diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 0000000000000000000000000000000000000000..5e05fa914db14f49952a8405d5554d4b95b84dfd
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1,44 @@
+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/01.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/02.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/03.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/04.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/05.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/06.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/07.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/08.mp4 filter=lfs diff=lfs merge=lfs -text
+assets/09.mp4 filter=lfs diff=lfs merge=lfs -text
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1fee058a018d4062806512f1bac29dca6b96e876
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,4 @@
+data_ssc/
+demo_out/
+pretrained_models/*
+.vscode/
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cf5a2adf25531ebddfc4702c1fa68df337c1e2f0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,13 @@
+---
+title: AiOS
+emoji: ⚡
+colorFrom: blue
+colorTo: indigo
+sdk: gradio
+python_version: 3.9
+sdk_version: 4.38.1
+app_file: app.py
+pinned: false
+---
+
+Check out the configuration reference at https://huggingface.co./docs/hub/spaces-config-reference
\ No newline at end of file
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3ebe114d4734c6e50ba3f8c9f725eb1ec2204f2
--- /dev/null
+++ b/app.py
@@ -0,0 +1,126 @@
+import os
+import sys
+import subprocess
+import pkg_resources
+
+def is_package_installed(package_name):
+    try:
+        pkg_resources.get_distribution(package_name)
+        return True
+    except pkg_resources.DistributionNotFound:
+        return False
+
+if is_package_installed("mmcv"):
+    print("MMCV is installed.")
+else:
+    print("MMCV is not installed. Build it from the source.")
+    os.environ["MMCV_WITH_OPS"] = "1"
+    os.environ["FORCE_MLU"] = "1"
+    subprocess.run(["pip", "install", "-e", "./mmcv"], check=True)
+    subprocess.run(["pip", "list"], check=True)
+
+if is_package_installed("pytorch3d"):
+    print("pytorch3d is installed.")
+else:
+    print("pytorch3d is not installed. Build it from the source.")
+    subprocess.run(["pip", "install", "-e", "./pytorch3d"], check=True)
+
+if is_package_installed("MultiScaleDeformableAttention"):
+    print("MultiScaleDeformableAttention is installed.")
+else:
+    print("MultiScaleDeformableAttention is not installed. Build it from the source.")
+    subprocess.run(["pip", "install", "-e", "./models/aios/ops"], check=True)
+    
+import os.path as osp
+from pathlib import Path
+import cv2
+import gradio as gr
+import torch
+import math
+import spaces
+from huggingface_hub import hf_hub_download
+
+hf_hub_download(repo_id="ttxskk/AiOS", filename="aios_checkpoint.pth", local_dir="/home/user/app/pretrained_models")
+
+OUT_FOLDER = '/home/user/app/demo_out'
+os.makedirs(OUT_FOLDER, exist_ok=True)
+
+DEMO_CONFIG = '/home/user/app/config/aios_smplx_demo.py'
+MODEL_PATH = '/home/user/app/pretrained_models/aios_checkpoint.pth'
+@spaces.GPU(enable_queue=True, duration=300)
+def infer(video_input, batch_size, threshold=0.5, num_person=1):
+    os.system(f'rm -rf {OUT_FOLDER}/*')
+    os.system(f'torchrun --nproc_per_node 1 \
+             main.py \
+            -c {DEMO_CONFIG} \
+            --options batch_size={batch_size} backbone="resnet50" num_person={num_person} threshold={threshold} \
+            --resume {MODEL_PATH} \
+            --eval \
+            --inference \
+            --inference_input {video_input} \
+            --to_vid \
+            --output_dir {OUT_FOLDER}')
+    
+    video_path = os.path.join(OUT_FOLDER, 'demo_vid.mp4')
+    save_path_img = os.path.join(OUT_FOLDER, 'res_img')
+    save_path_mesh = os.path.join(OUT_FOLDER, 'mesh')
+    save_mesh_file = os.path.join(OUT_FOLDER, 'mesh.zip')
+    os.system(f'zip -r {save_mesh_file} {save_path_mesh}')
+    yield video_path, save_mesh_file
+
+TITLE = """
+
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+  <div>
+    <h1 align="center">AiOS: All-in-One-Stage Expressive Human Pose and Shape Estimation</h1>
+  </div>
+</div>
+
+<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
+    <div style="display:flex; gap: 0.25rem;" align="center">
+        <a href="https://ttxskk.github.io/AiOS/" target="_blank"><img src='https://img.shields.io/badge/Project-Page-Green'></a>
+        <a href="https://github.com/ttxskk/AiOS" target="_blank"><img src='https://img.shields.io/badge/Github-Code-blue'></a>
+        <a href="https://ttxskk.github.io/AiOS/assets/aios_cvpr24.pdf" target="_blank"><img src='https://img.shields.io/badge/Paper-Arxiv-red'></a>
+    </div>
+</div>
+<div style="font-size: 1.1rem; color: #555; max-width: 800px; margin: 1rem auto; line-height: 1.5; justify-content: center; align-items: center; text-align: center;">
+  <div>
+    <p>Recover multiple expressive human pose and shape recovery from an RGB image without any additional requirements, such as an off-the-shelf detection model.</h1>
+  </div>
+</div>
+"""
+with gr.Blocks(title="AiOS", theme=gr.themes.Soft(primary_hue="blue", secondary_hue="gray")) as demo:
+    
+    gr.Markdown(TITLE)
+    with gr.Row():
+        with gr.Column(scale=2):
+            video_input = gr.Video(label="Input video", elem_classes="video")
+        with gr.Column(scale=1):
+            batch_size = gr.Textbox(label="Batch Size", type="text", value=8)
+            num_person = gr.Textbox(label="Number of Person", type="text", value=1)
+            threshold = gr.Slider(0, 1.0, value=0.5, label='Score Threshold')
+            send_button = gr.Button("Infer")
+    gr.HTML("""<br/>""")
+        
+    with gr.Row():
+        with gr.Column():
+            # processed_frames = gr.Image(label="Last processed frame")
+            video_output = gr.Video(elem_classes="video")
+        with gr.Column():
+            meshes_output = gr.File(label="3D meshes")
+    
+    send_button.click(fn=infer, inputs=[video_input, batch_size, threshold, num_person], outputs=[video_output, meshes_output])
+    # example_videos = gr.Examples([
+    #     ['./assets/01.mp4'], 
+    #     ['./assets/02.mp4'], 
+    #     ['./assets/03.mp4'],
+    #     ['./assets/04.mp4'], 
+    #     ['./assets/05.mp4'], 
+    #     ['./assets/06.mp4'], 
+    #     ['./assets/07.mp4'], 
+    #     ['./assets/08.mp4'], 
+    #     ['./assets/09.mp4'], 
+    #     ], 
+    #     inputs=[video_input, 0.5])
+    
+demo.queue().launch(debug=True)
diff --git a/assets/01.mp4 b/assets/01.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..0a8e2831c3461c62dc6afa241addb829154bd812
--- /dev/null
+++ b/assets/01.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2ba560996c248d78be6556f1727ae6ced81cd62a002715c3ffd542f6202b204b
+size 2751935
diff --git a/assets/02.mp4 b/assets/02.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..1d418b5be50c00473f988180f7e4b07a8904f666
--- /dev/null
+++ b/assets/02.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:00702a08c978b27b3ddf6ddfd48c5a057753664c8e80d83f4b4e04dff45b8a71
+size 2827267
diff --git a/assets/03.mp4 b/assets/03.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..952e5c5b1e896ba31934ffbb5af03d93304f371f
--- /dev/null
+++ b/assets/03.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bfcc1ce90a0921ffa5550a04f743470081ff4599c265cf491e636a8ea70233d4
+size 4033767
diff --git a/assets/04.mp4 b/assets/04.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..1e95ea77e8571587b68657ce832f33b538ce2dd7
--- /dev/null
+++ b/assets/04.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:28531c3c0ad9cbcc097a00f8553aafcdc0513a881f0fa6d1a7937248f46fce0c
+size 2639842
diff --git a/assets/05.mp4 b/assets/05.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..e7d9d8f6ed14f0b8ddb5131840d02f8e8152195f
--- /dev/null
+++ b/assets/05.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1cf7f1b65d87f0a77c1d9456771e4f88228aa836426b4ad0cbad672e80d07e36
+size 3584040
diff --git a/assets/06.mp4 b/assets/06.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..dfb9d5a3279c180e8a4f03803fb42d966e451e80
--- /dev/null
+++ b/assets/06.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fcb4139d4863c5ec92224f7cb452ec4631be0613eb4c3f82ee7fbb6f89510fe2
+size 19797950
diff --git a/assets/07.mp4 b/assets/07.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..9d4b83e851a6040a7a864a8a546c60e20cfe3964
--- /dev/null
+++ b/assets/07.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c71c5ed8573cb727c515d733e51c5da4654c58ab096cbca4bdf9b072e8284c7
+size 3274979
diff --git a/assets/08.mp4 b/assets/08.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..0442a9995c9ef1efe83ac64cad26ec7a0d93ef29
--- /dev/null
+++ b/assets/08.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d14f03e984a0ebefd9e8429c8e0d3ecdb0ffc9126ad91a489b57dc0f5d12695b
+size 6825913
diff --git a/assets/09.mp4 b/assets/09.mp4
new file mode 100644
index 0000000000000000000000000000000000000000..763be21a7cc25fda8bf2ae924843d58ff9922b56
--- /dev/null
+++ b/assets/09.mp4
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:30b5b6f75f024647a9e430f02b33caa1ccec327b487ba5bb451e2859e1e45142
+size 6336699
diff --git a/config/__init__.py b/config/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/config/aios_smplx.py b/config/aios_smplx.py
new file mode 100644
index 0000000000000000000000000000000000000000..51192ad8fa0e096446bd106ac00465737012bc98
--- /dev/null
+++ b/config/aios_smplx.py
@@ -0,0 +1,259 @@
+
+num_classes = 2
+lr = 0.0001*1.414/10
+param_dict_type = 'default'
+lr_backbone = 1e-05*1.414/10
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 200
+lr_drop = 11
+save_checkpoint_interval = 1
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = True
+lr_drop_list = [30, 60]
+
+modelname = 'aios_smplx'
+frozen_weights = None
+backbone = 'resnet50'
+use_checkpoint = False
+
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+two_stage_type = 'standard'
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+rm_detach = None
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+
+masks = False
+losses = ["smpl_pose", "smpl_beta", "smpl_expr",
+        "smpl_kp2d","smpl_kp3d","smpl_kp3d_ra",'labels', 'boxes', "keypoints"]
+# losses = ['labels', 'boxes', "keypoints"]
+aux_loss = True
+set_cost_class = 2.0
+set_cost_bbox = 5.0
+set_cost_giou = 2.0
+set_cost_keypoints = 10.0
+set_cost_kpvis = 0.0
+set_cost_oks = 4.0
+cls_loss_coef = 2.0
+# keypoints_loss_coef = 10.0
+
+smpl_pose_loss_root_coef = 10 * 0.1
+smpl_pose_loss_body_coef = 1 * 0.1
+smpl_pose_loss_lhand_coef = 1 * 0.1
+smpl_pose_loss_rhand_coef = 1 * 0.1
+smpl_pose_loss_jaw_coef = 1 * 0.1
+smpl_beta_loss_coef = 0.01
+smpl_expr_loss_coef = 0.01
+
+# smpl_kp3d_loss_coef = 10
+smpl_body_kp3d_loss_coef = 10.0 * 0.1
+smpl_face_kp3d_loss_coef = 1.0 * 0.1
+smpl_lhand_kp3d_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_loss_coef = 1 * 0.1
+
+# kp3d ra
+smpl_body_kp3d_ra_loss_coef = 10 * 0.1
+smpl_face_kp3d_ra_loss_coef = 1 * 0.1
+smpl_lhand_kp3d_ra_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_ra_loss_coef = 1 * 0.1
+
+
+# smpl_kp2d_ba_loss_coef = 1.0
+smpl_body_kp2d_loss_coef = 10.0 * 0.1
+smpl_lhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_rhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_face_kp2d_loss_coef = 1.0 * 0.1
+
+smpl_body_kp2d_ba_loss_coef = 0 * 0.1
+smpl_face_kp2d_ba_loss_coef = 0 * 0.1
+smpl_lhand_kp2d_ba_loss_coef = 0 * 0.1
+smpl_rhand_kp2d_ba_loss_coef = 0 * 0.1
+
+bbox_loss_coef = 5.0
+body_bbox_loss_coef = 5.0
+lhand_bbox_loss_coef = 5.0
+rhand_bbox_loss_coef = 5.0
+face_bbox_loss_coef = 5.0
+
+giou_loss_coef = 2.0
+body_giou_loss_coef = 2.0
+rhand_giou_loss_coef = 2.0
+lhand_giou_loss_coef = 2.0
+face_giou_loss_coef = 2.0
+
+keypoints_loss_coef = 10.0
+rhand_keypoints_loss_coef = 10.0
+lhand_keypoints_loss_coef = 10.0
+face_keypoints_loss_coef = 10.0
+       
+oks_loss_coef=4.0
+rhand_oks_loss_coef = 0.5
+lhand_oks_loss_coef = 0.5
+face_oks_loss_coef = 4.0
+
+
+enc_loss_coef = 1.0
+interm_loss_coef = 1.0
+no_interm_box_loss = False
+focal_alpha = 0.25
+rm_self_attn_layers = None
+indices_idx_list = [1, 2, 3, 4, 5, 6, 7]
+
+decoder_sa_type = 'sa'
+matcher_type = 'HungarianMatcher'
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+
+dec_pred_bbox_embed_share = False
+dec_pred_class_embed_share = False
+dec_pred_pose_embed_share = False
+body_only = True
+
+# for dn
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 0.4
+dn_label_noise_ratio = 0.5
+embed_init_tgt = False
+dn_label_coef = 0.3
+dn_bbox_coef = 0.5
+dn_batch_gt_fuse = False
+dn_attn_mask_type_list = ['match2dn', 'dn2dn', 'group2group']
+dn_labelbook_size = 100
+
+match_unstable_error = False
+
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+
+cls_no_bias = False
+num_body_points = 17  # for coco
+num_hand_points = 6 # for coco
+num_face_points = 6  # for coco
+num_group = 100
+num_box_decoder_layers = 2
+num_hand_face_decoder_layers = 4
+no_mmpose_keypoint_evaluator = True
+strong_aug = False
+
+body_model_test=\
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+body_model_train = \
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+# will be update in exp
+exp_name = 'output/exp52/dataset_debug'
+
+
+end_epoch = 150
+train_batch_size = 32
+
+scheduler = 'step'
+step_size = 20
+gamma = 0.1
+
+# continue
+continue_train = True
+pretrained_model_path = '../output/train_gta_synbody_ft_20230410_132110/model_dump/snapshot_2.pth.tar'
+
+# dataset setting
+# dataset_list = ['AGORA_MM','BEDLAM', 'COCO_NA']
+# trainset_3d = ['AGORA_MM','BEDLAM', 'COCO_NA']
+dataset_list = ['INFERENCE_demo']
+trainset_3d = []
+trainset_2d = []
+trainset_partition = {
+            }
+trainset_humandata = []
+testset = 'INFERENCE_demo'
+train_sizes=[480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+train_max_size=1333
+test_sizes=[800]
+test_max_size=1333
+no_aug=False
+# model
+use_cache = True
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2
+           )  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+bbox_ratio = 1.2
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
+
+agora_benchmark = 'na' # 'agora_model', 'test_only'
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 'auto'
\ No newline at end of file
diff --git a/config/aios_smplx_agora_val.py b/config/aios_smplx_agora_val.py
new file mode 100644
index 0000000000000000000000000000000000000000..d85b8a081435723cddd741b703d68e5c6bb80ef6
--- /dev/null
+++ b/config/aios_smplx_agora_val.py
@@ -0,0 +1,265 @@
+
+num_classes = 2
+lr = 1e-04
+param_dict_type = 'default'
+lr_backbone = 1e-05
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 200
+lr_drop = 11
+save_checkpoint_interval = 1
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = True
+lr_drop_list = [30, 60]
+
+modelname = 'aios_smplx'
+frozen_weights = None
+backbone = 'resnet50'
+use_checkpoint = False
+
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+two_stage_type = 'standard'
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+rm_detach = None
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+
+masks = False
+losses = ["smpl_pose", "smpl_beta", "smpl_expr",
+        "smpl_kp2d","smpl_kp3d","smpl_kp3d_ra",'labels', 'boxes', "keypoints"]
+# losses = ['labels', 'boxes', "keypoints"]
+aux_loss = True
+set_cost_class = 2.0
+set_cost_bbox = 5.0
+set_cost_giou = 2.0
+set_cost_keypoints = 10.0
+set_cost_kpvis = 0.0
+set_cost_oks = 4.0
+cls_loss_coef = 2.0
+# keypoints_loss_coef = 10.0
+
+smpl_pose_loss_root_coef = 10 * 0.1
+smpl_pose_loss_body_coef = 1 * 0.1
+smpl_pose_loss_lhand_coef = 1 * 0.1
+smpl_pose_loss_rhand_coef = 1 * 0.1
+smpl_pose_loss_jaw_coef = 1 * 0.1
+smpl_beta_loss_coef = 0.01
+smpl_expr_loss_coef = 0.01
+
+# smpl_kp3d_loss_coef = 10
+smpl_body_kp3d_loss_coef = 10.0 * 0.1
+smpl_face_kp3d_loss_coef = 1.0 * 0.1
+smpl_lhand_kp3d_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_loss_coef = 1 * 0.1
+
+# kp3d ra
+smpl_body_kp3d_ra_loss_coef = 10 * 0.1
+smpl_face_kp3d_ra_loss_coef = 1 * 0.1
+smpl_lhand_kp3d_ra_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_ra_loss_coef = 1 * 0.1
+
+
+# smpl_kp2d_ba_loss_coef = 1.0
+smpl_body_kp2d_loss_coef = 10.0 * 0.1
+smpl_lhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_rhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_face_kp2d_loss_coef = 1.0 * 0.1
+
+smpl_body_kp2d_ba_loss_coef = 0 * 0.1
+smpl_face_kp2d_ba_loss_coef = 0 * 0.1
+smpl_lhand_kp2d_ba_loss_coef = 0 * 0.1
+smpl_rhand_kp2d_ba_loss_coef = 0 * 0.1
+
+bbox_loss_coef = 5.0
+body_bbox_loss_coef = 5.0
+lhand_bbox_loss_coef = 5.0
+rhand_bbox_loss_coef = 5.0
+face_bbox_loss_coef = 5.0
+
+giou_loss_coef = 2.0
+body_giou_loss_coef = 2.0
+rhand_giou_loss_coef = 2.0
+lhand_giou_loss_coef = 2.0
+face_giou_loss_coef = 2.0
+
+keypoints_loss_coef = 10.0
+rhand_keypoints_loss_coef = 10.0
+lhand_keypoints_loss_coef = 10.0
+face_keypoints_loss_coef = 10.0
+       
+oks_loss_coef=4.0
+rhand_oks_loss_coef = 0.5
+lhand_oks_loss_coef = 0.5
+face_oks_loss_coef = 4.0
+
+
+enc_loss_coef = 1.0
+interm_loss_coef = 1.0
+no_interm_box_loss = False
+focal_alpha = 0.25
+rm_self_attn_layers = None
+indices_idx_list = [1, 2, 3, 4, 5, 6, 7]
+
+decoder_sa_type = 'sa'
+matcher_type = 'HungarianMatcher'
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+
+dec_pred_bbox_embed_share = False
+dec_pred_class_embed_share = False
+dec_pred_pose_embed_share = False
+body_only = True
+
+# for dn
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 0.4
+dn_label_noise_ratio = 0.5
+embed_init_tgt = False
+dn_label_coef = 0.3
+dn_bbox_coef = 0.5
+dn_batch_gt_fuse = False
+dn_attn_mask_type_list = ['match2dn', 'dn2dn', 'group2group']
+dn_labelbook_size = 100
+
+match_unstable_error = False
+
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+
+cls_no_bias = False
+num_body_points = 17  # for coco
+num_hand_points = 6 # for coco
+num_face_points = 6  # for coco
+num_group = 100
+num_box_decoder_layers = 2
+num_hand_face_decoder_layers = 4
+no_mmpose_keypoint_evaluator = True
+strong_aug = False
+
+body_model_test=\
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+body_model_train = \
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+# will be update in exp
+exp_name = 'output/exp52/dataset_debug'
+
+
+end_epoch = 150
+train_batch_size = 32
+
+scheduler = 'step'
+step_size = 20
+gamma = 0.1
+
+# continue
+continue_train = True
+pretrained_model_path = '../output/train_gta_synbody_ft_20230410_132110/model_dump/snapshot_2.pth.tar'
+
+# dataset setting
+# dataset_list = ['AGORA_MM','BEDLAM', 'COCO_NA']
+# trainset_3d = ['AGORA_MM','BEDLAM', 'COCO_NA']
+dataset_list = ['AGORA_MM','BEDLAM', 'COCO_NA']
+trainset_3d = ['AGORA_MM','BEDLAM', 'COCO_NA']
+trainset_2d = []
+trainset_partition = {
+            'AGORA_MM': 0.4, 
+            'BEDLAM': 0.7,
+            'COCO_NA': 1,
+            
+            # 'EgoBody_Egocentric': 1,
+            # 'EgoBody_Kinect': 1.0,
+            }
+trainset_humandata = []
+testset = 'INFERENCE_AGORA'
+train_sizes=[480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+train_max_size=1333
+test_sizes=[800]
+test_max_size=1333
+no_aug=False
+# model
+use_cache = True
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2
+           )  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+bbox_ratio = 1.2
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
+
+agora_benchmark = 'na' # 'agora_model', 'test_only'
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 'auto'
\ No newline at end of file
diff --git a/config/aios_smplx_bedlam.py b/config/aios_smplx_bedlam.py
new file mode 100644
index 0000000000000000000000000000000000000000..88eeb29f74eceb0404269ffa56c628afa068fcff
--- /dev/null
+++ b/config/aios_smplx_bedlam.py
@@ -0,0 +1,265 @@
+
+num_classes = 2
+lr = 0.0001*1.414/10
+param_dict_type = 'default'
+lr_backbone = 1e-05*1.414/10
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 200
+lr_drop = 11
+save_checkpoint_interval = 1
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = True
+lr_drop_list = [30, 60]
+
+modelname = 'aios_smplx'
+frozen_weights = None
+backbone = 'resnet50'
+use_checkpoint = False
+
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+two_stage_type = 'standard'
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+rm_detach = None
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+
+masks = False
+losses = ["smpl_pose", "smpl_beta", "smpl_expr",
+        "smpl_kp2d","smpl_kp3d","smpl_kp3d_ra",'labels', 'boxes', "keypoints"]
+# losses = ['labels', 'boxes', "keypoints"]
+aux_loss = True
+set_cost_class = 2.0
+set_cost_bbox = 5.0
+set_cost_giou = 2.0
+set_cost_keypoints = 10.0
+set_cost_kpvis = 0.0
+set_cost_oks = 4.0
+cls_loss_coef = 2.0
+# keypoints_loss_coef = 10.0
+
+smpl_pose_loss_root_coef = 10 * 0.1
+smpl_pose_loss_body_coef = 1 * 0.1
+smpl_pose_loss_lhand_coef = 1 * 0.1
+smpl_pose_loss_rhand_coef = 1 * 0.1
+smpl_pose_loss_jaw_coef = 1 * 0.1
+smpl_beta_loss_coef = 0.01
+smpl_expr_loss_coef = 0.01
+
+# smpl_kp3d_loss_coef = 10
+smpl_body_kp3d_loss_coef = 10.0 * 0.1
+smpl_face_kp3d_loss_coef = 1.0 * 0.1
+smpl_lhand_kp3d_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_loss_coef = 1 * 0.1
+
+# kp3d ra
+smpl_body_kp3d_ra_loss_coef = 10 * 0.1
+smpl_face_kp3d_ra_loss_coef = 1 * 0.1
+smpl_lhand_kp3d_ra_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_ra_loss_coef = 1 * 0.1
+
+
+# smpl_kp2d_ba_loss_coef = 1.0
+smpl_body_kp2d_loss_coef = 10.0 * 0.1
+smpl_lhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_rhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_face_kp2d_loss_coef = 1.0 * 0.1
+
+smpl_body_kp2d_ba_loss_coef = 0 * 0.1
+smpl_face_kp2d_ba_loss_coef = 0 * 0.1
+smpl_lhand_kp2d_ba_loss_coef = 0 * 0.1
+smpl_rhand_kp2d_ba_loss_coef = 0 * 0.1
+
+bbox_loss_coef = 5.0
+body_bbox_loss_coef = 5.0
+lhand_bbox_loss_coef = 5.0
+rhand_bbox_loss_coef = 5.0
+face_bbox_loss_coef = 5.0
+
+giou_loss_coef = 2.0
+body_giou_loss_coef = 2.0
+rhand_giou_loss_coef = 2.0
+lhand_giou_loss_coef = 2.0
+face_giou_loss_coef = 2.0
+
+keypoints_loss_coef = 10.0
+rhand_keypoints_loss_coef = 10.0
+lhand_keypoints_loss_coef = 10.0
+face_keypoints_loss_coef = 10.0
+       
+oks_loss_coef=4.0
+rhand_oks_loss_coef = 0.5
+lhand_oks_loss_coef = 0.5
+face_oks_loss_coef = 4.0
+
+
+enc_loss_coef = 1.0
+interm_loss_coef = 1.0
+no_interm_box_loss = False
+focal_alpha = 0.25
+rm_self_attn_layers = None
+indices_idx_list = [1, 2, 3, 4, 5, 6, 7]
+
+decoder_sa_type = 'sa'
+matcher_type = 'HungarianMatcher'
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+
+dec_pred_bbox_embed_share = False
+dec_pred_class_embed_share = False
+dec_pred_pose_embed_share = False
+body_only = True
+
+# for dn
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 0.4
+dn_label_noise_ratio = 0.5
+embed_init_tgt = False
+dn_label_coef = 0.3
+dn_bbox_coef = 0.5
+dn_batch_gt_fuse = False
+dn_attn_mask_type_list = ['match2dn', 'dn2dn', 'group2group']
+dn_labelbook_size = 100
+
+match_unstable_error = False
+
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+
+cls_no_bias = False
+num_body_points = 17  # for coco
+num_hand_points = 6 # for coco
+num_face_points = 6  # for coco
+num_group = 100
+num_box_decoder_layers = 2
+num_hand_face_decoder_layers = 4
+no_mmpose_keypoint_evaluator = True
+strong_aug = False
+
+body_model_test=\
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+body_model_train = \
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+# will be update in exp
+exp_name = 'output/exp52/dataset_debug'
+
+
+end_epoch = 150
+train_batch_size = 32
+
+scheduler = 'step'
+step_size = 20
+gamma = 0.1
+
+# continue
+continue_train = True
+pretrained_model_path = '../output/train_gta_synbody_ft_20230410_132110/model_dump/snapshot_2.pth.tar'
+
+# dataset setting
+# dataset_list = ['AGORA_MM','BEDLAM', 'COCO_NA']
+# trainset_3d = ['AGORA_MM','BEDLAM', 'COCO_NA']
+dataset_list = ['AGORA_MM','BEDLAM', 'COCO_NA']
+trainset_3d = ['AGORA_MM','BEDLAM', 'COCO_NA']
+trainset_2d = []
+trainset_partition = {
+            'AGORA_MM': 0.4, 
+            'BEDLAM': 0.7,
+            'COCO_NA': 1,
+            
+            # 'EgoBody_Egocentric': 1,
+            # 'EgoBody_Kinect': 1.0,
+            }
+trainset_humandata = []
+testset = 'INFERENCE_BEDLAM'
+train_sizes=[480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+train_max_size=1333
+test_sizes=[800]
+test_max_size=1333
+no_aug=False
+# model
+use_cache = True
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2
+           )  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+bbox_ratio = 1.2
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
+
+agora_benchmark = 'na' # 'agora_model', 'test_only'
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 'auto'
\ No newline at end of file
diff --git a/config/aios_smplx_demo.py b/config/aios_smplx_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..51192ad8fa0e096446bd106ac00465737012bc98
--- /dev/null
+++ b/config/aios_smplx_demo.py
@@ -0,0 +1,259 @@
+
+num_classes = 2
+lr = 0.0001*1.414/10
+param_dict_type = 'default'
+lr_backbone = 1e-05*1.414/10
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 200
+lr_drop = 11
+save_checkpoint_interval = 1
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = True
+lr_drop_list = [30, 60]
+
+modelname = 'aios_smplx'
+frozen_weights = None
+backbone = 'resnet50'
+use_checkpoint = False
+
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+two_stage_type = 'standard'
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+rm_detach = None
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+
+masks = False
+losses = ["smpl_pose", "smpl_beta", "smpl_expr",
+        "smpl_kp2d","smpl_kp3d","smpl_kp3d_ra",'labels', 'boxes', "keypoints"]
+# losses = ['labels', 'boxes', "keypoints"]
+aux_loss = True
+set_cost_class = 2.0
+set_cost_bbox = 5.0
+set_cost_giou = 2.0
+set_cost_keypoints = 10.0
+set_cost_kpvis = 0.0
+set_cost_oks = 4.0
+cls_loss_coef = 2.0
+# keypoints_loss_coef = 10.0
+
+smpl_pose_loss_root_coef = 10 * 0.1
+smpl_pose_loss_body_coef = 1 * 0.1
+smpl_pose_loss_lhand_coef = 1 * 0.1
+smpl_pose_loss_rhand_coef = 1 * 0.1
+smpl_pose_loss_jaw_coef = 1 * 0.1
+smpl_beta_loss_coef = 0.01
+smpl_expr_loss_coef = 0.01
+
+# smpl_kp3d_loss_coef = 10
+smpl_body_kp3d_loss_coef = 10.0 * 0.1
+smpl_face_kp3d_loss_coef = 1.0 * 0.1
+smpl_lhand_kp3d_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_loss_coef = 1 * 0.1
+
+# kp3d ra
+smpl_body_kp3d_ra_loss_coef = 10 * 0.1
+smpl_face_kp3d_ra_loss_coef = 1 * 0.1
+smpl_lhand_kp3d_ra_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_ra_loss_coef = 1 * 0.1
+
+
+# smpl_kp2d_ba_loss_coef = 1.0
+smpl_body_kp2d_loss_coef = 10.0 * 0.1
+smpl_lhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_rhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_face_kp2d_loss_coef = 1.0 * 0.1
+
+smpl_body_kp2d_ba_loss_coef = 0 * 0.1
+smpl_face_kp2d_ba_loss_coef = 0 * 0.1
+smpl_lhand_kp2d_ba_loss_coef = 0 * 0.1
+smpl_rhand_kp2d_ba_loss_coef = 0 * 0.1
+
+bbox_loss_coef = 5.0
+body_bbox_loss_coef = 5.0
+lhand_bbox_loss_coef = 5.0
+rhand_bbox_loss_coef = 5.0
+face_bbox_loss_coef = 5.0
+
+giou_loss_coef = 2.0
+body_giou_loss_coef = 2.0
+rhand_giou_loss_coef = 2.0
+lhand_giou_loss_coef = 2.0
+face_giou_loss_coef = 2.0
+
+keypoints_loss_coef = 10.0
+rhand_keypoints_loss_coef = 10.0
+lhand_keypoints_loss_coef = 10.0
+face_keypoints_loss_coef = 10.0
+       
+oks_loss_coef=4.0
+rhand_oks_loss_coef = 0.5
+lhand_oks_loss_coef = 0.5
+face_oks_loss_coef = 4.0
+
+
+enc_loss_coef = 1.0
+interm_loss_coef = 1.0
+no_interm_box_loss = False
+focal_alpha = 0.25
+rm_self_attn_layers = None
+indices_idx_list = [1, 2, 3, 4, 5, 6, 7]
+
+decoder_sa_type = 'sa'
+matcher_type = 'HungarianMatcher'
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+
+dec_pred_bbox_embed_share = False
+dec_pred_class_embed_share = False
+dec_pred_pose_embed_share = False
+body_only = True
+
+# for dn
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 0.4
+dn_label_noise_ratio = 0.5
+embed_init_tgt = False
+dn_label_coef = 0.3
+dn_bbox_coef = 0.5
+dn_batch_gt_fuse = False
+dn_attn_mask_type_list = ['match2dn', 'dn2dn', 'group2group']
+dn_labelbook_size = 100
+
+match_unstable_error = False
+
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+
+cls_no_bias = False
+num_body_points = 17  # for coco
+num_hand_points = 6 # for coco
+num_face_points = 6  # for coco
+num_group = 100
+num_box_decoder_layers = 2
+num_hand_face_decoder_layers = 4
+no_mmpose_keypoint_evaluator = True
+strong_aug = False
+
+body_model_test=\
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+body_model_train = \
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+# will be update in exp
+exp_name = 'output/exp52/dataset_debug'
+
+
+end_epoch = 150
+train_batch_size = 32
+
+scheduler = 'step'
+step_size = 20
+gamma = 0.1
+
+# continue
+continue_train = True
+pretrained_model_path = '../output/train_gta_synbody_ft_20230410_132110/model_dump/snapshot_2.pth.tar'
+
+# dataset setting
+# dataset_list = ['AGORA_MM','BEDLAM', 'COCO_NA']
+# trainset_3d = ['AGORA_MM','BEDLAM', 'COCO_NA']
+dataset_list = ['INFERENCE_demo']
+trainset_3d = []
+trainset_2d = []
+trainset_partition = {
+            }
+trainset_humandata = []
+testset = 'INFERENCE_demo'
+train_sizes=[480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+train_max_size=1333
+test_sizes=[800]
+test_max_size=1333
+no_aug=False
+# model
+use_cache = True
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2
+           )  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+bbox_ratio = 1.2
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
+
+agora_benchmark = 'na' # 'agora_model', 'test_only'
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 'auto'
\ No newline at end of file
diff --git a/config/aios_smplx_inference.py b/config/aios_smplx_inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..13cf56fd6589c6b79671b71b2b7217ee0db593fb
--- /dev/null
+++ b/config/aios_smplx_inference.py
@@ -0,0 +1,265 @@
+
+num_classes = 2
+lr = 0.0001*1.414/10
+param_dict_type = 'default'
+lr_backbone = 1e-05*1.414/10
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 200
+lr_drop = 11
+save_checkpoint_interval = 1
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = True
+lr_drop_list = [30, 60]
+
+modelname = 'aios_smplx'
+frozen_weights = None
+backbone = 'resnet50'
+use_checkpoint = False
+
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+two_stage_type = 'standard'
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+rm_detach = None
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+
+masks = False
+losses = ["smpl_pose", "smpl_beta", "smpl_expr",
+        "smpl_kp2d","smpl_kp3d","smpl_kp3d_ra",'labels', 'boxes', "keypoints"]
+# losses = ['labels', 'boxes', "keypoints"]
+aux_loss = True
+set_cost_class = 2.0
+set_cost_bbox = 5.0
+set_cost_giou = 2.0
+set_cost_keypoints = 10.0
+set_cost_kpvis = 0.0
+set_cost_oks = 4.0
+cls_loss_coef = 2.0
+# keypoints_loss_coef = 10.0
+
+smpl_pose_loss_root_coef = 10 * 0.1
+smpl_pose_loss_body_coef = 1 * 0.1
+smpl_pose_loss_lhand_coef = 1 * 0.1
+smpl_pose_loss_rhand_coef = 1 * 0.1
+smpl_pose_loss_jaw_coef = 1 * 0.1
+smpl_beta_loss_coef = 0.01
+smpl_expr_loss_coef = 0.01
+
+# smpl_kp3d_loss_coef = 10
+smpl_body_kp3d_loss_coef = 10.0 * 0.1
+smpl_face_kp3d_loss_coef = 1.0 * 0.1
+smpl_lhand_kp3d_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_loss_coef = 1 * 0.1
+
+# kp3d ra
+smpl_body_kp3d_ra_loss_coef = 10 * 0.1
+smpl_face_kp3d_ra_loss_coef = 1 * 0.1
+smpl_lhand_kp3d_ra_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_ra_loss_coef = 1 * 0.1
+
+
+# smpl_kp2d_ba_loss_coef = 1.0
+smpl_body_kp2d_loss_coef = 10.0 * 0.1
+smpl_lhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_rhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_face_kp2d_loss_coef = 1.0 * 0.1
+
+smpl_body_kp2d_ba_loss_coef = 0 * 0.1
+smpl_face_kp2d_ba_loss_coef = 0 * 0.1
+smpl_lhand_kp2d_ba_loss_coef = 0 * 0.1
+smpl_rhand_kp2d_ba_loss_coef = 0 * 0.1
+
+bbox_loss_coef = 5.0
+body_bbox_loss_coef = 5.0
+lhand_bbox_loss_coef = 5.0
+rhand_bbox_loss_coef = 5.0
+face_bbox_loss_coef = 5.0
+
+giou_loss_coef = 2.0
+body_giou_loss_coef = 2.0
+rhand_giou_loss_coef = 2.0
+lhand_giou_loss_coef = 2.0
+face_giou_loss_coef = 2.0
+
+keypoints_loss_coef = 10.0
+rhand_keypoints_loss_coef = 10.0
+lhand_keypoints_loss_coef = 10.0
+face_keypoints_loss_coef = 10.0
+       
+oks_loss_coef=4.0
+rhand_oks_loss_coef = 0.5
+lhand_oks_loss_coef = 0.5
+face_oks_loss_coef = 4.0
+
+
+enc_loss_coef = 1.0
+interm_loss_coef = 1.0
+no_interm_box_loss = False
+focal_alpha = 0.25
+rm_self_attn_layers = None
+indices_idx_list = [1, 2, 3, 4, 5, 6, 7]
+
+decoder_sa_type = 'sa'
+matcher_type = 'HungarianMatcher'
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+
+dec_pred_bbox_embed_share = False
+dec_pred_class_embed_share = False
+dec_pred_pose_embed_share = False
+body_only = True
+
+# for dn
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 0.4
+dn_label_noise_ratio = 0.5
+embed_init_tgt = False
+dn_label_coef = 0.3
+dn_bbox_coef = 0.5
+dn_batch_gt_fuse = False
+dn_attn_mask_type_list = ['match2dn', 'dn2dn', 'group2group']
+dn_labelbook_size = 100
+
+match_unstable_error = False
+
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+
+cls_no_bias = False
+num_body_points = 17  # for coco
+num_hand_points = 6 # for coco
+num_face_points = 6  # for coco
+num_group = 100
+num_box_decoder_layers = 2
+num_hand_face_decoder_layers = 4
+no_mmpose_keypoint_evaluator = True
+strong_aug = False
+
+body_model_test=\
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+body_model_train = \
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+# will be update in exp
+exp_name = 'output/exp52/dataset_debug'
+
+
+end_epoch = 150
+train_batch_size = 32
+
+scheduler = 'step'
+step_size = 20
+gamma = 0.1
+
+# continue
+continue_train = True
+pretrained_model_path = '../output/train_gta_synbody_ft_20230410_132110/model_dump/snapshot_2.pth.tar'
+
+# dataset setting
+# dataset_list = ['AGORA_MM','BEDLAM', 'COCO_NA']
+# trainset_3d = ['AGORA_MM','BEDLAM', 'COCO_NA']
+dataset_list = ['AGORA_MM','BEDLAM', 'COCO_NA']
+trainset_3d = ['AGORA_MM','BEDLAM', 'COCO_NA']
+trainset_2d = []
+trainset_partition = {
+            'AGORA_MM': 0.4, 
+            'BEDLAM': 0.7,
+            'COCO_NA': 1,
+            
+            # 'EgoBody_Egocentric': 1,
+            # 'EgoBody_Kinect': 1.0,
+            }
+trainset_humandata = []
+testset = 'INFERENCE'
+train_sizes=[480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+train_max_size=1333
+test_sizes=[800]
+test_max_size=1333
+no_aug=False
+# model
+use_cache = True
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2
+           )  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+bbox_ratio = 1.2
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
+
+agora_benchmark = 'na' # 'agora_model', 'test_only'
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 'auto'
\ No newline at end of file
diff --git a/config/aios_smplx_pretrain.py b/config/aios_smplx_pretrain.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4c6e51bffd61e020b94ce25c61193a1b6c22369
--- /dev/null
+++ b/config/aios_smplx_pretrain.py
@@ -0,0 +1,264 @@
+num_classes = 2
+lr = 0.0001
+param_dict_type = 'default'
+lr_backbone = 1e-05
+lr_backbone_names = ['backbone.0']
+lr_linear_proj_names = ['reference_points', 'sampling_offsets']
+lr_linear_proj_mult = 0.1
+ddetr_lr_param = False
+batch_size = 2
+weight_decay = 0.0001
+epochs = 200
+lr_drop = 11
+save_checkpoint_interval = 1
+clip_max_norm = 0.1
+onecyclelr = False
+multi_step_lr = True
+lr_drop_list = [30, 60]
+
+modelname = 'aios_smplx'
+frozen_weights = None
+backbone = 'resnet50'
+use_checkpoint = False
+
+dilation = False
+position_embedding = 'sine'
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+random_refpoints_xy = False
+fix_refpoints_hw = -1
+dec_layer_number = None
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+dln_xy_noise = 0.2
+dln_hw_noise = 0.2
+two_stage_type = 'standard'
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+two_stage_learn_wh = False
+two_stage_default_hw = 0.05
+two_stage_keep_all_tokens = False
+rm_detach = None
+num_select = 50
+transformer_activation = 'relu'
+batch_norm_type = 'FrozenBatchNorm2d'
+
+masks = False
+losses = ["smpl_pose", "smpl_beta", "smpl_expr",
+        "smpl_kp2d","smpl_kp3d","smpl_kp3d_ra",'labels', 'boxes', "keypoints"]
+# losses = ['labels', 'boxes', "keypoints"]
+aux_loss = True
+set_cost_class = 2.0
+set_cost_bbox = 5.0
+set_cost_giou = 2.0
+set_cost_keypoints = 10.0
+set_cost_kpvis = 0.0
+set_cost_oks = 4.0
+cls_loss_coef = 2.0
+# keypoints_loss_coef = 10.0
+
+smpl_pose_loss_root_coef = 10 * 0.1
+smpl_pose_loss_body_coef = 1 * 0.1
+smpl_pose_loss_lhand_coef = 1 * 0.1
+smpl_pose_loss_rhand_coef = 1 * 0.1
+smpl_pose_loss_jaw_coef = 1 * 0.1
+smpl_beta_loss_coef = 0.01
+smpl_expr_loss_coef = 0.01
+
+# smpl_kp3d_loss_coef = 10
+smpl_body_kp3d_loss_coef = 10.0 * 0.1
+smpl_face_kp3d_loss_coef = 1.0 * 0.1
+smpl_lhand_kp3d_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_loss_coef = 1 * 0.1
+
+# kp3d ra
+smpl_body_kp3d_ra_loss_coef = 10 * 0.1
+smpl_face_kp3d_ra_loss_coef = 1 * 0.1
+smpl_lhand_kp3d_ra_loss_coef = 1 * 0.1
+smpl_rhand_kp3d_ra_loss_coef = 1 * 0.1
+
+
+# smpl_kp2d_ba_loss_coef = 1.0
+smpl_body_kp2d_loss_coef = 10.0 * 0.1
+smpl_lhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_rhand_kp2d_loss_coef = 5.0 * 0.1
+smpl_face_kp2d_loss_coef = 1.0 * 0.1
+
+smpl_body_kp2d_ba_loss_coef = 0 * 0.1
+smpl_face_kp2d_ba_loss_coef = 0 * 0.1
+smpl_lhand_kp2d_ba_loss_coef = 0 * 0.1
+smpl_rhand_kp2d_ba_loss_coef = 0 * 0.1
+
+bbox_loss_coef = 5.0
+body_bbox_loss_coef = 5.0
+lhand_bbox_loss_coef = 5.0
+rhand_bbox_loss_coef = 5.0
+face_bbox_loss_coef = 5.0
+
+giou_loss_coef = 2.0
+body_giou_loss_coef = 2.0
+rhand_giou_loss_coef = 2.0
+lhand_giou_loss_coef = 2.0
+face_giou_loss_coef = 2.0
+
+keypoints_loss_coef = 10.0
+rhand_keypoints_loss_coef = 10.0
+lhand_keypoints_loss_coef = 10.0
+face_keypoints_loss_coef = 10.0
+       
+oks_loss_coef=4.0
+rhand_oks_loss_coef = 0.5
+lhand_oks_loss_coef = 0.5
+face_oks_loss_coef = 4.0
+
+
+enc_loss_coef = 1.0
+interm_loss_coef = 1.0
+no_interm_box_loss = False
+focal_alpha = 0.25
+rm_self_attn_layers = None
+indices_idx_list = [1, 2, 3, 4, 5, 6, 7]
+
+decoder_sa_type = 'sa'
+matcher_type = 'HungarianMatcher'
+decoder_module_seq = ['sa', 'ca', 'ffn']
+nms_iou_threshold = -1
+
+dec_pred_bbox_embed_share = False
+dec_pred_class_embed_share = False
+dec_pred_pose_embed_share = False
+body_only = True
+
+# for dn
+use_dn = True
+dn_number = 100
+dn_box_noise_scale = 0.4
+dn_label_noise_ratio = 0.5
+embed_init_tgt = False
+dn_label_coef = 0.3
+dn_bbox_coef = 0.5
+dn_batch_gt_fuse = False
+dn_attn_mask_type_list = ['match2dn', 'dn2dn', 'group2group']
+dn_labelbook_size = 100
+
+match_unstable_error = False
+
+# for ema
+use_ema = True
+ema_decay = 0.9997
+ema_epoch = 0
+
+cls_no_bias = False
+num_body_points = 17  # for coco
+num_hand_points = 6 # for coco
+num_face_points = 6  # for coco
+num_group = 100
+num_box_decoder_layers = 2
+num_hand_face_decoder_layers = 4
+no_mmpose_keypoint_evaluator = True
+strong_aug = False
+
+body_model_test=\
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+body_model_train = \
+    dict(
+        type='smplx',
+        keypoint_src='smplx',
+        num_expression_coeffs=10,
+        num_betas=10,
+        keypoint_dst='smplx_137',
+        model_path='data/body_models/smplx',
+        use_pca=False,
+        use_face_contour=True)
+
+# will be update in exp
+exp_name = 'output/exp52/dataset_debug'
+
+
+end_epoch = 150
+train_batch_size = 32
+
+scheduler = 'step'
+step_size = 20
+gamma = 0.1
+
+# continue
+continue_train = True
+pretrained_model_path = '../output/train_gta_synbody_ft_20230410_132110/model_dump/snapshot_2.pth.tar'
+
+# dataset setting
+# dataset_list = ['AGORA_MM','BEDLAM', 'COCO_NA']
+# trainset_3d = ['AGORA_MM','BEDLAM', 'COCO_NA']
+dataset_list = ['AGORA_MM','BEDLAM', 'COCO_NA']
+trainset_3d = ['AGORA_MM','BEDLAM', 'COCO_NA']
+trainset_2d = []
+trainset_partition = {
+            'AGORA_MM': 0.4, 
+            'BEDLAM': 0.7,
+            'COCO_NA': 1,
+            
+            # 'EgoBody_Egocentric': 1,
+            # 'EgoBody_Kinect': 1.0,
+            }
+trainset_humandata = []
+testset = 'AGORA_MM'
+train_sizes=[480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800]
+train_max_size=1333
+test_sizes=[800]
+test_max_size=1333
+no_aug=False
+# model
+use_cache = True
+
+## UBody setting
+train_sample_interval = 10
+test_sample_interval = 100
+make_same_len = False
+
+## input, output size
+input_body_shape = (256, 192)
+output_hm_shape = (16, 16, 12)
+input_hand_shape = (256, 256)
+output_hand_hm_shape = (16, 16, 16)
+output_face_hm_shape = (8, 8, 8)
+input_face_shape = (192, 192)
+focal = (5000, 5000)  # virtual focal lengths
+princpt = (input_body_shape[1] / 2, input_body_shape[0] / 2
+           )  # virtual principal point position
+body_3d_size = 2
+hand_3d_size = 0.3
+face_3d_size = 0.3
+camera_3d_size = 2.5
+
+bbox_ratio = 1.2
+
+## directory
+output_dir, model_dir, vis_dir, log_dir, result_dir, code_dir = None, None, None, None, None, None
+
+agora_benchmark = 'na' # 'agora_model', 'test_only'
+
+# strategy 
+data_strategy = 'balance' # 'balance' need to define total_data_len
+total_data_len = 'auto'
\ No newline at end of file
diff --git a/config/config.py b/config/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fce085a9c5ca502df03c408a63971757001950f
--- /dev/null
+++ b/config/config.py
@@ -0,0 +1,91 @@
+import os
+import os.path as osp
+import sys
+import datetime
+from mmcv import Config as MMConfig
+
+class Config(MMConfig):
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        super().__init__(cfg_dict, cfg_text, filename)
+
+    def get_config_fromfile(self, config_path):
+        self.config_path = config_path
+
+        cfg, _ = MMConfig._file2dict(self.config_path)
+        
+        self.merge_from_dict(cfg)
+        # #import ipdb;ipdb.set_trace()
+        # self.__dict__.update(dict(cfg))
+        # # update dir
+        dir_dict = {}
+        exp_name = 'exps62'
+        time_str = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+        dir_dict['cur_dir'] = osp.dirname(os.path.abspath(__file__))
+        dir_dict['root_dir'] = osp.join(dir_dict['cur_dir'], '..')
+        dir_dict['output_dir'] = osp.join(dir_dict['root_dir'], exp_name)
+        dir_dict['result_dir'] = osp.join(dir_dict['output_dir'], 'result')
+        dir_dict['data_dir'] = osp.join(dir_dict['root_dir'], 'dataset')
+        dir_dict['human_model_path'] = osp.join('data/body_models')
+        self.merge_from_dict(dir_dict)
+        # 
+        # ## add some paths to the system root dir
+        sys.path.insert(0, osp.join(self.root_dir, 'common'))
+        sys.path.insert(0, osp.join(self.root_dir, 'united-perception_utils'))
+        sys.path.insert(0, osp.join(self.cur_dir, 'humanbench_utils'))
+        sys.path.insert(0, osp.join(self.cur_dir, 'dinov2_utils'))
+        sys.path.insert(0, osp.join(self.cur_dir, 'lora_utils'))
+        sys.path.insert(0, osp.join(self.cur_dir, 'vit_adapter_utils'))
+        from util.dir import add_pypath
+        # add_pypath(osp.join(self.data_dir))
+        for dataset in os.listdir('datasets'):
+            if dataset not in ['humandata.py', '__pycache__', 'dataset.py']:
+                add_pypath(osp.join(self.root_dir, 'data', dataset))
+        add_pypath('datasets')
+        add_pypath(self.data_dir)
+
+    def prepare_dirs(self, exp_name):
+        time_str = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
+        self.output_dir = osp.join(self.root_dir, f'{exp_name}_{time_str}')
+        self.model_dir = osp.join(self.output_dir, 'model_dump')
+        self.vis_dir = osp.join(self.output_dir, 'vis')
+        self.log_dir = osp.join(self.output_dir, 'log')
+        self.code_dir = osp.join(self.output_dir, 'code')
+        self.result_dir = osp.join(self.output_dir.split('/')[:-1])
+        from util.dir import make_folder
+        make_folder(self.model_dir)
+        make_folder(self.vis_dir)
+        make_folder(self.log_dir)
+        make_folder(self.code_dir)
+        make_folder(self.result_dir)
+
+        ## copy some code to log dir as a backup
+        copy_files = [
+            'main/train.py', 'main/test.py', 'common/base.py', 'main/OSX.py',
+            'common/nets', 'main/OSX_WoDecoder.py', 'data/dataset.py',
+            'data/MSCOCO/MSCOCO.py', 'data/AGORA/AGORA.py'
+        ]
+        for file in copy_files:
+            os.system(f'cp -r {self.root_dir}/{file} {self.code_dir}')
+
+    def update_test_config(self, testset, agora_benchmark, shapy_eval_split,
+                           pretrained_model_path, use_cache):
+        self.testset = testset
+        self.agora_benchmark = agora_benchmark
+        self.pretrained_model_path = pretrained_model_path
+        self.shapy_eval_split = shapy_eval_split
+        self.use_cache = use_cache
+
+    def update_config(self, num_gpus, exp_name):
+        self.num_gpus = num_gpus
+        self.exp_name = exp_name
+
+        self.prepare_dirs(self.exp_name)
+
+        # Save
+        cfg_save = MMConfig(self.__dict__)
+        cfg_save.dump(osp.join(self.code_dir, 'config_base.py'))
+
+
+cfg = Config()
+cfg.get_config_fromfile('config/aios_smplx.py')
+
diff --git a/data/body_models/J_regressor_extra.npy b/data/body_models/J_regressor_extra.npy
new file mode 100644
index 0000000000000000000000000000000000000000..d6cf8c0f6747d3c623a0d300c5176843ae99031d
--- /dev/null
+++ b/data/body_models/J_regressor_extra.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cc968ea4f9855571e82f90203280836b01f13ee42a8e1b89d8d580b801242a89
+size 496160
diff --git a/data/body_models/J_regressor_h36m.npy b/data/body_models/J_regressor_h36m.npy
new file mode 100644
index 0000000000000000000000000000000000000000..d8ea80f7f2fa4c3fde21c543d28376b84e22d77a
--- /dev/null
+++ b/data/body_models/J_regressor_h36m.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c655cd7013d7829eb9acbebf0e43f952a3fa0305a53c35880e39192bfb6444a0
+size 937168
diff --git a/data/body_models/J_regressor_mano_LEFT.txt b/data/body_models/J_regressor_mano_LEFT.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a392696c2a8ddd2af11ad9821f2d60352c4f4590
--- /dev/null
+++ b/data/body_models/J_regressor_mano_LEFT.txt
@@ -0,0 +1,1902 @@
+# 21 778
+0 4 0.0019103600293901542
+0 5 0.0027920646583394562
+0 6 0.00029390154298310065
+0 7 0.00014695077149155033
+0 25 0.0016164584864070536
+0 26 0.000440852314474651
+0 32 0.011756061719324026
+0 33 0.021234386480529024
+0 34 0.019838354151359296
+0 35 0.016311535635562088
+0 36 0.015870683321087434
+0 37 0.02343864805290228
+0 38 0.01671565025716385
+0 39 0.020499632623071272
+0 40 0.005437178545187362
+0 41 0.010139603232916973
+0 42 0.002645113886847906
+0 43 0.00014695077149155033
+0 44 0.02005878030859662
+0 45 0.02233651726671565
+0 50 0.01763409257898604
+0 51 0.01704628949301984
+0 52 0.019838354151359296
+0 53 0.02079353416605437
+0 54 0.00822924320352682
+0 55 0.00822924320352682
+0 78 0.011572373254959589
+0 79 0.011939750183688464
+0 84 0.01704628949301984
+0 85 0.019691403379867745
+0 88 0.005437178545187362
+0 89 0.0007347538574577516
+0 90 0.014548126377663484
+0 91 0.018736223365172666
+0 92 0.011645848640705364
+0 106 0.018515797207935343
+0 107 0.02204261572373255
+0 108 0.012417340191036004
+0 109 0.009992652461425423
+0 110 0.016311535635562088
+0 111 0.01880969875091844
+0 112 0.0073475385745775165
+0 113 0.0014695077149155032
+0 114 0.005731080088170463
+0 116 0.02204261572373255
+0 117 0.012123438648052902
+0 118 0.013005143277002204
+0 119 0.016385011021307863
+0 120 0.008155767817781044
+0 121 0.011315209404849376
+0 122 0.009037472446730345
+0 130 0.0073475385745775165
+0 131 0.00911094783247612
+0 178 0.001763409257898604
+0 179 0.002351212343864805
+0 190 0.019544452608376194
+0 191 0.019691403379867745
+0 192 0.01704628949301984
+0 193 0.016605437178545186
+0 200 0.002351212343864805
+0 203 0.00822924320352682
+0 204 0.007641440117560617
+0 205 0.01704628949301984
+0 207 0.001763409257898604
+0 208 0.005290227773695812
+0 209 0.01763409257898604
+0 210 0.019691403379867745
+0 211 0.019691403379867745
+0 214 0.011315209404849376
+0 215 0.011315209404849376
+0 216 0.007641440117560617
+0 217 0.00822924320352682
+0 218 0.002351212343864805
+0 219 0.0011756061719324026
+0 227 0.002351212343864805
+0 229 0.007788390889052168
+0 231 0.002204261572373255
+0 232 0.016311535635562088
+0 233 0.006759735488611315
+0 234 0.011168258633357825
+0 235 0.019544452608376194
+0 236 0.0016164584864070536
+0 239 0.011315209404849376
+0 241 0.0007347538574577516
+0 242 0.002351212343864805
+0 243 0.0036737692872887582
+0 244 0.0011756061719324026
+0 254 0.0064658339456282144
+0 255 0.0038207200587803084
+0 256 0.002351212343864805
+0 257 0.002351212343864805
+0 264 0.014107274063188832
+0 265 0.00440852314474651
+0 279 0.011315209404849376
+0 284 0.00896399706098457
+0 285 0.0029390154298310064
+1 0 0.014595751184471957
+1 1 0.025294207550053488
+1 2 0.019180803912578332
+1 3 0.01039278618370778
+1 4 0.03156044627846554
+1 5 0.025752712822864135
+1 6 0.014977838911814154
+1 7 0.023307351367874065
+1 8 0.005654898364664528
+1 9 0.009170105456212748
+1 10 0.002063273727647868
+1 11 0.0006113403637475165
+1 12 0.0018340210912425497
+1 14 0.001222680727495033
+1 15 7.641754546843957e-05
+1 16 0.0011462631820265935
+1 17 0.0004585052728106374
+1 18 0.00015283509093687913
+1 19 0.0003820877273421978
+1 22 7.641754546843957e-05
+1 24 0.01413724591166132
+1 25 0.019257221458046772
+1 26 0.024377197004432218
+1 27 0.017346782821335782
+1 28 0.0007641754546843956
+1 29 0.0022161088185847473
+1 30 0.0006877579092159561
+1 31 0.0005349228182790769
+1 32 0.0005349228182790768
+1 33 0.0005349228182790769
+1 34 0.0024071526822558465
+1 35 0.002445361454990066
+1 36 0.029802842732691428
+1 37 0.022122879413113253
+1 38 0.010029802842732692
+1 39 0.02334556014060829
+1 40 0.029344337459880795
+1 41 0.032171786642213054
+1 42 0.02009781445819961
+1 43 0.009934280910897143
+1 60 0.004355800091701055
+1 61 0.00855876509246523
+1 62 0.0004585052728106374
+1 63 0.003285954455142901
+1 64 0.0012990982729634726
+1 65 7.641754546843957e-05
+1 66 0.0019868561821794286
+1 67 0.004814305364511693
+1 68 0.008253094910591475
+1 69 0.0018340210912425497
+1 70 0.0003820877273421978
+1 71 7.641754546843957e-05
+1 88 0.021320495185694635
+1 89 0.013907993275256002
+1 90 0.01986856182179429
+1 91 0.013564114320648022
+1 92 0.003763564114320649
+1 93 0.0004585052728106374
+1 94 0.008329512456059913
+1 95 0.007565337001375517
+1 104 0.0027510316368638244
+1 105 0.0072596668195017595
+1 109 0.009705028274491823
+1 110 0.005654898364664528
+1 111 0.015436344184624792
+1 112 0.019180803912578332
+1 113 0.03339446736970809
+1 114 0.0340058077334556
+1 115 0.02559987773192725
+1 116 0.008405930001528351
+1 117 0.0017767079321412199
+1 118 0.00527281063732233
+1 119 0.00032477456824086816
+1 122 0.004967140455448571
+1 123 0.007259666819501758
+1 124 0.0016811860003056705
+1 125 0.0025217790004585057
+1 126 0.008176677365123033
+1 129 0.00030567018187375826
+1 145 0.00030567018187375826
+1 146 0.0006877579092159561
+1 147 7.641754546843957e-05
+1 152 7.641754546843957e-05
+1 157 0.002063273727647868
+1 158 0.0016047684548372307
+1 159 0.0032095369096744614
+1 188 0.0007641754546843956
+1 190 0.0019868561821794286
+1 191 0.0004585052728106374
+1 192 0.0016047684548372307
+1 193 0.005884151001069847
+1 207 0.00015283509093687913
+1 208 7.641754546843957e-05
+1 209 0.00030567018187375826
+1 216 0.0008405930001528353
+1 217 0.003897294818890417
+1 218 0.0008405930001528353
+1 219 0.0014519333639003516
+1 227 0.005502063273727648
+1 229 0.008635182637933671
+1 230 0.004126547455295736
+1 231 0.009705028274491824
+1 232 0.01245605991135565
+1 233 0.016888277548525142
+1 234 0.001413724591166132
+1 235 0.005654898364664528
+1 236 0.012838147638697846
+1 239 0.00026746140913953847
+1 240 0.01543634418462479
+1 241 0.0006877579092159561
+1 242 0.0032095369096744614
+1 248 0.004890722909980132
+1 249 0.0005349228182790769
+1 250 0.0015283509093687911
+1 251 0.0009170105456212748
+1 252 0.0029038667278007036
+1 253 0.005502063273727649
+1 254 0.0019868561821794286
+1 255 0.0002292526364053187
+1 264 0.028885832187070158
+1 265 0.029650007641754548
+1 266 0.006953996637628001
+1 267 0.002445361454990066
+1 268 0.00015283509093687913
+1 285 0.010087116001834023
+1 286 0.007794589637780836
+1 287 0.0025981965459269452
+1 697 0.0004585052728106374
+1 699 7.641754546843957e-05
+1 700 0.00030567018187375826
+1 704 0.0002292526364053187
+1 705 0.0008405930001528353
+1 706 7.641754546843957e-05
+2 0 0.0027531810402559712
+2 1 0.0034972840241089364
+2 2 0.007887491628841432
+2 3 0.0056551826772825355
+2 4 0.009152466701391472
+2 5 0.01674231713669172
+2 6 0.02708534861224793
+2 7 0.02209985862043307
+2 8 0.00833395341915321
+2 9 0.009152466701391472
+2 10 0.011682416846491553
+2 11 0.0055063620805119425
+2 12 0.005431951782126646
+2 13 0.0011161544757794478
+2 14 0.006176054765979612
+2 15 0.0017858471612471167
+2 16 0.0007441029838529652
+2 19 0.0003720514919264826
+2 26 0.000967333879008855
+2 27 0.0008929235806235583
+2 28 0.013245033112582783
+2 29 0.013765905201279856
+2 30 0.009970979983629735
+2 31 0.011384775652950369
+2 36 0.0023811295483294886
+2 37 0.00014882059677059304
+2 38 7.441029838529652e-05
+2 39 0.0020834883547883026
+2 40 0.0055063620805119425
+2 41 0.009896569685244438
+2 42 0.022843961604286034
+2 43 0.032666120991145166
+2 60 0.00364610462087953
+2 61 0.0017858471612471167
+2 62 0.0002976411935411861
+2 63 0.000967333879008855
+2 64 0.0014882059677059304
+2 65 0.0004464617903117792
+2 68 0.0002976411935411861
+2 69 7.441029838529652e-05
+2 88 0.01562616266091227
+2 89 0.027234169209018527
+2 90 0.00513431058858546
+2 91 0.0006696926854676687
+2 93 7.441029838529652e-05
+2 94 0.0005952823870823722
+2 104 0.025225091152615526
+2 105 0.017858471612471165
+2 113 0.0035716943224942334
+2 114 0.002604360443485378
+2 115 0.010566262370712107
+2 123 0.026787707418706754
+2 124 0.021504576233350697
+2 125 0.01882580549148002
+2 126 0.02083488354788303
+2 127 0.0002232308951558896
+2 128 0.0002976411935411861
+2 129 0.0017114368628618197
+2 144 0.0002232308951558896
+2 145 0.0013393853709353374
+2 158 0.002604360443485378
+2 193 0.0003720514919264826
+2 217 0.0007441029838529652
+2 219 0.0004464617903117792
+2 227 0.003199642830567751
+2 229 0.003125232532182454
+2 230 0.008854825507850286
+2 231 0.00982215938685914
+2 232 0.002009078056403006
+2 233 0.007813081330456134
+2 235 7.441029838529652e-05
+2 236 0.01912344668502121
+2 240 0.01480764937867401
+2 248 0.03318699307984225
+2 249 0.01823052310439765
+2 250 0.02887119577349505
+2 251 0.02500186025745963
+2 252 0.02864796487833916
+2 253 0.032889351886301064
+2 259 0.00014882059677059304
+2 264 0.0002232308951558896
+2 265 0.0005952823870823722
+2 266 0.015402931765756382
+2 267 0.01622144504799464
+2 286 0.02805268249125679
+2 287 0.025820373539697895
+2 697 0.014510008185132822
+2 698 0.008631594612694398
+2 699 0.011161544757794479
+2 700 0.01049185207232681
+2 701 0.00811072252399732
+2 702 0.013393853709353377
+2 703 0.010938313862638589
+2 704 0.008185132822382618
+2 705 0.02187662772527718
+2 706 0.018825805491480024
+2 707 0.011905647741647447
+2 708 0.007217798943373763
+2 709 0.005059900290200163
+2 710 0.003199642830567751
+2 711 0.0019346677580177095
+2 712 0.005952823870823722
+2 713 0.00364610462087953
+2 714 0.00364610462087953
+2 715 0.0026787707418706747
+2 716 0.0021578986531735995
+2 721 0.0006696926854676687
+2 722 0.0002232308951558896
+2 723 0.0002232308951558896
+2 725 0.0004464617903117792
+2 731 0.0032740531289530473
+2 732 0.0008185132822382618
+2 741 0.0005952823870823722
+2 742 0.0005208720886970756
+2 746 0.0002232308951558896
+2 749 0.0005208720886970756
+2 753 0.0034972840241089364
+2 754 0.004018156112806012
+2 755 0.0014882059677059304
+2 757 0.0008929235806235583
+2 758 0.0014137956693206339
+2 759 0.0003720514919264826
+2 760 7.441029838529652e-05
+3 6 0.0019164148301024542
+3 7 0.0014004569912287167
+3 8 0.000884499152354979
+3 9 0.00029483305078499295
+3 10 0.004422495761774894
+3 11 0.0011793322031399718
+3 12 0.0005896661015699859
+3 14 0.0011056239404437236
+3 28 0.011203655929829732
+3 29 0.0037591213975086604
+3 30 0.004496204024471142
+3 31 0.011645905506007222
+3 43 0.0019164148301024544
+3 89 0.0005896661015699859
+3 104 0.009729490675904768
+3 105 0.002137539618191199
+3 123 0.006412618854573597
+3 124 0.0187956069875433
+3 125 0.013414903810717178
+3 126 0.004938453600648632
+3 230 0.0007370826269624824
+3 231 0.00022112478808874474
+3 236 0.0005159578388737376
+3 240 0.0008844991523549787
+3 248 0.007665659320409817
+3 249 0.013120070759932186
+3 250 0.009434657625119773
+3 251 0.012088155082184712
+3 252 0.004348787499078646
+3 253 0.003022038770546178
+3 266 0.0029483305078499295
+3 267 0.0125304046583622
+3 286 0.002727205719761185
+3 287 0.005896661015699859
+3 697 0.01805852436058082
+3 698 0.019016731775632047
+3 699 0.021375396181911987
+3 700 0.01968010613989828
+3 701 0.023512935800103187
+3 702 0.01975381440259453
+3 703 0.021965062283481978
+3 704 0.019164148301024544
+3 705 0.015331318640819633
+3 706 0.017837399572492075
+3 707 0.02889363897692931
+3 708 0.02130168791921574
+3 709 0.027050932409523103
+3 710 0.024544851477850665
+3 711 0.0209331466057345
+3 712 0.0232181027493182
+3 713 0.023070686223925697
+3 714 0.024102601901673175
+3 715 0.018353357411365814
+3 716 0.017026608682833344
+3 717 0.0016952900420137097
+3 718 0.0062652023291811
+3 719 0.0033168718213311705
+3 720 0.00125304046583622
+3 721 0.016879192157440846
+3 722 0.01090882287904474
+3 723 0.008402741947372299
+3 724 0.004717328812559887
+3 725 0.010982531141740989
+3 726 0.0033168718213311705
+3 727 0.0008107908896587306
+3 730 7.370826269624824e-05
+3 731 0.022775853173140702
+3 732 0.018279649148669565
+3 733 0.009803198938601014
+3 734 0.003022038770546178
+3 735 0.0003685413134812412
+3 736 0.011719613768703471
+3 737 0.003906537922901157
+3 738 0.0008107908896587306
+3 739 0.013488612073413427
+3 740 0.005306994914129874
+3 741 0.021301687919215745
+3 742 0.019606397877202027
+3 743 0.0022112478808874476
+3 746 0.006338910591877348
+3 747 0.00125304046583622
+3 748 0.0016952900420137097
+3 749 0.009876907201297264
+3 750 0.003022038770546178
+3 751 7.370826269624824e-05
+3 753 0.025208225842116898
+3 754 0.0209331466057345
+3 755 0.023291811012014444
+3 756 0.017837399572492075
+3 757 0.021449104444608236
+3 758 0.01975381440259453
+3 759 0.01171961376870347
+3 760 0.01348861207341343
+3 761 0.003906537922901157
+3 762 0.005306994914129872
+3 763 0.007960492371194809
+3 764 0.0008107908896587306
+3 765 0.0003685413134812412
+3 767 0.0022112478808874476
+3 768 0.0011056239404437238
+4 745 1.0
+5 0 0.0012638674343491084
+5 1 0.0001404297149276787
+5 2 0.00035107428731919675
+5 3 0.002808594298553574
+5 8 0.004072461732902682
+5 9 0.0007723634321022329
+5 10 0.004774610307541076
+5 11 0.01418340120769555
+5 12 0.012357814913635726
+5 13 0.01930908580255582
+5 14 0.007934278893413846
+5 15 0.020011234377194213
+5 16 0.0021064457239151806
+5 17 0.0006319337171745541
+5 18 0.0022468754388428594
+5 19 0.009127931470299114
+5 21 0.00042128914478303613
+5 24 0.0009127931470299115
+5 25 7.021485746383936e-05
+5 26 0.0001404297149276787
+5 27 0.0010532228619575903
+5 28 0.0004212891447830361
+5 29 0.0015447268642044658
+5 30 0.003932032017975004
+5 31 0.0009127931470299115
+5 46 0.0006319337171745542
+5 47 0.00035107428731919675
+5 48 0.003721387445583485
+5 49 0.0027383794410897346
+5 56 0.0002808594298553574
+5 57 7.021485746383936e-05
+5 58 0.0010532228619575903
+5 59 0.0028788091560174134
+5 60 0.010040724617329027
+5 61 0.005687403454570988
+5 62 0.029981744137059403
+5 63 0.017483499508496
+5 64 0.02029209380704957
+5 65 0.024294340682488414
+5 66 0.0029490240134812527
+5 67 0.0011234377194214297
+5 68 0.005827833169498665
+5 69 0.00975986518747367
+5 74 0.00217666058137902
+5 75 0.0010532228619575903
+5 76 0.00035107428731919675
+5 77 0.00021064457239151807
+5 86 0.0007723634321022329
+5 87 0.0021064457239151806
+5 93 0.018536722370453586
+5 94 0.0016851565791321445
+5 95 0.0001404297149276787
+5 104 7.021485746383936e-05
+5 105 0.0001404297149276787
+5 127 0.023592192107850022
+5 128 0.02710293498104199
+5 129 0.020713382951832608
+5 132 0.023030473248139307
+5 133 0.005195899452324112
+5 134 0.005195899452324112
+5 135 0.01305996348827412
+5 136 0.008495997753124563
+5 137 0.014323830922623225
+5 138 0.01818564808313439
+5 139 0.011515236624069652
+5 140 0.008215138323269205
+5 143 0.010742873191967421
+5 144 0.016991995506249125
+5 145 0.010040724617329027
+5 146 0.00035107428731919675
+5 147 0.0011234377194214297
+5 149 0.013832326920376354
+5 150 0.016430276646538407
+5 151 0.010181154332256704
+5 152 0.011023732621822779
+5 155 0.00035107428731919675
+5 156 0.001966016008987502
+5 157 7.021485746383936e-05
+5 158 0.003932032017975004
+5 164 0.0034405280157281284
+5 165 0.005195899452324111
+5 166 0.0014745120067406266
+5 167 0.0014745120067406264
+5 168 0.026049712119084405
+5 169 0.02927959556242101
+5 170 0.023873051537705376
+5 171 0.016008987501755372
+5 172 0.027102934981041993
+5 173 0.016921780648785283
+5 174 0.005546973739643309
+5 175 0.005406544024715631
+5 176 0.013551467490520995
+5 177 0.00758320460609465
+5 183 7.021485746383936e-05
+5 185 0.009127931470299114
+5 186 0.017834573795815194
+5 187 0.008074708608341525
+5 189 0.007161915461311614
+5 194 0.010602443477039742
+5 195 0.01060244347703974
+5 206 0.0013340822918129478
+5 212 0.007091700603847775
+5 213 0.0013340822918129476
+5 219 0.0002808594298553574
+5 220 0.00435332116275804
+5 222 0.0002808594298553574
+5 223 0.00042128914478303613
+5 225 0.0016851565791321445
+5 226 0.00042128914478303613
+5 227 0.000983008004493751
+5 228 0.00975986518747367
+5 230 0.001825586294059823
+5 231 7.021485746383936e-05
+5 246 0.00035107428731919675
+5 258 0.020924027524224127
+5 259 0.022398539530964757
+5 260 0.015587698356972338
+5 261 0.012568459486027245
+5 262 0.009619435472545991
+5 263 0.01305996348827412
+5 266 0.0010532228619575903
+5 267 0.0005617188597107148
+5 268 0.004283106305294201
+5 269 0.0017553714365959837
+5 270 0.005266114309787951
+5 271 0.004844825165004915
+5 274 0.018045218368206713
+5 276 0.0002808594298553574
+5 277 0.00021064457239151807
+5 280 0.0001404297149276787
+5 288 0.00540654402471563
+5 290 7.021485746383936e-05
+5 358 0.0002808594298553574
+5 359 0.00035107428731919675
+5 362 0.00021064457239151807
+5 363 0.0002808594298553574
+5 365 7.021485746383936e-05
+5 366 0.0009127931470299116
+5 367 0.0013340822918129476
+5 368 0.005125684594860273
+5 369 0.0034405280157281284
+5 370 0.0013340822918129476
+5 371 0.00021064457239151807
+5 373 0.00042128914478303613
+5 375 0.00035107428731919675
+5 378 0.004493750877685719
+5 379 0.0034405280157281284
+5 380 0.004634180592613397
+5 383 0.00042128914478303613
+5 385 0.0016149417216683051
+5 386 0.001404297149276787
+5 387 0.0016851565791321445
+5 388 0.0002808594298553574
+5 399 0.0014745120067406264
+6 46 0.019904998869034157
+6 47 0.01960340797707909
+6 48 0.025559828093191583
+6 49 0.02352408957249491
+6 56 0.022166930558697125
+6 57 0.020131192038000453
+6 58 0.02194073738973083
+6 59 0.028952725627686037
+6 62 0.0005277840609213601
+6 65 0.00022619316896629722
+6 86 0.02382568046444997
+6 87 0.022543919173640955
+6 127 0.0012063635678202518
+6 128 0.0007539772298876573
+6 132 0.0006031817839101259
+6 133 0.017643067179371183
+6 134 0.02382568046444997
+6 135 0.01379778330694413
+6 136 0.01259141973912388
+6 137 0.004448465656337178
+6 138 0.003091306642539395
+6 139 0.009424715373595717
+6 140 0.012214431124180048
+6 143 0.0005277840609213601
+6 144 0.0012817612908090175
+6 150 0.0008293749528764231
+6 155 0.019678805700067855
+6 156 0.0244288622483601
+6 164 0.019980396592022914
+6 165 0.017944658071326246
+6 166 0.023222498680539848
+6 167 0.023901078187438737
+6 168 0.002789715750584332
+6 169 0.002186533966674206
+6 170 0.00987710171152831
+6 171 0.005881022393123726
+6 172 0.004071477041393349
+6 173 0.011837442509236221
+6 174 0.022166930558697128
+6 175 0.02382568046444997
+6 176 0.019377214808112796
+6 177 0.013119203800045236
+6 185 0.0016587499057528462
+6 186 0.004448465656337178
+6 187 0.0005277840609213601
+6 189 0.020809771544899342
+6 194 0.015154942320741913
+6 195 0.01839704440925884
+6 212 0.021262157882831936
+6 213 0.022317726004674656
+6 221 0.006333408731056322
+6 222 0.016210510442584633
+6 223 0.018472442132247607
+6 224 0.00987710171152831
+6 225 0.02744477116791073
+6 226 0.020583578375933047
+6 228 0.0005277840609213602
+6 237 0.012516022016135112
+6 238 0.011912840232224985
+6 245 0.011912840232224985
+6 258 0.0052024428862248355
+6 259 0.002337329412651738
+6 260 0.007162783683932745
+6 261 0.013043806077056472
+6 262 0.0016587499057528462
+6 263 0.007388976852899043
+6 272 0.014174771921887958
+6 273 0.012817612908090177
+6 274 0.0059564201161124925
+6 280 0.019301817085124028
+6 281 0.011385056171303627
+6 282 0.011460453894292393
+6 283 0.017643067179371186
+6 294 0.003920681595415819
+6 295 0.0069365905149664465
+6 296 0.0037698861494382865
+6 297 0.00512704516323607
+6 298 0.006634999623011385
+6 299 0.002789715750584332
+6 300 0.0021865339666742064
+6 301 0.0038452838724270517
+6 302 0.0005277840609213601
+6 303 0.0006031817839101259
+6 305 0.00030159089195506294
+6 316 0.0016587499057528462
+6 321 0.0009047726758651889
+6 330 0.0021111362436854408
+6 331 0.0015079544597753145
+6 340 0.00512704516323607
+6 341 0.004599261102314709
+6 342 0.0011309658448314859
+6 344 0.0007539772298876573
+6 345 0.00022619316896629722
+7 46 0.008690077640857611
+7 47 0.009188688653037966
+7 48 0.0033478167960680964
+7 49 0.0034902770852624832
+7 56 0.010898212123370611
+7 57 0.012322815015314481
+7 58 0.004202578531234419
+7 59 0.003276586651470902
+7 86 0.00648194315834461
+7 87 0.0016382933257354513
+7 133 0.00035615072298596765
+7 134 0.0015670631811382577
+7 155 0.009829759954412709
+7 156 0.004131348386637225
+7 164 0.0009259918797635161
+7 165 0.0006410713013747418
+7 166 0.003917657952845645
+7 167 0.0050573402664007405
+7 174 0.001638293325735451
+7 175 0.0014246028919438706
+7 189 0.0009259918797635161
+7 194 0.00028492057838877413
+7 195 0.0006410713013747418
+7 212 0.00042738086758316123
+7 213 0.0037039675190540643
+7 221 0.019517059619631027
+7 222 0.016739083980340477
+7 223 0.0143172590640359
+7 224 0.02443193959683738
+7 225 0.00683809388133058
+7 226 0.01111190255716219
+7 237 0.016739083980340477
+7 238 0.018092456727687157
+7 245 0.01367618776266116
+7 272 0.02236626540351877
+7 273 0.01923213904124225
+7 280 0.011040672412564997
+7 281 0.020086900776408578
+7 282 0.01859106773986751
+7 283 0.0165253935465489
+7 294 0.024004558729254222
+7 295 0.024075788873851416
+7 296 0.02443193959683738
+7 297 0.025357931476600898
+7 298 0.026283923356364414
+7 299 0.023933328584657028
+7 300 0.022722416126504736
+7 301 0.02514424104280932
+7 302 0.01738015528171522
+7 303 0.020941662511574897
+7 304 0.007835315905691288
+7 305 0.017380155281715225
+7 306 0.011396823135550965
+7 307 0.0036327373744568705
+7 308 0.0012821426027494836
+7 309 0.002777975639290548
+7 310 0.011966664292328516
+7 311 0.005342260844789515
+7 312 0.0038464278082484507
+7 313 0.0014958330365410642
+7 314 0.0007835315905691288
+7 315 0.008191466628677256
+7 316 0.022651185981907542
+7 317 0.00035615072298596765
+7 321 0.02101289265617209
+7 322 0.01225158487071729
+7 323 0.007764085761094094
+7 324 0.002564285205498967
+7 325 0.01994444048721419
+7 326 0.008690077640857611
+7 327 0.0024218249163045803
+7 328 0.0165253935465489
+7 329 0.006980554170524965
+7 330 0.028064676971294254
+7 331 0.021084122800769284
+7 332 0.0019232139041242254
+7 333 0.00021369043379158061
+7 334 0.010969442267967804
+7 335 0.0024930550609017737
+7 336 0.008690077640857611
+7 337 0.003988888097442838
+7 338 0.00028492057838877413
+7 340 0.019588289764228224
+7 341 0.0242182491630458
+7 342 0.021867654391338417
+7 343 0.014103568630244322
+7 344 0.018662297884464708
+7 345 0.014673409787021868
+7 346 0.006125792435358643
+7 347 0.009758529809815513
+7 348 0.0017095234703326447
+7 349 0.0031341263622765153
+7 350 0.004772419688011967
+7 351 0.0006410713013747418
+7 352 0.0008547617351663223
+7 353 0.00042738086758316123
+7 354 0.001068452168957903
+7 355 0.0009972220243607095
+8 317 1.0
+9 11 0.0002498906728306366
+9 13 0.0002498906728306366
+9 14 0.0009995626913225464
+9 15 0.0022490160554757294
+9 16 0.0029986880739676387
+9 17 0.002249016055475729
+9 18 0.007746610857749733
+9 19 0.00949584556756419
+9 20 0.0013743987005685012
+9 21 0.00437308677453614
+9 22 0.0009995626913225461
+9 23 0.00018741800462297744
+9 48 0.0004997813456612732
+9 59 0.0002498906728306366
+9 62 0.0014368713687761604
+9 63 0.000874617354907228
+9 64 6.247266820765915e-05
+9 65 6.247266820765915e-05
+9 66 0.0024989067283063657
+9 67 0.000437308677453614
+9 68 0.0006871993502842506
+9 69 0.0029986880739676387
+9 71 0.0004997813456612732
+9 74 0.015555694383707127
+9 75 0.017867183107390515
+9 76 0.017242456425313923
+9 77 0.00868370088086462
+9 83 6.247266820765915e-05
+9 87 0.0004997813456612732
+9 93 0.0033110514150059348
+9 127 0.0006247266820765914
+9 132 0.004810395451989753
+9 133 0.0006247266820765914
+9 135 0.0001249453364153183
+9 136 0.0004997813456612732
+9 137 0.015555694383707127
+9 138 0.007246829512088461
+9 139 0.005997376147935278
+9 140 0.008683700880864622
+9 141 0.005997376147935278
+9 142 0.0025613793965140247
+9 143 0.015743112388330104
+9 144 0.009558318235771848
+9 145 0.0032485787467982754
+9 146 0.0015618167051914785
+9 147 0.006122321484350596
+9 148 0.0025613793965140247
+9 149 0.0071843568438808006
+9 150 0.01243206097332417
+9 151 0.013993877678515648
+9 152 0.007809083525957393
+9 157 0.0001249453364153183
+9 158 0.0023114887236833884
+9 160 0.0019991253826450927
+9 161 0.0002498906728306366
+9 162 0.0005622540138689324
+9 163 0.0021240707190604106
+9 164 0.0029362154057599797
+9 165 0.002561379396514025
+9 166 0.0007496720184919098
+9 167 0.0007496720184919097
+9 168 0.002124070719060411
+9 169 0.0003123633410382957
+9 170 0.0006871993502842506
+9 171 0.002249016055475729
+9 174 0.0028737427375523207
+9 175 0.0018741800462297744
+9 176 0.009433372899356529
+9 177 0.006247266820765914
+9 181 0.00018741800462297744
+9 182 0.0009995626913225464
+9 183 0.004248141438120822
+9 185 0.019179109139751356
+9 186 0.01661772974323733
+9 187 0.019054163803336036
+9 194 0.0015618167051914785
+9 195 0.0001249453364153183
+9 196 0.0004997813456612732
+9 197 0.0014993440369838195
+9 198 0.0003748360092459549
+9 199 0.0001249453364153183
+9 202 6.247266820765915e-05
+9 206 0.013181732991816079
+9 207 6.247266820765915e-05
+9 212 0.0018741800462297742
+9 213 0.0002498906728306366
+9 218 0.0003123633410382957
+9 219 0.0006871993502842506
+9 220 0.014868495033422876
+9 225 0.0006247266820765914
+9 227 0.0006871993502842506
+9 228 0.021802961204473042
+9 230 0.0002498906728306366
+9 246 0.020803398513150495
+9 247 0.017304929093521583
+9 258 0.0004997813456612732
+9 259 0.0027487974011370024
+9 260 0.0017492347098144558
+9 261 0.002623852064721684
+9 262 0.01974136315362029
+9 263 0.01655525707502967
+9 268 0.007746610857749734
+9 269 0.02167801586805772
+9 270 0.019054163803336036
+9 271 0.011932279627662898
+9 274 0.0066221028300118695
+9 275 0.0007496720184919098
+9 276 0.016742675079652648
+9 277 0.02205285187730368
+9 288 0.022427687886549634
+9 289 0.0003123633410382957
+9 290 0.00730930218029612
+9 291 0.005685012806896982
+9 292 0.0057474854751046415
+9 293 0.008933591553695257
+9 356 0.0014993440369838195
+9 357 0.0014993440369838193
+9 358 0.00668457549821953
+9 359 0.004685450115574436
+9 360 0.0007496720184919098
+9 361 0.0007496720184919098
+9 362 0.0024989067283063657
+9 363 0.0038733054288748667
+9 364 0.0014368713687761604
+9 365 0.004498032110951459
+9 366 0.009933154245017804
+9 367 0.010245517586056099
+9 368 0.015993003061160742
+9 369 0.015993003061160742
+9 370 0.021115761854188793
+9 371 0.01693009308427563
+9 372 0.0009995626913225464
+9 373 0.0037483600924595483
+9 374 0.008996064221902918
+9 375 0.012432060973324168
+9 376 0.004498032110951458
+9 377 0.0031861060785906164
+9 378 0.017554819766352217
+9 379 0.01749234709814456
+9 380 0.01649278440682201
+9 381 0.008308864871618667
+9 382 0.006434684825388891
+9 383 0.016055475729368402
+9 384 0.012557006309739488
+9 385 0.01018304491784844
+9 386 0.015180858374461174
+9 387 0.01155744361841694
+9 388 0.009058536890110576
+9 389 0.0028112700693446614
+9 391 0.00018741800462297744
+9 392 0.0005622540138689324
+9 394 0.0018117073780221152
+9 395 0.0004997813456612732
+9 399 0.01611794839757606
+9 402 0.0008746173549072279
+9 470 0.0007496720184919098
+9 471 0.0004997813456612732
+9 478 0.0007496720184919098
+9 479 0.0004997813456612732
+9 480 0.0026863247329293434
+9 481 0.002623852064721684
+9 483 0.0001249453364153183
+9 484 0.0001249453364153183
+9 485 0.0014993440369838195
+9 486 0.0004997813456612732
+9 488 0.008996064221902916
+9 489 0.006059848816142937
+9 490 0.006497157493596552
+9 491 0.0001249453364153183
+9 492 0.0003748360092459549
+9 493 0.001311926032360842
+9 494 0.000437308677453614
+9 495 0.0017492347098144558
+9 496 0.002623852064721684
+9 497 0.0027487974011370024
+9 498 0.0006247266820765914
+9 509 0.0020615980508527517
+9 510 0.0003748360092459549
+9 579 0.0019991253826450927
+10 74 0.0005264345341054373
+10 75 0.0021809430698653833
+10 76 0.000752049334436339
+10 137 0.000827254267879973
+10 143 0.0006016394675490712
+10 150 0.0003008197337745356
+10 151 0.0006016394675490712
+10 185 0.004361886139730767
+10 186 0.0010528690682108748
+10 187 0.003910656539068963
+10 206 0.0001504098668872678
+10 220 0.0003008197337745356
+10 228 0.0030834022711889904
+10 246 0.003985861472512596
+10 247 0.0012784838685417762
+10 262 0.003910656539068963
+10 263 0.0011280740016545085
+10 269 0.0032338121380762574
+10 270 0.002857787470858088
+10 271 0.0003008197337745356
+10 276 0.000902459201323607
+10 277 0.00556516507482891
+10 288 0.0027825825374144545
+10 356 0.020305332029781156
+10 357 0.019703692562232082
+10 358 0.02549447243739189
+10 359 0.023764758968188315
+10 360 0.02587049710461006
+10 361 0.022486275099646538
+10 362 0.022411070166202904
+10 363 0.02278709483342107
+10 364 0.026321726705271865
+10 365 0.02007971722945025
+10 366 0.016093855756937656
+10 367 0.022260660299315636
+10 368 0.011882379484094157
+10 369 0.009400616680454237
+10 370 0.00962623148078514
+10 371 0.011431149883432353
+10 372 0.021583815898322933
+10 373 0.024742423102955553
+10 374 0.01947807776190118
+10 375 0.01789877415958487
+10 376 0.023388734300970146
+10 377 0.023689554034744677
+10 378 0.009400616680454237
+10 379 0.005865984808603443
+10 380 0.01135594494998872
+10 381 0.022486275099646538
+10 382 0.015341806422501316
+10 383 0.01135594494998872
+10 384 0.01158155975031962
+10 385 0.019703692562232082
+10 386 0.01504098668872678
+10 387 0.018124388959915774
+10 388 0.010077461081446944
+10 389 0.02293750470030834
+10 390 0.01383770775362864
+10 391 0.017372339625479433
+10 392 0.019703692562232086
+10 393 0.011882379484094157
+10 394 0.024667218169511923
+10 395 0.024667218169511916
+10 396 0.012333609084755958
+10 397 0.011506354816875987
+10 398 0.013236068286079568
+10 399 0.0070692637437015865
+10 400 0.01940287282845755
+10 401 0.016093855756937656
+10 402 0.020530946830112053
+10 403 0.008197337745356097
+10 404 0.01759795442581033
+10 405 0.021508610964879295
+10 406 0.008197337745356095
+10 407 0.013988117620515906
+10 408 0.008949387079792434
+10 409 0.006467624276152515
+10 410 0.005264345341054373
+10 411 0.005565165074828909
+10 412 0.003835451605625329
+10 413 0.002105738136421749
+10 414 0.0012784838685417764
+10 415 0.002556967737083553
+10 417 7.52049334436339e-05
+10 420 0.0020305332029781154
+10 421 0.0006016394675490712
+10 422 0.0006016394675490712
+10 427 7.52049334436339e-05
+10 430 0.004737910806948936
+10 431 0.002331352936752651
+10 432 0.0001504098668872678
+10 440 0.0010528690682108748
+10 441 0.0021057381364217496
+10 446 7.52049334436339e-05
+10 452 0.004512296006618034
+10 453 0.003609836805294428
+10 454 0.0006016394675490712
+10 456 0.0006016394675490712
+10 457 0.0004512296006618035
+11 356 0.011297349184080336
+11 357 0.011888060252528984
+11 358 0.004430333013364838
+11 359 0.004430333013364838
+11 360 0.009229860444510078
+11 361 0.011371188067636416
+11 362 0.0038396219449161927
+11 363 0.002805877575131064
+11 364 0.005759432917374288
+11 365 0.0014767776711216124
+11 366 0.0003691944177804031
+11 367 0.0014029387875655322
+11 372 0.011371188067636418
+11 373 0.004504171896920917
+11 374 0.0012552610204533705
+11 375 0.0011075832533412094
+11 376 0.005316399616037805
+11 377 0.005685594033818208
+11 381 0.001772133205345935
+11 382 0.0003691944177804031
+11 385 0.00118142213689729
+11 386 0.0005168721848925644
+11 387 0.0011075832533412094
+11 388 7.383888355608063e-05
+11 389 0.0031012331093553864
+11 390 0.019345787491693123
+11 391 0.010928154766299934
+11 392 0.01299564350587019
+11 393 0.02082256516281474
+11 394 0.0057594329173742895
+11 395 0.00945137709517832
+11 396 0.017352137635678947
+11 397 0.02001033744369785
+11 398 0.018238204238351912
+11 400 0.01794284870412759
+11 401 0.019124270841024884
+11 402 0.016170715498781657
+11 403 0.022816215018828915
+11 404 0.01727829875212287
+11 405 0.014546260060547885
+11 406 0.0239976371557262
+11 407 0.022963892785941076
+11 408 0.02695119249796943
+11 409 0.023776120505057962
+11 410 0.019493465258805284
+11 411 0.023849959388614037
+11 412 0.026581998080189025
+11 413 0.020601048512146496
+11 414 0.019493465258805288
+11 415 0.02163479288193162
+11 416 0.004873366314701322
+11 417 0.007900760540500627
+11 418 0.0042088163626965965
+11 419 0.0016982943217898545
+11 420 0.018238204238351912
+11 421 0.012035738019641142
+11 422 0.012331093553865465
+11 423 0.0055379162667060465
+11 424 0.004061138595584434
+11 425 0.0016982943217898542
+11 426 0.0008122277191168869
+11 427 0.00834379384183711
+11 428 0.0005168721848925643
+11 429 0.0015506165546776932
+11 430 0.023406926087277558
+11 431 0.019124270841024884
+11 432 0.016392232149449903
+11 433 0.005907110684486449
+11 434 0.0019198109724580966
+11 435 0.015432326663220851
+11 436 0.006940855054271579
+11 437 0.0013290999040094513
+11 438 0.013364837923650594
+11 439 0.00694085505427158
+11 440 0.02126559846415122
+11 441 0.02355460385438972
+11 442 0.002732038691574983
+11 444 7.383888355608063e-05
+11 446 0.010854315882743852
+11 447 0.0031012331093553864
+11 448 0.007753082773388465
+11 449 0.0018459720889020155
+11 450 0.00044303330133648377
+11 451 0.00044303330133648377
+11 452 0.023776120505057962
+11 453 0.02229934283393635
+11 454 0.02126559846415122
+11 455 0.013290999040094512
+11 456 0.018385882005464073
+11 457 0.015580004430333012
+11 458 0.010189765930739126
+11 459 0.012035738019641142
+11 460 0.0034704275271357893
+11 461 0.004578010780476998
+11 462 0.005907110684486449
+11 463 0.000590711068448645
+11 464 0.000590711068448645
+11 465 0.0002953555342243225
+11 466 0.0019936498560141768
+11 467 0.0013290999040094513
+12 445 1.0
+13 16 0.0014635288607891346
+13 17 0.002575810794988877
+13 18 0.005737033134293408
+13 19 0.001990399250673223
+13 20 0.007785973539398196
+13 21 0.008664090855871677
+13 22 0.002985598876009834
+13 23 0.002224563868399485
+13 63 5.854115443156538e-05
+13 66 0.0018147757873785268
+13 67 0.0006439526987472192
+13 68 0.0002927057721578269
+13 69 0.0008195761620419153
+13 70 0.0007024938531787846
+13 71 0.0033953869570307925
+13 72 0.0024001873316941806
+13 73 0.00023416461772626153
+13 74 0.009308043554618896
+13 75 0.007551808921671934
+13 76 0.01890879288139562
+13 77 0.013230300901533777
+13 80 0.0013464465519260039
+13 81 0.0002927057721578269
+13 82 0.0016976934785153963
+13 83 0.0040978808102095764
+13 93 0.00017562346329469617
+13 100 0.00017562346329469617
+13 102 0.00011708230886313077
+13 103 0.00035124692658939234
+13 137 0.00011708230886313077
+13 141 0.020021074815595362
+13 142 0.016625687858564567
+13 143 0.0016391523240838306
+13 144 0.0005268703898840885
+13 145 0.0002927057721578269
+13 146 0.002868516567146704
+13 147 0.006673691605198454
+13 148 0.008839714319166374
+13 149 0.0002927057721578269
+13 150 0.0002927057721578269
+13 151 0.0012293642430628731
+13 152 0.0011122819341997424
+13 157 0.0008781173164734808
+13 158 0.0004097880810209577
+13 160 0.02681184872965695
+13 161 0.023592085235920848
+13 162 0.03096827069429809
+13 163 0.02476290832455216
+13 178 0.0002927057721578269
+13 179 5.854115443156538e-05
+13 180 0.0009366584709050461
+13 181 0.00444912773679897
+13 182 0.013464465519260038
+13 183 0.0167427701674277
+13 184 0.00017562346329469617
+13 185 5.854115443156538e-05
+13 186 0.0002927057721578269
+13 187 0.0008195761620419153
+13 196 0.017503805175038047
+13 197 0.023416461772626154
+13 198 0.023416461772626154
+13 199 0.02921203606135113
+13 201 0.0018733169418100922
+13 202 0.006439526987472192
+13 206 0.015162158997775435
+13 207 0.0006439526987472192
+13 218 0.0007610350076103501
+13 219 0.00046832923545252306
+13 220 0.006673691605198454
+13 227 0.00011708230886313077
+13 228 0.0009951996253366115
+13 246 0.0106544901065449
+13 247 0.014576747453459781
+13 262 0.00011708230886313077
+13 268 0.0033368458025992264
+13 269 0.010420325488818641
+13 270 0.0035710104203254887
+13 271 0.002985598876009834
+13 275 0.009834913944502985
+13 276 0.02142606252195293
+13 277 0.01164968973188151
+13 278 0.00035124692658939234
+13 288 0.004741833508956796
+13 289 0.014693829762322912
+13 290 0.02207001522070015
+13 291 0.017913593256059006
+13 292 0.011005737033134292
+13 293 0.010478866643250203
+13 358 0.0003512469265893923
+13 363 5.854115443156538e-05
+13 365 0.00035124692658939234
+13 366 0.0007024938531787846
+13 367 0.00017562346329469617
+13 368 0.00017562346329469617
+13 369 0.0009951996253366115
+13 370 0.005151621589977754
+13 371 0.005385786207704015
+13 374 0.0016976934785153963
+13 375 0.0017562346329469615
+13 376 0.0004097880810209577
+13 377 0.0003512469265893923
+13 378 0.00046832923545252306
+13 379 0.0015220700152206996
+13 381 0.0014635288607891346
+13 382 0.0009951996253366115
+13 383 0.00532724505327245
+13 384 0.0037466338836201845
+13 386 0.0011708230886313077
+13 387 0.00011708230886313077
+13 388 0.0012293642430628731
+13 389 0.0002927057721578269
+13 394 0.00017562346329469617
+13 399 0.0033953869570307925
+13 468 5.854115443156538e-05
+13 469 0.0011122819341997424
+13 470 0.0027514342582835734
+13 471 0.0012879053974944384
+13 474 5.854115443156538e-05
+13 475 0.0002927057721578269
+13 476 0.0002927057721578269
+13 477 0.0018147757873785268
+13 478 0.0020489404051047887
+13 479 0.0011122819341997424
+13 480 0.004332045427935838
+13 481 0.006556609296335323
+13 483 0.00046832923545252306
+13 484 0.012352183585060298
+13 485 0.014869453225617611
+13 486 0.005912656597588104
+13 487 0.004214963119072708
+13 488 0.01164968973188151
+13 489 0.015806111696522657
+13 490 0.008312843929282283
+13 491 0.009834913944502985
+13 492 0.006146821215314366
+13 493 0.015513405924364829
+13 494 0.02007961597002693
+13 495 0.0024001873316941806
+13 496 0.008956796628029503
+13 497 0.004741833508956796
+13 498 0.003512469265893923
+13 499 0.002517269640557311
+13 501 0.0005854115443156538
+13 502 0.0004097880810209577
+13 504 0.001990399250673223
+13 505 0.00040978808102095764
+13 509 0.010478866643250205
+13 510 0.02207001522070015
+13 513 0.0012293642430628731
+13 579 0.021660227139679192
+13 580 0.0002927057721578269
+13 581 0.00011708230886313077
+13 582 0.0012879053974944388
+13 583 0.0018147757873785272
+13 584 5.854115443156538e-05
+13 585 0.00011708230886313077
+13 586 0.0011122819341997422
+13 587 0.0008195761620419154
+13 589 0.0007610350076103501
+13 590 0.003395386957030792
+13 591 0.0026928931038520073
+13 592 0.009834913944502985
+13 593 0.009834913944502985
+13 594 0.00011708230886313077
+13 595 0.0013464465519260039
+13 596 0.0015806111696522653
+13 597 0.0002927057721578269
+13 598 0.00023416461772626153
+13 599 0.0009951996253366115
+13 600 0.0002927057721578269
+13 601 0.0012293642430628731
+13 602 0.00046832923545252306
+13 603 0.00011708230886313077
+13 604 0.003980798501346446
+13 605 0.013523006673691603
+13 606 0.011591148577449948
+13 607 0.006263903524177495
+13 608 0.014693829762322912
+13 610 0.0003512469265893923
+13 611 0.0012293642430628734
+13 612 5.854115443156538e-05
+13 613 0.005327245053272449
+13 614 0.0019318580962416575
+13 615 0.006615150450766888
+13 616 0.0026928931038520073
+13 617 0.0002927057721578269
+13 627 0.005268703898840884
+13 630 0.00011708230886313077
+13 696 0.00023416461772626153
+13 769 0.00076103500761035
+13 770 0.004683292354525231
+13 771 0.0011122819341997424
+13 772 5.854115443156538e-05
+13 774 0.00076103500761035
+13 775 0.003512469265893923
+13 776 0.008020138157124457
+14 74 0.0005157677571470676
+14 75 0.0005157677571470676
+14 76 0.004273504273504274
+14 77 0.0008104921898025347
+14 141 0.002799882110226938
+14 142 0.0003684055408193339
+14 160 0.001326259946949602
+14 161 0.0005894488653109342
+14 162 0.004420866489832007
+14 163 0.0050103153551429415
+14 196 7.368110816386678e-05
+14 197 0.0014736221632773356
+14 198 0.0030209254347185383
+14 199 0.0009578544061302684
+14 206 7.368110816386678e-05
+14 246 0.0013262599469496023
+14 247 0.0061155319776009425
+14 269 7.368110816386678e-05
+14 276 0.0034630120837017397
+14 277 0.0008841732979664015
+14 290 0.001399941055113469
+14 291 0.0052313586796345415
+14 292 0.0058944886531093425
+14 293 0.008989095195991748
+14 468 0.0199675803124079
+14 469 0.02460949012673151
+14 470 0.021220159151193633
+14 471 0.02586206896551724
+14 472 0.020704391394046565
+14 473 0.017978190391983492
+14 474 0.020114942528735632
+14 475 0.02586206896551724
+14 476 0.02291482463896257
+14 477 0.02475685234305924
+14 478 0.021293840259357502
+14 479 0.026009431181844976
+14 480 0.019451812555260833
+14 481 0.014294134983790155
+14 482 0.01422045387562629
+14 483 0.02726201002063071
+14 484 0.02026230474506337
+14 485 0.015694076038903628
+14 486 0.02726201002063071
+14 487 0.02733569112879458
+14 488 0.01215738284703802
+14 489 0.009652225169466549
+14 490 0.015767757147067494
+14 491 0.02460949012673151
+14 492 0.020114942528735635
+14 493 0.013704686118479222
+14 494 0.01333628057765989
+14 495 0.022988505747126436
+14 496 0.018272914824638966
+14 497 0.020851753610374304
+14 498 0.016578249336870028
+14 499 0.025567344532861774
+14 500 0.007515473032714411
+14 501 0.019157088122605366
+14 502 0.015104627173592693
+14 503 0.00987326849395815
+14 504 0.021293840259357502
+14 505 0.020999115826702035
+14 506 0.013262599469496024
+14 507 0.013483642793987621
+14 508 0.010389036251105217
+14 509 0.011715296198054817
+14 510 0.010167992926613616
+14 511 0.011199528440907752
+14 512 0.009357500736811082
+14 513 0.020335985853227233
+14 514 0.010683760683760684
+14 515 0.01215738284703802
+14 516 0.016357206012378427
+14 517 0.004052460949012673
+14 518 0.006704980842911877
+14 519 0.004273504273504274
+14 520 0.0036103743000294726
+14 521 0.004494547597995874
+14 522 0.003020925434718538
+14 523 0.002136752136752137
+14 524 0.0037577365163572064
+14 525 0.0005894488653109342
+14 526 0.0008104921898025347
+14 531 0.0002947244326554671
+14 541 0.0016209843796050694
+14 542 0.0006631299734748011
+14 551 0.0019157088122605363
+14 552 0.0009578544061302684
+14 563 0.005010315355142941
+14 564 0.004715590922487474
+14 565 0.0010315355142941351
+14 567 0.000663129973474801
+14 568 0.00022104332449160037
+14 579 0.006115531977600943
+15 468 0.01103996467211305
+15 469 0.010230367262824759
+15 470 0.0023551924633841174
+15 471 0.004121586810922205
+15 472 0.009199970560094207
+15 473 0.011334363730036065
+15 474 0.004047987046441452
+15 475 0.0027967910502686394
+15 476 0.0059615809229410476
+15 477 0.0014719952896150733
+15 478 0.0003679988224037683
+15 479 0.0016191948185765807
+15 482 0.011187164201074557
+15 483 0.0056671818650180315
+15 484 0.0014719952896150733
+15 485 0.0003679988224037683
+15 486 0.004563185397806727
+15 487 0.0073599764480753675
+15 491 0.002134393169941856
+15 492 0.0003679988224037683
+15 495 0.0011775962316920587
+15 496 0.0005151983513652757
+15 497 0.0005151983513652757
+15 498 7.359976448075367e-05
+15 499 0.0032383896371531613
+15 500 0.019945536174284243
+15 501 0.01781114300434239
+15 502 0.014204754544785456
+15 503 0.02524471921689851
+15 504 0.005446382571575771
+15 505 0.010524766320747773
+15 506 0.01832634135570766
+15 507 0.01884153970707294
+15 508 0.018473540884669168
+15 511 0.01781114300434239
+15 512 0.019356738058438214
+15 513 0.012143961139324354
+15 514 0.020755133583572533
+15 515 0.01862074041363068
+15 516 0.015014351954073748
+15 517 0.024361522043129462
+15 518 0.02333112534039891
+15 519 0.027011113564436594
+15 520 0.02465592110105248
+15 521 0.024067122985206444
+15 522 0.024508721572090966
+15 523 0.023478324869360415
+15 524 0.025980716861706044
+15 525 0.018031942297784646
+15 526 0.020607934054611025
+15 527 0.0012511959961728123
+15 528 0.0059615809229410476
+15 529 0.0025023919923456246
+15 530 0.0009567969382497977
+15 531 0.018179141826746157
+15 532 0.006255979980864061
+15 533 0.011187164201074557
+15 534 0.005225583278133511
+15 535 0.004710384926768235
+15 536 0.0016927945830573343
+15 537 0.0007359976448075366
+15 538 0.013247957606535658
+15 540 7.359976448075367e-05
+15 541 0.02340472510487966
+15 542 0.02031353499668801
+15 543 0.010745565614190034
+15 544 0.0032383896371531613
+15 545 0.0003679988224037683
+15 546 0.015529550305439023
+15 547 0.005593582100537278
+15 548 0.001103996467211305
+15 549 0.019356738058438214
+15 550 0.009126370795613454
+15 551 0.025465518510340766
+15 552 0.022374328402149115
+15 553 0.0029439905792301465
+15 557 0.011923161845882095
+15 558 0.0029439905792301465
+15 559 0.00942076985353647
+15 560 0.003679988224037683
+15 561 0.0002943990579230147
+15 563 0.019356738058438214
+15 564 0.024582321336571723
+15 565 0.02244792816662987
+15 566 0.015382350776477514
+15 567 0.019503937587399718
+15 568 0.015161551483035255
+15 569 0.0059615809229410476
+15 570 0.01023036726282476
+15 571 0.0030175903437109006
+15 572 0.003459188930595423
+15 573 0.005519982336056524
+15 574 0.0008095974092882903
+15 575 0.0008095974092882903
+15 576 7.359976448075367e-05
+15 577 0.0013247957606535659
+15 578 0.0008095974092882903
+16 556 1.0
+17 17 0.0004919184820801125
+17 18 0.0006324666198172875
+17 20 0.005762473647224175
+17 21 0.0021082220660576245
+17 22 0.0014757554462403375
+17 23 0.0024595924104005625
+17 70 0.000140548137737175
+17 71 0.000983836964160225
+17 72 0.0023190442726633872
+17 73 0.0004919184820801125
+17 76 0.000140548137737175
+17 77 0.0006324666198172875
+17 80 0.008151791988756148
+17 81 0.006676036542515813
+17 82 0.016303583977512297
+17 83 0.012297962052002813
+17 96 0.002178496134926213
+17 97 0.0007027406886858749
+17 98 7.02740688685875e-05
+17 99 0.0011243851018974
+17 100 0.009065354884047786
+17 101 0.007308503162333099
+17 102 0.015038650737877725
+17 103 0.017919887561489812
+17 141 0.0018271257905832748
+17 142 0.003794799718903725
+17 148 0.0013352073085031624
+17 153 0.001546029515108925
+17 154 0.0024595924104005625
+17 160 0.0134926212227688
+17 161 0.01883345045678145
+17 162 0.012438510189739986
+17 163 0.005200281096275476
+17 178 0.0007730147575544624
+17 179 0.0007730147575544624
+17 180 0.003021784961349263
+17 181 0.00758959943780745
+17 182 0.01377371749824315
+17 183 0.007238229093464512
+17 184 0.0026001405481377374
+17 196 0.0123682361208714
+17 197 0.007449051300070275
+17 198 0.0071679550245959235
+17 199 0.0202389318341532
+17 200 0.0004919184820801125
+17 201 0.027406886858749122
+17 202 0.020028109627547436
+17 206 0.0019676739283204497
+17 207 0.0004919184820801125
+17 218 0.000140548137737175
+17 220 0.00028109627547435
+17 247 7.02740688685875e-05
+17 256 0.000140548137737175
+17 257 0.0004919184820801125
+17 269 0.00035137034434293746
+17 275 0.005270555165144062
+17 276 0.0010541110330288123
+17 277 7.02740688685875e-05
+17 278 0.02508784258608574
+17 289 0.019465917076598734
+17 290 0.0044975404075896
+17 291 0.001546029515108925
+17 292 0.0002108222066057625
+17 293 7.02740688685875e-05
+17 484 0.0009135628952916374
+17 485 0.0007730147575544624
+17 489 0.000421644413211525
+17 491 0.0004919184820801125
+17 492 7.02740688685875e-05
+17 493 0.0013352073085031622
+17 494 0.002951510892480675
+17 509 0.0002108222066057625
+17 510 0.0033731553056922
+17 579 0.0027406886858749122
+17 580 0.003162333099086437
+17 581 0.0023190442726633877
+17 582 0.009978917779339425
+17 583 0.009065354884047788
+17 584 0.0010541110330288125
+17 585 0.0016865776528460997
+17 586 0.004356992269852425
+17 587 0.003513703443429375
+17 588 0.0016865776528461
+17 589 0.004567814476458187
+17 590 0.009065354884047786
+17 591 0.006886858749121575
+17 592 0.013914265635980324
+17 593 0.016795502459592413
+17 594 0.021503865073787775
+17 595 0.028742094167252288
+17 596 0.02178496134926212
+17 597 0.02059030217849614
+17 598 0.0026001405481377374
+17 599 0.004427266338721012
+17 600 0.015038650737877721
+17 601 0.015390021082220663
+17 602 0.008081517919887564
+17 603 0.007308503162333099
+17 604 0.021995783555867888
+17 605 0.02312016865776529
+17 606 0.019465917076598734
+17 607 0.025720309205903027
+17 608 0.019465917076598737
+17 609 0.0123682361208714
+17 610 0.013070976809557275
+17 611 0.02009838369641603
+17 612 0.019184820801124384
+17 613 0.012719606465214337
+17 614 0.0134926212227688
+17 615 0.0179901616303584
+17 616 0.015319747013352071
+17 617 0.005621925509486999
+17 618 0.00028109627547435
+17 619 0.0022487702037948
+17 620 0.0026704146170063252
+17 621 0.0002108222066057625
+17 622 0.007589599437807451
+17 623 0.0018271257905832748
+17 624 0.00084328882642305
+17 625 0.0007027406886858749
+17 626 0.0009135628952916376
+17 627 0.01981728742094167
+17 628 0.0002108222066057625
+17 629 0.0004919184820801124
+17 630 0.003021784961349262
+17 631 0.0006324666198172875
+17 632 7.02740688685875e-05
+17 633 7.02740688685875e-05
+17 696 0.021152494729444835
+17 769 0.0179901616303584
+17 770 0.020941672522839076
+17 771 0.011243851018974
+17 772 0.005411103302881238
+17 773 0.0028109627547434997
+17 774 0.0036542515811665496
+17 775 0.00871398453970485
+17 776 0.012157413914265636
+17 777 0.00035137034434293746
+18 82 0.0006012777151446825
+18 83 0.00015031942878617063
+18 103 0.0004509582863585119
+18 142 7.515971439308531e-05
+18 160 0.001428034573468621
+18 161 0.0071401728673431055
+18 162 0.0018789928598271326
+18 182 0.0005261180007515971
+18 196 0.0018789928598271326
+18 197 0.0009019165727170237
+18 198 0.0008267568583239384
+18 199 0.006238256294626081
+18 201 0.003006388575723412
+18 202 0.0018038331454340473
+18 275 0.00015031942878617063
+18 278 0.002179631717399474
+18 289 0.003607666290868095
+18 580 0.020142803457346866
+18 581 0.01698609545283728
+18 582 0.03013904547162721
+18 583 0.02705749718151071
+18 584 0.017737692596768138
+18 585 0.016910935738444193
+18 586 0.021270199173243146
+18 587 0.018489289740698987
+18 588 0.01924088688462984
+18 589 0.016910935738444197
+18 590 0.016234498308906428
+18 591 0.017061255167230362
+18 592 0.015182262307403233
+18 593 0.010672679443818113
+18 594 0.004509582863585118
+18 595 0.005712138293874483
+18 596 0.01570838030815483
+18 597 0.011424276587748966
+18 598 0.018714768883878245
+18 599 0.01969184517098835
+18 600 0.020593761743705377
+18 601 0.023900789177001128
+18 602 0.027583615182262308
+18 603 0.0266065388951522
+18 604 0.007591131153701616
+18 605 0.007666290868094702
+18 606 0.012701991732431419
+18 607 0.020668921458098462
+18 608 0.016309658023299513
+18 609 0.015031942878617064
+18 610 0.017061255167230362
+18 611 0.009845922585494176
+18 612 0.009845922585494176
+18 613 0.02435174746335964
+18 614 0.02450206689214581
+18 615 0.02720781661029688
+18 616 0.016459977452085682
+18 617 0.021119879744456973
+18 618 0.012251033446072902
+18 619 0.020744081172491546
+18 620 0.02247275460353251
+18 621 0.008643367155204812
+18 622 0.025779782036828264
+18 623 0.02006764374295378
+18 624 0.011724915445321307
+18 625 0.01089815858699737
+18 626 0.011800075159714395
+18 627 0.006914693724163849
+18 628 0.011499436302142051
+18 629 0.011273957158962795
+18 630 0.0214956783164224
+18 631 0.012251033446072907
+18 632 0.013077790304396842
+18 633 0.014656144306651634
+18 634 0.00496054114994363
+18 635 0.006914693724163849
+18 636 0.004735062006764375
+18 637 0.004359263434798948
+18 638 0.004208944006012777
+18 639 0.003757985719654265
+18 640 0.0021796317173994736
+18 641 0.003908305148440436
+18 642 0.0010522360015031943
+18 643 0.0006012777151446825
+18 648 0.0005261180007515971
+18 658 0.0020293122886133035
+18 659 0.0011273957158962795
+18 668 0.000751597143930853
+18 669 0.0011273957158962795
+18 680 0.002931228861330327
+18 681 0.0057872980082675695
+18 682 0.001428034573468621
+18 684 0.0006012777151446825
+18 685 0.0006012777151446825
+18 696 0.003757985719654265
+18 769 0.0009019165727170238
+18 770 0.002931228861330327
+18 771 0.00015031942878617063
+18 775 0.00022547914317925594
+18 776 0.001202555430289365
+19 580 0.012027744982290436
+19 581 0.009961629279811098
+19 582 0.0059031877213695395
+19 583 0.004574970484061393
+19 584 0.009223730814639905
+19 585 0.011289846517119244
+19 586 0.0038370720188902006
+19 587 0.0028040141676505316
+19 588 0.0057556080283353
+19 589 0.0014757969303423849
+19 590 0.0003689492325855962
+19 591 0.0008116883116883117
+19 598 0.011806375442739079
+19 599 0.004501180637544274
+19 600 0.0012544273907910272
+19 601 0.0012544273907910272
+19 602 0.005165289256198347
+19 603 0.005165289256198347
+19 609 0.0005165289256198347
+19 610 0.0008116883116883117
+19 613 0.0016971664698937428
+19 614 0.0010330578512396697
+19 615 0.0011068476977567888
+19 616 0.0003689492325855962
+19 617 0.0028040141676505316
+19 618 0.021325265643447465
+19 619 0.012322904368358915
+19 620 0.016528925619834708
+19 621 0.022432113341204252
+19 622 0.0042060212514757975
+19 623 0.009518890200708384
+19 624 0.021989374262101534
+19 625 0.015053128689492327
+19 626 0.01977567886658796
+19 628 0.017635773317591502
+19 629 0.019406729634002362
+19 630 0.010256788665879575
+19 631 0.01977567886658796
+19 632 0.018299881936245575
+19 633 0.014684179456906728
+19 634 0.027007083825265645
+19 635 0.02317001180637544
+19 636 0.026638134592680048
+19 637 0.02368654073199528
+19 638 0.02435064935064935
+19 639 0.02457201889020071
+19 640 0.021915584415584416
+19 641 0.0256788665879575
+19 642 0.018816410861865408
+19 643 0.01682408500590319
+19 644 0.002656434474616293
+19 645 0.009445100354191263
+19 646 0.0031729634002361272
+19 647 0.0025826446280991736
+19 648 0.016602715466351833
+19 649 0.006419716646989375
+19 650 0.010478158205430934
+19 651 0.004870129870129871
+19 652 0.003246753246753247
+19 653 0.0014757969303423849
+19 654 0.0013282172373081465
+19 655 0.010847107438016527
+19 656 0.0005903187721369539
+19 657 0.0005165289256198347
+19 658 0.02169421487603306
+19 659 0.019406729634002362
+19 660 0.007747933884297522
+19 661 0.001844746162927981
+19 662 0.00014757969303423848
+19 663 0.012101534828807558
+19 664 0.0038370720188902014
+19 665 0.0007378984651711924
+19 666 0.0157172373081464
+19 667 0.006050767414403779
+19 668 0.02221074380165289
+19 669 0.021103896103896107
+19 670 0.0028040141676505316
+19 674 0.006419716646989373
+19 675 0.0014020070838252656
+19 676 0.010109208972845335
+19 677 0.0030253837072018895
+19 678 7.378984651711924e-05
+19 679 0.00014757969303423848
+19 680 0.02206316410861866
+19 681 0.022432113341204252
+19 682 0.0256788665879575
+19 683 0.01977567886658796
+19 684 0.022358323494687134
+19 685 0.02088252656434475
+19 686 0.012470484061393153
+19 687 0.01586481700118064
+19 688 0.004427390791027155
+19 689 0.006198347107438017
+19 690 0.00974025974025974
+19 691 0.000885478158205431
+19 692 0.0003689492325855962
+19 693 0.00014757969303423848
+19 694 0.002877804014167651
+19 695 0.0016233766233766235
+20 673 1.0
diff --git a/data/body_models/J_regressor_mano_RIGHT.txt b/data/body_models/J_regressor_mano_RIGHT.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3151077d5f7d3a680eb3ad55d115685def2ea33b
--- /dev/null
+++ b/data/body_models/J_regressor_mano_RIGHT.txt
@@ -0,0 +1,1902 @@
+# 21 778
+0 4 0.0019103600293901542
+0 5 0.0027920646583394562
+0 6 0.00029390154298310065
+0 7 0.00014695077149155033
+0 25 0.0016164584864070536
+0 26 0.000440852314474651
+0 32 0.011756061719324026
+0 33 0.021234386480529024
+0 34 0.019838354151359296
+0 35 0.016311535635562088
+0 36 0.015870683321087434
+0 37 0.02343864805290228
+0 38 0.01671565025716385
+0 39 0.020499632623071272
+0 40 0.005437178545187362
+0 41 0.010139603232916973
+0 42 0.002645113886847906
+0 43 0.00014695077149155033
+0 44 0.02005878030859662
+0 45 0.02233651726671565
+0 50 0.01763409257898604
+0 51 0.01704628949301984
+0 52 0.019838354151359296
+0 53 0.02079353416605437
+0 54 0.00822924320352682
+0 55 0.00822924320352682
+0 78 0.011572373254959589
+0 79 0.011939750183688464
+0 84 0.01704628949301984
+0 85 0.019691403379867745
+0 88 0.005437178545187362
+0 89 0.0007347538574577516
+0 90 0.014548126377663484
+0 91 0.018736223365172666
+0 92 0.011645848640705364
+0 106 0.018515797207935343
+0 107 0.02204261572373255
+0 108 0.012417340191036004
+0 109 0.009992652461425423
+0 110 0.016311535635562088
+0 111 0.01880969875091844
+0 112 0.0073475385745775165
+0 113 0.0014695077149155032
+0 114 0.005731080088170463
+0 116 0.02204261572373255
+0 117 0.012123438648052902
+0 118 0.013005143277002204
+0 119 0.016385011021307863
+0 120 0.008155767817781044
+0 121 0.011315209404849376
+0 122 0.009037472446730345
+0 130 0.0073475385745775165
+0 131 0.00911094783247612
+0 178 0.001763409257898604
+0 179 0.002351212343864805
+0 190 0.019544452608376194
+0 191 0.019691403379867745
+0 192 0.01704628949301984
+0 193 0.016605437178545186
+0 200 0.002351212343864805
+0 203 0.00822924320352682
+0 204 0.007641440117560617
+0 205 0.01704628949301984
+0 207 0.001763409257898604
+0 208 0.005290227773695812
+0 209 0.01763409257898604
+0 210 0.019691403379867745
+0 211 0.019691403379867745
+0 214 0.011315209404849376
+0 215 0.011315209404849376
+0 216 0.007641440117560617
+0 217 0.00822924320352682
+0 218 0.002351212343864805
+0 219 0.0011756061719324026
+0 227 0.002351212343864805
+0 229 0.007788390889052168
+0 231 0.002204261572373255
+0 232 0.016311535635562088
+0 233 0.006759735488611315
+0 234 0.011168258633357825
+0 235 0.019544452608376194
+0 236 0.0016164584864070536
+0 239 0.011315209404849376
+0 241 0.0007347538574577516
+0 242 0.002351212343864805
+0 243 0.0036737692872887582
+0 244 0.0011756061719324026
+0 254 0.0064658339456282144
+0 255 0.0038207200587803084
+0 256 0.002351212343864805
+0 257 0.002351212343864805
+0 264 0.014107274063188832
+0 265 0.00440852314474651
+0 279 0.011315209404849376
+0 284 0.00896399706098457
+0 285 0.0029390154298310064
+1 0 0.014595751184471957
+1 1 0.025294207550053488
+1 2 0.019180803912578332
+1 3 0.01039278618370778
+1 4 0.03156044627846554
+1 5 0.025752712822864135
+1 6 0.014977838911814154
+1 7 0.023307351367874065
+1 8 0.005654898364664528
+1 9 0.009170105456212748
+1 10 0.002063273727647868
+1 11 0.0006113403637475165
+1 12 0.0018340210912425497
+1 14 0.001222680727495033
+1 15 7.641754546843957e-05
+1 16 0.0011462631820265935
+1 17 0.0004585052728106374
+1 18 0.00015283509093687913
+1 19 0.0003820877273421978
+1 22 7.641754546843957e-05
+1 24 0.01413724591166132
+1 25 0.019257221458046772
+1 26 0.024377197004432218
+1 27 0.017346782821335782
+1 28 0.0007641754546843956
+1 29 0.0022161088185847473
+1 30 0.0006877579092159561
+1 31 0.0005349228182790769
+1 32 0.0005349228182790768
+1 33 0.0005349228182790769
+1 34 0.0024071526822558465
+1 35 0.002445361454990066
+1 36 0.029802842732691428
+1 37 0.022122879413113253
+1 38 0.010029802842732692
+1 39 0.02334556014060829
+1 40 0.029344337459880795
+1 41 0.032171786642213054
+1 42 0.02009781445819961
+1 43 0.009934280910897143
+1 60 0.004355800091701055
+1 61 0.00855876509246523
+1 62 0.0004585052728106374
+1 63 0.003285954455142901
+1 64 0.0012990982729634726
+1 65 7.641754546843957e-05
+1 66 0.0019868561821794286
+1 67 0.004814305364511693
+1 68 0.008253094910591475
+1 69 0.0018340210912425497
+1 70 0.0003820877273421978
+1 71 7.641754546843957e-05
+1 88 0.021320495185694635
+1 89 0.013907993275256002
+1 90 0.01986856182179429
+1 91 0.013564114320648022
+1 92 0.003763564114320649
+1 93 0.0004585052728106374
+1 94 0.008329512456059913
+1 95 0.007565337001375517
+1 104 0.0027510316368638244
+1 105 0.0072596668195017595
+1 109 0.009705028274491823
+1 110 0.005654898364664528
+1 111 0.015436344184624792
+1 112 0.019180803912578332
+1 113 0.03339446736970809
+1 114 0.0340058077334556
+1 115 0.02559987773192725
+1 116 0.008405930001528351
+1 117 0.0017767079321412199
+1 118 0.00527281063732233
+1 119 0.00032477456824086816
+1 122 0.004967140455448571
+1 123 0.007259666819501758
+1 124 0.0016811860003056705
+1 125 0.0025217790004585057
+1 126 0.008176677365123033
+1 129 0.00030567018187375826
+1 145 0.00030567018187375826
+1 146 0.0006877579092159561
+1 147 7.641754546843957e-05
+1 152 7.641754546843957e-05
+1 157 0.002063273727647868
+1 158 0.0016047684548372307
+1 159 0.0032095369096744614
+1 188 0.0007641754546843956
+1 190 0.0019868561821794286
+1 191 0.0004585052728106374
+1 192 0.0016047684548372307
+1 193 0.005884151001069847
+1 207 0.00015283509093687913
+1 208 7.641754546843957e-05
+1 209 0.00030567018187375826
+1 216 0.0008405930001528353
+1 217 0.003897294818890417
+1 218 0.0008405930001528353
+1 219 0.0014519333639003516
+1 227 0.005502063273727648
+1 229 0.008635182637933671
+1 230 0.004126547455295736
+1 231 0.009705028274491824
+1 232 0.01245605991135565
+1 233 0.016888277548525142
+1 234 0.001413724591166132
+1 235 0.005654898364664528
+1 236 0.012838147638697846
+1 239 0.00026746140913953847
+1 240 0.01543634418462479
+1 241 0.0006877579092159561
+1 242 0.0032095369096744614
+1 248 0.004890722909980132
+1 249 0.0005349228182790769
+1 250 0.0015283509093687911
+1 251 0.0009170105456212748
+1 252 0.0029038667278007036
+1 253 0.005502063273727649
+1 254 0.0019868561821794286
+1 255 0.0002292526364053187
+1 264 0.028885832187070158
+1 265 0.029650007641754548
+1 266 0.006953996637628001
+1 267 0.002445361454990066
+1 268 0.00015283509093687913
+1 285 0.010087116001834023
+1 286 0.007794589637780836
+1 287 0.0025981965459269452
+1 697 0.0004585052728106374
+1 699 7.641754546843957e-05
+1 700 0.00030567018187375826
+1 704 0.0002292526364053187
+1 705 0.0008405930001528353
+1 706 7.641754546843957e-05
+2 0 0.0027531810402559712
+2 1 0.0034972840241089364
+2 2 0.007887491628841432
+2 3 0.0056551826772825355
+2 4 0.009152466701391472
+2 5 0.01674231713669172
+2 6 0.02708534861224793
+2 7 0.02209985862043307
+2 8 0.00833395341915321
+2 9 0.009152466701391472
+2 10 0.011682416846491553
+2 11 0.0055063620805119425
+2 12 0.005431951782126646
+2 13 0.0011161544757794478
+2 14 0.006176054765979612
+2 15 0.0017858471612471167
+2 16 0.0007441029838529652
+2 19 0.0003720514919264826
+2 26 0.000967333879008855
+2 27 0.0008929235806235583
+2 28 0.013245033112582783
+2 29 0.013765905201279856
+2 30 0.009970979983629735
+2 31 0.011384775652950369
+2 36 0.0023811295483294886
+2 37 0.00014882059677059304
+2 38 7.441029838529652e-05
+2 39 0.0020834883547883026
+2 40 0.0055063620805119425
+2 41 0.009896569685244438
+2 42 0.022843961604286034
+2 43 0.032666120991145166
+2 60 0.00364610462087953
+2 61 0.0017858471612471167
+2 62 0.0002976411935411861
+2 63 0.000967333879008855
+2 64 0.0014882059677059304
+2 65 0.0004464617903117792
+2 68 0.0002976411935411861
+2 69 7.441029838529652e-05
+2 88 0.01562616266091227
+2 89 0.027234169209018527
+2 90 0.00513431058858546
+2 91 0.0006696926854676687
+2 93 7.441029838529652e-05
+2 94 0.0005952823870823722
+2 104 0.025225091152615526
+2 105 0.017858471612471165
+2 113 0.0035716943224942334
+2 114 0.002604360443485378
+2 115 0.010566262370712107
+2 123 0.026787707418706754
+2 124 0.021504576233350697
+2 125 0.01882580549148002
+2 126 0.02083488354788303
+2 127 0.0002232308951558896
+2 128 0.0002976411935411861
+2 129 0.0017114368628618197
+2 144 0.0002232308951558896
+2 145 0.0013393853709353374
+2 158 0.002604360443485378
+2 193 0.0003720514919264826
+2 217 0.0007441029838529652
+2 219 0.0004464617903117792
+2 227 0.003199642830567751
+2 229 0.003125232532182454
+2 230 0.008854825507850286
+2 231 0.00982215938685914
+2 232 0.002009078056403006
+2 233 0.007813081330456134
+2 235 7.441029838529652e-05
+2 236 0.01912344668502121
+2 240 0.01480764937867401
+2 248 0.03318699307984225
+2 249 0.01823052310439765
+2 250 0.02887119577349505
+2 251 0.02500186025745963
+2 252 0.02864796487833916
+2 253 0.032889351886301064
+2 259 0.00014882059677059304
+2 264 0.0002232308951558896
+2 265 0.0005952823870823722
+2 266 0.015402931765756382
+2 267 0.01622144504799464
+2 286 0.02805268249125679
+2 287 0.025820373539697895
+2 697 0.014510008185132822
+2 698 0.008631594612694398
+2 699 0.011161544757794479
+2 700 0.01049185207232681
+2 701 0.00811072252399732
+2 702 0.013393853709353377
+2 703 0.010938313862638589
+2 704 0.008185132822382618
+2 705 0.02187662772527718
+2 706 0.018825805491480024
+2 707 0.011905647741647447
+2 708 0.007217798943373763
+2 709 0.005059900290200163
+2 710 0.003199642830567751
+2 711 0.0019346677580177095
+2 712 0.005952823870823722
+2 713 0.00364610462087953
+2 714 0.00364610462087953
+2 715 0.0026787707418706747
+2 716 0.0021578986531735995
+2 721 0.0006696926854676687
+2 722 0.0002232308951558896
+2 723 0.0002232308951558896
+2 725 0.0004464617903117792
+2 731 0.0032740531289530473
+2 732 0.0008185132822382618
+2 741 0.0005952823870823722
+2 742 0.0005208720886970756
+2 746 0.0002232308951558896
+2 749 0.0005208720886970756
+2 753 0.0034972840241089364
+2 754 0.004018156112806012
+2 755 0.0014882059677059304
+2 757 0.0008929235806235583
+2 758 0.0014137956693206339
+2 759 0.0003720514919264826
+2 760 7.441029838529652e-05
+3 6 0.0019164148301024542
+3 7 0.0014004569912287167
+3 8 0.000884499152354979
+3 9 0.00029483305078499295
+3 10 0.004422495761774894
+3 11 0.0011793322031399718
+3 12 0.0005896661015699859
+3 14 0.0011056239404437236
+3 28 0.011203655929829732
+3 29 0.0037591213975086604
+3 30 0.004496204024471142
+3 31 0.011645905506007222
+3 43 0.0019164148301024544
+3 89 0.0005896661015699859
+3 104 0.009729490675904768
+3 105 0.002137539618191199
+3 123 0.006412618854573597
+3 124 0.0187956069875433
+3 125 0.013414903810717178
+3 126 0.004938453600648632
+3 230 0.0007370826269624824
+3 231 0.00022112478808874474
+3 236 0.0005159578388737376
+3 240 0.0008844991523549787
+3 248 0.007665659320409817
+3 249 0.013120070759932186
+3 250 0.009434657625119773
+3 251 0.012088155082184712
+3 252 0.004348787499078646
+3 253 0.003022038770546178
+3 266 0.0029483305078499295
+3 267 0.0125304046583622
+3 286 0.002727205719761185
+3 287 0.005896661015699859
+3 697 0.01805852436058082
+3 698 0.019016731775632047
+3 699 0.021375396181911987
+3 700 0.01968010613989828
+3 701 0.023512935800103187
+3 702 0.01975381440259453
+3 703 0.021965062283481978
+3 704 0.019164148301024544
+3 705 0.015331318640819633
+3 706 0.017837399572492075
+3 707 0.02889363897692931
+3 708 0.02130168791921574
+3 709 0.027050932409523103
+3 710 0.024544851477850665
+3 711 0.0209331466057345
+3 712 0.0232181027493182
+3 713 0.023070686223925697
+3 714 0.024102601901673175
+3 715 0.018353357411365814
+3 716 0.017026608682833344
+3 717 0.0016952900420137097
+3 718 0.0062652023291811
+3 719 0.0033168718213311705
+3 720 0.00125304046583622
+3 721 0.016879192157440846
+3 722 0.01090882287904474
+3 723 0.008402741947372299
+3 724 0.004717328812559887
+3 725 0.010982531141740989
+3 726 0.0033168718213311705
+3 727 0.0008107908896587306
+3 730 7.370826269624824e-05
+3 731 0.022775853173140702
+3 732 0.018279649148669565
+3 733 0.009803198938601014
+3 734 0.003022038770546178
+3 735 0.0003685413134812412
+3 736 0.011719613768703471
+3 737 0.003906537922901157
+3 738 0.0008107908896587306
+3 739 0.013488612073413427
+3 740 0.005306994914129874
+3 741 0.021301687919215745
+3 742 0.019606397877202027
+3 743 0.0022112478808874476
+3 746 0.006338910591877348
+3 747 0.00125304046583622
+3 748 0.0016952900420137097
+3 749 0.009876907201297264
+3 750 0.003022038770546178
+3 751 7.370826269624824e-05
+3 753 0.025208225842116898
+3 754 0.0209331466057345
+3 755 0.023291811012014444
+3 756 0.017837399572492075
+3 757 0.021449104444608236
+3 758 0.01975381440259453
+3 759 0.01171961376870347
+3 760 0.01348861207341343
+3 761 0.003906537922901157
+3 762 0.005306994914129872
+3 763 0.007960492371194809
+3 764 0.0008107908896587306
+3 765 0.0003685413134812412
+3 767 0.0022112478808874476
+3 768 0.0011056239404437238
+4 745 1.0
+5 0 0.0012638674343491084
+5 1 0.0001404297149276787
+5 2 0.00035107428731919675
+5 3 0.002808594298553574
+5 8 0.004072461732902682
+5 9 0.0007723634321022329
+5 10 0.004774610307541076
+5 11 0.01418340120769555
+5 12 0.012357814913635726
+5 13 0.01930908580255582
+5 14 0.007934278893413846
+5 15 0.020011234377194213
+5 16 0.0021064457239151806
+5 17 0.0006319337171745541
+5 18 0.0022468754388428594
+5 19 0.009127931470299114
+5 21 0.00042128914478303613
+5 24 0.0009127931470299115
+5 25 7.021485746383936e-05
+5 26 0.0001404297149276787
+5 27 0.0010532228619575903
+5 28 0.0004212891447830361
+5 29 0.0015447268642044658
+5 30 0.003932032017975004
+5 31 0.0009127931470299115
+5 46 0.0006319337171745542
+5 47 0.00035107428731919675
+5 48 0.003721387445583485
+5 49 0.0027383794410897346
+5 56 0.0002808594298553574
+5 57 7.021485746383936e-05
+5 58 0.0010532228619575903
+5 59 0.0028788091560174134
+5 60 0.010040724617329027
+5 61 0.005687403454570988
+5 62 0.029981744137059403
+5 63 0.017483499508496
+5 64 0.02029209380704957
+5 65 0.024294340682488414
+5 66 0.0029490240134812527
+5 67 0.0011234377194214297
+5 68 0.005827833169498665
+5 69 0.00975986518747367
+5 74 0.00217666058137902
+5 75 0.0010532228619575903
+5 76 0.00035107428731919675
+5 77 0.00021064457239151807
+5 86 0.0007723634321022329
+5 87 0.0021064457239151806
+5 93 0.018536722370453586
+5 94 0.0016851565791321445
+5 95 0.0001404297149276787
+5 104 7.021485746383936e-05
+5 105 0.0001404297149276787
+5 127 0.023592192107850022
+5 128 0.02710293498104199
+5 129 0.020713382951832608
+5 132 0.023030473248139307
+5 133 0.005195899452324112
+5 134 0.005195899452324112
+5 135 0.01305996348827412
+5 136 0.008495997753124563
+5 137 0.014323830922623225
+5 138 0.01818564808313439
+5 139 0.011515236624069652
+5 140 0.008215138323269205
+5 143 0.010742873191967421
+5 144 0.016991995506249125
+5 145 0.010040724617329027
+5 146 0.00035107428731919675
+5 147 0.0011234377194214297
+5 149 0.013832326920376354
+5 150 0.016430276646538407
+5 151 0.010181154332256704
+5 152 0.011023732621822779
+5 155 0.00035107428731919675
+5 156 0.001966016008987502
+5 157 7.021485746383936e-05
+5 158 0.003932032017975004
+5 164 0.0034405280157281284
+5 165 0.005195899452324111
+5 166 0.0014745120067406266
+5 167 0.0014745120067406264
+5 168 0.026049712119084405
+5 169 0.02927959556242101
+5 170 0.023873051537705376
+5 171 0.016008987501755372
+5 172 0.027102934981041993
+5 173 0.016921780648785283
+5 174 0.005546973739643309
+5 175 0.005406544024715631
+5 176 0.013551467490520995
+5 177 0.00758320460609465
+5 183 7.021485746383936e-05
+5 185 0.009127931470299114
+5 186 0.017834573795815194
+5 187 0.008074708608341525
+5 189 0.007161915461311614
+5 194 0.010602443477039742
+5 195 0.01060244347703974
+5 206 0.0013340822918129478
+5 212 0.007091700603847775
+5 213 0.0013340822918129476
+5 219 0.0002808594298553574
+5 220 0.00435332116275804
+5 222 0.0002808594298553574
+5 223 0.00042128914478303613
+5 225 0.0016851565791321445
+5 226 0.00042128914478303613
+5 227 0.000983008004493751
+5 228 0.00975986518747367
+5 230 0.001825586294059823
+5 231 7.021485746383936e-05
+5 246 0.00035107428731919675
+5 258 0.020924027524224127
+5 259 0.022398539530964757
+5 260 0.015587698356972338
+5 261 0.012568459486027245
+5 262 0.009619435472545991
+5 263 0.01305996348827412
+5 266 0.0010532228619575903
+5 267 0.0005617188597107148
+5 268 0.004283106305294201
+5 269 0.0017553714365959837
+5 270 0.005266114309787951
+5 271 0.004844825165004915
+5 274 0.018045218368206713
+5 276 0.0002808594298553574
+5 277 0.00021064457239151807
+5 280 0.0001404297149276787
+5 288 0.00540654402471563
+5 290 7.021485746383936e-05
+5 358 0.0002808594298553574
+5 359 0.00035107428731919675
+5 362 0.00021064457239151807
+5 363 0.0002808594298553574
+5 365 7.021485746383936e-05
+5 366 0.0009127931470299116
+5 367 0.0013340822918129476
+5 368 0.005125684594860273
+5 369 0.0034405280157281284
+5 370 0.0013340822918129476
+5 371 0.00021064457239151807
+5 373 0.00042128914478303613
+5 375 0.00035107428731919675
+5 378 0.004493750877685719
+5 379 0.0034405280157281284
+5 380 0.004634180592613397
+5 383 0.00042128914478303613
+5 385 0.0016149417216683051
+5 386 0.001404297149276787
+5 387 0.0016851565791321445
+5 388 0.0002808594298553574
+5 399 0.0014745120067406264
+6 46 0.019904998869034157
+6 47 0.01960340797707909
+6 48 0.025559828093191583
+6 49 0.02352408957249491
+6 56 0.022166930558697125
+6 57 0.020131192038000453
+6 58 0.02194073738973083
+6 59 0.028952725627686037
+6 62 0.0005277840609213601
+6 65 0.00022619316896629722
+6 86 0.02382568046444997
+6 87 0.022543919173640955
+6 127 0.0012063635678202518
+6 128 0.0007539772298876573
+6 132 0.0006031817839101259
+6 133 0.017643067179371183
+6 134 0.02382568046444997
+6 135 0.01379778330694413
+6 136 0.01259141973912388
+6 137 0.004448465656337178
+6 138 0.003091306642539395
+6 139 0.009424715373595717
+6 140 0.012214431124180048
+6 143 0.0005277840609213601
+6 144 0.0012817612908090175
+6 150 0.0008293749528764231
+6 155 0.019678805700067855
+6 156 0.0244288622483601
+6 164 0.019980396592022914
+6 165 0.017944658071326246
+6 166 0.023222498680539848
+6 167 0.023901078187438737
+6 168 0.002789715750584332
+6 169 0.002186533966674206
+6 170 0.00987710171152831
+6 171 0.005881022393123726
+6 172 0.004071477041393349
+6 173 0.011837442509236221
+6 174 0.022166930558697128
+6 175 0.02382568046444997
+6 176 0.019377214808112796
+6 177 0.013119203800045236
+6 185 0.0016587499057528462
+6 186 0.004448465656337178
+6 187 0.0005277840609213601
+6 189 0.020809771544899342
+6 194 0.015154942320741913
+6 195 0.01839704440925884
+6 212 0.021262157882831936
+6 213 0.022317726004674656
+6 221 0.006333408731056322
+6 222 0.016210510442584633
+6 223 0.018472442132247607
+6 224 0.00987710171152831
+6 225 0.02744477116791073
+6 226 0.020583578375933047
+6 228 0.0005277840609213602
+6 237 0.012516022016135112
+6 238 0.011912840232224985
+6 245 0.011912840232224985
+6 258 0.0052024428862248355
+6 259 0.002337329412651738
+6 260 0.007162783683932745
+6 261 0.013043806077056472
+6 262 0.0016587499057528462
+6 263 0.007388976852899043
+6 272 0.014174771921887958
+6 273 0.012817612908090177
+6 274 0.0059564201161124925
+6 280 0.019301817085124028
+6 281 0.011385056171303627
+6 282 0.011460453894292393
+6 283 0.017643067179371186
+6 294 0.003920681595415819
+6 295 0.0069365905149664465
+6 296 0.0037698861494382865
+6 297 0.00512704516323607
+6 298 0.006634999623011385
+6 299 0.002789715750584332
+6 300 0.0021865339666742064
+6 301 0.0038452838724270517
+6 302 0.0005277840609213601
+6 303 0.0006031817839101259
+6 305 0.00030159089195506294
+6 316 0.0016587499057528462
+6 321 0.0009047726758651889
+6 330 0.0021111362436854408
+6 331 0.0015079544597753145
+6 340 0.00512704516323607
+6 341 0.004599261102314709
+6 342 0.0011309658448314859
+6 344 0.0007539772298876573
+6 345 0.00022619316896629722
+7 46 0.008690077640857611
+7 47 0.009188688653037966
+7 48 0.0033478167960680964
+7 49 0.0034902770852624832
+7 56 0.010898212123370611
+7 57 0.012322815015314481
+7 58 0.004202578531234419
+7 59 0.003276586651470902
+7 86 0.00648194315834461
+7 87 0.0016382933257354513
+7 133 0.00035615072298596765
+7 134 0.0015670631811382577
+7 155 0.009829759954412709
+7 156 0.004131348386637225
+7 164 0.0009259918797635161
+7 165 0.0006410713013747418
+7 166 0.003917657952845645
+7 167 0.0050573402664007405
+7 174 0.001638293325735451
+7 175 0.0014246028919438706
+7 189 0.0009259918797635161
+7 194 0.00028492057838877413
+7 195 0.0006410713013747418
+7 212 0.00042738086758316123
+7 213 0.0037039675190540643
+7 221 0.019517059619631027
+7 222 0.016739083980340477
+7 223 0.0143172590640359
+7 224 0.02443193959683738
+7 225 0.00683809388133058
+7 226 0.01111190255716219
+7 237 0.016739083980340477
+7 238 0.018092456727687157
+7 245 0.01367618776266116
+7 272 0.02236626540351877
+7 273 0.01923213904124225
+7 280 0.011040672412564997
+7 281 0.020086900776408578
+7 282 0.01859106773986751
+7 283 0.0165253935465489
+7 294 0.024004558729254222
+7 295 0.024075788873851416
+7 296 0.02443193959683738
+7 297 0.025357931476600898
+7 298 0.026283923356364414
+7 299 0.023933328584657028
+7 300 0.022722416126504736
+7 301 0.02514424104280932
+7 302 0.01738015528171522
+7 303 0.020941662511574897
+7 304 0.007835315905691288
+7 305 0.017380155281715225
+7 306 0.011396823135550965
+7 307 0.0036327373744568705
+7 308 0.0012821426027494836
+7 309 0.002777975639290548
+7 310 0.011966664292328516
+7 311 0.005342260844789515
+7 312 0.0038464278082484507
+7 313 0.0014958330365410642
+7 314 0.0007835315905691288
+7 315 0.008191466628677256
+7 316 0.022651185981907542
+7 317 0.00035615072298596765
+7 321 0.02101289265617209
+7 322 0.01225158487071729
+7 323 0.007764085761094094
+7 324 0.002564285205498967
+7 325 0.01994444048721419
+7 326 0.008690077640857611
+7 327 0.0024218249163045803
+7 328 0.0165253935465489
+7 329 0.006980554170524965
+7 330 0.028064676971294254
+7 331 0.021084122800769284
+7 332 0.0019232139041242254
+7 333 0.00021369043379158061
+7 334 0.010969442267967804
+7 335 0.0024930550609017737
+7 336 0.008690077640857611
+7 337 0.003988888097442838
+7 338 0.00028492057838877413
+7 340 0.019588289764228224
+7 341 0.0242182491630458
+7 342 0.021867654391338417
+7 343 0.014103568630244322
+7 344 0.018662297884464708
+7 345 0.014673409787021868
+7 346 0.006125792435358643
+7 347 0.009758529809815513
+7 348 0.0017095234703326447
+7 349 0.0031341263622765153
+7 350 0.004772419688011967
+7 351 0.0006410713013747418
+7 352 0.0008547617351663223
+7 353 0.00042738086758316123
+7 354 0.001068452168957903
+7 355 0.0009972220243607095
+8 317 1.0
+9 11 0.0002498906728306366
+9 13 0.0002498906728306366
+9 14 0.0009995626913225464
+9 15 0.0022490160554757294
+9 16 0.0029986880739676387
+9 17 0.002249016055475729
+9 18 0.007746610857749733
+9 19 0.00949584556756419
+9 20 0.0013743987005685012
+9 21 0.00437308677453614
+9 22 0.0009995626913225461
+9 23 0.00018741800462297744
+9 48 0.0004997813456612732
+9 59 0.0002498906728306366
+9 62 0.0014368713687761604
+9 63 0.000874617354907228
+9 64 6.247266820765915e-05
+9 65 6.247266820765915e-05
+9 66 0.0024989067283063657
+9 67 0.000437308677453614
+9 68 0.0006871993502842506
+9 69 0.0029986880739676387
+9 71 0.0004997813456612732
+9 74 0.015555694383707127
+9 75 0.017867183107390515
+9 76 0.017242456425313923
+9 77 0.00868370088086462
+9 83 6.247266820765915e-05
+9 87 0.0004997813456612732
+9 93 0.0033110514150059348
+9 127 0.0006247266820765914
+9 132 0.004810395451989753
+9 133 0.0006247266820765914
+9 135 0.0001249453364153183
+9 136 0.0004997813456612732
+9 137 0.015555694383707127
+9 138 0.007246829512088461
+9 139 0.005997376147935278
+9 140 0.008683700880864622
+9 141 0.005997376147935278
+9 142 0.0025613793965140247
+9 143 0.015743112388330104
+9 144 0.009558318235771848
+9 145 0.0032485787467982754
+9 146 0.0015618167051914785
+9 147 0.006122321484350596
+9 148 0.0025613793965140247
+9 149 0.0071843568438808006
+9 150 0.01243206097332417
+9 151 0.013993877678515648
+9 152 0.007809083525957393
+9 157 0.0001249453364153183
+9 158 0.0023114887236833884
+9 160 0.0019991253826450927
+9 161 0.0002498906728306366
+9 162 0.0005622540138689324
+9 163 0.0021240707190604106
+9 164 0.0029362154057599797
+9 165 0.002561379396514025
+9 166 0.0007496720184919098
+9 167 0.0007496720184919097
+9 168 0.002124070719060411
+9 169 0.0003123633410382957
+9 170 0.0006871993502842506
+9 171 0.002249016055475729
+9 174 0.0028737427375523207
+9 175 0.0018741800462297744
+9 176 0.009433372899356529
+9 177 0.006247266820765914
+9 181 0.00018741800462297744
+9 182 0.0009995626913225464
+9 183 0.004248141438120822
+9 185 0.019179109139751356
+9 186 0.01661772974323733
+9 187 0.019054163803336036
+9 194 0.0015618167051914785
+9 195 0.0001249453364153183
+9 196 0.0004997813456612732
+9 197 0.0014993440369838195
+9 198 0.0003748360092459549
+9 199 0.0001249453364153183
+9 202 6.247266820765915e-05
+9 206 0.013181732991816079
+9 207 6.247266820765915e-05
+9 212 0.0018741800462297742
+9 213 0.0002498906728306366
+9 218 0.0003123633410382957
+9 219 0.0006871993502842506
+9 220 0.014868495033422876
+9 225 0.0006247266820765914
+9 227 0.0006871993502842506
+9 228 0.021802961204473042
+9 230 0.0002498906728306366
+9 246 0.020803398513150495
+9 247 0.017304929093521583
+9 258 0.0004997813456612732
+9 259 0.0027487974011370024
+9 260 0.0017492347098144558
+9 261 0.002623852064721684
+9 262 0.01974136315362029
+9 263 0.01655525707502967
+9 268 0.007746610857749734
+9 269 0.02167801586805772
+9 270 0.019054163803336036
+9 271 0.011932279627662898
+9 274 0.0066221028300118695
+9 275 0.0007496720184919098
+9 276 0.016742675079652648
+9 277 0.02205285187730368
+9 288 0.022427687886549634
+9 289 0.0003123633410382957
+9 290 0.00730930218029612
+9 291 0.005685012806896982
+9 292 0.0057474854751046415
+9 293 0.008933591553695257
+9 356 0.0014993440369838195
+9 357 0.0014993440369838193
+9 358 0.00668457549821953
+9 359 0.004685450115574436
+9 360 0.0007496720184919098
+9 361 0.0007496720184919098
+9 362 0.0024989067283063657
+9 363 0.0038733054288748667
+9 364 0.0014368713687761604
+9 365 0.004498032110951459
+9 366 0.009933154245017804
+9 367 0.010245517586056099
+9 368 0.015993003061160742
+9 369 0.015993003061160742
+9 370 0.021115761854188793
+9 371 0.01693009308427563
+9 372 0.0009995626913225464
+9 373 0.0037483600924595483
+9 374 0.008996064221902918
+9 375 0.012432060973324168
+9 376 0.004498032110951458
+9 377 0.0031861060785906164
+9 378 0.017554819766352217
+9 379 0.01749234709814456
+9 380 0.01649278440682201
+9 381 0.008308864871618667
+9 382 0.006434684825388891
+9 383 0.016055475729368402
+9 384 0.012557006309739488
+9 385 0.01018304491784844
+9 386 0.015180858374461174
+9 387 0.01155744361841694
+9 388 0.009058536890110576
+9 389 0.0028112700693446614
+9 391 0.00018741800462297744
+9 392 0.0005622540138689324
+9 394 0.0018117073780221152
+9 395 0.0004997813456612732
+9 399 0.01611794839757606
+9 402 0.0008746173549072279
+9 470 0.0007496720184919098
+9 471 0.0004997813456612732
+9 478 0.0007496720184919098
+9 479 0.0004997813456612732
+9 480 0.0026863247329293434
+9 481 0.002623852064721684
+9 483 0.0001249453364153183
+9 484 0.0001249453364153183
+9 485 0.0014993440369838195
+9 486 0.0004997813456612732
+9 488 0.008996064221902916
+9 489 0.006059848816142937
+9 490 0.006497157493596552
+9 491 0.0001249453364153183
+9 492 0.0003748360092459549
+9 493 0.001311926032360842
+9 494 0.000437308677453614
+9 495 0.0017492347098144558
+9 496 0.002623852064721684
+9 497 0.0027487974011370024
+9 498 0.0006247266820765914
+9 509 0.0020615980508527517
+9 510 0.0003748360092459549
+9 579 0.0019991253826450927
+10 74 0.0005264345341054373
+10 75 0.0021809430698653833
+10 76 0.000752049334436339
+10 137 0.000827254267879973
+10 143 0.0006016394675490712
+10 150 0.0003008197337745356
+10 151 0.0006016394675490712
+10 185 0.004361886139730767
+10 186 0.0010528690682108748
+10 187 0.003910656539068963
+10 206 0.0001504098668872678
+10 220 0.0003008197337745356
+10 228 0.0030834022711889904
+10 246 0.003985861472512596
+10 247 0.0012784838685417762
+10 262 0.003910656539068963
+10 263 0.0011280740016545085
+10 269 0.0032338121380762574
+10 270 0.002857787470858088
+10 271 0.0003008197337745356
+10 276 0.000902459201323607
+10 277 0.00556516507482891
+10 288 0.0027825825374144545
+10 356 0.020305332029781156
+10 357 0.019703692562232082
+10 358 0.02549447243739189
+10 359 0.023764758968188315
+10 360 0.02587049710461006
+10 361 0.022486275099646538
+10 362 0.022411070166202904
+10 363 0.02278709483342107
+10 364 0.026321726705271865
+10 365 0.02007971722945025
+10 366 0.016093855756937656
+10 367 0.022260660299315636
+10 368 0.011882379484094157
+10 369 0.009400616680454237
+10 370 0.00962623148078514
+10 371 0.011431149883432353
+10 372 0.021583815898322933
+10 373 0.024742423102955553
+10 374 0.01947807776190118
+10 375 0.01789877415958487
+10 376 0.023388734300970146
+10 377 0.023689554034744677
+10 378 0.009400616680454237
+10 379 0.005865984808603443
+10 380 0.01135594494998872
+10 381 0.022486275099646538
+10 382 0.015341806422501316
+10 383 0.01135594494998872
+10 384 0.01158155975031962
+10 385 0.019703692562232082
+10 386 0.01504098668872678
+10 387 0.018124388959915774
+10 388 0.010077461081446944
+10 389 0.02293750470030834
+10 390 0.01383770775362864
+10 391 0.017372339625479433
+10 392 0.019703692562232086
+10 393 0.011882379484094157
+10 394 0.024667218169511923
+10 395 0.024667218169511916
+10 396 0.012333609084755958
+10 397 0.011506354816875987
+10 398 0.013236068286079568
+10 399 0.0070692637437015865
+10 400 0.01940287282845755
+10 401 0.016093855756937656
+10 402 0.020530946830112053
+10 403 0.008197337745356097
+10 404 0.01759795442581033
+10 405 0.021508610964879295
+10 406 0.008197337745356095
+10 407 0.013988117620515906
+10 408 0.008949387079792434
+10 409 0.006467624276152515
+10 410 0.005264345341054373
+10 411 0.005565165074828909
+10 412 0.003835451605625329
+10 413 0.002105738136421749
+10 414 0.0012784838685417764
+10 415 0.002556967737083553
+10 417 7.52049334436339e-05
+10 420 0.0020305332029781154
+10 421 0.0006016394675490712
+10 422 0.0006016394675490712
+10 427 7.52049334436339e-05
+10 430 0.004737910806948936
+10 431 0.002331352936752651
+10 432 0.0001504098668872678
+10 440 0.0010528690682108748
+10 441 0.0021057381364217496
+10 446 7.52049334436339e-05
+10 452 0.004512296006618034
+10 453 0.003609836805294428
+10 454 0.0006016394675490712
+10 456 0.0006016394675490712
+10 457 0.0004512296006618035
+11 356 0.011297349184080336
+11 357 0.011888060252528984
+11 358 0.004430333013364838
+11 359 0.004430333013364838
+11 360 0.009229860444510078
+11 361 0.011371188067636416
+11 362 0.0038396219449161927
+11 363 0.002805877575131064
+11 364 0.005759432917374288
+11 365 0.0014767776711216124
+11 366 0.0003691944177804031
+11 367 0.0014029387875655322
+11 372 0.011371188067636418
+11 373 0.004504171896920917
+11 374 0.0012552610204533705
+11 375 0.0011075832533412094
+11 376 0.005316399616037805
+11 377 0.005685594033818208
+11 381 0.001772133205345935
+11 382 0.0003691944177804031
+11 385 0.00118142213689729
+11 386 0.0005168721848925644
+11 387 0.0011075832533412094
+11 388 7.383888355608063e-05
+11 389 0.0031012331093553864
+11 390 0.019345787491693123
+11 391 0.010928154766299934
+11 392 0.01299564350587019
+11 393 0.02082256516281474
+11 394 0.0057594329173742895
+11 395 0.00945137709517832
+11 396 0.017352137635678947
+11 397 0.02001033744369785
+11 398 0.018238204238351912
+11 400 0.01794284870412759
+11 401 0.019124270841024884
+11 402 0.016170715498781657
+11 403 0.022816215018828915
+11 404 0.01727829875212287
+11 405 0.014546260060547885
+11 406 0.0239976371557262
+11 407 0.022963892785941076
+11 408 0.02695119249796943
+11 409 0.023776120505057962
+11 410 0.019493465258805284
+11 411 0.023849959388614037
+11 412 0.026581998080189025
+11 413 0.020601048512146496
+11 414 0.019493465258805288
+11 415 0.02163479288193162
+11 416 0.004873366314701322
+11 417 0.007900760540500627
+11 418 0.0042088163626965965
+11 419 0.0016982943217898545
+11 420 0.018238204238351912
+11 421 0.012035738019641142
+11 422 0.012331093553865465
+11 423 0.0055379162667060465
+11 424 0.004061138595584434
+11 425 0.0016982943217898542
+11 426 0.0008122277191168869
+11 427 0.00834379384183711
+11 428 0.0005168721848925643
+11 429 0.0015506165546776932
+11 430 0.023406926087277558
+11 431 0.019124270841024884
+11 432 0.016392232149449903
+11 433 0.005907110684486449
+11 434 0.0019198109724580966
+11 435 0.015432326663220851
+11 436 0.006940855054271579
+11 437 0.0013290999040094513
+11 438 0.013364837923650594
+11 439 0.00694085505427158
+11 440 0.02126559846415122
+11 441 0.02355460385438972
+11 442 0.002732038691574983
+11 444 7.383888355608063e-05
+11 446 0.010854315882743852
+11 447 0.0031012331093553864
+11 448 0.007753082773388465
+11 449 0.0018459720889020155
+11 450 0.00044303330133648377
+11 451 0.00044303330133648377
+11 452 0.023776120505057962
+11 453 0.02229934283393635
+11 454 0.02126559846415122
+11 455 0.013290999040094512
+11 456 0.018385882005464073
+11 457 0.015580004430333012
+11 458 0.010189765930739126
+11 459 0.012035738019641142
+11 460 0.0034704275271357893
+11 461 0.004578010780476998
+11 462 0.005907110684486449
+11 463 0.000590711068448645
+11 464 0.000590711068448645
+11 465 0.0002953555342243225
+11 466 0.0019936498560141768
+11 467 0.0013290999040094513
+12 444 1.0
+13 16 0.0014635288607891346
+13 17 0.002575810794988877
+13 18 0.005737033134293408
+13 19 0.001990399250673223
+13 20 0.007785973539398196
+13 21 0.008664090855871677
+13 22 0.002985598876009834
+13 23 0.002224563868399485
+13 63 5.854115443156538e-05
+13 66 0.0018147757873785268
+13 67 0.0006439526987472192
+13 68 0.0002927057721578269
+13 69 0.0008195761620419153
+13 70 0.0007024938531787846
+13 71 0.0033953869570307925
+13 72 0.0024001873316941806
+13 73 0.00023416461772626153
+13 74 0.009308043554618896
+13 75 0.007551808921671934
+13 76 0.01890879288139562
+13 77 0.013230300901533777
+13 80 0.0013464465519260039
+13 81 0.0002927057721578269
+13 82 0.0016976934785153963
+13 83 0.0040978808102095764
+13 93 0.00017562346329469617
+13 100 0.00017562346329469617
+13 102 0.00011708230886313077
+13 103 0.00035124692658939234
+13 137 0.00011708230886313077
+13 141 0.020021074815595362
+13 142 0.016625687858564567
+13 143 0.0016391523240838306
+13 144 0.0005268703898840885
+13 145 0.0002927057721578269
+13 146 0.002868516567146704
+13 147 0.006673691605198454
+13 148 0.008839714319166374
+13 149 0.0002927057721578269
+13 150 0.0002927057721578269
+13 151 0.0012293642430628731
+13 152 0.0011122819341997424
+13 157 0.0008781173164734808
+13 158 0.0004097880810209577
+13 160 0.02681184872965695
+13 161 0.023592085235920848
+13 162 0.03096827069429809
+13 163 0.02476290832455216
+13 178 0.0002927057721578269
+13 179 5.854115443156538e-05
+13 180 0.0009366584709050461
+13 181 0.00444912773679897
+13 182 0.013464465519260038
+13 183 0.0167427701674277
+13 184 0.00017562346329469617
+13 185 5.854115443156538e-05
+13 186 0.0002927057721578269
+13 187 0.0008195761620419153
+13 196 0.017503805175038047
+13 197 0.023416461772626154
+13 198 0.023416461772626154
+13 199 0.02921203606135113
+13 201 0.0018733169418100922
+13 202 0.006439526987472192
+13 206 0.015162158997775435
+13 207 0.0006439526987472192
+13 218 0.0007610350076103501
+13 219 0.00046832923545252306
+13 220 0.006673691605198454
+13 227 0.00011708230886313077
+13 228 0.0009951996253366115
+13 246 0.0106544901065449
+13 247 0.014576747453459781
+13 262 0.00011708230886313077
+13 268 0.0033368458025992264
+13 269 0.010420325488818641
+13 270 0.0035710104203254887
+13 271 0.002985598876009834
+13 275 0.009834913944502985
+13 276 0.02142606252195293
+13 277 0.01164968973188151
+13 278 0.00035124692658939234
+13 288 0.004741833508956796
+13 289 0.014693829762322912
+13 290 0.02207001522070015
+13 291 0.017913593256059006
+13 292 0.011005737033134292
+13 293 0.010478866643250203
+13 358 0.0003512469265893923
+13 363 5.854115443156538e-05
+13 365 0.00035124692658939234
+13 366 0.0007024938531787846
+13 367 0.00017562346329469617
+13 368 0.00017562346329469617
+13 369 0.0009951996253366115
+13 370 0.005151621589977754
+13 371 0.005385786207704015
+13 374 0.0016976934785153963
+13 375 0.0017562346329469615
+13 376 0.0004097880810209577
+13 377 0.0003512469265893923
+13 378 0.00046832923545252306
+13 379 0.0015220700152206996
+13 381 0.0014635288607891346
+13 382 0.0009951996253366115
+13 383 0.00532724505327245
+13 384 0.0037466338836201845
+13 386 0.0011708230886313077
+13 387 0.00011708230886313077
+13 388 0.0012293642430628731
+13 389 0.0002927057721578269
+13 394 0.00017562346329469617
+13 399 0.0033953869570307925
+13 468 5.854115443156538e-05
+13 469 0.0011122819341997424
+13 470 0.0027514342582835734
+13 471 0.0012879053974944384
+13 474 5.854115443156538e-05
+13 475 0.0002927057721578269
+13 476 0.0002927057721578269
+13 477 0.0018147757873785268
+13 478 0.0020489404051047887
+13 479 0.0011122819341997424
+13 480 0.004332045427935838
+13 481 0.006556609296335323
+13 483 0.00046832923545252306
+13 484 0.012352183585060298
+13 485 0.014869453225617611
+13 486 0.005912656597588104
+13 487 0.004214963119072708
+13 488 0.01164968973188151
+13 489 0.015806111696522657
+13 490 0.008312843929282283
+13 491 0.009834913944502985
+13 492 0.006146821215314366
+13 493 0.015513405924364829
+13 494 0.02007961597002693
+13 495 0.0024001873316941806
+13 496 0.008956796628029503
+13 497 0.004741833508956796
+13 498 0.003512469265893923
+13 499 0.002517269640557311
+13 501 0.0005854115443156538
+13 502 0.0004097880810209577
+13 504 0.001990399250673223
+13 505 0.00040978808102095764
+13 509 0.010478866643250205
+13 510 0.02207001522070015
+13 513 0.0012293642430628731
+13 579 0.021660227139679192
+13 580 0.0002927057721578269
+13 581 0.00011708230886313077
+13 582 0.0012879053974944388
+13 583 0.0018147757873785272
+13 584 5.854115443156538e-05
+13 585 0.00011708230886313077
+13 586 0.0011122819341997422
+13 587 0.0008195761620419154
+13 589 0.0007610350076103501
+13 590 0.003395386957030792
+13 591 0.0026928931038520073
+13 592 0.009834913944502985
+13 593 0.009834913944502985
+13 594 0.00011708230886313077
+13 595 0.0013464465519260039
+13 596 0.0015806111696522653
+13 597 0.0002927057721578269
+13 598 0.00023416461772626153
+13 599 0.0009951996253366115
+13 600 0.0002927057721578269
+13 601 0.0012293642430628731
+13 602 0.00046832923545252306
+13 603 0.00011708230886313077
+13 604 0.003980798501346446
+13 605 0.013523006673691603
+13 606 0.011591148577449948
+13 607 0.006263903524177495
+13 608 0.014693829762322912
+13 610 0.0003512469265893923
+13 611 0.0012293642430628734
+13 612 5.854115443156538e-05
+13 613 0.005327245053272449
+13 614 0.0019318580962416575
+13 615 0.006615150450766888
+13 616 0.0026928931038520073
+13 617 0.0002927057721578269
+13 627 0.005268703898840884
+13 630 0.00011708230886313077
+13 696 0.00023416461772626153
+13 769 0.00076103500761035
+13 770 0.004683292354525231
+13 771 0.0011122819341997424
+13 772 5.854115443156538e-05
+13 774 0.00076103500761035
+13 775 0.003512469265893923
+13 776 0.008020138157124457
+14 74 0.0005157677571470676
+14 75 0.0005157677571470676
+14 76 0.004273504273504274
+14 77 0.0008104921898025347
+14 141 0.002799882110226938
+14 142 0.0003684055408193339
+14 160 0.001326259946949602
+14 161 0.0005894488653109342
+14 162 0.004420866489832007
+14 163 0.0050103153551429415
+14 196 7.368110816386678e-05
+14 197 0.0014736221632773356
+14 198 0.0030209254347185383
+14 199 0.0009578544061302684
+14 206 7.368110816386678e-05
+14 246 0.0013262599469496023
+14 247 0.0061155319776009425
+14 269 7.368110816386678e-05
+14 276 0.0034630120837017397
+14 277 0.0008841732979664015
+14 290 0.001399941055113469
+14 291 0.0052313586796345415
+14 292 0.0058944886531093425
+14 293 0.008989095195991748
+14 468 0.0199675803124079
+14 469 0.02460949012673151
+14 470 0.021220159151193633
+14 471 0.02586206896551724
+14 472 0.020704391394046565
+14 473 0.017978190391983492
+14 474 0.020114942528735632
+14 475 0.02586206896551724
+14 476 0.02291482463896257
+14 477 0.02475685234305924
+14 478 0.021293840259357502
+14 479 0.026009431181844976
+14 480 0.019451812555260833
+14 481 0.014294134983790155
+14 482 0.01422045387562629
+14 483 0.02726201002063071
+14 484 0.02026230474506337
+14 485 0.015694076038903628
+14 486 0.02726201002063071
+14 487 0.02733569112879458
+14 488 0.01215738284703802
+14 489 0.009652225169466549
+14 490 0.015767757147067494
+14 491 0.02460949012673151
+14 492 0.020114942528735635
+14 493 0.013704686118479222
+14 494 0.01333628057765989
+14 495 0.022988505747126436
+14 496 0.018272914824638966
+14 497 0.020851753610374304
+14 498 0.016578249336870028
+14 499 0.025567344532861774
+14 500 0.007515473032714411
+14 501 0.019157088122605366
+14 502 0.015104627173592693
+14 503 0.00987326849395815
+14 504 0.021293840259357502
+14 505 0.020999115826702035
+14 506 0.013262599469496024
+14 507 0.013483642793987621
+14 508 0.010389036251105217
+14 509 0.011715296198054817
+14 510 0.010167992926613616
+14 511 0.011199528440907752
+14 512 0.009357500736811082
+14 513 0.020335985853227233
+14 514 0.010683760683760684
+14 515 0.01215738284703802
+14 516 0.016357206012378427
+14 517 0.004052460949012673
+14 518 0.006704980842911877
+14 519 0.004273504273504274
+14 520 0.0036103743000294726
+14 521 0.004494547597995874
+14 522 0.003020925434718538
+14 523 0.002136752136752137
+14 524 0.0037577365163572064
+14 525 0.0005894488653109342
+14 526 0.0008104921898025347
+14 531 0.0002947244326554671
+14 541 0.0016209843796050694
+14 542 0.0006631299734748011
+14 551 0.0019157088122605363
+14 552 0.0009578544061302684
+14 563 0.005010315355142941
+14 564 0.004715590922487474
+14 565 0.0010315355142941351
+14 567 0.000663129973474801
+14 568 0.00022104332449160037
+14 579 0.006115531977600943
+15 468 0.01103996467211305
+15 469 0.010230367262824759
+15 470 0.0023551924633841174
+15 471 0.004121586810922205
+15 472 0.009199970560094207
+15 473 0.011334363730036065
+15 474 0.004047987046441452
+15 475 0.0027967910502686394
+15 476 0.0059615809229410476
+15 477 0.0014719952896150733
+15 478 0.0003679988224037683
+15 479 0.0016191948185765807
+15 482 0.011187164201074557
+15 483 0.0056671818650180315
+15 484 0.0014719952896150733
+15 485 0.0003679988224037683
+15 486 0.004563185397806727
+15 487 0.0073599764480753675
+15 491 0.002134393169941856
+15 492 0.0003679988224037683
+15 495 0.0011775962316920587
+15 496 0.0005151983513652757
+15 497 0.0005151983513652757
+15 498 7.359976448075367e-05
+15 499 0.0032383896371531613
+15 500 0.019945536174284243
+15 501 0.01781114300434239
+15 502 0.014204754544785456
+15 503 0.02524471921689851
+15 504 0.005446382571575771
+15 505 0.010524766320747773
+15 506 0.01832634135570766
+15 507 0.01884153970707294
+15 508 0.018473540884669168
+15 511 0.01781114300434239
+15 512 0.019356738058438214
+15 513 0.012143961139324354
+15 514 0.020755133583572533
+15 515 0.01862074041363068
+15 516 0.015014351954073748
+15 517 0.024361522043129462
+15 518 0.02333112534039891
+15 519 0.027011113564436594
+15 520 0.02465592110105248
+15 521 0.024067122985206444
+15 522 0.024508721572090966
+15 523 0.023478324869360415
+15 524 0.025980716861706044
+15 525 0.018031942297784646
+15 526 0.020607934054611025
+15 527 0.0012511959961728123
+15 528 0.0059615809229410476
+15 529 0.0025023919923456246
+15 530 0.0009567969382497977
+15 531 0.018179141826746157
+15 532 0.006255979980864061
+15 533 0.011187164201074557
+15 534 0.005225583278133511
+15 535 0.004710384926768235
+15 536 0.0016927945830573343
+15 537 0.0007359976448075366
+15 538 0.013247957606535658
+15 540 7.359976448075367e-05
+15 541 0.02340472510487966
+15 542 0.02031353499668801
+15 543 0.010745565614190034
+15 544 0.0032383896371531613
+15 545 0.0003679988224037683
+15 546 0.015529550305439023
+15 547 0.005593582100537278
+15 548 0.001103996467211305
+15 549 0.019356738058438214
+15 550 0.009126370795613454
+15 551 0.025465518510340766
+15 552 0.022374328402149115
+15 553 0.0029439905792301465
+15 557 0.011923161845882095
+15 558 0.0029439905792301465
+15 559 0.00942076985353647
+15 560 0.003679988224037683
+15 561 0.0002943990579230147
+15 563 0.019356738058438214
+15 564 0.024582321336571723
+15 565 0.02244792816662987
+15 566 0.015382350776477514
+15 567 0.019503937587399718
+15 568 0.015161551483035255
+15 569 0.0059615809229410476
+15 570 0.01023036726282476
+15 571 0.0030175903437109006
+15 572 0.003459188930595423
+15 573 0.005519982336056524
+15 574 0.0008095974092882903
+15 575 0.0008095974092882903
+15 576 7.359976448075367e-05
+15 577 0.0013247957606535659
+15 578 0.0008095974092882903
+16 556 1.0
+17 17 0.0004919184820801125
+17 18 0.0006324666198172875
+17 20 0.005762473647224175
+17 21 0.0021082220660576245
+17 22 0.0014757554462403375
+17 23 0.0024595924104005625
+17 70 0.000140548137737175
+17 71 0.000983836964160225
+17 72 0.0023190442726633872
+17 73 0.0004919184820801125
+17 76 0.000140548137737175
+17 77 0.0006324666198172875
+17 80 0.008151791988756148
+17 81 0.006676036542515813
+17 82 0.016303583977512297
+17 83 0.012297962052002813
+17 96 0.002178496134926213
+17 97 0.0007027406886858749
+17 98 7.02740688685875e-05
+17 99 0.0011243851018974
+17 100 0.009065354884047786
+17 101 0.007308503162333099
+17 102 0.015038650737877725
+17 103 0.017919887561489812
+17 141 0.0018271257905832748
+17 142 0.003794799718903725
+17 148 0.0013352073085031624
+17 153 0.001546029515108925
+17 154 0.0024595924104005625
+17 160 0.0134926212227688
+17 161 0.01883345045678145
+17 162 0.012438510189739986
+17 163 0.005200281096275476
+17 178 0.0007730147575544624
+17 179 0.0007730147575544624
+17 180 0.003021784961349263
+17 181 0.00758959943780745
+17 182 0.01377371749824315
+17 183 0.007238229093464512
+17 184 0.0026001405481377374
+17 196 0.0123682361208714
+17 197 0.007449051300070275
+17 198 0.0071679550245959235
+17 199 0.0202389318341532
+17 200 0.0004919184820801125
+17 201 0.027406886858749122
+17 202 0.020028109627547436
+17 206 0.0019676739283204497
+17 207 0.0004919184820801125
+17 218 0.000140548137737175
+17 220 0.00028109627547435
+17 247 7.02740688685875e-05
+17 256 0.000140548137737175
+17 257 0.0004919184820801125
+17 269 0.00035137034434293746
+17 275 0.005270555165144062
+17 276 0.0010541110330288123
+17 277 7.02740688685875e-05
+17 278 0.02508784258608574
+17 289 0.019465917076598734
+17 290 0.0044975404075896
+17 291 0.001546029515108925
+17 292 0.0002108222066057625
+17 293 7.02740688685875e-05
+17 484 0.0009135628952916374
+17 485 0.0007730147575544624
+17 489 0.000421644413211525
+17 491 0.0004919184820801125
+17 492 7.02740688685875e-05
+17 493 0.0013352073085031622
+17 494 0.002951510892480675
+17 509 0.0002108222066057625
+17 510 0.0033731553056922
+17 579 0.0027406886858749122
+17 580 0.003162333099086437
+17 581 0.0023190442726633877
+17 582 0.009978917779339425
+17 583 0.009065354884047788
+17 584 0.0010541110330288125
+17 585 0.0016865776528460997
+17 586 0.004356992269852425
+17 587 0.003513703443429375
+17 588 0.0016865776528461
+17 589 0.004567814476458187
+17 590 0.009065354884047786
+17 591 0.006886858749121575
+17 592 0.013914265635980324
+17 593 0.016795502459592413
+17 594 0.021503865073787775
+17 595 0.028742094167252288
+17 596 0.02178496134926212
+17 597 0.02059030217849614
+17 598 0.0026001405481377374
+17 599 0.004427266338721012
+17 600 0.015038650737877721
+17 601 0.015390021082220663
+17 602 0.008081517919887564
+17 603 0.007308503162333099
+17 604 0.021995783555867888
+17 605 0.02312016865776529
+17 606 0.019465917076598734
+17 607 0.025720309205903027
+17 608 0.019465917076598737
+17 609 0.0123682361208714
+17 610 0.013070976809557275
+17 611 0.02009838369641603
+17 612 0.019184820801124384
+17 613 0.012719606465214337
+17 614 0.0134926212227688
+17 615 0.0179901616303584
+17 616 0.015319747013352071
+17 617 0.005621925509486999
+17 618 0.00028109627547435
+17 619 0.0022487702037948
+17 620 0.0026704146170063252
+17 621 0.0002108222066057625
+17 622 0.007589599437807451
+17 623 0.0018271257905832748
+17 624 0.00084328882642305
+17 625 0.0007027406886858749
+17 626 0.0009135628952916376
+17 627 0.01981728742094167
+17 628 0.0002108222066057625
+17 629 0.0004919184820801124
+17 630 0.003021784961349262
+17 631 0.0006324666198172875
+17 632 7.02740688685875e-05
+17 633 7.02740688685875e-05
+17 696 0.021152494729444835
+17 769 0.0179901616303584
+17 770 0.020941672522839076
+17 771 0.011243851018974
+17 772 0.005411103302881238
+17 773 0.0028109627547434997
+17 774 0.0036542515811665496
+17 775 0.00871398453970485
+17 776 0.012157413914265636
+17 777 0.00035137034434293746
+18 82 0.0006012777151446825
+18 83 0.00015031942878617063
+18 103 0.0004509582863585119
+18 142 7.515971439308531e-05
+18 160 0.001428034573468621
+18 161 0.0071401728673431055
+18 162 0.0018789928598271326
+18 182 0.0005261180007515971
+18 196 0.0018789928598271326
+18 197 0.0009019165727170237
+18 198 0.0008267568583239384
+18 199 0.006238256294626081
+18 201 0.003006388575723412
+18 202 0.0018038331454340473
+18 275 0.00015031942878617063
+18 278 0.002179631717399474
+18 289 0.003607666290868095
+18 580 0.020142803457346866
+18 581 0.01698609545283728
+18 582 0.03013904547162721
+18 583 0.02705749718151071
+18 584 0.017737692596768138
+18 585 0.016910935738444193
+18 586 0.021270199173243146
+18 587 0.018489289740698987
+18 588 0.01924088688462984
+18 589 0.016910935738444197
+18 590 0.016234498308906428
+18 591 0.017061255167230362
+18 592 0.015182262307403233
+18 593 0.010672679443818113
+18 594 0.004509582863585118
+18 595 0.005712138293874483
+18 596 0.01570838030815483
+18 597 0.011424276587748966
+18 598 0.018714768883878245
+18 599 0.01969184517098835
+18 600 0.020593761743705377
+18 601 0.023900789177001128
+18 602 0.027583615182262308
+18 603 0.0266065388951522
+18 604 0.007591131153701616
+18 605 0.007666290868094702
+18 606 0.012701991732431419
+18 607 0.020668921458098462
+18 608 0.016309658023299513
+18 609 0.015031942878617064
+18 610 0.017061255167230362
+18 611 0.009845922585494176
+18 612 0.009845922585494176
+18 613 0.02435174746335964
+18 614 0.02450206689214581
+18 615 0.02720781661029688
+18 616 0.016459977452085682
+18 617 0.021119879744456973
+18 618 0.012251033446072902
+18 619 0.020744081172491546
+18 620 0.02247275460353251
+18 621 0.008643367155204812
+18 622 0.025779782036828264
+18 623 0.02006764374295378
+18 624 0.011724915445321307
+18 625 0.01089815858699737
+18 626 0.011800075159714395
+18 627 0.006914693724163849
+18 628 0.011499436302142051
+18 629 0.011273957158962795
+18 630 0.0214956783164224
+18 631 0.012251033446072907
+18 632 0.013077790304396842
+18 633 0.014656144306651634
+18 634 0.00496054114994363
+18 635 0.006914693724163849
+18 636 0.004735062006764375
+18 637 0.004359263434798948
+18 638 0.004208944006012777
+18 639 0.003757985719654265
+18 640 0.0021796317173994736
+18 641 0.003908305148440436
+18 642 0.0010522360015031943
+18 643 0.0006012777151446825
+18 648 0.0005261180007515971
+18 658 0.0020293122886133035
+18 659 0.0011273957158962795
+18 668 0.000751597143930853
+18 669 0.0011273957158962795
+18 680 0.002931228861330327
+18 681 0.0057872980082675695
+18 682 0.001428034573468621
+18 684 0.0006012777151446825
+18 685 0.0006012777151446825
+18 696 0.003757985719654265
+18 769 0.0009019165727170238
+18 770 0.002931228861330327
+18 771 0.00015031942878617063
+18 775 0.00022547914317925594
+18 776 0.001202555430289365
+19 580 0.012027744982290436
+19 581 0.009961629279811098
+19 582 0.0059031877213695395
+19 583 0.004574970484061393
+19 584 0.009223730814639905
+19 585 0.011289846517119244
+19 586 0.0038370720188902006
+19 587 0.0028040141676505316
+19 588 0.0057556080283353
+19 589 0.0014757969303423849
+19 590 0.0003689492325855962
+19 591 0.0008116883116883117
+19 598 0.011806375442739079
+19 599 0.004501180637544274
+19 600 0.0012544273907910272
+19 601 0.0012544273907910272
+19 602 0.005165289256198347
+19 603 0.005165289256198347
+19 609 0.0005165289256198347
+19 610 0.0008116883116883117
+19 613 0.0016971664698937428
+19 614 0.0010330578512396697
+19 615 0.0011068476977567888
+19 616 0.0003689492325855962
+19 617 0.0028040141676505316
+19 618 0.021325265643447465
+19 619 0.012322904368358915
+19 620 0.016528925619834708
+19 621 0.022432113341204252
+19 622 0.0042060212514757975
+19 623 0.009518890200708384
+19 624 0.021989374262101534
+19 625 0.015053128689492327
+19 626 0.01977567886658796
+19 628 0.017635773317591502
+19 629 0.019406729634002362
+19 630 0.010256788665879575
+19 631 0.01977567886658796
+19 632 0.018299881936245575
+19 633 0.014684179456906728
+19 634 0.027007083825265645
+19 635 0.02317001180637544
+19 636 0.026638134592680048
+19 637 0.02368654073199528
+19 638 0.02435064935064935
+19 639 0.02457201889020071
+19 640 0.021915584415584416
+19 641 0.0256788665879575
+19 642 0.018816410861865408
+19 643 0.01682408500590319
+19 644 0.002656434474616293
+19 645 0.009445100354191263
+19 646 0.0031729634002361272
+19 647 0.0025826446280991736
+19 648 0.016602715466351833
+19 649 0.006419716646989375
+19 650 0.010478158205430934
+19 651 0.004870129870129871
+19 652 0.003246753246753247
+19 653 0.0014757969303423849
+19 654 0.0013282172373081465
+19 655 0.010847107438016527
+19 656 0.0005903187721369539
+19 657 0.0005165289256198347
+19 658 0.02169421487603306
+19 659 0.019406729634002362
+19 660 0.007747933884297522
+19 661 0.001844746162927981
+19 662 0.00014757969303423848
+19 663 0.012101534828807558
+19 664 0.0038370720188902014
+19 665 0.0007378984651711924
+19 666 0.0157172373081464
+19 667 0.006050767414403779
+19 668 0.02221074380165289
+19 669 0.021103896103896107
+19 670 0.0028040141676505316
+19 674 0.006419716646989373
+19 675 0.0014020070838252656
+19 676 0.010109208972845335
+19 677 0.0030253837072018895
+19 678 7.378984651711924e-05
+19 679 0.00014757969303423848
+19 680 0.02206316410861866
+19 681 0.022432113341204252
+19 682 0.0256788665879575
+19 683 0.01977567886658796
+19 684 0.022358323494687134
+19 685 0.02088252656434475
+19 686 0.012470484061393153
+19 687 0.01586481700118064
+19 688 0.004427390791027155
+19 689 0.006198347107438017
+19 690 0.00974025974025974
+19 691 0.000885478158205431
+19 692 0.0003689492325855962
+19 693 0.00014757969303423848
+19 694 0.002877804014167651
+19 695 0.0016233766233766235
+20 673 1.0
diff --git a/data/body_models/SMPLX_to_J14.pkl b/data/body_models/SMPLX_to_J14.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..db8aa5c74b860a2b9555383d5ca2a09523851fe4
--- /dev/null
+++ b/data/body_models/SMPLX_to_J14.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5df844ddea85b0a400a2e8dbe63d09d19f2b1b7ec0e0e952daeae08f83d82d61
+size 4692193
diff --git a/data/body_models/SMPL_NEUTRAL.pkl b/data/body_models/SMPL_NEUTRAL.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..65ae47d34e5b26720c9ccdd2614044832f0e30f2
--- /dev/null
+++ b/data/body_models/SMPL_NEUTRAL.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4924f235e63f7c5d5b690acedf736419c2edb846a2d69fc0956169615fa75688
+size 247186228
diff --git a/data/body_models/all_means.pkl b/data/body_models/all_means.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..03ff93f70af27dac0b808dbe45761c95ce8df397
--- /dev/null
+++ b/data/body_models/all_means.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:010c2178eff5fd58d07bab3717002e959fe62541aaaef778b09414ec0237690d
+size 4758
diff --git a/data/body_models/downsample_mat_smplx.pkl b/data/body_models/downsample_mat_smplx.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..45e09fb0bf098421656f6c3418ac05bd8fc32f18
--- /dev/null
+++ b/data/body_models/downsample_mat_smplx.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b67d12e8e9af767d9856fea8cb3366bfa8025fdf17cd4e25fc8b10f9a45eca9e
+size 18310685
diff --git a/data/body_models/joints_regressor_cmr.npy b/data/body_models/joints_regressor_cmr.npy
new file mode 100644
index 0000000000000000000000000000000000000000..06bcf3ff5f0f2797e8d090e4a5b1ea7c6c37db13
--- /dev/null
+++ b/data/body_models/joints_regressor_cmr.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a408b885040d714c94b41f64b2ec329d20dce673ae330d04a07a4b02dae7a13d
+size 661568
diff --git a/data/body_models/smpl/SMPL_FEMALE.pkl b/data/body_models/smpl/SMPL_FEMALE.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..92a201f4839bd95c1c1986437c7c6a02d7d1ae99
--- /dev/null
+++ b/data/body_models/smpl/SMPL_FEMALE.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a583c1b98e4afc19042641f1bae5cd8a1f712a6724886291a7627ec07acd408d
+size 39056454
diff --git a/data/body_models/smpl/SMPL_MALE.pkl b/data/body_models/smpl/SMPL_MALE.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..43dfecc57d9b7aa99cd2398df818ba252be7f605
--- /dev/null
+++ b/data/body_models/smpl/SMPL_MALE.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e8c0bbbbc635dcb166ed29c303fb4bef16ea5f623e5a89263495a9e403575bd
+size 39056404
diff --git a/data/body_models/smpl/SMPL_NEUTRAL.pkl b/data/body_models/smpl/SMPL_NEUTRAL.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..65ae47d34e5b26720c9ccdd2614044832f0e30f2
--- /dev/null
+++ b/data/body_models/smpl/SMPL_NEUTRAL.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4924f235e63f7c5d5b690acedf736419c2edb846a2d69fc0956169615fa75688
+size 247186228
diff --git a/data/body_models/smpl/index.html b/data/body_models/smpl/index.html
new file mode 100644
index 0000000000000000000000000000000000000000..60897cdaf2b7687e48b31e7025731020cfd13a5f
--- /dev/null
+++ b/data/body_models/smpl/index.html
@@ -0,0 +1,17 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
+<title>Directory listing for /body_models/smpl/</title>
+</head>
+<body>
+<h1>Directory listing for /body_models/smpl/</h1>
+<hr>
+<ul>
+<li><a href="SMPL_FEMALE.pkl">SMPL_FEMALE.pkl</a></li>
+<li><a href="SMPL_MALE.pkl">SMPL_MALE.pkl</a></li>
+<li><a href="SMPL_NEUTRAL.pkl">SMPL_NEUTRAL.pkl</a></li>
+</ul>
+<hr>
+</body>
+</html>
diff --git a/data/body_models/smpl_mean_params.npz b/data/body_models/smpl_mean_params.npz
new file mode 100644
index 0000000000000000000000000000000000000000..c6f60a76976b877cbc08345b2977c6ddd83ced87
--- /dev/null
+++ b/data/body_models/smpl_mean_params.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6fd6dd687800da946d0a0492383f973b92ec20f166a0b829775882868c35fcdd
+size 1310
diff --git a/data/body_models/smplx/MANO_SMPLX_vertex_ids.pkl b/data/body_models/smplx/MANO_SMPLX_vertex_ids.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..dabec1377a0da4c511a519a00f51f1a3a23f33af
--- /dev/null
+++ b/data/body_models/smplx/MANO_SMPLX_vertex_ids.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e5abe70b6574de25470475091e8008314a5b90127eb48c3e63bfa0adf8c04dcf
+size 13535
diff --git a/data/body_models/smplx/SMPL-X__FLAME_vertex_ids.npy b/data/body_models/smplx/SMPL-X__FLAME_vertex_ids.npy
new file mode 100644
index 0000000000000000000000000000000000000000..c940d3aa6cb4cbbcc348fd518b15d8777dc350fd
--- /dev/null
+++ b/data/body_models/smplx/SMPL-X__FLAME_vertex_ids.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e70cdc3659aae699b9732e8dd4af49106310c69b90dc83d9f73e96dbf871e49
+size 40312
diff --git a/data/body_models/smplx/SMPLX_FEMALE.npz b/data/body_models/smplx/SMPLX_FEMALE.npz
new file mode 100644
index 0000000000000000000000000000000000000000..da0a200cd85eb10f73aa36d44f1d9c509a82dfcc
--- /dev/null
+++ b/data/body_models/smplx/SMPLX_FEMALE.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b2a3686c9d6d218ff6822fba411c607a3c8125a70af340f384ce68bebecabe0e
+size 108794146
diff --git a/data/body_models/smplx/SMPLX_FEMALE.pkl b/data/body_models/smplx/SMPLX_FEMALE.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..3b3c8f90629a55b1af53896ab37d9e6863f77d3d
--- /dev/null
+++ b/data/body_models/smplx/SMPLX_FEMALE.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f3ac7af258fd217ab480b839c011545e5826cfa333ab34b3c98244ee3039bddd
+size 544434140
diff --git a/data/body_models/smplx/SMPLX_MALE.npz b/data/body_models/smplx/SMPLX_MALE.npz
new file mode 100644
index 0000000000000000000000000000000000000000..41fdef3ff2784eb06bb479ebf5fb6887aafbc183
--- /dev/null
+++ b/data/body_models/smplx/SMPLX_MALE.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ab318e3f37d2bfaae26abf4e6fab445c2a610e1d63714794d60379cc263bc2a5
+size 108753445
diff --git a/data/body_models/smplx/SMPLX_MALE.pkl b/data/body_models/smplx/SMPLX_MALE.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..450a5c0a51fb0b382cd746efae420a7131a349cc
--- /dev/null
+++ b/data/body_models/smplx/SMPLX_MALE.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:af7ebc82e44cf098598685474c0592049ddfaca8e850feb0c2b88343f9aacee3
+size 544477159
diff --git a/data/body_models/smplx/SMPLX_NEUTRAL.npz b/data/body_models/smplx/SMPLX_NEUTRAL.npz
new file mode 100644
index 0000000000000000000000000000000000000000..6f42b326bd60123bd813c0fa2df7f4660862a920
--- /dev/null
+++ b/data/body_models/smplx/SMPLX_NEUTRAL.npz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:376021446ddc86e99acacd795182bbef903e61d33b76b9d8b359c2b0865bd992
+size 108752058
diff --git a/data/body_models/smplx/SMPLX_NEUTRAL.pkl b/data/body_models/smplx/SMPLX_NEUTRAL.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..c2ef9ea8a36f2bf51256325bc6d24c181975483c
--- /dev/null
+++ b/data/body_models/smplx/SMPLX_NEUTRAL.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:381c808965deb4f5e845f8c3eddb0cd69930cc72e5774ce4f34c4ce3cf058361
+size 544173380
diff --git a/data/body_models/smplx/SMPLX_to_J14.npy b/data/body_models/smplx/SMPLX_to_J14.npy
new file mode 100644
index 0000000000000000000000000000000000000000..d336545c180ad9c89421cf9eae65aca2faf631d1
--- /dev/null
+++ b/data/body_models/smplx/SMPLX_to_J14.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be01f37aa99e794ace8f52abe7b31df302fe54c68e75062ea0431a6c2f5e084f
+size 1173328
diff --git a/data/body_models/smplx/SMPLX_to_J14.pkl b/data/body_models/smplx/SMPLX_to_J14.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..db8aa5c74b860a2b9555383d5ca2a09523851fe4
--- /dev/null
+++ b/data/body_models/smplx/SMPLX_to_J14.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5df844ddea85b0a400a2e8dbe63d09d19f2b1b7ec0e0e952daeae08f83d82d61
+size 4692193
diff --git a/data/body_models/smplx/smplx_kid_template.npy b/data/body_models/smplx/smplx_kid_template.npy
new file mode 100644
index 0000000000000000000000000000000000000000..8ce7bc403545dfb29f361787cb7bca1df8316d6e
--- /dev/null
+++ b/data/body_models/smplx/smplx_kid_template.npy
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bdce4f5886b9ddcb6da3ee0f70ae636b1aa1292f2b379c4c3149fce8abc0a604
+size 251528
diff --git a/data/body_models/smplx2smpl.pkl b/data/body_models/smplx2smpl.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..0f25e10571181989524020c803280607b7ee9a85
--- /dev/null
+++ b/data/body_models/smplx2smpl.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1d912d121ad98132e4492d8e7a0f1a8cf4412811e14a7ef8cb337bb48eef99e
+size 578019251
diff --git a/datasets/AGORA_MM.py b/datasets/AGORA_MM.py
new file mode 100644
index 0000000000000000000000000000000000000000..45f97d3d3bde58c29f684540f87b60c7845665cc
--- /dev/null
+++ b/datasets/AGORA_MM.py
@@ -0,0 +1,974 @@
+import os
+import os.path as osp
+from glob import glob
+import numpy as np
+from config.config import cfg
+import copy
+import json
+import pickle
+import cv2
+import torch
+from pycocotools.coco import COCO
+from util.human_models import smpl_x
+from util.preprocessing import load_img, sanitize_bbox, process_bbox, load_ply, load_obj
+from util.transforms import rigid_align, rigid_align_batch
+import tqdm
+import random
+from util.formatting import DefaultFormatBundle
+from detrsmpl.data.datasets.pipelines.transforms import Normalize
+import time
+from util.preprocessing import (
+    load_img, process_bbox, augmentation_instance_sample
+    ,process_human_model_output_batch_simplify,process_db_coord_batch_no_valid)
+# from util.human_models import smpl_x
+from .humandata import HumanDataset
+import csv
+KPS2D_KEYS = [
+    'keypoints2d_ori', 'keypoints2d_smplx', 'keypoints2d_smpl',
+    'keypoints2d_original','keypoints2d_gta'
+]
+KPS3D_KEYS = [
+    'keypoints3d_cam', 'keypoints3d', 'keypoints3d_smplx', 'keypoints3d_smpl',
+    'keypoints3d_original', 'keypoints3d_gta'
+]
+class AGORA_MM(HumanDataset):
+    def __init__(self, transform, data_split):
+        super(AGORA_MM, self).__init__(transform, data_split)
+        self.img_shape = [2160,3840]
+        pre_prc_file_train = 'spec_train_smpl.npz'
+        pre_prc_file_test = 'spec_test_smpl.npz'
+        self.save_idx = 0
+        if self.data_split == 'train':
+            filename = getattr(cfg, 'filename', pre_prc_file_train)
+        else:
+            self.test_set = 'val'
+        
+        self.img_dir = './data/datasets/agora'
+
+
+        if data_split == 'train':
+            if self.img_shape == [2160,3840]:
+                self.annot_path = 'data/preprocessed_npz/multihuman_data/agora_train_3840_w_occ_multi_2010.npz'
+                self.annot_path_cache = 'data/preprocessed_npz/cache/agora_train_3840_w_occ_cache_2010.npz'
+            elif self.img_shape == [720,1280]:
+                self.annot_path = 'data/preprocessed_npz/multihuman_data/agora_train_1280_multi_1010.npz'
+                self.annot_path_cache = 'data/preprocessed_npz/cache/agora_train_cache_1280_1010.npz'
+
+        elif data_split == 'test':
+            if self.img_shape == [2160,3840]:
+                self.annot_path = 'data/preprocessed_npz/multihuman_data/agora_validation_multi_3840_1010.npz'
+                self.annot_path_cache = 'data/preprocessed_npz/cache/agora_validation_cache_3840_1010_occ_cache_balance.npz'
+            elif self.img_shape == [720,1280]:
+                self.annot_path = 'data/preprocessed_npz/multihuman_data/agora_validation_1280_1010_occ.npz'
+                self.annot_path_cache = 'data/preprocessed_npz/cache/agora_validation_cache_1280_1010_occ.npz'
+        
+        self.use_cache = getattr(cfg, 'use_cache', False)
+        self.cam_param = {}
+
+        # load data or cache
+        if self.use_cache and osp.isfile(self.annot_path_cache):
+            print(f'[{self.__class__.__name__}] loading cache from {self.annot_path_cache}')
+            self.datalist = self.load_cache(self.annot_path_cache)
+        else:
+            if self.use_cache:
+                print(f'[{self.__class__.__name__}] Cache not found, generating cache...')
+            self.datalist = self.load_data(
+                train_sample_interval=getattr(cfg, f'{self.__class__.__name__}_train_sample_interval', 1))
+            if self.use_cache:
+                self.save_cache(self.annot_path_cache, self.datalist)
+
+
+    def load_data(self, train_sample_interval=1):
+
+        content = np.load(self.annot_path, allow_pickle=True)
+        
+        try:
+            frame_range = content['frame_range']
+        except KeyError:
+            frame_range = \
+                np.array([[i, i + 1] for i in range(self.num_data)])
+
+        num_examples = len(frame_range)
+        
+        if 'meta' in content:
+            meta = content['meta'].item()
+            print('meta keys:', meta.keys())
+        else:
+            meta = None
+            print(
+                'No meta info provided! Please give height and width manually')
+
+        print(
+            f'Start loading humandata {self.annot_path} into memory...\nDataset includes: {content.files}'
+        )
+        tic = time.time()
+        image_path = content['image_path']
+
+        if meta is not None and 'height' in meta:
+            height = np.array(meta['height'])
+            width = np.array(meta['width'])
+            image_shape = np.stack([height, width], axis=-1)
+        else:
+            image_shape = None
+
+        if meta is not None and 'gender' in meta and len(meta['gender']) != 0:
+            gender = meta['gender']
+        else:
+            gender = None
+        
+        if meta is not None and 'is_kid' in meta and len(meta['is_kid']) != 0:
+            is_kid = meta['is_kid']
+        else:
+            is_kid = None
+                    
+        bbox_xywh = content['bbox_xywh']
+
+        if 'smplx' in content:
+            smplx = content['smplx'].item()
+            as_smplx = 'smplx'
+        elif 'smpl' in content:
+            smplx = content['smpl'].item()
+            as_smplx = 'smpl'
+        elif 'smplh' in content:
+            smplx = content['smplh'].item()
+            as_smplx = 'smplh'
+        # TODO: temp solution, should be more general. But SHAPY is very special
+        elif self.__class__.__name__ == 'SHAPY':
+            smplx = {}
+        else:
+            raise KeyError('No SMPL for SMPLX available, please check keys:\n'
+                           f'{content.files}')
+
+        print('Smplx param', smplx.keys())
+
+        if 'lhand_bbox_xywh' in content and 'rhand_bbox_xywh' in content:
+            lhand_bbox_xywh = content['lhand_bbox_xywh']
+            rhand_bbox_xywh = content['rhand_bbox_xywh']
+        else:
+            lhand_bbox_xywh = np.zeros_like(bbox_xywh)
+            rhand_bbox_xywh = np.zeros_like(bbox_xywh)
+
+        if 'face_bbox_xywh' in content:
+            face_bbox_xywh = content['face_bbox_xywh']
+        else:
+            face_bbox_xywh = np.zeros_like(bbox_xywh)
+
+        decompressed = False
+        if content['__keypoints_compressed__']:
+            decompressed_kps = self.decompress_keypoints(content)
+            decompressed = True
+
+        keypoints3d = None
+        valid_kps3d = False
+        keypoints3d_mask = None
+        valid_kps3d_mask = False
+        
+        
+        # processing keypoints
+        for kps3d_key in KPS3D_KEYS:
+            if kps3d_key in content:
+                keypoints3d = decompressed_kps[kps3d_key][:, self.SMPLX_137_MAPPING, :] if decompressed \
+                else content[kps3d_key][:, self.SMPLX_137_MAPPING, :]
+                valid_kps3d = True
+                if keypoints3d.shape[-1] == 4:
+                    valid_kps3d_mask = True
+                break
+        if self.keypoints2d is not None:
+            keypoints2d = decompressed_kps[self.keypoints2d][:, self.SMPLX_137_MAPPING, :] if decompressed \
+                else content[self.keypoints2d][:, self.SMPLX_137_MAPPING, :]
+            
+
+        else:
+            for kps2d_key in KPS2D_KEYS:
+                if kps2d_key in content:
+                    keypoints2d = decompressed_kps[kps2d_key][:, self.SMPLX_137_MAPPING, :] if decompressed \
+                        else content[kps2d_key][:, self.SMPLX_137_MAPPING, :]
+                    
+        if keypoints2d.shape[-1] == 3:
+            valid_kps3d_mask = True
+        occlusion = content['meta'][()]['occ'] if 'occ' in content['meta'][()] and len(content['meta'][()]['occ'])>0 else None
+        
+        print('Done. Time: {:.2f}s'.format(time.time() - tic))
+
+        datalist = []
+        # num_examples
+
+        # processing each image, filter according to bbox valid
+        for i in tqdm.tqdm(range(int(num_examples))):
+            if self.data_split == 'train' and i % train_sample_interval != 0:
+                continue
+            frame_start, frame_end = frame_range[i]
+            img_path = osp.join(self.img_dir, image_path[frame_start])
+            # im_shape = cv2.imread(img_path).shape[:2]
+            img_shape = image_shape[
+                frame_start] if image_shape is not None else self.img_shape
+            
+
+            bbox_list = bbox_xywh[frame_start:frame_end, :4]
+            
+            valid_idx = []
+            body_bbox_list = []
+            
+            if hasattr(cfg, 'bbox_ratio'):
+                bbox_ratio = cfg.bbox_ratio * 0.833  # preprocess body bbox is giving 1.2 box padding
+            else:
+                bbox_ratio = 1.25
+            
+            for bbox_i, bbox in enumerate(bbox_list):
+                
+                bbox = process_bbox(bbox,
+                                    img_width=img_shape[1],
+                                    img_height=img_shape[0],
+                                    ratio=bbox_ratio)
+                if bbox is None:
+                    continue
+                else:
+                    valid_idx.append(frame_start + bbox_i)
+                    bbox[2:] += bbox[:2]
+                    body_bbox_list.append(bbox)
+            if len(valid_idx) == 0:
+                continue
+            valid_num = len(valid_idx)
+            # hand/face bbox
+            lhand_bbox_list = []
+            rhand_bbox_list = []
+            face_bbox_list = []
+            
+            for bbox_i in valid_idx:
+                lhand_bbox = lhand_bbox_xywh[bbox_i]
+                
+                rhand_bbox = rhand_bbox_xywh[bbox_i]
+                face_bbox = face_bbox_xywh[bbox_i]
+                if lhand_bbox[-1] > 0:  # conf > 0
+                    lhand_bbox = lhand_bbox[:4]
+                    if hasattr(cfg, 'bbox_ratio'):
+                        lhand_bbox = process_bbox(lhand_bbox,
+                                                  img_width=img_shape[1],
+                                                  img_height=img_shape[0],
+                                                  ratio=bbox_ratio)
+                    if lhand_bbox is not None:
+                        lhand_bbox[2:] += lhand_bbox[:2]  # xywh -> xyxy
+                else:
+                    lhand_bbox = None
+                if rhand_bbox[-1] > 0:
+                    rhand_bbox = rhand_bbox[:4]
+                    if hasattr(cfg, 'bbox_ratio'):
+                        rhand_bbox = process_bbox(rhand_bbox,
+                                                  img_width=img_shape[1],
+                                                  img_height=img_shape[0],
+                                                  ratio=bbox_ratio)
+                    if rhand_bbox is not None:
+                        rhand_bbox[2:] += rhand_bbox[:2]  # xywh -> xyxy
+                else:
+                    rhand_bbox = None
+                if face_bbox[-1] > 0:
+                    face_bbox = face_bbox[:4]
+                    if hasattr(cfg, 'bbox_ratio'):
+                        face_bbox = process_bbox(face_bbox,
+                                                 img_width=img_shape[1],
+                                                 img_height=img_shape[0],
+                                                 ratio=bbox_ratio)
+                    if face_bbox is not None:
+                        face_bbox[2:] += face_bbox[:2]  # xywh -> xyxy
+                else:
+                    face_bbox = None
+                lhand_bbox_list.append(lhand_bbox)
+                rhand_bbox_list.append(rhand_bbox)
+                face_bbox_list.append(face_bbox)
+            
+            # lhand_bbox = np.stack(lhand_bbox_list,axis=0)
+            # rhand_bbox = np.stack(rhand_bbox_list,axis=0)
+            # face_bbox = np.stack(face_bbox_list,axis=0)
+            joint_img = keypoints2d[valid_idx]
+            
+            # num_joints = joint_cam.shape[0]
+            # joint_valid = np.ones((num_joints, 1))
+            if valid_kps3d:
+                joint_cam = keypoints3d[valid_idx]
+            else:
+                joint_cam = None
+            
+            if 'leye_pose_0' in smplx.keys():
+                smplx.pop('leye_pose_0')
+            if 'leye_pose_1' in smplx.keys():
+                smplx.pop('leye_pose_1')
+            if 'leye_pose' in smplx.keys():
+                smplx.pop('leye_pose')
+            if 'reye_pose_0' in smplx.keys():
+                smplx.pop('reye_pose_0')
+            if 'reye_pose_1' in smplx.keys():
+                smplx.pop('reye_pose_1')
+            if 'reye_pose' in smplx.keys():
+                smplx.pop('reye_pose')
+            
+            occlusion_frame = occlusion[valid_idx] \
+                if occlusion is not None else np.array([1]*(valid_num))
+
+            smplx_param = {k: v[valid_idx] for k, v in smplx.items()}
+            gender_ = gender[valid_idx] \
+                if gender is not None else np.array(['neutral']*(valid_num))
+                
+            is_kid_ = is_kid[valid_idx] \
+                if is_kid is not None else np.array([1]*(valid_num))
+            lhand_bbox_valid = lhand_bbox_xywh[valid_idx,4]
+            rhand_bbox_valid = rhand_bbox_xywh[valid_idx,4]
+            face_bbox_valid = face_bbox_xywh[valid_idx,4]
+            
+            smplx_param['root_pose'] = smplx_param.pop('global_orient', None)
+            smplx_param['shape'] = smplx_param.pop('betas', None)
+            smplx_param['trans'] = smplx_param.pop('transl', np.zeros(3))
+            smplx_param['lhand_pose'] = smplx_param.pop('left_hand_pose', None)
+            smplx_param['rhand_pose'] = smplx_param.pop(
+                'right_hand_pose', None)
+            smplx_param['expr'] = smplx_param.pop('expression', None)
+
+            # TODO do not fix betas, give up shape supervision
+            if 'betas_neutral' in smplx_param and self.data_split == 'train':
+                smplx_param['shape'] = smplx_param.pop('betas_neutral')
+                # smplx_param['shape'] = np.zeros(10, dtype=np.float32)
+
+            if smplx_param['lhand_pose'] is None or self.body_only == True:
+                smplx_param['lhand_valid'] = np.zeros(valid_num, dtype=np.bool8)
+            else:
+                smplx_param['lhand_valid'] = lhand_bbox_valid.astype(np.bool8)
+                
+            if smplx_param['rhand_pose'] is None or self.body_only == True:
+                smplx_param['rhand_valid'] = np.zeros(valid_num, dtype=np.bool8)
+            else:
+                smplx_param['rhand_valid'] = rhand_bbox_valid.astype(np.bool8)
+            
+            if smplx_param['expr'] is None or self.body_only == True:
+                smplx_param['face_valid'] = np.zeros(valid_num, dtype=np.bool8)
+            else:
+                smplx_param['face_valid'] = face_bbox_valid.astype(np.bool8)
+
+            if joint_cam is not None and np.any(np.isnan(joint_cam)):
+                continue
+            
+            
+            datalist.append({
+                'img_path': img_path,
+                'img_shape': img_shape,
+                'bbox': body_bbox_list,
+                'lhand_bbox': lhand_bbox_list,
+                'rhand_bbox': rhand_bbox_list,
+                'face_bbox': face_bbox_list,
+                'joint_img': joint_img,
+                'joint_cam': joint_cam,
+                'smplx_param': smplx_param,
+                'as_smplx': as_smplx,
+                'gender': gender_,
+                'occlusion': occlusion_frame,
+                'is_kid': is_kid_,
+            })
+
+        # save memory
+        del content, image_path, bbox_xywh, lhand_bbox_xywh, rhand_bbox_xywh, face_bbox_xywh, keypoints3d, keypoints2d
+
+        if self.data_split == 'train':
+            print(f'[{self.__class__.__name__} train] original size:',
+                  int(num_examples), '. Sample interval:',
+                  train_sample_interval, '. Sampled size:', len(datalist))
+
+        if getattr(cfg, 'data_strategy',
+                   None) == 'balance' and self.data_split == 'train':
+            print(
+                f'[{self.__class__.__name__}] Using [balance] strategy with datalist shuffled...'
+            )
+            random.shuffle(datalist)
+
+        return datalist
+   
+    def __getitem__(self, idx):
+        try:
+            data = copy.deepcopy(self.datalist[idx])
+        except Exception as e:
+            print(f'[{self.__class__.__name__}] Error loading data {idx}')
+            print(e)
+            exit(0)
+
+        img_path, img_shape, bbox = \
+            data['img_path'], data['img_shape'], data['bbox']
+        as_smplx = data['as_smplx']
+        gender = data['gender'].copy()
+        for gender_str, gender_num in {
+            'neutral': -1, 'male': 0, 'female': 1}.items():
+            gender[gender==gender_str]=gender_num
+        gender = gender.astype(int)    
+        
+        img_whole_bbox = np.array([0, 0, img_shape[1], img_shape[0]])        
+        img = load_img(img_path, order='BGR')
+
+        num_person = len(data['bbox'])
+        data_name = self.__class__.__name__
+        img, img2bb_trans, bb2img_trans, rot, do_flip = \
+            augmentation_instance_sample(img, img_whole_bbox, self.data_split,data,data_name)
+        cropped_img_shape=img.shape[:2]
+        
+        num_person = len(data['bbox'])
+        if self.data_split == 'train':
+            joint_cam = data['joint_cam']  # num, 137,4
+            if joint_cam is not None:
+                dummy_cord = False
+                joint_cam[:,:,:3] = \
+                    joint_cam[:,:,:3] - joint_cam[:, self.joint_set['root_joint_idx'], None, :3]  # root-relative
+            else:
+                # dummy cord as joint_cam
+                dummy_cord = True
+                joint_cam = np.zeros(
+                    (num_person, self.joint_set['joint_num'], 4),
+                    dtype=np.float32)
+
+            joint_img = data['joint_img']
+            # do rotation on keypoints
+            joint_img_aug, joint_cam_wo_ra, joint_cam_ra, joint_trunc = \
+                process_db_coord_batch_no_valid(
+                    joint_img, joint_cam, do_flip, img_shape,
+                    self.joint_set['flip_pairs'], img2bb_trans, rot,
+                    self.joint_set['joints_name'], smpl_x.joints_name,
+                    cropped_img_shape)
+            joint_img_aug[:,:,2:] = joint_img_aug[:,:,2:] * joint_trunc
+            
+            # smplx coordinates and parameters
+            smplx_param = data['smplx_param']
+            smplx_pose, smplx_shape, smplx_expr, smplx_pose_valid, \
+            smplx_joint_valid, smplx_expr_valid, smplx_shape_valid = \
+                process_human_model_output_batch_simplify(
+                    smplx_param, do_flip, rot, as_smplx)
+            # if cam not provided, we take joint_img as smplx joint 2d, 
+            # which is commonly the case for our processed humandata
+            # change smplx_shape if use_betas_neutral
+            # processing follows that in process_human_model_output
+            
+            if self.use_betas_neutral:
+                smplx_shape = smplx_param['betas_neutral'].reshape(
+                    num_person, -1)
+                smplx_shape[(np.abs(smplx_shape) > 3).any(axis=1)] = 0.
+                smplx_shape = smplx_shape.reshape(num_person, -1)
+            # SMPLX joint coordinate validity
+            # for name in ('L_Big_toe', 'L_Small_toe', 'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel'):
+            #     smplx_joint_valid[smpl_x.joints_name.index(name)] = 0
+            smplx_joint_valid = smplx_joint_valid[:, :, None]
+
+            lhand_bbox_center_list = []
+            lhand_bbox_valid_list = []
+            lhand_bbox_size_list = []
+            lhand_bbox_list = []
+            face_bbox_center_list = []
+            face_bbox_size_list = []
+            face_bbox_valid_list = []
+            face_bbox_list = []
+            rhand_bbox_center_list = []
+            rhand_bbox_valid_list = []
+            rhand_bbox_size_list = []
+            rhand_bbox_list = []
+            body_bbox_center_list = []
+            body_bbox_size_list = []
+            body_bbox_valid_list = []
+            body_bbox_list = []
+
+            for i in range(num_person):
+                body_bbox, body_bbox_valid = self.process_hand_face_bbox(
+                    data['bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                
+                lhand_bbox, lhand_bbox_valid = self.process_hand_face_bbox(
+                    data['lhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                lhand_bbox_valid *= smplx_param['lhand_valid'][i]
+                
+                rhand_bbox, rhand_bbox_valid = self.process_hand_face_bbox(
+                    data['rhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                rhand_bbox_valid *= smplx_param['rhand_valid'][i]
+                
+                face_bbox, face_bbox_valid = self.process_hand_face_bbox(
+                    data['face_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                face_bbox_valid *= smplx_param['face_valid'][i]
+                
+                if do_flip:
+                    lhand_bbox, rhand_bbox = rhand_bbox, lhand_bbox
+                    lhand_bbox_valid, rhand_bbox_valid = rhand_bbox_valid, lhand_bbox_valid
+                    
+                body_bbox_list.append(body_bbox)
+                lhand_bbox_list.append(lhand_bbox)
+                rhand_bbox_list.append(rhand_bbox)
+                face_bbox_list.append(face_bbox)
+                
+                lhand_bbox_center = (lhand_bbox[0] + lhand_bbox[1]) / 2.
+                rhand_bbox_center = (rhand_bbox[0] + rhand_bbox[1]) / 2.
+                face_bbox_center = (face_bbox[0] + face_bbox[1]) / 2.
+                body_bbox_center = (body_bbox[0] + body_bbox[1]) / 2.
+                lhand_bbox_size = lhand_bbox[1] - lhand_bbox[0]
+                rhand_bbox_size = rhand_bbox[1] - rhand_bbox[0]
+
+                face_bbox_size = face_bbox[1] - face_bbox[0]
+                body_bbox_size = body_bbox[1] - body_bbox[0]
+                lhand_bbox_center_list.append(lhand_bbox_center)
+                lhand_bbox_valid_list.append(lhand_bbox_valid)
+                lhand_bbox_size_list.append(lhand_bbox_size)
+                face_bbox_center_list.append(face_bbox_center)
+                face_bbox_size_list.append(face_bbox_size)
+                face_bbox_valid_list.append(face_bbox_valid)
+                rhand_bbox_center_list.append(rhand_bbox_center)
+                rhand_bbox_valid_list.append(rhand_bbox_valid)
+                rhand_bbox_size_list.append(rhand_bbox_size)
+                body_bbox_center_list.append(body_bbox_center)
+                body_bbox_size_list.append(body_bbox_size)
+                body_bbox_valid_list.append(body_bbox_valid)
+            
+            
+            body_bbox = np.stack(body_bbox_list, axis=0)
+            lhand_bbox = np.stack(lhand_bbox_list, axis=0)
+            rhand_bbox = np.stack(rhand_bbox_list, axis=0)
+            face_bbox = np.stack(face_bbox_list, axis=0)
+            lhand_bbox_center = np.stack(lhand_bbox_center_list, axis=0)
+            lhand_bbox_valid = np.stack(lhand_bbox_valid_list, axis=0)
+            lhand_bbox_size = np.stack(lhand_bbox_size_list, axis=0)
+            face_bbox_center = np.stack(face_bbox_center_list, axis=0)
+            face_bbox_size = np.stack(face_bbox_size_list, axis=0)
+            face_bbox_valid = np.stack(face_bbox_valid_list, axis=0)
+            body_bbox_center = np.stack(body_bbox_center_list, axis=0)
+            body_bbox_size = np.stack(body_bbox_size_list, axis=0)
+            body_bbox_valid = np.stack(body_bbox_valid_list, axis=0)
+            rhand_bbox_center = np.stack(rhand_bbox_center_list, axis=0)
+            rhand_bbox_valid = np.stack(rhand_bbox_valid_list, axis=0)
+            rhand_bbox_size = np.stack(rhand_bbox_size_list, axis=0)
+
+
+            if 'occlusion' in data:
+                occlusion = data['occlusion']
+                occ_mask = occlusion<97
+                
+                joint_img_aug[:,:,2] = joint_img_aug[:,:,2]*occ_mask[:,None]
+                joint_cam_wo_ra[:,:,3] = joint_cam_wo_ra[:,:,3]*occ_mask[:,None]
+                joint_trunc = joint_trunc*occ_mask[:,None,None]
+                smplx_pose_valid = smplx_pose_valid*occ_mask[:,None]
+                smplx_joint_valid = smplx_joint_valid*occ_mask[:,None,None]
+                smplx_expr_valid = smplx_expr_valid*occ_mask
+                smplx_shape_valid = smplx_shape_valid*occ_mask
+                rhand_bbox_valid = rhand_bbox_valid*occ_mask
+                lhand_bbox_valid = lhand_bbox_valid*occ_mask
+                face_bbox_valid = face_bbox_valid*occ_mask
+                
+            
+            if 'is_kid' in data:
+                is_kid = data['is_kid'].copy()
+                smplx_shape_valid = smplx_shape_valid * (is_kid==0)
+                
+                
+            inputs = {'img': img}
+
+            joint_img_aug[:,:,2] = joint_img_aug[:,:,2] * body_bbox_valid[:,None]
+            
+            is_3D = float(False) if dummy_cord else float(True)
+            
+            targets = {
+                # keypoints2d, [0,img_w],[0,img_h] -> [0,1] -> [0,output_hm_shape]
+                'joint_img': joint_img_aug[body_bbox_valid>0], 
+                # joint_cam, kp3d wo ra # raw kps3d probably without ra
+                'joint_cam': joint_cam_wo_ra[body_bbox_valid>0], 
+                # kps3d with body, face, hand ra
+                'smplx_joint_cam': joint_cam_ra[body_bbox_valid>0], 
+                'smplx_pose': smplx_pose[body_bbox_valid>0],
+                'smplx_shape': smplx_shape[body_bbox_valid>0],
+                'smplx_expr': smplx_expr[body_bbox_valid>0],
+                'lhand_bbox_center': lhand_bbox_center[body_bbox_valid>0], 
+                'lhand_bbox_size': lhand_bbox_size[body_bbox_valid>0],
+                'rhand_bbox_center': rhand_bbox_center[body_bbox_valid>0], 
+                'rhand_bbox_size': rhand_bbox_size[body_bbox_valid>0],
+                'face_bbox_center': face_bbox_center[body_bbox_valid>0], 
+                'face_bbox_size': face_bbox_size[body_bbox_valid>0],
+                'body_bbox_center': body_bbox_center[body_bbox_valid>0], 
+                'body_bbox_size': body_bbox_size[body_bbox_valid>0],
+                'body_bbox': body_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'lhand_bbox': lhand_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'rhand_bbox': rhand_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'face_bbox': face_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'gender': gender[body_bbox_valid>0]}
+            
+            meta_info = {
+                'joint_trunc': joint_trunc[body_bbox_valid>0],
+                'smplx_pose_valid': smplx_pose_valid[body_bbox_valid>0],
+                'smplx_shape_valid': smplx_shape_valid[body_bbox_valid>0],
+                'smplx_expr_valid': smplx_expr_valid[body_bbox_valid>0],
+                'is_3D': is_3D, 
+                'lhand_bbox_valid': lhand_bbox_valid[body_bbox_valid>0],
+                'rhand_bbox_valid': rhand_bbox_valid[body_bbox_valid>0], 
+                'face_bbox_valid': face_bbox_valid[body_bbox_valid>0],
+                'body_bbox_valid': body_bbox_valid[body_bbox_valid>0],
+                'img_shape': np.array(img.shape[:2]), 
+                'ori_shape':data['img_shape'],
+                'idx': idx
+               
+            }            
+            result = {**inputs, **targets, **meta_info}
+            
+            result = self.normalize(result)
+            result = self.format(result)
+            return result
+
+        
+
+        if self.data_split == 'test':
+            self.cam_param = {}
+            joint_cam = data['joint_cam']
+            
+            if joint_cam is not None:
+                dummy_cord = False
+                joint_cam[:,:,:3] = joint_cam[:,:,:3] - joint_cam[
+                    :, self.joint_set['root_joint_idx'], None, :3]  # root-relative
+            else:
+                # dummy cord as joint_cam
+                dummy_cord = True
+                joint_cam = np.zeros(
+                    (num_person, self.joint_set['joint_num'], 3),
+                                     dtype=np.float32)
+
+            joint_img = data['joint_img']
+            
+            
+            joint_img_aug, joint_cam_wo_ra, joint_cam_ra, joint_trunc = \
+                process_db_coord_batch_no_valid(
+                    joint_img, joint_cam, do_flip, img_shape,
+                    self.joint_set['flip_pairs'], img2bb_trans, rot,
+                    self.joint_set['joints_name'], smpl_x.joints_name,
+                    cropped_img_shape)
+            
+            
+
+            # smplx coordinates and parameters
+            smplx_param = data['smplx_param']
+            # smplx_cam_trans = np.array(
+            #     smplx_param['trans']) if 'trans' in smplx_param else None
+            # TODO: remove this, seperate smpl and smplx
+            smplx_pose, smplx_shape, smplx_expr, smplx_pose_valid, \
+            smplx_joint_valid, smplx_expr_valid, smplx_shape_valid = \
+                process_human_model_output_batch_simplify(
+                    smplx_param, do_flip, rot, as_smplx)
+            
+            # if cam not provided, we take joint_img as smplx joint 2d, 
+            # which is commonly the case for our processed humandata
+            if self.use_betas_neutral:
+                smplx_shape = smplx_param['betas_neutral'].reshape(
+                    num_person, -1)
+                smplx_shape[(np.abs(smplx_shape) > 3).any(axis=1)] = 0.
+                smplx_shape = smplx_shape.reshape(num_person, -1)
+            
+            smplx_joint_valid = smplx_joint_valid[:, :, None]
+            
+            lhand_bbox_center_list = []
+            lhand_bbox_valid_list = []
+            lhand_bbox_size_list = []
+            lhand_bbox_list = []
+            face_bbox_center_list = []
+            face_bbox_size_list = []
+            face_bbox_valid_list = []
+            face_bbox_list = []
+            rhand_bbox_center_list = []
+            rhand_bbox_valid_list = []
+            rhand_bbox_size_list = []
+            rhand_bbox_list = []
+            body_bbox_center_list = []
+            body_bbox_size_list = []
+            body_bbox_valid_list = []
+            body_bbox_list = []
+                        
+            for i in range(num_person):
+                lhand_bbox, lhand_bbox_valid = self.process_hand_face_bbox(
+                    data['lhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                rhand_bbox, rhand_bbox_valid = self.process_hand_face_bbox(
+                    data['rhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                face_bbox, face_bbox_valid = self.process_hand_face_bbox(
+                    data['face_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                
+                body_bbox, body_bbox_valid = self.process_hand_face_bbox(
+                    data['bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)                
+
+                if do_flip:
+                    lhand_bbox, rhand_bbox = rhand_bbox, lhand_bbox
+                    lhand_bbox_valid, rhand_bbox_valid = rhand_bbox_valid, lhand_bbox_valid            
+
+                body_bbox_list.append(body_bbox)
+                lhand_bbox_list.append(lhand_bbox)
+                rhand_bbox_list.append(rhand_bbox)
+                face_bbox_list.append(face_bbox)
+
+                lhand_bbox_center = (lhand_bbox[0] + lhand_bbox[1]) / 2.
+                rhand_bbox_center = (rhand_bbox[0] + rhand_bbox[1]) / 2.
+                face_bbox_center = (face_bbox[0] + face_bbox[1]) / 2.
+                body_bbox_center = (body_bbox[0] + body_bbox[1]) / 2.
+                lhand_bbox_size = lhand_bbox[1] - lhand_bbox[0]
+                rhand_bbox_size = rhand_bbox[1] - rhand_bbox[0]
+
+                face_bbox_size = face_bbox[1] - face_bbox[0]
+                body_bbox_size = body_bbox[1] - body_bbox[0]
+                lhand_bbox_center_list.append(lhand_bbox_center)
+                lhand_bbox_valid_list.append(lhand_bbox_valid)
+                lhand_bbox_size_list.append(lhand_bbox_size)
+                face_bbox_center_list.append(face_bbox_center)
+                face_bbox_size_list.append(face_bbox_size)
+                face_bbox_valid_list.append(face_bbox_valid)
+                rhand_bbox_center_list.append(rhand_bbox_center)
+                rhand_bbox_valid_list.append(rhand_bbox_valid)
+                rhand_bbox_size_list.append(rhand_bbox_size)
+                body_bbox_center_list.append(body_bbox_center)
+                body_bbox_size_list.append(body_bbox_size)
+                body_bbox_valid_list.append(body_bbox_valid)
+
+            body_bbox = np.stack(body_bbox_list, axis=0)
+            lhand_bbox = np.stack(lhand_bbox_list, axis=0)
+            rhand_bbox = np.stack(rhand_bbox_list, axis=0)
+            face_bbox = np.stack(face_bbox_list, axis=0)
+            lhand_bbox_center = np.stack(lhand_bbox_center_list, axis=0)
+            lhand_bbox_valid = np.stack(lhand_bbox_valid_list, axis=0)
+            lhand_bbox_size = np.stack(lhand_bbox_size_list, axis=0)
+            face_bbox_center = np.stack(face_bbox_center_list, axis=0)
+            face_bbox_size = np.stack(face_bbox_size_list, axis=0)
+            face_bbox_valid = np.stack(face_bbox_valid_list, axis=0)
+            body_bbox_center = np.stack(body_bbox_center_list, axis=0)
+            body_bbox_size = np.stack(body_bbox_size_list, axis=0)
+            body_bbox_valid = np.stack(body_bbox_valid_list, axis=0)
+            rhand_bbox_center = np.stack(rhand_bbox_center_list, axis=0)
+            rhand_bbox_valid = np.stack(rhand_bbox_valid_list, axis=0)
+            rhand_bbox_size = np.stack(rhand_bbox_size_list, axis=0)
+                                            
+                            
+            inputs = {'img': img}
+            
+            targets = {
+                # keypoints2d, [0,img_w],[0,img_h] -> [0,1] -> [0,output_hm_shape]
+                'joint_img': joint_img_aug, 
+                # projected smplx if valid cam_param, else same as keypoints2d
+                # joint_cam, kp3d wo ra # raw kps3d probably without ra
+                'joint_cam': joint_cam_wo_ra, 
+                'ann_idx': idx,
+                # kps3d with body, face, hand ra
+                'smplx_joint_cam': joint_cam_ra,
+                'smplx_pose': smplx_pose,
+                'smplx_shape': smplx_shape,
+                'smplx_expr': smplx_expr,
+                'lhand_bbox_center': lhand_bbox_center, 
+                'lhand_bbox_size': lhand_bbox_size,
+                'rhand_bbox_center': rhand_bbox_center, 
+                'rhand_bbox_size': rhand_bbox_size,
+                'face_bbox_center': face_bbox_center, 
+                'face_bbox_size': face_bbox_size,
+                'body_bbox_center': body_bbox_center, 
+                'body_bbox_size': body_bbox_size,
+                'body_bbox': body_bbox.reshape(-1,4),
+                'lhand_bbox': lhand_bbox.reshape(-1,4),
+                'rhand_bbox': rhand_bbox.reshape(-1,4),
+                'face_bbox': face_bbox.reshape(-1,4),
+                'gender': gender,
+                'bb2img_trans': bb2img_trans,
+            }
+            
+            if self.body_only:
+                meta_info = {
+                    'joint_trunc': joint_trunc,
+                    'smplx_pose_valid': smplx_pose_valid,
+                    'smplx_shape_valid': float(smplx_shape_valid),
+                    'smplx_expr_valid': smplx_expr_valid,
+                    'is_3D': float(False) if dummy_cord else float(True), 
+                    'lhand_bbox_valid': lhand_bbox_valid,
+                    'rhand_bbox_valid': rhand_bbox_valid, 
+                    'face_bbox_valid': face_bbox_valid,
+                    'body_bbox_valid': body_bbox_valid,
+                    'img_shape': np.array(img.shape[:2]), 
+                    'ori_shape':data['img_shape'],
+                    'idx': idx
+                }
+            else:
+                meta_info = {
+                    'joint_trunc': joint_trunc,
+                    'smplx_pose_valid': smplx_pose_valid,
+                    'smplx_shape_valid': smplx_shape_valid,
+                    'smplx_expr_valid': smplx_expr_valid,
+                    'is_3D': float(False) if dummy_cord else float(True), 
+                    'lhand_bbox_valid': lhand_bbox_valid,
+                    'rhand_bbox_valid': rhand_bbox_valid, 
+                    'face_bbox_valid': face_bbox_valid,
+                    'body_bbox_valid': body_bbox_valid,
+                    'img_shape': np.array(img.shape[:2]), 
+                    'ori_shape':data['img_shape'],
+                    'idx': idx
+                   }
+            
+            result = {**inputs, **targets, **meta_info}
+            result = self.normalize(result)
+            result = self.format(result)
+            return result
+        
+    def evaluate(self, outs, cur_sample_idx):
+        annots = self.datalist
+        sample_num = len(outs)
+        eval_result = {
+            'pa_mpvpe_all': [],
+            'pa_mpvpe_l_hand': [],
+            'pa_mpvpe_r_hand': [],
+            'pa_mpvpe_hand': [],
+            'pa_mpvpe_face': [],
+            'mpvpe_all': [],
+            'mpvpe_l_hand': [],
+            'mpvpe_r_hand': [],
+            'mpvpe_hand': [],
+            'mpvpe_face': []
+        }
+
+        vis = getattr(cfg, 'vis', False)
+        vis_save_dir = cfg.vis_dir
+        
+        csv_file = f'{cfg.result_dir}/agora_smplx_error.csv'
+        file = open(csv_file, 'a', newline='')
+        for n in range(sample_num):
+            annot = annots[cur_sample_idx + n]
+            out = outs[n]
+            mesh_gt = out['smplx_mesh_cam_target']
+            mesh_out = out['smplx_mesh_cam']
+            
+            # print('zzz',mesh_gt.shape,mesh_out.shape)
+            # from pytorch3d.io import save_obj
+            # for m_i,(mesh_gt_i,mesh_out_i) in enumerate(zip(mesh_gt,mesh_out)):
+            #     save_obj('temp_gt_%d.obj'%m_i,verts=torch.Tensor(mesh_gt_i),faces=torch.tensor([]))
+            #     save_obj('temp_pred_%d.obj'%m_i,verts=torch.Tensor(mesh_out_i),faces=torch.tensor([]))
+            
+            ann_idx = out['gt_ann_idx']
+            img_path = []
+            for ann_id in ann_idx:
+                img_path.append(annots[ann_id]['img_path'])
+            eval_result['img_path'] = img_path
+            eval_result['ann_idx'] = ann_idx
+            # MPVPE from all vertices
+            mesh_out_align = \
+                mesh_out - np.dot(
+                    smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['pelvis'], None, :] + \
+                    np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['pelvis'], None, :]
+            
+            eval_result['mpvpe_all'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1)).mean(-1) * 1000)
+            mesh_out_align = rigid_align_batch(mesh_out, mesh_gt)
+            eval_result['pa_mpvpe_all'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1)).mean(-1) * 1000)
+
+            # MPVPE from hand vertices
+            mesh_gt_lhand = mesh_gt[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_out_lhand = mesh_out[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_gt_rhand = mesh_gt[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_rhand = mesh_out[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_lhand_align = \
+                mesh_out_lhand - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :]
+                    
+            mesh_out_rhand_align = \
+                mesh_out_rhand - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :]
+            
+            eval_result['mpvpe_l_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['mpvpe_r_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['mpvpe_hand'].extend(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                 1000) / 2.)
+            mesh_out_lhand_align = rigid_align_batch(mesh_out_lhand, mesh_gt_lhand)
+            mesh_out_rhand_align = rigid_align_batch(mesh_out_rhand, mesh_gt_rhand)
+            eval_result['pa_mpvpe_l_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['pa_mpvpe_r_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['pa_mpvpe_hand'].extend(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                 1000) / 2.)
+            
+            
+            save_error=True
+            if save_error:
+                writer = csv.writer(file)
+                new_line = [ann_idx[n],img_path[n], eval_result['mpvpe_all'][-1], eval_result['pa_mpvpe_all'][-1]]
+                writer.writerow(new_line)
+                self.save_idx += 1
+            
+            
+        return eval_result
+
+
+    def print_eval_result(self, eval_result):
+
+        print('AGORA test results are dumped at: ' +
+              osp.join(cfg.result_dir, 'predictions'))
+
+        if self.data_split == 'test' and self.test_set == 'test':  # do not print. just submit the results to the official evaluation server
+            return
+
+        print('======AGORA-val======')
+        print('PA MPVPE (All): %.2f mm' % np.mean(eval_result['pa_mpvpe_all']))
+        print('PA MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_l_hand']))
+        print('PA MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_r_hand']))
+        print('PA MPVPE (Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_hand']))
+        print('PA MPVPE (Face): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_face']))
+        print()
+
+        print('MPVPE (All): %.2f mm' % np.mean(eval_result['mpvpe_all']))
+        print('MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_l_hand']))
+        print('MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_r_hand']))
+        print('MPVPE (Hands): %.2f mm' % np.mean(eval_result['mpvpe_hand']))
+        print('MPVPE (Face): %.2f mm' % np.mean(eval_result['mpvpe_face']))
+        
+        out_file = osp.join(cfg.result_dir,'agora_val.txt')
+        if os.path.exists(out_file):
+            f = open(out_file, 'a+')
+        else:
+            f = open(out_file, 'w', encoding="utf-8")
+            
+        f.write('\n')
+        f.write(f'{cfg.exp_name}\n')            
+        f.write(f'AGORA-val dataset: \n')
+        f.write('PA MPVPE (All): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_all']))
+        f.write('PA MPVPE (L-Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_l_hand']))
+        f.write('PA MPVPE (R-Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_r_hand']))
+        f.write('PA MPVPE (Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_hand']))
+        f.write('PA MPVPE (Face): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_face']))
+        f.write('MPVPE (All): %.2f mm\n' % np.mean(eval_result['mpvpe_all']))
+        f.write('MPVPE (L-Hands): %.2f mm\n' %
+                np.mean(eval_result['mpvpe_l_hand']))
+        f.write('MPVPE (R-Hands): %.2f mm\n' %
+                np.mean(eval_result['mpvpe_r_hand']))
+        f.write('MPVPE (Hands): %.2f mm\n' % np.mean(eval_result['mpvpe_hand']))
+        f.write('MPVPE (Face): %.2f mm\n' % np.mean(eval_result['mpvpe_face']))
diff --git a/datasets/ARCTIC.py b/datasets/ARCTIC.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c96065d579fb642596ffa3f3f15686a6e4e0d46
--- /dev/null
+++ b/datasets/ARCTIC.py
@@ -0,0 +1,215 @@
+import os
+import os.path as osp
+from glob import glob
+import numpy as np
+from config.config import cfg
+
+import csv
+
+from util.human_models import smpl_x
+
+from util.transforms import rigid_align_batch
+
+from humandata import HumanDataset
+
+class ARCTIC(HumanDataset):
+    def __init__(self, transform, data_split):
+        super(ARCTIC, self).__init__(transform, data_split)
+
+        self.img_dir = 'data/osx_data/ARCTIC'
+
+
+        if data_split == 'train':
+            self.annot_path = 'data/preprocessed_npz/multihuman_data/p1_train_multi.npz'
+            self.annot_path_cache = 'data/preprocessed_npz/cache/p1_train_cache_sample1000_080824.npz'
+            self.sample_interval = 1000  
+        elif data_split == 'test':
+            self.annot_path = 'data/preprocessed_npz_old/multihuman_data/p1_val_multi.npz'
+            self.annot_path_cache = 'data/preprocessed_npz_old/cache/p1_val_cache_30.npz'
+            self.sample_interval = 30
+        
+        
+        self.use_cache = getattr(cfg, 'use_cache', False)
+        self.img_shape = None  #1024, 1024)  # (h, w)
+        self.cam_param = {}
+        self.use_cache=True
+        # load data
+        if self.use_cache and osp.isfile(self.annot_path_cache):
+            print(
+                f'[{self.__class__.__name__}] loading cache from {self.annot_path_cache}'
+            )
+            self.datalist = self.load_cache(self.annot_path_cache)
+        else:
+            if self.use_cache:
+                print(
+                    f'[{self.__class__.__name__}] Cache not found, generating cache...'
+                )
+            self.datalist = self.load_data(train_sample_interval=getattr(
+                cfg, f'{self.__class__.__name__}_train_sample_interval', self.sample_interval))
+            if self.use_cache:
+                self.save_cache(self.annot_path_cache, self.datalist)
+                
+
+    def evaluate(self, outs, cur_sample_idx):
+        annots = self.datalist
+        sample_num = len(outs)
+        eval_result = {
+            'pa_mpvpe_all': [],
+            'pa_mpvpe_l_hand': [],
+            'pa_mpvpe_r_hand': [],
+            'pa_mpvpe_hand': [],
+            'pa_mpvpe_face': [],
+            'mpvpe_all': [],
+            'mpvpe_l_hand': [],
+            'mpvpe_r_hand': [],
+            'mpvpe_hand': [],
+            'mpvpe_face': []
+        }
+
+        vis = getattr(cfg, 'vis', False)
+        vis_save_dir = cfg.vis_dir
+        csv_file = f'{cfg.result_dir}/arctic_smplx_error.csv'
+        file = open(csv_file, 'a', newline='')
+        
+        for n in range(sample_num):
+            annot = annots[cur_sample_idx + n]
+            out = outs[n]
+            mesh_gt = out['smplx_mesh_cam_target']
+            mesh_out = out['smplx_mesh_cam']
+            ann_idx = out['gt_ann_idx']
+            img_path = []
+            for ann_id in ann_idx:
+                img_path.append(annots[ann_id]['img_path'])
+            eval_result['img_path'] = img_path
+            # MPVPE from all vertices
+            mesh_out_align = \
+                mesh_out - np.dot(
+                    smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['pelvis'], None, :] + \
+                    np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['pelvis'], None, :]
+            
+            eval_result['mpvpe_all'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1)).mean() * 1000)
+            mesh_out_align = rigid_align_batch(mesh_out, mesh_gt)
+            eval_result['pa_mpvpe_all'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1)).mean() * 1000)
+
+            # MPVPE from hand vertices
+            mesh_gt_lhand = mesh_gt[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_out_lhand = mesh_out[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_gt_rhand = mesh_gt[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_rhand = mesh_out[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_lhand_align = \
+                mesh_out_lhand - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :]
+                    
+            mesh_out_rhand_align = \
+                mesh_out_rhand - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :]
+            
+            eval_result['mpvpe_l_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean() *
+                1000)
+            eval_result['mpvpe_r_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean() *
+                1000)
+            eval_result['mpvpe_hand'].append(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean() *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean() *
+                 1000) / 2.)
+            mesh_out_lhand_align = rigid_align_batch(mesh_out_lhand, mesh_gt_lhand)
+            mesh_out_rhand_align = rigid_align_batch(mesh_out_rhand, mesh_gt_rhand)
+            eval_result['pa_mpvpe_l_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean() *
+                1000)
+            eval_result['pa_mpvpe_r_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean() *
+                1000)
+            eval_result['pa_mpvpe_hand'].append(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean() *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean() *
+                 1000) / 2.)
+
+            # MPVPE from face vertices
+            mesh_gt_face = mesh_gt[:, smpl_x.face_vertex_idx, :]
+            mesh_out_face = mesh_out[:, smpl_x.face_vertex_idx, :]
+            mesh_out_face_align = \
+                mesh_out_face - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['neck'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['neck'], None, :]
+            eval_result['mpvpe_face'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_face_align - mesh_gt_face)**2, -1)).mean() * 1000)
+            mesh_out_face_align = rigid_align_batch(mesh_out_face, mesh_gt_face)
+            eval_result['pa_mpvpe_face'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_face_align - mesh_gt_face)**2, -1)).mean() * 1000)
+        
+            save_error=True
+            if save_error:
+                writer = csv.writer(file)
+                new_line = [ann_idx[n], img_path[n], eval_result['mpvpe_all'][-1], eval_result['pa_mpvpe_all'][-1]]
+                writer.writerow(new_line)
+                # self.save_idx += 1
+        return eval_result
+
+    def print_eval_result(self, eval_result):
+
+        print('======ARCTIC-val======')
+        print('PA MPVPE (All): %.2f mm' % np.mean(eval_result['pa_mpvpe_all']))
+        print('PA MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_l_hand']))
+        print('PA MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_r_hand']))
+        print('PA MPVPE (Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_hand']))
+        print('PA MPVPE (Face): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_face']))
+        print()
+
+        print('MPVPE (All): %.2f mm' % np.mean(eval_result['mpvpe_all']))
+        print('MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_l_hand']))
+        print('MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_r_hand']))
+        print('MPVPE (Hands): %.2f mm' % np.mean(eval_result['mpvpe_hand']))
+        print('MPVPE (Face): %.2f mm' % np.mean(eval_result['mpvpe_face']))
+
+        out_file = osp.join(cfg.result_dir,'arctic_val.txt')
+        if os.path.exists(out_file):
+            f = open(out_file, 'a+')
+        else:
+            f = open(out_file, 'w', encoding="utf-8")
+        f.write('\n')
+        f.write(f'{cfg.exp_name}\n')   
+        f.write(f'ARCTIC-val dataset: \n')
+        f.write('PA MPVPE (All): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_all']))
+        f.write('PA MPVPE (L-Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_l_hand']))
+        f.write('PA MPVPE (R-Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_r_hand']))
+        f.write('PA MPVPE (Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_hand']))
+        f.write('PA MPVPE (Face): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_face']))
+        f.write('MPVPE (All): %.2f mm\n' % np.mean(eval_result['mpvpe_all']))
+        f.write('MPVPE (L-Hands): %.2f mm\n' %
+                np.mean(eval_result['mpvpe_l_hand']))
+        f.write('MPVPE (R-Hands): %.2f mm\n' %
+                np.mean(eval_result['mpvpe_r_hand']))
+        f.write('MPVPE (Hands): %.2f mm\n' % np.mean(eval_result['mpvpe_hand']))
+        f.write('MPVPE (Face): %.2f mm\n' % np.mean(eval_result['mpvpe_face']))
diff --git a/datasets/BEDLAM.py b/datasets/BEDLAM.py
new file mode 100644
index 0000000000000000000000000000000000000000..566de0e3253a12a4086b5d635b5dcf6410f42fe5
--- /dev/null
+++ b/datasets/BEDLAM.py
@@ -0,0 +1,32 @@
+import os.path as osp
+from config.config import cfg
+from humandata import HumanDataset
+
+
+class BEDLAM(HumanDataset):
+    def __init__(self, transform, data_split):
+        super(BEDLAM, self).__init__(transform, data_split)
+
+        self.img_dir = './data/datasets/bedlam/train_images/'
+        self.annot_path = 'data/preprocessed_npz/multihuman_data/bedlam_train_multi_0915.npz'
+        self.annot_path_cache = 'data/preprocessed_npz/cache/bedlam_train_cache_080824.npz'        
+        self.use_cache = getattr(cfg, 'use_cache', False)
+        
+        self.img_shape = None  #1024, 1024)  # (h, w)
+        self.cam_param = {}
+
+        # load data or cache
+        if self.use_cache and osp.isfile(self.annot_path_cache):
+            print(
+                f'[{self.__class__.__name__}] loading cache from {self.annot_path_cache}'
+            )
+            self.datalist = self.load_cache(self.annot_path_cache)
+        else:
+            if self.use_cache:
+                print(
+                    f'[{self.__class__.__name__}] Cache not found, generating cache...'
+                )
+            self.datalist = self.load_data(train_sample_interval=getattr(
+                cfg, f'{self.__class__.__name__}_train_sample_interval', 5))
+            if self.use_cache:
+                self.save_cache(self.annot_path_cache, self.datalist)
diff --git a/datasets/COCO_NA.py b/datasets/COCO_NA.py
new file mode 100644
index 0000000000000000000000000000000000000000..38a553deb21e5e005becc4442bc0ea7a32189dcf
--- /dev/null
+++ b/datasets/COCO_NA.py
@@ -0,0 +1,36 @@
+import os
+import os.path as osp
+import numpy as np
+
+# from osx.common.utils.human_models import smpl_x
+
+from humandata import HumanDataset
+from config.config import cfg
+
+
+class COCO_NA(HumanDataset):
+    def __init__(self, transform, data_split):
+        super(COCO_NA, self).__init__(transform, data_split)
+        self.img_dir = 'data/datasets/coco_2017'
+        self.annot_path = 'data/preprocessed_npz/multihuman_data/coco_wholebody_new_train_multi.npz'
+        self.annot_path_cache = 'data/preprocessed_npz/cache/coco_train_cache_080824.npz'
+        # osp.join(cfg.data_dir, 'cache', filename)
+        self.keypoints2d = 'keypoints2d_ori'
+        self.use_cache = getattr(cfg, 'use_cache', False)
+        self.cam_param = {}
+
+        # load data or cache
+        if self.use_cache and osp.isfile(self.annot_path_cache):
+            print(
+                f'[{self.__class__.__name__}] loading cache from {self.annot_path_cache}'
+            )
+            self.datalist = self.load_cache(self.annot_path_cache)
+        else:
+            if self.use_cache:
+                print(
+                    f'[{self.__class__.__name__}] Cache not found, generating cache...'
+                )
+            self.datalist = self.load_data(train_sample_interval=getattr(
+                cfg, f'{self.__class__.__name__}_train_sample_interval', 1))
+            if self.use_cache:
+                self.save_cache(self.annot_path_cache, self.datalist)
diff --git a/datasets/EHF.py b/datasets/EHF.py
new file mode 100644
index 0000000000000000000000000000000000000000..f75fad8515111c09c2dd968ae69f27e10908eb88
--- /dev/null
+++ b/datasets/EHF.py
@@ -0,0 +1,289 @@
+import os
+import os.path as osp
+from glob import glob
+import numpy as np
+from config.config import cfg
+import copy
+import json
+import cv2
+import torch
+from pycocotools.coco import COCO
+from util.human_models import smpl_x
+from util.preprocessing import load_img, process_bbox, load_ply
+from util.transforms import rigid_align, rigid_align_batch
+from humandata import HumanDataset
+import csv
+
+class EHF(HumanDataset):
+    def __init__(self, transform, data_split):
+        super(EHF, self).__init__(transform, data_split)
+
+        self.transform = transform
+        self.data_split = data_split
+        self.save_idx = 0
+        # self.cam_param = {'R': [-2.98747896, 0.01172457, -0.05704687]}
+        # self.cam_param['R'], _ = cv2.Rodrigues(np.array(self.cam_param['R']))
+        self.cam_param = {}
+        self.img_dir = 'data/data_weichen/ehf'
+        self.img_shape = [1200, 1600]
+        
+        self.annot_path = 'data_tmp/multihuman_data/ehf_val_230908_100.npz'
+        self.annot_path_cache = 'data_tmp/cache/ehf_val_cache_230908_100.npz'
+        
+        if self.use_cache and osp.isfile(self.annot_path_cache):
+            print(f'[{self.__class__.__name__}] loading cache from {self.annot_path_cache}')
+            self.datalist = self.load_cache(self.annot_path_cache)
+        else:
+            if self.use_cache:
+                print(f'[{self.__class__.__name__}] Cache not found, generating cache...')
+            self.datalist = self.load_data(
+                train_sample_interval=getattr(cfg, f'{self.__class__.__name__}_train_sample_interval', 1))
+            if self.use_cache:
+                self.save_cache(self.annot_path_cache, self.datalist)
+
+
+    def evaluate(self, outs, cur_sample_idx):
+        annots = self.datalist
+        sample_num = len(outs)
+        eval_result = {
+            'pa_mpvpe_all': [],
+            'pa_mpvpe_l_hand': [],
+            'pa_mpvpe_r_hand': [],
+            'pa_mpvpe_hand': [],
+            'pa_mpvpe_face': [],
+            'mpvpe_all': [],
+            'mpvpe_l_hand': [],
+            'mpvpe_r_hand': [],
+            'mpvpe_hand': [],
+            'mpvpe_face': [],
+            'pa_mpjpe_body': [],
+            'pa_mpjpe_l_hand': [],
+            'pa_mpjpe_r_hand': [],
+            'pa_mpjpe_hand': []
+        }
+        
+        csv_file = f'{cfg.result_dir}/ehf_smplx_error.csv'
+        file = open(csv_file, 'a', newline='')
+        for n in range(sample_num):
+            annot = annots[cur_sample_idx + n]
+            ann_id = annot['img_path'].split('/')[-1].split('_')[0]
+            out = outs[n]
+            ann_idx = out['gt_ann_idx']
+            img_path = []
+            for ann_id in ann_idx:
+                img_path.append(annots[ann_id]['img_path'])
+            eval_result['img_path'] = img_path
+            eval_result['ann_idx'] = ann_idx
+            # MPVPE from all vertices np.dot(self.cam_param['R'], out['smplx_mesh_cam_target'].transpose(0,2,1)).transpose(1,2,0)
+            # mesh_gt = np.dot(
+            #     self.cam_param['R'], 
+            #     out['smplx_mesh_cam_target'].transpose(0,2,1)
+            #     ).transpose(1,2,0)
+            mesh_gt = out['smplx_mesh_cam_target']
+            mesh_out = out['smplx_mesh_cam']
+
+            # mesh_gt_align = rigid_align(mesh_gt, mesh_out)
+
+            # print(mesh_out.shape)
+            mesh_out_align = rigid_align_batch(mesh_out, mesh_gt)
+            eval_result['pa_mpvpe_all'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1)).mean() * 1000)
+            mesh_out_align = mesh_out - np.dot(
+                smpl_x.J_regressor,
+                mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['pelvis'], None, :] + np.dot(
+                    smpl_x.J_regressor,
+                    mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['pelvis'], None, :]
+            eval_result['mpvpe_all'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1)).mean() * 1000)
+
+            # MPVPE from hand vertices
+            mesh_gt_lhand = mesh_gt[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_out_lhand = mesh_out[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_out_lhand_align = rigid_align_batch(mesh_out_lhand, mesh_gt_lhand)
+            mesh_gt_rhand = mesh_gt[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_rhand = mesh_out[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_rhand_align = rigid_align_batch(mesh_out_rhand, mesh_gt_rhand)
+            eval_result['pa_mpvpe_l_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean() *
+                1000)
+            eval_result['pa_mpvpe_r_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean() *
+                1000)
+            eval_result['pa_mpvpe_hand'].append(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean() *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean() *
+                 1000) / 2.)
+
+            mesh_out_lhand_align = mesh_out_lhand - np.dot(
+                smpl_x.J_regressor,
+                mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :] + np.dot(
+                    smpl_x.J_regressor,
+                    mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :]
+            mesh_out_rhand_align = mesh_out_rhand - np.dot(
+                smpl_x.J_regressor,
+                mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :] + np.dot(
+                    smpl_x.J_regressor,
+                    mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :]
+
+            eval_result['mpvpe_l_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean() *
+                1000)
+            eval_result['mpvpe_r_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean() *
+                1000)
+            eval_result['mpvpe_hand'].append(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean() *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean() *
+                 1000) / 2.)
+
+            # MPVPE from face vertices
+            mesh_gt_face = mesh_gt[:, smpl_x.face_vertex_idx, :]
+            mesh_out_face = mesh_out[:, smpl_x.face_vertex_idx, :]
+            mesh_out_face_align = rigid_align_batch(mesh_out_face, mesh_gt_face)
+            eval_result['pa_mpvpe_face'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_face_align - mesh_gt_face)**2, -1)).mean() * 1000)
+            mesh_out_face_align = mesh_out_face - np.dot(
+                smpl_x.J_regressor,
+                mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['neck'], None, :] + np.dot(
+                    smpl_x.J_regressor,
+                    mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['neck'], None, :]
+            eval_result['mpvpe_face'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_face_align - mesh_gt_face)**2, -1)).mean() * 1000)
+
+            # MPJPE from body joints
+            joint_gt_body = np.dot(smpl_x.j14_regressor, mesh_gt).transpose(1,0,2)
+            joint_out_body = np.dot(smpl_x.j14_regressor, mesh_out).transpose(1,0,2)
+            joint_out_body_align = rigid_align_batch(joint_out_body, joint_gt_body)
+            eval_result['pa_mpjpe_body'].append(
+                np.sqrt(np.sum(
+                    (joint_out_body_align - joint_gt_body)**2, -1)).mean() *
+                1000)
+
+            # MPJPE from hand joints
+            joint_gt_lhand = np.dot(smpl_x.orig_hand_regressor['left'],
+                                    mesh_gt).transpose(1,0,2)
+            joint_out_lhand = np.dot(smpl_x.orig_hand_regressor['left'],
+                                     mesh_out).transpose(1,0,2)
+            joint_out_lhand_align = rigid_align_batch(joint_out_lhand,
+                                                joint_gt_lhand)
+            joint_gt_rhand = np.dot(smpl_x.orig_hand_regressor['right'],
+                                    mesh_gt).transpose(1,0,2)
+            joint_out_rhand = np.dot(smpl_x.orig_hand_regressor['right'],
+                                     mesh_out).transpose(1,0,2)
+            joint_out_rhand_align = rigid_align_batch(joint_out_rhand,
+                                                joint_gt_rhand)
+            eval_result['pa_mpjpe_l_hand'].append(
+                np.sqrt(np.sum(
+                    (joint_out_lhand_align - joint_gt_lhand)**2, -1)).mean() *
+                1000)
+            eval_result['pa_mpjpe_r_hand'].append(
+                np.sqrt(np.sum(
+                    (joint_out_rhand_align - joint_gt_rhand)**2, 1)).mean() *
+                1000)
+            eval_result['pa_mpjpe_hand'].append(
+                (np.sqrt(np.sum(
+                    (joint_out_lhand_align - joint_gt_lhand)**2, -1)).mean() *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (joint_out_rhand_align - joint_gt_rhand)**2, -1)).mean() *
+                 1000) / 2.)
+            save_error=True
+            if save_error:
+                writer = csv.writer(file)
+                new_line = [ann_idx[n],img_path[n], eval_result['mpvpe_all'][-1], eval_result['pa_mpvpe_all'][-1]]
+                writer.writerow(new_line)
+                self.save_idx += 1
+                
+            # vis = cfg.vis
+            
+
+        for k,v in eval_result.items():
+            if k != 'img_path' and k != 'ann_idx':
+                
+                if len(v)>1:
+                    eval_result[k] = np.concatenate(v,axis=0)
+                else:
+                    eval_result[k] = np.array(v)
+        return eval_result
+
+    def print_eval_result(self, eval_result):
+        print('======EHF======')
+        print('PA MPVPE (All): %.2f mm' % np.mean(eval_result['pa_mpvpe_all']))
+        print('PA MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_l_hand']))
+        print('PA MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_r_hand']))
+        print('PA MPVPE (Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_hand']))
+        print('PA MPVPE (Face): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_face']))
+        print()
+
+        print('MPVPE (All): %.2f mm' % np.mean(eval_result['mpvpe_all']))
+        print('MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_l_hand']))
+        print('MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_r_hand']))
+        print('MPVPE (Hands): %.2f mm' % np.mean(eval_result['mpvpe_hand']))
+        print('MPVPE (Face): %.2f mm' % np.mean(eval_result['mpvpe_face']))
+        print()
+
+        print('PA MPJPE (Body): %.2f mm' %
+              np.mean(eval_result['pa_mpjpe_body']))
+        print('PA MPJPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpjpe_l_hand']))
+        print('PA MPJPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpjpe_r_hand']))
+        print('PA MPJPE (Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpjpe_hand']))
+        out_file = osp.join(cfg.result_dir,'ehf_test.txt')
+        if os.path.exists(out_file):
+            f = open(out_file, 'a+')
+        else:
+            f = open(out_file, 'w', encoding="utf-8")
+        
+        f.write('\n')
+        f.write(f'{cfg.exp_name}\n')
+        f.write(f'EHF dataset: \n')
+        f.write('PA MPVPE (All): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_all']))
+        f.write('PA MPVPE (L-Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_l_hand']))
+        f.write('PA MPVPE (R-Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_r_hand']))
+        f.write('PA MPVPE (Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_hand']))
+        f.write('PA MPVPE (Face): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_face']))
+        f.write('MPVPE (All): %.2f mm\n' % np.mean(eval_result['mpvpe_all']))
+        f.write('MPVPE (L-Hands): %.2f mm\n' %
+                np.mean(eval_result['mpvpe_l_hand']))
+        f.write('MPVPE (R-Hands): %.2f mm\n' %
+                np.mean(eval_result['mpvpe_r_hand']))
+        f.write('MPVPE (Hands): %.2f mm\n' % np.mean(eval_result['mpvpe_hand']))
+        f.write('MPVPE (Face): %.2f mm\n' % np.mean(eval_result['mpvpe_face']))
+        f.write('PA MPJPE (Body): %.2f mm\n' %
+                np.mean(eval_result['pa_mpjpe_body']))
+        f.write('PA MPJPE (L-Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpjpe_l_hand']))
+        f.write('PA MPJPE (R-Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpjpe_r_hand']))
+        f.write('PA MPJPE (Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpjpe_hand']))
+        
+        f.close()
+
diff --git a/datasets/EgoBody_Egocentric.py b/datasets/EgoBody_Egocentric.py
new file mode 100644
index 0000000000000000000000000000000000000000..c69993965abdd12b6da0bdb4b6400913abebceb1
--- /dev/null
+++ b/datasets/EgoBody_Egocentric.py
@@ -0,0 +1,211 @@
+import os
+import os.path as osp
+import numpy as np
+import torch
+import cv2
+import json
+import copy
+import csv
+from pycocotools.coco import COCO
+from config.config import cfg
+from util.human_models import smpl_x
+
+from util.transforms import world2cam, cam2pixel, rigid_align
+from humandata import HumanDataset
+from util.transforms import rigid_align, rigid_align_batch
+
+
+
+class EgoBody_Egocentric(HumanDataset):
+    def __init__(self, transform, data_split):
+        super(EgoBody_Egocentric, self).__init__(transform, data_split)
+
+        if self.data_split == 'train':
+            filename = 'data/preprocessed_npz/multihuman_data/egobody_egocentric_train_multi_080824.npz'
+            self.annot_path_cache = 'data/preprocessed_npz/cache/egobody_egocentric_train_cache_080824.npz'
+            self.sample_interval = 5
+        else:
+            filename = 'data/preprocessed_npz/multihuman_data/egobody_egocentric_val_multi_080824.npz'
+            self.annot_path_cache = 'data/preprocessed_npz/cache/egobody_egocentric_val_cache_080824.npz'
+            self.sample_interval = 1
+        self.use_betas_neutral = getattr(cfg, 'egobody_fix_betas', False)
+
+        self.img_dir = 'data/osx_data/EgoBody'
+        self.annot_path = filename
+        self.use_cache = getattr(cfg, 'use_cache', False)
+        self.img_shape = (1080, 1920)  # (h, w)
+        self.cam_param = {}
+
+        # check image shape
+        img_path = osp.join(self.img_dir,
+                            np.load(self.annot_path)['image_path'][0])
+        img_shape = cv2.imread(img_path).shape[:2]
+        assert self.img_shape == img_shape, 'image shape is incorrect: {} vs {}'.format(
+            self.img_shape, img_shape)
+
+        # load data or cache
+        if self.use_cache and osp.isfile(self.annot_path_cache):
+            print(
+                f'[{self.__class__.__name__}] loading cache from {self.annot_path_cache}'
+            )
+            self.datalist = self.load_cache(self.annot_path_cache)
+        else:
+            if self.use_cache:
+                print(
+                    f'[{self.__class__.__name__}] Cache not found, generating cache...'
+                )
+            self.datalist = self.load_data(train_sample_interval=getattr(
+                cfg, f'{self.__class__.__name__}_train_sample_interval', self.sample_interval))
+            if self.use_cache:
+                self.save_cache(self.annot_path_cache, self.datalist)
+                
+    def evaluate(self, outs, cur_sample_idx):
+        annots = self.datalist
+        sample_num = len(outs)
+        eval_result = {
+            'pa_mpvpe_all': [],
+            'pa_mpvpe_l_hand': [],
+            'pa_mpvpe_r_hand': [],
+            'pa_mpvpe_hand': [],
+            'pa_mpvpe_face': [],
+            'mpvpe_all': [],
+            'mpvpe_l_hand': [],
+            'mpvpe_r_hand': [],
+            'mpvpe_hand': [],
+            'mpvpe_face': []
+        }
+
+        vis = getattr(cfg, 'vis', False)
+        vis_save_dir = cfg.vis_dir
+        csv_file = f'{cfg.result_dir}/egobody_smplx_error.csv'
+        file = open(csv_file, 'a', newline='')
+        for n in range(sample_num):
+            annot = annots[cur_sample_idx + n]
+            out = outs[n]
+            mesh_gt = out['smplx_mesh_cam_target']
+            mesh_out = out['smplx_mesh_cam']
+            ann_idx = out['gt_ann_idx']
+            img_path = []
+            for ann_id in ann_idx:
+                img_path.append(annots[ann_id]['img_path'])
+            eval_result['img_path'] = img_path
+            eval_result['ann_idx'] = ann_idx
+            # MPVPE from all vertices
+            mesh_out_align = \
+                mesh_out - np.dot(
+                    smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['pelvis'], None, :] + \
+                    np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['pelvis'], None, :]
+            
+            eval_result['mpvpe_all'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1)).mean(-1) * 1000)
+            mesh_out_align = rigid_align_batch(mesh_out, mesh_gt)
+            eval_result['pa_mpvpe_all'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1)).mean(-1) * 1000)
+
+            # MPVPE from hand vertices
+            mesh_gt_lhand = mesh_gt[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_out_lhand = mesh_out[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_gt_rhand = mesh_gt[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_rhand = mesh_out[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_lhand_align = \
+                mesh_out_lhand - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :]
+                    
+            mesh_out_rhand_align = \
+                mesh_out_rhand - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :]
+            
+            eval_result['mpvpe_l_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['mpvpe_r_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['mpvpe_hand'].extend(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                 1000) / 2.)
+            mesh_out_lhand_align = rigid_align_batch(mesh_out_lhand, mesh_gt_lhand)
+            mesh_out_rhand_align = rigid_align_batch(mesh_out_rhand, mesh_gt_rhand)
+            eval_result['pa_mpvpe_l_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['pa_mpvpe_r_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['pa_mpvpe_hand'].extend(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                 1000) / 2.)
+            save_error=True
+            if save_error:
+                writer = csv.writer(file)
+                new_line = [ann_idx[n], img_path[n], eval_result['mpvpe_all'][-1], eval_result['pa_mpvpe_all'][-1]]
+                writer.writerow(new_line)
+            
+
+        return eval_result
+
+
+    def print_eval_result(self, eval_result):
+
+        print('======Egocentric======')
+        print('PA MPVPE (All): %.2f mm' % np.mean(eval_result['pa_mpvpe_all']))
+        print('PA MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_l_hand']))
+        print('PA MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_r_hand']))
+        print('PA MPVPE (Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_hand']))
+        print('PA MPVPE (Face): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_face']))
+        print()
+
+        print('MPVPE (All): %.2f mm' % np.mean(eval_result['mpvpe_all']))
+        print('MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_l_hand']))
+        print('MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_r_hand']))
+        print('MPVPE (Hands): %.2f mm' % np.mean(eval_result['mpvpe_hand']))
+        print('MPVPE (Face): %.2f mm' % np.mean(eval_result['mpvpe_face']))
+        
+        out_file = osp.join(cfg.result_dir,'Egocentric_val.txt')
+        if os.path.exists(out_file):
+            f = open(out_file, 'a+')
+        else:
+            f = open(out_file, 'w', encoding="utf-8")
+            
+        f.write('\n')
+        f.write(f'{cfg.exp_name}\n')            
+        f.write(f'Egocentric dataset: \n')
+        f.write('PA MPVPE (All): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_all']))
+        f.write('PA MPVPE (L-Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_l_hand']))
+        f.write('PA MPVPE (R-Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_r_hand']))
+        f.write('PA MPVPE (Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_hand']))
+        f.write('PA MPVPE (Face): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_face']))
+        f.write('MPVPE (All): %.2f mm\n' % np.mean(eval_result['mpvpe_all']))
+        f.write('MPVPE (L-Hands): %.2f mm\n' %
+                np.mean(eval_result['mpvpe_l_hand']))
+        f.write('MPVPE (R-Hands): %.2f mm\n' %
+                np.mean(eval_result['mpvpe_r_hand']))
+        f.write('MPVPE (Hands): %.2f mm\n' % np.mean(eval_result['mpvpe_hand']))
+        f.write('MPVPE (Face): %.2f mm\n' % np.mean(eval_result['mpvpe_face']))
diff --git a/datasets/EgoBody_Kinect.py b/datasets/EgoBody_Kinect.py
new file mode 100644
index 0000000000000000000000000000000000000000..999fe8654a9f9f712aaf3c3ea63ec6f20efd1604
--- /dev/null
+++ b/datasets/EgoBody_Kinect.py
@@ -0,0 +1,194 @@
+import os
+import os.path as osp
+import numpy as np
+import torch
+import cv2
+import json
+import copy
+import csv
+from pycocotools.coco import COCO
+from config.config import cfg
+from util.human_models import smpl_x
+
+from util.transforms import world2cam, cam2pixel, rigid_align
+from humandata import HumanDataset
+from util.transforms import rigid_align, rigid_align_batch
+
+
+class EgoBody_Kinect(HumanDataset):
+    def __init__(self, transform, data_split):
+        super(EgoBody_Kinect, self).__init__(transform, data_split)
+
+        if self.data_split == 'train':
+            filename = 'data/preprocessed_npz/multihuman_data/egobody_kinect_train_multi_080824.npz'
+            self.annot_path_cache = 'data/preprocessed_npz/cache/egobody_kinect_train_cache_080824.npz'
+            self.sample_interval = 10
+        else:
+            filename = 'data/preprocessed_npz/egobody_kinect_test_230503_043_fix_betas_multi.npz'
+            self.annot_path_cache = 'data/preprocessed_npz/egobody_kinect_test_230503_043_fix_betas_multi_cache_100.npz'
+            self.sample_interval = 100
+        self.use_betas_neutral = getattr(cfg, 'egobody_fix_betas', False)
+
+        self.img_dir = 'data/osx_data/EgoBody'
+        self.annot_path = filename
+        
+        self.use_cache = getattr(cfg, 'use_cache', False)
+        self.img_shape = (1080, 1920)  # (h, w)
+        self.cam_param = {}
+
+        # check image shape
+        img_path = osp.join(self.img_dir,
+                            np.load(self.annot_path)['image_path'][0])
+        img_shape = cv2.imread(img_path).shape[:2]
+        assert self.img_shape == img_shape, 'image shape is incorrect: {} vs {}'.format(
+            self.img_shape, img_shape)
+
+        # load data or cache
+        if self.use_cache and osp.isfile(self.annot_path_cache):
+            print(
+                f'[{self.__class__.__name__}] loading cache from {self.annot_path_cache}'
+            )
+            self.datalist = self.load_cache(self.annot_path_cache)
+        else:
+            if self.use_cache:
+                print(
+                    f'[{self.__class__.__name__}] Cache not found, generating cache...'
+                )
+            self.datalist = self.load_data(train_sample_interval=self.sample_interval)
+            if self.use_cache:
+                self.save_cache(self.annot_path_cache, self.datalist)
+    def evaluate(self, outs, cur_sample_idx):
+        annots = self.datalist
+        sample_num = len(outs)
+        eval_result = {
+            'pa_mpvpe_all': [],
+            'pa_mpvpe_l_hand': [],
+            'pa_mpvpe_r_hand': [],
+            'pa_mpvpe_hand': [],
+            'pa_mpvpe_face': [],
+            'mpvpe_all': [],
+            'mpvpe_l_hand': [],
+            'mpvpe_r_hand': [],
+            'mpvpe_hand': [],
+            'mpvpe_face': []
+        }
+
+        vis = getattr(cfg, 'vis', False)
+        vis_save_dir = cfg.vis_dir
+
+        for n in range(sample_num):
+            annot = annots[cur_sample_idx + n]
+            out = outs[n]
+            mesh_gt = out['smplx_mesh_cam_target']
+            mesh_out = out['smplx_mesh_cam']
+            ann_idx = out['gt_ann_idx']
+            img_path = []
+            for ann_id in ann_idx:
+                img_path.append(annots[ann_id]['img_path'])
+            eval_result['img_path'] = img_path
+            eval_result['ann_idx'] = ann_idx
+            # MPVPE from all vertices
+            mesh_out_align = \
+                mesh_out - np.dot(
+                    smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['pelvis'], None, :] + \
+                    np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['pelvis'], None, :]
+            
+            eval_result['mpvpe_all'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1)).mean(-1) * 1000)
+            mesh_out_align = rigid_align_batch(mesh_out, mesh_gt)
+            eval_result['pa_mpvpe_all'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1)).mean(-1) * 1000)
+
+            # MPVPE from hand vertices
+            mesh_gt_lhand = mesh_gt[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_out_lhand = mesh_out[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_gt_rhand = mesh_gt[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_rhand = mesh_out[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_lhand_align = \
+                mesh_out_lhand - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :]
+                    
+            mesh_out_rhand_align = \
+                mesh_out_rhand - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :]
+            
+            eval_result['mpvpe_l_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['mpvpe_r_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['mpvpe_hand'].extend(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                 1000) / 2.)
+            mesh_out_lhand_align = rigid_align_batch(mesh_out_lhand, mesh_gt_lhand)
+            mesh_out_rhand_align = rigid_align_batch(mesh_out_rhand, mesh_gt_rhand)
+            eval_result['pa_mpvpe_l_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['pa_mpvpe_r_hand'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                1000)
+            eval_result['pa_mpvpe_hand'].extend(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, -1)).mean(-1) *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).mean(-1) *
+                 1000) / 2.)
+            vis = False
+            if vis:
+                import mmcv
+                img = (out['img']).transpose(0,2,3,1)
+                img = mmcv.imdenormalize(
+                    img=img[0], 
+                    mean=np.array([123.675, 116.28, 103.53]), 
+                    std=np.array([58.395, 57.12, 57.375]),
+                    to_bgr=True).astype(np.uint8)
+                from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+                import ipdb;ipdb.set_trace()
+                visualize_kp2d(
+                    out['smplx_joint_proj'][0][None],
+                    image_array=img[None].copy(),
+                    disable_limbs=True,
+                    overwrite=True,
+                    output_path='./figs/pred2d'
+                )
+                from pytorch3d.io import save_obj
+                save_obj('temp.obj',verts=out['smplx_mesh_cam'][0],faces=torch.tensor([]))
+            # MPVPE from face vertices
+            mesh_gt_face = mesh_gt[:, smpl_x.face_vertex_idx, :]
+            mesh_out_face = mesh_out[:, smpl_x.face_vertex_idx, :]
+            mesh_out_face_align = \
+                mesh_out_face - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['neck'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['neck'], None, :]
+            eval_result['mpvpe_face'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_face_align - mesh_gt_face)**2, -1)).mean(-1) * 1000)
+            mesh_out_face_align = rigid_align_batch(mesh_out_face, mesh_gt_face)
+            eval_result['pa_mpvpe_face'].extend(
+                np.sqrt(np.sum(
+                    (mesh_out_face_align - mesh_gt_face)**2, -1)).mean(-1) * 1000)
+            
+        # for k,v in eval_result.items():
+        #     if k != 'img_path' and k != 'ann_idx':
+        #         # import ipdb;ipdb.set_trace()
+        #         if len(v)>1:
+        #             eval_result[k] = np.concatenate(v,axis=0)
+        #         else:
+        #             eval_result[k] = np.array(v)
+
+        return eval_result
\ No newline at end of file
diff --git a/datasets/INFERENCE.py b/datasets/INFERENCE.py
new file mode 100644
index 0000000000000000000000000000000000000000..122c4edb6e7c495c0b5df5274d628dbe79e8b3a3
--- /dev/null
+++ b/datasets/INFERENCE.py
@@ -0,0 +1,289 @@
+import os
+import os.path as osp
+from glob import glob
+import numpy as np
+from config.config import cfg
+import copy
+import json
+import pickle
+import cv2
+import torch
+from pycocotools.coco import COCO
+from util.human_models import smpl_x
+from util.preprocessing import load_img, sanitize_bbox, process_bbox,augmentation_keep_size, load_ply, load_obj
+from util.transforms import rigid_align, rigid_align_batch
+import tqdm
+import random
+from util.formatting import DefaultFormatBundle
+from detrsmpl.data.datasets.pipelines.transforms import Normalize
+from humandata import HumanDataset
+from detrsmpl.utils.demo_utils import xywh2xyxy, xyxy2xywh, box2cs
+from detrsmpl.core.conventions.keypoints_mapping import convert_kps
+import mmcv
+import cv2
+import numpy as np
+from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+from detrsmpl.core.visualization.visualize_smpl import visualize_smpl_hmr,render_smpl
+from detrsmpl.models.body_models.builder import build_body_model
+from detrsmpl.core.visualization.visualize_keypoints3d import visualize_kp3d
+from detrsmpl.data.data_structures.multi_human_data import MultiHumanData
+from detrsmpl.utils.ffmpeg_utils import video_to_images
+from mmcv.runner import get_dist_info
+from config.config import cfg
+import torch.distributed as dist
+import shutil
+
+class INFERENCE(torch.utils.data.Dataset):
+    def __init__(self, img_dir=None,out_path=None):
+        
+        self.output_path = out_path
+
+        self.img_dir = img_dir
+
+        self.is_vid = False
+        
+        # can you change isfile to decide if it is mp4
+        rank, _ = get_dist_info()
+        if self.img_dir.endswith('.mp4'):
+            self.is_vid = True
+            img_name = self.img_dir.split('/')[-1][:-4]
+            # self.img_dir = self.img_dir[:-4]
+        else:
+            img_name = self.img_dir.split('/')[-1]
+        self.img_name = img_name+'_out'
+        self.output_path = os.path.join(self.output_path,self.img_name)
+        os.makedirs(self.output_path, exist_ok=True)
+        self.tmp_dir = os.path.join(self.output_path, 'temp_img')
+        os.makedirs(self.tmp_dir, exist_ok=True)
+        self.result_img_dir = os.path.join(self.output_path, 'res_img')
+        
+        
+        if not self.is_vid:
+            if rank == 0:
+                image_files = sorted(glob(self.img_dir + '/*.jpg') + glob(self.img_dir + '/*.png'))
+                for i, image_file in enumerate(image_files):
+                    new_name = os.path.join(self.tmp_dir, '%06d.png'%i)
+                    shutil.copy(image_file, new_name)
+            dist.barrier()
+        else:
+            if rank == 0:
+                video_to_images(self.img_dir, self.tmp_dir)
+            dist.barrier()
+        self.img_paths = sorted(glob(self.tmp_dir+'/*',recursive=True)) 
+        self.score_threshold = 0.2
+        self.resolution = [720 ,1280] # AGORA test
+        # self.resolution = [1200, 1600] # EHF
+        # self.img_paths = sorted(glob(self.img_dir,recursive=True))        
+        self.format = DefaultFormatBundle()
+        self.normalize = Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+       
+    def __len__(self):
+        return len(self.img_paths)
+
+    def __getitem__(self, idx):
+        
+        img = load_img(self.img_paths[idx],'BGR')
+        img_whole_bbox = np.array([0, 0, img.shape[1],img.shape[0]])
+        img, img2bb_trans, bb2img_trans, _, _ = \
+            augmentation_keep_size(img, img_whole_bbox, 'test')
+
+        cropped_img_shape=img.shape[:2]
+        img = (img.astype(np.float32)) 
+        
+        inputs = {'img': img}
+        targets = {
+            'body_bbox_center': np.array(img_whole_bbox[None]),
+            'body_bbox_size': np.array(img_whole_bbox[None])}
+        meta_info = {
+            'ori_shape':np.array(self.resolution),
+            'img_shape': np.array(img.shape[:2]),
+            'img2bb_trans': img2bb_trans,
+            'bb2img_trans': bb2img_trans,
+            'ann_idx': idx}
+        result = {**inputs, **targets, **meta_info}
+        
+        result = self.normalize(result)
+        result = self.format(result)
+            
+        return result
+        
+    def inference(self, outs):
+        img_paths = self.img_paths
+        sample_num = len(outs)
+        output = {}
+        
+        for out in outs:
+            ann_idx = out['image_idx']
+            img_cropped = mmcv.imdenormalize(
+                img=(out['img'].cpu().numpy()).transpose(1, 2, 0), 
+                mean=np.array([123.675, 116.28, 103.53]), 
+                std=np.array([58.395, 57.12, 57.375]),
+                to_bgr=True).astype(np.uint8)
+            # bb2img_trans = out['bb2img_trans']
+            # img2bb_trans = out['img2bb_trans']
+            scores = out['scores'].clone().cpu().numpy()
+            img_shape = out['img_shape'].cpu().numpy()[::-1] # w, h
+            width,height = img_shape
+            width += width % 2
+            height += height % 2
+            img_shape = np.array([width, height])
+            img = cv2.imread(img_paths[ann_idx]) # h, w
+
+                
+            joint_proj = out['smplx_joint_proj'].clone().cpu().numpy()
+            joint_vis = out['smplx_joint_proj'].clone().cpu().numpy()
+            joint_coco = out['keypoints_coco'].clone().cpu().numpy()
+            joint_coco_raw = joint_coco.copy()
+            smpl_kp3d_coco, _ = convert_kps(out['smpl_kp3d'].clone().cpu().numpy(),src='smplx',dst='coco', approximate=True)
+            
+            
+            
+            body_bbox = out['body_bbox'].clone().cpu().numpy()
+            lhand_bbox = out['lhand_bbox'].clone().cpu().numpy()
+            rhand_bbox = out['rhand_bbox'].clone().cpu().numpy()
+            face_bbox = out['face_bbox'].clone().cpu().numpy()
+
+            if self.resolution == [720, 1280]:
+                joint_proj[:, :, 0] = joint_proj[:, :, 0] / img_shape[0] * 3840
+                joint_proj[:, :, 1] = joint_proj[:, :, 1] / img_shape[1] * 2160
+                joint_vis[:, :, 0] = joint_vis[:, :, 0] / img_shape[0] * img.shape[1]
+                joint_vis[:, :, 1] = joint_vis[:, :, 1]/ img_shape[1] * img.shape[0]        
+                
+                joint_coco[:, :, 0] = joint_coco[:, :, 0] / img_shape[0] * img.shape[1]
+                joint_coco[:, :, 1] = joint_coco[:, :, 1]/ img_shape[1] * img.shape[0] 
+                scale = np.array([
+                    img.shape[1]/img_shape[0],
+                    img.shape[1]/img_shape[0], 
+                    img.shape[1]/img_shape[0], 
+                    img.shape[1]/img_shape[0], 
+                    ])
+                body_bbox_raw = body_bbox.copy()
+                body_bbox = body_bbox * scale
+                lhand_bbox = lhand_bbox * scale
+                rhand_bbox = rhand_bbox * scale
+                face_bbox = face_bbox * scale
+            elif self.resolution == [1200, 1600]:
+                
+                joint_proj[:, :, 0] = joint_proj[:, :, 0] * (1200 / 800)
+                joint_proj[:, :, 1] = joint_proj[:, :, 1] * (1600 / 1066)
+
+                joint_vis[:, :, 0] = joint_vis[:, :, 0] * (1200 / 800)
+                joint_vis[:, :, 1] = joint_vis[:, :, 1] * (1600 / 1066)             
+                
+                scale = np.array([1600/1066, 1200/800, 1600/1066, 1200/800])[None]
+                body_bbox = body_bbox * scale
+                lhand_bbox = lhand_bbox * scale
+                rhand_bbox = rhand_bbox * scale
+                face_bbox = face_bbox * scale
+                
+            for i, score in enumerate(scores):
+                if score < self.score_threshold:
+                    break
+
+                save_name = img_paths[ann_idx].split('/')[-1][:-4] # if not crop should be -4
+                if self.resolution == (2160, 3840):
+                    save_name = save_name.split('_ann_id')[0]
+                else:
+                    save_name = save_name.split('_1280x720')[0] 
+
+
+                
+                save_dict = {
+                    'params': {
+                        'transl': out['cam_trans'][i].reshape(1, -1).cpu().numpy(),
+                        'global_orient': out['smplx_root_pose'][i].reshape(1, -1).cpu().numpy(),
+                        'body_pose': out['smplx_body_pose'][i].reshape(1, -1).cpu().numpy(),
+                        'left_hand_pose': out['smplx_lhand_pose'][i].reshape(1, -1).cpu().numpy(),
+                        'right_hand_pose': out['smplx_rhand_pose'][i].reshape(1, -1).cpu().numpy(),
+                        'reye_pose': np.zeros((1, 3)),
+                        'leye_pose': np.zeros((1, 3)),
+                        'jaw_pose': out['smplx_jaw_pose'][i].reshape(1, -1).cpu().numpy(),
+                        'expression': out['smplx_expr'][i].reshape(1, -1).cpu().numpy(),
+                        'betas': out['smplx_shape'][i].reshape(1, -1).cpu().numpy()},
+                    
+                    'joints': joint_proj[i].reshape(1, -1, 2)[0,:24]}
+                
+                # save
+                exist_result_path = glob(osp.join(self.output_path, 'predictions', save_name + '*'))
+                if len(exist_result_path) == 0:
+                    person_idx = 0
+                else:
+                    last_person_idx = max([
+                        int(name.split('personId_')[1].split('.pkl')[0])
+                        for name in exist_result_path
+                    ])
+                    person_idx = last_person_idx + 1
+
+                save_name += '_personId_' + str(person_idx) + '.pkl'
+                os.makedirs(osp.join(self.output_path, 'predictions'), exist_ok=True)
+                with open(osp.join(self.output_path, 'predictions', save_name),'wb') as f:
+                    pickle.dump(save_dict, f)
+            # mesh
+            # bbox
+
+            
+            if i == 0:
+                save_name = img_paths[ann_idx].split('/')[-1][:-4]
+                cv2.imwrite(os.path.join(self.result_img_dir,img_paths[ann_idx].split('/')[-1]), img)
+            else:
+                # dump bbox
+                body_xywh = xyxy2xywh(body_bbox[:i])
+                score = scores[:i]
+                out_value = [{'bbox': b, 'score': s} for b, s in zip(body_xywh, score)]
+                out_key = img_paths[ann_idx].split('/')[-1]
+                output.update({out_key: out_value})
+                
+                # show bbox 
+                img = mmcv.imshow_bboxes(img, body_bbox[:i], show=False, colors='green')
+                img = mmcv.imshow_bboxes(img, lhand_bbox[:i], show=False, colors='blue')
+                img = mmcv.imshow_bboxes(img, rhand_bbox[:i], show=False, colors='yellow')
+                img = mmcv.imshow_bboxes(img, face_bbox[:i], show=False, colors='red')
+                
+                verts = out['smpl_verts'][:i] + out['cam_trans'][:i][:, None]
+                body_model_cfg = dict(
+                    type='smplx',
+                    keypoint_src='smplx',
+                    num_expression_coeffs=10,
+                    num_betas=10,
+                    gender='neutral',
+                    keypoint_dst='smplx_137',
+                    model_path='data/body_models/smplx',
+                    use_pca=False,
+                    use_face_contour=True)
+                body_model = build_body_model(body_model_cfg).to('cuda')
+                # for n, v in enumerate(verts):
+                #     save_obj(
+                #         osp.join(self.out_path, 'vis', img_paths[ann_idx].split('/')[-1].rjust(5+4,'0')).replace('.jpg',f'_{n}_.obj'),
+                #         verts = v,
+                #         faces=torch.tensor(body_model.faces.astype(np.int32))
+                #     )
+                # print(osp.join(self.out_path, 'vis', img_paths[ann_idx].split('/')[-1]))
+                
+                render_smpl(
+                    verts=verts[None],
+                    body_model=body_model,
+                    # K= np.array(
+                    #     [[img_shape[0]/2, 0, img_shape[0]/2],
+                    #      [0, img_shape[0]/2, img_shape[1]/2],
+                    #      [0, 0, 1]]),
+                    K= np.array(
+                        [[5000, 0, img_shape[0]/2],
+                         [0, 5000, img_shape[1]/2],
+                         [0, 0, 1]]),
+                    R=None,
+                    T=None,
+                    # output_path=osp.join(self.out_path, 'vis', img_paths[ann_idx].split('/')[-1].rjust(5+4,'0')),
+                    output_path=os.path.join(self.result_img_dir,img_paths[ann_idx].split('/')[-1]),
+                    image_array=cv2.resize(img, (img_shape[0],img_shape[1]), cv2.INTER_CUBIC),
+                    in_ndc=False,
+                    alpha=0.9,
+                    convention='opencv',
+                    projection='perspective',
+                    overwrite=True,
+                    no_grad=True,
+                    device='cuda',
+                    resolution=[img_shape[0],img_shape[1]],
+                    render_choice='hq',    
+                )
+        return output
+
diff --git a/datasets/INFERENCE_demo.py b/datasets/INFERENCE_demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..39ba4bce29b86d481c49600209b21274393eba17
--- /dev/null
+++ b/datasets/INFERENCE_demo.py
@@ -0,0 +1,169 @@
+import os
+import os.path as osp
+from glob import glob
+import numpy as np
+from config.config import cfg
+import copy
+import json
+import pickle
+import cv2
+import torch
+from pycocotools.coco import COCO
+from util.human_models import smpl_x
+from util.preprocessing import load_img, sanitize_bbox, process_bbox,augmentation_keep_size, load_ply, load_obj
+from util.transforms import rigid_align, rigid_align_batch
+import tqdm
+import random
+from util.formatting import DefaultFormatBundle
+from detrsmpl.data.datasets.pipelines.transforms import Normalize
+from humandata import HumanDataset
+from detrsmpl.utils.demo_utils import xywh2xyxy, xyxy2xywh, box2cs
+from detrsmpl.core.conventions.keypoints_mapping import convert_kps
+import mmcv
+import cv2
+import numpy as np
+from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+from detrsmpl.core.visualization.visualize_smpl import visualize_smpl_hmr,render_smpl
+from detrsmpl.models.body_models.builder import build_body_model
+from detrsmpl.core.visualization.visualize_keypoints3d import visualize_kp3d
+from detrsmpl.data.data_structures.multi_human_data import MultiHumanData
+from detrsmpl.utils.ffmpeg_utils import video_to_images
+from mmcv.runner import get_dist_info
+from config.config import cfg
+import torch.distributed as dist
+import shutil
+import re
+from pytorch3d.io import save_obj
+
+class INFERENCE_demo(torch.utils.data.Dataset):
+    def __init__(self, img_dir=None,out_path=None):
+        
+        self.output_path = out_path
+        self.mesh_path = os.path.join(self.output_path, 'mesh')
+        self.img_dir = img_dir
+        self.is_vid = True
+        body_model_cfg = dict(
+            type='smplx',
+            keypoint_src='smplx',
+            num_expression_coeffs=10,
+            num_betas=10,
+            gender='neutral',
+            keypoint_dst='smplx_137',
+            model_path='data/body_models/smplx',
+            use_pca=False,
+            use_face_contour=True)
+        self.body_model = build_body_model(body_model_cfg).to('cuda') 
+
+        os.makedirs(self.output_path, exist_ok=True)
+        self.tmp_dir = os.path.join(self.output_path, 'temp_img')
+        os.makedirs(self.tmp_dir, exist_ok=True)
+        self.result_img_dir = os.path.join(self.output_path, 'res_img')
+        video_to_images(self.img_dir, self.tmp_dir)
+        self.img_paths = sorted(glob(self.tmp_dir+'/*',recursive=True))
+        
+        self.num_person = cfg.num_person if 'num_person' in cfg else 0.1
+        self.score_threshold = cfg.threshold if 'threshold' in cfg else 0.1  
+        self.format = DefaultFormatBundle()
+        self.normalize = Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+       
+    def __len__(self):
+        return len(self.img_paths)
+
+    def __getitem__(self, idx):
+        img = load_img(self.img_paths[idx],'BGR')
+        self.resolution = img.shape[:2]
+        img_whole_bbox = np.array([0, 0, img.shape[1],img.shape[0]])
+        img, img2bb_trans, bb2img_trans, _, _ = \
+            augmentation_keep_size(img, img_whole_bbox, 'test')
+
+        # cropped_img_shape=img.shape[:2]
+        img = (img.astype(np.float32)) 
+        
+        inputs = {'img': img}
+        targets = {
+            'body_bbox_center': np.array(img_whole_bbox[None]),
+            'body_bbox_size': np.array(img_whole_bbox[None])}
+        meta_info = {
+            'ori_shape':np.array(self.resolution),
+            'img_shape': np.array(img.shape[:2]),
+            'img2bb_trans': img2bb_trans,
+            'bb2img_trans': bb2img_trans,
+            'ann_idx': idx}
+        result = {**inputs, **targets, **meta_info}
+        
+        result = self.normalize(result)
+        result = self.format(result)
+            
+        return result
+        
+    def inference(self, outs):
+        img_paths = self.img_paths
+        for out in outs:
+            ann_idx = out['image_idx']
+            # img_cropped = mmcv.imdenormalize(
+            #     img=(out['img'].cpu().numpy()).transpose(1, 2, 0), 
+            #     mean=np.array([123.675, 116.28, 103.53]), 
+            #     std=np.array([58.395, 57.12, 57.375]),
+            #     to_bgr=True).astype(np.uint8)
+            # bb2img_trans = out['bb2img_trans']
+            # img2bb_trans = out['img2bb_trans']
+            scores = out['scores'].clone().cpu().numpy()
+            img_shape = out['img_shape'].cpu().numpy()[::-1] # w, h
+            img = cv2.imread(img_paths[ann_idx]) # h, w
+            scale = img.shape[1]/img_shape[0]
+            body_bbox = out['body_bbox'].clone().cpu().numpy()
+            body_bbox = body_bbox * scale
+            joint_3d, _ =  convert_kps(out['smpl_kp3d'].clone().cpu().numpy(),src='smplx',dst='smplx', approximate=True)
+
+            for i, score in enumerate(scores):
+                if score < self.score_threshold:
+                    break
+                if i>self.num_person:
+                    break
+                save_name = img_paths[ann_idx].split('/')[-1]
+                save_name = save_name.split('.')[0] 
+                vert = out['smpl_verts'][i] + out['cam_trans'][i][None]
+                # save mesh
+                exist_result_path = glob(osp.join(self.mesh_path, save_name + '*'))
+                if len(exist_result_path) == 0:
+                    person_idx = 0
+                else:
+                    last_person_idx = max([
+                        int(name.split('personId_')[1].split('.obj')[0])
+                        for name in exist_result_path
+                    ])
+                    person_idx = last_person_idx + 1
+
+                save_name += '_personId_' + str(person_idx) + '.obj'
+                os.makedirs(self.mesh_path, exist_ok=True)
+                save_obj(osp.join(self.mesh_path, save_name), vert, faces=torch.tensor(self.body_model.faces.astype(np.int32)))
+            
+            if i == 0:
+                save_name = img_paths[ann_idx].split('/')[-1][:-4]
+                cv2.imwrite(os.path.join(self.result_img_dir,img_paths[ann_idx].split('/')[-1]), img)
+            else:
+                verts = out['smpl_verts'][:i] + out['cam_trans'][:i][:, None] 
+                img = mmcv.imshow_bboxes(img, body_bbox[:i], show=False, colors='green') 
+                render_smpl(
+                    verts=verts[None],
+                    body_model=self.body_model,
+                    K= np.array(
+                        [[5000, 0, img_shape[0]/2],
+                         [0, 5000, img_shape[1]/2],
+                         [0, 0, 1]]),
+                    R=None,
+                    T=None,
+                    output_path=os.path.join(self.result_img_dir,img_paths[ann_idx].split('/')[-1]),
+                    image_array=cv2.resize(img, (img_shape[0],img_shape[1]), cv2.INTER_CUBIC),
+                    in_ndc=False,
+                    alpha=0.9,
+                    convention='opencv',
+                    projection='perspective',
+                    overwrite=True,
+                    no_grad=True,
+                    device='cuda',
+                    resolution=[img_shape[1],img_shape[0]],
+                    render_choice='hq' 
+                )
+        return None
+
diff --git a/datasets/SynBody.py b/datasets/SynBody.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f9428069440cd2723aac31e3e2c9e9f43ec7831
--- /dev/null
+++ b/datasets/SynBody.py
@@ -0,0 +1,53 @@
+import os
+import os.path as osp
+import numpy as np
+import torch
+import cv2
+import json
+import copy
+from pycocotools.coco import COCO
+from config.config import cfg
+from util.human_models import smpl_x
+from util.preprocessing import (
+    load_img, process_bbox, augmentation_instance_sample
+    ,process_human_model_output_batch_simplify,process_db_coord_batch_no_valid)
+from util.transforms import world2cam, cam2pixel, rigid_align
+from humandata import HumanDataset
+
+
+class SynBody(HumanDataset):
+    def __init__(self, transform, data_split):
+        super(SynBody, self).__init__(transform, data_split)
+        self.img_dir = 'data/datasets/synbody'
+        self.annot_path = 'data/preprocessed_npz/multihuman_data/synbody_v1.1_multi_new.npz'
+        self.annot_path_cache = 'data/preprocessed_npz/cache/synbody_v1.1_cache_new_10.npz'
+        self.use_cache = getattr(cfg, 'use_cache', False)
+        self.img_shape = (720, 1280)  # (h, w)
+        self.cam_param = {
+            'focal': (540, 540),  # (fx, fy)
+            'princpt': (640, 360)  # (cx, cy)
+        }
+
+        # check image shape
+        img_path = osp.join(self.img_dir,
+                            np.load(self.annot_path)['image_path'][0])
+
+        img_shape = cv2.imread(img_path).shape[:2]
+        assert self.img_shape == img_shape, 'image shape is incorrect: {} vs {}'.format(
+            self.img_shape, img_shape)
+
+        # load data or cache
+        if self.use_cache and osp.isfile(self.annot_path_cache):
+            print(
+                f'[{self.__class__.__name__}] loading cache from {self.annot_path_cache}'
+            )
+            self.datalist = self.load_cache(self.annot_path_cache)
+        else:
+            if self.use_cache:
+                print(
+                    f'[{self.__class__.__name__}] Cache not found, generating cache...'
+                )
+            self.datalist = self.load_data(train_sample_interval=getattr(
+                cfg, f'{self.__class__.__name__}_train_sample_interval', 15))
+            if self.use_cache:
+                self.save_cache(self.annot_path_cache, self.datalist)
diff --git a/datasets/UBody_MM.py b/datasets/UBody_MM.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1aa88b2542a1e06ebc8d51100940cf7e5aa1668
--- /dev/null
+++ b/datasets/UBody_MM.py
@@ -0,0 +1,1122 @@
+import os
+import os.path as osp
+from glob import glob
+import numpy as np
+from config.config import cfg
+import copy
+
+import cv2
+import torch
+from pycocotools.coco import COCO
+from util.human_models import smpl_x
+from util.preprocessing import load_img, process_bbox
+from util.transforms import  rigid_align_batch
+import tqdm
+from detrsmpl.utils.geometry import batch_rodrigues, project_points_new
+import random
+from util.formatting import DefaultFormatBundle
+from detrsmpl.data.datasets.pipelines.transforms import Normalize
+from datasets.humandata import HumanDataset
+import time
+from util.preprocessing import (
+    load_img, process_bbox, augmentation_instance_sample,process_human_model_output_batch_simplify,process_db_coord_batch_no_valid,process_human_model_output_batch_ubody)
+KPS2D_KEYS = [
+    'keypoints2d_ori', 'keypoints2d_smplx', 'keypoints2d_smpl',
+    'keypoints2d_original','keypoints2d_gta'
+]
+KPS3D_KEYS = [
+    'keypoints3d_cam', 'keypoints3d', 'keypoints3d_smplx', 'keypoints3d_smpl',
+    'keypoints3d_original', 'keypoints3d_gta'
+]
+class UBody_MM(HumanDataset):
+    def __init__(self, transform, data_split):
+        super(UBody_MM, self).__init__(transform, data_split)
+
+        self.img_dir = 'data/osx_data/UBody'
+        self.data_split = data_split
+        self.test_vid_list = np.load('data/osx_data/UBody/splits/intra_scene_test_list.npy')
+        if self.data_split == 'train':
+            # self.annot_path = 'data/preprocessed_npz/multihuman_data/ubody_intra_train_multi_all.npz'
+            # self.annot_path_cache = 'data/preprocessed_npz/cache/ubody_intra_train_cache_fix8.npz'
+            self.annot_path = 'data/preprocessed_npz/multihuman_data/ubody_train_intra_multi.npz'
+            self.annot_path_cache = 'data/preprocessed_npz/cache/ubody_train_intra_cache_080824.npz'  
+            self.sample_interval = getattr(
+                cfg, f'{self.__class__.__name__}_train_sample_interval', 5)
+        elif self.data_split == 'test':
+            self.annot_path = 'data/preprocessed_npz/ubody_intra_test_all.npz'
+            self.annot_path_cache = 'data/preprocessed_npz/cache/ubody_intra_test_multi_all_smpler_x.npz'
+            self.sample_interval = getattr(
+                cfg, f'{self.__class__.__name__}_test_sample_interval', 100)
+        # self.test_set = 'val'
+        self.use_cache = getattr(cfg, 'use_cache', False)
+        self.img_shape = None  #1024, 1024)  # (h, w)
+        self.cam_param = {}
+        self.keypoints2d = 'keypoints2d_ubody'
+        # load data
+        if self.use_cache and osp.isfile(self.annot_path_cache):
+            print(
+                f'[{self.__class__.__name__}] loading cache from {self.annot_path_cache}'
+            )
+            self.datalist = self.load_cache(self.annot_path_cache)
+        else:
+            if self.use_cache:
+                print(
+                    f'[{self.__class__.__name__}] Cache not found, generating cache...'
+                )
+            self.datalist = self.load_data(train_sample_interval=self.sample_interval)
+            
+            if self.use_cache:
+                self.save_cache(self.annot_path_cache, self.datalist)
+                
+
+    def evaluate(self, outs, cur_sample_idx):
+        annots = self.datalist
+        sample_num = len(outs)
+        eval_result = {
+            'pa_mpvpe_all': [],
+            'pa_mpvpe_l_hand': [],
+            'pa_mpvpe_r_hand': [],
+            'pa_mpvpe_hand': [],
+            'pa_mpvpe_face': [],
+            'mpvpe_all': [],
+            'mpvpe_l_hand': [],
+            'mpvpe_r_hand': [],
+            'mpvpe_hand': [],
+            'mpvpe_face': []
+        }
+
+        vis = getattr(cfg, 'vis', False)
+        vis_save_dir = cfg.vis_dir
+
+        for n in range(sample_num):
+            
+            out = outs[n]
+            mesh_gt = out['smplx_mesh_cam_target']
+            mesh_out = out['smplx_mesh_cam']
+            cam_trans = out['cam_trans']
+            joint_proj = out['smplx_joint_proj']
+            img_wh = (out['img_shape'])
+            ann_idx = out['gt_ann_idx']
+            img_path = []
+            for ann_id in ann_idx:
+                img_path.append(annots[ann_id]['img_path'])
+            # print(img_path)
+            eval_result['img_path'] = img_path
+            eval_result['ann_idx'] = ann_idx
+            
+            # MPVPE from all vertices
+            joint_gt_body_wo_trans = np.dot(smpl_x.j14_regressor,
+                                            mesh_gt).transpose(1,0,2)
+            joint_gt_body_proj = project_points_new(
+                points_3d=torch.Tensor(joint_gt_body_wo_trans),
+                pred_cam=torch.Tensor(cam_trans),
+                focal_length=5000,
+                camera_center=torch.Tensor(img_wh/2)
+            )  # origin image space
+            
+            
+            
+            joint_gt_lhand_wo_trans = np.dot(
+                smpl_x.orig_hand_regressor['left'], mesh_gt).transpose(1,0,2)
+            joint_gt_lhand_proj = project_points_new(
+                points_3d=torch.Tensor(joint_gt_lhand_wo_trans),
+                pred_cam=torch.Tensor(cam_trans),
+                focal_length=5000,
+                camera_center=torch.Tensor(img_wh/2)
+            )  # origin image space
+            joint_gt_rhand_wo_trans = np.dot(
+                smpl_x.orig_hand_regressor['left'], mesh_gt).transpose(1,0,2)
+            joint_gt_rhand_proj = project_points_new(
+                points_3d=torch.Tensor(joint_gt_rhand_wo_trans),
+                pred_cam=torch.Tensor(cam_trans),
+                focal_length=5000,
+                camera_center=torch.Tensor(img_wh/2)
+            )  # origin image space
+            mesh_gt_proj = project_points_new(
+                points_3d=torch.Tensor(mesh_gt),
+                pred_cam=torch.Tensor(cam_trans),
+                focal_length=5000,
+                camera_center=torch.Tensor(img_wh/2))
+            
+                
+            joint_gt_body_valid = self.validate_within_img_batch(
+                img_wh, joint_gt_body_proj)
+            joint_gt_lhand_valid = self.validate_within_img_batch(
+                img_wh, joint_gt_lhand_proj)
+            joint_gt_rhand_valid = self.validate_within_img_batch(
+                img_wh, joint_gt_rhand_proj)
+            mesh_valid = self.validate_within_img_batch(img_wh, mesh_gt_proj)
+            mesh_valid = mesh_valid.cpu().numpy()>0
+            mesh_lhand_valid = mesh_valid[:,smpl_x.hand_vertex_idx['left_hand']]
+            mesh_rhand_valid = mesh_valid[:,smpl_x.hand_vertex_idx['right_hand']]
+            mesh_face_valid = mesh_valid[:,smpl_x.face_vertex_idx]
+            
+            # MPVPE from all vertices
+            mesh_out = out['smplx_mesh_cam']
+            mesh_out_align = rigid_align_batch(mesh_out, mesh_gt)
+            
+            if mesh_valid.sum()>0:
+                pa_mpvpe_all = np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1))[mesh_valid].mean() * 1000
+            else:
+                pa_mpvpe_all = 0
+            
+            eval_result['pa_mpvpe_all'].append(pa_mpvpe_all)
+            
+            mesh_out_align = mesh_out - np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:,smpl_x.J_regressor_idx['pelvis'], None, :] + \
+                             np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:,smpl_x.J_regressor_idx['pelvis'], None, :]
+            if mesh_valid.sum()>0:
+                mpvpe_all = np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, -1))[mesh_valid].mean() * 1000
+            else:
+                mpvpe_all = 0
+            eval_result['mpvpe_all'].append(mpvpe_all)
+            vis = False
+            
+            if vis:
+                import mmcv
+                img = (out['img']).transpose(0,2,3,1)
+                
+                img = mmcv.imdenormalize(
+                    img=img[0], 
+                    mean=np.array([123.675, 116.28, 103.53]), 
+                    std=np.array([58.395, 57.12, 57.375]),
+                    to_bgr=True).astype(np.uint8)
+                cv2.imwrite('temp.png',img)
+                from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+                
+                # out['smplx_joint_proj']
+                from pytorch3d.io import save_obj
+
+                mesh_pred_proj = project_points_new(
+                    points_3d=torch.Tensor(mesh_gt),
+                    pred_cam=torch.Tensor(cam_trans),
+                    focal_length=5000,
+                    camera_center=torch.Tensor(img_wh/2))
+                mesh_pred_proj = (mesh_valid[:,:,None])*mesh_pred_proj.detach().cpu().numpy()
+                visualize_kp2d(
+                    mesh_pred_proj[0][None],
+                    image_array=img[None].copy(),
+                    disable_limbs=True,
+                    overwrite=True,
+                    output_path='./figs/gt2d/%d'%ann_idx
+                )
+                mesh_pred_proj = project_points_new(
+                    points_3d=torch.Tensor(mesh_out),
+                    pred_cam=torch.Tensor(cam_trans),
+                    focal_length=5000,
+                    camera_center=torch.Tensor(img_wh/2))
+                mesh_pred_proj = (mesh_valid[:,:,None])*mesh_pred_proj.detach().cpu().numpy()
+                visualize_kp2d(
+                    mesh_pred_proj[0][None],
+                    image_array=img[None].copy(),
+                    disable_limbs=True,
+                    overwrite=True,
+                    output_path='./figs/pred2d/%d'%ann_idx
+                )
+                save_obj('./figs/pred_smpl_%d.obj'%mpvpe_all,verts = torch.tensor(mesh_out_align[0]),faces=torch.tensor([]))
+                save_obj('./figs/gt_smpl_%d.obj'%mpvpe_all,verts = torch.tensor(mesh_gt[0]),faces=torch.tensor([]))
+            # MPVPE from hand vertices
+            mesh_gt_lhand = mesh_gt[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_out_lhand = mesh_out[:, smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_gt_rhand = mesh_gt[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_rhand = mesh_out[:, smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_lhand_align = \
+                mesh_out_lhand - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['lwrist'], None, :]
+                    
+            mesh_out_rhand_align = \
+                mesh_out_rhand - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['rwrist'], None, :]
+            mpvpe_hand = []
+            
+            if mesh_lhand_valid.sum() != 0:
+                mpvpe_lhand = np.sqrt(
+                    np.sum((mesh_out_lhand_align - mesh_gt_lhand)**2,
+                           -1))[mesh_lhand_valid].mean() * 1000
+                mpvpe_hand.append(mpvpe_lhand)
+                eval_result['mpvpe_l_hand'].append(mpvpe_lhand)
+            else:
+                eval_result['mpvpe_l_hand'].append(np.zeros_like(mpvpe_all))
+            if mesh_rhand_valid.sum() != 0:
+                mpvpe_rhand = np.sqrt(
+                    np.sum((mesh_out_rhand_align - mesh_gt_rhand)**2,
+                           -1))[mesh_rhand_valid].mean() * 1000
+                mpvpe_hand.append(mpvpe_rhand)
+                eval_result['mpvpe_r_hand'].append(mpvpe_rhand)
+            else:
+                eval_result['mpvpe_r_hand'].append(np.zeros_like(mpvpe_all))
+            if len(mpvpe_hand) > 0:
+                mpvpe_hand = np.stack(mpvpe_hand,axis=-1)
+                eval_result['mpvpe_hand'].append(np.mean(mpvpe_hand,axis=-1))
+            else:
+                eval_result['mpvpe_hand'].append(np.zeros_like(mpvpe_all))
+            mesh_out_lhand_align = rigid_align_batch(mesh_out_lhand, mesh_gt_lhand)
+            mesh_out_rhand_align = rigid_align_batch(mesh_out_rhand, mesh_gt_rhand)
+            pa_mpvpe_hand = []
+            if mesh_lhand_valid.sum() != 0:
+                pa_mpvpe_lhand = np.sqrt(
+                    np.sum((mesh_out_lhand_align - mesh_gt_lhand)**2,
+                           -1))[mesh_lhand_valid].mean() * 1000
+                pa_mpvpe_hand.append(pa_mpvpe_lhand)
+                eval_result['pa_mpvpe_l_hand'].append(pa_mpvpe_lhand)
+            else:
+                eval_result['pa_mpvpe_l_hand'].append(np.zeros_like(mpvpe_all))
+            if mesh_rhand_valid.sum() != 0:
+                # pa_mpvpe_rhand = np.sqrt(np.sum((mesh_out_rhand_align - mesh_gt_rhand)**2, -1)).sum(-1) * 1000 / (mesh_rhand_valid.sum(-1)+1e-6)
+                pa_mpvpe_rhand = np.sqrt(
+                    np.sum((mesh_out_rhand_align - mesh_gt_rhand)**2,
+                           -1))[mesh_rhand_valid].mean() * 1000
+                pa_mpvpe_hand.append(pa_mpvpe_rhand)
+                eval_result['pa_mpvpe_r_hand'].append(pa_mpvpe_rhand)
+            else:
+                eval_result['pa_mpvpe_r_hand'].append(np.zeros_like(mpvpe_all))
+            if len(pa_mpvpe_hand) > 0:
+                pa_mpvpe_hand = np.stack(pa_mpvpe_hand,axis=-1)
+                eval_result['pa_mpvpe_hand'].append(np.mean(pa_mpvpe_hand,axis=-1))
+            else:
+                eval_result['pa_mpvpe_hand'].append(np.zeros_like(np.mean(np.zeros_like(mpvpe_all))))
+                
+            # MPVPE from face vertices
+            mesh_gt_face = mesh_gt[:, smpl_x.face_vertex_idx, :]
+            mesh_out_face = mesh_out[:, smpl_x.face_vertex_idx, :]
+            mesh_out_face_align = \
+                mesh_out_face - \
+                np.dot(smpl_x.J_regressor, mesh_out).transpose(1,0,2)[:, smpl_x.J_regressor_idx['neck'], None, :] + \
+                np.dot(smpl_x.J_regressor, mesh_gt).transpose(1,0,2)[:, smpl_x.J_regressor_idx['neck'], None, :]
+            if mesh_face_valid.sum() != 0:
+                eval_result['mpvpe_face'].append(
+                    np.sqrt(np.sum((mesh_out_face_align - mesh_gt_face)**2,
+                                   -1))[mesh_face_valid].mean() * 1000)
+            else:
+                eval_result['mpvpe_face'].append(np.zeros_like(np.mean(np.zeros_like(mpvpe_all))))
+            mesh_out_face_align = rigid_align_batch(mesh_out_face, mesh_gt_face)
+            
+            if mesh_face_valid.sum() != 0:
+                eval_result['pa_mpvpe_face'].append(
+                    np.sqrt(np.sum((mesh_out_face_align - mesh_gt_face)**2,
+                                   -1))[mesh_face_valid].mean() * 1000)
+            else:
+                eval_result['pa_mpvpe_face'].append(np.zeros_like(np.mean(np.zeros_like(mpvpe_all))))
+            for k,v in eval_result.items():
+                if k != 'img_path' and k != 'ann_idx':
+                    
+                    if len(v)>1:
+                        eval_result[k] = np.concatenate(v,axis=0)
+                    else:
+                        eval_result[k] = np.array(v)
+        return eval_result
+    
+    def load_data(self, train_sample_interval=1):
+        
+        content = np.load(self.annot_path, allow_pickle=True)
+        try:
+            frame_range = content['frame_range']
+        except KeyError:
+            self.num_data = len(content['image_path'])
+            frame_range = \
+                np.array([[i, i + 1] for i in range(self.num_data)])
+
+        num_examples = len(frame_range)
+        
+        if 'meta' in content:
+            meta = content['meta'].item()
+            print('meta keys:', meta.keys())
+        else:
+            meta = None
+            print(
+                'No meta info provided! Please give height and width manually')
+
+        print(
+            f'Start loading humandata {self.annot_path} into memory...\nDataset includes: {content.files}'
+        )
+        tic = time.time()
+        image_path = content['image_path']
+
+        if meta is not None and 'height' in meta:
+            height = np.array(meta['height'])
+            width = np.array(meta['width'])
+            image_shape = np.stack([height, width], axis=-1)
+        else:
+            image_shape = None
+
+        if meta is not None and 'gender' in meta and len(meta['gender']) != 0:
+            gender = meta['gender']
+        else:
+            gender = None
+            
+        face_valid = meta['face_valid']
+        lhand_valid = meta['lefthand_valid']
+        rhand_valid = meta['righthand_valid']
+        valid_label = meta['valid_label']
+        is_crowd = meta['iscrowd']
+        keypoints_valid = content['keypoints2d_ubody'][:,:,2].sum(-1)!=0
+        bbox_xywh = content['bbox_xywh']
+        if 'smplx' in content:
+            smplx = content['smplx'].item()
+            as_smplx = 'smplx'
+        elif 'smpl' in content:
+            smplx = content['smpl'].item()
+            as_smplx = 'smpl'
+        elif 'smplh' in content:
+            smplx = content['smplh'].item()
+            as_smplx = 'smplh'
+        # TODO: temp solution, should be more general. But SHAPY is very special
+        elif self.__class__.__name__ == 'SHAPY':
+            smplx = {}
+        else:
+            raise KeyError('No SMPL for SMPLX available, please check keys:\n'
+                           f'{content.files}')
+
+        print('Smplx param', smplx.keys())
+
+        if 'lhand_bbox_xywh' in content and 'rhand_bbox_xywh' in content:
+            lhand_bbox_xywh = content['lhand_bbox_xywh']
+            rhand_bbox_xywh = content['rhand_bbox_xywh']
+        else:
+            lhand_bbox_xywh = np.zeros_like(bbox_xywh)
+            rhand_bbox_xywh = np.zeros_like(bbox_xywh)
+
+        if 'face_bbox_xywh' in content:
+            face_bbox_xywh = content['face_bbox_xywh']
+        else:
+            face_bbox_xywh = np.zeros_like(bbox_xywh)
+
+        decompressed = False
+        if content['__keypoints_compressed__']:
+            decompressed_kps = self.decompress_keypoints(content)
+            decompressed = True
+
+        keypoints3d = None
+        valid_kps3d = False
+        keypoints3d_mask = None
+        valid_kps3d_mask = False
+        for kps3d_key in KPS3D_KEYS:
+            if kps3d_key in content:
+                keypoints3d = decompressed_kps[kps3d_key][:, self.SMPLX_137_MAPPING, :] if decompressed \
+                else content[kps3d_key][:, self.SMPLX_137_MAPPING, :]
+                valid_kps3d = True
+                if keypoints3d.shape[-1] == 4:
+                    valid_kps3d_mask = True
+                break
+        
+        if self.keypoints2d is not None:
+            keypoints2d = decompressed_kps[self.keypoints2d][:, self.SMPLX_137_MAPPING, :] if decompressed \
+                else content[self.keypoints2d][:, self.SMPLX_137_MAPPING, :]
+            keypoints2d = keypoints2d[:,:,:3]
+        if keypoints2d.shape[-1] == 3:
+            valid_kps3d_mask = True
+        
+        
+        print('Done. Time: {:.2f}s'.format(time.time() - tic))
+
+        datalist = []
+        num_examples
+
+        # processing each image, filter according to bbox valid
+        for i in tqdm.tqdm(range(int(num_examples))):
+            if self.data_split == 'train' and i % self.sample_interval != 0:
+                continue
+            
+            frame_start, frame_end = frame_range[i]
+            img_path = osp.join(self.img_dir, image_path[frame_start])
+            vid_name = img_path.split('/')[-2]
+            if 'Trim' in vid_name:
+                vid_name = vid_name.split('_Trim')[0]        
+            if str(vid_name) in self.test_vid_list:
+                continue
+            # im_shape = cv2.imread(img_path).shape[:2]
+            img_shape = image_shape[
+                frame_start] if image_shape is not None else self.img_shape
+            
+            bbox_list = bbox_xywh[frame_start:frame_end, :4]
+            
+            unique_bbox_idx = np.unique(bbox_list,axis=0,return_index=True)[1]
+            unique_bbox_idx.sort()
+            unique_bbox_list = bbox_list[unique_bbox_idx]
+
+            valid_idx = []
+            body_bbox_list = []
+            
+            if hasattr(cfg, 'bbox_ratio'):
+                bbox_ratio = cfg.bbox_ratio * 0.833  # preprocess body bbox is giving 1.2 box padding
+            else:
+                bbox_ratio = 1.25
+            
+            for bbox_i, bbox in zip(unique_bbox_idx,unique_bbox_list):
+                
+                bbox = process_bbox(bbox,
+                                    img_width=img_shape[1],
+                                    img_height=img_shape[0],
+                                    ratio=bbox_ratio)
+                if bbox is None:
+                    continue
+                
+                if is_crowd[frame_start + bbox_i] == 0 and valid_label[frame_start + bbox_i] != 0 and keypoints_valid[frame_start + bbox_i] == True:
+                    
+                    valid_idx.append(frame_start + bbox_i)
+                    bbox[2:] += bbox[:2]
+                    body_bbox_list.append(bbox)
+            if len(valid_idx) == 0:
+                continue
+            valid_num = len(valid_idx)
+            # hand/face bbox
+            lhand_bbox_list = []
+            rhand_bbox_list = []
+            face_bbox_list = []
+            
+            for bbox_i in valid_idx:
+                lhand_bbox = lhand_bbox_xywh[bbox_i]
+                rhand_bbox = rhand_bbox_xywh[bbox_i]
+                face_bbox = face_bbox_xywh[bbox_i]
+                if lhand_valid[bbox_i] > 0:  # conf > 0
+                    lhand_bbox = lhand_bbox[:4]
+                    if hasattr(cfg, 'bbox_ratio'):
+                        lhand_bbox = process_bbox(lhand_bbox,
+                                                  img_width=img_shape[1],
+                                                  img_height=img_shape[0],
+                                                  ratio=cfg.bbox_ratio)
+                    if lhand_bbox is not None:
+                        lhand_bbox[2:] += lhand_bbox[:2]  # xywh -> xyxy
+                else:
+                    lhand_bbox = None
+                if rhand_valid[bbox_i] > 0:
+                    rhand_bbox = rhand_bbox[:4]
+                    if hasattr(cfg, 'bbox_ratio'):
+                        rhand_bbox = process_bbox(rhand_bbox,
+                                                  img_width=img_shape[1],
+                                                  img_height=img_shape[0],
+                                                  ratio=cfg.bbox_ratio)
+                    if rhand_bbox is not None:
+                        rhand_bbox[2:] += rhand_bbox[:2]  # xywh -> xyxy
+                else:
+                    rhand_bbox = None
+                if face_valid[bbox_i] > 0:
+                    face_bbox = face_bbox[:4]
+                    if hasattr(cfg, 'bbox_ratio'):
+                        face_bbox = process_bbox(face_bbox,
+                                                 img_width=img_shape[1],
+                                                 img_height=img_shape[0],
+                                                 ratio=cfg.bbox_ratio)
+                    if face_bbox is not None:
+                        face_bbox[2:] += face_bbox[:2]  # xywh -> xyxy
+                else:
+                    face_bbox = None
+                lhand_bbox_list.append(lhand_bbox)
+                rhand_bbox_list.append(rhand_bbox)
+                face_bbox_list.append(face_bbox)
+            
+            # lhand_bbox = np.stack(lhand_bbox_list,axis=0)
+            # rhand_bbox = np.stack(rhand_bbox_list,axis=0)
+            # face_bbox = np.stack(face_bbox_list,axis=0)
+            joint_img = keypoints2d[valid_idx]
+            
+            # num_joints = joint_cam.shape[0]
+            # joint_valid = np.ones((num_joints, 1))
+            if valid_kps3d:
+                joint_cam = keypoints3d[valid_idx]
+            else:
+                joint_cam = None
+            
+            if 'leye_pose_0' in smplx.keys():
+                smplx.pop('leye_pose_0')
+            if 'leye_pose_1' in smplx.keys():
+                smplx.pop('leye_pose_1')
+            if 'leye_pose' in smplx.keys():
+                smplx.pop('leye_pose')
+            if 'reye_pose_0' in smplx.keys():
+                smplx.pop('reye_pose_0')
+            if 'reye_pose_1' in smplx.keys():
+                smplx.pop('reye_pose_1')
+            if 'reye_pose' in smplx.keys():
+                smplx.pop('reye_pose')
+            
+
+            smplx_param = {k: v[valid_idx] for k, v in smplx.items()}
+            gender_ = gender[valid_idx] \
+                if gender is not None else np.array(['neutral']*(valid_num))
+            
+            # TODO: set invalid if None?
+            smplx_param['root_pose'] = smplx_param.pop('global_orient', None)
+            smplx_param['shape'] = smplx_param.pop('betas', None)
+            smplx_param['trans'] = smplx_param.pop('transl', np.zeros(3))
+            smplx_param['lhand_pose'] = smplx_param.pop('left_hand_pose', None)
+            smplx_param['rhand_pose'] = smplx_param.pop(
+                'right_hand_pose', None)
+            smplx_param['expr'] = smplx_param.pop('expression', None)
+
+            # TODO do not fix betas, give up shape supervision
+            if 'betas_neutral' in smplx_param and self.data_split == 'train':
+                smplx_param['shape'] = smplx_param.pop('betas_neutral')
+                # smplx_param['shape'] = np.zeros(10, dtype=np.float32)
+
+            # # TODO fix shape of poses
+            if self.__class__.__name__ == 'Talkshow':
+                smplx_param['body_pose'] = smplx_param['body_pose'].reshape(
+                    -1, 21, 3)
+                smplx_param['lhand_pose'] = smplx_param['lhand_pose'].reshape(
+                    -1, 15, 3)
+                smplx_param['rhand_pose'] = smplx_param['lhand_pose'].reshape(
+                    -1, 15, 3)
+                smplx_param['expr'] = smplx_param['expr'][:, :10]
+
+            if self.__class__.__name__ == 'BEDLAM':
+                smplx_param['shape'] = smplx_param['shape'][:, :10]
+
+            if as_smplx == 'smpl':
+                smplx_param['shape'] = np.zeros(
+                    [valid_num, 10],
+                    dtype=np.float32)  # drop smpl betas for smplx
+                smplx_param['body_pose'] = smplx_param[
+                    'body_pose'][:, :21, :]  # use smpl body_pose on smplx
+            if as_smplx == 'smplh':
+                smplx_param['shape'] = np.zeros(
+                    [valid_num, 10],
+                    dtype=np.float32)  # drop smpl betas for smplx
+
+            if smplx_param['lhand_pose'] is None or self.body_only == True:
+                smplx_param['lhand_valid'] = np.zeros(valid_num, dtype=np.bool8)
+            else:
+                smplx_param['lhand_valid'] = lhand_valid[valid_idx]
+                
+            if smplx_param['rhand_pose'] is None or self.body_only == True:
+                smplx_param['rhand_valid'] = np.zeros(valid_num, dtype=np.bool8)
+            else:
+                smplx_param['rhand_valid'] = rhand_valid[valid_idx]
+                
+            if smplx_param['expr'] is None or self.body_only == True:
+                smplx_param['face_valid'] = np.zeros(valid_num, dtype=np.bool8)
+            else:
+                smplx_param['face_valid'] = face_valid[valid_idx]
+
+            if joint_cam is not None and np.any(np.isnan(joint_cam)):
+                continue
+            
+            
+            
+            datalist.append({
+                'img_path': img_path,
+                'img_shape': img_shape,
+                'bbox': body_bbox_list,
+                'lhand_bbox': lhand_bbox_list,
+                'rhand_bbox': rhand_bbox_list,
+                'face_bbox': face_bbox_list,
+                'joint_img': joint_img,
+                'joint_cam': joint_cam,
+                'smplx_param': smplx_param,
+                'as_smplx': as_smplx,
+                'gender': gender_
+            })
+
+        # save memory
+        del content, image_path, bbox_xywh, lhand_bbox_xywh, rhand_bbox_xywh, face_bbox_xywh, keypoints3d, keypoints2d
+
+        if self.data_split == 'train':
+            print(f'[{self.__class__.__name__} train] original size:',
+                  int(num_examples), '. Sample interval:',
+                  train_sample_interval, '. Sampled size:', len(datalist))
+
+        if getattr(cfg, 'data_strategy',
+                   None) == 'balance' and self.data_split == 'train':
+            print(
+                f'[{self.__class__.__name__}] Using [balance] strategy with datalist shuffled...'
+            )
+            random.shuffle(datalist)
+
+        return datalist
+    def __getitem__(self, idx):
+        try:
+            data = copy.deepcopy(self.datalist[idx])
+        except Exception as e:
+            print(f'[{self.__class__.__name__}] Error loading data {idx}')
+            print(e)
+            exit(0)
+
+        img_path, img_shape, bbox = data['img_path'], data['img_shape'], data[
+            'bbox']
+        as_smplx = data['as_smplx']
+        if 'gender' in data:
+            gender = data['gender'].copy()
+            for gender_str, gender_num in {
+                'neutral': -1, 'male': 0, 'female': 1}.items():
+                gender[gender==gender_str]=gender_num
+            gender = gender.astype(int)    
+        else:
+            gender = np.array([-1]*len(bbox))
+        img_whole_bbox = np.array([0, 0, img_shape[1], img_shape[0]])
+        img = load_img(img_path, order='BGR')
+        num_person = len(data['bbox'])
+        data_name = self.__class__.__name__
+        img, img2bb_trans, bb2img_trans, rot, do_flip = \
+            augmentation_instance_sample(img, img_whole_bbox, self.data_split,data,data_name)
+        cropped_img_shape=img.shape[:2]
+        num_person = len(data['bbox'])
+
+        if self.data_split == 'train':
+            # h36m gt
+            if 'joint_cam' in data:
+                joint_cam = data['joint_cam']
+            else:
+                joint_cam = None
+            
+            if joint_cam is not None:
+                dummy_cord = False
+                joint_cam[:,:,:3] = joint_cam[:,:,:3] - joint_cam[:, self.
+                                                  joint_set['root_joint_idx'],
+                                                  None, :3]  # root-relative
+            else:
+                # dummy cord as joint_cam
+                dummy_cord = True
+                joint_cam = np.zeros(
+                    (num_person, self.joint_set['joint_num'], 4),
+                    dtype=np.float32)
+
+            joint_img = data['joint_img']
+            
+            # do rotation on keypoints
+            joint_img_aug, joint_cam_wo_ra, joint_cam_ra, joint_trunc = \
+                process_db_coord_batch_no_valid(
+                    joint_img, joint_cam, do_flip, img_shape,
+                    self.joint_set['flip_pairs'], img2bb_trans, rot,
+                    self.joint_set['joints_name'], smpl_x.joints_name,
+                    cropped_img_shape)
+            joint_img_aug[:,:,2:] = joint_img_aug[:,:,2:] * joint_trunc
+            
+            # smplx coordinates and parameters
+            smplx_param = data['smplx_param']
+            if self.__class__.__name__ in ['CHI3D', 'SynBody']:
+                smplx_param['lhand_pose']-=self.lhand_mean[None]
+                smplx_param['rhand_pose']-=self.rhand_mean[None]
+            part_valid = {
+                'lhand': smplx_param['lhand_valid'],
+                'rhand': smplx_param['rhand_valid'],
+                'face':  smplx_param['face_valid']
+            }
+            smplx_pose, smplx_shape, smplx_expr, smplx_pose_valid, \
+            smplx_joint_valid, smplx_expr_valid, smplx_shape_valid = \
+                process_human_model_output_batch_ubody(
+                    smplx_param, do_flip, rot, as_smplx, part_valid)
+            
+            # if cam not provided, we take joint_img as smplx joint 2d, 
+            # which is commonly the case for our processed humandata
+            # TODO temp fix keypoints3d for renbody
+            
+
+            # change smplx_shape if use_betas_neutral
+            # processing follows that in process_human_model_output
+            if self.use_betas_neutral:
+                smplx_shape = smplx_param['betas_neutral'].reshape(
+                    num_person, -1)
+                smplx_shape[(np.abs(smplx_shape) > 3).any(axis=1)] = 0.
+                smplx_shape = smplx_shape.reshape(num_person, -1)
+                
+            # smplx_pose_valid = np.tile(smplx_pose_valid[:,:, None], (1, 3)).reshape(num_person,-1)
+            
+            # smplx_pose = smplx_pose * smplx_pose_valid
+            # smplx_expr = smplx_expr * smplx_expr_valid[:, None]
+            smplx_joint_valid = smplx_joint_valid[:, :, None]
+            
+            lhand_bbox_center_list = []
+            lhand_bbox_valid_list = []
+            lhand_bbox_size_list = []
+            lhand_bbox_list = []
+            face_bbox_center_list = []
+            face_bbox_size_list = []
+            face_bbox_valid_list = []
+            face_bbox_list = []
+            rhand_bbox_center_list = []
+            rhand_bbox_valid_list = []
+            rhand_bbox_size_list = []
+            rhand_bbox_list = []
+            body_bbox_center_list = []
+            body_bbox_size_list = []
+            body_bbox_valid_list = []
+            body_bbox_list = []
+            # hand and face bbox transform
+            
+            for i in range(num_person):
+                # TODO: check if body bbox is invalid, it will assert error?
+                body_bbox, body_bbox_valid = self.process_hand_face_bbox(
+                    data['bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+
+                lhand_bbox, lhand_bbox_valid = self.process_hand_face_bbox(
+                    data['lhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                lhand_bbox_valid *= smplx_param['lhand_valid'][i]
+                
+                rhand_bbox, rhand_bbox_valid = self.process_hand_face_bbox(
+                    data['rhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                rhand_bbox_valid *= smplx_param['rhand_valid'][i]
+                
+                face_bbox, face_bbox_valid = self.process_hand_face_bbox(
+                    data['face_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                face_bbox_valid *= smplx_param['face_valid'][i]
+                
+                if do_flip:
+                    lhand_bbox, rhand_bbox = rhand_bbox, lhand_bbox
+                    lhand_bbox_valid, rhand_bbox_valid = rhand_bbox_valid, lhand_bbox_valid
+                    
+                body_bbox_list.append(body_bbox)
+                lhand_bbox_list.append(lhand_bbox)
+                rhand_bbox_list.append(rhand_bbox)
+                face_bbox_list.append(face_bbox)
+                
+                lhand_bbox_center = (lhand_bbox[0] + lhand_bbox[1]) / 2.
+                rhand_bbox_center = (rhand_bbox[0] + rhand_bbox[1]) / 2.
+                face_bbox_center = (face_bbox[0] + face_bbox[1]) / 2.
+                body_bbox_center = (body_bbox[0] + body_bbox[1]) / 2.
+                lhand_bbox_size = lhand_bbox[1] - lhand_bbox[0]
+                rhand_bbox_size = rhand_bbox[1] - rhand_bbox[0]
+
+                face_bbox_size = face_bbox[1] - face_bbox[0]
+                body_bbox_size = body_bbox[1] - body_bbox[0]
+                lhand_bbox_center_list.append(lhand_bbox_center)
+                lhand_bbox_valid_list.append(lhand_bbox_valid)
+                lhand_bbox_size_list.append(lhand_bbox_size)
+                face_bbox_center_list.append(face_bbox_center)
+                face_bbox_size_list.append(face_bbox_size)
+                face_bbox_valid_list.append(face_bbox_valid)
+                rhand_bbox_center_list.append(rhand_bbox_center)
+                rhand_bbox_valid_list.append(rhand_bbox_valid)
+                rhand_bbox_size_list.append(rhand_bbox_size)
+                body_bbox_center_list.append(body_bbox_center)
+                body_bbox_size_list.append(body_bbox_size)
+                body_bbox_valid_list.append(body_bbox_valid)
+            
+            
+            body_bbox = np.stack(body_bbox_list, axis=0)
+            lhand_bbox = np.stack(lhand_bbox_list, axis=0)
+            rhand_bbox = np.stack(rhand_bbox_list, axis=0)
+            face_bbox = np.stack(face_bbox_list, axis=0)
+            lhand_bbox_center = np.stack(lhand_bbox_center_list, axis=0)
+            lhand_bbox_valid = np.stack(lhand_bbox_valid_list, axis=0)
+            lhand_bbox_size = np.stack(lhand_bbox_size_list, axis=0)
+            face_bbox_center = np.stack(face_bbox_center_list, axis=0)
+            face_bbox_size = np.stack(face_bbox_size_list, axis=0)
+            face_bbox_valid = np.stack(face_bbox_valid_list, axis=0)
+            body_bbox_center = np.stack(body_bbox_center_list, axis=0)
+            body_bbox_size = np.stack(body_bbox_size_list, axis=0)
+            body_bbox_valid = np.stack(body_bbox_valid_list, axis=0)
+            rhand_bbox_center = np.stack(rhand_bbox_center_list, axis=0)
+            rhand_bbox_valid = np.stack(rhand_bbox_valid_list, axis=0)
+            rhand_bbox_size = np.stack(rhand_bbox_size_list, axis=0)
+
+            inputs = {'img': img}
+
+            is_3D = True
+            # joint_img_aug[:,:,2] = joint_img_aug[:,:,2] * body_bbox_valid[:,None]
+            
+            # assign 2d kps valid to 3d kps
+            joint_cam_wo_ra[..., -1] = joint_img_aug[..., -1] * smplx_joint_valid[..., 0]
+            joint_cam_ra[..., -1] = joint_img_aug[..., -1] * smplx_joint_valid[..., 0]
+            joint_img_aug[...,-1] = joint_img_aug[...,-1] * smplx_joint_valid[...,0]
+            targets = {
+                # keypoints2d, [0,img_w],[0,img_h] -> [0,1] -> [0,output_hm_shape]
+                'joint_img': joint_img_aug[body_bbox_valid>0], 
+                # joint_cam, kp3d wo ra # raw kps3d probably without ra
+                'joint_cam': joint_cam_wo_ra[body_bbox_valid>0], 
+                # kps3d with body, face, hand ra
+                'smplx_joint_cam': joint_cam_ra[body_bbox_valid>0], 
+                'smplx_pose': smplx_pose[body_bbox_valid>0],
+                'smplx_shape': smplx_shape[body_bbox_valid>0],
+                'smplx_expr': smplx_expr[body_bbox_valid>0],
+                'lhand_bbox_center': lhand_bbox_center[body_bbox_valid>0], 
+                'lhand_bbox_size': lhand_bbox_size[body_bbox_valid>0],
+                'rhand_bbox_center': rhand_bbox_center[body_bbox_valid>0], 
+                'rhand_bbox_size': rhand_bbox_size[body_bbox_valid>0],
+                'face_bbox_center': face_bbox_center[body_bbox_valid>0], 
+                'face_bbox_size': face_bbox_size[body_bbox_valid>0],
+                'body_bbox_center': body_bbox_center[body_bbox_valid>0], 
+                'body_bbox_size': body_bbox_size[body_bbox_valid>0],
+                'body_bbox': body_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'lhand_bbox': lhand_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'rhand_bbox': rhand_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'face_bbox': face_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'gender': gender[body_bbox_valid>0]}
+
+            meta_info = {
+                'joint_trunc': joint_trunc[body_bbox_valid>0],
+                'smplx_pose_valid': smplx_pose_valid[body_bbox_valid>0],
+                'smplx_shape_valid': smplx_shape_valid[body_bbox_valid>0],
+                'smplx_expr_valid': smplx_expr_valid[body_bbox_valid>0],
+                'is_3D': is_3D, 
+                'lhand_bbox_valid': lhand_bbox_valid[body_bbox_valid>0],
+                'rhand_bbox_valid': rhand_bbox_valid[body_bbox_valid>0], 
+                'face_bbox_valid': face_bbox_valid[body_bbox_valid>0],
+                'body_bbox_valid': body_bbox_valid[body_bbox_valid>0],
+                'img_shape': np.array(img.shape[:2]), 
+                'ori_shape':data['img_shape'],
+                'idx': idx,
+            }
+           
+            
+            result = {**inputs, **targets, **meta_info}
+            
+            result = self.normalize(result)
+            result = self.format(result)
+            return result
+
+        
+
+        if self.data_split == 'test':
+            self.cam_param = {}
+            if 'joint_cam' not in data:
+                joint_cam = None
+            else:
+                joint_cam = data['joint_cam']
+            
+            if joint_cam is not None:
+                dummy_cord = False
+                joint_cam[:,:,:3] = joint_cam[:,:,:3] - joint_cam[
+                    :, self.joint_set['root_joint_idx'], None, :3]  # root-relative
+            else:
+                # dummy cord as joint_cam
+                dummy_cord = True
+                joint_cam = np.zeros((num_person, 137, 4), dtype=np.float32)
+
+            joint_img = data['joint_img']
+            
+            joint_img_aug, joint_cam_wo_ra, joint_cam_ra, joint_trunc = \
+                process_db_coord_batch_no_valid(
+                    joint_img, joint_cam, do_flip, img_shape,
+                    self.joint_set['flip_pairs'], img2bb_trans, rot,
+                    self.joint_set['joints_name'], smpl_x.joints_name,
+                    cropped_img_shape)
+            joint_img_aug[:,:,2:] = joint_img_aug[:,:,2:] * joint_trunc
+            
+
+            # smplx coordinates and parameters
+            smplx_param = data['smplx_param']
+            # smplx_cam_trans = np.array(
+            #     smplx_param['trans']) if 'trans' in smplx_param else None
+            # TODO: remove this, seperate smpl and smplx
+            part_valid = {
+                'lhand': smplx_param['lhand_valid'],
+                'rhand': smplx_param['rhand_valid'],
+                'face':  smplx_param['face_valid']
+            }
+            smplx_pose, smplx_shape, smplx_expr, smplx_pose_valid, \
+            smplx_joint_valid, smplx_expr_valid, smplx_shape_valid = \
+                process_human_model_output_batch_ubody(
+                    smplx_param, do_flip, rot, as_smplx, part_valid)
+            # if cam not provided, we take joint_img as smplx joint 2d, 
+            # which is commonly the case for our processed humandata
+            if self.use_betas_neutral:
+                smplx_shape = smplx_param['betas_neutral'].reshape(
+                    num_person, -1)
+                smplx_shape[(np.abs(smplx_shape) > 3).any(axis=1)] = 0.
+                smplx_shape = smplx_shape.reshape(num_person, -1)
+            
+            # smplx_pose_valid = np.tile(smplx_pose_valid[:,:, None], (1, 3)).reshape(num_person,-1)
+            smplx_joint_valid = smplx_joint_valid[:, :, None]
+            # smplx_pose = smplx_pose*smplx_pose_valid
+            # smplx_expr = smplx_expr*smplx_expr_valid
+            
+            # if not (smplx_shape == 0).all():
+            #     smplx_shape_valid = True
+            # else:
+            #     smplx_shape_valid = False
+            lhand_bbox_center_list = []
+            lhand_bbox_valid_list = []
+            lhand_bbox_size_list = []
+            lhand_bbox_list = []
+            face_bbox_center_list = []
+            face_bbox_size_list = []
+            face_bbox_valid_list = []
+            face_bbox_list = []
+            rhand_bbox_center_list = []
+            rhand_bbox_valid_list = []
+            rhand_bbox_size_list = []
+            rhand_bbox_list = []
+            body_bbox_center_list = []
+            body_bbox_size_list = []
+            body_bbox_valid_list = []
+            body_bbox_list = []
+                        
+            for i in range(num_person):
+                lhand_bbox, lhand_bbox_valid = self.process_hand_face_bbox(
+                    data['lhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                
+                rhand_bbox, rhand_bbox_valid = self.process_hand_face_bbox(
+                    data['rhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                lhand_bbox_valid *= smplx_param['lhand_valid'][i]
+                
+                face_bbox, face_bbox_valid = self.process_hand_face_bbox(
+                    data['face_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                rhand_bbox_valid *= smplx_param['rhand_valid'][i]
+                
+                body_bbox, body_bbox_valid = self.process_hand_face_bbox(
+                    data['bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                face_bbox_valid *= smplx_param['face_valid'][i]                
+
+                if do_flip:
+                    lhand_bbox, rhand_bbox = rhand_bbox, lhand_bbox
+                    lhand_bbox_valid, rhand_bbox_valid = rhand_bbox_valid, lhand_bbox_valid            
+
+                body_bbox_list.append(body_bbox)
+                lhand_bbox_list.append(lhand_bbox)
+                rhand_bbox_list.append(rhand_bbox)
+                face_bbox_list.append(face_bbox)
+
+                lhand_bbox_center = (lhand_bbox[0] + lhand_bbox[1]) / 2.
+                rhand_bbox_center = (rhand_bbox[0] + rhand_bbox[1]) / 2.
+                face_bbox_center = (face_bbox[0] + face_bbox[1]) / 2.
+                body_bbox_center = (body_bbox[0] + body_bbox[1]) / 2.
+                lhand_bbox_size = lhand_bbox[1] - lhand_bbox[0]
+                rhand_bbox_size = rhand_bbox[1] - rhand_bbox[0]
+
+                face_bbox_size = face_bbox[1] - face_bbox[0]
+                body_bbox_size = body_bbox[1] - body_bbox[0]
+                lhand_bbox_center_list.append(lhand_bbox_center)
+                lhand_bbox_valid_list.append(lhand_bbox_valid)
+                lhand_bbox_size_list.append(lhand_bbox_size)
+                face_bbox_center_list.append(face_bbox_center)
+                face_bbox_size_list.append(face_bbox_size)
+                face_bbox_valid_list.append(face_bbox_valid)
+                rhand_bbox_center_list.append(rhand_bbox_center)
+                rhand_bbox_valid_list.append(rhand_bbox_valid)
+                rhand_bbox_size_list.append(rhand_bbox_size)
+                body_bbox_center_list.append(body_bbox_center)
+                body_bbox_size_list.append(body_bbox_size)
+                body_bbox_valid_list.append(body_bbox_valid)
+
+            body_bbox = np.stack(body_bbox_list, axis=0)
+            lhand_bbox = np.stack(lhand_bbox_list, axis=0)
+            rhand_bbox = np.stack(rhand_bbox_list, axis=0)
+            face_bbox = np.stack(face_bbox_list, axis=0)
+            lhand_bbox_center = np.stack(lhand_bbox_center_list, axis=0)
+            lhand_bbox_valid = np.stack(lhand_bbox_valid_list, axis=0)
+            lhand_bbox_size = np.stack(lhand_bbox_size_list, axis=0)
+            face_bbox_center = np.stack(face_bbox_center_list, axis=0)
+            face_bbox_size = np.stack(face_bbox_size_list, axis=0)
+            face_bbox_valid = np.stack(face_bbox_valid_list, axis=0)
+            body_bbox_center = np.stack(body_bbox_center_list, axis=0)
+            body_bbox_size = np.stack(body_bbox_size_list, axis=0)
+            body_bbox_valid = np.stack(body_bbox_valid_list, axis=0)
+            rhand_bbox_center = np.stack(rhand_bbox_center_list, axis=0)
+            rhand_bbox_valid = np.stack(rhand_bbox_valid_list, axis=0)
+            rhand_bbox_size = np.stack(rhand_bbox_size_list, axis=0)
+                                            
+            inputs = {'img': img}
+            joint_img_aug[:,:,2] = joint_img_aug[:,:,2] * body_bbox_valid[:,None]
+            
+            # assign 2d kps valid to 3d kps
+            joint_cam_wo_ra[..., -1] = joint_img_aug[..., -1] * smplx_joint_valid[..., 0]
+            joint_cam_ra[..., -1] = joint_img_aug[..., -1] * smplx_joint_valid[..., 0]
+            joint_img_aug[...,-1] = joint_img_aug[...,-1] * smplx_joint_valid[...,0]
+            targets = {
+                # keypoints2d, [0,img_w],[0,img_h] -> [0,1] -> [0,output_hm_shape]
+                'joint_img': joint_img_aug, 
+                # projected smplx if valid cam_param, else same as keypoints2d
+                # joint_cam, kp3d wo ra # raw kps3d probably without ra
+                'joint_cam': joint_cam_wo_ra, 
+                'ann_idx': idx,
+                # kps3d with body, face, hand ra
+                'smplx_joint_cam': joint_cam_ra,
+                'smplx_pose': smplx_pose,
+                'smplx_shape': smplx_shape,
+                'smplx_expr': smplx_expr,
+                'lhand_bbox_center': lhand_bbox_center, 
+                'lhand_bbox_size': lhand_bbox_size,
+                'rhand_bbox_center': rhand_bbox_center, 
+                'rhand_bbox_size': rhand_bbox_size,
+                'face_bbox_center': face_bbox_center, 
+                'face_bbox_size': face_bbox_size,
+                'body_bbox_center': body_bbox_center, 
+                'body_bbox_size': body_bbox_size,
+                'body_bbox': body_bbox.reshape(-1,4),
+                'lhand_bbox': lhand_bbox.reshape(-1,4),
+                'rhand_bbox': rhand_bbox.reshape(-1,4),
+                'face_bbox': face_bbox.reshape(-1,4),
+                'gender': gender,
+                'bb2img_trans': bb2img_trans,
+            }
+            
+            if self.body_only:
+                meta_info = {
+                    'joint_trunc': joint_trunc,
+                    'smplx_pose_valid': smplx_pose_valid,
+                    'smplx_shape_valid': float(smplx_shape_valid),
+                    'smplx_expr_valid': smplx_expr_valid,
+                    'is_3D': float(False) if dummy_cord else float(True), 
+                    'lhand_bbox_valid': lhand_bbox_valid,
+                    'rhand_bbox_valid': rhand_bbox_valid, 
+                    'face_bbox_valid': face_bbox_valid,
+                    'body_bbox_valid': body_bbox_valid,
+                    'img_shape': np.array(img.shape[:2]), 
+                    'ori_shape':data['img_shape']
+                }
+            else:
+                meta_info = {
+                    'joint_trunc': joint_trunc,
+                    'smplx_pose_valid': smplx_pose_valid,
+                    'smplx_shape_valid': smplx_shape_valid,
+                    'smplx_expr_valid': smplx_expr_valid,
+                    'is_3D': float(False) if dummy_cord else float(True), 
+                    'lhand_bbox_valid': lhand_bbox_valid,
+                    'rhand_bbox_valid': rhand_bbox_valid, 
+                    'face_bbox_valid': face_bbox_valid,
+                    'body_bbox_valid': body_bbox_valid,
+                    'img_shape': np.array(img.shape[:2]), 
+                    'ori_shape':data['img_shape']}
+            
+            result = {**inputs, **targets, **meta_info}
+            result = self.normalize(result)
+            result = self.format(result)
+            return result
+    def print_eval_result(self, eval_result):
+
+        print('UBody test results are dumped at: ' +
+              osp.join(cfg.result_dir, 'predictions'))
+
+        if self.data_split == 'test' and self.test_set == 'test':  # do not print. just submit the results to the official evaluation server
+            return
+
+        print('======UBody-val======')
+        print('PA MPVPE (All): %.2f mm' % np.mean(eval_result['pa_mpvpe_all']))
+        print('PA MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_l_hand']))
+        print('PA MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_r_hand']))
+        print('PA MPVPE (Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_hand']))
+        print('PA MPVPE (Face): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_face']))
+        print()
+
+        print('MPVPE (All): %.2f mm' % np.mean(eval_result['mpvpe_all']))
+        print('MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_l_hand']))
+        print('MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_r_hand']))
+        print('MPVPE (Hands): %.2f mm' % np.mean(eval_result['mpvpe_hand']))
+        print('MPVPE (Face): %.2f mm' % np.mean(eval_result['mpvpe_face']))
+
+        f = open(os.path.join(cfg.result_dir, 'result.txt'), 'w')
+        f.write(f'UBody-val dataset: \n')
+        f.write('PA MPVPE (All): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_all']))
+        f.write('PA MPVPE (L-Hands): %.2f mm' %
+                np.mean(eval_result['pa_mpvpe_l_hand']))
+        f.write('PA MPVPE (R-Hands): %.2f mm' %
+                np.mean(eval_result['pa_mpvpe_r_hand']))
+        f.write('PA MPVPE (Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_hand']))
+        f.write('PA MPVPE (Face): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_face']))
+        f.write('MPVPE (All): %.2f mm\n' % np.mean(eval_result['mpvpe_all']))
+        f.write('MPVPE (L-Hands): %.2f mm' %
+                np.mean(eval_result['mpvpe_l_hand']))
+        f.write('MPVPE (R-Hands): %.2f mm' %
+                np.mean(eval_result['mpvpe_r_hand']))
+        f.write('MPVPE (Hands): %.2f mm' % np.mean(eval_result['mpvpe_hand']))
+        f.write('MPVPE (Face): %.2f mm\n' % np.mean(eval_result['mpvpe_face']))
diff --git a/datasets/__init__.py b/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/datasets/dataset.py b/datasets/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..01c9a3a3fdebcfeb047c6b67483d6d47d4e52b94
--- /dev/null
+++ b/datasets/dataset.py
@@ -0,0 +1,118 @@
+import random
+import numpy as np
+from torch.utils.data.dataset import Dataset
+from config.config import cfg
+
+class MultipleDatasets(Dataset):
+    def __init__(self, 
+                 dbs,
+                 partition,
+                 make_same_len=True, 
+                 total_len=None, 
+                 verbose=False):
+        self.dbs = dbs
+        self.db_num = len(self.dbs)
+        self.max_db_data_num = max([len(db) for db in dbs])
+        self.db_len_cumsum = np.cumsum([len(db) for db in dbs])
+        self.make_same_len = make_same_len
+        # self.partition = partition
+        self.partition = {k: v for k, v in sorted(partition.items(), key=lambda item: item[1])}
+        self.dataset = {}
+        for db in dbs:
+            self.dataset.update({db.__class__.__name__: db})
+
+        if verbose:
+            print('datasets:', [len(self.dbs[i]) for i in range(self.db_num)])
+            print(
+                f'Sample Ratio: {self.partition}')
+
+    def __len__(self):
+        return self.max_db_data_num
+
+    def __getitem__(self, index):
+        p = np.random.rand()
+        v = list(self.partition.values())
+        k = list(self.partition.keys())
+        for i,v_i in enumerate(v):
+            if p<=v_i:
+                return self.dataset[k[i]][index % len(self.dataset[k[i]])]
+
+
+import random
+import numpy as np
+from torch.utils.data.dataset import Dataset
+
+
+class MultipleDatasets_debug(Dataset):
+    def __init__(self, dbs, make_same_len=True, total_len=None, verbose=False):
+        self.dbs = dbs
+        self.db_num = len(self.dbs)
+        self.max_db_data_num = max([len(db) for db in dbs])
+        self.db_len_cumsum = np.cumsum([len(db) for db in dbs])
+        self.make_same_len = make_same_len
+
+        if total_len == 'auto':
+            self.total_len = self.db_len_cumsum[-1]
+            self.auto_total_len = True
+        else:
+            self.total_len = total_len
+            self.auto_total_len = False
+
+        if total_len is not None:
+            self.per_db_len = self.total_len // self.db_num
+        if verbose:
+            print('datasets:', [len(self.dbs[i]) for i in range(self.db_num)])
+            print(
+                f'Auto total length: {self.auto_total_len}, {self.total_len}')
+
+    def __len__(self):
+        # all dbs have the same length
+        if self.make_same_len:
+            if self.total_len is None:
+                # match the longest length
+                return self.max_db_data_num * self.db_num
+            else:
+                # each dataset has the same length and total len is fixed
+                return self.total_len
+        else:
+            # each db has different length, simply concat
+            return sum([len(db) for db in self.dbs])
+
+    def __getitem__(self, index):
+        if self.make_same_len:
+            if self.total_len is None:
+                # match the longest length
+                db_idx = index // self.max_db_data_num
+                data_idx = index % self.max_db_data_num
+                if data_idx >= len(self.dbs[db_idx]) * (
+                        self.max_db_data_num //
+                        len(self.dbs[db_idx])):  # last batch: random sampling
+                    data_idx = random.randint(0, len(self.dbs[db_idx]) - 1)
+                else:  # before last batch: use modular
+                    data_idx = data_idx % len(self.dbs[db_idx])
+            else:
+                db_idx = index // self.per_db_len
+                data_idx = index % self.per_db_len
+                if db_idx > (self.db_num - 1):
+                    # last batch: randomly choose one dataset
+                    db_idx = random.randint(0, self.db_num - 1)
+
+                if len(self.dbs[db_idx]) < self.per_db_len  and \
+                        data_idx >= len(self.dbs[db_idx]) * (self.per_db_len  // len(self.dbs[db_idx])):
+                    # last batch: random sampling in this dataset
+                    data_idx = random.randint(0, len(self.dbs[db_idx]) - 1)
+                else:
+                    # before last batch: use modular
+                    data_idx = data_idx % len(self.dbs[db_idx])
+
+        else:
+            for i in range(self.db_num):
+                if index < self.db_len_cumsum[i]:
+                    db_idx = i
+                    break
+            if db_idx == 0:
+                data_idx = index
+            else:
+                data_idx = index - self.db_len_cumsum[db_idx - 1]
+
+        return self.dbs[db_idx][data_idx]
diff --git a/datasets/humandata.py b/datasets/humandata.py
new file mode 100644
index 0000000000000000000000000000000000000000..82115c4518736a273d05d8b644708154b4f71c9f
--- /dev/null
+++ b/datasets/humandata.py
@@ -0,0 +1,1301 @@
+import os
+import os.path as osp
+import numpy as np
+import torch
+import cv2
+import json
+import copy
+from pycocotools.coco import COCO
+from config.config import cfg
+from util.human_models import smpl_x
+from util.preprocessing import (
+    load_img, process_bbox, augmentation_instance_sample, process_human_model_output_batch_simplify,process_db_coord_batch_no_valid)
+from util.transforms import world2cam, cam2pixel, rigid_align
+from detrsmpl.utils.geometry import batch_rodrigues, project_points_new, weak_perspective_projection, perspective_projection
+import tqdm
+import time
+import random
+from detrsmpl.utils.demo_utils import box2cs, xywh2xyxy, xyxy2xywh
+import torch.distributed as dist
+
+KPS2D_KEYS = [
+    'keypoints2d_ori', 'keypoints2d_smplx', 'keypoints2d_smpl',
+    'keypoints2d_original','keypoints2d_gta','keypoints2d'
+]
+KPS3D_KEYS = [
+    'keypoints3d_cam', 'keypoints3d', 'keypoints3d_smplx', 'keypoints3d_smpl',
+    'keypoints3d_original', 'keypoints3d_gta','keypoints3d'
+]
+# keypoints3d_cam with root-align has higher priority, followed by old version key keypoints3d
+# when there is keypoints3d_smplx, use this rather than keypoints3d_original
+
+from util.formatting import DefaultFormatBundle
+from detrsmpl.data.datasets.pipelines.transforms import Normalize
+
+class Cache():
+    """A custom implementation for OSX pipeline."""
+    def __init__(self, load_path=None):
+        if load_path is not None:
+            self.load(load_path)
+
+    def load(self, load_path):
+        self.load_path = load_path
+        self.cache = np.load(load_path, allow_pickle=True)
+        self.data_len = self.cache['data_len']
+        self.data_strategy = self.cache['data_strategy']
+        assert self.data_len == len(self.cache) - 2  # data_len, data_strategy
+        self.cache = None
+
+    @classmethod
+    def save(cls, save_path, data_list, data_strategy):
+        assert save_path is not None, 'save_path is None'
+        data_len = len(data_list)
+        cache = {}
+        for i, data in enumerate(data_list):
+            cache[str(i)] = data
+        assert len(cache) == data_len
+        # update meta
+        cache.update({'data_len': data_len, 'data_strategy': data_strategy})
+        # import pdb; pdb.set_trace()
+        np.savez_compressed(save_path, **cache)
+        print(f'Cache saved to {save_path}.')
+
+    # def shuffle(self):
+    #     random.shuffle(self.mapping)
+
+    def __len__(self):
+        return self.data_len
+
+    def __getitem__(self, idx):
+        if self.cache is None:
+            self.cache = np.load(self.load_path, allow_pickle=True)
+        # mapped_idx = self.mapping[idx]
+        # cache_data = self.cache[str(mapped_idx)]
+        # print(self.cache.files)
+        cache_data = self.cache[str(idx)]
+        data = cache_data.item()
+        return data
+
+
+class HumanDataset(torch.utils.data.Dataset):
+
+    # same mapping for 144->137 and 190->137
+    SMPLX_137_MAPPING = [
+        0, 1, 2, 4, 5, 7, 8, 12, 16, 17, 18, 19, 20, 21, 60, 61, 62, 63, 64,
+        65, 59, 58, 57, 56, 55, 37, 38, 39, 66, 25, 26, 27, 67, 28, 29, 30, 68,
+        34, 35, 36, 69, 31, 32, 33, 70, 52, 53, 54, 71, 40, 41, 42, 72, 43, 44,
+        45, 73, 49, 50, 51, 74, 46, 47, 48, 75, 22, 15, 56, 57, 76, 77, 78, 79,
+        80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
+        98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+        112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+        126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
+        140, 141, 142, 143
+    ]
+
+    def __init__(self, transform, data_split):
+        self.transform = transform
+        self.data_split = data_split
+
+        # dataset information, to be filled by child class
+        self.img_dir = None
+        self.annot_path = None
+        self.annot_path_cache = None
+        self.use_cache = False
+        self.img_shape = None  # (h, w)
+        self.cam_param = None  # {'focal_length': (fx, fy), 'princpt': (cx, cy)}
+        self.use_betas_neutral = False
+        self.body_only = False
+        self.joint_set = {
+            'joint_num': smpl_x.joint_num,
+            'joints_name': smpl_x.joints_name,
+            'flip_pairs': smpl_x.flip_pairs
+        }
+        self.joint_set['root_joint_idx'] = self.joint_set['joints_name'].index(
+            'Pelvis')
+        self.format = DefaultFormatBundle()
+        self.normalize = Normalize(mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375])
+        self.keypoints2d = None
+        # self.rank = dist.get_rank()
+        self.lhand_mean = smpl_x.layer['neutral'].left_hand_mean.reshape(15, 3).cpu().numpy()
+        self.rhand_mean = smpl_x.layer['neutral'].right_hand_mean.reshape(15, 3).cpu().numpy()
+        # self.log_file_path = f'indices_node{rank}.txt'
+    def load_cache(self, annot_path_cache):
+        datalist = Cache(annot_path_cache)
+        # assert datalist.data_strategy == getattr(cfg, 'data_strategy', None), \
+        #     f'Cache data strategy {datalist.data_strategy} does not match current data strategy ' \
+        #     f'{getattr(cfg, "data_strategy", None)}'
+        return datalist
+
+    def save_cache(self, annot_path_cache, datalist):
+        print(
+            f'[{self.__class__.__name__}] Caching datalist to {self.annot_path_cache}...'
+        )
+        Cache.save(annot_path_cache,
+                   datalist,
+                   data_strategy=getattr(cfg, 'data_strategy', None))
+
+    def load_data(self, train_sample_interval=1,
+                  hand_bbox_ratio=1, body_bbox_ratio=1):
+
+        content = np.load(self.annot_path, allow_pickle=True)
+        try:
+            frame_range = content['frame_range']
+        except KeyError:
+            self.num_data = len(content['image_path'])
+            frame_range = \
+                np.array([[i, i + 1] for i in range(self.num_data)])
+
+        num_examples = len(frame_range)
+        if 'meta' in content:
+            meta = content['meta'].item()
+            print('meta keys:', meta.keys())
+        else:
+            meta = None
+            print(
+                'No meta info provided! Please give height and width manually')
+
+        print(
+            f'Start loading humandata {self.annot_path} into memory...\nDataset includes: {content.files}'
+        )
+        tic = time.time()
+        image_path = content['image_path']
+        if meta is not None and 'height' in meta and len(meta['height'])>0:
+            height = np.array(meta['height'])
+            width = np.array(meta['width'])
+            image_shape = np.stack([height, width], axis=-1)
+        else:
+            image_shape = None
+
+        if meta is not None and 'gender' in meta and len(meta['gender']) != 0:
+            gender = np.array(meta['gender'])
+        else:
+            gender = None
+        bbox_xywh = content['bbox_xywh']
+
+        if 'smplx' in content:
+            smplx = content['smplx'].item()
+            as_smplx = 'smplx'
+        elif 'smpl' in content:
+            smplx = content['smpl'].item()
+            as_smplx = 'smpl'
+        elif 'smplh' in content:
+            smplx = content['smplh'].item()
+            as_smplx = 'smplh'
+        # TODO: temp solution, should be more general. But SHAPY is very special
+        elif self.__class__.__name__ == 'SHAPY':
+            smplx = {}
+        else:
+            raise KeyError('No SMPL for SMPLX available, please check keys:\n'
+                           f'{content.files}')
+
+        print('Smplx param', smplx.keys())
+
+        if 'lhand_bbox_xywh' in content and 'rhand_bbox_xywh' in content:
+            lhand_bbox_xywh = content['lhand_bbox_xywh']
+            rhand_bbox_xywh = content['rhand_bbox_xywh']
+        else:
+            lhand_bbox_xywh = np.zeros_like(bbox_xywh)
+            rhand_bbox_xywh = np.zeros_like(bbox_xywh)
+
+        if 'face_bbox_xywh' in content:
+            face_bbox_xywh = content['face_bbox_xywh']
+        else:
+            face_bbox_xywh = np.zeros_like(bbox_xywh)
+        
+        if meta is not None and 'smplx_valid' in meta:
+            smplx_valid = meta['smplx_valid']
+        else:
+            smplx_valid = np.ones(len(bbox_xywh))
+            
+        decompressed = False
+        if content['__keypoints_compressed__']:
+            decompressed_kps = self.decompress_keypoints(content)
+            decompressed = True
+
+        keypoints3d = None
+        valid_kps3d = False
+        keypoints3d_mask = None
+        valid_kps3d_mask = False
+
+        # processing keypoints
+        for kps3d_key in KPS3D_KEYS:
+            if kps3d_key in content:
+                keypoints3d = decompressed_kps[kps3d_key][:, self.SMPLX_137_MAPPING, :] if decompressed \
+                else content[kps3d_key][:, self.SMPLX_137_MAPPING, :]
+                valid_kps3d = True
+                if keypoints3d.shape[-1] == 4:
+                    valid_kps3d_mask = True
+                break
+        if self.keypoints2d is not None:
+            keypoints2d = decompressed_kps[self.keypoints2d][:, self.SMPLX_137_MAPPING, :] if decompressed \
+                else content[self.keypoints2d][:, self.SMPLX_137_MAPPING, :]
+            
+
+        else:
+            for kps2d_key in KPS2D_KEYS:
+                if kps2d_key in content:
+                    keypoints2d = decompressed_kps[kps2d_key][:, self.SMPLX_137_MAPPING, :] if decompressed \
+                        else content[kps2d_key][:, self.SMPLX_137_MAPPING, :]
+                    break
+        if keypoints2d.shape[-1] == 3:
+            valid_kps3d_mask = True
+        
+        print('Done. Time: {:.2f}s'.format(time.time() - tic))
+
+        datalist = []
+        # num_examples
+
+        # processing each image, filter according to bbox valid
+        for i in tqdm.tqdm(range(int(num_examples))):
+            
+            if self.data_split == 'train' and i % train_sample_interval != 0:
+                continue
+            frame_start, frame_end = frame_range[i]
+            img_path = osp.join(self.img_dir, image_path[frame_start])
+            # im_shape = cv2.imread(img_path).shape[:2]
+            img_shape = image_shape[
+                frame_start] if image_shape is not None else self.img_shape
+            
+
+            bbox_list = bbox_xywh[frame_start:frame_end, :4]
+            
+            valid_idx = []
+            body_bbox_list = []
+            
+            # if hasattr(cfg, 'bbox_ratio'):
+            #     bbox_ratio = cfg.bbox_ratio * 0.833  # preprocess body bbox is giving 1.2 box padding
+            # else:
+            #     bbox_ratio = 1.25
+            # if self.__class__.__name__ == 'SPEC':
+            #     bbox_ratio = 1.25
+            
+            for bbox_i, bbox in enumerate(bbox_list):
+                
+                bbox = process_bbox(bbox,
+                                    img_width=img_shape[1],
+                                    img_height=img_shape[0],
+                                    ratio=body_bbox_ratio)
+                if bbox is None:
+                    continue
+                else:
+                    valid_idx.append(frame_start + bbox_i)
+                    bbox[2:] += bbox[:2]
+                    body_bbox_list.append(bbox)
+            
+            if len(valid_idx) == 0:
+                continue
+            valid_num = len(valid_idx)
+            # hand/face bbox
+            lhand_bbox_list = []
+            rhand_bbox_list = []
+            face_bbox_list = []
+            smplx_valid_list = []
+            for bbox_i in valid_idx:
+                smplx_valid_list.append(smplx_valid[bbox_i])
+                lhand_bbox = lhand_bbox_xywh[bbox_i]
+                rhand_bbox = rhand_bbox_xywh[bbox_i]
+                face_bbox = face_bbox_xywh[bbox_i]
+                if lhand_bbox[-1] > 0:  # conf > 0
+                    lhand_bbox = lhand_bbox[:4]
+                    # if hasattr(cfg, 'bbox_ratio'):
+                    lhand_bbox = process_bbox(lhand_bbox,
+                                                img_width=img_shape[1],
+                                                img_height=img_shape[0],
+                                                ratio=hand_bbox_ratio)
+                    if lhand_bbox is not None:
+                        lhand_bbox[2:] += lhand_bbox[:2]  # xywh -> xyxy
+                else:
+                    lhand_bbox = None
+                if rhand_bbox[-1] > 0:
+                    rhand_bbox = rhand_bbox[:4]
+                    # if hasattr(cfg, 'bbox_ratio'):
+                    rhand_bbox = process_bbox(rhand_bbox,
+                                                img_width=img_shape[1],
+                                                img_height=img_shape[0],
+                                                ratio=hand_bbox_ratio)
+                    if rhand_bbox is not None:
+                        rhand_bbox[2:] += rhand_bbox[:2]  # xywh -> xyxy
+                else:
+                    rhand_bbox = None
+                if face_bbox[-1] > 0:
+                    face_bbox = face_bbox[:4]
+                    # if hasattr(cfg, 'bbox_ratio'):
+                    face_bbox = process_bbox(face_bbox,
+                                                img_width=img_shape[1],
+                                                img_height=img_shape[0],
+                                                ratio=hand_bbox_ratio)
+                    if face_bbox is not None:
+                        face_bbox[2:] += face_bbox[:2]  # xywh -> xyxy
+                else:
+                    face_bbox = None
+                lhand_bbox_list.append(lhand_bbox)
+                rhand_bbox_list.append(rhand_bbox)
+                face_bbox_list.append(face_bbox)
+            
+            joint_img = keypoints2d[valid_idx]
+            
+            if valid_kps3d:
+                joint_cam = keypoints3d[valid_idx]
+            else:
+                joint_cam = None
+            if 'leye_pose_0' in smplx.keys():
+                smplx.pop('leye_pose_0')
+            if 'leye_pose_1' in smplx.keys():
+                smplx.pop('leye_pose_1')
+            if 'leye_pose' in smplx.keys():
+                smplx.pop('leye_pose')
+            if 'reye_pose_0' in smplx.keys():
+                smplx.pop('reye_pose_0')
+            if 'reye_pose_1' in smplx.keys():
+                smplx.pop('reye_pose_1')
+            if 'reye_pose' in smplx.keys():
+                smplx.pop('reye_pose')
+            
+
+            smplx_param = {k: v[valid_idx] for k, v in smplx.items()}
+            gender_ = gender[valid_idx] \
+                if gender is not None else np.array(['neutral']*(valid_num))
+            lhand_bbox_valid = lhand_bbox_xywh[valid_idx,4]
+            rhand_bbox_valid = rhand_bbox_xywh[valid_idx,4]
+            face_bbox_valid = face_bbox_xywh[valid_idx,4]
+            
+            # TODO: set invalid if None?
+            smplx_param['root_pose'] = smplx_param.pop('global_orient', None)
+            smplx_param['shape'] = smplx_param.pop('betas', None)
+            smplx_param['trans'] = smplx_param.pop('transl', np.zeros([len(valid_idx),3]))
+            smplx_param['lhand_pose'] = smplx_param.pop('left_hand_pose', None)
+            smplx_param['rhand_pose'] = smplx_param.pop(
+                'right_hand_pose', None)
+            smplx_param['expr'] = smplx_param.pop('expression', None)
+
+            # TODO do not fix betas, give up shape supervision
+            if 'betas_neutral' in smplx_param and self.data_split == 'train':
+                smplx_param['shape'] = smplx_param.pop('betas_neutral')
+                # smplx_param['shape'] = np.zeros(10, dtype=np.float32)
+
+            # # TODO fix shape of poses
+            if self.__class__.__name__ == 'Talkshow':
+                smplx_param['body_pose'] = smplx_param['body_pose'].reshape(
+                    -1, 21, 3)
+                smplx_param['lhand_pose'] = smplx_param['lhand_pose'].reshape(
+                    -1, 15, 3)
+                smplx_param['rhand_pose'] = smplx_param['lhand_pose'].reshape(
+                    -1, 15, 3)
+                smplx_param['expr'] = smplx_param['expr'][:, :10]
+
+            if self.__class__.__name__ == 'BEDLAM':
+                smplx_param['shape'] = smplx_param['shape'][:, :10]
+                # smplx_param['expr'] = None
+            if self.__class__.__name__ == 'GTA':
+                smplx_param['shape'] = np.zeros(
+                    [valid_num, 10],
+                    dtype=np.float32)
+            if self.__class__.__name__ == 'COCO_NA':
+                # smplx_param['expr'] = None
+                smplx_param['body_pose'] = smplx_param['body_pose'].reshape(
+                    -1, 21, 3)
+                smplx_param['lhand_pose'] = smplx_param['lhand_pose'].reshape(
+                    -1, 15, 3)
+                smplx_param['rhand_pose'] = smplx_param['rhand_pose'].reshape(
+                    -1, 15, 3)
+            if as_smplx == 'smpl':
+                smplx_param['shape'] = np.zeros(
+                    [valid_num, 10],
+                    dtype=np.float32)  # drop smpl betas for smplx
+                smplx_param['body_pose'] = smplx_param[
+                    'body_pose'].reshape(-1,23,3)[:, :21, :]  # use smpl body_pose on smplx
+            if as_smplx == 'smplh':
+                smplx_param['shape'] = np.zeros(
+                    [valid_num, 10],
+                    dtype=np.float32)  # drop smpl betas for smplx
+
+            if smplx_param['lhand_pose'] is None or self.body_only == True:
+                smplx_param['lhand_valid'] = np.zeros(valid_num, dtype=np.bool8)
+            else:
+                smplx_param['lhand_valid'] = lhand_bbox_valid.astype(np.bool8)
+                
+            if smplx_param['rhand_pose'] is None or self.body_only == True:
+                smplx_param['rhand_valid'] = np.zeros(valid_num, dtype=np.bool8)
+            else:
+                smplx_param['rhand_valid'] = rhand_bbox_valid.astype(np.bool8)
+                
+            if smplx_param['expr'] is None or self.body_only == True:
+                smplx_param['face_valid'] = np.zeros(valid_num, dtype=np.bool8)
+            else:
+                smplx_param['face_valid'] = face_bbox_valid.astype(np.bool8)
+
+            smplx_param['smplx_valid'] = np.array(smplx_valid_list).astype(np.bool8)
+            if joint_cam is not None and np.any(np.isnan(joint_cam)):
+                continue
+            
+            
+            if self.__class__.__name__ == 'SPEC':
+                joint_img[:,:,2] = joint_img[:,:,2]>0
+                joint_cam[:,:,3] = joint_cam[:,:,0]!=0
+            datalist.append({
+                'img_path': img_path,
+                'img_shape': img_shape,
+                'bbox': body_bbox_list,
+                'lhand_bbox': lhand_bbox_list,
+                'rhand_bbox': rhand_bbox_list,
+                'face_bbox': face_bbox_list,
+                'joint_img': joint_img,
+                'joint_cam': joint_cam,
+                'smplx_param': smplx_param,
+                'as_smplx': as_smplx,
+                'gender': gender_
+            })
+
+        # save memory
+        del content, image_path, bbox_xywh, lhand_bbox_xywh, rhand_bbox_xywh, face_bbox_xywh, keypoints3d, keypoints2d
+
+        if self.data_split == 'train':
+            print(f'[{self.__class__.__name__} train] original size:',
+                  int(num_examples), '. Sample interval:',
+                  train_sample_interval, '. Sampled size:', len(datalist))
+
+        if getattr(cfg, 'data_strategy',
+                   None) == 'balance' and self.data_split == 'train':
+            print(
+                f'[{self.__class__.__name__}] Using [balance] strategy with datalist shuffled...'
+            )
+            random.shuffle(datalist)
+
+        return datalist
+
+    def __len__(self):
+        return len(self.datalist)
+    # 19493
+    def __getitem__(self, idx):
+        # rank = self.rank
+        # local_rank = rank % torch.cuda.device_count()
+        # with open(f'index_log_{rank}.txt', 'a') as f:
+        #     f.write(f'{rank}-{local_rank}-{idx}\n')
+        try:
+            data = copy.deepcopy(self.datalist[idx])
+        except Exception as e:
+            print(f'[{self.__class__.__name__}] Error loading data {idx}')
+            print(e)
+            exit(0)
+        # data/datasets/coco_2017/train2017/000000029582.jpg' 45680
+        img_path, img_shape, bbox = \
+            data['img_path'], data['img_shape'], data['bbox']
+        as_smplx = data['as_smplx']
+        gender = data['gender'].copy()
+        for gender_str, gender_num in {
+            'neutral': -1, 'male': 0, 'female': 1}.items():
+            gender[gender==gender_str]=gender_num
+        gender = gender.astype(int)    
+        
+        img_whole_bbox = np.array([0, 0, img_shape[1], img_shape[0]])
+        img = load_img(img_path, order='BGR')
+
+        num_person = len(data['bbox'])
+        data_name = self.__class__.__name__
+        try:
+            # dist.barrier()
+            img, img2bb_trans, bb2img_trans, rot, do_flip = \
+                augmentation_instance_sample(img, img_whole_bbox, self.data_split, data, data_name)
+        except Exception as e:
+            rank = self.rank
+            local_rank = rank % torch.cuda.device_count()
+            with open(f'index_log_{rank}.txt', 'a') as f:
+                f.write(f'{rank}-{local_rank}-{idx}\n')
+                f.write(f'[{self.__class__.__name__}] Error loading data {idx}\n')
+                f.write(f'Error in augmentation_instance_sample for {img_path}\n')
+            # print(f'[{self.__class__.__name__}] Error loading data {idx}')
+            # print(f'Error in augmentation_instance_sample for {img_path}')
+            raise e
+        cropped_img_shape = img.shape[:2]
+        
+        if self.data_split == 'train':
+            joint_cam = data['joint_cam']  # num, 137,4
+            if joint_cam is not None:
+                dummy_cord = False
+                joint_cam[:,:,:3] = \
+                    joint_cam[:,:,:3] - joint_cam[:, self.joint_set['root_joint_idx'], None, :3]  # root-relative
+            else:
+                # dummy cord as joint_cam
+                dummy_cord = True
+                joint_cam = np.zeros(
+                    (num_person, self.joint_set['joint_num'], 4),
+                    dtype=np.float32)
+
+            joint_img = data['joint_img']
+            # do rotation on keypoints
+            joint_img_aug, joint_cam_wo_ra, joint_cam_ra, joint_trunc = \
+                process_db_coord_batch_no_valid(
+                    joint_img, joint_cam, do_flip, img_shape,
+                    self.joint_set['flip_pairs'], img2bb_trans, rot,
+                    self.joint_set['joints_name'], smpl_x.joints_name,
+                    cropped_img_shape)
+            joint_img_aug[:,:,2:] = joint_img_aug[:,:,2:] * joint_trunc
+            
+            # smplx coordinates and parameters
+            smplx_param = data['smplx_param']
+            
+            
+            if self.__class__.__name__ in [ 'CHI3D', 'SynBody', 'UBody_MM']:
+                smplx_param['lhand_pose']-=self.lhand_mean[None]
+                smplx_param['rhand_pose']-=self.rhand_mean[None]
+            # smplx_param
+            smplx_pose, smplx_shape, smplx_expr, smplx_pose_valid, \
+            smplx_joint_valid, smplx_expr_valid, smplx_shape_valid = \
+                process_human_model_output_batch_simplify(
+                    smplx_param, do_flip, rot, as_smplx, data_name)
+            smplx_joint_valid = smplx_joint_valid[:, :, None]
+            # if cam not provided, we take joint_img as smplx joint 2d, 
+            # which is commonly the case for our processed humandata
+            # change smplx_shape if use_betas_neutral
+            # processing follows that in process_human_model_output
+            if self.use_betas_neutral:
+                smplx_shape = smplx_param['betas_neutral'].reshape(
+                    num_person, -1)
+                smplx_shape[(np.abs(smplx_shape) > 3).any(axis=1)] = 0.
+                smplx_shape = smplx_shape.reshape(num_person, -1)
+            
+            if self.__class__.__name__ == 'MPII_MM' :
+                for name in ('L_Ankle', 'R_Ankle', 'L_Wrist', 'R_Wrist'):
+                    smplx_pose_valid[:, smpl_x.orig_joints_name.index(name)] = 0
+                for name in ('L_Big_toe', 'L_Small_toe', 'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel'):
+                     smplx_joint_valid[:,smpl_x.joints_name.index(name)] = 0
+            
+    
+            lhand_bbox_center_list = []
+            lhand_bbox_valid_list = []
+            lhand_bbox_size_list = []
+            lhand_bbox_list = []
+            face_bbox_center_list = []
+            face_bbox_size_list = []
+            face_bbox_valid_list = []
+            face_bbox_list = []
+            rhand_bbox_center_list = []
+            rhand_bbox_valid_list = []
+            rhand_bbox_size_list = []
+            rhand_bbox_list = []
+            body_bbox_center_list = []
+            body_bbox_size_list = []
+            body_bbox_valid_list = []
+            body_bbox_list = []
+            # hand and face bbox transform
+            
+
+            for i in range(num_person):
+                body_bbox, body_bbox_valid = self.process_hand_face_bbox(
+                    data['bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                
+                lhand_bbox, lhand_bbox_valid = self.process_hand_face_bbox(
+                    data['lhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                lhand_bbox_valid *= smplx_param['lhand_valid'][i]
+                
+                rhand_bbox, rhand_bbox_valid = self.process_hand_face_bbox(
+                    data['rhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                rhand_bbox_valid *= smplx_param['rhand_valid'][i]
+                
+                face_bbox, face_bbox_valid = self.process_hand_face_bbox(
+                    data['face_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                face_bbox_valid *= smplx_param['face_valid'][i]
+                
+                # BEDLAM and COCO_NA do not have face expression
+                # if self.__class__.__name__ != 'BEDLAM':
+                #     face_bbox_valid *= smplx_param['face_valid'][i]
+
+                if do_flip:
+                    lhand_bbox, rhand_bbox = rhand_bbox, lhand_bbox
+                    lhand_bbox_valid, rhand_bbox_valid = rhand_bbox_valid, lhand_bbox_valid
+                    
+                body_bbox_list.append(body_bbox)
+                lhand_bbox_list.append(lhand_bbox)
+                rhand_bbox_list.append(rhand_bbox)
+                face_bbox_list.append(face_bbox)
+                
+                lhand_bbox_center = (lhand_bbox[0] + lhand_bbox[1]) / 2.
+                rhand_bbox_center = (rhand_bbox[0] + rhand_bbox[1]) / 2.
+                face_bbox_center = (face_bbox[0] + face_bbox[1]) / 2.
+                body_bbox_center = (body_bbox[0] + body_bbox[1]) / 2.
+                lhand_bbox_size = lhand_bbox[1] - lhand_bbox[0]
+                rhand_bbox_size = rhand_bbox[1] - rhand_bbox[0]
+
+                face_bbox_size = face_bbox[1] - face_bbox[0]
+                body_bbox_size = body_bbox[1] - body_bbox[0]
+                lhand_bbox_center_list.append(lhand_bbox_center)
+                lhand_bbox_valid_list.append(lhand_bbox_valid)
+                lhand_bbox_size_list.append(lhand_bbox_size)
+                face_bbox_center_list.append(face_bbox_center)
+                face_bbox_size_list.append(face_bbox_size)
+                face_bbox_valid_list.append(face_bbox_valid)
+                rhand_bbox_center_list.append(rhand_bbox_center)
+                rhand_bbox_valid_list.append(rhand_bbox_valid)
+                rhand_bbox_size_list.append(rhand_bbox_size)
+                body_bbox_center_list.append(body_bbox_center)
+                body_bbox_size_list.append(body_bbox_size)
+                body_bbox_valid_list.append(body_bbox_valid)
+            
+            
+            body_bbox = np.stack(body_bbox_list, axis=0)
+            lhand_bbox = np.stack(lhand_bbox_list, axis=0)
+            rhand_bbox = np.stack(rhand_bbox_list, axis=0)
+            face_bbox = np.stack(face_bbox_list, axis=0)
+            lhand_bbox_center = np.stack(lhand_bbox_center_list, axis=0)
+            lhand_bbox_valid = np.stack(lhand_bbox_valid_list, axis=0)
+            lhand_bbox_size = np.stack(lhand_bbox_size_list, axis=0)
+            face_bbox_center = np.stack(face_bbox_center_list, axis=0)
+            face_bbox_size = np.stack(face_bbox_size_list, axis=0)
+            face_bbox_valid = np.stack(face_bbox_valid_list, axis=0)
+            body_bbox_center = np.stack(body_bbox_center_list, axis=0)
+            body_bbox_size = np.stack(body_bbox_size_list, axis=0)
+            body_bbox_valid = np.stack(body_bbox_valid_list, axis=0)
+            rhand_bbox_center = np.stack(rhand_bbox_center_list, axis=0)
+            rhand_bbox_valid = np.stack(rhand_bbox_valid_list, axis=0)
+            rhand_bbox_size = np.stack(rhand_bbox_size_list, axis=0)
+
+            inputs = {'img': img}
+
+            # joint_img_aug[:,:,2] = joint_img_aug[:,:,2] * body_bbox_valid[:,None]
+            
+            is_3D = float(False) if dummy_cord else float(True)
+            if self.__class__.__name__ == 'COCO_NA':
+                is_3D = False
+            if self.__class__.__name__ == 'GTA_Human2':
+                smplx_shape_valid = smplx_shape_valid * 0
+            if self.__class__.__name__ == 'PoseTrack' or self.__class__.__name__ == 'MPII_MM' \
+            or self.__class__.__name__ == 'CrowdPose'  or self.__class__.__name__ == 'UBody_MM' \
+            or self.__class__.__name__ == 'COCO_NA':
+                joint_cam_ra[...,-1] = joint_cam_ra[...,-1] * smplx_joint_valid[...,0]
+                joint_cam_wo_ra[...,-1] = joint_cam_wo_ra[...,-1] * smplx_joint_valid[...,0]
+                joint_img_aug[...,-1] = joint_img_aug[...,-1] * smplx_joint_valid[...,0]
+            # if body_bbox_valid.sum() > 0:
+            
+            
+            targets = {
+                # keypoints2d, [0,img_w],[0,img_h] -> [0,1] -> [0,output_hm_shape]
+                'joint_img': joint_img_aug[body_bbox_valid>0], 
+                # joint_cam, kp3d wo ra # raw kps3d probably without ra
+                'joint_cam': joint_cam_wo_ra[body_bbox_valid>0], 
+                # kps3d with body, face, hand ra
+                'smplx_joint_cam': joint_cam_ra[body_bbox_valid>0], 
+                'smplx_pose': smplx_pose[body_bbox_valid>0],
+                'smplx_shape': smplx_shape[body_bbox_valid>0],
+                'smplx_expr': smplx_expr[body_bbox_valid>0],
+                'lhand_bbox_center': lhand_bbox_center[body_bbox_valid>0], 
+                'lhand_bbox_size': lhand_bbox_size[body_bbox_valid>0],
+                'rhand_bbox_center': rhand_bbox_center[body_bbox_valid>0], 
+                'rhand_bbox_size': rhand_bbox_size[body_bbox_valid>0],
+                'face_bbox_center': face_bbox_center[body_bbox_valid>0], 
+                'face_bbox_size': face_bbox_size[body_bbox_valid>0],
+                'body_bbox_center': body_bbox_center[body_bbox_valid>0], 
+                'body_bbox_size': body_bbox_size[body_bbox_valid>0],
+                'body_bbox': body_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'lhand_bbox': lhand_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'rhand_bbox': rhand_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'face_bbox': face_bbox.reshape(-1,4)[body_bbox_valid>0],
+                'gender': gender[body_bbox_valid>0]}
+
+            meta_info = {
+                'joint_trunc': joint_trunc[body_bbox_valid>0],
+                'smplx_pose_valid': smplx_pose_valid[body_bbox_valid>0],
+                'smplx_shape_valid': smplx_shape_valid[body_bbox_valid>0],
+                'smplx_expr_valid': smplx_expr_valid[body_bbox_valid>0],
+                'is_3D': is_3D, 
+                'lhand_bbox_valid': lhand_bbox_valid[body_bbox_valid>0],
+                'rhand_bbox_valid': rhand_bbox_valid[body_bbox_valid>0], 
+                'face_bbox_valid': face_bbox_valid[body_bbox_valid>0],
+                'body_bbox_valid': body_bbox_valid[body_bbox_valid>0],
+                'img_shape': np.array(img.shape[:2]), 
+                'ori_shape':data['img_shape'],
+                'idx': idx
+               
+            }
+
+            result = {**inputs, **targets, **meta_info}
+            
+            result = self.normalize(result)
+            result = self.format(result)
+            return result
+
+        
+
+        if self.data_split == 'test':
+            self.cam_param = {}
+            joint_cam = data['joint_cam']
+            
+            if joint_cam is not None:
+                dummy_cord = False
+                joint_cam[:,:,:3] = joint_cam[:,:,:3] - joint_cam[
+                    :, self.joint_set['root_joint_idx'], None, :3]  # root-relative
+            else:
+                # dummy cord as joint_cam
+                dummy_cord = True
+                joint_cam = np.zeros(
+                    (num_person, self.joint_set['joint_num'], 3),
+                                     dtype=np.float32)
+
+            joint_img = data['joint_img']
+            
+            
+            joint_img_aug, joint_cam_wo_ra, joint_cam_ra, joint_trunc = \
+                process_db_coord_batch_no_valid(
+                    joint_img, joint_cam, do_flip, img_shape,
+                    self.joint_set['flip_pairs'], img2bb_trans, rot,
+                    self.joint_set['joints_name'], smpl_x.joints_name,
+                    cropped_img_shape)
+            
+            
+
+            # smplx coordinates and parameters
+            smplx_param = data['smplx_param']
+            # smplx_cam_trans = np.array(
+            #     smplx_param['trans']) if 'trans' in smplx_param else None
+            # TODO: remove this, seperate smpl and smplx
+            smplx_pose, smplx_shape, smplx_expr, smplx_pose_valid, \
+            smplx_joint_valid, smplx_expr_valid, smplx_shape_valid = \
+                process_human_model_output_batch_simplify(
+                    smplx_param, do_flip, rot, as_smplx)
+            # if cam not provided, we take joint_img as smplx joint 2d, 
+            # which is commonly the case for our processed humandata
+            if self.use_betas_neutral:
+                smplx_shape = smplx_param['betas_neutral'].reshape(
+                    num_person, -1)
+                smplx_shape[(np.abs(smplx_shape) > 3).any(axis=1)] = 0.
+                smplx_shape = smplx_shape.reshape(num_person, -1)
+            # smplx_pose_valid = np.tile(smplx_pose_valid[:,:, None], (1, 3)).reshape(num_person,-1)
+            smplx_joint_valid = smplx_joint_valid[:, :, None]
+
+            # if not (smplx_shape == 0).all():
+            #     smplx_shape_valid = True
+            # else:
+            #     smplx_shape_valid = False
+            lhand_bbox_center_list = []
+            lhand_bbox_valid_list = []
+            lhand_bbox_size_list = []
+            lhand_bbox_list = []
+            face_bbox_center_list = []
+            face_bbox_size_list = []
+            face_bbox_valid_list = []
+            face_bbox_list = []
+            rhand_bbox_center_list = []
+            rhand_bbox_valid_list = []
+            rhand_bbox_size_list = []
+            rhand_bbox_list = []
+            body_bbox_center_list = []
+            body_bbox_size_list = []
+            body_bbox_valid_list = []
+            body_bbox_list = []
+                        
+            for i in range(num_person):
+                lhand_bbox, lhand_bbox_valid = self.process_hand_face_bbox(
+                    data['lhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                rhand_bbox, rhand_bbox_valid = self.process_hand_face_bbox(
+                    data['rhand_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                face_bbox, face_bbox_valid = self.process_hand_face_bbox(
+                    data['face_bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)
+                
+                body_bbox, body_bbox_valid = self.process_hand_face_bbox(
+                    data['bbox'][i], do_flip, img_shape, img2bb_trans,
+                    cropped_img_shape)                
+
+                if do_flip:
+                    lhand_bbox, rhand_bbox = rhand_bbox, lhand_bbox
+                    lhand_bbox_valid, rhand_bbox_valid = rhand_bbox_valid, lhand_bbox_valid            
+
+                body_bbox_list.append(body_bbox)
+                lhand_bbox_list.append(lhand_bbox)
+                rhand_bbox_list.append(rhand_bbox)
+                face_bbox_list.append(face_bbox)
+
+                lhand_bbox_center = (lhand_bbox[0] + lhand_bbox[1]) / 2.
+                rhand_bbox_center = (rhand_bbox[0] + rhand_bbox[1]) / 2.
+                face_bbox_center = (face_bbox[0] + face_bbox[1]) / 2.
+                body_bbox_center = (body_bbox[0] + body_bbox[1]) / 2.
+                lhand_bbox_size = lhand_bbox[1] - lhand_bbox[0]
+                rhand_bbox_size = rhand_bbox[1] - rhand_bbox[0]
+
+                face_bbox_size = face_bbox[1] - face_bbox[0]
+                body_bbox_size = body_bbox[1] - body_bbox[0]
+                lhand_bbox_center_list.append(lhand_bbox_center)
+                lhand_bbox_valid_list.append(lhand_bbox_valid)
+                lhand_bbox_size_list.append(lhand_bbox_size)
+                face_bbox_center_list.append(face_bbox_center)
+                face_bbox_size_list.append(face_bbox_size)
+                face_bbox_valid_list.append(face_bbox_valid)
+                rhand_bbox_center_list.append(rhand_bbox_center)
+                rhand_bbox_valid_list.append(rhand_bbox_valid)
+                rhand_bbox_size_list.append(rhand_bbox_size)
+                body_bbox_center_list.append(body_bbox_center)
+                body_bbox_size_list.append(body_bbox_size)
+                body_bbox_valid_list.append(body_bbox_valid)
+
+            body_bbox = np.stack(body_bbox_list, axis=0)
+            lhand_bbox = np.stack(lhand_bbox_list, axis=0)
+            rhand_bbox = np.stack(rhand_bbox_list, axis=0)
+            face_bbox = np.stack(face_bbox_list, axis=0)
+            lhand_bbox_center = np.stack(lhand_bbox_center_list, axis=0)
+            lhand_bbox_valid = np.stack(lhand_bbox_valid_list, axis=0)
+            lhand_bbox_size = np.stack(lhand_bbox_size_list, axis=0)
+            face_bbox_center = np.stack(face_bbox_center_list, axis=0)
+            face_bbox_size = np.stack(face_bbox_size_list, axis=0)
+            face_bbox_valid = np.stack(face_bbox_valid_list, axis=0)
+            body_bbox_center = np.stack(body_bbox_center_list, axis=0)
+            body_bbox_size = np.stack(body_bbox_size_list, axis=0)
+            body_bbox_valid = np.stack(body_bbox_valid_list, axis=0)
+            rhand_bbox_center = np.stack(rhand_bbox_center_list, axis=0)
+            rhand_bbox_valid = np.stack(rhand_bbox_valid_list, axis=0)
+            rhand_bbox_size = np.stack(rhand_bbox_size_list, axis=0)
+                                            
+                            
+            inputs = {'img': img}
+            
+            targets = {
+                # keypoints2d, [0,img_w],[0,img_h] -> [0,1] -> [0,output_hm_shape]
+                'joint_img': joint_img_aug, 
+                # projected smplx if valid cam_param, else same as keypoints2d
+                # joint_cam, kp3d wo ra # raw kps3d probably without ra
+                'joint_cam': joint_cam_wo_ra, 
+                'ann_idx': idx,
+                # kps3d with body, face, hand ra
+                'smplx_joint_cam': joint_cam_ra,
+                'smplx_pose': smplx_pose,
+                'smplx_shape': smplx_shape,
+                'smplx_expr': smplx_expr,
+                'lhand_bbox_center': lhand_bbox_center, 
+                'lhand_bbox_size': lhand_bbox_size,
+                'rhand_bbox_center': rhand_bbox_center, 
+                'rhand_bbox_size': rhand_bbox_size,
+                'face_bbox_center': face_bbox_center, 
+                'face_bbox_size': face_bbox_size,
+                'body_bbox_center': body_bbox_center, 
+                'body_bbox_size': body_bbox_size,
+                'body_bbox': body_bbox.reshape(-1,4),
+                'lhand_bbox': lhand_bbox.reshape(-1,4),
+                'rhand_bbox': rhand_bbox.reshape(-1,4),
+                'face_bbox': face_bbox.reshape(-1,4),
+                'gender': gender,
+                'bb2img_trans': bb2img_trans,
+            }
+            
+            if self.body_only:
+                meta_info = {
+                    'joint_trunc': joint_trunc,
+                    'smplx_pose_valid': smplx_pose_valid,
+                    'smplx_shape_valid': float(smplx_shape_valid),
+                    'smplx_expr_valid': smplx_expr_valid,
+                    'is_3D': float(False) if dummy_cord else float(True), 
+                    'lhand_bbox_valid': lhand_bbox_valid,
+                    'rhand_bbox_valid': rhand_bbox_valid, 
+                    'face_bbox_valid': face_bbox_valid,
+                    'body_bbox_valid': body_bbox_valid,
+                    'img_shape': np.array(img.shape[:2]), 
+                    'ori_shape':data['img_shape'],
+                    'idx': idx
+                   
+                }
+            else:
+                meta_info = {
+                    'joint_trunc': joint_trunc,
+                    'smplx_pose_valid': smplx_pose_valid,
+                    'smplx_shape_valid': smplx_shape_valid,
+                    'smplx_expr_valid': smplx_expr_valid,
+                    'is_3D': float(False) if dummy_cord else float(True), 
+                    'lhand_bbox_valid': lhand_bbox_valid,
+                    'rhand_bbox_valid': rhand_bbox_valid, 
+                    'face_bbox_valid': face_bbox_valid,
+                    'body_bbox_valid': body_bbox_valid,
+                    'img_shape': np.array(img.shape[:2]), 
+                    'ori_shape':data['img_shape'],
+                    'idx': idx
+                   }
+            
+            result = {**inputs, **targets, **meta_info}
+            result = self.normalize(result)
+            result = self.format(result)
+            return result
+
+    def process_hand_face_bbox(self, bbox, do_flip, img_shape, img2bb_trans,
+                               input_img_shape):
+        if bbox is None:
+            bbox = np.array([0, 0, 1, 1],
+                            dtype=np.float32).reshape(2, 2)  # dummy value
+            bbox_valid = float(False)  # dummy value
+        else:
+            # reshape to top-left (x,y) and bottom-right (x,y)
+            bbox = bbox.reshape(2, 2)
+
+            # flip augmentation
+            if do_flip:
+                bbox[:, 0] = img_shape[1] - bbox[:, 0] - 1
+                bbox[0, 0], bbox[1, 0] = bbox[1, 0].copy(), bbox[
+                    0, 0].copy()  # xmin <-> xmax swap
+
+            # make four points of the bbox
+            bbox = bbox.reshape(4).tolist()
+            xmin, ymin, xmax, ymax = bbox
+            bbox = np.array(
+                [[xmin, ymin], [xmax, ymin], [xmax, ymax], [xmin, ymax]],
+                dtype=np.float32).reshape(4, 2)
+
+            # affine transformation (crop, rotation, scale)
+            bbox_xy1 = np.concatenate((bbox, np.ones_like(bbox[:, :1])), 1)
+            bbox = np.dot(img2bb_trans,
+                          bbox_xy1.transpose(1, 0)).transpose(1, 0)[:, :2]
+            
+            # print(bbox)
+            # bbox[:, 0] = bbox[:, 0] / input_img_shape[1] * cfg.output_hm_shape[2]
+            # bbox[:, 1] = bbox[:, 1] / input_img_shape[0] * cfg.output_hm_shape[1]
+            
+            bbox[:, 0] /= input_img_shape[1]
+            bbox[:, 1] /= input_img_shape[0]
+            
+            # make box a rectangle without rotation
+            if np.max(bbox[:,0])<=0 or np.min(bbox[:,0])>=1 or np.max(bbox[:,1])<=0 or np.min(bbox[:,1])>=1:
+                bbox_valid = float(False)
+                bbox = np.array([0, 0, 1, 1], dtype=np.float32)
+            else:
+                xmin = np.max([np.min(bbox[:, 0]), 0])
+                xmax = np.min([np.max(bbox[:, 0]), 1])
+                ymin = np.max([np.min(bbox[:, 1]), 0])
+                ymax = np.min([np.max(bbox[:, 1]), 1])
+                bbox = np.array([xmin, ymin, xmax, ymax], dtype=np.float32)
+
+                bbox = np.clip(bbox,0,1)
+                bbox_valid = float(True)
+            bbox = bbox.reshape(2, 2)
+
+        return bbox, bbox_valid
+
+    def evaluate(self, outs, cur_sample_idx=None):
+        annots = self.datalist
+        sample_num = len(outs)
+        eval_result = {
+            'pa_mpvpe_all': [],
+            'pa_mpvpe_l_hand': [],
+            'pa_mpvpe_r_hand': [],
+            'pa_mpvpe_hand': [],
+            'pa_mpvpe_face': [],
+            'mpvpe_all': [],
+            'mpvpe_l_hand': [],
+            'mpvpe_r_hand': [],
+            'mpvpe_hand': [],
+            'mpvpe_face': [],
+            'pa_mpjpe_body': [],
+            'pa_mpjpe_l_hand': [],
+            'pa_mpjpe_r_hand': [],
+            'pa_mpjpe_hand': []
+        }
+        
+        for n in range(sample_num):
+            out = outs[n]
+            ann_idx = out['gt_ann_idx']
+            mesh_gt = out['smplx_mesh_cam_pseudo_gt']
+            mesh_out = out['smplx_mesh_cam']
+            cam_trans = out['cam_trans']
+            ann_idx = out['gt_ann_idx']
+            img_path = []
+            for ann_id in ann_idx:
+                img_path.append(annots[ann_id]['img_path'])
+            eval_result['img_path'] = img_path
+            eval_result['ann_idx'] = ann_idx
+            
+            img = out['img']
+            # MPVPE from all vertices
+            mesh_out_align = mesh_out - np.dot(
+                smpl_x.J_regressor,
+                mesh_out)[smpl_x.J_regressor_idx['pelvis'], None, :] + np.dot(
+                    smpl_x.J_regressor,
+                    mesh_gt)[smpl_x.J_regressor_idx['pelvis'], None, :]
+            eval_result['mpvpe_all'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, 1)).mean() * 1000)
+            mesh_out_align = rigid_align(mesh_out, mesh_gt)
+            eval_result['pa_mpvpe_all'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_align - mesh_gt)**2, 1)).mean() * 1000)
+            # MPVPE from hand vertices
+            mesh_gt_lhand = mesh_gt[smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_out_lhand = mesh_out[smpl_x.hand_vertex_idx['left_hand'], :]
+            mesh_gt_rhand = mesh_gt[smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_rhand = mesh_out[smpl_x.hand_vertex_idx['right_hand'], :]
+            mesh_out_lhand_align = mesh_out_lhand - np.dot(
+                smpl_x.J_regressor,
+                mesh_out)[smpl_x.J_regressor_idx['lwrist'], None, :] + np.dot(
+                    smpl_x.J_regressor,
+                    mesh_gt)[smpl_x.J_regressor_idx['lwrist'], None, :]
+            mesh_out_rhand_align = mesh_out_rhand - np.dot(
+                smpl_x.J_regressor,
+                mesh_out)[smpl_x.J_regressor_idx['rwrist'], None, :] + np.dot(
+                    smpl_x.J_regressor,
+                    mesh_gt)[smpl_x.J_regressor_idx['rwrist'], None, :]
+            eval_result['mpvpe_l_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, 1)).mean() *
+                1000)
+            eval_result['mpvpe_r_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, 1)).mean() *
+                1000)
+            eval_result['mpvpe_hand'].append(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, 1)).mean() *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, 1)).mean() *
+                 1000) / 2.)
+            mesh_out_lhand_align = rigid_align(mesh_out_lhand, mesh_gt_lhand)
+            mesh_out_rhand_align = rigid_align(mesh_out_rhand, mesh_gt_rhand)
+            eval_result['pa_mpvpe_l_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, 1)).mean() *
+                1000)
+            eval_result['pa_mpvpe_r_hand'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_rhand_align - mesh_gt_rhand)**2, 1)).mean() *
+                1000)
+            eval_result['pa_mpvpe_hand'].append(
+                (np.sqrt(np.sum(
+                    (mesh_out_lhand_align - mesh_gt_lhand)**2, 1)).mean() *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (mesh_out_rhand_align - mesh_gt_rhand)**2, 1)).mean() *
+                 1000) / 2.)
+            
+            if self.__class__.__name__ == 'UBody':
+                joint_gt_body_wo_trans = np.dot(smpl_x.j14_regressor,
+                                            mesh_gt)
+                import ipdb;ipdb.set_trace()
+                img_wh = out['gt_img_shape'].flip(-1)
+                joint_gt_body_proj = project_points_new(
+                    points_3d=joint_gt_body_wo_trans,
+                    pred_cam=cam_trans,
+                    focal_length=5000,
+                    camera_center=img_wh/2
+                )  # origin image space
+                joint_gt_lhand_wo_trans = np.dot(
+                    smpl_x.orig_hand_regressor['left'], mesh_gt)
+                joint_gt_lhand_proj = project_points_new(
+                    points_3d=joint_gt_lhand_wo_trans,
+                    pred_cam=cam_trans,
+                    focal_length=5000,
+                    camera_center=img_wh/2
+                )  # origin image space
+                joint_gt_rhand_wo_trans = np.dot(
+                    smpl_x.orig_hand_regressor['left'], mesh_gt)
+                joint_gt_rhand_proj = project_points_new(
+                    points_3d=joint_gt_rhand_wo_trans,
+                    pred_cam=cam_trans,
+                    focal_length=5000,
+                    camera_center=img_wh/2
+                )  # origin image space
+                mesh_gt_proj = project_points_new(
+                    points_3d=mesh_gt,
+                    pred_cam=cam_trans,
+                    focal_length=5000,
+                    camera_center=img_wh/2)
+                joint_gt_body_valid = self.validate_within_img(
+                    img, joint_gt_body_proj)
+                joint_gt_lhand_valid = self.validate_within_img(
+                    img, joint_gt_lhand_proj)
+                joint_gt_rhand_valid = self.validate_within_img(
+                    img, joint_gt_rhand_proj)
+                mesh_valid = self.validate_within_img(img, mesh_gt_proj)
+                mesh_lhand_valid = mesh_valid[smpl_x.hand_vertex_idx['left_hand']]
+                mesh_rhand_valid = mesh_valid[smpl_x.hand_vertex_idx['right_hand']]
+                mesh_face_valid = mesh_valid[smpl_x.face_vertex_idx]
+
+            # MPVPE from face vertices
+            mesh_gt_face = mesh_gt[smpl_x.face_vertex_idx, :]
+            mesh_out_face = mesh_out[smpl_x.face_vertex_idx, :]
+            mesh_out_face_align = mesh_out_face - np.dot(
+                smpl_x.J_regressor,
+                mesh_out)[smpl_x.J_regressor_idx['neck'], None, :] + np.dot(
+                    smpl_x.J_regressor,
+                    mesh_gt)[smpl_x.J_regressor_idx['neck'], None, :]
+            eval_result['mpvpe_face'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_face_align - mesh_gt_face)**2, 1)).mean() * 1000)
+            mesh_out_face_align = rigid_align(mesh_out_face, mesh_gt_face)
+            eval_result['pa_mpvpe_face'].append(
+                np.sqrt(np.sum(
+                    (mesh_out_face_align - mesh_gt_face)**2, 1)).mean() * 1000)
+
+            # MPJPE from body joints
+            joint_gt_body = np.dot(smpl_x.j14_regressor, mesh_gt)
+            joint_out_body = np.dot(smpl_x.j14_regressor, mesh_out)
+            joint_out_body_align = rigid_align(joint_out_body, joint_gt_body)
+            eval_result['pa_mpjpe_body'].append(
+                np.sqrt(np.sum((joint_out_body_align - joint_gt_body)**2,
+                               1))[joint_gt_body_valid].mean() * 1000)
+
+            # eval_result['pa_mpjpe_body'].append(
+            #     np.sqrt(np.sum(
+            #         (joint_out_body_align - joint_gt_body)**2, 1)).mean() *
+            #     1000)
+
+            # MPJPE from hand joints
+            joint_gt_lhand = np.dot(smpl_x.orig_hand_regressor['left'],
+                                    mesh_gt)
+            joint_out_lhand = np.dot(smpl_x.orig_hand_regressor['left'],
+                                     mesh_out)
+            joint_out_lhand_align = rigid_align(joint_out_lhand,
+                                                joint_gt_lhand)
+            joint_gt_rhand = np.dot(smpl_x.orig_hand_regressor['right'],
+                                    mesh_gt)
+            joint_out_rhand = np.dot(smpl_x.orig_hand_regressor['right'],
+                                     mesh_out)
+            joint_out_rhand_align = rigid_align(joint_out_rhand,
+                                                joint_gt_rhand)
+            # if self.__class__.__name__ == 'UBody':
+            if sum(joint_gt_lhand_valid) != 0:
+                pa_mpjpe_lhand = np.sqrt(
+                    np.sum((joint_out_lhand_align - joint_gt_lhand)**2,
+                           1))[joint_gt_lhand_valid].mean() * 1000
+                pa_mpjpe_hand.append(pa_mpjpe_lhand)
+                eval_result['pa_mpjpe_l_hand'].append(pa_mpjpe_lhand)
+            if sum(joint_gt_rhand_valid) != 0:
+                pa_mpjpe_rhand = np.sqrt(
+                    np.sum((joint_out_rhand_align - joint_gt_rhand)**2,
+                           1))[joint_gt_rhand_valid].mean() * 1000
+                pa_mpjpe_hand.append(pa_mpjpe_rhand)
+                eval_result['pa_mpjpe_r_hand'].append(pa_mpjpe_rhand)
+            if len(pa_mpjpe_hand) > 0:
+                eval_result['pa_mpjpe_hand'].append(np.mean(pa_mpjpe_hand))
+
+            eval_result['pa_mpjpe_l_hand'].append(
+                np.sqrt(np.sum(
+                    (joint_out_lhand_align - joint_gt_lhand)**2, 1)).mean() *
+                1000)
+            eval_result['pa_mpjpe_r_hand'].append(
+                np.sqrt(np.sum(
+                    (joint_out_rhand_align - joint_gt_rhand)**2, 1)).mean() *
+                1000)
+            eval_result['pa_mpjpe_hand'].append(
+                (np.sqrt(np.sum(
+                    (joint_out_lhand_align - joint_gt_lhand)**2, 1)).mean() *
+                 1000 +
+                 np.sqrt(np.sum(
+                     (joint_out_rhand_align - joint_gt_rhand)**2, 1)).mean() *
+                 1000) / 2.)
+        return eval_result
+
+    def print_eval_result(self, eval_result):
+        print(f'======{cfg.testset}======')
+        print('PA MPVPE (All): %.2f mm' % np.mean(eval_result['pa_mpvpe_all']))
+        print('PA MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_l_hand']))
+        print('PA MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_r_hand']))
+        print('PA MPVPE (Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_hand']))
+        print('PA MPVPE (Face): %.2f mm' %
+              np.mean(eval_result['pa_mpvpe_face']))
+        print()
+
+        print('MPVPE (All): %.2f mm' % np.mean(eval_result['mpvpe_all']))
+        print('MPVPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_l_hand']))
+        print('MPVPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['mpvpe_r_hand']))
+        print('MPVPE (Hands): %.2f mm' % np.mean(eval_result['mpvpe_hand']))
+        print('MPVPE (Face): %.2f mm' % np.mean(eval_result['mpvpe_face']))
+        print()
+
+        print('PA MPJPE (Body): %.2f mm' %
+              np.mean(eval_result['pa_mpjpe_body']))
+        print('PA MPJPE (L-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpjpe_l_hand']))
+        print('PA MPJPE (R-Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpjpe_r_hand']))
+        print('PA MPJPE (Hands): %.2f mm' %
+              np.mean(eval_result['pa_mpjpe_hand']))
+
+        f = open(os.path.join(cfg.result_dir, 'result.txt'), 'w')
+        f.write(f'{cfg.testset} dataset \n')
+        f.write('PA MPVPE (All): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_all']))
+        f.write('PA MPVPE (L-Hands): %.2f mm' %
+                np.mean(eval_result['pa_mpvpe_l_hand']))
+        f.write('PA MPVPE (R-Hands): %.2f mm' %
+                np.mean(eval_result['pa_mpvpe_r_hand']))
+        f.write('PA MPVPE (Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_hand']))
+        f.write('PA MPVPE (Face): %.2f mm\n' %
+                np.mean(eval_result['pa_mpvpe_face']))
+        f.write('MPVPE (All): %.2f mm\n' % np.mean(eval_result['mpvpe_all']))
+        f.write('MPVPE (L-Hands): %.2f mm' %
+                np.mean(eval_result['mpvpe_l_hand']))
+        f.write('MPVPE (R-Hands): %.2f mm' %
+                np.mean(eval_result['mpvpe_r_hand']))
+        f.write('MPVPE (Hands): %.2f mm' % np.mean(eval_result['mpvpe_hand']))
+        f.write('MPVPE (Face): %.2f mm\n' % np.mean(eval_result['mpvpe_face']))
+        f.write('PA MPJPE (Body): %.2f mm\n' %
+                np.mean(eval_result['pa_mpjpe_body']))
+        f.write('PA MPJPE (L-Hands): %.2f mm' %
+                np.mean(eval_result['pa_mpjpe_l_hand']))
+        f.write('PA MPJPE (R-Hands): %.2f mm' %
+                np.mean(eval_result['pa_mpjpe_r_hand']))
+        f.write('PA MPJPE (Hands): %.2f mm\n' %
+                np.mean(eval_result['pa_mpjpe_hand']))
+    def validate_within_img_batch(
+            self, img_wh, points):  # check whether the points is within the image
+        # img: (h, w, c), points: (num_points, 2)
+        
+        valid_mask = np.logical_and((points-img_wh[:,None])<0,points>0)
+        valid_mask = np.logical_and(valid_mask[:,:,0],valid_mask[:,:,1])
+        
+        return valid_mask
+    def decompress_keypoints(self, humandata) -> None:
+        """If a key contains 'keypoints', and f'{key}_mask' is in self.keys(),
+        invalid zeros will be inserted to the right places and f'{key}_mask'
+        will be unlocked.
+
+        Raises:
+            KeyError:
+                A key contains 'keypoints' has been found
+                but its corresponding mask is missing.
+        """
+        assert bool(humandata['__keypoints_compressed__']) is True
+        key_pairs = []
+        for key in humandata.files:
+            if key not in KPS2D_KEYS + KPS3D_KEYS:
+                continue
+            mask_key = f'{key}_mask'
+            if mask_key in humandata.files:
+                print(f'Decompress {key}...')
+                key_pairs.append([key, mask_key])
+        decompressed_dict = {}
+        for kpt_key, mask_key in key_pairs:
+            mask_array = np.asarray(humandata[mask_key])
+            compressed_kpt = humandata[kpt_key]
+            kpt_array = \
+                self.add_zero_pad(compressed_kpt, mask_array)
+            decompressed_dict[kpt_key] = kpt_array
+        del humandata
+        return decompressed_dict
+
+    def add_zero_pad(self, compressed_array: np.ndarray,
+                     mask_array: np.ndarray) -> np.ndarray:
+        """Pad zeros to a compressed keypoints array.
+
+        Args:
+            compressed_array (np.ndarray):
+                A compressed keypoints array.
+            mask_array (np.ndarray):
+                The mask records compression relationship.
+
+        Returns:
+            np.ndarray:
+                A keypoints array in full-size.
+        """
+        assert mask_array.sum() == compressed_array.shape[1]
+        data_len, _, dim = compressed_array.shape
+        mask_len = mask_array.shape[0]
+        ret_value = np.zeros(shape=[data_len, mask_len, dim],
+                             dtype=compressed_array.dtype)
+        valid_mask_index = np.where(mask_array == 1)[0]
+        ret_value[:, valid_mask_index, :] = compressed_array
+        return ret_value
diff --git a/detrsmpl/__init__.py b/detrsmpl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac52099185f80381d9e8402ac74fc292f332ad91
--- /dev/null
+++ b/detrsmpl/__init__.py
@@ -0,0 +1,28 @@
+import mmcv
+
+from .version import __version__
+
+
+def digit_version(version_str):
+    digit_version = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            digit_version.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            digit_version.append(int(patch_version[0]) - 1)
+            digit_version.append(int(patch_version[1]))
+    return digit_version
+
+
+mmcv_minimum_version = '1.3.17'
+mmcv_maximum_version = '1.7.1'
+mmcv_version = digit_version(mmcv.__version__)
+
+
+assert (mmcv_version >= digit_version(mmcv_minimum_version)
+        and mmcv_version <= digit_version(mmcv_maximum_version)), \
+    f'MMCV=={mmcv.__version__} is used but incompatible. ' \
+    f'Please install mmcv>={mmcv_minimum_version}, <{mmcv_maximum_version}.'
+
+__all__ = ['__version__']
diff --git a/detrsmpl/apis/__init__.py b/detrsmpl/apis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f1a2da2c226d4ec3b30cd2a32ac90e60ba803408
--- /dev/null
+++ b/detrsmpl/apis/__init__.py
@@ -0,0 +1,12 @@
+
+from detrsmpl.apis.test import (
+    collect_results_cpu,
+    collect_results_gpu,
+    multi_gpu_test,
+    single_gpu_test,
+)
+from detrsmpl.apis.train import set_random_seed, train_model
+
+__all__ = [
+    'collect_results_cpu', 'collect_results_gpu','multi_gpu_test','single_gpu_test'
+]
diff --git a/detrsmpl/apis/inference.py b/detrsmpl/apis/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb18c9c2498ee02548b089092a73487998671cce
--- /dev/null
+++ b/detrsmpl/apis/inference.py
@@ -0,0 +1,518 @@
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import collate
+from mmcv.runner import load_checkpoint
+
+from detrsmpl.data.datasets.pipelines import Compose
+from detrsmpl.models.architectures.builder import build_architecture
+from detrsmpl.models.backbones.builder import build_backbone
+from detrsmpl.utils.demo_utils import box2cs, xywh2xyxy, xyxy2xywh
+
+
+def init_model(config, checkpoint=None, device='cuda:0'):
+    """Initialize a model from config file.
+
+    Args:
+        config (str or :obj:`mmcv.Config`): Config file path or the config
+            object.
+        checkpoint (str, optional): Checkpoint path. If left as None, the model
+            will not load any weights.
+
+    Returns:
+        nn.Module: The constructed model.
+        (nn.Module, None): The constructed extractor model
+    """
+    if isinstance(config, str):
+        config = mmcv.Config.fromfile(config)
+    elif not isinstance(config, mmcv.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(config)}')
+    config.data.test.test_mode = True
+
+    model = build_architecture(config.model)
+    if checkpoint is not None:
+        # load model checkpoint
+        load_checkpoint(model, checkpoint, map_location=device)
+    # save the config in the model for convenience
+    model.cfg = config
+    model.to(device)
+    model.eval()
+
+    extractor = None
+    if config.model.type == 'VideoBodyModelEstimator':
+        extractor = build_backbone(config.extractor.backbone)
+        if config.extractor.checkpoint is not None:
+            # load model checkpoint
+            load_checkpoint(extractor, config.extractor.checkpoint)
+        extractor.cfg = config
+        extractor.to(device)
+        extractor.eval()
+    return model, extractor
+
+
+class LoadImage:
+    """A simple pipeline to load image."""
+    def __init__(self, color_type='color', channel_order='bgr'):
+        self.color_type = color_type
+        self.channel_order = channel_order
+
+    def __call__(self, results):
+        """Call function to load images into results.
+
+        Args:
+            results (dict): A result dict contains the image_path.
+
+        Returns:
+            dict: ``results`` will be returned containing loaded image.
+        """
+        if isinstance(results['image_path'], str):
+            results['image_file'] = results['image_path']
+            img = mmcv.imread(results['image_path'], self.color_type,
+                              self.channel_order)
+        elif isinstance(results['image_path'], np.ndarray):
+            results['image_file'] = ''
+            if self.color_type == 'color' and self.channel_order == 'rgb':
+                img = cv2.cvtColor(results['image_path'], cv2.COLOR_BGR2RGB)
+            else:
+                img = results['image_path']
+        else:
+            raise TypeError('"image_path" must be a numpy array or a str or '
+                            'a pathlib.Path object')
+
+        results['img'] = img
+        return results
+
+
+def inference_image_based_model(
+    model,
+    img_or_path,
+    det_results,
+    bbox_thr=None,
+    format='xywh',
+):
+    """Inference a single image with a list of person bounding boxes.
+
+    Args:
+        model (nn.Module): The loaded pose model.
+        img_or_path (Union[str, np.ndarray]): Image filename or loaded image.
+        det_results (List(dict)): the item in the dict may contain
+            'bbox' and/or 'track_id'.
+            'bbox' (4, ) or (5, ): The person bounding box, which contains
+            4 box coordinates (and score).
+            'track_id' (int): The unique id for each human instance.
+        bbox_thr (float, optional): Threshold for bounding boxes.
+            Only bboxes with higher scores will be fed into the pose detector.
+            If bbox_thr is None, ignore it. Defaults to None.
+        format (str, optional): bbox format ('xyxy' | 'xywh'). Default: 'xywh'.
+            'xyxy' means (left, top, right, bottom),
+            'xywh' means (left, top, width, height).
+
+    Returns:
+        list[dict]: Each item in the list is a dictionary,
+            containing the bbox: (left, top, right, bottom, [score]),
+            SMPL parameters, vertices, kp3d, and camera.
+    """
+    # only two kinds of bbox format is supported.
+    assert format in ['xyxy', 'xywh']
+    mesh_results = []
+    if len(det_results) == 0:
+        return []
+
+    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
+    bboxes = np.array([box['bbox'] for box in det_results])
+
+    # Select bboxes by score threshold
+    if bbox_thr is not None:
+        assert bboxes.shape[1] == 5
+        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
+        bboxes = bboxes[valid_idx]
+        det_results = [det_results[i] for i in valid_idx]
+
+    if format == 'xyxy':
+        bboxes_xyxy = bboxes
+        bboxes_xywh = xyxy2xywh(bboxes)
+    else:
+        # format is already 'xywh'
+        bboxes_xywh = bboxes
+        bboxes_xyxy = xywh2xyxy(bboxes)
+
+    # if bbox_thr remove all bounding box
+    if len(bboxes_xywh) == 0:
+        return []
+
+    cfg = model.cfg
+    device = next(model.parameters()).device
+
+    # build the data pipeline
+    inference_pipeline = [LoadImage()] + cfg.inference_pipeline
+    inference_pipeline = Compose(inference_pipeline)
+
+    assert len(bboxes[0]) in [4, 5]
+
+    batch_data = []
+    input_size = cfg['img_res']
+    aspect_ratio = 1 if isinstance(input_size,
+                                   int) else input_size[0] / input_size[1]
+
+    for i, bbox in enumerate(bboxes_xywh):
+        center, scale = box2cs(bbox, aspect_ratio, bbox_scale_factor=1.25)
+        # prepare data
+        data = {
+            'image_path': img_or_path,
+            'center': center,
+            'scale': scale,
+            'rotation': 0,
+            'bbox_score': bbox[4] if len(bbox) == 5 else 1,
+            'sample_idx': i,
+        }
+        data = inference_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=1)
+
+    if next(model.parameters()).is_cuda:
+        # scatter not work so just move image to cuda device
+        batch_data['img'] = batch_data['img'].to(device)
+
+    # get all img_metas of each bounding box
+    batch_data['img_metas'] = [
+        img_metas[0] for img_metas in batch_data['img_metas'].data
+    ]
+
+    # forward the model
+    with torch.no_grad():
+        results = model(
+            img=batch_data['img'],
+            img_metas=batch_data['img_metas'],
+            sample_idx=batch_data['sample_idx'],
+        )
+
+    for idx in range(len(det_results)):
+        mesh_result = det_results[idx].copy()
+        mesh_result['bbox'] = bboxes_xyxy[idx]
+        mesh_result['camera'] = results['camera'][idx]
+        mesh_result['smpl_pose'] = results['smpl_pose'][idx]
+        mesh_result['smpl_beta'] = results['smpl_beta'][idx]
+        mesh_result['vertices'] = results['vertices'][idx]
+        mesh_result['keypoints_3d'] = results['keypoints_3d'][idx]
+        mesh_results.append(mesh_result)
+    return mesh_results
+
+
+def inference_video_based_model(model,
+                                extracted_results,
+                                with_track_id=True,
+                                causal=True):
+    """Inference SMPL parameters from extracted featutres using a video-based
+    model.
+
+    Args:
+        model (nn.Module): The loaded mesh estimation model.
+        extracted_results (List[List[Dict]]): Multi-frame feature extraction
+            results stored in a nested list. Each element of the outer list
+            is the feature extraction results of a single frame, and each
+            element of the inner list is the feature information of one person,
+            which contains:
+                features (ndarray): extracted features
+                track_id (int): unique id of each person, required when
+                    ``with_track_id==True```
+                bbox ((4, ) or (5, )): left, right, top, bottom, [score]
+        with_track_id: If True, the element in extracted_results is expected to
+            contain "track_id", which will be used to gather the feature
+            sequence of a person from multiple frames. Otherwise, the extracted
+            results in each frame are expected to have a consistent number and
+            order of identities. Default is True.
+        causal (bool): If True, the target frame is the first frame in
+            a sequence. Otherwise, the target frame is in the middle of a
+            sequence.
+
+    Returns:
+        list[dict]: Each item in the list is a dictionary, which contains:
+            SMPL parameters, vertices, kp3d, and camera.
+    """
+    cfg = model.cfg
+    device = next(model.parameters()).device
+    seq_len = cfg.data.test.seq_len
+    mesh_results = []
+    # build the data pipeline
+    inference_pipeline = Compose(cfg.inference_pipeline)
+    target_idx = 0 if causal else len(extracted_results) // 2
+
+    input_features = _gather_input_features(extracted_results)
+    feature_sequences = _collate_feature_sequence(input_features,
+                                                  with_track_id, target_idx)
+    if not feature_sequences:
+        return mesh_results
+
+    batch_data = []
+
+    for i, seq in enumerate(feature_sequences):
+
+        data = {
+            'features': seq['features'],
+            'sample_idx': i,
+        }
+
+        data = inference_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=len(batch_data))
+
+    if next(model.parameters()).is_cuda:
+        # scatter not work so just move image to cuda device
+        batch_data['features'] = batch_data['features'].to(device)
+
+    with torch.no_grad():
+        results = model(features=batch_data['features'],
+                        img_metas=batch_data['img_metas'],
+                        sample_idx=batch_data['sample_idx'])
+
+    results['camera'] = results['camera'].reshape(-1, seq_len, 3)
+    results['smpl_pose'] = results['smpl_pose'].reshape(-1, seq_len, 24, 3, 3)
+    results['smpl_beta'] = results['smpl_beta'].reshape(-1, seq_len, 10)
+    results['vertices'] = results['vertices'].reshape(-1, seq_len, 6890, 3)
+    results['keypoints_3d'] = results['keypoints_3d'].reshape(
+        -1, seq_len, 17, 3)
+
+    for idx in range(len(feature_sequences)):
+        mesh_result = dict()
+        mesh_result['camera'] = results['camera'][idx, target_idx]
+        mesh_result['smpl_pose'] = results['smpl_pose'][idx, target_idx]
+        mesh_result['smpl_beta'] = results['smpl_beta'][idx, target_idx]
+        mesh_result['vertices'] = results['vertices'][idx, target_idx]
+        mesh_result['keypoints_3d'] = results['keypoints_3d'][idx, target_idx]
+        mesh_result['bbox'] = extracted_results[target_idx][idx]['bbox']
+        # 'track_id' is not included in results generated by mmdet
+        if 'track_id' in extracted_results[target_idx][idx].keys():
+            mesh_result['track_id'] = extracted_results[target_idx][idx][
+                'track_id']
+        mesh_results.append(mesh_result)
+    return mesh_results
+
+
+def feature_extract(
+    model,
+    img_or_path,
+    det_results,
+    bbox_thr=None,
+    format='xywh',
+):
+    """Extract image features with a list of person bounding boxes.
+
+    Args:
+        model (nn.Module): The loaded feature extraction model.
+        img_or_path (Union[str, np.ndarray]): Image filename or loaded image.
+        det_results (List(dict)): the item in the dict may contain
+            'bbox' and/or 'track_id'.
+            'bbox' (4, ) or (5, ): The person bounding box, which contains
+            4 box coordinates (and score).
+            'track_id' (int): The unique id for each human instance.
+        bbox_thr (float, optional): Threshold for bounding boxes.
+            If bbox_thr is None, ignore it. Defaults to None.
+        format (str, optional): bbox format. Default: 'xywh'.
+            'xyxy' means (left, top, right, bottom),
+            'xywh' means (left, top, width, height).
+
+    Returns:
+        list[dict]: The bbox & pose info,
+            containing the bbox: (left, top, right, bottom, [score])
+            and the features.
+    """
+    # only two kinds of bbox format is supported.
+    assert format in ['xyxy', 'xywh']
+
+    cfg = model.cfg
+    device = next(model.parameters()).device
+
+    feature_results = []
+    if len(det_results) == 0:
+        return feature_results
+
+    # Change for-loop preprocess each bbox to preprocess all bboxes at once.
+    bboxes = np.array([box['bbox'] for box in det_results])
+    assert len(bboxes[0]) in [4, 5]
+
+    # Select bboxes by score threshold
+    if bbox_thr is not None:
+        assert bboxes.shape[1] == 5
+        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
+        bboxes = bboxes[valid_idx]
+        det_results = [det_results[i] for i in valid_idx]
+
+    # if bbox_thr remove all bounding box
+    if len(bboxes) == 0:
+        return feature_results
+
+    if format == 'xyxy':
+        bboxes_xyxy = bboxes
+        bboxes_xywh = xyxy2xywh(bboxes)
+    else:
+        # format is already 'xywh'
+        bboxes_xywh = bboxes
+        bboxes_xyxy = xywh2xyxy(bboxes)
+
+    # build the data pipeline
+    extractor_pipeline = [LoadImage()] + cfg.extractor_pipeline
+    extractor_pipeline = Compose(extractor_pipeline)
+    batch_data = []
+    input_size = cfg['img_res']
+    aspect_ratio = 1 if isinstance(input_size,
+                                   int) else input_size[0] / input_size[1]
+
+    for i, bbox in enumerate(bboxes_xywh):
+        center, scale = box2cs(bbox, aspect_ratio, bbox_scale_factor=1.25)
+        # prepare data
+        data = {
+            'image_path': img_or_path,
+            'center': center,
+            'scale': scale,
+            'rotation': 0,
+            'bbox_score': bbox[4] if len(bbox) == 5 else 1,
+            'sample_idx': i,
+        }
+        data = extractor_pipeline(data)
+        batch_data.append(data)
+
+    batch_data = collate(batch_data, samples_per_gpu=1)
+
+    if next(model.parameters()).is_cuda:
+        # scatter not work so just move image to cuda device
+        batch_data['img'] = batch_data['img'].to(device)
+
+    # get all img_metas of each bounding box
+    batch_data['img_metas'] = [
+        img_metas[0] for img_metas in batch_data['img_metas'].data
+    ]
+
+    # forward the model
+    with torch.no_grad():
+        results = model(batch_data['img'])
+
+        if isinstance(results, list) or isinstance(results, tuple):
+            results = results[-1].mean(dim=-1).mean(dim=-1)
+
+    for idx in range(len(det_results)):
+        feature_result = det_results[idx].copy()
+        feature_result['bbox'] = bboxes_xyxy[idx]
+        feature_result['features'] = results[idx].cpu().numpy()
+        feature_results.append(feature_result)
+
+    return feature_results
+
+
+def _gather_input_features(extracted_results):
+    """Gather input features.
+
+    Args:
+        extracted_results (List[List[Dict]]):
+            Multi-frame feature extraction results
+
+    Returns:
+        List[List[dict]]: Multi-frame feature extraction results
+            stored in a nested list. Each element of the outer list is the
+            feature extraction results of a single frame, and each element of
+            the inner list is the extracted results of one person,
+            which contains:
+                features (ndarray): extracted features
+                track_id (int): unique id of each person, required when
+                    ``with_track_id==True```
+    """
+    sequence_inputs = []
+    for frame in extracted_results:
+        frame_inputs = []
+        for res in frame:
+            inputs = dict()
+            if 'features' in res:
+                inputs['features'] = res['features']
+            if 'track_id' in res:
+                inputs['track_id'] = res['track_id']
+            frame_inputs.append(inputs)
+        sequence_inputs.append(frame_inputs)
+    return sequence_inputs
+
+
+def _collate_feature_sequence(extracted_features,
+                              with_track_id=True,
+                              target_frame=0):
+    """Reorganize multi-frame feature extraction results into individual
+    feature sequences.
+
+    Args:
+        extracted_features (List[List[Dict]]): Multi-frame feature extraction
+            results stored in a nested list. Each element of the outer list
+            is the feature extraction results of a single frame, and each
+            element of the inner list is the extracted results of one person,
+            which contains:
+                features (ndarray): extracted features
+                track_id (int): unique id of each person, required when
+                    ``with_track_id==True```
+        with_track_id (bool): If True, the element in pose_results is expected
+            to contain "track_id", which will be used to gather the pose
+            sequence of a person from multiple frames. Otherwise, the pose
+            results in each frame are expected to have a consistent number and
+            order of identities. Default is True.
+        target_frame (int): The index of the target frame. Default: 0.
+    """
+    T = len(extracted_features)
+    assert T > 0
+
+    target_frame = (T + target_frame) % T  # convert negative index to positive
+
+    N = len(
+        extracted_features[target_frame])  # use identities in the target frame
+    if N == 0:
+        return []
+
+    C = extracted_features[target_frame][0]['features'].shape[0]
+
+    track_ids = None
+    if with_track_id:
+        track_ids = [
+            res['track_id'] for res in extracted_features[target_frame]
+        ]
+
+    feature_sequences = []
+    for idx in range(N):
+        feature_seq = dict()
+        # gather static information
+        for k, v in extracted_features[target_frame][idx].items():
+            if k != 'features':
+                feature_seq[k] = v
+        # gather keypoints
+        if not with_track_id:
+            feature_seq['features'] = np.stack(
+                [frame[idx]['features'] for frame in extracted_features])
+        else:
+            features = np.zeros((T, C), dtype=np.float32)
+            features[target_frame] = extracted_features[target_frame][idx][
+                'features']
+            # find the left most frame containing track_ids[idx]
+            for frame_idx in range(target_frame - 1, -1, -1):
+                contains_idx = False
+                for res in extracted_features[frame_idx]:
+                    if res['track_id'] == track_ids[idx]:
+                        features[frame_idx] = res['features']
+                        contains_idx = True
+                        break
+                if not contains_idx:
+                    # replicate the left most frame
+                    features[frame_idx] = features[frame_idx + 1]
+
+            # find the right most frame containing track_idx[idx]
+            for frame_idx in range(target_frame + 1, T):
+                contains_idx = False
+                for res in extracted_features[frame_idx]:
+                    if res['track_id'] == track_ids[idx]:
+                        features[frame_idx] = res['features']
+                        contains_idx = True
+                        break
+                if not contains_idx:
+                    # replicate the right most frame
+                    features[frame_idx] = features[frame_idx - 1]
+                    # break
+            feature_seq['features'] = features
+        feature_sequences.append(feature_seq)
+
+    return feature_sequences
diff --git a/detrsmpl/apis/test.py b/detrsmpl/apis/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..98ed3dc5355967b9c3cff61aeabc2a0fab730e9e
--- /dev/null
+++ b/detrsmpl/apis/test.py
@@ -0,0 +1,172 @@
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+
+import mmcv
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+
+def single_gpu_test(model, data_loader):
+    """Test with single gpu."""
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+
+        batch_size = len(result)
+        if isinstance(result, list):
+            results.extend(result)
+        else:
+            results.append(result)
+
+        if 'img' in data.keys():
+            batch_size = data['img'].size(0)
+        else:
+            batch_size = data['features'].size(0)
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+
+
+def multi_gpu_test(model, data_loader, tmpdir=None, gpu_collect=False):
+    """Test model with multiple gpus.
+
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting 'gpu_collect=True'
+    it encodes results to gpu tensors and use gpu communication for results
+    collection. On cpu mode it saves the results on different gpus to 'tmpdir'
+    and collects them by the rank 0 worker.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        # Check if tmpdir is valid for cpu_collect
+        if (not gpu_collect) and (tmpdir is not None and osp.exists(tmpdir)):
+            raise OSError((f'The tmpdir {tmpdir} already exists.',
+                           ' Since tmpdir will be deleted after testing,',
+                           ' please make sure you specify an empty one.'))
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        if isinstance(result, list):
+            results.extend(result)
+        else:
+            results.append(result)
+
+        if rank == 0:
+            if 'img' in data.keys():
+                batch_size = data['img'].size(0)
+            else:
+                batch_size = data['features'].size(0)
+            for _ in range(batch_size * world_size):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        results = collect_results_gpu(results, len(dataset))
+    else:
+        results = collect_results_cpu(results, len(dataset), tmpdir)
+    return results
+
+
+def collect_results_cpu(result_part, size, tmpdir=None):
+    """Collect results in cpu."""
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(bytearray(tmpdir.encode()),
+                                  dtype=torch.uint8,
+                                  device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_result = mmcv.load(part_file)
+            part_list.append(part_result)
+        # import ipdb;ipdb.set_trace()
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def collect_results_gpu(result_part, size):
+    """Collect results in gpu."""
+    rank, world_size = get_dist_info()
+    # dump result part to tensor with pickle
+    part_tensor = torch.tensor(bytearray(pickle.dumps(result_part)),
+                               dtype=torch.uint8,
+                               device='cuda')
+    # gather all result part tensor shape
+    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
+    shape_list = [shape_tensor.clone() for _ in range(world_size)]
+    dist.all_gather(shape_list, shape_tensor)
+    # padding result part tensor to max length
+    shape_max = torch.tensor(shape_list).max()
+    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
+    part_send[:shape_tensor[0]] = part_tensor
+    part_recv_list = [
+        part_tensor.new_zeros(shape_max) for _ in range(world_size)
+    ]
+    # gather all result part
+    dist.all_gather(part_recv_list, part_send)
+
+    if rank == 0:
+        part_list = []
+        for recv, shape in zip(part_recv_list, shape_list):
+            part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())
+            part_list.append(part_result)
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
diff --git a/detrsmpl/apis/train.py b/detrsmpl/apis/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dbcf0a580fd5e3377b44b9ff695dce6943d1521
--- /dev/null
+++ b/detrsmpl/apis/train.py
@@ -0,0 +1,163 @@
+import random
+import warnings
+
+import numpy as np
+import torch
+from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
+from mmcv.runner import (
+    DistSamplerSeedHook,
+    Fp16OptimizerHook,
+    OptimizerHook,
+    build_runner,
+)
+
+from detrsmpl.core.distributed_wrapper import DistributedDataParallelWrapper
+from detrsmpl.core.evaluation import DistEvalHook, EvalHook
+from detrsmpl.core.optimizer import build_optimizers
+from detrsmpl.data.datasets import build_dataloader, build_dataset
+from detrsmpl.utils.logger import get_root_logger
+
+
+def set_random_seed(seed, deterministic=False):
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+    """
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
+
+
+def train_model(model,
+                dataset,
+                cfg,
+                distributed=False,
+                validate=False,
+                timestamp=None,
+                device='cuda',
+                meta=None):
+    """Main api for training model."""
+    logger = get_root_logger(cfg.log_level)
+
+    # prepare data loaders
+    dataset = dataset if isinstance(dataset, (list, tuple)) else [dataset]
+    data_loaders = [
+        build_dataloader(
+            ds,
+            cfg.data.samples_per_gpu,
+            cfg.data.workers_per_gpu,
+            # cfg.gpus will be ignored if distributed
+            num_gpus=len(cfg.gpu_ids),
+            dist=distributed,
+            round_up=True,
+            seed=cfg.seed) for ds in dataset
+    ]
+
+    # determine whether use adversarial training precess or not
+    use_adverserial_train = cfg.get('use_adversarial_train', False)
+
+    # put model on gpus
+    if distributed:
+        find_unused_parameters = cfg.get('find_unused_parameters', False)
+        # Sets the `find_unused_parameters` parameter in
+        # torch.nn.parallel.DistributedDataParallel
+        if use_adverserial_train:
+            # Use DistributedDataParallelWrapper for adversarial training
+            model = DistributedDataParallelWrapper(
+                model,
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+        else:
+            model = MMDistributedDataParallel(
+                model.cuda(),
+                device_ids=[torch.cuda.current_device()],
+                broadcast_buffers=False,
+                find_unused_parameters=find_unused_parameters)
+    else:
+        if device == 'cuda':
+            model = MMDataParallel(model.cuda(cfg.gpu_ids[0]),
+                                   device_ids=cfg.gpu_ids)
+        elif device == 'cpu':
+            model = model.cpu()
+        else:
+            raise ValueError(F'unsupported device name {device}.')
+
+    # build runner
+    optimizer = build_optimizers(model, cfg.optimizer)
+    if cfg.get('runner') is None:
+        cfg.runner = {
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.total_epochs
+        }
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+
+    runner = build_runner(cfg.runner,
+                          default_args=dict(model=model,
+                                            batch_processor=None,
+                                            optimizer=optimizer,
+                                            work_dir=cfg.work_dir,
+                                            logger=logger,
+                                            meta=meta))
+
+    # an ugly walkaround to make the .log and .log.json filenames the same
+    runner.timestamp = timestamp
+
+    if use_adverserial_train:
+        # The optimizer step process is included in the train_step function
+        # of the model, so the runner should NOT include optimizer hook.
+        optimizer_config = None
+    else:
+        # fp16 setting
+        fp16_cfg = cfg.get('fp16', None)
+        if fp16_cfg is not None:
+            optimizer_config = Fp16OptimizerHook(**cfg.optimizer_config,
+                                                 **fp16_cfg,
+                                                 distributed=distributed)
+        elif distributed and 'type' not in cfg.optimizer_config:
+            optimizer_config = OptimizerHook(**cfg.optimizer_config)
+        else:
+            optimizer_config = cfg.optimizer_config
+
+    # register hooks
+    runner.register_training_hooks(cfg.lr_config,
+                                   optimizer_config,
+                                   cfg.checkpoint_config,
+                                   cfg.log_config,
+                                   cfg.get('momentum_config', None),
+                                   custom_hooks_config=cfg.get(
+                                       'custom_hooks', None))
+    if distributed:
+        runner.register_hook(DistSamplerSeedHook())
+
+    # register eval hooks
+    if validate:
+        val_dataset = build_dataset(cfg.data.val, dict(test_mode=True))
+        val_dataloader = build_dataloader(
+            val_dataset,
+            samples_per_gpu=cfg.data.samples_per_gpu,
+            workers_per_gpu=cfg.data.workers_per_gpu,
+            dist=distributed,
+            shuffle=False,
+            round_up=True)
+        eval_cfg = cfg.get('evaluation', {})
+        eval_cfg['by_epoch'] = cfg.runner['type'] != 'IterBasedRunner'
+        eval_hook = DistEvalHook if distributed else EvalHook
+        runner.register_hook(eval_hook(val_dataloader, **eval_cfg))
+
+    if cfg.resume_from:
+        runner.resume(cfg.resume_from)
+    elif cfg.load_from:
+        runner.load_checkpoint(cfg.load_from)
+    runner.run(data_loaders, cfg.workflow)
diff --git a/detrsmpl/core/__init__.py b/detrsmpl/core/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/core/cameras/__init__.py b/detrsmpl/core/cameras/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..248d02bb8989969384e9685bd7e518cca8142e6d
--- /dev/null
+++ b/detrsmpl/core/cameras/__init__.py
@@ -0,0 +1,19 @@
+from detrsmpl.core.cameras import builder, camera_parameters, cameras
+from detrsmpl.core.cameras.builder import CAMERAS, build_cameras
+from detrsmpl.core.cameras.cameras import (
+    FoVOrthographicCameras,
+    FoVPerspectiveCameras,
+    MMCamerasBase,
+    OrthographicCameras,
+    PerspectiveCameras,
+    WeakPerspectiveCameras,
+    compute_direction_cameras,
+    compute_orbit_cameras,
+)
+
+__all__ = [
+    'CAMERAS', 'FoVOrthographicCameras', 'FoVPerspectiveCameras',
+    'MMCamerasBase', 'OrthographicCameras', 'PerspectiveCameras',
+    'WeakPerspectiveCameras', 'build_cameras', 'builder', 'camera_parameters',
+    'cameras', 'compute_orbit_cameras', 'compute_direction_cameras'
+]
diff --git a/detrsmpl/core/cameras/builder.py b/detrsmpl/core/cameras/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..711ecadf36ae56c0f1b16554c443be7a4e41b415
--- /dev/null
+++ b/detrsmpl/core/cameras/builder.py
@@ -0,0 +1,8 @@
+from mmcv.utils import Registry
+
+CAMERAS = Registry('cameras')
+
+
+def build_cameras(cfg):
+    """Build cameras."""
+    return CAMERAS.build(cfg)
diff --git a/detrsmpl/core/cameras/camera_parameters.py b/detrsmpl/core/cameras/camera_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c8534f70c2ea352feb3d8e54b5a69aedf715a2f
--- /dev/null
+++ b/detrsmpl/core/cameras/camera_parameters.py
@@ -0,0 +1,678 @@
+import json
+import warnings
+from enum import Enum
+from typing import Any, List, Tuple, Union
+
+import numpy as np
+import torch
+
+from detrsmpl.core.cameras.cameras import PerspectiveCameras
+from detrsmpl.core.conventions.cameras.convert_convention import (
+    convert_camera_matrix,
+    convert_K_3x3_to_4x4,
+    convert_K_4x4_to_3x3,
+)
+from .builder import build_cameras
+
+_CAMERA_PARAMETER_SUPPORTED_KEYS_ = {
+    'H': {
+        'type': int,
+    },
+    'W': {
+        'type': int,
+    },
+    'in_mat': {
+        'type': list,
+        'len': 3,
+    },
+    'rotation_mat': {
+        'type': list,
+        'len': 3,
+    },
+    'translation': {
+        'type': list,
+        'len': 3,
+    },
+    'k1': {
+        'type': float,
+    },
+    'k2': {
+        'type': float,
+    },
+    'k3': {
+        'type': float,
+    },
+    'k4': {
+        'type': float,
+    },
+    'k5': {
+        'type': float,
+    },
+    'k6': {
+        'type': float,
+    },
+    'p1': {
+        'type': float,
+    },
+    'p2': {
+        'type': float,
+    },
+}
+
+
+class _TypeValidation(Enum):
+    MATCH = 0
+    ARRAY = 1
+    FAIL = 2
+
+
+class CameraParameter:
+    logger = None
+    SUPPORTED_KEYS = _CAMERA_PARAMETER_SUPPORTED_KEYS_
+
+    def __init__(self,
+                 name: str = 'default',
+                 H: int = 1080,
+                 W: int = 1920) -> None:
+        """
+        Args:
+            name (str, optional):
+                Name of this camera. Defaults to "default".
+            H (int, optional):
+                Height of a frame, in pixel. Defaults to 1080.
+            W (int, optional):
+                Width of a frame, in pixel. Defaults to 1920.
+        """
+        self.name = name
+        self.parameters_dict = {}
+        in_mat = __zero_mat_list__(3)
+        self.parameters_dict['in_mat'] = in_mat
+        for distort_name in __distort_coefficient_names__:
+            self.parameters_dict[distort_name] = 0.0
+        _, H = self.validate_item('H', H)
+        self.parameters_dict['H'] = H
+        _, W = self.validate_item('W', W)
+        self.parameters_dict['W'] = W
+        r_mat = __zero_mat_list__(3)
+        self.parameters_dict['rotation_mat'] = r_mat
+        t_list = [0.0, 0.0, 0.0]
+        self.parameters_dict['translation'] = t_list
+
+    def reset_distort(self) -> None:
+        """Reset all distort coefficients to zero."""
+        for distort_name in __distort_coefficient_names__:
+            self.parameters_dict[distort_name] = 0.0
+
+    def get_opencv_distort_mat(self) -> np.ndarray:
+        """Get a numpy array of 8 distort coefficients, which is the distCoeffs
+        arg of cv2.undistort.
+
+        Returns:
+            ndarray:
+                (k_1, k_2, p_1, p_2, k_3, k_4, k_5, k_6) of 8 elements.
+        """
+        dist_coeffs = [
+            self.get_value('k1'),
+            self.get_value('k2'),
+            self.get_value('p1'),
+            self.get_value('p2'),
+            self.get_value('k3'),
+            self.get_value('k4'),
+            self.get_value('k5'),
+            self.get_value('k6'),
+        ]
+        dist_coeffs = np.array(dist_coeffs)
+        return dist_coeffs
+
+    def set_KRT(self,
+                K_mat: np.ndarray,
+                R_mat: np.ndarray,
+                T_vec: np.ndarray,
+                inverse_extrinsic: bool = False) -> None:
+        """Set intrinsic and extrinsic of a camera.
+
+        Args:
+            K_mat (np.ndarray):
+                In shape [3, 3].
+            R_mat (np.ndarray):
+                Rotation from world to view in default.
+                In shape [3, 3].
+            T_vec (np.ndarray):
+                Translation from world to view in default.
+                In shape [3,].
+            inverse_extrinsic (bool, optional):
+                If true, R_mat and T_vec transform a point
+                from view to world. Defaults to False.
+        """
+        k_shape = K_mat.shape
+        assert k_shape[0] == k_shape[1] == 3
+        r_shape = R_mat.shape
+        assert r_shape[0] == r_shape[1] == 3
+        assert T_vec.ndim == 1 and T_vec.shape[0] == 3
+        self.set_mat_np('in_mat', K_mat)
+        if inverse_extrinsic:
+            R_mat = np.linalg.inv(R_mat)
+            T_vec = -np.dot(R_mat, T_vec).reshape((3))
+        self.set_mat_np('rotation_mat', R_mat)
+        self.set_value('translation', T_vec.tolist())
+
+    def get_KRT(self, k_dim=3) -> List[np.ndarray]:
+        """Get intrinsic and extrinsic of a camera.
+
+        Args:
+            k_dim (int, optional):
+                Dimension of the returned mat K.
+                Defaults to 3.
+
+        Raises:
+            ValueError: k_dim is neither 3 nor 4.
+
+        Returns:
+            List[np.ndarray]:
+                K_mat (np.ndarray):
+                    In shape [3, 3].
+                R_mat (np.ndarray):
+                    Rotation from world to view in default.
+                    In shape [3, 3].
+                T_vec (np.ndarray):
+                    Translation from world to view in default.
+                    In shape [3,].
+        """
+        K_3x3 = self.get_mat_np('in_mat')
+        R_mat = self.get_mat_np('rotation_mat')
+        T_vec = np.asarray(self.get_value('translation'))
+        if k_dim == 3:
+            return [K_3x3, R_mat, T_vec]
+        elif k_dim == 4:
+            K_3x3 = np.expand_dims(K_3x3, 0)  # shape (1, 3, 3)
+            K_4x4 = convert_K_3x3_to_4x4(
+                K=K_3x3, is_perspective=True)  # shape (1, 4, 4)
+            K_4x4 = K_4x4[0, :, :]
+            return [K_4x4, R_mat, T_vec]
+        else:
+            raise ValueError(f'K mat cannot be converted to {k_dim}x{k_dim}')
+
+    def set_mat_np(self, mat_key: str, mat_numpy: np.ndarray) -> None:
+        """Set a matrix-type parameter to mat_numpy.
+
+        Args:
+            mat_key (str):
+                Key of the target matrix. in_mat or rotation_mat.
+            mat_numpy (ndarray):
+                Matrix in numpy format.
+
+            Raises:
+                TypeError:
+                    mat_numpy is not an np.ndarray.
+        """
+        if not isinstance(mat_numpy, np.ndarray):
+            raise TypeError
+        self.set_mat_list(mat_key, mat_numpy.tolist())
+
+    def set_mat_list(self, mat_key: str, mat_list: List[list]) -> None:
+        """Set a matrix-type parameter to mat_list.
+
+        Args:
+            mat_key (str):
+                Key of the target matrix. in_mat or rotation_mat.
+            mat_list (List[list]):
+                Matrix in list format.
+        """
+        _, mat_list = self.validate_item(mat_key, mat_list)
+        self.parameters_dict[mat_key] = mat_list
+
+    def set_value(self, key: str, value: Any) -> None:
+        """Set a parameter to value.
+
+        Args:
+            key (str):
+                Name of the parameter.
+            value (object):
+                New value of the parameter.
+        """
+        _, value = self.validate_item(key, value)
+        self.parameters_dict[key] = value
+
+    def get_value(self, key: str) -> Any:
+        """Get a parameter by key.
+
+        Args:
+            key (str):
+                Name of the parameter.
+        Raises:
+            KeyError: key not in self.parameters_dict
+
+        Returns:
+            object:
+                Value of the parameter.
+        """
+        if key not in self.parameters_dict:
+            raise KeyError(key)
+        else:
+            return self.parameters_dict[key]
+
+    def get_mat_np(self, key: str) -> np.ndarray:
+        """Get a a matrix-type parameter by key.
+
+        Args:
+            key (str):
+                Name of the parameter.
+        Raises:
+            KeyError: key not in self.parameters_dict
+
+        Returns:
+            ndarray:
+                Value of the parameter.
+        """
+        if key not in self.parameters_dict:
+            raise KeyError(key)
+        else:
+            mat_list = self.parameters_dict[key]
+            mat_np = np.array(mat_list).reshape((3, 3))
+            return mat_np
+
+    def to_string(self) -> str:
+        """Convert self.to_dict() to a string.
+
+        Returns:
+            str:
+                A dict in json string format.
+        """
+        dump_dict = self.to_dict()
+        ret_str = json.dumps(dump_dict)
+        return ret_str
+
+    def to_dict(self) -> dict:
+        """Dump camera name and parameters to dict.
+
+        Returns:
+            dict:
+                Put self.name and self.parameters_dict
+                in one dict.
+        """
+        dump_dict = self.parameters_dict.copy()
+        dump_dict['name'] = self.name
+        return dump_dict
+
+    def dump(self, json_path: str) -> None:
+        """Dump camera name and parameters to a file.
+
+        Returns:
+            dict:
+                Put self.name and self.parameters_dict
+                in one dict, and dump them to a json file.
+        """
+        dump_dict = self.to_dict()
+        with open(json_path, 'w') as f_write:
+            json.dump(dump_dict, f_write)
+
+    def load(self, json_path: str) -> None:
+        """Load camera name and parameters from a file."""
+        with open(json_path, 'r') as f_read:
+            dumped_dict = json.load(f_read)
+        self.load_from_dict(dumped_dict)
+
+    def load_from_dict(self, json_dict: dict) -> None:
+        """Load name and parameters from a dict.
+
+        Args:
+            json_dict (dict):
+                A dict comes from self.to_dict().
+        """
+        for key in json_dict.keys():
+            if key == 'name':
+                self.name = json_dict[key]
+            elif key == 'rotation':
+                self.parameters_dict['rotation_mat'] = np.array(
+                    json_dict[key]).reshape(3, 3).tolist()
+            elif key == 'translation':
+                self.parameters_dict[key] = np.array(json_dict[key]).reshape(
+                    (3)).tolist()
+            else:
+                self.parameters_dict[key] = json_dict[key]
+                if '_mat' in key:
+                    self.parameters_dict[key] = np.array(
+                        self.parameters_dict[key]).reshape(3, 3).tolist()
+
+    def load_from_chessboard(self,
+                             chessboard_dict: dict,
+                             name: str,
+                             inverse: bool = True) -> None:
+        """Load name and parameters from a dict.
+
+        Args:
+            chessboard_dict (dict):
+                A dict loaded from json.load(chessboard_file).
+            name (str):
+                Name of this camera.
+            inverse (bool, optional):
+                Whether to inverse rotation and translation mat.
+                Defaults to False.
+        """
+        camera_param_dict = \
+            __parse_chessboard_param__(chessboard_dict, name, inverse=inverse)
+        self.load_from_dict(camera_param_dict)
+
+    def load_kinect_from_smc(self, smc_reader, kinect_id: int) -> None:
+        """Load name and parameters of a kinect from an SmcReader instance.
+
+        Args:
+            smc_reader (mmhuman3d.data.data_structures.smc_reader.SMCReader):
+                An SmcReader instance containing kinect camera parameters.
+            kinect_id (int):
+                Id of the target kinect.
+        """
+        name = kinect_id
+        extrinsics_dict = \
+            smc_reader.get_kinect_color_extrinsics(
+                kinect_id, homogeneous=False
+            )
+        rot_np = extrinsics_dict['R']
+        trans_np = extrinsics_dict['T']
+        intrinsics_np = \
+            smc_reader.get_kinect_color_intrinsics(
+                kinect_id
+            )
+        resolution = \
+            smc_reader.get_kinect_color_resolution(
+                kinect_id
+            )
+        rmatrix = np.linalg.inv(rot_np).reshape(3, 3)
+        tvec = -np.dot(rmatrix, trans_np)
+        self.name = name
+        self.set_mat_np('in_mat', intrinsics_np)
+        self.set_mat_np('rotation_mat', rmatrix)
+        self.set_value('translation', tvec.tolist())
+        self.set_value('H', resolution[1])
+        self.set_value('W', resolution[0])
+
+    def load_iphone_from_smc(self,
+                             smc_reader,
+                             iphone_id: int = 0,
+                             frame_id: int = 0) -> None:
+        """Load name and parameters of an iPhone from an SmcReader instance.
+
+        Args:
+            smc_reader (mmhuman3d.data.data_structures.smc_reader.SMCReader):
+                An SmcReader instance containing kinect camera parameters.
+            iphone_id (int):
+                Id of the target iphone.
+                Defaults to 0.
+            frame_id (int):
+                Frame ID of one selected frame.
+                It only influences the intrinsics.
+                Defaults to 0.
+        """
+        name = f'iPhone_{iphone_id}'
+        extrinsics_mat = \
+            smc_reader.get_iphone_extrinsics(
+                iphone_id, homogeneous=True
+            )
+        rot_np = extrinsics_mat[:3, :3]
+        trans_np = extrinsics_mat[:3, 3]
+        intrinsics_np = \
+            smc_reader.get_iphone_intrinsics(
+                iphone_id, frame_id
+            )
+        resolution = \
+            smc_reader.get_iphone_color_resolution(
+                iphone_id
+            )
+        rmatrix = np.linalg.inv(rot_np).reshape(3, 3)
+        tvec = -np.dot(rmatrix, trans_np)
+        self.name = name
+        self.set_mat_np('in_mat', intrinsics_np)
+        self.set_mat_np('rotation_mat', rmatrix)
+        self.set_value('translation', tvec.tolist())
+        self.set_value('H', resolution[1])
+        self.set_value('W', resolution[0])
+
+    @classmethod
+    def load_from_perspective_cameras(cls,
+                                      cam,
+                                      name: str,
+                                      resolution: Union[List, Tuple] = None):
+        """Load parameters from a PerspectiveCameras and return a
+        CameraParameter.
+
+        Args:
+            cam (mmhuman3d.core.cameras.cameras.PerspectiveCameras):
+                An instance.
+            name (str):
+                Name of this camera.
+        """
+        assert isinstance(cam, PerspectiveCameras
+                          ), 'Wrong input, support PerspectiveCameras only!'
+        if len(cam) > 1:
+            warnings.warn('Will only use the first camera in the batch.')
+        cam = cam[0]
+
+        resolution = resolution if resolution is not None else cam.resolution[
+            0].tolist()
+
+        height, width = int(resolution[0]), int(resolution[1])
+
+        cam_param = CameraParameter()
+        cam_param.__init__(H=height, W=width, name=name)
+
+        k_4x4 = cam.K  # shape (1, 4, 4)
+        r_3x3 = cam.R  # shape (1, 3, 3)
+        t_3 = cam.T  # shape (1, 3)
+        is_perspective = cam.is_perspective()
+        in_ndc = cam.in_ndc()
+
+        k_4x4, r_3x3, t_3 = convert_camera_matrix(K=k_4x4,
+                                                  R=r_3x3,
+                                                  T=t_3,
+                                                  is_perspective=False,
+                                                  in_ndc_dst=False,
+                                                  in_ndc_src=in_ndc,
+                                                  convention_src='pytorch3d',
+                                                  convention_dst='opencv',
+                                                  resolution_src=(height,
+                                                                  width),
+                                                  resolution_dst=(height,
+                                                                  width))
+
+        k_3x3 = \
+            convert_K_4x4_to_3x3(k_4x4, is_perspective=is_perspective)
+
+        k_3x3 = k_3x3.numpy()[0]
+        r_3x3 = r_3x3.numpy()[0]
+        t_3 = t_3.numpy()[0]
+        cam_param.name = name
+        cam_param.set_mat_np('in_mat', k_3x3)
+        cam_param.set_mat_np('rotation_mat', r_3x3)
+        cam_param.set_value('translation', t_3.tolist())
+        cam_param.parameters_dict.update(H=height)
+        cam_param.parameters_dict.update(W=width)
+        return cam_param
+
+    def export_to_perspective_cameras(self) -> PerspectiveCameras:
+        """Export to a opencv defined screen space PerspectiveCameras.
+
+        Returns:
+            Same defined PerspectiveCameras of batch_size 1.
+        """
+        height = self.parameters_dict['H']
+        width = self.parameters_dict['W']
+        k_4x4, rotation, translation = self.get_KRT(k_dim=4)
+        k_4x4 = np.expand_dims(k_4x4, 0)  # shape (1, 3, 3)
+        rotation = np.expand_dims(rotation, 0)  # shape (1, 3, 3)
+        translation = np.expand_dims(translation, 0)  # shape (1, 3)
+        new_K = torch.from_numpy(k_4x4)
+        new_R = torch.from_numpy(rotation)
+        new_T = torch.from_numpy(translation)
+        cam = build_cameras(
+            dict(type='PerspectiveCameras',
+                 K=new_K.float(),
+                 R=new_R.float(),
+                 T=new_T.float(),
+                 convention='opencv',
+                 in_ndc=False,
+                 resolution=(height, width)))
+        return cam
+
+    def validate_item(self, key: Any, val: Any) -> List:
+        """Check whether the key and its value matches definition in
+        CameraParameter.SUPPORTED_KEYS.
+
+        Args:
+            key (Any):
+                Key in CameraParameter.
+            val (Any):
+                Value to the key.
+
+        Raises:
+            KeyError:
+                key cannot be found in
+                CameraParameter.SUPPORTED_KEYS.
+            TypeError:
+                Value's type doesn't match definition.
+        Returns:
+            key (Any): The input key.
+            val (Any): The value casted into correct format.
+        """
+        self.__check_key__(key)
+        formatted_val = self.__validate_value_type__(key, val)
+        return key, formatted_val
+
+    def __check_key__(self, key: Any) -> None:
+        """Check whether the key matches definition in
+        CameraParameter.SUPPORTED_KEYS.
+
+        Args:
+            key (Any):
+                Key in CameraParameter.
+
+        Raises:
+            KeyError:
+                key cannot be found in
+                CameraParameter.SUPPORTED_KEYS.
+        """
+        if key not in self.__class__.SUPPORTED_KEYS:
+            err_msg = 'Key check failed in CameraParameter:\n'
+            err_msg += f'key={str(key)}\n'
+            raise KeyError(err_msg)
+
+    def __validate_value_type__(self, key: Any, val: Any) -> Any:
+        """Check whether the type of value matches definition in
+        CameraParameter.SUPPORTED_KEYS.
+
+        Args:
+            key (Any):
+                Key in CameraParameter.
+            val (Any):
+                Value to the key.
+
+        Raises:
+            TypeError:
+                Value is supported but doesn't match definition.
+
+        Returns:
+            val (Any): The value casted into correct format.
+        """
+        np_type_mapping = {int: np.integer, float: np.floating}
+        supported_keys = self.__class__.SUPPORTED_KEYS
+        validation_result = _TypeValidation.FAIL
+        ret_val = None
+        if supported_keys[key]['type'] == int or\
+                supported_keys[key]['type'] == float:
+            type_str = str(type(val))
+            class_name = type_str.split('\'')[1]
+            if type(val) == self.__class__.SUPPORTED_KEYS[key]['type']:
+                validation_result = _TypeValidation.MATCH
+                ret_val = val
+            elif class_name.startswith('numpy'):
+                # a value is required, not array
+                if np.issubdtype(type(val),
+                                 np_type_mapping[supported_keys[key]['type']]):
+                    validation_result = _TypeValidation.MATCH
+                    ret_val = val.astype(supported_keys[key]['type'])
+                elif np.issubdtype(type(val), np.ndarray):
+                    validation_result = _TypeValidation.ARRAY
+            elif class_name.startswith('torch'):
+                # only one element tensors
+                # can be converted to Python scalars
+                if len(val.size()) == 0:
+                    val_item = val.item()
+                    if type(val_item) == supported_keys[key]['type']:
+                        validation_result = _TypeValidation.MATCH
+                        ret_val = val_item
+                else:
+                    validation_result = _TypeValidation.ARRAY
+        else:
+            if type(val) == self.__class__.SUPPORTED_KEYS[key]['type']:
+                validation_result = _TypeValidation.MATCH
+                ret_val = val
+        if validation_result != _TypeValidation.MATCH:
+            err_msg = 'Type check failed in CameraParameter:\n'
+            err_msg += f'key={str(key)}\n'
+            err_msg += f'type(val)={type(val)}\n'
+            if validation_result == _TypeValidation.ARRAY:
+                err_msg += 'A single value is expected, ' +\
+                    'neither an array nor a slice.\n'
+            raise TypeError(err_msg)
+        return ret_val
+
+
+def __parse_chessboard_param__(chessboard_camera_param, name, inverse=True):
+    """Parse a dict loaded from chessboard file into another dict needed by
+    CameraParameter.
+
+    Args:
+        chessboard_camera_param (dict):
+            A dict loaded from json.load(chessboard_file).
+        name (str):
+            Name of this camera.
+        inverse (bool, optional):
+            Whether to inverse rotation and translation mat.
+            Defaults to True.
+
+    Returns:
+        dict:
+            A dict of parameters in CameraParameter.to_dict() format.
+    """
+    camera_param_dict = {}
+    camera_param_dict['H'] = chessboard_camera_param['imgSize'][1]
+    camera_param_dict['W'] = chessboard_camera_param['imgSize'][0]
+    camera_param_dict['in_mat'] = chessboard_camera_param['K']
+    camera_param_dict['k1'] = 0
+    camera_param_dict['k2'] = 0
+    camera_param_dict['k3'] = 0
+    camera_param_dict['k4'] = 0
+    camera_param_dict['k5'] = 0
+    camera_param_dict['p1'] = 0
+    camera_param_dict['p2'] = 0
+    camera_param_dict['name'] = name
+    camera_param_dict['rotation'] = chessboard_camera_param['R']
+    camera_param_dict['translation'] = chessboard_camera_param['T']
+    if inverse:
+        rmatrix = np.linalg.inv(
+            np.array(camera_param_dict['rotation']).reshape(3, 3))
+        camera_param_dict['rotation'] = rmatrix.tolist()
+        tmatrix = np.array(camera_param_dict['translation']).reshape((3, 1))
+        tvec = -np.dot(rmatrix, tmatrix)
+        camera_param_dict['translation'] = tvec.reshape((3)).tolist()
+    return camera_param_dict
+
+
+__distort_coefficient_names__ = [
+    'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'p1', 'p2'
+]
+
+
+def __zero_mat_list__(n=3):
+    """Return a zero mat in list format.
+
+    Args:
+        n (int, optional):
+            Length of the edge.
+            Defaults to 3.
+
+    Returns:
+        list:
+            List[List[int]]
+    """
+    ret_list = [[0] * n for _ in range(n)]
+    return ret_list
diff --git a/detrsmpl/core/cameras/cameras.py b/detrsmpl/core/cameras/cameras.py
new file mode 100644
index 0000000000000000000000000000000000000000..907d591e4f6b2edf2d6fc37b0265c06ebbe3f600
--- /dev/null
+++ b/detrsmpl/core/cameras/cameras.py
@@ -0,0 +1,1426 @@
+import math
+from typing import Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from pytorch3d.renderer import cameras
+from pytorch3d.structures import Meshes
+from pytorch3d.transforms import Transform3d
+
+from detrsmpl.core.conventions.cameras.convert_convention import (
+    convert_camera_matrix,
+    convert_ndc_to_screen,
+    convert_screen_to_ndc,
+    convert_world_view,
+)
+from detrsmpl.utils.transforms import ee_to_rotmat
+from .builder import CAMERAS
+
+
+class MMCamerasBase(cameras.CamerasBase):
+    """Inherited from Pytorch3D CamerasBase and provide some new functions."""
+    def __init__(self, **kwargs) -> None:
+        """Initialize your cameras with `build_cameras` following:
+
+        1): provide `K`, `R`, `T`, `resolution`/`image_size`, `in_ndc`
+            directly.
+            `K` should be shape of (N, 3, 3) or (N, 4, 4).
+            `R` should be shape of (N, 3, 3).
+            `T` should be shape of (N, 3).
+        2): if `K` is not provided, will use `get_default_projection_matrix`
+            to generate K from camera intrinsic parameters.
+            E.g., you can pass `focal_length`, `principal_point` for
+            perspective camers.
+            If these args are not provided, will use default values.
+        3): if `R` is not provided, will use Identity matrix as default.
+        4): if `T` is not provided, will use zeros matrix as default.
+        5): `convention` means your source parameter camera convention.
+            This mainly depends on how you get the matrixs. E.g., you get the
+            `K` `R`, `T` by calibration with opencv, you should set
+            `convention = opencv`. To figure out your camera convention,
+            please see the definition of its extrinsic and intrinsic matrixs.
+            For projection and rendering, the matrixs will be converted to
+            `pytorch3d` finally since the `transforms3d` called in rendering
+            and projection are defined as `pytorch3d` convention.
+        6): `image_size` equals `resolution`.
+        7): `in_ndc` could be set for 'PerspectiveCameras' and
+            'OrthographicCameras', other cameras are fixed for this arg.
+            `in_ndc = True` means your projection matrix is defined as `camera
+            space to NDC space`. Under this cirecumstance you need to set
+            `image_size` or `resolution` (they are equal) when you need to do
+            `transform_points_screen`. You can also override resolution
+            in `transform_points_screen` function.
+            `in_ndc = False` means your projections matrix is defined as
+            `cameras space to screen space`. Under this cirecumstance you do
+            not need to set `image_size` or `resolution` (they are equal) when
+            you need to do `transform_points_screen` since the projection
+            matrix is defined as view space to screen space.
+        """
+        for k in kwargs:
+            if isinstance(kwargs.get(k), np.ndarray):
+                kwargs.update({k: torch.Tensor(kwargs[k])})
+        convention = kwargs.pop('convention', 'pytorch3d').lower()
+        in_ndc = kwargs.pop('in_ndc', kwargs.get('_in_ndc'))
+        kwargs.update(_in_ndc=in_ndc)
+        is_perspective = kwargs.get('_is_perspective')
+        kwargs.pop('is_perspective', None)
+
+        image_size = kwargs.get('image_size', kwargs.get('resolution', None))
+
+        if image_size is not None:
+            if isinstance(image_size, (int, float)):
+                image_size = (image_size, image_size)
+            if isinstance(image_size, (tuple, list)):
+                image_size = torch.Tensor(image_size)
+            if isinstance(image_size, torch.Tensor):
+                if image_size.numel() == 1:
+                    image_size = image_size.repeat(2)
+                image_size = image_size.view(-1, 2)
+
+        if kwargs.get('K') is None:
+            focal_length = kwargs.get('focal_length', None)
+            if focal_length is not None:
+                if not isinstance(focal_length, Iterable):
+                    focal_length = [focal_length, focal_length]
+                if not torch.is_tensor(focal_length):
+                    focal_length = torch.FloatTensor(focal_length).view(-1, 2)
+                elif focal_length.numel() == 1:
+                    focal_length = focal_length.repeat(2).view(-1, 2)
+                kwargs.update(focal_length=focal_length)
+
+            principal_point = kwargs.get('principal_point', None)
+            if principal_point is not None:
+                if isinstance(principal_point, (tuple, list)):
+                    principal_point = torch.FloatTensor(principal_point)
+                principal_point = principal_point.view(-1, 2)
+                kwargs.update(principal_point=principal_point)
+
+            K = self.get_default_projection_matrix(**kwargs)
+
+            K, _, _ = convert_camera_matrix(K=K,
+                                            is_perspective=is_perspective,
+                                            convention_src='pytorch3d',
+                                            convention_dst='pytorch3d',
+                                            in_ndc_src=in_ndc,
+                                            in_ndc_dst=in_ndc,
+                                            resolution_dst=image_size,
+                                            resolution_src=image_size)
+            kwargs.update(K=K)
+
+        K, R, T = convert_camera_matrix(K=kwargs.get('K'),
+                                        R=kwargs.get('R', None),
+                                        T=kwargs.get('T', None),
+                                        convention_src=convention,
+                                        convention_dst='pytorch3d',
+                                        is_perspective=is_perspective,
+                                        in_ndc_src=in_ndc,
+                                        in_ndc_dst=in_ndc,
+                                        resolution_src=image_size,
+                                        resolution_dst=image_size)
+
+        if image_size is not None:
+            if image_size.shape[0] == 1:
+                image_size = image_size.repeat(K.shape[0], 1)
+            kwargs.update(image_size=image_size)
+            kwargs.update(resolution=image_size)
+
+        kwargs.update(K=K, R=R, T=T)
+
+        super().__init__(**kwargs)
+
+    def get_camera_plane_normals(self, **kwargs) -> torch.Tensor:
+        """Get the identity normal vector which stretchs out of the camera
+        plane.
+
+        Could pass `R` to override the camera extrinsic rotation matrix.
+        Returns:
+            torch.Tensor: shape will be (N, 3)
+        """
+        normals = torch.Tensor([0, 0, 1]).view(1, 3).to(self.device)
+        w2v_trans = self.get_world_to_view_transform(**kwargs)
+        normals = w2v_trans.inverse().transform_normals(normals)
+        return normals.view(-1, 3)
+
+    def compute_depth_of_points(self, points: torch.Tensor) -> torch.Tensor:
+        """Compute depth of points to the camera plane.
+
+        Args:
+            points ([torch.Tensor]): shape should be (batch_size, ..., 3).
+
+        Returns:
+            torch.Tensor: shape will be (batch_size, 1)
+        """
+        world_to_view_transform = self.get_world_to_view_transform()
+        world_to_view_points = world_to_view_transform.transform_points(
+            points.to(self.device))
+        return world_to_view_points[..., 2:3]
+
+    def compute_normal_of_meshes(self, meshes: Meshes) -> torch.Tensor:
+        """Compute normal of meshes in the camera view.
+
+        Args:
+            points ([torch.Tensor]): shape should be (batch_size, 3).
+
+        Returns:
+            torch.Tensor: shape will be (batch_size, 1)
+        """
+        world_to_view_transform = self.get_world_to_view_transform()
+        world_to_view_normals = world_to_view_transform.transform_normals(
+            meshes.verts_normals_padded().to(self.device))
+        return world_to_view_normals
+
+    def __repr__(self):
+        """Rewrite __repr__
+
+        Returns:
+            str: print the information of cameras (N, in_ndc, device).
+        """
+        main_str = super().__repr__()
+        main_str = main_str.split(')')[0]
+        main_str += f'N: {self.__len__()}, in_ndc: {self.in_ndc()}, '
+        main_str += f'device: {self.device})'
+        return main_str
+
+    def get_image_size(self):
+        """Returns the image size, if provided, expected in the form of
+        (height, width) The image size is used for conversion of projected
+        points to screen coordinates."""
+        if hasattr(self, 'image_size'):
+            image_size = self.image_size
+        if hasattr(self, 'resolution'):
+            if self.resolution is not None:
+                image_size = self.resolution
+        else:
+            image_size = None
+
+        return image_size
+
+    def __getitem__(
+        self, index: Union[slice, int, torch.Tensor, List,
+                           Tuple]) -> 'MMCamerasBase':
+        """Slice the cameras by batch dim.
+
+        Args:
+            index (Union[slice, int, torch.Tensor, List, Tuple]):
+            index for slicing.
+
+        Returns:
+            MMCamerasBase: sliced cameras.
+        """
+        if isinstance(index, int):
+            index = [index]
+        return self.__class__(K=self.K[index],
+                              R=self.R[index],
+                              T=self.T[index],
+                              image_size=self.get_image_size()[index]
+                              if self.get_image_size() is not None else None,
+                              in_ndc=self.in_ndc(),
+                              convention='pytorch3d',
+                              device=self.device)
+
+    def extend(self, N) -> 'MMCamerasBase':
+        """Create new camera class which contains each input camera N times.
+
+        Args:
+            N: number of new copies of each camera.
+
+        Returns:
+            MMCamerasBase object.
+        """
+        return self.__class__(K=self.K.repeat(N, 1, 1),
+                              R=self.R.repeat(N, 1, 1),
+                              T=self.T.repeat(N, 1),
+                              image_size=self.get_image_size(),
+                              in_ndc=self.in_ndc(),
+                              convention='pytorch3d',
+                              device=self.device)
+
+    def extend_(self, N):
+        """extend camera inplace."""
+        self.K = self.K.repeat(N, 1, 1)
+        self.R = self.R.repeat(N, 1, 1)
+        self.T = self.T.repeat(N, 1)
+        self._N = self._N * N
+
+    @classmethod
+    def get_default_projection_matrix(cls, ):
+        """Class method. Calculate the projective transformation matrix by
+        default parameters.
+
+        Args:
+            **kwargs: parameters for the projection can be passed in as keyword
+                arguments to override the default values set in `__init__`.
+
+        Return:
+            a `torch.Tensor` which represents a batch of projection matrices K
+            of shape (N, 4, 4)
+        """
+        raise NotImplementedError()
+
+    def to_screen_(self, **kwargs) -> 'MMCamerasBase':
+        """Convert to screen inplace."""
+        if self.in_ndc():
+            if self.get_image_size() is None:
+                self.image_size = kwargs.get('image_size')
+            else:
+                self.image_size = self.get_image_size()
+            self.K = convert_ndc_to_screen(K=self.K,
+                                           resolution=self.image_size,
+                                           is_perspective=self._is_perspective)
+            self._in_ndc = False
+        else:
+            print('Redundant operation, already in screen.')
+
+    def to_ndc_(self, **kwargs) -> 'MMCamerasBase':
+        """Convert to ndc inplace."""
+        if self.in_ndc():
+            print('Redundant operation, already in ndc.')
+        else:
+            if self.get_image_size() is None:
+                self.image_size = kwargs.get('image_size')
+            else:
+                self.image_size = self.get_image_size()
+            self.K = convert_screen_to_ndc(K=self.K,
+                                           resolution=self.image_size,
+                                           is_perspective=self._is_perspective)
+            self._in_ndc = True
+
+    def to_screen(self, **kwargs) -> 'MMCamerasBase':
+        """Convert to screen."""
+        if self.in_ndc():
+            if self.get_image_size() is None:
+                self.image_size = kwargs.get('image_size')
+            else:
+                self.image_size = self.get_image_size()
+
+            K = convert_ndc_to_screen(K=self.K,
+                                      resolution=self.image_size,
+                                      is_perspective=self._is_perspective)
+            return self.__class__(K=K,
+                                  R=self.R,
+                                  T=self.T,
+                                  in_ndc=False,
+                                  resolution=self.image_size)
+        else:
+            print('Redundant operation, already in screen.')
+
+    def to_ndc(self, **kwargs) -> 'MMCamerasBase':
+        """Convert to ndc."""
+        if self.in_ndc():
+            print('Redundant operation, already in ndc.')
+        else:
+            if self.get_image_size() is None:
+                self.image_size = kwargs.get('image_size')
+            else:
+                self.image_size = self.get_image_size()
+            K = convert_screen_to_ndc(K=self.K,
+                                      resolution=self.image_size,
+                                      is_perspective=self._is_perspective)
+            return self.__class__(K=K,
+                                  R=self.R,
+                                  T=self.T,
+                                  in_ndc=True,
+                                  resolution=self.image_size)
+
+    def detach(self) -> 'MMCamerasBase':
+        image_size = self.image_size.detach(
+        ) if self.image_size is not None else None
+        return self.__class__(K=self.K.detach(),
+                              R=self.R.detach(),
+                              T=self.T.detach(),
+                              in_ndc=self.in_ndc(),
+                              device=self.device,
+                              resolution=image_size)
+
+    def concat(self, others) -> 'MMCamerasBase':
+        if isinstance(others, type(self)):
+            others = [others]
+        else:
+            raise TypeError('Could only concat with same type cameras.')
+        return concat_cameras([self] + others)
+
+
+@CAMERAS.register_module(name=('WeakPerspectiveCameras', 'WeakPerspective',
+                               'weakperspective'))
+class WeakPerspectiveCameras(MMCamerasBase):
+    """Inherited from [Pytorch3D cameras](https://github.com/facebookresearch/
+    pytorch3d/blob/main/pytorch3d/renderer/cameras.py) and mimiced the code
+    style. And re-inmplemented functions: compute_projection_matrix,
+    get_projection_transform, unproject_points, is_perspective, in_ndc for
+    render.
+
+    K modified from [VIBE](https://github.com/mkocabas/VIBE/blob/master/
+    lib/utils/renderer.py) and changed to opencv convention.
+    Original license please see docs/additional_license/md.
+
+    This intrinsic matrix is orthographics indeed, but could serve as
+    weakperspective for single smpl mesh.
+    """
+    def __init__(
+        self,
+        scale_x: Union[torch.Tensor, float] = 1.0,
+        scale_y: Union[torch.Tensor, float] = 1.0,
+        transl_x: Union[torch.Tensor, float] = 0.0,
+        transl_y: Union[torch.Tensor, float] = 0.0,
+        znear: Union[torch.Tensor, float] = -1.0,
+        aspect_ratio: Union[torch.Tensor, float] = 1.0,
+        K: Optional[torch.Tensor] = None,
+        R: Optional[torch.Tensor] = None,
+        T: Optional[torch.Tensor] = None,
+        device: Union[torch.device, str] = 'cpu',
+        convention: str = 'pytorch3d',
+        **kwargs,
+    ):
+        """Initialize. If K is provided, don't need scale_x, scale_y, transl_x,
+        transl_y, znear, aspect_ratio.
+
+        Args:
+            scale_x (Union[torch.Tensor, float], optional):
+                Scale in x direction.
+                Defaults to 1.0.
+            scale_y (Union[torch.Tensor, float], optional):
+                Scale in y direction.
+                Defaults to 1.0.
+            transl_x (Union[torch.Tensor, float], optional):
+                Translation in x direction.
+                Defaults to 0.0.
+            transl_y (Union[torch.Tensor, float], optional):
+                Translation in y direction.
+                Defaults to 0.0.
+            znear (Union[torch.Tensor, float], optional):
+                near clipping plane of the view frustrum.
+                Defaults to -1.0.
+            aspect_ratio (Union[torch.Tensor, float], optional):
+                aspect ratio of the image pixels. 1.0 indicates square pixels.
+                Defaults to 1.0.
+            K (Optional[torch.Tensor], optional): Intrinsic matrix of shape
+                (N, 4, 4). If provided, don't need scale_x, scale_y, transl_x,
+                transl_y, znear, aspect_ratio.
+                Defaults to None.
+            R (Optional[torch.Tensor], optional):
+                Rotation matrix of shape (N, 3, 3).
+                Defaults to None.
+            T (Optional[torch.Tensor], optional):
+                Translation matrix of shape (N, 3).
+                Defaults to None.
+            device (Union[torch.device, str], optional):
+                torch device. Defaults to 'cpu'.
+        """
+        kwargs.update(
+            _in_ndc=True,
+            _is_perspective=False,
+        )
+        kwargs.pop('in_ndc', None)
+        kwargs.pop('is_perspective', None)
+        super().__init__(scale_x=scale_x,
+                         scale_y=scale_y,
+                         transl_x=transl_x,
+                         transl_y=transl_y,
+                         znear=znear,
+                         aspect_ratio=aspect_ratio,
+                         K=K,
+                         R=R,
+                         T=T,
+                         device=device,
+                         convention=convention,
+                         **kwargs)
+
+    @staticmethod
+    def convert_orig_cam_to_matrix(
+            orig_cam: torch.Tensor,
+            **kwargs) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute intrinsic camera matrix from orig_cam parameter of smpl.
+
+        .. code-block:: python
+
+            r > 1::
+
+                K = [[sx*r,   0,    0,   tx*sx*r],
+                     [0,     sy,    0,     ty*sy],
+                     [0,      0,    1,         0],
+                     [0,      0,    0,         1]]
+
+            or r < 1::
+
+                K = [[sx,    0,     0,   tx*sx],
+                     [0,   sy/r,    0,  ty*sy/r],
+                     [0,     0,     1,      0],
+                     [0,     0,     0,      1],]
+
+            rotation matrix: (N, 3, 3)::
+
+                [[1, 0, 0],
+                 [0, 1, 0],
+                 [0, 0, 1]]
+
+            translation matrix: (N, 3)::
+
+                [0, 0, -znear]
+
+        Args:
+            orig_cam (torch.Tensor): shape should be (N, 4).
+            znear (Union[torch.Tensor, float], optional):
+                near clipping plane of the view frustrum.
+                Defaults to 0.0.
+            aspect_ratio (Union[torch.Tensor, float], optional):
+                aspect ratio of the image pixels. 1.0 indicates square pixels.
+                Defaults to 1.0.
+
+        Returns:
+            Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+            opencv intrinsic matrix: (N, 4, 4)
+        """
+        znear = kwargs.get('znear', -1.0)
+        aspect_ratio = kwargs.get('aspect_ratio', 1.0)
+        _N = orig_cam.shape[0]
+        scale_x, scale_y, transl_x, transl_y = orig_cam[:, 0], orig_cam[:, 1],\
+            orig_cam[:, 2], orig_cam[:, 3]
+        K = torch.zeros((_N, 4, 4), dtype=torch.float32)
+        if aspect_ratio >= 1.0:
+            K[:, 0, 0] = scale_x * aspect_ratio
+            K[:, 1, 1] = scale_y
+            K[:, 0, 3] = transl_x * scale_x * aspect_ratio
+            K[:, 1, 3] = transl_y * scale_y
+        else:
+            K[:, 0, 0] = scale_x
+            K[:, 1, 1] = scale_y / aspect_ratio
+            K[:, 0, 3] = transl_x * scale_x
+            K[:, 1, 3] = transl_y * scale_y / aspect_ratio
+
+        K[:, 3, 3] = 1
+        K[:, 2, 2] = 1
+        R = torch.eye(3, 3)[None].repeat(_N, 1, 1)
+        T = torch.zeros(_N, 3)
+        T[:, 2] = znear
+        return K, R, T
+
+    @staticmethod
+    def convert_K_to_orig_cam(
+        K: torch.Tensor,
+        aspect_ratio: Union[torch.Tensor, float] = 1.0,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Compute intrinsic camera matrix from pred camera parameter of smpl.
+
+        Args:
+            K (torch.Tensor):
+                opencv orthographics intrinsic matrix: (N, 4, 4)
+
+            .. code-block:: python
+
+                K = [[sx*r,   0,    0,   tx*sx*r],
+                     [0,     sy,    0,   ty*sy],
+                     [0,     0,     1,       0],
+                     [0,     0,     0,       1],]
+
+            aspect_ratio (Union[torch.Tensor, float], optional):
+                aspect ratio of the image pixels. 1.0 indicates square pixels.
+                Defaults to 1.0.
+
+        Returns:
+
+            orig_cam (torch.Tensor): shape should be (N, 4).
+        """
+        _N = K.shape[0]
+        s_x = K[:, 0, 0] / aspect_ratio
+        s_y = K[:, 1, 1] / aspect_ratio
+        t_x = K[:, 0, 3] / (aspect_ratio * s_x)
+        t_y = K[:, 1, 3] / s_y
+        orig_cam = torch.cat([s_x, s_y, t_x, t_y], -1).view(_N, 4)
+        return orig_cam
+
+    @classmethod
+    def get_default_projection_matrix(cls, **args):
+        """Class method. Calculate the projective transformation matrix by
+        default parameters.
+
+        Args:
+            **kwargs: parameters for the projection can be passed in as keyword
+                arguments to override the default values set in `__init__`.
+
+        Return:
+            a `torch.Tensor` which represents a batch of projection matrices K
+            of shape (N, 4, 4)
+        """
+        orig_cam = args.get('orig_cam', None)
+        scale_x = args.get('scale_x', 1.0)
+        scale_y = args.get('scale_y', 1.0)
+        transl_x = args.get('transl_x', 0.0)
+        transl_y = args.get('transl_y', 0.0)
+        aspect_ratio = args.get('aspect_ratio', 1.0)
+        batch_size = args.get('batch_size', 1)
+        device = args.get('device', 'cpu')
+
+        if orig_cam is not None:
+            K, _, _ = cls.convert_orig_cam_to_matrix(orig_cam, **args)
+        else:
+            K = torch.zeros((1, 4, 4), dtype=torch.float32)
+
+            K[:, 0, 0] = scale_x * aspect_ratio
+            K[:, 1, 1] = scale_y
+            K[:, 3, 3] = 1
+            K[:, 0, 3] = transl_x * scale_x * aspect_ratio
+            K[:, 1, 3] = transl_y * scale_y
+            K[:, 2, 2] = 1
+            K = K.repeat(batch_size, 1, 1).to(device)
+        return K
+
+    def compute_projection_matrix(self, scale_x, scale_y, transl_x, transl_y,
+                                  aspect_ratio) -> torch.Tensor:
+        """Compute the calibration matrix K of shape (N, 4, 4)
+
+        Args:
+            scale_x (Union[torch.Tensor, float], optional):
+                Scale in x direction.
+            scale_y (Union[torch.Tensor, float], optional):
+                Scale in y direction.
+            transl_x (Union[torch.Tensor, float], optional):
+                Translation in x direction.
+            transl_y (Union[torch.Tensor, float], optional):
+                Translation in y direction.
+            aspect_ratio (Union[torch.Tensor, float], optional):
+                aspect ratio of the image pixels. 1.0 indicates square pixels.
+
+        Returns:
+            torch.FloatTensor of the calibration matrix with shape (N, 4, 4)
+        """
+        K = torch.zeros((self._N, 4, 4),
+                        dtype=torch.float32,
+                        device=self.device)
+
+        K[:, 0, 0] = scale_x * aspect_ratio
+        K[:, 1, 1] = scale_y
+        K[:, 3, 3] = 1
+        K[:, 0, 3] = transl_x * scale_x * aspect_ratio
+        K[:, 1, 3] = transl_y * scale_y
+        K[:, 2, 2] = 1
+        return K
+
+    def get_projection_transform(self, **kwargs) -> Transform3d:
+        """Calculate the orthographic projection matrix. Use column major
+        order.
+
+        Args:
+            **kwargs: parameters for the projection can be passed in to
+                      override the default values set in __init__.
+        Return:
+            a Transform3d object which represents a batch of projection
+               matrices of shape (N, 4, 4)
+        """
+        K = kwargs.get('K', self.K)
+        if K is not None:
+            if K.shape != (self._N, 4, 4):
+                msg = f'Expected K to have shape of ({self._N}, 4, 4)'
+                raise ValueError(msg)
+        else:
+            K = self.compute_projection_matrix(
+                kwargs.get('scale_x', self.scale_x),
+                kwargs.get('scale_y', self.scale_y),
+                kwargs.get('transl_x', self.trans_x),
+                kwargs.get('transl_y', self.trans_y),
+                kwargs.get('aspect_ratio', self.aspect_ratio))
+
+        transform = Transform3d(matrix=K.transpose(1, 2).contiguous(),
+                                device=self.device)
+        return transform
+
+    def unproject_points(self,
+                         xy_depth: torch.Tensor,
+                         world_coordinates: bool = True,
+                         **kwargs) -> torch.Tensor:
+        """Sends points from camera coordinates (NDC or screen) back to camera
+        view or world coordinates depending on the `world_coordinates` boolean
+        argument of the function."""
+        if world_coordinates:
+            to_camera_transform = self.get_full_projection_transform(**kwargs)
+        else:
+            to_camera_transform = self.get_projection_transform(**kwargs)
+
+        unprojection_transform = to_camera_transform.inverse()
+        return unprojection_transform.transform_points(xy_depth)
+
+    def is_perspective(self):
+        """Boolean of whether is perspective."""
+        return False
+
+    def in_ndc(self):
+        """Boolean of whether in NDC."""
+        return True
+
+    def to_ndc_(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+    def to_screen_(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+    def to_ndc(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+    def to_screen(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+
+@CAMERAS.register_module(name=('PerspectiveCameras', 'perspective',
+                               'Perspective'))
+class PerspectiveCameras(cameras.PerspectiveCameras, MMCamerasBase):
+    """Inherited from Pytorch3D `PerspectiveCameras`."""
+    def __init__(
+        self,
+        focal_length=1.0,
+        principal_point=((0.0, 0.0), ),
+        R: Optional[torch.Tensor] = None,
+        T: Optional[torch.Tensor] = None,
+        K: Optional[torch.Tensor] = None,
+        device: Union[torch.device, str] = 'cpu',
+        in_ndc: bool = True,
+        convention: str = 'pytorch3d',
+        image_size: Optional[Union[List, Tuple, torch.Tensor]] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Args:
+            focal_length (float, torch.Tensor, optional):  Defaults to 1.0.
+            principal_point (tuple, optional):  Defaults to ((0.0, 0.0), ).
+            R (Optional[torch.Tensor], optional):  Defaults to None.
+            T (Optional[torch.Tensor], optional):  Defaults to None.
+            K (Optional[torch.Tensor], optional):  Defaults to None.
+            device (Union[torch.device, str], optional):  Defaults to 'cpu'.
+            in_ndc (bool, optional):  Defaults to True.
+            convention (str, optional):  Defaults to 'pytorch3d'.
+            image_size (Optional[Union[List, Tuple, torch.Tensor]], optional):
+                 Defaults to None.
+
+        """
+        if image_size is not None:
+            kwargs.update({'image_size': image_size})
+        kwargs.update(
+            _in_ndc=in_ndc,
+            _is_perspective=True,
+        )
+        kwargs.pop('is_perspective', None)
+        kwargs.pop('in_ndc', None)
+
+        super(cameras.PerspectiveCameras,
+              self).__init__(device=device,
+                             focal_length=focal_length,
+                             principal_point=principal_point,
+                             R=R,
+                             T=T,
+                             K=K,
+                             convention=convention,
+                             **kwargs)
+        if image_size is not None:
+            if (self.image_size < 1).any():  # pyre-ignore
+                raise ValueError('Image_size provided has invalid values')
+        else:
+            self.image_size = None
+
+    def __getitem__(self, index: Union[slice, int, torch.Tensor, List, Tuple]):
+        """Slice the cameras by batch dim.
+
+        Args:
+            index (Union[slice, int, torch.Tensor, List, Tuple]):
+            index for slicing.
+
+        Returns:
+            MMCamerasBase: sliced cameras.
+        """
+        return super(cameras.PerspectiveCameras, self).__getitem__(index)
+
+    @classmethod
+    def get_default_projection_matrix(cls, **args) -> torch.Tensor:
+        """Class method. Calculate the projective transformation matrix by
+        default parameters.
+
+        Args:
+            **kwargs: parameters for the projection can be passed in as keyword
+                arguments to override the default values set in `__init__`.
+
+        Return:
+            a `torch.Tensor` which represents a batch of projection matrices K
+            of shape (N, 4, 4)
+        """
+        batch_size = args.get('batch_size', 1)
+        device = args.get('device', 'cpu')
+        focal_length = args.get('focal_length')
+        principal_point = args.get('principal_point')
+
+        return cameras._get_sfm_calibration_matrix(
+            N=batch_size,
+            device=device,
+            focal_length=focal_length,
+            principal_point=principal_point,
+            orthographic=False)
+
+    def get_ndc_camera_transform(self, **kwargs) -> Transform3d:
+        kwargs.pop('cameras', None)
+        return super().get_ndc_camera_transform(**kwargs)
+
+    def transform_points_screen(self,
+                                points,
+                                eps: Optional[float] = None,
+                                **kwargs) -> torch.Tensor:
+        kwargs.pop('cameras', None)
+        return super().transform_points_screen(points, eps, **kwargs)
+
+
+@CAMERAS.register_module(name=('FoVPerspectiveCameras', 'FoVPerspective',
+                               'fovperspective'))
+class FoVPerspectiveCameras(cameras.FoVPerspectiveCameras, MMCamerasBase):
+    """Inherited from Pytorch3D `FoVPerspectiveCameras`."""
+    def __init__(
+        self,
+        znear=1.0,
+        zfar=100.0,
+        aspect_ratio=1.0,
+        fov=60.0,
+        degrees: bool = True,
+        R: Optional[torch.Tensor] = None,
+        T: Optional[torch.Tensor] = None,
+        K: Optional[torch.Tensor] = None,
+        device: Union[torch.device, str] = 'cpu',
+        convention: str = 'pytorch3d',
+        **kwargs,
+    ) -> None:
+        """Initialize a camera.
+
+        Args:
+            znear (float, optional):  Defaults to 1.0.
+            zfar (float, optional):  Defaults to 100.0.
+            aspect_ratio (float, optional):  Defaults to 1.0.
+            fov (float, optional):  Defaults to 60.0.
+            degrees (bool, optional):  Defaults to True.
+            R (Optional[torch.Tensor], optional):  Defaults to None.
+            T (Optional[torch.Tensor], optional):  Defaults to None.
+            K (Optional[torch.Tensor], optional):  Defaults to None.
+            device (Union[torch.device, str], optional):  Defaults to 'cpu'.
+            convention (str, optional):  Defaults to 'pytorch3d'.
+        """
+        kwargs.update(
+            _in_ndc=True,
+            _is_perspective=True,
+        )
+        kwargs.pop('in_ndc', None)
+        kwargs.pop('is_perspective', None)
+        super(cameras.FoVPerspectiveCameras, self).__init__(
+            device=device,
+            znear=znear,
+            zfar=zfar,
+            aspect_ratio=aspect_ratio,
+            fov=fov,
+            R=R,
+            T=T,
+            K=K,
+            convention=convention,
+            **kwargs,
+        )
+        self.degrees = degrees
+
+    def __getitem__(self, index: Union[slice, int, torch.Tensor, List, Tuple]):
+        """Slice the cameras by batch dim.
+
+        Args:
+            index (Union[slice, int, torch.Tensor, List, Tuple]):
+            index for slicing.
+
+        Returns:
+            MMCamerasBase: sliced cameras.
+        """
+        return super(cameras.FoVPerspectiveCameras, self).__getitem__(index)
+
+    def get_ndc_camera_transform(self, **kwargs) -> Transform3d:
+        kwargs.pop('cameras', None)
+        return super().get_ndc_camera_transform(**kwargs)
+
+    def transform_points_screen(self,
+                                points,
+                                eps: Optional[float] = None,
+                                **kwargs) -> torch.Tensor:
+        kwargs.pop('cameras', None)
+        return super().transform_points_screen(points, eps, **kwargs)
+
+    @classmethod
+    def get_default_projection_matrix(cls, **args) -> torch.Tensor:
+        """Class method. Calculate the projective transformation matrix by
+        default parameters.
+
+        Args:
+            **kwargs: parameters for the projection can be passed in as keyword
+                arguments to override the default values set in `__init__`.
+
+        Return:
+            a `torch.Tensor` which represents a batch of projection matrices K
+            of shape (N, 4, 4)
+        """
+        znear = args.get('znear', 1.0)
+        zfar = args.get('zfar', 100.0)
+        aspect_ratio = args.get('aspect_ratio', 1.0)
+        fov = args.get('fov', 60.0)
+        degrees = args.get('degrees', True)
+        batch_size = args.get('batch_size', 1)
+
+        K = torch.zeros((1, 4, 4), dtype=torch.float32)
+        if degrees:
+            fov = (math.pi / 180) * fov
+
+        if not torch.is_tensor(fov):
+            fov = torch.tensor(fov)
+        tanHalfFov = torch.tan((fov / 2))
+        max_y = tanHalfFov * znear
+        min_y = -max_y
+        max_x = max_y * aspect_ratio
+        min_x = -max_x
+
+        z_sign = 1.0
+
+        K[:, 0, 0] = 2.0 * znear / (max_x - min_x)
+        K[:, 1, 1] = 2.0 * znear / (max_y - min_y)
+        K[:, 0, 2] = (max_x + min_x) / (max_x - min_x)
+        K[:, 1, 2] = (max_y + min_y) / (max_y - min_y)
+        K[:, 3, 2] = z_sign
+
+        K[:, 2, 2] = z_sign * zfar / (zfar - znear)
+        K[:, 2, 3] = -(zfar * znear) / (zfar - znear)
+        K = K.repeat(batch_size, 1, 1)
+        return K
+
+    def to_ndc_(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+    def to_screen_(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+    def to_ndc(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+    def to_screen(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+
+@CAMERAS.register_module(name=('OrthographicCameras', 'Orthographic',
+                               'orthographic'))
+class OrthographicCameras(cameras.OrthographicCameras, MMCamerasBase):
+    """Inherited from Pytorch3D `OrthographicCameras`."""
+    def __init__(
+        self,
+        focal_length=1.0,
+        principal_point=((0.0, 0.0), ),
+        R: Optional[torch.Tensor] = None,
+        T: Optional[torch.Tensor] = None,
+        K: Optional[torch.Tensor] = None,
+        device: Union[torch.Tensor, str] = 'cpu',
+        in_ndc: bool = True,
+        image_size: Optional[torch.Tensor] = None,
+        convention: str = 'pytorch3d',
+        **kwargs,
+    ) -> None:
+        """Initialize OrthographicCameras.
+
+        Args:
+            focal_length (float, optional):  Defaults to 1.0.
+            principal_point (tuple, optional):  Defaults to ((0.0, 0.0), ).
+            R (Optional[torch.Tensor], optional):  Defaults to None.
+            T (Optional[torch.Tensor], optional):  Defaults to None.
+            K (Optional[torch.Tensor], optional):  Defaults to None.
+            device (Union[torch.Tensor, str], optional):  Defaults to 'cpu'.
+            in_ndc (bool, optional):  Defaults to True.
+            image_size (Optional[torch.Tensor], optional):  Defaults to None.
+            convention (str, optional):  Defaults to 'pytorch3d'.
+
+        Raises:
+            ValueError: [description]
+        """
+        if image_size is not None:
+            kwargs.update({'image_size': image_size})
+        kwargs.update(
+            _is_perspective=False,
+            _in_ndc=in_ndc,
+        )
+        kwargs.pop('is_perspective', None)
+        kwargs.pop('in_ndc', None)
+        super(cameras.OrthographicCameras,
+              self).__init__(device=device,
+                             focal_length=focal_length,
+                             principal_point=principal_point,
+                             R=R,
+                             T=T,
+                             K=K,
+                             convention=convention,
+                             **kwargs)
+        if image_size is not None:
+            if (self.image_size < 1).any():  # pyre-ignore
+                raise ValueError('Image_size provided has invalid values')
+        else:
+            self.image_size = None
+
+    def get_ndc_camera_transform(self, **kwargs) -> Transform3d:
+        kwargs.pop('cameras', None)
+        return super().get_ndc_camera_transform(**kwargs)
+
+    def transform_points_screen(self,
+                                points,
+                                eps: Optional[float] = None,
+                                **kwargs) -> torch.Tensor:
+        kwargs.pop('cameras', None)
+        return super().transform_points_screen(points, eps, **kwargs)
+
+    def __getitem__(self, index: Union[slice, int, torch.Tensor, List, Tuple]):
+        """Slice the cameras by batch dim.
+
+        Args:
+            index (Union[slice, int, torch.Tensor, List, Tuple]):
+            index for slicing.
+
+        Returns:
+            MMCamerasBase: sliced cameras.
+        """
+        return super(cameras.OrthographicCameras, self).__getitem__(index)
+
+    @classmethod
+    def get_default_projection_matrix(cls, **args) -> torch.Tensor:
+        """Class method. Calculate the projective transformation matrix by
+        default parameters.
+
+        .. code-block:: python
+
+            fx = focal_length[:,0]
+            fy = focal_length[:,1]
+            px = principal_point[:,0]
+            py = principal_point[:,1]
+
+            K = [[fx,   0,    0,  px],
+                 [0,   fy,    0,  py],
+                 [0,    0,    1,   0],
+                 [0,    0,    0,   1],]
+
+        Args:
+            **kwargs: parameters for the projection can be passed in as keyword
+                arguments to override the default values.
+
+        Return:
+            a `torch.Tensor` which represents a batch of projection matrices K
+            of shape (N, 4, 4)
+        """
+        batch_size = args.get('batch_size', 1)
+        device = args.get('device', 'cpu')
+        focal_length = args.get('focal_length')
+        principal_point = args.get('principal_point')
+
+        return cameras._get_sfm_calibration_matrix(
+            N=batch_size,
+            device=device,
+            focal_length=focal_length,
+            principal_point=principal_point,
+            orthographic=True)
+
+
+@CAMERAS.register_module(name=('FoVOrthographicCameras', 'FoVOrthographic',
+                               'fovorthographic'))
+class FoVOrthographicCameras(cameras.FoVOrthographicCameras, MMCamerasBase):
+    """Inherited from Pytorch3D `FoVOrthographicCameras`."""
+    def __init__(
+            self,
+            znear: Union[torch.Tensor, int, float] = 1.0,
+            zfar: Union[torch.Tensor, int, float] = 100.0,
+            max_y: Union[torch.Tensor, int, float] = 1.0,
+            min_y: Union[torch.Tensor, int, float] = -1.0,
+            max_x: Union[torch.Tensor, int, float] = 1.0,
+            min_x: Union[torch.Tensor, int, float] = -1.0,
+            scale_xyz: Union[Iterable[float],
+                             Iterable[int]] = ((1.0, 1.0, 1.0), ),  # (1, 3)
+            R: Optional[torch.Tensor] = None,
+            T: Optional[torch.Tensor] = None,
+            K: Optional[torch.Tensor] = None,
+            device: Union[torch.device, str] = 'cpu',
+            convention: str = 'pytorch3d',
+            **kwargs):
+        """reimplemented __init__, add `convention`.
+
+        Args:
+            znear (Union[torch.Tensor, int, float], optional):
+                Defaults to 1.0.
+            zfar (Union[torch.Tensor, int, float], optional):
+                Defaults to 100.0.
+            max_y (Union[torch.Tensor, int, float], optional):
+                Defaults to 1.0.
+            min_y (Union[torch.Tensor, int, float], optional):
+                Defaults to -1.0.
+            max_x (Union[torch.Tensor, int, float], optional):
+                Defaults to 1.0.
+            min_x (Union[torch.Tensor, int, float], optional):
+                Defaults to -1.0.
+            scale_xyz (Union[Iterable[float], Iterable[int]], optional):
+                Defaults to ((1.0, 1.0, 1.0), ).
+            T (Optional[torch.Tensor], optional):  Defaults to None.
+            K (Optional[torch.Tensor], optional):  Defaults to None.
+            device (Union[torch.device, str], optional):  Defaults to 'cpu'.
+            convention (str, optional):  Defaults to 'pytorch3d'.
+        """
+        kwargs.update(_is_perspective=False, _in_ndc=True)
+        kwargs.pop('in_ndc', None)
+        kwargs.pop('is_perspective', None)
+        super(cameras.FoVOrthographicCameras,
+              self).__init__(device=device,
+                             znear=znear,
+                             zfar=zfar,
+                             max_y=max_y,
+                             min_y=min_y,
+                             max_x=max_x,
+                             min_x=min_x,
+                             scale_xyz=scale_xyz,
+                             R=R,
+                             T=T,
+                             K=K,
+                             convention=convention,
+                             **kwargs)
+
+    def __getitem__(self, index: Union[slice, int, torch.Tensor, List, Tuple]):
+        """Slice the cameras by batch dim.
+
+        Args:
+            index (Union[slice, int, torch.Tensor, List, Tuple]):
+            index for slicing.
+
+        Returns:
+            MMCamerasBase: sliced cameras.
+        """
+        return super(cameras.FoVOrthographicCameras, self).__getitem__(index)
+
+    @classmethod
+    def get_default_projection_matrix(cls, **args) -> torch.Tensor:
+        """Class method. Calculate the projective transformation matrix by
+        default parameters.
+
+        .. code-block:: python
+
+            scale_x = 2 / (max_x - min_x)
+            scale_y = 2 / (max_y - min_y)
+            scale_z = 2 / (far-near)
+            mid_x = (max_x + min_x) / (max_x - min_x)
+            mix_y = (max_y + min_y) / (max_y - min_y)
+            mid_z = (far + near) / (far - near)
+
+            K = [[scale_x,        0,         0,  -mid_x],
+                 [0,        scale_y,         0,  -mix_y],
+                 [0,              0,  -scale_z,  -mid_z],
+                 [0,              0,         0,       1],]
+
+        Args:
+            **kwargs: parameters for the projection can be passed in as keyword
+                arguments to override the default values.
+
+        Return:
+            a `torch.Tensor` which represents a batch of projection matrices K
+            of shape (N, 4, 4)
+        """
+        znear = args.get('znear', 1.0)
+        zfar = args.get('zfar', 100.0)
+        max_y = args.get('max_y', 1.0)
+        min_y = args.get('min_y', -1.0)
+        max_x = args.get('max_x', 1.0)
+        min_x = args.get('min_x', -1.0)
+        scale_xyz = args.get(
+            'scale_xyz',
+            ((1.0, 1.0, 1.0), ),
+        )
+        batch_size = args.get('batch_size', 1)
+
+        K = torch.zeros((1, 4, 4), dtype=torch.float32)
+        ones = torch.ones((1), dtype=torch.float32)
+        z_sign = +1.0
+
+        if not isinstance(scale_xyz, torch.Tensor):
+            scale_xyz = torch.Tensor(scale_xyz)
+        K[:, 0, 0] = (2.0 / (max_x - min_x)) * scale_xyz[:, 0]
+        K[:, 1, 1] = (2.0 / (max_y - min_y)) * scale_xyz[:, 1]
+        K[:, 0, 3] = -(max_x + min_x) / (max_x - min_x)
+        K[:, 1, 3] = -(max_y + min_y) / (max_y - min_y)
+        K[:, 3, 3] = ones
+
+        # NOTE: This maps the z coordinate to the range [0, 1] and replaces the
+        # the OpenGL z normalization to [-1, 1]
+        K[:, 2, 2] = z_sign * (1.0 / (zfar - znear)) * scale_xyz[:, 2]
+        K[:, 2, 3] = -znear / (zfar - znear)
+        K = K.repeat(batch_size, 1, 1)
+        return K
+
+    def to_ndc_(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+    def to_screen_(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+    def to_ndc(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+    def to_screen(self, **kwargs):
+        """Not implemented."""
+        raise NotImplementedError()
+
+    def get_ndc_camera_transform(self, **kwargs) -> Transform3d:
+        kwargs.pop('cameras', None)
+        return super().get_ndc_camera_transform(**kwargs)
+
+    def transform_points_screen(self,
+                                points,
+                                eps: Optional[float] = None,
+                                **kwargs) -> torch.Tensor:
+        kwargs.pop('cameras', None)
+        return super().transform_points_screen(points, eps, **kwargs)
+
+
+def concat_cameras(cameras_list: List[MMCamerasBase]) -> MMCamerasBase:
+    """Concat a list of cameras of the same type.
+
+    Args:
+        cameras_list (List[cameras.CamerasBase]): a list of cameras.
+
+    Returns:
+        MMCamerasBase: the returned cameras concated following the batch
+            dim.
+    """
+    K = []
+    R = []
+    T = []
+    is_perspective = cameras_list[0].is_perspective()
+    in_ndc = cameras_list[0].in_ndc()
+    cam_cls = type(cameras_list[0])
+    image_size = cameras_list[0].get_image_size()
+    device = cameras_list[0].device
+    for cam in cameras_list:
+        assert type(cam) is cam_cls
+        assert cam.in_ndc() is in_ndc
+        assert cam.is_perspective() is is_perspective
+        assert cam.device is device
+        K.append(cam.K)
+        R.append(cam.R)
+        T.append(cam.T)
+    K = torch.cat(K)
+    R = torch.cat(R)
+    T = torch.cat(T)
+    concated_cameras = cam_cls(K=K,
+                               R=R,
+                               T=T,
+                               device=device,
+                               is_perspective=is_perspective,
+                               in_ndc=in_ndc,
+                               image_size=image_size)
+    return concated_cameras
+
+
+def compute_orbit_cameras(
+    K: Union[torch.Tensor, np.ndarray, None] = None,
+    elev: float = 0,
+    azim: float = 0,
+    dist: float = 2.7,
+    at: Union[torch.Tensor, List, Tuple] = (0, 0, 0),
+    batch_size: int = 1,
+    orbit_speed: Union[float, Tuple[float, float]] = 0,
+    dist_speed: Optional[float] = 0,
+    convention: str = 'pytorch3d',
+) -> Union[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Generate a sequence of moving cameras following an orbit.
+
+    Args:
+        K (Union[torch.Tensor, np.ndarray, None], optional):
+            Intrinsic matrix. Will generate a default K if None.
+            Defaults to None.
+        elev (float, optional):  This is the angle between the
+            vector from the object to the camera, and the horizontal
+            plane y = 0 (xz-plane).
+            Defaults to 0.
+        azim (float, optional): angle in degrees or radians. The vector
+            from the object to the camera is projected onto a horizontal
+            plane y = 0. azim is the angle between the projected vector and a
+            reference vector at (0, 0, 1) on the reference plane (the
+            horizontal plane).
+            Defaults to 0.
+        dist (float, optional): distance of the camera from the object.
+            Defaults to 2.7.
+        at (Union[torch.Tensor, List, Tuple], optional):
+            the position of the object(s) in world coordinates.
+            Defaults to (0, 0, 0).
+        batch_size (int, optional): number of frames. Defaults to 1.
+        orbit_speed (Union[float, Tuple[float, float]], optional):
+            degree speed of camera moving along the orbit.
+            Could be one or two number. One number for only elev speed,
+            two number for both.
+            Defaults to 0.
+        dist_speed (Optional[float], optional):
+            speed of camera moving along the center line.
+            Defaults to 0.
+        convention (str, optional): Camera convention. Defaults to 'pytorch3d'.
+
+    Returns:
+        Union[torch.Tensor, torch.Tensor, torch.Tensor]: computed K, R, T.
+    """
+    if not isinstance(orbit_speed, Iterable):
+        orbit_speed = (orbit_speed, 0.0)
+    if not isinstance(at, torch.Tensor):
+        at = torch.Tensor(at)
+    at = at.view(1, 3)
+    if batch_size > 1 and orbit_speed[0] != 0:
+        azim = torch.linspace(azim, azim + batch_size * orbit_speed[0],
+                              batch_size)
+    if batch_size > 1 and orbit_speed[1] != 0:
+        elev = torch.linspace(elev, elev + batch_size * orbit_speed[1],
+                              batch_size)
+    if batch_size > 1 and dist_speed != 0:
+        dist = torch.linspace(dist, dist + batch_size * dist_speed, batch_size)
+
+    if convention == 'opencv':
+        rotation_compensate = ee_to_rotmat(
+            torch.Tensor([math.pi, 0, 0]).view(1, 3))
+        at = rotation_compensate.permute(0, 2, 1) @ at.view(-1, 3, 1)
+        at = at.view(1, 3)
+    R, T = cameras.look_at_view_transform(dist=dist,
+                                          elev=elev,
+                                          azim=azim,
+                                          at=at)
+    if K is None:
+        K = FoVPerspectiveCameras.get_default_projection_matrix(
+            batch_size=batch_size)
+    if convention == 'opencv':
+        rotation_compensate = ee_to_rotmat(
+            torch.Tensor([math.pi, 0, 0]).view(1, 3))
+        R = rotation_compensate.permute(0, 2, 1) @ R
+    return K, R, T
+
+
+def compute_direction_cameras(
+    K: Union[torch.Tensor, np.ndarray, None] = None,
+    at: Union[torch.Tensor, List, Tuple, None] = None,
+    eye: Union[torch.Tensor, List, Tuple, None] = None,
+    plane: Union[Iterable[torch.Tensor], None] = None,
+    dist: float = 1.0,
+    batch_size: int = 1,
+    dist_speed: float = 0.0,
+    z_vec: Union[torch.Tensor, List, Tuple, None] = None,
+    y_vec: Union[torch.Tensor, List, Tuple] = (0, 1, 0),
+    convention: str = 'pytorch3d',
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """Generate a sequence of moving cameras along a direction.
+    We need a `z_vec`, `y_vec` to generate `x_vec` so as to get the `R` matrix.
+    And we need `eye` as `T` matrix.
+    `K` matrix could be set or use default.
+    We recommend `y_vec` as default (0, 1, 0), and it will be orthogonal
+        decomposed. The `x_vec` will be generated by cross production from
+        `y_vec` and `x_vec`.
+    You can set `z_vec` by: 1. set `at`, `dist`, `dist_speed`, `plane`,
+                            `batch_size` to get `eye`, then get `z_vec`.
+                            2. set `at`, `eye` directly and get `z_vec`.
+                            3. set `z_vec` directly and:
+                                1). set `eye` and `dist`.
+                                2). set `at`, `dist`, `dist_speed`,
+                                `batch_size` then get `eye`.
+        When we have `eye`, `z_vec`, `y_vec`, we will have `R` and `T`.
+
+    Args:
+        K (Union[torch.Tensor, np.ndarray, None], optional):
+            Intrinsic matrix. Will generate a default K if None.
+            Defaults to None.
+        at (Union[torch.Tensor, List, Tuple], optional):
+            the position of the object(s) in world coordinates.
+            Required.
+            Defaults to None.
+        eye (Union[torch.Tensor, List, Tuple], optional):
+            the position of the camera(s) in world coordinates.
+            If eye is not None, it will override the camera position derived
+            from plane, dist, dist_speed.
+            Defaults to None.
+        plane (Optional[Iterable[torch.Tensor, List, Tuple]], optional):
+            The plane of your z direction normal.
+            Should be a tuple or list containing two vectors of shape (N, 3).
+            Defaults to None.
+        dist (float, optional): distance to at.
+            Defaults to 1.0.
+        dist_speed (float, optional): distance moving speed.
+            Defaults to 1.0.
+        batch_size (int, optional): number of frames.
+            Defaults to 1.
+        z_vec (Union[torch.Tensor, List, Tuple], optional):
+            z direction of shape (-1, 3). If z_vec is not None, it will
+            override plane, dist, dist_speed.
+            Defaults to None.
+        y_vec (Union[torch.Tensor, List, Tuple], optional):
+            Will only be used when z_vec is used.
+            Defaults to (0, 1, 0).
+        convention (str, optional): Camera convention.
+            Defaults to 'pytorch3d'.
+
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: computed K, R, T.
+    """
+    def norm_vec(vec):
+        return vec / torch.sqrt((vec * vec).sum())
+
+    if z_vec is None:
+        assert at is not None
+        at = torch.Tensor(at).view(-1, 3)
+        if eye is None:
+            assert plane is not None
+            dist = torch.linspace(dist, dist + batch_size * dist_speed,
+                                  batch_size)
+            vec1 = torch.Tensor(plane[0]).view(-1, 3)
+            norm_vec1 = norm_vec(vec1)
+            vec2 = torch.Tensor(plane[1]).view(-1, 3)
+            norm_vec2 = norm_vec(vec2)
+            norm = torch.cross(norm_vec1, norm_vec2)
+            normed_norm = norm_vec(norm)
+            eye = at + normed_norm * dist
+        else:
+            eye = torch.Tensor(eye).view(-1, 3)
+            norm = eye - at
+            normed_norm = norm_vec(norm)
+
+        z_vec = -normed_norm
+    else:
+        z_vec = torch.Tensor(z_vec).view(-1, 3)
+        z_vec = norm_vec(z_vec)
+        if eye is None:
+            assert at is not None
+            at = torch.Tensor(at).view(-1, 3)
+            dist = torch.linspace(dist, dist + batch_size * dist_speed,
+                                  batch_size)
+            eye = -z_vec * dist + at
+        eye = torch.Tensor(eye).view(-1, 3)
+        assert eye is not None
+        z_vec = norm_vec(z_vec)
+        normed_norm = -z_vec
+
+    z_vec = z_vec.view(-1, 3)
+    y_vec = torch.Tensor(y_vec).view(-1, 3)
+
+    y_vec = y_vec - torch.bmm(y_vec.view(-1, 1, 3), z_vec.view(-1, 3, 1)).view(
+        -1, 1) * z_vec
+    y_vec = norm_vec(y_vec)
+    x_vec = torch.cross(y_vec, z_vec)
+    R = torch.cat(
+        [x_vec.view(-1, 3, 1),
+         y_vec.view(-1, 3, 1),
+         z_vec.view(-1, 3, 1)], 1).view(-1, 3, 3)
+    T = eye
+
+    R = R.permute(0, 2, 1)
+    _, T = convert_world_view(R=R, T=T)
+
+    if K is None:
+        K = FoVPerspectiveCameras.get_default_projection_matrix(
+            batch_size=batch_size)
+    K, R, T = convert_camera_matrix(K=K,
+                                    R=R,
+                                    T=T,
+                                    is_perspective=True,
+                                    convention_src='pytorch3d',
+                                    convention_dst=convention)
+    return K, R, T
diff --git a/detrsmpl/core/conventions/__init__.py b/detrsmpl/core/conventions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/core/conventions/cameras/__init__.py b/detrsmpl/core/conventions/cameras/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/core/conventions/cameras/convert_convention.py b/detrsmpl/core/conventions/cameras/convert_convention.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb76635a81fdd1b4d005a0f6dee5c64e5f2299f9
--- /dev/null
+++ b/detrsmpl/core/conventions/cameras/convert_convention.py
@@ -0,0 +1,649 @@
+import warnings
+from typing import Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from detrsmpl.utils.transforms import ee_to_rotmat, rotmat_to_ee
+
+CAMERA_CONVENTIONS = {
+    'pytorch3d': {
+        'axis': '-xyz',
+        'left_mm_extrinsic': False,
+        'view_to_world': False,
+        'left_mm_intrinsic': True,
+    },
+    'pyrender': {
+        'axis': 'xy-z',
+        'left_mm_extrinsic': True,
+        'view_to_world': False,
+        'left_mm_intrinsic': True,
+    },
+    'opengl': {
+        'axis': 'xy-z',
+        'left_mm_extrinsic': True,
+        'view_to_world': False,
+        'left_mm_intrinsic': True,
+    },
+    'open3d': {
+        'axis': 'x-yz',
+        'left_mm_extrinsic': False,
+        'view_to_world': False,
+        'left_mm_intrinsic': False,
+    },
+    'opencv': {
+        'axis': 'x-yz',
+        'left_mm_extrinsic': True,
+        'view_to_world': True,
+        'left_mm_intrinsic': True,
+    },
+    'unity': {
+        'axis': 'xyz',
+        'left_mm_extrinsic': True,
+        'view_to_world': False,
+        'left_mm_intrinsic': True,
+    },
+    'blender': {
+        'axis': 'xy-z',
+        'left_mm_extrinsic': True,
+        'view_to_world': False,
+        'left_mm_intrinsic': True,
+    },
+    'maya': {
+        'axis': 'xy-z',
+        'left_mm_extrinsic': True,
+        'view_to_world': False,
+        'left_mm_intrinsic': True,
+    }
+}
+
+
+def enc_camera_convention(convention, camera_conventions=CAMERA_CONVENTIONS):
+    """convert camera convention to axis direction and order."""
+    if convention in camera_conventions:
+        convention = camera_conventions[convention]['axis']
+    else:
+        assert set(convention).issubset(
+            {'x', 'y', 'z', '+',
+             '-'}), 'Wrong convention string, choose either in'
+        f'set({camera_conventions.keys()}) or define by xyz.'
+    sign = [1, 1, 1]
+    convention = '_' + convention
+    count = 0
+    axis_order = ''
+    for i in range(len(convention)):
+        if convention[i] in 'xyz':
+            axis_order += convention[i]
+            if convention[i - 1] == '-':
+                sign[count] *= -1
+            count += 1
+    return sign, axis_order
+
+
+def convert_camera_matrix(
+    K: Optional[Union[torch.Tensor, np.ndarray]] = None,
+    R: Optional[Union[torch.Tensor, np.ndarray]] = None,
+    T: Optional[Union[torch.Tensor, np.ndarray]] = None,
+    is_perspective: bool = True,
+    convention_src: str = 'opencv',
+    convention_dst: str = 'pytorch3d',
+    in_ndc_src: bool = True,
+    in_ndc_dst: bool = True,
+    resolution_src: Optional[Union[int, Tuple[int, int], torch.Tensor,
+                                   np.ndarray]] = None,
+    resolution_dst: Optional[Union[int, Tuple[int, int], torch.Tensor,
+                                   np.ndarray]] = None,
+    camera_conventions: dict = CAMERA_CONVENTIONS,
+) -> Tuple[Union[torch.Tensor, np.ndarray], Union[torch.Tensor, np.ndarray],
+           Union[torch.Tensor, np.ndarray]]:
+    """Convert the intrinsic matrix K and extrinsic matrix [R|T] from source
+    convention to destination convention.
+
+    Args:
+        K (Union[torch.Tensor, np.ndarray]): Intrinsic matrix,
+            shape should be (batch_size, 4, 4) or (batch_size, 3, 3).
+            Will be ignored if None.
+        R (Optional[Union[torch.Tensor, np.ndarray]], optional):
+            Extrinsic rotation matrix. Shape should be (batch_size, 3, 3).
+            Will be identity if None.
+            Defaults to None.
+        T (Optional[Union[torch.Tensor, np.ndarray]], optional):
+            Extrinsic translation matrix. Shape should be (batch_size, 3).
+            Will be zeros if None.
+            Defaults to None.
+        is_perspective (bool, optional): whether is perspective projection.
+            Defaults to True.
+
+        _____________________________________________________________________
+        # Camera dependent args
+        convention_src (str, optional): convention of source camera,
+        convention_dst (str, optional): convention of destination camera,
+
+        We define the convention of cameras by the order of right, front and
+        up.
+        E.g., the first one is pyrender and its convention should be
+            '+x+z+y'. '+' could be ignored.
+            The second one is opencv and its convention should be '+x-z-y'.
+            The third one is pytorch3d and its convention should be '-xzy'.
+                    opengl(pyrender)     opencv            pytorch3d
+                    y                   z                     y
+                    |                  /                      |
+                    |                 /                       |
+                    |_______x        /________x     x________ |
+                    /                |                        /
+                   /                 |                       /
+                z /                y |                    z /
+
+        in_ndc_src (bool, optional): Whether is the source camera defined
+            in ndc.
+            Defaults to True.
+        in_ndc_dst (bool, optional): Whether is the destination camera defined
+            in ndc.
+            Defaults to True.
+
+        in camera_convention, we define these args as:
+            1). `left_mm_ex` means extrinsic matrix `K` is left matrix
+                multiplcation defined.
+            2). `left_mm_in` means intrinsic matrix [`R`| `T`] is left
+                matrix multiplcation defined.
+            3) `view_to_world` means extrinsic matrix [`R`| `T`] is defined
+                as view to world.
+
+        resolution_src (Optional[Union[int, Tuple[int, int], torch.Tensor,
+            np.ndarray]], optional):
+            Source camera image size of (height, width).
+            Required if defined in screen.
+            Will be square if int.
+            Shape should be (2,) if `array` or `tensor`.
+            Defaults to None.
+        resolution_dst (Optional[Union[int, Tuple[int, int], torch.Tensor,
+            np.ndarray]], optional):
+            Destination camera image size of (height, width).
+            Required if defined in screen.
+            Will be square if int.
+            Shape should be (2,) if `array` or `tensor`.
+            Defaults to None.
+        camera_conventions: (dict, optional): `dict` containing
+            pre-defined camera convention information.
+            Defaults to CAMERA_CONVENTIONS.
+
+    Raises:
+        TypeError: K, R, T should all be `torch.Tensor` or `np.ndarray`.
+
+    Returns:
+        Tuple[Union[torch.Tensor, None], Union[torch.Tensor, None],
+            Union[torch.Tensor, None]]:
+            Converted K, R, T matrix of `tensor`.
+    """
+    convention_dst = convention_dst.lower()
+    convention_src = convention_src.lower()
+
+    assert convention_dst in CAMERA_CONVENTIONS
+    assert convention_src in CAMERA_CONVENTIONS
+
+    left_mm_ex_src = CAMERA_CONVENTIONS[convention_src].get(
+        'left_mm_extrinsic', True)
+    view_to_world_src = CAMERA_CONVENTIONS[convention_src].get(
+        'view_to_world', False)
+    left_mm_in_src = CAMERA_CONVENTIONS[convention_src].get(
+        'left_mm_intrinsic', False)
+
+    left_mm_ex_dst = CAMERA_CONVENTIONS[convention_dst].get(
+        'left_mm_extrinsic', True)
+    view_to_world_dst = CAMERA_CONVENTIONS[convention_dst].get(
+        'view_to_world', False)
+    left_mm_in_dst = CAMERA_CONVENTIONS[convention_dst].get(
+        'left_mm_intrinsic', False)
+
+    sign_src, axis_src = enc_camera_convention(convention_src,
+                                               camera_conventions)
+    sign_dst, axis_dst = enc_camera_convention(convention_dst,
+                                               camera_conventions)
+    sign = torch.Tensor(sign_dst) / torch.Tensor(sign_src)
+
+    type_ = []
+    for x in [K, R, T]:
+        if x is not None:
+            type_.append(type(x))
+    if len(type_) > 0:
+        if not all(x == type_[0] for x in type_):
+            raise TypeError('Input type should be the same.')
+
+    use_numpy = False
+    if np.ndarray in type_:
+        use_numpy = True
+    # convert raw matrix to tensor
+    if isinstance(K, np.ndarray):
+        new_K = torch.Tensor(K)
+    elif K is None:
+        new_K = None
+    elif isinstance(K, torch.Tensor):
+        new_K = K.clone()
+    else:
+        raise TypeError(
+            f'K should be `torch.Tensor` or `numpy.ndarray`, type(K): '
+            f'{type(K)}')
+
+    if isinstance(R, np.ndarray):
+        new_R = torch.Tensor(R).view(-1, 3, 3)
+    elif R is None:
+        new_R = torch.eye(3, 3)[None]
+    elif isinstance(R, torch.Tensor):
+        new_R = R.clone().view(-1, 3, 3)
+    else:
+        raise TypeError(
+            f'R should be `torch.Tensor` or `numpy.ndarray`, type(R): '
+            f'{type(R)}')
+
+    if isinstance(T, np.ndarray):
+        new_T = torch.Tensor(T).view(-1, 3)
+    elif T is None:
+        new_T = torch.zeros(1, 3)
+    elif isinstance(T, torch.Tensor):
+        new_T = T.clone().view(-1, 3)
+    else:
+        raise TypeError(
+            f'T should be `torch.Tensor` or `numpy.ndarray`, type(T): '
+            f'{type(T)}')
+
+    if axis_dst != axis_src:
+        new_R = ee_to_rotmat(rotmat_to_ee(new_R, convention=axis_src),
+                             convention=axis_dst)
+
+    # convert extrinsic to world_to_view
+    if view_to_world_src is True:
+        new_R, new_T = convert_world_view(new_R, new_T)
+
+    # right mm to left mm
+    if (not left_mm_ex_src) and left_mm_ex_dst:
+        new_R *= sign.to(new_R.device)
+        new_R = new_R.permute(0, 2, 1)
+    # left mm to right mm
+    elif left_mm_ex_src and (not left_mm_ex_dst):
+        new_R = new_R.permute(0, 2, 1)
+        new_R *= sign.to(new_R.device)
+    # right_mm to right mm
+    elif (not left_mm_ex_dst) and (not left_mm_ex_src):
+        new_R *= sign.to(new_R.device)
+    # left mm to left mm
+    elif left_mm_ex_src and left_mm_ex_dst:
+        new_R *= sign.view(3, 1).to(new_R.device)
+    new_T *= sign.to(new_T.device)
+
+    # convert extrinsic to as definition
+    if view_to_world_dst is True:
+        new_R, new_T = convert_world_view(new_R, new_T)
+
+    # in ndc or in screen
+    if in_ndc_dst is False and in_ndc_src is True:
+        assert resolution_dst is not None, \
+            'dst in screen, should specify resolution_dst.'
+
+    if in_ndc_src is False and in_ndc_dst is True:
+        assert resolution_src is not None, \
+            'src in screen, should specify resolution_dst.'
+    if resolution_dst is None:
+        resolution_dst = 2.0
+    if resolution_src is None:
+        resolution_src = 2.0
+
+    if new_K is not None:
+        if left_mm_in_src is False and left_mm_in_dst is True:
+            new_K = new_K.permute(0, 2, 1)
+        if new_K.shape[-2:] == (3, 3):
+            new_K = convert_K_3x3_to_4x4(new_K, is_perspective)
+        # src in ndc, dst in screen
+
+        if in_ndc_src is True and (in_ndc_dst is False):
+            new_K = convert_ndc_to_screen(K=new_K,
+                                          is_perspective=is_perspective,
+                                          sign=sign.to(new_K.device),
+                                          resolution=resolution_dst)
+        # src in screen, dst in ndc
+        elif in_ndc_src is False and in_ndc_dst is True:
+            new_K = convert_screen_to_ndc(K=new_K,
+                                          is_perspective=is_perspective,
+                                          sign=sign.to(new_K.device),
+                                          resolution=resolution_src)
+        # src in ndc, dst in ndc
+        elif in_ndc_src is True and in_ndc_dst is True:
+            if is_perspective:
+                new_K[:, 0, 2] *= sign[0].to(new_K.device)
+                new_K[:, 1, 2] *= sign[1].to(new_K.device)
+            else:
+                new_K[:, 0, 3] *= sign[0].to(new_K.device)
+                new_K[:, 1, 3] *= sign[1].to(new_K.device)
+        # src in screen, dst in screen
+        else:
+            pass
+
+        if left_mm_in_src is True and left_mm_in_dst is False:
+            new_K = new_K.permute(0, 2, 1)
+
+        num_batch = max(new_K.shape[0], new_R.shape[0], new_T.shape[0])
+        if new_K.shape[0] == 1:
+            new_K = new_K.repeat(num_batch, 1, 1)
+        if new_R.shape[0] == 1:
+            new_R = new_R.repeat(num_batch, 1, 1)
+        if new_T.shape[0] == 1:
+            new_T = new_T.repeat(num_batch, 1)
+
+    if use_numpy:
+        if isinstance(new_K, torch.Tensor):
+            new_K = new_K.cpu().numpy()
+        if isinstance(new_R, torch.Tensor):
+            new_R = new_R.cpu().numpy()
+        if isinstance(new_T, torch.Tensor):
+            new_T = new_T.cpu().numpy()
+    return new_K, new_R, new_T
+
+
+def convert_K_3x3_to_4x4(
+        K: Union[torch.Tensor, np.ndarray],
+        is_perspective: bool = True) -> Union[torch.Tensor, np.ndarray]:
+    """Convert opencv 3x3 intrinsic matrix to 4x4.
+
+    Args:
+        K (Union[torch.Tensor, np.ndarray]):
+            Input 3x3 intrinsic matrix, left mm defined.
+            [[fx,   0,   px],
+             [0,   fy,   py],
+             [0,    0,   1]]
+        is_perspective (bool, optional): whether is perspective projection.
+            Defaults to True.
+
+    Raises:
+        TypeError: K is not `Tensor` or `array`.
+        ValueError: Shape is not (batch, 3, 3) or (3, 3)
+
+    Returns:
+        Union[torch.Tensor, np.ndarray]:
+            Output intrinsic matrix.
+            for perspective:
+                [[fx,   0,    px,   0],
+                [0,   fy,    py,   0],
+                [0,    0,    0,    1],
+                [0,    0,    1,    0]]
+
+            for orthographics:
+                [[fx,   0,    0,   px],
+                [0,   fy,    0,   py],
+                [0,    0,    1,    0],
+                [0,    0,    0,    1]]
+    """
+    if isinstance(K, torch.Tensor):
+        K = K.clone()
+    elif isinstance(K, np.ndarray):
+        K = K.copy()
+
+    else:
+        raise TypeError('K should be `torch.Tensor` or `numpy.ndarray`, '
+                        f'type(K): {type(K)}.')
+    if K.shape[-2:] == (4, 4):
+        warnings.warn(
+            f'shape of K already is {K.shape}, will pass converting.')
+        return K
+    use_numpy = False
+    if K.ndim == 2:
+        K = K[None].reshape(-1, 3, 3)
+    elif K.ndim == 3:
+        K = K.reshape(-1, 3, 3)
+    else:
+        raise ValueError(f'Wrong ndim of K: {K.ndim}')
+
+    if isinstance(K, np.ndarray):
+        use_numpy = True
+    if is_perspective:
+        if use_numpy:
+            K_ = np.zeros((K.shape[0], 4, 4))
+        else:
+            K_ = torch.zeros(K.shape[0], 4, 4)
+        K_[:, :2, :3] = K[:, :2, :3]
+        K_[:, 3, 2] = 1
+        K_[:, 2, 3] = 1
+    else:
+        if use_numpy:
+            K_ = np.eye(4, 4)[None].repeat(K.shape[0], 0)
+        else:
+            K_ = torch.eye(4, 4)[None].repeat(K.shape[0], 1, 1)
+        K_[:, :2, :2] = K[:, :2, :2]
+        K_[:, :2, 3:] = K[:, :2, 2:]
+    return K_
+
+
+def convert_K_4x4_to_3x3(
+        K: Union[torch.Tensor, np.ndarray],
+        is_perspective: bool = True) -> Union[torch.Tensor, np.ndarray]:
+    """Convert opencv 4x4 intrinsic matrix to 3x3.
+
+    Args:
+        K (Union[torch.Tensor, np.ndarray]):
+            Input 4x4 intrinsic matrix, left mm defined.
+            for perspective:
+                [[fx,   0,    px,   0],
+                [0,   fy,    py,   0],
+                [0,    0,    0,    1],
+                [0,    0,    1,    0]]
+
+            for orthographics:
+                [[fx,   0,    0,   px],
+                [0,   fy,    0,   py],
+                [0,    0,    1,    0],
+                [0,    0,    0,    1]]
+        is_perspective (bool, optional): whether is perspective projection.
+            Defaults to True.
+
+    Raises:
+        TypeError: type K should be `Tensor` or `array`.
+        ValueError: Shape is not (batch, 3, 3) or (3, 3).
+
+    Returns:
+        Union[torch.Tensor, np.ndarray]:
+            Output 3x3 intrinsic matrix, left mm defined.
+            [[fx,   0,   px],
+             [0,   fy,   py],
+             [0,    0,   1]]
+    """
+
+    if isinstance(K, torch.Tensor):
+        K = K.clone()
+    elif isinstance(K, np.ndarray):
+        K = K.copy()
+    else:
+        raise TypeError('K should be `torch.Tensor` or `numpy.ndarray`, '
+                        f'type(K): {type(K)}.')
+    if K.shape[-2:] == (3, 3):
+        warnings.warn(
+            f'shape of K already is {K.shape}, will pass converting.')
+        return K
+    use_numpy = True if isinstance(K, np.ndarray) else False
+    if K.ndim == 2:
+        K = K[None].reshape(-1, 4, 4)
+    elif K.ndim == 3:
+        K = K.reshape(-1, 4, 4)
+    else:
+        raise ValueError(f'Wrong ndim of K: {K.ndim}')
+
+    if use_numpy:
+        K_ = np.eye(3, 3)[None].repeat(K.shape[0], 0)
+    else:
+        K_ = torch.eye(3, 3)[None].repeat(K.shape[0], 1, 1)
+    if is_perspective:
+        K_[:, :2, :3] = K[:, :2, :3]
+    else:
+        K_[:, :2, :2] = K[:, :2, :2]
+        K_[:, :2, 2:3] = K[:, :2, 3:4]
+    return K_
+
+
+def convert_ndc_to_screen(
+        K: Union[torch.Tensor, np.ndarray],
+        resolution: Union[int, Tuple[int, int], List[int], torch.Tensor,
+                          np.ndarray],
+        sign: Optional[Iterable[int]] = None,
+        is_perspective: bool = True) -> Union[torch.Tensor, np.ndarray]:
+    """Convert intrinsic matrix from ndc to screen.
+
+    Args:
+        K (Union[torch.Tensor, np.ndarray]):
+            Input 4x4 intrinsic matrix, left mm defined.
+        resolution (Union[int, Tuple[int, int], torch.Tensor, np.ndarray]):
+            (height, width) of image.
+        sign (Optional[Union[Iterable[int]]], optional): xyz axis sign.
+            Defaults to None.
+        is_perspective (bool, optional): whether is perspective projection.
+            Defaults to True.
+
+    Raises:
+        TypeError: K should be Tensor or array.
+        ValueError: shape of K should be (batch, 4, 4)
+
+    Returns:
+        Union[torch.Tensor, np.ndarray]: output intrinsic matrix.
+    """
+    sign = [1, 1, 1] if sign is None else sign
+    if isinstance(K, torch.Tensor):
+        K = K.clone()
+    elif isinstance(K, np.ndarray):
+        K = K.copy()
+    else:
+        raise TypeError(
+            f'K should be `torch.Tensor` or `np.ndarray`, type(K): {type(K)}')
+    if K.ndim == 2:
+        K = K[None].reshape(-1, 4, 4)
+    elif K.ndim == 3:
+        K = K.reshape(-1, 4, 4)
+    else:
+        raise ValueError(f'Wrong ndim of K: {K.ndim}')
+
+    if isinstance(resolution, (int, float)):
+        w_dst = h_dst = resolution
+    elif isinstance(resolution, (list, tuple)):
+        h_dst, w_dst = resolution
+    elif isinstance(resolution, (torch.Tensor, np.ndarray)):
+        resolution = resolution.reshape(-1, 2)
+        h_dst, w_dst = resolution[:, 0], resolution[:, 1]
+
+    aspect_ratio = w_dst / h_dst
+    K[:, 0, 0] *= w_dst / 2
+    K[:, 1, 1] *= h_dst / 2
+    if aspect_ratio > 1:
+        K[:, 0, 0] /= aspect_ratio
+    else:
+        K[:, 1, 1] *= aspect_ratio
+    if is_perspective:
+        K[:, 0, 2] *= sign[0]
+        K[:, 1, 2] *= sign[1]
+        K[:, 0, 2] = (K[:, 0, 2] + 1) * (w_dst / 2)
+        K[:, 1, 2] = (K[:, 1, 2] + 1) * (h_dst / 2)
+    else:
+        K[:, 0, 3] *= sign[0]
+        K[:, 1, 3] *= sign[1]
+        K[:, 0, 3] = (K[:, 0, 3] + 1) * (w_dst / 2)
+        K[:, 1, 3] = (K[:, 1, 3] + 1) * (h_dst / 2)
+    return K
+
+
+def convert_screen_to_ndc(
+        K: Union[torch.Tensor, np.ndarray],
+        resolution: Union[int, Tuple[int, int], torch.Tensor, np.ndarray],
+        sign: Optional[Iterable[int]] = None,
+        is_perspective: bool = True) -> Union[torch.Tensor, np.ndarray]:
+    """Convert intrinsic matrix from screen to ndc.
+
+    Args:
+        K (Union[torch.Tensor, np.ndarray]): input intrinsic matrix.
+        resolution (Union[int, Tuple[int, int], torch.Tensor, np.ndarray]):
+            (height, width) of image.
+        sign (Optional[Union[Iterable[int]]], optional): xyz axis sign.
+            Defaults to None.
+        is_perspective (bool, optional): whether is perspective projection.
+            Defaults to True.
+
+    Raises:
+        TypeError: K should be Tensor or array.
+        ValueError: shape of K should be (batch, 4, 4)
+
+    Returns:
+        Union[torch.Tensor, np.ndarray]: output intrinsic matrix.
+    """
+    if sign is None:
+        sign = [1, 1, 1]
+
+    if isinstance(K, torch.Tensor):
+        K = K.clone()
+    elif isinstance(K, np.ndarray):
+        K = K.copy()
+    else:
+        raise TypeError(
+            f'K should be `torch.Tensor` or `np.ndarray`, type(K): {type(K)}')
+    if K.ndim == 2:
+        K = K[None].reshape(-1, 4, 4)
+    elif K.ndim == 3:
+        K = K.reshape(-1, 4, 4)
+    else:
+        raise ValueError(f'Wrong ndim of K: {K.ndim}')
+
+    if isinstance(resolution, (int, float)):
+        w_src = h_src = resolution
+    elif isinstance(resolution, (list, tuple)):
+        h_src, w_src = resolution
+    elif isinstance(resolution, (torch.Tensor, np.ndarray)):
+        resolution = resolution.reshape(-1, 2)
+        h_src, w_src = resolution[:, 0], resolution[:, 1]
+
+    aspect_ratio = w_src / h_src
+    K[:, 0, 0] /= w_src / 2
+    K[:, 1, 1] /= h_src / 2
+    if aspect_ratio > 1:
+        K[:, 0, 0] *= aspect_ratio
+    else:
+        K[:, 1, 1] /= aspect_ratio
+    if is_perspective:
+        K[:, 0, 2] = K[:, 0, 2] / (w_src / 2) - 1
+        K[:, 1, 2] = K[:, 1, 2] / (h_src / 2) - 1
+        K[:, 0, 2] *= sign[0]
+        K[:, 1, 2] *= sign[1]
+    else:
+        K[:, 0, 3] = K[:, 0, 3] / (w_src / 2) - 1
+        K[:, 1, 3] = K[:, 1, 3] / (h_src / 2) - 1
+        K[:, 0, 3] *= sign[0]
+        K[:, 1, 3] *= sign[1]
+    return K
+
+
+def convert_world_view(
+    R: Union[torch.Tensor, np.ndarray], T: Union[torch.Tensor, np.ndarray]
+) -> Tuple[Union[torch.Tensor, np.ndarray], Union[torch.Tensor, np.ndarray]]:
+    """Convert between view_to_world and world_to_view defined extrinsic
+    matrix.
+
+    Args:
+        R (Union[torch.Tensor, np.ndarray]): extrinsic rotation matrix.
+            shape should be (batch, 3, 4)
+        T (Union[torch.Tensor, np.ndarray]): extrinsic translation matrix.
+
+    Raises:
+        TypeError: R and T should be of the same type.
+
+    Returns:
+        Tuple[Union[torch.Tensor, np.ndarray], Union[torch.Tensor,
+            np.ndarray]]: output R, T.
+    """
+    if not (type(R) is type(T)):
+        raise TypeError(
+            f'R: {type(R)}, T: {type(T)} should have the same type.')
+    if isinstance(R, torch.Tensor):
+        R = R.clone()
+        T = T.clone()
+        R = R.permute(0, 2, 1)
+        T = -(R @ T.view(-1, 3, 1)).view(-1, 3)
+    elif isinstance(R, np.ndarray):
+        R = R.copy()
+        T = T.copy()
+        R = R.transpose(0, 2, 1)
+        T = -(R @ T.reshape(-1, 3, 1)).reshape(-1, 3)
+    else:
+        raise TypeError(f'R: {type(R)}, T: {type(T)} should be torch.Tensor '
+                        f'or numpy.ndarray.')
+    return R, T
diff --git a/detrsmpl/core/conventions/cameras/convert_projection.py b/detrsmpl/core/conventions/cameras/convert_projection.py
new file mode 100644
index 0000000000000000000000000000000000000000..e051d2f195a018ed5948c21fa790a91fafe693c1
--- /dev/null
+++ b/detrsmpl/core/conventions/cameras/convert_projection.py
@@ -0,0 +1,108 @@
+from typing import Tuple, Union
+
+import numpy as np
+import torch
+
+from .convert_convention import convert_camera_matrix
+
+
+def convert_perspective_to_weakperspective(
+        K: Union[torch.Tensor, np.ndarray],
+        zmean: Union[torch.Tensor, np.ndarray, float, int],
+        resolution: Union[int, Tuple[int, int], torch.Tensor,
+                          np.ndarray] = None,
+        in_ndc: bool = False,
+        convention: str = 'opencv') -> Union[torch.Tensor, np.ndarray]:
+    """Convert perspective to weakperspective intrinsic matrix.
+
+    Args:
+        K (Union[torch.Tensor, np.ndarray]): input intrinsic matrix, shape
+            should be (batch, 4, 4) or (batch, 3, 3).
+        zmean (Union[torch.Tensor, np.ndarray, int, float]): zmean for object.
+            shape should be (batch, ) or singleton number.
+        resolution (Union[int, Tuple[int, int], torch.Tensor, np.ndarray],
+            optional): (height, width) of image. Defaults to None.
+        in_ndc (bool, optional): whether defined in ndc. Defaults to False.
+        convention (str, optional): camera convention. Defaults to 'opencv'.
+
+    Returns:
+        Union[torch.Tensor, np.ndarray]: output weakperspective pred_cam,
+            shape is (batch, 4)
+    """
+    assert K is not None, 'K is required.'
+    K, _, _ = convert_camera_matrix(K=K,
+                                    convention_src=convention,
+                                    convention_dst='pytorch3d',
+                                    is_perspective=True,
+                                    in_ndc_src=in_ndc,
+                                    in_ndc_dst=True,
+                                    resolution_src=resolution)
+    if isinstance(zmean, np.ndarray):
+        zmean = torch.Tensor(zmean)
+    elif isinstance(zmean, (float, int)):
+        zmean = torch.Tensor([zmean])
+    zmean = zmean.view(-1)
+    num_frame = max(zmean.shape[0], K.shape[0])
+    new_K = torch.eye(4, 4)[None].repeat(num_frame, 1, 1)
+    fx = K[:, 0, 0]
+    fy = K[:, 0, 0]
+    cx = K[:, 0, 2]
+    cy = K[:, 1, 2]
+    new_K[:, 0, 0] = fx / zmean
+    new_K[:, 1, 1] = fy / zmean
+    new_K[:, 0, 3] = cx
+    new_K[:, 1, 3] = cy
+    return new_K
+
+
+def convert_weakperspective_to_perspective(
+        K: Union[torch.Tensor, np.ndarray],
+        zmean: Union[torch.Tensor, np.ndarray, int, float],
+        resolution: Union[int, Tuple[int, int], torch.Tensor,
+                          np.ndarray] = None,
+        in_ndc: bool = False,
+        convention: str = 'opencv') -> Union[torch.Tensor, np.ndarray]:
+    """Convert perspective to weakperspective intrinsic matrix.
+
+    Args:
+        K (Union[torch.Tensor, np.ndarray]): input intrinsic matrix, shape
+            should be (batch, 4, 4) or (batch, 3, 3).
+        zmean (Union[torch.Tensor, np.ndarray, int, float]): zmean for object.
+            shape should be (batch, ) or singleton number.
+        resolution (Union[int, Tuple[int, int], torch.Tensor, np.ndarray],
+            optional): (height, width) of image. Defaults to None.
+        in_ndc (bool, optional): whether defined in ndc. Defaults to False.
+        convention (str, optional): camera convention. Defaults to 'opencv'.
+
+    Returns:
+        Union[torch.Tensor, np.ndarray]: output weakperspective pred_cam,
+            shape is (batch, 4)
+    """
+    if K.ndim == 2:
+        K = K[None]
+    if isinstance(zmean, np.ndarray):
+        zmean = torch.Tensor(zmean)
+    elif isinstance(zmean, (float, int)):
+        zmean = torch.Tensor([zmean])
+    zmean = zmean.view(-1)
+    _N = max(K.shape[0], zmean.shape[0])
+    s1 = K[:, 0, 0]
+    s2 = K[:, 1, 1]
+    c1 = K[:, 0, 3]
+    c2 = K[:, 1, 3]
+    new_K = torch.zeros(_N, 4, 4)
+    new_K[:, 0, 0] = zmean * s1
+    new_K[:, 1, 1] = zmean * s2
+    new_K[:, 0, 2] = c1
+    new_K[:, 1, 2] = c2
+    new_K[:, 2, 3] = 1
+    new_K[:, 3, 2] = 1
+
+    new_K, _, _ = convert_camera_matrix(K=new_K,
+                                        convention_src=convention,
+                                        convention_dst='pytorch3d',
+                                        is_perspective=True,
+                                        in_ndc_src=in_ndc,
+                                        in_ndc_dst=True,
+                                        resolution_src=resolution)
+    return new_K
diff --git a/detrsmpl/core/conventions/joints_mapping/__init__.py b/detrsmpl/core/conventions/joints_mapping/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/core/conventions/joints_mapping/standard_joint_angles.py b/detrsmpl/core/conventions/joints_mapping/standard_joint_angles.py
new file mode 100644
index 0000000000000000000000000000000000000000..44ca32a659224afb8465ffa09d625e70a087b409
--- /dev/null
+++ b/detrsmpl/core/conventions/joints_mapping/standard_joint_angles.py
@@ -0,0 +1,54 @@
+import torch
+
+TRANSFORMATION_AA_TO_SJA = torch.Tensor([
+    [[1, 0, 0], [0, 0, 1], [0, -1, 0]],  # 00, 'left_hip',
+    [[1, 0, 0], [0, 0, 1], [0, -1, 0]],  # 01, 'right_hip',
+    [[1, 0, 0], [0, 0, -1], [0, 1, 0]],  # 02, 'spine1',
+    [[1, 0, 0], [0, 0, 1], [0, -1, 0]],  # 03, 'left_knee',
+    [[1, 0, 0], [0, 0, 1], [0, -1, 0]],  # 04, 'right_knee',
+    [[1, 0, 0], [0, 0, -1], [0, 1, 0]],  # 05, 'spine2',
+    [[1, 0, 0], [0, 1, 0], [0, 0, 1]],  # 06, 'left_ankle',
+    [[1, 0, 0], [0, 1, 0], [0, 0, 1]],  # 07, 'right_ankle',
+    [[1, 0, 0], [0, 0, -1], [0, 1, 0]],  # 08, 'spine3',
+    [[1, 0, 0], [0, 1, 0], [0, 0, 1]],  # 09, 'left_foot',
+    [[1, 0, 0], [0, 1, 0], [0, 0, 1]],  # 10, 'right_foot',
+    [[1, 0, 0], [0, 0, -1], [0, 1, 0]],  # 11, 'neck',
+    [[0, 0, -1], [0, 1, 0], [1, 0, 0]],  # 12, 'left_collar',
+    [[0, 0, 1], [0, 1, 0], [-1, 0, 0]],  # 13, 'right_collar',
+    [[1, 0, 0], [0, 0, -1], [0, 1, 0]],  # 14, 'head',
+    [[0, 0, -1], [0, 1, 0], [1, 0, 0]],  # 15, 'left_shoulder',
+    [[0, 0, 1], [0, 1, 0], [-1, 0, 0]],  # 16, 'right_shoulder',
+    [[0, 0, -1], [0, 1, 0], [1, 0, 0]],  # 17, 'left_elbow',
+    [[0, 0, 1], [0, 1, 0], [-1, 0, 0]],  # 18, 'right_elbow',
+    [[0, 0, -1], [0, 1, 0], [1, 0, 0]],  # 19, 'left_wrist',
+    [[0, 0, 1], [0, 1, 0], [-1, 0, 0]],  # 20, 'right_wrist',
+])
+
+TRANSFORMATION_SJA_TO_AA = \
+    torch.inverse(TRANSFORMATION_AA_TO_SJA)
+
+# TODO: spines and shoulders may need further adjustment
+STANDARD_JOINT_ANGLE_LIMITS = torch.deg2rad(
+    torch.Tensor([
+        [[-45, 155], [-88, 17], [-105, 85]],  # 00, 'left_hip',
+        [[-45, 155], [-17, 88], [-85, 105]],  # 01, 'right_hip',
+        [[-25, 15], [-20, 20], [-30, 30]],  # 02, 'spine1',
+        [[0, 150], [0, 0], [0, 0]],  # 03, 'left_knee',
+        [[0, 150], [0, 0], [0, 0]],  # 04, 'right_knee',
+        [[-25, 15], [-15, 15], [-25, 25]],  # 05, 'spine2',
+        [[-31, 63], [-26, 26], [-74, 15]],  # 06, 'left_ankle',
+        [[-31, 63], [-26, 26], [-15, 74]],  # 07, 'right_ankle',
+        [[-25, 15], [-15, 15], [-25, 25]],  # 08, 'spine3',
+        [[-60, 45], [0, 0], [-45, 45]],  # 09, 'left_foot',
+        [[-60, 45], [0, 0], [-45, 45]],  # 10, 'right_foot',
+        [[-37, 22], [-30, 30], [-45, 45]],  # 11, 'neck',
+        [[-30, 30], [-30, 10], [0, 0]],  # 12, 'left_collar',
+        [[-30, 30], [-10, 30], [0, 0]],  # 13, 'right_collar',
+        [[-37, 22], [-30, 30], [-45, 45]],  # 14, 'head',
+        [[-90, 135], [-97, 91], [-90, 135]],  # 15, 'left_shoulder',
+        [[-135, 90], [-91, 97], [-135, 90]],  # 16, 'right_shoulder',
+        [[0, 0], [-150, 0], [0, 0]],  # 17, 'left_elbow',
+        [[0, 0], [0, 150], [0, 0]],  # 18, 'right_elbow',
+        [[-90, 90], [-45, 45], [-180, 60]],  # 19, 'left_wrist',
+        [[-90, 90], [-45, 45], [-60, 180]],  # 20, 'right_wrist',
+    ]))
diff --git a/detrsmpl/core/conventions/keypoints_mapping/__init__.py b/detrsmpl/core/conventions/keypoints_mapping/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe966a6158eca3ecabe31b4bc6e44c4622983668
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/__init__.py
@@ -0,0 +1,399 @@
+from collections import defaultdict
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+from mmcv.utils import print_log
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    agora,
+    coco,
+    coco_wholebody,
+    crowdpose,
+    face3d,
+    flame,
+    gta,
+    h36m,
+    human_data,
+    hybrik,
+    instavariety,
+    lsp,
+    mano,
+    mpi_inf_3dhp,
+    mpii,
+    openpose,
+    penn_action,
+    posetrack,
+    pw3d,
+    smpl,
+    smplx,
+    spin_smplx,
+    star,
+)
+
+KEYPOINTS_FACTORY = {
+    'human_data': human_data.HUMAN_DATA,
+    'agora': agora.AGORA_KEYPOINTS,
+    'coco': coco.COCO_KEYPOINTS,
+    'coco_wholebody': coco_wholebody.COCO_WHOLEBODY_KEYPOINTS,
+    'crowdpose': crowdpose.CROWDPOSE_KEYPOINTS,
+    'smplx': smplx.SMPLX_KEYPOINTS,
+    'smplx_137': smplx.SMPLX_137_KEYPOINTS,
+    'smplx_lhand': smplx.SMPLX_LHAND,
+    'smplx_rhand': smplx.SMPLX_RHAND,
+    'smplx_face': smplx.SMPLX_FACE,
+    'smplx_aios': smplx.AiOS_35_KEYPOINTS,
+    'smpl': smpl.SMPL_KEYPOINTS,
+    'smpl_45': smpl.SMPL_45_KEYPOINTS,
+    'smpl_54': smpl.SMPL_54_KEYPOINTS,
+    'smpl_49': smpl.SMPL_49_KEYPOINTS,
+    'smpl_24': smpl.SMPL_24_KEYPOINTS,
+    'star': star.STAR_KEYPOINTS,
+    'mpi_inf_3dhp': mpi_inf_3dhp.MPI_INF_3DHP_KEYPOINTS,
+    'mpi_inf_3dhp_test': mpi_inf_3dhp.MPI_INF_3DHP_TEST_KEYPOINTS,
+    'penn_action': penn_action.PENN_ACTION_KEYPOINTS,
+    'h36m': h36m.H36M_KEYPOINTS,
+    'h36m_mmpose': h36m.H36M_KEYPOINTS_MMPOSE,
+    'h36m_smplx': h36m.H36M_KEYPOINTS_SMPLX,
+    'pw3d': pw3d.PW3D_KEYPOINTS,
+    'mpii': mpii.MPII_KEYPOINTS,
+    'lsp': lsp.LSP_KEYPOINTS,
+    'posetrack': posetrack.POSETRACK_KEYPOINTS,
+    'instavariety': instavariety.INSTAVARIETY_KEYPOINTS,
+    'openpose_25': openpose.OPENPOSE_25_KEYPOINTS,
+    'openpose_118': openpose.OPENPOSE_118_KEYPOINTS,
+    'openpose_135': openpose.OPENPOSE_135_KEYPOINTS,
+    'openpose_137': openpose.OPENPOSE_137_KEYPOINTS,
+    'hybrik_29': hybrik.HYBRIK_29_KEYPOINTS,
+    'hybrik_hp3d': mpi_inf_3dhp.HYBRIK_MPI_INF_3DHP_KEYPOINTS,
+    'gta': gta.GTA_KEYPOINTS,
+    'flame': flame.FLAME_73_KEYPOINTS,
+    'face3d': face3d.FACE3D_IND,
+    'spin_smplx': spin_smplx.SPIN_SMPLX_KEYPOINTS,
+    'mano': mano.MANO_KEYPOINTS,
+    'mano_left': mano.MANO_LEFT_KEYPOINTS,
+    'mano_right': mano.MANO_RIGHT_KEYPOINTS,
+    'mano_hands': mano.MANO_HANDS_KEYPOINTS,
+    'mano_left_reorder': mano.MANO_LEFT_REORDER_KEYPOINTS,
+    'mano_right_reorder': mano.MANO_RIGHT_REORDER_KEYPOINTS,
+    'mano_hands_reorder': mano.MANO_HANDS_REORDER_KEYPOINTS,
+}
+
+__KEYPOINTS_MAPPING_CACHE__ = defaultdict(dict)
+
+
+def convert_kps(
+    keypoints: Union[np.ndarray, torch.Tensor],
+    src: str,
+    dst: str,
+    approximate: bool = False,
+    mask: Union[np.ndarray, torch.Tensor] = None,
+    keypoints_factory: dict = KEYPOINTS_FACTORY,
+    return_mask: bool = True
+) -> Tuple[Union[np.ndarray, torch.Tensor], Union[np.ndarray, torch.Tensor]]:
+    """Convert keypoints following the mapping correspondence between src and
+    dst keypoints definition. Supported conventions by now: agora, coco, smplx,
+    smpl, mpi_inf_3dhp, mpi_inf_3dhp_test, h36m, h36m_mmpose, pw3d, mpii, lsp.
+    Args:
+        keypoints [Union[np.ndarray, torch.Tensor]]: input keypoints array,
+            could be (f * n * J * 3/2) or (f * J * 3/2).
+            You can set keypoints as np.zeros((1, J, 2))
+            if you only need mask.
+        src (str): source data type from keypoints_factory.
+        dst (str): destination data type from keypoints_factory.
+        approximate (bool): control whether approximate mapping is allowed.
+        mask (Union[np.ndarray, torch.Tensor], optional):
+            The original mask to mark the existence of the keypoints.
+            None represents all ones mask.
+            Defaults to None.
+        keypoints_factory (dict, optional): A class to store the attributes.
+            Defaults to keypoints_factory.
+        return_mask (bool, optional): whether to return a mask as part of the
+            output. It is unnecessary to return a mask if the keypoints consist
+            of confidence. Any invalid keypoints will have zero confidence.
+            Defaults to True.
+    Returns:
+        Tuple[Union[np.ndarray, torch.Tensor], Union[np.ndarray, torch.Tensor]]
+            : tuple of (out_keypoints, mask). out_keypoints and mask will be of
+            the same type.
+    """
+    assert keypoints.ndim in {3, 4}
+    if isinstance(keypoints, torch.Tensor):
+
+        def new_array_func(shape, value, device_data, if_uint8):
+            if if_uint8:
+                dtype = torch.uint8
+            else:
+                dtype = None
+            if value == 1:
+                return torch.ones(size=shape,
+                                  dtype=dtype,
+                                  device=device_data.device)
+            elif value == 0:
+                return torch.zeros(size=shape,
+                                   dtype=dtype,
+                                   device=device_data.device)
+            else:
+                raise ValueError
+
+        def to_type_uint8_func(data):
+            return data.to(dtype=torch.uint8)
+
+    elif isinstance(keypoints, np.ndarray):
+
+        def new_array_func(shape, value, device_data, if_uint8):
+            if if_uint8:
+                dtype = np.uint8
+            else:
+                dtype = None
+            if value == 1:
+                return np.ones(shape=shape)
+            elif value == 0:
+                return np.zeros(shape=shape, dtype=dtype)
+            else:
+                raise ValueError
+
+        def to_type_uint8_func(data):
+            return data.astype(np.uint8)
+
+    else:
+        raise TypeError('Type of keypoints is neither' +
+                        ' torch.Tensor nor np.ndarray.\n' +
+                        f'Type of keypoints: {type(keypoints)}')
+
+    if mask is not None:
+        assert type(mask) == type(keypoints)
+    else:
+        mask = new_array_func(shape=(keypoints.shape[-2], ),
+                              value=1,
+                              device_data=keypoints,
+                              if_uint8=True)
+
+    if src == dst:
+        if return_mask:
+            return keypoints, mask
+        else:
+            return keypoints
+
+    src_names = keypoints_factory[src.lower()]
+    dst_names = keypoints_factory[dst.lower()]
+    extra_dims = keypoints.shape[:-2]
+    keypoints = keypoints.reshape(-1, len(src_names), keypoints.shape[-1])
+
+    out_keypoints = new_array_func(shape=(keypoints.shape[0], len(dst_names),
+                                          keypoints.shape[-1]),
+                                   value=0,
+                                   device_data=keypoints,
+                                   if_uint8=False)
+
+    original_mask = mask
+    if original_mask is not None:
+        original_mask = original_mask.reshape(-1)
+        assert original_mask.shape[0] == len(
+            src_names), f'The length of mask should be {len(src_names)}'
+
+    mask = new_array_func(shape=(len(dst_names), ),
+                          value=0,
+                          device_data=keypoints,
+                          if_uint8=True)
+
+    dst_idxs, src_idxs, _ = \
+        get_mapping(src, dst, approximate, keypoints_factory)
+    out_keypoints[:, dst_idxs] = keypoints[:, src_idxs]
+    out_shape = extra_dims + (len(dst_names), keypoints.shape[-1])
+    out_keypoints = out_keypoints.reshape(out_shape)
+    mask[dst_idxs] = to_type_uint8_func(original_mask[src_idxs]) \
+        if original_mask is not None else 1.0
+
+    if return_mask:
+        return out_keypoints, mask
+    else:
+        return out_keypoints
+
+
+def compress_converted_kps(
+    zero_pad_array: Union[np.ndarray, torch.Tensor],
+    mask_array: Union[np.ndarray, torch.Tensor],
+) -> Union[np.ndarray, torch.Tensor]:
+    """Compress keypoints that are zero-padded after applying convert_kps.
+
+    Args:
+        keypoints (np.ndarray): input keypoints array, could be
+            (f * n * J * 3/2) or (f * J * 3/2). You can set keypoints as
+            np.zeros((1, J, 2)) if you only need mask.
+        mask [Union[np.ndarray, torch.Tensor]]:
+            The original mask to mark the existence of the keypoints.
+    Returns:
+        Union[np.ndarray, torch.Tensor]: out_keypoints
+    """
+
+    assert mask_array.shape[0] == zero_pad_array.shape[1]
+    valid_mask_index = np.where(mask_array == 1)[0]
+    compressed_array = np.take(zero_pad_array, valid_mask_index, axis=1)
+    return compressed_array
+
+
+def get_mapping(src: str,
+                dst: str,
+                approximate: bool = False,
+                keypoints_factory: dict = KEYPOINTS_FACTORY):
+    """Get mapping list from src to dst.
+
+    Args:
+        src (str): source data type from keypoints_factory.
+        dst (str): destination data type from keypoints_factory.
+        approximate (bool): control whether approximate mapping is allowed.
+        keypoints_factory (dict, optional): A class to store the attributes.
+            Defaults to keypoints_factory.
+
+    Returns:
+        list:
+            [src_to_intersection_idx, dst_to_intersection_index,
+             intersection_names]
+    """
+    if src in __KEYPOINTS_MAPPING_CACHE__ and \
+        dst in __KEYPOINTS_MAPPING_CACHE__[src] and \
+            __KEYPOINTS_MAPPING_CACHE__[src][dst][3] == approximate:
+        return __KEYPOINTS_MAPPING_CACHE__[src][dst][:3]
+    else:
+        src_names = keypoints_factory[src.lower()]
+        dst_names = keypoints_factory[dst.lower()]
+
+        dst_idxs, src_idxs, intersection = [], [], []
+        unmapped_names, approximate_names = [], []
+        for dst_idx, dst_name in enumerate(dst_names):
+            matched = False
+            try:
+                src_idx = src_names.index(dst_name)
+            except ValueError:
+                src_idx = -1
+            if src_idx >= 0:
+                matched = True
+                dst_idxs.append(dst_idx)
+                src_idxs.append(src_idx)
+                intersection.append(dst_name)
+            # approximate mapping
+            if approximate and not matched:
+
+                try:
+                    part_list = human_data.APPROXIMATE_MAP[dst_name]
+                except KeyError:
+                    continue
+                for approximate_name in part_list:
+                    try:
+                        src_idx = src_names.index(approximate_name)
+                    except ValueError:
+                        src_idx = -1
+                    if src_idx >= 0:
+                        dst_idxs.append(dst_idx)
+                        src_idxs.append(src_idx)
+                        intersection.append(dst_name)
+                        unmapped_names.append(src_names[src_idx])
+                        approximate_names.append(dst_name)
+                        break
+
+        if unmapped_names:
+            warn_message = \
+                f'Approximate mapping {unmapped_names}' +\
+                f' to {approximate_names}'
+            print_log(msg=warn_message)
+
+        mapping_list = [dst_idxs, src_idxs, intersection, approximate]
+
+        if src not in __KEYPOINTS_MAPPING_CACHE__:
+            __KEYPOINTS_MAPPING_CACHE__[src] = {}
+        __KEYPOINTS_MAPPING_CACHE__[src][dst] = mapping_list
+        return mapping_list[:3]
+
+
+def get_flip_pairs(convention: str = 'smplx',
+                   keypoints_factory: dict = KEYPOINTS_FACTORY) -> List[int]:
+    """Get indices of left, right keypoint pairs from specified convention.
+
+    Args:
+        convention (str): data type from keypoints_factory.
+        keypoints_factory (dict, optional): A class to store the attributes.
+            Defaults to keypoints_factory.
+    Returns:
+        List[int]: left, right keypoint indices
+    """
+    flip_pairs = []
+    keypoints = keypoints_factory[convention]
+    left_kps = [kp for kp in keypoints if 'left_' in kp]
+    for left_kp in left_kps:
+        right_kp = left_kp.replace('left_', 'right_')
+        flip_pairs.append([keypoints.index(kp) for kp in [left_kp, right_kp]])
+    return flip_pairs
+
+
+def get_keypoint_idxs_by_part(
+        part: str,
+        convention: str = 'smplx',
+        keypoints_factory: dict = KEYPOINTS_FACTORY) -> List[int]:
+    """Get part keypoints indices from specified part and convention.
+
+    Args:
+        part (str): part to search from
+        convention (str): data type from keypoints_factory.
+        keypoints_factory (dict, optional): A class to store the attributes.
+            Defaults to keypoints_factory.
+    Returns:
+        List[int]: part keypoint indices
+    """
+    humandata_parts = human_data.HUMAN_DATA_PARTS
+    keypoints = keypoints_factory[convention]
+    if part not in humandata_parts.keys():
+        raise ValueError('part not in allowed parts')
+    part_keypoints = list(set(humandata_parts[part]) & set(keypoints))
+    part_keypoints_idx = [keypoints.index(kp) for kp in part_keypoints]
+    return part_keypoints_idx
+
+
+def get_keypoint_idx(name: str,
+                     convention: str = 'smplx',
+                     approximate: bool = False,
+                     keypoints_factory: dict = KEYPOINTS_FACTORY) -> List[int]:
+    """Get keypoint index from specified convention with keypoint name.
+
+    Args:
+        name (str): keypoint name
+        convention (str): data type from keypoints_factory.
+        approximate (bool): control whether approximate mapping is allowed.
+        keypoints_factory (dict, optional): A class to store the attributes.
+            Defaults to keypoints_factory.
+    Returns:
+        List[int]: keypoint index
+    """
+    keypoints = keypoints_factory[convention]
+    try:
+        idx = keypoints.index(name)
+    except ValueError:
+        idx = -1  # not matched
+    if approximate and idx == -1:
+        try:
+            part_list = human_data.APPROXIMATE_MAP[name]
+        except KeyError:
+            return idx
+        for approximate_name in part_list:
+            try:
+                idx = keypoints.index(approximate_name)
+            except ValueError:
+                idx = -1
+            if idx >= 0:
+                return idx
+    return idx
+
+
+def get_keypoint_num(convention: str = 'smplx',
+                     keypoints_factory: dict = KEYPOINTS_FACTORY) -> List[int]:
+    """Get number of keypoints of specified convention.
+
+    Args:
+        convention (str): data type from keypoints_factory.
+        keypoints_factory (dict, optional): A class to store the attributes.
+            Defaults to keypoints_factory.
+    Returns:
+        List[int]: part keypoint indices
+    """
+    keypoints = keypoints_factory[convention]
+    return len(keypoints)
diff --git a/detrsmpl/core/conventions/keypoints_mapping/agora.py b/detrsmpl/core/conventions/keypoints_mapping/agora.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a1e08f739cc77d70b0f8a24bcaf265dcedd33ec
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/agora.py
@@ -0,0 +1,129 @@
+AGORA_KEYPOINTS = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine_1',
+    'left_knee',
+    'right_knee',
+    'spine_2',
+    'left_ankle',
+    'right_ankle',
+    'spine_3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'jaw',
+    'left_eyeball',
+    'right_eyeball',
+    'left_index_1',
+    'left_index_2',
+    'left_index_3',
+    'left_middle_1',
+    'left_middle_2',
+    'left_middle_3',
+    'left_pinky_1',
+    'left_pinky_2',
+    'left_pinky_3',
+    'left_ring_1',
+    'left_ring_2',
+    'left_ring_3',
+    'left_thumb_1',
+    'left_thumb_2',
+    'left_thumb_3',
+    'right_index_1',
+    'right_index_2',
+    'right_index_3',
+    'right_middle_1',
+    'right_middle_2',
+    'right_middle_3',
+    'right_pinky_1',
+    'right_pinky_2',
+    'right_pinky_3',
+    'right_ring_1',
+    'right_ring_2',
+    'right_ring_3',
+    'right_thumb_1',
+    'right_thumb_2',
+    'right_thumb_3',
+    'nose',
+    'right_eye',
+    'left_eye',
+    'right_ear',
+    'left_ear',
+    'left_bigtoe',
+    'left_smalltoe',
+    'left_heel',
+    'right_bigtoe',
+    'right_smalltoe',
+    'right_heel',
+    'left_thumb',
+    'left_index',
+    'left_middle',
+    'left_ring',
+    'left_pinky',
+    'right_thumb',
+    'right_index',
+    'right_middle',
+    'right_ring',
+    'right_pinky',
+    'right_eyebrow_1',
+    'right_eyebrow_2',
+    'right_eyebrow_3',
+    'right_eyebrow_4',
+    'right_eyebrow_5',
+    'left_eyebrow_5',
+    'left_eyebrow_4',
+    'left_eyebrow_3',
+    'left_eyebrow_2',
+    'left_eyebrow_1',
+    'nosebridge_1',
+    'nosebridge_2',
+    'nosebridge_3',
+    'nosebridge_4',
+    'right_nose_2',  # original name: nose_1
+    'right_nose_1',  # original name: nose_2
+    'nose_middle',  # original name: nose_3
+    'left_nose_1',  # original name: nose_4
+    'left_nose_2',  # original name: nose_5
+    'right_eye_1',
+    'right_eye_2',
+    'right_eye_3',
+    'right_eye_4',
+    'right_eye_5',
+    'right_eye_6',
+    'left_eye_4',
+    'left_eye_3',
+    'left_eye_2',
+    'left_eye_1',
+    'left_eye_6',
+    'left_eye_5',
+    'right_mouth_1',  # original name: mouth_1
+    'right_mouth_2',  # original name: mouth_2
+    'right_mouth_3',  # original name: mouth_3
+    'mouth_top',  # original name: mouth_4
+    'left_mouth_3',  # original name: mouth_5
+    'left_mouth_2',  # original name: mouth_6
+    'left_mouth_1',  # original name: mouth_7
+    'left_mouth_5',  # original name: mouth_8
+    'left_mouth_4',  # original name: mouth_9
+    'mouth_bottom',  # original name: mouth_10
+    'right_mouth_4',  # original name: mouth_11
+    'right_mouth_5',  # original name: mouth_12
+    'right_lip_1',  # original name: lip_1
+    'right_lip_2',  # original name: lip_2
+    'lip_top',  # original name: lip_3
+    'left_lip_2',  # original name: lip_4
+    'left_lip_1',  # original name: lip_5
+    'left_lip_3',  # original name: lip_6
+    'lip_bottom',  # original name: lip_7
+    'right_lip_3',  # original name: lip_8
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/coco.py b/detrsmpl/core/conventions/keypoints_mapping/coco.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c39e1ae5f14b767356751cd5195fbe224d0fd88
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/coco.py
@@ -0,0 +1,19 @@
+COCO_KEYPOINTS = [
+    'nose',
+    'left_eye',
+    'right_eye',
+    'left_ear',
+    'right_ear',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'left_hip_extra',
+    'right_hip_extra',
+    'left_knee',
+    'right_knee',
+    'left_ankle',
+    'right_ankle',
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/coco_wholebody.py b/detrsmpl/core/conventions/keypoints_mapping/coco_wholebody.py
new file mode 100644
index 0000000000000000000000000000000000000000..39e1e7e1c5cace9092b591ea3d3f0e0e637c4b60
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/coco_wholebody.py
@@ -0,0 +1,135 @@
+COCO_WHOLEBODY_KEYPOINTS = [
+    'nose',
+    'left_eye',
+    'right_eye',
+    'left_ear',
+    'right_ear',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'left_hip',
+    'right_hip',
+    'left_knee',
+    'right_knee',
+    'left_ankle',
+    'right_ankle',
+    'left_bigtoe',
+    'left_smalltoe',
+    'left_heel',
+    'right_bigtoe',
+    'right_smalltoe',
+    'right_heel',
+    'right_contour_1',  # original name: face_contour_1
+    'right_contour_2',  # original name: face_contour_2
+    'right_contour_3',  # original name: face_contour_3
+    'right_contour_4',  # original name: face_contour_4
+    'right_contour_5',  # original name: face_contour_5
+    'right_contour_6',  # original name: face_contour_6
+    'right_contour_7',  # original name: face_contour_7
+    'right_contour_8',  # original name: face_contour_8
+    'contour_middle',  # original name: face_contour_9
+    'left_contour_8',  # original name: face_contour_10
+    'left_contour_7',  # original name: face_contour_11
+    'left_contour_6',  # original name: face_contour_12
+    'left_contour_5',  # original name: face_contour_13
+    'left_contour_4',  # original name: face_contour_14
+    'left_contour_3',  # original name: face_contour_15
+    'left_contour_2',  # original name: face_contour_16
+    'left_contour_1',  # original name: face_contour_17
+    'right_eyebrow_1',
+    'right_eyebrow_2',
+    'right_eyebrow_3',
+    'right_eyebrow_4',
+    'right_eyebrow_5',
+    'left_eyebrow_5',
+    'left_eyebrow_4',
+    'left_eyebrow_3',
+    'left_eyebrow_2',
+    'left_eyebrow_1',
+    'nosebridge_1',
+    'nosebridge_2',
+    'nosebridge_3',
+    'nosebridge_4',
+    'right_nose_2',  # original name: nose_1
+    'right_nose_1',  # original name: nose_2
+    'nose_middle',  # original name: nose_3
+    'left_nose_1',  # original name: nose_4
+    'left_nose_2',  # original name: nose_5
+    'right_eye_1',
+    'right_eye_2',
+    'right_eye_3',
+    'right_eye_4',
+    'right_eye_5',
+    'right_eye_6',
+    'left_eye_4',
+    'left_eye_3',
+    'left_eye_2',
+    'left_eye_1',
+    'left_eye_6',
+    'left_eye_5',
+    'right_mouth_1',  # original name: mouth_1
+    'right_mouth_2',  # original name: mouth_2
+    'right_mouth_3',  # original name: mouth_3
+    'mouth_top',  # original name: mouth_4
+    'left_mouth_3',  # original name: mouth_5
+    'left_mouth_2',  # original name: mouth_6
+    'left_mouth_1',  # original name: mouth_7
+    'left_mouth_5',  # original name: mouth_8
+    'left_mouth_4',  # original name: mouth_9
+    'mouth_bottom',  # original name: mouth_10
+    'right_mouth_4',  # original name: mouth_11
+    'right_mouth_5',  # original name: mouth_12
+    'right_lip_1',  # original name: lip_1
+    'right_lip_2',  # original name: lip_2
+    'lip_top',  # original name: lip_3
+    'left_lip_2',  # original name: lip_4
+    'left_lip_1',  # original name: lip_5
+    'left_lip_3',  # original name: lip_6
+    'lip_bottom',  # original name: lip_7
+    'right_lip_3',  # original name: lip_8
+    'left_hand_root',
+    'left_thumb_1',
+    'left_thumb_2',
+    'left_thumb_3',
+    'left_thumb',
+    'left_index_1',
+    'left_index_2',
+    'left_index_3',
+    'left_index',
+    'left_middle_1',
+    'left_middle_2',
+    'left_middle_3',
+    'left_middle',
+    'left_ring_1',
+    'left_ring_2',
+    'left_ring_3',
+    'left_ring',
+    'left_pinky_1',
+    'left_pinky_2',
+    'left_pinky_3',
+    'left_pinky',
+    'right_hand_root',
+    'right_thumb_1',
+    'right_thumb_2',
+    'right_thumb_3',
+    'right_thumb',
+    'right_index_1',
+    'right_index_2',
+    'right_index_3',
+    'right_index',
+    'right_middle_1',
+    'right_middle_2',
+    'right_middle_3',
+    'right_middle',
+    'right_ring_1',
+    'right_ring_2',
+    'right_ring_3',
+    'right_ring',
+    'right_pinky_1',
+    'right_pinky_2',
+    'right_pinky_3',
+    'right_pinky',
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/crowdpose.py b/detrsmpl/core/conventions/keypoints_mapping/crowdpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f6b154c6b3d1a434225d19ac0755adb39dc52e2
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/crowdpose.py
@@ -0,0 +1,5 @@
+CROWDPOSE_KEYPOINTS = [
+    'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
+    'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee',
+    'right_knee', 'left_ankle', 'right_ankle', 'head', 'neck'
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/face3d.py b/detrsmpl/core/conventions/keypoints_mapping/face3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb4c3312005f687c6f50f1175009e0d2d11118fb
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/face3d.py
@@ -0,0 +1,4 @@
+FACE3D_IND = [
+    'right_eye_1', 'right_eye_4', 'left_eye_4', 'left_eye_1', 'nose_middle',
+    'right_mouth_1', 'left_mouth_1'
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/flame.py b/detrsmpl/core/conventions/keypoints_mapping/flame.py
new file mode 100644
index 0000000000000000000000000000000000000000..15a6d8051ce369abecbb319f956fdbdb35fbc979
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/flame.py
@@ -0,0 +1,75 @@
+FLAME_73_KEYPOINTS = [
+    'head',
+    'neck',
+    'jaw',
+    'left_eye',
+    'right_eye',
+    'right_eyebrow_1',
+    'right_eyebrow_2',
+    'right_eyebrow_3',
+    'right_eyebrow_4',
+    'right_eyebrow_5',
+    'left_eyebrow_5',
+    'left_eyebrow_4',
+    'left_eyebrow_3',
+    'left_eyebrow_2',
+    'left_eyebrow_1',
+    'nosebridge_1',
+    'nosebridge_2',
+    'nosebridge_3',
+    'nosebridge_4',
+    'right_nose_2',
+    'right_nose_1',
+    'nose_middle',
+    'left_nose_1',
+    'left_nose_2',
+    'right_eye_1',
+    'right_eye_2',
+    'right_eye_3',
+    'right_eye_4',
+    'right_eye_5',
+    'right_eye_6',
+    'left_eye_4',
+    'left_eye_3',
+    'left_eye_2',
+    'left_eye_1',
+    'left_eye_6',
+    'left_eye_5',
+    'right_mouth_1',
+    'right_mouth_2',
+    'right_mouth_3',
+    'mouth_top',
+    'left_mouth_3',
+    'left_mouth_2',
+    'left_mouth_1',
+    'left_mouth_5',
+    'left_mouth_4',
+    'mouth_bottom',
+    'right_mouth_4',
+    'right_mouth_5',
+    'right_lip_1',
+    'right_lip_2',
+    'lip_top',
+    'left_lip_2',
+    'left_lip_1',
+    'left_lip_3',
+    'lip_bottom',
+    'right_lip_3',
+    'right_contour_1',
+    'right_contour_2',
+    'right_contour_3',
+    'right_contour_4',
+    'right_contour_5',
+    'right_contour_6',
+    'right_contour_7',
+    'right_contour_8',
+    'contour_middle',
+    'left_contour_8',
+    'left_contour_7',
+    'left_contour_6',
+    'left_contour_5',
+    'left_contour_4',
+    'left_contour_3',
+    'left_contour_2',
+    'left_contour_1',
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/gta.py b/detrsmpl/core/conventions/keypoints_mapping/gta.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d9446ce6c7d56a382b76fb1a6d640df58c16937
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/gta.py
@@ -0,0 +1,205 @@
+# ORIGINAL_NAMES = [
+#     'head_top',             # 00, extrapolate 02-01
+#     'head_center',          # 01
+#     'neck',                 # 02
+#     'right_clavicle',       # 03
+#     'right_shoulder',       # 04
+#     'right_elbow',          # 05
+#     'right_wrist',          # 06
+#     'left_clavicle',        # 07
+#     'left_shoulder',        # 08
+#     'left_elbow',           # 09
+#     'left_wrist',           # 10
+#     'spine0',               # 11
+#     'spine1',               # 12
+#     'spine2',               # 13
+#     'spine3',               # 14
+#     'spine4',               # 15
+#     'right_hip',            # 16
+#     'right_knee',           # 17
+#     'right_ankle',          # 18
+#     'left_hip',             # 19
+#     'left_knee',            # 20
+#     'left_ankle',           # 21
+#     'SKEL_ROOT',            # 22
+#     'FB_R_Brow_Out_000',    # 23
+#     'SKEL_L_Toe0',          # 24
+#     'MH_R_Elbow',           # 25
+#     'SKEL_L_Finger01',      # 26
+#     'SKEL_L_Finger02',      # 27
+#     'SKEL_L_Finger31',      # 28
+#     'SKEL_L_Finger32',      # 29
+#     'SKEL_L_Finger41',      # 30
+#     'SKEL_L_Finger42',      # 31
+#     'SKEL_L_Finger11',      # 32
+#     'SKEL_L_Finger12',      # 33
+#     'SKEL_L_Finger21',      # 34
+#     'SKEL_L_Finger22',      # 35
+#     'RB_L_ArmRoll',         # 36
+#     'IK_R_Hand',            # 37
+#     'RB_R_ThighRoll',       # 38
+#     'FB_R_Lip_Corner_000',  # 39
+#     'SKEL_Pelvis',          # 40
+#     'IK_Head',              # 41
+#     'MH_R_Knee',            # 42
+#     'FB_LowerLipRoot_000',  # 43
+#     'FB_R_Lip_Top_000',     # 44
+#     'FB_R_CheekBone_000',   # 45
+#     'FB_UpperLipRoot_000',  # 46
+#     'FB_L_Lip_Top_000',     # 47
+#     'FB_LowerLip_000',      # 48
+#     'SKEL_R_Toe0',          # 49
+#     'FB_L_CheekBone_000',   # 50
+#     'MH_L_Elbow',           # 51
+#     'RB_L_ThighRoll',       # 52
+#     'PH_R_Foot',            # 53
+#     'FB_L_Eye_000',         # 54
+#     'SKEL_L_Finger00',      # 55
+#     'SKEL_L_Finger10',      # 56
+#     'SKEL_L_Finger20',      # 57
+#     'SKEL_L_Finger30',      # 58
+#     'SKEL_L_Finger40',      # 59
+#     'FB_R_Eye_000',         # 60
+#     'PH_R_Hand',            # 61
+#     'FB_L_Lip_Corner_000',  # 62
+#     'IK_R_Foot',            # 63
+#     'RB_Neck_1',            # 64
+#     'IK_L_Hand',            # 65
+#     'RB_R_ArmRoll',         # 66
+#     'FB_Brow_Centre_000',   # 67
+#     'FB_R_Lid_Upper_000',   # 68
+#     'RB_R_ForeArmRoll',     # 69
+#     'FB_L_Lid_Upper_000',   # 70
+#     'MH_L_Knee',            # 71
+#     'FB_Jaw_000',           # 72
+#     'FB_L_Lip_Bot_000',     # 73
+#     'FB_Tongue_000',        # 74
+#     'FB_R_Lip_Bot_000',     # 75
+#     'IK_Root',              # 76
+#     'PH_L_Foot',            # 77
+#     'FB_L_Brow_Out_000',    # 78
+#     'SKEL_R_Finger00',      # 79
+#     'SKEL_R_Finger10',      # 80
+#     'SKEL_R_Finger20',      # 81
+#     'SKEL_R_Finger30',      # 82
+#     'SKEL_R_Finger40',      # 83
+#     'PH_L_Hand',            # 84
+#     'RB_L_ForeArmRoll',     # 85
+#     'FB_UpperLip_000',      # 86
+#     'SKEL_R_Finger01',      # 87
+#     'SKEL_R_Finger02',      # 88
+#     'SKEL_R_Finger31',      # 89
+#     'SKEL_R_Finger32',      # 90
+#     'SKEL_R_Finger41',      # 91
+#     'SKEL_R_Finger42',      # 92
+#     'SKEL_R_Finger11',      # 93
+#     'SKEL_R_Finger12',      # 94
+#     'SKEL_R_Finger21',      # 95
+#     'SKEL_R_Finger22',      # 96
+#     'FACIAL_facialRoot',    # 97
+#     'IK_L_Foot',            # 98
+#     'interpolated_nose'     # 99, mid-point of 45-50
+# ]
+
+GTA_KEYPOINTS = [
+    'gta_head_top',  # 00
+    'head',  # 01 - head_center
+    'neck',  # 02 - neck
+    'gta_right_clavicle',  # 03
+    'right_shoulder',  # 04  - right_shoulder
+    'right_elbow',  # 05  - right_elbow
+    'right_wrist',  # 06  - right_wrist
+    'gta_left_clavicle',  # 07
+    'left_shoulder',  # 08  - left_shoulder
+    'left_elbow',  # 09  - left_elbow
+    'left_wrist',  # 10  - left_wrist
+    'spine_2',  # 11  - spine0
+    'gta_spine1',  # 12
+    'spine_1',  # 13  - spine2
+    'pelvis',  # 14  - pelvis
+    'gta_spine4',  # 15
+    'right_hip',  # 16  - right_hip
+    'right_knee',  # 17  - right_knee
+    'right_ankle',  # 18  - right_ankle
+    'left_hip',  # 19  - left_hip
+    'left_knee',  # 20  - left_knee
+    'left_ankle',  # 21  - left_ankle
+    'gta_SKEL_ROOT',  # 22
+    'gta_FB_R_Brow_Out_000',  # 23
+    'left_foot',  # 24  - SKEL_L_Toe0
+    'gta_MH_R_Elbow',  # 25
+    'left_thumb_2',  # 26  - SKEL_L_Finger01
+    'left_thumb_3',  # 27  - SKEL_L_Finger02
+    'left_ring_2',  # 28  - SKEL_L_Finger31
+    'left_ring_3',  # 29  - SKEL_L_Finger32
+    'left_pinky_2',  # 30  - SKEL_L_Finger41
+    'left_pinky_3',  # 31  - SKEL_L_Finger42
+    'left_index_2',  # 32  - SKEL_L_Finger11
+    'left_index_3',  # 33  - SKEL_L_Finger12
+    'left_middle_2',  # 34  - SKEL_L_Finger21
+    'left_middle_3',  # 35  - SKEL_L_Finger22
+    'gta_RB_L_ArmRoll',  # 36
+    'gta_IK_R_Hand',  # 37
+    'gta_RB_R_ThighRoll',  # 38
+    'gta_FB_R_Lip_Corner_000',  # 39
+    'gta_SKEL_Pelvis',  # 40
+    'gta_IK_Head',  # 41
+    'gta_MH_R_Knee',  # 42
+    'gta_FB_LowerLipRoot_000',  # 43
+    'gta_FB_R_Lip_Top_000',  # 44
+    'gta_FB_R_CheekBone_000',  # 45
+    'gta_FB_UpperLipRoot_000',  # 46
+    'gta_FB_L_Lip_Top_000',  # 47
+    'gta_FB_LowerLip_000',  # 48
+    'right_foot',  # 49  - SKEL_R_Toe0
+    'gta_FB_L_CheekBone_000',  # 50
+    'gta_MH_L_Elbow',  # 51
+    'gta_RB_L_ThighRoll',  # 52
+    'gta_PH_R_Foot',  # 53
+    'left_eye',  # 54  - FB_L_Eye_000
+    'gta_SKEL_L_Finger00',  # 55
+    'left_index_1',  # 56  - SKEL_L_Finger10
+    'left_middle_1',  # 57  - SKEL_L_Finger20
+    'left_ring_1',  # 58  - SKEL_L_Finger30
+    'left_pinky_1',  # 59  - SKEL_L_Finger40
+    'right_eye',  # 60  - FB_R_Eye_000
+    'gta_PH_R_Hand',  # 61
+    'gta_FB_L_Lip_Corner_000',  # 62
+    'gta_IK_R_Foot',  # 63
+    'gta_RB_Neck_1',  # 64
+    'gta_IK_L_Hand',  # 65
+    'gta_RB_R_ArmRoll',  # 66
+    'gta_FB_Brow_Centre_000',  # 67
+    'gta_FB_R_Lid_Upper_000',  # 68
+    'gta_RB_R_ForeArmRoll',  # 69
+    'gta_FB_L_Lid_Upper_000',  # 70
+    'gta_MH_L_Knee',  # 71
+    'gta_FB_Jaw_000',  # 72
+    'gta_FB_L_Lip_Bot_000',  # 73
+    'gta_FB_Tongue_000',  # 74
+    'gta_FB_R_Lip_Bot_000',  # 75
+    'gta_IK_Root',  # 76
+    'gta_PH_L_Foot',  # 77
+    'gta_FB_L_Brow_Out_000',  # 78
+    'gta_SKEL_R_Finger00',  # 79
+    'right_index_1',  # 80  - SKEL_R_Finger10
+    'right_middle_1',  # 81  - SKEL_R_Finger20
+    'right_ring_1',  # 82  - SKEL_R_Finger30
+    'right_pinky_1',  # 83  - SKEL_R_Finger40
+    'gta_PH_L_Hand',  # 84
+    'gta_RB_L_ForeArmRoll',  # 85
+    'gta_FB_UpperLip_000',  # 86
+    'right_thumb_2',  # 87  - SKEL_R_Finger01
+    'right_thumb_3',  # 88  - SKEL_R_Finger02
+    'right_ring_2',  # 89  - SKEL_R_Finger31
+    'right_ring_3',  # 90  - SKEL_R_Finger32
+    'right_pinky_2',  # 91  - SKEL_R_Finger41
+    'right_pinky_3',  # 92  - SKEL_R_Finger42
+    'right_index_2',  # 93  - SKEL_R_Finger11
+    'right_index_3',  # 94  - SKEL_R_Finger12
+    'right_middle_2',  # 95  - SKEL_R_Finger21
+    'right_middle_3',  # 96  - SKEL_R_Finger22
+    'gta_FACIAL_facialRoot',  # 97
+    'gta_IK_L_Foot',  # 98
+    'nose'  # 99  - interpolated nose
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/h36m.py b/detrsmpl/core/conventions/keypoints_mapping/h36m.py
new file mode 100644
index 0000000000000000000000000000000000000000..34d7cd01de2b723398810256a475f94298ed2bc2
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/h36m.py
@@ -0,0 +1,59 @@
+H36M_KEYPOINTS = [
+    'pelvis_extra',
+    'left_hip_extra',
+    'left_knee',
+    'left_ankle',
+    'right_hip_extra',
+    'right_knee',
+    'right_ankle',
+    'spine_extra',
+    'neck_extra',
+    'head_extra',
+    'headtop',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'right_shoulder',
+    'right_elbow',
+    'right_wrist',
+]
+
+H36M_KEYPOINTS_MMPOSE = [
+    'pelvis_extra',
+    'right_hip_extra',
+    'right_knee',
+    'right_ankle',
+    'left_hip_extra',
+    'left_knee',
+    'left_ankle',
+    'spine_extra',
+    'neck_extra',
+    'head_extra',
+    'headtop',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'right_shoulder',
+    'right_elbow',
+    'right_wrist',
+]
+
+H36M_KEYPOINTS_SMPLX = [
+    'pelvis',
+    'left_hip',
+    'left_knee',
+    'left_ankle',
+    'right_hip',
+    'right_knee',
+    'right_ankle',
+    'spine',
+    'neck',  # 'thorax',
+    'neck/nose',
+    'head',  # 'head_h36m',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'right_shoulder',
+    'right_elbow',
+    'right_wrist'
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/human_data.py b/detrsmpl/core/conventions/keypoints_mapping/human_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..c96f9e8af2ba621987f98a3d52071e327c990ea4
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/human_data.py
@@ -0,0 +1,534 @@
+from collections import defaultdict
+
+HUMAN_DATA = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine_1',
+    'left_knee',
+    'right_knee',
+    'spine_2',
+    'left_ankle',
+    'right_ankle',
+    'spine_3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'jaw',
+    'left_eyeball',
+    'right_eyeball',
+    'left_index_1',
+    'left_index_2',
+    'left_index_3',
+    'left_middle_1',
+    'left_middle_2',
+    'left_middle_3',
+    'left_pinky_1',
+    'left_pinky_2',
+    'left_pinky_3',
+    'left_ring_1',
+    'left_ring_2',
+    'left_ring_3',
+    'left_thumb_1',
+    'left_thumb_2',
+    'left_thumb_3',
+    'right_index_1',
+    'right_index_2',
+    'right_index_3',
+    'right_middle_1',
+    'right_middle_2',
+    'right_middle_3',
+    'right_pinky_1',
+    'right_pinky_2',
+    'right_pinky_3',
+    'right_ring_1',
+    'right_ring_2',
+    'right_ring_3',
+    'right_thumb_1',
+    'right_thumb_2',
+    'right_thumb_3',
+    'nose',
+    'right_eye',
+    'left_eye',
+    'right_ear',
+    'left_ear',
+    'left_bigtoe',
+    'left_smalltoe',
+    'left_heel',
+    'right_bigtoe',
+    'right_smalltoe',
+    'right_heel',
+    'left_thumb',
+    'left_index',
+    'left_middle',
+    'left_ring',
+    'left_pinky',
+    'right_thumb',
+    'right_index',
+    'right_middle',
+    'right_ring',
+    'right_pinky',
+    'right_eyebrow_1',
+    'right_eyebrow_2',
+    'right_eyebrow_3',
+    'right_eyebrow_4',
+    'right_eyebrow_5',
+    'left_eyebrow_5',
+    'left_eyebrow_4',
+    'left_eyebrow_3',
+    'left_eyebrow_2',
+    'left_eyebrow_1',
+    'nosebridge_1',
+    'nosebridge_2',
+    'nosebridge_3',
+    'nosebridge_4',
+    'right_nose_2',  # original name: nose_1
+    'right_nose_1',  # original name: nose_2
+    'nose_middle',  # original name: nose_3
+    'left_nose_1',  # original name: nose_4
+    'left_nose_2',  # original name: nose_5
+    'right_eye_1',
+    'right_eye_2',
+    'right_eye_3',
+    'right_eye_4',
+    'right_eye_5',
+    'right_eye_6',
+    'left_eye_4',
+    'left_eye_3',
+    'left_eye_2',
+    'left_eye_1',
+    'left_eye_6',
+    'left_eye_5',
+    'right_mouth_1',  # original name: mouth_1
+    'right_mouth_2',  # original name: mouth_2
+    'right_mouth_3',  # original name: mouth_3
+    'mouth_top',  # original name: mouth_4
+    'left_mouth_3',  # original name: mouth_5
+    'left_mouth_2',  # original name: mouth_6
+    'left_mouth_1',  # original name: mouth_7
+    'left_mouth_5',  # original name: mouth_8
+    'left_mouth_4',  # original name: mouth_9
+    'mouth_bottom',  # original name: mouth_10
+    'right_mouth_4',  # original name: mouth_11
+    'right_mouth_5',  # original name: mouth_12
+    'right_lip_1',  # original name: lip_1
+    'right_lip_2',  # original name: lip_2
+    'lip_top',  # original name: lip_3
+    'left_lip_2',  # original name: lip_4
+    'left_lip_1',  # original name: lip_5
+    'left_lip_3',  # original name: lip_6
+    'lip_bottom',  # original name: lip_7
+    'right_lip_3',  # original name: lip_8
+    'right_contour_1',  # original name: face_contour_1
+    'right_contour_2',  # original name: face_contour_2
+    'right_contour_3',  # original name: face_contour_3
+    'right_contour_4',  # original name: face_contour_4
+    'right_contour_5',  # original name: face_contour_5
+    'right_contour_6',  # original name: face_contour_6
+    'right_contour_7',  # original name: face_contour_7
+    'right_contour_8',  # original name: face_contour_8
+    'contour_middle',  # original name: face_contour_9
+    'left_contour_8',  # original name: face_contour_10
+    'left_contour_7',  # original name: face_contour_11
+    'left_contour_6',  # original name: face_contour_12
+    'left_contour_5',  # original name: face_contour_13
+    'left_contour_4',  # original name: face_contour_14
+    'left_contour_3',  # original name: face_contour_15
+    'left_contour_2',  # original name: face_contour_16
+    'left_contour_1',  # original name: face_contour_17
+    # J_regressor_extra
+    'right_hip_extra',
+    'left_hip_extra',
+    'neck_extra',  # LSP
+    'headtop',  # LSP mpii peen_action mpi_inf_3dhp
+    'pelvis_extra',  # MPII
+    'thorax_extra',  # MPII
+    'spine_extra',  # H36M
+    'jaw_extra',  # H36M
+    'head_extra',  # H36M
+    # openpose
+    'nose_openpose',
+    'neck_openpose',
+    'right_shoulder_openpose',
+    'right_elbow_openpose',
+    'right_wrist_openpose',
+    'left_shoulder_openpose',
+    'left_elbow_openpose',
+    'left_wrist_openpose',
+    'pelvis_openpose',
+    'right_hip_openpose',
+    'right_knee_openpose',
+    'right_ankle_openpose',
+    'left_hip_openpose',
+    'left_knee_openpose',
+    'left_ankle_openpose',
+    'right_eye_openpose',
+    'left_eye_openpose',
+    'right_ear_openpose',
+    'left_ear_openpose',
+    'left_bigtoe_openpose',
+    'left_smalltoe_openpose',
+    'left_heel_openpose',
+    'right_bigtoe_openpose',
+    'right_smalltoe_openpose',
+    'right_heel_openpose',
+    # 3dhp
+    'spine_4_3dhp',
+    'left_clavicle_3dhp',
+    'right_clavicle_3dhp',
+    'left_hand_3dhp',
+    'right_hand_3dhp',
+    'left_toe_3dhp',
+    'right_toe_3dhp',
+    'head_h36m',  # H36M GT
+    'headtop_h36m',  # H36M GT
+    'head_bottom_pt',  # pose track
+    'left_hand',  # SMPL
+    'right_hand',  # SMPL
+]
+
+APPROXIMATE_MAPPING_LIST = [
+    # extra
+    ['pelvis', 'pelvis_openpose', 'pelvis_extra'],
+    ['left_hip', 'left_hip_openpose', 'left_hip_extra'],
+    ['right_hip', 'right_hip_openpose', 'right_hip_extra'],
+    ['neck', 'neck_openpose', 'neck_extra'],
+    ['jaw', 'jaw_extra'],
+    ['head_extra', 'head_h36m'],
+    ['headtop', 'headtop_h36m'],
+    # 3dhp
+    ['left_hand', 'left_hand_3dhp'],
+    ['right_hand', 'right_hand_3dhp'],
+    # openpose
+    ['nose', 'nose_openpose'],
+    ['right_shoulder', 'right_shoulder_openpose'],
+    ['right_elbow', 'right_elbow_openpose'],
+    ['right_wrist', 'right_wrist_openpose'],
+    ['left_shoulder', 'left_shoulder_openpose'],
+    ['left_elbow', 'left_elbow_openpose'],
+    ['left_wrist', 'left_wrist_openpose'],
+    ['right_knee', 'right_knee_openpose'],
+    ['right_ankle', 'right_ankle_openpose'],
+    ['left_knee', 'left_knee_openpose'],
+    ['left_ankle', 'left_ankle_openpose'],
+    ['right_eye', 'right_eye_openpose'],
+    ['left_eye', 'left_eye_openpose'],
+    ['right_ear', 'right_ear_openpose'],
+    ['left_ear', 'left_ear_openpose'],
+    ['left_bigtoe', 'left_bigtoe_openpose'],
+    ['left_smalltoe', 'left_smalltoe_openpose'],
+    ['left_heel', 'left_heel_openpose'],
+    ['right_bigtoe', 'right_bigtoe_openpose'],
+    ['right_smalltoe', 'right_smalltoe_openpose'],
+    ['right_heel', 'right_heel_openpose'],
+]
+
+APPROXIMATE_MAP = defaultdict(list)
+for group in APPROXIMATE_MAPPING_LIST:
+    for member in group:
+        for other_member in group:
+            if member == other_member:
+                continue
+            APPROXIMATE_MAP[member].append(other_member)
+
+HUMAN_DATA_HEAD = [
+    'head', 'jaw', 'left_eyeball', 'right_eyeball', 'nose', 'right_eye',
+    'left_eye', 'right_ear', 'left_ear', 'right_eyebrow_1', 'right_eyebrow_2',
+    'right_eyebrow_3', 'right_eyebrow_4', 'right_eyebrow_5', 'left_eyebrow_5',
+    'left_eyebrow_4', 'left_eyebrow_3', 'left_eyebrow_2', 'left_eyebrow_1',
+    'nosebridge_1', 'nosebridge_2', 'nosebridge_3', 'nosebridge_4',
+    'right_nose_2', 'right_nose_1', 'nose_middle', 'left_nose_1',
+    'left_nose_2', 'right_eye_1', 'right_eye_2', 'right_eye_3', 'right_eye_4',
+    'right_eye_5', 'right_eye_6', 'left_eye_4', 'left_eye_3', 'left_eye_2',
+    'left_eye_1', 'left_eye_6', 'left_eye_5', 'right_mouth_1', 'right_mouth_2',
+    'right_mouth_3', 'mouth_top', 'left_mouth_3', 'left_mouth_2',
+    'left_mouth_1', 'left_mouth_5', 'left_mouth_4', 'mouth_bottom',
+    'right_mouth_4', 'right_mouth_5', 'right_lip_1', 'right_lip_2', 'lip_top',
+    'left_lip_2', 'left_lip_1', 'left_lip_3', 'lip_bottom', 'right_lip_3',
+    'right_contour_1', 'right_contour_2', 'right_contour_3', 'right_contour_4',
+    'right_contour_5', 'right_contour_6', 'right_contour_7', 'right_contour_8',
+    'contour_middle', 'left_contour_8', 'left_contour_7', 'left_contour_6',
+    'left_contour_5', 'left_contour_4', 'left_contour_3', 'left_contour_2',
+    'left_contour_1', 'headtop', 'jaw_extra', 'head_extra', 'nose_openpose',
+    'right_eye_openpose', 'left_eye_openpose', 'right_ear_openpose',
+    'left_ear_openpose', 'headtop_h36m', 'head_bottom_pt', 'head_h36m'
+]
+
+HUMAN_DATA_LEFT_HAND = [
+    'left_index_1', 'left_index_2', 'left_index_3', 'left_middle_1',
+    'left_middle_2', 'left_middle_3', 'left_pinky_1', 'left_pinky_2',
+    'left_pinky_3', 'left_ring_1', 'left_ring_2', 'left_ring_3',
+    'left_thumb_1', 'left_thumb_2', 'left_thumb_3', 'left_thumb', 'left_index',
+    'left_middle', 'left_ring', 'left_pinky', 'left_hand_3dhp', 'left_hand'
+]
+
+HUMAN_DATA_RIGHT_HAND = [
+    'right_index_1', 'right_index_2', 'right_index_3', 'right_middle_1',
+    'right_middle_2', 'right_middle_3', 'right_pinky_1', 'right_pinky_2',
+    'right_pinky_3', 'right_ring_1', 'right_ring_2', 'right_ring_3',
+    'right_thumb_1', 'right_thumb_2', 'right_thumb_3', 'right_thumb',
+    'right_index', 'right_middle', 'right_ring', 'right_pinky',
+    'right_hand_3dhp', 'right_hand'
+]
+
+HUMAN_DATA_SHOULDER = [
+    'left_shoulder', 'left_shoulder_openpose', 'right_shoulder',
+    'right_shoulder_openpose'
+]
+
+HUMAN_DATA_HIP = [
+    'left_hip', 'left_hip_openpose', 'left_hip_extra', 'right_hip',
+    'right_hip_openpose', 'right_hip_extra'
+]
+
+HUMAN_DATA_BODY = HUMAN_DATA_SHOULDER + HUMAN_DATA_HIP + [
+    'pelvis', 'spine_1', 'left_knee', 'right_knee', 'spine_2', 'left_ankle',
+    'right_ankle', 'spine_3', 'left_foot', 'right_foot', 'neck', 'left_collar',
+    'right_collar', 'left_elbow', 'right_elbow', 'left_wrist', 'right_wrist',
+    'left_bigtoe', 'left_smalltoe', 'left_heel', 'right_bigtoe',
+    'right_smalltoe', 'right_heel', 'neck_extra', 'pelvis_extra',
+    'thorax_extra', 'spine_extra', 'neck_openpose', 'right_elbow_openpose',
+    'right_wrist_openpose', 'left_elbow_openpose', 'left_wrist_openpose',
+    'pelvis_openpose', 'right_knee_openpose', 'right_ankle_openpose',
+    'left_knee_openpose', 'left_ankle_openpose', 'left_bigtoe_openpose',
+    'left_smalltoe_openpose', 'left_heel_openpose', 'right_bigtoe_openpose',
+    'right_smalltoe_openpose', 'right_heel_openpose', 'spine_4_3dhp',
+    'left_clavicle_3dhp', 'right_clavicle_3dhp', 'left_toe_3dhp',
+    'right_toe_3dhp'
+]
+
+HUMAN_DATA_PARTS = {
+    'head': HUMAN_DATA_HEAD,
+    'left_hand': HUMAN_DATA_LEFT_HAND,
+    'right_hand': HUMAN_DATA_RIGHT_HAND,
+    'shoulder': HUMAN_DATA_SHOULDER,
+    'hip': HUMAN_DATA_HIP,
+    'body': HUMAN_DATA_BODY
+}
+
+HUMAN_DATA_LIMBS = {
+    'body': [
+        ['pelvis', 'left_hip'],
+        ['pelvis', 'right_hip'],
+        ['pelvis', 'spine_1'],
+        ['spine_1', 'spine_2'],
+        ['spine_2', 'spine_3'],
+        ['spine_3', 'neck'],
+        ['neck', 'head'],
+        ['left_ankle', 'left_knee'],
+        ['left_knee', 'left_hip'],
+        ['right_ankle', 'right_knee'],
+        ['right_knee', 'right_hip'],
+        ['right_ankle', 'right_foot'],
+        ['left_ankle', 'left_foot'],
+        ['left_hip', 'right_hip'],
+        ['left_shoulder', 'left_hip'],
+        ['right_shoulder', 'right_hip'],
+        ['left_collar', 'spine_3'],
+        ['right_collar', 'spine_3'],
+        ['right_collar', 'right_shoulder'],
+        ['left_collar', 'left_shoulder'],
+        ['left_shoulder', 'right_shoulder'],
+        ['left_shoulder', 'left_elbow'],
+        ['right_shoulder', 'right_elbow'],
+        ['left_elbow', 'left_wrist'],
+        ['right_elbow', 'right_wrist'],
+        ['left_ankle', 'left_bigtoe'],
+        ['left_ankle', 'left_smalltoe'],
+        ['left_ankle', 'left_heel'],
+        ['right_ankle', 'right_bigtoe'],
+        ['right_ankle', 'right_smalltoe'],
+        ['right_ankle', 'right_heel'],
+        ['left_shoulder', 'left_ear'],
+        ['right_shoulder', 'right_ear'],
+        ['right_ear', 'right_eye'],
+        ['right_eye', 'nose'],
+        ['nose', 'left_eye'],
+        ['left_eye', 'left_ear'],
+        ['nose', 'jaw'],
+        ['jaw', 'neck'],
+        # extra limbs
+        ['pelvis_extra', 'left_hip_extra'],
+        ['pelvis_extra', 'right_hip_extra'],
+        ['left_hip_extra', 'left_knee'],
+        ['right_hip_extra', 'right_knee'],
+        ['left_hip_extra', 'left_shoulder'],
+        ['right_hip_extra', 'right_shoulder'],
+        ['pelvis_extra', 'spine_1'],
+        ['spine_2', 'spine_extra'],
+        ['spine_extra', 'spine_3'],
+        ['spine_3', 'thorax_extra'],
+        ['thorax_extra', 'left_shoulder'],
+        ['thorax_extra', 'right_shoulder'],
+        ['thorax_extra', 'neck_extra'],
+        ['neck_extra', 'jaw_extra'],
+        ['jaw_extra', 'nose'],
+        ['head_extra', 'nose'],
+        ['head_extra', 'headtop'],
+        ['head_extra', 'neck_extra'],
+        ['neck_extra', 'headtop'],
+        ['right_hip_extra', 'left_hip_extra'],
+        ['right_eye_openpose', 'right_ear_openpose'],
+        ['left_ear_openpose', 'left_eye_openpose'],
+        ['right_shoulder_openpose', 'right_elbow_openpose'],
+        ['right_elbow_openpose', 'right_wrist_openpose'],
+        ['left_shoulder_openpose', 'right_shoulder_openpose'],
+        ['left_shoulder_openpose', 'left_elbow_openpose'],
+        ['left_elbow_openpose', 'left_wrist_openpose'],
+        ['pelvis_openpose', 'headtop'],
+        ['pelvis_openpose', 'headtop'],
+        ['neck_extra', 'right_hip_openpose'],
+        ['neck_extra', 'left_hip_openpose'],
+        ['right_hip_openpose', 'right_shoulder_openpose'],
+        ['right_hip_openpose', 'right_knee_openpose'],
+        ['left_hip_openpose', 'left_shoulder_openpose'],
+        ['left_hip_openpose', 'left_knee_openpose'],
+        ['right_knee_openpose', 'right_ankle_openpose'],
+        ['left_knee_openpose', 'left_ankle_openpose'],
+        ['right_ankle_openpose', 'right_heel_openpose'],
+        ['left_ankle_openpose', 'left_heel_openpose'],
+        ['right_heel_openpose', 'right_bigtoe_openpose'],
+        ['right_heel_openpose', 'right_smalltoe_openpose'],
+        ['left_ankle_openpose', 'left_bigtoe_openpose'],
+        ['left_ankle_openpose', 'left_smalltoe_openpose'],
+    ],
+    'face': [['right_contour_1', 'right_contour_2'],
+             ['right_contour_2', 'right_contour_3'],
+             ['right_contour_3', 'right_contour_4'],
+             ['right_contour_4', 'right_contour_5'],
+             ['right_contour_5', 'right_contour_6'],
+             ['right_contour_6', 'right_contour_7'],
+             ['right_contour_7', 'right_contour_8'],
+             ['right_contour_8', 'contour_middle'],
+             ['contour_middle', 'left_contour_8'],
+             ['left_contour_8', 'left_contour_7'],
+             ['left_contour_7', 'left_contour_6'],
+             ['left_contour_6', 'left_contour_5'],
+             ['left_contour_5', 'left_contour_4'],
+             ['left_contour_4', 'left_contour_3'],
+             ['left_contour_3', 'left_contour_2'],
+             ['left_contour_2', 'left_contour_1']],
+    'left_hand': [['left_wrist', 'left_thumb_1'],
+                  ['left_thumb_1', 'left_thumb_2'],
+                  ['left_thumb_2', 'left_thumb_3'],
+                  ['left_thumb_3', 'left_thumb'],
+                  ['left_wrist', 'left_index_1'],
+                  ['left_index_1', 'left_index_2'],
+                  ['left_index_2', 'left_index_3'],
+                  ['left_index_3', 'left_index'],
+                  ['left_wrist', 'left_middle_1'],
+                  ['left_middle_1', 'left_middle_2'],
+                  ['left_middle_2', 'left_middle_3'],
+                  ['left_middle_3', 'left_middle'],
+                  ['left_wrist', 'left_ring_1'], 
+                  ['left_ring_1', 'left_ring_2'],
+                  ['left_ring_2', 'left_ring_3'], 
+                  ['left_ring_3', 'left_ring'],
+                  ['left_wrist', 'left_pinky_1'],
+                  ['left_pinky_1', 'left_pinky_2'],
+                  ['left_pinky_2', 'left_pinky_3'],
+                  ['left_pinky_3', 'left_pinky'],
+                  ['left_wrist', 'left_thumb'],
+                  ['left_wrist', 'left_index'],
+                  ['left_wrist', 'left_middle'],
+                  ['left_wrist', 'left_ring'],
+                  ['left_wrist', 'left_pinky'],
+                  
+                  ],
+    'right_hand': [['right_wrist', 'right_thumb_1'],
+                   ['right_thumb_1', 'right_thumb_2'],
+                   ['right_thumb_2', 'right_thumb_3'],
+                   ['right_thumb_3', 'right_thumb'],
+                   ['right_wrist', 'right_index_1'],
+                   ['right_index_1', 'right_index_2'],
+                   ['right_index_2', 'right_index_3'],
+                   ['right_index_3', 'right_index'],
+                   ['right_wrist', 'right_middle_1'],
+                   ['right_middle_1', 'right_middle_2'],
+                   ['right_middle_2', 'right_middle_3'],
+                   ['right_middle_3', 'right_middle'],
+                   ['right_wrist', 'right_ring_1'],
+                   ['right_ring_1', 'right_ring_2'],
+                   ['right_ring_2', 'right_ring_3'],
+                   ['right_ring_3', 'right_ring'],
+                   ['right_wrist', 'right_pinky_1'],
+                   ['right_pinky_1', 'right_pinky_2'],
+                   ['right_pinky_2', 'right_pinky_3'],
+                   ['right_pinky_3', 'right_pinky'],
+                   ['right_wrist', 'right_thumb'],
+                   ['right_wrist', 'right_index'],
+                   ['right_wrist', 'right_middle'],
+                   ['right_wrist', 'right_ring'],
+                   ['right_wrist', 'right_pinky']],
+    'right_eye':
+    [['right_eye_1', 'right_eye_2'], ['right_eye_2', 'right_eye_3'],
+     ['right_eye_3', 'right_eye_4'], ['right_eye_4', 'right_eye_5'],
+     ['right_eye_5', 'right_eye_6'], ['right_eye_6', 'right_eye_1'],
+     ['right_eyebrow_1', 'right_eyebrow_2'],
+     ['right_eyebrow_2', 'right_eyebrow_3'],
+     ['right_eyebrow_3', 'right_eyebrow_4'],
+     ['right_eyebrow_4', 'right_eyebrow_5']],
+    'left_eye': [['left_eye_4', 'left_eye_3'], ['left_eye_3', 'left_eye_2'],
+                 ['left_eye_2', 'left_eye_1'], ['left_eye_1', 'left_eye_6'],
+                 ['left_eye_6', 'left_eye_5'], ['left_eye_5', 'left_eye_4'],
+                 ['left_eyebrow_1', 'left_eyebrow_2'],
+                 ['left_eyebrow_2', 'left_eyebrow_3'],
+                 ['left_eyebrow_3', 'left_eyebrow_4'],
+                 ['left_eyebrow_4', 'left_eyebrow_5']],
+    'mouth':
+    [['right_mouth_1', 'right_mouth_2'], ['right_mouth_2', 'right_mouth_3'],
+     ['right_mouth_3', 'mouth_top'], ['mouth_top', 'left_mouth_3'],
+     ['left_mouth_3', 'left_mouth_2'], ['left_mouth_2', 'left_mouth_1'],
+     ['left_mouth_1', 'left_mouth_5'], ['left_mouth_5', 'left_mouth_4'],
+     ['left_mouth_4', 'mouth_bottom'], ['mouth_bottom', 'right_mouth_4'],
+     ['right_mouth_4', 'right_mouth_5'], ['right_mouth_5', 'right_mouth_1'],
+     ['right_lip_1', 'right_lip_2'], ['right_lip_2', 'lip_top'],
+     ['lip_top', 'left_lip_2'], ['left_lip_2', 'left_lip_1'],
+     ['left_lip_1', 'left_lip_3'], ['left_lip_3', 'lip_bottom'],
+     ['lip_bottom', 'right_lip_3'], ['right_lip_3', 'right_lip_1'],
+     
+     ['nose', 'mouth_top'], ['mouth_top', 'right_contour_1'],
+     ['mouth_top', 'left_contour_1'],
+     ['jaw', 'left_contour_1'],
+     ['jaw', 'right_contour_1']
+     
+     ],
+    
+    
+    
+    'nose': [
+        ['nosebridge_1', 'nosebridge_2'],
+        ['nosebridge_2', 'nosebridge_3'],
+        ['nosebridge_3', 'nosebridge_4'],
+        ['right_nose_2', 'right_nose_1'],
+        ['right_nose_1', 'nose_middle'],
+        ['nose_middle', 'left_nose_1'],
+        ['left_nose_1', 'left_nose_2'],
+    ]
+}
+
+HUMAN_DATA_LIMBS_INDEX = {}
+for k in HUMAN_DATA_LIMBS:
+    HUMAN_DATA_LIMBS_INDEX[k] = [[
+        HUMAN_DATA.index(limb[0]),
+        HUMAN_DATA.index(limb[1])
+    ] for limb in HUMAN_DATA_LIMBS[k]]
+
+HUMAN_DATA_PALETTE = {
+    'left_eye': [[0, 0, 0]],
+    'right_eye': [[255, 255, 0]],
+    'nose': [[0, 0, 255]],
+    'mouth': [[0, 255, 255]],
+    'face': [[255, 0, 0]],
+    'left_hand': [[0, 255, 0]],
+    'right_hand': [[255, 0, 255]],
+}
diff --git a/detrsmpl/core/conventions/keypoints_mapping/hybrik.py b/detrsmpl/core/conventions/keypoints_mapping/hybrik.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f801e846f510e87d14d9df40f5c7745a7535613
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/hybrik.py
@@ -0,0 +1,31 @@
+HYBRIK_29_KEYPOINTS = [
+    'pelvis',
+    'left_hip',
+    'right_hip',  # 2
+    'spine_1',
+    'left_knee',
+    'right_knee',  # 5
+    'spine_2',
+    'left_ankle',
+    'right_ankle',  # 8
+    'spine_3',
+    'left_foot',
+    'right_foot',  # 11
+    'neck',
+    'left_collar',
+    'right_collar',  # 14
+    'jaw',  # 15
+    'left_shoulder',
+    'right_shoulder',  # 17
+    'left_elbow',
+    'right_elbow',  # 19
+    'left_wrist',
+    'right_wrist',  # 21
+    'left_thumb',
+    'right_thumb',  # 23
+    'head',
+    'left_middle',
+    'right_middle',  # 26
+    'left_bigtoe',
+    'right_bigtoe'  # 28
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/instavariety.py b/detrsmpl/core/conventions/keypoints_mapping/instavariety.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e8587f831eb0191924a6fb1115988c766870ce9
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/instavariety.py
@@ -0,0 +1,27 @@
+INSTAVARIETY_KEYPOINTS = [
+    'right_heel_openpose',
+    'right_knee_openpose',
+    'right_hip_openpose',
+    'left_hip_openpose',
+    'left_knee_openpose',
+    'left_heel_openpose',
+    'right_wrist_openpose',
+    'right_elbow_openpose',
+    'right_shoulder_openpose',
+    'left_shoulder_openpose',
+    'left_elbow_openpose',
+    'left_wrist_openpose',
+    'neck_openpose',
+    'headtop',
+    'nose_openpose',
+    'left_eye_openpose',
+    'right_eye_openpose',
+    'left_ear_openpose',
+    'right_ear_openpose',
+    'left_bigtoe_openpose',
+    'right_bigtoe_openpose',
+    'left_smalltoe_openpose',
+    'right_smalltoe_openpose',
+    'left_ankle_openpose',
+    'right_ankle_openpose',
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/lsp.py b/detrsmpl/core/conventions/keypoints_mapping/lsp.py
new file mode 100644
index 0000000000000000000000000000000000000000..214f16d52eecd2de163ca0ffe40457bfa2860c6c
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/lsp.py
@@ -0,0 +1,16 @@
+LSP_KEYPOINTS = [
+    'right_ankle',
+    'right_knee',
+    'right_hip_extra',
+    'left_hip_extra',
+    'left_knee',
+    'left_ankle',
+    'right_wrist',
+    'right_elbow',
+    'right_shoulder',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'neck_extra',
+    'headtop',
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/mano.py b/detrsmpl/core/conventions/keypoints_mapping/mano.py
new file mode 100644
index 0000000000000000000000000000000000000000..225d27751274d62da245af7276214363944b3194
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/mano.py
@@ -0,0 +1,33 @@
+# Original order from MANO J_regressor
+MANO_RIGHT_KEYPOINTS = [
+    'right_wrist', 'right_index_1', 'right_index_2', 'right_index_3',
+    'right_middle_1', 'right_middle_2', 'right_middle_3', 'right_pinky_1',
+    'right_pinky_2', 'right_pinky_3', 'right_ring_1', 'right_ring_2',
+    'right_ring_3', 'right_thumb_1', 'right_thumb_2', 'right_thumb_3',
+    'right_thumb', 'right_index', 'right_middle', 'right_ring', 'right_pinky'
+]
+
+MANO_LEFT_KEYPOINTS = [
+    x.replace('right_', 'left_') for x in MANO_RIGHT_KEYPOINTS
+]
+
+# Re-arranged order is compatible with the output of manolayer
+# from official [manopth](https://github.com/hassony2/manopth)
+MANO_REORDER_MAP = [
+    0, 13, 14, 15, 16, 1, 2, 3, 17, 4, 5, 6, 18, 10, 11, 12, 19, 7, 8, 9, 20
+]
+
+MANO_RIGHT_REORDER_KEYPOINTS = [
+    MANO_RIGHT_KEYPOINTS[i] for i in MANO_REORDER_MAP
+]
+MANO_LEFT_REORDER_KEYPOINTS = [
+    MANO_LEFT_KEYPOINTS[i] for i in MANO_REORDER_MAP
+]
+
+# Deprecated: reserved for backward compatibility
+MANO_KEYPOINTS = MANO_RIGHT_KEYPOINTS
+# Two hands (left + right)
+MANO_HANDS_KEYPOINTS = MANO_LEFT_KEYPOINTS + MANO_RIGHT_KEYPOINTS
+# Reordered two hands (left + right)
+MANO_HANDS_REORDER_KEYPOINTS = \
+        MANO_LEFT_REORDER_KEYPOINTS + MANO_RIGHT_REORDER_KEYPOINTS
diff --git a/detrsmpl/core/conventions/keypoints_mapping/mpi_inf_3dhp.py b/detrsmpl/core/conventions/keypoints_mapping/mpi_inf_3dhp.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffc180f925d20fc3d3d4adf12198a48bea606c20
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/mpi_inf_3dhp.py
@@ -0,0 +1,81 @@
+MPI_INF_3DHP_KEYPOINTS = [
+    'spine_3',
+    'spine_4_3dhp',
+    'spine_2',
+    'spine_extra',  # close to spine2
+    'pelvis_extra',
+    'neck_extra',  # throat
+    'head_extra',
+    'headtop',
+    'left_clavicle_3dhp',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'left_hand_3dhp',
+    'right_clavicle_3dhp',
+    'right_shoulder',
+    'right_elbow',
+    'right_wrist',
+    'right_hand_3dhp',
+    'left_hip_extra',
+    'left_knee',
+    'left_ankle',
+    'left_foot',
+    'left_toe_3dhp',
+    'right_hip_extra',
+    'right_knee',
+    'right_ankle',
+    'right_foot',
+    'right_toe_3dhp'
+]
+
+MPI_INF_3DHP_TEST_KEYPOINTS = [
+    'headtop',
+    'neck_extra',
+    'right_shoulder',
+    'right_elbow',
+    'right_wrist',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'right_hip_extra',
+    'right_knee',
+    'right_ankle',
+    'left_hip_extra',
+    'left_knee',
+    'left_ankle',
+    'pelvis_extra',
+    'spine_extra',  # close to spine2
+    'head_extra'
+]
+
+HYBRIK_MPI_INF_3DHP_KEYPOINTS = [
+    'spine_3',
+    'spine_4_3dhp',
+    'spine_2',
+    'spine_extra',  # close to spine2
+    'pelvis',
+    'neck',  # throat
+    'head_extra',
+    'headtop',
+    'left_clavicle_3dhp',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'left_hand_3dhp',
+    'right_clavicle_3dhp',
+    'right_shoulder',
+    'right_elbow',
+    'right_wrist',
+    'right_hand_3dhp',
+    'left_hip',
+    'left_knee',
+    'left_ankle',
+    'left_foot',
+    'left_toe_3dhp',
+    'right_hip',
+    'right_knee',
+    'right_ankle',
+    'right_foot',
+    'right_toe_3dhp'
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/mpii.py b/detrsmpl/core/conventions/keypoints_mapping/mpii.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e78953cba05941b0054e0ffe72f962c5399a9fe
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/mpii.py
@@ -0,0 +1,18 @@
+MPII_KEYPOINTS = [
+    'right_ankle',
+    'right_knee',
+    'right_hip_extra',
+    'left_hip_extra',
+    'left_knee',
+    'left_ankle',
+    'pelvis_extra',
+    'thorax_extra',
+    'neck_extra',
+    'headtop',
+    'right_wrist',
+    'right_elbow',
+    'right_shoulder',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/openpose.py b/detrsmpl/core/conventions/keypoints_mapping/openpose.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebe3b3cdee24a6c088f64a70238c6f8963213919
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/openpose.py
@@ -0,0 +1,452 @@
+"""These keypoint formats are taken from https://github.com/CMU-Perceptual-
+Computing-Lab/openpose/blob/master/src/openpose/pose/poseParameters.cpp.
+Openpose mainly supports 25 and 135 now, 118 convention can be found in
+https://github.com/vchoutas/smplify-x/issues/152#issuecomment-923715702.
+
+OPENPOSE_137_KEYPOINTS can be found in
+https://github.com/vchoutas/expose
+
+- OPENPOSE_25_KEYPOINTS:  body(25)
+- OPENPOSE_118_KEYPOINTS: body(25) + hand(42) + face(51)
+- OPENPOSE_135_KEYPOINTS: body(25) + hand(40) + face(70)
+- OPENPOSE_137_KEYPOINTS: body(27) + hand(40) + face(70)
+
+Note that:
+1. 135 and coco17 share the first 17 body keypoints
+2. 25 and 118 share the first 25 body keypoints
+3. 137 and 135 share the hand and face parts
+"""
+
+OPENPOSE_135_KEYPOINTS = [
+    'nose',
+    'left_eye',
+    'right_eye',
+    'left_ear',
+    'right_ear',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'left_hip',
+    'right_hip',
+    'left_knee',
+    'right_knee',
+    'left_ankle',
+    'right_ankle',
+    'neck',  # upper_neck
+    'head',
+    'left_bigtoe',
+    'left_smalltoe',
+    'left_heel',
+    'right_bigtoe',
+    'right_smalltoe',
+    'right_heel',
+    'left_thumb_1',
+    'left_thumb_2',
+    'left_thumb_3',
+    'left_thumb',
+    'left_index_1',
+    'left_index_2',
+    'left_index_3',
+    'left_index',
+    'left_middle_1',
+    'left_middle_2',
+    'left_middle_3',
+    'left_middle',
+    'left_ring_1',
+    'left_ring_2',
+    'left_ring_3',
+    'left_ring',
+    'left_pinky_1',
+    'left_pinky_2',
+    'left_pinky_3',
+    'left_pinky',
+    'right_thumb_1',
+    'right_thumb_2',
+    'right_thumb_3',
+    'right_thumb',
+    'right_index_1',
+    'right_index_2',
+    'right_index_3',
+    'right_index',
+    'right_middle_1',
+    'right_middle_2',
+    'right_middle_3',
+    'right_middle',
+    'right_ring_1',
+    'right_ring_2',
+    'right_ring_3',
+    'right_ring',
+    'right_pinky_1',
+    'right_pinky_2',
+    'right_pinky_3',
+    'right_pinky',
+    'right_contour_1',  # original name: face_contour_1
+    'right_contour_2',  # original name: face_contour_2
+    'right_contour_3',  # original name: face_contour_3
+    'right_contour_4',  # original name: face_contour_4
+    'right_contour_5',  # original name: face_contour_5
+    'right_contour_6',  # original name: face_contour_6
+    'right_contour_7',  # original name: face_contour_7
+    'right_contour_8',  # original name: face_contour_8
+    'contour_middle',  # original name: face_contour_9
+    'left_contour_8',  # original name: face_contour_10
+    'left_contour_7',  # original name: face_contour_11
+    'left_contour_6',  # original name: face_contour_12
+    'left_contour_5',  # original name: face_contour_13
+    'left_contour_4',  # original name: face_contour_14
+    'left_contour_3',  # original name: face_contour_15
+    'left_contour_2',  # original name: face_contour_16
+    'left_contour_1',  # original name: face_contour_17
+    'right_eyebrow_1',
+    'right_eyebrow_2',
+    'right_eyebrow_3',
+    'right_eyebrow_4',
+    'right_eyebrow_5',
+    'left_eyebrow_5',
+    'left_eyebrow_4',
+    'left_eyebrow_3',
+    'left_eyebrow_2',
+    'left_eyebrow_1',
+    'nosebridge_1',
+    'nosebridge_2',
+    'nosebridge_3',
+    'nosebridge_4',
+    'right_nose_2',  # original name: nose_1
+    'right_nose_1',  # original name: nose_2
+    'nose_middle',  # original name: nose_3
+    'left_nose_1',  # original name: nose_4
+    'left_nose_2',  # original name: nose_5
+    'right_eye_1',
+    'right_eye_2',
+    'right_eye_3',
+    'right_eye_4',
+    'right_eye_5',
+    'right_eye_6',
+    'left_eye_4',
+    'left_eye_3',
+    'left_eye_2',
+    'left_eye_1',
+    'left_eye_6',
+    'left_eye_5',
+    'right_mouth_1',  # original name: mouth_1
+    'right_mouth_2',  # original name: mouth_2
+    'right_mouth_3',  # original name: mouth_3
+    'mouth_top',  # original name: mouth_4
+    'left_mouth_3',  # original name: mouth_5
+    'left_mouth_2',  # original name: mouth_6
+    'left_mouth_1',  # original name: mouth_7
+    'left_mouth_5',  # original name: mouth_8
+    'left_mouth_4',  # original name: mouth_9
+    'mouth_bottom',  # original name: mouth_10
+    'right_mouth_4',  # original name: mouth_11
+    'right_mouth_5',  # original name: mouth_12
+    'right_lip_1',  # original name: lip_1
+    'right_lip_2',  # original name: lip_2
+    'lip_top',  # original name: lip_3
+    'left_lip_2',  # original name: lip_4
+    'left_lip_1',  # original name: lip_5
+    'left_lip_3',  # original name: lip_6
+    'lip_bottom',  # original name: lip_7
+    'right_lip_3',  # original name: lip_8
+    'right_eyeball',
+    'left_eyeball'
+]
+
+# TODO: OPENPOSE-25->HumanData->SMPLX causes the whole body to be lost
+# OPENPOSE-25: nose_openpose
+# SMPLX: nose
+
+OPENPOSE_25_KEYPOINTS = [
+    'nose_openpose',
+    'neck_openpose',  # 'upper_neck'
+    'right_shoulder_openpose',
+    'right_elbow_openpose',
+    'right_wrist_openpose',
+    'left_shoulder_openpose',
+    'left_elbow_openpose',
+    'left_wrist_openpose',
+    'pelvis_openpose',  # 'mid_hip'
+    'right_hip_openpose',
+    'right_knee_openpose',
+    'right_ankle_openpose',
+    'left_hip_openpose',
+    'left_knee_openpose',
+    'left_ankle_openpose',
+    'right_eye_openpose',
+    'left_eye_openpose',
+    'right_ear_openpose',
+    'left_ear_openpose',
+    'left_bigtoe_openpose',
+    'left_smalltoe_openpose',
+    'left_heel_openpose',
+    'right_bigtoe_openpose',
+    'right_smalltoe_openpose',
+    'right_heel_openpose'
+]
+
+OPENPOSE_118_KEYPOINTS = [
+    'nose_openpose',
+    'neck_openpose',
+    'right_shoulder_openpose',
+    'right_elbow_openpose',
+    'right_wrist_openpose',
+    'left_shoulder_openpose',
+    'left_elbow_openpose',
+    'left_wrist_openpose',
+    'pelvis_openpose',
+    'right_hip_openpose',
+    'right_knee_openpose',
+    'right_ankle_openpose',
+    'left_hip_openpose',
+    'left_knee_openpose',
+    'left_ankle_openpose',
+    'right_eye_openpose',
+    'left_eye_openpose',
+    'right_ear_openpose',
+    'left_ear_openpose',
+    'left_bigtoe_openpose',
+    'left_smalltoe_openpose',
+    'left_heel_openpose',
+    'right_bigtoe_openpose',
+    'right_smalltoe_openpose',
+    'right_heel_openpose',
+    'left_wrist',
+    'left_thumb_1',
+    'left_thumb_2',
+    'left_thumb_3',
+    'left_thumb',
+    'left_index_1',
+    'left_index_2',
+    'left_index_3',
+    'left_index',
+    'left_middle_1',
+    'left_middle_2',
+    'left_middle_3',
+    'left_middle',
+    'left_ring_1',
+    'left_ring_2',
+    'left_ring_3',
+    'left_ring',
+    'left_pinky_1',
+    'left_pinky_2',
+    'left_pinky_3',
+    'left_pinky',
+    'right_wrist',
+    'right_thumb_1',
+    'right_thumb_2',
+    'right_thumb_3',
+    'right_thumb',
+    'right_index_1',
+    'right_index_2',
+    'right_index_3',
+    'right_index',
+    'right_middle_1',
+    'right_middle_2',
+    'right_middle_3',
+    'right_middle',
+    'right_ring_1',
+    'right_ring_2',
+    'right_ring_3',
+    'right_ring',
+    'right_pinky_1',
+    'right_pinky_2',
+    'right_pinky_3',
+    'right_pinky',
+    'right_eyebrow_1',
+    'right_eyebrow_2',
+    'right_eyebrow_3',
+    'right_eyebrow_4',
+    'right_eyebrow_5',
+    'left_eyebrow_5',
+    'left_eyebrow_4',
+    'left_eyebrow_3',
+    'left_eyebrow_2',
+    'left_eyebrow_1',
+    'nosebridge_1',
+    'nosebridge_2',
+    'nosebridge_3',
+    'nosebridge_4',
+    'right_nose_2',  # original name: nose_1
+    'right_nose_1',  # original name: nose_2
+    'nose_middle',  # original name: nose_3
+    'left_nose_1',  # original name: nose_4
+    'left_nose_2',  # original name: nose_5
+    'right_eye_1',
+    'right_eye_2',
+    'right_eye_3',
+    'right_eye_4',
+    'right_eye_5',
+    'right_eye_6',
+    'left_eye_4',
+    'left_eye_3',
+    'left_eye_2',
+    'left_eye_1',
+    'left_eye_6',
+    'left_eye_5',
+    'right_mouth_1',  # original name: mouth_1
+    'right_mouth_2',  # original name: mouth_2
+    'right_mouth_3',  # original name: mouth_3
+    'mouth_top',  # original name: mouth_4
+    'left_mouth_3',  # original name: mouth_5
+    'left_mouth_2',  # original name: mouth_6
+    'left_mouth_1',  # original name: mouth_7
+    'left_mouth_5',  # original name: mouth_8
+    'left_mouth_4',  # original name: mouth_9
+    'mouth_bottom',  # original name: mouth_10
+    'right_mouth_4',  # original name: mouth_11
+    'right_mouth_5',  # original name: mouth_12
+    'right_lip_1',  # original name: lip_1
+    'right_lip_2',  # original name: lip_2
+    'lip_top',  # original name: lip_3
+    'left_lip_2',  # original name: lip_4
+    'left_lip_1',  # original name: lip_5
+    'left_lip_3',  # original name: lip_6
+    'lip_bottom',  # original name: lip_7
+    'right_lip_3',  # original name: lip_8
+]
+
+OPENPOSE_JOINTS = [
+    'nose',
+    'neck',
+    'right_shoulder',
+    'right_elbow',
+    'right_wrist',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'pelvis',
+    'right_hip',
+    'right_knee',
+    'right_ankle',
+    'left_hip',
+    'left_knee',
+    'left_ankle',
+    'right_eye',
+    'left_eye',
+    'right_ear',
+    'left_ear',
+    'left_wrist_openpose',
+    'left_thumb_1',
+    'left_thumb_2',
+    'left_thumb_3',
+    'left_thumb',
+    'left_index_1',
+    'left_index_2',
+    'left_index_3',
+    'left_index',
+    'left_middle_1',
+    'left_middle_2',
+    'left_middle_3',
+    'left_middle',
+    'left_ring_1',
+    'left_ring_2',
+    'left_ring_3',
+    'left_ring',
+    'left_pinky_1',
+    'left_pinky_2',
+    'left_pinky_3',
+    'left_pinky',
+    'right_wrist_openpose',
+    'right_thumb_1',
+    'right_thumb_2',
+    'right_thumb_3',
+    'right_thumb',
+    'right_index_1',
+    'right_index_2',
+    'right_index_3',
+    'right_index',
+    'right_middle_1',
+    'right_middle_2',
+    'right_middle_3',
+    'right_middle',
+    'right_ring_1',
+    'right_ring_2',
+    'right_ring_3',
+    'right_ring',
+    'right_pinky_1',
+    'right_pinky_2',
+    'right_pinky_3',
+    'right_pinky',
+    # Face contour
+    'right_contour_1',
+    'right_contour_2',
+    'right_contour_3',
+    'right_contour_4',
+    'right_contour_5',
+    'right_contour_6',
+    'right_contour_7',
+    'right_contour_8',
+    'contour_middle',
+    'left_contour_8',
+    'left_contour_7',
+    'left_contour_6',
+    'left_contour_5',
+    'left_contour_4',
+    'left_contour_3',
+    'left_contour_2',
+    'left_contour_1',
+    # Eye brows
+    'right_eye_brow_1',
+    'right_eye_brow_2',
+    'right_eye_brow_3',
+    'right_eye_brow_4',
+    'right_eye_brow_5',
+    'left_eye_brow_5',
+    'left_eye_brow_4',
+    'left_eye_brow_3',
+    'left_eye_brow_2',
+    'left_eye_brow_1',
+    'nosebridge_1',
+    'nosebridge_2',
+    'nosebridge_3',
+    'nosebridge_4',
+    'right_nose_2',
+    'right_nose_1',
+    'nose_middle',
+    'left_nose_1',
+    'left_nose_2',
+    'right_eye_1',
+    'right_eye_2',
+    'right_eye_3',
+    'right_eye_4',
+    'right_eye_5',
+    'right_eye_6',
+    'left_eye_4',
+    'left_eye_3',
+    'left_eye_2',
+    'left_eye_1',
+    'left_eye_6',
+    'left_eye_5',
+    'right_mouth_1',
+    'right_mouth_2',
+    'right_mouth_3',
+    'mouth_top',
+    'left_mouth_3',
+    'left_mouth_2',
+    'left_mouth_1',
+    'left_mouth_5',
+    'left_mouth_4',
+    'mouth_bottom',
+    'right_mouth_4',
+    'right_mouth_5',
+    'right_lip_1',
+    'right_lip_2',
+    'lip_top',
+    'left_lip_2',
+    'left_lip_1',
+    'left_lip_3',
+    'lip_bottom',
+    'right_lip_3',
+    'right_eyeball_unused',  # not used in expose
+    'left_eyeball_unused',  # not used in expose
+]
+
+OPENPOSE_FEET_KEYPOINTS = [
+    'left_bigtoe', 'left_smalltoe', 'left_heel', 'right_bigtoe',
+    'right_smalltoe', 'right_heel'
+]
+OPENPOSE_137_KEYPOINTS = OPENPOSE_JOINTS[:19] + \
+    OPENPOSE_FEET_KEYPOINTS + OPENPOSE_JOINTS[19:]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/penn_action.py b/detrsmpl/core/conventions/keypoints_mapping/penn_action.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d6d70ea2baab877cbc3e83b92b742a099ba78ff
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/penn_action.py
@@ -0,0 +1,15 @@
+PENN_ACTION_KEYPOINTS = [
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'left_hip',
+    'right_hip',
+    'left_knee',
+    'right_knee',
+    'left_ankle',
+    'right_ankle',
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/posetrack.py b/detrsmpl/core/conventions/keypoints_mapping/posetrack.py
new file mode 100644
index 0000000000000000000000000000000000000000..03700f7c93e09638c2d31dbd55b55eb8b6f050dd
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/posetrack.py
@@ -0,0 +1,6 @@
+POSETRACK_KEYPOINTS = [
+    'nose', 'head_bottom_pt', 'headtop', 'left_ear', 'right_ear',
+    'left_shoulder', 'right_shoulder', 'left_elbow', 'right_elbow',
+    'left_wrist', 'right_wrist', 'left_hip', 'right_hip', 'left_knee',
+    'right_knee', 'left_ankle', 'right_ankle'
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/pw3d.py b/detrsmpl/core/conventions/keypoints_mapping/pw3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5ec7a5167208a93f24cafd15596aa0cece7af0f
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/pw3d.py
@@ -0,0 +1,20 @@
+PW3D_KEYPOINTS = [
+    'nose',
+    'neck_extra',
+    'right_shoulder',
+    'right_elbow',
+    'right_wrist',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'right_hip_extra',
+    'right_knee',
+    'right_ankle',
+    'left_hip_extra',
+    'left_knee',
+    'left_ankle',
+    'right_eye',
+    'left_eye',
+    'right_ear',
+    'left_ear',
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/smpl.py b/detrsmpl/core/conventions/keypoints_mapping/smpl.py
new file mode 100644
index 0000000000000000000000000000000000000000..52bb70ba6e8a1563e6cbebf4bad1da393477685c
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/smpl.py
@@ -0,0 +1,126 @@
+# the keypoints defined in the SMPL paper
+SMPL_KEYPOINTS = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine_1',
+    'left_knee',
+    'right_knee',
+    'spine_2',
+    'left_ankle',
+    'right_ankle',
+    'spine_3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+  
+    # 'left_hand',
+    # 'right_hand',
+    'left_middle',
+    'right_middle'
+]
+
+# the full keypoints produced by the default SMPL J_regressor
+SMPL_45_KEYPOINTS = SMPL_KEYPOINTS + [
+    'nose',
+    'right_eye',
+    'left_eye',
+    'right_ear',
+    'left_ear',
+    'left_bigtoe',
+    'left_smalltoe',
+    'left_heel',
+    'right_bigtoe',
+    'right_smalltoe',
+    'right_heel',
+    'left_thumb',
+    'left_index',
+    'left_middle',
+    'left_ring',
+    'left_pinky',
+    'right_thumb',
+    'right_index',
+    'right_middle',
+    'right_ring',
+    'right_pinky',
+]
+
+# the full keypoints produced by the default SMPL J_regressor and
+# extra_J_regressor (provided by SPIN)
+SMPL_54_KEYPOINTS = SMPL_45_KEYPOINTS + [
+    'right_hip_extra',  # LSP
+    'left_hip_extra',  # LSP
+    'neck_extra',  # LSP
+    'headtop',  # LSP
+    'pelvis_extra',  # MPII
+    'thorax_extra',  # MPII
+    'spine_extra',  # H36M
+    'jaw_extra',  # H36M
+    'head_extra',  # H36M
+]
+
+# SMPL keypoint convention used by SPIN, EFT and so on
+SMPL_49_KEYPOINTS = [
+    # OpenPose
+    'nose_openpose',
+    'neck_openpose',  # 'upper_neck'
+    'right_shoulder_openpose',
+    'right_elbow_openpose',
+    'right_wrist_openpose',
+    'left_shoulder_openpose',
+    'left_elbow_openpose',
+    'left_wrist_openpose',
+    'pelvis_openpose',
+    'right_hip_openpose',
+    'right_knee_openpose',
+    'right_ankle_openpose',
+    'left_hip_openpose',
+    'left_knee_openpose',
+    'left_ankle_openpose',
+    'right_eye_openpose',
+    'left_eye_openpose',
+    'right_ear_openpose',
+    'left_ear_openpose',
+    'left_bigtoe_openpose',
+    'left_smalltoe_openpose',
+    'left_heel_openpose',
+    'right_bigtoe_openpose',
+    'right_smalltoe_openpose',
+    'right_heel_openpose',
+    # 24 Keypoints
+    'right_ankle',
+    'right_knee',
+    'right_hip_extra',  # LSP
+    'left_hip_extra',  # LSP
+    'left_knee',
+    'left_ankle',
+    'right_wrist',
+    'right_elbow',
+    'right_shoulder',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'neck_extra',  # LSP
+    'headtop',  # LSP mpii peen_action mpi_inf_3dhp
+    'pelvis_extra',  # MPII
+    'thorax_extra',  # MPII
+    'spine_extra',  # H36M
+    'jaw_extra',  # H36M
+    'head_extra',  # H36M
+    'nose',
+    'left_eye',
+    'right_eye',
+    'left_ear',
+    'right_ear'
+]
+
+SMPL_24_KEYPOINTS = SMPL_49_KEYPOINTS[-24:]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/smplx.py b/detrsmpl/core/conventions/keypoints_mapping/smplx.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f247073947a438019741420a322264bcadc3a12
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/smplx.py
@@ -0,0 +1,382 @@
+SMPLX_KEYPOINTS = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine_1',
+    'left_knee',
+    'right_knee',
+    'spine_2',
+    'left_ankle',
+    'right_ankle',
+    'spine_3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'jaw',
+    'left_eyeball',
+    'right_eyeball',
+    'left_index_1',
+    'left_index_2',
+    'left_index_3',
+    'left_middle_1',
+    'left_middle_2',
+    'left_middle_3',
+    'left_pinky_1',
+    'left_pinky_2',
+    'left_pinky_3',
+    'left_ring_1',
+    'left_ring_2',
+    'left_ring_3',
+    'left_thumb_1',
+    'left_thumb_2',
+    'left_thumb_3',
+    'right_index_1',
+    'right_index_2',
+    'right_index_3',
+    'right_middle_1',
+    'right_middle_2',
+    'right_middle_3',
+    'right_pinky_1',
+    'right_pinky_2',
+    'right_pinky_3',
+    'right_ring_1',
+    'right_ring_2',
+    'right_ring_3',
+    'right_thumb_1',
+    'right_thumb_2',
+    'right_thumb_3',
+    'nose',
+    'right_eye',
+    'left_eye',
+    'right_ear',
+    'left_ear',
+    'left_bigtoe',
+    'left_smalltoe',
+    'left_heel',
+    'right_bigtoe',
+    'right_smalltoe',
+    'right_heel',
+    'left_thumb',
+    'left_index',
+    'left_middle',
+    'left_ring',
+    'left_pinky',
+    'right_thumb',
+    'right_index',
+    'right_middle',
+    'right_ring',
+    'right_pinky',
+    'right_eyebrow_1',
+    'right_eyebrow_2',
+    'right_eyebrow_3',
+    'right_eyebrow_4',
+    'right_eyebrow_5',
+    'left_eyebrow_5',
+    'left_eyebrow_4',
+    'left_eyebrow_3',
+    'left_eyebrow_2',
+    'left_eyebrow_1',
+    'nosebridge_1',
+    'nosebridge_2',
+    'nosebridge_3',
+    'nosebridge_4',
+    'right_nose_2',  # original name: nose_1
+    'right_nose_1',  # original name: nose_2
+    'nose_middle',  # original name: nose_3
+    'left_nose_1',  # original name: nose_4
+    'left_nose_2',  # original name: nose_5
+    'right_eye_1',
+    'right_eye_2',
+    'right_eye_3',
+    'right_eye_4',
+    'right_eye_5',
+    'right_eye_6',
+    'left_eye_4',
+    'left_eye_3',
+    'left_eye_2',
+    'left_eye_1',
+    'left_eye_6',
+    'left_eye_5',
+    'right_mouth_1',  # original name: mouth_1
+    'right_mouth_2',  # original name: mouth_2
+    'right_mouth_3',  # original name: mouth_3
+    'mouth_top',  # original name: mouth_4
+    'left_mouth_3',  # original name: mouth_5
+    'left_mouth_2',  # original name: mouth_6
+    'left_mouth_1',  # original name: mouth_7
+    'left_mouth_5',  # original name: mouth_8
+    'left_mouth_4',  # original name: mouth_9
+    'mouth_bottom',  # original name: mouth_10
+    'right_mouth_4',  # original name: mouth_11
+    'right_mouth_5',  # original name: mouth_12
+    'right_lip_1',  # original name: lip_1
+    'right_lip_2',  # original name: lip_2
+    'lip_top',  # original name: lip_3
+    'left_lip_2',  # original name: lip_4
+    'left_lip_1',  # original name: lip_5
+    'left_lip_3',  # original name: lip_6
+    'lip_bottom',  # original name: lip_7
+    'right_lip_3',  # original name: lip_8
+    'right_contour_1',  # original name: face_contour_1
+    'right_contour_2',  # original name: face_contour_2
+    'right_contour_3',  # original name: face_contour_3
+    'right_contour_4',  # original name: face_contour_4
+    'right_contour_5',  # original name: face_contour_5
+    'right_contour_6',  # original name: face_contour_6
+    'right_contour_7',  # original name: face_contour_7
+    'right_contour_8',  # original name: face_contour_8
+    'contour_middle',  # original name: face_contour_9
+    'left_contour_8',  # original name: face_contour_10
+    'left_contour_7',  # original name: face_contour_11
+    'left_contour_6',  # original name: face_contour_12
+    'left_contour_5',  # original name: face_contour_13
+    'left_contour_4',  # original name: face_contour_14
+    'left_contour_3',  # original name: face_contour_15
+    'left_contour_2',  # original name: face_contour_16
+    'left_contour_1',  # original name: face_contour_17
+]
+
+SMPLX_LIMBS = {
+    'body': [['pelvis', 'left_hip'], ['pelvis', 'right_hip'],
+             ['left_hip', 'right_hip'], ['left_shoulder', 'right_shoulder'],
+             ['pelvis', 'spine_1'], ['spine_1', 'spine_2'],
+             ['spine_2', 'spine_3'], ['spine_3', 'neck'], ['neck', 'head'],
+             ['left_ankle', 'left_knee'], ['left_knee', 'left_hip'],
+             ['right_ankle', 'right_knee'], ['right_knee', 'right_hip'],
+             ['right_ankle', 'right_foot'], ['left_ankle', 'left_foot'],
+             ['left_hip', 'right_hip'], ['left_shoulder', 'left_hip'],
+             ['right_shoulder', 'right_hip'], ['left_collar', 'spine_3'],
+             ['right_collar', 'spine_3'], ['right_collar', 'right_shoulder'],
+             ['left_collar', 'left_shoulder'],
+             ['left_shoulder', 'right_shoulder'],
+             ['left_shoulder',
+              'left_elbow'], ['right_shoulder', 'right_elbow'],
+             ['left_elbow', 'left_wrist'], ['right_elbow', 'right_wrist'],
+             ['left_ankle', 'left_bigtoe'], ['left_ankle', 'left_smalltoe'],
+             ['left_ankle', 'left_heel'], ['right_ankle', 'right_bigtoe'],
+             ['right_ankle', 'right_smalltoe'], ['right_ankle', 'right_heel'],
+             ['left_shoulder', 'left_ear'], ['right_shoulder', 'right_ear'],
+             ['right_ear', 'right_eye'], ['right_eye', 'nose'],
+             ['nose', 'left_eye'], ['left_eye', 'left_ear'], ['nose', 'jaw'],
+             ['jaw', 'neck']],
+    'face': [['right_contour_1', 'right_contour_2'],
+             ['right_contour_2', 'right_contour_3'],
+             ['right_contour_3', 'right_contour_4'],
+             ['right_contour_4', 'right_contour_5'],
+             ['right_contour_5', 'right_contour_6'],
+             ['right_contour_6', 'right_contour_7'],
+             ['right_contour_7', 'right_contour_8'],
+             ['right_contour_8', 'contour_middle'],
+             ['contour_middle', 'left_contour_8'],
+             ['left_contour_8', 'left_contour_7'],
+             ['left_contour_7', 'left_contour_6'],
+             ['left_contour_6', 'left_contour_5'],
+             ['left_contour_5', 'left_contour_4'],
+             ['left_contour_4', 'left_contour_3'],
+             ['left_contour_3', 'left_contour_2'],
+             ['left_contour_2', 'left_contour_1']],
+    'left_hand':
+    [['left_wrist', 'left_thumb_1'], ['left_thumb_1', 'left_thumb_2'],
+     ['left_thumb_2', 'left_thumb_3'], ['left_thumb_3', 'left_thumb'],
+     ['left_wrist', 'left_index_1'], ['left_index_1', 'left_index_2'],
+     ['left_index_2', 'left_index_3'], ['left_index_3', 'left_index'],
+     ['left_wrist', 'left_middle_1'], ['left_middle_1', 'left_middle_2'],
+     ['left_middle_2', 'left_middle_3'], ['left_middle_3', 'left_middle'],
+     ['left_wrist', 'left_ring_1'], ['left_ring_1', 'left_ring_2'],
+     ['left_ring_2', 'left_ring_3'], ['left_ring_3', 'left_ring'],
+     ['left_wrist', 'left_pinky_1'], ['left_pinky_1', 'left_pinky_2'],
+     ['left_pinky_2', 'left_pinky_3'], ['left_pinky_3', 'left_pinky']],
+    'right_hand': [['right_wrist', 'right_thumb_1'],
+                   ['right_thumb_1', 'right_thumb_2'],
+                   ['right_thumb_2', 'right_thumb_3'],
+                   ['right_thumb_3', 'right_thumb'],
+                   ['right_wrist', 'right_index_1'],
+                   ['right_index_1', 'right_index_2'],
+                   ['right_index_2', 'right_index_3'],
+                   ['right_index_3', 'right_index'],
+                   ['right_wrist', 'right_middle_1'],
+                   ['right_middle_1', 'right_middle_2'],
+                   ['right_middle_2', 'right_middle_3'],
+                   ['right_middle_3', 'right_middle'],
+                   ['right_wrist', 'right_ring_1'],
+                   ['right_ring_1', 'right_ring_2'],
+                   ['right_ring_2', 'right_ring_3'],
+                   ['right_ring_3', 'right_ring'],
+                   ['right_wrist', 'right_pinky_1'],
+                   ['right_pinky_1', 'right_pinky_2'],
+                   ['right_pinky_2', 'right_pinky_3'],
+                   ['right_pinky_3', 'right_pinky']],
+    'right_eye':
+    [['right_eye_1', 'right_eye_2'], ['right_eye_2', 'right_eye_3'],
+     ['right_eye_3', 'right_eye_4'], ['right_eye_4', 'right_eye_5'],
+     ['right_eye_5', 'right_eye_6'], ['right_eye_6', 'right_eye_1'],
+     ['right_eyebrow_1', 'right_eyebrow_2'],
+     ['right_eyebrow_2', 'right_eyebrow_3'],
+     ['right_eyebrow_3', 'right_eyebrow_4'],
+     ['right_eyebrow_4', 'right_eyebrow_5']],
+    'left_eye': [['left_eye_4', 'left_eye_3'], ['left_eye_3', 'left_eye_2'],
+                 ['left_eye_2', 'left_eye_1'], ['left_eye_1', 'left_eye_6'],
+                 ['left_eye_6', 'left_eye_5'], ['left_eye_5', 'left_eye_4'],
+                 ['left_eyebrow_1', 'left_eyebrow_2'],
+                 ['left_eyebrow_2', 'left_eyebrow_3'],
+                 ['left_eyebrow_3', 'left_eyebrow_4'],
+                 ['left_eyebrow_4', 'left_eyebrow_5']],
+    'mouth':
+    [['right_mouth_1', 'right_mouth_2'], ['right_mouth_2', 'right_mouth_3'],
+     ['right_mouth_3', 'mouth_top'], ['mouth_top', 'left_mouth_3'],
+     ['left_mouth_3', 'left_mouth_2'], ['left_mouth_2', 'left_mouth_1'],
+     ['left_mouth_1', 'left_mouth_5'], ['left_mouth_5', 'left_mouth_4'],
+     ['left_mouth_4', 'mouth_bottom'], ['mouth_bottom', 'right_mouth_4'],
+     ['right_mouth_4', 'right_mouth_5'], ['right_mouth_5', 'right_mouth_1'],
+     ['right_lip_1', 'right_lip_2'], ['right_lip_2', 'lip_top'],
+     ['lip_top', 'left_lip_2'], ['left_lip_2', 'left_lip_1'],
+     ['left_lip_1', 'left_lip_3'], ['left_lip_3', 'lip_bottom'],
+     ['lip_bottom', 'right_lip_3'], ['right_lip_3', 'right_lip_1']],
+    'nose': [
+        ['nosebridge_1', 'nosebridge_2'],
+        ['nosebridge_2', 'nosebridge_3'],
+        ['nosebridge_3', 'nosebridge_4'],
+        ['right_nose_2', 'right_nose_1'],
+        ['right_nose_1', 'nose_middle'],
+        ['nose_middle', 'left_nose_1'],
+        ['left_nose_1', 'left_nose_2'],
+    ]
+}
+
+SMPLX_LIMBS_INDEX = {}
+for k in SMPLX_LIMBS:
+    SMPLX_LIMBS_INDEX[k] = [[
+        SMPLX_KEYPOINTS.index(limb[0]),
+        SMPLX_KEYPOINTS.index(limb[1])
+    ] for limb in SMPLX_LIMBS[k]]
+
+SMPLX_PALETTE = {
+    'left_eye': [[0, 0, 0]],
+    'right_eye': [[0, 0, 0]],
+    'nose': [[0, 0, 255]],
+    'mouth': [[0, 255, 255]],
+    'face': [[255, 0, 0]],
+    'left_hand': [[0, 0, 0]],
+    'right_hand': [[0, 0, 0]]
+}
+
+
+joint_idx = \
+        (0,1,2,4,5,7,8,12,16,17,18,19,20,21,60,61,62,63,64,65,59,58,57,56,55, # body joints
+        37,38,39,66,25,26,27,67,28,29,30,68,34,35,36,69,31,32,33,70, # left hand joints
+        52,53,54,71,40,41,42,72,43,44,45,73,49,50,51,74,46,47,48,75, # right hand joints
+        22,15, # jaw, head
+        57,56, # eyeballs
+        76,77,78,79,80,81,82,83,84,85, # eyebrow
+        86,87,88,89, # nose
+        90,91,92,93,94, # below nose
+        95,96,97,98,99,100,101,102,103,104,105,106, # eyes
+        107, # right mouth
+        108,109,110,111,112, # upper mouth
+        113, # left mouth
+        114,115,116,117,118, # lower mouth
+        119, # right lip
+        120,121,122, # upper lip
+        123, # left lip
+        124,125,126, # lower lip
+        127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143 # face contour
+        )
+
+SMPLX_137_KEYPOINTS = []
+for idx in joint_idx:
+    SMPLX_137_KEYPOINTS.append(SMPLX_KEYPOINTS[idx])
+
+
+SMPLX_LHAND = [
+    # 'left_thumb_2',
+    'left_wrist',
+    'left_thumb',
+    # 'left_index_1',
+    'left_index',
+    # 'left_middle_1',
+    'left_middle',
+    # 'left_ring_1',
+    'left_ring',
+    # 'left_pinky_1',
+    'left_pinky',
+]
+SMPLX_RHAND = [
+    # 'right_thumb_2',
+    'right_wrist',
+    'right_thumb',
+    # 'right_index_1',
+    'right_index',
+    # 'right_middle_1',
+    'right_middle',
+    # 'right_ring_1',
+    'right_ring',
+    # 'right_pinky_1',
+    'right_pinky',
+]
+
+SMPLX_FACE = [
+    'nose',
+    'mouth_top',
+    'jaw',
+    'right_contour_1',
+    'contour_middle',
+    'left_contour_1'
+]
+
+AiOS_35_KEYPOINTS = [
+    'nose',
+    'left_eye',
+    'right_eye',
+    'left_ear',
+    'right_ear',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'left_hip_extra',
+    'right_hip_extra',
+    'left_knee',
+    'right_knee',
+    'left_ankle',
+    'right_ankle',
+    'left_wrist',
+    'left_thumb',
+    # 'left_index_1',
+    'left_index',
+    # 'left_middle_1',
+    'left_middle',
+    # 'left_ring_1',
+    'left_ring',
+    # 'left_pinky_1',
+    'left_pinky',
+
+    'right_wrist',
+    'right_thumb',
+    # 'right_index_1',
+    'right_index',
+    # 'right_middle_1',
+    'right_middle',
+    # 'right_ring_1',
+    'right_ring',
+    # 'right_pinky_1',
+    'right_pinky',
+
+    'nose',
+    'mouth_top',
+    'jaw',
+    'right_contour_1',
+    'contour_middle',
+    'left_contour_1'
+    
+]
\ No newline at end of file
diff --git a/detrsmpl/core/conventions/keypoints_mapping/spin_smplx.py b/detrsmpl/core/conventions/keypoints_mapping/spin_smplx.py
new file mode 100644
index 0000000000000000000000000000000000000000..0be4d124804a51b694b4f67805b8bb666b99b04b
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/spin_smplx.py
@@ -0,0 +1,35 @@
+"""SPIN in smplx convention.
+
+SPIN_SMPLX_KEYPOINTS can be found in https://github.com/vchoutas/expose
+"""
+
+# TODO: SMPL_24->HumanData->SMPLX causes hip, spine to be lost.
+# SMPL_24: left/right_hip_extra
+# SMPLX: left/right_hip
+
+SPIN_SMPLX_KEYPOINTS = [
+    'right_ankle',
+    'right_knee',
+    'right_hip',
+    'left_hip',
+    'left_knee',
+    'left_ankle',
+    'right_wrist',
+    'right_elbow',
+    'right_shoulder',
+    'left_shoulder',
+    'left_elbow',
+    'left_wrist',
+    'neck',
+    'head_top',
+    'pelvis',
+    'thorax',
+    'spine',
+    'h36m_jaw',
+    'h36m_head',
+    'nose',
+    'left_eye',
+    'right_eye',
+    'left_ear',
+    'right_ear',
+]
diff --git a/detrsmpl/core/conventions/keypoints_mapping/star.py b/detrsmpl/core/conventions/keypoints_mapping/star.py
new file mode 100644
index 0000000000000000000000000000000000000000..d774d8efc722f397bb39d036c70bf3f0332dedd5
--- /dev/null
+++ b/detrsmpl/core/conventions/keypoints_mapping/star.py
@@ -0,0 +1,26 @@
+STAR_KEYPOINTS = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine_1',
+    'left_knee',
+    'right_knee',
+    'spine_2',
+    'left_ankle',
+    'right_ankle',
+    'spine_3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'left_hand',
+    'right_hand',
+]
diff --git a/detrsmpl/core/conventions/segmentation/__init__.py b/detrsmpl/core/conventions/segmentation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7a843ccb92d6113c415f0cf32e95cca15f075e4
--- /dev/null
+++ b/detrsmpl/core/conventions/segmentation/__init__.py
@@ -0,0 +1,94 @@
+from .smpl import SMPL_SEGMENTATION_DICT, SMPL_SUPER_SET
+from .smplx import SMPLX_SEGMENTATION_DICT, SMPLX_SUPER_SET
+
+
+class body_segmentation(object):
+    """SMPL(X) body mesh vertex segmentation."""
+    def __init__(self, model_type='smpl') -> None:
+        if model_type == 'smpl':
+            self.DICT = SMPL_SEGMENTATION_DICT
+            self.super_set = SMPL_SUPER_SET
+            self.NUM_VERTS = 6890
+        elif model_type == 'smplx':
+            self.DICT = SMPLX_SEGMENTATION_DICT
+            self.super_set = SMPLX_SUPER_SET
+            self.NUM_VERTS = 10475
+        else:
+            raise ValueError(f'Wrong model_type: {model_type}.'
+                             f' Should be in {["smpl", "smplx"]}')
+        self.model_type = model_type
+        self.len = len(list(self.DICT))
+
+    def items(self, ):
+        return zip(self.keys(), [self.__getitem__(key) for key in self.keys()])
+
+    def keys(self, ):
+        return self.DICT.keys()
+
+    def values(self, ):
+        return [self.__getitem__(key) for key in self.keys()]
+
+    def __len__(self, ):
+        return self.len
+
+    def __getitem__(self, key):
+        if key in self.DICT.keys():
+            part_segmentation = []
+            raw_segmentation = self.DICT[key]
+            for continuous in raw_segmentation:
+                if len(continuous) == 2:
+                    part_segmentation.extend(
+                        list(range(continuous[0], continuous[1] + 1)))
+                elif len(continuous) == 1:
+                    part_segmentation.extend(continuous)
+            return part_segmentation
+        elif key in self.super_set.keys():
+            super_part_segmentation = []
+            for body_part_key in self.super_set[key]:
+                super_part_segmentation += self.__getitem__(body_part_key)
+            return super_part_segmentation
+        elif key.lower() == 'all':
+            return list(range(self.NUM_VERTS))
+        else:
+            raise KeyError(f'{key} not in {self.model_type} conventions.')
+
+
+def _preprocess_segmentation_dict(segmentation_dict):
+    """help to preprocess the indexes to list."""
+    final_dict = {}
+    for k in segmentation_dict:
+        final_dict[k] = [[]]
+        final_part_indexes = final_dict[k]
+        part_indexes = segmentation_dict[k]
+        part_indexes.sort()
+        for index in range(len(part_indexes)):
+            if len(final_part_indexes[-1]) == 0:
+                final_part_indexes[-1].append(part_indexes[index])
+            elif len(final_part_indexes[-1]) == 2:
+                final_part_indexes.append([part_indexes[index]])
+            elif len(final_part_indexes[-1]) == 1:
+                if index != len(part_indexes) - 1:
+                    this_index = part_indexes[index]
+                    last_index = part_indexes[index - 1]
+                    next_index = part_indexes[index + 1]
+                    if (this_index == last_index + 1) and (this_index
+                                                           == next_index - 1):
+                        pass
+                    elif (this_index == last_index +
+                          1) and (this_index != next_index - 1):
+                        final_part_indexes[-1].append(this_index)
+                    elif (this_index != last_index + 1) and (this_index !=
+                                                             next_index - 1):
+                        final_part_indexes.append([this_index])
+                        final_part_indexes.append([])
+                    elif (this_index !=
+                          last_index + 1) and (this_index == next_index - 1):
+                        final_part_indexes.append([this_index])
+                else:
+                    this_index = part_indexes[index]
+                    last_index = part_indexes[index - 1]
+                    if (this_index == last_index + 1):
+                        final_part_indexes[-1].append(this_index)
+                    else:
+                        final_part_indexes.append([this_index])
+    return final_dict
diff --git a/detrsmpl/core/conventions/segmentation/smpl.py b/detrsmpl/core/conventions/segmentation/smpl.py
new file mode 100644
index 0000000000000000000000000000000000000000..b841d74c0cdf08479c97ae7e5d6968af00aa2a94
--- /dev/null
+++ b/detrsmpl/core/conventions/segmentation/smpl.py
@@ -0,0 +1,239 @@
+"""Raw index information can be found from smpl-wiki website:
+
+https://meshcapade.wiki/SMPL#mesh-templates--samples
+"""
+SMPL_SEGMENTATION_DICT = {
+    'rightHand':
+    [[5442, 5487], [5492, 5497], [5502, 5527], [5530, 5562], [5569], [5571],
+     [5574, 5583], [5588, 5589], [5592, 5605], [5610, 5614], [5621, 5622],
+     [5625], [5631, 5641], [5643, 5646], [5649, 5650], [5652, 5664], [5667],
+     [5670, 5675], [5682, 5690], [5692], [5695], [5697, 5701], [5707, 5721],
+     [5723, 5732], [5735, 5740], [5745, 5746], [5748, 5752], [6056, 6057],
+     [6066, 6067], [6158, 6239]],
+    'rightUpLeg': [[4320, 4321], [4323, 4324], [4333, 4340], [4356, 4367],
+                   [4383, 4401], [4419, 4422], [4430, 4532], [4623, 4634],
+                   [4645, 4660], [4670, 4673], [4704, 4713], [4745, 4746],
+                   [4757, 4760], [4801, 4802], [4829], [4834, 4841],
+                   [4924, 4926], [4928, 4936], [4948, 4952], [4970, 4973],
+                   [4983, 4993], [5004, 5005], [6546, 6549], [6552, 6556],
+                   [6873], [6877]],
+    'leftArm': [[626, 629], [634, 635], [680, 681], [716, 719], [769, 780],
+                [784, 793], [1231, 1234], [1258, 1261], [1271], [1281, 1282],
+                [1310, 1311], [1314, 1315], [1340, 1343], [1355, 1358],
+                [1376, 1400], [1402, 1403], [1405, 1416], [1428, 1433],
+                [1438, 1445], [1502], [1505, 1510], [1538],
+                [1541, 1543], [1545], [1619, 1622], [1631, 1642], [1645, 1656],
+                [1658, 1659], [1661, 1662], [1664], [1666, 1684], [1696, 1698],
+                [1703, 1720], [1725], [1731, 1735], [1737], [1739, 1740],
+                [1745, 1749], [1751], [1761], [1830, 1831], [1844, 1846],
+                [1850, 1851], [1854, 1855], [1858], [1860], [1865, 1867],
+                [1869, 1871], [1874, 1878], [1882, 1883], [1888, 1889], [1892],
+                [1900, 1904], [1909], [2819, 2822], [2895, 2903], [2945, 2946],
+                [2974, 2996], [3002], [3013]],
+    'leftLeg': [[995], [998, 999], [1002], [1004, 1005], [1008], [1010],
+                [1012], [1015, 1016], [1018, 1019], [1043, 1044], [1047, 1136],
+                [1148, 1158], [1175, 1183], [1369, 1375], [1464, 1474],
+                [1522, 1532], [3174, 3210], [3319, 3335], [3432, 3436], [3469],
+                [3472, 3474]],
+    'leftToeBase': [[3211, 3318], [3336, 3337], [3340], [3342], [3344], [3346],
+                    [3348], [3350], [3352], [3354], [3357, 3358], [3360],
+                    [3362]],
+    'leftFoot': [[3327, 3469]],
+    'spine1':
+    [[598, 601], [610, 621], [642], [645, 647], [652, 653], [658, 661],
+     [668, 671], [684, 692], [722, 725], [736], [750, 751], [761], [764],
+     [766, 767], [794, 795], [891, 894], [925, 929], [940, 943], [1190, 1197],
+     [1200, 1202], [1212], [1236], [1252, 1255], [1268, 1270], [1329, 1330],
+     [1348, 1349], [1351], [1420, 1421], [1423, 1426], [1436, 1437],
+     [1756, 1758], [2839, 2851], [2870, 2871], [2883], [2906], [2908], [3014],
+     [3017], [3025], [3030], [3033, 3034], [3037], [3039, 3044], [3076, 3077],
+     [3079], [3480], [3505], [3511], [4086, 4089], [4098, 4109], [4130, 4131],
+     [4134, 4135], [4140, 4141], [4146, 4149], [4156, 4159], [4172, 4180],
+     [4210, 4213], [4225], [4239, 4240], [4249, 4250], [4255, 4256],
+     [4282, 4283], [4377, 4380], [4411, 4415], [4426, 4429], [4676, 4683],
+     [4686, 4688], [4695], [4719], [4735, 4737], [4740], [4751, 4753],
+     [4824, 4825], [4828], [4893, 4895], [4897, 4899], [4908, 4909],
+     [5223, 5225], [6300, 6312], [6331, 6332], [6342], [6366, 6367], [6475],
+     [6477, 6478], [6481, 6482], [6485], [6487, 6491], [6878]],
+    'spine2': [[570, 573], [584, 597], [602, 609], [622, 625], [638, 641],
+               [643, 644], [648, 651], [666, 667], [672, 675], [680, 683],
+               [693, 704], [713, 717], [726, 733], [735],
+               [737, 749], [752, 760], [762, 763], [803, 806], [811, 814],
+               [817, 821], [824, 828], [895, 896], [930, 931], [1198, 1199],
+               [1213, 1220], [1235], [1237], [1256, 1257], [1271, 1273],
+               [1279, 1280], [1283, 1309], [1312, 1313], [1319, 1320],
+               [1346, 1347], [1350], [1352], [1401], [1417, 1419], [1422],
+               [1427], [1434, 1435], [1503, 1504], [1536, 1537], [1544, 1545],
+               [1753, 1755], [1759, 1763], [1808, 1811], [1816, 1820],
+               [1834, 1839], [1868], [1879, 1880], [2812, 2813], [2852, 2869],
+               [2872], [2875, 2878], [2881, 2882], [2884, 2886], [2904, 2905],
+               [2907], [2931, 2937], [2941], [2950, 2973], [2997, 2998],
+               [3006, 3007], [3012], [3015], [3026, 3029], [3031, 3032],
+               [3035, 3036], [3038], [3059, 3067], [3073, 3075], [3078],
+               [3168, 3169], [3171], [3470, 3471], [3482, 3483], [3495, 3498],
+               [3506], [3508], [4058, 4061], [4072, 4085], [4090, 4097],
+               [4110, 4113], [4126, 4129], [4132, 4133], [4136, 4139],
+               [4154, 4155], [4160, 4163], [4168, 4171], [4181, 4192],
+               [4201, 4204], [4207], [4214, 4221], [4223, 4224], [4226, 4238],
+               [4241, 4248], [4251, 4252], [4291, 4294], [4299, 4302],
+               [4305, 4309], [4312, 4315], [4381, 4382], [4416, 4417],
+               [4684, 4685], [4696, 4703], [4718], [4720], [4738, 4739],
+               [4754, 4756], [4761, 4762], [4765, 4789], [4792, 4793],
+               [4799, 4800], [4822, 4823], [4826, 4827], [4874], [4890, 4892],
+               [4896], [4900], [4907], [4910], [4975, 4976], [5007, 5008],
+               [5013, 5014], [5222], [5226, 5230], [5269, 5272], [5277, 5281],
+               [5295, 5300], [5329], [5340, 5341], [6273, 6274], [6313, 6330],
+               [6333], [6336, 6337], [6340, 6341], [6343, 6345], [6363, 6365],
+               [6390, 6396], [6398], [6409, 6432], [6456, 6457], [6465, 6466],
+               [6476], [6479, 6480], [6483, 6484], [6486], [6496,
+                                                            6503], [6879]],
+    'leftShoulder': [[591], [604, 606], [609], [634, 637], [674], [706, 713],
+                     [715], [717], [730], [733, 735], [781, 783], [1238, 1245],
+                     [1290, 1291], [1294], [1316, 1318], [1401, 1404], [1509],
+                     [1535], [1545], [1808], [1810, 1815], [1818, 1819],
+                     [1821, 1833], [1837], [1840, 1859], [1861, 1864],
+                     [1872, 1873], [1880, 1881], [1884, 1887], [1890, 1891],
+                     [1893, 1899], [2879, 2881], [2886, 2894], [2903],
+                     [2938, 2949], [2965], [2967], [2969], [2999, 3005],
+                     [3008, 3011]],
+    'rightShoulder': [[4077], [4091, 4092], [4094, 4095], [4122, 4125], [4162],
+                      [4194, 4201], [4203], [4207], [4218, 4219], [4222, 4223],
+                      [4269, 4271], [4721, 4728], [4773, 4774], [4778],
+                      [4796, 4798], [4874, 4877], [4982], [5006], [5014],
+                      [5269], [5271, 5276], [5279], [5281, 5294], [5298],
+                      [5301, 5320], [5322, 5325], [5333, 5334], [5341, 5342],
+                      [5345, 5348], [5351, 5352], [5354, 5360], [6338, 6340],
+                      [6345, 6353], [6362], [6397, 6408], [6424, 6425], [6428],
+                      [6458, 6464], [6467, 6470]],
+    'rightFoot': [[6727, 6869]],
+    'head': [[0, 149], [154, 173], [176, 205], [220, 221], [225, 255],
+             [258, 283], [286, 295], [303, 304], [306, 307], [310, 332],
+             [335, 422], [427, 439], [442, 450], [454, 459], [461, 569],
+             [574, 583], [1764, 1766], [1770, 1778], [1905, 1908],
+             [2779, 2811], [2814, 2818], [3045, 3048], [3051, 3056], [3058],
+             [3069, 3072], [3161, 3163], [3165, 3167], [3485, 3494], [3499],
+             [3512, 3661], [3666, 3685], [3688, 3717], [3732, 3733],
+             [3737, 3767], [3770, 3795], [3798, 3807], [3815, 3816],
+             [3819, 3838], [3841, 3917], [3922, 3933], [3936, 3941],
+             [3945, 4057], [4062, 4071], [5231, 5233], [5235, 5243],
+             [5366, 5369], [6240, 6272], [6275, 6279], [6492, 6495],
+             [6880, 6889]],
+    'rightArm': [[4114, 4117], [4122], [4125], [4168], [4171], [4204, 4207],
+                 [4257, 4268], [4272, 4281], [4714, 4717], [4741,
+                                                            4744], [4756],
+                 [4763, 4764], [4790, 4791], [4794, 4795], [4816, 4819],
+                 [4830, 4833], [4849, 4873], [4876, 4889], [4901, 4906],
+                 [4911, 4918], [4974], [4977, 4982], [5009, 5012], [5014],
+                 [5088, 5091], [5100, 5111], [5114, 5125], [5128, 5131],
+                 [5134, 5153], [5165, 5167],
+                 [5172, 5189], [5194], [5200, 5204], [5206], [5208, 5209],
+                 [5214, 5218], [5220], [5229], [5292, 5293], [5303], [5306],
+                 [5309], [5311], [5314, 5315], [5318, 5319], [5321],
+                 [5326, 5328], [5330, 5332], [5335, 5339], [5343, 5344],
+                 [5349, 5350], [5353], [5361, 5365], [5370], [6280, 6283],
+                 [6354, 6362], [6404, 6405], [6433, 6455], [6461], [6471]],
+    'leftHandIndex1': [[2027, 2030], [2037, 2040], [2057], [2067, 2068],
+                       [2123, 2130], [2132], [2145, 2146], [2152, 2154],
+                       [2156, 2169], [2177, 2179], [2181], [2186, 2187],
+                       [2190, 2191], [2204, 2205], [2215, 2220], [2232, 2233],
+                       [2245, 2247], [2258, 2259], [2261, 2263], [2269, 2270],
+                       [2272, 2274], [2276, 2277], [2280, 2283], [2291, 2594],
+                       [2596, 2597], [2599, 2604], [2606, 2607], [2609, 2696]],
+    'rightLeg': [[4481, 4482], [4485, 4486], [4491, 4493], [4495], [4498],
+                 [4500, 4501], [4505, 4506], [4529], [4532, 4622],
+                 [4634, 4644], [4661, 4669], [4842, 4848], [4937, 4947],
+                 [4993, 5003], [6574, 6610], [6719, 6735], [6832, 6836],
+                 [6869, 6872]],
+    'rightHandIndex1': [[5488, 5491], [5498, 5501], [5518], [5528, 5529],
+                        [5584, 5592], [5606, 5607], [5613], [5615, 5630],
+                        [5638, 5640], [5642], [5647, 5648], [5650, 5651],
+                        [5665, 5666], [5676, 5681], [5693, 5694], [5706, 5708],
+                        [5719], [5721, 5724], [5730, 5731], [5733, 5735],
+                        [5737, 5738], [5741, 5744], [5752, 6055], [6058, 6065],
+                        [6068, 6157]],
+    'leftForeArm': [[1546, 1618], [1620, 1621], [1623, 1630], [1643, 1644],
+                    [1646, 1647], [1650, 1651], [1654, 1655], [1657, 1666],
+                    [1685, 1695], [1699, 1702], [1721, 1730], [1732], [1736],
+                    [1738], [1741, 1744], [1750], [1752], [1900], [1909, 1980],
+                    [2019], [2059, 2060], [2073], [2089], [2098, 2112],
+                    [2147, 2148], [2206, 2209], [2228], [2230], [2234, 2235],
+                    [2241, 2244], [2279], [2286], [2873, 2874]],
+    'rightForeArm': [[5015, 5087], [5090, 5099], [5112, 5113], [5116, 5117],
+                     [5120, 5121], [5124, 5135], [5154, 5164], [5168, 5171],
+                     [5190, 5199], [5202],
+                     [5205], [5207], [5210, 5213], [5219], [5221], [5361],
+                     [5370, 5441], [5480], [5520, 5521], [5534], [5550],
+                     [5559, 5573], [5608, 5609], [5667, 5670], [5689], [5691],
+                     [5695, 5696], [5702, 5705], [5740], [5747], [6334, 6335]],
+    'neck': [[148], [150, 153], [172], [174, 175], [201, 202], [204, 219],
+             [222, 225], [256, 257], [284, 285], [295, 309], [333, 334],
+             [423, 426], [440, 441], [451, 453], [460, 461], [571, 572],
+             [824, 829], [1279, 1280], [1312, 1313], [1319, 1320], [1331],
+             [3049, 3050], [3057, 3059], [3068], [3164], [3661, 3665],
+             [3685, 3687], [3714, 3731], [3734, 3737], [3768, 3769],
+             [3796, 3797], [3807, 3819], [3839, 3840], [3918, 3921],
+             [3934, 3935], [3942, 3944], [3950], [4060, 4061], [4312, 4315],
+             [4761, 4762], [4792, 4793], [4799, 4800], [4807]],
+    'rightToeBase': [[6611, 6718], [6736], [6739], [6741], [6743], [6745],
+                     [6747], [6749, 6750], [6752], [6754], [6757, 6758],
+                     [6760], [6762]],
+    'spine': [[616, 617], [630, 633], [654, 657], [662, 665], [720, 721],
+              [765, 768], [796, 799], [889, 890], [916, 919], [921, 926],
+              [1188, 1189], [1211, 1212], [1248, 1251], [1264, 1267],
+              [1323, 1328], [1332, 1336], [1344, 1345], [1481, 1496], [1767],
+              [2823, 2845], [2847, 2848], [2851], [3016, 3020], [3023, 3024],
+              [3124], [3173], [3476, 3478], [3480], [3500,
+                                                     3502], [3504], [3509],
+              [3511], [4103, 4104], [4118, 4121], [4142, 4145], [4150, 4153],
+              [4208, 4209], [4253, 4256], [4284, 4287], [4375, 4376],
+              [4402, 4403], [4405, 4412], [4674, 4675], [4694, 4695],
+              [4731, 4734], [4747, 4750], [4803, 4806], [4808, 4812],
+              [4820, 4821], [4953, 4968], [5234], [6284, 6306], [6308, 6309],
+              [6312], [6472, 6474], [6545], [6874, 6876], [6878]],
+    'leftUpLeg': [[833, 834], [838, 839], [847, 854], [870, 881], [897, 915],
+                  [933, 936], [944, 1046], [1137, 1148], [1159, 1174],
+                  [1184, 1187], [1221, 1230], [1262, 1263], [1274, 1277],
+                  [1321, 1322], [1354], [1359, 1362], [1365,
+                                                       1368], [1451, 1453],
+                  [1455, 1463], [1475], [1477, 1480], [1498, 1501],
+                  [1511, 1514], [1516, 1522], [1533, 1534], [3125, 3128],
+                  [3131, 3135], [3475], [3479]],
+    'leftHand': [[1981, 2026], [2031, 2036],
+                 [2041, 2066], [2069, 2101], [2107], [2111], [2113, 2122],
+                 [2127], [2130, 2144], [2149, 2152], [2155], [2160],
+                 [2163, 2164], [2170, 2180], [2182, 2185], [2188, 2189],
+                 [2191, 2203], [2207], [2209, 2214], [2221, 2229], [2231],
+                 [2234], [2236, 2240], [2246, 2260], [2262,
+                                                      2271], [2274, 2279],
+                 [2284, 2285], [2287, 2290], [2293], [2595], [2598], [2605],
+                 [2608], [2697, 2778]],
+    'hips': [[631, 632], [654], [657], [662], [665], [676, 679], [705], [720],
+             [796], [799, 802], [807, 810], [815, 816], [822, 823], [830, 846],
+             [855, 869], [871], [878], [881, 890], [912], [915, 920], [932],
+             [937, 939], [1163], [1166], [1203, 1210], [1246, 1247],
+             [1262, 1263], [1276, 1278], [1321], [1336, 1339], [1353, 1354],
+             [1361, 1364], [1446, 1450], [1454], [1476], [1497], [1511],
+             [1513, 1515], [1533, 1534], [1539, 1540], [1768, 1769],
+             [1779, 1807], [2909, 2930], [3018, 3019], [3021, 3022],
+             [3080, 3124], [3128, 3130], [3136, 3160], [3170], [3172], [3481],
+             [3484], [3500], [3502, 3503], [3507], [3510], [4120, 4121],
+             [4142, 4143], [4150, 4151], [4164, 4167], [4193], [4208],
+             [4284, 4285], [4288, 4290], [4295, 4298], [4303, 4304],
+             [4310, 4311], [4316, 4332], [4341, 4356], [4364, 4365],
+             [4368, 4376], [4398, 4399], [4402, 4406], [4418], [4423, 4425],
+             [4649, 4650], [4689, 4693], [4729, 4730], [4745, 4746],
+             [4759, 4760], [4801], [4812, 4815], [4829], [4836, 4837],
+             [4919, 4923], [4927], [4969], [4983, 4984], [4986], [5004, 5005],
+             [5244, 5268], [6368, 6389], [6473, 6474], [6504, 6545],
+             [6549, 6551], [6557, 6573]]
+}
+
+SMPL_SUPER_SET = {
+    'FOOT': ['leftFoot', 'leftToeBase', 'rightFoot', 'rightToeBase'],
+    'HAND': ['leftHand', 'rightHand', 'leftHandIndex1', 'rightHandIndex1'],
+    'LEG': ['rightUpLeg', 'leftUpLeg', 'leftLeg', 'rightLeg'],
+    'ARM': ['leftForeArm', 'rightForeArm', 'leftArm', 'rightArm'],
+    'HEAD': ['neck', 'head'],
+    'UPBODY': ['spine1', 'spine2', 'leftShoulder', 'rightShoulder'],
+    'DOWNBODY': ['spine', 'hips']
+}
diff --git a/detrsmpl/core/conventions/segmentation/smplx.py b/detrsmpl/core/conventions/segmentation/smplx.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9d0927341ffc133385ab1d11ff1c37c9fbc4be9
--- /dev/null
+++ b/detrsmpl/core/conventions/segmentation/smplx.py
@@ -0,0 +1,269 @@
+"""Raw index information can be found from smpl-wiki website:
+
+https://meshcapade.wiki/SMPL#mesh-templates--samples
+"""
+SMPLX_SEGMENTATION_DICT = {
+    'rightHand':
+    [[7331, 7376], [7381, 7386], [7391, 7416], [7419, 7451], [7456], [7459],
+     [7463, 7472], [7479, 7494], [7499, 7501], [7504, 7505], [7512, 7514],
+     [7520, 7530], [7532, 7535], [7539, 7553], [7556], [7558], [7560, 7564],
+     [7571, 7579], [7581], [7585, 7590], [7596, 7610], [7612, 7621],
+     [7624, 7629], [7634, 7635], [7637, 7640], [7643], [7947, 7948],
+     [7957, 7958], [8047, 8128]],
+    'rightUpLeg': [[6225, 6226], [6228, 6229], [6238, 6245], [6261, 6272],
+                   [6288, 6306], [6324, 6327], [6335, 6437], [6528, 6539],
+                   [6550, 6565], [6575, 6578], [6609, 6618], [6650, 6651],
+                   [6662, 6665], [6706, 6707], [6734], [6739, 6746],
+                   [6829, 6831], [6833, 6841], [6853, 6857], [6875, 6878],
+                   [6888, 6898], [6909, 6910], [8394, 8397], [8400, 8404],
+                   [8721], [8725]],
+    'leftArm': [[3256, 3259], [3266, 3267], [3311, 3312], [3346, 3349],
+                [3401, 3412], [3416, 3425], [3868, 3871], [3898, 3901], [3912],
+                [3920, 3921], [3947, 3948], [3951, 3952], [3973, 3976],
+                [3987, 3990], [4007, 4031], [4034, 4040], [4042, 4048],
+                [4060, 4064], [4067], [4072, 4079], [4135], [4138, 4143],
+                [4170, 4174], [4249, 4252], [4261, 4272], [4275, 4278],
+                [4281, 4290], [4295, 4296], [4301, 4319], [4322], [4334, 4336],
+                [4341, 4358], [4363], [4369, 4373], [4375], [4377, 4378],
+                [4383, 4387], [4389], [4398], [4449, 4450], [4460],
+                [4464, 4465], [4470, 4471], [4474, 4476], [4478], [4483, 4485],
+                [4487, 4489], [4492, 4496], [4500, 4501], [4506, 4507], [4510],
+                [4518, 4523], [5397, 5400], [5471, 5479], [5542, 5543],
+                [5572, 5573], [5576, 5595], [5597], [5607], [5628]],
+    'head': [[0, 11], [16, 218], [223, 371], [376, 461], [464,
+                                                          495], [498, 551],
+             [554, 557], [560, 562], [565, 648], [651, 735], [738, 1209],
+             [1214, 1325], [1327, 1358], [1361, 1385], [1387, 1725],
+             [1728, 1758], [1760, 1789], [1791, 1885], [1887, 1897],
+             [1899, 1930], [1935, 1939], [1942, 1947], [1950, 2035],
+             [2037, 2148], [2152, 2217], [2220, 2483], [2485, 2530],
+             [2532, 2869], [2871, 2892], [2894, 2963], [2965, 2975],
+             [2977, 3011], [3014, 3183], [8731, 8810], [8815, 8838],
+             [8926, 8928], [8931, 8933], [8939], [8941, 8987], [8989, 9019],
+             [9028, 9160], [9162, 9164], [9166, 9382]],
+    'leftEye': [[9383, 9928]],
+    'rightEye': [[9929, 10474]],
+    'leftLeg': [[3625, 3626], [3629, 3630], [3635, 3637], [3639], [3642, 3644],
+                [3649, 3650], [3675, 3733], [3737, 3769], [3781, 3791],
+                [3809, 3817], [3999, 4001], [4003, 4006], [4098, 4108],
+                [4154, 4164], [5728, 5764], [5873, 5889], [8892, 8896],
+                [8935, 8937], [9020]],
+    'leftToeBase': [[5765, 5872], [5890], [5893], [5895], [5897], [5899],
+                    [5901], [5903, 5904], [5906], [5908], [5911, 5912], [5914],
+                    [5916]],
+    'leftFoot': [[5881, 5919], [5922, 5930], [5933], [8728, 8730],
+                 [8839, 8925], [8929, 8930], [8934, 8935]],
+    'spine1':
+    [[3228, 3231], [3240, 3251], [3272, 3273], [3276, 3277], [3282, 3283],
+     [3288, 3291], [3298, 3301], [3314, 3322], [3352], [3355, 3357], [3369],
+     [3383, 3384], [3393, 3394], [3399, 3400], [3426, 3427], [3521, 3524],
+     [3555, 3559], [3570, 3573], [3824, 3830], [3833], [3836, 3838], [3844],
+     [3855, 3856], [3873], [3892, 3893], [3896, 3897], [3908, 3910],
+     [3981, 3982], [3985], [4052, 4054], [4056, 4058], [4069, 4070],
+     [4392, 4394], [5417, 5429], [5448, 5449], [5459], [5483], [5485, 5486],
+     [5489], [5531, 5532], [5534], [5632], [5634, 5635], [5638, 5639], [5642],
+     [5644, 5648], [5944], [5950], [5991, 5994], [6003, 6014], [6035, 6036],
+     [6039, 6040], [6045, 6046], [6051, 6054], [6061, 6064], [6077, 6085],
+     [6115, 6118], [6130], [6144, 6145], [6154, 6155], [6160, 6161],
+     [6187, 6188], [6282, 6285], [6316, 6320], [6331, 6334], [6581, 6588],
+     [6591, 6593], [6599], [6624], [6640, 6641], [6644, 6645], [6656, 6658],
+     [6729, 6730], [6733], [6798, 6800], [6802, 6804], [6813, 6814],
+     [7128, 7130], [8151, 8163], [8182, 8183], [8193], [8217, 8218], [8326],
+     [8328, 8329], [8332, 8333], [8336], [8338, 8342], [8726], [9026]],
+    'spine2': [[3210, 3211], [3214, 3227], [3232, 3239], [3252, 3255],
+               [3268, 3271], [3274, 3275], [3278, 3281], [3296, 3297],
+               [3302, 3305], [3310, 3313], [3323, 3334], [3342, 3343],
+               [3345, 3347], [3358, 3365], [3367, 3368], [3370, 3382],
+               [3385, 3392], [3395, 3396], [3435, 3438], [3443, 3446],
+               [3449, 3453], [3525, 3526], [3560, 3561], [3831, 3832],
+               [3834, 3835], [3846, 3850], [3853, 3854], [3857], [3872],
+               [3874], [3894, 3895], [3911, 3913], [3922, 3946], [3979, 3980],
+               [3983, 3984], [4032], [4049, 4051], [4055], [4059], [4068],
+               [4071], [4136, 4137], [4168, 4169], [4174, 4175], [4279, 4280],
+               [4391], [4395, 4399], [4426, 4429], [4434, 4438], [4452, 4457],
+               [4486], [4497, 4498], [5349, 5350], [5395, 5396], [5430, 5447],
+               [5450], [5453, 5454], [5457, 5458], [5460, 5462], [5480, 5482],
+               [5484], [5487], [5499, 5501], [5519], [5521,
+                                                      5526], [5528, 5530],
+               [5533], [5536], [5547, 5556], [5558, 5571], [5598, 5599],
+               [5611, 5612], [5618, 5619], [5621], [5633], [5636, 5637],
+               [5640, 5641], [5643], [5650, 5657], [5920, 5921], [5932],
+               [5935, 5938], [5945], [5947], [5973, 5974], [5977, 5990],
+               [5995, 6002], [6015, 6018], [6031, 6034], [6037, 6038],
+               [6041, 6044], [6059, 6060], [6065, 6068], [6073, 6076],
+               [6086, 6097], [6105, 6106], [6108, 6110], [6119, 6126],
+               [6128, 6129], [6131, 6143], [6146, 6153], [6156, 6157],
+               [6196, 6199], [6204, 6207], [6210, 6214], [6286, 6287],
+               [6321, 6322], [6589, 6590], [6601, 6608], [6623], [6625],
+               [6642, 6643], [6659, 6661], [6670, 6694], [6727, 6728],
+               [6731, 6732], [6779], [6795, 6797], [6801], [6805], [6812],
+               [6815], [6880, 6881], [6912, 6913], [6918, 6919], [7127],
+               [7131, 7135], [7162, 7165], [7170, 7174], [7188, 7193], [7222],
+               [7233, 7234], [8129, 8130], [8164, 8181], [8184], [8187, 8188],
+               [8191, 8192], [8194, 8196], [8214, 8216], [8241, 8247], [8249],
+               [8260, 8283], [8307, 8308], [8316, 8317], [8327], [8330, 8331],
+               [8334, 8335], [8337], [8344, 8351], [8727], [9027]],
+    'leftShoulder': [[3219], [3233, 3234], [3236, 3237], [3264, 3267], [3303],
+                     [3336, 3341], [3343, 3346], [3362, 3363], [3366, 3367],
+                     [3413, 3415], [3875, 3878], [3880, 3883], [3929, 3930],
+                     [3935], [3953, 3955], [4032, 4035], [4143], [4167],
+                     [4174], [4426, 4428], [4430, 4433], [4436], [4438, 4451],
+                     [4455], [4458, 4477], [4479, 4482], [4490, 4491],
+                     [4498, 4499], [4502, 4505], [4508, 4509], [4511, 4517],
+                     [5455, 5457], [5462, 5470], [5479], [5535, 5546],
+                     [5563, 5564], [5566], [5602], [5605, 5610], [5624, 5627]],
+    'rightShoulder': [[5982], [5996, 5997], [5999, 6000], [6027, 6030], [6066],
+                      [6099, 6104], [6106, 6109], [6123, 6124], [6127, 6128],
+                      [6174, 6176], [6626, 6633], [6677, 6678], [6683],
+                      [6701, 6703], [6779, 6782], [6887], [6911], [6918],
+                      [7162, 7164], [7166, 7169], [7172], [7174, 7187], [7191],
+                      [7194, 7213], [7215, 7218], [7226, 7227], [7234, 7235],
+                      [7238, 7241], [7244, 7245], [7247, 7253], [8189, 8191],
+                      [8196, 8204], [8213], [8248, 8259], [8275, 8276], [8278],
+                      [8309, 8315], [8318, 8321]],
+    'rightFoot': [[8575, 8717]],
+    'rightArm':
+    [[6019, 6022], [6029, 6030], [6074, 6075], [6109, 6112], [6162, 6173],
+     [6177, 6186], [6619, 6622], [6646, 6649], [6660], [6668, 6669],
+     [6695, 6696], [6699, 6700], [6721, 6724], [6735, 6738], [6754, 6778],
+     [6781, 6794], [6806, 6811], [6816, 6823], [6879], [6882, 6887],
+     [6914, 6918], [6993, 6996], [7005, 7016], [7019, 7032], [7035, 7036],
+     [7039, 7058], [7070, 7072], [7077, 7094], [7099], [7105, 7109], [7111],
+     [7113, 7114], [7119, 7123], [7125], [7134], [7185, 7186], [7196],
+     [7200, 7201], [7206, 7207], [7210, 7212], [7214], [7219, 7221],
+     [7223, 7225], [7228, 7232], [7236, 7237], [7242, 7243], [7246],
+     [7254, 7259], [8131, 8134], [8205, 8213], [8255, 8256], [8284, 8306],
+     [8312], [8322]],
+    'leftHandIndex1': [[4641, 4644], [4651, 4654], [4669], [4681, 4682],
+                       [4737, 4745], [4759, 4760], [4766, 4768], [4770, 4783],
+                       [4791, 4793], [4795], [4800, 4802], [4805],
+                       [4818, 4819], [4829, 4834], [4846, 4847], [4859, 4861],
+                       [4872], [4874, 4877], [4883, 4884], [4886, 4888],
+                       [4890, 4891], [4894, 4897], [4905, 5210], [5213, 5220],
+                       [5223, 5310]],
+    'rightLeg': [[6386, 6387], [6390, 6391], [6396, 6398], [6400],
+                 [6403, 6405], [6410, 6411], [6436, 6527], [6539, 6549],
+                 [6566, 6574], [6747, 6753], [6842, 6852], [6898, 6908],
+                 [8422, 8458], [8567, 8583], [8680, 8684], [8717, 8720]],
+    'rightHandIndex1': [[7377, 7380], [7387, 7390], [7405], [7417, 7418],
+                        [7473, 7481], [7495, 7496], [7502, 7504], [7506, 7519],
+                        [7527, 7529], [7531], [7536, 7538], [7541],
+                        [7554, 7555], [7565, 7570], [7582, 7583], [7595, 7597],
+                        [7608], [7610, 7613], [7619, 7620], [7622, 7624],
+                        [7626, 7627], [7630, 7633], [7641, 7946], [7949, 7956],
+                        [7959, 8046]],
+    'leftForeArm': [[4176, 4248], [4251, 4260], [4273, 4274], [4277, 4278],
+                    [4283, 4284], [4287, 4290], [4293, 4296], [4299, 4302],
+                    [4323, 4333], [4337, 4340], [4359, 4368], [4371], [4374],
+                    [4376], [4379, 4382], [4388], [4390], [4518], [4523, 4594],
+                    [4632], [4673, 4674], [4686], [4703], [4712, 4726],
+                    [4761, 4762], [4820, 4823], [4842], [4844], [4848, 4849],
+                    [4855, 4858], [4893], [4900], [5451, 5452]],
+    'rightForeArm': [[6920, 6992], [6995, 7004], [7017, 7018], [7021, 7022],
+                     [7025, 7026], [7029, 7040], [7059, 7069], [7073, 7076],
+                     [7095, 7104], [7107],
+                     [7110], [7112], [7115, 7118], [7124], [7126], [7254],
+                     [7259, 7330], [7368], [7409, 7410], [7422], [7439],
+                     [7448, 7462], [7497, 7498], [7556, 7559], [7578], [7580],
+                     [7584, 7585], [7591, 7594], [7629], [7636], [8185, 8186]],
+    'neck': [[12, 15], [219, 222], [372, 375], [462, 463], [496, 497],
+             [552, 553], [558, 559], [563, 564], [649, 650], [736, 737],
+             [1210, 1213], [1326], [1359, 1360], [1386], [1726, 1727], [1759],
+             [1790], [1886], [1898], [1931, 1934], [1940, 1941], [1948, 1949],
+             [2036], [2149, 2151], [2218, 2219], [2484], [2531], [2870],
+             [2893], [2964], [2976], [3012, 3013], [3184, 3213], [3353, 3354],
+             [3435, 3436], [3445, 3446], [3450], [3452, 3453], [3456, 3459],
+             [3857], [3918, 3919], [3944, 3945], [3949, 3950], [3956, 3957],
+             [3964], [5518, 5519], [5527], [5616, 5617], [5649], [5920],
+             [5951, 5976], [6196, 6197], [6206, 6207], [6211], [6213, 6214],
+             [6217, 6220], [6608], [6666, 6667], [6692, 6693], [6697, 6698],
+             [6704, 6705], [6712], [8343], [8938], [8940], [8988]],
+    'rightToeBase': [[8459, 8566], [8584], [8587], [8589], [8591], [8593],
+                     [8595], [8597, 8598], [8600], [8602], [8605, 8606],
+                     [8608], [8610]],
+    'spine': [[3244, 3245], [3260, 3263], [3284, 3287], [3292, 3295],
+              [3350, 3351], [3397, 3400], [3428, 3431], [3519, 3520],
+              [3546, 3547], [3549, 3556], [3822, 3823], [3844, 3845],
+              [3851, 3852], [3886, 3888], [3891], [3904, 3907], [3960, 3963],
+              [3965, 3968], [3970], [3977, 3978], [4114, 4129], [4400],
+              [5401, 5423], [5425, 5426], [5429], [5488, 5489], [5495, 5496],
+              [5623], [5629, 5631], [5699], [5939, 5941], [5943], [5948],
+              [5950], [6007, 6008], [6023, 6026], [6047, 6050], [6055, 6058],
+              [6113, 6114], [6158, 6161], [6189, 6192], [6280, 6281],
+              [6307, 6308], [6310, 6317], [6579, 6580], [6599, 6600],
+              [6636, 6639], [6652, 6655], [6708, 6711], [6713, 6716], [6718],
+              [6725, 6726], [6858, 6873], [7136], [8135, 8157], [8159, 8160],
+              [8163], [8323, 8325], [8393], [8722, 8724], [8726], [9022, 9024],
+              [9026]],
+    'leftUpLeg': [[3464, 3465], [3467, 3468], [3477, 3484], [3500, 3511],
+                  [3527, 3545], [3563, 3566], [3574, 3676], [3770, 3781],
+                  [3792, 3803], [3805, 3808], [3818, 3821], [3858, 3867],
+                  [3902, 3903], [3914, 3917], [3958, 3959], [3986],
+                  [3991, 3998], [4085, 4087], [4089, 4097], [4109, 4113],
+                  [4131, 4134], [4144, 4154], [4165, 4166], [5700, 5703],
+                  [5706, 5710], [9021], [9025]],
+    'eyeballs': [[9383, 9516], [9518, 9529], [9531, 9542], [9544, 9555],
+                 [9557, 9568], [9570, 9581], [9583, 9594], [9596, 9607],
+                 [9609, 9620], [9622, 9633], [9635, 9646], [9648, 9659],
+                 [9661, 9672], [9674, 9685], [9687, 9698], [9700, 9711],
+                 [9713, 9724], [9726, 9737], [9739, 9750], [9752, 9763],
+                 [9765, 9776], [9778, 9789], [9791, 9803], [9805, 9816],
+                 [9818, 9829], [9831, 9842], [9844, 9855], [9857, 9868],
+                 [9870, 9881], [9883, 9894], [9896, 9907], [9909, 9920],
+                 [9922, 10062], [10064, 10075], [10077, 10088], [10090, 10101],
+                 [10103, 10114], [10116, 10127], [10129,
+                                                  10140], [10142, 10153],
+                 [10155, 10166], [10168, 10179], [10181,
+                                                  10192], [10194, 10205],
+                 [10207, 10218], [10220, 10231], [10233,
+                                                  10244], [10246, 10257],
+                 [10259, 10270], [10272, 10283], [10285,
+                                                  10296], [10298, 10309],
+                 [10311, 10322], [10324, 10335], [10337,
+                                                  10349], [10351, 10362],
+                 [10364, 10375], [10377, 10388], [10390,
+                                                  10401], [10403, 10414],
+                 [10416, 10427], [10429, 10440], [10442, 10453],
+                 [10455, 10466], [10468, 10474]],
+    'leftHand': [[4595, 4640],
+                 [4645, 4650], [4655, 4680], [4683, 4715], [4720], [4723],
+                 [4727, 4736], [4743, 4758], [4763, 4765], [4768, 4769],
+                 [4776, 4778], [4784, 4794], [4796, 4799], [4803,
+                                                            4817], [4820],
+                 [4822], [4824, 4828], [4835, 4843], [4845], [4849, 4854],
+                 [4860, 4874], [4876, 4885], [4888, 4893], [4898, 4899],
+                 [4901, 4904], [4907], [5211, 5212], [5221,
+                                                      5222], [5311, 5348],
+                 [5351, 5394]],
+    'hips': [[3262, 3263], [3284, 3285], [3292, 3293], [3306, 3309], [3335],
+             [3350], [3428, 3429], [3432, 3434], [3439, 3442], [3447, 3448],
+             [3454, 3455], [3460, 3476], [3485, 3500], [3510, 3520],
+             [3542, 3543], [3546, 3550], [3562], [3567, 3569], [3734, 3736],
+             [3798, 3799], [3804], [3839, 3843], [3879], [3884, 3885],
+             [3889, 3890], [3902, 3903], [3916, 3917], [3958], [3969, 3972],
+             [3986], [3993, 3994], [4002], [4041], [4065, 4066], [4080, 4084],
+             [4088], [4130], [4144, 4145], [4147], [4165, 4166], [4291, 4292],
+             [4297, 4298], [4320, 4321], [4401, 4425], [5490, 5494],
+             [5497, 5498], [5502, 5517], [5520], [5557], [5574, 5575], [5596],
+             [5600, 5601], [5603, 5604], [5613, 5615], [5620], [5622],
+             [5630, 5631], [5658, 5699], [5703, 5705], [5711, 5727], [5931],
+             [5934], [5939], [5941, 5942], [5946], [5949], [6025, 6026],
+             [6047, 6048], [6055, 6056], [6069, 6072], [6098], [6113],
+             [6189, 6190], [6193, 6195], [6200, 6203], [6208, 6209],
+             [6215, 6216], [6221, 6237], [6246, 6261], [6271, 6281],
+             [6303, 6304], [6307, 6311], [6323], [6328, 6330], [6556, 6557],
+             [6594, 6598], [6634, 6635], [6650, 6651], [6664, 6665], [6706],
+             [6717, 6720], [6734], [6741, 6742], [6824, 6828], [6832], [6874],
+             [6888, 6889], [6891], [6909, 6910], [7137, 7161], [8219, 8240],
+             [8324, 8325], [8352, 8393], [8397, 8399], [8405, 8421]]
+}
+
+SMPLX_SUPER_SET = {
+    'FOOT': ['leftFoot', 'leftToeBase', 'rightFoot', 'rightToeBase'],
+    'HAND': ['leftHand', 'rightHand', 'leftHandIndex1', 'rightHandIndex1'],
+    'LEG': ['rightUpLeg', 'leftUpLeg', 'leftLeg', 'rightLeg'],
+    'ARM': ['leftForeArm', 'rightForeArm', 'leftArm', 'rightArm'],
+    'HEAD': ['neck', 'head', 'leftEye', 'rightEye', 'eyeballs'],
+    'UPBODY': ['spine1', 'spine2', 'leftShoulder', 'rightShoulder'],
+    'LOWBODY': ['spine', 'hips'],
+}
diff --git a/detrsmpl/core/distributed_wrapper.py b/detrsmpl/core/distributed_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..106d6553f2b1d66d1b50ab8c0fd4f3dba0e38f45
--- /dev/null
+++ b/detrsmpl/core/distributed_wrapper.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.parallel import MODULE_WRAPPERS, MMDistributedDataParallel
+from mmcv.parallel.scatter_gather import scatter_kwargs
+from torch.cuda._utils import _get_device_index
+
+
+@MODULE_WRAPPERS.register_module()
+class DistributedDataParallelWrapper(nn.Module):
+    """A DistributedDataParallel wrapper for models in 3D mesh estimation task.
+
+    In  3D mesh estimation task, there is a need to wrap different modules in
+    the models with separate DistributedDataParallel. Otherwise, it will cause
+    errors for GAN training.
+    More specific, the GAN model, usually has two sub-modules:
+    generator and discriminator. If we wrap both of them in one
+    standard DistributedDataParallel, it will cause errors during training,
+    because when we update the parameters of the generator (or discriminator),
+    the parameters of the discriminator (or generator) is not updated, which is
+    not allowed for DistributedDataParallel.
+    So we design this wrapper to separately wrap DistributedDataParallel
+    for generator and discriminator.
+    In this wrapper, we perform two operations:
+    1. Wrap the modules in the models with separate MMDistributedDataParallel.
+        Note that only modules with parameters will be wrapped.
+    2. Do scatter operation for 'forward', 'train_step' and 'val_step'.
+    Note that the arguments of this wrapper is the same as those in
+    `torch.nn.parallel.distributed.DistributedDataParallel`.
+    Args:
+        module (nn.Module): Module that needs to be wrapped.
+        device_ids (list[int | `torch.device`]): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+        dim (int, optional): Same as that in the official scatter function in
+            pytorch. Defaults to 0.
+        broadcast_buffers (bool): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Defaults to False.
+        find_unused_parameters (bool, optional): Same as that in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+            Traverse the autograd graph of all tensors contained in returned
+            value of the wrapped module’s forward function. Defaults to False.
+        kwargs (dict): Other arguments used in
+            `torch.nn.parallel.distributed.DistributedDataParallel`.
+    """
+    def __init__(self,
+                 module,
+                 device_ids,
+                 dim=0,
+                 broadcast_buffers=False,
+                 find_unused_parameters=False,
+                 **kwargs):
+        super().__init__()
+        assert len(device_ids) == 1, (
+            'Currently, DistributedDataParallelWrapper only supports one'
+            'single CUDA device for each process.'
+            f'The length of device_ids must be 1, but got {len(device_ids)}.')
+        self.module = module
+        self.dim = dim
+        self.to_ddp(device_ids=device_ids,
+                    dim=dim,
+                    broadcast_buffers=broadcast_buffers,
+                    find_unused_parameters=find_unused_parameters,
+                    **kwargs)
+        self.output_device = _get_device_index(device_ids[0], True)
+
+    def to_ddp(self, device_ids, dim, broadcast_buffers,
+               find_unused_parameters, **kwargs):
+        """Wrap models with separate MMDistributedDataParallel.
+
+        It only wraps the modules with parameters.
+        """
+        for name, module in self.module._modules.items():
+            if next(module.parameters(), None) is None:
+                module = module.cuda()
+            elif all(not p.requires_grad for p in module.parameters()):
+                module = module.cuda()
+            else:
+                module = MMDistributedDataParallel(
+                    module.cuda(),
+                    device_ids=device_ids,
+                    dim=dim,
+                    broadcast_buffers=broadcast_buffers,
+                    find_unused_parameters=find_unused_parameters,
+                    **kwargs)
+            self.module._modules[name] = module
+
+    def scatter(self, inputs, kwargs, device_ids):
+        """Scatter function.
+
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+            device_ids (int): Device id.
+        """
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def forward(self, *inputs, **kwargs):
+        """Forward function.
+
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        return self.module(*inputs[0], **kwargs[0])
+
+    def train_step(self, *inputs, **kwargs):
+        """Train step function.
+
+        Args:
+            inputs (Tensor): Input Tensor.
+            kwargs (dict): Args for
+                ``mmcv.parallel.scatter_gather.scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.train_step(*inputs[0], **kwargs[0])
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        """Validation step function.
+
+        Args:
+            inputs (tuple): Input data.
+            kwargs (dict): Args for ``scatter_kwargs``.
+        """
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.val_step(*inputs[0], **kwargs[0])
+        return output
diff --git a/detrsmpl/core/evaluation/__init__.py b/detrsmpl/core/evaluation/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6ee4cb3215cdb31960ccf00a7d81c69bd5df482
--- /dev/null
+++ b/detrsmpl/core/evaluation/__init__.py
@@ -0,0 +1,17 @@
+from detrsmpl.core.evaluation import mesh_eval
+from detrsmpl.core.evaluation.eval_hooks import DistEvalHook, EvalHook
+from detrsmpl.core.evaluation.eval_utils import (
+    fg_vertices_to_mesh_distance,
+    keypoint_3d_auc,
+    keypoint_3d_pck,
+    keypoint_accel_error,
+    keypoint_mpjpe,
+    vertice_pve,
+)
+from detrsmpl.core.evaluation.mesh_eval import compute_similarity_transform
+
+__all__ = [
+    'compute_similarity_transform', 'keypoint_mpjpe', 'mesh_eval',
+    'DistEvalHook', 'EvalHook', 'vertice_pve', 'keypoint_3d_pck',
+    'keypoint_3d_auc', 'keypoint_accel_error', 'fg_vertices_to_mesh_distance'
+]
diff --git a/detrsmpl/core/evaluation/eval_hooks.py b/detrsmpl/core/evaluation/eval_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..f18709963e7f534a7241cd8ac60c7b6cbfcf03b3
--- /dev/null
+++ b/detrsmpl/core/evaluation/eval_hooks.py
@@ -0,0 +1,135 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+import warnings
+
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EvalHook as BaseEvalHook
+
+MMHUMAN3D_GREATER_KEYS = ['3dpck', 'pa-3dpck', '3dauc', 'pa-3dauc']
+MMHUMAN3D_LESS_KEYS = ['mpjpe', 'pa-mpjpe', 'pve']
+
+
+class EvalHook(BaseEvalHook):
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=MMHUMAN3D_GREATER_KEYS,
+                 less_keys=MMHUMAN3D_LESS_KEYS,
+                 **eval_kwargs):
+        if test_fn is None:
+            from detrsmpl.apis import single_gpu_test
+            test_fn = single_gpu_test
+
+        # remove "gpu_collect" from eval_kwargs
+        if 'gpu_collect' in eval_kwargs:
+            warnings.warn(
+                '"gpu_collect" will be deprecated in EvalHook.'
+                'Please remove it from the config.', DeprecationWarning)
+            _ = eval_kwargs.pop('gpu_collect')
+
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="pa-mpjpe".', DeprecationWarning)
+
+            key_indicator = eval_kwargs.pop('key_indicator', None)
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys, **eval_kwargs)
+
+    def evaluate(self, runner, results):
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            eval_res = self.dataloader.dataset.evaluate(results,
+                                                        res_folder=tmp_dir,
+                                                        logger=runner.logger,
+                                                        **self.eval_kwargs)
+
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+
+        if self.save_best is not None:
+            if self.key_indicator == 'auto':
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+
+            return eval_res[self.key_indicator]
+
+        return None
+
+
+class DistEvalHook(BaseDistEvalHook):
+    def __init__(self,
+                 dataloader,
+                 start=None,
+                 interval=1,
+                 by_epoch=True,
+                 save_best=None,
+                 rule=None,
+                 test_fn=None,
+                 greater_keys=MMHUMAN3D_GREATER_KEYS,
+                 less_keys=MMHUMAN3D_LESS_KEYS,
+                 broadcast_bn_buffer=True,
+                 tmpdir=None,
+                 gpu_collect=False,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from detrsmpl.apis import multi_gpu_test
+            test_fn = multi_gpu_test
+
+        # update "save_best" according to "key_indicator" and remove the
+        # latter from eval_kwargs
+        if 'key_indicator' in eval_kwargs or isinstance(save_best, bool):
+            warnings.warn(
+                '"key_indicator" will be deprecated in EvalHook.'
+                'Please use "save_best" to specify the metric key,'
+                'e.g., save_best="pa-mpjpe".', DeprecationWarning)
+
+            key_indicator = eval_kwargs.pop('key_indicator', None)
+            if save_best is True and key_indicator is None:
+                raise ValueError('key_indicator should not be None, when '
+                                 'save_best is set to True.')
+            save_best = key_indicator
+
+        super().__init__(dataloader, start, interval, by_epoch, save_best,
+                         rule, test_fn, greater_keys, less_keys,
+                         broadcast_bn_buffer, tmpdir, gpu_collect,
+                         **eval_kwargs)
+
+    def evaluate(self, runner, results):
+        """Evaluate the results.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlined training runner.
+            results (list): Output results.
+        """
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            eval_res = self.dataloader.dataset.evaluate(results,
+                                                        res_folder=tmp_dir,
+                                                        logger=runner.logger,
+                                                        **self.eval_kwargs)
+
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+
+        if self.save_best is not None:
+            if self.key_indicator == 'auto':
+                # infer from eval_results
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+            return eval_res[self.key_indicator]
+
+        return None
diff --git a/detrsmpl/core/evaluation/eval_utils.py b/detrsmpl/core/evaluation/eval_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd0fddd9324cc504271c034d810ac4b975dfd495
--- /dev/null
+++ b/detrsmpl/core/evaluation/eval_utils.py
@@ -0,0 +1,287 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import trimesh
+from trimesh.proximity import closest_point
+
+from .mesh_eval import compute_similarity_transform
+
+
+def keypoint_mpjpe(pred, gt, mask, alignment='none'):
+    """Calculate the mean per-joint position error (MPJPE) and the error after
+    rigid alignment with the ground truth (PA-MPJPE).
+    batch_size: N
+    num_keypoints: K
+    keypoint_dims: C
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+    Returns:
+        tuple: A tuple containing joint position errors
+        - mpjpe (float|np.ndarray[N]): mean per-joint position error.
+        - pa-mpjpe (float|np.ndarray[N]): mpjpe after rigid alignment with the
+            ground truth
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)[mask].mean()
+
+    return error
+
+
+def keypoint_accel_error(gt, pred, mask=None):
+    """Computes acceleration error:
+
+    Note that for each frame that is not visible, three entries in the
+    acceleration error should be zero'd out.
+    Args:
+        gt (Nx14x3).
+        pred (Nx14x3).
+        mask (N).
+    Returns:
+        error_accel (N-2).
+    """
+    # (N-2)x14x3
+    accel_gt = gt[:-2] - 2 * gt[1:-1] + gt[2:]
+    accel_pred = pred[:-2] - 2 * pred[1:-1] + pred[2:]
+
+    normed = np.linalg.norm(accel_pred - accel_gt, axis=2)
+
+    if mask is None:
+        new_vis = np.ones(len(normed), dtype=bool)
+    else:
+        invis = np.logical_not(mask)
+        invis1 = np.roll(invis, -1)
+        invis2 = np.roll(invis, -2)
+        new_invis = np.logical_or(invis, np.logical_or(invis1, invis2))[:-2]
+        new_vis = np.logical_not(new_invis)
+
+    return np.mean(normed[new_vis], axis=1)
+
+
+def vertice_pve(pred_verts, target_verts, alignment='none'):
+    """Computes per vertex error (PVE).
+
+    Args:
+        verts_gt (N x verts_num x 3).
+        verts_pred (N x verts_num x 3).
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+    Returns:
+        error_verts.
+    """
+    assert len(pred_verts) == len(target_verts)
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred_verts = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred_verts, target_verts)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred_verts, pred_verts)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred_verts, target_verts)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred_verts = pred_verts * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+    error = np.linalg.norm(pred_verts - target_verts, ord=2, axis=-1).mean()
+    return error
+
+
+def keypoint_3d_pck(pred, gt, mask, alignment='none', threshold=150.):
+    """Calculate the Percentage of Correct Keypoints (3DPCK) w. or w/o rigid
+    alignment.
+    Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved
+    CNN Supervision' 3DV'2017. <https://arxiv.org/pdf/1611.09813>`__ .
+    Note:
+        - batch_size: N
+        - num_keypoints: K
+        - keypoint_dims: C
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+        threshold:  If L2 distance between the prediction and the groundtruth
+            is less then threshold, the predicted result is considered as
+            correct. Default: 150 (mm).
+    Returns:
+        pck: percentage of correct keypoints.
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)
+    pck = (error < threshold).astype(np.float32)[mask].mean() * 100
+
+    return pck
+
+
+def keypoint_3d_auc(pred, gt, mask, alignment='none'):
+    """Calculate the Area Under the Curve (3DAUC) computed for a range of 3DPCK
+    thresholds.
+    Paper ref: `Monocular 3D Human Pose Estimation In The Wild Using Improved
+    CNN Supervision' 3DV'2017. <https://arxiv.org/pdf/1611.09813>`__ .
+    This implementation is derived from mpii_compute_3d_pck.m, which is
+    provided as part of the MPI-INF-3DHP test data release.
+    Note:
+        batch_size: N
+        num_keypoints: K
+        keypoint_dims: C
+    Args:
+        pred (np.ndarray[N, K, C]): Predicted keypoint location.
+        gt (np.ndarray[N, K, C]): Groundtruth keypoint location.
+        mask (np.ndarray[N, K]): Visibility of the target. False for invisible
+            joints, and True for visible. Invisible joints will be ignored for
+            accuracy calculation.
+        alignment (str, optional): method to align the prediction with the
+            groundtruth. Supported options are:
+            - ``'none'``: no alignment will be applied
+            - ``'scale'``: align in the least-square sense in scale
+            - ``'procrustes'``: align in the least-square sense in scale,
+                rotation and translation.
+    Returns:
+        auc: AUC computed for a range of 3DPCK thresholds.
+    """
+    assert mask.any()
+
+    if alignment == 'none':
+        pass
+    elif alignment == 'procrustes':
+        pred = np.stack([
+            compute_similarity_transform(pred_i, gt_i)
+            for pred_i, gt_i in zip(pred, gt)
+        ])
+    elif alignment == 'scale':
+        pred_dot_pred = np.einsum('nkc,nkc->n', pred, pred)
+        pred_dot_gt = np.einsum('nkc,nkc->n', pred, gt)
+        scale_factor = pred_dot_gt / pred_dot_pred
+        pred = pred * scale_factor[:, None, None]
+    else:
+        raise ValueError(f'Invalid value for alignment: {alignment}')
+
+    error = np.linalg.norm(pred - gt, ord=2, axis=-1)
+
+    thresholds = np.linspace(0., 150, 31)
+    pck_values = np.zeros(len(thresholds))
+    for i in range(len(thresholds)):
+        pck_values[i] = (error < thresholds[i]).astype(np.float32)[mask].mean()
+
+    auc = pck_values.mean() * 100
+
+    return auc
+
+
+def fg_vertices_to_mesh_distance(groundtruth_vertices,
+                                 grundtruth_landmark_points,
+                                 predicted_mesh_vertices, predicted_mesh_faces,
+                                 predicted_mesh_landmark_points):
+    """This script computes the reconstruction error between an input mesh and
+    a ground truth mesh.
+    Args:
+        groundtruth_vertices (np.ndarray[N,3]): Ground truth vertices.
+        grundtruth_landmark_points (np.ndarray[7,3]): Ground truth annotations.
+        predicted_mesh_vertices (np.ndarray[M,3]): Predicted vertices.
+        predicted_mesh_faces (np.ndarray[K,3]): Vertex indices
+            composing the predicted mesh.
+        predicted_mesh_landmark_points (np.ndarray[7,3]): Predicted points.
+
+    Return:
+        distance: Mean point to mesh distance.
+
+    The grundtruth_landmark_points and predicted_mesh_landmark_points have to
+    contain points in the following order:
+    (1) right eye outer corner, (2) right eye inner corner,
+    (3) left eye inner corner, (4) left eye outer corner,
+    (5) nose bottom, (6) right mouth corner, (7) left mouth corner.
+    """
+
+    # Do procrustes based on the 7 points:
+    _, tform = compute_similarity_transform(predicted_mesh_landmark_points,
+                                            grundtruth_landmark_points,
+                                            return_tform=True)
+    # Use tform to transform all vertices.
+    predicted_mesh_vertices_aligned = (
+        tform['scale'] * tform['rotation'].dot(predicted_mesh_vertices.T) +
+        tform['translation']).T
+
+    # Compute the mask: A circular area around the center of the face.
+    nose_bottom = np.array(grundtruth_landmark_points[4])
+    nose_bridge = (np.array(grundtruth_landmark_points[1]) + np.array(
+        grundtruth_landmark_points[2])) / 2  # between the inner eye corners
+    face_centre = nose_bottom + 0.3 * (nose_bridge - nose_bottom)
+    # Compute the radius for the face mask:
+    outer_eye_dist = np.linalg.norm(
+        np.array(grundtruth_landmark_points[0]) -
+        np.array(grundtruth_landmark_points[3]))
+    nose_dist = np.linalg.norm(nose_bridge - nose_bottom)
+    mask_radius = 1.2 * (outer_eye_dist + nose_dist) / 2
+
+    # Find all the vertex indices in mask area.
+    vertex_indices_mask = []
+    # vertex indices in the source mesh (the ground truth scan)
+    points_on_groundtruth_scan_to_measure_from = []
+    for vertex_idx, vertex in enumerate(groundtruth_vertices):
+        dist = np.linalg.norm(
+            vertex - face_centre
+        )  # We use Euclidean distance for the mask area for now.
+        if dist <= mask_radius:
+            vertex_indices_mask.append(vertex_idx)
+            points_on_groundtruth_scan_to_measure_from.append(vertex)
+    assert len(vertex_indices_mask) == len(
+        points_on_groundtruth_scan_to_measure_from)
+    # Calculate the distance to the surface of the predicted mesh.
+    predicted_mesh = trimesh.Trimesh(predicted_mesh_vertices_aligned,
+                                     predicted_mesh_faces)
+    _, distance, _ = closest_point(predicted_mesh,
+                                   points_on_groundtruth_scan_to_measure_from)
+    return distance.mean()
diff --git a/detrsmpl/core/evaluation/mesh_eval.py b/detrsmpl/core/evaluation/mesh_eval.py
new file mode 100644
index 0000000000000000000000000000000000000000..913052c4a3e09d380de8e643a047f78d7a11cda6
--- /dev/null
+++ b/detrsmpl/core/evaluation/mesh_eval.py
@@ -0,0 +1,77 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/akanazawa/hmr
+# Original licence: Copyright (c) 2018 akanazawa, under the MIT License.
+# ------------------------------------------------------------------------------
+
+import numpy as np
+
+
+def compute_similarity_transform(source_points,
+                                 target_points,
+                                 return_tform=False):
+    """Computes a similarity transform (sR, t) that takes a set of 3D points
+    source_points (N x 3) closest to a set of 3D points target_points, where R
+    is an 3x3 rotation matrix, t 3x1 translation, s scale.
+
+    And return the
+    transformed 3D points source_points_hat (N x 3). i.e. solves the orthogonal
+    Procrutes problem.
+    Notes:
+        Points number: N
+    Args:
+        source_points (np.ndarray([N, 3])): Source point set.
+        target_points (np.ndarray([N, 3])): Target point set.
+        return_tform (bool) : Whether return transform
+    Returns:
+        source_points_hat (np.ndarray([N, 3])): Transformed source point set.
+        transform (dict): Returns if return_tform is True.
+            Returns rotation: r, 'scale': s, 'translation':t.
+    """
+
+    assert target_points.shape[0] == source_points.shape[0]
+    assert target_points.shape[1] == 3 and source_points.shape[1] == 3
+
+    source_points = source_points.T
+    target_points = target_points.T
+
+    # 1. Remove mean.
+    mu1 = source_points.mean(axis=1, keepdims=True)
+    mu2 = target_points.mean(axis=1, keepdims=True)
+    X1 = source_points - mu1
+    X2 = target_points - mu2
+
+    # 2. Compute variance of X1 used for scale.
+    var1 = np.sum(X1**2)
+
+    # 3. The outer product of X1 and X2.
+    K = X1.dot(X2.T)
+
+    # 4. Solution that Maximizes trace(R'K) is R=U*V', where U, V are
+    # singular vectors of K.
+    U, _, Vh = np.linalg.svd(K)
+    V = Vh.T
+    # Construct Z that fixes the orientation of R to get det(R)=1.
+    Z = np.eye(U.shape[0])
+    Z[-1, -1] *= np.sign(np.linalg.det(U.dot(V.T)))
+    # Construct R.
+    R = V.dot(Z.dot(U.T))
+
+    # 5. Recover scale.
+    scale = np.trace(R.dot(K)) / var1
+
+    # 6. Recover translation.
+    t = mu2 - scale * (R.dot(mu1))
+
+    # 7. Transform the source points:
+    source_points_hat = scale * R.dot(source_points) + t
+
+    source_points_hat = source_points_hat.T
+
+    if return_tform:
+        return source_points_hat, {
+            'rotation': R,
+            'scale': scale,
+            'translation': t
+        }
+
+    return source_points_hat
diff --git a/detrsmpl/core/optimizer/__init__.py b/detrsmpl/core/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4340ffc075afdcdf3d9f7a398ead394ca5a168a1
--- /dev/null
+++ b/detrsmpl/core/optimizer/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import OPTIMIZERS, build_optimizers
+
+__all__ = ['build_optimizers', 'OPTIMIZERS']
diff --git a/detrsmpl/core/optimizer/builder.py b/detrsmpl/core/optimizer/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..80f659453a797def86628250cfbb7638ec0f323f
--- /dev/null
+++ b/detrsmpl/core/optimizer/builder.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import build_optimizer
+from mmcv.utils import Registry
+
+OPTIMIZERS = Registry('optimizers')
+
+
+def build_optimizers(model, cfgs):
+    """Build multiple optimizers from configs. If `cfgs` contains several dicts
+    for optimizers, then a dict for each constructed optimizers will be
+    returned. If `cfgs` only contains one optimizer config, the constructed
+    optimizer itself will be returned. For example,
+
+    1) Multiple optimizer configs:
+
+    .. code-block:: python
+
+        optimizer_cfg = dict(
+            model1=dict(type='SGD', lr=lr),
+            model2=dict(type='SGD', lr=lr))
+
+    The return dict is
+    ``dict('model1': torch.optim.Optimizer, 'model2': torch.optim.Optimizer)``
+
+    2) Single optimizer config:
+
+    .. code-block:: python
+
+        optimizer_cfg = dict(type='SGD', lr=lr)
+
+    The return is ``torch.optim.Optimizer``.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        cfgs (dict): The config dict of the optimizer.
+
+    Returns:
+        dict[:obj:`torch.optim.Optimizer`] | :obj:`torch.optim.Optimizer`:
+            The initialized optimizers.
+    """
+    optimizers = {}
+    if hasattr(model, 'module'):
+        model = model.module
+    # determine whether 'cfgs' has several dicts for optimizers
+    if all(isinstance(v, dict) for v in cfgs.values()):
+        for key, cfg in cfgs.items():
+            cfg_ = cfg.copy()
+            module = getattr(model, key)
+            optimizers[key] = build_optimizer(module, cfg_)
+        return optimizers
+
+    return build_optimizer(model, cfgs)
diff --git a/detrsmpl/core/post_processing/__init__.py b/detrsmpl/core/post_processing/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5b3945d2377ba675b2f20b63b3b550acccdbdd4f
--- /dev/null
+++ b/detrsmpl/core/post_processing/__init__.py
@@ -0,0 +1,15 @@
+from .builder import build_post_processing
+from .smooth.gaus1d_filter import Gaus1dFilter
+from .smooth.oneeuro_filter import OneEuroFilter
+from .smooth.savgol_filter import SGFilter
+from .smooth.smoothnet import SmoothNetFilter
+from .speed_up.deciwatch import DeciWatchPostProcessing
+
+__all__ = [
+    'build_post_processing',
+    'OneEuroFilter',
+    'SGFilter',
+    'Gaus1dFilter',
+    'SmoothNetFilter',
+    'DeciWatchPostProcessing',
+]
diff --git a/detrsmpl/core/post_processing/bbox/__init__.py b/detrsmpl/core/post_processing/bbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..59755b46c6df58c4c5dc74b5fe3d08fa10d1c49f
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/__init__.py
@@ -0,0 +1,6 @@
+from .assigners import AssignResult, BaseAssigner
+
+__all__ = [
+    'AssignResult',
+    'BaseAssigner',
+]
diff --git a/detrsmpl/core/post_processing/bbox/assigners/__init__.py b/detrsmpl/core/post_processing/bbox/assigners/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5aa9ad6c937ca6a98afe04d2ac9b81c12d20b185
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/assigners/__init__.py
@@ -0,0 +1,8 @@
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+from .builder import build_assigner
+from .hungarian_assigner import HungarianAssigner
+
+__all__ = [
+    'build_assigner', 'HungarianAssigner', 'AssignResult', 'BaseAssigner'
+]
diff --git a/detrsmpl/core/post_processing/bbox/assigners/assign_result.py b/detrsmpl/core/post_processing/bbox/assigners/assign_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..a44da61acc0c2bcb5ef0ce74f165f17d1da2764a
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/assigners/assign_result.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+# from mmdet.utils import util_mixins
+from detrsmpl.utils import util_mixins
+
+
+class AssignResult(util_mixins.NiceRepr):
+    """Stores assignments between predicted and truth boxes.
+
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+
+        gt_inds (LongTensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+
+        max_overlaps (FloatTensor): the iou between the predicted box and its
+            assigned truth box.
+
+        labels (None | LongTensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+
+    Example:
+        >>> # An assign result between 4 predicted boxes and 9 true boxes
+        >>> # where only two boxes were assigned.
+        >>> num_gts = 9
+        >>> max_overlaps = torch.LongTensor([0, .5, .9, 0])
+        >>> gt_inds = torch.LongTensor([-1, 1, 2, 0])
+        >>> labels = torch.LongTensor([0, 3, 4, 0])
+        >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(4,), max_overlaps.shape=(4,),
+                      labels.shape=(4,))>
+        >>> # Force addition of gt labels (when adding gt as proposals)
+        >>> new_labels = torch.LongTensor([3, 4, 5])
+        >>> self.add_gt_(new_labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(7,), max_overlaps.shape=(7,),
+                      labels.shape=(7,))>
+    """
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this assign result"""
+        parts = []
+        parts.append(f'num_gts={self.num_gts!r}')
+        if self.gt_inds is None:
+            parts.append(f'gt_inds={self.gt_inds!r}')
+        else:
+            parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}')
+        if self.max_overlaps is None:
+            parts.append(f'max_overlaps={self.max_overlaps!r}')
+        else:
+            parts.append('max_overlaps.shape='
+                         f'{tuple(self.max_overlaps.shape)!r}')
+        if self.labels is None:
+            parts.append(f'labels={self.labels!r}')
+        else:
+            parts.append(f'labels.shape={tuple(self.labels.shape)!r}')
+        return ', '.join(parts)
+
+    @classmethod
+    def random(cls, **kwargs):
+        """Create random AssignResult for tests or debugging.
+
+        Args:
+            num_preds: number of predicted boxes
+            num_gts: number of true boxes
+            p_ignore (float): probability of a predicted box assigned to an
+                ignored truth
+            p_assigned (float): probability of a predicted box not being
+                assigned
+            p_use_label (float | bool): with labels or not
+            rng (None | int | numpy.random.RandomState): seed or state
+
+        Returns:
+            :obj:`AssignResult`: Randomly generated assign results.
+
+        Example:
+            >>> from mmdet.core.bbox.assigners.assign_result import *  # NOQA
+            >>> self = AssignResult.random()
+            >>> print(self.info)
+        """
+        from mmdet.core.bbox import demodata
+        rng = demodata.ensure_rng(kwargs.get('rng', None))
+
+        num_gts = kwargs.get('num_gts', None)
+        num_preds = kwargs.get('num_preds', None)
+        p_ignore = kwargs.get('p_ignore', 0.3)
+        p_assigned = kwargs.get('p_assigned', 0.7)
+        p_use_label = kwargs.get('p_use_label', 0.5)
+        num_classes = kwargs.get('p_use_label', 3)
+
+        if num_gts is None:
+            num_gts = rng.randint(0, 8)
+        if num_preds is None:
+            num_preds = rng.randint(0, 16)
+
+        if num_gts == 0:
+            max_overlaps = torch.zeros(num_preds, dtype=torch.float32)
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+            if p_use_label is True or p_use_label < rng.rand():
+                labels = torch.zeros(num_preds, dtype=torch.int64)
+            else:
+                labels = None
+        else:
+            import numpy as np
+
+            # Create an overlap for each predicted box
+            max_overlaps = torch.from_numpy(rng.rand(num_preds))
+
+            # Construct gt_inds for each predicted box
+            is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned)
+            # maximum number of assignments constraints
+            n_assigned = min(num_preds, min(num_gts, is_assigned.sum()))
+
+            assigned_idxs = np.where(is_assigned)[0]
+            rng.shuffle(assigned_idxs)
+            assigned_idxs = assigned_idxs[0:n_assigned]
+            assigned_idxs.sort()
+
+            is_assigned[:] = 0
+            is_assigned[assigned_idxs] = True
+
+            is_ignore = torch.from_numpy(
+                rng.rand(num_preds) < p_ignore) & is_assigned
+
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+
+            true_idxs = np.arange(num_gts)
+            rng.shuffle(true_idxs)
+            true_idxs = torch.from_numpy(true_idxs)
+            gt_inds[is_assigned] = true_idxs[:n_assigned].long()
+
+            gt_inds = torch.from_numpy(
+                rng.randint(1, num_gts + 1, size=num_preds))
+            gt_inds[is_ignore] = -1
+            gt_inds[~is_assigned] = 0
+            max_overlaps[~is_assigned] = 0
+
+            if p_use_label is True or p_use_label < rng.rand():
+                if num_classes == 0:
+                    labels = torch.zeros(num_preds, dtype=torch.int64)
+                else:
+                    labels = torch.from_numpy(
+                        # remind that we set FG labels to [0, num_class-1]
+                        # since mmdet v2.0
+                        # BG cat_id: num_class
+                        rng.randint(0, num_classes, size=num_preds))
+                    labels[~is_assigned] = 0
+            else:
+                labels = None
+
+        self = cls(num_gts, gt_inds, max_overlaps, labels)
+        return self
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.
+
+        Args:
+            gt_labels (torch.Tensor): Labels of gt boxes
+        """
+        self_inds = torch.arange(1,
+                                 len(gt_labels) + 1,
+                                 dtype=torch.long,
+                                 device=gt_labels.device)
+        self.gt_inds = torch.cat([self_inds, self.gt_inds])
+
+        self.max_overlaps = torch.cat(
+            [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps])
+
+        if self.labels is not None:
+            self.labels = torch.cat([gt_labels, self.labels])
diff --git a/detrsmpl/core/post_processing/bbox/assigners/base_assigner.py b/detrsmpl/core/post_processing/bbox/assigners/base_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..e63b71a206234e861d574d7c569f9fb93fc6883e
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/assigners/base_assigner.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns boxes to ground truth boxes."""
+    @abstractmethod
+    def assign(self, bboxes, gt_bboxes, gt_bboxes_ignore=None, gt_labels=None):
+        """Assign boxes to either a ground truth boxes or a negative boxes."""
diff --git a/detrsmpl/core/post_processing/bbox/assigners/builder.py b/detrsmpl/core/post_processing/bbox/assigners/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c27ec35f7e314113e33a48a959dac523258125a8
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/assigners/builder.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, build_from_cfg
+
+BBOX_ASSIGNERS = Registry('bbox_assigner')
+
+
+def build_assigner(cfg, **default_args):
+    """Builder of box assigner."""
+    return build_from_cfg(cfg, BBOX_ASSIGNERS, default_args)
diff --git a/detrsmpl/core/post_processing/bbox/assigners/hungarian_assigner.py b/detrsmpl/core/post_processing/bbox/assigners/hungarian_assigner.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd6fabf88c7f39bb561c7656b2ee2f2e24a32788
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/assigners/hungarian_assigner.py
@@ -0,0 +1,189 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+# from detrsmpl.core.post_processing.bbox.transforms
+# import bbox_cxcywh_to_xyxy
+from ..match_costs import build_match_cost
+from .assign_result import AssignResult
+from .base_assigner import BaseAssigner
+from .builder import BBOX_ASSIGNERS
+
+try:
+    from scipy.optimize import linear_sum_assignment
+except ImportError:
+    linear_sum_assignment = None
+
+
+@BBOX_ASSIGNERS.register_module()
+class HungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between predictions and ground truth.
+
+    This class computes an assignment between the targets and the predictions
+    based on the costs. The costs are weighted sum of three components:
+    classification cost, regression L1 cost and regression iou cost. The
+    targets don't include the no_object, so generally there are more
+    predictions than targets. After the one-to-one matching, the un-matched
+    are treated as backgrounds. Thus each query prediction will be assigned
+    with `0` or a positive integer indicating the ground truth index:
+
+    - 0: negative sample, no assigned gt
+    - positive integer: positive sample, index (1-based) of assigned gt
+
+    Args:
+        cls_weight (int | float, optional): The scale factor for classification
+            cost. Default 1.0.
+        bbox_weight (int | float, optional): The scale factor for regression
+            L1 cost. Default 1.0.
+        iou_weight (int | float, optional): The scale factor for regression
+            iou cost. Default 1.0.
+        iou_calculator (dict | optional): The config for the iou calculation.
+            Default type `BboxOverlaps2D`.
+        iou_mode (str | optional): "iou" (intersection over union), "iof"
+                (intersection over foreground), or "giou" (generalized
+                intersection over union). Default "giou".
+    """
+    def __init__(
+        self,
+        #  cls_cost=dict(type='ClassificationCost', weight=1.),
+        kp3d_cost=dict(type='Keypoints3DCost', covention='smpl_54',
+                       weight=1.0),
+        kp2d_cost=dict(type='Keypoints2DCost', covention='smpl_54',
+                       weight=1.0),
+    ):
+        # self.cls_cost = build_match_cost(cls_cost)
+        self.kp2d_cost = build_match_cost(kp2d_cost)
+        self.kp3d_cost = build_match_cost(kp3d_cost)
+
+    def assign(
+        self,
+        pred_smpl_pose,
+        pred_smpl_shape,
+        pred_kp3d,
+        pred_vert,
+        pred_cam,
+        gt_smpl_pose,
+        gt_smpl_shape,
+        gt_kp2d,
+        gt_kp3d,
+        has_keypoints2d,
+        has_keypoints3d,
+        has_smpl,
+        img_meta,
+        gt_bboxes_ignore=None,
+        eps=1e-7,
+        # pred_smpl_orient,
+        # pred_keypoints2d,
+        # gt_bboxes,
+        # gt_labels,
+    ):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The `assigned_gt_inds` with -1 means don't care,
+        0 means negative sample, and positive number is the index (1-based)
+        of assigned gt.
+        The assignment is done in the following steps, the order matters.
+
+        1. assign every prediction to -1
+        2. compute the weighted costs
+        3. do Hungarian matching on CPU based on the costs
+        4. assign all to 0 (background) first, then for each matched pair
+           between predictions and gts, treat this prediction as foreground
+           and assign the corresponding gt index (plus 1) to it.
+
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                [num_query, 4].
+            cls_pred (Tensor): Predicted classification logits, shape
+                [num_query, num_class].
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape [num_gt, 4].
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+            img_meta (dict): Meta information for current image.
+            gt_bboxes_ignore (Tensor, optional): Ground truth bboxes that are
+                labelled as `ignored`. Default None.
+            eps (int | float, optional): A value added to the denominator for
+                numerical stability. Default 1e-7.
+
+        Returns:
+            :obj:`AssignResult`: The assigned result.
+        """
+        assert gt_bboxes_ignore is None, \
+            'Only case when gt_bboxes_ignore is None is supported.'
+        num_gts, num_bboxes = gt_smpl_pose.size(0), pred_smpl_pose.size(0)
+
+        # 1. assign -1 by default
+        assigned_gt_inds = pred_smpl_pose.new_full((num_bboxes, ),
+                                                   -1,
+                                                   dtype=torch.long)
+        assigned_labels = pred_smpl_pose.new_full((num_bboxes, ),
+                                                  -1,
+                                                  dtype=torch.long)
+        if num_gts == 0 or num_bboxes == 0:
+            # No ground truth or boxes, return empty assignment
+            if num_gts == 0:
+                # No ground truth, assign all to background
+                assigned_gt_inds[:] = 0
+            return AssignResult(num_gts,
+                                assigned_gt_inds,
+                                None,
+                                labels=assigned_labels)
+        # img_h, img_w, _ = img_meta['img_shape']
+        # factor = gt_bboxes.new_tensor([img_w, img_h, img_w,
+        #                                img_h]).unsqueeze(0)
+
+        # 2. compute the weighted costs
+        # classification and bboxcost.
+        # cls_cost = self.cls_cost(cls_pred, gt_labels)
+        # regression L1 cost
+        # normalize_gt_bboxes = gt_bboxes / factor
+
+        # kp3d_cost
+        kp3d_cost = self.kp3d_cost(pred_kp3d, gt_kp3d)
+
+        # kp2d_cost
+        kp2d_cost = self.kp2d_cost(pred_kp3d, pred_cam, gt_kp2d)
+        # smpl_pose_cost
+
+        # smpl_betas_cost
+
+        # verts_cost
+
+        # TODO: bbox_cost
+
+        # TODO: occlusion == bbox insecaa
+
+        # regression iou cost, defaultly giou is used in official DETR.
+        # bboxes = bbox_cxcywh_to_xyxy(bbox_pred) * factor
+        # iou_cost = self.iou_cost(pred_smpl_pose, gt_smpl_pose)
+        # weighted sum of above three costs
+        cost = kp2d_cost  # + kp3d_cost
+
+        # 3. do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+        matched_row_inds, matched_col_inds = linear_sum_assignment(cost)
+        matched_row_inds = torch.from_numpy(matched_row_inds).to(
+            pred_smpl_pose.device)
+        matched_col_inds = torch.from_numpy(matched_col_inds).to(
+            pred_smpl_pose.device)
+
+        # 4. assign backgrounds and foregrounds
+        # assign all indices to backgrounds first
+        assigned_gt_inds[:] = 0
+        # assign foregrounds based on matching results
+        assigned_gt_inds[matched_row_inds] = matched_col_inds + 1
+        # assigned_labels[matched_row_inds] = None # gt_smpl_pose[matched_col_inds]
+        assigned_labels = None
+        return AssignResult(num_gts,
+                            assigned_gt_inds,
+                            None,
+                            labels=assigned_labels)
+
+        # num_gt: instance_num
+        # assigned_gt_inds: self.gt_inds
+        #
diff --git a/detrsmpl/core/post_processing/bbox/coder/__init__.py b/detrsmpl/core/post_processing/bbox/coder/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8217120e4bac7e1044f85242eda60ce09a9d58a
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/coder/__init__.py
@@ -0,0 +1,5 @@
+from .base_bbox_coder import BaseBBoxCoder
+from .builder import build_bbox_coder
+from .distance_point_bbox_coder import DistancePointBBoxCoder
+
+__all__ = ['build_bbox_coder', 'BaseBBoxCoder', 'DistancePointBBoxCoder']
diff --git a/detrsmpl/core/post_processing/bbox/coder/base_bbox_coder.py b/detrsmpl/core/post_processing/bbox/coder/base_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd1f4f97ea96763f85daf0d7010a911a2e9cf88a
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/coder/base_bbox_coder.py
@@ -0,0 +1,17 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseBBoxCoder(metaclass=ABCMeta):
+    """Base bounding box coder."""
+    def __init__(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def encode(self, bboxes, gt_bboxes):
+        """Encode deltas between bboxes and ground truth boxes."""
+
+    @abstractmethod
+    def decode(self, bboxes, bboxes_pred):
+        """Decode the predicted bboxes according to prediction and base
+        boxes."""
diff --git a/detrsmpl/core/post_processing/bbox/coder/builder.py b/detrsmpl/core/post_processing/bbox/coder/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..154eba608de21b22e1273f479c6644c4e6220e85
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/coder/builder.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, build_from_cfg
+
+BBOX_CODERS = Registry('bbox_coder')
+
+
+def build_bbox_coder(cfg, **default_args):
+    """Builder of box coder."""
+    return build_from_cfg(cfg, BBOX_CODERS, default_args)
diff --git a/detrsmpl/core/post_processing/bbox/coder/distance_point_bbox_coder.py b/detrsmpl/core/post_processing/bbox/coder/distance_point_bbox_coder.py
new file mode 100644
index 0000000000000000000000000000000000000000..c04acbd87ff5832b78f98e90f16920da1ec3428e
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/coder/distance_point_bbox_coder.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..transforms import bbox2distance, distance2bbox
+from .base_bbox_coder import BaseBBoxCoder
+from .builder import BBOX_CODERS
+
+
+@BBOX_CODERS.register_module()
+class DistancePointBBoxCoder(BaseBBoxCoder):
+    """Distance Point BBox coder.
+
+    This coder encodes gt bboxes (x1, y1, x2, y2) into (top, bottom, left,
+    right) and decode it back to the original.
+
+    Args:
+        clip_border (bool, optional): Whether clip the objects outside the
+            border of the image. Defaults to True.
+    """
+
+    def __init__(self, clip_border=True):
+        super(BaseBBoxCoder, self).__init__()
+        self.clip_border = clip_border
+
+    def encode(self, points, gt_bboxes, max_dis=None, eps=0.1):
+        """Encode bounding box to distances.
+
+        Args:
+            points (Tensor): Shape (N, 2), The format is [x, y].
+            gt_bboxes (Tensor): Shape (N, 4), The format is "xyxy"
+            max_dis (float): Upper bound of the distance. Default None.
+            eps (float): a small value to ensure target < max_dis, instead <=.
+                Default 0.1.
+
+        Returns:
+            Tensor: Box transformation deltas. The shape is (N, 4).
+        """
+        assert points.size(0) == gt_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert gt_bboxes.size(-1) == 4
+        return bbox2distance(points, gt_bboxes, max_dis, eps)
+
+    def decode(self, points, pred_bboxes, max_shape=None):
+        """Decode distance prediction to bounding box.
+
+        Args:
+            points (Tensor): Shape (B, N, 2) or (N, 2).
+            pred_bboxes (Tensor): Distance from the given point to 4
+                boundaries (left, top, right, bottom). Shape (B, N, 4)
+                or (N, 4)
+            max_shape (Sequence[int] or torch.Tensor or Sequence[
+                Sequence[int]],optional): Maximum bounds for boxes, specifies
+                (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+                the max_shape should be a Sequence[Sequence[int]],
+                and the length of max_shape should also be B.
+                Default None.
+        Returns:
+            Tensor: Boxes with shape (N, 4) or (B, N, 4)
+        """
+        assert points.size(0) == pred_bboxes.size(0)
+        assert points.size(-1) == 2
+        assert pred_bboxes.size(-1) == 4
+        if self.clip_border is False:
+            max_shape = None
+        return distance2bbox(points, pred_bboxes, max_shape)
diff --git a/detrsmpl/core/post_processing/bbox/match_costs/__init__.py b/detrsmpl/core/post_processing/bbox/match_costs/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..abebdda7277e180ae0a00d78eee3e821061c81e2
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/match_costs/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import build_match_cost
+from .match_cost import (
+    BBoxL1Cost,
+    ClassificationCost,
+    CrossEntropyLossCost,
+    DiceCost,
+    FocalLossCost,
+    IoUCost,
+)
+
+__all__ = [
+    'build_match_cost', 'ClassificationCost', 'BBoxL1Cost', 'IoUCost',
+    'FocalLossCost', 'DiceCost', 'CrossEntropyLossCost'
+]
diff --git a/detrsmpl/core/post_processing/bbox/match_costs/builder.py b/detrsmpl/core/post_processing/bbox/match_costs/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea086adff23c5adbc35d448d5a93daf1a04bdc53
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/match_costs/builder.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, build_from_cfg
+
+MATCH_COST = Registry('Match Cost')
+
+
+def build_match_cost(cfg, default_args=None):
+    """Builder of IoU calculator."""
+    return build_from_cfg(cfg, MATCH_COST, default_args)
diff --git a/detrsmpl/core/post_processing/bbox/match_costs/match_cost.py b/detrsmpl/core/post_processing/bbox/match_costs/match_cost.py
new file mode 100644
index 0000000000000000000000000000000000000000..883803c20abcc574773fe0c5a2473ef5d1d0b919
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/match_costs/match_cost.py
@@ -0,0 +1,551 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+from mmdet.core.bbox.iou_calculators import bbox_overlaps
+from mmdet.core.bbox.transforms import bbox_cxcywh_to_xyxy, bbox_xyxy_to_cxcywh
+from typing import Optional, Tuple, Union
+from detrsmpl.core.conventions.keypoints_mapping import get_keypoint_idx
+from detrsmpl.utils.geometry import project_points
+
+from .builder import MATCH_COST
+
+
+@MATCH_COST.register_module()
+class BBoxL1Cost:
+    """BBoxL1Cost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+         box_format (str, optional): 'xyxy' for DETR, 'xywh' for Sparse_RCNN
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import BBoxL1Cost
+         >>> import torch
+         >>> self = BBoxL1Cost()
+         >>> bbox_pred = torch.rand(1, 4)
+         >>> gt_bboxes= torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(bbox_pred, gt_bboxes, factor)
+         tensor([[1.6172, 1.6422]])
+    """
+    def __init__(self, weight=1., box_format='xyxy'):
+        self.weight = weight
+        assert box_format in ['xyxy', 'xywh']
+        self.box_format = box_format
+
+    def __call__(self, bbox_pred, gt_bboxes):
+        """
+        Args:
+            bbox_pred (Tensor): Predicted boxes with normalized coordinates
+                (cx, cy, w, h), which are all in range [0, 1]. Shape
+                (num_query, 4).
+            gt_bboxes (Tensor): Ground truth boxes with normalized
+                coordinates (x1, y1, x2, y2). Shape (num_gt, 4).
+
+        Returns:
+            torch.Tensor: bbox_cost value with weight
+        """
+        if self.box_format == 'xywh':
+            gt_bboxes = bbox_xyxy_to_cxcywh(gt_bboxes)
+        elif self.box_format == 'xyxy':
+            bbox_pred = bbox_cxcywh_to_xyxy(bbox_pred)
+        bbox_cost = torch.cdist(bbox_pred, gt_bboxes, p=1)
+        return bbox_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class FocalLossCost:
+    """FocalLossCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+         alpha (int | float, optional): focal_loss alpha
+         gamma (int | float, optional): focal_loss gamma
+         eps (float, optional): default 1e-12
+         binary_input (bool, optional): Whether the input is binary,
+            default False.
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import FocalLossCost
+         >>> import torch
+         >>> self = FocalLossCost()
+         >>> cls_pred = torch.rand(4, 3)
+         >>> gt_labels = torch.tensor([0, 1, 2])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(cls_pred, gt_labels)
+         tensor([[-0.3236, -0.3364, -0.2699],
+                [-0.3439, -0.3209, -0.4807],
+                [-0.4099, -0.3795, -0.2929],
+                [-0.1950, -0.1207, -0.2626]])
+    """
+    def __init__(self,
+                 weight=1.,
+                 alpha=0.25,
+                 gamma=2,
+                 eps=1e-12,
+                 binary_input=False):
+        self.weight = weight
+        self.alpha = alpha
+        self.gamma = gamma
+        self.eps = eps
+        self.binary_input = binary_input
+
+    def _focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = pos_cost[:, gt_labels] - neg_cost[:, gt_labels]
+        return cls_cost * self.weight
+
+    def _mask_focal_loss_cost(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits
+                in shape (num_query, d1, ..., dn), dtype=torch.float32.
+            gt_labels (Tensor): Ground truth in shape (num_gt, d1, ..., dn),
+                dtype=torch.long. Labels should be binary.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1)
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        cls_pred = cls_pred.sigmoid()
+        neg_cost = -(1 - cls_pred + self.eps).log() * (
+            1 - self.alpha) * cls_pred.pow(self.gamma)
+        pos_cost = -(cls_pred + self.eps).log() * self.alpha * (
+            1 - cls_pred).pow(self.gamma)
+
+        cls_cost = torch.einsum('nc,mc->nm', pos_cost, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg_cost, (1 - gt_labels))
+        return cls_cost / n * self.weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classfication logits.
+            gt_labels (Tensor)): Labels.
+
+        Returns:
+            Tensor: Focal cost matrix with weight in shape\
+                (num_query, num_gt).
+        """
+        if self.binary_input:
+            return self._mask_focal_loss_cost(cls_pred, gt_labels)
+        else:
+            return self._focal_loss_cost(cls_pred, gt_labels)
+
+
+@MATCH_COST.register_module()
+class ClassificationCost:
+    """ClsSoftmaxCost.
+
+     Args:
+         weight (int | float, optional): loss_weight
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import \
+         ... ClassificationCost
+         >>> import torch
+         >>> self = ClassificationCost()
+         >>> cls_pred = torch.rand(4, 3)
+         >>> gt_labels = torch.tensor([0, 1, 2])
+         >>> factor = torch.tensor([10, 8, 10, 8])
+         >>> self(cls_pred, gt_labels)
+         tensor([[-0.3430, -0.3525, -0.3045],
+                [-0.3077, -0.2931, -0.3992],
+                [-0.3664, -0.3455, -0.2881],
+                [-0.3343, -0.2701, -0.3956]])
+    """
+    def __init__(self, weight=1.):
+        self.weight = weight
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits, shape
+                (num_query, num_class).
+            gt_labels (Tensor): Label of `gt_bboxes`, shape (num_gt,).
+
+        Returns:
+            torch.Tensor: cls_cost value with weight
+        """
+        # Following the official DETR repo, contrary to the loss that
+        # NLL is used, we approximate it in 1 - cls_score[gt_label].
+        # The 1 is a constant that doesn't change the matching,
+        # so it can be omitted.
+        cls_score = cls_pred.softmax(-1)
+        cls_cost = -cls_score[:, gt_labels]
+        return cls_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class IoUCost:
+    """IoUCost.
+
+     Args:
+         iou_mode (str, optional): iou mode such as 'iou' | 'giou'
+         weight (int | float, optional): loss weight
+
+     Examples:
+         >>> from mmdet.core.bbox.match_costs.match_cost import IoUCost
+         >>> import torch
+         >>> self = IoUCost()
+         >>> bboxes = torch.FloatTensor([[1,1, 2, 2], [2, 2, 3, 4]])
+         >>> gt_bboxes = torch.FloatTensor([[0, 0, 2, 4], [1, 2, 3, 4]])
+         >>> self(bboxes, gt_bboxes)
+         tensor([[-0.1250,  0.1667],
+                [ 0.1667, -0.5000]])
+    """
+    def __init__(self, iou_mode='giou', weight=1.):
+        self.weight = weight
+        self.iou_mode = iou_mode
+
+    def __call__(self, bboxes, gt_bboxes):
+        """
+        Args:
+            bboxes (Tensor): Predicted boxes with unnormalized coordinates
+                (x1, y1, x2, y2). Shape (num_query, 4).
+            gt_bboxes (Tensor): Ground truth boxes with unnormalized
+                coordinates (x1, y1, x2, y2). Shape (num_gt, 4).
+
+        Returns:
+            torch.Tensor: iou_cost value with weight
+        """
+        # overlaps: [num_bboxes, num_gt]
+        overlaps = bbox_overlaps(bboxes,
+                                 gt_bboxes,
+                                 mode=self.iou_mode,
+                                 is_aligned=False)
+        # The 1 is a constant that doesn't change the matching, so omitted.
+        iou_cost = -overlaps
+        return iou_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class DiceCost:
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        weight (int | float, optional): loss_weight. Defaults to 1.
+        pred_act (bool, optional): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float, optional): default 1e-12.
+        naive_dice (bool, optional): If True, use the naive dice loss
+            in which the power of the number in the denominator is
+            the first power. If Flase, use the second power that
+            is adopted by K-Net and SOLO.
+            Defaults to True.
+    """
+    def __init__(self, weight=1., pred_act=False, eps=1e-3, naive_dice=True):
+        self.weight = weight
+        self.pred_act = pred_act
+        self.eps = eps
+        self.naive_dice = naive_dice
+
+    def binary_mask_dice_loss(self, mask_preds, gt_masks):
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (num_query, *).
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (num_query, num_gt).
+        """
+        mask_preds = mask_preds.flatten(1)
+        gt_masks = gt_masks.flatten(1).float()
+        numerator = 2 * torch.einsum('nc,mc->nm', mask_preds, gt_masks)
+        if self.naive_dice:
+            denominator = mask_preds.sum(-1)[:, None] + \
+                gt_masks.sum(-1)[None, :]
+        else:
+            denominator = mask_preds.pow(2).sum(1)[:, None] + \
+                gt_masks.pow(2).sum(1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self, mask_preds, gt_masks):
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction logits in shape (num_query, *)
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+
+        Returns:
+            Tensor: Dice cost matrix with weight in shape (num_query, num_gt).
+        """
+        if self.pred_act:
+            mask_preds = mask_preds.sigmoid()
+        dice_cost = self.binary_mask_dice_loss(mask_preds, gt_masks)
+        return dice_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class CrossEntropyLossCost:
+    """CrossEntropyLossCost.
+
+    Args:
+        weight (int | float, optional): loss weight. Defaults to 1.
+        use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+    Examples:
+         >>> from mmdet.core.bbox.match_costs import CrossEntropyLossCost
+         >>> import torch
+         >>> bce = CrossEntropyLossCost(use_sigmoid=True)
+         >>> cls_pred = torch.tensor([[7.6, 1.2], [-1.3, 10]])
+         >>> gt_labels = torch.tensor([[1, 1], [1, 0]])
+         >>> print(bce(cls_pred, gt_labels))
+    """
+    def __init__(self, weight=1., use_sigmoid=True):
+        assert use_sigmoid, 'use_sigmoid = False is not supported yet.'
+        self.weight = weight
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_query, 1, *) or
+                (num_query, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_query, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(cls_pred,
+                                                 torch.ones_like(cls_pred),
+                                                 reduction='none')
+        neg = F.binary_cross_entropy_with_logits(cls_pred,
+                                                 torch.zeros_like(cls_pred),
+                                                 reduction='none')
+        cls_cost = torch.einsum('nc,mc->nm', pos, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self, cls_pred, gt_labels):
+        """
+        Args:
+            cls_pred (Tensor): Predicted classification logits.
+            gt_labels (Tensor): Labels.
+
+        Returns:
+            Tensor: Cross entropy cost matrix with weight in
+                shape (num_query, num_gt).
+        """
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(cls_pred, gt_labels)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
+
+
+@MATCH_COST.register_module()
+class Keypoints3DCost(object):
+    """_summary_
+
+    Args:
+        object (_type_): _description_
+    """
+    def __init__(
+        self,
+        convention,
+        weight=1.0,
+    ) -> None:
+        self.weight = weight
+        self.convention = convention
+
+    def __call__(self,
+                 pred_keypoints3d: torch.Tensor,
+                 gt_keypoints3d: torch.Tensor,
+                 has_keypoints3d: Optional[torch.Tensor] = None):
+        """_summary_
+
+        Args:
+            pred (torch.Tensor): pred kp3d with shape [instance_num, kp_num, 3/4]
+            target (torch.Tensor): gt kp3d with shape [batch_size, kp_num, 3/4]
+            pred_conf (_type_, optional): _description_. Defaults to None.
+            target_conf (_type_, optional): _description_. Defaults to None.
+            keypoint_weight (_type_, optional): _description_. Defaults to None.
+
+        Returns:
+            _type_: _description_
+        """
+        # B: batch_size N: instance_num K: kp_num D: 2 for 2D; 3 for 3D
+        Q = pred_keypoints3d.shape[0]  # Q means query num
+        N, K, D = gt_keypoints3d.shape
+
+        gt_keypoints3d = gt_keypoints3d.unsqueeze(1).repeat([1, Q, 1, 1])
+        keypoints3d_conf = gt_keypoints3d[..., 3].float().unsqueeze(-1)
+        keypoints3d_conf = keypoints3d_conf.repeat(1, 1, 1, 3)
+        gt_keypoints3d = gt_keypoints3d[..., :3].float()
+        pred_keypoints3d = pred_keypoints3d.unsqueeze(0).repeat([N, 1, 1,
+                                                                 1]).float()
+
+        right_hip_idx = get_keypoint_idx('right_hip_extra', self.convention)
+        left_hip_idx = get_keypoint_idx('left_hip_extra', self.convention)
+
+        gt_pelvis = (gt_keypoints3d[:, :, right_hip_idx, :] +
+                     gt_keypoints3d[:, :, left_hip_idx, :]) / 2
+        pred_pelvis = (pred_keypoints3d[:, :, right_hip_idx, :] +
+                       pred_keypoints3d[:, :, left_hip_idx, :]) / 2
+
+        gt_keypoints3d = gt_keypoints3d - gt_pelvis[:, :, None, :]
+        pred_keypoints3d = pred_keypoints3d - pred_pelvis[:, :, None, :]
+
+        # [Q, N]
+        loss = torch.abs(gt_keypoints3d - pred_keypoints3d).sum([-2,
+                                                                 -1]).permute(
+                                                                     1, 0)
+        # shape: N
+        avg_factor = (keypoints3d_conf[:, 0, :, 0] > 0).sum(-1)
+
+        loss = self.weight * (loss / avg_factor)
+        return loss
+
+
+@MATCH_COST.register_module()
+class Keypoints2DCost(object):
+    """_summary_
+
+    Args:
+        object (_type_): _description_
+    """
+    def __init__(
+        self,
+        convention,
+        weight=1.0,
+        img_res=512,
+        focal_length=5000.,
+    ) -> None:
+        self.weight = weight
+        self.convention = convention
+        self.img_res = img_res
+        self.focal_length = focal_length
+
+    def __call__(self,
+                 pred_keypoints3d: torch.Tensor,
+                 pred_camera: torch.Tensor,
+                 gt_keypoints2d: torch.Tensor,
+                 has_keypoints2d: Optional[torch.Tensor] = None):
+        """_summary_
+
+        Args:
+            pred (torch.Tensor): pred kp3d with shape [instance_num, kp_num, 3/4]
+            target (torch.Tensor): gt kp3d with shape [batch_size, kp_num, 3/4]
+            pred_conf (_type_, optional): _description_. Defaults to None.
+            target_conf (_type_, optional): _description_. Defaults to None.
+            keypoint_weight (_type_, optional): _description_. Defaults to None.
+
+        Returns:
+            _type_: _description_
+        """
+        # B: batch_size N: instance_num K: kp_num D: 2 for 2D; 3 for 3D
+        Q = pred_keypoints3d.shape[0]  # Q means query num
+        N, K, D = gt_keypoints2d.shape
+
+        gt_keypoints2d = gt_keypoints2d.unsqueeze(1).repeat([1, Q, 1, 1])
+        keypoints2d_conf = gt_keypoints2d[..., 2].float().unsqueeze(-1)
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 1, 2)
+        gt_keypoints2d = gt_keypoints2d[..., :2].float()
+        pred_keypoints3d = pred_keypoints3d.unsqueeze(0).repeat([N, 1, 1,
+                                                                 1]).float()
+        pred_camera = pred_camera.unsqueeze(0).repeat([N, 1, 1]).float()
+
+        cam_t = torch.stack([
+            pred_camera[..., 1], pred_camera[..., 2], 2 * self.focal_length /
+            (self.img_res * pred_camera[..., 0] + 1e-9)
+        ],
+                            dim=-1)
+
+        K = torch.zeros([N, Q, 3, 3], device=pred_keypoints3d.device)
+        K[..., 0, 0] = self.focal_length
+        K[..., 1, 1] = self.focal_length
+        K[..., 2, 2] = 1.
+        K[..., :-1, -1] = torch.tensor([self.img_res / 2., self.img_res / 2.],
+                                       device=pred_keypoints3d.device)
+
+        # transform
+        pred_keypoints3d_ = pred_keypoints3d + cam_t.unsqueeze(2)
+        projected_kp3d = pred_keypoints3d_ / pred_keypoints3d_[
+            ..., -1].unsqueeze(-1)
+
+        # apply camera instrics
+        projected_kp3d = torch.einsum('nqij,nqkj->nqki', K, projected_kp3d)
+        pred_keypoints2d = projected_kp3d[..., :-1]
+
+        # Normalize keypoints to [-1, 1]
+        pred_keypoints2d = 2 * pred_keypoints2d / (self.img_res - 1)
+        gt_keypoints2d = 2 * gt_keypoints2d / (self.img_res - 1)
+
+        # computer loss
+        # [Q, N]
+        loss = torch.abs(gt_keypoints2d - pred_keypoints2d).sum([-2,
+                                                                 -1]).permute(
+                                                                     1, 0)
+        # shape: N
+        avg_factor = (keypoints2d_conf[:, 0, :, 0] > 0).sum(-1)
+
+        loss = self.weight * (loss / avg_factor)
+        return loss
+
+
+@MATCH_COST.register_module()
+class KeypointsMSECost(object):
+    """_summary_
+
+    Args:
+        object (_type_): _description_
+    """
+    def __init__(self, weight=1.0) -> None:
+        self.weight = weight
+
+    def __call__(self,
+                 pred,
+                 target,
+                 pred_conf=None,
+                 target_conf=None,
+                 keypoint_weight=None):
+
+        N = pred.shape[0]  # N means instance num
+        B, K, D = pred
+
+        pred_conf = pred_conf.view((N, B, K, 1)) \
+            if pred_conf is not None else 1.0
+        target_conf = target_conf.view((N, B, K, 1)) \
+            if target_conf is not None else 1.0
+        keypoint_weight = keypoint_weight.view((1, 1, K, 1)) \
+            if keypoint_weight is not None else \
+            self.keypoint_weight.view((1, 1, K, 1)).type_as(pred) \
+            if self.keypoint_weight is not None else 1.0
+
+        weight = keypoint_weight * pred_conf * target_conf
+
+        # B: batch_size N: instance_num K: kp_num D: 2 for 2D; 3 for 3D
+        pred = pred.unsqueeze(0).repeat([B, 1, 1, 1])  # B, N, K, D
+        target = target.unsqueeze(1).repeat([1, N, 1, 1])
+
+        loss = self.weight * (weight * F.mse_loss(
+            pred, target, reduction='none').sum(-1)).permute(1, 0)
+
+        return loss
diff --git a/detrsmpl/core/post_processing/bbox/samplers/__init__.py b/detrsmpl/core/post_processing/bbox/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8361def97989d1d08d978879a63ecdf7e5458c5a
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/samplers/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from .base_sampler import BaseSampler
+from .builder import build_sampler
+from .pseudo_sampler import PseudoSampler
+
+__all__ = ['build_sampler', 'BaseSampler', 'PseudoSampler']
diff --git a/detrsmpl/core/post_processing/bbox/samplers/base_sampler.py b/detrsmpl/core/post_processing/bbox/samplers/base_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..dee649739e03013050089d831a28e4c549b06768
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/samplers/base_sampler.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+import torch
+
+from .sampling_result import SamplingResult
+
+
+class BaseSampler(metaclass=ABCMeta):
+    """Base class of samplers."""
+    def __init__(self,
+                 num,
+                 pos_fraction,
+                 neg_pos_ub=-1,
+                 add_gt_as_proposals=True,
+                 **kwargs):
+        self.num = num
+        self.pos_fraction = pos_fraction
+        self.neg_pos_ub = neg_pos_ub
+        self.add_gt_as_proposals = add_gt_as_proposals
+        self.pos_sampler = self
+        self.neg_sampler = self
+
+    @abstractmethod
+    def _sample_pos(self, assign_result, num_expected, **kwargs):
+        """Sample positive samples."""
+        pass
+
+    @abstractmethod
+    def _sample_neg(self, assign_result, num_expected, **kwargs):
+        """Sample negative samples."""
+        pass
+
+    def sample(self,
+               assign_result,
+               bboxes,
+               gt_bboxes,
+               gt_labels=None,
+               **kwargs):
+        """Sample positive and negative bboxes.
+
+        This is a simple implementation of bbox sampling given candidates,
+        assigning results and ground truth bboxes.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Bbox assigning results.
+            bboxes (Tensor): Boxes to be sampled from.
+            gt_bboxes (Tensor): Ground truth bboxes.
+            gt_labels (Tensor, optional): Class labels of ground truth bboxes.
+
+        Returns:
+            :obj:`SamplingResult`: Sampling result.
+
+        Example:
+            >>> from mmdet.core.bbox import RandomSampler
+            >>> from mmdet.core.bbox import AssignResult
+            >>> from mmdet.core.bbox.demodata import ensure_rng, random_boxes
+            >>> rng = ensure_rng(None)
+            >>> assign_result = AssignResult.random(rng=rng)
+            >>> bboxes = random_boxes(assign_result.num_preds, rng=rng)
+            >>> gt_bboxes = random_boxes(assign_result.num_gts, rng=rng)
+            >>> gt_labels = None
+            >>> self = RandomSampler(num=32, pos_fraction=0.5, neg_pos_ub=-1,
+            >>>                      add_gt_as_proposals=False)
+            >>> self = self.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        """
+        if len(bboxes.shape) < 2:
+            bboxes = bboxes[None, :]
+
+        bboxes = bboxes[:, :4]
+
+        gt_flags = bboxes.new_zeros((bboxes.shape[0], ), dtype=torch.uint8)
+        if self.add_gt_as_proposals and len(gt_bboxes) > 0:
+            if gt_labels is None:
+                raise ValueError(
+                    'gt_labels must be given when add_gt_as_proposals is True')
+            bboxes = torch.cat([gt_bboxes, bboxes], dim=0)
+            assign_result.add_gt_(gt_labels)
+            gt_ones = bboxes.new_ones(gt_bboxes.shape[0], dtype=torch.uint8)
+            gt_flags = torch.cat([gt_ones, gt_flags])
+
+        num_expected_pos = int(self.num * self.pos_fraction)
+        pos_inds = self.pos_sampler._sample_pos(assign_result,
+                                                num_expected_pos,
+                                                bboxes=bboxes,
+                                                **kwargs)
+        # We found that sampled indices have duplicated items occasionally.
+        # (may be a bug of PyTorch)
+        pos_inds = pos_inds.unique()
+        num_sampled_pos = pos_inds.numel()
+        num_expected_neg = self.num - num_sampled_pos
+        if self.neg_pos_ub >= 0:
+            _pos = max(1, num_sampled_pos)
+            neg_upper_bound = int(self.neg_pos_ub * _pos)
+            if num_expected_neg > neg_upper_bound:
+                num_expected_neg = neg_upper_bound
+        neg_inds = self.neg_sampler._sample_neg(assign_result,
+                                                num_expected_neg,
+                                                bboxes=bboxes,
+                                                **kwargs)
+        neg_inds = neg_inds.unique()
+
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
diff --git a/detrsmpl/core/post_processing/bbox/samplers/builder.py b/detrsmpl/core/post_processing/bbox/samplers/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c56bad5c8e5502c44d70f9b6660f16e98626949
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/samplers/builder.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, build_from_cfg
+
+BBOX_SAMPLERS = Registry('bbox_sampler')
+
+
+def build_sampler(cfg, **default_args):
+    """Builder of box sampler."""
+    return build_from_cfg(cfg, BBOX_SAMPLERS, default_args)
diff --git a/detrsmpl/core/post_processing/bbox/samplers/pseudo_sampler.py b/detrsmpl/core/post_processing/bbox/samplers/pseudo_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..f37aa52327b319504b6b7e9a8290f1d4728181dd
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/samplers/pseudo_sampler.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .base_sampler import BaseSampler
+from .builder import BBOX_SAMPLERS
+from .sampling_result import SamplingResult
+
+
+@BBOX_SAMPLERS.register_module()
+class PseudoSampler(BaseSampler):
+    """A pseudo sampler that does not do sampling actually."""
+    def __init__(self, **kwargs):
+        pass
+
+    def _sample_pos(self, **kwargs):
+        """Sample positive samples."""
+        raise NotImplementedError
+
+    def _sample_neg(self, **kwargs):
+        """Sample negative samples."""
+        raise NotImplementedError
+
+    def sample(self, assign_result, bboxes, gt_bboxes, *args, **kwargs):
+        """Directly returns the positive and negative indices  of samples.
+
+        Args:
+            assign_result (:obj:`AssignResult`): Assigned results
+            bboxes (torch.Tensor): Bounding boxes
+            gt_bboxes (torch.Tensor): Ground truth boxes
+
+        Returns:
+            :obj:`SamplingResult`: sampler results
+        """
+        pos_inds = torch.nonzero(assign_result.gt_inds > 0,
+                                 as_tuple=False).squeeze(-1).unique()
+        neg_inds = torch.nonzero(assign_result.gt_inds == 0,
+                                 as_tuple=False).squeeze(-1).unique()
+        gt_flags = bboxes.new_zeros(bboxes.shape[0], dtype=torch.uint8)
+        sampling_result = SamplingResult(pos_inds, neg_inds, bboxes, gt_bboxes,
+                                         assign_result, gt_flags)
+        return sampling_result
diff --git a/detrsmpl/core/post_processing/bbox/samplers/sampling_result.py b/detrsmpl/core/post_processing/bbox/samplers/sampling_result.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1ac5785b8df0b5335b61cc64d78c39aa46cfe25
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/samplers/sampling_result.py
@@ -0,0 +1,150 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmdet.utils import util_mixins
+
+
+class SamplingResult(util_mixins.NiceRepr):
+    """Bbox sampling result.
+
+    Example:
+        >>> # xdoctest: +IGNORE_WANT
+        >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
+        >>> self = SamplingResult.random(rng=10)
+        >>> print(f'self = {self}')
+        self = <SamplingResult({
+            'neg_bboxes': torch.Size([12, 4]),
+            'neg_inds': tensor([ 0,  1,  2,  4,  5,  6,  7,  8,  9, 10, 11, 12]),
+            'num_gts': 4,
+            'pos_assigned_gt_inds': tensor([], dtype=torch.int64),
+            'pos_bboxes': torch.Size([0, 4]),
+            'pos_inds': tensor([], dtype=torch.int64),
+            'pos_is_gt': tensor([], dtype=torch.uint8)
+        })>
+    """
+    def __init__(self, pos_inds, neg_inds, bboxes, gt_bboxes, assign_result,
+                 gt_flags):
+        self.pos_inds = pos_inds
+        self.neg_inds = neg_inds
+        self.pos_bboxes = bboxes[pos_inds]
+        self.neg_bboxes = bboxes[neg_inds]
+        self.pos_is_gt = gt_flags[pos_inds]
+
+        self.num_gts = gt_bboxes.shape[0]
+        self.pos_assigned_gt_inds = assign_result.gt_inds[pos_inds] - 1
+
+        if gt_bboxes.numel() == 0:
+            # hack for index error case
+            assert self.pos_assigned_gt_inds.numel() == 0
+            self.pos_gt_bboxes = torch.empty_like(gt_bboxes).view(-1, 4)
+        else:
+            if len(gt_bboxes.shape) < 2:
+                gt_bboxes = gt_bboxes.view(-1, 4)
+
+            self.pos_gt_bboxes = gt_bboxes[self.pos_assigned_gt_inds.long(), :]
+
+        if assign_result.labels is not None:
+            self.pos_gt_labels = assign_result.labels[pos_inds]
+        else:
+            self.pos_gt_labels = None
+
+    @property
+    def bboxes(self):
+        """torch.Tensor: concatenated positive and negative boxes"""
+        return torch.cat([self.pos_bboxes, self.neg_bboxes])
+
+    def to(self, device):
+        """Change the device of the data inplace.
+
+        Example:
+            >>> self = SamplingResult.random()
+            >>> print(f'self = {self.to(None)}')
+            >>> # xdoctest: +REQUIRES(--gpu)
+            >>> print(f'self = {self.to(0)}')
+        """
+        _dict = self.__dict__
+        for key, value in _dict.items():
+            if isinstance(value, torch.Tensor):
+                _dict[key] = value.to(device)
+        return self
+
+    def __nice__(self):
+        data = self.info.copy()
+        data['pos_bboxes'] = data.pop('pos_bboxes').shape
+        data['neg_bboxes'] = data.pop('neg_bboxes').shape
+        parts = [f"'{k}': {v!r}" for k, v in sorted(data.items())]
+        body = '    ' + ',\n    '.join(parts)
+        return '{\n' + body + '\n}'
+
+    @property
+    def info(self):
+        """Returns a dictionary of info about the object."""
+        return {
+            'pos_inds': self.pos_inds,
+            'neg_inds': self.neg_inds,
+            'pos_bboxes': self.pos_bboxes,
+            'neg_bboxes': self.neg_bboxes,
+            'pos_is_gt': self.pos_is_gt,
+            'num_gts': self.num_gts,
+            'pos_assigned_gt_inds': self.pos_assigned_gt_inds,
+        }
+
+    @classmethod
+    def random(cls, rng=None, **kwargs):
+        """
+        Args:
+            rng (None | int | numpy.random.RandomState): seed or state.
+            kwargs (keyword arguments):
+                - num_preds: number of predicted boxes
+                - num_gts: number of true boxes
+                - p_ignore (float): probability of a predicted box assigned to \
+                    an ignored truth.
+                - p_assigned (float): probability of a predicted box not being \
+                    assigned.
+                - p_use_label (float | bool): with labels or not.
+
+        Returns:
+            :obj:`SamplingResult`: Randomly generated sampling result.
+
+        Example:
+            >>> from mmdet.core.bbox.samplers.sampling_result import *  # NOQA
+            >>> self = SamplingResult.random()
+            >>> print(self.__dict__)
+        """
+        from mmdet.core.bbox import demodata
+        from mmdet.core.bbox.assigners.assign_result import AssignResult
+        from mmdet.core.bbox.samplers.random_sampler import RandomSampler
+        rng = demodata.ensure_rng(rng)
+
+        # make probabalistic?
+        num = 32
+        pos_fraction = 0.5
+        neg_pos_ub = -1
+
+        assign_result = AssignResult.random(rng=rng, **kwargs)
+
+        # Note we could just compute an assignment
+        bboxes = demodata.random_boxes(assign_result.num_preds, rng=rng)
+        gt_bboxes = demodata.random_boxes(assign_result.num_gts, rng=rng)
+
+        if rng.rand() > 0.2:
+            # sometimes algorithms squeeze their data, be robust to that
+            gt_bboxes = gt_bboxes.squeeze()
+            bboxes = bboxes.squeeze()
+
+        if assign_result.labels is None:
+            gt_labels = None
+        else:
+            gt_labels = None  # todo
+
+        if gt_labels is None:
+            add_gt_as_proposals = False
+        else:
+            add_gt_as_proposals = True  # make probabalistic?
+
+        sampler = RandomSampler(num,
+                                pos_fraction,
+                                neg_pos_ub=neg_pos_ub,
+                                add_gt_as_proposals=add_gt_as_proposals,
+                                rng=rng)
+        self = sampler.sample(assign_result, bboxes, gt_bboxes, gt_labels)
+        return self
diff --git a/detrsmpl/core/post_processing/bbox/transforms.py b/detrsmpl/core/post_processing/bbox/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d72076a5621c5b59c081a8a190b4c8d167c26a5
--- /dev/null
+++ b/detrsmpl/core/post_processing/bbox/transforms.py
@@ -0,0 +1,270 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def find_inside_bboxes(bboxes, img_h, img_w):
+    """Find bboxes as long as a part of bboxes is inside the image.
+
+    Args:
+        bboxes (Tensor): Shape (N, 4).
+        img_h (int): Image height.
+        img_w (int): Image width.
+
+    Returns:
+        Tensor: Index of the remaining bboxes.
+    """
+    inside_inds = (bboxes[:, 0] < img_w) & (bboxes[:, 2] > 0) \
+        & (bboxes[:, 1] < img_h) & (bboxes[:, 3] > 0)
+    return inside_inds
+
+
+def bbox_flip(bboxes, img_shape, direction='horizontal'):
+    """Flip bboxes horizontally or vertically.
+
+    Args:
+        bboxes (Tensor): Shape (..., 4*k)
+        img_shape (tuple): Image shape.
+        direction (str): Flip direction, options are "horizontal", "vertical",
+            "diagonal". Default: "horizontal"
+
+    Returns:
+        Tensor: Flipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    flipped = bboxes.clone()
+    if direction == 'horizontal':
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+    elif direction == 'vertical':
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    else:
+        flipped[..., 0::4] = img_shape[1] - bboxes[..., 2::4]
+        flipped[..., 1::4] = img_shape[0] - bboxes[..., 3::4]
+        flipped[..., 2::4] = img_shape[1] - bboxes[..., 0::4]
+        flipped[..., 3::4] = img_shape[0] - bboxes[..., 1::4]
+    return flipped
+
+
+def bbox_mapping(bboxes,
+                 img_shape,
+                 scale_factor,
+                 flip,
+                 flip_direction='horizontal'):
+    """Map bboxes from the original image scale to testing scale."""
+    new_bboxes = bboxes * bboxes.new_tensor(scale_factor)
+    if flip:
+        new_bboxes = bbox_flip(new_bboxes, img_shape, flip_direction)
+    return new_bboxes
+
+
+def bbox_mapping_back(bboxes,
+                      img_shape,
+                      scale_factor,
+                      flip,
+                      flip_direction='horizontal'):
+    """Map bboxes from testing scale to original image scale."""
+    new_bboxes = bbox_flip(bboxes, img_shape,
+                           flip_direction) if flip else bboxes
+    new_bboxes = new_bboxes.view(-1, 4) / new_bboxes.new_tensor(scale_factor)
+    return new_bboxes.view(bboxes.shape)
+
+
+def bbox2roi(bbox_list):
+    """Convert a list of bboxes to roi format.
+
+    Args:
+        bbox_list (list[Tensor]): a list of bboxes corresponding to a batch
+            of images.
+
+    Returns:
+        Tensor: shape (n, 5), [batch_ind, x1, y1, x2, y2]
+    """
+    rois_list = []
+    for img_id, bboxes in enumerate(bbox_list):
+        if bboxes.size(0) > 0:
+            img_inds = bboxes.new_full((bboxes.size(0), 1), img_id)
+            rois = torch.cat([img_inds, bboxes[:, :4]], dim=-1)
+        else:
+            rois = bboxes.new_zeros((0, 5))
+        rois_list.append(rois)
+    rois = torch.cat(rois_list, 0)
+    return rois
+
+
+def roi2bbox(rois):
+    """Convert rois to bounding box format.
+
+    Args:
+        rois (torch.Tensor): RoIs with the shape (n, 5) where the first
+            column indicates batch id of each RoI.
+
+    Returns:
+        list[torch.Tensor]: Converted boxes of corresponding rois.
+    """
+    bbox_list = []
+    img_ids = torch.unique(rois[:, 0].cpu(), sorted=True)
+    for img_id in img_ids:
+        inds = (rois[:, 0] == img_id.item())
+        bbox = rois[inds, 1:]
+        bbox_list.append(bbox)
+    return bbox_list
+
+
+def bbox2result(bboxes, labels, num_classes):
+    """Convert detection results to a list of numpy arrays.
+
+    Args:
+        bboxes (torch.Tensor | np.ndarray): shape (n, 5)
+        labels (torch.Tensor | np.ndarray): shape (n, )
+        num_classes (int): class number, including background class
+
+    Returns:
+        list(ndarray): bbox results of each class
+    """
+    if bboxes.shape[0] == 0:
+        return [np.zeros((0, 5), dtype=np.float32) for i in range(num_classes)]
+    else:
+        if isinstance(bboxes, torch.Tensor):
+            bboxes = bboxes.detach().cpu().numpy()
+            labels = labels.detach().cpu().numpy()
+        return [bboxes[labels == i, :] for i in range(num_classes)]
+
+
+def distance2bbox(points, distance, max_shape=None):
+    """Decode distance prediction to bounding box.
+
+    Args:
+        points (Tensor): Shape (B, N, 2) or (N, 2).
+        distance (Tensor): Distance from the given point to 4
+            boundaries (left, top, right, bottom). Shape (B, N, 4) or (N, 4)
+        max_shape (Sequence[int] or torch.Tensor or Sequence[
+            Sequence[int]],optional): Maximum bounds for boxes, specifies
+            (H, W, C) or (H, W). If priors shape is (B, N, 4), then
+            the max_shape should be a Sequence[Sequence[int]]
+            and the length of max_shape should also be B.
+
+    Returns:
+        Tensor: Boxes with shape (N, 4) or (B, N, 4)
+    """
+
+    x1 = points[..., 0] - distance[..., 0]
+    y1 = points[..., 1] - distance[..., 1]
+    x2 = points[..., 0] + distance[..., 2]
+    y2 = points[..., 1] + distance[..., 3]
+
+    bboxes = torch.stack([x1, y1, x2, y2], -1)
+
+    if max_shape is not None:
+        if bboxes.dim() == 2 and not torch.onnx.is_in_onnx_export():
+            # speed up
+            bboxes[:, 0::2].clamp_(min=0, max=max_shape[1])
+            bboxes[:, 1::2].clamp_(min=0, max=max_shape[0])
+            return bboxes
+
+        # clip bboxes with dynamic `min` and `max` for onnx
+        if torch.onnx.is_in_onnx_export():
+            from mmdet.core.export import dynamic_clip_for_onnx
+            x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, max_shape)
+            bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+            return bboxes
+        if not isinstance(max_shape, torch.Tensor):
+            max_shape = x1.new_tensor(max_shape)
+        max_shape = max_shape[..., :2].type_as(x1)
+        if max_shape.ndim == 2:
+            assert bboxes.ndim == 3
+            assert max_shape.size(0) == bboxes.size(0)
+
+        min_xy = x1.new_tensor(0)
+        max_xy = torch.cat([max_shape, max_shape],
+                           dim=-1).flip(-1).unsqueeze(-2)
+        bboxes = torch.where(bboxes < min_xy, min_xy, bboxes)
+        bboxes = torch.where(bboxes > max_xy, max_xy, bboxes)
+
+    return bboxes
+
+
+def bbox2distance(points, bbox, max_dis=None, eps=0.1):
+    """Decode bounding box based on distances.
+
+    Args:
+        points (Tensor): Shape (n, 2), [x, y].
+        bbox (Tensor): Shape (n, 4), "xyxy" format
+        max_dis (float): Upper bound of the distance.
+        eps (float): a small value to ensure target < max_dis, instead <=
+
+    Returns:
+        Tensor: Decoded distances.
+    """
+    left = points[:, 0] - bbox[:, 0]
+    top = points[:, 1] - bbox[:, 1]
+    right = bbox[:, 2] - points[:, 0]
+    bottom = bbox[:, 3] - points[:, 1]
+    if max_dis is not None:
+        left = left.clamp(min=0, max=max_dis - eps)
+        top = top.clamp(min=0, max=max_dis - eps)
+        right = right.clamp(min=0, max=max_dis - eps)
+        bottom = bottom.clamp(min=0, max=max_dis - eps)
+    return torch.stack([left, top, right, bottom], -1)
+
+
+def bbox_rescale(bboxes, scale_factor=1.0):
+    """Rescale bounding box w.r.t. scale_factor.
+
+    Args:
+        bboxes (Tensor): Shape (n, 4) for bboxes or (n, 5) for rois
+        scale_factor (float): rescale factor
+
+    Returns:
+        Tensor: Rescaled bboxes.
+    """
+    if bboxes.size(1) == 5:
+        bboxes_ = bboxes[:, 1:]
+        inds_ = bboxes[:, 0]
+    else:
+        bboxes_ = bboxes
+    cx = (bboxes_[:, 0] + bboxes_[:, 2]) * 0.5
+    cy = (bboxes_[:, 1] + bboxes_[:, 3]) * 0.5
+    w = bboxes_[:, 2] - bboxes_[:, 0]
+    h = bboxes_[:, 3] - bboxes_[:, 1]
+    w = w * scale_factor
+    h = h * scale_factor
+    x1 = cx - 0.5 * w
+    x2 = cx + 0.5 * w
+    y1 = cy - 0.5 * h
+    y2 = cy + 0.5 * h
+    if bboxes.size(1) == 5:
+        rescaled_bboxes = torch.stack([inds_, x1, y1, x2, y2], dim=-1)
+    else:
+        rescaled_bboxes = torch.stack([x1, y1, x2, y2], dim=-1)
+    return rescaled_bboxes
+
+
+def bbox_cxcywh_to_xyxy(bbox):
+    """Convert bbox coordinates from (cx, cy, w, h) to (x1, y1, x2, y2).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    cx, cy, w, h = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(cx - 0.5 * w), (cy - 0.5 * h), (cx + 0.5 * w), (cy + 0.5 * h)]
+    return torch.cat(bbox_new, dim=-1)
+
+
+def bbox_xyxy_to_cxcywh(bbox):
+    """Convert bbox coordinates from (x1, y1, x2, y2) to (cx, cy, w, h).
+
+    Args:
+        bbox (Tensor): Shape (n, 4) for bboxes.
+
+    Returns:
+        Tensor: Converted bboxes.
+    """
+    x1, y1, x2, y2 = bbox.split((1, 1, 1, 1), dim=-1)
+    bbox_new = [(x1 + x2) / 2, (y1 + y2) / 2, (x2 - x1), (y2 - y1)]
+    return torch.cat(bbox_new, dim=-1)
diff --git a/detrsmpl/core/post_processing/builder.py b/detrsmpl/core/post_processing/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae2acf4afdf20891a259d14e86a674f695a61c89
--- /dev/null
+++ b/detrsmpl/core/post_processing/builder.py
@@ -0,0 +1,8 @@
+from mmcv.utils import Registry
+
+POST_PROCESSING = Registry('post_processing')
+
+
+def build_post_processing(cfg):
+    """Build post processing function."""
+    return POST_PROCESSING.build(cfg)
diff --git a/detrsmpl/core/post_processing/smooth/__init__.py b/detrsmpl/core/post_processing/smooth/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/core/post_processing/smooth/gaus1d_filter.py b/detrsmpl/core/post_processing/smooth/gaus1d_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..403f79a39d4bfe8193e5c26b60a116acc4b7aaf3
--- /dev/null
+++ b/detrsmpl/core/post_processing/smooth/gaus1d_filter.py
@@ -0,0 +1,60 @@
+import warnings
+
+import numpy as np
+import scipy.signal as signal
+import torch
+from scipy.ndimage.filters import gaussian_filter1d
+
+from ..builder import POST_PROCESSING
+
+
+@POST_PROCESSING.register_module(name=['Gaus1dFilter', 'gaus1d'])
+class Gaus1dFilter:
+    """Applies median filter and then gaussian filter. code from:
+    https://github.com/akanazawa/human_dynamics/blob/mas
+    ter/src/util/smooth_bbox.py.
+
+    Args:
+        x (np.ndarray): input pose
+        window_size (int, optional): for median filters (must be odd).
+        sigma (float, optional): Sigma for gaussian smoothing.
+
+    Returns:
+        np.ndarray: Smoothed poses
+    """
+    def __init__(self, window_size=11, sigma=4):
+        super(Gaus1dFilter, self).__init__()
+
+        self.window_size = window_size
+        self.sigma = sigma
+
+    def __call__(self, x=None):
+        if self.window_size % 2 == 0:
+            window_size = self.window_size - 1
+        else:
+            window_size = self.window_size
+        if window_size > x.shape[0]:
+            window_size = x.shape[0]
+        if len(x.shape) != 3:
+            warnings.warn('x should be a tensor or numpy of [T*M,K,C]')
+        assert len(x.shape) == 3
+        x_type = x
+        if isinstance(x, torch.Tensor):
+            if x.is_cuda:
+                x = x.cpu().numpy()
+            else:
+                x = x.numpy()
+
+        smoothed = np.array(
+            [signal.medfilt(param, window_size) for param in x.T]).T
+        smooth_poses = np.array(
+            [gaussian_filter1d(traj, self.sigma) for traj in smoothed.T]).T
+
+        if isinstance(x_type, torch.Tensor):
+            # we also return tensor by default
+            if x_type.is_cuda:
+                smooth_poses = torch.from_numpy(smooth_poses).cuda()
+            else:
+                smooth_poses = torch.from_numpy(smooth_poses)
+
+        return smooth_poses
diff --git a/detrsmpl/core/post_processing/smooth/oneeuro_filter.py b/detrsmpl/core/post_processing/smooth/oneeuro_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ff5ea5dfde38273414c22e5ebd61739a0f3ae21
--- /dev/null
+++ b/detrsmpl/core/post_processing/smooth/oneeuro_filter.py
@@ -0,0 +1,114 @@
+import math
+import warnings
+
+import numpy as np
+import torch
+
+from ..builder import POST_PROCESSING
+
+
+def smoothing_factor(t_e, cutoff):
+    r = 2 * math.pi * cutoff * t_e
+    return r / (r + 1)
+
+
+def exponential_smoothing(a, x, x_prev):
+    return a * x + (1 - a) * x_prev
+
+
+class OneEuro:
+    def __init__(self,
+                 t0,
+                 x0,
+                 dx0=0.0,
+                 min_cutoff=1.0,
+                 beta=0.0,
+                 d_cutoff=1.0):
+        super(OneEuro, self).__init__()
+        """Initialize the one euro filter."""
+        # The parameters.
+        self.min_cutoff = float(min_cutoff)
+        self.beta = float(beta)
+        self.d_cutoff = float(d_cutoff)
+        # Previous values.
+        self.x_prev = x0
+        self.dx_prev = dx0
+        self.t_prev = t0
+
+    def __call__(self, t, x):
+        """Compute the filtered signal."""
+        t_e = t - self.t_prev
+
+        # The filtered derivative of the signal.
+        a_d = smoothing_factor(t_e, self.d_cutoff)  # [k, c]
+        dx = (x - self.x_prev) / t_e
+        dx_hat = exponential_smoothing(a_d, dx, self.dx_prev)
+
+        # The filtered signal.
+        cutoff = self.min_cutoff + self.beta * np.abs(dx_hat)
+        a = smoothing_factor(t_e, cutoff)
+        x_hat = exponential_smoothing(a, x, self.x_prev)
+        # Memorize the previous values.
+        self.x_prev = x_hat
+        self.dx_prev = dx_hat
+        self.t_prev = t
+        return x_hat
+
+
+@POST_PROCESSING.register_module(name=['OneEuroFilter', 'oneeuro'])
+class OneEuroFilter:
+    """Oneeuro filter, source code: https://github.com/mkocabas/VIBE/blob/c0
+    c3f77d587351c806e901221a9dc05d1ffade4b/lib/utils/smooth_pose.py.
+
+    Args:
+        min_cutoff (float, optional):
+        Decreasing the minimum cutoff frequency decreases slow speed jitter
+        beta (float, optional):
+        Increasing the speed coefficient(beta) decreases speed lag.
+
+    Returns:
+        np.ndarray: smoothed poses
+    """
+    def __init__(self, min_cutoff=0.004, beta=0.7):
+        super(OneEuroFilter, self).__init__()
+
+        self.min_cutoff = min_cutoff
+        self.beta = beta
+
+    def __call__(self, x=None):
+        # x (np.ndarray): input poses.
+        if len(x.shape) != 3:
+            warnings.warn('x should be a tensor or numpy of [T*M,K,C]')
+        assert len(x.shape) == 3
+        x_type = x
+        if isinstance(x, torch.Tensor):
+            if x.is_cuda:
+                x = x.cpu().numpy()
+            else:
+                x = x.numpy()
+
+        one_euro_filter = OneEuro(
+            np.zeros_like(x[0]),
+            x[0],
+            min_cutoff=self.min_cutoff,
+            beta=self.beta,
+        )
+
+        pred_pose_hat = np.zeros_like(x)
+
+        # initialize
+        pred_pose_hat[0] = x[0]
+
+        for idx, pose in enumerate(x[1:]):
+            idx += 1
+            t = np.ones_like(pose) * idx
+            pose = one_euro_filter(t, pose)
+            pred_pose_hat[idx] = pose
+
+        if isinstance(x_type, torch.Tensor):
+            # we also return tensor by default
+            if x_type.is_cuda:
+                pred_pose_hat = torch.from_numpy(pred_pose_hat).cuda()
+            else:
+                pred_pose_hat = torch.from_numpy(pred_pose_hat)
+        return pred_pose_hat
diff --git a/detrsmpl/core/post_processing/smooth/savgol_filter.py b/detrsmpl/core/post_processing/smooth/savgol_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..a481833557f92c3b19af26ee5c75f091fab89540
--- /dev/null
+++ b/detrsmpl/core/post_processing/smooth/savgol_filter.py
@@ -0,0 +1,73 @@
+import warnings
+
+import numpy as np
+import scipy.signal as signal
+import torch
+
+from ..builder import POST_PROCESSING
+
+
+@POST_PROCESSING.register_module(name=['SGFilter', 'savgol'])
+class SGFilter:
+    """savgol_filter lib is from:
+    https://docs.scipy.org/doc/scipy/reference/generated/
+    scipy.signal.savgol_filter.html.
+
+    Args:
+        window_size (float):
+                    The length of the filter window
+                    (i.e., the number of coefficients).
+                    window_length must be a positive odd integer.
+        polyorder (int):
+                    The order of the polynomial used to fit the samples.
+                    polyorder must be less than window_length.
+
+    Returns:
+        smoothed poses (np.ndarray, torch.tensor)
+    """
+    def __init__(self, window_size=11, polyorder=2):
+        super(SGFilter, self).__init__()
+
+        # 1-D Savitzky-Golay filter
+        self.window_size = window_size
+        self.polyorder = polyorder
+
+    def __call__(self, x=None):
+        # x.shape: [t,k,c]
+        if self.window_size % 2 == 0:
+            window_size = self.window_size - 1
+        else:
+            window_size = self.window_size
+        if window_size > x.shape[0]:
+            window_size = x.shape[0]
+        if window_size <= self.polyorder:
+            polyorder = window_size - 1
+        else:
+            polyorder = self.polyorder
+        assert polyorder > 0
+        assert window_size > polyorder
+        if len(x.shape) != 3:
+            warnings.warn('x should be a tensor or numpy of [T*M,K,C]')
+        assert len(x.shape) == 3
+        x_type = x
+        if isinstance(x, torch.Tensor):
+            if x.is_cuda:
+                x = x.cpu().numpy()
+            else:
+                x = x.numpy()
+        smooth_poses = np.zeros_like(x)
+        # smooth at different axis
+        C = x.shape[-1]
+        for i in range(C):
+            smooth_poses[..., i] = signal.savgol_filter(x[..., i],
+                                                        window_size,
+                                                        polyorder,
+                                                        axis=0)
+
+        if isinstance(x_type, torch.Tensor):
+            # we also return tensor by default
+            if x_type.is_cuda:
+                smooth_poses = torch.from_numpy(smooth_poses).cuda()
+            else:
+                smooth_poses = torch.from_numpy(smooth_poses)
+        return smooth_poses
diff --git a/detrsmpl/core/post_processing/smooth/smoothnet.py b/detrsmpl/core/post_processing/smooth/smoothnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1412b72cb6119a07894939754a6343e8332f714d
--- /dev/null
+++ b/detrsmpl/core/post_processing/smooth/smoothnet.py
@@ -0,0 +1,237 @@
+from typing import Optional
+
+import numpy as np
+import torch
+from mmcv.runner import load_checkpoint
+from torch import Tensor, nn
+
+from detrsmpl.utils.transforms import (
+    aa_to_rotmat,
+    rot6d_to_rotmat,
+    rotmat_to_aa,
+    rotmat_to_rot6d,
+)
+from ..builder import POST_PROCESSING
+
+
+class SmoothNetResBlock(nn.Module):
+    """Residual block module used in SmoothNet.
+
+    Args:
+        in_channels (int): Input channel number.
+        hidden_channels (int): The hidden feature channel number.
+        dropout (float): Dropout probability. Default: 0.5
+    Shape:
+        Input: (*, in_channels)
+        Output: (*, in_channels)
+    """
+    def __init__(self, in_channels, hidden_channels, dropout=0.1):
+        super().__init__()
+        self.linear1 = nn.Linear(in_channels, hidden_channels)
+        self.linear2 = nn.Linear(hidden_channels, in_channels)
+        self.lrelu = nn.LeakyReLU(0.2, inplace=True)
+        self.dropout = nn.Dropout(p=dropout, inplace=True)
+
+    def forward(self, x):
+        identity = x
+        x = self.linear1(x)
+        x = self.dropout(x)
+        x = self.lrelu(x)
+        x = self.linear2(x)
+        x = self.dropout(x)
+        x = self.lrelu(x)
+
+        out = x + identity
+        return out
+
+
+class SmoothNet(nn.Module):
+    """SmoothNet is a plug-and-play temporal-only network to refine human
+    poses. It works for 2d/3d/6d pose smoothing.
+    "SmoothNet: A Plug-and-Play Network for Refining Human Poses in Videos",
+    arXiv'2021. More details can be found in the `paper
+    <https://arxiv.org/abs/2112.13715>`__ .
+    Note:
+        N: The batch size
+        T: The temporal length of the pose sequence
+        C: The total pose dimension (e.g. keypoint_number * keypoint_dim)
+    Args:
+        window_size (int): The size of the input window.
+        output_size (int): The size of the output window.
+        hidden_size (int): The hidden feature dimension in the encoder,
+            the decoder and between residual blocks. Default: 512
+        res_hidden_size (int): The hidden feature dimension inside the
+            residual blocks. Default: 256
+        num_blocks (int): The number of residual blocks. Default: 3
+        dropout (float): Dropout probability. Default: 0.5
+    Shape:
+        Input: (N, C, T) the original pose sequence
+        Output: (N, C, T) the smoothed pose sequence
+    """
+    def __init__(self,
+                 window_size: int,
+                 output_size: int,
+                 hidden_size: int = 512,
+                 res_hidden_size: int = 512,
+                 num_blocks: int = 5,
+                 dropout: float = 0.1):
+        super().__init__()
+        self.window_size = window_size
+        self.output_size = output_size
+        self.hidden_size = hidden_size
+        self.res_hidden_size = res_hidden_size
+        self.num_blocks = num_blocks
+        self.dropout = dropout
+
+        assert output_size <= window_size, (
+            'The output size should be less than or equal to the window size.',
+            f' Got output_size=={output_size} and window_size=={window_size}')
+
+        # Build encoder layers
+        self.encoder = nn.Sequential(nn.Linear(window_size, hidden_size),
+                                     nn.LeakyReLU(0.1, inplace=True))
+
+        # Build residual blocks
+        res_blocks = []
+        for _ in range(num_blocks):
+            res_blocks.append(
+                SmoothNetResBlock(in_channels=hidden_size,
+                                  hidden_channels=res_hidden_size,
+                                  dropout=dropout))
+        self.res_blocks = nn.Sequential(*res_blocks)
+
+        # Build decoder layers
+        self.decoder = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        N, C, T = x.shape
+        num_windows = T - self.window_size + 1
+
+        assert T >= self.window_size, (
+            'Input sequence length must be no less than the window size. ',
+            f'Got x.shape[2]=={T} and window_size=={self.window_size}')
+
+        # Unfold x to obtain input sliding windows
+        # [N, C, num_windows, window_size]
+        x = x.unfold(2, self.window_size, 1)
+
+        # Forward layers
+        x = self.encoder(x)
+        x = self.res_blocks(x)
+        x = self.decoder(x)  # [N, C, num_windows, output_size]
+
+        # Accumulate output ensembles
+        out = x.new_zeros(N, C, T)
+        count = x.new_zeros(T)
+
+        for t in range(num_windows):
+            out[..., t:t + self.output_size] += x[:, :, t]
+            count[t:t + self.output_size] += 1.0
+
+        return out.div(count)
+
+
+@POST_PROCESSING.register_module(name=['SmoothNetFilter', 'smoothnet'])
+class SmoothNetFilter:
+    """Apply SmoothNet filter.
+    "SmoothNet: A Plug-and-Play Network for Refining Human Poses in Videos",
+    arXiv'2021. More details can be found in the `paper
+    <https://arxiv.org/abs/2112.13715>`__ .
+    Args:
+        window_size (int): The size of the filter window. It's also the
+            window_size of SmoothNet model.
+        output_size (int): The output window size of SmoothNet model.
+        checkpoint (str): The checkpoint file of the pretrained SmoothNet
+            model. Please note that `checkpoint` should be matched with
+            `window_size` and `output_size`.
+        hidden_size (int): SmoothNet argument. See :class:`SmoothNet` for
+            details. Default: 512
+        hidden_res_size (int): SmoothNet argument. See :class:`SmoothNet`
+            for details. Default: 256
+        num_blocks (int): SmoothNet argument. See :class:`SmoothNet` for
+            details. Default: 3
+        device (str): Device for model inference. Default: 'cpu'
+        root_index (int, optional): If not None, relative keypoint coordinates
+            will be calculated as the SmoothNet input, by centering the
+            keypoints around the root point. The model output will be
+            converted back to absolute coordinates. Default: None
+    """
+    def __init__(
+        self,
+        window_size: int,
+        output_size: int,
+        checkpoint: Optional[str] = None,
+        hidden_size: int = 512,
+        res_hidden_size: int = 512,
+        num_blocks: int = 5,
+        device: str = 'cpu',
+    ):
+        super(SmoothNetFilter, self).__init__()
+        self.window_size = window_size
+        self.device = device
+        self.smoothnet = SmoothNet(window_size, output_size, hidden_size,
+                                   res_hidden_size, num_blocks)
+        self.smoothnet.to(device)
+        if checkpoint:
+            load_checkpoint(self.smoothnet,
+                            checkpoint,
+                            map_location=self.device)
+        self.smoothnet.eval()
+
+        for p in self.smoothnet.parameters():
+            p.requires_grad_(False)
+
+    def __call__(self, x: np.ndarray):
+        x_type = 'tensor'
+        if not isinstance(x, torch.Tensor):
+            x_type = 'array'
+
+        assert x.ndim == 3, ('Input should be an array with shape [T, K, C]'
+                             f', but got invalid shape {x.shape}')
+
+        T, K, C = x.shape
+
+        assert C == 3 or C == 6 or C == 9
+
+        if T < self.window_size:
+            # Skip smoothing if the input length is less than the window size
+            smoothed = x
+        else:
+            if x_type == 'array':
+                dtype = x.dtype
+
+            # Convert to tensor and forward the model
+            with torch.no_grad():
+                if x_type == 'array':
+                    x = torch.tensor(x,
+                                     dtype=torch.float32,
+                                     device=self.device)
+                if C == 9:
+                    input_type = 'matrix'
+                    x = rotmat_to_rot6d(x.reshape(-1, 3, 3)).reshape(T, K, -1)
+                elif C == 3:
+                    input_type = 'axis_angles'
+                    x = rotmat_to_rot6d(aa_to_rotmat(x.reshape(-1,
+                                                               3))).reshape(
+                                                                   T, K, -1)
+                else:
+                    input_type = 'rotation_6d'
+                x = x.view(1, T, -1).permute(0, 2, 1)  # to [1, KC, T]
+                smoothed = self.smoothnet(x)  # in shape [1, KC, T]
+
+            # Convert model output back to input shape and format
+            smoothed = smoothed.permute(0, 2, 1).view(T, K, -1)  # to [T, K, C]
+
+            if input_type == 'matrix':
+                smoothed = rot6d_to_rotmat(smoothed.reshape(-1, 6)).reshape(
+                    T, K, C)
+            elif input_type == 'axis_angles':
+                smoothed = rotmat_to_aa(
+                    rot6d_to_rotmat(smoothed.reshape(-1, 6))).reshape(T, K, C)
+
+            if x_type == 'array':
+                smoothed = smoothed.cpu().numpy().astype(
+                    dtype)  # to numpy.ndarray
+
+        return smoothed
diff --git a/detrsmpl/core/post_processing/speed_up/__init__.py b/detrsmpl/core/post_processing/speed_up/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/core/post_processing/speed_up/deciwatch.py b/detrsmpl/core/post_processing/speed_up/deciwatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..631bf4c0f88eb57d0af8c819c2923c23614dc59b
--- /dev/null
+++ b/detrsmpl/core/post_processing/speed_up/deciwatch.py
@@ -0,0 +1,716 @@
+import copy
+import math
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmcv.runner import load_checkpoint
+from torch import Tensor, nn
+
+from detrsmpl.utils.transforms import (
+    aa_to_rotmat,
+    rot6d_to_rotmat,
+    rotmat_to_aa,
+    rotmat_to_rot6d,
+)
+from ..builder import POST_PROCESSING
+
+
+@POST_PROCESSING.register_module(name=['DeciWatchPostProcessing', 'deciwatch'])
+class DeciWatchPostProcessing:
+    """DeciWatchFilter lib is from: https://arxiv.org/abs/2203.08713.
+
+    Args:
+        interval (int): The interval of Visible frames.
+        slide_window_q (int): frames per slide window contains + 1.
+        checkpoint (str): model checkpoint path
+        device (Union[torch.device, str], optional):
+                You can pass a str or torch.device for cpu or gpu render.
+                Defaults to 'cpu'.
+
+    Returns:
+        smoothed poses (np.ndarray, torch.tensor)
+    """
+
+    def __init__(self, interval, slide_window_q, checkpoint, device=None):
+        super(DeciWatchPostProcessing, self).__init__()
+        self.interval = interval
+        self.slide_window_q = slide_window_q
+        self.slide_window_size = self.slide_window_q * self.interval + 1
+        self.device = device
+
+        self.input_dimension = 24 * 6
+
+        self.model = DeciWatch(sample_interval=self.interval).to(self.device)
+
+        self.checkpoint_path = checkpoint
+
+        print(f'load checkpoint from local path: {self.checkpoint_path}')
+        load_checkpoint(
+            self.model, self.checkpoint_path, map_location=self.device)
+
+    def __call__(self, x=None):
+        # x.shape: [t,24,3]
+        seq_len = x.shape[0]
+        assert seq_len > self.slide_window_size
+        assert x.shape[1:] == (24, 3, 3) or x.shape[1:] == (
+            self.input_dimension) or x.shape[1:] == (24, 3)
+
+        if x.shape[1:] == (24, 3, 3):
+            input_type = 'matrix'
+            x = torch.tensor(x).to(self.device)
+            x = rotmat_to_rot6d(x).reshape(-1, self.input_dimension)
+        elif x.shape[1:] == (24, 3):
+            input_type = 'axis_angles'
+            x = torch.tensor(x).to(self.device)
+            x = rotmat_to_rot6d(aa_to_rotmat(x.reshape(-1, 3))).reshape(
+                -1, self.input_dimension)
+        else:
+            x = torch.tensor(x).to(self.device)
+            x = x.reshape(-1, self.input_dimension)
+            input_type = 'rotation_6d'
+
+        input = x.clone()
+
+        slide_window_x = torch.as_strided(
+            input, ((seq_len - self.slide_window_size) // (self.interval) + 1,
+                    self.slide_window_size, self.input_dimension),
+            (self.interval * self.input_dimension, self.input_dimension, 1),
+            storage_offset=0).reshape(-1, self.slide_window_size,
+                                      self.input_dimension)
+
+        smoothed_len = (
+            seq_len - self.slide_window_size
+        ) // self.interval * self.interval + self.slide_window_size
+
+        with torch.no_grad():
+            smooth_poses, _ = self.model(slide_window_x, self.device)
+
+        output_poses = [[] for i in range(smoothed_len)]
+
+        for i in range(smooth_poses.shape[0]):
+            for j in range(self.slide_window_size):
+                output_poses[i * self.interval + j].append(smooth_poses[i,
+                                                                        j, :])
+
+        smooth_poses = torch.cat(
+            (smooth_poses[:, :self.slide_window_size - 1, :].reshape(
+                -1, self.input_dimension), smooth_poses[-1, -1, :].reshape(
+                    -1, self.input_dimension)),
+            dim=0)
+
+        for i in range(smoothed_len):
+            output_poses[i] = torch.stack(output_poses[i]).mean(0)
+
+        output_poses = torch.stack(output_poses)
+
+        if smoothed_len < seq_len:
+            output_poses = torch.cat((output_poses, x[smoothed_len:, :]),
+                                     dim=0)
+
+        if input_type == 'matrix':
+            output_poses = rot6d_to_rotmat(output_poses.reshape(
+                -1, 6)).reshape(-1, 24, 3, 3)
+        elif input_type == 'axis_angles':
+            output_poses = rotmat_to_aa(
+                rot6d_to_rotmat(output_poses.reshape(-1,
+                                                     6))).reshape(-1, 24, 3)
+
+        return output_poses
+
+
+class PositionEmbeddingSine_1D(nn.Module):
+    """This is a more standard version of the position embedding, very similar
+    to the one used by the Attention is all you need paper, generalized to work
+    on images."""
+
+    def __init__(self,
+                 num_pos_feats=64,
+                 temperature=10000,
+                 normalize=True,
+                 scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError('normalize should be True if scale is passed')
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, B, L):
+
+        position = torch.arange(0, L, dtype=torch.float32).unsqueeze(0)
+        position = position.repeat(B, 1)
+
+        if self.normalize:
+            eps = 1e-6
+            position = position / (position[:, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32)
+        dim_t = self.temperature**(2 * (torch.div(dim_t, 1)) /
+                                   self.num_pos_feats)
+
+        pe = torch.zeros(B, L, self.num_pos_feats * 2)
+        pe[:, :, 0::2] = torch.sin(position[:, :, None] / dim_t)
+        pe[:, :, 1::2] = torch.cos(position[:, :, None] / dim_t)
+
+        pe = pe.permute(1, 0, 2)
+
+        return pe
+
+
+class DeciWatch(nn.Module):
+    """Apply DeciWatch framework for 10x efficiency.
+    "DeciWatch: A Simple Baseline for 10× Efficient 2D and 3D Pose Estimation",
+    arXiv'2022. More details can be found in the `paper
+    <https://arxiv.org/pdf/2203.08713>` .
+    Args:
+        input_dim (int): The size of input spatial dimension,
+            e.g., 15*2 for 2d pose on the jhmdb dataset
+        sample_interval (int): DeciWatch argument. See :class:`DeciWatch`
+            for details. The intervals of the uniform sampling.
+            The sampling ratio is: 1/sample_interval. Default: 10
+        encoder_hidden_dim (int): DeciWatch argument. See :class:`DeciWatch`
+            for details. Hidden dimension in the encoder. Default: 64
+        decoder_hidden_dim (int): DeciWatch argument. See :class:`DeciWatch`
+            for details. Hidden dimension in the decoder. Default: 64
+        dropout (float): DeciWatch argument. See :class:`DeciWatch`
+            for details. dropout probability. Default: 0.1
+        nheads (int): DeciWatch argument. See :class:`DeciWatch`
+            for details. Default: 4
+        dim_feedforward (int): DeciWatch argument. See :class:`DeciWatch`
+            for details. Dimension of feed forward layers.
+        enc_layers (int): DeciWatch argument. See :class:`DeciWatch`
+            for details. Layers of the encoder. Default: 5
+        dec_layers (int): DeciWatch argument. See :class:`DeciWatch`
+            for details. Layers of the encoder. Default: 5
+        activation (str): DeciWatch argument. See :class:`DeciWatch`
+            for details. Activation function in deciwatch.
+            Default: 'leaky_relu'
+        pre_norm (bool): DeciWatch argument. See :class:`DeciWatch`
+            for details. Whether to normalize before positional embedding.
+            Default: False
+    """
+
+    def __init__(self,
+                 input_dim=24 * 6,
+                 sample_interval=10,
+                 encoder_hidden_dim=16,
+                 decoder_hidden_dim=16,
+                 dropout=0.1,
+                 nheads=4,
+                 dim_feedforward=256,
+                 enc_layers=3,
+                 dec_layers=3,
+                 activation='leaky_relu',
+                 pre_norm=False):
+        super(DeciWatch, self).__init__()
+        self.pos_embed_dim = encoder_hidden_dim
+        self.pos_embed = self.build_position_encoding(self.pos_embed_dim)
+
+        self.sample_interval = sample_interval
+
+        self.deciwatch_par = {
+            'input_dim': input_dim,
+            'encoder_hidden_dim': encoder_hidden_dim,
+            'decoder_hidden_dim': decoder_hidden_dim,
+            'dropout': dropout,
+            'nheads': nheads,
+            'dim_feedforward': dim_feedforward,
+            'enc_layers': enc_layers,
+            'dec_layers': dec_layers,
+            'activation': activation,
+            'pre_norm': pre_norm
+        }
+
+        self.transformer = build_model(self.deciwatch_par)
+
+    def build_position_encoding(self, pos_embed_dim):
+        N_steps = pos_embed_dim // 2
+        position_embedding = PositionEmbeddingSine_1D(N_steps, normalize=True)
+        return position_embedding
+
+    def generate_unifrom_mask(self, L, sample_interval=10):
+        # 1 unseen 0 see
+
+        seq_len = L
+        if (seq_len - 1) % sample_interval != 0:
+            raise Exception(
+                'The following equation should be satisfied: [Window size] \
+                    = [sample interval] * Q + 1, where Q is an integer.')
+
+        sample_mask = np.ones(seq_len, dtype=np.int32)
+        sample_mask[::sample_interval] = 0
+
+        encoder_mask = sample_mask
+        decoder_mask = np.array([0] * L, dtype=np.int32)
+
+        return torch.tensor(encoder_mask), torch.tensor(decoder_mask)
+
+    def seqence_interpolation(self, motion, rate):
+
+        seq_len = motion.shape[-1]
+        indice = torch.arange(seq_len, dtype=int)
+        chunk = torch.div(indice, rate).type(torch.long)
+        remain = indice % rate
+
+        prev = motion[:, :, chunk * rate]
+
+        next = torch.cat([
+            motion[:, :, (chunk[:-1] + 1) * rate], motion[:, :, -1, np.newaxis]
+        ], -1)
+        remain = remain.to(motion.device)
+
+        interpolate = (prev / rate * (rate - remain)) + (next / rate * remain)
+
+        return interpolate
+
+    def forward(self, sequence, device):
+        B, L, C = sequence.shape
+        seq = sequence.permute(0, 2, 1)  # B,C,L
+
+        encoder_mask, decoder_mask = self.generate_unifrom_mask(
+            L, sample_interval=self.sample_interval)
+        encoder_mask = encoder_mask.to(seq.device)
+        decoder_mask = decoder_mask.to(seq.device)
+
+        self.input_seq = seq * (1 - encoder_mask.int())
+        self.input_seq_interp = self.seqence_interpolation(
+            self.input_seq, self.sample_interval)
+        # self.input_seq=self.input_seq.reshape(1,1,-1)
+        self.encoder_mask = encoder_mask.unsqueeze(0).repeat(B, 1).to(device)
+        self.decoder_mask = decoder_mask.unsqueeze(0).repeat(B, 1).to(device)
+
+        self.encoder_pos_embed = self.pos_embed(B, L).to(device)
+        self.decoder_pos_embed = self.encoder_pos_embed.clone().to(device)
+
+        self.recover, self.denoise = self.transformer.forward(
+            input_seq=self.input_seq.to(torch.float32),
+            encoder_mask=self.encoder_mask,
+            encoder_pos_embed=self.encoder_pos_embed,
+            input_seq_interp=self.input_seq_interp,
+            decoder_mask=self.decoder_mask,
+            decoder_pos_embed=self.decoder_pos_embed,
+            sample_interval=self.sample_interval,
+            device=device)
+
+        self.recover = self.recover.permute(1, 0, 2).reshape(B, L, C)
+        self.denoise = self.denoise.permute(1, 0, 2).reshape(B, L, C)
+
+        return self.recover, self.denoise
+
+
+class DeciWatchTransformer(nn.Module):
+
+    def __init__(self,
+                 input_nc,
+                 encoder_hidden_dim=512,
+                 decoder_hidden_dim=512,
+                 nhead=8,
+                 num_encoder_layers=6,
+                 num_decoder_layers=6,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 pre_norm=False):
+        super(DeciWatchTransformer, self).__init__()
+
+        self.joints_dim = input_nc
+        # bring in semantic (5 frames) temporal information into tokens
+        self.decoder_embed = nn.Conv1d(
+            self.joints_dim,
+            decoder_hidden_dim,
+            kernel_size=5,
+            stride=1,
+            padding=2)
+
+        self.encoder_embed = nn.Linear(self.joints_dim, encoder_hidden_dim)
+
+        encoder_layer = DeciWatchTransformerEncoderLayer(
+            encoder_hidden_dim, nhead, dim_feedforward, dropout, activation,
+            pre_norm)
+        encoder_norm = nn.LayerNorm(encoder_hidden_dim) if pre_norm else None
+        self.encoder = DeciWatchTransformerEncoder(encoder_layer,
+                                                   num_encoder_layers,
+                                                   encoder_norm)
+
+        decoder_layer = DeciWatchTransformerDecoderLayer(
+            decoder_hidden_dim, nhead, dim_feedforward, dropout, activation,
+            pre_norm)
+        decoder_norm = nn.LayerNorm(decoder_hidden_dim)
+        self.decoder = DeciWatchTransformerDecoder(decoder_layer,
+                                                   num_decoder_layers,
+                                                   decoder_norm)
+
+        self.decoder_joints_embed = nn.Linear(decoder_hidden_dim,
+                                              self.joints_dim)
+        self.encoder_joints_embed = nn.Linear(encoder_hidden_dim,
+                                              self.joints_dim)
+
+        # reset parameters
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+        self.encoder_hidden_dim = encoder_hidden_dim
+        self.decoder_hidden_dim = decoder_hidden_dim
+
+        self.nhead = nhead
+
+    def _generate_square_subsequent_mask(self, sz):
+        mask = torch.triu(torch.ones(sz, sz), 1)
+        mask = mask.masked_fill(mask == 1, float('-inf'))
+        return mask
+
+    def interpolate_embedding(self, input, rate):
+
+        tmp = input.clone()
+        seq_len = input.shape[0]
+        indice = torch.arange(seq_len, dtype=int).to(self.device)
+        chunk = torch.div(indice, rate).type(torch.long)
+        remain = indice % rate
+
+        prev = tmp[chunk * rate]
+
+        next = torch.cat([tmp[(chunk[:-1] + 1) * rate], tmp[-1].unsqueeze(0)],
+                         dim=0)
+
+        interpolate = (prev / rate * (rate - remain.view(-1, 1, 1))) + (
+            next / rate * remain.view(-1, 1, 1))
+
+        return interpolate
+
+    def forward(self, input_seq, encoder_mask, encoder_pos_embed,
+                input_seq_interp, decoder_mask, decoder_pos_embed,
+                sample_interval, device):
+
+        self.device = device
+
+        # flatten NxCxL to LxNxC
+        bs, c, _ = input_seq.shape
+        input_seq = input_seq.permute(2, 0, 1)
+        input_seq_interp = input_seq_interp.permute(2, 0, 1)
+
+        input = input_seq.clone()
+
+        # mask on all sequences:
+        trans_src = self.encoder_embed(input_seq)
+        mem = self.encode(trans_src, encoder_mask, encoder_pos_embed)
+        reco = self.encoder_joints_embed(mem) + input
+
+        interp = self.interpolate_embedding(reco, sample_interval)
+        center = interp.clone()
+        trans_tgt = self.decoder_embed(interp.permute(1, 2,
+                                                      0)).permute(2, 0, 1)
+
+        output = self.decode(mem, encoder_mask, encoder_pos_embed, trans_tgt,
+                             decoder_mask, decoder_pos_embed)
+
+        joints = self.decoder_joints_embed(output) + center
+        return joints, reco
+
+    def encode(self, src, src_mask, pos_embed):
+
+        mask = torch.eye(src.shape[0]).bool().to(src.device)
+        memory = self.encoder(
+            src, mask=mask, src_key_padding_mask=src_mask, pos=pos_embed)
+
+        return memory
+
+    def decode(self, memory, memory_mask, memory_pos, tgt, tgt_mask, tgt_pos):
+        hs = self.decoder(
+            tgt,
+            memory,
+            tgt_key_padding_mask=tgt_mask,
+            memory_key_padding_mask=memory_mask,
+            pos=memory_pos,
+            query_pos=tgt_pos)
+        return hs
+
+
+class DeciWatchTransformerEncoder(nn.Module):
+
+    def __init__(self, encoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(encoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self,
+                src,
+                mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        output = src
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                src_mask=mask,
+                src_key_padding_mask=src_key_padding_mask,
+                pos=pos)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class DeciWatchTransformerDecoder(nn.Module):
+
+    def __init__(self, decoder_layer, num_layers, norm=None):
+        super().__init__()
+        self.layers = _get_clones(decoder_layer, num_layers)
+        self.num_layers = num_layers
+        self.norm = norm
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        output = tgt
+
+        for layer in self.layers:
+            output = layer(
+                output,
+                memory,
+                tgt_mask=tgt_mask,
+                memory_mask=memory_mask,
+                tgt_key_padding_mask=tgt_key_padding_mask,
+                memory_key_padding_mask=memory_key_padding_mask,
+                pos=pos,
+                query_pos=query_pos)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        return output
+
+
+class DeciWatchTransformerEncoderLayer(nn.Module):
+
+    def __init__(self,
+                 encoder_hidden_dim,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 pre_norm=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            encoder_hidden_dim, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(encoder_hidden_dim, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, encoder_hidden_dim)
+
+        self.norm1 = nn.LayerNorm(encoder_hidden_dim)
+        self.norm2 = nn.LayerNorm(encoder_hidden_dim)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.pre_norm = pre_norm
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     src,
+                     src_mask: Optional[Tensor] = None,
+                     src_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(src, pos)
+        src2 = self.self_attn(
+            q,
+            k,
+            value=src,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask.bool())[0]
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
+        src = src + self.dropout2(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward_pre(self,
+                    src,
+                    src_mask: Optional[Tensor] = None,
+                    src_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None):
+        src2 = self.norm1(src)
+        q = k = self.with_pos_embed(src2, pos)
+        src2 = self.self_attn(
+            q,
+            k,
+            value=src2,
+            attn_mask=src_mask,
+            key_padding_mask=src_key_padding_mask)[0]
+        src = src + self.dropout1(src2)
+        src2 = self.norm2(src)
+        src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
+        src = src + self.dropout2(src2)
+        return src
+
+    def forward(self,
+                src,
+                src_mask: Optional[Tensor] = None,
+                src_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None):
+        if self.pre_norm:
+            return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
+        return self.forward_post(src, src_mask, src_key_padding_mask, pos)
+
+
+class DeciWatchTransformerDecoderLayer(nn.Module):
+
+    def __init__(self,
+                 decoder_hidden_dim,
+                 nhead,
+                 dim_feedforward=2048,
+                 dropout=0.1,
+                 activation='relu',
+                 pre_norm=False):
+        super().__init__()
+        self.self_attn = nn.MultiheadAttention(
+            decoder_hidden_dim, nhead, dropout=dropout)
+        self.multihead_attn = nn.MultiheadAttention(
+            decoder_hidden_dim, nhead, dropout=dropout)
+        # Implementation of Feedforward model
+        self.linear1 = nn.Linear(decoder_hidden_dim, dim_feedforward)
+        self.dropout = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(dim_feedforward, decoder_hidden_dim)
+
+        self.norm1 = nn.LayerNorm(decoder_hidden_dim)
+        self.norm2 = nn.LayerNorm(decoder_hidden_dim)
+        self.norm3 = nn.LayerNorm(decoder_hidden_dim)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.dropout3 = nn.Dropout(dropout)
+
+        self.activation = _get_activation_fn(activation)
+        self.pre_norm = pre_norm
+
+    def with_pos_embed(self, tensor, pos: Optional[Tensor]):
+        return tensor if pos is None else tensor + pos
+
+    def forward_post(self,
+                     tgt,
+                     memory,
+                     tgt_mask: Optional[Tensor] = None,
+                     memory_mask: Optional[Tensor] = None,
+                     tgt_key_padding_mask: Optional[Tensor] = None,
+                     memory_key_padding_mask: Optional[Tensor] = None,
+                     pos: Optional[Tensor] = None,
+                     query_pos: Optional[Tensor] = None):
+        q = k = self.with_pos_embed(tgt, query_pos)
+        tgt2 = self.self_attn(
+            q,
+            k,
+            value=tgt,
+            attn_mask=tgt_mask,
+            key_padding_mask=tgt_key_padding_mask.bool())[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask.bool())[0]
+        tgt = tgt + self.dropout2(tgt2)
+        tgt = self.norm2(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout3(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward_pre(self,
+                    tgt,
+                    memory,
+                    tgt_mask: Optional[Tensor] = None,
+                    memory_mask: Optional[Tensor] = None,
+                    tgt_key_padding_mask: Optional[Tensor] = None,
+                    memory_key_padding_mask: Optional[Tensor] = None,
+                    pos: Optional[Tensor] = None,
+                    query_pos: Optional[Tensor] = None):
+        tgt2 = self.norm1(tgt)
+        q = k = self.with_pos_embed(tgt2, query_pos)
+        tgt2 = self.self_attn(
+            q,
+            k,
+            value=tgt2,
+            attn_mask=tgt_mask,
+            key_padding_mask=tgt_key_padding_mask)[0]
+        tgt = tgt + self.dropout1(tgt2)
+        tgt2 = self.norm2(tgt)
+        tgt2 = self.multihead_attn(
+            query=self.with_pos_embed(tgt2, query_pos),
+            key=self.with_pos_embed(memory, pos),
+            value=memory,
+            attn_mask=memory_mask,
+            key_padding_mask=memory_key_padding_mask.bool())[0]
+
+        tgt = tgt + self.dropout2(tgt2)
+        tgt2 = self.norm3(tgt)
+        tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
+        tgt = tgt + self.dropout3(tgt2)
+        return tgt
+
+    def forward(self,
+                tgt,
+                memory,
+                tgt_mask: Optional[Tensor] = None,
+                memory_mask: Optional[Tensor] = None,
+                tgt_key_padding_mask: Optional[Tensor] = None,
+                memory_key_padding_mask: Optional[Tensor] = None,
+                pos: Optional[Tensor] = None,
+                query_pos: Optional[Tensor] = None):
+        if self.pre_norm:
+            return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
+                                    tgt_key_padding_mask,
+                                    memory_key_padding_mask, pos, query_pos)
+        return self.forward_post(tgt, memory, tgt_mask, memory_mask,
+                                 tgt_key_padding_mask, memory_key_padding_mask,
+                                 pos, query_pos)
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def build_model(args):
+    return DeciWatchTransformer(
+        input_nc=args['input_dim'],
+        decoder_hidden_dim=args['decoder_hidden_dim'],
+        encoder_hidden_dim=args['encoder_hidden_dim'],
+        dropout=args['dropout'],
+        nhead=args['nheads'],
+        dim_feedforward=args['dim_feedforward'],
+        num_encoder_layers=args['enc_layers'],
+        num_decoder_layers=args['dec_layers'],
+        activation=args['activation'],
+        pre_norm=args['pre_norm'],
+    )
+
+
+def _get_activation_fn(activation):
+    """Return an activation function given a string."""
+    if activation == 'relu':
+        return F.relu
+    if activation == 'gelu':
+        return F.gelu
+    if activation == 'glu':
+        return F.glu
+    if activation == 'leaky_relu':
+        return F.leaky_relu
+    raise RuntimeError(F'activation should be relu/gelu, not {activation}.')
diff --git a/detrsmpl/core/renderer/__init__.py b/detrsmpl/core/renderer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/core/renderer/matplotlib3d_renderer.py b/detrsmpl/core/renderer/matplotlib3d_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e607227778225f3ecebff33c5672bd5ffdfc2750
--- /dev/null
+++ b/detrsmpl/core/renderer/matplotlib3d_renderer.py
@@ -0,0 +1,408 @@
+import io
+import os
+import shutil
+from pathlib import Path
+from typing import Iterable, List, Optional, Union
+
+import cv2
+import mmcv
+import numpy as np
+from matplotlib import pyplot as plt
+from matplotlib.lines import Line2D
+from mpl_toolkits.mplot3d import Axes3D
+
+from detrsmpl.core.conventions.cameras.convert_convention import \
+    enc_camera_convention  # prevent yapf isort conflict
+from detrsmpl.utils.demo_utils import get_different_colors
+from detrsmpl.utils.ffmpeg_utils import images_to_video
+from detrsmpl.utils.path_utils import check_path_suffix
+
+
+class Axes3dBaseRenderer(object):
+    """Base renderer."""
+    def init_camera(self,
+                    cam_elev_angle=10,
+                    cam_elev_speed=0.0,
+                    cam_hori_angle=45,
+                    cam_hori_speed=0.5):
+        """Initiate the route of camera with arguments.
+
+        Args:
+            cam_elev_angle (int, optional):
+                The pitch angle where camera starts.
+                Defaults to 10.
+            cam_elev_speed (float, optional):
+                The pitch angle camera steps in one frame.
+                It will go back and forth between -30 and 30 degree.
+                Defaults to 0.0.
+            cam_hori_angle (int, optional):
+                The yaw angle where camera starts. Defaults to 45.
+            cam_hori_speed (float, optional):
+                The yaw angle camera steps in one frame.
+                It will go back and forth between 0 and 90 degree.
+                Defaults to 0.5.
+        """
+        self.cam_elevation_args = [cam_elev_angle, cam_elev_speed]
+        self.cam_horizon_args = [cam_hori_angle, cam_hori_speed]
+        self.if_camera_init = True
+
+    def _get_camera_vector_list(self, frame_number):
+        """Generate self.cam_vector_list according to hori and elev arguments.
+
+        Args:
+            frame_number (int):
+                Number of frames.
+
+        Returns:
+            List[List[float, float]]:
+                A list of float vectors.
+        """
+        self.cam_vector_list = [
+            [self.cam_elevation_args[0], self.cam_horizon_args[0]],
+        ]
+        ele_sign = 1
+        hor_sign = 1
+        for _ in range(frame_number - 1):
+            new_ele_angle = ele_sign * self.cam_elevation_args[
+                1] + self.cam_vector_list[-1][0]
+            #  if elevation angle out of range, go backwards
+            if new_ele_angle <= self.cam_elevation_args[
+                    1] or new_ele_angle >= 30:
+                ele_sign = (-1) * ele_sign
+                new_ele_angle = (ele_sign * self.cam_elevation_args[1] +
+                                 self.cam_vector_list[-1][0])
+            new_hor_angle = (hor_sign * self.cam_horizon_args[1] +
+                             self.cam_vector_list[-1][1])
+            #  if horizon angle out of range, go backwards
+            if new_hor_angle >= 90 - 2 * self.cam_horizon_args[
+                    1] or new_hor_angle <= 2 * self.cam_horizon_args[1]:
+                hor_sign = (-1) * hor_sign
+                new_hor_angle = (hor_sign * self.cam_horizon_args[1] +
+                                 self.cam_vector_list[-1][1])
+            self.cam_vector_list.append([new_ele_angle, new_hor_angle])
+        return self.cam_vector_list
+
+    @staticmethod
+    def _get_visual_range(points: np.ndarray) -> np.ndarray:
+        """Calculate the visual range according to the input points. It make
+        sure that no point is absent.
+
+        Args:
+            points (np.ndarray):
+                An array of 3D points.
+                Axis at the last dim.
+
+        Returns:
+            np.ndarray:
+                An array in shape [3, 2].
+                It marks the lower bound and the upper bound
+                along each axis.
+        """
+        axis_num = points.shape[-1]
+        axis_stat = np.zeros(shape=[axis_num, 4])
+        for axis_index in range(axis_num):
+            axis_data = points[..., axis_index]
+            axis_min = np.min(axis_data)
+            axis_max = np.max(axis_data)
+            axis_mid = (axis_min + axis_max) / 2.0
+            axis_span = axis_max - axis_min
+            axis_stat[axis_index] = np.asarray(
+                (axis_min, axis_max, axis_mid, axis_span))
+        max_span = np.max(axis_stat[:, 3])
+        visual_range = np.zeros(shape=[axis_num, 2])
+        for axis_index in range(axis_num):
+            visual_range[axis_index, 0] =\
+                axis_stat[axis_index, 2] - max_span/2.0
+            visual_range[axis_index, 1] =\
+                axis_stat[axis_index, 2] + max_span/2.0
+        return visual_range
+
+    def _draw_scene(self,
+                    visual_range,
+                    axis_len=1.0,
+                    cam_elev_angle=10,
+                    cam_hori_angle=45):
+        """Draw an empty scene according to visual range and camera vector.
+
+        Args:
+            visual_range (np.ndarray):
+                Return value of _get_visual_range().
+            axis_len (float, optional):
+                The length of every axis.
+                Defaults to 1.0.
+            cam_elev_angle (int, optional):
+                Pitch angle of the camera.
+                Defaults to 10.
+            cam_hori_angle (int, optional):
+                Yaw angle of the camera.
+                Defaults to 45.
+
+        Returns:
+            list: Figure and Axes3D
+        """
+        fig = plt.figure()
+        ax = Axes3D(fig, auto_add_to_figure=False)
+        fig.add_axes(ax)
+        ax.set_xlim(*visual_range[0])
+        ax.set_ylim(*visual_range[1])
+        ax.set_zlim(*visual_range[2])
+        ax.view_init(cam_elev_angle, cam_hori_angle)
+        mid_point = [
+            np.average(visual_range[0]),
+            np.average(visual_range[1]),
+            np.average(visual_range[2]),
+        ]
+        # draw axis
+        zero_point = np.array([0, 0, 0])
+        x_axis = np.array([(visual_range[0][1] - mid_point[0]) * axis_len, 0,
+                           0])
+        y_axis = np.array(
+            [0, (visual_range[1][1] - mid_point[1]) * axis_len, 0])
+        z_axis = np.array(
+            [0, 0, (visual_range[2][1] - mid_point[2]) * axis_len])
+        ax = _plot_line_on_fig(ax, zero_point, x_axis, 'r')
+        ax = _plot_line_on_fig(ax, zero_point, y_axis, 'g')
+        ax = _plot_line_on_fig(ax, zero_point, z_axis, 'b')
+        return fig, ax
+
+
+class Axes3dJointsRenderer(Axes3dBaseRenderer):
+    """Render of joints."""
+    def __init__(self):
+        self.if_camera_init = False
+        self.cam_vector_list = None
+        self.if_connection_setup = False
+        self.if_frame_updated = False
+        self.temp_path = ''
+
+    def set_connections(self, limbs_connection, limbs_palette):
+        """set body limbs."""
+        self.limbs_connection = limbs_connection
+        self.limbs_palette = limbs_palette
+        self.if_connection_setup = True
+
+    def render_kp3d_to_video(
+        self,
+        keypoints_np: np.ndarray,
+        output_path: Optional[str] = None,
+        convention='opencv',
+        fps: Union[float, int] = 30,
+        resolution: Iterable[int] = (720, 720),
+        visual_range: Iterable[int] = (-100, 100),
+        frame_names: Optional[List[str]] = None,
+        disable_limbs: bool = False,
+        return_array: bool = False,
+    ) -> None:
+        """Render 3d keypoints to a video.
+
+        Args:
+            keypoints_np (np.ndarray): shape of input array should be
+                    (f * n * J * 3).
+            output_path (str): output video path or frame folder.
+            sign (Iterable[int], optional): direction of the axis.
+                    Defaults to (1, 1, 1).
+            axis (str, optional): axis convention.
+                    Defaults to 'xzy'.
+            fps (Union[float, int], optional): fps.
+                    Defaults to 30.
+            resolution (Iterable[int], optional): (width, height) of
+                    output video.
+                    Defaults to (720, 720).
+            visual_range (Iterable[int], optional): range of axis value.
+                    Defaults to (-100, 100).
+            frame_names (Optional[List[str]], optional):  List of string
+                    for frame title, no title if None. Defaults to None.
+            disable_limbs (bool, optional): whether need to disable drawing
+                limbs.
+                Defaults to False.
+        Returns:
+            None.
+        """
+        assert self.if_camera_init is True
+        assert self.if_connection_setup is True
+        sign, axis = enc_camera_convention(convention)
+        if output_path is not None:
+            if check_path_suffix(output_path, ['.mp4', '.gif']):
+                self.temp_path = os.path.join(
+                    Path(output_path).parent,
+                    Path(output_path).name + '_output_temp')
+                mmcv.mkdir_or_exist(self.temp_path)
+                print('make dir', self.temp_path)
+                self.remove_temp = True
+            else:
+                self.temp_path = output_path
+                self.remove_temp = False
+        else:
+            self.temp_path = None
+        keypoints_np = _set_new_pose(keypoints_np, sign, axis)
+        if not self.if_frame_updated:
+            if self.cam_vector_list is None:
+                self._get_camera_vector_list(
+                    frame_number=keypoints_np.shape[0])
+            assert len(self.cam_vector_list) == keypoints_np.shape[0]
+            if visual_range is None:
+                visual_range = self._get_visual_range(keypoints_np)
+            else:
+                visual_range = np.asarray(visual_range)
+                if len(visual_range.shape) == 1:
+                    one_dim_visual_range = np.expand_dims(visual_range, 0)
+                    visual_range = one_dim_visual_range.repeat(3, axis=0)
+            image_array = self._export_frames(keypoints_np, resolution,
+                                              visual_range, frame_names,
+                                              disable_limbs, return_array)
+            self.if_frame_updated = True
+
+        if output_path is not None:
+            if check_path_suffix(output_path, '.mp4'):
+                images_to_video(self.temp_path,
+                                output_path,
+                                img_format='frame_%06d.png',
+                                fps=fps)
+        return image_array
+
+    def _export_frames(self, keypoints_np, resolution, visual_range,
+                       frame_names, disable_limbs, return_array):
+        """Write output/temp images."""
+        image_array = []
+        for frame_index in range(keypoints_np.shape[0]):
+            keypoints_frame = keypoints_np[frame_index]
+            cam_ele, cam_hor = self.cam_vector_list[frame_index]
+            fig, ax = \
+                self._draw_scene(visual_range=visual_range, axis_len=0.5,
+                                 cam_elev_angle=cam_ele,
+                                 cam_hori_angle=cam_hor)
+            #  draw limbs
+            num_person = keypoints_frame.shape[0]
+            for person_index, keypoints_person in enumerate(keypoints_frame):
+                if num_person >= 2:
+                    self.limbs_palette = get_different_colors(
+                        num_person)[person_index].reshape(-1, 3)
+                if not disable_limbs:
+                    for part_name, limbs in self.limbs_connection.items():
+                        if part_name == 'body':
+                            linewidth = 2
+                        else:
+                            linewidth = 1
+                        if isinstance(self.limbs_palette, np.ndarray):
+                            color = self.limbs_palette.astype(
+                                np.int32).reshape(-1, 3)
+                        elif isinstance(self.limbs_palette, dict):
+                            color = np.array(
+                                self.limbs_palette[part_name]).astype(np.int32)
+                        for limb_index, limb in enumerate(limbs):
+                            limb_index = min(limb_index, len(color) - 1)
+
+                            ax = _plot_line_on_fig(
+                                ax,
+                                keypoints_person[limb[0]],
+                                keypoints_person[limb[1]],
+                                color=np.array(color[limb_index]) / 255.0,
+                                linewidth=linewidth)
+                scatter_points_index = list(
+                    set(
+                        np.array(self.limbs_connection['body']).reshape(
+                            -1).tolist()))
+                ax.scatter(keypoints_person[scatter_points_index, 0],
+                           keypoints_person[scatter_points_index, 1],
+                           keypoints_person[scatter_points_index, 2],
+                           c=np.array([0, 0, 0]).reshape(1, -1),
+                           s=10,
+                           marker='o')
+            if num_person >= 2:
+                ax.xaxis.set_ticklabels([])
+                ax.yaxis.set_ticklabels([])
+                ax.zaxis.set_ticklabels([])
+                labels = []
+                custom_lines = []
+                for person_index in range(num_person):
+                    color = get_different_colors(
+                        num_person)[person_index].reshape(1, 3) / 255.0
+                    custom_lines.append(
+                        Line2D([0], [0],
+                               linestyle='-',
+                               color=color[0],
+                               lw=2,
+                               marker='',
+                               markeredgecolor='k',
+                               markeredgewidth=.1,
+                               markersize=20))
+                    labels.append(f'person_{person_index + 1}')
+                ax.legend(
+                    handles=custom_lines,
+                    labels=labels,
+                    loc='upper left',
+                )
+            plt.close('all')
+            rgb_mat = _get_cv2mat_from_buf(fig)
+            resized_mat = cv2.resize(rgb_mat, resolution)
+            if frame_names is not None:
+                cv2.putText(
+                    resized_mat, str(frame_names[frame_index]),
+                    (resolution[0] // 10, resolution[1] // 10),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.5 * resolution[0] / 500,
+                    np.array([255, 255, 255]).astype(np.int32).tolist(), 2)
+            if self.temp_path is not None:
+                frame_path = os.path.join(self.temp_path,
+                                          'frame_%06d.png' % frame_index)
+                cv2.imwrite(frame_path, resized_mat)
+            if return_array:
+                image_array.append(resized_mat[None])
+        if return_array:
+            image_array = np.concatenate(image_array)
+            return image_array
+        else:
+            return None
+
+    def __del__(self):
+        """remove temp images."""
+        self.remove_temp_frames()
+
+    def remove_temp_frames(self):
+        """remove temp images."""
+        if self.temp_path is not None:
+            if Path(self.temp_path).is_dir() and self.remove_temp:
+                shutil.rmtree(self.temp_path)
+
+
+def _set_new_pose(pose_np, sign, axis):
+    """set new pose with axis convention."""
+    target_sign = [-1, 1, -1]
+    target_axis = ['x', 'z', 'y']
+
+    pose_rearrange_axis_result = pose_np.copy()
+    for axis_index, axis_name in enumerate(target_axis):
+        src_axis_index = axis.index(axis_name)
+        pose_rearrange_axis_result[..., axis_index] = \
+            pose_np[..., src_axis_index]
+
+    for dim_index in range(pose_rearrange_axis_result.shape[-1]):
+        pose_rearrange_axis_result[
+            ..., dim_index] = sign[dim_index] / target_sign[
+                dim_index] * pose_rearrange_axis_result[..., dim_index]
+    return pose_rearrange_axis_result
+
+
+def _plot_line_on_fig(ax,
+                      point1_location,
+                      point2_location,
+                      color,
+                      linewidth=1):
+    """Draw line on fig with matplotlib."""
+    ax.plot([point1_location[0], point2_location[0]],
+            [point1_location[1], point2_location[1]],
+            [point1_location[2], point2_location[2]],
+            color=color,
+            linewidth=linewidth)
+    return ax
+
+
+def _get_cv2mat_from_buf(fig, dpi=180):
+    """Get numpy image from IO."""
+    buf = io.BytesIO()
+    fig.savefig(buf, format='png', dpi=dpi)
+    buf.seek(0)
+    img_arr = np.frombuffer(buf.getvalue(), dtype=np.uint8)
+    buf.close()
+    img = cv2.imdecode(img_arr, 1)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
+    return img
diff --git a/detrsmpl/core/renderer/mpr_renderer/__init__.py b/detrsmpl/core/renderer/mpr_renderer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e833984e8c1f456d89a610c6878c3832610b3cb4
--- /dev/null
+++ b/detrsmpl/core/renderer/mpr_renderer/__init__.py
@@ -0,0 +1,6 @@
+"""minimal_pytorch_rasterizer is a CUDA non-differentiable mesh rasterization
+library for pytorch tensors with python bindings.
+
+These codes brought from
+`https://github.com/rmbashirov/minimal_pytorch_rasterizer`.
+"""
diff --git a/detrsmpl/core/renderer/mpr_renderer/camera.py b/detrsmpl/core/renderer/mpr_renderer/camera.py
new file mode 100644
index 0000000000000000000000000000000000000000..a73fad7c5edab58369583259fccd200bbac7a821
--- /dev/null
+++ b/detrsmpl/core/renderer/mpr_renderer/camera.py
@@ -0,0 +1,52 @@
+import numpy as np
+import torch
+
+
+class Pinhole2D:
+    def __init__(self, K=None, fx=None, fy=None, cx=None, cy=None, h=0, w=0):
+        if K is not None:
+            assert fx is None and fy is None and cx is None and cy is None
+            self.fx = K[0, 0]
+            self.fy = K[1, 1]
+            self.cx = K[0, 2]
+            self.cy = K[1, 2]
+        else:
+            assert \
+                fx is not None and fy is not None and \
+                cx is not None and cy is not None
+            self.fx = fx
+            self.fy = fy
+            self.cx = cx
+            self.cy = cy
+        self.h = h
+        self.w = w
+
+    def get_K(self):
+        return np.array([[self.fx, 0, self.cx], [0, self.fy, self.cy],
+                         [0, 0, 1]])
+
+    def project_ndc(self, vertices, eps=1e-9):
+        """
+        vertices: torch.Tensor of shape (N, 3), 3 stands for xyz
+        """
+        assert isinstance(vertices, torch.Tensor)
+        assert len(vertices.shape) == 2
+        assert vertices.shape[1] == 3
+        K = torch.tensor(self.get_K(),
+                         device=vertices.device,
+                         dtype=vertices.dtype)
+
+        # apply intrinsics
+        vertices_ndc = vertices @ K.transpose(1, 0)
+
+        # divide xy by z, leave z unchanged
+        vertices_ndc[:, [0, 1]] /= vertices_ndc[:, [2]] + eps
+
+        # convert x from [0, w) to [-1, 1] range
+        # convert y from [0, h) to [-1, 1] range
+        wh = torch.tensor([self.w, self.h],
+                          device=vertices.device,
+                          dtype=vertices.dtype).unsqueeze(0)
+        vertices_ndc[:, [0, 1]] = 2 * vertices_ndc[:, [0, 1]] / wh - 1
+
+        return vertices_ndc
diff --git a/detrsmpl/core/renderer/mpr_renderer/cuda/rasterizer.cpp b/detrsmpl/core/renderer/mpr_renderer/cuda/rasterizer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd7402b8b4c6db20f67ed42d485833deda38017f
--- /dev/null
+++ b/detrsmpl/core/renderer/mpr_renderer/cuda/rasterizer.cpp
@@ -0,0 +1,83 @@
+#include <torch/extension.h>
+#include <vector>
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor> estimate_normals_cuda(
+    const torch::Tensor& vertices_ndc,
+    const torch::Tensor& faces,
+    const torch::Tensor& vertices,
+    const torch::Tensor& vertices_filter,
+    int h, int w
+);
+
+
+torch::Tensor project_mesh_cuda(
+    const torch::Tensor& vertices_ndc,
+    const torch::Tensor& faces,
+    const torch::Tensor& vertice_values,
+    const torch::Tensor& vertices_filter,
+    int h, int w
+);
+
+// C++ interface
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.type().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+void check_equal_dtype(const torch::Tensor& a, const torch::Tensor& b) {
+    TORCH_CHECK(
+        a.dtype() == b.dtype(),
+        "expected equal dtype, got ", a.dtype(), " != ", b.dtype()
+    );
+}
+
+void check_equal_gpuid(const torch::Tensor& a, const torch::Tensor& b) {
+    TORCH_CHECK(
+        a.device().index() == b.device().index(),
+        "expected equal gpu id, got ", a.device().index(), " != ", b.device().index()
+    );
+}
+
+std::vector<torch::Tensor> estimate_normals(
+    const torch::Tensor& vertices_ndc,
+    const torch::Tensor& faces,
+    const torch::Tensor& vertices,
+    const torch::Tensor& vertices_filter,
+    int h, int w
+) {
+    TORCH_CHECK(h > 0, "h expected to be > 0");
+    TORCH_CHECK(w > 0, "w expected to be > 0");
+    CHECK_INPUT(vertices_ndc);
+    CHECK_INPUT(faces);
+    CHECK_INPUT(vertices_filter);
+    return estimate_normals_cuda(
+        vertices_ndc, faces, vertices, vertices_filter,
+        h, w
+    );
+}
+
+torch::Tensor project_mesh(
+    const torch::Tensor& vertices_ndc,
+    const torch::Tensor& faces,
+    const torch::Tensor& vertice_values,
+    const torch::Tensor& vertices_filter,
+    int h, int w
+) {
+    TORCH_CHECK(h > 0, "h expected to be > 0");
+    TORCH_CHECK(w > 0, "w expected to be > 0");
+    CHECK_INPUT(vertices_ndc);
+    CHECK_INPUT(faces);
+    CHECK_INPUT(vertice_values);
+    CHECK_INPUT(vertices_filter);
+    return project_mesh_cuda(
+        vertices_ndc, faces, vertice_values, vertices_filter,
+        h, w
+    );
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.def("estimate_normals", &estimate_normals, "estimate_normals (CUDA)");
+    m.def("project_mesh", &project_mesh, "project_mesh (CUDA)");
+}
diff --git a/detrsmpl/core/renderer/mpr_renderer/cuda/rasterizer_kernel.cu b/detrsmpl/core/renderer/mpr_renderer/cuda/rasterizer_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..797a78c602b28040d21585b786fb8666f28f70f2
--- /dev/null
+++ b/detrsmpl/core/renderer/mpr_renderer/cuda/rasterizer_kernel.cu
@@ -0,0 +1,577 @@
+/*
+
+There are 2 ways to rasterize triangles that came to mind:
+1) iterate over all pixels (they define CUDA grid), for selected pixel feed all triangles to 1 CUDA block
+2) iterate over all triangels (they define CUDA grid), for selected triangle feed pixels that are bounded by selected triangle to 1 CUDA block
+
+2nd way is implemented here
+*/
+
+
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <vector>
+#include <thrust/sort.h>
+#include <thrust/device_ptr.h>
+
+#define BLOCK_SIZE 512
+#define BLOCK_SIZE_2D_X 32
+#define BLOCK_SIZE_2D_Y 16
+#define BLOCK_SIZE_3D_X 32
+#define BLOCK_SIZE_3D_Y 8
+#define BLOCK_SIZE_3D_Z 4
+
+// vertices coords:
+// vertices[:, 0]: x
+// vertices[:, 1]: y
+// vertices[:, 2]: z
+
+// 2d tensor axis:
+// 0: yi
+// 1: xi
+
+// 3d tensor axis:
+// 0: zi
+// 1: yi
+// 2: xi
+
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t atomicMinFloat(scalar_t * addr, scalar_t value) {
+        scalar_t old;
+        old = (value >= 0) ? __int_as_float(atomicMin((int *)addr, __float_as_int(value))) :
+             __uint_as_float(atomicMax((unsigned int *)addr, __float_as_uint(value)));
+        return old;
+}
+
+__device__ double atomicMin_double(double* address, double val)
+{
+    unsigned long long int* address_as_ull = (unsigned long long int*) address;
+    unsigned long long int old = *address_as_ull, assumed;
+    do {
+        assumed = old;
+        old = atomicCAS(address_as_ull, assumed,
+            __double_as_longlong(fmin(val, __longlong_as_double(assumed))));
+    } while (assumed != old);
+    return __longlong_as_double(old);
+}
+
+// kernel utils
+
+template <typename scalar_t>
+__device__ int lower_bound(const scalar_t* values, const scalar_t value, const int N) {
+    int left = 0;
+    int right = N;
+    int mid;
+    while (right - left > 1) {
+        mid = (left + right) / 2;
+        if (values[mid] < value) {
+            left = mid;
+        } else {
+            right = mid;
+        }
+    }
+    return right;
+}
+
+// kernels
+
+template <typename scalar_t>
+__global__ void rasterize_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,2> vertices_ndc,
+    const torch::PackedTensorAccessor32<int32_t,2> faces,
+    const torch::PackedTensorAccessor32<uint8_t,1> vertices_filter,
+    torch::PackedTensorAccessor32<scalar_t,2> depth,
+    scalar_t* global_face_ndc_inv,
+    int* global_is_bad_face
+) {
+    const int face_indx = blockIdx.x;
+    const int H = depth.size(0);
+    const int W = depth.size(1);
+
+    scalar_t min_x, max_x, min_y, max_y;
+    scalar_t denom;
+
+    __shared__ int vertices_per_thread_x, vertices_per_thread_y;
+    __shared__ int ai, bi, ci;
+    __shared__ bool is_bad_face;
+    __shared__ int min_xi, max_xi, min_yi, max_yi;
+    __shared__ scalar_t face_ndc[9];
+    __shared__ scalar_t face_ndc_inv[9];
+    const scalar_t eps = 1e-5;
+
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+        ai = faces[face_indx][0];
+        bi = faces[face_indx][1];
+        ci = faces[face_indx][2];
+
+        if (vertices_filter[ai] == 0 || vertices_filter[bi] == 0 || vertices_filter[ci] == 0) {
+            is_bad_face = true;
+            global_is_bad_face[face_indx] = 1;
+            return;
+        }
+
+        face_ndc[0] = vertices_ndc[ai][0]; face_ndc[1] = vertices_ndc[ai][1]; face_ndc[2] = vertices_ndc[ai][2];
+        face_ndc[3] = vertices_ndc[bi][0]; face_ndc[4] = vertices_ndc[bi][1]; face_ndc[5] = vertices_ndc[bi][2];
+        face_ndc[6] = vertices_ndc[ci][0]; face_ndc[7] = vertices_ndc[ci][1]; face_ndc[8] = vertices_ndc[ci][2];
+
+        // negative vertex
+        is_bad_face = false;
+        if (face_ndc[2] < eps || face_ndc[5] < eps || face_ndc[8] < eps) {
+            is_bad_face = true;
+            global_is_bad_face[face_indx] = 1;
+            return;
+        }
+
+        face_ndc_inv[0] = face_ndc[4] - face_ndc[7];
+        face_ndc_inv[1] = face_ndc[6] - face_ndc[3];
+        face_ndc_inv[2] = face_ndc[3] * face_ndc[7] - face_ndc[6] * face_ndc[4];
+        face_ndc_inv[3] = face_ndc[7] - face_ndc[1];
+        face_ndc_inv[4] = face_ndc[0] - face_ndc[6];
+        face_ndc_inv[5] = face_ndc[6] * face_ndc[1] - face_ndc[0] * face_ndc[7];
+        face_ndc_inv[6] = face_ndc[1] - face_ndc[4];
+        face_ndc_inv[7] = face_ndc[3] - face_ndc[0];
+        face_ndc_inv[8] = face_ndc[0] * face_ndc[4] - face_ndc[3] * face_ndc[1];
+
+        denom = (
+            face_ndc[6] * (face_ndc[1] - face_ndc[4]) +
+            face_ndc[0] * (face_ndc[4] - face_ndc[7]) +
+            face_ndc[3] * (face_ndc[7] - face_ndc[1])
+        );
+
+//        if (abs(denom) < eps) {
+//            is_bad_face = true;
+//            global_is_bad_face[face_indx] = 1;
+//            return;
+//        }
+
+        for (int i = 0; i < 9; ++i) {
+            face_ndc_inv[i] /= denom;
+        }
+
+        for (int i = 0; i < 9; ++i) {
+            global_face_ndc_inv[9 * face_indx + i] = face_ndc_inv[i];
+        }
+
+        global_is_bad_face[face_indx] = 0;
+
+        min_x = min(min(face_ndc[0], face_ndc[3]), face_ndc[6]);
+        min_x = (min_x + 1) / 2 * W;  // convert from ndc to img coordinates
+        min_xi = static_cast<int>(floorf(static_cast<float>(min_x)));
+        min_xi = min(max(min_xi, 0), W - 1);
+        max_x = max(max(face_ndc[0], face_ndc[3]), face_ndc[6]);
+        max_x = (max_x + 1) / 2 * W;
+        max_xi = static_cast<int>(ceilf(static_cast<float>(max_x)));
+        max_xi = min(max(max_xi, 0), W - 1);
+
+        min_y = min(min(face_ndc[1], face_ndc[4]), face_ndc[7]);
+        min_y = (min_y + 1) / 2 * H;
+        min_yi = static_cast<int>(floorf(static_cast<float>(min_y)));
+        min_yi = min(max(min_yi, 0), H - 1);
+        max_y = max(max(face_ndc[1], face_ndc[4]), face_ndc[7]);
+        max_y = (max_y + 1) / 2 * H;
+        max_yi = static_cast<int>(ceilf(static_cast<float>(max_y)));
+        max_yi = min(max(max_yi, 0), H - 1);
+
+        vertices_per_thread_x = (max_xi - min_xi) / blockDim.x + 1;
+        vertices_per_thread_y = (max_yi - min_yi) / blockDim.y + 1;
+    }
+    __syncthreads();
+    if (is_bad_face) {
+        return;
+    }
+
+    const int left = min_xi + vertices_per_thread_x * threadIdx.x;
+    const int right = min(left + vertices_per_thread_x, max_xi);
+
+    const int top = min_yi + vertices_per_thread_y * threadIdx.y;
+    const int bottom = min(top + vertices_per_thread_y, max_yi);
+
+    scalar_t x, y, face_z, wa, wb, wc, wsum;
+    for (int i = top; i <= bottom; i++) {
+        for (int j = left; j <= right; j++) {
+            x = 2 * ((scalar_t)j + 0.5) / W - 1;
+            y = 2 * ((scalar_t)i + 0.5) / H - 1;
+
+            // check pixel is inside the face
+            if (((y - face_ndc[1]) * (face_ndc[3] - face_ndc[0]) > (x - face_ndc[0]) * (face_ndc[4] - face_ndc[1])) ||
+                ((y - face_ndc[4]) * (face_ndc[6] - face_ndc[3]) > (x - face_ndc[3]) * (face_ndc[7] - face_ndc[4])) ||
+                ((y - face_ndc[7]) * (face_ndc[0] - face_ndc[6]) > (x - face_ndc[6]) * (face_ndc[1] - face_ndc[7]))) {
+                continue;
+            }
+
+            wa = face_ndc_inv[0] * x + face_ndc_inv[1] * y + face_ndc_inv[2];
+            wb = face_ndc_inv[3] * x + face_ndc_inv[4] * y + face_ndc_inv[5];
+            wc = face_ndc_inv[6] * x + face_ndc_inv[7] * y + face_ndc_inv[8];
+            wsum = wa + wb + wc;
+            wa /= wsum; wb /= wsum; wc /= wsum;
+
+            wa /= face_ndc[2];
+            wb /= face_ndc[5];
+            wc /= face_ndc[8];
+            wsum = wa + wb + wc;
+            wa /= wsum; wb /= wsum; wc /= wsum;
+
+            face_z = wa * face_ndc[2] + wb * face_ndc[5] + wc * face_ndc[8];
+
+            if (sizeof(scalar_t) == sizeof(double)) {
+                atomicMin_double((double*)&depth[i][j], (double)face_z);
+            } else {
+                atomicMinFloat(&depth[i][j], face_z);
+            }
+        }
+    }
+}
+
+
+template <typename scalar_t>
+__global__ void interpolate_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,2> vertices_ndc,
+    const torch::PackedTensorAccessor32<int32_t,2> faces,
+    const torch::PackedTensorAccessor32<scalar_t,2> depth,
+    const scalar_t* global_face_ndc_inv,
+    const int* global_is_bad_face,
+    const torch::PackedTensorAccessor32<scalar_t,2> vertice_values,
+    torch::PackedTensorAccessor32<scalar_t,3> result
+) {
+    const int face_indx = blockIdx.x;
+
+    if (global_is_bad_face[face_indx]) {
+        return;
+    }
+
+    const int H = depth.size(0);
+    const int W = depth.size(1);
+    const int C = vertice_values.size(1);
+    const scalar_t eps = 1e-5;
+
+    scalar_t min_x, max_x, min_y, max_y;
+    __shared__ int vertices_per_thread_x, vertices_per_thread_y;
+    __shared__ int ai, bi, ci;
+    __shared__ scalar_t face_ndc[9];
+    __shared__ scalar_t face_ndc_inv[9];
+    __shared__ int min_xi, max_xi, min_yi, max_yi;
+
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+        ai = faces[face_indx][0];
+        bi = faces[face_indx][1];
+        ci = faces[face_indx][2];
+
+        face_ndc[0] = vertices_ndc[ai][0]; face_ndc[1] = vertices_ndc[ai][1]; face_ndc[2] = vertices_ndc[ai][2];
+        face_ndc[3] = vertices_ndc[bi][0]; face_ndc[4] = vertices_ndc[bi][1]; face_ndc[5] = vertices_ndc[bi][2];
+        face_ndc[6] = vertices_ndc[ci][0]; face_ndc[7] = vertices_ndc[ci][1]; face_ndc[8] = vertices_ndc[ci][2];
+
+        for (int i = 0; i < 9; ++i) {
+            face_ndc_inv[i] = global_face_ndc_inv[9 * face_indx + i];
+        }
+
+        min_x = min(min(face_ndc[0], face_ndc[3]), face_ndc[6]);
+        min_x = (min_x + 1) / 2 * W;  // convert from ndc to img coordinates
+        min_xi = static_cast<int>(floorf(static_cast<float>(min_x)));
+        min_xi = min(max(min_xi, 0), W - 1);
+        max_x = max(max(face_ndc[0], face_ndc[3]), face_ndc[6]);
+        max_x = (max_x + 1) / 2 * W;
+        max_xi = static_cast<int>(ceilf(static_cast<float>(max_x)));
+        max_xi = min(max(max_xi, 0), W - 1);
+
+        min_y = min(min(face_ndc[1], face_ndc[4]), face_ndc[7]);
+        min_y = (min_y + 1) / 2 * H;
+        min_yi = static_cast<int>(floorf(static_cast<float>(min_y)));
+        min_yi = min(max(min_yi, 0), H - 1);
+        max_y = max(max(face_ndc[1], face_ndc[4]), face_ndc[7]);
+        max_y = (max_y + 1) / 2 * H;
+        max_yi = static_cast<int>(ceilf(static_cast<float>(max_y)));
+        max_yi = min(max(max_yi, 0), H - 1);
+
+        vertices_per_thread_x = (max_xi - min_xi) / blockDim.x + 1;
+        vertices_per_thread_y = (max_yi - min_yi) / blockDim.y + 1;
+    }
+    __syncthreads();
+
+    const int left = min_xi + vertices_per_thread_x * threadIdx.x;
+    const int right = min(left + vertices_per_thread_x, max_xi);
+
+    const int top = min_yi + vertices_per_thread_y * threadIdx.y;
+    const int bottom = min(top + vertices_per_thread_y, max_yi);
+
+    scalar_t x, y, face_z, wa, wb, wc, wsum;
+    for (int i = top; i <= bottom; i++) {
+        for (int j = left; j <= right; j++) {
+            x = 2 * ((scalar_t)j + 0.5) / W - 1;
+            y = 2 * ((scalar_t)i + 0.5) / H - 1;
+
+            // check pixel is inside the face
+            if (((y - face_ndc[1]) * (face_ndc[3] - face_ndc[0]) > (x - face_ndc[0]) * (face_ndc[4] - face_ndc[1])) ||
+                ((y - face_ndc[4]) * (face_ndc[6] - face_ndc[3]) > (x - face_ndc[3]) * (face_ndc[7] - face_ndc[4])) ||
+                ((y - face_ndc[7]) * (face_ndc[0] - face_ndc[6]) > (x - face_ndc[6]) * (face_ndc[1] - face_ndc[7]))) {
+                continue;
+            }
+
+            wa = face_ndc_inv[0] * x + face_ndc_inv[1] * y + face_ndc_inv[2];
+            wb = face_ndc_inv[3] * x + face_ndc_inv[4] * y + face_ndc_inv[5];
+            wc = face_ndc_inv[6] * x + face_ndc_inv[7] * y + face_ndc_inv[8];
+            wsum = wa + wb + wc;
+            wa /= wsum; wb /= wsum; wc /= wsum;
+
+            wa /= face_ndc[2];
+            wb /= face_ndc[5];
+            wc /= face_ndc[8];
+            wsum = wa + wb + wc;
+            wa /= wsum; wb /= wsum; wc /= wsum;
+
+            face_z = wa * face_ndc[2] + wb * face_ndc[5] + wc * face_ndc[8];
+
+            if (face_z - eps < depth[i][j]) {
+                for (int c = 0; c < C; c++) {
+                    result[i][j][c] = wa * vertice_values[ai][c] + wb * vertice_values[bi][c] + wc * vertice_values[ci][c];
+                }
+            }
+        }
+    }
+}
+
+
+template <typename scalar_t>
+__global__ void estimate_normals_cuda_kernel(
+    const torch::PackedTensorAccessor32<scalar_t,2> vertices_ndc,
+    const torch::PackedTensorAccessor32<int32_t,2> faces,
+    const torch::PackedTensorAccessor32<scalar_t,2> depth,
+    const scalar_t* global_face_ndc_inv,
+    const int* global_is_bad_face,
+    const torch::PackedTensorAccessor32<scalar_t,2> vertices,
+    torch::PackedTensorAccessor32<scalar_t,3> coords,
+    torch::PackedTensorAccessor32<scalar_t,3> normals
+) {
+    const int face_indx = blockIdx.x;
+
+    if (global_is_bad_face[face_indx]) {
+        return;
+    }
+
+    const int H = depth.size(0);
+    const int W = depth.size(1);
+    const scalar_t eps = 1e-5;
+
+    scalar_t min_x, max_x, min_y, max_y;
+    scalar_t v1x, v1y, v1z, v2x, v2y, v2z, nlen;
+    __shared__ int vertices_per_thread_x, vertices_per_thread_y;
+    __shared__ int ai, bi, ci;
+    __shared__ scalar_t face[9];
+    __shared__ scalar_t face_ndc[9];
+    __shared__ scalar_t face_ndc_inv[9];
+    __shared__ int min_xi, max_xi, min_yi, max_yi;
+    __shared__ scalar_t nx, ny, nz;
+
+    if (threadIdx.x == 0 && threadIdx.y == 0) {
+        ai = faces[face_indx][0];
+        bi = faces[face_indx][1];
+        ci = faces[face_indx][2];
+
+        face[0] = vertices[ai][0]; face[1] = vertices[ai][1]; face[2] = vertices[ai][2];
+        face[3] = vertices[bi][0]; face[4] = vertices[bi][1]; face[5] = vertices[bi][2];
+        face[6] = vertices[ci][0]; face[7] = vertices[ci][1]; face[8] = vertices[ci][2];
+
+        v1x = face[3] - face[0]; v2x = face[6] - face[0];
+        v1y = face[4] - face[1]; v2y = face[7] - face[1];
+        v1z = face[5] - face[2]; v2z = face[8] - face[2];
+
+        nx = v1y * v2z - v1z * v2y;
+        ny = v1z * v2x - v1x * v2z;
+        nz = v1x * v2y - v1y * v2x;
+        nlen = nx * nx + ny * ny + nz * nz;
+        nlen = (scalar_t)sqrt((float)nlen);
+        nx /= nlen;
+        ny /= nlen;
+        nz /= nlen;
+
+        face_ndc[0] = vertices_ndc[ai][0]; face_ndc[1] = vertices_ndc[ai][1]; face_ndc[2] = vertices_ndc[ai][2];
+        face_ndc[3] = vertices_ndc[bi][0]; face_ndc[4] = vertices_ndc[bi][1]; face_ndc[5] = vertices_ndc[bi][2];
+        face_ndc[6] = vertices_ndc[ci][0]; face_ndc[7] = vertices_ndc[ci][1]; face_ndc[8] = vertices_ndc[ci][2];
+
+        for (int i = 0; i < 9; ++i) {
+            face_ndc_inv[i] = global_face_ndc_inv[9 * face_indx + i];
+        }
+
+        min_x = min(min(face_ndc[0], face_ndc[3]), face_ndc[6]);
+        min_x = (min_x + 1) / 2 * W;  // convert from ndc to img coordinates
+        min_xi = static_cast<int>(floorf(static_cast<float>(min_x)));
+        min_xi = min(max(min_xi, 0), W - 1);
+        max_x = max(max(face_ndc[0], face_ndc[3]), face_ndc[6]);
+        max_x = (max_x + 1) / 2 * W;
+        max_xi = static_cast<int>(ceilf(static_cast<float>(max_x)));
+        max_xi = min(max(max_xi, 0), W - 1);
+
+        min_y = min(min(face_ndc[1], face_ndc[4]), face_ndc[7]);
+        min_y = (min_y + 1) / 2 * H;
+        min_yi = static_cast<int>(floorf(static_cast<float>(min_y)));
+        min_yi = min(max(min_yi, 0), H - 1);
+        max_y = max(max(face_ndc[1], face_ndc[4]), face_ndc[7]);
+        max_y = (max_y + 1) / 2 * H;
+        max_yi = static_cast<int>(ceilf(static_cast<float>(max_y)));
+        max_yi = min(max(max_yi, 0), H - 1);
+
+        vertices_per_thread_x = (max_xi - min_xi) / blockDim.x + 1;
+        vertices_per_thread_y = (max_yi - min_yi) / blockDim.y + 1;
+    }
+    __syncthreads();
+
+    const int left = min_xi + vertices_per_thread_x * threadIdx.x;
+    const int right = min(left + vertices_per_thread_x, max_xi);
+
+    const int top = min_yi + vertices_per_thread_y * threadIdx.y;
+    const int bottom = min(top + vertices_per_thread_y, max_yi);
+
+    scalar_t x, y, face_z, wa, wb, wc, wsum;
+    for (int i = top; i <= bottom; i++) {
+        for (int j = left; j <= right; j++) {
+            x = 2 * ((scalar_t)j + 0.5) / W - 1;
+            y = 2 * ((scalar_t)i + 0.5) / H - 1;
+
+            // check pixel is inside the face
+            if (((y - face_ndc[1]) * (face_ndc[3] - face_ndc[0]) > (x - face_ndc[0]) * (face_ndc[4] - face_ndc[1])) ||
+                ((y - face_ndc[4]) * (face_ndc[6] - face_ndc[3]) > (x - face_ndc[3]) * (face_ndc[7] - face_ndc[4])) ||
+                ((y - face_ndc[7]) * (face_ndc[0] - face_ndc[6]) > (x - face_ndc[6]) * (face_ndc[1] - face_ndc[7]))) {
+                continue;
+            }
+
+            wa = face_ndc_inv[0] * x + face_ndc_inv[1] * y + face_ndc_inv[2];
+            wb = face_ndc_inv[3] * x + face_ndc_inv[4] * y + face_ndc_inv[5];
+            wc = face_ndc_inv[6] * x + face_ndc_inv[7] * y + face_ndc_inv[8];
+            wsum = wa + wb + wc;
+            wa /= wsum; wb /= wsum; wc /= wsum;
+
+            wa /= face_ndc[2];
+            wb /= face_ndc[5];
+            wc /= face_ndc[8];
+            wsum = wa + wb + wc;
+            wa /= wsum; wb /= wsum; wc /= wsum;
+
+            face_z = wa * face_ndc[2] + wb * face_ndc[5] + wc * face_ndc[8];
+
+            if (face_z - eps < depth[i][j]) {
+                coords[i][j][0] = wa * face[0] + wb * face[3] + wc * face[6];
+                coords[i][j][1] = wa * face[1] + wb * face[4] + wc * face[7];
+                coords[i][j][2] = wa * face[2] + wb * face[5] + wc * face[8];
+
+                normals[i][j][0] = nx;
+                normals[i][j][1] = ny;
+                normals[i][j][2] = nz;
+            }
+        }
+    }
+}
+
+// cpp defined functions
+
+torch::Tensor project_mesh_cuda(
+    const torch::Tensor& vertices_ndc,
+    const torch::Tensor& faces,
+    const torch::Tensor& vertice_values,
+    const torch::Tensor& vertices_filter,
+    int H, int W
+) {
+    const int N = vertices_ndc.size(0);
+    const int C = vertice_values.size(1);
+    const int M = faces.size(0);
+
+    const int gpuid = vertices_ndc.device().index();
+    AT_CUDA_CHECK(cudaSetDevice(gpuid));
+    auto options = torch::dtype(vertices_ndc.scalar_type()).device(torch::kCUDA, gpuid);
+
+    const dim3 dimGrid(M);
+    const dim3 dimBlock(4, 4);
+
+    auto depth = torch::ones({H, W}, options) * 1e10;
+    auto result = torch::zeros({H, W, C}, options);
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(vertices_ndc.scalar_type(), "project_mesh_cuda_kernel", [&] {
+        scalar_t* global_face_ndc_inv;
+        cudaMalloc(&global_face_ndc_inv, M * 9 * sizeof(scalar_t));
+        int* global_is_bad_face;
+        cudaMalloc(&global_is_bad_face, M * sizeof(int));
+        rasterize_cuda_kernel<scalar_t><<<dimGrid, dimBlock>>>(
+            vertices_ndc.packed_accessor32<scalar_t,2>(),
+            faces.packed_accessor32<int32_t,2>(),
+            vertices_filter.packed_accessor32<uint8_t,1>(),
+            depth.packed_accessor32<scalar_t,2>(),
+            global_face_ndc_inv,
+            global_is_bad_face
+        );
+        AT_CUDA_CHECK(cudaGetLastError());
+
+        interpolate_cuda_kernel<scalar_t><<<dimGrid, dimBlock>>>(
+            vertices_ndc.packed_accessor32<scalar_t,2>(),
+            faces.packed_accessor32<int32_t,2>(),
+            depth.packed_accessor32<scalar_t,2>(),
+            global_face_ndc_inv,
+            global_is_bad_face,
+            vertice_values.packed_accessor32<scalar_t,2>(),
+            result.packed_accessor32<scalar_t,3>()
+        );
+        AT_CUDA_CHECK(cudaGetLastError());
+
+        cudaFree(global_face_ndc_inv);
+        cudaFree(global_is_bad_face);
+        AT_CUDA_CHECK(cudaGetLastError());
+    });
+
+    return result;
+}
+
+
+std::vector<torch::Tensor> estimate_normals_cuda(
+    const torch::Tensor& vertices_ndc,
+    const torch::Tensor& faces,
+    const torch::Tensor& vertices,
+    const torch::Tensor& vertices_filter,
+    int H, int W
+) {
+    const int N = vertices_ndc.size(0);
+    const int M = faces.size(0);
+
+    const int gpuid = vertices_ndc.device().index();
+    AT_CUDA_CHECK(cudaSetDevice(gpuid));
+    auto options = torch::dtype(vertices_ndc.scalar_type()).device(torch::kCUDA, gpuid);
+
+    const dim3 dimGrid(M);
+    const dim3 dimBlock(4, 4);
+
+    auto depth = torch::ones({H, W}, options) * 1e10;
+    auto coords = torch::zeros({H, W, 3}, options);
+    auto normals = torch::zeros({H, W, 3}, options);
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(vertices_ndc.scalar_type(), "project_mesh_cuda_kernel", [&] {
+        scalar_t* global_face_ndc_inv;
+        cudaMalloc(&global_face_ndc_inv, M * 9 * sizeof(scalar_t));
+        int* global_is_bad_face;
+        cudaMalloc(&global_is_bad_face, M * sizeof(int));
+        rasterize_cuda_kernel<scalar_t><<<dimGrid, dimBlock>>>(
+            vertices_ndc.packed_accessor32<scalar_t,2>(),
+            faces.packed_accessor32<int32_t,2>(),
+            vertices_filter.packed_accessor32<uint8_t,1>(),
+            depth.packed_accessor32<scalar_t,2>(),
+            global_face_ndc_inv,
+            global_is_bad_face
+        );
+        AT_CUDA_CHECK(cudaGetLastError());
+
+        estimate_normals_cuda_kernel<scalar_t><<<dimGrid, dimBlock>>>(
+            vertices_ndc.packed_accessor32<scalar_t,2>(),
+            faces.packed_accessor32<int32_t,2>(),
+            depth.packed_accessor32<scalar_t,2>(),
+            global_face_ndc_inv,
+            global_is_bad_face,
+            vertices.packed_accessor32<scalar_t,2>(),
+            coords.packed_accessor32<scalar_t,3>(),
+            normals.packed_accessor32<scalar_t,3>()
+        );
+        AT_CUDA_CHECK(cudaGetLastError());
+
+        cudaFree(global_face_ndc_inv);
+        cudaFree(global_is_bad_face);
+        AT_CUDA_CHECK(cudaGetLastError());
+    });
+
+    return {coords, normals};
+}
diff --git a/detrsmpl/core/renderer/mpr_renderer/rasterizer.py b/detrsmpl/core/renderer/mpr_renderer/rasterizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..18746e8b582193b8cb15219be568a9d74cf411a3
--- /dev/null
+++ b/detrsmpl/core/renderer/mpr_renderer/rasterizer.py
@@ -0,0 +1,68 @@
+import torch
+
+try:
+    from detrsmpl.core.renderer.mpr_renderer.cuda.rasterizer import \
+        estimate_normals as estimate_normals_cuda  # noqa: E501
+    from detrsmpl.core.renderer.mpr_renderer.cuda.rasterizer import \
+        project_mesh as project_mesh_cuda  # noqa: E501
+except (ImportError, ModuleNotFoundError):
+    print('Please reinstall MMHuman3D to build mpr_renderer.')
+    raise
+
+
+def estimate_normals(vertices, faces, pinhole, vertices_filter=None):
+    """Estimate the vertices normals with the specified faces and camera.
+
+    Args:
+        vertices (torch.tensor): Shape should be (num_verts, 3).
+        faces (torch.tensor): The faces of the vertices.
+        pinhole (object): The object of the camera.
+
+    Returns:
+        coords (torch.tensor): The estimated coordinates.
+        normals (torch.tensor): The estimated normals.
+    """
+    if vertices_filter is None:
+        assert torch.is_tensor(vertices)
+        assert vertices.is_cuda
+        assert len(vertices.shape) == 2
+        n = vertices.shape[0]
+        vertices_filter = torch.ones((n),
+                                     dtype=torch.uint8,
+                                     device=vertices.device)
+    vertices = vertices.contiguous()
+    vertices_ndc = pinhole.project_ndc(vertices)
+    coords, normals = estimate_normals_cuda(vertices_ndc, faces, vertices,
+                                            vertices_filter, pinhole.h,
+                                            pinhole.w)
+    return coords, normals
+
+
+def project_mesh(vertices,
+                 faces,
+                 vertice_values,
+                 pinhole,
+                 vertices_filter=None):
+    """Project mesh to the image plane with the specified faces and camera.
+
+    Args:
+        vertices (torch.tensor): Shape should be (num_verts, 3).
+        faces (torch.tensor): The faces of the vertices.
+        vertice_values (torch.tensor): The depth of the each vertex.
+        pinhole (object): The object of the camera.
+
+    Returns:
+        torch.tensor: The projected mesh.
+    """
+    if vertices_filter is None:
+        assert torch.is_tensor(vertices)
+        assert vertices.is_cuda
+        assert len(vertices.shape) == 2
+        n = vertices.shape[0]
+        vertices_filter = torch.ones((n),
+                                     dtype=torch.uint8,
+                                     device=vertices.device)
+    vertices = vertices.contiguous()
+    vertices_ndc = pinhole.project_ndc(vertices)
+    return project_mesh_cuda(vertices_ndc, faces, vertice_values,
+                             vertices_filter, pinhole.h, pinhole.w)
diff --git a/detrsmpl/core/renderer/mpr_renderer/smpl_realrender.py b/detrsmpl/core/renderer/mpr_renderer/smpl_realrender.py
new file mode 100644
index 0000000000000000000000000000000000000000..e795b1fbabfccd59f7ba4b638071ec211cf4c1b6
--- /dev/null
+++ b/detrsmpl/core/renderer/mpr_renderer/smpl_realrender.py
@@ -0,0 +1,48 @@
+import cv2
+import numpy as np
+import torch
+
+from detrsmpl.core.renderer.mpr_renderer.camera import Pinhole2D
+from detrsmpl.core.renderer.mpr_renderer.rasterizer import \
+    estimate_normals  # noqa: E501
+from detrsmpl.core.renderer.mpr_renderer.utils import \
+    vis_normals  # noqa: E501
+
+
+class VisualizerMeshSMPL:
+    def __init__(self,
+                 device=None,
+                 body_models=None,
+                 focal_length=5000.,
+                 camera_center=[112., 112.],
+                 resolution=None,
+                 scale=None):
+        self.body_models = body_models
+        self.pinhole2d = Pinhole2D(fx=focal_length,
+                                   fy=focal_length,
+                                   cx=camera_center[0],
+                                   cy=camera_center[1],
+                                   w=resolution[1],
+                                   h=resolution[0])
+        self.device = torch.device(device)
+        self.faces = self.body_models.faces_tensor.to(dtype=torch.int32,
+                                                      device=self.device)
+
+    def __call__(self, vertices, bg=None, **kwargs):
+        assert vertices.device == self.faces.device
+        vertices = vertices.clone()
+        coords, normals = estimate_normals(vertices=vertices,
+                                           faces=self.faces,
+                                           pinhole=self.pinhole2d)
+        vis = vis_normals(coords, normals)
+        if bg is not None:
+            mask = coords[:, :, [2]] <= 0
+            vis = (
+                vis[:, :, None] +
+                torch.tensor(bg).to(mask.device) * mask).cpu().numpy().astype(
+                    np.uint8)
+        else:
+            # convert gray to 3 channel img
+            vis = vis.detach().cpu().numpy()
+            vis = cv2.merge((vis, vis, vis))
+        return vis
diff --git a/detrsmpl/core/renderer/mpr_renderer/utils.py b/detrsmpl/core/renderer/mpr_renderer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3256d1092a5dd3481a8201530d8f6c5d8eb50dcd
--- /dev/null
+++ b/detrsmpl/core/renderer/mpr_renderer/utils.py
@@ -0,0 +1,37 @@
+import torch
+
+
+def vis_z_buffer(z, percentile=1, vis_pad=0.2):
+    z = z[:, :, 0]
+    mask = z > 1e-5
+    if torch.sum(mask) == 0:
+        z[...] = 0
+    else:
+        vmin = torch.quantile(z[mask], percentile / 100)
+        vmax = torch.quantile(z[mask], 1 - percentile / 100)
+        pad = (vmax - vmin) * vis_pad
+        vmin_padded = vmin - pad
+        vmax_padded = vmax + pad
+        z[mask] = vmin + vmax - z[mask]
+        z = (z - vmin_padded) / (vmax_padded - vmin_padded)
+        z = torch.clip(torch.round(z * 255), 0, 255)
+    z_cpu = z.to(dtype=torch.uint8).detach().cpu().numpy()
+    return z_cpu
+
+
+def vis_normals(coords, normals, vis_pad=0.2):
+    mask = coords[:, :, 2] > 0
+    coords_masked = -coords[mask]
+    normals_masked = normals[mask]
+
+    coords_len = torch.sqrt(torch.sum(coords_masked**2, dim=1))
+
+    dot = torch.sum(coords_masked * normals_masked, dim=1) / coords_len
+
+    h, w = normals.shape[:2]
+    vis = torch.zeros((h, w), dtype=coords.dtype, device=coords.device)
+    vis[mask] = torch.clamp(dot, 0, 1) * (1 - 2 * vis_pad) + vis_pad
+
+    vis = (vis * 255).to(dtype=torch.uint8)
+
+    return vis
diff --git a/detrsmpl/core/renderer/torch3d_renderer/__init__.py b/detrsmpl/core/renderer/torch3d_renderer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/core/renderer/torch3d_renderer/base_renderer.py b/detrsmpl/core/renderer/torch3d_renderer/base_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6a37a534cca1f1c6e9ae5fd5e2f4971aa326f3e
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/base_renderer.py
@@ -0,0 +1,273 @@
+import os.path as osp
+import shutil
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+import cv2
+import mmcv
+import torch
+import torch.nn as nn
+from pytorch3d.renderer import (
+    AmbientLights,
+    BlendParams,
+    DirectionalLights,
+    Materials,
+    MeshRasterizer,
+    PointLights,
+    RasterizationSettings,
+)
+
+from detrsmpl.core.cameras import MMCamerasBase
+from detrsmpl.utils.ffmpeg_utils import images_to_gif, images_to_video
+from detrsmpl.utils.path_utils import check_path_suffix
+from .lights import build_lights
+from .shader import build_shader
+from .utils import normalize, rgb2bgr, tensor2array
+
+
+class BaseRenderer(nn.Module):
+    def __init__(self,
+                 resolution: Tuple[int, int] = None,
+                 device: Union[torch.device, str] = 'cpu',
+                 output_path: Optional[str] = None,
+                 out_img_format: str = '%06d.png',
+                 **kwargs) -> None:
+        """BaseRenderer for differentiable rendering and visualization.
+
+        Args:
+            resolution (Iterable[int]):
+                (width, height) of the rendered images resolution.
+            device (Union[torch.device, str], optional):
+                You can pass a str or torch.device for cpu or gpu render.
+                Defaults to 'cpu'.
+            output_path (Optional[str], optional):
+                Output path of the video or images to be saved.
+                Defaults to None.
+            out_img_format (str, optional): The image format string for
+                saving the images.
+                Defaults to '%06d.png'.
+
+        **kwargs is used for render setting.
+        You can set up your render kwargs like:
+            {
+                'shader': {
+                    'type': 'soft_phong'
+                },
+                'lights': {
+                        'type': 'directional',
+                        'direction': [[10.0, 10.0, 10.0]],
+                        'ambient_color': [[0.5, 0.5, 0.5]],
+                        'diffuse_color': [[0.5, 0.5, 0.5]],
+                        'specular_color': [[0.5, 0.5, 0.5]],
+                    },
+                'materials': {
+                        'ambient_color': [[1, 1, 1]],
+                        'diffuse_color': [[0.5, 0.5, 0.5]],
+                        'specular_color': [[0.5, 0.5, 0.5]],
+                        'shininess': 60.0,
+                    },
+                'rasterizer': {
+                    'bin_size': 0,
+                    'blur_radius': 0.0,
+                    'faces_per_pixel': 1,
+                    'perspective_correct': False,
+                },
+                'blend_params': {'background_color': (1.0, 1.0, 1.0)},
+            },
+        You can change any parameter in the suitable range, please check
+        configs/render/smpl.py.
+
+        Returns:
+            None
+        """
+        super().__init__()
+        self.device = device
+        self.output_path = output_path
+        self.resolution = resolution
+        self.temp_path = None
+        self.out_img_format = out_img_format
+        self._set_output_path(output_path)
+        self._init_renderer(**kwargs)
+
+    def _init_renderer(self,
+                       rasterizer: Union[dict, nn.Module] = None,
+                       shader: Union[dict, nn.Module] = None,
+                       materials: Union[dict, Materials] = None,
+                       lights: Union[dict, DirectionalLights, PointLights,
+                                     AmbientLights] = None,
+                       blend_params: Union[dict, BlendParams] = None,
+                       **kwargs):
+        """Initial renderer."""
+        if isinstance(materials, dict):
+            materials = Materials(**materials)
+        elif materials is None:
+            materials = Materials()
+        elif not isinstance(materials, Materials):
+            raise TypeError(f'Wrong type of materials: {type(materials)}.')
+
+        if isinstance(lights, dict):
+            self.lights = build_lights(lights)
+        elif lights is None:
+            self.lights = AmbientLights()
+        elif isinstance(lights,
+                        (AmbientLights, PointLights, DirectionalLights)):
+            self.lights = lights
+        else:
+            raise TypeError(f'Wrong type of lights: {type(lights)}.')
+
+        if isinstance(blend_params, dict):
+            blend_params = BlendParams(**blend_params)
+        elif blend_params is None:
+            blend_params = BlendParams()
+        elif not isinstance(blend_params, BlendParams):
+            raise TypeError(
+                f'Wrong type of blend_params: {type(blend_params)}.')
+
+        if isinstance(rasterizer, nn.Module):
+            if self.resolution is not None:
+                rasterizer.raster_settings.image_size = self.resolution
+            self.rasterizer = rasterizer
+        elif isinstance(rasterizer, dict):
+            if self.resolution is not None:
+                rasterizer['image_size'] = self.resolution
+            raster_settings = RasterizationSettings(**rasterizer)
+            self.rasterizer = MeshRasterizer(raster_settings=raster_settings)
+        elif rasterizer is None:
+            self.rasterizer = MeshRasterizer(
+                raster_settings=RasterizationSettings(
+                    image_size=self.resolution,
+                    bin_size=0,
+                    blur_radius=0,
+                    faces_per_pixel=1,
+                    perspective_correct=False))
+        else:
+            raise TypeError(
+                f'Wrong type of rasterizer: {type(self.rasterizer)}.')
+
+        if self.resolution is None:
+            self.resolution = self.rasterizer.raster_settings.image_size
+        assert self.resolution is not None
+        self.resolution = (self.resolution, self.resolution) if isinstance(
+            self.resolution, int) else tuple(self.resolution)
+        if isinstance(shader, nn.Module):
+            self.shader = shader
+        elif isinstance(shader, dict):
+            shader.update(materials=materials,
+                          lights=self.lights,
+                          blend_params=blend_params)
+            self.shader = build_shader(shader)
+        elif shader is None:
+            self.shader = build_shader(
+                dict(type=self.shader_type,
+                     materials=materials,
+                     lights=self.lights,
+                     blend_params=blend_params))
+        else:
+            raise TypeError(f'Wrong type of shader: {type(self.shader)}.')
+        self = self.to(self.device)
+
+    def to(self, device):
+        if isinstance(device, str):
+            device = torch.device(device)
+        self.device = device
+        if getattr(self.rasterizer, 'cameras', None) is not None:
+            self.rasterizer.cameras = self.rasterizer.cameras.to(device)
+
+        if getattr(self.shader, 'cameras', None) is not None:
+            self.shader.cameras = self.shader.cameras.to(device)
+        if getattr(self.shader, 'materials', None) is not None:
+            self.shader.materials = self.shader.materials.to(device)
+        if getattr(self.shader, 'lights', None) is not None:
+            self.shader.lights = self.shader.lights.to(device)
+        return self
+
+    def _set_output_path(self, output_path):
+        if output_path is not None:
+            self.output_path = output_path
+            if check_path_suffix(output_path, ['.mp4', '.gif']):
+                self.temp_path = osp.join(
+                    Path(output_path).parent,
+                    Path(output_path).name + '_output_temp')
+            elif check_path_suffix(output_path, ['.png', '.jpg', '.jpeg']):
+                mmcv.mkdir_or_exist(Path(output_path).parent)
+                self.temp_path = osp.join(
+                    Path(output_path).parent,
+                    Path(output_path).name + '_output_temp')
+            else:
+                self.temp_path = output_path
+            mmcv.mkdir_or_exist(self.temp_path)
+            print('Make dir', self.temp_path)
+
+    def _update_resolution(self, cameras, **kwargs):
+        if isinstance(cameras, MMCamerasBase):
+            self.resolution = (int(cameras.resolution[0][0]),
+                               int(cameras.resolution[0][1]))
+        if 'resolution' in kwargs:
+            self.resolution = kwargs.get('resolution')
+        self.rasterizer.raster_settings.image_size = self.resolution
+
+    def export(self):
+        """Export output video if need."""
+        if self.output_path is not None:
+            folder = self.temp_path if self.temp_path is not None else\
+                 self.output_path
+            if check_path_suffix(self.output_path, ['.mp4']):
+                images_to_video(input_folder=folder,
+                                output_path=self.output_path,
+                                img_format=self.out_img_format)
+            elif check_path_suffix(self.output_path, ['.gif']):
+                images_to_gif(input_folder=folder,
+                              output_path=self.output_path,
+                              img_format=self.out_img_format)
+
+    def __del__(self):
+        """remove_temp_files."""
+        if self.output_path is not None:
+            if Path(self.output_path).is_file():
+                self._remove_temp_frames()
+
+    def _remove_temp_frames(self):
+        """Remove temp files."""
+        if self.temp_path:
+            if osp.exists(self.temp_path) and osp.isdir(self.temp_path):
+                shutil.rmtree(self.temp_path)
+
+    def _write_images(self, rgba, backgrounds, indexes):
+        """Write output/temp images."""
+        if rgba.shape[-1] > 3:
+            rgbs, valid_masks = rgba[..., :3], rgba[..., 3:]
+        else:
+            rgbs = rgba[..., :3]
+            valid_masks = torch.ones_like(rgbs[..., :1])
+        rgbs = normalize(rgbs, origin_value_range=(0, 1), clip=True)
+        bgrs = rgb2bgr(rgbs)
+        if backgrounds is not None:
+            image_max = 1.0 if backgrounds.max() <= 1.0 else 255
+            backgrounds = normalize(backgrounds,
+                                    origin_value_range=(0, image_max),
+                                    out_value_range=(0, 1))
+            output_images = bgrs * valid_masks + (1 -
+                                                  valid_masks) * backgrounds
+            output_images = tensor2array(output_images)
+
+        else:
+            output_images = tensor2array(bgrs)
+        for idx, real_idx in enumerate(indexes):
+            folder = self.temp_path if self.temp_path is not None else\
+                self.output_path
+            cv2.imwrite(osp.join(folder, self.out_img_format % real_idx),
+                        output_images[idx])
+
+    def forward(self):
+        """"Should be called by each sub renderer class."""
+        raise NotImplementedError()
+
+    def tensor2rgba(self, tensor: torch.Tensor):
+        valid_masks = (tensor[..., 3:] > 0) * 1.0
+        rgbs = tensor[..., :3]
+
+        rgbs = normalize(rgbs,
+                         origin_value_range=[0, 1],
+                         out_value_range=[0, 1])
+        rgba = torch.cat([rgbs, valid_masks], -1)
+        return rgba
diff --git a/detrsmpl/core/renderer/torch3d_renderer/builder.py b/detrsmpl/core/renderer/torch3d_renderer/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..57834e17a1c44c1c5a7008a21d6fdd019c6f8ec3
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/builder.py
@@ -0,0 +1,45 @@
+from mmcv.utils import Registry
+
+from .base_renderer import BaseRenderer
+from .depth_renderer import DepthRenderer
+from .mesh_renderer import MeshRenderer
+from .normal_renderer import NormalRenderer
+from .pointcloud_renderer import PointCloudRenderer
+from .segmentation_renderer import SegmentationRenderer
+from .silhouette_renderer import SilhouetteRenderer
+from .uv_renderer import UVRenderer
+
+RENDERER = Registry('renderer')
+RENDERER.register_module(
+    name=['base', 'Base', 'base_renderer', 'BaseRenderer'],
+    module=BaseRenderer)
+RENDERER.register_module(
+    name=['Depth', 'depth', 'depth_renderer', 'DepthRenderer'],
+    module=DepthRenderer)
+RENDERER.register_module(
+    name=['Mesh', 'mesh', 'mesh_renderer', 'MeshRenderer'],
+    module=MeshRenderer)
+RENDERER.register_module(
+    name=['Normal', 'normal', 'normal_renderer', 'NormalRenderer'],
+    module=NormalRenderer)
+RENDERER.register_module(name=[
+    'PointCloud', 'pointcloud', 'point_cloud', 'pointcloud_renderer',
+    'PointCloudRenderer'
+],
+                         module=PointCloudRenderer)
+RENDERER.register_module(name=[
+    'segmentation', 'segmentation_renderer', 'Segmentation',
+    'SegmentationRenderer'
+],
+                         module=SegmentationRenderer)
+RENDERER.register_module(name=[
+    'silhouette', 'silhouette_renderer', 'Silhouette', 'SilhouetteRenderer'
+],
+                         module=SilhouetteRenderer)
+RENDERER.register_module(name=['uv_renderer', 'uv', 'UV', 'UVRenderer'],
+                         module=UVRenderer)
+
+
+def build_renderer(cfg):
+    """Build renderers."""
+    return RENDERER.build(cfg)
diff --git a/detrsmpl/core/renderer/torch3d_renderer/depth_renderer.py b/detrsmpl/core/renderer/torch3d_renderer/depth_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..51a5cbc5f2817769c92270316d6f4fd0b2c352de
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/depth_renderer.py
@@ -0,0 +1,109 @@
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from pytorch3d.structures import Meshes
+
+from detrsmpl.core.cameras import MMCamerasBase
+from .base_renderer import BaseRenderer
+from .shader import build_shader
+from .utils import normalize
+
+
+class DepthRenderer(BaseRenderer):
+    """Render depth map with the help of camera system."""
+    shader_type = 'DepthShader'
+
+    def __init__(
+        self,
+        resolution: Tuple[int, int] = None,
+        device: Union[torch.device, str] = 'cpu',
+        output_path: Optional[str] = None,
+        out_img_format: str = '%06d.png',
+        depth_max: Union[int, float, torch.Tensor] = None,
+        **kwargs,
+    ) -> None:
+        """Renderer for depth map of meshes.
+
+        Args:
+            resolution (Iterable[int]):
+                (width, height) of the rendered images resolution.
+            device (Union[torch.device, str], optional):
+                You can pass a str or torch.device for cpu or gpu render.
+                Defaults to 'cpu'.
+            output_path (Optional[str], optional):
+                Output path of the video or images to be saved.
+                Defaults to None.
+            out_img_format (str, optional): The image format string for
+                saving the images.
+                Defaults to '%06d.png'.
+
+            depth_max (Union[int, float, torch.Tensor], optional):
+                The max value for normalize depth range. Defaults to None.
+
+        Returns:
+            None
+        """
+        super().__init__(resolution=resolution,
+                         device=device,
+                         output_path=output_path,
+                         out_img_format=out_img_format,
+                         **kwargs)
+        self.depth_max = depth_max
+
+    def _init_renderer(self,
+                       rasterizer=None,
+                       shader=None,
+                       materials=None,
+                       lights=None,
+                       blend_params=None,
+                       **kwargs):
+        shader = build_shader(dict(
+            type='DepthShader')) if shader is None else shader
+        return super()._init_renderer(rasterizer, shader, materials, lights,
+                                      blend_params, **kwargs)
+
+    def forward(self,
+                meshes: Optional[Meshes] = None,
+                cameras: Optional[MMCamerasBase] = None,
+                indexes: Optional[Iterable[int]] = None,
+                backgrounds: Optional[torch.Tensor] = None,
+                **kwargs):
+        """Render depth map.
+
+        Args:
+            meshes (Optional[Meshes], optional): meshes to be rendered.
+                Defaults to None.
+            cameras (Optional[MMCamerasBase], optional): cameras for rendering.
+                Defaults to None.
+            indexes (Optional[Iterable[int]], optional): indexes for the
+                images.
+                Defaults to None.
+            backgrounds (Optional[torch.Tensor], optional): background images.
+                Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, None]: return tensor or None.
+        """
+        meshes = meshes.to(self.device)
+        self._update_resolution(cameras, **kwargs)
+
+        fragments = self.rasterizer(meshes_world=meshes, cameras=cameras)
+        depth_map = self.shader(fragments=fragments,
+                                meshes=meshes,
+                                cameras=cameras)
+
+        if self.output_path is not None:
+            rgba = self.tensor2rgba(depth_map)
+            if self.output_path is not None:
+                self._write_images(rgba, backgrounds, indexes)
+
+        return depth_map
+
+    def tensor2rgba(self, tensor: torch.Tensor):
+        rgbs, valid_masks = tensor.repeat(1, 1, 1, 3), (tensor > 0) * 1.0
+        depth_max = self.depth_max if self.depth_max is not None else rgbs.max(
+        )
+        rgbs = normalize(rgbs,
+                         origin_value_range=(0, depth_max),
+                         out_value_range=(0, 1))
+        return torch.cat([rgbs, valid_masks], -1)
diff --git a/detrsmpl/core/renderer/torch3d_renderer/lights/__init__.py b/detrsmpl/core/renderer/torch3d_renderer/lights/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..06d85748e0646a09f1d6828d1d6ffc636011bc27
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/lights/__init__.py
@@ -0,0 +1,10 @@
+# yapf: disable
+from .builder import (  # noqa: F401
+    AmbientLights,
+    DirectionalLights,
+    PointLights,
+    build_lights,
+)
+from .lights import MMLights  # noqa: F401
+
+# yapf: enable
diff --git a/detrsmpl/core/renderer/torch3d_renderer/lights/builder.py b/detrsmpl/core/renderer/torch3d_renderer/lights/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c82511f41722380486567503b83c817441070ce
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/lights/builder.py
@@ -0,0 +1,17 @@
+from mmcv.utils import Registry
+
+from .lights import AmbientLights, DirectionalLights, PointLights  # noqa:E401
+
+LIGHTS = Registry('lights')
+LIGHTS.register_module(
+    name=['directional', 'directional_lights', 'DirectionalLights'],
+    module=DirectionalLights)
+LIGHTS.register_module(name=['point', 'point_lights', 'PointLights'],
+                       module=PointLights)
+LIGHTS.register_module(name=['ambient', 'ambient_lights', 'AmbientLights'],
+                       module=AmbientLights)
+
+
+def build_lights(cfg):
+    """Build lights."""
+    return LIGHTS.build(cfg)
diff --git a/detrsmpl/core/renderer/torch3d_renderer/lights/lights.py b/detrsmpl/core/renderer/torch3d_renderer/lights/lights.py
new file mode 100644
index 0000000000000000000000000000000000000000..62eda2eddba87237c65e6f2924a8921cde6fed74
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/lights/lights.py
@@ -0,0 +1,80 @@
+from typing import Union
+
+import torch
+from pytorch3d.renderer.lighting import AmbientLights as _AmbientLights
+from pytorch3d.renderer.lighting import DirectionalLights as _DirectionalLights
+from pytorch3d.renderer.lighting import PointLights as _PointLights
+from pytorch3d.renderer.utils import TensorProperties
+
+MMLIGHT_ATTR = [
+    'ambient_color', 'diffuse_color', 'specular_color', 'location', 'direction'
+]
+
+
+class MMLights(TensorProperties):
+    def __init__(self, **kwargs) -> None:
+        super().__init__(**kwargs)
+        _N = 1
+        self.mmlight_attr_list = []
+        for attr_name in MMLIGHT_ATTR:
+            if hasattr(self, attr_name):
+                self.mmlight_attr_list.append(attr_name)
+        for k in self.mmlight_attr_list:
+            v = getattr(self, k)
+            if not isinstance(v, torch.Tensor):
+                v = torch.Tensor(v)
+            v = v.view(-1, 3)
+            setattr(self, k, v)
+
+            if getattr(self, k).shape[0] > _N:
+                _N = getattr(self, k).shape[0]
+        for k in self.mmlight_attr_list:
+            if getattr(self, k).shape[0] == 1:
+                setattr(self, k, getattr(self, k).repeat(_N, 1))
+        self._N = _N
+
+    def __len__(self, ):
+        return self._N
+
+    def __getitem__(self, index: Union[int, slice]):
+        if isinstance(index, int):
+            index = [index]
+        kwargs = {}
+        for k in self.mmlight_attr_list:
+            kwargs[k] = getattr(self, k)[index]
+
+        return self.__class__(device=self.device, **kwargs)
+
+    def extend(self, N):
+        kwargs = {}
+        for k in self.mmlight_attr_list:
+            kwargs[k] = getattr(self, k).repeat(N, 1)
+        return self.__class__(device=self.device, **kwargs)
+
+    def extend_(self, N):
+        for k in self.mmlight_attr_list:
+            setattr(self, k, getattr(self, k).repeat(N, 1))
+        self._N = N
+
+
+class AmbientLights(_AmbientLights, MMLights):
+    def __init__(self, ambient_color=None, device='cpu', **kwargs) -> None:
+        if ambient_color is None:
+            ambient_color = ((1.0, 1.0, 1.0), )
+        diffuse_color = ((0.0, 0.0, 0.0), )
+        super(_AmbientLights, self).__init__(ambient_color=ambient_color,
+                                             diffuse_color=diffuse_color,
+                                             device=device)
+
+    def __getitem__(self, index: Union[int, slice]):
+        return super(_AmbientLights, self).__getitem__(index)
+
+
+class PointLights(_PointLights, MMLights):
+    def __getitem__(self, index: Union[int, slice]):
+        return super(_PointLights, self).__getitem__(index)
+
+
+class DirectionalLights(_DirectionalLights, MMLights):
+    def __getitem__(self, index: Union[int, slice]):
+        return super(_DirectionalLights, self).__getitem__(index)
diff --git a/detrsmpl/core/renderer/torch3d_renderer/mesh_renderer.py b/detrsmpl/core/renderer/torch3d_renderer/mesh_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ff4876d6bf6cc259c666eb93cc3abcaf0abe581
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/mesh_renderer.py
@@ -0,0 +1,81 @@
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from pytorch3d.structures import Meshes
+
+from detrsmpl.core.cameras import MMCamerasBase
+from .base_renderer import BaseRenderer
+from .lights import MMLights
+
+
+class MeshRenderer(BaseRenderer):
+    """Render RGBA image with the help of camera system."""
+    shader_type = 'SoftPhongShader'
+
+    def __init__(
+        self,
+        resolution: Tuple[int, int] = None,
+        device: Union[torch.device, str] = 'cpu',
+        output_path: Optional[str] = None,
+        out_img_format: str = '%06d.png',
+        **kwargs,
+    ) -> None:
+        """Renderer for RGBA image of meshes.
+
+        Args:
+            resolution (Iterable[int]):
+                (width, height) of the rendered images resolution.
+            device (Union[torch.device, str], optional):
+                You can pass a str or torch.device for cpu or gpu render.
+                Defaults to 'cpu'.
+            output_path (Optional[str], optional):
+                Output path of the video or images to be saved.
+                Defaults to None.
+            out_img_format (str, optional): The image format string for
+                saving the images.
+                Defaults to '%06d.png'.
+        """
+        super().__init__(resolution=resolution,
+                         device=device,
+                         output_path=output_path,
+                         out_img_format=out_img_format,
+                         **kwargs)
+
+    def forward(self,
+                meshes: Meshes,
+                cameras: Optional[MMCamerasBase] = None,
+                lights: Optional[MMLights] = None,
+                indexes: Optional[Iterable[int]] = None,
+                backgrounds: Optional[torch.Tensor] = None,
+                **kwargs) -> Union[torch.Tensor, None]:
+        """Render Meshes.
+
+        Args:
+            meshes (Meshes): meshes to be rendered.
+            cameras (Optional[MMCamerasBase], optional): cameras for render.
+                Defaults to None.
+            lights (Optional[MMLights], optional): lights for render.
+                Defaults to None.
+            indexes (Optional[Iterable[int]], optional): indexes for images.
+                Defaults to None.
+            backgrounds (Optional[torch.Tensor], optional): background images.
+                Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, None]: return tensor or None.
+        """
+
+        meshes = meshes.to(self.device)
+        self._update_resolution(cameras, **kwargs)
+        fragments = self.rasterizer(meshes_world=meshes, cameras=cameras)
+
+        rendered_images = self.shader(
+            fragments=fragments,
+            meshes=meshes,
+            cameras=cameras,
+            lights=self.lights if lights is None else lights)
+
+        if self.output_path is not None:
+            rgba = self.tensor2rgba(rendered_images)
+            self._write_images(rgba, backgrounds, indexes)
+        return rendered_images
diff --git a/detrsmpl/core/renderer/torch3d_renderer/meshes.py b/detrsmpl/core/renderer/torch3d_renderer/meshes.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5b8783b78218e1635b6f61660dfa77f4d2d1e71
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/meshes.py
@@ -0,0 +1,526 @@
+from typing import Iterable, List, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from pytorch3d.renderer import TexturesUV, TexturesVertex
+from pytorch3d.renderer.mesh.textures import TexturesBase
+from pytorch3d.structures import Meshes, list_to_padded, padded_to_list
+
+from detrsmpl.models.body_models.builder import SMPL, SMPLX
+from detrsmpl.utils.mesh_utils import \
+    join_meshes_as_batch as _join_meshes_as_batch
+from .builder import build_renderer
+from .textures.textures import TexturesNearest
+from .utils import align_input_to_padded
+
+
+class ParametricMeshes(Meshes):
+    """Mesh structure for parametric body models, E.g., smpl, smplx, mano,
+    flame.
+
+    There are 3 ways to initialize the verts:
+        1): Pass the verts directly as verts_padded (N, V, 3) or verts_list
+        (list of (N, 3)).
+        2): Pass body_model and pose_params.
+        3): Pass meshes. Could be Meshes or ParametricMeshes.
+            Will use the verts from the meshes.
+    There are 3 ways to initialize the faces:
+        1): Pass the faces directly as faces_padded (N, F, 3) or faces_list
+        (list of (F, 3)).
+        2): Pass body_model and will use body_model.faces_tensor.
+        3): Pass meshes. Could be Meshes or ParametricMeshes.
+        Will use the faces from the meshes.
+    There are 4 ways to initialize the textures.
+        1): Pass the textures directly.
+        2): Pass the texture_images of shape (H, W, 3) for single person or
+        (_N_individual, H, W, 3) for multi-person. `body_model` should be
+        passed and should has `uv_renderer`.
+        3): Pass the vertex_color of shape (3) or (V, 3) or (N, V, 3).
+        4): Pass meshes. Could be Meshes or ParametricMeshes.
+        Will use the textures directly from the meshes.
+    """
+    # TODO: More model class to be added (FLAME, MANO)
+    MODEL_CLASSES = {'smpl': SMPL, 'smplx': SMPLX}
+
+    def __init__(self,
+                 verts: Union[List[torch.Tensor], torch.Tensor] = None,
+                 faces: Union[List[torch.Tensor], torch.Tensor] = None,
+                 textures: TexturesBase = None,
+                 meshes: Meshes = None,
+                 body_model: Union[nn.Module, dict] = None,
+                 uv_renderer: Union[nn.Module, dict] = None,
+                 vertex_color: Union[Iterable[float], torch.Tensor,
+                                     np.ndarray] = ((1, 1, 1), ),
+                 use_nearest: bool = False,
+                 texture_images: Union[torch.Tensor, List[torch.Tensor],
+                                       None] = None,
+                 model_type: str = 'smpl',
+                 N_individual_override: int = None,
+                 *,
+                 verts_normals: torch.Tensor = None,
+                 **pose_params) -> None:
+
+        if isinstance(meshes, Meshes):
+            verts = meshes.verts_padded()
+            faces = meshes.faces_padded()
+            textures = meshes.textures
+
+        self.model_type = body_model._get_name().lower(
+        ) if body_model is not None else model_type
+
+        self.model_class = self.MODEL_CLASSES[self.model_type]
+
+        use_list = False
+
+        # formart verts as verts_padded: (N, V, 3)
+        if verts is None:
+            assert body_model is not None
+            verts = body_model(**pose_params)['vertices']
+        elif isinstance(verts, list):
+            verts = list_to_padded(verts)
+            use_list = True
+        # specify number of individuals
+        if N_individual_override is not None:
+            verts = verts.view(
+                -1, self.model_class.NUM_VERTS * N_individual_override, 3)
+
+        # the information of _N_individual should be revealed in verts's shape
+        self._N_individual = int(verts.shape[-2] // self.model_class.NUM_VERTS)
+
+        assert verts.shape[1] % self.model_class.NUM_VERTS == 0
+        verts = verts.view(-1, self.model_class.NUM_VERTS * self._N_individual,
+                           3)
+        device = verts.device
+        N, V, _ = verts.shape
+
+        # formart faces as faces_padded: (N, F, 3)
+        if isinstance(faces, list):
+            faces = list_to_padded(faces)
+            self.face_individual = faces[0][:self.model_class.NUM_FACES].to(
+                device)
+        elif faces is None:
+            assert body_model is not None
+            self.face_individual = body_model.faces_tensor[None].to(device)
+            faces = self.get_faces_padded(N, self._N_individual)
+        elif isinstance(faces, torch.Tensor):
+            faces = align_input_to_padded(faces, ndim=3, batch_size=N)
+            self.face_individual = faces[:1, :self.model_class.NUM_FACES].to(
+                device)
+        else:
+            raise ValueError(f'Wrong type of faces: {type(faces)}.')
+
+        assert faces.shape == (N,
+                               self.model_class.NUM_FACES * self._N_individual,
+                               3)
+        F = faces.shape[1]
+        if textures is None:
+            if texture_images is None:
+                # input vertex_color should be
+                #   (3), (1, 3), (1, 1, 3). all the same color
+                #   (V, 3), (1, V, 3), each vertex has a single color
+                #   (N, V, 3), each batch each vertex has a single color
+                if isinstance(vertex_color, (tuple, list)):
+                    vertex_color = torch.Tensor(vertex_color)
+                elif isinstance(vertex_color, np.ndarray):
+                    vertex_color = torch.from_numpy(vertex_color)
+                if vertex_color.numel() == 3:
+                    vertex_color = vertex_color.view(1, 3).repeat(V, 1)
+                vertex_color = align_input_to_padded(vertex_color,
+                                                     ndim=3,
+                                                     batch_size=N)
+                assert vertex_color.shape == verts.shape
+                if use_nearest:
+                    textures = TexturesNearest(
+                        verts_features=vertex_color).to(device)
+                else:
+                    textures = TexturesVertex(
+                        verts_features=vertex_color).to(device)
+            else:
+
+                texture_images = align_input_to_padded(texture_images,
+                                                       ndim=4,
+                                                       batch_size=N).to(device)
+
+                assert uv_renderer is not None
+                if isinstance(uv_renderer, dict):
+                    uv_renderer = build_renderer(uv_renderer)
+                uv_renderer = uv_renderer.to(device)
+                textures = uv_renderer.wrap_texture(texture_images).to(device)
+                if self._N_individual > 1:
+                    textures = textures.join_scene()
+                    textures = textures.extend(N)
+
+        num_verts_per_mesh = [V for _ in range(N)]
+        num_faces_per_mesh = [F for _ in range(N)]
+
+        if use_list:
+            verts = padded_to_list(verts, num_verts_per_mesh)
+            faces = padded_to_list(faces, num_faces_per_mesh)
+        super().__init__(
+            verts=verts,
+            faces=faces,
+            textures=textures,
+            verts_normals=verts_normals,
+        )
+
+    def get_faces_padded(self, N_batch, N_individual):
+        faces = self.face_individual.repeat(N_batch, N_individual, 1)
+        faces_offset = torch.arange(N_individual).view(N_individual, 1).repeat(
+            1, self.model_class.NUM_FACES).view(1, -1, 1).to(faces.device)
+        faces = faces + faces_offset * self.model_class.NUM_VERTS
+        return faces
+
+    def _compute_list(self):
+        self._faces_list = self.faces_list()
+        self._verts_list = self.verts_list()
+
+    def extend(self, N_batch: int, N_scene: int = 1):
+        if N_batch == 1:
+            meshes_batch = self
+        else:
+            meshes_batch = join_meshes_as_batch([self for _ in range(N_batch)])
+
+        if N_scene == 1:
+            meshes = meshes_batch
+        else:
+            meshes = join_batch_meshes_as_scene(
+                [meshes_batch for _ in range(N_scene)])
+        return meshes
+
+    def clone(self):
+        """Modified from pytorch3d and add `model_type` in
+        __class__.__init__."""
+        verts_list = self.verts_list()
+        faces_list = self.faces_list()
+        new_verts_list = [v.clone() for v in verts_list]
+        new_faces_list = [f.clone() for f in faces_list]
+        other = self.__class__(verts=new_verts_list,
+                               faces=new_faces_list,
+                               model_type=self.model_type)
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.clone())
+
+        # Textures is not a tensor but has a clone method
+        if self.textures is not None:
+            other.textures = self.textures.clone()
+        return other
+
+    def detach(self):
+        """Modified from pytorch3d and add `model_type` in
+        __class__.__init__."""
+        verts_list = self.verts_list()
+        faces_list = self.faces_list()
+        new_verts_list = [v.detach() for v in verts_list]
+        new_faces_list = [f.detach() for f in faces_list]
+        other = self.__class__(verts=new_verts_list,
+                               faces=new_faces_list,
+                               model_type=self.model_type)
+
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.detach())
+
+        # Textures is not a tensor but has a detach method
+        if self.textures is not None:
+            other.textures = self.textures.detach()
+        return other
+
+    def update_padded(self, new_verts_padded: torch.Tensor):
+        """Modified from pytorch3d and add `model_type` in
+        __class__.__init__."""
+        def check_shapes(x, size):
+            if x.shape[0] != size[0]:
+                raise ValueError('new values must have the same batch size.')
+            if x.shape[1] != size[1]:
+                raise ValueError(
+                    'new values must have the same number of points.')
+            if x.shape[2] != size[2]:
+                raise ValueError('new values must have the same dimension.')
+
+        check_shapes(new_verts_padded, [self._N, self._V, 3])
+
+        new = self.__class__(verts=new_verts_padded,
+                             faces=self.faces_padded(),
+                             model_type=self.model_type)
+
+        if new._N != self._N or new._V != self._V or new._F != self._F:
+            raise ValueError('Inconsistent sizes after construction.')
+
+        # overwrite the equisized flag
+        new.equisized = self.equisized
+
+        # overwrite textures if any
+        new.textures = self.textures
+
+        # copy auxiliary tensors
+        copy_tensors = ['_num_verts_per_mesh', '_num_faces_per_mesh', 'valid']
+
+        for k in copy_tensors:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(new, k, v)  # shallow copy
+
+        # shallow copy of faces_list if any, st new.faces_list()
+        # does not re-compute from _faces_padded
+        new._faces_list = self._faces_list
+
+        # update verts/faces packed if they are computed in self
+        if self._verts_packed is not None:
+            copy_tensors = [
+                '_faces_packed',
+                '_verts_packed_to_mesh_idx',
+                '_faces_packed_to_mesh_idx',
+                '_mesh_to_verts_packed_first_idx',
+                '_mesh_to_faces_packed_first_idx',
+            ]
+            for k in copy_tensors:
+                v = getattr(self, k)
+                assert torch.is_tensor(v)
+                setattr(new, k, v)  # shallow copy
+            # update verts_packed
+            pad_to_packed = self.verts_padded_to_packed_idx()
+            new_verts_packed = new_verts_padded.reshape(-1,
+                                                        3)[pad_to_packed, :]
+            new._verts_packed = new_verts_packed
+            new._verts_padded_to_packed_idx = pad_to_packed
+
+        # update edges packed if they are computed in self
+        if self._edges_packed is not None:
+            copy_tensors = [
+                '_edges_packed',
+                '_edges_packed_to_mesh_idx',
+                '_mesh_to_edges_packed_first_idx',
+                '_faces_packed_to_edges_packed',
+                '_num_edges_per_mesh',
+            ]
+            for k in copy_tensors:
+                v = getattr(self, k)
+                assert torch.is_tensor(v)
+                setattr(new, k, v)  # shallow copy
+
+        # update laplacian if it is compute in self
+        if self._laplacian_packed is not None:
+            new._laplacian_packed = self._laplacian_packed
+
+        assert new._verts_list is None
+        assert new._verts_normals_packed is None
+        assert new._faces_normals_packed is None
+        assert new._faces_areas_packed is None
+
+        return new
+
+    def __getitem__(self, index: Union[tuple, int, list, slice, torch.Tensor]):
+        """Slice the meshes by the batch dim like pytorch3d Meshes. And slice
+        by scene dim due to the topology of the parametric meshes.
+
+        Args:
+            index (Union[tuple, int, list, slice, torch.Tensor]): indexes, if
+            pass only one augment, will ignore the scene dim.
+        """
+        if isinstance(index, tuple):
+            batch_index, individual_index = index
+        else:
+            batch_index, individual_index = index, None
+
+        if isinstance(batch_index, int):
+            batch_index = [batch_index]
+        elif isinstance(batch_index, (tuple, list, slice)):
+            batch_index = torch.arange(self._N)[batch_index]
+        batch_index = torch.tensor(batch_index) if not isinstance(
+            batch_index, torch.Tensor) else batch_index
+        batch_index = batch_index.to(self.device, dtype=torch.long)
+
+        if (batch_index >= self._N).any():
+            raise IndexError('list index out of range')
+
+        if individual_index is None:
+            return self.__class__(verts=self.verts_padded()[batch_index],
+                                  faces=self.faces_padded()[batch_index],
+                                  textures=self.textures[batch_index]
+                                  if self.textures is not None else None,
+                                  model_type=self.model_type)
+
+        if isinstance(individual_index, int):
+            individual_index = [individual_index]
+        elif isinstance(individual_index, (tuple, list, slice)):
+            individual_index = torch.arange(
+                self._N_individual)[individual_index]
+        individual_index = torch.tensor(individual_index) if not isinstance(
+            individual_index, torch.Tensor) else individual_index
+        if (individual_index > self._N_individual).any():
+            raise IndexError('list index out of range')
+        vertex_index = [
+            torch.arange(self.model_class.NUM_VERTS) +
+            idx * self.model_class.NUM_VERTS for idx in individual_index
+        ]
+        vertex_index = torch.cat(vertex_index).to(self.device).long()
+
+        new_face_num = self.model_class.NUM_FACES * len(individual_index)
+
+        verts_padded = self.verts_padded()[batch_index][:, vertex_index]
+        faces_padded = self.get_faces_padded(len(verts_padded),
+                                             len(individual_index))
+
+        textures_batch = self.textures[batch_index]
+
+        if isinstance(textures_batch, TexturesUV):
+            # TODO: there is still some problem with `TexturesUV`
+            # slice and need to fix the function `join_meshes_as_scene`.
+            # It is recommended that we re-inplement the `TexturesUV`
+            # as `ParametricTexturesUV`, mainly for the `__getitem__`
+            # and `join_scene` functions.
+
+            # textures_batch.get('unique_map_index ')
+
+            # This version only consider the maps tensor as different id.
+            maps = textures_batch.maps_padded()
+            width_individual = maps.shape[-2] // self._N_individual
+            maps_index = [
+                torch.arange(width_individual * idx,
+                             width_individual * (idx + 1))
+                for idx in individual_index
+            ]
+            maps_index = torch.cat(maps_index).to(self.device)
+            verts_uvs_padded = textures_batch.verts_uvs_padded(
+            )[:, :len(vertex_index)] * torch.Tensor([
+                self._N_individual / len(individual_index), 1
+            ]).view(1, 1, 2).to(self.device)
+            faces_uvs_padded = textures_batch.faces_uvs_padded(
+            )[:, :new_face_num]
+            maps_padded = maps[:, :, maps_index]
+            textures = TexturesUV(faces_uvs=faces_uvs_padded,
+                                  verts_uvs=verts_uvs_padded,
+                                  maps=maps_padded)
+        elif isinstance(textures_batch, (TexturesVertex, TexturesNearest)):
+            verts_features_padded = textures_batch.verts_features_padded(
+            )[:, vertex_index]
+            textures = textures_batch.__class__(verts_features_padded)
+        meshes = self.__class__(verts=verts_padded,
+                                faces=faces_padded,
+                                textures=textures,
+                                model_type=self.model_type)
+        return meshes
+
+    @property
+    def shape(self, ):
+        return (len(self), self._N_individual)
+
+
+def join_meshes_as_batch(meshes: List[ParametricMeshes],
+                         include_textures: bool = True) -> ParametricMeshes:
+    """Join the meshes along the batch dim.
+
+    Args:
+        meshes (Union[ParametricMeshes, List[ParametricMeshes, Meshes,
+            List[Meshes]]]): Meshes object that contains a batch of meshes,
+            or a list of Meshes objects.
+        include_textures (bool, optional): whether to try to join the textures.
+            Defaults to True.
+
+    Returns:
+        ParametricMeshes: the joined ParametricMeshes.
+    """
+    if isinstance(meshes, ParametricMeshes):
+        raise ValueError('Wrong first argument to join_meshes_as_batch.')
+    first = meshes[0]
+
+    assert all(mesh.model_type == first.model_type
+               for mesh in meshes), 'model_type should all be the same.'
+
+    meshes = _join_meshes_as_batch(meshes, include_textures=include_textures)
+    return ParametricMeshes(model_type=first.model_type, meshes=meshes)
+
+
+def join_meshes_as_scene(meshes: Union[ParametricMeshes,
+                                       List[ParametricMeshes]],
+                         include_textures: bool = True) -> ParametricMeshes:
+    """Join the meshes along the scene dim.
+
+    Args:
+        meshes (Union[ParametricMeshes, List[ParametricMeshes]]):
+            ParametricMeshes object that contains a batch of meshes,
+            or a list of ParametricMeshes objects.
+        include_textures (bool, optional): whether to try to join the textures.
+            Defaults to True.
+
+    Returns:
+        ParametricMeshes: the joined ParametricMeshes.
+    """
+    first = meshes[0]
+    assert all(mesh.model_type == first.model_type
+               for mesh in meshes), 'model_type should all be the same.'
+
+    if isinstance(meshes, List):
+        meshes = join_meshes_as_batch(meshes,
+                                      include_textures=include_textures)
+
+    if len(meshes) == 1:
+        return meshes
+    verts = meshes.verts_packed()  # (sum(V_n), 3)
+    # Offset automatically done by faces_packed
+    faces = meshes.faces_packed()  # (sum(F_n), 3)
+    textures = None
+
+    if include_textures and meshes.textures is not None:
+        textures = meshes.textures.join_scene()
+
+    mesh = ParametricMeshes(verts=verts.unsqueeze(0),
+                            faces=faces.unsqueeze(0),
+                            textures=textures,
+                            model_type=first.model_type)
+
+    return mesh
+
+
+def join_batch_meshes_as_scene(
+        meshes: List[ParametricMeshes],
+        include_textures: bool = True) -> ParametricMeshes:
+    """Join `meshes` as a scene each batch. For ParametricMeshes. The Meshes
+    must share the same batch size, and topology could be different. They must
+    all be on the same device. If `include_textures` is true, the textures
+    should be the same type, all be None is not accepted. If `include_textures`
+    is False, textures are ignored. The return meshes will have no textures.
+
+    Args:
+        meshes (List[ParametricMeshes]): Meshes object that contains a list of
+            Meshes objects.
+        include_textures (bool, optional): whether to try to join the textures.
+            Defaults to True.
+
+
+    Returns:
+        New Meshes which has join different Meshes by each batch.
+    """
+    first = meshes[0]
+
+    assert all(mesh.model_type == first.model_type
+               for mesh in meshes), 'model_type should all be the same.'
+
+    assert all(len(mesh) == len(first) for mesh in meshes)
+    if not all(mesh.shape[1] == first.shape[1] for mesh in meshes):
+        meshes_temp = []
+        for mesh_scene in meshes:
+            meshes_temp.extend([
+                mesh_scene[:, individual_index]
+                for individual_index in range(mesh_scene._N_individual)
+            ])
+        meshes = meshes_temp
+    for mesh in meshes:
+        mesh._verts_list = padded_to_list(mesh.verts_padded(),
+                                          mesh.num_verts_per_mesh().tolist())
+    num_scene_size = len(meshes)
+    num_batch_size = len(meshes[0])
+
+    meshes_all = []
+    for j in range(num_batch_size):
+        meshes_batch = []
+        for i in range(num_scene_size):
+            meshes_batch.append(meshes[i][j])
+        meshes_all.append(join_meshes_as_scene(meshes_batch, include_textures))
+    meshes_final = join_meshes_as_batch(meshes_all, include_textures)
+
+    return meshes_final
diff --git a/detrsmpl/core/renderer/torch3d_renderer/normal_renderer.py b/detrsmpl/core/renderer/torch3d_renderer/normal_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e9364830c251302261b3a7f1b5578e86e706495
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/normal_renderer.py
@@ -0,0 +1,89 @@
+from typing import Iterable, Optional, Union
+
+import torch
+from pytorch3d.structures import Meshes
+
+from detrsmpl.core.cameras import MMCamerasBase
+from .base_renderer import BaseRenderer
+from .utils import normalize
+
+
+class NormalRenderer(BaseRenderer):
+    """Render normal map with the help of camera system."""
+    shader_type = 'NormalShader'
+
+    def __init__(
+        self,
+        resolution: Iterable[int] = None,
+        device: Union[torch.device, str] = 'cpu',
+        output_path: Optional[str] = None,
+        out_img_format: str = '%06d.png',
+        **kwargs,
+    ) -> None:
+        """Renderer for normal map of meshes.
+
+        Args:
+            resolution (Iterable[int]):
+                (width, height) of the rendered images resolution.
+            device (Union[torch.device, str], optional):
+                You can pass a str or torch.device for cpu or gpu render.
+                Defaults to 'cpu'.
+            output_path (Optional[str], optional):
+                Output path of the video or images to be saved.
+                Defaults to None.
+            out_img_format (str, optional): The image format string for
+                saving the images.
+                Defaults to '%06d.png'.
+
+        Returns:
+            None
+        """
+        super().__init__(resolution=resolution,
+                         device=device,
+                         output_path=output_path,
+                         obj_path=None,
+                         out_img_format=out_img_format,
+                         **kwargs)
+
+    def forward(self,
+                meshes: Optional[Meshes] = None,
+                cameras: Optional[MMCamerasBase] = None,
+                indexes: Optional[Iterable[int]] = None,
+                backgrounds: Optional[torch.Tensor] = None,
+                **kwargs):
+        """Render Meshes.
+
+        Args:
+            meshes (Optional[Meshes], optional): meshes to be rendered.
+                Defaults to None.
+            cameras (Optional[MMCamerasBase], optional): cameras for render.
+                Defaults to None.
+            indexes (Optional[Iterable[int]], optional): indexes for the
+                images.
+                Defaults to None.
+            backgrounds (Optional[torch.Tensor], optional): background images.
+                Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, None]: return tensor or None.
+        """
+
+        meshes = meshes.to(self.device)
+        self._update_resolution(cameras, **kwargs)
+        fragments = self.rasterizer(meshes_world=meshes, cameras=cameras)
+        normal_map = self.shader(fragments=fragments,
+                                 meshes=meshes,
+                                 cameras=cameras)
+
+        if self.output_path is not None:
+            rgba = self.tensor2rgba(normal_map)
+            self._write_images(rgba, backgrounds, indexes)
+
+        return normal_map
+
+    def tensor2rgba(self, tensor: torch.Tensor):
+        rgbs, valid_masks = tensor[..., :3], (tensor[..., 3:] > 0) * 1.0
+        rgbs = normalize(rgbs,
+                         origin_value_range=(-1, 1),
+                         out_value_range=(0, 1))
+        return torch.cat([rgbs, valid_masks], -1)
diff --git a/detrsmpl/core/renderer/torch3d_renderer/pointcloud_renderer.py b/detrsmpl/core/renderer/torch3d_renderer/pointcloud_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..045e69ba5346d94811c39ba0b1440b9e642d2ac0
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/pointcloud_renderer.py
@@ -0,0 +1,161 @@
+import warnings
+from typing import Iterable, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from pytorch3d.renderer import (
+    AlphaCompositor,
+    PointsRasterizationSettings,
+    PointsRasterizer,
+)
+from pytorch3d.structures import Meshes, Pointclouds
+
+from detrsmpl.core.cameras import MMCamerasBase
+from detrsmpl.utils.mesh_utils import mesh_to_pointcloud_vc
+from .base_renderer import BaseRenderer
+
+
+class PointCloudRenderer(BaseRenderer):
+    def __init__(self,
+                 resolution: Tuple[int, int] = None,
+                 device: Union[torch.device, str] = 'cpu',
+                 output_path: Optional[str] = None,
+                 out_img_format: str = '%06d.png',
+                 radius: Optional[float] = None,
+                 **kwargs) -> None:
+        """Point cloud renderer.
+
+        Args:
+            resolution (Iterable[int]):
+                (width, height) of the rendered images resolution.
+            device (Union[torch.device, str], optional):
+                You can pass a str or torch.device for cpu or gpu render.
+                Defaults to 'cpu'.
+            output_path (Optional[str], optional):
+                Output path of the video or images to be saved.
+                Defaults to None.
+            out_img_format (str, optional): name format for temp images.
+                Defaults to '%06d.png'.
+            radius (float, optional): radius of points. Defaults to None.
+
+        Returns:
+            None
+        """
+        self.radius = radius
+        super().__init__(resolution=resolution,
+                         device=device,
+                         output_path=output_path,
+                         out_img_format=out_img_format,
+                         **kwargs)
+
+    def to(self, device):
+        if isinstance(device, str):
+            device = torch.device(device)
+        self.device = device
+        if getattr(self.rasterizer, 'cameras', None) is not None:
+            self.rasterizer.cameras = self.rasterizer.cameras.to(device)
+
+        self.compositor = self.compositor.to(device)
+        return self
+
+    def _init_renderer(self, rasterizer=None, compositor=None, **kwargs):
+        """Set render params."""
+
+        if isinstance(rasterizer, nn.Module):
+            rasterizer.raster_settings.image_size = self.resolution
+            self.rasterizer = rasterizer
+        elif isinstance(rasterizer, dict):
+            rasterizer['image_size'] = self.resolution
+            if self.radius is not None:
+                rasterizer.update(radius=self.radius)
+            raster_settings = PointsRasterizationSettings(**rasterizer)
+            self.rasterizer = PointsRasterizer(raster_settings=raster_settings)
+        elif rasterizer is None:
+            self.rasterizer = PointsRasterizer(
+                raster_settings=PointsRasterizationSettings(
+                    radius=self.radius,
+                    image_size=self.resolution,
+                    points_per_pixel=10))
+        else:
+            raise TypeError(
+                f'Wrong type of rasterizer: {type(self.rasterizer)}.')
+
+        if isinstance(compositor, dict):
+            self.compositor = AlphaCompositor(**compositor)
+        elif isinstance(compositor, nn.Module):
+            self.compositor = compositor
+        elif compositor is None:
+            self.compositor = AlphaCompositor()
+        else:
+            raise TypeError(
+                f'Wrong type of compositor: {type(self.compositor)}.')
+        self = self.to(self.device)
+
+    def forward(
+        self,
+        pointclouds: Optional[Pointclouds] = None,
+        vertices: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        verts_rgba: Optional[Union[torch.Tensor, List[torch.Tensor]]] = None,
+        meshes: Meshes = None,
+        cameras: Optional[MMCamerasBase] = None,
+        indexes: Optional[Iterable[int]] = None,
+        backgrounds: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[None, torch.Tensor]:
+        """Render pointclouds.
+
+        Args:
+            pointclouds (Optional[Pointclouds], optional): pytorch3d data
+                structure. If not None, `vertices` and `verts_rgba` will
+                be ignored.
+                Defaults to None.
+            vertices (Optional[Union[torch.Tensor, List[torch.Tensor]]],
+                optional): coordinate tensor of points. Defaults to None.
+            verts_rgba (Optional[Union[torch.Tensor, List[torch.Tensor]]],
+                optional): color tensor of points. Defaults to None.
+            indexes (Optional[Iterable[int]], optional): indexes for the
+                images.
+                Defaults to None.
+            backgrounds (Optional[torch.Tensor], optional): background images.
+                Defaults to None.
+
+        Returns:
+            Union[None, torch.Tensor]: Return tensor or None.
+        """
+        if pointclouds is None:
+            if meshes is not None:
+                pointclouds = mesh_to_pointcloud_vc(meshes)
+            else:
+                assert vertices is not None
+                if isinstance(vertices, torch.Tensor):
+                    if vertices.ndim == 2:
+                        vertices = vertices[None]
+                if isinstance(verts_rgba, torch.Tensor):
+                    if verts_rgba.ndim == 2:
+                        verts_rgba = verts_rgba[None]
+                pointclouds = Pointclouds(points=vertices, features=verts_rgba)
+        else:
+            if vertices is not None or verts_rgba is not None:
+                warnings.warn(
+                    'Redundant input, will ignore `vertices` and `verts_rgb`.')
+        pointclouds = pointclouds.to(self.device)
+        self._update_resolution(cameras, **kwargs)
+        fragments = self.rasterizer(pointclouds, cameras=cameras)
+        r = self.rasterizer.raster_settings.radius
+
+        dists2 = fragments.dists.permute(0, 3, 1, 2)
+        weights = 1 - dists2 / (r * r)
+        rendered_images = self.compositor(
+            fragments.idx.long().permute(0, 3, 1, 2),
+            weights,
+            pointclouds.features_packed().permute(1, 0),
+            **kwargs,
+        )
+        rendered_images = rendered_images.permute(0, 2, 3, 1)
+
+        if self.output_path is not None:
+            rgba = self.tensor2rgba(rendered_images)
+            if self.output_path is not None:
+                self.write_images(rgba, backgrounds, indexes)
+
+        return rendered_images
diff --git a/detrsmpl/core/renderer/torch3d_renderer/render_runner.py b/detrsmpl/core/renderer/torch3d_renderer/render_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9c313011b206438a1b075d920cc22f90d71b32c
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/render_runner.py
@@ -0,0 +1,125 @@
+import math
+import os
+from typing import Iterable, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from pytorch3d.renderer import MeshRenderer, SoftSilhouetteShader
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.structures import Meshes
+from tqdm import trange
+
+from detrsmpl.core.cameras import MMCamerasBase
+from detrsmpl.core.cameras.builder import build_cameras
+from .base_renderer import BaseRenderer
+from .builder import build_renderer
+from .lights import AmbientLights, MMLights, build_lights
+
+osj = os.path.join
+
+
+def render(renderer: Union[nn.Module, dict],
+           meshes: Union[Meshes, None] = None,
+           output_path: Optional[str] = None,
+           resolution: Union[Iterable[int], int] = None,
+           device: Union[str, torch.device] = 'cpu',
+           cameras: Union[MMCamerasBase, CamerasBase, dict, None] = None,
+           lights: Union[MMLights, dict, None] = None,
+           batch_size: int = 5,
+           return_tensor: bool = False,
+           no_grad: bool = False,
+           verbose: bool = True,
+           **forward_params):
+
+    if isinstance(renderer, dict):
+        renderer = build_renderer(renderer)
+    elif isinstance(renderer, MeshRenderer):
+        if isinstance(renderer.shader, SoftSilhouetteShader):
+            renderer = build_renderer(
+                dict(type='silhouette',
+                     resolution=resolution,
+                     shader=renderer.shader,
+                     rasterizer=renderer.rasterizer))
+        else:
+            renderer = build_renderer(
+                dict(type='mesh',
+                     resolution=resolution,
+                     shader=renderer.shader,
+                     rasterizer=renderer.rasterizer))
+    elif isinstance(renderer, BaseRenderer):
+        renderer = renderer
+    else:
+        raise TypeError('Wrong input renderer type.')
+
+    renderer = renderer.to(device)
+    if output_path is not None:
+        renderer._set_output_path(output_path)
+
+    if isinstance(cameras, dict):
+        cameras = build_cameras(cameras)
+    elif isinstance(cameras, MMCamerasBase):
+        cameras = cameras
+    elif isinstance(cameras,
+                    CamerasBase) and not isinstance(cameras, MMCamerasBase):
+        cameras = build_cameras(
+            dict(type=cameras.__class__.__name__,
+                 K=cameras.K,
+                 R=cameras.R,
+                 T=cameras.T,
+                 in_ndc=cameras.in_ndc(),
+                 resolution=resolution))
+    else:
+        raise TypeError('Wrong input cameras type.')
+    num_frames = len(meshes)
+    if isinstance(lights, dict):
+        lights = build_lights(lights)
+    elif isinstance(lights, MMLights):
+        lights = lights
+    elif lights is None:
+        lights = AmbientLights(device=device).extend(num_frames)
+    else:
+        raise ValueError('Wrong light type.')
+
+    if len(cameras) == 1:
+        cameras = cameras.extend(num_frames)
+    if len(lights) == 1:
+        lights = lights.extend(num_frames)
+
+    forward_params.update(lights=lights, cameras=cameras, meshes=meshes)
+
+    batch_size = min(batch_size, num_frames)
+    tensors = []
+    for k in forward_params:
+        if isinstance(forward_params[k], np.ndarray):
+            forward_params.update(
+                {k: torch.tensor(forward_params[k]).to(device)})
+    if verbose:
+        iter_func = trange
+    else:
+        iter_func = range
+    for i in iter_func(math.ceil(num_frames // batch_size)):
+        indexes = list(
+            range(i * batch_size, min((i + 1) * batch_size, len(meshes))))
+        foward_params_batch = {}
+
+        for k in forward_params:
+            if hasattr(forward_params[k], '__getitem__'):
+                foward_params_batch[k] = forward_params[k][indexes].to(device)
+
+        if no_grad:
+            with torch.no_grad():
+                images_batch = renderer(indexes=indexes, **foward_params_batch)
+
+        else:
+            images_batch = renderer(indexes=indexes, **foward_params_batch)
+        # if return_tensor:
+        tensors.append(images_batch)
+
+    renderer.export()
+
+    if return_tensor:
+        tensors = torch.cat(tensors)
+        return tensors
+    else:
+        return np.concatenate(tensors)
diff --git a/detrsmpl/core/renderer/torch3d_renderer/render_smpl_config.py b/detrsmpl/core/renderer/torch3d_renderer/render_smpl_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..4afc80b78b7ccd679dd08d71b969b261de242e7d
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/render_smpl_config.py
@@ -0,0 +1,149 @@
+base_directional_light = {
+    'type': 'directional',
+    'direction': [[1, 1, 1]],
+    'ambient_color': [[0.5, 0.5, 0.5]],
+    'diffuse_color': [[0.5, 0.5, 0.5]],
+    'specular_color': [[0.5, 0.5, 0.5]],
+}
+
+base_point_light = {
+    'type': 'point',
+    'ambient_color': [[1, 1, 1]],
+    'diffuse_color': [[0.3, 0.3, 0.3]],
+    'specular_color': [[0.5, 0.5, 0.5]],
+    'location': [[2.0, 2.0, -2.0]],
+}
+
+base_ambient_light = {
+    'type': 'ambient',
+    'ambient_color': [[1.0, 1.0, 1.0]],
+}
+
+base_material = {
+    'ambient_color': [[1, 1, 1]],
+    'diffuse_color': [[0.5, 0.5, 0.5]],
+    'specular_color': [[0.15, 0.15, 0.15]],
+    'shininess': 60.0,
+}
+
+silhouete_material = {
+    'ambient_color': [[1.0, 1.0, 1.0]],
+    'diffuse_color': [[0.0, 0.0, 0.0]],
+    'specular_color': [[0.0, 0.0, 0.0]],
+    'shininess': 1.0,
+}
+
+white_blend_params = {'background_color': (1.0, 1.0, 1.0)}
+
+black_blend_params = {'background_color': (0.0, 0.0, 0.0)}
+
+RENDER_CONFIGS = {
+    # low quality
+    'lq': {
+        'type': 'mesh',
+        'shader': {
+            'type': 'hard_flat'
+        },
+        'lights': base_directional_light,
+        'materials': base_material,
+        'rasterizer': {
+            'bin_size': 0,
+            'blur_radius': 0.0,
+            'faces_per_pixel': 1,
+            'perspective_correct': False,
+        },
+        'blend_params': white_blend_params,
+    },
+    # medium quality
+    'mq': {
+        'type': 'mesh',
+        'shader': {
+            'type': 'soft_gouraud'
+        },
+        'lights': base_directional_light,
+        'materials': base_material,
+        'rasterizer': {
+            'bin_size': 0,
+            'blur_radius': 0.0,
+            'faces_per_pixel': 1,
+            'perspective_correct': False,
+        },
+        'blend_params': white_blend_params,
+    },
+    # high quality
+    'hq': {
+        'type': 'mesh',
+        'shader': {
+            'type': 'soft_phong'
+        },
+        'lights': base_directional_light,
+        'materials': base_material,
+        'rasterizer': {
+            'bin_size': 0,
+            'blur_radius': 0.0,
+            'faces_per_pixel': 1,
+            'perspective_correct': False,
+        },
+        'blend_params': white_blend_params,
+    },
+    'silhouette': {
+        'type': 'silhouette',
+        'lights': None,
+        'materials': silhouete_material,
+        'rasterizer': {
+            'bin_size': 0,
+            'blur_radius': 2e-5,
+            'faces_per_pixel': 50,
+            'perspective_correct': False,
+        },
+        'blend_params': black_blend_params,
+    },
+    'part_silhouette': {
+        'type': 'segmentation',
+        'material': base_material,
+        'rasterizer': {
+            'bin_size': 0,
+            'blur_radius': 0.0,
+            'faces_per_pixel': 1,
+            'perspective_correct': False,
+        },
+        'blend_params': black_blend_params,
+    },
+    'depth': {
+        'type': 'depth',
+        'rasterizer': {
+            'bin_size': 0,
+            'blur_radius': 0.0,
+            'faces_per_pixel': 1,
+            'perspective_correct': False,
+        },
+        'blend_params': black_blend_params,
+    },
+    'normal': {
+        'type': 'normal',
+        'rasterizer': {
+            'bin_size': 0,
+            'blur_radius': 0.0,
+            'faces_per_pixel': 1,
+            'perspective_correct': False,
+        },
+        'blend_params': white_blend_params,
+    },
+    'pointcloud': {
+        'type': 'pointcloud',
+        'compositor': {
+            'background_color': [
+                1.0,
+                1.0,
+                1.0,
+                0.0,
+            ],
+        },
+        'rasterizer': {
+            'points_per_pixel': 10,
+            'radius': 0.003,
+            'bin_size': None,
+            'max_points_per_bin': None,
+        }
+    }
+}
diff --git a/detrsmpl/core/renderer/torch3d_renderer/segmentation_renderer.py b/detrsmpl/core/renderer/torch3d_renderer/segmentation_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c403e4073af365e1709879fb1ad0ff48124d4df9
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/segmentation_renderer.py
@@ -0,0 +1,106 @@
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from pytorch3d.structures import Meshes
+
+from detrsmpl.core.cameras import MMCamerasBase
+from detrsmpl.utils.demo_utils import get_different_colors
+from .base_renderer import BaseRenderer
+from .utils import normalize
+
+
+class SegmentationRenderer(BaseRenderer):
+    """Render segmentation map into a segmentation index tensor."""
+    shader_type = 'SegmentationShader'
+
+    def __init__(self,
+                 resolution: Tuple[int, int] = None,
+                 device: Union[torch.device, str] = 'cpu',
+                 output_path: Optional[str] = None,
+                 out_img_format: str = '%06d.png',
+                 num_class: int = 1,
+                 **kwargs) -> None:
+        """Render vertex-color mesh into a segmentation map of a (B, H, W)
+        tensor. For visualization, the output rgba image will be (B, H, W, 4),
+        and the color palette comes from `get_different_colors`. The
+        segmentation map is a tensor each pixel saves the classification index.
+        Please make sure you have allocate each pixel a correct classification
+        index by defining a textures of vertex color.
+
+        [CrossEntropyLoss](https://pytorch.org/docs/stable/generated/torch.nn.
+        CrossEntropyLoss.html)
+
+        Args:
+            resolution (Iterable[int]):
+                (width, height) of the rendered images resolution.
+            device (Union[torch.device, str], optional):
+                You can pass a str or torch.device for cpu or gpu render.
+                Defaults to 'cpu'.
+            output_path (Optional[str], optional):
+                Output path of the video or images to be saved.
+                Defaults to None.
+            out_img_format (str, optional): The image format string for
+                saving the images.
+                Defaults to '%06d.png'.
+            num_class (int, optional): number of segmentation parts.
+                Defaults to 1.
+
+        Returns:
+            None
+        """
+        super().__init__(resolution=resolution,
+                         device=device,
+                         output_path=output_path,
+                         obj_path=None,
+                         out_img_format=out_img_format,
+                         **kwargs)
+        self.num_class = num_class
+
+    def forward(self,
+                meshes: Meshes,
+                cameras: Optional[MMCamerasBase] = None,
+                indexes: Optional[Iterable[int]] = None,
+                backgrounds: Optional[torch.Tensor] = None,
+                **kwargs):
+        """Render segmentation map.
+
+        Args:
+            meshes (Meshes): meshes to be rendered.
+                Require the textures type is `TexturesClosest`.
+                The color indicates the class index of the triangle.
+            cameras (Optional[MMCamerasBase], optional): cameras for render.
+                Defaults to None.
+            indexes (Optional[Iterable[int]], optional): indexes for images.
+                Defaults to None.
+            backgrounds (Optional[torch.Tensor], optional): background images.
+                Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, None]: return tensor or None.
+        """
+
+        meshes = meshes.to(self.device)
+        self._update_resolution(cameras, **kwargs)
+        fragments = self.rasterizer(meshes_world=meshes, cameras=cameras)
+        segmentation_map = self.shader(fragments=fragments,
+                                       meshes=meshes,
+                                       cameras=cameras)
+
+        if self.output_path is not None:
+            rgba = self.tensor2rgba(segmentation_map)
+            if self.output_path is not None:
+                self._write_images(rgba, backgrounds, indexes)
+
+        return segmentation_map
+
+    def tensor2rgba(self, tensor: torch.Tensor):
+        valid_masks = (tensor[..., :] > 0) * 1.0
+        color = torch.Tensor(get_different_colors(self.num_class))
+        color = torch.cat([torch.zeros(1, 3), color]).to(self.device)
+        B, H, W, _ = tensor.shape
+        rgbs = color[tensor.view(-1)].view(B, H, W, 3) * valid_masks
+        rgbs = normalize(rgbs.float(),
+                         origin_value_range=(0, 255),
+                         out_value_range=(0, 1))
+        rgba = torch.cat([rgbs, valid_masks], -1)
+        return rgba
diff --git a/detrsmpl/core/renderer/torch3d_renderer/shader/__init__.py b/detrsmpl/core/renderer/torch3d_renderer/shader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e55fad81f1eb3de0b5e39d7107ffc579b314446
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/shader/__init__.py
@@ -0,0 +1,16 @@
+# yapf: disable
+from .builder import (  # noqa: F401
+    DepthShader,
+    HardFlatShader,
+    HardGouraudShader,
+    HardPhongShader,
+    NoLightShader,
+    NormalShader,
+    SegmentationShader,
+    SilhouetteShader,
+    SoftGouraudShader,
+    SoftPhongShader,
+    build_shader,
+)
+
+# yapf: enable
diff --git a/detrsmpl/core/renderer/torch3d_renderer/shader/builder.py b/detrsmpl/core/renderer/torch3d_renderer/shader/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..900d4440dbdcae86ac1babae876e442b4de1f373
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/shader/builder.py
@@ -0,0 +1,51 @@
+from mmcv.utils import Registry
+from pytorch3d.renderer import (
+    HardFlatShader,
+    HardGouraudShader,
+    HardPhongShader,
+    SoftGouraudShader,
+    SoftPhongShader,
+)
+
+from .shader import (
+    DepthShader,
+    NoLightShader,
+    NormalShader,
+    SegmentationShader,
+    SilhouetteShader,
+)
+
+SHADER = Registry('shader')
+SHADER.register_module(name=[
+    'flat', 'hard_flat_shader', 'hard_flat', 'HardFlat', 'HardFlatShader'
+],
+                       module=HardFlatShader)
+SHADER.register_module(name=['hard_phong', 'HardPhong', 'HardPhongShader'],
+                       module=HardPhongShader)
+SHADER.register_module(
+    name=['hard_gouraud', 'HardGouraud', 'HardGouraudShader'],
+    module=HardGouraudShader)
+SHADER.register_module(
+    name=['soft_gouraud', 'SoftGouraud', 'SoftGouraudShader'],
+    module=SoftGouraudShader)
+SHADER.register_module(name=['soft_phong', 'SoftPhong', 'SoftPhongShader'],
+                       module=SoftPhongShader)
+SHADER.register_module(name=['silhouette', 'Silhouette', 'SilhouetteShader'],
+                       module=SilhouetteShader)
+SHADER.register_module(
+    name=['nolight', 'nolight_shader', 'NoLight', 'NoLightShader'],
+    module=NoLightShader)
+SHADER.register_module(
+    name=['normal', 'normal_shader', 'Normal', 'NormalShader'],
+    module=NormalShader)
+SHADER.register_module(name=['depth', 'depth_shader', 'Depth', 'DepthShader'],
+                       module=DepthShader)
+SHADER.register_module(name=[
+    'segmentation', 'segmentation_shader', 'Segmentation', 'SegmentationShader'
+],
+                       module=SegmentationShader)
+
+
+def build_shader(cfg):
+    """Build shader."""
+    return SHADER.build(cfg)
diff --git a/detrsmpl/core/renderer/torch3d_renderer/shader/shader.py b/detrsmpl/core/renderer/torch3d_renderer/shader/shader.py
new file mode 100644
index 0000000000000000000000000000000000000000..977b0859cfae5a7270aad3639318251e6bc68e3a
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/shader/shader.py
@@ -0,0 +1,103 @@
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from pytorch3d.ops import interpolate_face_attributes
+from pytorch3d.renderer import BlendParams, hard_rgb_blend
+from pytorch3d.renderer.mesh.shader import SoftSilhouetteShader
+from pytorch3d.structures.utils import padded_to_packed
+
+
+class SilhouetteShader(SoftSilhouetteShader):
+    """Avoid unexpected keyword argument error."""
+    def __init__(self,
+                 blend_params: Optional[BlendParams] = None,
+                 **kwargs) -> None:
+        super().__init__(blend_params)
+
+
+class NoLightShader(nn.Module):
+    """No light shader."""
+    def __init__(self,
+                 blend_params: Optional[BlendParams] = None,
+                 **kwargs) -> None:
+        """Initlialize without blend_params."""
+        super().__init__()
+        self.blend_params = blend_params if blend_params is not None\
+            else BlendParams()
+
+    def forward(self, fragments, meshes, **kwargs) -> torch.Tensor:
+        """Sample without light."""
+        texels = meshes.sample_textures(fragments)
+        blend_params = kwargs.get('blend_params', self.blend_params)
+        images = hard_rgb_blend(texels, fragments, blend_params)
+        return images
+
+
+class DepthShader(nn.Module):
+    """No light shader."""
+    def __init__(self,
+                 blend_params: Optional[BlendParams] = None,
+                 **kwargs) -> None:
+        """Initlialize without blend_params."""
+        super().__init__()
+        self.blend_params = blend_params if blend_params is not None\
+            else BlendParams()
+
+    def forward(self, fragments, meshes, cameras, **kwargs) -> torch.Tensor:
+        """Sample without light."""
+        verts_depth = cameras.compute_depth_of_points(meshes.verts_padded())
+        faces = meshes.faces_packed()  # (F, 3)
+        verts_depth = padded_to_packed(verts_depth)
+        faces_depth = verts_depth[faces]
+        depth_map = interpolate_face_attributes(
+            pix_to_face=fragments.pix_to_face,
+            barycentric_coords=fragments.bary_coords,
+            face_attributes=faces_depth)
+        return depth_map[..., 0, :]
+
+
+class NormalShader(nn.Module):
+    """No light shader."""
+    def __init__(self,
+                 blend_params: Optional[BlendParams] = None,
+                 **kwargs) -> None:
+        """Initlialize without blend_params."""
+        super().__init__()
+        self.blend_params = blend_params if blend_params is not None\
+            else BlendParams()
+
+    def forward(self, fragments, meshes, cameras, **kwargs) -> torch.Tensor:
+        """Sample without light."""
+        verts_normal = cameras.compute_normal_of_meshes(meshes)
+        faces = meshes.faces_packed()  # (F, 3)
+        verts_normal = padded_to_packed(verts_normal)
+        faces_normal = verts_normal[faces]
+        normal_map = interpolate_face_attributes(
+            pix_to_face=fragments.pix_to_face,
+            barycentric_coords=fragments.bary_coords,
+            face_attributes=faces_normal)
+        return normal_map[..., 0, :]
+
+
+class SegmentationShader(nn.Module):
+    """No light shader."""
+    def __init__(self,
+                 blend_params: Optional[BlendParams] = None,
+                 **kwargs) -> None:
+        """Initlialize without blend_params."""
+        super().__init__()
+        self.blend_params = blend_params if blend_params is not None\
+            else BlendParams()
+
+    def forward(self, fragments, meshes, **kwargs) -> torch.Tensor:
+        """Sample without light."""
+        verts_class = meshes.textures.verts_features_padded()
+        faces = meshes.faces_packed()  # (F, 3)
+        verts_class = padded_to_packed(verts_class)
+        faces_class = verts_class[faces]
+        segmentation_map = interpolate_face_attributes(
+            pix_to_face=fragments.pix_to_face,
+            barycentric_coords=fragments.bary_coords,
+            face_attributes=faces_class).long()
+        return segmentation_map[..., :, 0]
diff --git a/detrsmpl/core/renderer/torch3d_renderer/silhouette_renderer.py b/detrsmpl/core/renderer/torch3d_renderer/silhouette_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..850f9d7bec5e107b8b2abc56d655da7e2c017b01
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/silhouette_renderer.py
@@ -0,0 +1,89 @@
+from typing import Iterable, Optional, Tuple, Union
+
+import torch
+from pytorch3d.structures import Meshes
+
+from detrsmpl.core.cameras import MMCamerasBase
+from .base_renderer import BaseRenderer
+from .utils import normalize
+
+
+class SilhouetteRenderer(BaseRenderer):
+    """Silhouette renderer."""
+    shader_type = 'SilhouetteShader'
+
+    def __init__(
+        self,
+        resolution: Tuple[int, int] = None,
+        device: Union[torch.device, str] = 'cpu',
+        output_path: Optional[str] = None,
+        out_img_format: str = '%06d.png',
+        **kwargs,
+    ) -> None:
+        """SilhouetteRenderer for neural rendering and visualization.
+
+        Args:
+            resolution (Iterable[int]):
+                (width, height) of the rendered images resolution.
+            device (Union[torch.device, str], optional):
+                You can pass a str or torch.device for cpu or gpu render.
+                Defaults to 'cpu'.
+            output_path (Optional[str], optional):
+                Output path of the video or images to be saved.
+                Defaults to None.
+            out_img_format (str, optional): The image format string for
+                saving the images.
+                Defaults to '%06d.png'.
+
+        Returns:
+            None
+        """
+        super().__init__(resolution=resolution,
+                         device=device,
+                         output_path=output_path,
+                         out_img_format=out_img_format,
+                         **kwargs)
+
+    def forward(self,
+                meshes: Optional[Meshes] = None,
+                cameras: Optional[MMCamerasBase] = None,
+                images: Optional[torch.Tensor] = None,
+                indexes: Iterable[str] = None,
+                backgrounds: Optional[torch.Tensor] = None,
+                **kwargs):
+        """Render silhouette map.
+
+        Args:
+            meshes (Optional[Meshes], optional): meshes to be rendered.
+                Require the textures type is `TexturesClosest`.
+                The color indicates the class index of the triangle.
+                Defaults to None.
+            cameras (Optional[MMCamerasBase], optional): cameras for render.
+                Defaults to None.
+            indexes (Optional[Iterable[int]], optional): indexes for images.
+                Defaults to None.
+            backgrounds (Optional[torch.Tensor], optional): background images.
+                Defaults to None.
+
+        Returns:
+            Union[torch.Tensor, None]: return tensor or None.
+        """
+        meshes = meshes.to(self.device)
+        self._update_resolution(cameras, **kwargs)
+        fragments = self.rasterizer(meshes_world=meshes, cameras=cameras)
+        silhouette_map = self.shader(fragments=fragments,
+                                     meshes=meshes,
+                                     cameras=cameras)
+
+        if self.output_path is not None:
+            rgba = self.tensor2rgba(silhouette_map)
+            self._write_images(rgba, backgrounds, indexes)
+
+        return silhouette_map
+
+    def tensor2rgba(self, tensor: torch.Tensor):
+        silhouette = tensor[..., 3:]
+        rgbs = silhouette.repeat(1, 1, 1, 3)
+        valid_masks = (silhouette > 0) * 1.0
+        rgbs = normalize(rgbs, out_value_range=(0, 1))
+        return torch.cat([rgbs, valid_masks], -1)
diff --git a/detrsmpl/core/renderer/torch3d_renderer/smpl_renderer.py b/detrsmpl/core/renderer/torch3d_renderer/smpl_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..870d0dde2c5d2e0192856a08519eb8b5cd99f8f7
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/smpl_renderer.py
@@ -0,0 +1,279 @@
+import os.path as osp
+from pathlib import Path
+from typing import Iterable, Optional, Tuple, Union
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from pytorch3d.structures import Meshes
+from torch.nn.functional import interpolate
+
+from detrsmpl.core.cameras import MMCamerasBase
+from detrsmpl.utils.ffmpeg_utils import images_to_array
+from detrsmpl.utils.path_utils import check_path_suffix
+from .base_renderer import BaseRenderer
+from .builder import build_renderer
+from .lights import DirectionalLights, PointLights
+from .utils import align_input_to_padded, normalize, rgb2bgr, tensor2array
+
+
+class SMPLRenderer(BaseRenderer):
+    """Render SMPL(X) with different render choices."""
+    def __init__(self,
+                 resolution: Tuple[int, int] = None,
+                 device: Union[torch.device, str] = 'cpu',
+                 output_path: Optional[str] = None,
+                 return_tensor: bool = False,
+                 alpha: float = 1.0,
+                 out_img_format: str = '%06d.png',
+                 read_img_format: str = None,
+                 render_choice='mq',
+                 frames_folder: Optional[str] = None,
+                 plot_kps: bool = False,
+                 vis_kp_index: bool = False,
+                 final_resolution: Tuple[int, int] = None,
+                 **kwargs) -> None:
+        super(BaseRenderer, self).__init__()
+
+        self.device = device
+        self.resolution = resolution
+        self.render_choice = render_choice
+        self.output_path = output_path
+        self.frames_folder = frames_folder
+        self.plot_kps = plot_kps
+        self.vis_kp_index = vis_kp_index
+        self.read_img_format = read_img_format
+        self.out_img_format = out_img_format
+        self.final_resolution = final_resolution
+        self.return_tensor = return_tensor
+        if output_path is not None:
+            if check_path_suffix(output_path, ['.mp4', '.gif']):
+                self.temp_path = osp.join(
+                    Path(output_path).parent,
+                    Path(output_path).name + '_output_temp')
+                mmcv.mkdir_or_exist(self.temp_path)
+                print('make dir', self.temp_path)
+            else:
+                self.temp_path = output_path
+
+        self.image_renderer = build_renderer(
+            dict(device=device, resolution=resolution, **kwargs))
+
+        if plot_kps:
+            self.alpha = max(min(0.8, alpha), 0.1)
+            self.joints_renderer = build_renderer(
+                dict(type='pointcloud',
+                     resolution=resolution,
+                     device=device,
+                     radius=0.008))
+        else:
+            self.alpha = max(min(1.0, alpha), 0.1)
+        """
+        Render Mesh for SMPL and SMPL-X. For function render_smpl.
+        2 modes: mesh render with different quality and palette,
+        or silhouette render.
+
+        Args:
+            resolution (Iterable[int]): (height, width of render images)
+            faces (Union[np.ndarray, torch.LongTensor]): face of mesh to
+                be rendered.
+            device (torch.device, optional): cuda or cpu device.
+                Defaults to torch.device('cpu').
+            output_path (Optional[str], optional): render output path.
+                could be .mp4 or .gif or a folder.
+                Else: 1). If `render_choice` in ['lq', 'mq', 'hq'], the output
+                video will be a smpl mesh video which each person in a single
+                color.
+                2). If `render_choice` is `silhouette`, the output video will
+                be a black-white smpl silhouette video.
+                3). If `render_choice` is  `part_silhouette`, the output video
+                will be a smpl mesh video which each body-part in a single
+                color.
+                If None, no video will be wrote.
+                Defaults to None.
+            palette (Optional[List[str]], optional):
+                List of palette string. Defaults to ['blue'].
+            return_tensor (bool, optional): Whether return tensors.
+                return None if set to False.
+                Defaults to False.
+            alpha (float, optional): transparency value, from 0.0 to 1.0.
+                Defaults to 1.0.
+
+        Returns:
+            None
+        """
+
+    def to(self, device):
+        return super(BaseRenderer, self).to(device)
+
+    def forward(
+        self,
+        meshes: Meshes,
+        cameras: Optional[MMCamerasBase] = None,
+        images: Optional[torch.Tensor] = None,
+        joints: Optional[torch.Tensor] = None,
+        joints_gt: Optional[torch.Tensor] = None,
+        indexes: Optional[Iterable[int]] = None,
+        **kwargs,
+    ) -> Union[None, torch.Tensor]:
+        """Forward render procedure.
+
+        Args:
+            vertices (torch.Tensor): shape should be (frame, num_V, 3) or
+                (frame, num_people, num_V, 3). Num people Would influence
+                the visualization.
+            images (Optional[torch.Tensor], optional): Tensor of background
+                images. If None, no background.
+                Defaults to None.
+            joints (Optional[torch.Tensor], optional):
+                joints produced from smpl model.
+                Defaults to None.
+            joints_gt (Optional[torch.Tensor], optional):
+                ground-truth points passed.
+                Defaults to None.
+            indexes (Optional[Iterable[int]], optional):
+                indexes for writing images.
+                Defaults to None.
+
+        Returns:
+            Union[None, torch.Tensor]:
+                return None if not return_tensor.
+                Else: 1). If render images, the output tensor shape would be
+                (frame, h, w, 4) or (frame, num_people, h, w, 4), depends on
+                number of people.
+                2). If render silhouette, the output tensor shape will be
+                (frame, h, w) or (frame, num_people, h, w).
+                3). If render part silhouette, the output tensor shape should
+                be (frame, h, w, 1) or (frame, num_people, h, w, 1
+                ).
+        """
+        num_frames = len(meshes)
+        if self.frames_folder is not None and images is None:
+
+            images = images_to_array(self.frames_folder,
+                                     resolution=self.resolution,
+                                     img_format=self.read_img_format,
+                                     start=indexes[0],
+                                     end=indexes[-1] + 1,
+                                     disable_log=True).astype(np.float64)
+            images = torch.Tensor(images).to(self.device)
+            images = align_input_to_padded(
+                images,
+                ndim=4,
+                batch_size=num_frames,
+                padding_mode='ones',
+            )
+        if images is not None:
+            images = images.to(self.device)
+
+        lights = getattr(self.image_renderer, 'lights', None)
+        if isinstance(lights, DirectionalLights):
+            lights = lights.clone()
+            lights.direction = -cameras.get_camera_plane_normals()
+        elif isinstance(lights, PointLights):
+            lights = lights.clone()
+            lights.location = -cameras.get_camera_plane_normals(
+            ) - cameras.get_camera_center()
+
+        rendered_tensor = self.image_renderer(meshes=meshes,
+                                              cameras=cameras,
+                                              lights=lights,
+                                              indexes=indexes)
+
+        rendered_images = self.image_renderer.tensor2rgba(rendered_tensor)
+
+        rgbs = rendered_images[..., :3]
+        valid_masks = rendered_images[..., 3:]
+        images = normalize(images,
+                           origin_value_range=[0, 255],
+                           out_value_range=[0, 1],
+                           dtype=torch.float32) if images is not None else None
+
+        bgrs = rgb2bgr(rgbs)
+
+        # write temp images for the output video
+        if self.output_path is not None:
+
+            if images is not None:
+                output_images = bgrs * valid_masks * self.alpha + \
+                    images * valid_masks * (
+                        1 - self.alpha) + (1 - valid_masks) * images
+
+            else:
+                output_images = bgrs
+
+            if self.plot_kps:
+
+                joints = joints.to(self.device)
+                joints_2d = cameras.transform_points_screen(
+                    joints, image_size=self.resolution)[..., :2]
+                if joints_gt is None:
+                    joints_padded = joints
+                    num_joints = joints_padded.shape[1]
+                    joints_rgb_padded = torch.ones(
+                        num_frames, num_joints, 4) * (torch.tensor(
+                            [0.0, 1.0, 0.0, 1.0]).view(1, 1, 4))
+                else:
+                    joints_gt = joints_gt.to(self.device)
+                    joints_padded = torch.cat([joints, joints_gt], dim=1)
+                    num_joints = joints.shape[1]
+                    num_joints_gt = joints_gt.shape[1]
+                    joints_rgb = torch.ones(num_frames, num_joints, 4) * (
+                        torch.tensor([0.0, 1.0, 0.0, 1.0]).view(1, 1, 4))
+                    joints_rgb_gt = torch.ones(
+                        num_frames, num_joints_gt, 4) * (torch.tensor(
+                            [1.0, 0.0, 0.0, 1.0]).view(1, 1, 4))
+                    joints_rgb_padded = torch.cat([joints_rgb, joints_rgb_gt],
+                                                  dim=1)
+
+                pointcloud_images = self.joints_renderer(
+                    vertices=joints_padded,
+                    verts_rgba=joints_rgb_padded.to(self.device),
+                    cameras=cameras)
+
+                pointcloud_rgb = pointcloud_images[..., :3]
+                pointcloud_bgr = rgb2bgr(pointcloud_rgb)
+                pointcloud_mask = (pointcloud_images[..., 3:] > 0) * 1.0
+                output_images = output_images * (
+                    1 - pointcloud_mask) + pointcloud_mask * pointcloud_bgr
+
+            output_images = tensor2array(output_images)
+
+            for frame_idx, real_idx in enumerate(indexes):
+                folder = self.temp_path if self.temp_path is not None else\
+                    self.output_path
+                im = output_images[frame_idx]
+                if self.plot_kps and self.vis_kp_index:
+                    point_xy = joints_2d[frame_idx]
+                    for j_idx in range(point_xy.shape[-2]):
+                        x = point_xy[j_idx, 0]
+                        y = point_xy[j_idx, 1]
+                        cv2.putText(im, str(j_idx), (int(x), int(y)),
+                                    cv2.FONT_HERSHEY_SIMPLEX,
+                                    0.25 * self.final_resolution[1] / 500,
+                                    [0, 0, 0],
+                                    int(1 * self.final_resolution[1] / 1000))
+                if self.final_resolution != self.resolution:
+                    im = cv2.resize(im, self.final_resolution, cv2.INTER_CUBIC)
+                # cv2.imwrite(osp.join(folder, self.out_img_format % real_idx),
+                #             im)
+                # import ipdb;ipdb.set_trace()
+                # cv2.imwrite(self.output_path+'temp.jpg', im)
+                cv2.imwrite(self.output_path, im)
+
+        # return
+        if self.return_tensor:
+
+            if images is not None:
+                rendered_map = torch.tensor(output_images)
+            else:
+                rendered_map = rendered_tensor
+
+            if self.final_resolution != self.resolution:
+                rendered_map = interpolate(rendered_map,
+                                           size=self.final_resolution,
+                                           mode='bilinear')
+            return rendered_map
+        else:
+            return output_images
diff --git a/detrsmpl/core/renderer/torch3d_renderer/textures/__init__.py b/detrsmpl/core/renderer/torch3d_renderer/textures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f7c950fc41d29935d34dbc8d7daa585da2b2f42
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/textures/__init__.py
@@ -0,0 +1,10 @@
+# yapf: disable
+from .builder import (  # noqa:F401
+    TexturesAtlas,
+    TexturesNearest,
+    TexturesUV,
+    TexturesVertex,
+    build_textures,
+)
+
+# yapf: enable
diff --git a/detrsmpl/core/renderer/torch3d_renderer/textures/builder.py b/detrsmpl/core/renderer/torch3d_renderer/textures/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f91ccad7747711dd15774d598134730166e60224
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/textures/builder.py
@@ -0,0 +1,22 @@
+from mmcv.utils import Registry
+from pytorch3d.renderer import TexturesAtlas, TexturesUV, TexturesVertex
+
+from .textures import TexturesNearest
+
+TEXTURES = Registry('textures')
+TEXTURES.register_module(
+    name=['TexturesAtlas', 'textures_atlas', 'atlas', 'Atlas'],
+    module=TexturesAtlas)
+TEXTURES.register_module(
+    name=['TexturesNearest', 'textures_nearest', 'nearest', 'Nearest'],
+    module=TexturesNearest)
+TEXTURES.register_module(name=['TexturesUV', 'textures_uv', 'uv'],
+                         module=TexturesUV)
+TEXTURES.register_module(
+    name=['TexturesVertex', 'textures_vertex', 'vertex', 'vc'],
+    module=TexturesVertex)
+
+
+def build_textures(cfg):
+    """Build textures."""
+    return TEXTURES.build(cfg)
diff --git a/detrsmpl/core/renderer/torch3d_renderer/textures/textures.py b/detrsmpl/core/renderer/torch3d_renderer/textures/textures.py
new file mode 100644
index 0000000000000000000000000000000000000000..264e8f60cb225ae59719b5ee7fca2689b3aa9962
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/textures/textures.py
@@ -0,0 +1,23 @@
+import torch
+from pytorch3d.ops import interpolate_face_attributes
+from pytorch3d.renderer import TexturesVertex
+
+
+class TexturesNearest(TexturesVertex):
+    """Textures for nearest interpolation."""
+    def sample_textures(self, fragments, faces_packed=None) -> torch.Tensor:
+        """Rewrite sample_textures to use the nearest interpolation.
+
+        This function will only be called in render forwarding.
+        """
+        verts_features_packed = self.verts_features_packed()
+        faces_verts_features = verts_features_packed[faces_packed]
+        bary_coords = fragments.bary_coords
+        _, idx = torch.max(bary_coords, -1)
+        mask = torch.arange(bary_coords.size(-1)).reshape(1, 1, -1).to(
+            self.device) == idx.unsqueeze(-1)
+        bary_coords *= 0
+        bary_coords[mask] = 1
+        texels = interpolate_face_attributes(fragments.pix_to_face,
+                                             bary_coords, faces_verts_features)
+        return texels
diff --git a/detrsmpl/core/renderer/torch3d_renderer/utils.py b/detrsmpl/core/renderer/torch3d_renderer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..26374dbe09bc12cd09fe33d9b12cfd618d803836
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/utils.py
@@ -0,0 +1,113 @@
+from typing import List, Union
+
+import numpy as np
+import torch
+from pytorch3d.structures import list_to_padded
+
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
+
+def normalize(value,
+              origin_value_range=None,
+              out_value_range=(0, 1),
+              dtype=None,
+              clip=False) -> Union[torch.Tensor, np.ndarray]:
+    """Normalize the tensor or array and convert dtype."""
+    if origin_value_range is not None:
+        value = (value - origin_value_range[0]) / (
+            origin_value_range[1] - origin_value_range[0] + 1e-9)
+
+    else:
+        value = (value - value.min()) / (value.max() - value.min())
+    value = value * (out_value_range[1] -
+                     out_value_range[0]) + out_value_range[0]
+    if clip:
+        value = torch.clip(value,
+                           min=out_value_range[0],
+                           max=out_value_range[1])
+    if isinstance(value, torch.Tensor):
+        if dtype is not None:
+            return value.type(dtype)
+        else:
+            return value
+    elif isinstance(value, np.ndarray):
+        if dtype is not None:
+            return value.astype(dtype)
+        else:
+            return value
+
+
+def tensor2array(image: torch.Tensor) -> np.ndarray:
+    """Convert image tensor to array."""
+    image = image.detach().cpu().numpy()
+    image = normalize(image,
+                      origin_value_range=(0, 1),
+                      out_value_range=(0, 255),
+                      dtype=np.uint8)
+    return image
+
+
+def array2tensor(image: np.ndarray) -> torch.Tensor:
+    """Convert image array to tensor."""
+    image = torch.Tensor(image)
+    image = normalize(image,
+                      origin_value_range=(0, 255),
+                      out_value_range=(0, 1),
+                      dtype=torch.float32)
+    return image
+
+
+def rgb2bgr(rgbs) -> Union[torch.Tensor, np.ndarray]:
+    """Convert color channels."""
+    bgrs = [rgbs[..., 2, None], rgbs[..., 1, None], rgbs[..., 0, None]]
+    if isinstance(rgbs, torch.Tensor):
+        bgrs = torch.cat(bgrs, -1)
+    elif isinstance(rgbs, np.ndarray):
+        bgrs = np.concatenate(bgrs, -1)
+    return bgrs
+
+
+def align_input_to_padded(tensor=Union[List[torch.Tensor], torch.Tensor],
+                          ndim: int = 3,
+                          batch_size: int = None,
+                          padding_mode: Literal['ones', 'zeros', 'repeat',
+                                                'none'] = 'none'):
+    if isinstance(tensor, list):
+        for i in range(len(tensor)):
+            if tensor[i].dim == ndim:
+                tensor[i] = tensor[i][0]
+        tensor = list_to_padded(tensor, equisized=True)
+    assert tensor.ndim in (ndim, ndim - 1)
+    if tensor.ndim == ndim - 1:
+        tensor = tensor.unsqueeze(0)
+
+    if batch_size is not None:
+        current_batch_size = tensor.shape[0]
+        if current_batch_size == 1:
+            tensor = tensor.repeat_interleave(batch_size, 0)
+        elif current_batch_size < batch_size:
+            if padding_mode == 'ones':
+                tensor = torch.cat([
+                    tensor,
+                    torch.ones_like(tensor)[:1].repeat_interleave(
+                        batch_size - current_batch_size, 0)
+                ])
+            elif padding_mode == 'ones':
+                tensor = torch.cat([
+                    tensor,
+                    torch.zeros_like(tensor)[:1].repeat_interleave(
+                        batch_size - current_batch_size, 0)
+                ])
+            elif padding_mode == 'repeat':
+                tensor = tensor.repeat_interleave(
+                    batch_size // current_batch_size + 1, 0)[:batch_size]
+            else:
+                raise ValueError('Wrong batch_size to allocate,'
+                                 ' please specify padding mode.')
+        elif current_batch_size > batch_size:
+            tensor = tensor[:batch_size]
+
+    return tensor
diff --git a/detrsmpl/core/renderer/torch3d_renderer/uv_renderer.py b/detrsmpl/core/renderer/torch3d_renderer/uv_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a8e667d1a40504cf9e7b7004e79e6102b0bb3bc
--- /dev/null
+++ b/detrsmpl/core/renderer/torch3d_renderer/uv_renderer.py
@@ -0,0 +1,520 @@
+import warnings
+from typing import Iterable, Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from pytorch3d.io.obj_io import load_objs_as_meshes
+from pytorch3d.ops import interpolate_face_attributes
+from pytorch3d.renderer.mesh import TexturesUV
+from pytorch3d.renderer.mesh.rasterizer import (
+    MeshRasterizer,
+    RasterizationSettings,
+)
+from pytorch3d.structures import Meshes
+from pytorch3d.structures.utils import padded_to_packed
+
+from detrsmpl.core.cameras.cameras import (
+    FoVOrthographicCameras,
+    MMCamerasBase,
+)
+from detrsmpl.utils.path_utils import check_path_suffix
+from .utils import array2tensor, rgb2bgr
+
+
+class UVRenderer(nn.Module):
+    """Renderer for SMPL(x) UV map."""
+    def __init__(
+        self,
+        resolution: Tuple[int] = 1024,
+        model_type: Optional[str] = 'smpl',
+        uv_param_path: Optional[str] = None,
+        obj_path: Optional[str] = None,
+        device: Union[torch.device, str] = 'cpu',
+        threshold_size: int = 512,
+        # TODO: Solved the sample bug when the resolution is too small.
+        # set threshold_size is just a temporary solution.
+
+        # TODO: add smplx_uv.npz and eval the warping & sampling of smplx
+        # model.
+    ):
+        super().__init__()
+        self.threshold_size = threshold_size
+        num_verts = {'smpl': 6890, 'smplx': 10475}
+        self.NUM_VERTS = num_verts[model_type]
+        self.device = device
+        self.resolution = (resolution, resolution) if isinstance(
+            resolution, int) else resolution
+        self.uv_param_path = uv_param_path
+        self.obj_path = obj_path
+        if uv_param_path is not None:
+            check_path_suffix(uv_param_path, allowed_suffix=['npz'])
+            param_dict = dict(np.load(uv_param_path))
+
+            verts_uv = torch.Tensor(param_dict['verts_uv'])
+            verts_u, verts_v = torch.unbind(verts_uv, -1)
+            verts_v_ = 1 - verts_u.unsqueeze(-1)
+            verts_u_ = verts_v.unsqueeze(-1)
+            self.verts_uv = torch.cat([verts_u_, verts_v_], -1).to(self.device)
+            self.faces_uv = torch.LongTensor(param_dict['faces_uv']).to(
+                self.device)
+
+            self.NUM_VT = self.verts_uv.shape[0]
+
+            self.faces_tensor = torch.LongTensor(param_dict['faces'].astype(
+                np.int64)).to(self.device)
+            self.num_faces = self.faces_uv.shape[0]
+        elif obj_path is not None:
+            check_path_suffix(obj_path, allowed_suffix=['obj'])
+            mesh_template = load_objs_as_meshes([obj_path])
+            self.faces_uv = mesh_template.textures.faces_uvs_padded()[0].to(
+                self.device)
+            self.verts_uv = mesh_template.textures.verts_uvs_padded()[0].to(
+                self.device)
+            self.NUM_VT = self.verts_uv.shape[0]
+            self.faces_tensor = mesh_template.faces_padded()[0].to(self.device)
+            self.num_faces = self.faces_uv.shape[0]
+        self.update_fragments()
+        self.update_face_uv_pixel()
+
+        self = self.to(self.device)
+
+    def to(self, device):
+        if isinstance(device, str):
+            device = torch.device(device)
+        self.device = device
+        for k in dir(self):
+            if isinstance(getattr(self, k), (torch.Tensor)):
+                setattr(self, k, getattr(self, k).to(device))
+        return self
+
+    def update_fragments(self):
+        """Update pix_to_face, bary_coords."""
+        rasterizer = MeshRasterizer(cameras=FoVOrthographicCameras(
+            min_x=1, max_x=0, max_y=1, min_y=0, device=self.device),
+                                    raster_settings=RasterizationSettings(
+                                        blur_radius=0,
+                                        image_size=self.resolution,
+                                        faces_per_pixel=1,
+                                        perspective_correct=False,
+                                    )).to(self.device)
+        verts_uv = torch.cat([
+            self.verts_uv[None],
+            torch.ones(1, self.NUM_VT, 1).to(self.device)
+        ], -1)
+
+        fragments = rasterizer(
+            Meshes(verts=verts_uv, faces=self.faces_uv[None]))
+        self.pix_to_face = fragments.pix_to_face[0, ..., 0]
+        self.bary_coords = fragments.bary_coords[0, ..., 0, :]
+        self.mask = (self.pix_to_face >= 0).long()
+
+    def update_face_uv_pixel(self):
+        """Move the pixels lie on the edges inside the mask, then refine the
+        rest points by searching the nearest pixel in the faces it should be
+        in."""
+        H, W = self.resolution
+        device = self.device
+        cameras = FoVOrthographicCameras(min_x=1,
+                                         max_x=0,
+                                         max_y=1,
+                                         min_y=0,
+                                         device=self.device)
+        verts_uv = torch.cat([
+            self.verts_uv[None],
+            torch.ones(1, self.NUM_VT, 1).to(self.device)
+        ], -1)
+
+        verts_uv_pixel = cameras.transform_points_screen(
+            verts_uv, image_size=self.resolution).round().long()[0, ..., :2]
+        verts_uv_pixel[..., 0] = torch.clip(verts_uv_pixel[..., 0],
+                                            min=0,
+                                            max=W - 1)
+        verts_uv_pixel[..., 1] = torch.clip(verts_uv_pixel[..., 1],
+                                            min=0,
+                                            max=H - 1)
+        verts_uv_pixel = verts_uv_pixel.long()
+        mask = self.mask
+
+        wrong_indexes = torch.where(
+            mask[verts_uv_pixel.view(-1, 2)[:, 1],
+                 verts_uv_pixel.view(-1, 2)[:, 0]] == 0)[0]
+        for wrong_index in wrong_indexes:
+            proposed_faces = torch.where(self.faces_uv == wrong_index)[0]
+            vert_xy = verts_uv_pixel[wrong_index]
+            faces_xy = []
+            for face_id in proposed_faces:
+                x = torch.where(self.pix_to_face == face_id)[1]
+                y = torch.where(self.pix_to_face == face_id)[0]
+                if x.shape[0] > 0:
+                    face_xy = torch.cat([x.unsqueeze(-1), y.unsqueeze(-1)], -1)
+                    faces_xy.append(face_xy)
+            if len(faces_xy) > 0:
+                faces_xy = torch.cat(faces_xy, 0)
+                min_arg = torch.argmin(
+                    torch.sqrt(((faces_xy - vert_xy) *
+                                (faces_xy - vert_xy)).sum(-1).float()))
+
+                verts_uv_pixel[wrong_index] = faces_xy[min_arg]
+
+        up_bound = ((mask[:-1] - mask[1:]) < 0).long()
+        bottom_bound = ((mask[1:] - mask[:-1]) < 0).long()
+        left_bound = ((mask[:, :-1] - mask[:, 1:]) < 0).long()
+        right_bound = ((mask[:, 1:] - mask[:, :-1]) < 0).long()
+
+        left_bound = torch.cat(
+            [left_bound, torch.zeros(H, 1).to(device)], 1).unsqueeze(-1)
+        right_bound = torch.cat([torch.zeros(H, 1).to(device), right_bound],
+                                1).unsqueeze(-1)
+        up_bound = torch.cat([up_bound, torch.zeros(1, W).to(device)],
+                             0).unsqueeze(-1)
+        bottom_bound = torch.cat([torch.zeros(1, W).to(device), bottom_bound],
+                                 0).unsqueeze(-1)
+
+        leftup_corner_ = ((mask[:-1, :-1] - mask[1:, 1:]) < 0).long()
+        rightup_corner_ = ((mask[:-1, 1:] - mask[1:, :-1]) < 0).long()
+        leftbottom_corner_ = ((mask[1:, :-1] - mask[:-1, 1:]) < 0).long()
+        rightbottom_corner_ = ((mask[1:, 1:] - mask[:-1, :-1]) < 0).long()
+
+        leftup_corner = torch.zeros_like(mask).long()
+        leftup_corner[:-1, :-1] = leftup_corner_
+        leftup_corner = leftup_corner.unsqueeze(-1)
+
+        rightup_corner = torch.zeros_like(mask).long()
+        rightup_corner[:-1, 1:] = rightup_corner_
+        rightup_corner = rightup_corner.unsqueeze(-1)
+
+        leftbottom_corner = torch.zeros_like(mask).long()
+        leftbottom_corner[1:, :-1] = leftbottom_corner_
+        leftbottom_corner = leftbottom_corner.unsqueeze(-1)
+
+        rightbottom_corner = torch.zeros_like(mask).long()
+        rightbottom_corner[1:, 1:] = rightbottom_corner_
+        rightbottom_corner = rightbottom_corner.unsqueeze(-1)
+
+        stride_uv_mask = torch.cat([
+            right_bound * -1 + left_bound * 1 + rightbottom_corner * -1 +
+            leftbottom_corner * 1 + rightup_corner * -1 + leftup_corner * 1,
+            up_bound * 1 + bottom_bound * -1 + rightbottom_corner * -1 +
+            leftbottom_corner * -1 + rightup_corner * 1 + leftup_corner * 1
+        ], -1).long()
+
+        verts_uv_pixel = verts_uv_pixel + stride_uv_mask[
+            verts_uv_pixel.view(-1, 2)[:, 1],
+            verts_uv_pixel.view(-1, 2)[:, 0]].view(self.NUM_VT, 2)
+
+        face_uv_pixel = verts_uv_pixel[self.faces_uv]
+
+        face_uv_pixel = face_uv_pixel.long()
+        self.face_uv_pixel = face_uv_pixel
+
+    def forward(self,
+                verts_attr: Optional[torch.Tensor],
+                resolution: Optional[Iterable[int]] = None) -> torch.Tensor:
+        """Interpolate the vertex attributes to a map.
+
+        Args:
+            verts_attr (Optional[torch.Tensor]): shape should be (N, V, C),
+                required.
+            resolution (Optional[Iterable[int]], optional): resolution to
+                override self.resolution. If None, will use self.resolution.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: interpolated maps of (N, H, W, C)
+        """
+        if verts_attr.ndim == 2:
+            verts_attr = verts_attr[None]
+        if resolution is not None and resolution != self.resolution:
+            self.resolution = resolution
+            self.update_fragments()
+            self.update_face_uv_pixel()
+
+        bary_coords = self.bary_coords
+        pix_to_face = self.pix_to_face
+
+        N, V, C = verts_attr.shape
+        assert V == self.NUM_VERTS
+        verts_attr = verts_attr.view(N * V, C).to(self.device)
+        offset_idx = torch.arange(0, N).long() * (self.NUM_VERTS - 1)
+        faces_packed = self.faces_tensor[None].repeat(
+            N, 1, 1) + offset_idx.view(-1, 1, 1).to(self.device)
+        faces_packed = faces_packed.view(-1, 3)
+        face_attr = verts_attr[faces_packed]
+        assert face_attr.shape == (N * self.num_faces, 3, C)
+        pix_to_face = self.pix_to_face.unsqueeze(0).repeat(N, 1,
+                                                           1).unsqueeze(-1)
+        bary_coords = self.bary_coords[None].repeat(N, 1, 1, 1).unsqueeze(-2)
+        maps_padded = interpolate_face_attributes(
+            pix_to_face=pix_to_face.to(self.device),
+            barycentric_coords=bary_coords.to(self.device),
+            face_attributes=face_attr.to(self.device),
+        ).squeeze(-2)
+        return maps_padded
+
+    def forward_normal_map(self,
+                           meshes: Meshes = None,
+                           vertices: torch.Tensor = None,
+                           resolution: Optional[Iterable[int]] = None,
+                           cameras: MMCamerasBase = None) -> torch.Tensor:
+        """Interpolate verts normals to a normal map.
+
+        Args:
+            meshes (Meshes): input smpl mesh.
+                Will override vertices if both not None.
+                Defaults to None.
+            vertices (torch.Tensor, optional):
+                smpl vertices. Defaults to None.
+            resolution (Optional[Iterable[int]], optional): resolution to
+                override self.resolution. If None, will use self.resolution.
+                Defaults to None.
+            cameras (MMCamerasBase, optional):
+                cameras to see the mesh.
+                Defaults to None.
+        Returns:
+            torch.Tensor: Normal map of shape (N, H, W, 3)
+        """
+        if meshes is not None:
+            verts_normals = meshes.verts_normals_padded()
+        elif meshes is None and vertices is not None:
+            meshes = Meshes(verts=vertices,
+                            faces=self.faces_tensor[None].repeat(
+                                vertices.shape[0], 1, 1))
+            verts_normals = meshes.verts_normals_padded()
+        else:
+            raise ValueError('No valid input.')
+        verts_normals = meshes.verts_normals_padded()
+        if cameras:
+            verts_normals = cameras.get_world_to_view_transform(
+            ).transform_normals(verts_normals)
+        normal_map = self.forward(verts_attr=verts_normals,
+                                  resolution=resolution)
+        return normal_map
+
+    def forward_uvd_map(self,
+                        meshes: Meshes = None,
+                        vertices: torch.Tensor = None,
+                        resolution: Optional[Iterable[int]] = None,
+                        cameras: MMCamerasBase = None) -> torch.Tensor:
+        """Interpolate the verts xyz value to a uvd map.
+
+        Args:
+            meshes (Meshes): input smpl mesh.
+                Defaults to None.
+            vertices (torch.Tensor, optional):
+                smpl vertices. Will override meshes if both not None.
+                Defaults to None.
+            resolution (Optional[Iterable[int]], optional): resolution to
+                override self.resolution. If None, will use self.resolution.
+                Defaults to None.
+            cameras (MMCamerasBase, optional):
+                cameras to see the mesh.
+                Defaults to None.
+
+        Returns:
+            torch.Tensor: UVD map of shape (N, H, W, 3)
+        """
+        if vertices is not None:
+            verts_uvd = vertices
+        elif vertices is None and meshes is not None:
+            verts_uvd = meshes.verts_padded()
+        else:
+            raise ValueError('No valid input.')
+        if cameras:
+            verts_uvd = cameras.get_world_to_view_transform(
+            ).transform_normals(verts_uvd)
+        uvd_map = self.forward(verts_attr=verts_uvd, resolution=resolution)
+        return uvd_map
+
+    def vertex_resample(
+        self,
+        maps_padded: torch.Tensor,
+        h_flip: bool = False,
+    ) -> torch.Tensor:
+        """Resample the vertex attributes from a map.
+
+        Args:
+            maps_padded (torch.Tensor): shape should be (N, H, W, C). Required.
+            h_flip (bool, optional): whether flip horizontally.
+                Defaults to False.
+
+        Returns:
+            torch.Tensor: resampled vertex attributes. Shape will be (N, V, C)
+        """
+        if maps_padded.ndim == 3:
+            maps_padded = maps_padded[None]
+
+        if h_flip:
+            maps_padded = torch.flip(maps_padded, dims=[2])
+        N, H, W, C = maps_padded.shape
+
+        if H < self.threshold_size or W < self.threshold_size:
+            maps_padded = F.interpolate(
+                maps_padded.permute(0, 3, 1, 2),
+                size=(self.threshold_size, self.threshold_size),
+                mode='bicubic',
+                align_corners=False).permute(0, 2, 3, 1)
+            H, W = self.threshold_size, self.threshold_size
+        if (H, W) != self.resolution:
+            self.resolution = (H, W)
+            self.update_fragments()
+            self.update_face_uv_pixel()
+        offset_idx = torch.arange(0, N).long() * (self.NUM_VERTS - 1)
+        faces_packed = self.faces_tensor[None].repeat(
+            N, 1, 1) + offset_idx.view(-1, 1, 1).to(self.device)
+        faces_packed = faces_packed.view(-1, 3)
+
+        verts_feature_packed = torch.zeros(N * self.NUM_VERTS,
+                                           C).to(self.device)
+
+        face_uv_pixel = self.face_uv_pixel.view(-1, 2)
+        verts_feature_packed[
+            faces_packed] = maps_padded[:, face_uv_pixel[:, 1],
+                                        face_uv_pixel[:, 0]].view(
+                                            N * self.num_faces, 3, C)
+        verts_feature_padded = verts_feature_packed.view(N, self.NUM_VERTS, C)
+
+        return verts_feature_padded
+
+    def wrap_normal(
+        self,
+        meshes: Meshes,
+        normal: torch.Tensor = None,
+        normal_map: torch.Tensor = None,
+    ) -> Meshes:
+        """Warp a normal map or vertex normal to the input meshes.
+
+        Args:
+            meshes (Meshes): the input meshes.
+            normal (torch.Tensor, optional): vertex normal. Shape should be
+                (N, V, 3).
+                Defaults to None.
+            normal_map (torch.Tensor, optional):
+                normal map. Defaults to None.
+
+        Returns:
+            Meshes: returned meshes.
+        """
+        if normal_map is not None and normal is None:
+            normal = self.vertex_resample(normal_map)
+        elif normal_map is not None and normal is not None:
+            normal_map = None
+        elif normal_map is None and normal is None:
+            warnings.warn('Redundant input, will only take displacement.')
+        batch_size = len(meshes)
+        if normal.ndim == 2:
+            normal = normal[None]
+        assert normal.shape[1:] == (self.NUM_VERTS, 3)
+        assert normal.shape[0] in [batch_size, 1]
+
+        if normal.shape[0] == 1:
+            normal = normal.repeat(batch_size, 1, 1)
+        meshes = meshes.clone()
+
+        meshes._set_verts_normals(normal)
+        return meshes
+
+    def wrap_displacement(
+        self,
+        meshes: Meshes,
+        displacement: torch.Tensor = None,
+        displacement_map: torch.Tensor = None,
+    ) -> Meshes:
+        """Offset a vertex displacement or displacement_map to the input
+        meshes.
+
+        Args:
+            meshes (Meshes): the input meshes.
+            displacement (torch.Tensor, optional): vertex displacement.
+                shape should be (N, V, 3).
+                Defaults to None.
+            displacement_map (torch.Tensor, optional): displacement_map,
+                shape should be (N, H, W, 3).
+                Defaults to None.
+
+        Returns:
+            Meshes: returned meshes.
+        """
+        if displacement_map is not None and displacement is None:
+            displacement = self.vertex_resample(displacement_map)
+        elif displacement_map is not None and displacement is not None:
+            displacement_map = None
+            warnings.warn('Redundant input, will only take displacement.')
+        elif displacement_map is None and displacement is None:
+            raise ValueError('No valid input.')
+        batch_size = len(meshes)
+        if displacement.ndim == 2:
+            displacement = displacement[None]
+        assert displacement.shape[1] == self.NUM_VERTS
+        assert displacement.shape[0] in [batch_size, 1]
+
+        if displacement.shape[0] == 1:
+            displacement = displacement.repeat(batch_size, 1, 1)
+        C = displacement.shape[-1]
+        if C == 1:
+            displacement = meshes.verts_normals_padded() * displacement
+
+        displacement = padded_to_packed(displacement)
+
+        meshes = meshes.to(self.device)
+        meshes = meshes.offset_verts(displacement)
+        return meshes
+
+    def wrap_texture(self,
+                     texture_map: torch.Tensor,
+                     resolution: Optional[Iterable[int]] = None,
+                     mode: Optional[str] = 'bicubic',
+                     is_bgr: bool = True) -> Meshes:
+        """Wrap a texture map to the input meshes.
+
+        Args:
+            texture_map (torch.Tensor): the texture map to be wrapped.
+                Shape should be (N, H, W, 3)
+            resolution (Optional[Iterable[int]], optional): resolution to
+                override self.resolution. If None, will use self.resolution.
+                Defaults to None.
+            mode (Optional[str], optional): interpolate mode.
+                Should be in ['nearest', 'bilinear', 'trilinear', 'bicubic',
+                'area'].
+                Defaults to 'bicubic'.
+            is_bgr (bool, optional): Whether the color channel is BGR.
+                Defaults to True.
+
+        Returns:
+            Meshes: returned meshes.
+        """
+
+        assert texture_map.shape[-1] == 3
+        if texture_map.ndim == 3:
+            texture_map_padded = texture_map[None]
+        elif texture_map.ndim == 4:
+            texture_map_padded = texture_map
+        else:
+            raise ValueError(f'Wrong texture_map shape: {texture_map.shape}.')
+        N, H, W, _ = texture_map_padded.shape
+
+        resolution = resolution if resolution is not None else (H, W)
+
+        if resolution != (H, W):
+            texture_map_padded = F.interpolate(texture_map_padded.view(
+                0, 3, 1, 2),
+                                               resolution,
+                                               mode=mode).view(0, 2, 3, 1)
+        assert texture_map_padded.shape[0] in [N, 1]
+
+        if isinstance(texture_map_padded, np.ndarray):
+            texture_map_padded = array2tensor(texture_map_padded)
+            is_bgr = True
+        if is_bgr:
+            texture_map_padded = rgb2bgr(texture_map_padded)
+
+        if texture_map_padded.shape[0] == 1:
+            texture_map_padded = texture_map_padded.repeat(N, 1, 1, 1)
+
+        faces_uvs = self.faces_uv[None].repeat(N, 1, 1)
+        verts_uvs = self.verts_uv[None].repeat(N, 1, 1)
+        textures = TexturesUV(faces_uvs=faces_uvs,
+                              verts_uvs=verts_uvs,
+                              maps=texture_map_padded)
+        return textures
diff --git a/detrsmpl/core/renderer/vedo_render.py b/detrsmpl/core/renderer/vedo_render.py
new file mode 100644
index 0000000000000000000000000000000000000000..c772f61b89a2a57c4ecc4358bc981caf4c19ddfc
--- /dev/null
+++ b/detrsmpl/core/renderer/vedo_render.py
@@ -0,0 +1,107 @@
+import numpy as np
+import vedo
+from scipy.spatial.transform import Rotation as scipy_Rotation
+
+
+class VedoRenderer(object):
+    """An interactive renderer for camera visualization."""
+    def __init__(self, scale=0.03):
+        """Visualize cameras in an interactive scene supported by vedo.
+
+        Args:
+            scale (float, optional):
+                Scale factor. Defaults to 0.03.
+        """
+        self.scale = scale
+        self.axis_list = self.__init_axis()
+        self.camera_list = []
+        self.frames_dir_path = ''
+        self.y_reverse = False
+
+    def __init_axis(self, axis_len=80):
+        """Prepare arrows for axis.
+
+        Args:
+            axis_len (int, optional):
+                Length of each axis.
+                Defaults to 80.
+
+        Returns:
+            List[Arrows]:
+                A list of three arrows.
+        """
+        arrow_end_np = np.eye(3) * axis_len * self.scale
+        colors = ['r', 'g', 'b']  # r-x, g-y, b-z
+        ret_list = []
+        for axis_index in range(3):
+            ret_list.append(
+                vedo.Arrows([[0, 0, 0]],
+                            [arrow_end_np[axis_index]]).c(colors[axis_index]))
+        return ret_list
+
+    def set_y_reverse(self):
+        """Set y reverse before add_camera if it is needed.
+
+        Vedo defines y+ as up direction. When visualizing kinect cameras, y- is
+        up, call set_y_reverse in this situation to make text in correct
+        direction.
+        """
+        self.y_reverse = True
+        self.y_reverse_rotation = \
+            scipy_Rotation.from_euler('z', 180, degrees=True)
+
+    def add_camera(self, camera_parameter, arrow_len=30):
+        """Add an camera to the scene.
+
+        Args:
+            camera_parameter (CameraParameter):
+                An instance of class CameraParameter which stores
+                rotation, translation and name of a camera.
+            arrow_len (int, optional):
+                Length of the arrow. Defaults to 30.
+
+        Returns:
+            list:
+                A list of vedo items related to the input camera.
+        """
+        rot_mat = np.asarray(camera_parameter.get_value('rotation_mat'))
+        translation = np.asarray(camera_parameter.get_value('translation'))
+        cam_center = -np.linalg.inv(rot_mat).dot(translation)
+        arrow_end_origin = np.eye(3) * arrow_len * self.scale
+        colors = ['r', 'g', 'b']  # r-x, g-y, b-z
+        arrow_end_camera = \
+            np.einsum('ij,kj->ki', np.linalg.inv(rot_mat), arrow_end_origin)
+        if self.y_reverse:
+            cam_center = self.y_reverse_rotation.apply(cam_center)
+            for axis_index in range(3):
+                arrow_end_camera[axis_index, :] = \
+                    self.y_reverse_rotation.apply(
+                        arrow_end_camera[axis_index, :]
+                    )
+        vedo_list = []
+        for i in range(3):
+            vedo_list.append(
+                vedo.Arrows([cam_center],
+                            [cam_center + arrow_end_camera[i]]).c(colors[i]))
+        vedo_list.append(
+            vedo.Text3D(camera_parameter.name, cam_center, s=self.scale * 10))
+        self.camera_list += vedo_list
+        return vedo_list
+
+    def show(self, with_axis=True, interactive=True):
+        """Show cameras as well as axis arrow by vedo.show()
+
+        Args:
+            with_axis (bool, optional):
+                Whether to show the axis arrow. Defaults to True.
+            interactive (bool, optional):
+                Pause and interact with window (True) or
+                continue execution (False).
+                Defaults to True.
+        """
+        list_to_show = []
+        list_to_show += self.camera_list
+        if with_axis:
+            list_to_show += self.axis_list
+        vedo.show(*list_to_show, interactive=interactive, axes=1)
+        vedo.clear()
diff --git a/detrsmpl/core/visualization/__init__.py b/detrsmpl/core/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..732bb4f496c953cc12204e57c21a888362a5dbad
--- /dev/null
+++ b/detrsmpl/core/visualization/__init__.py
@@ -0,0 +1,2 @@
+from .visualize_keypoints2d import visualize_kp2d  # noqa:F401
+from .visualize_keypoints3d import visualize_kp3d  # noqa:F401
diff --git a/detrsmpl/core/visualization/visualize_cameras.py b/detrsmpl/core/visualization/visualize_cameras.py
new file mode 100644
index 0000000000000000000000000000000000000000..7285a19915a483cd883fb81e5c3e267e0d4e766f
--- /dev/null
+++ b/detrsmpl/core/visualization/visualize_cameras.py
@@ -0,0 +1,82 @@
+import json
+import os
+
+from detrsmpl.core.cameras.camera_parameters import CameraParameter
+from detrsmpl.core.renderer.vedo_render import VedoRenderer
+from detrsmpl.utils.path_utils import check_path_suffix
+
+
+def visualize_chessboard_kinects_rgb(chessboard_path: str,
+                                     interactive: bool = True,
+                                     show: bool = True):
+    """Visualize all the RGB cameras in a chessboard file.
+
+    Args:
+        chessboard_path (str):
+            Path to the chessboard file.
+        interactive (bool, optional):
+            Pause and interact with window (True) or
+            continue execution (False).
+            Defaults to True.
+        show (bool, optional):
+            Whether to show in a window.
+            Defaults to True.
+    """
+    # Load camera parameter from a json file
+    camera_para_json_dict = json.load(open(chessboard_path))
+    camera_para_dict = {}
+    for camera_id in camera_para_json_dict.keys():
+        try:
+            camera_id_int = int(camera_id)
+            # if camera_id is an instance of int
+            # and it can be divided by 2, it's an rgb camera
+            if camera_id_int % 2 == 0:
+                pass
+            else:
+                continue
+        except ValueError:
+            continue
+        temp_camera_parameter = CameraParameter(name=camera_id)
+        temp_camera_parameter.load_from_chessboard(
+            camera_para_json_dict[camera_id], camera_id)
+        camera_para_dict[camera_id] = temp_camera_parameter
+    camera_vedo_renderer = VedoRenderer()
+    camera_vedo_renderer.set_y_reverse()
+    for camera_id in camera_para_dict.keys():
+        camera_vedo_renderer.add_camera(camera_para_dict[camera_id])
+    if show:
+        camera_vedo_renderer.show(with_axis=False, interactive=interactive)
+
+
+def visualize_dumped_camera_parameter(dumped_dir: str,
+                                      interactive: bool = True,
+                                      show: bool = True):
+    """Visualize all cameras dumped in a directory.
+
+    Args:
+        dumped_dir (str):
+            Path to the directory.
+        interactive (bool, optional):
+            Pause and interact with window (True) or
+            continue execution (False).
+            Defaults to True.
+        show (bool, optional):
+            Whether to show in a window.
+            Defaults to True.
+    """
+    file_list = os.listdir(dumped_dir)
+    camera_para_list = []
+    for file_name in file_list:
+        file_path = os.path.join(dumped_dir, file_name)
+        if not check_path_suffix(file_path, ['.json']):
+            continue
+        else:
+            cam_para = CameraParameter()
+            cam_para.load(file_path)
+            camera_para_list.append(cam_para)
+    camera_vedo_renderer = VedoRenderer()
+    camera_vedo_renderer.set_y_reverse()
+    for camera_para in camera_para_list:
+        camera_vedo_renderer.add_camera(camera_para)
+    if show:
+        camera_vedo_renderer.show(with_axis=False, interactive=interactive)
diff --git a/detrsmpl/core/visualization/visualize_keypoints2d.py b/detrsmpl/core/visualization/visualize_keypoints2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..bff23cc1a2e2649ea9ad9c12b710774126f73437
--- /dev/null
+++ b/detrsmpl/core/visualization/visualize_keypoints2d.py
@@ -0,0 +1,610 @@
+import glob
+import os
+import os.path as osp
+import shutil
+import warnings
+from pathlib import Path
+from typing import Iterable, List, Optional, Tuple, Union
+
+import cv2
+import numpy as np
+from tqdm import tqdm
+
+from detrsmpl.core.conventions.keypoints_mapping import KEYPOINTS_FACTORY
+from detrsmpl.core.conventions.keypoints_mapping.human_data import (
+    HUMAN_DATA_LIMBS_INDEX,
+    HUMAN_DATA_PALETTE,
+)
+from detrsmpl.utils.demo_utils import get_different_colors
+from detrsmpl.utils.ffmpeg_utils import images_to_video, video_to_images
+from detrsmpl.utils.keypoint_utils import search_limbs
+from detrsmpl.utils.path_utils import (
+    Existence,
+    check_input_path,
+    check_path_existence,
+    check_path_suffix,
+    prepare_output_path,
+)
+
+
+def _plot_kp2d_frame(kp2d_person: np.ndarray,
+                     canvas: np.ndarray,
+                     limbs: Union[list, dict,
+                                  np.ndarray] = HUMAN_DATA_LIMBS_INDEX,
+                     palette: Optional[Union[dict, np.ndarray]] = None,
+                     draw_bbox: bool = False,
+                     with_number: bool = False,
+                     font_size: Union[float, int] = 0.5,
+                     disable_limbs: bool = False) -> np.ndarray:
+    """Plot a single frame(array) with keypoints, limbs, bbox, index.
+
+    Args:
+        kp2d_person (np.ndarray): `np.ndarray` shape of (J * 2).
+        canvas (np.ndarray): cv2 image, (H * W * 3) array.
+        limbs (Union[list, dict, np.ndarray], optional): limbs in form of
+            `dict` or 2-dimensional `list` or `np.ndarray` of shape
+            (num_limb, 2).
+            `dict` is used mainly for function `visualize_kp2d`, you can also
+            get the limbs by function `search_limbs`.
+            Defaults to `HUMAN_DATA_LIMBS_INDEX`.
+        palette (Optional[Union[dict, np.ndarray, list]], optional):
+            Pass an (1, 3) `np.ndarray` or `list` [B, G, R] if want the whole
+            limbs and keypoints will be in same color.
+            Pass `None` to use our colorful palette.
+            Pass an (num_limb, 3) `np.ndarray` to get each limb your specific
+            color.
+            `dict` is used mainly for function `visualize_kp2d`, you can also
+            get the palette by function `search_limbs`.
+            Defaults to `HUMAN_DATA_PALETTE`.
+        draw_bbox (bool, optional): whether need to draw bounding boxes.
+            Defaults to False.
+        with_number (bool, optional): whether need to draw index numbers.
+            Defaults to False.
+        font_size (Union[float, int], optional): the font size of the index.
+            Defaults to 0.5.
+        disable_limbs (bool, optional): whether need to disable drawing limbs.
+            Defaults to False.
+
+    Returns:
+        np.ndarray: opencv image of shape (H * W * 3).
+    """
+    # slice the kp2d array
+    kp2d_person = kp2d_person.copy()
+    if kp2d_person.shape[-1] >= 3:
+        kp2d_person = kp2d_person[..., :-1]
+        warnings.warn(
+            'The input array has more than 2-Dimensional coordinates, will'
+            'keep only the first 2-Dimensions of the last axis. The new'
+            f'array shape: {kp2d_person.shape}')
+    if kp2d_person.ndim == 3 and kp2d_person.shape[0] == 1:
+        kp2d_person = kp2d_person[0]
+    assert kp2d_person.ndim == 2 and kp2d_person.shape[
+        -1] == 2, f'Wrong input array shape {kp2d_person.shape}, \
+            should be (num_kp, 2)'
+
+    if draw_bbox:
+        bbox = _get_bbox(kp2d_person, canvas, expand=True)
+    else:
+        bbox = None
+
+    # determine the limb connections and palette
+    if not disable_limbs:
+        if isinstance(limbs, list):
+            limbs = {'body': limbs}
+        elif isinstance(limbs, np.ndarray):
+            limbs = {'body': limbs.reshape(-1, 2).astype(np.int32).tolist()}
+        else:
+            assert set(limbs.keys()).issubset(HUMAN_DATA_LIMBS_INDEX)
+
+        if palette is None:
+            palette = {'body': None}
+        elif isinstance(palette, dict):
+            assert set(palette.keys()) == set(limbs.keys())
+    else:
+        limbs = {'body': None}
+    # draw by part to specify the thickness and color
+    for part_name, part_limbs in limbs.items():
+        # scatter_points_index means the limb end points
+        if not disable_limbs:
+            scatter_points_index = list(
+                set(np.array([part_limbs]).reshape(-1).tolist()))
+        else:
+            scatter_points_index = list(range(len(kp2d_person)))
+        if isinstance(palette, dict) and part_name == 'body':
+            thickness = 2
+            radius = 3
+            color = get_different_colors(len(scatter_points_index))
+        elif disable_limbs and palette is None:
+            radius = 2
+            color = get_different_colors(len(scatter_points_index))
+        else:
+            thickness = 2
+            radius = 2
+            if isinstance(palette, np.ndarray):
+                color = palette.astype(np.int32)
+            elif isinstance(palette, dict):
+                color = np.array(palette[part_name]).astype(np.int32)
+            elif isinstance(palette, list):
+                color = np.array(palette).reshape(-1, 3).astype(np.int32)
+        if not disable_limbs:
+            for limb_index, limb in enumerate(part_limbs):
+                limb_index = min(limb_index, len(color) - 1)
+                cv2.line(canvas,
+                         tuple(kp2d_person[limb[0]].astype(np.int32)),
+                         tuple(kp2d_person[limb[1]].astype(np.int32)),
+                         color=tuple(color[limb_index].tolist()),
+                         thickness=thickness)
+        # draw the points inside the image region
+        for index in scatter_points_index:
+            x, y = kp2d_person[index, :2]
+            if np.isnan(x) or np.isnan(y):
+                continue
+            if 0 <= x < canvas.shape[1] and 0 <= y < canvas.shape[0]:
+                if disable_limbs:
+                    point_color = color[index].tolist()
+                else:
+                    point_color = color[min(color.shape[0] - 1,
+                                            len(scatter_points_index) -
+                                            1)].tolist()
+
+                cv2.circle(canvas, (int(x), int(y)),
+                           radius,
+                           point_color,
+                           thickness=-1)
+                if with_number:
+                    cv2.putText(
+                        canvas, str(index), (int(x), int(y)),
+                        cv2.FONT_HERSHEY_SIMPLEX, font_size,
+                        np.array([255, 255, 255]).astype(np.int32).tolist(), 2)
+    # draw the bboxes
+    if bbox is not None:
+        bbox = bbox.astype(np.int32)
+        cv2.rectangle(canvas, (bbox[0], bbox[2]), (bbox[1], bbox[3]),
+                      (0, 255, 255), 1)
+    return canvas
+
+
+def _get_bbox(keypoint_np: np.ndarray,
+              img_mat: Optional[np.ndarray] = None,
+              expand: bool = False):
+    """get bbox of kp2d."""
+    x_max = np.max(keypoint_np[:, 0])
+    x_min = np.min(keypoint_np[:, 0])
+    y_max = np.max(keypoint_np[:, 1])
+    y_min = np.min(keypoint_np[:, 1])
+    if expand and img_mat is not None:
+        x_expand = (x_max - x_min) * 0.1
+        y_expand = (y_max - y_min) * 0.1
+        x_min = max(0, x_min - x_expand)
+        x_max = min(img_mat.shape[1], x_max + x_expand)
+        y_min = max(0, y_min - y_expand)
+        y_max = min(img_mat.shape[0], y_max + y_expand)
+    return np.asarray([x_min, x_max, y_min, y_max])
+
+
+def _prepare_limb_palette(limbs,
+                          palette,
+                          pop_parts,
+                          data_source,
+                          mask,
+                          search_limbs_func=search_limbs):
+    """Prepare limbs and their palette for plotting.
+
+    Args:
+        limbs (Union[np.ndarray, List[int]]):
+            The preset limbs. This option is for free skeletons like BVH file.
+            In most cases, it's set to None,
+            this function will search a result for limbs automatically.
+        palette (Iterable):
+            The preset palette for limbs. Specified palette,
+            three int represents (B, G, R). Should be tuple or list.
+            In most cases, it's set to None,
+            a palette will be generated with the result of search_limbs.
+        pop_parts (Iterable[str]):
+            The body part names you do not
+            want to visualize.
+            When it's none, nothing will be removed.
+        data_source (str):
+            Data source type.
+        mask (Union[list, np.ndarray):
+            A mask to mask out the incorrect points.
+
+    Returns:
+        Tuple[dict, dict]: (limbs_target, limbs_palette).
+    """
+    if limbs is not None:
+        limbs_target, limbs_palette = {
+            'body': limbs.tolist() if isinstance(limbs, np.ndarray) else limbs
+        }, get_different_colors(len(limbs))
+    else:
+        limbs_target, limbs_palette = search_limbs_func(
+            data_source=data_source, mask=mask)
+
+    if palette:
+        limbs_palette = np.array(palette, dtype=np.uint8)[None]
+
+    # check and pop the pop_parts
+    assert set(pop_parts).issubset(
+        HUMAN_DATA_PALETTE
+    ), f'wrong part_names in pop_parts, supported parts are\
+            {set(HUMAN_DATA_PALETTE.keys())}'
+
+    for part_name in pop_parts:
+        if part_name in limbs_target:
+            limbs_target.pop(part_name)
+            limbs_palette.pop(part_name)
+    return limbs_target, limbs_palette
+
+
+def _prepare_output_path(output_path, overwrite):
+    """Prepare output path."""
+    prepare_output_path(output_path,
+                        allowed_suffix=['.mp4', ''],
+                        tag='output video',
+                        path_type='auto',
+                        overwrite=overwrite)
+    # output_path is a directory
+    if check_path_suffix(output_path, ['']):
+        temp_folder = output_path
+        os.makedirs(temp_folder, exist_ok=True)
+    else:
+        temp_folder = output_path + '_temp_images'
+        if check_path_existence(temp_folder, 'dir') in [
+                Existence.DirectoryExistNotEmpty, Existence.DirectoryExistEmpty
+        ]:
+            shutil.rmtree(temp_folder)
+        os.makedirs(temp_folder, exist_ok=True)
+    return temp_folder
+
+
+def _check_frame_path(frame_list):
+    """Check frame path."""
+    for frame_path in frame_list:
+        if check_path_existence(frame_path, 'file') != Existence.FileExist or \
+                 not check_path_suffix(frame_path, ['.png', '.jpg', '.jpeg']):
+            raise FileNotFoundError(
+                f'The frame should be .png or .jp(e)g: {frame_path}')
+
+
+def _check_temp_path(temp_folder, frame_list, overwrite):
+    """Check temp frame folder path."""
+    if not overwrite and frame_list is not None and len(frame_list) > 0:
+        if Path(temp_folder).absolute() == \
+                Path(frame_list[0]).parent.absolute():
+            raise FileExistsError(
+                f'{temp_folder} exists (set --overwrite to overwrite).')
+
+
+class _CavasProducer:
+    """Prepare background canvas, pure white if not set."""
+    def __init__(self,
+                 frame_list,
+                 resolution,
+                 kp2d=None,
+                 image_array=None,
+                 default_scale=1.5):
+        """Initialize a canvas writer."""
+        # check the origin background frames
+        if frame_list is not None:
+            _check_frame_path(frame_list)
+            self.frame_list = frame_list
+        else:
+            self.frame_list = []
+        self.resolution = resolution
+        self.kp2d = kp2d
+
+        # with numpy array frames
+        self.image_array = image_array
+
+        if self.resolution is None:
+            if self.image_array is not None:
+                self.auto_resolution = self.image_array.shape[1:3]
+            elif len(self.frame_list) > 1 and \
+                    check_path_existence(
+                        self.frame_list[0], 'file') == Existence.FileExist:
+                tmp_image_array = cv2.imread(self.frame_list[0])
+                self.auto_resolution = tmp_image_array.shape[:2]
+            else:
+
+                self.auto_resolution = [
+                    int(np.max(kp2d) * default_scale),
+                    int(np.max(kp2d) * default_scale)
+                ]
+        self.len = kp2d.shape[0]
+
+        if self.image_array is None:
+            self.len_frame = len(self.frame_list)
+        else:
+            self.len_frame = self.image_array.shape[0]
+
+    def __getitem__(self, frame_index):
+        """Get frame data from frame_list of image_array."""
+        # frame file exists, resolution not set
+        if frame_index < self.len_frame and self.resolution is None:
+            if self.image_array is not None:
+                canvas = self.image_array[frame_index]
+            else:
+                canvas = cv2.imread(self.frame_list[frame_index])
+            if self.kp2d is None:
+                kp2d_frame = None
+            else:
+                kp2d_frame = self.kp2d[frame_index]
+        # no frame file, resolution has been set
+        elif frame_index >= self.len_frame and self.resolution is not None:
+            canvas = np.ones((self.resolution[0], self.resolution[1], 3),
+                             dtype=np.uint8) * 255
+            if self.kp2d is None:
+                kp2d_frame = None
+            else:
+                kp2d_frame = self.kp2d[frame_index]
+        # frame file exists, resolution has been set
+        elif frame_index < self.len_frame and self.resolution is not None:
+            if self.image_array is not None:
+                canvas = self.image_array[frame_index]
+            else:
+                canvas = cv2.imread(self.frame_list[frame_index])
+            w_scale = self.resolution[1] / canvas.shape[1]
+            h_scale = self.resolution[0] / canvas.shape[0]
+            canvas = cv2.resize(canvas,
+                                (self.resolution[1], self.resolution[0]),
+                                cv2.INTER_CUBIC)
+            if self.kp2d is None:
+                kp2d_frame = None
+            else:
+                kp2d_frame = np.array([[w_scale, h_scale]
+                                       ]) * self.kp2d[frame_index]
+        # no frame file, no resolution
+        else:
+            canvas = np.ones(
+                (self.auto_resolution[0], self.auto_resolution[1], 3),
+                dtype=np.uint8) * 255
+            if self.kp2d is None:
+                kp2d_frame = None
+            else:
+                kp2d_frame = self.kp2d[frame_index]
+        return canvas, kp2d_frame
+
+    def __len__(self):
+        return self.len
+
+
+def update_frame_list(frame_list, origin_frames, img_format, start, end):
+    """Update frame list if have origin_frames."""
+    input_temp_folder = None
+    # choose in frame_list or origin_frames
+    if frame_list is None and origin_frames is None:
+        print('No background provided, will use pure white background.')
+    elif frame_list is not None and origin_frames is not None:
+        warnings.warn('Redundant input, will only use frame_list.')
+        origin_frames = None
+    if origin_frames is not None:
+        check_input_path(input_path=origin_frames,
+                         allowed_suffix=['.mp4', '.gif', ''],
+                         tag='origin frames',
+                         path_type='auto')
+        if Path(origin_frames).is_file():
+            input_temp_folder = origin_frames + '_temp_images/'
+            video_to_images(origin_frames,
+                            input_temp_folder,
+                            start=start,
+                            end=end)
+            frame_list = glob.glob(osp.join(input_temp_folder, '*.png'))
+            frame_list.sort()
+        else:
+            if img_format is None:
+                frame_list = []
+                for im_name in os.listdir(origin_frames):
+                    if Path(im_name).suffix.lower() in [
+                            '.png', '.jpg', '.jpeg'
+                    ]:
+                        frame_list.append(osp.join(origin_frames, im_name))
+            else:
+                frame_list = []
+                for index in range(start, end):
+                    frame_path = osp.join(origin_frames, img_format % index)
+                    if osp.exists(frame_path):
+                        frame_list.append(frame_path)
+            frame_list.sort()
+    return frame_list, input_temp_folder
+
+
+def visualize_kp2d(
+    kp2d: np.ndarray,
+    output_path: Optional[str] = None,
+    frame_list: Optional[List[str]] = None,
+    origin_frames: Optional[str] = None,
+    image_array: Optional[np.ndarray] = None,
+    limbs: Optional[Union[np.ndarray, List[int]]] = None,
+    palette: Optional[Iterable[int]] = None,
+    data_source: str = 'coco',
+    mask: Optional[Union[list, np.ndarray]] = None,
+    img_format: str = '%06d.png',
+    start: int = 0,
+    end: int = -1,
+    overwrite: bool = False,
+    with_file_name: bool = True,
+    resolution: Optional[Union[Tuple[int, int], list]] = None,
+    fps: Union[float, int] = 30,
+    draw_bbox: bool = False,
+    with_number: bool = False,
+    pop_parts: Iterable[str] = None,
+    disable_tqdm: bool = False,
+    disable_limbs: bool = False,
+    return_array: Optional[bool] = False,
+    keypoints_factory: dict = KEYPOINTS_FACTORY,
+    remove_raw_file: bool = True,
+) -> Union[None, np.ndarray]:
+    """Visualize 2d keypoints to a video or into a folder of frames.
+
+    Args:
+        kp2d (np.ndarray): should be array of shape (f * J * 2)
+                                or (f * n * J * 2)]
+        output_path (str): output video path or image folder.
+        frame_list (Optional[List[str]], optional): list of origin background
+            frame paths, element in list each should be a image path like
+            `*.jpg` or `*.png`. Higher priority than `origin_frames`.
+            Use this when your file names is hard to sort or you only want to
+            render a small number frames.
+            Defaults to None.
+        origin_frames (Optional[str], optional): origin background frame path,
+            could be `.mp4`, `.gif`(will be sliced into a folder) or an image
+            folder. Lower priority than `frame_list`.
+            Defaults to None.
+        limbs (Optional[Union[np.ndarray, List[int]]], optional):
+                if not specified, the limbs will be searched by search_limbs,
+                this option is for free skeletons like BVH file.
+                Defaults to None.
+        palette (Iterable, optional): specified palette, three int represents
+                (B, G, R). Should be tuple or list.
+                Defaults to None.
+        data_source (str, optional): data source type. Defaults to 'coco'.
+        mask (Optional[Union[list, np.ndarray]], optional):
+                mask to mask out the incorrect point.
+                Pass a `np.ndarray` of shape (J,) or `list` of length J.
+                Defaults to None.
+        img_format (str, optional): input image format. Default to '%06d.png',
+        start (int, optional): start frame index. Defaults to 0.
+        end (int, optional): end frame index. Defaults to -1.
+        overwrite (bool, optional): whether replace the origin frames.
+                Defaults to False.
+        with_file_name (bool, optional): whether write origin frame name on
+                the images. Defaults to True.
+        resolution (Optional[Union[Tuple[int, int], list]], optional):
+                (height, width) of the output video
+                will be the same size as the original images if not specified.
+                Defaults to None.
+        fps (Union[float, int], optional): fps. Defaults to 30.
+        draw_bbox (bool, optional): whether need to draw bounding boxes.
+                Defaults to False.
+        with_number (bool, optional): whether draw index number.
+                Defaults to False.
+        pop_parts (Iterable[str], optional): The body part names you do not
+                want to visualize. Supported parts are ['left_eye','right_eye'
+                ,'nose', 'mouth', 'face', 'left_hand', 'right_hand'].
+                Defaults to [].frame_list
+        disable_tqdm (bool, optional):
+            Whether to disable the entire progressbar wrapper.
+            Defaults to False.
+        disable_limbs (bool, optional): whether need to disable drawing limbs.
+            Defaults to False.
+        return_array (bool, optional): Whether to return images as a opencv
+            array. Defaults to None.
+        keypoints_factory (dict, optional): Dict of all the conventions.
+            Defaults to KEYPOINTS_FACTORY.
+
+    Raises:
+        FileNotFoundError: check output video path.
+        FileNotFoundError: check input frame paths.
+
+    Returns:
+        Union[None, np.ndarray].
+    """
+
+    # check the input array shape, reshape to (num_frames, num_person, J, 2)
+    kp2d = kp2d[..., :2].copy()
+    if kp2d.ndim == 3:
+        kp2d = kp2d[:, np.newaxis]
+    assert kp2d.ndim == 4
+    num_frames, num_person = kp2d.shape[0], kp2d.shape[1]
+    # slice the input array temporally
+    end = (min(num_frames - 1, end) + num_frames) % num_frames
+    kp2d = kp2d[start:end + 1]
+
+    if image_array is not None:
+        origin_frames = None
+        frame_list = None
+        return_array = True
+        input_temp_folder = None
+    else:
+        frame_list, input_temp_folder = update_frame_list(
+            frame_list, origin_frames, img_format, start, end)
+
+    kp2d = kp2d[:num_frames]
+    # check output path
+    if output_path is not None:
+        output_temp_folder = _prepare_output_path(output_path, overwrite)
+        # check whether temp_folder will overwrite frame_list by accident
+        _check_temp_path(output_temp_folder, frame_list, overwrite)
+    else:
+        output_temp_folder = None
+
+    # check data_source & mask
+    if data_source not in keypoints_factory:
+        raise ValueError('Wrong data_source. Should choose in'
+                         f'{list(keypoints_factory.keys())}')
+    if mask is not None:
+        if isinstance(mask, list):
+            mask = np.array(mask).reshape(-1)
+        assert mask.shape == (
+            len(keypoints_factory[data_source]),
+        ), f'mask length should fit with keypoints number \
+            {len(keypoints_factory[data_source])}'
+
+    # search the limb connections and palettes from superset smplx
+    # check and pop the pop_parts
+    if pop_parts is None:
+        pop_parts = []
+
+    if disable_limbs:
+        limbs_target, limbs_palette = None, None
+    else:
+        # *** changed by wyj ***
+        limbs_target, limbs_palette = _prepare_limb_palette(
+            limbs, palette, pop_parts, data_source, mask)
+        # limbs_target, limbs_palette = limbs, palette
+    canvas_producer = _CavasProducer(frame_list, resolution, kp2d, image_array)
+
+    out_image_array = []
+    # start plotting by frame
+    for frame_index in tqdm(range(kp2d.shape[0]), disable=disable_tqdm):
+        canvas, kp2d_frame = canvas_producer[frame_index]
+        # start plotting by person
+        for person_index in range(num_person):
+            if num_person >= 2 and not disable_limbs:
+                limbs_palette = get_different_colors(
+                    num_person)[person_index].reshape(1, 3)
+            canvas = _plot_kp2d_frame(kp2d_person=kp2d_frame[person_index],
+                                      canvas=canvas,
+                                      limbs=limbs_target,
+                                      palette=limbs_palette,
+                                      draw_bbox=draw_bbox,
+                                      with_number=with_number,
+                                      font_size=0.5,
+                                      disable_limbs=disable_limbs)
+        if with_file_name and frame_list is not None:
+            h, w, _ = canvas.shape
+            if frame_index <= len(frame_list) - 1:
+                cv2.putText(
+                    canvas, str(Path(frame_list[frame_index]).name),
+                    (w // 2, h // 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5 * h / 500,
+                    np.array([255, 255, 255]).astype(np.int32).tolist(), 2)
+        if output_path is not None:
+            # write the frame with opencv
+            if frame_list is not None and check_path_suffix(
+                    output_path,
+                    '') and len(frame_list) >= len(canvas_producer):
+                frame_path = os.path.join(output_temp_folder,
+                                          Path(frame_list[frame_index]).name)
+                img_format = None
+            else:
+                frame_path = \
+                    os.path.join(output_temp_folder, f'{frame_index:06d}.png')
+                img_format = '%06d.png'
+            cv2.imwrite(frame_path, canvas)
+        if return_array:
+            out_image_array.append(canvas[None])
+
+    if input_temp_folder is not None:
+        shutil.rmtree(input_temp_folder)
+    # convert frames to video
+    if output_path is not None:
+        if check_path_suffix(output_path, ['.mp4']):
+            images_to_video(input_folder=output_temp_folder,
+                            output_path=output_path,
+                            remove_raw_file=remove_raw_file,
+                            img_format=img_format,
+                            fps=fps)
+
+    if return_array:
+        out_image_array = np.concatenate(out_image_array)
+        return out_image_array
diff --git a/detrsmpl/core/visualization/visualize_keypoints3d.py b/detrsmpl/core/visualization/visualize_keypoints3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd7b2579084b781f5716a6301649861f2834a988
--- /dev/null
+++ b/detrsmpl/core/visualization/visualize_keypoints3d.py
@@ -0,0 +1,218 @@
+import warnings
+from typing import Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+import detrsmpl.core.conventions.keypoints_mapping as keypoints_mapping
+from detrsmpl.core.renderer.matplotlib3d_renderer import Axes3dJointsRenderer
+from detrsmpl.utils.demo_utils import get_different_colors
+from detrsmpl.utils.keypoint_utils import search_limbs
+from detrsmpl.utils.path_utils import prepare_output_path
+
+
+def _norm_pose(pose_numpy: np.ndarray, min_value: Union[float, int],
+               max_value: Union[float, int], mask: Union[np.ndarray, list]):
+    """Normalize the poses and make the center close to axis center."""
+    assert max_value > min_value
+    pose_np_normed = pose_numpy.copy()
+    if not mask:
+        mask = list(range(pose_numpy.shape[-2]))
+    axis_num = 3
+    axis_stat = np.zeros(shape=[axis_num, 4])
+    for axis_index in range(axis_num):
+        axis_data = pose_np_normed[..., mask, axis_index]
+        axis_min = np.min(axis_data)
+        axis_max = np.max(axis_data)
+        axis_mid = (axis_min + axis_max) / 2.0
+        axis_span = axis_max - axis_min
+        axis_stat[axis_index] = np.asarray(
+            (axis_min, axis_max, axis_mid, axis_span))
+    target_mid = (max_value + min_value) / 2.0
+    max_span = np.max(axis_stat[:, 3])
+    target_span = max_value - min_value
+    for axis_index in range(axis_num):
+        pose_np_normed[..., axis_index] = \
+            pose_np_normed[..., axis_index] - \
+            axis_stat[axis_index, 2]
+    pose_np_normed = pose_np_normed / max_span * target_span
+    pose_np_normed = pose_np_normed + target_mid
+    return pose_np_normed
+
+
+def visualize_kp3d(
+    kp3d: np.ndarray,
+    output_path: Optional[str] = None,
+    limbs: Optional[Union[np.ndarray, List[int]]] = None,
+    palette: Optional[Iterable[int]] = None,
+    data_source: str = 'coco',
+    mask: Optional[Union[list, tuple, np.ndarray]] = None,
+    start: int = 0,
+    end: Optional[int] = None,
+    resolution: Union[list, Tuple[int, int]] = (1024, 1024),
+    fps: Union[float, int] = 30,
+    frame_names: Optional[Union[List[str], str]] = None,
+    orbit_speed: Union[float, int] = 0.5,
+    value_range: Union[Tuple[int, int], list] = (-100, 100),
+    pop_parts: Iterable[str] = (),
+    disable_limbs: bool = False,
+    return_array: Optional[bool] = None,
+    convention: str = 'opencv',
+    keypoints_factory: dict = keypoints_mapping.KEYPOINTS_FACTORY,
+) -> Union[None, np.ndarray]:
+    """Visualize 3d keypoints to a video with matplotlib. Support multi person
+    and specified limb connections.
+
+    Args:
+        kp3d (np.ndarray): shape could be (f * J * 4/3/2) or
+            (f * num_person * J * 4/3/2)
+        output_path (str): output video path image folder.
+        limbs (Optional[Union[np.ndarray, List[int]]], optional):
+            if not specified, the limbs will be searched by search_limbs,
+            this option is for free skeletons like BVH file.
+            Defaults to None.
+        palette (Iterable, optional): specified palette, three int represents
+            (B, G, R). Should be tuple or list.
+            Defaults to None.
+        data_source (str, optional): data source type. Defaults to 'coco'.
+            choose in ['coco', 'smplx', 'smpl', 'coco_wholebody',
+            'mpi_inf_3dhp', 'mpi_inf_3dhp_test', 'h36m', 'pw3d', 'mpii']
+        mask (Optional[Union[list, tuple, np.ndarray]], optional):
+            mask to mask out the incorrect points. Defaults to None.
+        start (int, optional): start frame index. Defaults to 0.
+        end (int, optional): end frame index.
+            Could be positive int or negative int or None.
+            None represents include all the frames.
+            Defaults to None.
+        resolution (Union[list, Tuple[int, int]], optional):
+            (width, height) of the output video
+            will be the same size as the original images if not specified.
+            Defaults to None.
+        fps (Union[float, int], optional): fps. Defaults to 30.
+        frame_names (Optional[Union[List[str], str]], optional): List(should be
+            the same as frame numbers) or single string or string format
+            (like 'frame%06d')for frame title, no title if None.
+            Defaults to None.
+        orbit_speed (Union[float, int], optional): orbit speed of camera.
+            Defaults to 0.5.
+        value_range (Union[Tuple[int, int], list], optional):
+            range of axis value. Defaults to (-100, 100).
+        pop_parts (Iterable[str], optional): The body part names you do not
+            want to visualize. Choose in ['left_eye','right_eye', 'nose',
+            'mouth', 'face', 'left_hand', 'right_hand']Defaults to [].
+        disable_limbs (bool, optional): whether need to disable drawing limbs.
+            Defaults to False.
+        return_array (bool, optional): Whether to return images as opencv array
+            .If None, an array will be returned when frame number is below 100.
+            Defaults to None.
+        keypoints_factory (dict, optional): Dict of all the conventions.
+            Defaults to KEYPOINTS_FACTORY.
+    Raises:
+        TypeError: check the type of input keypoints.
+        FileNotFoundError: check the output video path.
+
+    Returns:
+        Union[None, np.ndarray].
+    """
+    # check input shape
+    if not isinstance(kp3d, np.ndarray):
+        raise TypeError(
+            f'Input type is {type(kp3d)}, which should be numpy.ndarray.')
+    kp3d = kp3d.copy()
+    if kp3d.shape[-1] == 2:
+        kp3d = np.concatenate([kp3d, np.zeros_like(kp3d)[..., 0:1]], axis=-1)
+        warnings.warn(
+            'The input array is 2-Dimensional coordinates, will concatenate ' +
+            f'zeros to the last axis. The new array shape: {kp3d.shape}')
+    elif kp3d.shape[-1] >= 4:
+        kp3d = kp3d[..., :3]
+        warnings.warn(
+            'The input array has more than 3-Dimensional coordinates, will ' +
+            'keep only the first 3-Dimensions of the last axis. The new ' +
+            f'array shape: {kp3d.shape}')
+    if kp3d.ndim == 3:
+        kp3d = np.expand_dims(kp3d, 1)
+    num_frames = kp3d.shape[0]
+    assert kp3d.ndim == 4
+    assert kp3d.shape[-1] == 3
+
+    if return_array is None:
+        if num_frames > 100:
+            return_array = False
+        else:
+            return_array = True
+
+    # check data_source & mask
+    if data_source not in keypoints_factory:
+        raise ValueError('Wrong data_source. Should choose in' +
+                         f'{list(keypoints_factory.keys())}')
+    if mask is not None:
+        if not isinstance(mask, np.ndarray):
+            mask = np.array(mask).reshape(-1)
+        assert mask.shape == (
+            len(keypoints_factory[data_source]),
+        ), f'mask length should fit with keypoints number \
+            {len(keypoints_factory[data_source])}'
+
+    # check the output path
+    if output_path is not None:
+        prepare_output_path(output_path,
+                            path_type='auto',
+                            tag='output video',
+                            allowed_suffix=['.mp4', '.gif', ''])
+
+    # slice the frames
+    end = num_frames if end is None else end
+    kp3d = kp3d[start:end]
+    # norm the coordinates
+    if value_range is not None:
+        # norm pose location to value_range (70% value range)
+        mask_index = np.where(np.array(mask) > 0) if mask is not None else None
+        margin_width = abs(value_range[1] - value_range[1]) * 0.15
+        pose_np_normed = _norm_pose(kp3d, value_range[0] + margin_width,
+                                    value_range[1] - margin_width, mask_index)
+        input_pose_np = pose_np_normed
+    else:
+        input_pose_np = kp3d
+
+    # determine the limb connections and palettes
+    if limbs is not None:
+        limbs_target, limbs_palette = {
+            'body': limbs.tolist() if isinstance(limbs, np.ndarray) else limbs
+        }, get_different_colors(len(limbs))
+    else:
+        limbs_target, limbs_palette = search_limbs(data_source=data_source,
+                                                   mask=mask)
+    if palette is not None:
+        limbs_palette = np.array(palette, dtype=np.uint8)[None]
+
+    # check and pop the pop_parts
+    assert set(pop_parts).issubset(
+        keypoints_mapping.human_data.HUMAN_DATA_PALETTE.keys(
+        )), f'wrong part_names in pop_parts, could only \
+        choose in{set(keypoints_mapping.human_data.HUMAN_DATA_PALETTE.keys())}'
+
+    for part_name in pop_parts:
+        if part_name in limbs_target:
+            limbs_target.pop(part_name)
+
+    # initialize renderer and start render
+    renderer = Axes3dJointsRenderer()
+    renderer.init_camera(cam_hori_speed=orbit_speed, cam_elev_speed=0.2)
+    renderer.set_connections(limbs_target, limbs_palette)
+    if isinstance(frame_names, str):
+        if '%' in frame_names:
+            frame_names = [
+                frame_names % index for index in range(input_pose_np.shape[0])
+            ]
+        else:
+            frame_names = [frame_names] * input_pose_np.shape[0]
+    image_array = renderer.render_kp3d_to_video(input_pose_np,
+                                                output_path,
+                                                convention,
+                                                fps=fps,
+                                                resolution=resolution,
+                                                visual_range=value_range,
+                                                frame_names=frame_names,
+                                                disable_limbs=disable_limbs,
+                                                return_array=return_array)
+    return image_array
diff --git a/detrsmpl/core/visualization/visualize_smpl.py b/detrsmpl/core/visualization/visualize_smpl.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed68b904c90b1482978e47d84f47746b0c489431
--- /dev/null
+++ b/detrsmpl/core/visualization/visualize_smpl.py
@@ -0,0 +1,1209 @@
+# yapf: disable
+import copy
+import glob
+import os
+import os.path as osp
+import shutil
+import warnings
+from functools import partial
+from pathlib import Path
+from typing import List, Optional, Tuple, Union
+
+import mmcv
+import numpy as np
+import torch
+import torch.nn as nn
+from colormap import Color
+
+from detrsmpl.core.cameras import (
+    WeakPerspectiveCameras,
+    compute_orbit_cameras,
+)
+from detrsmpl.core.cameras.builder import build_cameras
+from detrsmpl.core.conventions.cameras.convert_convention import \
+    convert_camera_matrix  # prevent yapf isort conflict
+from detrsmpl.core.conventions.segmentation import body_segmentation
+from detrsmpl.core.renderer.torch3d_renderer import render_runner
+from detrsmpl.core.renderer.torch3d_renderer.meshes import \
+    ParametricMeshes  # noqa: E501
+from detrsmpl.core.renderer.torch3d_renderer.render_smpl_config import (
+    RENDER_CONFIGS,
+)
+from detrsmpl.core.renderer.torch3d_renderer.smpl_renderer import SMPLRenderer
+from detrsmpl.core.renderer.torch3d_renderer.utils import \
+    align_input_to_padded  # noqa: E501
+from detrsmpl.models.body_models.builder import build_body_model
+from detrsmpl.utils.demo_utils import (
+    convert_bbox_to_intrinsic,
+    convert_crop_cam_to_orig_img,
+    convert_kp2d_to_bbox,
+    get_default_hmr_intrinsic,
+    get_different_colors,
+)
+from detrsmpl.utils.ffmpeg_utils import (
+    check_input_path,
+    images_to_array,
+    prepare_output_path,
+    vid_info_reader,
+    video_to_array,
+    video_to_images,
+)
+from detrsmpl.utils.mesh_utils import save_meshes_as_objs, save_meshes_as_plys
+from detrsmpl.utils.path_utils import check_path_suffix
+
+# yapf: enable
+
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
+
+def _prepare_background(image_array, frame_list, origin_frames, output_path,
+                        start, end, img_format, overwrite, num_frames,
+                        read_frames_batch):
+    """Compare among `image_array`, `frame_list` and `origin_frames` and decide
+    whether to save the temp background images."""
+    if num_frames > 300:
+        read_frames_batch = True
+
+    frames_folder = None
+    remove_folder = False
+
+    if isinstance(image_array, np.ndarray):
+
+        image_array = torch.Tensor(image_array)
+
+    if image_array is not None:
+        if image_array.ndim == 3:
+            image_array = image_array[None]
+        if image_array.shape[0] == 1:
+            image_array = image_array.repeat(num_frames, 1, 1, 1)
+        frame_list = None
+        origin_frames = None
+        image_array = image_array[start:end]
+
+    # check the output path and get the image_array
+    if output_path is not None:
+        prepare_output_path(output_path=output_path,
+                            allowed_suffix=['.mp4', 'gif', '.png', '.jpg','.jpeg'],
+                            tag='output video',
+                            path_type='auto',
+                            overwrite=overwrite)
+        if image_array is None:
+            # choose in frame_list or origin_frames
+            # if all None, will use pure white background
+            if frame_list is None and origin_frames is None:
+                print(
+                    'No background provided, will use pure white background.')
+            elif frame_list is not None and origin_frames is not None:
+                warnings.warn('Redundant input, will only use frame_list.')
+                origin_frames = None
+
+            # read the origin frames as array if any.
+            if frame_list is None and origin_frames is not None:
+                check_input_path(input_path=origin_frames,
+                                 allowed_suffix=['.mp4', '.gif', ''],
+                                 tag='origin frames',
+                                 path_type='auto')
+                # if origin_frames is a video, write it as a folder of images
+                # if read_frames_batch is True, else read directly as an array.
+                if Path(origin_frames).is_file():
+                    if read_frames_batch:
+                        frames_folder = osp.join(
+                            Path(output_path).parent,
+                            Path(output_path).name + '_input_temp')
+                        os.makedirs(frames_folder, exist_ok=True)
+                        video_to_images(origin_frames,
+                                        frames_folder,
+                                        img_format=img_format,
+                                        start=start,
+                                        end=end)
+                        remove_folder = True
+                    else:
+                        remove_folder = False
+                        frames_folder = None
+                        image_array = video_to_array(origin_frames,
+                                                     start=start,
+                                                     end=end)
+                # if origin_frames is a folder, write it as a folder of images
+                # read the folder as an array if read_frames_batch is True
+                # else return frames_folder for reading during rendering.
+                else:
+                    if read_frames_batch:
+                        frames_folder = origin_frames
+                        remove_folder = False
+                        image_array = None
+                    else:
+                        image_array = images_to_array(origin_frames,
+                                                      img_format=img_format,
+                                                      start=start,
+                                                      end=end)
+                        remove_folder = False
+                        frames_folder = origin_frames
+            # if frame_list is not None, move the images into a folder
+            # read the folder as an array if read_frames_batch is True
+            # else return frames_folder for reading during rendering.
+            elif frame_list is not None and origin_frames is None:
+                frames_folder = osp.join(
+                    Path(output_path).parent,
+                    Path(output_path).name + '_input_temp')
+                os.makedirs(frames_folder, exist_ok=True)
+                for frame_idx, frame_path in enumerate(frame_list):
+                    if check_path_suffix(
+                            path_str=frame_path,
+                            allowed_suffix=['.jpg', '.png', '.jpeg']):
+                        shutil.copy(
+                            frame_path,
+                            os.path.join(frames_folder,
+                                         '%06d.png' % frame_idx))
+                        img_format = '%06d.png'
+                if not read_frames_batch:
+
+                    image_array = images_to_array(frames_folder,
+                                                  img_format=img_format,
+                                                  remove_raw_files=True)
+                    frames_folder = None
+                    remove_folder = False
+                else:
+                    image_array = None
+                    remove_folder = True
+    return image_array, remove_folder, frames_folder
+
+
+def _prepare_body_model(body_model, body_model_config):
+    """Prepare `body_model` from `body_model_config` or existing
+    `body_model`."""
+    if body_model is None:
+        if body_model_config is not None:
+            body_model_config = copy.deepcopy(body_model_config)
+            model_path = body_model_config.get('model_path', None)
+
+            model_type = body_model_config.get('type').lower()
+            if model_type not in ['smpl', 'smplx']:
+                raise ValueError(f'Do not support {model_type}, please choose'
+                                 f' in `smpl` or `smplx.')
+
+            if model_path and osp.isdir(model_path):
+                model_path = osp.join(model_path, model_type)
+                body_model_config.update(model_path=model_path)
+                body_model = build_body_model(body_model_config)
+                assert os.path.isdir(model_path)
+            else:
+                raise FileNotFoundError('Wrong model_path.'
+                                        ' File or directory does not exist.')
+        else:
+            raise ValueError('Please input body_model_config.')
+    else:
+        if body_model_config is not None:
+            warnings.warn('Redundant input, will take body_model directly'
+                          'and ignore body_model_config.')
+    return body_model
+
+
+def _prepare_input_pose(verts, poses, betas, transl):
+    """Prepare input pose data as tensor and ensure correct temporal slice."""
+    if verts is None and poses is None:
+        raise ValueError('Please input valid poses or verts.')
+    elif (verts is not None) and (poses is not None):
+        warnings.warn('Redundant input, will take verts and ignore poses & '
+                      'betas & transl.')
+        poses = None
+        transl = None
+        betas = None
+    elif isinstance(poses, dict):
+        transl = poses.get('transl', transl)
+        betas = poses.get('betas', betas)
+
+    if isinstance(verts, np.ndarray):
+        verts = torch.Tensor(verts)
+        num_frames = verts.shape[0]
+    elif isinstance(verts, torch.Tensor):
+        num_frames = verts.shape[0]
+
+    if isinstance(poses, np.ndarray):
+        poses = torch.Tensor(poses)
+        num_frames = poses.shape[0]
+    elif isinstance(poses, torch.Tensor):
+        num_frames = poses.shape[0]
+    elif isinstance(poses, dict):
+        for k, v in poses.items():
+            if isinstance(v, np.ndarray):
+                poses[k] = torch.tensor(v)
+        num_frames = poses['body_pose'].shape[0]
+
+    if isinstance(betas, np.ndarray):
+        betas = torch.Tensor(betas)
+
+    if betas is not None:
+        if betas.shape[0] != num_frames:
+            times = num_frames // betas.shape[0]
+            if betas.ndim == 2:
+                betas = betas.repeat(times, 1)[:num_frames]
+            elif betas.ndim == 3:
+                betas = betas.repeat(times, 1, 1)[:num_frames]
+            print(f'betas will be repeated by dim 0 for {times} times.')
+    if isinstance(transl, np.ndarray):
+        transl = torch.Tensor(transl)
+
+    return verts, poses, betas, transl
+
+
+def _prepare_mesh(poses, betas, transl, verts, start, end, body_model):
+    """Prepare the mesh info for rendering."""
+    NUM_JOINTS = body_model.NUM_JOINTS
+    NUM_BODY_JOINTS = body_model.NUM_BODY_JOINTS
+    NUM_DIM = 3 * (NUM_JOINTS + 1)
+    body_pose_keys = body_model.body_pose_keys
+    joints = None
+    if poses is not None:
+        if isinstance(poses, dict):
+            if not body_pose_keys.issubset(poses):
+                raise KeyError(
+                    f'{str(poses.keys())}, Please make sure that your '
+                    f'input dict has all of {", ".join(body_pose_keys)}')
+            num_frames = poses['body_pose'].shape[0]
+            _, num_person, _ = poses['body_pose'].view(
+                num_frames, -1, NUM_BODY_JOINTS * 3).shape
+
+            full_pose = body_model.dict2tensor(poses)
+            full_pose = full_pose[start:end]
+
+        elif isinstance(poses, torch.Tensor):
+            if poses.shape[-1] != NUM_DIM:
+                raise ValueError(
+                    f'Please make sure your poses is {NUM_DIM} dims in'
+                    f'the last axis. Your input shape: {poses.shape}')
+            poses = poses.view(poses.shape[0], -1, (NUM_JOINTS + 1) * 3)
+            num_frames, num_person, _ = poses.shape
+            full_pose = poses[start:end]
+        else:
+            raise ValueError('Wrong pose type, should be `dict` or `tensor`.')
+
+        # multi person check
+        if num_person > 1:
+            if betas is not None:
+                num_betas = betas.shape[-1]
+                betas = betas.view(num_frames, -1, num_betas)
+
+                if betas.shape[1] == 1:
+                    betas = betas.repeat(1, num_person, 1)
+                    warnings.warn(
+                        'Only one betas for multi-person, will all be the '
+                        'same body shape.')
+                elif betas.shape[1] > num_person:
+                    betas = betas[:, :num_person]
+                    warnings.warn(
+                        f'Betas shape exceed, will be sliced as {betas.shape}.'
+                    )
+                elif betas.shape[1] == num_person:
+                    pass
+                else:
+                    raise ValueError(
+                        f'Odd betas shape: {betas.shape}, inconsistent'
+                        f'with poses in num_person: {poses.shape}.')
+            else:
+                warnings.warn('None betas for multi-person, will all be the '
+                              'default body shape.')
+
+            if transl is not None:
+                transl = transl.view(poses.shape[0], -1, 3)
+                if transl.shape[1] == 1:
+                    transl = transl.repeat(1, num_person, 1)
+                    warnings.warn(
+                        'Only one transl for multi-person, will all be the '
+                        'same translation.')
+                elif transl.shape[1] > num_person:
+                    transl = transl[:, :num_person]
+                    warnings.warn(f'Transl shape exceed, will be sliced as'
+                                  f'{transl.shape}.')
+                elif transl.shape[1] == num_person:
+                    pass
+                else:
+                    raise ValueError(
+                        f'Odd transl shape: {transl.shape}, inconsistent'
+                        f'with poses in num_person: {poses.shape}.')
+            else:
+                warnings.warn('None transl for multi-person, will all be the '
+                              'default translation.')
+
+        # slice the input poses, betas, and transl.
+        betas = betas[start:end] if betas is not None else None
+        transl = transl[start:end] if transl is not None else None
+        pose_dict = body_model.tensor2dict(full_pose=full_pose,
+                                           betas=betas,
+                                           transl=transl)
+
+        # get new num_frames
+        num_frames = full_pose.shape[0]
+
+        model_output = body_model(**pose_dict)
+        vertices = model_output['vertices']
+        joints = model_output['joints'][0] # hardcode here
+
+    elif verts is not None:
+        if isinstance(verts, np.ndarray):
+            verts = torch.Tensor(verts)
+        verts = verts[start:end]
+        pose_dict = body_model.tensor2dict(torch.zeros(1,
+                                                       (NUM_JOINTS + 1) * 3))
+
+        if verts.ndim == 3:
+            joints = torch.einsum('bik,ji->bjk',
+                                  [verts, body_model.J_regressor])
+        elif verts.ndim == 4:
+            joints = torch.einsum('fpik,ji->fpjk',
+                                  [verts, body_model.J_regressor])
+        num_verts = body_model.NUM_VERTS
+        assert verts.shape[-2] == num_verts, 'Wrong input verts shape.'
+        num_frames = verts.shape[0]
+        vertices = verts.view(num_frames, -1, num_verts, 3)
+        num_joints = joints.shape[-2]
+        joints = joints.view(num_frames, -1, num_joints, 3)
+        num_person = vertices.shape[1]
+    else:
+        raise ValueError('Poses and verts are all None.')
+    return vertices, joints, num_frames, num_person
+
+
+def _prepare_colors(palette, render_choice, num_person, num_verts, model_type):
+    """Prepare the `color` as a tensor of shape (num_person, num_verts, 3)
+    according to `palette`.
+
+    This is to make the identity in video clear.
+    """
+    if not len(palette) == num_person:
+        raise ValueError('Please give the right number of palette.')
+    body_segger = body_segmentation(model_type)
+
+    if render_choice == 'silhouette':
+        colors = torch.ones(num_person, num_verts, 3)
+    elif render_choice == 'part_silhouette':
+        colors = torch.zeros(num_person, num_verts, 3)
+        for i, k in enumerate(body_segger.keys()):
+            colors[:, body_segger[k]] = i + 1
+    else:
+        if isinstance(palette, torch.Tensor):
+            if palette.max() > 1:
+                palette = palette / 255.0
+            palette = torch.clip(palette, min=0, max=1)
+            colors = palette.view(num_person,
+                                  3).unsqueeze(1).repeat(1, num_verts, 1)
+
+        elif isinstance(palette, list):
+            colors = []
+            for person_idx in range(num_person):
+
+                if palette[person_idx] == 'random':
+                    color_person = get_different_colors(
+                        num_person, int_dtype=False)[person_idx]
+                    color_person = torch.FloatTensor(color_person)
+                    color_person = torch.clip(color_person * 1.5,
+                                              min=0.6,
+                                              max=1)
+                    color_person = color_person.view(1, 1, 3).repeat(
+                        1, num_verts, 1)
+                elif palette[person_idx] == 'segmentation':
+                    verts_labels = torch.zeros(num_verts)
+                    color_person = torch.ones(1, num_verts, 3)
+                    color_part = get_different_colors(len(body_segger),
+                                                      int_dtype=False)
+                    for part_idx, k in enumerate(body_segger.keys()):
+                        index = body_segger[k]
+                        verts_labels[index] = part_idx
+                        color_person[:, index] = torch.FloatTensor(
+                            color_part[part_idx])
+                elif palette[person_idx] in Color.color_names:
+                    color_person = torch.FloatTensor(
+                        Color(palette[person_idx]).rgb).view(1, 1, 3).repeat(
+                            1, num_verts, 1)
+                else:
+                    raise ValueError('Wrong palette string. '
+                                     'Please choose in the pre-defined range.')
+                colors.append(color_person)
+            colors = torch.cat(colors, 0)
+            assert colors.shape == (num_person, num_verts, 3)
+            # the color passed to renderer will be (num_person, num_verts, 3)
+        else:
+            raise ValueError(
+                'Palette should be tensor, array or list of strs.')
+    return colors
+
+
+def render_smpl(
+        # smpl parameters
+        poses: Optional[Union[torch.Tensor, np.ndarray, dict]] = None,
+        betas: Optional[Union[torch.Tensor, np.ndarray]] = None,
+        transl: Optional[Union[torch.Tensor, np.ndarray]] = None,
+        verts: Optional[Union[torch.Tensor, np.ndarray]] = None,
+        body_model: Optional[nn.Module] = None,
+        body_model_config: Optional[dict] = None,
+        # camera parameters
+        R: Optional[Union[torch.Tensor, np.ndarray]] = None,
+        T: Optional[Union[torch.Tensor, np.ndarray]] = None,
+        K: Optional[Union[torch.Tensor, np.ndarray]] = None,
+        orig_cam: Optional[Union[torch.Tensor, np.ndarray]] = None,
+        Ks: Optional[Union[torch.Tensor, np.ndarray]] = None,
+        in_ndc: bool = True,
+        convention: str = 'pytorch3d',
+        projection: Literal['weakperspective', 'perspective', 'fovperspective',
+                            'orthographics',
+                            'fovorthographics'] = 'perspective',
+        orbit_speed: Union[float, Tuple[float, float]] = 0.0,
+        # render choice parameters
+        render_choice: Literal['lq', 'mq', 'hq', 'silhouette', 'depth',
+                               'normal', 'pointcloud',
+                               'part_silhouette'] = 'hq',
+        palette: Union[List[str], str, np.ndarray, torch.Tensor] = 'white',
+        texture_image: Union[torch.Tensor, np.ndarray] = None,
+        resolution: Optional[Union[List[int], Tuple[int, int]]] = None,
+        start: int = 0,
+        end: Optional[int] = None,
+        alpha: float = 1.0,
+        no_grad: bool = True,
+        batch_size: int = 10,
+        device: Union[torch.device, str] = 'cuda',
+        # file io parameters
+        return_tensor: bool = False,
+        output_path: str = None,
+        origin_frames: Optional[str] = None,
+        frame_list: Optional[List[str]] = None,
+        image_array: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        img_format: str = '%06d.png',
+        overwrite: bool = False,
+        mesh_file_path: Optional[str] = None,
+        read_frames_batch: bool = False,
+        # visualize keypoints
+        plot_kps: bool = False,
+        kp3d: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        mask: Optional[Union[np.ndarray, List[int]]] = None,
+        vis_kp_index: bool = False,
+        verbose: bool = False) -> Union[None, torch.Tensor]:
+    """Render SMPL or SMPL-X mesh or silhouette into differentiable tensors,
+    and export video or images.
+
+    Args:
+        # smpl parameters:
+        poses (Union[torch.Tensor, np.ndarray, dict]):
+
+            1). `tensor` or `array` and ndim is 2, shape should be
+            (frame, 72).
+
+            2). `tensor` or `array` and ndim is 3, shape should be
+            (frame, num_person, 72/165). num_person equals 1 means
+            single-person.
+            Rendering predicted multi-person should feed together with
+            multi-person weakperspective cameras. meshes would be computed
+            and use an identity intrinsic matrix.
+
+            3). `dict`, standard dict format defined in smplx.body_models.
+            will be treated as single-person.
+
+            Lower priority than `verts`.
+
+            Defaults to None.
+        betas (Optional[Union[torch.Tensor, np.ndarray]], optional):
+            1). ndim is 2, shape should be (frame, 10).
+
+            2). ndim is 3, shape should be (frame, num_person, 10). num_person
+            equals 1 means single-person. If poses are multi-person, betas
+            should be set to the same person number.
+
+            None will use default betas.
+
+            Defaults to None.
+        transl (Optional[Union[torch.Tensor, np.ndarray]], optional):
+            translations of smpl(x).
+
+            1). ndim is 2, shape should be (frame, 3).
+
+            2). ndim is 3, shape should be (frame, num_person, 3). num_person
+            equals 1 means single-person. If poses are multi-person,
+            transl should be set to the same person number.
+
+            Defaults to None.
+        verts (Optional[Union[torch.Tensor, np.ndarray]], optional):
+            1). ndim is 3, shape should be (frame, num_verts, 3).
+
+            2). ndim is 4, shape should be (frame, num_person, num_verts, 3).
+            num_person equals 1 means single-person.
+
+            Higher priority over `poses` & `betas` & `transl`.
+
+            Defaults to None.
+        body_model (nn.Module, optional): body_model created from smplx.create.
+            Higher priority than `body_model_config`. If `body_model` is not
+            None, it will override `body_model_config`.
+            Should not both be None.
+
+            Defaults to None.
+        body_model_config (dict, optional): body_model_config for build_model.
+            Lower priority than `body_model`. Should not both be None.
+            Defaults to None.
+
+        # camera parameters:
+
+        K (Optional[Union[torch.Tensor, np.ndarray]], optional):
+            shape should be (frame, 4, 4) or (frame, 3, 3), frame could be 1.
+            if (4, 4) or (3, 3), dim 0 will be added automatically.
+            Will be default `FovPerspectiveCameras` intrinsic if None.
+            Lower priority than `orig_cam`.
+        R (Optional[Union[torch.Tensor, np.ndarray]], optional):
+            shape should be (frame, 3, 3), If f equals 1, camera will have
+            identical rotation.
+            If `K` and `orig_cam` is None, will be generated by `look_at_view`.
+            If have `K` or `orig_cam` and `R` is None, will be generated by
+            `convert_camera_matrix`.
+
+            Defaults to None.
+        T (Optional[Union[torch.Tensor, np.ndarray]], optional):
+            shape should be (frame, 3). If f equals 1, camera will have
+            identical translation.
+            If `K` and `orig_cam` is None, will be generated by `look_at_view`.
+            If have `K` or `orig_cam` and `T` is None, will be generated by
+            `convert_camera_matrix`.
+
+            Defaults to None.
+        orig_cam (Optional[Union[torch.Tensor, np.ndarray]], optional):
+            shape should be (frame, 4) or (frame, num_person, 4). If f equals
+            1, will be repeated to num_frames. num_person should be 1 if single
+            person. Usually for HMR, VIBE predicted cameras.
+            Higher priority than `K` & `R` & `T`.
+
+            Defaults to None.
+        Ks (Optional[Union[torch.Tensor, np.ndarray]], optional):
+            shape should be (frame, 4, 4).
+            This is for HMR or SPIN multi-person demo.
+        in_ndc (bool, optional): . Defaults to True.
+        convention (str, optional): If want to  use an existing convention,
+            choose in ['opengl', 'opencv', 'pytorch3d', 'pyrender', 'open3d',
+            'maya', 'blender', 'unity'].
+            If want to use a new convention, define your convention in
+            (CAMERA_CONVENTION_FACTORY)[mmhuman3d/core/conventions/cameras/
+            __init__.py] by the order of right, front and up.
+
+            Defaults to 'pytorch3d'.
+        projection (Literal[, optional): projection mode of camers. Choose in
+            ['orthographics, fovperspective', 'perspective', 'weakperspective',
+            'fovorthographics']
+            Defaults to 'perspective'.
+        orbit_speed (float, optional): orbit speed for viewing when no `K`
+            provided. `float` for only azim speed and Tuple for `azim` and
+            `elev`.
+
+        # render choice parameters:
+
+        render_choice (Literal[, optional):
+            choose in ['lq', 'mq', 'hq', 'silhouette', 'depth', 'normal',
+            'pointcloud', 'part_silhouette'] .
+
+            `lq`, `mq`, `hq` would output (frame, h, w, 4) FloatTensor.
+
+            `lq` means low quality, `mq` means medium quality,
+            h`q means high quality.
+
+            `silhouette` would output (frame, h, w) soft binary FloatTensor.
+
+            `part_silhouette` would output (frame, h, w, 1) LongTensor.
+
+            Every pixel stores a class index.
+
+            `depth` will output a depth map of (frame, h, w, 1) FloatTensor
+            and 'normal' will output a normal map of (frame, h, w, 1).
+
+            `pointcloud` will output a (frame, h, w, 4) FloatTensor.
+
+            Defaults to 'mq'.
+        palette (Union[List[str], str, np.ndarray], optional):
+            color theme str or list of color str or `array`.
+
+            1). If use str to represent the color,
+            should choose in ['segmentation', 'random'] or color from
+            Colormap https://en.wikipedia.org/wiki/X11_color_names.
+            If choose 'segmentation', will get a color for each part.
+
+            2). If you have multi-person, better give a list of str or all
+            will be in the same color.
+
+            3). If you want to define your specific color, use an `array`
+            of shape (3,) for single person and (N, 3) for multiple persons.
+
+            If (3,) for multiple persons, all will be in the same color.
+
+            Your `array` should be in range [0, 255] for 8 bit color.
+
+            Defaults to 'white'.
+
+        texture_image (Union[torch.Tensor, np.ndarray], optional):
+            Texture image to be wrapped on the smpl mesh. If not None,
+            the `palette` will be ignored, and the `body_model` is required
+            to have `uv_param_path`.
+            Should pass list or tensor of shape (num_person, H, W, 3).
+            The color channel should be `RGB`.
+
+            Defaults to None.
+
+        resolution (Union[Iterable[int], int], optional):
+            1). If iterable, should be (height, width) of output images.
+
+            2). If int, would be taken as (resolution, resolution).
+
+            Defaults to (1024, 1024).
+
+            This will influence the overlay results when render with
+            backgrounds. The output video will be rendered following the
+            size of background images and finally resized to resolution.
+        start (int, optional): start frame index. Defaults to 0.
+
+        end (int, optional): end frame index. Exclusive.
+                Could be positive int or negative int or None.
+                None represents include all the frames.
+
+            Defaults to None.
+        alpha (float, optional): Transparency of the mesh.
+            Range in [0.0, 1.0]
+
+            Defaults to 1.0.
+        no_grad (bool, optional): Set to True if do not need differentiable
+            render.
+
+            Defaults to False.
+        batch_size (int, optional):  Batch size for render.
+            Related to your gpu memory.
+
+            Defaults to 10.
+        # file io parameters:
+
+        return_tensor (bool, optional): Whether return the result tensors.
+
+            Defaults to False, will return None.
+        output_path (str, optional): output video or gif or image folder.
+
+            Defaults to None, pass export procedure.
+
+        # background frames, priority: image_array > frame_list > origin_frames
+
+        origin_frames (Optional[str], optional): origin background frame path,
+            could be `.mp4`, `.gif`(will be sliced into a folder) or an image
+            folder.
+
+            Defaults to None.
+        frame_list (Optional[List[str]], optional): list of origin background
+            frame paths, element in list each should be a image path like
+            `*.jpg` or `*.png`.
+            Use this when your file names is hard to sort or you only want to
+            render a small number frames.
+
+            Defaults to None.
+        image_array: (Optional[Union[np.ndarray, torch.Tensor]], optional):
+            origin background frame `tensor` or `array`, use this when you
+            want your frames in memory as array or tensor.
+        overwrite (bool, optional): whether overwriting the existing files.
+
+            Defaults to False.
+        mesh_file_path (bool, optional): the directory path to store the `.ply`
+            or '.ply' files. Will be named like 'frame_idx_person_idx.ply'.
+
+            Defaults to None.
+        read_frames_batch (bool, optional): Whether read frames by batch.
+            Set it as True if your video is large in size.
+
+            Defaults to False.
+
+        # visualize keypoints
+        plot_kps (bool, optional): whether plot keypoints on the output video.
+
+            Defaults to False.
+        kp3d (Optional[Union[np.ndarray, torch.Tensor]], optional):
+            the keypoints of any convention, should pass `mask` if have any
+            none-sense points. Shape should be (frame, )
+
+            Defaults to None.
+        mask (Optional[Union[np.ndarray, List[int]]], optional):
+            Mask of keypoints existence.
+
+            Defaults to None.
+        vis_kp_index (bool, optional):
+            Whether plot keypoint index number on human mesh.
+
+            Defaults to False.
+        # visualize render progress
+        verbose (bool, optional):
+            Whether print the progress bar for rendering.
+    Returns:
+        Union[None, torch.Tensor]: return the rendered image tensors or None.
+    """
+    # initialize the device
+    device = torch.device(device) if isinstance(device, str) else device
+
+    if isinstance(resolution, int):
+        resolution = (resolution, resolution)
+    elif isinstance(resolution, list):
+        resolution = tuple(resolution)
+
+    verts, poses, betas, transl = _prepare_input_pose(verts, poses, betas,
+                                                      transl)
+
+    body_model = _prepare_body_model(body_model, body_model_config)
+    model_type = body_model.name().replace('-', '').lower()
+    assert model_type in ['smpl', 'smplx']
+
+    vertices, joints, num_frames, num_person = _prepare_mesh(
+        poses, betas, transl, verts, start, end, body_model)
+    end = num_frames if end is None else end
+    vertices = vertices.view(num_frames, num_person, -1, 3)
+    num_verts = vertices.shape[-2]
+
+    if not plot_kps:
+        joints = None
+        if kp3d is not None:
+            warnings.warn('`plot_kps` is False, `kp3d` will be set as None.')
+            kp3d = None
+
+    image_array, remove_folder, frames_folder = _prepare_background(
+        image_array, frame_list, origin_frames, output_path, start, end,
+        img_format, overwrite, num_frames, read_frames_batch)
+
+    render_resolution = None
+    if image_array is not None:
+        render_resolution = (image_array.shape[1], image_array.shape[2])
+    elif frames_folder is not None:
+        frame_path_list = glob.glob(osp.join(
+            frames_folder, '*.jpg')) + glob.glob(
+                osp.join(frames_folder, '*.png')) + glob.glob(
+                    osp.join(frames_folder, '*.jpeg'))
+        vid_info = vid_info_reader(frame_path_list[0])
+        render_resolution = (int(vid_info['height']), int(vid_info['width']))
+    if resolution is not None:
+        if render_resolution is not None:
+            if render_resolution != resolution:
+                warnings.warn(
+                    f'Size of background: {render_resolution} !='
+                    f' resolution: {resolution}, the output video will be '
+                    f'resized as {resolution}')
+            final_resolution = resolution
+        elif render_resolution is None:
+            render_resolution = final_resolution = resolution
+    elif resolution is None:
+        if render_resolution is None:
+            render_resolution = final_resolution = (1024, 1024)
+        elif render_resolution is not None:
+            final_resolution = render_resolution
+
+    if isinstance(kp3d, np.ndarray):
+        kp3d = torch.Tensor(kp3d)
+
+    if kp3d is not None:
+        if mask is not None:
+            map_index = np.where(np.array(mask) != 0)[0]
+            kp3d = kp3d[map_index.tolist()]
+        kp3d = kp3d[start:end]
+        kp3d = kp3d.view(num_frames, -1, 3)
+
+    # prepare render_param_dict
+    render_param_dict = copy.deepcopy(RENDER_CONFIGS[render_choice.lower()])
+    if model_type == 'smpl':
+        render_param_dict.update(num_class=24)
+    elif model_type == 'smplx':
+        render_param_dict.update(num_class=27)
+
+    if render_choice not in [
+            'hq', 'mq', 'lq', 'silhouette', 'part_silhouette', 'depth',
+            'pointcloud', 'normal'
+    ]:
+        raise ValueError('Please choose the right render_choice.')
+
+    # body part colorful visualization should use flat shader to be sharper.
+    if texture_image is None:
+        if isinstance(palette, str):
+            palette = [palette] * num_person
+        elif isinstance(palette, np.ndarray):
+            palette = torch.Tensor(palette)
+            palette = palette.view(-1, 3)
+            if palette.shape[0] != num_person:
+                _times = num_person // palette.shape[0]
+                palette = palette.repeat(_times, 1)[:num_person]
+                if palette.shape[0] == 1:
+                    print(f'Same color for all the {num_person} people')
+                else:
+                    print('Repeat palette for multi-person.')
+        else:
+            raise ValueError('Wrong input palette type. '
+                             'Palette should be tensor, array or list of strs')
+        colors_all = _prepare_colors(palette, render_choice, num_person,
+                                     num_verts, model_type)
+        colors_all = colors_all.view(-1, num_person * num_verts, 3)
+    # verts of ParametricMeshes should be in (N, V, 3)
+    vertices = vertices.view(num_frames, -1, 3)
+    meshes = ParametricMeshes(
+        body_model=body_model,
+        verts=vertices,
+        N_individual_overdide=num_person,
+        model_type=model_type,
+        texture_image=texture_image,
+        use_nearest=bool(render_choice == 'part_silhouette'),
+        vertex_color=colors_all)
+
+    # write .ply or .obj files
+    if mesh_file_path is not None:
+        mmcv.mkdir_or_exist(mesh_file_path)
+
+        for person_idx in range(meshes.shape[1]):
+            mesh_person = meshes[:, person_idx]
+            if texture_image is None:
+                ply_paths = [
+                    f'{mesh_file_path}/frame{frame_idx}_'
+                    f'person{person_idx}.ply'
+                    for frame_idx in range(num_frames)
+                ]
+                save_meshes_as_plys(meshes=mesh_person, files=ply_paths)
+
+            else:
+                obj_paths = [
+                    f'{mesh_file_path}/frame{frame_idx}_'
+                    f'person{person_idx}.obj'
+                    for frame_idx in range(num_frames)
+                ]
+                save_meshes_as_objs(meshes=mesh_person, files=obj_paths)
+
+    vertices = meshes.verts_padded().view(num_frames, num_person, -1, 3)
+
+    # prepare camera matrixs
+    if Ks is not None:
+        projection = 'perspective'
+        orig_cam = None
+        if isinstance(Ks, np.ndarray):
+            Ks = torch.Tensor(Ks)
+        Ks = Ks.view(-1, num_person, 3, 3)
+        Ks = Ks[start:end]
+        Ks = Ks.view(-1, 3, 3)
+        K = K.repeat(num_frames * num_person, 1, 1)
+
+        Ks = K.inverse() @ Ks @ K
+        vertices = vertices.view(num_frames * num_person, -1, 3)
+        if T is None:
+            T = torch.zeros(num_frames, num_person, 1, 3)
+        elif isinstance(T, np.ndarray):
+            T = torch.Tensor(T)
+        T = T[start:end]
+        T = T.view(num_frames * num_person, 1, 3)
+        vertices = torch.einsum('blc,bvc->bvl', Ks, vertices + T)
+
+        R = None
+        T = None
+        vertices = vertices.view(num_frames, num_person, -1, 3)
+
+    if orig_cam is not None:
+        if isinstance(orig_cam, np.ndarray):
+            orig_cam = torch.Tensor(orig_cam)
+        projection = 'weakperspective'
+        r = render_resolution[1] / render_resolution[0]
+        orig_cam = orig_cam[start:end]
+        orig_cam = orig_cam.view(num_frames, num_person, 4)
+        # if num_person > 1:
+        sx, sy, tx, ty = torch.unbind(orig_cam, -1)
+
+        vertices[..., 0] += tx.view(num_frames, num_person, 1)
+        vertices[..., 1] += ty.view(num_frames, num_person, 1)
+        vertices[..., 0] *= sx.view(num_frames, num_person, 1)
+        vertices[..., 1] *= sy.view(num_frames, num_person, 1)
+        orig_cam = torch.tensor([1.0, 1.0, 0.0,
+                                 0.0]).view(1, 4).repeat(num_frames, 1)
+        K, R, T = WeakPerspectiveCameras.convert_orig_cam_to_matrix(
+            orig_cam=orig_cam,
+            znear=torch.min(vertices[..., 2] - 1),
+            aspect_ratio=r)
+
+    if num_person > 1:
+        vertices = vertices.reshape(num_frames, -1, 3)
+    else:
+        vertices = vertices.view(num_frames, -1, 3)
+    meshes = meshes.update_padded(new_verts_padded=vertices)
+
+    # orig_cam and K are None, use look_at_view
+    if K is None:
+        projection = 'fovperspective'
+        K, R, T = compute_orbit_cameras(at=(torch.mean(vertices.view(-1, 3),
+                                                       0)).detach().cpu(),
+                                        orbit_speed=orbit_speed,
+                                        batch_size=num_frames,
+                                        convention=convention)
+        convention = 'pytorch3d'
+
+    if isinstance(R, np.ndarray):
+        R = torch.Tensor(R).view(-1, 3, 3)
+    elif isinstance(R, torch.Tensor):
+        R = R.view(-1, 3, 3)
+    elif isinstance(R, list):
+        R = torch.Tensor(R).view(-1, 3, 3)
+    elif R is None:
+        pass
+    else:
+        raise ValueError(f'Wrong type of R: {type(R)}!')
+
+    if R is not None:
+        if len(R) > num_frames:
+            R = R[start:end]
+
+    if isinstance(T, np.ndarray):
+        T = torch.Tensor(T).view(-1, 3)
+    elif isinstance(T, torch.Tensor):
+        T = T.view(-1, 3)
+    elif isinstance(T, list):
+        T = torch.Tensor(T).view(-1, 3)
+    elif T is None:
+        pass
+    else:
+        raise ValueError(f'Wrong type of T: {type(T)}!')
+
+    if T is not None:
+        if len(T) > num_frames:
+            T = T[start:end]
+
+    if isinstance(K, np.ndarray):
+        K = torch.Tensor(K).view(-1, K.shape[-2], K.shape[-1])
+    elif isinstance(K, torch.Tensor):
+        K = K.view(-1, K.shape[-2], K.shape[-1])
+    elif isinstance(K, list):
+        K = torch.Tensor(K)
+        K = K.view(-1, K.shape[-2], K.shape[-1])
+    else:
+        raise ValueError(f'Wrong type of K: {type(K)}!')
+
+    if K is not None:
+        if len(K) > num_frames:
+            K = K[start:end]
+
+    assert projection in [
+        'perspective', 'weakperspective', 'orthographics', 'fovorthographics',
+        'fovperspective'
+    ], f'Wrong camera projection: {projection}'
+    if projection in ['fovperspective', 'perspective']:
+        is_perspective = True
+    elif projection in [
+            'fovorthographics', 'weakperspective', 'orthographics'
+    ]:
+        is_perspective = False
+    if projection in ['fovperspective', 'fovorthographics', 'weakperspective']:
+        assert in_ndc
+
+    K, R, T = convert_camera_matrix(convention_dst='pytorch3d',
+                                    K=K,
+                                    R=R,
+                                    T=T,
+                                    is_perspective=is_perspective,
+                                    convention_src=convention,
+                                    resolution_src=render_resolution,
+                                    in_ndc_src=in_ndc,
+                                    in_ndc_dst=in_ndc)
+
+    # initialize the renderer.
+    renderer = SMPLRenderer(resolution=render_resolution,
+                            device=device,
+                            output_path=output_path,
+                            return_tensor=return_tensor,
+                            alpha=alpha,
+                            read_img_format=img_format,
+                            render_choice=render_choice,
+                            frames_folder=frames_folder,
+                            plot_kps=plot_kps,
+                            vis_kp_index=vis_kp_index,
+                            final_resolution=final_resolution,
+                            **render_param_dict)
+
+    cameras = build_cameras(
+        dict(type=projection,
+             in_ndc=in_ndc,
+             device=device,
+             K=K,
+             R=R,
+             T=T,
+             resolution=render_resolution))
+
+    if image_array is not None:
+        image_array = torch.Tensor(image_array)
+        image_array = align_input_to_padded(image_array,
+                                            ndim=4,
+                                            batch_size=num_frames,
+                                            padding_mode='ones')
+    # prepare the render data.
+    render_data = dict(
+        images=image_array,
+        meshes=meshes,
+        cameras=cameras,
+        joints=joints,
+        joints_gt=kp3d,
+    )
+
+    results = render_runner.render(renderer=renderer,
+                                   device=device,
+                                   batch_size=batch_size,
+                                   output_path=output_path,
+                                   return_tensor=return_tensor,
+                                   no_grad=no_grad,
+                                   verbose=verbose,
+                                   **render_data)
+
+    if remove_folder:
+        if Path(frames_folder).is_dir():
+            shutil.rmtree(frames_folder)
+
+    if return_tensor:
+        return results
+    else:
+        return None
+
+
+def visualize_smpl_calibration(
+    K,
+    R,
+    T,
+    resolution,
+    **kwargs,
+) -> None:
+    """Visualize a smpl mesh which has opencv calibration matrix defined in
+    screen."""
+    assert K is not None, '`K` is required.'
+    assert resolution is not None, '`resolution`(h, w) is required.'
+    func = partial(render_smpl,
+                   projection='perspective',
+                   convention='opencv',
+                   orig_cam=None,
+                   in_ndc=False)
+    for k in func.keywords.keys():
+        if k in kwargs:
+            kwargs.pop(k)
+    return func(K=K, R=R, T=T, resolution=resolution, **kwargs)
+
+
+def visualize_smpl_hmr(cam_transl,
+                       bbox=None,
+                       kp2d=None,
+                       focal_length=5000,
+                       det_width=224,
+                       det_height=224,
+                       bbox_format='xyxy',
+                       **kwargs) -> None:
+    """Simplest way to visualize HMR or SPIN or Smplify pred smpl with origin
+    frames and predicted cameras."""
+    if kp2d is not None:
+        bbox = convert_kp2d_to_bbox(kp2d, bbox_format=bbox_format)
+    Ks = convert_bbox_to_intrinsic(bbox, bbox_format=bbox_format)
+    K = torch.Tensor(
+        get_default_hmr_intrinsic(focal_length=focal_length,
+                                  det_height=det_height,
+                                  det_width=det_width))
+    func = partial(
+        render_smpl,
+        projection='perspective',
+        convention='opencv',
+        in_ndc=False,
+        K=None,
+        R=None,
+        orig_cam=None,
+    )
+    if isinstance(cam_transl, np.ndarray):
+        cam_transl = torch.Tensor(cam_transl)
+    T = torch.cat([
+        cam_transl[..., [1]], cam_transl[..., [2]], 2 * focal_length /
+        (det_width * cam_transl[..., [0]] + 1e-9)
+    ], -1)
+    for k in func.keywords.keys():
+        if k in kwargs:
+            kwargs.pop(k)
+    return func(Ks=Ks, K=K, T=T, **kwargs)
+
+
+def visualize_smpl_vibe(orig_cam=None,
+                        pred_cam=None,
+                        bbox=None,
+                        output_path='sample.mp4',
+                        resolution=None,
+                        aspect_ratio=1.0,
+                        bbox_scale_factor=1.25,
+                        bbox_format='xyxy',
+                        **kwargs) -> None:
+    """Simplest way to visualize pred smpl with origin frames and predicted
+    cameras."""
+    assert resolution is not None
+    if pred_cam is not None and bbox is not None:
+        orig_cam = torch.Tensor(
+            convert_crop_cam_to_orig_img(pred_cam, bbox, resolution[1],
+                                         resolution[0], aspect_ratio,
+                                         bbox_scale_factor, bbox_format))
+    assert orig_cam is not None, '`orig_cam` is required.'
+
+    func = partial(
+        render_smpl,
+        projection='weakperspective',
+        convention='opencv',
+        in_ndc=True,
+    )
+    for k in func.keywords.keys():
+        if k in kwargs:
+            kwargs.pop(k)
+    return func(orig_cam=orig_cam,
+                output_path=output_path,
+                resolution=resolution,
+                **kwargs)
+
+
+def visualize_T_pose(num_frames,
+                     body_model_config=None,
+                     body_model=None,
+                     orbit_speed=1.0,
+                     **kwargs) -> None:
+    """Simplest way to visualize a sequence of T pose."""
+    assert num_frames > 0, '`num_frames` is required.'
+    assert body_model_config is not None or body_model is not None
+    model_type = body_model_config[
+        'type'] if body_model_config is not None else body_model.name(
+        ).replace('-', '').lower()
+    if model_type == 'smpl':
+        poses = torch.zeros(num_frames, 72)
+    else:
+        poses = torch.zeros(num_frames, 165)
+
+    func = partial(render_smpl,
+                   betas=None,
+                   transl=None,
+                   verts=None,
+                   convention='pytorch3d',
+                   projection='fovperspective',
+                   K=None,
+                   R=None,
+                   T=None,
+                   origin_frames=None)
+    for k in func.keywords.keys():
+        if k in kwargs:
+            kwargs.pop(k)
+    return func(poses=poses,
+                body_model_config=body_model_config,
+                body_model=body_model,
+                orbit_speed=orbit_speed,
+                **kwargs)
+
+
+def visualize_smpl_pose(poses=None, verts=None, **kwargs) -> None:
+    """Simplest way to visualize a sequence of smpl pose.
+
+    Cameras will focus on the center of smpl mesh. `orbit speed` is
+    recommended.
+    """
+    assert (poses
+            is not None) or (verts
+                             is not None), 'Pass either `poses` or `verts`.'
+    func = partial(render_smpl,
+                   convention='opencv',
+                   projection='fovperspective',
+                   K=None,
+                   R=None,
+                   T=None,
+                   in_ndc=True,
+                   origin_frames=None,
+                   frame_list=None,
+                   image_array=None)
+    for k in func.keywords.keys():
+        if k in kwargs:
+            kwargs.pop(k)
+    return func(poses=poses, verts=verts, **kwargs)
diff --git a/detrsmpl/data/__init__.py b/detrsmpl/data/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/data/data_structures/__init__.py b/detrsmpl/data/data_structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/data/data_structures/human_data.py b/detrsmpl/data/data_structures/human_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..c32def905edd8474d1dd89b3ec1d0515562a8c89
--- /dev/null
+++ b/detrsmpl/data/data_structures/human_data.py
@@ -0,0 +1,1413 @@
+import logging
+import pickle
+from enum import Enum
+from math import ceil
+from typing import Any, List, Optional, TypeVar, Union, overload
+
+import numpy as np
+import torch
+from mmcv.utils import print_log
+
+from detrsmpl.utils.path_utils import (
+    Existence,
+    check_path_existence,
+    check_path_suffix,
+)
+
+# In T = TypeVar('T'), T can be anything.
+# See definition of typing.TypeVar for details.
+_T1 = TypeVar('_T1')
+_KT = TypeVar('_KT')
+_VT = TypeVar('_VT')
+_HumanData = TypeVar('_HumanData')
+_CPU_DEVICE = torch.device('cpu')
+
+_HumanData_SUPPORTED_KEYS = {
+    'image_path': {
+        'type': list,
+    },
+    'image_id': {
+        'type': list,
+    },
+    'bbox_xywh': {
+        'type': np.ndarray,
+        'shape': (-1, 5),
+        'dim': 0
+    },
+    'config': {
+        'type': str,
+        'dim': None
+    },
+    'keypoints2d': {
+        'type': np.ndarray,
+        'shape': (-1, -1, 3),
+        'dim': 0
+    },
+    'keypoints3d': {
+        'type': np.ndarray,
+        'shape': (-1, -1, 4),
+        'dim': 0
+    },
+    'smpl': {
+        'type': dict,
+        'slice_key': 'betas',
+        'dim': 0
+    },
+    'smplh': {
+        'type': dict,
+        'slice_key': 'betas',
+        'dim': 0
+    },
+    'smplx': {
+        'type': dict,
+        'slice_key': 'betas',
+        'dim': 0
+    },
+    'meta': {
+        'type': dict,
+    },
+    'keypoints2d_mask': {
+        'type': np.ndarray,
+        'shape': (-1, ),
+        'dim': None
+    },
+    'keypoints2d_convention': {
+        'type': str,
+        'dim': None
+    },
+    'keypoints3d_mask': {
+        'type': np.ndarray,
+        'shape': (-1, ),
+        'dim': None
+    },
+    'keypoints3d_convention': {
+        'type': str,
+        'dim': None
+    },
+    'vertices': {
+        'type': np.ndarray,
+        'shape': (-1, ),
+        'dim': None
+    },
+    'focal_length': {
+        'type': np.ndarray,
+        'shape': (-1, ),
+        'dim': 0
+    },
+    'principal_point': {
+        'type': np.ndarray,
+        'shape': (-1, ),
+        'dim': 0
+    },
+    'misc': {
+        'type': dict,
+    },
+}
+
+
+class _KeyCheck(Enum):
+    PASS = 0
+    WARN = 1
+    ERROR = 2
+
+
+class HumanData(dict):
+    logger = None
+    SUPPORTED_KEYS = _HumanData_SUPPORTED_KEYS
+    WARNED_KEYS = []
+
+    def __new__(cls: _HumanData, *args: Any, **kwargs: Any) -> _HumanData:
+        """New an instance of HumanData.
+
+        Args:
+            cls (HumanData): HumanData class.
+
+        Returns:
+            HumanData: An instance of HumanData.
+        """
+        ret_human_data = super().__new__(cls, args, kwargs)
+        setattr(ret_human_data, '__data_len__', -1)
+        setattr(ret_human_data, '__key_strict__', False)
+        setattr(ret_human_data, '__keypoints_compressed__', False)
+        return ret_human_data
+
+    @classmethod
+    def set_logger(cls, logger: Union[logging.Logger, str, None] = None):
+        """Set logger of HumanData class.
+
+        Args:
+            logger (logging.Logger | str | None, optional):
+                The way to print summary.
+                See `mmcv.utils.print_log()` for details.
+                Defaults to None.
+        """
+        cls.logger = logger
+
+    @classmethod
+    def fromfile(cls, npz_path: str) -> _HumanData:
+        """Construct a HumanData instance from an npz file.
+
+        Args:
+            npz_path (str):
+                Path to a dumped npz file.
+
+        Returns:
+            HumanData:
+                A HumanData instance load from file.
+        """
+        ret_human_data = cls()
+        ret_human_data.load(npz_path)
+        return ret_human_data
+
+    @classmethod
+    def new(cls,
+            source_dict: dict = None,
+            key_strict: bool = False) -> _HumanData:
+        """Construct a HumanData instance from a dict.
+
+        Args:
+            source_dict (dict, optional):
+                A dict with items in HumanData fashion.
+                Defaults to None.
+            key_strict (bool, optional):
+                Whether to raise error when setting unsupported keys.
+                Defaults to False.
+
+        Returns:
+            HumanData:
+                A HumanData instance.
+        """
+        if source_dict is None:
+            ret_human_data = cls()
+        else:
+            ret_human_data = cls(source_dict)
+        ret_human_data.set_key_strict(key_strict)
+        return ret_human_data
+
+    def get_key_strict(self) -> bool:
+        """Get value of attribute key_strict.
+
+        Returns:
+            bool:
+                Whether to raise error when setting unsupported keys.
+        """
+        return self.__key_strict__
+
+    def set_key_strict(self, value: bool):
+        """Set value of attribute key_strict.
+
+        Args:
+            value (bool, optional):
+                Whether to raise error when setting unsupported keys.
+                Defaults to True.
+        """
+        former__key_strict__ = self.__key_strict__
+        self.__key_strict__ = value
+        if former__key_strict__ is False and \
+                value is True:
+            self.pop_unsupported_items()
+
+    def check_keypoints_compressed(self) -> bool:
+        """Check whether the keypoints are compressed.
+
+        Returns:
+            bool:
+                Whether the keypoints are compressed.
+        """
+        return self.__keypoints_compressed__
+
+    def load(self, npz_path: str):
+        """Load data from npz_path and update them to self.
+
+        Args:
+            npz_path (str):
+                Path to a dumped npz file.
+        """
+        supported_keys = self.__class__.SUPPORTED_KEYS
+        with np.load(npz_path, allow_pickle=True) as npz_file:
+            tmp_data_dict = dict(npz_file)
+            for key, value in list(tmp_data_dict.items()):
+                if isinstance(value, np.ndarray) and\
+                        len(value.shape) == 0:
+                    # value is not an ndarray before dump
+                    value = value.item()
+                elif key in supported_keys and\
+                        type(value) != supported_keys[key]['type']:
+                    value = supported_keys[key]['type'](value)
+                if value is None:
+                    tmp_data_dict.pop(key)
+                elif key == '__key_strict__' or \
+                        key == '__data_len__' or\
+                        key == '__keypoints_compressed__':
+                    self.__setattr__(key, value)
+                    # pop the attributes to keep dict clean
+                    tmp_data_dict.pop(key)
+                elif key == 'bbox_xywh' and value.shape[1] == 4:
+                    value = np.hstack([value, np.ones([value.shape[0], 1])])
+                    tmp_data_dict[key] = value
+                else:
+                    tmp_data_dict[key] = value
+            self.update(tmp_data_dict)
+            self.__set_default_values__()
+
+    def dump(self, npz_path: str, overwrite: bool = True):
+        """Dump keys and items to an npz file.
+
+        Args:
+            npz_path (str):
+                Path to a dumped npz file.
+            overwrite (bool, optional):
+                Whether to overwrite if there is already a file.
+                Defaults to True.
+
+        Raises:
+            ValueError:
+                npz_path does not end with '.npz'.
+            FileExistsError:
+                When overwrite is False and file exists.
+        """
+        if not check_path_suffix(npz_path, ['.npz']):
+            raise ValueError('Not an npz file.')
+        if not overwrite:
+            if check_path_existence(npz_path, 'file') == Existence.FileExist:
+                raise FileExistsError
+        dict_to_dump = {
+            '__key_strict__': self.__key_strict__,
+            '__data_len__': self.__data_len__,
+            '__keypoints_compressed__': self.__keypoints_compressed__,
+        }
+        dict_to_dump.update(self)
+        np.savez_compressed(npz_path, **dict_to_dump)
+
+    def get_sliced_cache(self, slice_size=10) -> List:
+        """Slice the whole HumanData into pieces for HumanDataCacheWriter.
+
+        Args:
+            slice_size (int, optional):
+                The length of each unit in HumanData cache.
+                Defaults to 10.
+
+        Returns:
+            List:
+                Two dicts for HumanDataCacheWriter.
+                Init HumanDataCacheWriter by HumanDataCacheWriter(**Returns[0])
+                and set data by
+                human_data_cache_writer.update_sliced_dict(Returns[1]).
+        """
+        keypoints_info = {}
+        non_sliced_data = {}
+        sliced_data = {}
+        slice_num = ceil(self.__data_len__ / slice_size)
+        for slice_index in range(slice_num):
+            sliced_data[str(slice_index)] = {}
+        dim_dict = self.__get_slice_dim__()
+        for key, dim in dim_dict.items():
+            # no dim to slice
+            if dim is None:
+                if key.startswith('keypoints') and\
+                        (key.endswith('_mask') or
+                         key.endswith('_convention')):
+                    keypoints_info[key] = self[key]
+                else:
+                    non_sliced_data[key] = self[key]
+            elif isinstance(dim, dict):
+                value_dict = self.get_raw_value(key)
+                non_sliced_sub_dict = {}
+                for sub_key in value_dict.keys():
+                    sub_value = value_dict[sub_key]
+                    if dim[sub_key] is None:
+                        non_sliced_sub_dict[sub_key] = sub_value
+                    else:
+                        sub_dim = dim[sub_key]
+                        for slice_index in range(slice_num):
+                            slice_start = slice_index * slice_size
+                            slice_end = min((slice_index + 1) * slice_size,
+                                            self.__data_len__)
+                            slice_range = slice(slice_start, slice_end)
+                            sliced_sub_value = \
+                                HumanData.__get_sliced_result__(
+                                    sub_value, sub_dim, slice_range
+                                )
+                            if key not in sliced_data[str(slice_index)]:
+                                sliced_data[str(slice_index)][key] = {}
+                            sliced_data[str(slice_index)][key][sub_key] = \
+                                sliced_sub_value
+                if len(non_sliced_sub_dict) > 0:
+                    non_sliced_data[key] = non_sliced_sub_dict
+            else:
+                value = self.get_raw_value(key)
+                # slice as ndarray
+                if isinstance(value, np.ndarray):
+                    slice_list = [
+                        slice(None),
+                    ] * len(value.shape)
+                    for slice_index in range(slice_num):
+                        slice_start = slice_index * slice_size
+                        slice_end = min((slice_index + 1) * slice_size,
+                                        self.__data_len__)
+                        slice_list[dim] = slice(slice_start, slice_end)
+                        sliced_value = value[tuple(slice_list)]
+                        sliced_data[str(slice_index)][key] = sliced_value
+                # slice as list/tuple
+                else:
+                    for slice_index in range(slice_num):
+                        slice_start = slice_index * slice_size
+                        slice_end = min((slice_index + 1) * slice_size,
+                                        self.__data_len__)
+                        sliced_value = value[slice(slice_start, slice_end)]
+                        sliced_data[str(slice_index)][key] = sliced_value
+        writer_args_dict = {
+            'slice_size': slice_size,
+            'keypoints_info': keypoints_info,
+            'data_len': self.data_len,
+            'non_sliced_data': non_sliced_data,
+            'key_strict': self.get_key_strict()
+        }
+        return writer_args_dict, sliced_data
+
+    def to(self,
+           device: Optional[Union[torch.device, str]] = _CPU_DEVICE,
+           dtype: Optional[torch.dtype] = None,
+           non_blocking: Optional[bool] = False,
+           copy: Optional[bool] = False,
+           memory_format: Optional[torch.memory_format] = None) -> dict:
+        """Convert values in numpy.ndarray type to torch.Tensor, and move
+        Tensors to the target device. All keys will exist in the returned dict.
+
+        Args:
+            device (Union[torch.device, str], optional):
+                A specified device. Defaults to CPU_DEVICE.
+            dtype (torch.dtype, optional):
+                The data type of the expected torch.Tensor.
+                If dtype is None, it is decided according to numpy.ndarry.
+                Defaults to None.
+            non_blocking (bool, optional):
+                When non_blocking, tries to convert asynchronously with
+                respect to the host if possible, e.g.,
+                converting a CPU Tensor with pinned memory to a CUDA Tensor.
+                Defaults to False.
+            copy (bool, optional):
+                When copy is set, a new Tensor is created even when
+                the Tensor already matches the desired conversion.
+                No matter what value copy is, Tensor constructed from numpy
+                will not share the same memory with the source numpy.ndarray.
+                Defaults to False.
+            memory_format (torch.memory_format, optional):
+                The desired memory format of returned Tensor.
+                Not supported by pytorch-cpu.
+                Defaults to None.
+
+        Returns:
+            dict:
+                A dict with all numpy.ndarray values converted into
+                torch.Tensor and all Tensors moved to the target device.
+        """
+        ret_dict = {}
+        for key in self.keys():
+            raw_value = self.get_raw_value(key)
+            tensor_value = None
+            if isinstance(raw_value, np.ndarray):
+                tensor_value = torch.from_numpy(raw_value).clone()
+            elif isinstance(raw_value, torch.Tensor):
+                tensor_value = raw_value
+            if tensor_value is None:
+                ret_dict[key] = raw_value
+            else:
+                if memory_format is None:
+                    ret_dict[key] = \
+                        tensor_value.to(device, dtype,
+                                        non_blocking, copy)
+                else:
+                    ret_dict[key] = \
+                        tensor_value.to(device, dtype,
+                                        non_blocking, copy,
+                                        memory_format=memory_format)
+        return ret_dict
+
+    def __getitem__(self, key: _KT) -> _VT:
+        """Get value defined by HumanData. This function will be called by
+        self[key]. In keypoints_compressed mode, if the key contains
+        'keypoints', an array with zero-padding at absent keypoint will be
+        returned. Call self.get_raw_value(k) to get value without padding.
+
+        Args:
+            key (_KT):
+                Key in HumanData.
+
+        Returns:
+            _VT:
+                Value to the key.
+        """
+        value = super().__getitem__(key)
+        if self.__keypoints_compressed__:
+            mask_key = f'{key}_mask'
+            if key in self and \
+                    isinstance(value, np.ndarray) and \
+                    'keypoints' in key and \
+                    mask_key in self:
+                mask_array = np.asarray(super().__getitem__(mask_key))
+                value = \
+                    self.__class__.__add_zero_pad__(value, mask_array)
+        return value
+
+    def get_raw_value(self, key: _KT) -> _VT:
+        """Get raw value from the dict. It acts the same as
+        dict.__getitem__(k).
+
+        Args:
+            key (_KT):
+                Key in dict.
+
+        Returns:
+            _VT:
+                Value to the key.
+        """
+        value = super().__getitem__(key)
+        return value
+
+    def get_value_in_shape(self,
+                           key: _KT,
+                           shape: Union[list, tuple],
+                           padding_constant: int = 0) -> np.ndarray:
+        """Get value in a specific shape. For each dim, if the required shape
+        is smaller than current shape, ndarray will be sliced. Otherwise, it
+        will be padded with padding_constant at the end.
+
+        Args:
+            key (_KT):
+                Key in dict. The value of this key must be
+                an instance of numpy.ndarray.
+            shape (Union[list, tuple]):
+                Shape of the returned array. Its length
+                must be equal to value.ndim. Set -1 for
+                a dimension if you do not want to edit it.
+            padding_constant (int, optional):
+                The value to set the padded values for each axis.
+                Defaults to 0.
+
+        Raises:
+            ValueError:
+                A value in shape is neither positive integer nor -1.
+
+        Returns:
+            np.ndarray:
+                An array in required shape.
+        """
+        value = self.get_raw_value(key)
+        assert isinstance(value, np.ndarray)
+        assert value.ndim == len(shape)
+        pad_width_list = []
+        slice_list = []
+        for dim_index in range(len(shape)):
+            if shape[dim_index] == -1:
+                # no pad or slice
+                pad_width_list.append((0, 0))
+                slice_list.append(slice(None))
+            elif shape[dim_index] > 0:
+                # valid shape value
+                wid = shape[dim_index] - value.shape[dim_index]
+                if wid > 0:
+                    pad_width_list.append((0, wid))
+                else:
+                    pad_width_list.append((0, 0))
+                slice_list.append(slice(0, shape[dim_index]))
+            else:
+                # invalid
+                raise ValueError
+        pad_value = np.pad(value,
+                           pad_width=pad_width_list,
+                           mode='constant',
+                           constant_values=padding_constant)
+        return pad_value[tuple(slice_list)]
+
+    @overload
+    def get_slice(self, stop: int):
+        """Slice [0, stop, 1] of all sliceable values."""
+        ...
+
+    @overload
+    def get_slice(self, start: int, stop: int):
+        """Slice [start, stop, 1] of all sliceable values."""
+        ...
+
+    @overload
+    def get_slice(self, start: int, stop: int, step: int):
+        """Slice [start, stop, step] of all sliceable values."""
+        ...
+
+    def get_slice(self,
+                  arg_0: int,
+                  arg_1: Union[int, Any] = None,
+                  step: int = 1) -> _HumanData:
+        """Slice all sliceable values along major_dim dimension.
+
+        Args:
+            arg_0 (int):
+                When arg_1 is None, arg_0 is stop and start=0.
+                When arg_1 is not None, arg_0 is start.
+            arg_1 (Union[int, Any], optional):
+                None or where to stop.
+                Defaults to None.
+            step (int, optional):
+                Length of step. Defaults to 1.
+
+        Returns:
+            HumanData:
+                A new HumanData instance with sliced values.
+        """
+        ret_human_data = \
+            HumanData.new(key_strict=self.get_key_strict())
+        if arg_1 is None:
+            start = 0
+            stop = arg_0
+        else:
+            start = arg_0
+            stop = arg_1
+        slice_index = slice(start, stop, step)
+        dim_dict = self.__get_slice_dim__()
+        for key, dim in dim_dict.items():
+            # keys not expected be sliced
+            if dim is None:
+                ret_human_data[key] = self[key]
+            elif isinstance(dim, dict):
+                value_dict = self.get_raw_value(key)
+                sliced_dict = {}
+                for sub_key in value_dict.keys():
+                    sub_value = value_dict[sub_key]
+                    if dim[sub_key] is None:
+                        sliced_dict[sub_key] = sub_value
+                    else:
+                        sub_dim = dim[sub_key]
+                        sliced_sub_value = \
+                            HumanData.__get_sliced_result__(
+                                sub_value, sub_dim, slice_index)
+                        sliced_dict[sub_key] = sliced_sub_value
+                ret_human_data[key] = sliced_dict
+            else:
+                value = self[key]
+                sliced_value = \
+                    HumanData.__get_sliced_result__(
+                        value, dim, slice_index)
+                ret_human_data[key] = sliced_value
+        # check keypoints compressed
+        if self.check_keypoints_compressed():
+            ret_human_data.compress_keypoints_by_mask()
+        return ret_human_data
+
+    def __get_slice_dim__(self) -> dict:
+        """For each key in this HumanData, get the dimension for slicing. 0 for
+        default, if no other value specified.
+
+        Returns:
+            dict:
+                Keys are self.keys().
+                Values indicate where to slice.
+                None for not expected to be sliced or
+                failed.
+        """
+        supported_keys = self.__class__.SUPPORTED_KEYS
+        ret_dict = {}
+        for key in self.keys():
+            # keys not expected be sliced
+            if key in supported_keys and \
+                    'dim' in supported_keys[key] and \
+                    supported_keys[key]['dim'] is None:
+                ret_dict[key] = None
+            else:
+                value = self[key]
+                if isinstance(value, dict) and len(value) > 0:
+                    ret_dict[key] = {}
+                    for sub_key in value.keys():
+                        try:
+                            sub_value_len = len(value[sub_key])
+                            if sub_value_len != self.__data_len__:
+                                ret_dict[key][sub_key] = None
+                            elif 'dim' in value:
+                                ret_dict[key][sub_key] = value['dim']
+                            else:
+                                ret_dict[key][sub_key] = 0
+                        except TypeError:
+                            ret_dict[key][sub_key] = None
+                    continue
+                # instance cannot be sliced without len method
+                try:
+                    value_len = len(value)
+                except TypeError:
+                    ret_dict[key] = None
+                    continue
+                # slice on dim 0 by default
+                slice_dim = 0
+                if key in supported_keys and \
+                        'dim' in supported_keys[key]:
+                    slice_dim = \
+                        supported_keys[key]['dim']
+                data_len = value_len if slice_dim == 0 \
+                    else value.shape[slice_dim]
+                # dim not for slice
+                if data_len != self.__data_len__:
+                    ret_dict[key] = None
+                    continue
+                else:
+                    ret_dict[key] = slice_dim
+        return ret_dict
+
+    def __setitem__(self, key: _KT, val: _VT) -> None:
+        """Set self[key] to value. Only be called when using
+        human_data[key] = val. Methods like update won't call __setitem__.
+        In keypoints_compressed mode, if the key contains 'keypoints',
+        and f'{key}_mask' is in self.keys(), invalid zeros
+        will be removed before setting value.
+
+        Args:
+            key (_KT):
+                Key in HumanData.
+                Better be an element in HumanData.SUPPORTED_KEYS.
+                If not, an Error will be raised in key_strict mode.
+            val (_VT):
+                Value to the key.
+
+        Raises:
+            KeyError:
+                self.get_key_strict() is True and
+                key cannot be found in
+                HumanData.SUPPORTED_KEYS.
+            ValueError:
+                Value is supported but doesn't match definition.
+            ValueError:
+                self.check_keypoints_compressed() is True and
+                mask of a keypoint item is missing.
+        """
+        self.__check_key__(key)
+        self.__check_value__(key, val)
+        # if it can be compressed by mask
+        if self.__keypoints_compressed__:
+            class_logger = self.__class__.logger
+            if 'keypoints' in key and \
+                    '_mask' in key:
+                msg = 'Mask cannot be modified ' +\
+                      'in keypoints_compressed mode.'
+                print_log(msg=msg, logger=class_logger, level=logging.WARN)
+                return
+            elif isinstance(val, np.ndarray) and \
+                    'keypoints' in key and \
+                    '_mask' not in key:
+                mask_key = f'{key}_mask'
+                if mask_key in self:
+                    mask_array = np.asarray(super().__getitem__(mask_key))
+                    val = \
+                        self.__class__.__remove_zero_pad__(val, mask_array)
+                else:
+                    msg = f'Mask for {key} has not been set.' +\
+                        f' Please set {mask_key} before compression.'
+                    print_log(msg=msg,
+                              logger=class_logger,
+                              level=logging.ERROR)
+                    raise ValueError
+        dict.__setitem__(self, key, val)
+
+    def set_raw_value(self, key: _KT, val: _VT) -> None:
+        """Set the raw value of self[key] to val after key check. It acts the
+        same as dict.__setitem__(self, key, val) if the key satisfied
+        constraints.
+
+        Args:
+            key (_KT):
+                Key in dict.
+            val (_VT):
+                Value to the key.
+
+        Raises:
+            KeyError:
+                self.get_key_strict() is True and
+                key cannot be found in
+                HumanData.SUPPORTED_KEYS.
+            ValueError:
+                Value is supported but doesn't match definition.
+        """
+        self.__check_key__(key)
+        self.__check_value__(key, val)
+        dict.__setitem__(self, key, val)
+
+    def pop_unsupported_items(self) -> None:
+        """Find every item with a key not in HumanData.SUPPORTED_KEYS, and pop
+        it to save memory."""
+        for key in list(self.keys()):
+            if key not in self.__class__.SUPPORTED_KEYS:
+                self.pop(key)
+
+    def __check_key__(self, key: Any) -> _KeyCheck:
+        """Check whether the key matches definition in
+        HumanData.SUPPORTED_KEYS.
+
+        Args:
+            key (Any):
+                Key in HumanData.
+
+        Returns:
+            _KeyCheck:
+                PASS, WARN or ERROR.
+
+        Raises:
+            KeyError:
+                self.get_key_strict() is True and
+                key cannot be found in
+                HumanData.SUPPORTED_KEYS.
+        """
+        ret_key_check = _KeyCheck.PASS
+        if self.get_key_strict():
+            if key not in self.__class__.SUPPORTED_KEYS:
+                ret_key_check = _KeyCheck.ERROR
+        else:
+            if key not in self.__class__.SUPPORTED_KEYS and \
+                    key not in self.__class__.WARNED_KEYS:
+                # log warning message at the first time
+                ret_key_check = _KeyCheck.WARN
+                self.__class__.WARNED_KEYS.append(key)
+        if ret_key_check == _KeyCheck.ERROR:
+            raise KeyError(self.__class__.__get_key_error_msg__(key))
+        elif ret_key_check == _KeyCheck.WARN:
+            class_logger = self.__class__.logger
+            if class_logger == 'silent':
+                pass
+            else:
+                print_log(msg=self.__class__.__get_key_warn_msg__(key),
+                          logger=class_logger,
+                          level=logging.WARN)
+        return ret_key_check
+
+    def __check_value__(self, key: Any, val: Any) -> bool:
+        """Check whether the value matches definition in
+        HumanData.SUPPORTED_KEYS.
+
+        Args:
+            key (Any):
+                Key in HumanData.
+            val (Any):
+                Value to the key.
+
+        Returns:
+            bool:
+                True for matched, ortherwise False.
+
+        Raises:
+            ValueError:
+                Value is supported but doesn't match definition.
+        """
+        ret_bool = self.__check_value_type__(key, val) and\
+            self.__check_value_shape__(key, val) and\
+            self.__check_value_len__(key, val)
+        if not ret_bool:
+            raise ValueError(self.__class__.__get_value_error_msg__())
+        return ret_bool
+
+    def __check_value_type__(self, key: Any, val: Any) -> bool:
+        """Check whether the type of val matches definition in
+        HumanData.SUPPORTED_KEYS.
+
+        Args:
+            key (Any):
+                Key in HumanData.
+            val (Any):
+                Value to the key.
+
+        Returns:
+            bool:
+                If type doesn't match, return False.
+                Else return True.
+        """
+        ret_bool = True
+        supported_keys = self.__class__.SUPPORTED_KEYS
+        # check definition
+        if key in supported_keys:
+            # check type
+            if type(val) != supported_keys[key]['type']:
+                ret_bool = False
+        if not ret_bool:
+            expected_type = supported_keys[key]['type']
+            err_msg = 'Type check Failed:\n'
+            err_msg += f'key={str(key)}\n'
+            err_msg += f'type(val)={type(val)}\n'
+            err_msg += f'expected type={expected_type}\n'
+            print_log(msg=err_msg,
+                      logger=self.__class__.logger,
+                      level=logging.ERROR)
+        return ret_bool
+
+    def __check_value_shape__(self, key: Any, val: Any) -> bool:
+        """Check whether the shape of val matches definition in
+        HumanData.SUPPORTED_KEYS.
+
+        Args:
+            key (Any):
+                Key in HumanData.
+            val (Any):
+                Value to the key.
+
+        Returns:
+            bool:
+                If expected shape is defined and doesn't match,
+                return False.
+                Else return True.
+        """
+        ret_bool = True
+        supported_keys = self.__class__.SUPPORTED_KEYS
+        # check definition
+        if key in supported_keys:
+            # check shape
+            if 'shape' in supported_keys[key]:
+                val_shape = val.shape
+                for shape_ind in range(len(supported_keys[key]['shape'])):
+                    # length not match
+                    if shape_ind >= len(val_shape):
+                        ret_bool = False
+                        break
+                    expect_val = supported_keys[key]['shape'][shape_ind]
+                    # value not match
+                    if expect_val > 0 and \
+                            expect_val != val_shape[shape_ind]:
+                        ret_bool = False
+                        break
+        if not ret_bool:
+            expected_shape = str(supported_keys[key]['shape'])
+            expected_shape = expected_shape.replace('-1', 'Any')
+            err_msg = 'Shape check Failed:\n'
+            err_msg += f'key={str(key)}\n'
+            err_msg += f'val.shape={val_shape}\n'
+            err_msg += f'expected shape={expected_shape}\n'
+            print_log(msg=err_msg,
+                      logger=self.__class__.logger,
+                      level=logging.ERROR)
+        return ret_bool
+
+    @property
+    def data_len(self) -> int:
+        """Get the temporal length of this HumanData instance.
+
+        Returns:
+            int:
+                Number of frames related to this instance.
+        """
+        return self.__data_len__
+
+    @data_len.setter
+    def data_len(self, value: int):
+        """Set the temporal length of this HumanData instance.
+
+        Args:
+            value (int):
+                Number of frames related to this instance.
+        """
+        self.__data_len__ = value
+
+    def __check_value_len__(self, key: Any, val: Any) -> bool:
+        """Check whether the temporal length of val matches other values.
+
+        Args:
+            key (Any):
+                Key in HumanData.
+            val (Any):
+                Value to the key.
+
+        Returns:
+            bool:
+                If temporal dim is defined and temporal length doesn't match,
+                return False.
+                Else return True.
+        """
+        ret_bool = True
+        supported_keys = self.__class__.SUPPORTED_KEYS
+        # check definition
+        if key in supported_keys:
+            # check temporal length
+            if 'dim' in supported_keys[key] and \
+                    supported_keys[key]['dim'] is not None:
+                val_slice_dim = supported_keys[key]['dim']
+                if supported_keys[key]['type'] == dict:
+                    slice_key = supported_keys[key]['slice_key']
+                    val_data_len = val[slice_key].shape[val_slice_dim]
+                else:
+                    val_data_len = val.shape[val_slice_dim]
+                if self.data_len < 0:
+                    # no data_len yet, assign a new one
+                    self.data_len = val_data_len
+                else:
+                    # check if val_data_len matches recorded data_len
+                    if self.data_len != val_data_len:
+                        ret_bool = False
+        if not ret_bool:
+            err_msg = 'Temporal check Failed:\n'
+            err_msg += f'key={str(key)}\n'
+            err_msg += f'val\'s data_len={val_data_len}\n'
+            err_msg += f'expected data_len={self.data_len}\n'
+            print_log(msg=err_msg,
+                      logger=self.__class__.logger,
+                      level=logging.ERROR)
+        return ret_bool
+
+    def generate_mask_from_confidence(self, keys=None) -> None:
+        """Generate mask from keypoints' confidence. Keypoints that have zero
+        confidence in all occurrences will have a zero mask. Note that the last
+        value of the keypoint is assumed to be confidence.
+
+        Args:
+            keys: None, str, or list of str.
+                None: all keys with `keypoint` in it will have mask
+                    generated from their confidence.
+                str: key of the keypoint, the mask has name f'{key}_name'
+                list of str: a list of keys of the keypoints.
+                    Generate mask for multiple keypoints.
+                Defaults to None.
+
+        Returns:
+            None
+
+        Raises:
+            KeyError:
+                A key is not not found
+        """
+        if keys is None:
+            keys = []
+            for key in self.keys():
+                val = self.get_raw_value(key)
+                if isinstance(val, np.ndarray) and \
+                        'keypoints' in key and \
+                        '_mask' not in key:
+                    keys.append(key)
+        elif isinstance(keys, str):
+            keys = [keys]
+        elif isinstance(keys, list):
+            for key in keys:
+                assert isinstance(key, str)
+        else:
+            raise TypeError(f'`Keys` must be None, str, or list of str, '
+                            f'got {type(keys)}.')
+
+        update_dict = {}
+        for kpt_key in keys:
+            kpt_array = self.get_raw_value(kpt_key)
+            num_joints = kpt_array.shape[-2]
+            # if all conf of a joint are zero, this joint is masked
+            joint_conf = kpt_array[..., -1].reshape(-1, num_joints)
+            mask_array = (joint_conf > 0).astype(np.uint8).max(axis=0)
+            assert len(mask_array) == num_joints
+            # generate mask
+            update_dict[f'{kpt_key}_mask'] = mask_array
+        self.update(update_dict)
+
+    def compress_keypoints_by_mask(self) -> None:
+        """If a key contains 'keypoints', and f'{key}_mask' is in self.keys(),
+        invalid zeros will be removed and f'{key}_mask' will be locked.
+
+        Raises:
+            KeyError:
+                A key contains 'keypoints' has been found
+                but its corresponding mask is missing.
+        """
+        assert self.__keypoints_compressed__ is False
+        key_pairs = []
+        for key in self.keys():
+            mask_key = f'{key}_mask'
+            val = self.get_raw_value(key)
+            if isinstance(val, np.ndarray) and \
+                    'keypoints' in key and \
+                    '_mask' not in key and 'has' not in key:
+                if mask_key in self:
+                    key_pairs.append([key, mask_key])
+                else:
+                    msg = f'Mask for {key} has not been set.' +\
+                        f'Please set {mask_key} before compression.'
+                    raise KeyError(msg)
+        compressed_dict = {}
+        for kpt_key, mask_key in key_pairs:
+            kpt_array = self.get_raw_value(kpt_key)
+            mask_array = np.asarray(self.get_raw_value(mask_key))
+            compressed_kpt = \
+                self.__class__.__remove_zero_pad__(kpt_array, mask_array)
+            compressed_dict[kpt_key] = compressed_kpt
+        # set value after all pairs are compressed
+        self.update(compressed_dict)
+        self.__keypoints_compressed__ = True
+
+    def decompress_keypoints(self) -> None:
+        """If a key contains 'keypoints', and f'{key}_mask' is in self.keys(),
+        invalid zeros will be inserted to the right places and f'{key}_mask'
+        will be unlocked.
+
+        Raises:
+            KeyError:
+                A key contains 'keypoints' has been found
+                but its corresponding mask is missing.
+        """
+        assert self.__keypoints_compressed__ is True
+        key_pairs = []
+        for key in self.keys():
+            mask_key = f'{key}_mask'
+            val = self.get_raw_value(key)
+            if isinstance(val, np.ndarray) and \
+                    'keypoints' in key and \
+                    '_mask' not in key:
+                if mask_key in self:
+                    key_pairs.append([key, mask_key])
+                else:
+                    class_logger = self.__class__.logger
+                    msg = f'Mask for {key} has not been found.' +\
+                        f'Please remove {key} before decompression.'
+                    print_log(msg=msg,
+                              logger=class_logger,
+                              level=logging.ERROR)
+                    raise KeyError
+        decompressed_dict = {}
+        for kpt_key, mask_key in key_pairs:
+            mask_array = np.asarray(self.get_raw_value(mask_key))
+            compressed_kpt = self.get_raw_value(kpt_key)
+            kpt_array = \
+                self.__class__.__add_zero_pad__(compressed_kpt, mask_array)
+            decompressed_dict[kpt_key] = kpt_array
+        # set value after all pairs are decompressed
+        self.update(decompressed_dict)
+        self.__keypoints_compressed__ = False
+
+    def dump_by_pickle(self, pkl_path: str, overwrite: bool = True) -> None:
+        """Dump keys and items to a pickle file. It's a secondary dump method,
+        when a HumanData instance is too large to be dumped by self.dump()
+
+        Args:
+            pkl_path (str):
+                Path to a dumped pickle file.
+            overwrite (bool, optional):
+                Whether to overwrite if there is already a file.
+                Defaults to True.
+
+        Raises:
+            ValueError:
+                npz_path does not end with '.pkl'.
+            FileExistsError:
+                When overwrite is False and file exists.
+        """
+        if not check_path_suffix(pkl_path, ['.pkl']):
+            raise ValueError('Not an pkl file.')
+        if not overwrite:
+            if check_path_existence(pkl_path, 'file') == Existence.FileExist:
+                raise FileExistsError
+        dict_to_dump = {
+            '__key_strict__': self.__key_strict__,
+            '__data_len__': self.__data_len__,
+            '__keypoints_compressed__': self.__keypoints_compressed__,
+        }
+        dict_to_dump.update(self)
+        with open(pkl_path, 'wb') as f_writeb:
+            pickle.dump(dict_to_dump,
+                        f_writeb,
+                        protocol=pickle.HIGHEST_PROTOCOL)
+
+    def load_by_pickle(self, pkl_path: str) -> None:
+        """Load data from pkl_path and update them to self.
+
+        When a HumanData Instance was dumped by
+        self.dump_by_pickle(), use this to load.
+        Args:
+            npz_path (str):
+                Path to a dumped npz file.
+        """
+        with open(pkl_path, 'rb') as f_readb:
+            tmp_data_dict = pickle.load(f_readb)
+            for key, value in list(tmp_data_dict.items()):
+                if value is None:
+                    tmp_data_dict.pop(key)
+                elif key == '__key_strict__' or \
+                        key == '__data_len__' or\
+                        key == '__keypoints_compressed__':
+                    self.__setattr__(key, value)
+                    # pop the attributes to keep dict clean
+                    tmp_data_dict.pop(key)
+                elif key == 'bbox_xywh' and value.shape[1] == 4:
+                    value = np.hstack([value, np.ones([value.shape[0], 1])])
+                    tmp_data_dict[key] = value
+                else:
+                    tmp_data_dict[key] = value
+            self.update(tmp_data_dict)
+            self.__set_default_values__()
+
+    def __set_default_values__(self) -> None:
+        """For older versions of HumanData, call this method to apply missing
+        values (also attributes)."""
+        supported_keys = self.__class__.SUPPORTED_KEYS
+        if self.__data_len__ == -1:
+            for key in supported_keys:
+                if key in self and \
+                        'dim' in supported_keys[key] and\
+                        supported_keys[key]['dim'] is not None:
+                    if 'slice_key' in supported_keys[key] and\
+                            supported_keys[key]['type'] == dict:
+                        sub_key = supported_keys[key]['slice_key']
+                        slice_dim = supported_keys[key]['dim']
+                        self.__data_len__ = \
+                            self[key][sub_key].shape[slice_dim]
+                    else:
+                        slice_dim = supported_keys[key]['dim']
+                        self.__data_len__ = self[key].shape[slice_dim]
+                    break
+        for key in list(self.keys()):
+            convention_key = f'{key}_convention'
+            if key.startswith('keypoints') and \
+                    not key.endswith('_mask') and \
+                    not key.endswith('_convention') and \
+                    convention_key not in self:
+                self[convention_key] = 'human_data'
+
+    @classmethod
+    def concatenate(cls, human_data_0: _HumanData,
+                    human_data_1: _HumanData) -> _HumanData:
+        """Concatenate two human_data. All keys will be kept it the returned
+        human_data. If either value from human_data_0 or human_data_1 matches
+        data_len from its HumanData, the two values will be concatenated as a
+        single value. If not, postfix will be added to the key to specify
+        source of the value.
+
+        Args:
+            human_data_0 (_HumanData)
+            human_data_1 (_HumanData)
+
+        Returns:
+            _HumanData:
+                A new human_data instance with all concatenated data.
+        """
+        ret_human_data = cls.new(key_strict=False)
+        set_0 = set(human_data_0.keys())
+        set_1 = set(human_data_1.keys())
+        common_keys = set_0.intersection(set_1)
+        dim_dict_0 = human_data_0.__get_slice_dim__()
+        dim_dict_1 = human_data_1.__get_slice_dim__()
+        for key in common_keys:
+            value_0 = human_data_0[key]
+            value_1 = human_data_1[key]
+            # align type
+            value_0 = list(value_0) if isinstance(value_0, tuple)\
+                else value_0
+            value_1 = list(value_1) if isinstance(value_1, tuple)\
+                else value_1
+            assert type(value_0) == type(value_1)
+            # align convention
+            if key.startswith('keypoints') and\
+                    key.endswith('_convention'):
+                assert value_0 == value_1
+                ret_human_data[key] = value_0
+                continue
+            # mask_0 and mask_1
+            elif key.startswith('keypoints') and\
+                    key.endswith('_mask'):
+                new_mask = value_0 * value_1
+                ret_human_data[key] = new_mask
+                continue
+            # go through the sub dict
+            if isinstance(value_0, dict):
+                sub_dict = {}
+                for sub_key, sub_value_0 in value_0.items():
+                    # only found in value_0
+                    if sub_key not in value_1:
+                        sub_dict[sub_key] = sub_value_0
+                    # found in both values
+                    else:
+                        sub_value_1 = value_1[sub_key]
+                        concat_sub_dict = cls.__concat_value__(
+                            key=sub_key,
+                            value_0=sub_value_0,
+                            dim_0=dim_dict_0[key][sub_key],
+                            value_1=sub_value_1,
+                            dim_1=dim_dict_1[key][sub_key])
+                        sub_dict.update(concat_sub_dict)
+                for sub_key, sub_value_1 in value_1.items():
+                    if sub_key not in value_0:
+                        sub_dict[sub_key] = sub_value_1
+                
+                ret_human_data[key] = sub_dict
+            # try concat
+            else:
+                concat_dict = cls.__concat_value__(key=key,
+                                                   value_0=value_0,
+                                                   dim_0=dim_dict_0[key],
+                                                   value_1=value_1,
+                                                   dim_1=dim_dict_1[key])
+                ret_human_data.update(concat_dict)
+        # check exclusive keys
+        for key, value in human_data_0.items():
+            if key not in common_keys:
+                # value not for concat and slice
+                if dim_dict_0[key] is None:
+                    ret_human_data[key] = value
+                # value aligned with data_len of HumanData_0
+                else:
+                    ret_human_data[f'{key}_0'] = value
+        for key, value in human_data_1.items():
+            if key not in common_keys:
+                # same as above
+                if dim_dict_1[key] is None:
+                    ret_human_data[key] = value
+                else:
+                    ret_human_data[f'{key}_1'] = value
+        return ret_human_data
+
+    @classmethod
+    def __concat_value__(cls, key: Any, value_0: Any, value_1: Any,
+                         dim_0: Union[None, int], dim_1: Union[None,
+                                                               int]) -> dict:
+        """Concat two values from two different HumanData.
+
+        Args:
+            key (Any):
+                The common key of the two values.
+            value_0 (Any):
+                Value from 0.
+            value_1 (Any):
+                Value from 1.
+            dim_0 (Union[None, int]):
+                The dim for concat and slice. None for N/A.
+            dim_1 (Union[None, int]):
+                The dim for concat and slice. None for N/A.
+
+        Returns:
+            dict:
+                Dict for concatenated result.
+        """
+        ret_dict = {}
+        if dim_0 is None or dim_1 is None:
+            ret_dict[f'{key}_0'] = value_0
+            ret_dict[f'{key}_1'] = value_1
+        elif isinstance(value_0, list):
+            ret_dict[key] = value_0 + value_1
+        # elif isinstance(value_0, np.ndarray):
+        else:
+            ret_dict[key] = np.concatenate((value_0, value_1), axis=dim_0)
+        return ret_dict
+
+    @classmethod
+    def __add_zero_pad__(cls, compressed_array: np.ndarray,
+                         mask_array: np.ndarray) -> np.ndarray:
+        """Pad zeros to a compressed keypoints array.
+
+        Args:
+            compressed_array (np.ndarray):
+                A compressed keypoints array.
+            mask_array (np.ndarray):
+                The mask records compression relationship.
+
+        Returns:
+            np.ndarray:
+                A keypoints array in full-size.
+        """
+        assert mask_array.sum() == compressed_array.shape[1]
+        data_len, _, dim = compressed_array.shape
+        mask_len = mask_array.shape[0]
+        ret_value = np.zeros(shape=[data_len, mask_len, dim],
+                             dtype=compressed_array.dtype)
+        valid_mask_index = np.where(mask_array == 1)[0]
+        ret_value[:, valid_mask_index, :] = compressed_array
+        return ret_value
+
+    @classmethod
+    def __remove_zero_pad__(cls, zero_pad_array: np.ndarray,
+                            mask_array: np.ndarray) -> np.ndarray:
+        """Remove zero-padding from a full-size keypoints array.
+
+        Args:
+            zero_pad_array (np.ndarray):
+                A keypoints array in full-size.
+            mask_array (np.ndarray):
+                The mask records compression relationship.
+
+        Returns:
+            np.ndarray:
+                A compressed keypoints array.
+        """
+        assert mask_array.shape[0] == zero_pad_array.shape[1]
+        valid_mask_index = np.where(mask_array == 1)[0]
+        ret_value = np.take(zero_pad_array, valid_mask_index, axis=1)
+        return ret_value
+
+    @classmethod
+    def __get_key_warn_msg__(cls, key: Any) -> str:
+        """Get the warning message when a key fails the check.
+
+        Args:
+            key (Any):
+                The key with wrong.
+
+        Returns:
+            str:
+                The warning message.
+        """
+        class_name = cls.__name__
+        warn_message = \
+            f'{key} is absent in' +\
+            f' {class_name}.SUPPORTED_KEYS.\n'
+        suggestion_message = \
+            'Ignore this if you know exactly' +\
+            ' what you are doing.\n' +\
+            'Otherwise, Call self.set_key_strict(True)' +\
+            ' to avoid wrong keys.\n'
+        return warn_message + suggestion_message
+
+    @classmethod
+    def __get_key_error_msg__(cls, key: Any) -> str:
+        """Get the error message when a key fails the check.
+
+        Args:
+            key (Any):
+                The key with wrong.
+
+        Returns:
+            str:
+                The error message.
+        """
+        class_name = cls.__name__
+        absent_message = \
+            f'{key} is absent in' +\
+            f' {class_name}.SUPPORTED_KEYS.\n'
+        suggestion_message = \
+            'Call self.set_key_strict(False)' +\
+            ' to allow unsupported keys.\n'
+        return absent_message + suggestion_message
+
+    @classmethod
+    def __get_value_error_msg__(cls) -> str:
+        """Get the error message when a value fails the check.
+
+        Returns:
+            str:
+                The error message.
+        """
+        error_message = \
+            'An supported value doesn\'t ' +\
+            'match definition.\n'
+        suggestion_message = \
+            'See error log for details.\n'
+        return error_message + suggestion_message
+
+    @classmethod
+    def __get_sliced_result__(
+            cls, input_data: Union[np.ndarray, list, tuple], slice_dim: int,
+            slice_range: slice) -> Union[np.ndarray, list, tuple]:
+        """Slice input_data along slice_dim with slice_range.
+
+        Args:
+            input_data (Union[np.ndarray, list, tuple]):
+                Data to be sliced.
+            slice_dim (int):
+                Dimension to be sliced.
+            slice_range (slice):
+                An instance of class slice.
+
+        Returns:
+            Union[np.ndarray, list, tuple]:
+                A slice of input_data.
+        """
+        if isinstance(input_data, np.ndarray):
+            slice_list = [
+                slice(None),
+            ] * len(input_data.shape)
+            slice_list[slice_dim] = slice_range
+            sliced_data = input_data[tuple(slice_list)]
+        else:
+            sliced_data = \
+                input_data[slice_range]
+        return sliced_data
diff --git a/detrsmpl/data/data_structures/human_data_cache.py b/detrsmpl/data/data_structures/human_data_cache.py
new file mode 100644
index 0000000000000000000000000000000000000000..122b3ed1d5663e5ffa903b9b5d1e9d56ed102cec
--- /dev/null
+++ b/detrsmpl/data/data_structures/human_data_cache.py
@@ -0,0 +1,104 @@
+from typing import List
+
+import numpy as np
+
+from detrsmpl.utils.path_utils import (
+    Existence,
+    check_path_existence,
+    check_path_suffix,
+)
+from .human_data import HumanData
+
+
+class HumanDataCacheReader():
+    def __init__(self, npz_path: str):
+        self.npz_path = npz_path
+        npz_file = np.load(npz_path, allow_pickle=True)
+        self.slice_size = npz_file['slice_size'].item()
+        self.data_len = npz_file['data_len'].item()
+        self.keypoints_info = npz_file['keypoints_info'].item()
+        self.non_sliced_data = None
+        self.npz_file = None
+
+    def __del__(self):
+        if self.npz_file is not None:
+            self.npz_file.close()
+
+    def get_item(self, index, required_keys: List[str] = []):
+        if self.npz_file is None:
+            self.npz_file = np.load(self.npz_path, allow_pickle=True)
+        cache_key = str(int(index / self.slice_size))
+        base_data = self.npz_file[cache_key].item()
+        base_data.update(self.keypoints_info)
+        for key in required_keys:
+            non_sliced_value = self.get_non_sliced_data(key)
+            if isinstance(non_sliced_value, dict) and\
+                    key in base_data and\
+                    isinstance(base_data[key], dict):
+                base_data[key].update(non_sliced_value)
+            else:
+                base_data[key] = non_sliced_value
+        ret_human_data = HumanData.new(source_dict=base_data)
+        # data in cache is compressed
+        ret_human_data.__keypoints_compressed__ = True
+        # set missing values and attributes by default method
+        ret_human_data.__set_default_values__()
+        return ret_human_data
+
+    def get_non_sliced_data(self, key: str):
+        if self.non_sliced_data is None:
+            if self.npz_file is None:
+                npz_file = np.load(self.npz_path, allow_pickle=True)
+                self.non_sliced_data = npz_file['non_sliced_data'].item()
+            else:
+                self.non_sliced_data = self.npz_file['non_sliced_data'].item()
+        return self.non_sliced_data[key]
+
+
+class HumanDataCacheWriter():
+    def __init__(self,
+                 slice_size: int,
+                 data_len: int,
+                 keypoints_info: dict,
+                 non_sliced_data: dict,
+                 key_strict: bool = True):
+        self.slice_size = slice_size
+        self.data_len = data_len
+        self.keypoints_info = keypoints_info
+        self.non_sliced_data = non_sliced_data
+        self.sliced_data = {}
+        self.key_strict = key_strict
+
+    def update_sliced_dict(self, sliced_dict):
+        self.sliced_data.update(sliced_dict)
+
+    def dump(self, npz_path: str, overwrite: bool = True):
+        """Dump keys and items to an npz file.
+
+        Args:
+            npz_path (str):
+                Path to a dumped npz file.
+            overwrite (bool, optional):
+                Whether to overwrite if there is already a file.
+                Defaults to True.
+
+        Raises:
+            ValueError:
+                npz_path does not end with '.npz'.
+            FileExistsError:
+                When overwrite is False and file exists.
+        """
+        if not check_path_suffix(npz_path, ['.npz']):
+            raise ValueError('Not an npz file.')
+        if not overwrite:
+            if check_path_existence(npz_path, 'file') == Existence.FileExist:
+                raise FileExistsError
+        dict_to_dump = {
+            'slice_size': self.slice_size,
+            'data_len': self.data_len,
+            'keypoints_info': self.keypoints_info,
+            'non_sliced_data': self.non_sliced_data,
+            'key_strict': self.key_strict,
+        }
+        dict_to_dump.update(self.sliced_data)
+        np.savez_compressed(npz_path, **dict_to_dump)
diff --git a/detrsmpl/data/data_structures/multi_human_data.py b/detrsmpl/data/data_structures/multi_human_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..68ed9d9336572db13ee52f51bd45a1b4cd4be5da
--- /dev/null
+++ b/detrsmpl/data/data_structures/multi_human_data.py
@@ -0,0 +1,480 @@
+import logging
+import pickle
+from enum import Enum
+from typing import Any, TypeVar, Union
+
+import numpy as np
+from mmcv.utils import print_log
+
+from detrsmpl.data.data_structures.human_data import HumanData
+from detrsmpl.utils.path_utils import (
+    Existence,
+    check_path_existence,
+    check_path_suffix,
+)
+
+# In T = TypeVar('T'), T can be anything.
+# See definition of typing.TypeVar for details.
+_HumanData = TypeVar('_HumanData')
+
+_MultiHumanData_SUPPORTED_KEYS = HumanData.SUPPORTED_KEYS.copy()
+_MultiHumanData_SUPPORTED_KEYS.update(
+    {'optional': {
+        'type': dict,
+        'slice_key': 'frame_range',
+        'dim': 0
+    }})
+
+
+class _KeyCheck(Enum):
+    PASS = 0
+    WARN = 1
+    ERROR = 2
+
+
+class MultiHumanData(HumanData):
+    SUPPORTED_KEYS = _MultiHumanData_SUPPORTED_KEYS
+
+    def __new__(cls: _HumanData, *args: Any, **kwargs: Any) -> _HumanData:
+        """New an instance of HumanData.
+
+        Args:
+            cls (HumanData): HumanData class.
+
+        Returns:
+            HumanData: An instance of Hu
+        """
+        ret_human_data = super().__new__(cls, args, kwargs)
+        setattr(ret_human_data, '__data_len__', -1)
+        setattr(ret_human_data, '__instance_num__', -1)
+        setattr(ret_human_data, '__key_strict__', False)
+        setattr(ret_human_data, '__keypoints_compressed__', False)
+        return ret_human_data
+
+    def load(self, npz_path: str):
+        """Load data from npz_path and update them to self.
+
+        Args:
+            npz_path (str):
+                Path to a dumped npz file.
+        """
+        supported_keys = self.__class__.SUPPORTED_KEYS
+        with np.load(npz_path, allow_pickle=True) as npz_file:
+            tmp_data_dict = dict(npz_file)
+            for key, value in list(tmp_data_dict.items()):
+                if isinstance(value, np.ndarray) and\
+                        len(value.shape) == 0:
+                    # value is not an ndarray before dump
+                    value = value.item()
+                elif key in supported_keys and\
+                        type(value) != supported_keys[key]['type']:
+                    value = supported_keys[key]['type'](value)
+                if value is None:
+                    tmp_data_dict.pop(key)
+                elif key == '__key_strict__' or \
+                        key == '__data_len__' or\
+                        key == '__instance_num__' or\
+                        key == '__keypoints_compressed__':
+                    self.__setattr__(key, value)
+                    # pop the attributes to keep dict clean
+                    tmp_data_dict.pop(key)
+                elif key == 'bbox_xywh' and value.shape[1] == 4:
+                    value = np.hstack([value, np.ones([value.shape[0], 1])])
+                    tmp_data_dict[key] = value
+                else:
+                    tmp_data_dict[key] = value
+            self.update(tmp_data_dict)
+            self.__set_default_values__()
+
+    def dump(self, npz_path: str, overwrite: bool = True):
+        """Dump keys and items to an npz file.
+
+        Args:
+            npz_path (str):
+                Path to a dumped npz file.
+            overwrite (bool, optional):
+                Whether to overwrite if there is already a file.
+                Defaults to True.
+
+        Raises:
+            ValueError:
+                npz_path does not end with '.npz'.
+            FileExistsError:
+                When overwrite is False and file exists.
+        """
+        if not check_path_suffix(npz_path, ['.npz']):
+            raise ValueError('Not an npz file.')
+        if not overwrite:
+            if check_path_existence(npz_path, 'file') == Existence.FileExist:
+                raise FileExistsError
+        dict_to_dump = {
+            '__key_strict__': self.__key_strict__,
+            '__data_len__': self.__data_len__,
+            '__instance_num__': self.__instance_num__,
+            '__keypoints_compressed__': self.__keypoints_compressed__,
+        }
+        dict_to_dump.update(self)
+        np.savez_compressed(npz_path, **dict_to_dump)
+
+    def dump_by_pickle(self, pkl_path: str, overwrite: bool = True) -> None:
+        """Dump keys and items to a pickle file. It's a secondary dump method,
+        when a HumanData instance is too large to be dumped by self.dump()
+
+        Args:
+            pkl_path (str):
+                Path to a dumped pickle file.
+            overwrite (bool, optional):
+                Whether to overwrite if there is already a file.
+                Defaults to True.
+
+        Raises:
+            ValueError:
+                npz_path does not end with '.pkl'.
+            FileExistsError:
+                When overwrite is False and file exists.
+        """
+        if not check_path_suffix(pkl_path, ['.pkl']):
+            raise ValueError('Not an pkl file.')
+        if not overwrite:
+            if check_path_existence(pkl_path, 'file') == Existence.FileExist:
+                raise FileExistsError
+        dict_to_dump = {
+            '__key_strict__': self.__key_strict__,
+            '__data_len__': self.__data_len__,
+            '__instance_num__': self.__instance_num__,
+            '__keypoints_compressed__': self.__keypoints_compressed__,
+        }
+        dict_to_dump.update(self)
+        with open(pkl_path, 'wb') as f_writeb:
+            pickle.dump(dict_to_dump,
+                        f_writeb,
+                        protocol=pickle.HIGHEST_PROTOCOL)
+
+    def load_by_pickle(self, pkl_path: str) -> None:
+        """Load data from pkl_path and update them to self.
+
+        When a HumanData Instance was dumped by
+        self.dump_by_pickle(), use this to load.
+        Args:
+            npz_path (str):
+                Path to a dumped npz file.
+        """
+        with open(pkl_path, 'rb') as f_readb:
+            tmp_data_dict = pickle.load(f_readb)
+            for key, value in list(tmp_data_dict.items()):
+                if value is None:
+                    tmp_data_dict.pop(key)
+                elif key == '__key_strict__' or \
+                        key == '__data_len__' or\
+                        key == '__instance_num__' or\
+                        key == '__keypoints_compressed__':
+                    self.__setattr__(key, value)
+                    # pop the attributes to keep dict clean
+                    tmp_data_dict.pop(key)
+                elif key == 'bbox_xywh' and value.shape[1] == 4:
+                    value = np.hstack([value, np.ones([value.shape[0], 1])])
+                    tmp_data_dict[key] = value
+                else:
+                    tmp_data_dict[key] = value
+            self.update(tmp_data_dict)
+            self.__set_default_values__()
+
+    @property
+    def instance_num(self) -> int:
+        """Get the human instance num of this MultiHumanData instance. In
+        MuliHumanData, an image may have multiple corresponding human
+        instances.
+
+        Returns:
+            int:
+                Number of human instance related to this instance.
+        """
+        return self.__instance_num__
+
+    @instance_num.setter
+    def instance_num(self, value: int):
+        """Set the human instance num of this MultiHumanData instance.
+
+        Args:
+            value (int):
+                Number of human instance related to this instance.
+        """
+        self.__instance_num__ = value
+
+    def get_slice(self,
+                  arg_0: int,
+                  arg_1: Union[int, Any] = None,
+                  step: int = 1) -> _HumanData:
+        """Slice all sliceable values along major_dim dimension.
+
+        Args:
+            arg_0 (int):
+                When arg_1 is None, arg_0 is stop and start=0.
+                When arg_1 is not None, arg_0 is start.
+            arg_1 (Union[int, Any], optional):
+                None or where to stop.
+                Defaults to None.
+            step (int, optional):
+                Length of step. Defaults to 1.
+
+        Returns:
+            MultiHumanData:
+                A new MultiHumanData instance with sliced values.
+        """
+        ret_human_data = \
+            MultiHumanData.new(key_strict=self.get_key_strict())
+        if arg_1 is None:
+            start = 0
+            stop = arg_0
+        else:
+            start = arg_0
+            stop = arg_1
+        slice_index = slice(start, stop, step)
+        dim_dict = self.__get_slice_dim__()
+        # frame_range = self.get_raw_value('optional')['frame_range']
+        for key, dim in dim_dict.items():
+            # primary index
+            if key == 'optional':
+                frame_range = None
+            else:
+                frame_range = self.get_raw_value('optional')['frame_range']
+            # keys not expected be sliced
+            if dim is None:
+                ret_human_data[key] = self[key]
+            elif isinstance(dim, dict):
+                value_dict = self.get_raw_value(key)
+                sliced_dict = {}
+                for sub_key in value_dict.keys():
+                    sub_value = value_dict[sub_key]
+                    if dim[sub_key] is None:
+                        sliced_dict[sub_key] = sub_value
+                    else:
+                        sub_dim = dim[sub_key]
+                        sliced_sub_value = \
+                            MultiHumanData.__get_sliced_result__(
+                                sub_value, sub_dim, slice_index, frame_range)
+                        sliced_dict[sub_key] = sliced_sub_value
+                ret_human_data[key] = sliced_dict
+            else:
+                value = self[key]
+                sliced_value = \
+                    MultiHumanData.__get_sliced_result__(
+                        value, dim, slice_index, frame_range)
+                ret_human_data[key] = sliced_value
+        # check keypoints compressed
+        if self.check_keypoints_compressed():
+            ret_human_data.compress_keypoints_by_mask()
+        return ret_human_data
+
+    def __get_slice_dim__(self) -> dict:
+        """For each key in this HumanData, get the dimension for slicing. 0 for
+        default, if no other value specified.
+
+        Returns:
+            dict:
+                Keys are self.keys().
+                Values indicate where to slice.
+                None for not expected to be sliced or
+                failed.
+        """
+        supported_keys = self.__class__.SUPPORTED_KEYS
+        ret_dict = {}
+        for key in self.keys():
+            # keys not expected be sliced
+            if key in supported_keys and \
+                    'dim' in supported_keys[key] and \
+                    supported_keys[key]['dim'] is None:
+                ret_dict[key] = None
+            else:
+                value = self[key]
+                if isinstance(value, dict) and len(value) > 0:
+                    ret_dict[key] = {}
+                    for sub_key in value.keys():
+                        try:
+                            sub_value_len = len(value[sub_key])
+                            if sub_value_len != self.instance_num and \
+                                    sub_value_len != self.data_len:
+                                ret_dict[key][sub_key] = None
+                            elif 'dim' in value:
+                                ret_dict[key][sub_key] = value['dim']
+                            else:
+                                ret_dict[key][sub_key] = 0
+                        except TypeError:
+                            ret_dict[key][sub_key] = None
+                    continue
+                # instance cannot be sliced without len method
+                try:
+                    value_len = len(value)
+                except TypeError:
+                    ret_dict[key] = None
+                    continue
+                # slice on dim 0 by default
+                slice_dim = 0
+                if key in supported_keys and \
+                        'dim' in supported_keys[key]:
+                    slice_dim = \
+                        supported_keys[key]['dim']
+                data_len = value_len if slice_dim == 0 \
+                    else value.shape[slice_dim]
+                # dim not for slice
+                if data_len != self.__instance_num__:
+                    ret_dict[key] = None
+                    continue
+                else:
+                    ret_dict[key] = slice_dim
+        return ret_dict
+
+    # TODO: to support cache
+
+    def __check_value_len__(self, key: Any, val: Any) -> bool:
+        """Check whether the temporal length of val matches other values.
+
+        Args:
+            key (Any):
+                Key in MultiHumanData.
+            val (Any):
+                Value to the key.
+
+        Returns:
+            bool:
+                If temporal dim is defined and temporal length doesn't match,
+                return False.
+                Else return True.
+        """
+        ret_bool = True
+        supported_keys = self.__class__.SUPPORTED_KEYS
+
+        # MultiHumanData
+        instance_num = 0
+        if key == 'optional' and \
+                'frame_range' in val:
+            for frame_range in val['frame_range']:
+                instance_num += (frame_range[-1] - frame_range[0])
+
+            if self.instance_num == -1:
+                # init instance_num for multi_human_data
+                self.instance_num = instance_num
+            elif self.instance_num != instance_num:
+                ret_bool = False
+
+            data_len = len(val['frame_range'])
+            if self.data_len == -1:
+                # init data_len
+                self.data_len = data_len
+            elif self.data_len == self.instance_num:
+                # update data_len
+                self.data_len = data_len
+            elif self.data_len != self.instance_num:
+                ret_bool = False
+
+        # check definition
+        elif key in supported_keys:
+            # check data length
+            if 'dim' in supported_keys[key] and \
+                    supported_keys[key]['dim'] is not None:
+                val_slice_dim = supported_keys[key]['dim']
+                if supported_keys[key]['type'] == dict:
+                    slice_key = supported_keys[key]['slice_key']
+                    val_data_len = val[slice_key].shape[val_slice_dim]
+                else:
+                    val_data_len = val.shape[val_slice_dim]
+
+                if self.instance_num < 0:
+                    # Init instance_num for HumanData,
+                    # which is equal to data_len.
+                    self.instance_num = val_data_len
+                else:
+                    # check if val_data_len matches recorded instance_num
+                    if self.instance_num != val_data_len:
+                        ret_bool = False
+
+                if self.data_len < 0:
+                    # init data_len for HumanData, it's equal to
+                    # instance_num.
+                    # If it's MultiHumanData needs to be updated
+                    self.data_len = val_data_len
+
+        if not ret_bool:
+            err_msg = 'Data length check Failed:\n'
+            err_msg += f'key={str(key)}\n'
+            if self.data_len != self.instance_num:
+                err_msg += f'val\'s instance_num={self.data_len}\n'
+                err_msg += f'expected instance_num={self.instance_num}\n'
+            print_log(msg=err_msg,
+                      logger=self.__class__.logger,
+                      level=logging.ERROR)
+        return ret_bool
+
+    def __set_default_values__(self) -> None:
+        """For older versions of HumanData, call this method to apply missing
+        values (also attributes).
+
+        Note:
+        1. Older HumanData doesn't define `data_len`.
+        2. In the newer HumanData, `data_len` equals the `instances_num`.
+        3. In MultiHumanData, `instance_num` equals instances num,
+            and `data_len` equals frames num.
+        """
+        supported_keys = self.__class__.SUPPORTED_KEYS
+        if self.instance_num == -1:
+            # the loaded file is not multi_human_data
+            for key in supported_keys:
+                if key in self and \
+                        'dim' in supported_keys[key] and\
+                        supported_keys[key]['dim'] is not None:
+                    if 'slice_key' in supported_keys[key] and\
+                            supported_keys[key]['type'] == dict:
+                        sub_key = supported_keys[key]['slice_key']
+                        slice_dim = supported_keys[key]['dim']
+                        self.instance_num = self[key][sub_key].shape[slice_dim]
+                    else:
+                        slice_dim = supported_keys[key]['dim']
+                        self.instance_num = self[key].shape[slice_dim]
+
+                    # convert HumanData to MultiHumanData
+                    self.data_len = self.instance_num
+                    optional = {}
+                    optional['frame_range'] =  \
+                        [[i, i + 1] for i in range(self.data_len)]
+                    self['optional'] = optional
+                    break
+
+        for key in list(self.keys()):
+            convention_key = f'{key}_convention'
+            if key.startswith('keypoints') and \
+                    not key.endswith('_mask') and \
+                    not key.endswith('_convention') and \
+                    convention_key not in self:
+                self[convention_key] = 'human_data'
+
+    @classmethod
+    def __get_sliced_result__(
+            cls,
+            input_data: Union[np.ndarray, list, tuple],
+            slice_dim: int,
+            slice_range: slice,
+            frame_index: list = None) -> Union[np.ndarray, list, tuple]:
+
+        if frame_index is not None:
+            slice_data = []
+            for frame_range in frame_index[slice_range]:
+                slice_index = slice(frame_range[0], frame_range[-1], 1)
+                slice_result = \
+                    HumanData.__get_sliced_result__(
+                        input_data,
+                        slice_dim,
+                        slice_index)
+                for element in slice_result:
+                    slice_data.append(element)
+            if isinstance(input_data, np.ndarray):
+                slice_data = np.array(slice_data)
+            else:
+                slice_data = type(input_data)(slice_data)
+        else:
+            # primary index
+            slice_data = \
+                HumanData.__get_sliced_result__(
+                    input_data,
+                    slice_dim,
+                    slice_range)
+        return slice_data
diff --git a/detrsmpl/data/data_structures/smc_reader.py b/detrsmpl/data/data_structures/smc_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b3916114f86290db810bc0222a9647b4e1b8e53
--- /dev/null
+++ b/detrsmpl/data/data_structures/smc_reader.py
@@ -0,0 +1,1021 @@
+import json
+
+import cv2
+import h5py
+import numpy as np
+import torch
+import tqdm
+
+from detrsmpl.models.body_models.builder import build_body_model
+from detrsmpl.models.body_models.utils import batch_transform_to_camera_frame
+
+
+class SMCReader:
+    def __init__(self, file_path, body_model=None):
+        """Read SenseMocapFile endswith ".smc", see: https://github.com/open-
+        mmlab/detrsmpl/blob/main/docs/smc.md.
+
+        Args:
+            file_path (str):
+                Path to an SMC file.
+            body_model (nn.Module or dict):
+                Only needed for SMPL transformation to device frame
+                if nn.Module: a body_model instance
+                if dict: a body_model config
+        """
+        self.smc = h5py.File(file_path, 'r')
+        self.__calibration_dict__ = None
+        self.action_id = self.smc.attrs['action_id']
+        self.actor_id = self.smc.attrs['actor_id']
+        self.datetime_str = self.smc.attrs['datetime_str']  # .decode()
+        self.kinect_num_frames = self.smc['Kinect'].attrs['num_frame']
+        self.num_kinects = self.smc['Kinect'].attrs['num_device']
+        self.kinect_color_resolution = self.get_kinect_color_resolution(0)
+        self.kinect_depth_resolution = self.get_kinect_depth_resolution(0)
+        self.iphone_exists = 'iPhone' in self.smc.keys()
+        self.num_iphones = 1
+        if self.iphone_exists:
+            self.iphone_num_frames = self.smc['iPhone'].attrs['num_frame']
+            self.iphone_color_resolution = \
+                self.smc['iPhone'].attrs['color_resolution']  # vertical
+            self.iphone_depth_resolution = \
+                self.smc['iPhone'].attrs['depth_resolution']  # vertical
+        self.keypoint_exists = 'Keypoints3D' in self.smc.keys()
+        if self.keypoint_exists:
+            self.keypoints_num_frames = self.smc['Keypoints3D'].attrs[
+                'num_frame']
+            self.keypoints_convention = self.smc['Keypoints3D'].attrs[
+                'convention']
+            self.keypoints_created_time = self.smc['Keypoints3D'].attrs[
+                'created_time']
+        self.smpl_exists = 'SMPL' in self.smc.keys()
+        if self.smpl_exists:
+            self.smpl_num_frames = self.smc['SMPL'].attrs['num_frame']
+            self.smpl_created_time = self.smc['SMPL'].attrs['created_time']
+
+            # initialize body model
+            if isinstance(body_model, torch.nn.Module):
+                self.body_model = body_model
+            elif isinstance(body_model, dict):
+                self.body_model = build_body_model(body_model)
+            else:
+                # in most cases, SMCReader is instantiated for image reading
+                # only. Hence, it is wasteful to initialize a body model until
+                # really needed in get_smpl()
+                self.body_model = None
+                self.default_body_model_config = dict(
+                    type='SMPL',
+                    gender='neutral',
+                    num_betas=10,
+                    keypoint_src='smpl_45',
+                    keypoint_dst='smpl_45',
+                    model_path='data/body_models/smpl',
+                    batch_size=1,
+                )
+
+    def get_kinect_color_extrinsics(self, kinect_id, homogeneous=True):
+        """Get extrinsics(cam2world) of a kinect RGB camera by kinect id.
+
+        Args:
+            kinect_id (int):
+                ID of a kinect, starts from 0.
+            homogeneous (bool, optional):
+                If true, returns rotation and translation in
+                one 4x4 matrix. Defaults to True.
+
+        Returns:
+            homogeneous is True
+                ndarray: A 4x4 matrix of rotation and translation(cam2world).
+            homogeneous is False
+                dict: A dict of rotation and translation,
+                        keys are R and T,
+                        each value is an ndarray.
+        """
+        R = np.asarray(self.calibration_dict[str(kinect_id * 2)]['R']).reshape(
+            3, 3)
+        T = np.asarray(self.calibration_dict[str(kinect_id *
+                                                 2)]['T']).reshape(3)
+        if homogeneous:
+            extrinsics = np.identity(4, dtype=float)
+            extrinsics[:3, :3] = R
+            extrinsics[:3, 3] = T
+            return extrinsics
+        else:
+            return {'R': R, 'T': T}
+
+    @property
+    def calibration_dict(self):
+        """Get the dict of calibration.
+
+        Returns:
+            dict:
+                A dict of calibrated extrinsics.
+        """
+        if self.__calibration_dict__ is not None:
+            return self.__calibration_dict__
+        else:
+            return json.loads(self.smc['Extrinsics'][()])
+
+    def get_kinect_depth_extrinsics(self, kinect_id, homogeneous=True):
+        """Get extrinsics(cam2world) of a kinect depth camera by kinect id.
+
+        Args:
+            kinect_id (int):
+                ID of a kinect, starts from 0.
+            homogeneous (bool, optional):
+                If true, returns rotation and translation in
+                one 4x4 matrix. Defaults to True.
+
+        Returns:
+            homogeneous is True
+                ndarray: A 4x4 matrix of rotation and translation(cam2world).
+            homogeneous is False
+                dict: A dict of rotation and translation,
+                        keys are R and T,
+                        each value is an ndarray.
+        """
+        R = np.asarray(self.calibration_dict[str(kinect_id * 2 +
+                                                 1)]['R']).reshape(3, 3)
+        T = np.asarray(self.calibration_dict[str(kinect_id * 2 +
+                                                 1)]['T']).reshape(3)
+        if homogeneous:
+            extrinsics = np.identity(4, dtype=float)
+            extrinsics[:3, :3] = R
+            extrinsics[:3, 3] = T
+            return extrinsics
+        else:
+            return {'R': R, 'T': T}
+
+    def get_kinect_color_intrinsics(self, kinect_id):
+        """Get intrinsics of a kinect RGB camera by kinect id.
+
+        Args:
+            kinect_id (int):
+                ID of a kinect, starts from 0.
+
+        Returns:
+            ndarray: A 3x3 matrix.
+        """
+        kinect_dict = self.smc['Kinect'][str(kinect_id)]
+        intrinsics = \
+            kinect_dict['Calibration']['Color']['Intrinsics'][()]
+        cx, cy, fx, fy = intrinsics[:4]
+        intrinsics = \
+            np.asarray([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+        return intrinsics
+
+    def get_kinect_color_resolution(self, kinect_id):
+        """Get resolution of a kinect RGB camera by kinect id.
+
+        Args:
+            kinect_id (int):
+                ID of a kinect, starts from 0.
+
+        Returns:
+            ndarray:
+                An ndarray of (width, height), shape=[2, ].
+        """
+        kinect_dict = self.smc['Kinect'][str(kinect_id)]
+        resolution = \
+            kinect_dict['Calibration']['Color']['Resolution'][()]
+        return resolution
+
+    def get_kinect_depth_resolution(self, kinect_id):
+        """Get resolution of a kinect depth camera by kinect id.
+
+        Args:
+            kinect_id (int):
+                ID of a kinect, starts from 0.
+
+        Returns:
+            ndarray:
+                An ndarray of (width, height), shape=[2, ].
+        """
+        kinect_dict = self.smc['Kinect'][str(kinect_id)]
+        resolution = \
+            kinect_dict['Calibration']['Depth']['Resolution'][()]
+        return resolution
+
+    def get_kinect_depth_intrinsics(self, kinect_id):
+        """Get intrinsics of a kinect depth camera by kinect id.
+
+        Args:
+            kinect_id (int):
+                ID of a kinect, starts from 0.
+
+        Returns:
+            ndarray: A 3x3 matrix.
+        """
+        kinect_dict = self.smc['Kinect'][str(kinect_id)]
+        intrinsics = \
+            kinect_dict['Calibration']['Depth']['Intrinsics'][()]
+        cx, cy, fx, fy = intrinsics[:4]
+        intrinsics = \
+            np.asarray([[fx, 0, cx], [0, fy, cy], [0, 0, 1]])
+        return intrinsics
+
+    def get_iphone_intrinsics(self, iphone_id=0, frame_id=0, vertical=True):
+        """Get intrinsics of an iPhone RGB camera by iPhone id.
+
+        Args:
+            iphone_id (int, optional):
+                ID of an iPhone, starts from 0.
+                Defaults to 0.
+            frame_id (int, optional):
+                int: frame id of one selected frame
+                Defaults to 0.
+            vertical (bool, optional):
+                iPhone assumes landscape orientation
+                if True, convert data to vertical orientation
+                Defaults to True.
+
+        Returns:
+            ndarray: A 3x3 matrix.
+        """
+        camera_info = self.smc['iPhone'][str(iphone_id)]['CameraInfo'][str(
+            frame_id)]
+        camera_info = json.loads(camera_info[()])
+        intrinsics = np.asarray(camera_info['cameraIntrinsics']).transpose()
+
+        # Intrinsics have to be adjusted to achieve rotation
+        #   1. swapping fx, fy
+        #   2. cx -> image height - cy; cy -> cx
+        if vertical:
+            fx, fy = intrinsics[0, 0], intrinsics[1, 1]
+            cx, cy = intrinsics[0, 2], intrinsics[1, 2]
+            W, H = self.get_iphone_color_resolution(vertical=False)
+            intrinsics = np.eye(3)
+            intrinsics[0, 0], intrinsics[1, 1] = fy, fx
+            intrinsics[0, 2], intrinsics[1, 2] = H - cy, cx
+
+        return intrinsics
+
+    def get_iphone_extrinsics(self,
+                              iphone_id=0,
+                              homogeneous=True,
+                              vertical=True):
+        """Get extrinsics(cam2world) of an iPhone RGB camera by iPhone id.
+
+        Args:
+            iphone_id (int, optional):
+                ID of an iPhone, starts from 0.
+                Defaults to 0.
+            homogeneous (bool, optional):
+                If true, returns rotation and translation in
+                one 4x4 matrix. Defaults to True.
+            vertical (bool, optional):
+                iPhone assumes landscape orientation
+                if True, convert data to vertical orientation
+                Defaults to True.
+
+        Returns:
+            homogeneous is True
+                ndarray: A 4x4 transformation matrix(cam2world).
+            homogeneous is False
+                dict: A dict of rotation and translation,
+                    keys are R and T,
+                    each value is an ndarray.
+        """
+        if iphone_id != 0:
+            raise KeyError('Currently only one iPhone.')
+        R = np.asarray(self.calibration_dict['iPhone']['R']).reshape(3, 3)
+        T = np.asarray(self.calibration_dict['iPhone']['T']).reshape(3)
+
+        # cam2world
+        extrinsics = np.identity(4, dtype=float)
+        extrinsics[:3, :3] = R
+        extrinsics[:3, 3] = T
+
+        # Extrinsics have to be adjusted to achieve rotation
+        # A rotation matrix is applied on the extrinsics
+        if vertical:
+            # 90-degree clockwise rotation around z-axis
+            R = np.eye(4)
+            R[:2, :2] = np.array([[0, -1], [1, 0]])
+            # Note the extrinsics is cam2world
+            # world2cam_adjusted = R @ world2cam
+            # => cam2world_adjusted = cam2world @ inv(R)
+            extrinsics = extrinsics @ np.linalg.inv(R)
+            R = extrinsics[:3, :3]
+            T = extrinsics[:3, 3]
+
+        if homogeneous:
+            return extrinsics
+        else:
+            return {'R': R, 'T': T}
+
+    def get_iphone_color_resolution(self, iphone_id=0, vertical=True):
+        """Get color image resolution of an iPhone RGB camera by iPhone id.
+
+        Args:
+            iphone_id (int, optional):
+                ID of an iPhone, starts from 0.
+                Defaults to 0.
+            vertical (bool, optional):
+                iPhone assumes landscape orientation
+                if True, convert data to vertical orientation
+                Defaults to True.
+
+        Returns:
+            ndarray:get_iphone_keypoints2d
+                An ndarray of (width, height), shape=[2, ].
+        """
+        if iphone_id != 0:
+            raise KeyError('Currently only one iPhone.')
+        if vertical:
+            W_horizontal, H_horizontal = self.iphone_color_resolution
+            W_vertical, H_vertical = H_horizontal, W_horizontal
+            return np.array([W_vertical, H_vertical])
+        else:
+            return self.iphone_color_resolution
+
+    def get_kinect_color(self, kinect_id, frame_id=None, disable_tqdm=True):
+        """Get several frames captured by a kinect RGB camera.
+
+        Args:
+            kinect_id (int):
+                ID of a kinect, starts from 0.
+            frame_id (int, list or None, optional):
+                int: frame id of one selected frame
+                list: a list of frame id
+                None: all frames will be returned
+                Defaults to None.
+            disable_tqdm (bool, optional):
+                Whether to disable the entire progressbar wrapper.
+                Defaults to True.
+
+        Returns:
+            ndarray:
+                An ndarray in shape [frame_number, height, width, channels].
+        """
+        frames = []
+        if frame_id is None:
+            frame_list = range(self.get_kinect_num_frames())
+        elif isinstance(frame_id, list):
+            frame_list = frame_id
+        elif isinstance(frame_id, int):
+            assert frame_id < self.get_kinect_num_frames(),\
+                'Index out of range...'
+            frame_list = [frame_id]
+        else:
+            raise TypeError('frame_id should be int, list or None.')
+        for i in tqdm.tqdm(frame_list, disable=disable_tqdm):
+            frames.append(
+                self.__read_color_from_bytes__(
+                    self.smc['Kinect'][str(kinect_id)]['Color'][str(i)][()]))
+        return np.stack(frames, axis=0)
+
+    def get_kinect_rgbd(self,
+                        kinect_id,
+                        frame_id,
+                        mode='color2depth',
+                        threshold=0):
+        if mode == 'color2depth':
+            mapped_color = \
+                self.__map_color_to_depth__(
+                    kinect_id, frame_id, threshold=threshold
+                )
+            depth = self.get_kinect_depth(kinect_id, frame_id)[0]
+            return mapped_color, depth
+        else:
+            print('Model {} is not supported...'.format(mode))
+
+    def get_kinect_depth(self, kinect_id, frame_id=None, disable_tqdm=True):
+        """Get several frames captured by a kinect depth camera.
+
+        Args:
+            kinect_id (int):
+                ID of a kinect, starts from 0.
+            frame_id (int, list or None, optional):
+                int: frame id of one selected frame
+                list: a list of frame id
+                None: all frames will be returned
+                Defaults to None.
+            disable_tqdm (bool, optional):
+                Whether to disable the entire progressbar wrapper.
+                Defaults to True.
+
+        Returns:
+            ndarray:
+                An ndarray in shape [frame_number, height, width, channels].
+        """
+        frames = []
+        frame_list = []
+        if frame_id is None or type(frame_id) == list:
+            frame_list = range(self.get_kinect_num_frames())
+            if frame_id:
+                frame_list = frame_id
+        else:
+            assert frame_id < self.get_kinect_num_frames(),\
+                'Index out of range...'
+            frame_list.append(frame_id)
+        for i in tqdm.tqdm(frame_list, disable=disable_tqdm):
+            frames.append(
+                self.smc['Kinect'][str(kinect_id)]['Depth'][str(i)][()])
+        return np.stack(frames, axis=0)
+
+    def __read_color_from_bytes__(self, color_array):
+        """Decode an RGB image from an encoded byte array."""
+        return cv2.cvtColor(cv2.imdecode(color_array, cv2.IMREAD_COLOR),
+                            cv2.COLOR_BGR2RGB)
+
+    def get_num_kinect(self):
+        """Get the number of Kinect devices.
+
+        Returns:
+            int:
+                Number of Kinect devices.
+        """
+        return self.num_kinects
+
+    def get_kinect_num_frames(self):
+        """Get the number of frames recorded by one Kinect RGB camera.
+
+        Returns:
+            int:
+                Number of frames.
+        """
+        return self.kinect_num_frames
+
+    def get_iphone_num_frames(self):
+        """Get the number of frames recorded by one iPhone RGB camera.
+
+        Returns:
+            int:
+                Number of frames.
+        """
+        return self.iphone_num_frames
+
+    def get_depth_mask(self, device_id, frame_id):
+        return self.smc['Kinect'][str(device_id)]['Mask'][str(frame_id)][()]
+
+    def get_kinect_mask(self, device_id, frame_id):
+        kinect_dict = self.smc['Kinect'][str(device_id)]
+        return kinect_dict['Mask_k4abt'][str(frame_id)][()]
+
+    def get_num_iphone(self):
+        """Get the number of iPhone devices.
+
+        Returns:
+            int:
+                Number of iPhone devices.
+        """
+        return self.num_iphones
+
+    def get_iphone_color(self,
+                         iphone_id=0,
+                         frame_id=None,
+                         disable_tqdm=True,
+                         vertical=True):
+        """Get several frames captured by an iPhone RGB camera.
+
+        Args:
+            iphone_id (int):
+                ID of an iPhone, starts from 0.
+            frame_id (int, list or None, optional):
+                int: frame id of one selected frame
+                list: a list of frame id
+                None: all frames will be returned
+                Defaults to None.
+            disable_tqdm (bool, optional):
+                Whether to disable the entire progressbar wrapper.
+                Defaults to True.
+            vertical (bool, optional):
+                iPhone assumes horizontal orientation
+                if True, convert data to vertical orientation
+                Defaults to True.
+
+        Returns:
+            frames:
+                An ndarray in shape [frame_number, height, width, channels].
+        """
+        frames = []
+        if frame_id is None:
+            frame_list = range(self.get_iphone_num_frames())
+        elif isinstance(frame_id, list):
+            frame_list = frame_id
+        elif isinstance(frame_id, int):
+            assert frame_id < self.get_iphone_num_frames(),\
+                'Index out of range...'
+            frame_list = [frame_id]
+        else:
+            raise TypeError('frame_id should be int, list or None.')
+        for i in tqdm.tqdm(frame_list, disable=disable_tqdm):
+            frame = self.__read_color_from_bytes__(
+                self.smc['iPhone'][str(iphone_id)]['Color'][str(i)][()])
+            if vertical:
+                frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
+            frames.append(frame)
+        return np.stack(frames, axis=0)
+
+    def get_iphone_depth(self,
+                         iphone_id=0,
+                         frame_id=None,
+                         disable_tqdm=True,
+                         vertical=True):
+        """Get several frames captured by an iPhone RGB camera.
+
+        Args:
+            iphone_id (int):
+                ID of an iPhone, starts from 0.
+            frame_id (int, list or None, optional):
+                int: frame id of one selected frame
+                list: a list of frame id
+                None: all frames will be returned
+                Defaults to None.
+            disable_tqdm (bool, optional):
+                Whether to disable the entire progressbar wrapper.
+                Defaults to True.
+            vertical (bool, optional):
+                iPhone assumes horizontal orientation
+                if True, convert data to vertical orientation
+                Defaults to True.
+
+        Returns:
+            frames:
+                An ndarray in shape [frame_number, height, width, channels].
+        """
+        frames = []
+        if frame_id is None:
+            frame_list = range(self.get_iphone_num_frames())
+        elif isinstance(frame_id, list):
+            frame_list = frame_id
+        elif isinstance(frame_id, int):
+            assert frame_id < self.get_iphone_num_frames(),\
+                'Index out of range...'
+            frame_list = [frame_id]
+        else:
+            raise TypeError('frame_id should be int, list or None.')
+        for i in tqdm.tqdm(frame_list, disable=disable_tqdm):
+            frame = self.smc['iPhone'][str(iphone_id)]['Depth'][str(i)][()]
+            if vertical:
+                frame = cv2.rotate(frame, cv2.ROTATE_90_CLOCKWISE)
+            frames.append(frame)
+        return np.stack(frames, axis=0)
+
+    def get_kinect_transformation_depth_to_color(self, device_id):
+        """Get transformation matrix from depth to color from a single kinect.
+
+        Args:
+            kinect_id (int, optional):
+                ID of a Kinect, starts from 0.
+
+        Returns:
+            ndarray: A 4x4 transformation matrix.
+        """
+        return np.linalg.inv(self.get_kinect_color_extrinsics(
+            device_id)) @ self.get_kinect_depth_extrinsics(device_id)
+
+    def get_kinect_transformation_color_to_depth(self, device_id):
+        """Get transformation matrix from color to depth from a single kinect.
+
+        Args:
+            kinect_id (int, optional):
+                ID of a Kinect, starts from 0.
+
+        Returns:
+            ndarray: A 4x4 transformation matrix.
+        """
+        return np.linalg.inv(self.get_kinect_depth_extrinsics(
+            device_id)) @ self.get_kinect_color_extrinsics(device_id)
+
+    def __map_color_to_depth__(self, device_id, frame_id, threshold=100):
+        color_image = self.get_kinect_color(device_id, frame_id)[0]
+        depth_image = self.get_kinect_depth(device_id, frame_id)[0]
+        color_intrinsic = self.get_kinect_color_intrinsics(device_id)
+        depth_intrinsic = self.get_kinect_depth_intrinsics(device_id)
+
+        mask = self.get_depth_mask(device_id, frame_id)
+
+        Td2c = self.get_kinect_transformation_depth_to_color(device_id)
+
+        colidx = np.arange(depth_image.shape[1])
+        rowidx = np.arange(depth_image.shape[0])
+        colidx_map, rowidx_map = np.meshgrid(colidx, rowidx)
+        col_indices = colidx_map[mask >= threshold]
+        row_indices = rowidx_map[mask >= threshold]
+
+        homo_padding = \
+            np.ones((col_indices.shape[0], 1), dtype=np.float32)
+        homo_indices = \
+            np.concatenate(
+                (col_indices[..., None], row_indices[..., None], homo_padding),
+                axis=1
+            )
+
+        depth_intrinsic_inv = np.linalg.inv(depth_intrinsic)
+        normalized_points = \
+            depth_intrinsic_inv[None, ...] @ homo_indices[..., None]
+
+        z_values = (depth_image / 1000)[mask >= threshold]
+        valid_points = \
+            normalized_points.squeeze() * z_values[..., None]
+
+        R = Td2c[:3, :3]
+        T = Td2c[:3, 3]
+        valid_points = \
+            R[None, ...] @ valid_points[..., None] + T[None, ..., None]
+        valid_uvs = \
+            color_intrinsic[None, ...] @\
+            valid_points / valid_points[:, 2][..., None]
+        valid_uvs = np.int32(valid_uvs.squeeze()[..., :2] + 0.5)
+        valid_uvs[:, 0] = np.clip(valid_uvs[:, 0], 0, color_image.shape[1] - 1)
+        valid_uvs[:, 1] = np.clip(valid_uvs[:, 1], 0, color_image.shape[0] - 1)
+        mapped_color = np.ones((depth_image.shape[0], depth_image.shape[1], 3),
+                               dtype=np.uint8) * 255
+        mapped_color[mask >= threshold] = \
+            color_image[valid_uvs[:, 1], valid_uvs[:, 0]]
+
+        if threshold == 1:
+            return valid_uvs
+        return mapped_color
+
+    def get_kinect_skeleton_3d(self, device_id, frame_id):
+        """Get the 3D skeleton key points from a certain kinect.
+
+        Args:
+            device_id (int):
+                ID of a kinect, starts from 0.
+
+        Returns:
+            list:
+                A list with 3D keypoints
+        """
+        kinect_dict = self.smc['Kinect'][str(device_id)]
+        return json.loads(kinect_dict['Skeleton_k4abt'][str(frame_id)][()])
+
+    def get_depth_floor(self, device_id: int) -> dict:
+        """Get the floor plane defined by a normal vector and a center point
+        from a certain kinect.
+
+        Args:
+            device_id (int):
+                ID of a kinect, starts from 0.
+
+        Raises:
+            KeyError:
+                Key 'floor' not in ID of a kinect.
+
+        Returns:
+            dict:
+                A dict with 'center', 'normal' and 'pnum'.
+        """
+        device_dict = self.calibration_dict[str(device_id * 2 + 1)]
+        if 'floor' in device_dict:
+            return device_dict['floor']
+        else:
+            raise KeyError(f'Kinect {device_id} has no floor data.')
+
+    def get_keypoints2d(self, device, device_id, frame_id=None, vertical=True):
+        """Get keypoints2d projected from keypoints3d.
+
+        Args:
+            device (str):
+                Device name, should be Kinect or iPhone.
+            device_id (int):
+                ID of a device, starts from 0.
+            frame_id (int, list or None, optional):
+                int: frame id of one selected frame
+                list: a list of frame id
+                None: all frames will be returned
+                Defaults to None.
+            vertical (bool, optional):
+                Only applicable to iPhone as device
+                iPhone assumes horizontal orientation
+                if True, convert data to vertical orientation
+                Defaults to True.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]:
+                keypoints2d (N, J, 3) and its mask (J, )
+        """
+        assert device in {
+            'Kinect', 'iPhone'
+        }, f'Undefined device: {device}, should be "Kinect" or "iPhone"'
+        assert device_id >= 0
+
+        kps2d_dict = self.smc['Keypoints2D'][device][str(device_id)]
+        keypoints2d = kps2d_dict['keypoints2d'][...]
+        keypoints2d_mask = kps2d_dict['keypoints2d_mask'][...]
+
+        if frame_id is None:
+            frame_list = range(self.get_keypoints_num_frames())
+        elif isinstance(frame_id, list):
+            frame_list = frame_id
+        elif isinstance(frame_id, int):
+            assert frame_id < self.get_keypoints_num_frames(),\
+                'Index out of range...'
+            frame_list = [frame_id]
+        else:
+            raise TypeError('frame_id should be int, list or None.')
+
+        keypoints2d = keypoints2d[frame_list, ...]
+
+        if device == 'iPhone' and vertical:
+            # rotate keypoints 2D clockwise by 90 degrees
+            W, H = self.get_iphone_color_resolution(vertical=False)
+            xs, ys, conf = \
+                keypoints2d[..., 0], keypoints2d[..., 1], keypoints2d[..., 2]
+            xs, ys = H - ys, xs  # horizontal -> vertical
+            keypoints2d[..., 0], keypoints2d[..., 1] = xs.copy(), ys.copy()
+            keypoints2d[conf == 0.0] = 0.0
+
+        return keypoints2d, keypoints2d_mask
+
+    def get_kinect_keypoints2d(self, device_id, frame_id=None):
+        """Get Kinect 2D keypoints.
+
+        Args:
+            device_id (int):
+                ID of Kinect, starts from 0.
+            frame_id (int, list or None, optional):
+                int: frame id of one selected frame
+                list: a list of frame id
+                None: all frames will be returned
+                Defaults to None.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]:
+                keypoints2d (N, J, 3) and its mask (J, )
+        """
+        assert self.num_kinects > device_id >= 0
+        return self.get_keypoints2d('Kinect', device_id, frame_id)
+
+    def get_iphone_keypoints2d(self,
+                               device_id=0,
+                               frame_id=None,
+                               vertical=True):
+        """Get iPhone 2D keypoints.
+
+        Args:
+            device_id (int):
+                ID of iPhone, starts from 0.
+            frame_id (int, list or None, optional):
+                int: frame id of one selected frame
+                list: a list of frame id
+                None: all frames will be returned
+                Defaults to None.
+            vertical (bool, optional):
+                iPhone assumes horizontal orientation
+                if True, convert data to vertical orientation
+                Defaults to True.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]:
+                keypoints2d (N, J, 3) and its mask (J, )
+        """
+        assert device_id >= 0
+        return self.get_keypoints2d('iPhone',
+                                    device_id,
+                                    frame_id,
+                                    vertical=vertical)
+
+    def get_color(self,
+                  device,
+                  device_id,
+                  frame_id=None,
+                  disable_tqdm=True,
+                  vertical=True):
+        """Get RGB image(s) from Kinect RGB or iPhone RGB camera.
+
+        Args:
+            device (str):
+                Device name, should be Kinect or iPhone.
+            device_id (int):
+                Device ID, starts from 0.
+            frame_id (int, list or None, optional):
+                int: frame id of one selected frame
+                list: a list of frame id
+                None: all frames will be returned
+                Defaults to None.
+            disable_tqdm (bool, optional):
+                Whether to disable the entire progressbar wrapper.
+                Defaults to True.
+            vertical (bool, optional):
+                Only applicable to iPhone as device
+                iPhone assumes horizontal orientation
+                if True, convert data to vertical orientation
+                Defaults to True.
+
+        Returns:
+            img (ndarray):
+                An ndarray in shape [frame_number, height, width, channels].
+        """
+
+        assert device in {
+            'Kinect', 'iPhone'
+        }, f'Undefined device: {device}, should be "Kinect" or "iPhone"'
+
+        if device == 'Kinect':
+            img = self.get_kinect_color(device_id, frame_id, disable_tqdm)
+        else:
+            img = self.get_iphone_color(device_id,
+                                        frame_id,
+                                        disable_tqdm,
+                                        vertical=vertical)
+
+        return img
+
+    def get_keypoints_num_frames(self):
+        return self.keypoints_num_frames
+
+    def get_keypoints_convention(self):
+        return self.keypoints_convention
+
+    def get_keypoints_created_time(self):
+        return self.keypoints_created_time
+
+    def get_keypoints3d(self,
+                        device=None,
+                        device_id=None,
+                        frame_id=None,
+                        vertical=True):
+        """Get keypoints3d (world coordinate) computed by mocap processing
+        pipeline.
+
+        Args:
+            device (str):
+                Device name, should be Kinect or iPhone.
+                None: world coordinate
+                Defaults to None.
+            device_id (int):
+                ID of a device, starts from 0.
+                None: world coordinate
+                Defaults to None
+            frame_id (int, list or None, optional):
+                int: frame id of one selected frame
+                list: a list of frame id
+                None: all frames will be returned
+                Defaults to None.
+            vertical (bool, optional):
+                Only applicable to iPhone as device
+                iPhone assumes horizontal orientation
+                if True, convert data to vertical orientation
+                Defaults to True.
+
+        Returns:
+            Tuple[np.ndarray, np.ndarray]:
+                keypoints3d (N, J, 4) and its mask (J, )
+        """
+        assert (device is None and device_id is None) or \
+            (device is not None and device_id is not None), \
+            'device and device_id should be both None or both not None.'
+        if device is not None:
+            assert device in {
+                'Kinect', 'iPhone'
+            }, f'Undefined device: {device}, should be "Kinect" or "iPhone"'
+        if device_id is not None:
+            assert device_id >= 0
+
+        if frame_id is None:
+            frame_list = range(self.get_keypoints_num_frames())
+        elif isinstance(frame_id, list):
+            frame_list = frame_id
+        elif isinstance(frame_id, int):
+            assert frame_id < self.get_keypoints_num_frames(),\
+                'Index out of range...'
+            frame_list = [frame_id]
+        else:
+            raise TypeError('frame_id should be int, list or None.')
+
+        kps3d_dict = self.smc['Keypoints3D']
+
+        # keypoints3d are in world coordinate system
+        keypoints3d_world = kps3d_dict['keypoints3d'][...]
+        keypoints3d_world = keypoints3d_world[frame_list, ...]
+        keypoints3d_mask = kps3d_dict['keypoints3d_mask'][...]
+
+        # return keypoints3d in world coordinate system
+        if device is None:
+            return keypoints3d_world, keypoints3d_mask
+
+        # return keypoints3d in device coordinate system
+        else:
+            if device == 'Kinect':
+                cam2world = self.get_kinect_color_extrinsics(
+                    kinect_id=device_id, homogeneous=True)
+            else:
+                cam2world = self.get_iphone_extrinsics(iphone_id=device_id,
+                                                       vertical=vertical)
+
+            xyz, conf = keypoints3d_world[..., :3], keypoints3d_world[..., [3]]
+            xyz_homogeneous = np.ones([*xyz.shape[:-1], 4])
+            xyz_homogeneous[..., :3] = xyz
+            world2cam = np.linalg.inv(cam2world)
+            keypoints3d = np.einsum('ij,kmj->kmi', world2cam, xyz_homogeneous)
+            keypoints3d = np.concatenate([keypoints3d[..., :3], conf], axis=-1)
+
+            return keypoints3d, keypoints3d_mask
+
+    def get_smpl_num_frames(self):
+        return self.smpl_num_frames
+
+    def get_smpl_created_time(self):
+        return self.smpl_created_time
+
+    def get_smpl(self,
+                 device=None,
+                 device_id=None,
+                 frame_id=None,
+                 vertical=True):
+        """Get SMPL (world coordinate) computed by mocap processing pipeline.
+
+        Args:
+            device (str):
+                Device name, should be Kinect or iPhone.
+                None: world coordinate
+                Defaults to None.
+            device_id (int):
+                ID of a device, starts from 0.
+                None: world coordinate
+                Defaults to None
+            frame_id (int, list or None, optional):
+                int: frame id of one selected frame
+                list: a list of frame id
+                None: all frames will be returned
+                Defaults to None.
+            vertical (bool, optional):
+                Only applicable to iPhone as device
+                iPhone assumes horizontal orientation
+                if True, convert data to vertical orientation
+                Defaults to True.
+
+        Returns:
+            dict:
+                'global_orient': np.ndarray of shape (N, 3)
+                'body_pose': np.ndarray of shape (N, 69)
+                'transl': np.ndarray of shape (N, 3)
+                'betas': np.ndarray of shape (N, 10)
+        """
+        smpl_dict = self.smc['SMPL']
+        global_orient = smpl_dict['global_orient'][...]
+        body_pose = smpl_dict['body_pose'][...]
+        transl = smpl_dict['transl'][...]
+        betas = smpl_dict['betas'][...]
+
+        if frame_id is None:
+            frame_list = range(self.get_smpl_num_frames())
+        elif isinstance(frame_id, list):
+            frame_list = frame_id
+        elif isinstance(frame_id, int):
+            assert frame_id < self.get_keypoints_num_frames(),\
+                'Index out of range...'
+            frame_list = [frame_id]
+        else:
+            raise TypeError('frame_id should be int, list or None.')
+
+        body_pose = body_pose[frame_list, ...]
+        global_orient = global_orient[frame_list, ...]
+        transl = transl[frame_list, ...]
+
+        # return SMPL parameters in world coordinate system
+        if device is None:
+            smpl_dict = dict(global_orient=global_orient,
+                             body_pose=body_pose,
+                             transl=transl,
+                             betas=betas)
+
+            return smpl_dict
+
+        # return SMPL parameters in device coordinate system
+        else:
+
+            if self.body_model is None:
+                self.body_model = \
+                    build_body_model(self.default_body_model_config)
+            torch_device = self.body_model.global_orient.device
+
+            assert device in {
+                'Kinect', 'iPhone'
+            }, f'Undefined device: {device}, should be "Kinect" or "iPhone"'
+            assert device_id >= 0
+
+            if device == 'Kinect':
+                T_cam2world = self.get_kinect_color_extrinsics(
+                    kinect_id=device_id, homogeneous=True)
+            else:
+                T_cam2world = self.get_iphone_extrinsics(iphone_id=device_id,
+                                                         vertical=vertical)
+
+            T_world2cam = np.linalg.inv(T_cam2world)
+
+            output = self.body_model(
+                global_orient=torch.tensor(global_orient, device=torch_device),
+                body_pose=torch.tensor(body_pose, device=torch_device),
+                transl=torch.tensor(transl, device=torch_device),
+                betas=torch.tensor(betas, device=torch_device))
+            joints = output['joints'].detach().cpu().numpy()
+            pelvis = joints[:, 0, :]
+
+            new_global_orient, new_transl = batch_transform_to_camera_frame(
+                global_orient=global_orient,
+                transl=transl,
+                pelvis=pelvis,
+                extrinsic=T_world2cam)
+
+            smpl_dict = dict(global_orient=new_global_orient,
+                             body_pose=body_pose,
+                             transl=new_transl,
+                             betas=betas)
+
+            return smpl_dict
diff --git a/detrsmpl/data/datasets/__init__.py b/detrsmpl/data/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f8c20e70bc4719a3fb537d6f35bb513eca8dfce
--- /dev/null
+++ b/detrsmpl/data/datasets/__init__.py
@@ -0,0 +1,21 @@
+from .adversarial_dataset import AdversarialDataset
+from .base_dataset import BaseDataset
+from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
+from .dataset_wrappers import ConcatDataset, RepeatDataset
+from .human_hybrik_dataset import HybrIKHumanImageDataset
+from .human_image_dataset import HumanImageDataset
+from .human_image_smplx_dataset import HumanImageSMPLXDataset
+from .human_video_dataset import HumanVideoDataset
+from .mesh_dataset import MeshDataset
+from .mixed_dataset import MixedDataset
+from .multi_human_image_dataset import MultiHumanImageDataset
+from .pipelines import Compose
+from .samplers import DistributedSampler
+
+__all__ = [
+    'BaseDataset', 'HumanImageDataset', 'HumanImageSMPLXDataset',
+    'build_dataloader', 'build_dataset', 'Compose', 'DistributedSampler',
+    'ConcatDataset', 'RepeatDataset', 'DATASETS', 'PIPELINES', 'MixedDataset',
+    'AdversarialDataset', 'MeshDataset', 'HumanVideoDataset',
+    'HybrIKHumanImageDataset', 'MultiHumanImageDataset'
+]
diff --git a/detrsmpl/data/datasets/adversarial_dataset.py b/detrsmpl/data/datasets/adversarial_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..6484d3e0b83326883cf985157087671180bfe036
--- /dev/null
+++ b/detrsmpl/data/datasets/adversarial_dataset.py
@@ -0,0 +1,40 @@
+import numpy as np
+from torch.utils.data import Dataset
+
+from .builder import DATASETS, build_dataset
+
+
+@DATASETS.register_module()
+class AdversarialDataset(Dataset):
+    """Mix Dataset for the adversarial training in 3D human mesh estimation
+    task.
+
+    The dataset combines data from two datasets and
+    return a dict containing data from two datasets.
+    Args:
+        train_dataset (:obj:`Dataset`): Dataset for 3D human mesh estimation.
+        adv_dataset (:obj:`Dataset`): Dataset for adversarial learning.
+    """
+    def __init__(self, train_dataset: Dataset, adv_dataset: Dataset):
+        super().__init__()
+        self.train_dataset = build_dataset(train_dataset)
+        self.adv_dataset = build_dataset(adv_dataset)
+        self.num_train_data = len(self.train_dataset)
+        self.num_adv_data = len(self.adv_dataset)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return self.num_train_data
+
+    def __getitem__(self, idx: int):
+        """Given index, get the data from train dataset and randomly sample an
+        item from adversarial dataset.
+
+        Return a dict containing data from train and adversarial dataset.
+        """
+        data = self.train_dataset[idx]
+        adv_idx = np.random.randint(low=0, high=self.num_adv_data, dtype=int)
+        adv_data = self.adv_dataset[adv_idx]
+        for k, v in adv_data.items():
+            data['adv_' + k] = v
+        return data
diff --git a/detrsmpl/data/datasets/base_dataset.py b/detrsmpl/data/datasets/base_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..494b836db1095c34ac25004b317dc9427434d2a1
--- /dev/null
+++ b/detrsmpl/data/datasets/base_dataset.py
@@ -0,0 +1,71 @@
+import copy
+from abc import ABCMeta, abstractmethod
+from typing import Optional, Union
+
+from torch.utils.data import Dataset
+
+from .pipelines import Compose
+
+
+class BaseDataset(Dataset, metaclass=ABCMeta):
+    """Base dataset.
+
+    Args:
+        data_prefix (str): the prefix of data path.
+        pipeline (list): a list of dict, where each element represents
+            a operation defined in `mmhuman3d.datasets.pipelines`.
+        ann_file (str | None, optional): the annotation file. When ann_file is
+            str, the subclass is expected to read from the ann_file. When
+            ann_file is None, the subclass is expected to read according
+            to data_prefix.
+        test_mode (bool): in train mode or test mode. Default: None.
+        dataset_name (str | None, optional): the name of dataset. It is used
+            to identify the type of evaluation metric. Default: None.
+    """
+    # metric
+    ALLOWED_METRICS = {
+        'mpjpe', 'pa-mpjpe', 'pve', '3dpck', 'pa-3dpck', '3dauc', 'pa-3dauc',
+        '3DRMSE', 'pa-pve'
+    }
+
+    def __init__(self,
+                 data_prefix: str,
+                 pipeline: list,
+                 ann_file: Optional[Union[str, None]] = None,
+                 test_mode: Optional[bool] = False,
+                 dataset_name: Optional[Union[str, None]] = None):
+        super(BaseDataset, self).__init__()
+
+        self.ann_file = ann_file
+        self.data_prefix = data_prefix
+        self.test_mode = test_mode
+        self.pipeline = Compose(pipeline)
+        if dataset_name is not None:
+            self.dataset_name = dataset_name
+
+        self.load_annotations()
+
+    @abstractmethod
+    def load_annotations(self):
+        """Load annotations from ``ann_file``"""
+        pass
+
+    def prepare_data(self, idx: int):
+        """"Prepare raw data for the f'{idx'}-th data."""
+        results = copy.deepcopy(self.data_infos[idx])
+        results['dataset_name'] = self.dataset_name
+        results['sample_idx'] = idx
+        return self.pipeline(results)
+
+    def __len__(self):
+        """Return the length of current dataset."""
+        return self.num_data
+
+    def __getitem__(self, idx: int):
+        """Prepare data for the ``idx``-th data.
+
+        As for video dataset, we can first parse raw data for each frame. Then
+        we combine annotations from all frames. This interface is used to
+        simplify the logic of video dataset and other special datasets.
+        """
+        return self.prepare_data(idx)
diff --git a/detrsmpl/data/datasets/builder.py b/detrsmpl/data/datasets/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..27d3be684dc7d49ef1b3fa8d4c90ca84cf42d1c3
--- /dev/null
+++ b/detrsmpl/data/datasets/builder.py
@@ -0,0 +1,124 @@
+import platform
+import random
+from functools import partial
+from typing import Optional, Union
+
+import numpy as np
+from mmcv.parallel import collate
+from mmcv.runner import get_dist_info
+from mmcv.utils import Registry, build_from_cfg
+from torch.utils.data import DataLoader
+from torch.utils.data.dataset import Dataset
+
+from .samplers import DistributedSampler
+
+if platform.system() != 'Windows':
+    # https://github.com/pytorch/pytorch/issues/973
+    import resource
+    rlimit = resource.getrlimit(resource.RLIMIT_NOFILE)
+    base_soft_limit = rlimit[0]
+    hard_limit = rlimit[1]
+    soft_limit = min(max(4096, base_soft_limit), hard_limit)
+    resource.setrlimit(resource.RLIMIT_NOFILE, (soft_limit, hard_limit))
+
+DATASETS = Registry('dataset')
+PIPELINES = Registry('pipeline')
+
+
+def build_dataset(cfg: Union[dict, list, tuple],
+                  default_args: Optional[Union[dict, None]] = None):
+    """"Build dataset by the given config."""
+    from .dataset_wrappers import (
+        ConcatDataset,
+        RepeatDataset,
+    )
+    if isinstance(cfg, (list, tuple)):
+        dataset = ConcatDataset([build_dataset(c, default_args) for c in cfg])
+    elif cfg['type'] == 'RepeatDataset':
+        dataset = RepeatDataset(build_dataset(cfg['dataset'], default_args),
+                                cfg['times'])
+    else:
+        dataset = build_from_cfg(cfg, DATASETS, default_args)
+
+    return dataset
+
+
+def build_dataloader(dataset: Dataset,
+                     samples_per_gpu: int,
+                     workers_per_gpu: int,
+                     num_gpus: Optional[int] = 1,
+                     dist: Optional[bool] = True,
+                     shuffle: Optional[bool] = True,
+                     round_up: Optional[bool] = True,
+                     seed: Optional[Union[int, None]] = None,
+                     persistent_workers: Optional[bool] = True,
+                     **kwargs):
+    """Build PyTorch DataLoader.
+
+    In distributed training, each GPU/process has a dataloader.
+    In non-distributed training, there is only one dataloader for all GPUs.
+
+    Args:
+        dataset (:obj:`Dataset`): A PyTorch dataset.
+        samples_per_gpu (int): Number of training samples on each GPU, i.e.,
+            batch size of each GPU.
+        workers_per_gpu (int): How many subprocesses to use for data loading
+            for each GPU.
+        num_gpus (int, optional): Number of GPUs. Only used in non-distributed
+            training.
+        dist (bool, optional): Distributed training/test or not. Default: True.
+        shuffle (bool, optional): Whether to shuffle the data at every epoch.
+            Default: True.
+        round_up (bool, optional): Whether to round up the length of dataset by
+            adding extra samples to make it evenly divisible. Default: True.
+        persistent_workers (bool): If True, the data loader will not shutdown
+            the worker processes after a dataset has been consumed once.
+            This allows to maintain the workers Dataset instances alive.
+            The argument also has effect in PyTorch>=1.7.0.
+            Default: True
+        kwargs: any keyword argument to be used to initialize DataLoader
+
+    Returns:
+        DataLoader: A PyTorch dataloader.
+    """
+    rank, world_size = get_dist_info()
+    if dist:
+        sampler = DistributedSampler(dataset,
+                                     world_size,
+                                     rank,
+                                     shuffle=shuffle,
+                                     round_up=round_up)
+        shuffle = False
+        batch_size = samples_per_gpu
+        num_workers = workers_per_gpu
+    else:
+        sampler = None
+        batch_size = num_gpus * samples_per_gpu
+        num_workers = num_gpus * workers_per_gpu
+
+    init_fn = partial(
+        worker_init_fn, num_workers=num_workers, rank=rank,
+        seed=seed) if seed is not None else None
+
+    data_loader = DataLoader(dataset,
+                             batch_size=batch_size,
+                             sampler=sampler,
+                             num_workers=num_workers,
+                             collate_fn=partial(
+                                 collate, samples_per_gpu=samples_per_gpu),
+                             pin_memory=False,
+                             shuffle=shuffle,
+                             worker_init_fn=init_fn,
+                             persistent_workers=persistent_workers,
+                             **kwargs)
+
+    return data_loader
+
+
+def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int):
+    """Init random seed for each worker."""
+    # The seed of each worker equals to
+    # num_worker * rank + worker_id + user_seed
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
diff --git a/detrsmpl/data/datasets/dataset_wrappers.py b/detrsmpl/data/datasets/dataset_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4228b1d2520b3019f90c42f1af12f9abf642cf8
--- /dev/null
+++ b/detrsmpl/data/datasets/dataset_wrappers.py
@@ -0,0 +1,45 @@
+from torch.utils.data.dataset import ConcatDataset as _ConcatDataset
+from torch.utils.data.dataset import Dataset
+
+from .builder import DATASETS
+
+
+@DATASETS.register_module()
+class ConcatDataset(_ConcatDataset):
+    """A wrapper of concatenated dataset.
+
+    Same as :obj:`torch.utils.data.dataset.ConcatDataset`, but
+    add `get_cat_ids` function.
+
+    Args:
+        datasets (list[:obj:`Dataset`]): A list of datasets.
+    """
+    def __init__(self, datasets: list):
+        super(ConcatDataset, self).__init__(datasets)
+
+
+@DATASETS.register_module()
+class RepeatDataset(object):
+    """A wrapper of repeated dataset.
+
+    The length of repeated dataset will be `times` larger than the original
+    dataset. This is useful when the data loading time is long but the dataset
+    is small. Using RepeatDataset can reduce the data loading time between
+    epochs.
+
+    Args:
+        dataset (:obj:`Dataset`): The dataset to be repeated.
+        times (int): Repeat times.
+    """
+    def __init__(self, dataset: Dataset, times: int):
+        self.dataset = dataset
+        self.times = times
+        self.CLASSES = dataset.CLASSES
+
+        self._ori_len = len(self.dataset)
+
+    def __getitem__(self, idx: int):
+        return self.dataset[idx % self._ori_len]
+
+    def __len__(self):
+        return self.times * self._ori_len
diff --git a/detrsmpl/data/datasets/human_hybrik_dataset.py b/detrsmpl/data/datasets/human_hybrik_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2b4034cd7b065423fb407dd767408aa9902266a
--- /dev/null
+++ b/detrsmpl/data/datasets/human_hybrik_dataset.py
@@ -0,0 +1,452 @@
+import json
+import os
+import os.path
+from abc import ABCMeta
+from collections import OrderedDict
+from typing import List, Optional, Union
+
+import mmcv
+import numpy as np
+import torch
+
+from detrsmpl.core.conventions.keypoints_mapping import get_mapping
+from detrsmpl.core.evaluation import (
+    keypoint_3d_auc,
+    keypoint_3d_pck,
+    keypoint_mpjpe,
+    vertice_pve,
+)
+from detrsmpl.data.data_structures.human_data import HumanData
+from detrsmpl.models.body_models.builder import build_body_model
+from detrsmpl.utils.demo_utils import box2cs, xyxy2xywh
+from .base_dataset import BaseDataset
+from .builder import DATASETS
+
+
+@DATASETS.register_module()
+class HybrIKHumanImageDataset(BaseDataset, metaclass=ABCMeta):
+    """Dataset for HybrIK training. The dataset loads raw features and apply
+    specified transforms to return a dict containing the image tensors and
+    other information.
+
+    Args:
+
+        data_prefix (str): Path to a directory where preprocessed datasets are
+         held.
+        pipeline (list[dict | callable]): A sequence of data transforms.
+        dataset_name (str): accepted names include 'h36m', 'pw3d',
+         'mpi_inf_3dhp', 'coco'
+        ann_file (str): Name of annotation file.
+        test_mode (bool): Store True when building test dataset.
+         Default: False.
+    """
+    # metric
+    ALLOWED_METRICS = {
+        'mpjpe', 'pa-mpjpe', 'pve', '3dpck', 'pa-3dpck', '3dauc', 'pa-3dauc'
+    }
+
+    def __init__(self,
+                 data_prefix: str,
+                 pipeline: list,
+                 dataset_name: str,
+                 body_model: Optional[Union[dict, None]] = None,
+                 ann_file: Optional[Union[str, None]] = None,
+                 test_mode: Optional[bool] = False):
+        if dataset_name is not None:
+            self.dataset_name = dataset_name
+        self.test_mode = test_mode
+        super(HybrIKHumanImageDataset, self).__init__(data_prefix, pipeline,
+                                                      ann_file, test_mode)
+        if body_model is not None:
+            self.body_model = build_body_model(body_model)
+        else:
+            self.body_model = None
+
+    def get_annotation_file(self):
+        """Obtain annotation file path from data prefix."""
+        ann_prefix = os.path.join(self.data_prefix, 'preprocessed_datasets')
+        self.ann_file = os.path.join(ann_prefix, self.ann_file)
+
+    @staticmethod
+    def get_3d_keypoints_vis(keypoints):
+        """Get 3d keypoints and visibility mask
+        Args:
+            keypoints (np.ndarray): 2d (NxKx3) or 3d (NxKx4) keypoints with
+             visibility. N refers to number of datapoints, K refers to number
+             of keypoints.
+
+        Returns:
+            joint_img (np.ndarray): (NxKx3) 3d keypoints
+            joint_vis (np.ndarray): (NxKx3) visibility mask for keypoints
+        """
+        keypoints, keypoints_vis = keypoints[:, :, :-1], keypoints[:, :, -1]
+        num_datapoints, num_keypoints, dim = keypoints.shape
+        joint_img = np.zeros((num_datapoints, num_keypoints, 3),
+                             dtype=np.float32)
+        joint_vis = np.zeros((num_datapoints, num_keypoints, 3),
+                             dtype=np.float32)
+        joint_img[:, :, :dim] = keypoints
+        joint_vis[:, :, :dim] = np.tile(np.expand_dims(keypoints_vis, axis=2),
+                                        (1, dim))
+        return joint_img, joint_vis
+
+    def load_annotations(self):
+        """Load annotations."""
+        self.get_annotation_file()
+        data = HumanData()
+        data.load(self.ann_file)
+
+        self.image_path = data['image_path']
+        self.num_data = len(self.image_path)
+
+        self.bbox_xyxy = data['bbox_xywh']
+        self.width = data['image_width']
+        self.height = data['image_height']
+        self.depth_factor = data['depth_factor']
+
+        try:
+            self.keypoints3d, self.keypoints3d_vis = self.get_3d_keypoints_vis(
+                data['keypoints2d'])
+        except KeyError:
+            self.keypoints3d, self.keypoints3d_vis = self.get_3d_keypoints_vis(
+                data['keypoints3d'])
+
+        try:
+            self.smpl = data['smpl']
+            if 'has_smpl' not in data.keys():
+                self.has_smpl = np.ones((self.num_data)).astype(np.float32)
+            else:
+                self.has_smpl = data['has_smpl'].astype(np.float32)
+            self.thetas = self.smpl['thetas'].astype(np.float32)
+            self.betas = self.smpl['betas'].astype(np.float32)
+
+            self.keypoints3d_relative, _ = self.get_3d_keypoints_vis(
+                data['keypoints3d_relative'])
+            self.keypoints3d17, self.keypoints3d17_vis = \
+                self.get_3d_keypoints_vis(data['keypoints3d17'])
+            self.keypoints3d17_relative, _ = self.get_3d_keypoints_vis(
+                data['keypoints3d17_relative'])
+
+            if self.test_mode:
+                self.keypoints3d_cam, _ = self.get_3d_keypoints_vis(
+                    data['keypoints3d_cam'])
+        except KeyError:
+            self.has_smpl = np.zeros((self.num_data)).astype(np.float32)
+            if self.test_mode:
+                self.keypoints3d, self.keypoints3d_vis = \
+                    self.get_3d_keypoints_vis(data['keypoints3d'])
+                self.keypoints3d_cam, _ = self.get_3d_keypoints_vis(
+                    data['keypoints3d_cam'])
+
+        try:
+            self.intrinsic = data['cam_param']['intrinsic']
+        except KeyError:
+            self.intrinsic = np.zeros((self.num_data, 3, 3))
+
+        try:
+            self.target_twist = data['phi']
+            # self.target_twist_weight = np.ones_like((self.target_twist))
+            self.target_twist_weight = data['phi_weight']
+        except KeyError:
+            self.target_twist = np.zeros((self.num_data, 23, 2))
+            self.target_twist_weight = np.zeros_like((self.target_twist))
+
+        try:
+            self.root_cam = data['root_cam']
+        except KeyError:
+            self.root_cam = np.zeros((self.num_data, 3))
+
+        self.data_infos = []
+
+        for idx in range(self.num_data):
+            info = {}
+            info['ann_info'] = {}
+            info['img_prefix'] = None
+            info['image_path'] = os.path.join(self.data_prefix, 'datasets',
+                                              self.dataset_name,
+                                              self.image_path[idx])
+            bbox_xyxy = self.bbox_xyxy[idx]
+            info['bbox'] = bbox_xyxy[:4]
+            bbox_xywh = xyxy2xywh(bbox_xyxy)
+            center, scale = box2cs(bbox_xywh,
+                                   aspect_ratio=1.0,
+                                   bbox_scale_factor=1.25)
+
+            info['center'] = center
+            info['scale'] = scale
+            info['rotation'] = 0
+            info['ann_info']['dataset_name'] = self.dataset_name
+            info['ann_info']['height'] = self.height[idx]
+            info['ann_info']['width'] = self.width[idx]
+            info['depth_factor'] = float(self.depth_factor[idx])
+            info['has_smpl'] = int(self.has_smpl[idx])
+            info['joint_root'] = self.root_cam[idx].astype(np.float32)
+            info['intrinsic_param'] = self.intrinsic[idx].astype(np.float32)
+            info['target_twist'] = self.target_twist[idx].astype(
+                np.float32)  # twist_phi
+            info['target_twist_weight'] = self.target_twist_weight[idx].astype(
+                np.float32)
+            info['keypoints3d'] = self.keypoints3d[idx]
+            info['keypoints3d_vis'] = self.keypoints3d_vis[idx]
+
+            if info['has_smpl']:
+                info['pose'] = self.thetas[idx]
+                info['beta'] = self.betas[idx].astype(np.float32)
+                info['keypoints3d_relative'] = self.keypoints3d_relative[idx]
+                info['keypoints3d17'] = self.keypoints3d17[idx]
+                info['keypoints3d17_vis'] = self.keypoints3d17_vis[idx]
+                info['keypoints3d17_relative'] = self.keypoints3d17_relative[
+                    idx]
+
+                if self.test_mode:
+                    info['joint_relative_17'] = self.keypoints3d17_relative[
+                        idx].astype(np.float32)
+
+            else:
+                if self.test_mode:
+                    info['joint_relative_17'] = self.keypoints3d_cam[
+                        idx].astype(np.float32)
+
+            self.data_infos.append(info)
+
+    def evaluate(self,
+                 outputs: list,
+                 res_folder: str,
+                 metric: Optional[Union[str, List[str]]] = 'pa-mpjpe',
+                 **kwargs: dict):
+        """Evaluate 3D keypoint results.
+
+        Args:
+            outputs (list): results from model inference.
+            res_folder (str): path to store results.
+            metric (Optional[Union[str, List(str)]]):
+                the type of metric. Default: 'pa-mpjpe'
+            kwargs (dict): other arguments.
+        Returns:
+            dict:
+                A dict of all evaluation results.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        for metric in metrics:
+            if metric not in self.ALLOWED_METRICS:
+                raise ValueError(f'metric {metric} is not supported')
+
+        res_file = os.path.join(res_folder, 'result_keypoints.json')
+
+        res_dict = {}
+        for out in outputs:
+            target_id = out['image_idx']
+            batch_size = len(out['xyz_17'])
+            for i in range(batch_size):
+                res_dict[int(target_id[i])] = dict(
+                    keypoints=out['xyz_17'][i],
+                    poses=out['smpl_pose'][i],
+                    betas=out['smpl_beta'][i],
+                )
+
+        keypoints, poses, betas = [], [], []
+        for i in range(self.num_data):
+            keypoints.append(res_dict[i]['keypoints'])
+            poses.append(res_dict[i]['poses'])
+            betas.append(res_dict[i]['betas'])
+
+        res = dict(keypoints=keypoints, poses=poses, betas=betas)
+        mmcv.dump(res, res_file)
+
+        name_value_tuples = []
+        for _metric in metrics:
+            if _metric == 'mpjpe':
+                _nv_tuples = self._report_mpjpe(res)
+            elif _metric == 'pa-mpjpe':
+                _nv_tuples = self._report_mpjpe(res, metric='pa-mpjpe')
+            elif _metric == '3dpck':
+                _nv_tuples = self._report_3d_pck(res)
+            elif _metric == 'pa-3dpck':
+                _nv_tuples = self._report_3d_pck(res, metric='pa-3dpck')
+            elif _metric == '3dauc':
+                _nv_tuples = self._report_3d_auc(res)
+            elif _metric == 'pa-3dauc':
+                _nv_tuples = self._report_3d_auc(res, metric='pa-3dauc')
+            elif _metric == 'pve':
+                _nv_tuples = self._report_pve(res)
+            else:
+                raise NotImplementedError
+            name_value_tuples.extend(_nv_tuples)
+
+        name_value = OrderedDict(name_value_tuples)
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoints, res_file):
+        """Write results into a json file."""
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _parse_result(self, res, mode='keypoint'):
+        """Parse results."""
+        gts = self.data_infos
+        if mode == 'vertice':
+            pred_pose = torch.FloatTensor(res['poses'])
+            pred_beta = torch.FloatTensor(res['betas'])
+            pred_output = self.body_model(
+                betas=pred_beta,
+                body_pose=pred_pose[:, 1:],
+                global_orient=pred_pose[:, 0].unsqueeze(1),
+                pose2rot=False)
+            pred_vertices = pred_output['vertices'].detach().cpu().numpy()
+
+            gt_pose = torch.FloatTensor([gt['pose']
+                                         for gt in gts]).view(-1, 72)
+            gt_beta = torch.FloatTensor([gt['beta'] for gt in gts])
+            gt_output = self.body_model(betas=gt_beta,
+                                        body_pose=gt_pose[:, 3:],
+                                        global_orient=gt_pose[:, :3])
+            gt_vertices = gt_output['vertices'].detach().cpu().numpy()
+            gt_mask = np.ones(gt_vertices.shape[:-1])
+            assert len(pred_vertices) == self.num_data
+
+            return pred_vertices * 1000., gt_vertices * 1000., gt_mask
+        elif mode == 'keypoint':
+            pred_keypoints3d = res['keypoints']
+            assert len(pred_keypoints3d) == self.num_data
+            # (B, 17, 3)
+            pred_keypoints3d = np.array(pred_keypoints3d)
+            factor, root_idx_17 = 1, 0
+
+            if self.dataset_name == 'mpi_inf_3dhp':
+                _, hp3d_idxs, _ = get_mapping('human_data',
+                                              'mpi_inf_3dhp_test')
+                gt_keypoints3d = np.array(
+                    [gt['joint_relative_17'][hp3d_idxs] for gt in gts])
+                joint_mapper = [
+                    14, 11, 12, 13, 8, 9, 10, 15, 1, 16, 0, 5, 6, 7, 2, 3, 4
+                ]
+                gt_keypoints3d_mask = np.ones(
+                    (len(gt_keypoints3d), len(joint_mapper)))
+            else:
+                _, h36m_idxs, _ = get_mapping('human_data', 'h36m')
+                gt_keypoints3d = np.array(
+                    [gt['joint_relative_17'][h36m_idxs] for gt in gts])
+                joint_mapper = [
+                    6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10
+                ]
+                gt_keypoints3d_mask = np.ones(
+                    (len(gt_keypoints3d), len(joint_mapper)))
+                if self.dataset_name == 'pw3d':
+                    factor = 1000
+
+            assert len(pred_keypoints3d) == self.num_data
+
+            pred_keypoints3d = pred_keypoints3d * (2000 / factor)
+            if self.dataset_name == 'mpi_inf_3dhp':
+                gt_keypoints3d = gt_keypoints3d[:, joint_mapper, :]
+            # root joint alignment
+            pred_keypoints3d = (
+                pred_keypoints3d -
+                pred_keypoints3d[:, None, root_idx_17]) * factor
+            gt_keypoints3d = (gt_keypoints3d -
+                              gt_keypoints3d[:, None, root_idx_17]) * factor
+
+            if self.dataset_name == 'pw3d' or self.dataset_name == 'h36m':
+                # select eval 14 joints
+                pred_keypoints3d = pred_keypoints3d[:, joint_mapper, :]
+                gt_keypoints3d = gt_keypoints3d[:, joint_mapper, :]
+
+            gt_keypoints3d_mask = gt_keypoints3d_mask > 0
+
+            return pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask
+
+        else:
+            raise NotImplementedError()
+
+    def _report_mpjpe(self, res_file, metric='mpjpe'):
+        """Cauculate mean per joint position error (MPJPE) or its variants PA-
+        MPJPE.
+
+        Report mean per joint position error (MPJPE) and mean per joint
+        position error after rigid alignment (PA-MPJPE)
+        """
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file, mode='keypoint')
+
+        err_name = metric.upper()
+        if metric == 'mpjpe':
+            alignment = 'none'
+        elif metric == 'pa-mpjpe':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+
+        error = keypoint_mpjpe(pred_keypoints3d, gt_keypoints3d,
+                               gt_keypoints3d_mask, alignment)
+        info_str = [(err_name, error)]
+
+        return info_str
+
+    def _report_3d_pck(self, res_file, metric='3dpck'):
+        """Cauculate Percentage of Correct Keypoints (3DPCK) w. or w/o
+        Procrustes alignment.
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            metric (str): Specify mpjpe variants. Supported options are:
+                - ``'3dpck'``: Standard 3DPCK.
+                - ``'pa-3dpck'``:
+                    3DPCK after aligning prediction to groundtruth
+                    via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file, mode='keypoint')
+
+        err_name = metric.upper()
+        if metric == '3dpck':
+            alignment = 'none'
+        elif metric == 'pa-3dpck':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+
+        error = keypoint_3d_pck(pred_keypoints3d, gt_keypoints3d,
+                                gt_keypoints3d_mask, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _report_3d_auc(self, res_file, metric='3dauc'):
+        """Cauculate the Area Under the Curve (AUC) computed for a range of
+        3DPCK thresholds.
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            metric (str): Specify mpjpe variants. Supported options are:
+                - ``'3dauc'``: Standard 3DAUC.
+                - ``'pa-3dauc'``: 3DAUC after aligning prediction to
+                    groundtruth via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file, mode='keypoint')
+
+        err_name = metric.upper()
+        if metric == '3dauc':
+            alignment = 'none'
+        elif metric == 'pa-3dauc':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+
+        error = keypoint_3d_auc(pred_keypoints3d, gt_keypoints3d,
+                                gt_keypoints3d_mask, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _report_pve(self, res_file):
+        """Cauculate per vertex error."""
+        pred_verts, gt_verts, _ = \
+            self._parse_result(res_file, mode='vertice')
+        error = vertice_pve(pred_verts, gt_verts)
+        return [('PVE', error)]
diff --git a/detrsmpl/data/datasets/human_image_dataset.py b/detrsmpl/data/datasets/human_image_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..89b6c121892d889ddb3ff3b3aece38c150ef22ef
--- /dev/null
+++ b/detrsmpl/data/datasets/human_image_dataset.py
@@ -0,0 +1,662 @@
+import json
+import os
+import os.path
+from abc import ABCMeta
+from collections import OrderedDict
+from typing import Any, List, Optional, Union
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    convert_kps,
+    get_keypoint_num,
+    get_mapping,
+)
+from detrsmpl.core.evaluation import (
+    keypoint_3d_auc,
+    keypoint_3d_pck,
+    keypoint_mpjpe,
+    vertice_pve,
+)
+from detrsmpl.data.data_structures.human_data import HumanData
+from detrsmpl.data.data_structures.human_data_cache import (
+    HumanDataCacheReader,
+    HumanDataCacheWriter,
+)
+from detrsmpl.models.body_models.builder import build_body_model
+from .base_dataset import BaseDataset
+from .builder import DATASETS
+
+
+@DATASETS.register_module()
+class HumanImageDataset(BaseDataset, metaclass=ABCMeta):
+    """Human Image Dataset.
+
+    Args:
+        data_prefix (str): the prefix of data path.
+        pipeline (list): a list of dict, where each element represents
+            a operation defined in `detrsmpl.datasets.pipelines`.
+        dataset_name (str | None): the name of dataset. It is used to
+            identify the type of evaluation metric. Default: None.
+        body_model (dict | None, optional): the config for body model,
+            which will be used to generate meshes and keypoints.
+            Default: None.
+        ann_file (str | None, optional): the annotation file. When ann_file
+            is str, the subclass is expected to read from the ann_file.
+            When ann_file is None, the subclass is expected to read
+            according to data_prefix.
+        convention (str, optional): keypoints convention. Keypoints will be
+            converted from "human_data" to the given one.
+            Default: "human_data"
+        cache_data_path (str | None, optional): the path to store the cache
+            file. When cache_data_path is None, each dataset will store a copy
+            into memory. If cache_data_path is set, the dataset will first
+            create one cache file and then use a cache reader to reduce memory
+            cost and initialization time. The cache file will be generated
+            only once if they are not found at the the path. Otherwise, only
+            cache readers will be established.
+        test_mode (bool, optional): in train mode or test mode.
+            Default: False.
+    """
+    # metric
+    ALLOWED_METRICS = {
+        'mpjpe', 'pa-mpjpe', 'pve', '3dpck', 'pa-3dpck', '3dauc', 'pa-3dauc',
+        'ihmr'
+    }
+
+    def __init__(self,
+                 data_prefix: str,
+                 pipeline: list,
+                 dataset_name: str,
+                 body_model: Optional[Union[dict, None]] = None,
+                 ann_file: Optional[Union[str, None]] = None,
+                 convention: Optional[str] = 'human_data',
+                 cache_data_path: Optional[Union[str, None]] = None,
+                 test_mode: Optional[bool] = False):
+        self.convention = convention
+        self.num_keypoints = get_keypoint_num(convention)
+        self.cache_data_path = cache_data_path
+        super(HumanImageDataset,
+              self).__init__(data_prefix, pipeline, ann_file, test_mode,
+                             dataset_name)
+        if body_model is not None:
+            self.body_model = build_body_model(body_model)
+        else:
+            self.body_model = None
+
+    def get_annotation_file(self):
+        """Get path of the annotation file."""
+        ann_prefix = os.path.join(self.data_prefix, 'preprocessed_datasets')
+        self.ann_file = os.path.join(ann_prefix, self.ann_file)
+
+    def load_annotations(self):
+        """Load annotation from the annotation file.
+
+        Here we simply use :obj:`HumanData` to parse the annotation.
+        """
+        rank, world_size = get_dist_info()
+        self.get_annotation_file()
+        if self.cache_data_path is None:
+            use_human_data = True
+        elif rank == 0 and not os.path.exists(self.cache_data_path):
+            use_human_data = True
+        else:
+            use_human_data = False
+        if use_human_data:
+            self.human_data = HumanData.fromfile(self.ann_file)
+
+            if self.human_data.check_keypoints_compressed():
+                self.human_data.decompress_keypoints()
+            # change keypoint from 'human_data' to the given convention
+            if 'keypoints3d' in self.human_data:
+                keypoints3d = self.human_data['keypoints3d']
+                assert 'keypoints3d_mask' in self.human_data
+                keypoints3d_mask = self.human_data['keypoints3d_mask']
+                keypoints3d, keypoints3d_mask = \
+                    convert_kps(
+                        keypoints3d,
+                        src='human_data',
+                        dst=self.convention,
+                        mask=keypoints3d_mask)
+                self.human_data.__setitem__('keypoints3d', keypoints3d)
+                self.human_data.__setitem__('keypoints3d_convention',
+                                            self.convention)
+                self.human_data.__setitem__('keypoints3d_mask',
+                                            keypoints3d_mask)
+            if 'keypoints2d' in self.human_data:
+                keypoints2d = self.human_data['keypoints2d']
+                assert 'keypoints2d_mask' in self.human_data
+                keypoints2d_mask = self.human_data['keypoints2d_mask']
+                keypoints2d, keypoints2d_mask = \
+                    convert_kps(
+                        keypoints2d,
+                        src='human_data',
+                        dst=self.convention,
+                        mask=keypoints2d_mask)
+                self.human_data.__setitem__('keypoints2d', keypoints2d)
+                self.human_data.__setitem__('keypoints2d_convention',
+                                            self.convention)
+                self.human_data.__setitem__('keypoints2d_mask',
+                                            keypoints2d_mask)
+            self.human_data.compress_keypoints_by_mask()
+
+        if self.cache_data_path is not None:
+            if rank == 0 and not os.path.exists(self.cache_data_path):
+                writer_kwargs, sliced_data = self.human_data.get_sliced_cache()
+                writer = HumanDataCacheWriter(**writer_kwargs)
+                writer.update_sliced_dict(sliced_data)
+                writer.dump(self.cache_data_path)
+            if world_size > 1:
+                dist.barrier()
+            self.cache_reader = HumanDataCacheReader(
+                npz_path=self.cache_data_path)
+            self.num_data = self.cache_reader.data_len
+            self.human_data = None
+        else:
+            self.cache_reader = None
+            self.num_data = self.human_data.data_len
+
+    def prepare_raw_data(self, idx: int):
+        """Get item from self.human_data."""
+        sample_idx = idx
+        if self.cache_reader is not None:
+            self.human_data = self.cache_reader.get_item(idx)
+            idx = idx % self.cache_reader.slice_size
+        info = {}
+        info['img_prefix'] = None
+        image_path = self.human_data['image_path'][idx]
+        info['image_path'] = os.path.join(self.data_prefix, 'datasets',
+                                          self.dataset_name, image_path)
+        if image_path.endswith('smc'):
+            device, device_id, frame_id = self.human_data['image_id'][idx]
+            info['image_id'] = (device, int(device_id), int(frame_id))
+
+        info['dataset_name'] = self.dataset_name
+        info['sample_idx'] = sample_idx
+        if 'bbox_xywh' in self.human_data:
+            info['bbox_xywh'] = self.human_data['bbox_xywh'][idx]
+            x, y, w, h, s = info['bbox_xywh']
+            cx = x + w / 2
+            cy = y + h / 2
+            w = h = max(w, h)
+            info['center'] = np.array([cx, cy])
+            info['scale'] = np.array([w, h])
+        else:
+            info['bbox_xywh'] = np.zeros((5))
+            info['center'] = np.zeros((2))
+            info['scale'] = np.zeros((2))
+
+        # in later modules, we will check validity of each keypoint by
+        # its confidence. Therefore, we do not need the mask of keypoints.
+
+        if 'keypoints2d' in self.human_data:
+            info['keypoints2d'] = self.human_data['keypoints2d'][idx]
+            info['has_keypoints2d'] = 1
+        else:
+            info['keypoints2d'] = np.zeros((self.num_keypoints, 3))
+            info['has_keypoints2d'] = 0
+        if 'keypoints3d' in self.human_data:
+            info['keypoints3d'] = self.human_data['keypoints3d'][idx]
+            info['has_keypoints3d'] = 1
+        else:
+            info['keypoints3d'] = np.zeros((self.num_keypoints, 4))
+            info['has_keypoints3d'] = 0
+
+        if 'smpl' in self.human_data:
+            smpl_dict = self.human_data['smpl']
+        else:
+            smpl_dict = {}
+
+        if 'smpl' in self.human_data:
+            if 'has_smpl' in self.human_data:
+                info['has_smpl'] = int(self.human_data['has_smpl'][idx])
+            else:
+                info['has_smpl'] = 1
+        else:
+            info['has_smpl'] = 0
+        if 'body_pose' in smpl_dict:
+            info['smpl_body_pose'] = smpl_dict['body_pose'][idx]
+        else:
+            info['smpl_body_pose'] = np.zeros((23, 3))
+
+        if 'global_orient' in smpl_dict:
+            info['smpl_global_orient'] = smpl_dict['global_orient'][idx]
+        else:
+            info['smpl_global_orient'] = np.zeros((3))
+
+        if 'betas' in smpl_dict:
+            info['smpl_betas'] = smpl_dict['betas'][idx]
+        else:
+            info['smpl_betas'] = np.zeros((10))
+
+        if 'transl' in smpl_dict:
+            info['smpl_transl'] = smpl_dict['transl'][idx]
+        else:
+            info['smpl_transl'] = np.zeros((3))
+
+        return info
+
+    def prepare_data(self, idx: int):
+        """Generate and transform data."""
+        info = self.prepare_raw_data(idx)
+        return self.pipeline(info)
+
+    def evaluate(self,
+                 outputs: list,
+                 res_folder: str,
+                 metric: Optional[Union[str, List[str]]] = 'pa-mpjpe',
+                 **kwargs: dict):
+        """Evaluate 3D keypoint results.
+
+        Args:
+            outputs (list): results from model inference.
+            res_folder (str): path to store results.
+            metric (Optional[Union[str, List(str)]]):
+                the type of metric. Default: 'pa-mpjpe'
+            kwargs (dict): other arguments.
+        Returns:
+            dict:
+                A dict of all evaluation results.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        for metric in metrics:
+            if metric not in self.ALLOWED_METRICS:
+                raise KeyError(f'metric {metric} is not supported')
+
+        res_file = os.path.join(res_folder, 'result_keypoints.json')
+        # for keeping correctness during multi-gpu test, we sort all results
+
+        res_dict = {}
+        for out in outputs:
+            target_id = out['image_idx']
+            batch_size = len(out['keypoints_3d'])
+            for i in range(batch_size):
+                res_dict[int(target_id[i])] = dict(
+                    keypoints=out['keypoints_3d'][i],
+                    poses=out['smpl_pose'][i],
+                    betas=out['smpl_beta'][i],
+                )
+
+        keypoints, poses, betas = [], [], []
+        for i in range(self.num_data):
+            keypoints.append(res_dict[i]['keypoints'])
+            poses.append(res_dict[i]['poses'])
+            betas.append(res_dict[i]['betas'])
+
+        res = dict(keypoints=keypoints, poses=poses, betas=betas)
+        mmcv.dump(res, res_file)
+
+        name_value_tuples = []
+        for _metric in metrics:
+            if _metric == 'mpjpe':
+                _nv_tuples = self._report_mpjpe(res)
+            elif _metric == 'pa-mpjpe':
+                _nv_tuples = self._report_mpjpe(res, metric='pa-mpjpe')
+            elif _metric == '3dpck':
+                _nv_tuples = self._report_3d_pck(res)
+            elif _metric == 'pa-3dpck':
+                _nv_tuples = self._report_3d_pck(res, metric='pa-3dpck')
+            elif _metric == '3dauc':
+                _nv_tuples = self._report_3d_auc(res)
+            elif _metric == 'pa-3dauc':
+                _nv_tuples = self._report_3d_auc(res, metric='pa-3dauc')
+            elif _metric == 'pve':
+                _nv_tuples = self._report_pve(res)
+            elif _metric == 'ihmr':
+                _nv_tuples = self._report_ihmr(res)
+            else:
+                raise NotImplementedError
+            name_value_tuples.extend(_nv_tuples)
+
+        name_value = OrderedDict(name_value_tuples)
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoints: Any, res_file: str):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _parse_result(self, res, mode='keypoint', body_part=None):
+        """Parse results."""
+
+        if mode == 'vertice':
+            # gt
+            gt_beta, gt_pose, gt_global_orient, gender = [], [], [], []
+            gt_smpl_dict = self.human_data['smpl']
+            for idx in range(self.num_data):
+                gt_beta.append(gt_smpl_dict['betas'][idx])
+                gt_pose.append(gt_smpl_dict['body_pose'][idx])
+                gt_global_orient.append(gt_smpl_dict['global_orient'][idx])
+                if self.human_data['meta']['gender'][idx] == 'm':
+                    gender.append(0)
+                else:
+                    gender.append(1)
+            gt_beta = torch.FloatTensor(gt_beta)
+            gt_pose = torch.FloatTensor(gt_pose).view(-1, 69)
+            gt_global_orient = torch.FloatTensor(gt_global_orient)
+            gender = torch.Tensor(gender)
+            gt_output = self.body_model(betas=gt_beta,
+                                        body_pose=gt_pose,
+                                        global_orient=gt_global_orient,
+                                        gender=gender)
+            gt_vertices = gt_output['vertices'].detach().cpu().numpy() * 1000.
+            gt_mask = np.ones(gt_vertices.shape[:-1])
+            # pred
+            pred_pose = torch.FloatTensor(res['poses'])
+            pred_beta = torch.FloatTensor(res['betas'])
+            pred_output = self.body_model(
+                betas=pred_beta,
+                body_pose=pred_pose[:, 1:],
+                global_orient=pred_pose[:, 0].unsqueeze(1),
+                pose2rot=False,
+                gender=gender)
+            pred_vertices = pred_output['vertices'].detach().cpu().numpy(
+            ) * 1000.
+
+            assert len(pred_vertices) == self.num_data
+
+            return pred_vertices, gt_vertices, gt_mask
+        elif mode == 'keypoint':
+            pred_keypoints3d = res['keypoints']
+            assert len(pred_keypoints3d) == self.num_data
+            # (B, 17, 3)
+            pred_keypoints3d = np.array(pred_keypoints3d)
+
+            if self.dataset_name == 'pw3d':
+                betas = []
+                body_pose = []
+                global_orient = []
+                gender = []
+                smpl_dict = self.human_data['smpl']
+                for idx in range(self.num_data):
+                    betas.append(smpl_dict['betas'][idx])
+                    body_pose.append(smpl_dict['body_pose'][idx])
+                    global_orient.append(smpl_dict['global_orient'][idx])
+                    if self.human_data['meta']['gender'][idx] == 'm':
+                        gender.append(0)
+                    else:
+                        gender.append(1)
+                betas = torch.FloatTensor(betas)
+                body_pose = torch.FloatTensor(body_pose).view(-1, 69)
+                global_orient = torch.FloatTensor(global_orient)
+                gender = torch.Tensor(gender)
+                gt_output = self.body_model(betas=betas,
+                                            body_pose=body_pose,
+                                            global_orient=global_orient,
+                                            gender=gender)
+                gt_keypoints3d = gt_output['joints'].detach().cpu().numpy()
+                gt_keypoints3d_mask = np.ones((len(pred_keypoints3d), 24))
+            elif self.dataset_name == 'h36m':
+                _, h36m_idxs, _ = get_mapping('human_data', 'h36m')
+                gt_keypoints3d = \
+                    self.human_data['keypoints3d'][:, h36m_idxs, :3]
+                gt_keypoints3d_mask = np.ones((len(pred_keypoints3d), 17))
+            elif self.dataset_name == 'humman':
+                betas = []
+                body_pose = []
+                global_orient = []
+                smpl_dict = self.human_data['smpl']
+                for idx in range(self.num_data):
+                    betas.append(smpl_dict['betas'][idx])
+                    body_pose.append(smpl_dict['body_pose'][idx])
+                    global_orient.append(smpl_dict['global_orient'][idx])
+                betas = torch.FloatTensor(betas)
+                body_pose = torch.FloatTensor(body_pose).view(-1, 69)
+                global_orient = torch.FloatTensor(global_orient)
+                gt_output = self.body_model(betas=betas,
+                                            body_pose=body_pose,
+                                            global_orient=global_orient)
+                gt_keypoints3d = gt_output['joints'].detach().cpu().numpy()
+                gt_keypoints3d_mask = np.ones((len(pred_keypoints3d), 24))
+            else:
+                raise NotImplementedError()
+
+            # SMPL_49 only!
+            if gt_keypoints3d.shape[1] == 49:
+                assert pred_keypoints3d.shape[1] == 49
+
+                gt_keypoints3d = gt_keypoints3d[:, 25:, :]
+                pred_keypoints3d = pred_keypoints3d[:, 25:, :]
+
+                joint_mapper = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18]
+                gt_keypoints3d = gt_keypoints3d[:, joint_mapper, :]
+                pred_keypoints3d = pred_keypoints3d[:, joint_mapper, :]
+
+                # we only evaluate on 14 lsp joints
+                pred_pelvis = (pred_keypoints3d[:, 2] +
+                               pred_keypoints3d[:, 3]) / 2
+                gt_pelvis = (gt_keypoints3d[:, 2] + gt_keypoints3d[:, 3]) / 2
+
+            # H36M for testing!
+            elif gt_keypoints3d.shape[1] == 17:
+                assert pred_keypoints3d.shape[1] == 17
+
+                H36M_TO_J17 = [
+                    6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9
+                ]
+                H36M_TO_J14 = H36M_TO_J17[:14]
+                joint_mapper = H36M_TO_J14
+
+                pred_pelvis = pred_keypoints3d[:, 0]
+                gt_pelvis = gt_keypoints3d[:, 0]
+
+                gt_keypoints3d = gt_keypoints3d[:, joint_mapper, :]
+                pred_keypoints3d = pred_keypoints3d[:, joint_mapper, :]
+
+            # keypoint 24
+            elif gt_keypoints3d.shape[1] == 24:
+                assert pred_keypoints3d.shape[1] == 24
+
+                joint_mapper = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18]
+                gt_keypoints3d = gt_keypoints3d[:, joint_mapper, :]
+                pred_keypoints3d = pred_keypoints3d[:, joint_mapper, :]
+
+                # we only evaluate on 14 lsp joints
+                pred_pelvis = (pred_keypoints3d[:, 2] +
+                               pred_keypoints3d[:, 3]) / 2
+                gt_pelvis = (gt_keypoints3d[:, 2] + gt_keypoints3d[:, 3]) / 2
+
+            else:
+                pass
+
+            pred_keypoints3d = (pred_keypoints3d -
+                                pred_pelvis[:, None, :]) * 1000
+            gt_keypoints3d = (gt_keypoints3d - gt_pelvis[:, None, :]) * 1000
+
+            gt_keypoints3d_mask = gt_keypoints3d_mask[:, joint_mapper] > 0
+
+            return pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask
+
+    def _report_mpjpe(self, res_file, metric='mpjpe', body_part=''):
+        """Cauculate mean per joint position error (MPJPE) or its variants PA-
+        MPJPE.
+
+        Report mean per joint position error (MPJPE) and mean per joint
+        position error after rigid alignment (PA-MPJPE)
+        """
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file, mode='keypoint', body_part=body_part)
+
+        err_name = metric.upper()
+        if body_part != '':
+            err_name = body_part.upper() + ' ' + err_name
+
+        if metric == 'mpjpe':
+            alignment = 'none'
+        elif metric == 'pa-mpjpe':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+
+        error = keypoint_mpjpe(pred_keypoints3d, gt_keypoints3d,
+                               gt_keypoints3d_mask, alignment)
+        info_str = [(err_name, error)]
+
+        return info_str
+
+    def _report_3d_pck(self, res_file, metric='3dpck'):
+        """Cauculate Percentage of Correct Keypoints (3DPCK) w. or w/o
+        Procrustes alignment.
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            metric (str): Specify mpjpe variants. Supported options are:
+                - ``'3dpck'``: Standard 3DPCK.
+                - ``'pa-3dpck'``:
+                    3DPCK after aligning prediction to groundtruth
+                    via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file)
+
+        err_name = metric.upper()
+        if metric == '3dpck':
+            alignment = 'none'
+        elif metric == 'pa-3dpck':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+
+        error = keypoint_3d_pck(pred_keypoints3d, gt_keypoints3d,
+                                gt_keypoints3d_mask, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _report_3d_auc(self, res_file, metric='3dauc'):
+        """Cauculate the Area Under the Curve (AUC) computed for a range of
+        3DPCK thresholds.
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            metric (str): Specify mpjpe variants. Supported options are:
+                - ``'3dauc'``: Standard 3DAUC.
+                - ``'pa-3dauc'``: 3DAUC after aligning prediction to
+                    groundtruth via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file)
+
+        err_name = metric.upper()
+        if metric == '3dauc':
+            alignment = 'none'
+        elif metric == 'pa-3dauc':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+
+        error = keypoint_3d_auc(pred_keypoints3d, gt_keypoints3d,
+                                gt_keypoints3d_mask, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _report_pve(self, res_file, metric='pve', body_part=''):
+        """Cauculate per vertex error."""
+        pred_verts, gt_verts, _ = \
+            self._parse_result(res_file, mode='vertice', body_part=body_part)
+        err_name = metric.upper()
+        if body_part != '':
+            err_name = body_part.upper() + ' ' + err_name
+
+        if metric == 'pve':
+            alignment = 'none'
+        elif metric == 'pa-pve':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+        error = vertice_pve(pred_verts, gt_verts, alignment)
+        return [(err_name, error)]
+
+    def _report_ihmr(self, res_file):
+        """Calculate IHMR metric.
+
+        https://arxiv.org/abs/2203.16427
+        """
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file, mode='keypoint')
+
+        pred_verts, gt_verts, _ = \
+            self._parse_result(res_file, mode='vertice')
+
+        from detrsmpl.utils.geometry import rot6d_to_rotmat
+        mean_param_path = 'data/body_models/smpl_mean_params.npz'
+        mean_params = np.load(mean_param_path)
+        mean_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
+        mean_shape = torch.from_numpy(
+            mean_params['shape'][:].astype('float32')).unsqueeze(0)
+        mean_pose = rot6d_to_rotmat(mean_pose).view(1, 24, 3, 3)
+        mean_output = self.body_model(betas=mean_shape,
+                                      body_pose=mean_pose[:, 1:],
+                                      global_orient=mean_pose[:, :1],
+                                      pose2rot=False)
+        mean_verts = mean_output['vertices'].detach().cpu().numpy() * 1000.
+        dis = (gt_verts - mean_verts) * (gt_verts - mean_verts)
+        dis = np.sqrt(dis.sum(axis=-1)).mean(axis=-1)
+        # from the most remote one to the nearest one
+        idx_order = np.argsort(dis)[::-1]
+        num_data = idx_order.shape[0]
+
+        def report_ihmr_idx(idx):
+            mpvpe = vertice_pve(pred_verts[idx], gt_verts[idx])
+            mpjpe = keypoint_mpjpe(pred_keypoints3d[idx], gt_keypoints3d[idx],
+                                   gt_keypoints3d_mask[idx], 'none')
+            pampjpe = keypoint_mpjpe(pred_keypoints3d[idx],
+                                     gt_keypoints3d[idx],
+                                     gt_keypoints3d_mask[idx], 'procrustes')
+            return (mpvpe, mpjpe, pampjpe)
+
+        def report_ihmr_tail(percentage):
+            cur_data = int(num_data * percentage / 100.0)
+            idx = idx_order[:cur_data]
+            mpvpe, mpjpe, pampjpe = report_ihmr_idx(idx)
+            res_mpvpe = ('bMPVPE Tail ' + str(percentage) + '%', mpvpe)
+            res_mpjpe = ('bMPJPE Tail ' + str(percentage) + '%', mpjpe)
+            res_pampjpe = ('bPA-MPJPE Tail ' + str(percentage) + '%', pampjpe)
+            return [res_mpvpe, res_mpjpe, res_pampjpe]
+
+        def report_ihmr_all(num_bin):
+            num_per_bin = np.array([0 for _ in range(num_bin)
+                                    ]).astype(np.float32)
+            sum_mpvpe = np.array([0
+                                  for _ in range(num_bin)]).astype(np.float32)
+            sum_mpjpe = np.array([0
+                                  for _ in range(num_bin)]).astype(np.float32)
+            sum_pampjpe = np.array([0 for _ in range(num_bin)
+                                    ]).astype(np.float32)
+            max_dis = dis[idx_order[0]]
+            min_dis = dis[idx_order[-1]]
+            delta = (max_dis - min_dis) / num_bin
+            for i in range(num_data):
+                idx = int((dis[i] - min_dis) / delta - 0.001)
+                res_mpvpe, res_mpjpe, res_pampjpe = report_ihmr_idx([i])
+                num_per_bin[idx] += 1
+                sum_mpvpe[idx] += res_mpvpe
+                sum_mpjpe[idx] += res_mpjpe
+                sum_pampjpe[idx] += res_pampjpe
+            for i in range(num_bin):
+                if num_per_bin[i] > 0:
+                    sum_mpvpe[i] = sum_mpvpe[i] / num_per_bin[i]
+                    sum_mpjpe[i] = sum_mpjpe[i] / num_per_bin[i]
+                    sum_pampjpe[i] = sum_pampjpe[i] / num_per_bin[i]
+            valid_idx = np.where(num_per_bin > 0)
+            res_mpvpe = ('bMPVPE All', sum_mpvpe[valid_idx].mean())
+            res_mpjpe = ('bMPJPE All', sum_mpjpe[valid_idx].mean())
+            res_pampjpe = ('bPA-MPJPE All', sum_pampjpe[valid_idx].mean())
+            return [res_mpvpe, res_mpjpe, res_pampjpe]
+
+        metrics = []
+        metrics.extend(report_ihmr_all(num_bin=100))
+        metrics.extend(report_ihmr_tail(percentage=10))
+        metrics.extend(report_ihmr_tail(percentage=5))
+        return metrics
diff --git a/detrsmpl/data/datasets/human_image_smplx_dataset.py b/detrsmpl/data/datasets/human_image_smplx_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cc8ae84734317bea3031d293a4d9c97ed2fd7a4
--- /dev/null
+++ b/detrsmpl/data/datasets/human_image_smplx_dataset.py
@@ -0,0 +1,386 @@
+import os
+import os.path
+import pickle
+from collections import OrderedDict
+from typing import List, Optional, Union
+
+import numpy as np
+import torch
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    get_keypoint_idx,
+    get_keypoint_idxs_by_part,
+)
+from detrsmpl.core.evaluation import fg_vertices_to_mesh_distance
+from detrsmpl.utils.transforms import aa_to_rotmat
+from .builder import DATASETS
+from .human_image_dataset import HumanImageDataset
+
+
+@DATASETS.register_module()
+class HumanImageSMPLXDataset(HumanImageDataset):
+
+    # metric
+    ALLOWED_METRICS = {
+        'mpjpe', 'pa-mpjpe', 'pve', '3dpck', 'pa-3dpck', '3dauc', 'pa-3dauc',
+        '3DRMSE', 'pa-pve'
+    }
+
+    def __init__(
+        self,
+        data_prefix: str,
+        pipeline: list,
+        dataset_name: str,
+        body_model: Optional[Union[dict, None]] = None,
+        ann_file: Optional[Union[str, None]] = None,
+        convention: Optional[str] = 'human_data',
+        cache_data_path: Optional[Union[str, None]] = None,
+        test_mode: Optional[bool] = False,
+        num_betas: Optional[int] = 10,
+        num_expression: Optional[int] = 10,
+        face_vertex_ids_path: Optional[str] = None,
+        hand_vertex_ids_path: Optional[str] = None,
+    ):
+        super().__init__(data_prefix, pipeline, dataset_name, body_model,
+                         ann_file, convention, cache_data_path, test_mode)
+        self.num_betas = num_betas
+        self.num_expression = num_expression
+        if face_vertex_ids_path is not None:
+            if os.path.exists(face_vertex_ids_path):
+                self.face_vertex_ids = np.load(face_vertex_ids_path).astype(
+                    np.int32)
+        if hand_vertex_ids_path is not None:
+            if os.path.exists(hand_vertex_ids_path):
+                with open(hand_vertex_ids_path, 'rb') as f:
+                    vertex_idxs_data = pickle.load(f, encoding='latin1')
+                self.left_hand_vertex_ids = vertex_idxs_data['left_hand']
+                self.right_hand_vertex_ids = vertex_idxs_data['right_hand']
+
+    def prepare_raw_data(self, idx: int):
+        """Get item from self.human_data."""
+        info = super().prepare_raw_data(idx)
+        if self.cache_reader is not None:
+            self.human_data = self.cache_reader.get_item(idx)
+            idx = idx % self.cache_reader.slice_size
+
+        if 'smplx' in self.human_data:
+            smplx_dict = self.human_data['smplx']
+            info['has_smplx'] = 1
+        else:
+            smplx_dict = {}
+            info['has_smplx'] = 0
+        if 'global_orient' in smplx_dict:
+            info['smplx_global_orient'] = smplx_dict['global_orient'][idx]
+            info['has_smplx_global_orient'] = 1
+        else:
+            info['smplx_global_orient'] = np.zeros((3), dtype=np.float32)
+            info['has_smplx_global_orient'] = 0
+
+        if 'body_pose' in smplx_dict:
+            info['smplx_body_pose'] = smplx_dict['body_pose'][idx]
+            info['has_smplx_body_pose'] = 1
+        else:
+            info['smplx_body_pose'] = np.zeros((21, 3), dtype=np.float32)
+            info['has_smplx_body_pose'] = 0
+
+        if 'right_hand_pose' in smplx_dict:
+            info['smplx_right_hand_pose'] = smplx_dict['right_hand_pose'][idx]
+            info['has_smplx_right_hand_pose'] = 1
+        else:
+            info['smplx_right_hand_pose'] = np.zeros((15, 3), dtype=np.float32)
+            info['has_smplx_right_hand_pose'] = 0
+
+        if 'left_hand_pose' in smplx_dict:
+            info['smplx_left_hand_pose'] = smplx_dict['left_hand_pose'][idx]
+            info['has_smplx_left_hand_pose'] = 1
+        else:
+            info['smplx_left_hand_pose'] = np.zeros((15, 3), dtype=np.float32)
+            info['has_smplx_left_hand_pose'] = 0
+
+        if 'jaw_pose' in smplx_dict:
+            info['smplx_jaw_pose'] = smplx_dict['jaw_pose'][idx]
+            info['has_smplx_jaw_pose'] = 1
+        else:
+            info['smplx_jaw_pose'] = np.zeros((3), dtype=np.float32)
+            info['has_smplx_jaw_pose'] = 0
+
+        if 'betas' in smplx_dict:
+            info['smplx_betas'] = smplx_dict['betas'][idx]
+            info['has_smplx_betas'] = 1
+        else:
+            info['smplx_betas'] = np.zeros((self.num_betas), dtype=np.float32)
+            info['has_smplx_betas'] = 0
+
+        if 'expression' in smplx_dict:
+            info['smplx_expression'] = smplx_dict['expression'][idx]
+            info['has_smplx_expression'] = 1
+        else:
+            info['smplx_expression'] = np.zeros((self.num_expression),
+                                                dtype=np.float32)
+            info['has_smplx_expression'] = 0
+
+        return info
+
+    def _parse_result(self, res, mode='keypoint', body_part=''):
+        if mode == 'vertice':
+            # pred
+            pred_vertices = res['vertices'] * 1000.
+            # gt
+            if 'vertices' in self.human_data:  # stirling or ehf
+                gt_vertices = self.human_data['vertices'].copy()
+                if self.dataset_name == 'EHF':
+                    gt_vertices = gt_vertices * 1000.
+            else:
+                gt_param_dict = self.human_data['smplx'].copy()
+                for key, value in gt_param_dict.items():
+                    new_value = torch.FloatTensor(value)
+                    if ('pose' in key or key
+                            == 'global_orient') and value.shape[-2] != 3:
+                        new_value = aa_to_rotmat(new_value)
+                    gt_param_dict[key] = new_value
+                gt_output = self.body_model(**gt_param_dict)
+                gt_vertices = gt_output['vertices'].detach().cpu().numpy(
+                ) * 1000.
+
+            if body_part == 'right_hand':
+                pred_vertices = pred_vertices[:, self.right_hand_vertex_ids]
+                gt_vertices = gt_vertices[:, self.right_hand_vertex_ids]
+            elif body_part == 'left_hand':
+                pred_vertices = pred_vertices[:, self.left_hand_vertex_ids]
+                gt_vertices = gt_vertices[:, self.left_hand_vertex_ids]
+            elif body_part == 'face':
+                pred_vertices = pred_vertices[:, self.face_vertex_ids]
+                gt_vertices = gt_vertices[:, self.face_vertex_ids]
+
+            gt_mask = np.ones(gt_vertices.shape[:-1])
+            assert len(pred_vertices) == self.num_data
+
+            return pred_vertices, gt_vertices, gt_mask
+        elif mode == 'keypoint':
+            pred_keypoints3d = res['keypoints']
+            assert len(pred_keypoints3d) == self.num_data
+            if self.dataset_name in {'pw3d', '3DPW', '3dpw'}:
+                betas = []
+                body_pose = []
+                global_orient = []
+                gender = []
+                smpl_dict = self.human_data['smpl']
+                for idx in range(self.num_data):
+                    betas.append(smpl_dict['betas'][idx])
+                    body_pose.append(smpl_dict['body_pose'][idx])
+                    global_orient.append(smpl_dict['global_orient'][idx])
+                    if self.human_data['meta']['gender'][idx] == 'm':
+                        gender.append(0)
+                    else:
+                        gender.append(1)
+                betas = torch.FloatTensor(betas)
+                body_pose = torch.FloatTensor(body_pose).view(-1, 69)
+                global_orient = torch.FloatTensor(global_orient)
+                gender = torch.Tensor(gender)
+                gt_output = self.body_model(betas=betas,
+                                            body_pose=body_pose,
+                                            global_orient=global_orient,
+                                            gender=gender)
+                gt_keypoints3d = gt_output['joints'].detach().cpu().numpy()
+                gt_keypoints3d_mask = np.ones(
+                    (len(pred_keypoints3d), gt_keypoints3d.shape[1]))
+            elif self.dataset_name == 'EHF':
+                gt_vertices = self.human_data['vertices'].copy()
+                if body_part == 'J14':
+                    gt_keypoints3d = torch.einsum('bik,ji->bjk', [
+                        torch.from_numpy(gt_vertices).float(),
+                        self.body_model.joints_regressor
+                    ]).numpy()
+                    pred_vertices = res['vertices']
+                    pred_keypoints3d = torch.einsum('bik,ji->bjk', [
+                        torch.from_numpy(pred_vertices).float(),
+                        self.body_model.joints_regressor
+                    ]).numpy()
+                    gt_keypoints3d_mask = np.ones(
+                        (len(pred_keypoints3d), gt_keypoints3d.shape[1]))
+                else:
+                    gt_keypoints3d = torch.einsum('bik,ji->bjk', [
+                        torch.from_numpy(gt_vertices).float(),
+                        self.body_model.J_regressor
+                    ]).numpy()
+                    extra_joints_idxs = np.array([
+                        9120, 9929, 9448, 616, 6, 5770, 5780, 8846, 8463, 8474,
+                        8635, 5361, 4933, 5058, 5169, 5286, 8079, 7669, 7794,
+                        7905, 8022
+                    ])
+                    gt_keypoints3d = np.concatenate(
+                        (gt_keypoints3d, gt_vertices[:, extra_joints_idxs]),
+                        axis=1)
+                    pred_vertices = res['vertices']
+                    pred_keypoints3d = torch.einsum('bik,ji->bjk', [
+                        torch.from_numpy(pred_vertices).float(),
+                        self.body_model.J_regressor
+                    ]).numpy()
+                    pred_keypoints3d = np.concatenate(
+                        (pred_keypoints3d, pred_vertices[:,
+                                                         extra_joints_idxs]),
+                        axis=1)
+
+                    idxs = list(range(0, gt_keypoints3d.shape[1]))
+                    if body_part == 'right_hand':
+                        idxs = get_keypoint_idxs_by_part(
+                            'right_hand', self.convention)
+                        idxs.append(
+                            get_keypoint_idx('right_wrist', self.convention))
+                    elif body_part == 'left_hand':
+                        idxs = get_keypoint_idxs_by_part(
+                            'left_hand', self.convention)
+                        idxs.append(
+                            get_keypoint_idx('left_wrist', self.convention))
+                    elif body_part == 'body':
+                        idxs = get_keypoint_idxs_by_part(
+                            'body', self.convention)
+                    gt_keypoints3d = gt_keypoints3d[:, idxs]
+                    pred_keypoints3d = pred_keypoints3d[:, idxs]
+                    gt_keypoints3d_mask = np.ones(
+                        (len(pred_keypoints3d), gt_keypoints3d.shape[1]))
+            else:
+                gt_keypoints3d = self.human_data['keypoints3d'][:, :, :3]
+                gt_keypoints3d_mask = np.ones(
+                    (len(pred_keypoints3d), gt_keypoints3d.shape[1]))
+
+            if gt_keypoints3d.shape[1] == 17:
+                # SMPLX_to_J14
+                assert pred_keypoints3d.shape[1] == 14
+                H36M_TO_J17 = [
+                    6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9
+                ]
+                H36M_TO_J14 = H36M_TO_J17[:14]
+                joint_mapper = H36M_TO_J14
+                gt_keypoints3d = gt_keypoints3d[:, joint_mapper, :]
+                pred_pelvis = pred_keypoints3d[:,
+                                               [2, 3], :].mean(axis=1,
+                                                               keepdims=True)
+                gt_pelvis = gt_keypoints3d[:, [2, 3], :].mean(axis=1,
+                                                              keepdims=True)
+                gt_keypoints3d_mask = gt_keypoints3d_mask[:, joint_mapper]
+                pred_keypoints3d = pred_keypoints3d - pred_pelvis
+                gt_keypoints3d = gt_keypoints3d - gt_pelvis
+            elif gt_keypoints3d.shape[1] == 14:
+                assert pred_keypoints3d.shape[1] == 14
+                pred_pelvis = pred_keypoints3d[:,
+                                               [2, 3], :].mean(axis=1,
+                                                               keepdims=True)
+                gt_pelvis = gt_keypoints3d[:, [2, 3], :].mean(axis=1,
+                                                              keepdims=True)
+                pred_keypoints3d = pred_keypoints3d - pred_pelvis
+                gt_keypoints3d = gt_keypoints3d - gt_pelvis
+            elif gt_keypoints3d.shape[1] == 21:
+                pred_pelvis = pred_keypoints3d[:, :1, :]
+                gt_pelvis = gt_keypoints3d[:, :1, :]
+                pred_keypoints3d = pred_keypoints3d - pred_pelvis
+                gt_keypoints3d = gt_keypoints3d - gt_pelvis
+            else:
+                pass
+
+            pred_keypoints3d = pred_keypoints3d * 1000
+            if self.dataset_name != 'stirling':
+                gt_keypoints3d = gt_keypoints3d * 1000
+            gt_keypoints3d_mask = gt_keypoints3d_mask > 0
+
+            return pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask
+
+    def _report_3d_rmse(self, res_file):
+        """compute the 3DRMSE between a predicted 3D face shape and the 3D
+        ground truth scan."""
+        pred_vertices, gt_vertices, _ = self._parse_result(res_file,
+                                                           mode='vertice')
+        pred_keypoints3d, gt_keypoints3d, _ = self._parse_result(
+            res_file, mode='keypoint')
+        errors = []
+        for pred_vertice, gt_vertice, pred_points, gt_points in zip(
+                pred_vertices, gt_vertices, pred_keypoints3d, gt_keypoints3d):
+            error = fg_vertices_to_mesh_distance(gt_vertice, gt_points,
+                                                 pred_vertice,
+                                                 self.body_model.faces,
+                                                 pred_points)
+            errors.append(error)
+
+        error = np.array(errors).mean()
+        name_value_tuples = [('3DRMSE', error)]
+        return name_value_tuples
+
+    def evaluate(self,
+                 outputs: list,
+                 res_folder: str,
+                 metric: Optional[Union[str, List[str]]] = 'pa-mpjpe',
+                 **kwargs: dict):
+        """Evaluate 3D keypoint results.
+
+        Args:
+            outputs (list): results from model inference.
+            res_folder (str): path to store results.
+            metric (Optional[Union[str, List(str)]]):
+                the type of metric. Default: 'pa-mpjpe'
+            kwargs (dict): other arguments.
+        Returns:
+            dict:
+                A dict of all evaluation results.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        for metric in metrics:
+            if metric not in self.ALLOWED_METRICS:
+                raise KeyError(f'metric {metric} is not supported')
+
+        # for keeping correctness during multi-gpu test, we sort all results
+        res_dict = {}
+        for out in outputs:
+            target_id = out['image_idx']
+            batch_size = len(out['keypoints_3d'])
+            for i in range(batch_size):
+                res_dict[int(target_id[i])] = dict(
+                    keypoints=out['keypoints_3d'][i],
+                    vertices=out['vertices'][i],
+                )
+        keypoints, vertices = [], []
+        for i in range(self.num_data):
+            keypoints.append(res_dict[i]['keypoints'])
+            vertices.append(res_dict[i]['vertices'])
+        keypoints = np.stack(keypoints)
+        vertices = np.stack(vertices)
+        res = dict(keypoints=keypoints, vertices=vertices)
+        name_value_tuples = []
+        for index, _metric in enumerate(metrics):
+            if 'body_part' in kwargs:
+                body_parts = kwargs['body_part'][index]
+                for body_part in body_parts:
+                    if _metric == 'pa-mpjpe':
+                        _nv_tuples = self._report_mpjpe(res,
+                                                        metric='pa-mpjpe',
+                                                        body_part=body_part)
+                    elif _metric == 'pa-pve':
+                        _nv_tuples = self._report_pve(res,
+                                                      metric='pa-pve',
+                                                      body_part=body_part)
+                    else:
+                        raise NotImplementedError
+                    name_value_tuples.extend(_nv_tuples)
+            else:
+                if _metric == 'mpjpe':
+                    _nv_tuples = self._report_mpjpe(res)
+                elif _metric == 'pa-mpjpe':
+                    _nv_tuples = self._report_mpjpe(res, metric='pa-mpjpe')
+                elif _metric == '3dpck':
+                    _nv_tuples = self._report_3d_pck(res)
+                elif _metric == 'pa-3dpck':
+                    _nv_tuples = self._report_3d_pck(res, metric='pa-3dpck')
+                elif _metric == '3dauc':
+                    _nv_tuples = self._report_3d_auc(res)
+                elif _metric == 'pa-3dauc':
+                    _nv_tuples = self._report_3d_auc(res, metric='pa-3dauc')
+                elif _metric == 'pve':
+                    _nv_tuples = self._report_pve(res)
+                elif _metric == 'pa-pve':
+                    _nv_tuples = self._report_pve(res, metric='pa-pve')
+                elif _metric == '3DRMSE':
+                    _nv_tuples = self._report_3d_rmse(res)
+                else:
+                    raise NotImplementedError
+                name_value_tuples.extend(_nv_tuples)
+        name_value = OrderedDict(name_value_tuples)
+        return name_value
diff --git a/detrsmpl/data/datasets/human_video_dataset.py b/detrsmpl/data/datasets/human_video_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..240cbfa7d786ba8c0c30e95f6d8982fcb80fc1e8
--- /dev/null
+++ b/detrsmpl/data/datasets/human_video_dataset.py
@@ -0,0 +1,164 @@
+import copy
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+from skimage.util.shape import view_as_windows
+
+from .builder import DATASETS
+from .human_image_dataset import HumanImageDataset
+
+
+def get_vid_name(image_path: str):
+    """Get base_dir of the given path."""
+    content = image_path.split('/')
+    vid_name = '/'.join(content[:-1])
+    return vid_name
+
+
+def split_into_chunks(data_infos: list, seq_len: int, stride: int,
+                      test_mode: bool, only_vid_name: bool):
+    """Split annotations into chunks.
+    Adapted from https://github.com/mkocabas/VIBE
+    Args:
+        data_infos (list): parsed annotations.
+        seq_len (int): the length of each chunk.
+        stride (int): the interval between chunks.
+        test_mode (bool): if test_mode is true, then an additional chunk
+            will be added to cover all frames. Otherwise, last few frames
+            will be dropped.
+        only_vid_name (bool): if only_vid_name is true, image_path only
+            contains the video name. Otherwise, image_path contains both
+            video_name and frame index.
+
+    Return:
+        list:
+            shape: [N, 4]. Each chunk contains four parameters: start_frame,
+            end_frame, valid_start_frame, valid_end_frame. The last two
+            parameters are used to suppress redundant frames.
+    """
+    vid_names = []
+    for image_path in data_infos:
+        if only_vid_name:
+            vid_name = image_path
+        else:
+            vid_name = get_vid_name(image_path)
+        vid_names.append(vid_name)
+    vid_names = np.array(vid_names)
+    video_start_end_indices = []
+
+    video_names, group = np.unique(vid_names, return_index=True)
+    perm = np.argsort(group)
+    video_names, group = video_names[perm], group[perm]
+
+    indices = np.split(np.arange(0, vid_names.shape[0]), group[1:])
+
+    for idx in range(len(video_names)):
+        indexes = indices[idx]
+        if indexes.shape[0] < seq_len:
+            continue
+        chunks = view_as_windows(indexes, (seq_len, ), step=stride)
+        start_finish = chunks[:, (0, -1, 0, -1)].tolist()
+        video_start_end_indices += start_finish
+        if chunks[-1][-1] < indexes[-1] and test_mode:
+            start_frame = indexes[-1] - seq_len + 1
+            end_frame = indexes[-1]
+            valid_start_frame = chunks[-1][-1] + 1
+            valid_end_frame = indexes[-1]
+            extra_start_finish = [[
+                start_frame, end_frame, valid_start_frame, valid_end_frame
+            ]]
+            video_start_end_indices += extra_start_finish
+
+    return video_start_end_indices
+
+
+@DATASETS.register_module()
+class HumanVideoDataset(HumanImageDataset):
+    """Human Video Dataset.
+
+    Args:
+        data_prefix (str): the prefix of data path.
+        pipeline (list): a list of dict, where each element represents
+            a operation defined in `mmhuman3d.datasets.pipelines`.
+        dataset_name (str | None): the name of dataset. It is used to
+            identify the type of evaluation metric. Default: None.
+        seq_len (int, optional): the length of input sequence. Default: 16.
+        overlap (float, optional): the overlap between different sequences.
+            Default: 0
+        only_vid_name (bool, optional): the format of image_path.
+            If only_vid_name is true, image_path only contains the video
+            name. Otherwise, image_path contains both video_name and frame
+            index.
+        body_model (dict | None, optional): the config for body model,
+            which will be used to generate meshes and keypoints.
+            Default: None.
+        ann_file (str | None, optional): the annotation file. When ann_file
+            is str, the subclass is expected to read from the ann_file. When
+            ann_file is None, the subclass is expected to read according
+            to data_prefix.
+        convention (str, optional): keypoints convention. Keypoints will be
+            converted from "human_data" to the given one.
+            Default: "human_data"
+        test_mode (bool, optional): in train mode or test mode. Default: False.
+    """
+    def __init__(self,
+                 data_prefix: str,
+                 pipeline: list,
+                 dataset_name: str,
+                 seq_len: Optional[int] = 16,
+                 overlap: Optional[float] = 0.,
+                 only_vid_name: Optional[bool] = False,
+                 body_model: Optional[Union[dict, None]] = None,
+                 ann_file: Optional[Union[str, None]] = None,
+                 convention: Optional[str] = 'human_data',
+                 test_mode: Optional[bool] = False):
+        super(HumanVideoDataset, self).__init__(data_prefix=data_prefix,
+                                                pipeline=pipeline,
+                                                dataset_name=dataset_name,
+                                                body_model=body_model,
+                                                convention=convention,
+                                                ann_file=ann_file,
+                                                test_mode=test_mode)
+        self.seq_len = seq_len
+        self.stride = int(seq_len * (1 - overlap))
+        self.vid_indices = split_into_chunks(self.human_data['image_path'],
+                                             self.seq_len, self.stride,
+                                             test_mode, only_vid_name)
+        self.vid_indices = np.array(self.vid_indices)
+
+    def __len__(self):
+        return len(self.vid_indices)
+
+    def prepare_data(self, idx: int):
+        """Prepare data for each chunk.
+
+        Step 1: get annotation from each frame. Step 2: add metas of each
+        chunk.
+        """
+        start_idx, end_idx = self.vid_indices[idx][:2]
+        batch_results = []
+        image_path = []
+        for frame_idx in range(start_idx, end_idx + 1):
+            frame_results = copy.deepcopy(self.prepare_raw_data(frame_idx))
+            image_path.append(frame_results.pop('image_path'))
+            if 'features' in self.human_data:
+                frame_results['features'] = \
+                     copy.deepcopy(self.human_data['features'][frame_idx])
+            frame_results = self.pipeline(frame_results)
+            batch_results.append(frame_results)
+        video_results = {}
+        for key in batch_results[0].keys():
+            batch_anno = []
+            for item in batch_results:
+                batch_anno.append(item[key])
+            if isinstance(batch_anno[0], torch.Tensor):
+                batch_anno = torch.stack(batch_anno, dim=0)
+            video_results[key] = batch_anno
+        img_metas = {
+            'frame_idx': self.vid_indices[idx],
+            'image_path': image_path
+        }
+        video_results['img_metas'] = DC(img_metas, cpu_only=True)
+        return video_results
diff --git a/detrsmpl/data/datasets/mesh_dataset.py b/detrsmpl/data/datasets/mesh_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a162e066b640f2d8c47409b867cd096b0be1fbe
--- /dev/null
+++ b/detrsmpl/data/datasets/mesh_dataset.py
@@ -0,0 +1,63 @@
+import os
+from abc import ABCMeta
+from typing import Optional, Union
+
+import numpy as np
+
+from .base_dataset import BaseDataset
+from .builder import DATASETS
+
+
+@DATASETS.register_module()
+class MeshDataset(BaseDataset, metaclass=ABCMeta):
+    """Mesh Dataset. This dataset only contains smpl data.
+
+    Args:
+        data_prefix (str): the prefix of data path.
+        pipeline (list): a list of dict, where each element represents
+            a operation defined in `detrsmpl.datasets.pipelines`.
+        dataset_name (str | None): the name of dataset. It is used to
+            identify the type of evaluation metric. Default: None.
+        ann_file (str | None, optional): the annotation file. When ann_file
+            is str, the subclass is expected to read from the ann_file. When
+            ann_file is None, the subclass is expected to read according
+            to data_prefix.
+        test_mode (bool, optional): in train mode or test mode. Default: False.
+    """
+    def __init__(self,
+                 data_prefix: str,
+                 pipeline: list,
+                 dataset_name: str,
+                 ann_file: Optional[Union[str, None]] = None,
+                 test_mode: Optional[bool] = False):
+        self.dataset_name = dataset_name
+        super(MeshDataset, self).__init__(data_prefix=data_prefix,
+                                          pipeline=pipeline,
+                                          ann_file=ann_file,
+                                          test_mode=test_mode)
+
+    def get_annotation_file(self):
+        ann_prefix = os.path.join(self.data_prefix, 'preprocessed_datasets')
+        self.ann_file = os.path.join(ann_prefix, self.ann_file)
+
+    def load_annotations(self):
+
+        self.get_annotation_file()
+        data = np.load(self.ann_file, allow_pickle=True)
+
+        self.smpl = data['smpl'].item()
+        num_data = self.smpl['global_orient'].shape[0]
+        if 'transl' not in self.smpl:
+            self.smpl['transl'] = np.zeros((num_data, 3))
+        self.has_smpl = np.ones((num_data))
+
+        data_infos = []
+
+        for idx in range(num_data):
+            info = {}
+            for k, v in self.smpl.items():
+                info['smpl_' + k] = v[idx]
+
+            data_infos.append(info)
+        self.num_data = len(data_infos)
+        self.data_infos = data_infos
diff --git a/detrsmpl/data/datasets/mixed_dataset.py b/detrsmpl/data/datasets/mixed_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..31558f05aa8b0b7447b3d3b6d6c56a3e4a0b346a
--- /dev/null
+++ b/detrsmpl/data/datasets/mixed_dataset.py
@@ -0,0 +1,48 @@
+from typing import Optional, Union
+
+import numpy as np
+from torch.utils.data import ConcatDataset, Dataset, WeightedRandomSampler
+
+from .builder import DATASETS, build_dataset
+
+
+@DATASETS.register_module()
+class MixedDataset(Dataset):
+    """Mixed Dataset.
+
+    Args:
+        config (list): the list of different datasets.
+        partition (list): the ratio of datasets in each batch.
+        num_data (int | None, optional): if num_data is not None, the number
+            of iterations is set to this fixed value. Otherwise, the number of
+            iterations is set to the maximum size of each single dataset.
+            Default: None.
+    """
+    def __init__(self,
+                 configs: list,
+                 partition: list,
+                 num_data: Optional[Union[int, None]] = None):
+        """Load data from multiple datasets."""
+        assert min(partition) >= 0
+        datasets = [build_dataset(cfg) for cfg in configs]
+        self.dataset = ConcatDataset(datasets)
+        if num_data is not None:
+            self.length = num_data
+        else:
+            self.length = max(len(ds) for ds in datasets)
+        weights = [
+            np.ones(len(ds)) * p / len(ds)
+            for (p, ds) in zip(partition, datasets)
+        ]
+        weights = np.concatenate(weights, axis=0)
+        self.sampler = WeightedRandomSampler(weights, 1)
+
+    def __len__(self):
+        """Get the size of the dataset."""
+        return self.length
+
+    def __getitem__(self, idx):
+        """Given index, sample the data from multiple datasets with the given
+        proportion."""
+        idx_new = list(self.sampler)[0]
+        return self.dataset[idx_new]
diff --git a/detrsmpl/data/datasets/multi_human_image_dataset.py b/detrsmpl/data/datasets/multi_human_image_dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..426e363653ca87c5b93e284fee1843a1b15f9593
--- /dev/null
+++ b/detrsmpl/data/datasets/multi_human_image_dataset.py
@@ -0,0 +1,757 @@
+import json
+import os
+import os.path
+from abc import ABCMeta
+from collections import OrderedDict
+from typing import Any, List, Optional, Union
+
+import mmcv
+import numpy as np
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    convert_kps,
+    get_keypoint_num,
+    get_mapping,
+)
+
+from detrsmpl.core.evaluation import (
+    keypoint_3d_auc,
+    keypoint_3d_pck,
+    keypoint_mpjpe,
+    vertice_pve,
+)
+
+from detrsmpl.data.data_structures.multi_human_data import MultiHumanData
+from detrsmpl.models.body_models.builder import build_body_model
+from .base_dataset import BaseDataset
+from .builder import DATASETS
+
+
+@DATASETS.register_module()
+class MultiHumanImageDataset(BaseDataset, metaclass=ABCMeta):
+    def __init__(self,
+                 data_prefix: str,
+                 pipeline: list,
+                 body_model: Optional[Union[dict, None]] = None,
+                 ann_file: Optional[Union[str, None]] = None,
+                 convention: Optional[str] = 'human_data',
+                 test_mode: Optional[bool] = False,
+                 dataset_name: Optional[Union[str, None]] = None):
+        self.num_keypoints = get_keypoint_num(convention)
+        self.convention = convention
+        super(MultiHumanImageDataset,
+              self).__init__(data_prefix, pipeline, ann_file, test_mode,
+                             dataset_name)
+
+        if body_model is not None:
+            self.body_model = build_body_model(body_model)
+        else:
+            self.body_model = None
+
+    def get_annotation_file(self):
+        """Get path of the annotation file."""
+        ann_prefix = os.path.join(self.data_prefix, 'preprocessed_datasets')
+        self.ann_file = os.path.join(ann_prefix, self.ann_file)
+
+    def load_annotations(self):
+        """Load annotations."""
+        self.get_annotation_file()
+        self.human_data = MultiHumanData()
+        self.human_data.load(self.ann_file)
+
+        self.instance_num = self.human_data.instance_num
+        self.image_path = self.human_data['image_path']
+        self.num_data = self.human_data.data_len
+
+        try:
+            self.frame_range = self.human_data['frame_range']
+        except KeyError:
+            self.frame_range = \
+                np.array([[i, i + 1] for i in range(self.num_data)])
+
+        self.num_data = self.frame_range.shape[0]
+        if self.human_data.check_keypoints_compressed():
+            self.human_data.decompress_keypoints()
+
+        # change keypoint from 'human_data' to the given convention
+        if 'keypoints3d_ori' in self.human_data:
+            keypoints3d_ori = self.human_data['keypoints3d_ori']
+            assert 'keypoints3d_ori_mask' in self.human_data
+            keypoints3d_ori_mask = self.human_data['keypoints3d_ori_mask']
+            keypoints3d_ori, keypoints3d_ori_mask = \
+                convert_kps(
+                    keypoints3d_ori,
+                    src='human_data',
+                    dst=self.convention,
+                    mask=keypoints3d_ori_mask)
+            self.human_data.__setitem__('keypoints3d_ori', keypoints3d_ori)
+            self.human_data.__setitem__('keypoints3d_ori_convention',
+                                        self.convention)
+            self.human_data.__setitem__('keypoints3d_ori_mask',
+                                        keypoints3d_ori_mask)
+        elif 'keypoints3d' in self.human_data:
+            keypoints3d_ori = self.human_data['keypoints3d']
+            assert 'keypoints3d_mask' in self.human_data
+            keypoints3d_ori_mask = self.human_data['keypoints3d_mask']
+            keypoints3d_ori, keypoints3d_ori_mask = \
+                convert_kps(
+                    keypoints3d_ori,
+                    src='human_data',
+                    dst=self.convention,
+                    mask=keypoints3d_ori_mask)
+            self.human_data.__setitem__('keypoints3d_ori', keypoints3d_ori)
+            self.human_data.__setitem__('keypoints3d_ori_convention',
+                                        self.convention)
+            self.human_data.__setitem__('keypoints3d_ori_mask',
+                                        keypoints3d_ori_mask)
+
+        if 'keypoints2d_ori' in self.human_data:
+            keypoints2d_ori = self.human_data['keypoints2d_ori']
+            assert 'keypoints2d_ori_mask' in self.human_data
+            keypoints2d_ori_mask = self.human_data['keypoints2d_ori_mask']
+            keypoints2d_ori, keypoints2d_ori_mask = \
+                convert_kps(
+                    keypoints2d_ori,
+                    src='human_data',
+                    dst=self.convention,
+                    mask=keypoints2d_ori_mask)
+            self.human_data.__setitem__('keypoints2d_ori', keypoints2d_ori)
+            self.human_data.__setitem__('keypoints2d_ori_convention',
+                                        self.convention)
+            self.human_data.__setitem__('keypoints2d_ori_mask',
+                                        keypoints2d_ori_mask)
+            ori_mask = keypoints2d_ori[:, :, 2]
+        elif 'keypoints2d' in self.human_data:
+            keypoints2d_ori = self.human_data['keypoints2d']
+            assert 'keypoints2d_mask' in self.human_data
+            keypoints2d_ori_mask = self.human_data['keypoints2d_mask']
+            keypoints2d_ori, keypoints2d_ori_mask = \
+                convert_kps(
+                    keypoints2d_ori,
+                    src='human_data',
+                    dst=self.convention,
+                    mask=keypoints2d_ori_mask)
+            self.human_data.__setitem__('keypoints2d_ori', keypoints2d_ori)
+            self.human_data.__setitem__('keypoints2d_ori_convention',
+                                        self.convention)
+            self.human_data.__setitem__('keypoints2d_ori_mask',
+                                        keypoints2d_ori_mask)
+
+        # if 'has_smpl' in self.human_data:
+        #     index = ori_mask.sum(-1)>=8
+        #     self.human_data['has_smpl']=self.human_data['has_smpl'][:147270]*index
+        # change keypoint from 'human_data' to the given convention
+        if 'keypoints3d_smpl' in self.human_data:
+            keypoints3d_smpl = self.human_data['keypoints3d_smpl']
+            assert 'keypoints3d_smpl_mask' in self.human_data
+            keypoints3d_smpl_mask = self.human_data['keypoints3d_smpl_mask']
+            keypoints3d_smpl, keypoints3d_smpl_mask = \
+                convert_kps(
+                    keypoints3d_smpl,
+                    src='human_data',
+                    dst=self.convention,
+                    mask=keypoints3d_smpl_mask)
+            # index = ori_mask.sum(-1)<8
+            # index = ori_mask.sum(-1)<8
+            # keypoints3d_smpl[index]=np.concatenate(
+            #     [keypoints3d_smpl[index][:,:,:3],
+            #     keypoints2d_ori[index][:,:,[2]]],
+            #     -1)
+            self.human_data.__setitem__('keypoints3d_smpl', keypoints3d_smpl)
+            self.human_data.__setitem__('keypoints3d_smpl_convention',
+                                        self.convention)
+            self.human_data.__setitem__('keypoints3d_smpl_mask',
+                                        keypoints3d_smpl_mask)
+
+        if 'keypoints2d_smpl' in self.human_data:
+            keypoints2d_smpl = self.human_data['keypoints2d_smpl']
+            assert 'keypoints2d_smpl_mask' in self.human_data
+            keypoints2d_smpl_mask = self.human_data['keypoints2d_smpl_mask']
+            keypoints2d_smpl, keypoints2d_smpl_mask = \
+                convert_kps(
+                    keypoints2d_smpl,
+                    src='human_data',
+                    dst=self.convention,
+                    mask=keypoints2d_smpl_mask)
+            # index = ori_mask.sum(-1)<8
+            # keypoints2d_smpl[index]=np.concatenate(
+            #     [keypoints2d_smpl[index][:,:,:2],
+            #     keypoints2d_ori[index][:,:,[2]]],
+            #     -1)
+            # keypoints2d_smpl[index][:,:,2]=keypoints2d_ori[index][:, :,2]
+            self.human_data.__setitem__('keypoints2d_smpl', keypoints2d_smpl)
+            self.human_data.__setitem__('keypoints2d_smpl_convention',
+                                        self.convention)
+            self.human_data.__setitem__('keypoints2d_smpl_mask',
+                                        keypoints2d_smpl_mask)
+        self.human_data.compress_keypoints_by_mask()
+
+    
+
+    def prepare_raw_data(self, idx: int):
+        """Get item from self.human_data."""
+        sample_idx = idx
+        frame_start, frame_end = self.frame_range[idx]
+        frame_num = frame_end - frame_start
+        # TODO: Support cache_reader?
+        info = {}
+        info['img_prefix'] = None
+        image_path = self.human_data['image_path'][frame_start]
+        info['image_path'] = os.path.join(self.data_prefix, 'datasets',
+                                          self.dataset_name, image_path)
+        # TODO: Support smc?
+        info['dataset_name'] = self.dataset_name
+        info['sample_idx'] = sample_idx
+        if 'bbox_xywh' in self.human_data:
+            info['bbox_xywh'] = self.human_data['bbox_xywh'][
+                frame_start:frame_end]
+            center, scale = [], []
+            for bbox in info['bbox_xywh']:
+                x, y, w, h, s = bbox
+                cx = x + w / 2
+                cy = y + h / 2
+                # TODO: verify if we should keep w = h = max(w, h) for multi human data
+                w = h = max(w, h)
+                center.append([cx, cy])
+                scale.append([w, h])
+            info['center'] = np.array(center)
+            info['scale'] = np.array(scale)
+        else:
+            info['bbox_xywh'] = np.zeros((frame_num, 5))
+            info['center'] = np.zeros((frame_num, 2))
+            info['scale'] = np.zeros((frame_num, 2))
+
+        if 'keypoints2d_ori' in self.human_data:
+            info['keypoints2d_ori'] = self.human_data['keypoints2d_ori'][
+                frame_start:frame_end]
+            conf = info['keypoints2d_ori'][..., -1].sum(-1) > 0
+            info['has_keypoints2d_ori'] = np.ones(
+                (frame_num, 1)) * conf[..., None]
+        else:
+            info['keypoints2d_ori'] = np.zeros(
+                (frame_num, self.num_keypoints, 3))
+            info['has_keypoints2d_ori'] = np.zeros((frame_num, 1))
+
+        if 'keypoints3d_ori' in self.human_data:
+            info['keypoints3d_ori'] = self.human_data['keypoints3d_ori'][
+                frame_start:frame_end]
+            conf = info['keypoints3d_ori'][..., -1].sum(-1) > 0
+            info['has_keypoints3d_ori'] = np.ones(
+                (frame_num, 1)) * conf[..., None]
+        else:
+            info['keypoints3d_ori'] = np.zeros(
+                (frame_num, self.num_keypoints, 4))
+            info['has_keypoints3d_ori'] = np.zeros((frame_num, 1))
+
+        if 'keypoints2d_smpl' in self.human_data:
+            info['keypoints2d_smpl'] = self.human_data['keypoints2d_smpl'][
+                frame_start:frame_end]
+            conf = info['keypoints2d_smpl'][..., -1].sum(-1) > 0
+            info['has_keypoints2d_smpl'] = np.ones(
+                (frame_num, 1)) * conf[..., None]
+        else:
+            info['keypoints2d_smpl'] = np.zeros(
+                (frame_num, self.num_keypoints, 3))
+            info['has_keypoints2d_smpl'] = np.zeros((frame_num, 1))
+
+        if 'keypoints3d_smpl' in self.human_data:
+            info['keypoints3d_smpl'] = self.human_data['keypoints3d_smpl'][
+                frame_start:frame_end]
+            conf = info['keypoints3d_smpl'][..., -1].sum(-1) > 0
+            info['has_keypoints3d_smpl'] = np.ones(
+                (frame_num, 1)) * conf[..., None]
+        else:
+            info['keypoints3d_smpl'] = np.zeros(
+                (frame_num, self.num_keypoints, 4))
+            info['has_keypoints3d_smpl'] = np.zeros((frame_num, 1))
+
+        if 'smpl' in self.human_data:
+            if 'has_smpl' in self.human_data:
+                info['has_smpl'] = \
+                    self.human_data['has_smpl'][frame_start:frame_end]
+            else:
+                info['has_smpl'] = np.ones((frame_num, 1))
+            smpl_dict = self.human_data['smpl']
+        else:
+            info['has_smpl'] = np.zeros((frame_num, 1))
+            smpl_dict = {}
+
+        if 'body_pose' in smpl_dict:
+            info['smpl_body_pose'] = smpl_dict['body_pose'][
+                frame_start:frame_end]
+        else:
+            info['smpl_body_pose'] = np.zeros((frame_num, 23, 3))
+
+        if 'global_orient' in smpl_dict:
+            info['smpl_global_orient'] = smpl_dict['global_orient'][
+                frame_start:frame_end]
+        else:
+            info['smpl_global_orient'] = np.zeros((frame_num, 3))
+
+        if 'betas' in smpl_dict:
+            info['smpl_betas'] = smpl_dict['betas'][frame_start:frame_end]
+        else:
+            info['smpl_betas'] = np.zeros((frame_num, 10))
+
+        if 'transl' in smpl_dict:
+            info['smpl_transl'] = smpl_dict['transl'][frame_start:frame_end]
+        else:
+            info['smpl_transl'] = np.zeros((frame_num, 3))
+
+        if 'area' in self.human_data:
+            info['area'] = self.human_data['area'][frame_start:frame_end]
+        else:
+            info['area'] = np.zeros((frame_num, 0))
+
+
+        return info
+
+    def prepare_data(self, idx: int):
+        """Generate and transform data."""
+        info = self.prepare_raw_data(idx)
+        return self.pipeline(info)
+
+    def evaluate(self,
+                 outputs: list,
+                 res_folder: str,
+                 metric: Optional[Union[str, List[str]]] = 'pa-mpjpe',
+                 **kwargs: dict):
+        """Evaluate 3D keypoint results.
+
+        Args:
+            outputs (list): results from model inference.
+            res_folder (str): path to store results.
+            metric (Optional[Union[str, List(str)]]):
+                the type of metric. Default: 'pa-mpjpe'
+            kwargs (dict): other arguments.
+        Returns:
+            dict:
+                A dict of all evaluation results.
+        """
+        metrics = metric if isinstance(metric, list) else [metric]
+        for metric in metrics:
+            if metric not in self.ALLOWED_METRICS:
+                raise KeyError(f'metric {metric} is not supported')
+
+        res_file = os.path.join(res_folder, 'result_keypoints.json')
+        # for keeping correctness during multi-gpu test, we sort all results
+        res_dict = {}
+        # 'scores', 'labels', 'boxes', 'keypoints', 'pred_smpl_pose',
+        # 'pred_smpl_beta', 'pred_smpl_cam', 'pred_smpl_kp3d',
+        # 'gt_smpl_pose', 'gt_smpl_beta', 'gt_smpl_kp3d', 'gt_boxes',
+        # 'gt_keypoints', 'image_idx'
+        for out in outputs:
+            target_id = out['image_idx']
+            batch_size = len(out['pred_smpl_kp3d'])
+            for i in range(batch_size):
+                res_dict[int(target_id[i])] = dict(
+                    keypoints=out['pred_smpl_kp3d'][i],
+                    gt_poses=out['gt_smpl_pose'][i],
+                    gt_betas=out['gt_smpl_beta'][i],
+                    pred_poses=out['pred_smpl_pose'][i],
+                    pred_betas=out['pred_smpl_beta'][i])
+        keypoints, gt_poses, gt_betas, pred_poses, pred_betas = \
+            [], [], [], [], []
+        # print(self.num_data)
+        for i in range(self.num_data):
+            keypoints.append(res_dict[i]['keypoints'])
+            gt_poses.append(res_dict[i]['gt_poses'])
+            gt_betas.append(res_dict[i]['gt_betas'])
+            pred_poses.append(res_dict[i]['pred_poses'])
+            pred_betas.append(res_dict[i]['pred_betas'])
+
+        res = dict(keypoints=keypoints,
+                   gt_poses=gt_poses,
+                   gt_betas=gt_betas,
+                   pred_poses=pred_poses,
+                   pred_betas=pred_betas)
+        # mmcv.dump(res, res_file)
+        name_value_tuples = []
+        for _metric in metrics:
+            if _metric == 'mpjpe':
+                _nv_tuples = self._report_mpjpe(res)
+            elif _metric == 'pa-mpjpe':
+                _nv_tuples = self._report_mpjpe(res, metric='pa-mpjpe')
+                print(_nv_tuples)
+            elif _metric == '3dpck':
+                _nv_tuples = self._report_3d_pck(res)
+            elif _metric == 'pa-3dpck':
+                _nv_tuples = self._report_3d_pck(res, metric='pa-3dpck')
+            elif _metric == '3dauc':
+                _nv_tuples = self._report_3d_auc(res)
+            elif _metric == 'pa-3dauc':
+                _nv_tuples = self._report_3d_auc(res, metric='pa-3dauc')
+            elif _metric == 'pve':
+                _nv_tuples = self._report_pve(res)
+            elif _metric == 'ihmr':
+                _nv_tuples = self._report_ihmr(res)
+            else:
+                raise NotImplementedError
+            name_value_tuples.extend(_nv_tuples)
+
+        name_value = OrderedDict(name_value_tuples)
+        return name_value
+
+    @staticmethod
+    def _write_keypoint_results(keypoints: Any, res_file: str):
+        """Write results into a json file."""
+
+        with open(res_file, 'w') as f:
+            json.dump(keypoints, f, sort_keys=True, indent=4)
+
+    def _parse_result(self, res, mode='keypoint', body_part=None):
+        """Parse results."""
+
+        if mode == 'vertice':
+            # gt
+            gt_beta, gt_pose, gt_global_orient, gender = [], [], [], []
+            gt_smpl_dict = self.human_data['smpl']
+            for idx in range(self.num_data):
+                gt_beta.append(gt_smpl_dict['betas'][idx])
+                gt_pose.append(gt_smpl_dict['body_pose'][idx])
+                gt_global_orient.append(gt_smpl_dict['global_orient'][idx])
+                if self.human_data['meta']['gender'][idx] == 'm':
+                    gender.append(0)
+                else:
+                    gender.append(1)
+            gt_beta = torch.FloatTensor(gt_beta)
+            gt_pose = torch.FloatTensor(gt_pose).view(-1, 69)
+            gt_global_orient = torch.FloatTensor(gt_global_orient)
+            gender = torch.Tensor(gender)
+            gt_output = self.body_model(betas=gt_beta,
+                                        body_pose=gt_pose,
+                                        global_orient=gt_global_orient,
+                                        gender=gender)
+            gt_vertices = gt_output['vertices'].detach().cpu().numpy() * 1000.
+            gt_mask = np.ones(gt_vertices.shape[:-1])
+            # pred
+            pred_pose = torch.FloatTensor(res['pred_poses'])
+            pred_beta = torch.FloatTensor(res['pred_betas'])
+            pred_output = self.body_model(
+                betas=pred_beta[:, 0],
+                body_pose=pred_pose[:, 0, 1:],
+                global_orient=pred_pose[:, 0, 0].unsqueeze(1),
+                pose2rot=False)
+            pred_vertices = pred_output['vertices'].detach().cpu().numpy(
+            ) * 1000.
+
+            assert len(pred_vertices) == self.num_data
+
+            return pred_vertices, gt_vertices, gt_mask
+        elif mode == 'keypoint':
+            pred_keypoints3d = res['keypoints']
+            assert len(pred_keypoints3d) == self.num_data
+            # (B, 17, 3)
+            pred_keypoints3d = np.array(pred_keypoints3d).reshape(
+                len(pred_keypoints3d), -1, 3)
+            # pred_keypoints3d,_ = convert_kps(
+            #     pred_keypoints3d,
+            #     src='smpl_54',
+            #     dst='h36m',
+            # )
+
+            gt_smpl_pose = np.array(res['gt_poses'])
+            gt_body_pose = gt_smpl_pose[..., 1:, :]
+            gt_global_orient = gt_smpl_pose[..., 0, :]
+            gt_betas = np.array(res['gt_betas'])
+            gender = np.zeros([gt_betas.shape[0], gt_betas.shape[1]])
+            if self.dataset_name == 'pw3d':
+                # betas = []
+                # body_pose = []
+                # global_orient = []
+                # gender = []
+                # smpl_dict = self.human_data['smpl']
+
+                # for idx in range(self.num_data):
+                #     betas.append(smpl_dict['betas'][idx])
+                #     body_pose.append(smpl_dict['body_pose'][idx])
+                #     global_orient.append(smpl_dict['global_orient'][idx])
+                #     if self.human_data['meta']['gender'][idx] == 'm':
+                #         gender.append(0)
+                #     else:
+                #         gender.append(1)
+                betas = torch.FloatTensor(gt_betas).view(-1, 10)
+                body_pose = torch.FloatTensor(gt_body_pose).view(-1, 69)
+                global_orient = torch.FloatTensor(gt_global_orient).view(-1, 3)
+                gender = torch.Tensor(gender).view(-1)
+                gt_output = self.body_model(betas=betas,
+                                            body_pose=body_pose,
+                                            global_orient=global_orient,
+                                            gender=gender)
+                gt_keypoints3d = gt_output['joints'].detach().cpu().numpy()
+                # gt_keypoints3d,_ = convert_kps(
+                # gt_keypoints3d,
+                # src='smpl_54',
+                # dst='h36m')
+                gt_keypoints3d_mask = np.ones((len(pred_keypoints3d), 17))
+            elif self.dataset_name == 'h36m':
+                _, h36m_idxs, _ = get_mapping('human_data', 'h36m')
+                gt_keypoints3d = \
+                    self.human_data['keypoints3d'][:, h36m_idxs, :3]
+                gt_keypoints3d_mask = np.ones((len(pred_keypoints3d), 17))
+            elif self.dataset_name == 'humman':
+                betas = []
+                body_pose = []
+                global_orient = []
+                smpl_dict = self.human_data['smpl']
+                for idx in range(self.num_data):
+                    betas.append(smpl_dict['betas'][idx])
+                    body_pose.append(smpl_dict['body_pose'][idx])
+                    global_orient.append(smpl_dict['global_orient'][idx])
+                betas = torch.FloatTensor(betas)
+                body_pose = torch.FloatTensor(body_pose).view(-1, 69)
+                global_orient = torch.FloatTensor(global_orient)
+                gt_output = self.body_model(betas=betas,
+                                            body_pose=body_pose,
+                                            global_orient=global_orient)
+                gt_keypoints3d = gt_output['joints'].detach().cpu().numpy()
+                gt_keypoints3d_mask = np.ones((len(pred_keypoints3d), 24))
+            else:
+                raise NotImplementedError()
+
+            # SMPL_49 only!
+            if gt_keypoints3d.shape[1] == 49:
+                assert pred_keypoints3d.shape[1] == 49
+
+                gt_keypoints3d = gt_keypoints3d[:, 25:, :]
+                pred_keypoints3d = pred_keypoints3d[:, 25:, :]
+
+                joint_mapper = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18]
+                gt_keypoints3d = gt_keypoints3d[:, joint_mapper, :]
+                pred_keypoints3d = pred_keypoints3d[:, joint_mapper, :]
+
+                # we only evaluate on 14 lsp joints
+                pred_pelvis = (pred_keypoints3d[:, 2] +
+                               pred_keypoints3d[:, 3]) / 2
+                gt_pelvis = (gt_keypoints3d[:, 2] + gt_keypoints3d[:, 3]) / 2
+
+            # H36M for testing!
+            elif gt_keypoints3d.shape[1] == 17:
+                assert pred_keypoints3d.shape[-2] == 17
+
+                H36M_TO_J17 = [
+                    6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9
+                ]
+                H36M_TO_J14 = H36M_TO_J17[:14]
+                joint_mapper = H36M_TO_J14
+
+                pred_pelvis = pred_keypoints3d[:, 0]
+                gt_pelvis = gt_keypoints3d[:, 0]
+
+                gt_keypoints3d = gt_keypoints3d[:, joint_mapper, :]
+                pred_keypoints3d = pred_keypoints3d[:, joint_mapper, :]
+
+            # keypoint 24
+            elif gt_keypoints3d.shape[1] == 24:
+                assert pred_keypoints3d.shape[1] == 24
+
+                joint_mapper = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18]
+                gt_keypoints3d = gt_keypoints3d[:, joint_mapper, :]
+                pred_keypoints3d = pred_keypoints3d[:, joint_mapper, :]
+
+                # we only evaluate on 14 lsp joints
+                pred_pelvis = (pred_keypoints3d[:, 2] +
+                               pred_keypoints3d[:, 3]) / 2
+                gt_pelvis = (gt_keypoints3d[:, 2] + gt_keypoints3d[:, 3]) / 2
+
+            else:
+                pass
+
+            pred_keypoints3d = (pred_keypoints3d -
+                                pred_pelvis[:, None, :]) * 1000
+            gt_keypoints3d = (gt_keypoints3d - gt_pelvis[:, None, :]) * 1000
+
+            gt_keypoints3d_mask = gt_keypoints3d_mask[:, joint_mapper] > 0
+
+            return pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask
+
+    def _report_mpjpe(self, res_file, metric='mpjpe', body_part=''):
+        """Cauculate mean per joint position error (MPJPE) or its variants PA-
+        MPJPE.
+
+        Report mean per joint position error (MPJPE) and mean per joint
+        position error after rigid alignment (PA-MPJPE)
+        """
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file, mode='keypoint', body_part=body_part)
+
+        err_name = metric.upper()
+        if body_part != '':
+            err_name = body_part.upper() + ' ' + err_name
+
+        if metric == 'mpjpe':
+            alignment = 'none'
+        elif metric == 'pa-mpjpe':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+
+        error = keypoint_mpjpe(pred_keypoints3d, gt_keypoints3d,
+                               gt_keypoints3d_mask, alignment)
+        info_str = [(err_name, error)]
+
+        return info_str
+
+    def _report_3d_pck(self, res_file, metric='3dpck'):
+        """Cauculate Percentage of Correct Keypoints (3DPCK) w. or w/o
+        Procrustes alignment.
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            metric (str): Specify mpjpe variants. Supported options are:
+                - ``'3dpck'``: Standard 3DPCK.
+                - ``'pa-3dpck'``:
+                    3DPCK after aligning prediction to groundtruth
+                    via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file)
+
+        err_name = metric.upper()
+        if metric == '3dpck':
+            alignment = 'none'
+        elif metric == 'pa-3dpck':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+
+        error = keypoint_3d_pck(pred_keypoints3d, gt_keypoints3d,
+                                gt_keypoints3d_mask, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _report_3d_auc(self, res_file, metric='3dauc'):
+        """Cauculate the Area Under the Curve (AUC) computed for a range of
+        3DPCK thresholds.
+        Args:
+            keypoint_results (list): Keypoint predictions. See
+                'Body3DMpiInf3dhpDataset.evaluate' for details.
+            metric (str): Specify mpjpe variants. Supported options are:
+                - ``'3dauc'``: Standard 3DAUC.
+                - ``'pa-3dauc'``: 3DAUC after aligning prediction to
+                    groundtruth via a rigid transformation (scale, rotation and
+                    translation).
+        """
+
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file)
+
+        err_name = metric.upper()
+        if metric == '3dauc':
+            alignment = 'none'
+        elif metric == 'pa-3dauc':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+
+        error = keypoint_3d_auc(pred_keypoints3d, gt_keypoints3d,
+                                gt_keypoints3d_mask, alignment)
+        name_value_tuples = [(err_name, error)]
+
+        return name_value_tuples
+
+    def _report_pve(self, res_file, metric='pve', body_part=''):
+        """Cauculate per vertex error."""
+        pred_verts, gt_verts, _ = \
+            self._parse_result(res_file, mode='vertice', body_part=body_part)
+        err_name = metric.upper()
+        if body_part != '':
+            err_name = body_part.upper() + ' ' + err_name
+
+        if metric == 'pve':
+            alignment = 'none'
+        elif metric == 'pa-pve':
+            alignment = 'procrustes'
+        else:
+            raise ValueError(f'Invalid metric: {metric}')
+        error = vertice_pve(pred_verts, gt_verts, alignment)
+        return [(err_name, error)]
+
+    def _report_ihmr(self, res_file):
+        """Calculate IHMR metric.
+
+        https://arxiv.org/abs/2203.16427
+        """
+        pred_keypoints3d, gt_keypoints3d, gt_keypoints3d_mask = \
+            self._parse_result(res_file, mode='keypoint')
+
+        pred_verts, gt_verts, _ = \
+            self._parse_result(res_file, mode='vertice')
+
+        from detrsmpl.utils.geometry import rot6d_to_rotmat
+        mean_param_path = 'data/body_models/smpl_mean_params.npz'
+        mean_params = np.load(mean_param_path)
+        mean_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
+        mean_shape = torch.from_numpy(
+            mean_params['shape'][:].astype('float32')).unsqueeze(0)
+        mean_pose = rot6d_to_rotmat(mean_pose).view(1, 24, 3, 3)
+        mean_output = self.body_model(betas=mean_shape,
+                                      body_pose=mean_pose[:, 1:],
+                                      global_orient=mean_pose[:, :1],
+                                      pose2rot=False)
+        mean_verts = mean_output['vertices'].detach().cpu().numpy() * 1000.
+        dis = (gt_verts - mean_verts) * (gt_verts - mean_verts)
+        dis = np.sqrt(dis.sum(axis=-1)).mean(axis=-1)
+        # from the most remote one to the nearest one
+        idx_order = np.argsort(dis)[::-1]
+        num_data = idx_order.shape[0]
+
+        def report_ihmr_idx(idx):
+            mpvpe = vertice_pve(pred_verts[idx], gt_verts[idx])
+            mpjpe = keypoint_mpjpe(pred_keypoints3d[idx], gt_keypoints3d[idx],
+                                   gt_keypoints3d_mask[idx], 'none')
+            pampjpe = keypoint_mpjpe(pred_keypoints3d[idx],
+                                     gt_keypoints3d[idx],
+                                     gt_keypoints3d_mask[idx], 'procrustes')
+            return (mpvpe, mpjpe, pampjpe)
+
+        def report_ihmr_tail(percentage):
+            cur_data = int(num_data * percentage / 100.0)
+            idx = idx_order[:cur_data]
+            mpvpe, mpjpe, pampjpe = report_ihmr_idx(idx)
+            res_mpvpe = ('bMPVPE Tail ' + str(percentage) + '%', mpvpe)
+            res_mpjpe = ('bMPJPE Tail ' + str(percentage) + '%', mpjpe)
+            res_pampjpe = ('bPA-MPJPE Tail ' + str(percentage) + '%', pampjpe)
+            return [res_mpvpe, res_mpjpe, res_pampjpe]
+
+        def report_ihmr_all(num_bin):
+            num_per_bin = np.array([0 for _ in range(num_bin)
+                                    ]).astype(np.float32)
+            sum_mpvpe = np.array([0
+                                  for _ in range(num_bin)]).astype(np.float32)
+            sum_mpjpe = np.array([0
+                                  for _ in range(num_bin)]).astype(np.float32)
+            sum_pampjpe = np.array([0 for _ in range(num_bin)
+                                    ]).astype(np.float32)
+            max_dis = dis[idx_order[0]]
+            min_dis = dis[idx_order[-1]]
+            delta = (max_dis - min_dis) / num_bin
+            for i in range(num_data):
+                idx = int((dis[i] - min_dis) / delta - 0.001)
+                res_mpvpe, res_mpjpe, res_pampjpe = report_ihmr_idx([i])
+                num_per_bin[idx] += 1
+                sum_mpvpe[idx] += res_mpvpe
+                sum_mpjpe[idx] += res_mpjpe
+                sum_pampjpe[idx] += res_pampjpe
+            for i in range(num_bin):
+                if num_per_bin[i] > 0:
+                    sum_mpvpe[i] = sum_mpvpe[i] / num_per_bin[i]
+                    sum_mpjpe[i] = sum_mpjpe[i] / num_per_bin[i]
+                    sum_pampjpe[i] = sum_pampjpe[i] / num_per_bin[i]
+            valid_idx = np.where(num_per_bin > 0)
+            res_mpvpe = ('bMPVPE All', sum_mpvpe[valid_idx].mean())
+            res_mpjpe = ('bMPJPE All', sum_mpjpe[valid_idx].mean())
+            res_pampjpe = ('bPA-MPJPE All', sum_pampjpe[valid_idx].mean())
+            return [res_mpvpe, res_mpjpe, res_pampjpe]
+
+        metrics = []
+        metrics.extend(report_ihmr_all(num_bin=100))
+        metrics.extend(report_ihmr_tail(percentage=10))
+        metrics.extend(report_ihmr_tail(percentage=5))
+        return metrics
diff --git a/detrsmpl/data/datasets/pipelines/__init__.py b/detrsmpl/data/datasets/pipelines/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bba193f4c44ef84b0d7b61965a50a5981207405d
--- /dev/null
+++ b/detrsmpl/data/datasets/pipelines/__init__.py
@@ -0,0 +1,65 @@
+from .compose import Compose
+from .formatting import (
+    Collect,
+    ImageToTensor,
+    ToNumpy,
+    ToPIL,
+    ToTensor,
+    Transpose,
+    to_tensor,
+)
+from .hybrik_transforms import (
+    GenerateHybrIKTarget,
+    HybrIKAffine,
+    HybrIKRandomFlip,
+    NewKeypointsSelection,
+    RandomDPG,
+    RandomOcclusion,
+)
+from .loading import LoadImageFromFile
+from .synthetic_occlusion_augmentation import SyntheticOcclusion
+from .transforms import (
+    BBoxCenterJitter,
+    CenterCrop,
+    ColorJitter,
+    GetRandomScaleRotation,
+    Lighting,
+    MeshAffine,
+    MeshAffineED,
+    Normalize,
+    RandomChannelNoise,
+    RandomHorizontalFlip,
+    Rotation,
+    SimulateLowRes,
+)
+
+__all__ = [
+    'Compose',
+    'to_tensor',
+    'ToTensor',
+    'ImageToTensor',
+    'ToPIL',
+    'ToNumpy',
+    'Transpose',
+    'Collect',
+    'LoadImageFromFile',
+    'CenterCrop',
+    'RandomHorizontalFlip',
+    'ColorJitter',
+    'Lighting',
+    'RandomChannelNoise',
+    'GetRandomScaleRotation',
+    'MeshAffine',
+    'MeshAffineED',
+    'HybrIKRandomFlip',
+    'HybrIKAffine',
+    'GenerateHybrIKTarget',
+    'RandomDPG',
+    'RandomOcclusion',
+    'Rotation',
+    'NewKeypointsSelection',
+    'Normalize',
+    'SyntheticOcclusion',
+    'BBoxCenterJitter',
+    'SimulateLowRes',
+]
diff --git a/detrsmpl/data/datasets/pipelines/compose.py b/detrsmpl/data/datasets/pipelines/compose.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e1875bdfd6c042f403979ae855ab95b134a6fd
--- /dev/null
+++ b/detrsmpl/data/datasets/pipelines/compose.py
@@ -0,0 +1,41 @@
+from collections.abc import Sequence
+
+from mmcv.utils import build_from_cfg
+
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class Compose(object):
+    """Compose a data pipeline with a sequence of transforms.
+
+    Args:
+        transforms (list[dict | callable]):
+            Either config dicts of transforms or transform objects.
+    """
+    def __init__(self, transforms):
+        assert isinstance(transforms, Sequence)
+        self.transforms = []
+        for transform in transforms:
+            if isinstance(transform, dict):
+                transform = build_from_cfg(transform, PIPELINES)
+                self.transforms.append(transform)
+            elif callable(transform):
+                self.transforms.append(transform)
+            else:
+                raise TypeError('transform must be callable or a dict, but got'
+                                f' {type(transform)}')
+
+    def __call__(self, data):
+        for t in self.transforms:
+            data = t(data)
+            if data is None:
+                return None
+        return data
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += f'\n    {t}'
+        format_string += '\n)'
+        return format_string
diff --git a/detrsmpl/data/datasets/pipelines/formatting.py b/detrsmpl/data/datasets/pipelines/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..8260ffff860fe98b6025288c3df91c6a33e722f7
--- /dev/null
+++ b/detrsmpl/data/datasets/pipelines/formatting.py
@@ -0,0 +1,319 @@
+from collections.abc import Sequence
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+from PIL import Image
+
+from ..builder import PIPELINES
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+    """
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(
+            f'Type {type(data)} cannot be converted to tensor.'
+            'Supported types are: `numpy.ndarray`, `torch.Tensor`, '
+            '`Sequence`, `int` and `float`')
+
+
+@PIPELINES.register_module()
+class ToTensor(object):
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class ImageToTensor(object):
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = to_tensor(img.transpose(2, 0, 1))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+@PIPELINES.register_module()
+class Transpose(object):
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, order={self.order})'
+
+
+@PIPELINES.register_module()
+class ToPIL(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['img'] = Image.fromarray(results['img'])
+        return results
+
+
+@PIPELINES.register_module()
+class ToNumpy(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['img'] = np.array(results['img'], dtype=np.float32)
+        return results
+
+
+@PIPELINES.register_module()
+class Collect(object):
+    """Collect data from the loader relevant to the specific task.
+
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img" and "gt_label".
+
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ``('filename', 'ori_shape', 'img_shape', 'flip',
+            'flip_direction', 'img_norm_cfg')``
+
+    Returns:
+        dict: The result dict contains the following keys
+                - keys in``self.keys``
+                - ``img_metas`` if available
+    """
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_filename', 'ori_shape',
+                            'img_shape', 'flip', 'flip_direction',
+                            'img_norm_cfg')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        data = {}
+        img_meta = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_meta[key] = results[key]
+        data['img_metas'] = DC(img_meta, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+@PIPELINES.register_module()
+class ToDataContainer:
+    """Convert results to :obj:`mmcv.DataContainer` by given fields.
+
+    Args:
+        fields (Sequence[dict]): Each field is a dict like
+            ``dict(key='xxx', **kwargs)``. The ``key`` in result will
+            be converted to :obj:`mmcv.DataContainer` with ``**kwargs``.
+            Default: ``(dict(key='img', stack=True), dict(key='gt_bboxes'),
+            dict(key='gt_labels'))``.
+    """
+    def __init__(self,
+                 fields=(dict(key='img', stack=True), dict(key='gt_bboxes'),
+                         dict(key='gt_labels'))):
+        self.fields = fields
+
+    def __call__(self, results):
+        """Call function to convert data in results to
+        :obj:`mmcv.DataContainer`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted to \
+                :obj:`mmcv.DataContainer`.
+        """
+
+        for field in self.fields:
+            field = field.copy()
+            key = field.pop('key')
+            results[key] = DC(results[key], **field)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(fields={self.fields})'
+
+
+@PIPELINES.register_module()
+class DefaultFormatBundle:
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+                       (3)to DataContainer (stack=True)
+
+    Args:
+        img_to_float (bool): Whether to force the image to be converted to
+            float type. Default: True.
+        pad_val (dict): A dict for padding value in batch collating,
+            the default value is `dict(img=0, masks=0, seg=255)`.
+            Without this argument, the padding value of "gt_semantic_seg"
+            will be set to 0 by default, which should be 255.
+    """
+    def __init__(self,
+                 img_to_float=True,
+                 pad_val=dict(img=0, masks=0, seg=255)):
+        self.img_to_float = img_to_float
+        self.pad_val = pad_val
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with \
+                default bundle.
+        """
+        data_keys = [
+            'center', 'scale', 'rotation', 'smpl_body_pose',
+            'smpl_global_orient', 'smpl_betas', 'smpl_transl', 'area',
+            'bbox_xywh', 'has_smpl', 'keypoints2d_ori', 'keypoints3d_ori',
+            'keypoints2d_smpl', 'keypoints3d_smpl', 'has_keypoints2d_ori',
+            'has_keypoints3d_ori', 'has_keypoints2d_smpl',
+            'has_keypoints3d_smpl'
+        ]
+        if 'img' in results:
+            img = results['img']
+            if self.img_to_float is True and img.dtype == np.uint8:
+                # Normally, image is of uint8 type without normalization.
+                # At this time, it needs to be forced to be converted to
+                # flot32, otherwise the model training and inference
+                # will be wrong. Only used for YOLOX currently .
+                img = img.astype(np.float32)
+            # add default meta keys
+            results = self._add_default_meta_keys(results)
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            results['img'] = DC(to_tensor(img),
+                                padding_value=self.pad_val['img'],
+                                stack=True)
+        for key in data_keys:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]))
+        # if 'gt_masks' in results:
+        #     results['gt_masks'] = DC(
+        #         results['gt_masks'],
+        #         padding_value=self.pad_val['masks'],
+        #         cpu_only=True)
+        # if 'gt_semantic_seg' in results:
+        #     results['gt_semantic_seg'] = DC(
+        #         to_tensor(results['gt_semantic_seg'][None, ...]),
+        #         padding_value=self.pad_val['seg'],
+        #         stack=True)
+        return results
+
+    def _add_default_meta_keys(self, results):
+        """Add default meta keys.
+
+        We set default meta keys including `pad_shape`, `scale_factor` and
+        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
+        `Pad` are implemented during the whole pipeline.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            results (dict): Updated result dict contains the data to convert.
+        """
+        img = results['img']
+        results.setdefault('pad_shape', img.shape)
+        results.setdefault('scale_factor', 1.0)
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results.setdefault(
+            'img_norm_cfg',
+            dict(mean=np.zeros(num_channels, dtype=np.float32),
+                 std=np.ones(num_channels, dtype=np.float32),
+                 to_rgb=False))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(img_to_float={self.img_to_float})'
+
+
+@PIPELINES.register_module()
+class WrapFieldsToLists(object):
+    """Wrap fields of the data dictionary into lists for evaluation.
+
+    This class can be used as a last step of a test or validation
+    pipeline for single image evaluation or inference.
+
+    Example:
+        >>> test_pipeline = [
+        >>>    dict(type='LoadImageFromFile'),
+        >>>    dict(type='Normalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+        >>>    dict(type='ImageToTensor', keys=['img']),
+        >>>    dict(type='Collect', keys=['img']),
+        >>>    dict(type='WrapIntoLists')
+        >>> ]
+    """
+    def __call__(self, results):
+        # Wrap dict fields into lists
+        for key, val in results.items():
+            results[key] = [val]
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'
diff --git a/detrsmpl/data/datasets/pipelines/hybrik_transforms.py b/detrsmpl/data/datasets/pipelines/hybrik_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..a00f17626c722414f587f6e53cbb037e29692c16
--- /dev/null
+++ b/detrsmpl/data/datasets/pipelines/hybrik_transforms.py
@@ -0,0 +1,877 @@
+import math
+import random
+
+import cv2
+import mmcv
+import numpy as np
+
+from detrsmpl.core.conventions.keypoints_mapping import get_flip_pairs
+from detrsmpl.utils.demo_utils import box2cs, xyxy2xywh
+from ..builder import PIPELINES
+from .transforms import (
+    _rotate_smpl_pose,
+    affine_transform,
+    get_affine_transform,
+)
+
+
+def get_bbox(bbox_xywh, w, h):
+    """Obtain bbox in xyxy format given bbox in xywh format and applying
+    clipping to ensure bbox is within image bounds.
+
+    Args:
+        xywh (list): bbox in format (x, y, w, h).
+        w (int): image width
+        h (int): image height
+
+    Returns:
+        xyxy (numpy.ndarray): Converted bboxes in format (xmin, ymin,
+         xmax, ymax).
+    """
+    bbox_xywh = bbox_xywh.reshape(1, 4)
+    xmin, ymin, xmax, ymax = bbox_clip_xyxy(bbox_xywh_to_xyxy(bbox_xywh), w, h)
+    bbox = np.array([xmin, ymin, xmax, ymax])
+    return bbox
+
+
+def heatmap2coord(pred_jts,
+                  pred_scores,
+                  hm_shape,
+                  bbox,
+                  output_3d=False,
+                  mean_bbox_scale=None):
+    """Retrieve predicted keypoints and scores from heatmap."""
+    hm_width, hm_height = hm_shape
+
+    ndims = pred_jts.dim()
+    assert ndims in [2, 3], 'Dimensions of input heatmap should be 2 or 3'
+    if ndims == 2:
+        pred_jts = pred_jts.unsqueeze(0)
+        pred_scores = pred_scores.unsqueeze(0)
+
+    coords = pred_jts.cpu().numpy()
+    coords = coords.astype(float)
+    pred_scores = pred_scores.cpu().numpy()
+    pred_scores = pred_scores.astype(float)
+
+    coords[:, :, 0] = (coords[:, :, 0] + 0.5) * hm_width
+    coords[:, :, 1] = (coords[:, :, 1] + 0.5) * hm_height
+
+    preds = np.zeros_like(coords)
+    # transform bbox to scale
+    xmin, ymin, xmax, ymax = bbox
+    w = xmax - xmin
+    h = ymax - ymin
+    center = np.array([xmin + w * 0.5, ymin + h * 0.5])
+    scale = np.array([w, h])
+    # Transform back
+    for i in range(coords.shape[0]):
+        for j in range(coords.shape[1]):
+            preds[i, j, 0:2] = transform_preds(coords[i, j, 0:2], center,
+                                               scale, [hm_width, hm_height])
+            if output_3d:
+                if mean_bbox_scale is not None:
+                    zscale = scale[0] / mean_bbox_scale
+                    preds[i, j, 2] = coords[i, j, 2] / zscale
+                else:
+                    preds[i, j, 2] = coords[i, j, 2]
+    # maxvals = np.ones((*preds.shape[:2], 1), dtype=float)
+    # score_mul = 1 if norm_name == 'sigmoid' else 5
+
+    return preds, pred_scores
+
+
+def transform_preds(coords, center, scale, output_size):
+    """Transform heatmap coordinates to image coordinates."""
+    target_coords = np.zeros(coords.shape)
+    trans = get_affine_transform(center,
+                                 scale,
+                                 0,
+                                 output_size,
+                                 inv=1,
+                                 pixel_std=1)
+    target_coords[0:2] = affine_transform(coords[0:2], trans)
+    return target_coords
+
+
+def bbox_xywh_to_xyxy(xywh):
+    """Convert bounding boxes from format (x, y, w, h) to (xmin, ymin, xmax,
+    ymax)
+
+    Args:
+        xywh (list, tuple or numpy.ndarray): bbox in format (x, y, w, h).
+        If numpy.ndarray is provided, we expect multiple bounding boxes with
+        shape `(N, 4)`.
+
+    Returns:
+        xyxy (tuple or numpy.ndarray): Converted bboxes in format (xmin, ymin,
+         xmax, ymax). Return numpy.ndarray if input is in the same format.
+    """
+    if isinstance(xywh, (tuple, list)):
+        if not len(xywh) == 4:
+            raise IndexError(
+                'Bounding boxes must have 4 elements, given {}'.format(
+                    len(xywh)))
+        w, h = np.maximum(xywh[2] - 1, 0), np.maximum(xywh[3] - 1, 0)
+        return (xywh[0], xywh[1], xywh[0] + w, xywh[1] + h)
+    elif isinstance(xywh, np.ndarray):
+        if not xywh.size % 4 == 0:
+            raise IndexError(
+                'Bounding boxes must have n * 4 elements, given {}'.format(
+                    xywh.shape))
+        xyxy = np.hstack(
+            (xywh[:, :2], xywh[:, :2] + np.maximum(0, xywh[:, 2:4] - 1)))
+        return xyxy
+    else:
+        raise TypeError(
+            'Expect input xywh a list, tuple or numpy.ndarray, given {}'.
+            format(type(xywh)))
+
+
+def bbox_clip_xyxy(xyxy, width, height):
+    """Clip bounding box with format (xmin, ymin, xmax, ymax) to `(0, 0, width,
+    height)`.
+
+    Args:
+    xyxy (list, tuple or numpy.ndarray): bbox in format (xmin, ymin,
+     xmax, ymax). If numpy.ndarray is provided, we expect multiple bounding
+     boxes with shape `(N, 4)`.
+    width (int or float): Boundary width.
+    height (int or float): Boundary height.
+
+    Returns:
+    xyxy (list, tuple or numpy.ndarray): clipped bbox in format (xmin, ymin,
+     xmax, ymax) and input type
+    """
+    if isinstance(xyxy, (tuple, list)):
+        if not len(xyxy) == 4:
+            raise IndexError(
+                'Bounding boxes must have 4 elements, given {}'.format(
+                    len(xyxy)))
+        x1 = np.minimum(width - 1, np.maximum(0, xyxy[0]))
+        y1 = np.minimum(height - 1, np.maximum(0, xyxy[1]))
+        x2 = np.minimum(width - 1, np.maximum(0, xyxy[2]))
+        y2 = np.minimum(height - 1, np.maximum(0, xyxy[3]))
+        return (x1, y1, x2, y2)
+    elif isinstance(xyxy, np.ndarray):
+        if not xyxy.size % 4 == 0:
+            raise IndexError(
+                'Bounding boxes must have n * 4 elements, given {}'.format(
+                    xyxy.shape))
+        x1 = np.minimum(width - 1, np.maximum(0, xyxy[:, 0]))
+        y1 = np.minimum(height - 1, np.maximum(0, xyxy[:, 1]))
+        x2 = np.minimum(width - 1, np.maximum(0, xyxy[:, 2]))
+        y2 = np.minimum(height - 1, np.maximum(0, xyxy[:, 3]))
+        return np.hstack((x1, y1, x2, y2))
+    else:
+        raise TypeError(
+            'Expect input xywh a list, tuple or numpy.ndarray, given {}'.
+            format(type(xyxy)))
+
+
+def cam2pixel(cam_coord, f, c):
+    """Convert coordinates from camera to image frame given f and c
+    Args:
+        cam_coord (np.ndarray): Coordinates in camera frame
+        f (list): focal length, fx, fy
+        c (list): principal point offset, x0, y0
+
+    Returns:
+        img_coord (np.ndarray): Coordinates in image frame
+    """
+
+    x = cam_coord[:, 0] / (cam_coord[:, 2] + 1e-8) * f[0] + c[0]
+    y = cam_coord[:, 1] / (cam_coord[:, 2] + 1e-8) * f[1] + c[1]
+    z = cam_coord[:, 2]
+    img_coord = np.concatenate((x[:, None], y[:, None], z[:, None]), 1)
+    return img_coord
+
+
+def get_intrinsic_matrix(f, c, inv=False):
+    """Get intrisic matrix (or its inverse) given f and c.
+    Args:
+        f (list): focal length, fx, fy
+        c (list): principal point offset, x0, y0
+        inv (bool): Store True to get inverse. Default: False.
+
+    Returns:
+        intrinsic matrix (np.ndarray): 3x3 intrinsic matrix or its inverse
+    """
+    intrinsic_metrix = np.zeros((3, 3)).astype(np.float32)
+    intrinsic_metrix[0, 0] = f[0]
+    intrinsic_metrix[0, 2] = c[0]
+    intrinsic_metrix[1, 1] = f[1]
+    intrinsic_metrix[1, 2] = c[1]
+    intrinsic_metrix[2, 2] = 1
+
+    if inv:
+        intrinsic_metrix = np.linalg.inv(intrinsic_metrix).astype(np.float32)
+    return intrinsic_metrix
+
+
+def aa_to_quat_numpy(axis_angle):
+    """Convert rotations given as axis/angle to quaternions.
+
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a np.ndarray of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+
+    Returns:
+        quaternions with real part first, as np.ndarray of shape (..., 4).
+    """
+    angles = np.linalg.norm(axis_angle, ord=2, axis=-1, keepdims=True)
+    half_angles = 0.5 * angles
+    eps = 1e-6
+    small_angles = np.abs(angles) < eps
+    sin_half_angles_over_angles = np.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        np.sin(half_angles[~small_angles]) / angles[~small_angles])
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48)
+    quaternions = np.concatenate(
+        [np.cos(half_angles), axis_angle * sin_half_angles_over_angles],
+        axis=-1)
+    return quaternions
+
+
+def flip_thetas(thetas, theta_pairs):
+    """Flip thetas.
+
+    Args:
+        thetas (np.ndarray): joints in shape (num_thetas, 3)
+        theta_pairs (list): flip pairs for thetas
+
+    Returns:
+        thetas_flip (np.ndarray): flipped thetas with shape (num_thetas, 3)
+    """
+    thetas_flip = thetas.copy()
+    # reflect horizontally
+    thetas_flip[:, 1] = -1 * thetas_flip[:, 1]
+    thetas_flip[:, 2] = -1 * thetas_flip[:, 2]
+    # change left-right parts
+    for pair in theta_pairs:
+        thetas_flip[pair[0], :], thetas_flip[pair[1], :] = \
+            thetas_flip[pair[1], :], thetas_flip[pair[0], :].copy()
+
+    return thetas_flip
+
+
+def flip_joints_3d(joints_3d, joints_3d_visible, width, flip_pairs):
+    """Flip 3d joints.
+
+    Args:
+        joints_3d (np.ndarray): joints in shape (N, 3, 2)
+        width (int): Image width
+        joint_pairs (list): flip pairs for joints
+
+    Returns:
+        joints_3d_flipped (np.ndarray): flipped joints with shape (N, 3, 2)
+        joints_3d_visible_flipped (np.ndarray): visibility of (N, 3, 2)
+    """
+
+    assert len(joints_3d) == len(joints_3d_visible)
+    joints_3d[:, 0] = width - joints_3d[:, 0] - 1
+    joints_3d_flipped = joints_3d.copy()
+    joints_3d_visible_flipped = joints_3d_visible.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        joints_3d_flipped[left, :] = joints_3d[right, :]
+        joints_3d_flipped[right, :] = joints_3d[left, :]
+
+        joints_3d_visible_flipped[left, :] = joints_3d_visible[right, :]
+        joints_3d_visible_flipped[right, :] = joints_3d_visible[left, :]
+
+    joints_3d_flipped = joints_3d_flipped * joints_3d_visible_flipped
+
+    return joints_3d_flipped, joints_3d_visible_flipped
+
+
+def flip_xyz_joints_3d(joints_3d, flip_pairs):
+    """Flip 3d xyz joints.
+
+    Args:
+        joints_3d (np.ndarray): Joints in shape (N, 3)
+        joint_pairs (list): flip pairs for joints
+
+    Returns:
+        joints_3d_flipped (np.ndarray): flipped joints with shape (N, 3)
+    """
+
+    joints_3d[:, 0] = -1 * joints_3d[:, 0]
+    joints_3d_flipped = joints_3d.copy()
+    # change left-right parts
+    for left, right in flip_pairs:
+        joints_3d_flipped[left, :] = joints_3d[right, :]
+        joints_3d_flipped[right, :] = joints_3d[left, :]
+
+    return joints_3d_flipped
+
+
+def flip_twist(twist_phi, twist_weight, twist_pairs):
+    """Flip twist and weight.
+
+    Args:
+        twist_phi (np.ndarray): twist in shape (num_twist, 2)
+        twist_weight (np.ndarray): weight in shape (num_twist, 2)
+        twist_pairs (list): flip pairs for twist
+
+    Returns:
+        twist_flip (np.ndarray): flipped twist with shape (num_twist, 2)
+        weight_flip (np.ndarray): flipped weights with shape (num_twist, 2)
+    """
+    # twist_flip = -1 * twist_phi.copy() # 23 x 2
+    twist_flip = np.zeros_like(twist_phi)
+    weight_flip = twist_weight.copy()
+
+    twist_flip[:, 0] = twist_phi[:, 0].copy()  # cos
+    twist_flip[:, 1] = -1 * twist_phi[:, 1].copy()  # sin
+    for pair in twist_pairs:
+        idx0 = pair[0] - 1
+        idx1 = pair[1] - 1
+        twist_flip[idx0, :], twist_flip[idx1, :] = \
+            twist_flip[idx1, :], twist_flip[idx0, :].copy()
+
+        weight_flip[idx0, :], weight_flip[idx1, :] = \
+            weight_flip[idx1, :], weight_flip[idx0, :].copy()
+
+    return twist_flip, weight_flip
+
+
+def _center_scale_to_box(center, scale):
+    """Flip twist and weight.
+
+    Args:
+        joints_3d (np.ndarray): Joints in shape (N, 3)
+        joint_pairs (list): flip pairs for joints
+
+    Returns:
+        joints_3d_flipped (np.ndarray): flipped joints with shape (N, 3)
+    """
+    pixel_std = 1.0
+    w = scale[0] * pixel_std
+    h = scale[1] * pixel_std
+    xmin = center[0] - w * 0.5
+    ymin = center[1] - h * 0.5
+    xmax = xmin + w
+    ymax = ymin + h
+    bbox = [xmin, ymin, xmax, ymax]
+    return bbox
+
+
+@PIPELINES.register_module()
+class RandomDPG(object):
+    """Add dpg for data augmentation, including random crop and random sample
+    Required keys: 'bbox', 'ann_info
+    Modifies key: 'bbox', 'center', 'scale'
+    Args:
+        dpg_prob (float): Probability of dpg
+    """
+    def __init__(self, dpg_prob):
+        self.dpg_prob = dpg_prob
+
+    def __call__(self, results):
+        if np.random.rand() > self.dpg_prob:
+            return results
+
+        bbox = results['bbox']
+        imgwidth = results['ann_info']['width']
+        imgheight = results['ann_info']['height']
+
+        PatchScale = random.uniform(0, 1)
+        width = bbox[2] - bbox[0]
+        ht = bbox[3] - bbox[1]
+
+        if PatchScale > 0.85:
+            ratio = ht / width
+            if (width < ht):
+                patchWidth = PatchScale * width
+                patchHt = patchWidth * ratio
+            else:
+                patchHt = PatchScale * ht
+                patchWidth = patchHt / ratio
+
+            xmin = bbox[0] + random.uniform(0, 1) * (width - patchWidth)
+            ymin = bbox[1] + random.uniform(0, 1) * (ht - patchHt)
+            xmax = xmin + patchWidth + 1
+            ymax = ymin + patchHt + 1
+        else:
+            xmin = max(
+                1,
+                min(bbox[0] + np.random.normal(-0.0142, 0.1158) * width,
+                    imgwidth - 3))
+            ymin = max(
+                1,
+                min(bbox[1] + np.random.normal(0.0043, 0.068) * ht,
+                    imgheight - 3))
+            xmax = min(
+                max(xmin + 2,
+                    bbox[2] + np.random.normal(0.0154, 0.1337) * width),
+                imgwidth - 3)
+            ymax = min(
+                max(ymin + 2,
+                    bbox[3] + np.random.normal(-0.0013, 0.0711) * ht),
+                imgheight - 3)
+        bbox_xyxy = np.array([xmin, ymin, xmax, ymax])
+        bbox_xywh = xyxy2xywh(bbox_xyxy)
+        center, scale = box2cs(bbox_xywh,
+                               aspect_ratio=1.0,
+                               bbox_scale_factor=1.0)
+        results['bbox'] = bbox_xyxy
+        results['center'] = center
+        results['scale'] = scale
+
+        return results
+
+
+@PIPELINES.register_module()
+class HybrIKRandomFlip:
+    """Data augmentation with random image flip.
+
+    Required keys: 'img', 'keypoints3d', 'keypoints3d_vis', 'center',
+    and 'ann_info', 'has_smpl'
+    Additional keys required if has_smpl: 'keypoints3d17', 'keypoints3d17_vis',
+    'keypoints3d_relative', 'keypoints3d17_relative', 'pose'
+
+    Modifies key: 'img', 'keypoints3d', 'keypoints3d_vis', 'center', 'pose'
+    Additional keys modified if has_smpl: 'keypoints3d17', 'keypoints3d17_vis',
+    'keypoints3d_relative', 'keypoints3d17_relative', 'pose'
+
+    Args:
+        flip_prob (float): probability of the image being flipped. Default: 0.5
+        flip_pairs (list[int]): list of left-right keypoint pairs for flipping
+    """
+    def __init__(self, flip_prob=0.5, flip_pairs=None):
+        assert 0 <= flip_prob <= 1
+        self.flip_prob = flip_prob
+        self.flip_pairs = flip_pairs
+
+    def __call__(self, results):
+        """Perform data augmentation with random image flip."""
+        if np.random.rand() > self.flip_prob:
+            results['is_flipped'] = np.array([0])
+            return results
+
+        results['is_flipped'] = np.array([1])
+
+        # flip image
+        for key in results.get('img_fields', ['img']):
+            results[key] = mmcv.imflip(results[key], direction='horizontal')
+
+        width = results['img'][:, ::-1, :].shape[1]
+        # flip bbox center
+        center = results['center']
+        center[0] = width - 1 - center[0]
+        results['center'] = center
+
+        keypoints3d = results['keypoints3d']
+        keypoints3d_vis = results['keypoints3d_vis']
+
+        keypoints3d, keypoints3d_vis = flip_joints_3d(keypoints3d,
+                                                      keypoints3d_vis, width,
+                                                      self.flip_pairs)
+
+        if results['has_smpl']:
+            pose = results['pose']
+            smpl_flip_pairs = get_flip_pairs('smpl')
+            pose = flip_thetas(pose, smpl_flip_pairs)
+
+            keypoints3d17 = results['keypoints3d17']
+            keypoints3d17_vis = results['keypoints3d17_vis']
+            keypoints3d17_relative = results['keypoints3d17_relative']
+            keypoints3d_relative = results['keypoints3d_relative']
+
+            keypoints3d17, keypoints3d17_vis = flip_joints_3d(
+                keypoints3d17, keypoints3d17_vis, width, self.flip_pairs)
+            keypoints3d17_relative = flip_xyz_joints_3d(
+                keypoints3d17_relative, self.flip_pairs)
+            keypoints3d_relative = flip_xyz_joints_3d(keypoints3d_relative,
+                                                      self.flip_pairs)
+            twist_phi, twist_weight = results['target_twist'], results[
+                'target_twist_weight']
+            results['target_twist'], results[
+                'target_twist_weight'] = flip_twist(twist_phi, twist_weight,
+                                                    smpl_flip_pairs)
+
+            results['keypoints3d17_relative'] = keypoints3d17_relative.astype(
+                np.float32)
+            results['keypoints3d_relative'] = keypoints3d_relative.astype(
+                np.float32)
+            results['keypoints3d17'] = keypoints3d17.astype(np.float32)
+            results['keypoints3d17_vis'] = keypoints3d17_vis.astype(np.float32)
+            results['pose'] = pose.astype(np.float32)
+
+        results['keypoints3d'] = keypoints3d.astype(np.float32)
+        results['keypoints3d_vis'] = keypoints3d_vis.astype(np.float32)
+        return results
+
+
+@PIPELINES.register_module()
+class HybrIKAffine:
+    """Affine transform the image to get input image. Affine transform the 2D
+    keypoints, 3D kepoints and IUV image too.
+
+    Required keys: 'img', 'keypoints3d', 'keypoints3d_vis', 'pose', 'ann_info',
+    'scale', 'keypoints3d17', 'keypoints3d17_vis', 'rotation' and 'center'.
+    Modifies key: 'img', 'keypoints3d','keypoints3d_vis', 'pose',
+    'keypoints3d17', 'keypoints3d17_vis'
+    """
+    def __init__(self, img_res):
+        self.image_size = np.array([img_res, img_res])
+
+    def __call__(self, results):
+
+        img = results['img']
+        keypoints3d = results['keypoints3d']
+        num_joints = len(keypoints3d)
+        keypoints3d_vis = results['keypoints3d_vis']
+        has_smpl = results['has_smpl']
+
+        c = results['center']
+        s = results['scale']
+        r = results['rotation']
+        trans = get_affine_transform(c, s, r, self.image_size, pixel_std=1)
+        img = cv2.warpAffine(
+            img,
+            trans, (int(self.image_size[0]), int(self.image_size[1])),
+            flags=cv2.INTER_LINEAR)
+
+        for i in range(num_joints):
+            if keypoints3d_vis[i, 0] > 0.0:
+                keypoints3d[i, 0:2] = affine_transform(keypoints3d[i, 0:2],
+                                                       trans)
+
+        if has_smpl:
+
+            keypoints3d17 = results['keypoints3d17']
+            keypoints3d17_vis = results['keypoints3d17_vis']
+            for i in range(17):
+                if keypoints3d17_vis[i, 0] > 0.0:
+                    keypoints3d17[i, 0:2] = affine_transform(
+                        keypoints3d17[i, 0:2], trans)
+            results['keypoints3d17'] = keypoints3d17
+            results['keypoints3d17_vis'] = keypoints3d17_vis
+
+            # to rotate poses
+            pose = results['pose']
+            pose = _rotate_smpl_pose(pose.reshape(-1), r)
+            results['pose'] = pose.reshape(24, 3)
+
+        results['img'] = img.astype(np.float32)
+        results['keypoints3d_vis'] = keypoints3d_vis.astype(np.float32)
+        results['keypoints3d'] = keypoints3d.astype(np.float32)
+
+        return results
+
+
+@PIPELINES.register_module()
+class RandomOcclusion:
+    """Add random occlusion.
+
+    Add random occlusion based on occlusion probability.
+
+    Args:
+        occlusion_prob (float): probability of the image having
+        occlusion. Default: 0.5
+    """
+    def __init__(self, occlusion_prob=0.5):
+        self.occlusion_prob = occlusion_prob
+
+    def __call__(self, results):
+
+        if np.random.rand() > self.occlusion_prob:
+            return results
+
+        xmin, ymin, xmax, ymax = results['bbox']
+        imgwidth = results['ann_info']['width']
+        imgheight = results['ann_info']['height']
+        img = results['img']
+
+        area_min = 0.0
+        area_max = 0.7
+        synth_area = (random.random() * (area_max - area_min) +
+                      area_min) * (xmax - xmin) * (ymax - ymin)
+
+        ratio_min = 0.3
+        ratio_max = 1 / 0.3
+        synth_ratio = (random.random() * (ratio_max - ratio_min) + ratio_min)
+
+        synth_h = math.sqrt(synth_area * synth_ratio)
+        synth_w = math.sqrt(synth_area / synth_ratio)
+        synth_xmin = random.random() * ((xmax - xmin) - synth_w - 1) + xmin
+        synth_ymin = random.random() * ((ymax - ymin) - synth_h - 1) + ymin
+
+        if synth_xmin >= 0 and synth_ymin >= 0 and \
+            synth_xmin + synth_w < imgwidth and \
+                synth_ymin + synth_h < imgheight:
+            synth_xmin = int(synth_xmin)
+            synth_ymin = int(synth_ymin)
+            synth_w = int(synth_w)
+            synth_h = int(synth_h)
+            img[synth_ymin:synth_ymin + synth_h, synth_xmin:synth_xmin +
+                synth_w, :] = np.random.rand(synth_h, synth_w, 3) * 255
+
+        results['img'] = img
+
+        return results
+
+
+@PIPELINES.register_module()
+class GenerateHybrIKTarget:
+    """Generate the targets required for training.
+
+    Required keys: 'keypoints3d', 'keypoints3d_vis', 'ann_info', 'depth_factor'
+    Additional keys if has_smpl: 'keypoints3d17', 'keypoints3d17_vis',
+    'keypoints3d_relative', 'keypoints3d17_relative' Add keys: 'target_uvd_29',
+    'target_xyz_24', 'target_weight_24', 'target_weight_29', 'target_xyz_17',
+    'target_weight_17', 'target_theta', 'target_beta', 'target_smpl_weight',
+    'target_theta_weight', trans_inv', 'bbox'
+    """
+    def __init__(self, img_res, test_mode):
+        self.test_mode = test_mode
+        self.image_size = np.array([img_res, img_res])
+
+    def _integral_uvd_target_generator(self,
+                                       joints_3d,
+                                       num_joints,
+                                       patch_height,
+                                       patch_width,
+                                       depth_factor,
+                                       test_mode=False):
+
+        target_weight = np.ones((num_joints, 3), dtype=np.float32)
+        target_weight[:, 0] = joints_3d[:, 0, 1]
+        target_weight[:, 1] = joints_3d[:, 0, 1]
+        target_weight[:, 2] = joints_3d[:, 0, 1]
+
+        target = np.zeros((num_joints, 3), dtype=np.float32)
+        target[:, 0] = joints_3d[:, 0, 0] / patch_width - 0.5
+        target[:, 1] = joints_3d[:, 1, 0] / patch_height - 0.5
+        target[:, 2] = joints_3d[:, 2, 0] / depth_factor
+
+        target_weight[target[:, 0] > 0.5] = 0
+        target_weight[target[:, 0] < -0.5] = 0
+        target_weight[target[:, 1] > 0.5] = 0
+        target_weight[target[:, 1] < -0.5] = 0
+        target_weight[target[:, 2] > 0.5] = 0
+        target_weight[target[:, 2] < -0.5] = 0
+
+        target = target.reshape((-1))
+        target_weight = target_weight.reshape((-1))
+        return target, target_weight
+
+    def _integral_target_generator(self, joints_3d, num_joints, patch_height,
+                                   patch_width, depth_factor):
+        target_weight = np.ones((num_joints, 3), dtype=np.float32)
+        target_weight[:, 0] = joints_3d[:, 0, 1]
+        target_weight[:, 1] = joints_3d[:, 0, 1]
+        target_weight[:, 2] = joints_3d[:, 0, 1]
+
+        target = np.zeros((num_joints, 3), dtype=np.float32)
+        target[:, 0] = joints_3d[:, 0, 0] / patch_width - 0.5
+        target[:, 1] = joints_3d[:, 1, 0] / patch_height - 0.5
+        target[:, 2] = joints_3d[:, 2, 0] / depth_factor
+
+        target_weight[target[:, 0] > 0.5] = 0
+        target_weight[target[:, 0] < -0.5] = 0
+        target_weight[target[:, 1] > 0.5] = 0
+        target_weight[target[:, 1] < -0.5] = 0
+        target_weight[target[:, 2] > 0.5] = 0
+        target_weight[target[:, 2] < -0.5] = 0
+
+        target = target.reshape((-1))
+        target_weight = target_weight.reshape((-1))
+        return target, target_weight
+
+    def _integral_xyz_target_generator(self, joints_3d, joints_3d_vis,
+                                       num_joints, depth_factor):
+        target_weight = np.ones((num_joints, 3), dtype=np.float32)
+        target_weight[:, 0] = joints_3d_vis[:, 0]
+        target_weight[:, 1] = joints_3d_vis[:, 1]
+        target_weight[:, 2] = joints_3d_vis[:, 2]
+
+        target = np.zeros((num_joints, 3), dtype=np.float32)
+        target[:, 0] = joints_3d[:, 0] / int(depth_factor)
+        target[:, 1] = joints_3d[:, 1] / int(depth_factor)
+        target[:, 2] = joints_3d[:, 2] / int(depth_factor)
+
+        target = target.reshape((-1))
+        target_weight = target_weight.reshape((-1))
+        return target, target_weight
+
+    def _integral_target_generator_coco(self, joints_3d, num_joints,
+                                        patch_height, patch_width):
+        target_weight = np.ones((num_joints, 2), dtype=np.float32)
+        target_weight[:, 0] = joints_3d[:, 0, 1]
+        target_weight[:, 1] = joints_3d[:, 0, 1]
+
+        target = np.zeros((num_joints, 2), dtype=np.float32)
+        target[:, 0] = joints_3d[:, 0, 0] / patch_width - 0.5
+        target[:, 1] = joints_3d[:, 1, 0] / patch_height - 0.5
+
+        target = target.reshape((-1))
+        target_weight = target_weight.reshape((-1))
+        return target, target_weight
+
+    def __call__(self, results):
+
+        has_smpl = results['has_smpl']
+        inp_h, inp_w = self.image_size[0], self.image_size[1]
+
+        keypoints3d = results['keypoints3d']
+        num_joints = len(keypoints3d)
+        keypoints3d_vis = results['keypoints3d_vis']
+        depth_factor = results['depth_factor']
+
+        c = results['center']
+        s = results['scale']
+        r = results['rotation']
+
+        #  generate new keys
+        trans_inv = get_affine_transform(c,
+                                         s,
+                                         r,
+                                         self.image_size,
+                                         inv=True,
+                                         pixel_std=1).astype(np.float32)
+        results['trans_inv'] = trans_inv.astype(np.float32)
+        bbox = _center_scale_to_box(c, s)
+        results['bbox'] = np.array(bbox, dtype=np.float32)
+
+        if has_smpl:
+            theta = results['pose']
+            # aa to quat
+            results['target_theta'] = aa_to_quat_numpy(theta).reshape(
+                24 * 4).astype(np.float32)
+            theta_24_weights = np.ones((24, 4))
+            results['target_theta_weight'] = theta_24_weights.reshape(
+                24 * 4).astype(np.float32)
+
+            results['target_beta'] = results['beta'].astype(np.float32)
+            results['target_smpl_weight'] = np.ones(1).astype(np.float32)
+
+            keypoints3d17_vis = results['keypoints3d17_vis']
+            keypoints3d17_relative = results['keypoints3d17_relative']
+            joints24_relative_3d = results['keypoints3d_relative'][:24, :]
+
+            gt_joints_29 = np.zeros((29, 3, 2), dtype=np.float32)
+            gt_joints_29[:, :, 0] = keypoints3d.copy()
+            gt_joints_29[:, :, 1] = keypoints3d_vis.copy()
+
+            target_uvd_29, target_weight_29 = \
+                self._integral_uvd_target_generator(
+                    gt_joints_29, 29, inp_h, inp_w, depth_factor)
+            target_xyz_17, target_weight_17 = \
+                self._integral_xyz_target_generator(
+                    keypoints3d17_relative, keypoints3d17_vis, 17,
+                    depth_factor)
+            target_xyz_24, target_weight_24 = \
+                self._integral_xyz_target_generator(
+                    joints24_relative_3d, keypoints3d_vis[:24, :], 24,
+                    depth_factor)
+            target_weight_29 *= keypoints3d_vis.reshape(-1)
+            target_weight_24 *= keypoints3d_vis[:24, :].reshape(-1)
+            target_weight_17 *= keypoints3d17_vis.reshape(-1)
+
+            results['target_uvd_29'] = target_uvd_29.astype(np.float32)
+            results['target_xyz_24'] = target_xyz_24.astype(np.float32)
+            results['target_weight_29'] = target_weight_29.astype(np.float32)
+            results['target_weight_24'] = target_weight_24.astype(np.float32)
+            results['target_xyz_17'] = target_xyz_17.astype(np.float32)
+            results['target_weight_17'] = target_weight_17.astype(np.float32)
+        else:
+            label_uvd_29 = np.zeros((29, 3))
+            label_xyz_24 = np.zeros((24, 3))
+            label_uvd_29_mask = np.zeros((29, 3))
+            label_xyz_17 = np.zeros((17, 3))
+            label_xyz_17_mask = np.zeros((17, 3))
+
+            gt_joints = np.zeros((num_joints, 3, 2), dtype=np.float32)
+            gt_joints[:, :, 0] = keypoints3d.copy()
+            gt_joints[:, :, 1] = keypoints3d_vis.copy()
+            mask_idx = [1, 2, 6, 9, 10, 11]
+
+            if results['ann_info']['dataset_name'] == 'coco':
+                target, target_weight = self._integral_target_generator_coco(
+                    gt_joints, num_joints, inp_h, inp_w)
+
+                label_jts_origin = target * target_weight
+                label_jts_mask_origin = target_weight
+
+                label_jts_origin = label_jts_origin.reshape(num_joints, 2)
+                label_jts_mask_origin = label_jts_mask_origin.reshape(
+                    num_joints, 2)
+                label_jts_origin[mask_idx] = label_jts_origin[mask_idx] * 0
+                label_jts_mask_origin[
+                    mask_idx] = label_jts_origin[mask_idx] * 0
+                label_uvd_29 = np.hstack([label_jts_origin, np.zeros([29, 1])])
+                label_uvd_29_mask = np.hstack(
+                    [label_jts_mask_origin,
+                     np.zeros([29, 1])])
+
+            elif results['ann_info']['dataset_name'] == 'mpi_inf_3dhp':
+                if not self.test_mode:
+                    target, target_weight = self._integral_target_generator(
+                        gt_joints, num_joints, inp_h, inp_w, depth_factor)
+                    target_weight *= keypoints3d_vis.reshape(-1)
+
+                    label_jts_origin = target * target_weight
+                    label_jts_mask_origin = target_weight
+
+                    label_jts_origin = label_jts_origin.reshape(num_joints, 3)
+                    label_jts_mask_origin = label_jts_mask_origin.reshape(
+                        num_joints, 3)
+                    label_jts_origin[mask_idx] = label_jts_origin[mask_idx] * 0
+                    label_jts_mask_origin[
+                        mask_idx] = label_jts_origin[mask_idx] * 0
+                    label_uvd_29 = label_jts_origin
+                    label_uvd_29_mask = label_jts_mask_origin
+
+            label_uvd_29 = label_uvd_29.reshape(-1)
+            label_xyz_24 = label_xyz_24.reshape(-1)
+            label_uvd_24_mask = label_uvd_29_mask[:24, :].reshape(-1)
+            label_uvd_29_mask = label_uvd_29_mask.reshape(-1)
+            label_xyz_17 = label_xyz_17.reshape(-1)
+            label_xyz_17_mask = label_xyz_17_mask.reshape(-1)
+
+            results['target_uvd_29'] = label_uvd_29.astype(np.float32)
+            results['target_xyz_24'] = label_xyz_24.astype(np.float32)
+            results['target_weight_24'] = label_uvd_24_mask.astype(np.float32)
+            results['target_weight_29'] = label_uvd_29_mask.astype(np.float32)
+            results['target_xyz_17'] = label_xyz_17.astype(np.float32)
+            results['target_weight_17'] = label_xyz_17_mask.astype(np.float32)
+            results['target_theta'] = np.zeros(24 * 4).astype(np.float32)
+            results['target_beta'] = np.zeros(10).astype(np.float32)
+            results['target_smpl_weight'] = np.zeros(1).astype(np.float32)
+            results['target_theta_weight'] = np.zeros(24 * 4).astype(
+                np.float32)
+
+        return results
+
+
+@PIPELINES.register_module()
+class NewKeypointsSelection:
+    """Select keypoints.
+
+    Modifies specified keys
+
+    Args:
+        map (dict): keypoints and index for selection
+    """
+    def __init__(self, maps):
+        self.maps = maps
+
+    def __call__(self, results):
+        """Perform keypoints selection."""
+
+        for map in self.maps:
+            for keypoint in map['keypoints']:
+                keypoints_index = map['keypoints_index']
+                if keypoint in results:
+                    results[keypoint] = results[keypoint][...,
+                                                          keypoints_index, :]
+        return results
diff --git a/detrsmpl/data/datasets/pipelines/loading.py b/detrsmpl/data/datasets/pipelines/loading.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3a3f01c4a9242542b1552eb13519f2297295fc8
--- /dev/null
+++ b/detrsmpl/data/datasets/pipelines/loading.py
@@ -0,0 +1,85 @@
+import os.path as osp
+
+import cv2
+import mmcv
+import numpy as np
+
+from detrsmpl.data.data_structures.smc_reader import SMCReader
+from ..builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class LoadImageFromFile(object):
+    """Load an image from file.
+
+    Required keys are "img_prefix" and "img_info" (a dict that must contain the
+    key "filename"). Added or updated keys are "filename", "img", "img_shape",
+    "ori_shape" (same as `img_shape`) and "img_norm_cfg" (means=0 and stds=1).
+    Both "img_shape" and "ori_shape" use (height, width) convention.
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+        color_type (str): The flag argument for :func:`mmcv.imfrombytes()`.
+            Defaults to 'color'.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details.
+            Defaults to ``dict(backend='disk')``.
+    """
+    def __init__(self,
+                 to_float32=False,
+                 color_type='color',
+                 file_client_args=dict(backend='disk')):
+        self.to_float32 = to_float32
+        self.color_type = color_type
+        self.file_client_args = file_client_args.copy()
+        self.file_client = None
+
+    def __call__(self, results):
+        if self.file_client is None:
+            self.file_client = mmcv.FileClient(**self.file_client_args)
+
+        if results['img_prefix'] is not None:
+            filename = osp.join(results['img_prefix'], results['image_path'])
+        else:
+            filename = results['image_path']
+
+        if filename.endswith('smc'):
+            assert 'image_id' in results, 'Load image from .smc, ' \
+                                          'but image_id is not provided.'
+            device, device_id, frame_id = results['image_id']
+            smc_reader = SMCReader(filename)
+            img = smc_reader.get_color(device,
+                                       device_id,
+                                       frame_id,
+                                       disable_tqdm=True)
+            img = img.squeeze()  # (1, H, W, 3) -> (H, W, 3)
+            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)  # BGR is used
+            del smc_reader
+        else:
+            img_bytes = self.file_client.get(filename)
+            img = mmcv.imfrombytes(img_bytes, flag=self.color_type)
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['filename'] = filename
+        results['ori_filename'] = results['image_path']
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results['img_norm_cfg'] = dict(mean=np.zeros(num_channels,
+                                                     dtype=np.float32),
+                                       std=np.ones(num_channels,
+                                                   dtype=np.float32),
+                                       to_rgb=False)
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32}, '
+                    f"color_type='{self.color_type}', "
+                    f'file_client_args={self.file_client_args})')
+        return repr_str
diff --git a/detrsmpl/data/datasets/pipelines/synthetic_occlusion_augmentation.py b/detrsmpl/data/datasets/pipelines/synthetic_occlusion_augmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ca81ef0f1dfeebe44637741fd1f4a57d644e414
--- /dev/null
+++ b/detrsmpl/data/datasets/pipelines/synthetic_occlusion_augmentation.py
@@ -0,0 +1,137 @@
+"""This script is modified from https://github.com/ isarandi/synthetic-
+occlusion.
+
+Original license please see docs/additional_licenses.md.
+"""
+import os.path
+import random
+
+import cv2
+import numpy as np
+
+from ..builder import PIPELINES
+
+
+def load_pascal_occluders(occluders_file):
+    """load pascal occluders from the occluder file."""
+
+    if os.path.isfile(occluders_file):
+        return np.load(occluders_file, allow_pickle=True)
+    else:
+        raise NotImplementedError()
+
+
+def occlude_with_pascal_objects(im, occluders):
+    """Returns an augmented version of `im`, containing some occluders from the
+    Pascal VOC dataset."""
+
+    result = im.copy()
+    width_height = np.asarray([im.shape[1], im.shape[0]])
+    im_scale_factor = min(width_height) / 256
+    count = np.random.randint(1, 8)
+
+    # logger.debug(f'Number of augmentation objects: {count}')
+
+    for _ in range(count):
+        occluder = random.choice(occluders)
+
+        center = np.random.uniform([0, 0], width_height)
+        random_scale_factor = np.random.uniform(0.2, 1.0)
+        scale_factor = random_scale_factor * im_scale_factor
+
+        # logger.debug(f'occluder size: {occluder.shape},
+        # scale_f: {scale_factor}, img_scale: {im_scale_factor}')
+        occluder = resize_by_factor(occluder, scale_factor)
+
+        paste_over(im_src=occluder, im_dst=result, center=center)
+
+    return result
+
+
+def paste_over(im_src, im_dst, center):
+    """Pastes `im_src` onto `im_dst` at a specified position, with alpha
+    blending, in place.
+
+    Locations outside the bounds of `im_dst`
+    are handled as expected (only a part or none of `im_src` becomes visible).
+
+    Args:
+        im_src: The RGBA image to be pasted onto `im_dst`.
+                Its size can be arbitrary.
+        im_dst: The target image.
+        alpha: A float (0.0-1.0) array of the same size as `im_src`
+                controlling the alpha blending at each pixel.
+                Large values mean more visibility for `im_src`.
+        center: coordinates in `im_dst` where
+                the center of `im_src` should be placed.
+    """
+
+    width_height_src = np.asarray([im_src.shape[1], im_src.shape[0]])
+    width_height_dst = np.asarray([im_dst.shape[1], im_dst.shape[0]])
+
+    center = np.round(center).astype(np.int32)
+    raw_start_dst = center - width_height_src // 2
+    raw_end_dst = raw_start_dst + width_height_src
+
+    start_dst = np.clip(raw_start_dst, 0, width_height_dst)
+    end_dst = np.clip(raw_end_dst, 0, width_height_dst)
+    region_dst = im_dst[start_dst[1]:end_dst[1], start_dst[0]:end_dst[0]]
+
+    start_src = start_dst - raw_start_dst
+    end_src = width_height_src + (end_dst - raw_end_dst)
+    region_src = im_src[start_src[1]:end_src[1], start_src[0]:end_src[0]]
+    color_src = region_src[..., 0:3]
+    alpha = region_src[..., 3:].astype(np.float32) / 255
+
+    im_dst[start_dst[1]:end_dst[1],
+           start_dst[0]:end_dst[0]] = (alpha * color_src +
+                                       (1 - alpha) * region_dst)
+
+
+def resize_by_factor(im, factor):
+    """Returns a copy of `im` resized by `factor`, using bilinear interp for up
+    and area interp for downscaling."""
+    new_size = tuple(
+        np.round(np.array([im.shape[1], im.shape[0]]) * factor).astype(int))
+    interp = cv2.INTER_LINEAR if factor > 1.0 else cv2.INTER_AREA
+    return cv2.resize(im, new_size, fx=factor, fy=factor, interpolation=interp)
+
+
+def list_filepaths(dirpath):
+    """list the file paths."""
+    names = os.listdir(dirpath)
+    paths = [os.path.join(dirpath, name) for name in names]
+    return sorted(filter(os.path.isfile, paths))
+
+
+@PIPELINES.register_module()
+class SyntheticOcclusion:
+    """Data augmentation with synthetic occlusion.
+
+    Required keys: 'img'
+    Modifies key: 'img'
+    Args:
+        flip_prob (float): probability of the image being flipped. Default: 0.5
+        flip_pairs (list[int]): list of left-right keypoint pairs for flipping
+        occ_aug_dataset (str): name of occlusion dataset. Default: pascal
+        pascal_voc_root_path (str): the path to pascal voc dataset,
+        which can generate occluders file.
+        occluders_file (str): occluders file.
+    """
+    def __init__(self, occluders_file='', occluders=None):
+        self.occluders = None
+        if occluders is not None:
+            self.occluders = occluders
+
+        else:
+            self.occluders = load_pascal_occluders(
+                occluders_file=occluders_file, )
+
+    def __call__(self, results):
+        """Perform data augmentation with random channel noise."""
+        img = results['img']
+
+        img = occlude_with_pascal_objects(img, self.occluders)
+
+        results['img'] = img
+        return results
diff --git a/detrsmpl/data/datasets/pipelines/transforms.py b/detrsmpl/data/datasets/pipelines/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..826088283cf16be2371f29902704fe5518b033d7
--- /dev/null
+++ b/detrsmpl/data/datasets/pipelines/transforms.py
@@ -0,0 +1,1284 @@
+import math
+import random
+from collections import Iterable
+
+import cv2
+import mmcv
+import numpy as np
+
+from detrsmpl.utils.demo_utils import xywh2xyxy, xyxy2xywh
+from detrsmpl.core.conventions.keypoints_mapping import get_flip_pairs
+from detrsmpl.utils.transforms import aa_to_rotmat, rotmat_to_aa
+from ..builder import PIPELINES
+from .compose import Compose
+
+
+def get_affine_transform(center,
+                         scale,
+                         rot,
+                         output_size,
+                         shift=(0., 0.),
+                         inv=False,
+                         pixel_std=1.0):
+    """Get the affine transform matrix, given the center/scale/rot/output_size.
+
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+    Returns:
+        np.ndarray: The transform matrix.
+    """
+    assert len(center) == 2
+    assert len(scale) == 2
+    assert len(output_size) == 2
+    assert len(shift) == 2
+
+    scale_tmp = scale * pixel_std
+
+    shift = np.array(shift)
+    src_h = scale_tmp[1]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+
+    rot_rad = np.pi * rot / 180
+    src_dir = rotate_point([0., src_h * -0.5], rot_rad)
+    dst_dir = np.array([0., dst_h * -0.5])
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale_tmp * shift
+    src[1, :] = center + src_dir + scale_tmp * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    return trans
+
+
+def affine_transform(pt, trans_mat):
+    """Apply an affine transformation to the points.
+
+    Args:
+        pt (np.ndarray): a 2 dimensional point to be transformed
+        trans_mat (np.ndarray): 2x3 matrix of an affine transform
+    Returns:
+        np.ndarray: Transformed points.
+    """
+    if pt.ndim == 2:
+        new_pt = np.einsum('ij,mj->im', pt, trans_mat)
+    elif pt.ndim == 3:
+        new_pt = np.einsum('nij,mj->nim', pt, trans_mat)
+    else:
+        msg = f'Expected pt to have ndim of 2 or 3, but get {pt.ndim} '
+        raise ValueError(msg)
+    # new_pt = np.array(trans_mat) @ np.array([pt[0], pt[1], 1.])
+
+    return new_pt
+
+
+def _get_3rd_point(a, b):
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+    Args:
+        a (np.ndarray): point(x,y)
+        b (np.ndarray): point(x,y)
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    assert len(a) == 2
+    assert len(b) == 2
+    direction = a - b
+    third_pt = b + np.array([-direction[1], direction[0]], dtype=np.float32)
+
+    return third_pt
+
+
+def rotate_point(pt, angle_rad):
+    """Rotate a point by an angle.
+
+    Args:
+        pt (list[float]): 2 dimensional point to be rotated
+        angle_rad (float): rotation angle by radian
+    Returns:
+        list[float]: Rotated point.
+    """
+    assert len(pt) == 2
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    new_x = pt[0] * cs - pt[1] * sn
+    new_y = pt[0] * sn + pt[1] * cs
+    rotated_pt = [new_x, new_y]
+
+    return rotated_pt
+
+
+def get_warp_matrix(theta, size_input, size_dst, size_target):
+    """Calculate the transformation matrix under the constraint of unbiased.
+
+    Paper ref: Huang et al. The Devil is in the Details: Delving into Unbiased
+    Data Processing for Human Pose Estimation (CVPR 2020).
+    Args:
+        theta (float): Rotation angle in degrees.
+        size_input (np.ndarray): Size of input image [w, h].
+        size_dst (np.ndarray): Size of output image [w, h].
+        size_target (np.ndarray): Size of ROI in input plane [w, h].
+    Returns:
+        matrix (np.ndarray): A matrix for transformation.
+    """
+    theta = np.deg2rad(theta)
+    matrix = np.zeros((2, 3), dtype=np.float32)
+    scale_x = size_dst[0] / size_target[0]
+    scale_y = size_dst[1] / size_target[1]
+    matrix[0, 0] = math.cos(theta) * scale_x
+    matrix[0, 1] = -math.sin(theta) * scale_x
+    matrix[0, 2] = scale_x * (-0.5 * size_input[0] * math.cos(theta) +
+                              0.5 * size_input[1] * math.sin(theta) +
+                              0.5 * size_target[0])
+    matrix[1, 0] = math.sin(theta) * scale_y
+    matrix[1, 1] = math.cos(theta) * scale_y
+    matrix[1, 2] = scale_y * (-0.5 * size_input[0] * math.sin(theta) -
+                              0.5 * size_input[1] * math.cos(theta) +
+                              0.5 * size_target[1])
+    return matrix
+
+
+def warp_affine_joints(joints, mat):
+    """Apply affine transformation defined by the transform matrix on the
+    joints.
+
+    Args:
+        joints (np.ndarray[..., 2]): Origin coordinate of joints.
+        mat (np.ndarray[3, 2]): The affine matrix.
+    Returns:
+        matrix (np.ndarray[..., 2]): Result coordinate of joints.
+    """
+    joints = np.array(joints)
+    shape = joints.shape
+    joints = joints.reshape(-1, 2)
+    return np.dot(np.concatenate((joints, joints[:, 0:1] * 0 + 1), axis=1),
+                  mat.T).reshape(shape)
+
+
+def _construct_rotation_matrix(rot, size=3):
+    """Construct the in-plane rotation matrix.
+
+    Args:
+        rot (float): Rotation angle (degree).
+        size (int): The size of the rotation matrix.
+            Candidate Values: 2, 3. Defaults to 3.
+    Returns:
+        rot_mat (np.ndarray([size, size]): Rotation matrix.
+    """
+    rot_mat = np.eye(size, dtype=np.float32)
+    if rot != 0:
+        rot_rad = np.deg2rad(rot)
+        sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+        rot_mat[0, :2] = [cs, -sn]
+        rot_mat[1, :2] = [sn, cs]
+
+    return rot_mat
+
+
+def _flip_smpl_pose(pose):
+    """Flip SMPL pose parameters horizontally.
+
+    Args:
+        pose (np.ndarray([72])): SMPL pose parameters
+    Returns:
+        pose_flipped
+    """
+
+    flippedParts = [
+        0, 1, 2, 6, 7, 8, 3, 4, 5, 9, 10, 11, 15, 16, 17, 12, 13, 14, 18, 19,
+        20, 24, 25, 26, 21, 22, 23, 27, 28, 29, 33, 34, 35, 30, 31, 32, 36, 37,
+        38, 42, 43, 44, 39, 40, 41, 45, 46, 47, 51, 52, 53, 48, 49, 50, 57, 58,
+        59, 54, 55, 56, 63, 64, 65, 60, 61, 62, 69, 70, 71, 66, 67, 68
+    ]
+    pose_flipped = pose[..., flippedParts]
+    # Negate the second and the third dimension of the axis-angle
+    pose_flipped[..., 1::3] = -pose_flipped[..., 1::3]
+    pose_flipped[..., 2::3] = -pose_flipped[..., 2::3]
+    return pose_flipped
+
+
+def _flip_smplx_pose(pose):
+    """Flip SMPLX pose parameters horizontally.
+
+    Args:
+        pose (np.ndarray([63])): SMPLX pose parameters
+    Returns:
+        pose_flipped (np.ndarray([21,3]))
+    """
+    flippedParts = np.array([
+        6, 7, 8, 3, 4, 5, 9, 10, 11, 15, 16, 17, 12, 13, 14, 18, 19, 20, 24,
+        25, 26, 21, 22, 23, 27, 28, 29, 33, 34, 35, 30, 31, 32, 36, 37, 38, 42,
+        43, 44, 39, 40, 41, 45, 46, 47, 51, 52, 53, 48, 49, 50, 57, 58, 59, 54,
+        55, 56, 63, 64, 65, 60, 61, 62
+    ],
+                            dtype=np.int32) - 3
+    dim_flip = np.array([1, -1, -1], dtype=pose.dtype)
+    pose = (pose[..., flippedParts].reshape(-1, 21, 3) * dim_flip).copy()
+    return pose
+
+
+def _flip_axis_angle(r):
+    """Flip axis_angle horizontally.
+
+    Args:
+        r (np.ndarray([3]))
+    Returns:
+        f_flipped
+    """
+    dim_flip = np.array([1, -1, -1], dtype=r.dtype)
+    r = r * dim_flip
+    return r
+
+
+def _flip_hand_pose(r_pose, l_pose):
+    dim_flip = np.array([1, -1, -1], dtype=r_pose.dtype)
+    ret_l_pose = r_pose * dim_flip
+    ret_r_pose = l_pose * dim_flip
+    return ret_r_pose, ret_l_pose
+
+
+def _flip_keypoints(keypoints, flip_pairs, img_width=None):
+    """Flip human joints horizontally.
+
+    Note:
+        num_keypoints: K
+        num_dimension: D
+    Args:
+        keypoints (np.ndarray([K, D])): Coordinates of keypoints.
+        flip_pairs (list[tuple()]): Pairs of keypoints which are mirrored
+            (for example, left ear -- right ear).
+        img_width (int | None, optional): The width of the original image.
+            To flip 2D keypoints, image width is needed. To flip 3D keypoints,
+            we simply negate the value of x-axis. Default: None.
+    Returns:
+        keypoints_flipped
+    """
+
+    keypoints_flipped = keypoints.copy()
+
+    # Swap left-right parts
+    for left, right in flip_pairs:
+        keypoints_flipped[..., left, :] = keypoints[..., right, :]
+        keypoints_flipped[..., right, :] = keypoints[..., left, :]
+
+    # Flip horizontally
+    if img_width is None:
+        keypoints_flipped[..., 0] = -keypoints_flipped[..., 0]
+    else:
+        keypoints_flipped[..., 0] = img_width - 1 - keypoints_flipped[..., 0]
+
+    return keypoints_flipped
+
+
+def _rotate_joints_3d(joints_3d, rot):
+    """Rotate the 3D joints in the local coordinates.
+
+    Notes:
+        Joints number: K
+    Args:
+        joints_3d (np.ndarray([K, 3])): Coordinates of keypoints.
+        rot (float): Rotation angle (degree).
+    Returns:
+        joints_3d_rotated
+    """
+    # in-plane rotation
+    # 3D joints are rotated counterclockwise,
+    # so the rot angle is inversed.
+    rot_mat = _construct_rotation_matrix(-rot, 3)
+    if joints_3d.ndim == 2:
+        joints_3d_rotated = np.einsum('ij,kj->ki', rot_mat, joints_3d)
+    elif joints_3d.ndim == 3:
+        joints_3d_rotated = np.einsum('ij,mkj->mki', rot_mat, joints_3d)
+    else:
+        msg = 'Expected joints_3d to have ndim of 2 or 3, '
+        f'but get {joints_3d.ndim}.'
+        raise ValueError(msg)
+    joints_3d_rotated = joints_3d_rotated.astype('float32')
+    return joints_3d_rotated
+
+
+def _rotate_smpl_pose(pose, rot):
+    """Rotate SMPL pose parameters.
+
+    SMPL (https://smpl.is.tue.mpg.de/) is a 3D
+    human model.
+    Args:
+        pose (np.ndarray([72])): SMPL pose parameters
+        rot (float): Rotation angle (degree).
+    Returns:
+        pose_rotated
+    """
+    pose_rotated = pose.copy()
+    if rot != 0:
+        # rot_mat = _construct_rotation_matrix(-rot)
+        # orient = pose[:3]
+        # # find the rotation of the body in camera frame
+        # per_rdg, _ = cv2.Rodrigues(orient.astype(np.float32))
+        # # apply the global rotation to the global orientation
+        # res_rot, _ = cv2.Rodrigues(np.dot(rot_mat, per_rdg))
+        # pose_rotated[:3] = (res_rot.T)[0]
+
+        # use pytorch3d
+        rot_mat = _construct_rotation_matrix(-rot)
+        orient = pose[..., :3]
+        per_rdg = aa_to_rotmat(orient)
+
+        if pose.ndim == 1:
+            tmp_rot = np.einsum('ij,jk->ik', rot_mat, per_rdg)
+        elif pose.ndim == 2:
+            tmp_rot = np.einsum('ij,mjk->mik', rot_mat, per_rdg)
+        else:
+            msg = f'Expected pose to have ndim of 2 or 3, but get {pose.ndim} '
+            raise ValueError(msg)
+
+        res_rot = rotmat_to_aa(tmp_rot)
+        pose_rotated[..., :3] = res_rot
+
+        # use cv2
+        # rot_mat = _construct_rotation_matrix(-rot)
+        # for i in range(pose.shape[0]):
+        #     orient = pose[i, :3]
+        #     # find the rotation of the body in camera frame
+        #     per_rdg, _ = cv2.Rodrigues(orient.astype(np.float32))
+        #     # apply the global rotation to the global orientation
+        #     res_rot, _ = cv2.Rodrigues(np.dot(rot_mat, per_rdg))
+        #     pose_rotated[i, :3] = (res_rot.T)[0]
+
+    return pose_rotated
+
+
+def _bbox_flip(bboxes, img_shape, direction):
+    """Flip bboxes horizontally.
+
+    Args:
+        bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
+        img_shape (tuple[int]): Image shape (height, width)
+        direction (str): Flip direction. Options are 'horizontal',
+            'vertical'.
+
+    Returns:
+        numpy.ndarray: Flipped bounding boxes.
+    """
+
+    assert bboxes.shape[-1] % 5 == 0
+    flipped = bboxes.copy()
+    if direction == 'horizontal':
+        w = img_shape[1]
+        flipped[..., 0::4] = w - bboxes[..., 2::4]
+        flipped[..., 2::4] = w - bboxes[..., 0::4]
+    elif direction == 'vertical':
+        h = img_shape[0]
+        flipped[..., 1::4] = h - bboxes[..., 3::4]
+        flipped[..., 3::4] = h - bboxes[..., 1::4]
+    elif direction == 'diagonal':
+        w = img_shape[1]
+        h = img_shape[0]
+        flipped[..., 0::4] = w - bboxes[..., 2::4]
+        flipped[..., 1::4] = h - bboxes[..., 3::4]
+        flipped[..., 2::4] = w - bboxes[..., 0::4]
+        flipped[..., 3::4] = h - bboxes[..., 1::4]
+    else:
+        raise ValueError(f"Invalid flipping direction '{direction}'")
+    return flipped
+
+
+@PIPELINES.register_module()
+class RandomHorizontalFlip(object):
+    """Flip the image randomly.
+
+    Flip the image randomly based on flip probaility.
+
+    Args:
+        flip_prob (float): probability of the image being flipped. Default: 0.5
+    """
+    def __init__(self, flip_prob=0.5, convention=None):
+        assert 0 <= flip_prob <= 1
+        self.flip_prob = flip_prob
+        self.flip_pairs = get_flip_pairs(convention)
+
+    def __call__(self, results):
+        """Call function to flip image and annotations.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Flipped results, 'flip' key is added into
+                result dict.
+        """
+        if np.random.rand() > self.flip_prob:
+            results['is_flipped'] = np.array([0])
+            return results
+
+        results['is_flipped'] = np.array([1])
+
+        # flip image
+        for key in results.get('img_fields', ['img']):
+            results[key] = mmcv.imflip(results[key], direction='horizontal')
+
+        # flip keypoints2d
+        if 'keypoints2d' in results:
+            assert self.flip_pairs is not None
+            width = results['img'][:, ::-1, :].shape[1]
+            keypoints2d = results['keypoints2d'].copy()
+            keypoints2d = _flip_keypoints(keypoints2d, self.flip_pairs, width)
+            results['keypoints2d'] = keypoints2d
+        elif 'keypoints2d_ori' in results:
+            assert self.flip_pairs is not None
+            width = results['img'][:, ::-1, :].shape[1]
+            keypoints2d = results['keypoints2d_ori'].copy()
+            keypoints2d = _flip_keypoints(keypoints2d, self.flip_pairs, width)
+            results['keypoints2d_ori'] = keypoints2d
+
+        if 'keypoints2d_smpl' in results:
+            assert self.flip_pairs is not None
+            width = results['img'][:, ::-1, :].shape[1]
+            keypoints2d = results['keypoints2d_smpl'].copy()
+            keypoints2d = _flip_keypoints(keypoints2d, self.flip_pairs, width)
+            results['keypoints2d_smpl'] = keypoints2d
+
+        # flip bbox center
+        center = results['center']
+        center[..., 0] = width - 1 - center[..., 0]
+        results['center'] = center
+
+        # flip keypoints3d
+        if 'keypoints3d' in results:
+            assert self.flip_pairs is not None
+            keypoints3d = results['keypoints3d'].copy()
+            keypoints3d = _flip_keypoints(keypoints3d, self.flip_pairs)
+            results['keypoints3d'] = keypoints3d
+        elif 'keypoints3d_ori' in results:
+            assert self.flip_pairs is not None
+            keypoints3d = results['keypoints3d_ori'].copy()
+            keypoints3d = _flip_keypoints(keypoints3d, self.flip_pairs)
+            results['keypoints3d_ori'] = keypoints3d
+
+        if 'keypoints3d_smpl' in results:
+            assert self.flip_pairs is not None
+            keypoints3d = results['keypoints3d_smpl'].copy()
+            keypoints3d = _flip_keypoints(keypoints3d, self.flip_pairs)
+            results['keypoints3d_smpl'] = keypoints3d
+
+        if 'bbox_xywh' in results:
+            width = results['img'].shape[1]
+            bbox_xywh = results['bbox_xywh'].copy()
+            bbox_xyxy = xywh2xyxy(bbox_xywh)
+
+            bbox_xyxy = bbox_xyxy[:, [2, 1, 0, 3]] * np.array(
+                [-1, 1, -1, 1]) + np.array([width, 0, width, 0])
+
+            # img = mmcv.imshow_bboxes(results['img'], bbox_xyxy, show=False)
+            # cv2.imwrite('test.png',img)
+            results['bbox_xywh'] = xyxy2xywh(bbox_xyxy)
+
+        # flip smpl
+        if 'smpl_body_pose' in results:
+            global_orient = results['smpl_global_orient'].copy()
+            body_pose = results['smpl_body_pose'].copy().reshape((-1, 23 * 3))
+            smpl_pose = np.concatenate((global_orient, body_pose), axis=-1)
+            smpl_pose_flipped = _flip_smpl_pose(smpl_pose)
+            global_orient = smpl_pose_flipped[..., :3]
+            body_pose = smpl_pose_flipped[..., 3:]
+            results['smpl_global_orient'] = global_orient
+            results['smpl_body_pose'] = body_pose.reshape((-1, 23, 3))
+
+        # TODO: to check multi-human for smplx
+        if 'smplx_body_pose' in results:
+
+            body_pose = results['smplx_body_pose'].copy().reshape((-1))
+            body_pose_flipped = _flip_smplx_pose(body_pose)
+            results['smplx_body_pose'] = body_pose_flipped
+
+        if 'smplx_global_orient' in results:
+            global_orient = results['smplx_global_orient'].copy().reshape((-1))
+            global_orient_flipped = _flip_axis_angle(global_orient)
+            results['smplx_global_orient'] = global_orient_flipped
+
+        if 'smplx_jaw_pose' in results:
+            jaw_pose = results['smplx_jaw_pose'].copy().reshape((-1))
+            jaw_pose_flipped = _flip_axis_angle(jaw_pose)
+            results['smplx_jaw_pose'] = jaw_pose_flipped
+
+        if 'smplx_right_hand_pose' in results:
+            right_hand_pose = results['smplx_right_hand_pose'].copy()
+            left_hand_pose = results['smplx_left_hand_pose'].copy()
+            results['smplx_right_hand_pose'], results[
+                'smplx_left_hand_pose'] = _flip_hand_pose(
+                    right_hand_pose, left_hand_pose)
+
+        # Expressions are not symmetric. Remove them when flipped.
+        if 'smplx_expression' in results:
+            results['smplx_expression'] = np.zeros(
+                (results['smplx_expression'].shape[0]), dtype=np.float32)
+            results['has_smplx_expression'] = 0
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(flip_prob={self.flip_prob})'
+
+
+def resize(ori_shape, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    # import ipdb; ipdb.set_trace(context=15)
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(
+                    round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (w, h)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (ow, oh)
+
+    def get_size(ori_shape, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(ori_shape, size, max_size)
+
+    size = get_size(ori_shape, size, max_size)
+
+    return size
+
+
+@PIPELINES.register_module()
+class CenterCrop(object):
+    r"""Center crop the image.
+
+    Args:
+        crop_size (int | tuple): Expected size after cropping with the format
+            of (h, w).
+        efficientnet_style (bool): Whether to use efficientnet style center
+            crop. Defaults to False.
+        crop_padding (int): The crop padding parameter in efficientnet style
+            center crop. Only valid if efficientnet style is True. Defaults to
+            32.
+        interpolation (str): Interpolation method, accepted values are
+            'nearest', 'bilinear', 'bicubic', 'area', 'lanczos'. Only valid if
+             efficientnet style is True. Defaults to 'bilinear'.
+        backend (str): The image resize backend type, accpeted values are
+            `cv2` and `pillow`. Only valid if efficientnet style is True.
+            Defaults to `cv2`.
+
+
+    Notes:
+        If the image is smaller than the crop size, return the original image.
+        If efficientnet_style is set to False, the pipeline would be a simple
+        center crop using the crop_size.
+        If efficientnet_style is set to True, the pipeline will be to first to
+        perform the center crop with the crop_size_ as:
+
+        .. math::
+        crop\_size\_ = crop\_size / (crop\_size + crop\_padding) * short\_edge
+
+        And then the pipeline resizes the img to the input crop size.
+    """
+    def __init__(self,
+                 crop_size,
+                 efficientnet_style=False,
+                 crop_padding=32,
+                 interpolation='bilinear',
+                 backend='cv2'):
+        if efficientnet_style:
+            assert isinstance(crop_size, int)
+            assert crop_padding >= 0
+            assert interpolation in ('nearest', 'bilinear', 'bicubic', 'area',
+                                     'lanczos')
+            if backend not in ['cv2', 'pillow']:
+                raise ValueError(
+                    f'backend: {backend} is not supported for '
+                    'resize. Supported backends are "cv2", "pillow"')
+        else:
+            assert isinstance(crop_size, int) or (isinstance(crop_size, tuple)
+                                                  and len(crop_size) == 2)
+        if isinstance(crop_size, int):
+            crop_size = (crop_size, crop_size)
+        assert crop_size[0] > 0 and crop_size[1] > 0
+        self.crop_size = crop_size
+        self.efficientnet_style = efficientnet_style
+        self.crop_padding = crop_padding
+        self.interpolation = interpolation
+        self.backend = backend
+
+    def __call__(self, results):
+        crop_height, crop_width = self.crop_size[0], self.crop_size[1]
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            # img.shape has length 2 for grayscale, length 3 for color
+            img_height, img_width = img.shape[:2]
+
+            # https://github.com/tensorflow/tpu/blob/master/models/official/efficientnet/preprocessing.py#L118 # noqa
+            if self.efficientnet_style:
+                img_short = min(img_height, img_width)
+                crop_height = crop_height / (crop_height +
+                                             self.crop_padding) * img_short
+                crop_width = crop_width / (crop_width +
+                                           self.crop_padding) * img_short
+
+            y1 = max(0, int(round((img_height - crop_height) / 2.)))
+            x1 = max(0, int(round((img_width - crop_width) / 2.)))
+            y2 = min(img_height, y1 + crop_height) - 1
+            x2 = min(img_width, x1 + crop_width) - 1
+
+            # crop the image
+            img = mmcv.imcrop(img, bboxes=np.array([x1, y1, x2, y2]))
+
+            if self.efficientnet_style:
+                img = mmcv.imresize(img,
+                                    tuple(self.crop_size[::-1]),
+                                    interpolation=self.interpolation,
+                                    backend=self.backend)
+            img_shape = img.shape
+            results[key] = img
+        results['img_shape'] = img_shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(crop_size={self.crop_size}'
+        repr_str += f', efficientnet_style={self.efficientnet_style}'
+        repr_str += f', crop_padding={self.crop_padding}'
+        repr_str += f', interpolation={self.interpolation}'
+        repr_str += f', backend={self.backend})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Normalize(object):
+    """Normalize the image.
+
+    Args:
+        mean (sequence): Mean values of 3 channels.
+        std (sequence): Std values of 3 channels.
+        to_rgb (bool): Whether to convert the image from BGR to RGB,
+            default is true.
+    """
+    def __init__(self, mean, std, to_rgb=True):
+        self.mean = np.array(mean, dtype=np.float32)
+        self.std = np.array(std, dtype=np.float32)
+        self.to_rgb = to_rgb
+
+    def __call__(self, results):
+        for key in results.get('img_fields', ['img']):
+            results[key] = mmcv.imnormalize(results[key], self.mean, self.std,
+                                            self.to_rgb)
+        results['img_norm_cfg'] = dict(mean=self.mean,
+                                       std=self.std,
+                                       to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(mean={list(self.mean)}, '
+        repr_str += f'std={list(self.std)}, '
+        repr_str += f'to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class ColorJitter(object):
+    """Randomly change the brightness, contrast and saturation of an image.
+
+    Args:
+        brightness (float): How much to jitter brightness.
+            brightness_factor is chosen uniformly from
+            [max(0, 1 - brightness), 1 + brightness].
+        contrast (float): How much to jitter contrast.
+            contrast_factor is chosen uniformly from
+            [max(0, 1 - contrast), 1 + contrast].
+        saturation (float): How much to jitter saturation.
+            saturation_factor is chosen uniformly from
+            [max(0, 1 - saturation), 1 + saturation].
+    """
+    def __init__(self, brightness, contrast, saturation):
+        self.brightness = brightness
+        self.contrast = contrast
+        self.saturation = saturation
+
+    def __call__(self, results):
+        brightness_factor = random.uniform(0, self.brightness)
+        contrast_factor = random.uniform(0, self.contrast)
+        saturation_factor = random.uniform(0, self.saturation)
+        color_jitter_transforms = [
+            dict(type='Brightness',
+                 magnitude=brightness_factor,
+                 prob=1.,
+                 random_negative_prob=0.5),
+            dict(type='Contrast',
+                 magnitude=contrast_factor,
+                 prob=1.,
+                 random_negative_prob=0.5),
+            dict(type='ColorTransform',
+                 magnitude=saturation_factor,
+                 prob=1.,
+                 random_negative_prob=0.5)
+        ]
+        random.shuffle(color_jitter_transforms)
+        transform = Compose(color_jitter_transforms)
+        return transform(results)
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(brightness={self.brightness}, '
+        repr_str += f'contrast={self.contrast}, '
+        repr_str += f'saturation={self.saturation})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class Lighting(object):
+    """Adjust images lighting using AlexNet-style PCA jitter.
+
+    Args:
+        eigval (list): the eigenvalue of the convariance matrix of pixel
+            values, respectively.
+        eigvec (list[list]): the eigenvector of the convariance matrix of pixel
+            values, respectively.
+        alphastd (float): The standard deviation for distribution of alpha.
+            Defaults to 0.1
+        to_rgb (bool): Whether to convert img to rgb.
+    """
+    def __init__(self, eigval, eigvec, alphastd=0.1, to_rgb=True):
+        assert isinstance(eigval, list), \
+            f'eigval must be of type list, got {type(eigval)} instead.'
+        assert isinstance(eigvec, list), \
+            f'eigvec must be of type list, got {type(eigvec)} instead.'
+        for vec in eigvec:
+            assert isinstance(vec, list) and len(vec) == len(eigvec[0]), \
+                'eigvec must contains lists with equal length.'
+        self.eigval = np.array(eigval)
+        self.eigvec = np.array(eigvec)
+        self.alphastd = alphastd
+        self.to_rgb = to_rgb
+
+    def __call__(self, results):
+        for key in results.get('img_fields', ['img']):
+            img = results[key]
+            results[key] = mmcv.adjust_lighting(img,
+                                                self.eigval,
+                                                self.eigvec,
+                                                alphastd=self.alphastd,
+                                                to_rgb=self.to_rgb)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(eigval={self.eigval.tolist()}, '
+        repr_str += f'eigvec={self.eigvec.tolist()}, '
+        repr_str += f'alphastd={self.alphastd}, '
+        repr_str += f'to_rgb={self.to_rgb})'
+        return repr_str
+
+
+@PIPELINES.register_module()
+class RandomChannelNoise:
+    """Data augmentation with random channel noise.
+
+    Required keys: 'img'
+    Modifies key: 'img'
+    Args:
+        noise_factor (float): Multiply each channel with
+         a factor between``[1-scale_factor, 1+scale_factor]``
+    """
+    def __init__(self, noise_factor=0.4):
+        self.noise_factor = noise_factor
+
+    def __call__(self, results):
+        """Perform data augmentation with random channel noise."""
+        img = results['img']
+
+        # Each channel is multiplied with a number
+        # in the area [1-self.noise_factor, 1+self.noise_factor]
+        pn = np.random.uniform(1 - self.noise_factor, 1 + self.noise_factor,
+                               (1, 3))
+        img = cv2.multiply(img, pn)
+
+        results['img'] = img
+
+        if 'ori_img' in results:
+            img = results['ori_img']
+            img = cv2.multiply(img, pn)
+
+            results['ori_img'] = img
+
+        return results
+
+
+@PIPELINES.register_module()
+class GetRandomScaleRotation:
+    """Data augmentation with random scaling & rotating.
+
+    Required key: 'scale'. Modifies key: 'scale' and 'rotation'.
+    Args:
+        rot_factor (int): Rotating to ``[-2*rot_factor, 2*rot_factor]``.
+        scale_factor (float): Scaling to ``[1-scale_factor, 1+scale_factor]``.
+        rot_prob (float): Probability of random rotation.
+    """
+    def __init__(self, rot_factor=30, scale_factor=0.25, rot_prob=0.6):
+        self.rot_factor = rot_factor
+        self.scale_factor = scale_factor
+        self.rot_prob = rot_prob
+
+    def __call__(self, results):
+        """Perform data augmentation with random scaling & rotating."""
+        s = results['scale']
+
+        sf = self.scale_factor
+        rf = self.rot_factor
+
+        s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf)
+        s = s * s_factor
+
+        r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2)
+        r = r_factor if np.random.rand() <= self.rot_prob else 0.0
+
+        results['scale'] = s
+        results['rotation'] = r
+
+        return results
+
+
+@PIPELINES.register_module()
+class SampleInstance:
+    def __init__(self, sample_ratio):
+        self.sample_ratio = sample_ratio
+
+    def __call__(self, results):
+        
+
+        assert 'bbox_xywh' in results
+        bbox_xywh = results['bbox_xywh'].copy()
+        crop_person_number = len(bbox_xywh)
+        if random.random() < self.sample_ratio:
+            crop_person_number = np.random.randint(len(bbox_xywh)) + 1
+
+        sample_ids = np.array(
+            random.sample(list(range(len(bbox_xywh))), crop_person_number))
+
+        bbox_xyxy = xywh2xyxy(bbox_xywh)[sample_ids]
+
+        leftTop_ = bbox_xyxy[:, :2]
+        leftTop_ = np.array([np.min(leftTop_[:, 0]), np.min(leftTop_[:, 1])])
+        rightBottom_ = bbox_xyxy[:, 2:4]
+        rightBottom_ = np.array(
+            [np.max(rightBottom_[:, 0]),
+             np.max(rightBottom_[:, 1])])
+        bbox_xyxy = np.concatenate([leftTop_, rightBottom_])
+        results['bbox_xyxy'] = bbox_xyxy
+        center = (rightBottom_ + leftTop_) / 2
+        scale = (rightBottom_ - leftTop_)
+        scale[0] = scale[1] = max(scale)
+        results['center'] = center
+        results['scale'] = scale
+        return results
+
+        
+
+
+@PIPELINES.register_module()
+class MeshAffine:
+    """Affine transform the image to get input image.
+
+    Affine transform the 2D keypoints, 3D kepoints. Required keys: 'img',
+    'pose', 'img_shape', 'rotation' and 'center'. Modifies key: 'img',
+    ''keypoints2d', 'keypoints3d', 'pose'.
+    """
+    def __init__(self, img_res, crop_with_bbox=True):
+        if isinstance(img_res, tuple):
+            self.image_size = img_res
+        else:
+            self.image_size = np.array([img_res, img_res])
+        self.img_res = img_res
+        self.crop_with_bbox = crop_with_bbox
+
+    def __call__(self, results):
+
+        c = results['center']
+        s = results['scale']
+        r = results['rotation']
+
+        trans = get_affine_transform(c, s, r, self.image_size)
+
+        if 'img' in results:
+            img = results['img'].copy()
+
+            # img before affine
+            ori_img = img.copy()
+            results['crop_transform'] = trans
+            results['ori_img'] = ori_img
+            results['img_fields'] = ['img', 'ori_img']
+
+            img = cv2.warpAffine(
+                img,
+                trans, (int(self.image_size[0]), int(self.image_size[1])),
+                flags=cv2.INTER_LINEAR)
+            results['img'] = img
+
+        if 'keypoints2d' in results:
+            keypoints2d = results['keypoints2d'].copy()
+
+            results['keypoints2d'][..., :2] = affine_transform(
+                keypoints2d, trans)
+        if 'bbox_xywh' in results:
+            bbox_xywh = results['bbox_xywh'].copy()
+
+            leftTop = bbox_xywh[..., :2]
+            rightTop = np.concatenate([
+                bbox_xywh[..., [0]] + bbox_xywh[..., [2]], bbox_xywh[..., [1]]
+            ], -1)
+            leftBottom = np.concatenate([
+                bbox_xywh[..., [0]], bbox_xywh[..., [1]] + bbox_xywh[..., [3]]
+            ], -1)
+            rightBottom = np.concatenate([
+                bbox_xywh[..., [0]] + bbox_xywh[..., [2]],
+                bbox_xywh[..., [1]] + bbox_xywh[..., [3]]
+            ], -1)
+
+            bbox_point = np.vstack(
+                [leftTop, rightTop, leftBottom, rightBottom])
+            bbox_point = np.concatenate(
+                [bbox_point, np.ones_like(bbox_point[..., [0]])], -1)
+            bbox_point = affine_transform(bbox_point, trans)
+            # TODO:
+            bbox_point = np.clip(bbox_point, 0, self.img_res)
+            bbox__xywh_t = bbox_point.clone()
+            bbox__xywh_t
+            results['bbox'] = bbox_point
+
+            # bbox_xyxy = xywh2xyxy(bbox_xywh)[:,:4].reshape(-1, 2, 2)
+            # bbox_xyxy = np.concatenate([bbox_xyxy, np.ones_like(bbox_xyxy[...,[0]])], -1)
+            # bbox_xyxy = np.concatenate([affine_transform(bbox_xyxy, trans).reshape(-1,4), bbox_xywh[...,[-1]]],-1)
+            # results['bbox_xywh'] = xyxy2xywh(bbox_xyxy)
+
+            #     image_array=np.array([img]),
+            #     overwrite=True,
+            #     data_source='smpl_54')
+        if 'keypoints3d' in results:
+            keypoints3d = results['keypoints3d'].copy()
+            keypoints3d[..., :3] = _rotate_joints_3d(keypoints3d[..., :3], r)
+            results['keypoints3d'] = keypoints3d
+
+        if 'smpl_body_pose' in results:
+            global_orient = results['smpl_global_orient'].copy()
+            body_pose = results['smpl_body_pose'].copy().reshape((-1, 23 * 3))
+            pose = np.concatenate((global_orient, body_pose), axis=-1)
+            pose = _rotate_smpl_pose(pose, r)
+            results['smpl_global_orient'] = pose[..., :3]
+            results['smpl_body_pose'] = pose[..., 3:].reshape((-1, 23, 3))
+
+        if 'smplx_global_orient' in results:
+            global_orient = results['smplx_global_orient'].copy()
+            global_orient = _rotate_smpl_pose(global_orient, r)
+            results['smplx_global_orient'] = global_orient
+
+        return results
+
+
+@PIPELINES.register_module()
+class MeshAffineED:
+    """Affine transform the image to get input image.
+
+    Affine transform the 2D keypoints, 3D kepoints. Required keys: 'img',
+    'pose', 'img_shape', 'rotation' and 'center'. Modifies key: 'img',
+    ''keypoints2d', 'keypoints3d', 'pose'.
+    """
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+
+    def __call__(self, results):
+        ori_shape = np.array(results['ori_shape'])
+        # ori_shape = ori_shape[::-1]
+        # print(ori_shape)
+        size = random.choice(self.sizes)
+        reshape_size = resize(ori_shape, size, self.max_size)
+        c = (ori_shape / 2)[::-1]
+        s = ori_shape[::-1]
+        r = results['rotation']
+
+        trans = get_affine_transform(c, s, r, reshape_size[::-1])
+
+        results['img_shape'] = reshape_size
+        if 'img' in results:
+            img = results['img'].copy()
+
+            # img before affine
+            ori_img = img.copy()
+            results['crop_transform'] = trans
+            results['ori_img'] = ori_img
+            results['img_fields'] = ['img', 'ori_img']
+
+            img = cv2.warpAffine(img,
+                                 trans,
+                                 (int(reshape_size[1]), int(reshape_size[0])),
+                                 flags=cv2.INTER_LINEAR)
+            results['img'] = img
+
+        if 'keypoints2d_ori' in results:
+            keypoints2d_ori = results['keypoints2d_ori'].copy()
+
+            results['keypoints2d_ori'][..., :2] = affine_transform(
+                keypoints2d_ori, trans)
+
+        if 'keypoints2d_smpl' in results:
+            keypoints2d_smpl = results['keypoints2d_smpl'].copy()
+
+            results['keypoints2d_smpl'][..., :2] = affine_transform(
+                keypoints2d_smpl, trans)
+
+        if 'bbox_xywh' in results:
+            bbox_xywh = results['bbox_xywh'].copy()
+
+            leftTop = bbox_xywh[..., :2]
+            rightTop = np.concatenate([
+                bbox_xywh[..., [0]] + bbox_xywh[..., [2]], bbox_xywh[..., [1]]
+            ], -1)
+            leftBottom = np.concatenate([
+                bbox_xywh[..., [0]], bbox_xywh[..., [1]] + bbox_xywh[..., [3]]
+            ], -1)
+            rightBottom = np.concatenate([
+                bbox_xywh[..., [0]] + bbox_xywh[..., [2]],
+                bbox_xywh[..., [1]] + bbox_xywh[..., [3]]
+            ], -1)
+
+            bbox_point = np.vstack(
+                [leftTop, rightTop, leftBottom, rightBottom])
+            bbox_point = np.concatenate(
+                [bbox_point, np.ones_like(bbox_point[..., [0]])], -1)
+            bbox_point = affine_transform(bbox_point, trans)
+            # TODO:
+
+            bbox_point = np.clip(bbox_point, 0,
+                                 (int(reshape_size[1]), int(reshape_size[0])))
+            results['bbox'] = bbox_point
+
+            bbox_xyxy_t = bbox_xywh.copy()
+            num_sample = bbox_xywh.shape[0]
+            bbox_xyxy_t[..., :2] = bbox_point[:num_sample, :]
+            bbox_xyxy_t[...,
+                        2:4] = bbox_point[num_sample * 3:num_sample * 4, :]
+
+            results['bbox_xywh'] = xyxy2xywh(bbox_xyxy_t)
+            # bbox_xywh = results['bbox_xywh'].copy()
+            # bbox_xyxy = xywh2xyxy(bbox_xywh)[:,:4].reshape(-1, 2, 2)
+            # bbox_xyxy = np.concatenate([bbox_xyxy, np.ones_like(bbox_xyxy[...,[0]])], -1)
+            # bbox_xyxy = np.concatenate([affine_transform(bbox_xyxy, trans).reshape(-1,4), bbox_xywh[...,[-1]]],-1)
+            # results['bbox_xywh'] = xyxy2xywh(bbox_xyxy)
+
+            #     image_array=np.array([img]),
+            #     overwrite=True,
+            #     data_source='smpl_54')
+        if 'keypoints3d_ori' in results:
+            keypoints3d_ori = results['keypoints3d_ori'].copy()
+            keypoints3d_ori[..., :3] = _rotate_joints_3d(
+                keypoints3d_ori[..., :3], r)
+            results['keypoints3d_ori'] = keypoints3d_ori
+
+        if 'keypoints3d_smpl' in results:
+            keypoints3d_smpl = results['keypoints3d_smpl'].copy()
+            keypoints3d_smpl[..., :3] = _rotate_joints_3d(
+                keypoints3d_smpl[..., :3], r)
+            results['keypoints3d_smpl'] = keypoints3d_smpl
+
+        if 'smpl_body_pose' in results:
+            global_orient = results['smpl_global_orient'].copy()
+            body_pose = results['smpl_body_pose'].copy().reshape((-1, 23 * 3))
+            pose = np.concatenate((global_orient, body_pose), axis=-1)
+            pose = _rotate_smpl_pose(pose, r)
+            results['smpl_global_orient'] = pose[..., :3]
+            results['smpl_body_pose'] = pose[..., 3:].reshape((-1, 23, 3))
+
+        if 'area' in results:
+            area = results['area'] * (trans[0, 0] * trans[1, 1])
+            results['area'] = area
+        # if 'smplx_global_orient' in results:
+        #     global_orient = results['smplx_global_orient'].copy()
+        #     global_orient = _rotate_smpl_pose(global_orient, r)
+        #     results['smplx_global_orient'] = global_orient
+
+        return results
+
+
+@PIPELINES.register_module()
+class Rotation:
+    """Rotate the image with the given rotation.
+
+    Rotate the 2D keypoints, 3D kepoints, poses. Required keys: 'img',
+    'pose', 'rotation' and 'center'. Modifies key: 'img',
+    ''keypoints2d', 'keypoints3d', 'pose'.
+
+    To avoid conflicts with MeshAffine, rotation will be set to 0.0
+    after rotate the image.
+    The rotation value will be stored to 'ori_rotation'.
+    """
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        r = results['rotation']
+        if r == 0.0:
+            return results
+        img = results['img']
+
+        # img before affine
+        (h, w) = img.shape[:2]
+        (cX, cY) = (w // 2, h // 2)
+        M = cv2.getRotationMatrix2D((cX, cY), r, 1.0)
+        cos = np.abs(M[0, 0])
+        sin = np.abs(M[0, 1])
+        # compute the new bounding dimensions of the image
+        nW = int((h * sin) + (w * cos))
+        nH = int((h * cos) + (w * sin))
+        # adjust the rotation matrix to take into account translation
+        M[0, 2] += (nW / 2) - cX
+        M[1, 2] += (nH / 2) - cY
+        # perform the actual rotation and return the image
+        img = cv2.warpAffine(img, M, (nW, nH))
+
+        results['img'] = img
+
+        c = results['center']
+        c = np.dot(M[:2, :2], c) + M[:2, 2]
+        results['center'] = c
+
+        if 'keypoints2d' in results:
+            keypoints2d = results['keypoints2d'].copy()
+            keypoints2d[:, :2] = (np.dot(keypoints2d[:, :2], M[:2, :2].T) +
+                                  M[:2, 2] + 1).astype(np.int)
+            results['keypoints2d'] = keypoints2d
+
+        if 'keypoints3d' in results:
+            keypoints3d = results['keypoints3d'].copy()
+            keypoints3d[:, :3] = _rotate_joints_3d(keypoints3d[:, :3], r)
+            results['keypoints3d'] = keypoints3d
+
+        if 'smpl_body_pose' in results:
+            global_orient = results['smpl_global_orient'].copy()
+            body_pose = results['smpl_body_pose'].copy().reshape((-1))
+            pose = np.concatenate((global_orient, body_pose), axis=-1)
+            pose = _rotate_smpl_pose(pose, r)
+            results['smpl_global_orient'] = pose[:3]
+            results['smpl_body_pose'] = pose[3:].reshape((-1, 3))
+
+        if 'smplx_global_orient' in results:
+            global_orient = results['smplx_global_orient'].copy()
+            global_orient = _rotate_smpl_pose(global_orient, r)
+            results['smplx_global_orient'] = global_orient
+
+        results['rotation'] = 0.0
+        results['ori_rotation'] = r
+        return results
+
+
+@PIPELINES.register_module()
+class BBoxCenterJitter(object):
+    def __init__(self, factor=0.0, dist='normal'):
+        super(BBoxCenterJitter, self).__init__()
+        self.factor = factor
+        self.dist = dist
+        assert self.dist in [
+            'normal', 'uniform'
+        ], (f'Distribution must be normal or uniform, not {self.dist}')
+
+    def __call__(self, results):
+        # body model: no process
+        if self.factor <= 1e-3:
+            return results
+
+        bbox_size = results['scale'][0]
+
+        jitter = bbox_size * self.factor
+
+        if self.dist == 'normal':
+            center_jitter = np.random.randn(2) * jitter
+        elif self.dist == 'uniform':
+            center_jitter = np.random.rand(2) * 2 * jitter - jitter
+
+        center = results['center']
+        H, W = results['img_shape']
+        new_center = center + center_jitter
+        new_center[0] = np.clip(new_center[0], 0, W)
+        new_center[1] = np.clip(new_center[1], 0, H)
+
+        results['center'] = new_center
+        return results
+
+
+@PIPELINES.register_module()
+class SimulateLowRes(object):
+    def __init__(self,
+                 dist: str = 'categorical',
+                 factor: float = 1.0,
+                 cat_factors=(1.0, ),
+                 factor_min: float = 1.0,
+                 factor_max: float = 1.0) -> None:
+        self.factor_min = factor_min
+        self.factor_max = factor_max
+        self.dist = dist
+        self.cat_factors = cat_factors
+        assert dist in ['uniform', 'categorical']
+
+    def _sample_low_res(self, image: np.ndarray) -> np.ndarray:
+        """"""
+        if self.dist == 'uniform':
+            downsample = self.factor_min != self.factor_max
+            if not downsample:
+                return image
+            factor = np.random.rand() * (self.factor_max -
+                                         self.factor_min) + self.factor_min
+        elif self.dist == 'categorical':
+            if len(self.cat_factors) < 2:
+                return image
+            idx = np.random.randint(0, len(self.cat_factors))
+            factor = self.cat_factors[idx]
+
+        H, W, _ = image.shape
+        downsampled_image = cv2.resize(image,
+                                       (int(W // factor), int(H // factor)),
+                                       cv2.INTER_NEAREST)
+        resized_image = cv2.resize(downsampled_image, (W, H),
+                                   cv2.INTER_LINEAR_EXACT)
+        return resized_image
+
+    def __call__(self, results):
+        """"""
+        img = results['img']
+        img = self._sample_low_res(img)
+        results['img'] = img
+
+        return results
diff --git a/detrsmpl/data/datasets/samplers/__init__.py b/detrsmpl/data/datasets/samplers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cffe4dcb0fa663750bd88941e06ad336a43b527f
--- /dev/null
+++ b/detrsmpl/data/datasets/samplers/__init__.py
@@ -0,0 +1,3 @@
+from .distributed_sampler import DistributedSampler
+
+__all__ = ['DistributedSampler']
diff --git a/detrsmpl/data/datasets/samplers/distributed_sampler.py b/detrsmpl/data/datasets/samplers/distributed_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2388e072cba588db134c224dd04e20ee20c9bbbd
--- /dev/null
+++ b/detrsmpl/data/datasets/samplers/distributed_sampler.py
@@ -0,0 +1,41 @@
+import torch
+from torch.utils.data import DistributedSampler as _DistributedSampler
+
+
+class DistributedSampler(_DistributedSampler):
+    def __init__(self,
+                 dataset,
+                 num_replicas=None,
+                 rank=None,
+                 shuffle=True,
+                 round_up=True):
+        super().__init__(dataset, num_replicas=num_replicas, rank=rank)
+        self.shuffle = shuffle
+        self.round_up = round_up
+        if self.round_up:
+            self.total_size = self.num_samples * self.num_replicas
+        else:
+            self.total_size = len(self.dataset)
+
+    def __iter__(self):
+        # deterministically shuffle based on epoch
+        if self.shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.epoch)
+            indices = torch.randperm(len(self.dataset), generator=g).tolist()
+        else:
+            indices = torch.arange(len(self.dataset)).tolist()
+
+        # add extra samples to make it evenly divisible
+        if self.round_up:
+            indices = (
+                indices *
+                int(self.total_size / len(indices) + 1))[:self.total_size]
+        assert len(indices) == self.total_size
+
+        # subsample
+        indices = indices[self.rank:self.total_size:self.num_replicas]
+        if self.round_up:
+            assert len(indices) == self.num_samples
+
+        return iter(indices)
diff --git a/detrsmpl/models/__init__.py b/detrsmpl/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/models/architectures/DetrSMPL.py b/detrsmpl/models/architectures/DetrSMPL.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f5d8eaa960578f07f0e5b1a548dc34dfe0a0ec0
--- /dev/null
+++ b/detrsmpl/models/architectures/DetrSMPL.py
@@ -0,0 +1,771 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+from typing import Optional, Union
+
+import torch
+from scipy.optimize import linear_sum_assignment
+import numpy as np
+from detrsmpl.core.post_processing.bbox.assigners import build_assigner
+from detrsmpl.core.post_processing.bbox.samplers import build_sampler
+from detrsmpl.core.conventions.keypoints_mapping import (get_keypoint_idx,
+                                                          convert_kps)
+from detrsmpl.utils.geometry import batch_rodrigues
+from detrsmpl.utils.geometry import project_points
+from detrsmpl.utils.misc import multi_apply
+from ..backbones.builder import build_backbone
+from ..body_models.builder import build_body_model
+from ..heads.builder import build_head
+from ..losses.builder import build_loss
+from ..necks.builder import build_neck
+from .base_architecture import BaseArchitecture
+
+# from mmdet.core import bbox2result
+
+
+class MultiBodyEstimator(BaseArchitecture, metaclass=ABCMeta):
+    def __init__(
+            self,
+            backbone: Optional[Union[dict, None]] = None,
+            neck: Optional[Union[dict, None]] = None,
+            head: Optional[Union[dict, None]] = None,
+            disc: Optional[Union[dict, None]] = None,
+            registration: Optional[Union[dict, None]] = None,
+            body_model_train: Optional[Union[dict, None]] = None,
+            body_model_test: Optional[Union[dict, None]] = None,
+            convention: Optional[str] = 'human_data',
+            loss_keypoints2d: Optional[Union[dict, None]] = None,
+            loss_keypoints3d: Optional[Union[dict, None]] = None,
+            loss_vertex: Optional[Union[dict, None]] = None,
+            loss_smpl_pose: Optional[Union[dict, None]] = None,
+            loss_smpl_betas: Optional[Union[dict, None]] = None,
+            loss_camera: Optional[Union[dict, None]] = None,
+            loss_cls: Optional[Union[dict,
+                                     None]] = dict(type='CrossEntropyLoss',
+                                                   bg_cls_weight=0.1,
+                                                   use_sigmoid=False,
+                                                   loss_weight=1.0,
+                                                   class_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+            loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+            init_cfg: Optional[Union[list, dict, None]] = None,
+            train_cfg:
+        Optional[Union[dict, None]] = dict(assigner=dict(
+            type='HungarianAssigner',
+            kp3d_cost=dict(
+                type='Keypoints3DCost', convention='smpl_54', weight=5.0),
+            kp2d_cost=dict(
+                type='Keypoints2DCost', convention='smpl_54', weight=5.0),
+            # cls_cost=dict(type='ClassificationCost', weight=1.),
+            # reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+            # iou_cost=dict(
+            #     type='IoUCost', iou_mode='giou', weight=2.0))
+        )),
+            test_cfg: Optional[Union[dict, None]] = None):
+
+        super(MultiBodyEstimator, self).__init__(init_cfg)
+        self.backbone = build_backbone(backbone)
+        if neck is not None:
+            self.neck = build_neck(neck)
+        head.update(train_cfg=train_cfg)
+        head.update(test_cfg=test_cfg)
+        self.head = build_head(head)
+        # class_weight = loss_cls.get('class_weight', None)
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided '\
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            # TODO: update these
+            # assert loss_cls['loss_weight'] == assigner['kp3d_cost']['weight'], \
+            #     'The classification weight for loss and matcher should be' \
+            #     'exactly the same.'
+            # assert loss_bbox['loss_weight'] == assigner['kp3d_cost'][
+            #     'weight'], 'The regression L1 weight for loss and matcher ' \
+            #     'should be exactly the same.'
+            # assert loss_iou['loss_weight'] == assigner['kp3d_cost']['weight'], \
+            #     'The regression iou weight for loss and matcher should be' \
+            #     'exactly the same.'
+            self.assigner = build_assigner(assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        # build loss
+        self.loss_keypoints2d = build_loss(loss_keypoints2d)
+        self.loss_keypoints3d = build_loss(loss_keypoints3d)
+        self.loss_vertex = build_loss(loss_vertex)
+        self.loss_smpl_pose = build_loss(loss_smpl_pose)
+        self.loss_smpl_betas = build_loss(loss_smpl_betas)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_iou = build_loss(loss_iou)
+
+        self.body_model_train = build_body_model(body_model_train)
+        self.body_model_test = build_body_model(body_model_test)
+        self.convention = convention
+
+    def extract_feat(self, img):
+        """Directly extract features from the backbone+neck."""
+        x = self.backbone(img)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def forward_dummy(self, img):
+        """Used for computing network flops.
+
+        See `mmdetection/tools/analysis_tools/get_flops.py`
+        """
+        x = self.extract_feat(img)
+        outs = self.bbox_head(x)
+        return outs
+
+    def forward_train(self, img, img_metas, **kwargs):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # super(SingleStageDetector, self).forward_train(img, img_metas)
+        # NOTE the batched image size information may be useful, e.g.
+        # in DETR, this is needed for the construction of masks, which is
+        # then used for the transformer_head.
+
+        has_smpl = kwargs['has_smpl']
+        gt_smpl_body_pose = kwargs[
+            'smpl_body_pose']  # [bs_0: [ins_num, 23, 3]]
+        gt_smpl_global_orient = kwargs['smpl_global_orient']
+        gt_smpl_body_pose = \
+            [torch.cat((gt_smpl_global_orient[i].view(-1, 1, 3),
+                        gt_smpl_body_pose[i]), dim=1).float()
+             for i in range(len(gt_smpl_body_pose))]
+        gt_smpl_betas = kwargs['smpl_betas']
+        gt_smpl_transl = kwargs['smpl_transl']
+        gt_keypoints2d = kwargs['keypoints2d']
+        gt_keypoints3d = kwargs['keypoints3d']  # [bs_0: [N. K, D], ...]
+
+        if 'has_keypoints3d' in kwargs:
+            has_keypoints3d = kwargs['has_keypoints3d']
+        else:
+            has_keypoints3d = None
+
+        if 'has_keypoints2d' in kwargs:
+            has_keypoints2d = kwargs['has_keypoints2d']
+        else:
+            has_keypoints2d = None
+
+        batch_input_shape = tuple(img[0].size()[-2:])
+        for img_meta in img_metas:
+            img_meta['batch_input_shape'] = batch_input_shape
+
+        # features = self.extract_feat(img)
+        features = self.backbone(img)
+
+        if self.neck is not None:
+            features = self.neck(features)
+
+        # outputs_classes, outputs_coords,
+        pred_pose, \
+            pred_betas, pred_cameras, _, _  = self.head(features, img_metas)
+
+        L, B, N = pred_pose.shape[:3]
+        if self.body_model_train is not None:
+            pred_output = self.body_model_train(
+                betas=pred_betas.reshape(L * B * N, 10),
+                body_pose=pred_pose.reshape(L * B * N, 24, 3, 3)[:, 1:],
+                global_orient=pred_pose.reshape(L * B * N, 24, 3,
+                                                3)[:, 0].unsqueeze(1),
+                pose2rot=False,
+                num_joints=gt_keypoints2d[0].shape[1])
+            pred_keypoints3d = pred_output['joints'].reshape(L, B, N, -1, 3)
+            pred_vertices = pred_output['vertices'].reshape(L, B, N, 6890, 3)
+        # loss
+        num_dec_layers = pred_pose.shape[0]
+
+        all_gt_smpl_body_pose_list = [
+            gt_smpl_body_pose for _ in range(num_dec_layers)
+        ]
+        all_gt_smpl_global_orient_list = [
+            gt_smpl_global_orient for _ in range(num_dec_layers)
+        ]
+        all_gt_smpl_betas_list = [gt_smpl_betas for _ in range(num_dec_layers)]
+        all_gt_smpl_transl_list = [
+            gt_smpl_transl for _ in range(num_dec_layers)
+        ]
+        all_gt_keypoints2d_list = [
+            gt_keypoints2d for _ in range(num_dec_layers)
+        ]
+        all_gt_keypoints3d_list = [
+            gt_keypoints3d for _ in range(num_dec_layers)
+        ]
+        all_has_smpl_list = [has_smpl for _ in range(num_dec_layers)]
+        all_has_keypoints3d_list = [
+            has_keypoints3d for _ in range(num_dec_layers)
+        ]
+        all_has_keypoints2d_list = [
+            has_keypoints2d for _ in range(num_dec_layers)
+        ]
+        all_gt_ignore_list = [None for _ in range(num_dec_layers)]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+        # all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        # all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        # all_gt_bboxes_ignore_list = [
+        #     gt_bboxes_ignore for _ in range(num_dec_layers)
+        # ]
+        # computer loss for each layer
+        (kp2d_loss, kp3d_loss, vert_loss, pose_loss, beta_loss) = multi_apply(
+            self.compute_losses, pred_pose, pred_betas, pred_keypoints3d,
+            pred_vertices, pred_cameras, all_gt_smpl_body_pose_list,
+            all_gt_smpl_betas_list, all_gt_keypoints2d_list,
+            all_gt_keypoints3d_list, all_has_keypoints2d_list,
+            all_has_keypoints3d_list, all_has_smpl_list, img_metas_list,
+            all_gt_ignore_list)
+
+        losses = {}
+        losses['keypoints2d_loss'] = kp2d_loss[-1]
+        losses['keypoints3d_loss'] = kp3d_loss[-1]
+        losses['vertex_loss'] = vert_loss[-1]
+        losses['smpl_pose_loss'] = pose_loss[-1]
+        losses['smpl_betas_loss'] = beta_loss[-1]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for (kp2d_loss_i, kp3d_loss_i, vert_loss_i, pose_loss_i,
+             beta_loss_i) in zip(kp2d_loss[:-1], kp3d_loss[:-1],
+                                 vert_loss[:-1], pose_loss[:-1],
+                                 beta_loss[:-1]):
+            losses[f'd{num_dec_layer}.keypoints2d_loss'] = kp2d_loss_i
+            losses[f'd{num_dec_layer}.keypoints3d_loss'] = kp3d_loss_i
+            losses[f'd{num_dec_layer}.vertex_loss'] = vert_loss_i
+            losses[f'd{num_dec_layer}.smpl_pose_loss'] = pose_loss_i
+            losses[f'd{num_dec_layer}.smpl_betas_loss'] = beta_loss_i
+            num_dec_layer += 1
+
+        return losses
+
+    def compute_losses(self,
+                       outputs_poses,
+                       outputs_shapes,
+                       outputs_kp3ds,
+                       outputs_verts,
+                       outputs_cameras,
+                       all_gt_smpl_body_pose_list,
+                       all_gt_smpl_betas_list,
+                       all_gt_kp2d_list,
+                       all_gt_kp3d_list,
+                       all_has_keypoints2d_list,
+                       all_has_keypoints3d_list,
+                       all_has_smpl_list,
+                       img_metas_list,
+                       all_gt_ignore_list=None):
+        """_summary_
+            loss_single
+                get_targets
+        Args:
+            outputs_poses (_type_): with shape [B, N, 24, 3, 3]
+            outputs_shapes (_type_): _description_
+            all_gt_smpl_body_pose_list (_type_): _description_
+            all_gt_smpl_betas_list (_type_): _description_
+            all_gt_kp2d_list (Torch.tensor):
+            all_gt_kp3d_list (list): with shape [B, N, K, D]
+            img_metas_list (_type_): _description_
+            all_gt_ignore_list (_type_): _description_
+        """
+        num_img = outputs_poses.size(0)  # batch_size
+        all_pred_smpl_pose_list = [outputs_poses[i] for i in range(num_img)]
+        all_pred_smpl_shape_list = [outputs_shapes[i] for i in range(num_img)]
+        all_pred_kp3d_list = [outputs_kp3ds[i] for i in range(num_img)]
+        all_pred_vert_list = [outputs_verts[i] for i in range(num_img)]
+        all_pred_cam_list = [outputs_cameras[i] for i in range(num_img)]
+
+        gt_bboxes_ignore_list = [all_gt_ignore_list for _ in range(num_img)]
+
+        if all_has_keypoints2d_list is None:
+            all_has_keypoints2d_list = [
+                all_has_keypoints2d_list for _ in range(num_img)
+            ]
+
+        if all_has_keypoints3d_list is None:
+            all_has_keypoints3d_list = [
+                all_has_keypoints3d_list for _ in range(num_img)
+            ]
+
+        if all_has_smpl_list is None:
+            all_has_smpl_list = [all_has_smpl_list for _ in range(num_img)]
+
+        # for each batch data
+        (kp2d_list, kp2d_weight_list, kp3d_list, kp3d_weight_list,
+         smpl_pose_list, smpl_pose_weight_list, smpl_shape_list,
+         smpl_shape_weight_list, vert_list, vert_weight_list, has_smpl_list,
+         has_keypoints2d_list, has_keypoints3d_list, pos_inds_list,
+         neg_inds_list) = multi_apply(
+             self.prepare_targets,
+             all_pred_smpl_pose_list,
+             all_pred_smpl_shape_list,
+             all_pred_kp3d_list,
+             all_pred_vert_list,
+             all_pred_cam_list,
+             all_gt_smpl_body_pose_list,
+             all_gt_smpl_betas_list,
+             all_gt_kp2d_list,
+             all_gt_kp3d_list,
+             all_has_keypoints2d_list,
+             all_has_keypoints3d_list,
+             all_has_smpl_list,
+             img_metas_list,
+             gt_bboxes_ignore_list,
+         )
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+
+        K = outputs_kp3ds.shape[-2]
+
+        gt_kp2d = torch.cat(kp2d_list, 0)
+        kp2d_weight = torch.cat(kp2d_weight_list, 0)
+        pred_cam = outputs_cameras.reshape(-1, 3)
+        # pred_kp2d = torch.cat()
+
+        gt_kp3d = torch.cat(kp3d_list, 0)
+        kp3d_weight = torch.cat(kp3d_weight_list, 0)
+        pred_kp3d = outputs_kp3ds.reshape(-1, K, 3)
+
+        gt_smpl_pose = torch.cat(smpl_pose_list, 0)
+        smpl_pose_weight = torch.cat(smpl_pose_weight_list, 0)
+        pred_smpl_pose = outputs_poses.reshape(-1, 24, 3, 3)
+
+        gt_smpl_shape = torch.cat(smpl_shape_list, 0)
+        smpl_shape_weight = torch.cat(smpl_shape_weight_list, 0)
+        pred_smpl_shape = outputs_shapes.reshape(-1, 10)
+
+        gt_vert = torch.cat(vert_list, 0)
+        vert_weight = torch.cat(vert_weight_list, 0)
+        pred_verts = outputs_verts.reshape(-1, 6890, 3)
+
+        has_smpl = torch.cat(has_smpl_list, 0).squeeze()
+        has_keypoints2d = torch.cat(has_keypoints2d_list, 0).squeeze()
+        has_keypoints3d = torch.cat(has_keypoints3d_list, 0).squeeze()
+
+        # losses = {}
+        if self.loss_keypoints2d is not None:
+            keypoints2d_loss = self.compute_keypoints2d_loss(
+                pred_kp3d, pred_cam, gt_kp2d, has_keypoints2d=has_keypoints2d)
+        else:
+            keypoints2d_loss = 0.0
+
+        if self.loss_keypoints3d is not None:
+            keypoints3d_loss = self.compute_keypoints3d_loss(
+                pred_kp3d,
+                gt_kp3d,
+                has_keypoints3d=has_keypoints3d,
+            )
+        else:
+            keypoints3d_loss = 0.0
+
+        if self.loss_vertex is not None:
+            vertex_loss = self.compute_vertex_loss(pred_verts,
+                                                   gt_vert,
+                                                   has_smpl=has_smpl)
+        else:
+            vertex_loss = 0.0
+
+        if self.loss_smpl_pose is not None:
+            smpl_pose_loss = self.compute_smpl_pose_loss(pred_smpl_pose,
+                                                         gt_smpl_pose,
+                                                         has_smpl=has_smpl)
+        else:
+            smpl_pose_loss = 0.0
+
+        if self.loss_smpl_betas is not None:
+            smpl_betas_loss = self.compute_smpl_betas_loss(pred_smpl_shape,
+                                                           gt_smpl_shape,
+                                                           has_smpl=has_smpl)
+        else:
+            smpl_betas_loss = 0.0
+        # if self.loss_iou is not None:
+        #     losses['iou_loss'] = self.loss_iou()
+
+        # if self.loss_bbox is not None:
+        #     losses['bbox_loss'] = self.loss_bbox()
+
+        # if self.loss_cls is not None:
+        #     losses['cls_loss'] = self.loss_bbox()
+
+        return (keypoints2d_loss, keypoints3d_loss, vertex_loss,
+                smpl_pose_loss, smpl_betas_loss)
+
+    def prepare_targets(self, pred_smpl_pose, pred_smpl_shape, pred_kp3d,
+                        pred_vert, pred_cam, gt_smpl_pose, gt_smpl_shape,
+                        gt_kp2d, gt_kp3d, has_keypoints2d, has_keypoints3d,
+                        has_smpl, img_meta, gt_bboxes_ignore):
+        """_summary_
+
+        Args:
+            all_pred_smpl_pose (_type_): _description_
+            all_pred_smpl_shape (_type_): _description_
+            all_pred_kp3d (_type_): _description_
+            all_pred_vert (_type_): _description_
+            all_gt_smpl_body_pose (_type_): _description_
+            all_gt_smpl_betas (_type_): _description_
+            all_gt_kp2d (_type_): _description_
+            all_gt_kp3d (_type_): with shape [N, K, D]
+            img_meta (_type_): _description_
+            gt_bboxes_ignore (_type_): _description_
+        """
+        num_query = pred_smpl_pose.shape[0]
+        assign_result = self.assigner.assign(pred_smpl_pose, pred_smpl_shape,
+                                             pred_kp3d, pred_vert, pred_cam,
+                                             gt_smpl_pose, gt_smpl_shape,
+                                             gt_kp2d, gt_kp3d, has_keypoints2d,
+                                             has_keypoints3d, has_smpl,
+                                             img_meta, gt_bboxes_ignore)
+
+        gt_smpl_pose = gt_smpl_pose.float()
+        gt_smpl_shape = gt_smpl_shape.float()
+        gt_kp2d = gt_kp2d.float()
+        gt_kp3d = gt_kp3d.float()
+        has_keypoints2d = has_keypoints2d.float()
+        has_keypoints3d = has_keypoints3d.float()
+        has_smpl = has_smpl.float()
+
+        sampling_result = self.sampler.sample(assign_result, pred_smpl_pose,
+                                              gt_smpl_pose)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # img_h, img_w, _ = img_meta['img_shape']
+
+        # kp2d target
+        kp2d_targets = torch.zeros_like(pred_kp3d[..., :2])
+        kp2d_weights = torch.zeros_like(pred_kp3d[..., :2])
+        kp2d_targets[pos_inds] = gt_kp2d[sampling_result.pos_assigned_gt_inds][
+            ..., :2]
+        kp2d_weights[pos_inds] = gt_kp2d[sampling_result.pos_assigned_gt_inds][
+            ..., [2]].repeat(1, 1, 2)
+        kp2d_targets = torch.cat(
+            [kp2d_targets, kp2d_weights[..., 0].unsqueeze(-1)], dim=-1)
+        # kp3d target
+        kp3d_targets = torch.zeros_like(pred_kp3d)
+        kp3d_weights = torch.zeros_like(pred_kp3d)
+        kp3d_targets[pos_inds] = gt_kp3d[sampling_result.pos_assigned_gt_inds][
+            ..., :3]
+        kp3d_weights[pos_inds] = gt_kp3d[sampling_result.pos_assigned_gt_inds][
+            ..., [3]].repeat(1, 1, 3)
+        kp3d_targets = torch.cat(
+            [kp3d_targets, kp3d_weights[..., 0].unsqueeze(-1)], dim=-1)
+
+        # smpl_pose target
+        smpl_pose_targets = torch.zeros_like(pred_smpl_pose)
+        smpl_pose_weights = torch.zeros_like(pred_smpl_pose)
+        gt_smpl_pose_rotmat = batch_rodrigues(gt_smpl_pose.view(-1, 3)).view(
+            -1, 24, 3, 3)
+        smpl_pose_targets[pos_inds] = gt_smpl_pose_rotmat[
+            sampling_result.pos_assigned_gt_inds]
+        smpl_pose_weights[pos_inds] = 1.0
+
+        # smpl_beta target
+        smpl_shape_targets = torch.zeros_like(pred_smpl_shape)
+        smpl_shape_weights = torch.zeros_like(pred_smpl_shape)
+        smpl_shape_targets[pos_inds] = gt_smpl_shape[
+            sampling_result.pos_assigned_gt_inds]
+        smpl_shape_weights[pos_inds] = 1.0
+
+        # verts
+        if self.body_model_train is not None:
+            gt_output = self.body_model_train(
+                betas=gt_smpl_shape,
+                body_pose=gt_smpl_pose_rotmat[:, 1:],
+                global_orient=gt_smpl_pose_rotmat[:, 0].unsqueeze(1),
+                pose2rot=False)
+            gt_vertices = gt_output['vertices']
+            gt_model_joints = gt_output['joints']
+
+            vert_targets = torch.zeros_like(pred_vert)
+            vert_weights = torch.zeros_like(pred_vert)
+            vert_targets[pos_inds] = gt_vertices[
+                sampling_result.pos_assigned_gt_inds]
+            vert_weights[pos_inds] = 1.0
+
+        if has_keypoints2d is not None:
+            has_keypoints2d_ = torch.zeros(
+                (num_query, 1)).to(smpl_pose_targets.device)
+            has_keypoints2d_[pos_inds] = has_keypoints2d[
+                sampling_result.pos_assigned_gt_inds]
+        else:
+            has_keypoints2d_ = None
+
+        if has_keypoints3d is not None:
+            has_keypoints3d_ = torch.zeros(
+                (num_query, 1)).to(smpl_pose_targets.device)
+            has_keypoints3d_[pos_inds] = has_keypoints3d[
+                sampling_result.pos_assigned_gt_inds]
+        else:
+            has_keypoints3d_ = None
+
+        if has_smpl is not None:
+            has_smpl_ = torch.zeros(
+                (num_query, 1)).to(smpl_pose_targets.device)
+            # if len(sampling_result.pos_assigned_gt_inds) == 1:
+            #     has_smpl_[pos_inds] = has_smpl
+            # else:
+            has_smpl_[pos_inds] = has_smpl[
+                sampling_result.pos_assigned_gt_inds]
+        else:
+            has_smpl_ = None
+        return (kp2d_targets, kp2d_weights, kp3d_targets, kp3d_weights,
+                smpl_pose_targets, smpl_pose_weights, smpl_shape_targets,
+                smpl_shape_weights, vert_targets, vert_weights, has_smpl_,
+                has_keypoints2d_, has_keypoints3d_, pos_inds, neg_inds)
+
+    def forward_test(self, img, img_metas, **kwargs):
+        batch_input_shape = tuple(img[0].size()[-2:])
+        for img_meta in img_metas:
+            img_meta['batch_input_shape'] = batch_input_shape
+        features = self.backbone(img)
+        if self.neck is not None:
+            features = self.neck(features)
+        pred_pose, pred_betas, pred_cam, _, _  = \
+            self.head(features, img_metas)
+
+        # pred_pose = pred_pose[-1]
+        # pred_betas = pred_betas[-1]
+        # pred_cam = pred_cam[-1]
+
+        L, B, N = pred_pose.shape[:3]
+        if self.body_model_test is not None:
+            pred_output = self.body_model_test(
+                betas=pred_betas.reshape(L * B * N, 10),
+                body_pose=pred_pose.reshape(L * B * N, 24, 3, 3)[:, 1:],
+                global_orient=pred_pose.reshape(L * B * N, 24, 3,
+                                                3)[:, 0].unsqueeze(1),
+                pose2rot=False)
+        else:
+            raise ValueError('Please provide a builded body model.')
+
+        pred_keypoints_3d = pred_output['joints'].reshape(L, B, N, -1, 3)
+        pred_keypoints_3d = (pred_keypoints_3d -
+                             pred_keypoints_3d[..., [0], :])
+        pred_keypoints_3d = pred_keypoints_3d.detach().cpu().numpy()
+        # pred_vertices = pred_output['vertices'].reshape(L, B, N, 6890, 3)
+        pred_cam = pred_cam.detach().cpu().numpy()
+        pred_pose = pred_pose.detach().cpu().numpy()
+        pred_betas = pred_betas.detach().cpu().numpy()
+        # batch, instance_num, kp_num, 4
+        gt_keypoints3d = kwargs['keypoints3d'].repeat([1, N, 1, 1]).clone()
+        # keypoints3d_mask = kwargs['keypoints3d_mask']
+        gt_keypoints3d = gt_keypoints3d.detach().cpu().numpy()
+        # gt_keypoints3d, _ = convert_kps(
+        #                 gt_keypoints3d,
+        #                 src='human_data',
+        #                 dst='h36m')
+
+        cost = np.sum((pred_keypoints_3d[-1] - gt_keypoints3d[..., :3]),
+                      axis=(2, 3))
+        index = np.argmin(abs(cost), -1)
+
+        pred_keypoints_3d_ = []
+        pred_pose_ = []
+        pred_betas_ = []
+        pred_cam_ = []
+
+        for batch_i in range(B):
+            ind = index[batch_i]
+            pred_keypoints_3d_.append(pred_keypoints_3d[-1, batch_i, ind])
+            pred_pose_.append(pred_pose[-1, batch_i, ind])
+            pred_betas_.append(pred_betas[-1, batch_i, ind])
+            pred_cam_.append(pred_cam[-1, batch_i, ind])
+
+        # for img_id in range(len(img_metas)):
+        #     pred_pose_ = pred_pose[:, img_id]
+        #     pred_betas_ = pred_betas[:, img_id]
+        #     pred_cam_ = pred_cam[:, img_id]
+        #     pred_keypoints_3d_ = pred_keypoints_3d[:, img_id]
+        #     pred_vertices_ = pred_vertices[:, img_id]
+        #     img_shape_ = img_metas[img_id]['img_shape']
+
+        #     result_list.append()
+
+        all_preds = {}
+        all_preds['keypoints_3d'] = np.array(pred_keypoints_3d_)
+        all_preds['smpl_pose'] = np.array(pred_pose_)
+        all_preds['smpl_beta'] = np.array(pred_betas_)
+        all_preds['camera'] = np.array(pred_cam_)
+        # all_preds['vertices'] = pred_vertices.detach().cpu().numpy()
+
+        image_path = []
+        for img_meta in img_metas:
+            image_path.append(img_meta['image_path'])
+        all_preds['image_path'] = image_path
+        all_preds['image_idx'] = kwargs['sample_idx']
+        return all_preds
+        # loss
+
+    def compute_keypoints3d_loss(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            gt_keypoints3d: torch.Tensor,
+            has_keypoints3d: Optional[torch.Tensor] = None):
+        """Compute loss for 3d keypoints."""
+        keypoints3d_conf = gt_keypoints3d[:, :, 3].float().unsqueeze(-1)
+        keypoints3d_conf = keypoints3d_conf.repeat(1, 1, 3)
+        pred_keypoints3d = pred_keypoints3d.float()
+        gt_keypoints3d = gt_keypoints3d[:, :, :3].float()
+
+        # currently, only mpi_inf_3dhp and h36m have 3d keypoints
+        # both datasets have right_hip_extra and left_hip_extra
+        right_hip_idx = get_keypoint_idx('right_hip_extra', self.convention)
+        left_hip_idx = get_keypoint_idx('left_hip_extra', self.convention)
+        gt_pelvis = (gt_keypoints3d[:, right_hip_idx, :] +
+                     gt_keypoints3d[:, left_hip_idx, :]) / 2
+        pred_pelvis = (pred_keypoints3d[:, right_hip_idx, :] +
+                       pred_keypoints3d[:, left_hip_idx, :]) / 2
+
+        gt_keypoints3d = gt_keypoints3d - gt_pelvis[:, None, :]
+        pred_keypoints3d = pred_keypoints3d - pred_pelvis[:, None, :]
+        loss = self.loss_keypoints3d(pred_keypoints3d,
+                                     gt_keypoints3d,
+                                     reduction_override='none')
+
+        # If has_keypoints3d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints3d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints3d
+        # which have positive confidence.
+
+        # has_keypoints3d is None when the key has_keypoints3d
+        # is not in the datasets
+        if has_keypoints3d is None:
+
+            valid_pos = keypoints3d_conf > 0
+            if keypoints3d_conf[valid_pos].numel() == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints3d)
+            loss = torch.sum(loss * keypoints3d_conf)
+            loss /= keypoints3d_conf[valid_pos].numel()
+        else:
+
+            keypoints3d_conf = keypoints3d_conf[has_keypoints3d == 1]
+            if keypoints3d_conf.shape[0] == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints3d)
+            loss = loss[has_keypoints3d == 1]
+            loss = (loss * keypoints3d_conf).mean()
+        return loss
+
+    def compute_keypoints2d_loss(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            pred_cam: torch.Tensor,
+            gt_keypoints2d: torch.Tensor,
+            img_res: Optional[int] = 512,
+            focal_length: Optional[int] = 5000.,
+            has_keypoints2d: Optional[torch.Tensor] = None):
+        """Compute loss for 2d keypoints."""
+        keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1)
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+        gt_keypoints2d = gt_keypoints2d[:, :, :2].float()
+        pred_keypoints2d = project_points(pred_keypoints3d,
+                                          pred_cam,
+                                          focal_length=focal_length,
+                                          img_res=img_res)
+        # Normalize keypoints to [-1,1]
+        # The coordinate origin of pred_keypoints_2d is
+        # the center of the input image.
+        pred_keypoints2d = 2 * pred_keypoints2d / (img_res - 1)
+        # The coordinate origin of gt_keypoints_2d is
+        # the top left corner of the input image.
+        gt_keypoints2d = 2 * gt_keypoints2d / (img_res - 1) - 1
+        loss = self.loss_keypoints2d(pred_keypoints2d,
+                                     gt_keypoints2d,
+                                     reduction_override='none')
+
+        # If has_keypoints2d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints2d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints2d
+        # which have positive confidence.
+        # has_keypoints2d is None when the key has_keypoints2d
+        # is not in the datasets
+
+        if has_keypoints2d is None:
+            valid_pos = keypoints2d_conf > 0
+            if keypoints2d_conf[valid_pos].numel() == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints2d)
+            loss = torch.sum(loss * keypoints2d_conf)
+            loss /= keypoints2d_conf[valid_pos].numel()
+        else:
+            keypoints2d_conf = keypoints2d_conf[has_keypoints2d == 1]
+            if keypoints2d_conf.shape[0] == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints2d)
+            loss = loss[has_keypoints2d == 1]
+            loss = (loss * keypoints2d_conf).mean()
+
+        return loss
+
+    def compute_vertex_loss(self, pred_vertices: torch.Tensor,
+                            gt_vertices: torch.Tensor, has_smpl: torch.Tensor):
+        """Compute loss for vertices."""
+        gt_vertices = gt_vertices.float()
+        conf = has_smpl.float().view(-1, 1, 1)
+        conf = conf.repeat(1, gt_vertices.shape[1], gt_vertices.shape[2])
+        loss = self.loss_vertex(pred_vertices,
+                                gt_vertices,
+                                reduction_override='none')
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_vertices)
+        loss = torch.sum(loss * conf) / conf[valid_pos].numel()
+        return loss
+
+    def compute_smpl_pose_loss(self, pred_pose: torch.Tensor,
+                               gt_pose: torch.Tensor, has_smpl: torch.Tensor):
+        """Compute loss for smpl pose."""
+        conf = has_smpl.float().view(-1)
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_pose)
+        pred_pose = pred_pose[valid_pos]
+        gt_pose = gt_pose[valid_pos]
+        conf = conf[valid_pos]
+        # gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3)
+        loss = self.loss_smpl_pose(pred_pose,
+                                   gt_pose,
+                                   reduction_override='none')
+        loss = loss.view(loss.shape[0], -1).mean(-1)
+        loss = torch.mean(loss * conf)
+        return loss
+
+    def compute_smpl_betas_loss(self, pred_betas: torch.Tensor,
+                                gt_betas: torch.Tensor,
+                                has_smpl: torch.Tensor):
+        """Compute loss for smpl betas."""
+        conf = has_smpl.float().view(-1)
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_betas)
+        pred_betas = pred_betas[valid_pos]
+        gt_betas = gt_betas[valid_pos]
+        conf = conf[valid_pos]
+        loss = self.loss_smpl_betas(pred_betas,
+                                    gt_betas,
+                                    reduction_override='none')
+        loss = loss.view(loss.shape[0], -1).mean(-1)
+        loss = torch.mean(loss * conf)
+        return loss
+
+    def compute_camera_loss(self, cameras: torch.Tensor):
+        """Compute loss for predicted camera parameters."""
+        loss = self.loss_camera(cameras)
+        return loss
diff --git a/detrsmpl/models/architectures/DetrSMPLloss.py b/detrsmpl/models/architectures/DetrSMPLloss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ac9b98dfd353c69f8bfdd503023c11910f54fe4
--- /dev/null
+++ b/detrsmpl/models/architectures/DetrSMPLloss.py
@@ -0,0 +1,739 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+from typing import Optional, Union
+
+import torch
+from scipy.optimize import linear_sum_assignment
+import numpy as np
+from detrsmpl.core.post_processing.bbox.assigners import build_assigner
+from detrsmpl.core.post_processing.bbox.samplers import build_sampler
+from detrsmpl.core.conventions.keypoints_mapping import (get_keypoint_idx,
+                                                          convert_kps)
+from detrsmpl.utils.geometry import batch_rodrigues
+from detrsmpl.utils.geometry import project_points
+from detrsmpl.utils.misc import multi_apply
+from ..backbones.builder import build_backbone
+from ..body_models.builder import build_body_model
+from ..heads.builder import build_head
+from ..losses.builder import build_loss
+from ..necks.builder import build_neck
+from .base_architecture import BaseArchitecture
+
+# from mmdet.core import bbox2result
+
+
+class DETRLoss(BaseArchitecture, metaclass=ABCMeta):
+    def __init__(
+            self,
+            body_model_train: Optional[Union[dict, None]] = None,
+            body_model_test: Optional[Union[dict, None]] = None,
+            convention: Optional[str] = 'human_data',
+            loss_keypoints2d: Optional[Union[dict, None]] = None,
+            loss_keypoints3d: Optional[Union[dict, None]] = None,
+            loss_vertex: Optional[Union[dict, None]] = None,
+            loss_smpl_pose: Optional[Union[dict, None]] = None,
+            loss_smpl_betas: Optional[Union[dict, None]] = None,
+            loss_camera: Optional[Union[dict, None]] = None,
+            loss_cls: Optional[Union[dict,
+                                     None]] = dict(type='CrossEntropyLoss',
+                                                   bg_cls_weight=0.1,
+                                                   use_sigmoid=False,
+                                                   loss_weight=1.0,
+                                                   class_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+            loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+            init_cfg: Optional[Union[list, dict, None]] = None,
+            train_cfg:
+        Optional[Union[dict, None]] = dict(assigner=dict(
+            type='HungarianAssigner',
+            kp3d_cost=dict(
+                type='Keypoints3DCost', convention='smpl_54', weight=5.0),
+            kp2d_cost=dict(
+                type='Keypoints2DCost', convention='smpl_54', weight=5.0),
+            # cls_cost=dict(type='ClassificationCost', weight=1.),
+            # reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+            # iou_cost=dict(
+            #     type='IoUCost', iou_mode='giou', weight=2.0))
+        )),
+            test_cfg: Optional[Union[dict, None]] = None):
+
+        super(DETRLoss, self).__init__(init_cfg)
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided '\
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            # TODO: update these
+            # assert loss_cls['loss_weight'] == assigner['kp3d_cost']['weight'], \
+            #     'The classification weight for loss and matcher should be' \
+            #     'exactly the same.'
+            # assert loss_bbox['loss_weight'] == assigner['kp3d_cost'][
+            #     'weight'], 'The regression L1 weight for loss and matcher ' \
+            #     'should be exactly the same.'
+            # assert loss_iou['loss_weight'] == assigner['kp3d_cost']['weight'], \
+            #     'The regression iou weight for loss and matcher should be' \
+            #     'exactly the same.'
+            self.assigner = build_assigner(assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        # build loss
+        self.loss_keypoints2d = build_loss(loss_keypoints2d)
+        self.loss_keypoints3d = build_loss(loss_keypoints3d)
+        self.loss_vertex = build_loss(loss_vertex)
+        self.loss_smpl_pose = build_loss(loss_smpl_pose)
+        self.loss_smpl_betas = build_loss(loss_smpl_betas)
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_iou = build_loss(loss_iou)
+
+        self.body_model_train = build_body_model(body_model_train)
+        self.body_model_test = build_body_model(body_model_test)
+        self.convention = convention
+
+    def forward_train(self, preds, targets):
+        pass
+
+    def forward(self, preds, targets):
+        """
+        Args:
+            img (Tensor): Input images of shape (N, C, H, W).
+                Typically these should be mean centered and std scaled.
+            img_metas (list[dict]): A List of image info dict where each dict
+                has: 'img_shape', 'scale_factor', 'flip', and may also contain
+                'filename', 'ori_shape', 'pad_shape', and 'img_norm_cfg'.
+                For details on the values of these keys see
+                :class:`mmdet.datasets.pipelines.Collect`.
+            gt_bboxes (list[Tensor]): Each item are the truth boxes for each
+                image in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (list[Tensor]): Class indices corresponding to each box
+            gt_bboxes_ignore (None | list[Tensor]): Specify which bounding
+                boxes can be ignored when computing the loss.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # super(SingleStageDetector, self).forward_train(img, img_metas)
+        # NOTE the batched image size information may be useful, e.g.
+        # in DETR, this is needed for the construction of masks, which is
+        # then used for the transformer_head.
+        pred_pose = preds['pred_pose']
+        pred_betas = preds['pred_betas']
+        pred_cameras = preds['pred_cameras']
+        has_smpl = targets['has_smpl']
+        gt_smpl_body_pose = targets[
+            'smpl_body_pose']  # [bs_0: [ins_num, 23, 3]]
+        gt_smpl_global_orient = targets['smpl_global_orient']
+        gt_smpl_body_pose = \
+            [torch.cat((gt_smpl_global_orient[i].view(-1, 1, 3),
+                        gt_smpl_body_pose[i]), dim=1).float()
+             for i in range(len(gt_smpl_body_pose))]
+        gt_smpl_betas = targets['smpl_betas']
+        gt_smpl_transl = targets['smpl_transl']
+        gt_keypoints2d = targets['keypoints2d']
+        gt_keypoints3d = targets['keypoints3d']  # [bs_0: [N. K, D], ...]
+        img_metas = targets['img_metas']
+        if 'has_keypoints3d' in targets:
+            has_keypoints3d = targets['has_keypoints3d']
+        else:
+            has_keypoints3d = None
+
+        if 'has_keypoints2d' in targets:
+            has_keypoints2d = targets['has_keypoints2d']
+        else:
+            has_keypoints2d = None
+
+        img = targets['img']
+
+        batch_input_shape = tuple(img[0].size()[-2:])
+        for img_meta in img_metas:
+            img_meta['batch_input_shape'] = batch_input_shape
+
+        L, B, N = pred_pose.shape[:3]
+        if self.body_model_train is not None:
+            pred_output = self.body_model_train(
+                betas=pred_betas.reshape(L * B * N, 10),
+                body_pose=pred_pose.reshape(L * B * N, 24, 3, 3)[:, 1:],
+                global_orient=pred_pose.reshape(L * B * N, 24, 3,
+                                                3)[:, 0].unsqueeze(1),
+                pose2rot=False,
+                num_joints=gt_keypoints2d[0].shape[1])
+            pred_keypoints3d = pred_output['joints'].reshape(L, B, N, -1, 3)
+            pred_vertices = pred_output['vertices'].reshape(L, B, N, 6890, 3)
+        # loss
+        num_dec_layers = pred_pose.shape[0]
+
+        all_gt_smpl_body_pose_list = [
+            gt_smpl_body_pose for _ in range(num_dec_layers)
+        ]
+        all_gt_smpl_global_orient_list = [
+            gt_smpl_global_orient for _ in range(num_dec_layers)
+        ]
+        all_gt_smpl_betas_list = [gt_smpl_betas for _ in range(num_dec_layers)]
+        all_gt_smpl_transl_list = [
+            gt_smpl_transl for _ in range(num_dec_layers)
+        ]
+        all_gt_keypoints2d_list = [
+            gt_keypoints2d for _ in range(num_dec_layers)
+        ]
+        all_gt_keypoints3d_list = [
+            gt_keypoints3d for _ in range(num_dec_layers)
+        ]
+        all_has_smpl_list = [has_smpl for _ in range(num_dec_layers)]
+        all_has_keypoints3d_list = [
+            has_keypoints3d for _ in range(num_dec_layers)
+        ]
+        all_has_keypoints2d_list = [
+            has_keypoints2d for _ in range(num_dec_layers)
+        ]
+        all_gt_ignore_list = [None for _ in range(num_dec_layers)]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+        # all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        # all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        # all_gt_bboxes_ignore_list = [
+        #     gt_bboxes_ignore for _ in range(num_dec_layers)
+        # ]
+        # computer loss for each layer
+        (kp2d_loss, kp3d_loss, vert_loss, pose_loss, beta_loss) = multi_apply(
+            self.compute_losses, pred_pose, pred_betas, pred_keypoints3d,
+            pred_vertices, pred_cameras, all_gt_smpl_body_pose_list,
+            all_gt_smpl_betas_list, all_gt_keypoints2d_list,
+            all_gt_keypoints3d_list, all_has_keypoints2d_list,
+            all_has_keypoints3d_list, all_has_smpl_list, img_metas_list,
+            all_gt_ignore_list)
+
+        losses = {}
+        losses['keypoints2d_loss'] = kp2d_loss[-1]
+        losses['keypoints3d_loss'] = kp3d_loss[-1]
+        losses['vertex_loss'] = vert_loss[-1]
+        losses['smpl_pose_loss'] = pose_loss[-1]
+        losses['smpl_betas_loss'] = beta_loss[-1]
+
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for (kp2d_loss_i, kp3d_loss_i, vert_loss_i, pose_loss_i,
+             beta_loss_i) in zip(kp2d_loss[:-1], kp3d_loss[:-1],
+                                 vert_loss[:-1], pose_loss[:-1],
+                                 beta_loss[:-1]):
+            losses[f'd{num_dec_layer}.keypoints2d_loss'] = kp2d_loss_i
+            losses[f'd{num_dec_layer}.keypoints3d_loss'] = kp3d_loss_i
+            losses[f'd{num_dec_layer}.vertex_loss'] = vert_loss_i
+            losses[f'd{num_dec_layer}.smpl_pose_loss'] = pose_loss_i
+            losses[f'd{num_dec_layer}.smpl_betas_loss'] = beta_loss_i
+            num_dec_layer += 1
+
+        return losses
+
+    def compute_losses(self,
+                       outputs_poses,
+                       outputs_shapes,
+                       outputs_kp3ds,
+                       outputs_verts,
+                       outputs_cameras,
+                       all_gt_smpl_body_pose_list,
+                       all_gt_smpl_betas_list,
+                       all_gt_kp2d_list,
+                       all_gt_kp3d_list,
+                       all_has_keypoints2d_list,
+                       all_has_keypoints3d_list,
+                       all_has_smpl_list,
+                       img_metas_list,
+                       all_gt_ignore_list=None):
+        """_summary_
+            loss_single
+                get_targets
+        Args:
+            outputs_poses (_type_): with shape [B, N, 24, 3, 3]
+            outputs_shapes (_type_): _description_
+            all_gt_smpl_body_pose_list (_type_): _description_
+            all_gt_smpl_betas_list (_type_): _description_
+            all_gt_kp2d_list (Torch.tensor):
+            all_gt_kp3d_list (list): with shape [B, N, K, D]
+            img_metas_list (_type_): _description_
+            all_gt_ignore_list (_type_): _description_
+        """
+        num_img = outputs_poses.size(0)  # batch_size
+        all_pred_smpl_pose_list = [outputs_poses[i] for i in range(num_img)]
+        all_pred_smpl_shape_list = [outputs_shapes[i] for i in range(num_img)]
+        all_pred_kp3d_list = [outputs_kp3ds[i] for i in range(num_img)]
+        all_pred_vert_list = [outputs_verts[i] for i in range(num_img)]
+        all_pred_cam_list = [outputs_cameras[i] for i in range(num_img)]
+
+        gt_bboxes_ignore_list = [all_gt_ignore_list for _ in range(num_img)]
+
+        if all_has_keypoints2d_list is None:
+            all_has_keypoints2d_list = [
+                all_has_keypoints2d_list for _ in range(num_img)
+            ]
+
+        if all_has_keypoints3d_list is None:
+            all_has_keypoints3d_list = [
+                all_has_keypoints3d_list for _ in range(num_img)
+            ]
+
+        if all_has_smpl_list is None:
+            all_has_smpl_list = [all_has_smpl_list for _ in range(num_img)]
+
+        # for each batch data
+        (kp2d_list, kp2d_weight_list, kp3d_list, kp3d_weight_list,
+         smpl_pose_list, smpl_pose_weight_list, smpl_shape_list,
+         smpl_shape_weight_list, vert_list, vert_weight_list, has_smpl_list,
+         has_keypoints2d_list, has_keypoints3d_list, pos_inds_list,
+         neg_inds_list) = multi_apply(
+             self.prepare_targets,
+             all_pred_smpl_pose_list,
+             all_pred_smpl_shape_list,
+             all_pred_kp3d_list,
+             all_pred_vert_list,
+             all_pred_cam_list,
+             all_gt_smpl_body_pose_list,
+             all_gt_smpl_betas_list,
+             all_gt_kp2d_list,
+             all_gt_kp3d_list,
+             all_has_keypoints2d_list,
+             all_has_keypoints3d_list,
+             all_has_smpl_list,
+             img_metas_list,
+             gt_bboxes_ignore_list,
+         )
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+
+        K = outputs_kp3ds.shape[-2]
+
+        gt_kp2d = torch.cat(kp2d_list, 0)
+        kp2d_weight = torch.cat(kp2d_weight_list, 0)
+        pred_cam = outputs_cameras.reshape(-1, 3)
+        # pred_kp2d = torch.cat()
+
+        gt_kp3d = torch.cat(kp3d_list, 0)
+        kp3d_weight = torch.cat(kp3d_weight_list, 0)
+        pred_kp3d = outputs_kp3ds.reshape(-1, K, 3)
+
+        gt_smpl_pose = torch.cat(smpl_pose_list, 0)
+        smpl_pose_weight = torch.cat(smpl_pose_weight_list, 0)
+        pred_smpl_pose = outputs_poses.reshape(-1, 24, 3, 3)
+
+        gt_smpl_shape = torch.cat(smpl_shape_list, 0)
+        smpl_shape_weight = torch.cat(smpl_shape_weight_list, 0)
+        pred_smpl_shape = outputs_shapes.reshape(-1, 10)
+
+        gt_vert = torch.cat(vert_list, 0)
+        vert_weight = torch.cat(vert_weight_list, 0)
+        pred_verts = outputs_verts.reshape(-1, 6890, 3)
+
+        has_smpl = torch.cat(has_smpl_list, 0).squeeze()
+        has_keypoints2d = torch.cat(has_keypoints2d_list, 0).squeeze()
+        has_keypoints3d = torch.cat(has_keypoints3d_list, 0).squeeze()
+
+        # losses = {}
+        if self.loss_keypoints2d is not None:
+            keypoints2d_loss = self.compute_keypoints2d_loss(
+                pred_kp3d, pred_cam, gt_kp2d, has_keypoints2d=has_keypoints2d)
+        else:
+            keypoints2d_loss = 0.0
+
+        if self.loss_keypoints3d is not None:
+            keypoints3d_loss = self.compute_keypoints3d_loss(
+                pred_kp3d,
+                gt_kp3d,
+                has_keypoints3d=has_keypoints3d,
+            )
+        else:
+            keypoints3d_loss = 0.0
+
+        if self.loss_vertex is not None:
+            vertex_loss = self.compute_vertex_loss(pred_verts,
+                                                   gt_vert,
+                                                   has_smpl=has_smpl)
+        else:
+            vertex_loss = 0.0
+
+        if self.loss_smpl_pose is not None:
+            smpl_pose_loss = self.compute_smpl_pose_loss(pred_smpl_pose,
+                                                         gt_smpl_pose,
+                                                         has_smpl=has_smpl)
+        else:
+            smpl_pose_loss = 0.0
+
+        if self.loss_smpl_betas is not None:
+            smpl_betas_loss = self.compute_smpl_betas_loss(pred_smpl_shape,
+                                                           gt_smpl_shape,
+                                                           has_smpl=has_smpl)
+        else:
+            smpl_betas_loss = 0.0
+        # if self.loss_iou is not None:
+        #     losses['iou_loss'] = self.loss_iou()
+
+        # if self.loss_bbox is not None:
+        #     losses['bbox_loss'] = self.loss_bbox()
+
+        # if self.loss_cls is not None:
+        #     losses['cls_loss'] = self.loss_bbox()
+
+        return (keypoints2d_loss, keypoints3d_loss, vertex_loss,
+                smpl_pose_loss, smpl_betas_loss)
+
+    def prepare_targets(self, pred_smpl_pose, pred_smpl_shape, pred_kp3d,
+                        pred_vert, pred_cam, gt_smpl_pose, gt_smpl_shape,
+                        gt_kp2d, gt_kp3d, has_keypoints2d, has_keypoints3d,
+                        has_smpl, img_meta, gt_bboxes_ignore):
+        """_summary_
+
+        Args:
+            all_pred_smpl_pose (_type_): _description_
+            all_pred_smpl_shape (_type_): _description_
+            all_pred_kp3d (_type_): _description_
+            all_pred_vert (_type_): _description_
+            all_gt_smpl_body_pose (_type_): _description_
+            all_gt_smpl_betas (_type_): _description_
+            all_gt_kp2d (_type_): _description_
+            all_gt_kp3d (_type_): with shape [N, K, D]
+            img_meta (_type_): _description_
+            gt_bboxes_ignore (_type_): _description_
+        """
+        num_query = pred_smpl_pose.shape[0]
+        assign_result = self.assigner.assign(pred_smpl_pose, pred_smpl_shape,
+                                             pred_kp3d, pred_vert, pred_cam,
+                                             gt_smpl_pose, gt_smpl_shape,
+                                             gt_kp2d, gt_kp3d, has_keypoints2d,
+                                             has_keypoints3d, has_smpl,
+                                             img_meta, gt_bboxes_ignore)
+
+        gt_smpl_pose = gt_smpl_pose.float()
+        gt_smpl_shape = gt_smpl_shape.float()
+        gt_kp2d = gt_kp2d.float()
+        gt_kp3d = gt_kp3d.float()
+        has_keypoints2d = has_keypoints2d.float()
+        has_keypoints3d = has_keypoints3d.float()
+        has_smpl = has_smpl.float()
+
+        sampling_result = self.sampler.sample(assign_result, pred_smpl_pose,
+                                              gt_smpl_pose)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # img_h, img_w, _ = img_meta['img_shape']
+
+        # kp2d target
+        kp2d_targets = torch.zeros_like(pred_kp3d[..., :2])
+        kp2d_weights = torch.zeros_like(pred_kp3d[..., :2])
+        kp2d_targets[pos_inds] = gt_kp2d[sampling_result.pos_assigned_gt_inds][
+            ..., :2]
+        kp2d_weights[pos_inds] = gt_kp2d[sampling_result.pos_assigned_gt_inds][
+            ..., [2]].repeat(1, 1, 2)
+        kp2d_targets = torch.cat(
+            [kp2d_targets, kp2d_weights[..., 0].unsqueeze(-1)], dim=-1)
+        # kp3d target
+        kp3d_targets = torch.zeros_like(pred_kp3d)
+        kp3d_weights = torch.zeros_like(pred_kp3d)
+        kp3d_targets[pos_inds] = gt_kp3d[sampling_result.pos_assigned_gt_inds][
+            ..., :3]
+        kp3d_weights[pos_inds] = gt_kp3d[sampling_result.pos_assigned_gt_inds][
+            ..., [3]].repeat(1, 1, 3)
+        kp3d_targets = torch.cat(
+            [kp3d_targets, kp3d_weights[..., 0].unsqueeze(-1)], dim=-1)
+        # smpl_pose target
+        smpl_pose_targets = torch.zeros_like(pred_smpl_pose)
+        smpl_pose_weights = torch.zeros_like(pred_smpl_pose)
+        gt_smpl_pose_rotmat = batch_rodrigues(gt_smpl_pose.view(-1, 3)).view(
+            -1, 24, 3, 3)
+        smpl_pose_targets[pos_inds] = gt_smpl_pose_rotmat[
+            sampling_result.pos_assigned_gt_inds]
+        smpl_pose_weights[pos_inds] = 1.0
+
+        # smpl_beta target
+        smpl_shape_targets = torch.zeros_like(pred_smpl_shape)
+        smpl_shape_weights = torch.zeros_like(pred_smpl_shape)
+        smpl_shape_targets[pos_inds] = gt_smpl_shape[
+            sampling_result.pos_assigned_gt_inds]
+        smpl_shape_weights[pos_inds] = 1.0
+
+        # verts
+        if self.body_model_train is not None:
+            gt_output = self.body_model_train(
+                betas=gt_smpl_shape,
+                body_pose=gt_smpl_pose_rotmat[:, 1:],
+                global_orient=gt_smpl_pose_rotmat[:, 0].unsqueeze(1),
+                pose2rot=False)
+            gt_vertices = gt_output['vertices']
+            gt_model_joints = gt_output['joints']
+
+            vert_targets = torch.zeros_like(pred_vert)
+            vert_weights = torch.zeros_like(pred_vert)
+            vert_targets[pos_inds] = gt_vertices[
+                sampling_result.pos_assigned_gt_inds]
+            vert_weights[pos_inds] = 1.0
+
+        if has_keypoints2d is not None:
+            has_keypoints2d_ = torch.zeros(
+                (num_query, 1)).to(smpl_pose_targets.device)
+            has_keypoints2d_[pos_inds] = has_keypoints2d[
+                sampling_result.pos_assigned_gt_inds]
+        else:
+            has_keypoints2d_ = None
+
+        if has_keypoints3d is not None:
+            has_keypoints3d_ = torch.zeros(
+                (num_query, 1)).to(smpl_pose_targets.device)
+            has_keypoints3d_[pos_inds] = has_keypoints3d[
+                sampling_result.pos_assigned_gt_inds]
+        else:
+            has_keypoints3d_ = None
+
+        if has_smpl is not None:
+            has_smpl_ = torch.zeros(
+                (num_query, 1)).to(smpl_pose_targets.device)
+            # if len(sampling_result.pos_assigned_gt_inds) == 1:
+            #     has_smpl_[pos_inds] = has_smpl
+            # else:
+            has_smpl_[pos_inds] = has_smpl[
+                sampling_result.pos_assigned_gt_inds]
+        else:
+            has_smpl_ = None
+        return (kp2d_targets, kp2d_weights, kp3d_targets, kp3d_weights,
+                smpl_pose_targets, smpl_pose_weights, smpl_shape_targets,
+                smpl_shape_weights, vert_targets, vert_weights, has_smpl_,
+                has_keypoints2d_, has_keypoints3d_, pos_inds, neg_inds)
+
+    def forward_test(self, img, img_metas, **kwargs):
+        batch_input_shape = tuple(img[0].size()[-2:])
+        for img_meta in img_metas:
+            img_meta['batch_input_shape'] = batch_input_shape
+        features = self.backbone(img)
+        if self.neck is not None:
+            features = self.neck(features)
+        pred_pose, pred_betas, pred_cam, _, _  = \
+            self.head(features, img_metas)
+
+        # pred_pose = pred_pose[-1]
+        # pred_betas = pred_betas[-1]
+        # pred_cam = pred_cam[-1]
+
+        L, B, N = pred_pose.shape[:3]
+        if self.body_model_test is not None:
+            pred_output = self.body_model_test(
+                betas=pred_betas.reshape(L * B * N, 10),
+                body_pose=pred_pose.reshape(L * B * N, 24, 3, 3)[:, 1:],
+                global_orient=pred_pose.reshape(L * B * N, 24, 3,
+                                                3)[:, 0].unsqueeze(1),
+                pose2rot=False)
+        else:
+            raise ValueError('Please provide a builded body model.')
+
+        pred_keypoints_3d = pred_output['joints'].reshape(L, B, N, -1, 3)
+        pred_keypoints_3d = (pred_keypoints_3d -
+                             pred_keypoints_3d[..., [0], :])
+        pred_keypoints_3d = pred_keypoints_3d.detach().cpu().numpy()
+        # pred_vertices = pred_output['vertices'].reshape(L, B, N, 6890, 3)
+        pred_cam = pred_cam.detach().cpu().numpy()
+        pred_pose = pred_pose.detach().cpu().numpy()
+        pred_betas = pred_betas.detach().cpu().numpy()
+        # batch, instance_num, kp_num, 4
+        gt_keypoints3d = kwargs['keypoints3d'].repeat([1, N, 1, 1]).clone()
+        # keypoints3d_mask = kwargs['keypoints3d_mask']
+        gt_keypoints3d = gt_keypoints3d.detach().cpu().numpy()
+        # gt_keypoints3d, _ = convert_kps(
+        #                 gt_keypoints3d,
+        #                 src='human_data',
+        #                 dst='h36m')
+
+        cost = np.sum((pred_keypoints_3d[-1] - gt_keypoints3d[..., :3]),
+                      axis=(2, 3))
+        index = np.argmin(abs(cost), -1)
+
+        pred_keypoints_3d_ = []
+        pred_pose_ = []
+        pred_betas_ = []
+        pred_cam_ = []
+
+        for batch_i in range(B):
+            ind = index[batch_i]
+            pred_keypoints_3d_.append(pred_keypoints_3d[-1, batch_i, ind])
+            pred_pose_.append(pred_pose[-1, batch_i, ind])
+            pred_betas_.append(pred_betas[-1, batch_i, ind])
+            pred_cam_.append(pred_cam[-1, batch_i, ind])
+
+        # for img_id in range(len(img_metas)):
+        #     pred_pose_ = pred_pose[:, img_id]
+        #     pred_betas_ = pred_betas[:, img_id]
+        #     pred_cam_ = pred_cam[:, img_id]
+        #     pred_keypoints_3d_ = pred_keypoints_3d[:, img_id]
+        #     pred_vertices_ = pred_vertices[:, img_id]
+        #     img_shape_ = img_metas[img_id]['img_shape']
+
+        #     result_list.append()
+
+        all_preds = {}
+        all_preds['keypoints_3d'] = np.array(pred_keypoints_3d_)
+        all_preds['smpl_pose'] = np.array(pred_pose_)
+        all_preds['smpl_beta'] = np.array(pred_betas_)
+        all_preds['camera'] = np.array(pred_cam_)
+        # all_preds['vertices'] = pred_vertices.detach().cpu().numpy()
+
+        image_path = []
+        for img_meta in img_metas:
+            image_path.append(img_meta['image_path'])
+        all_preds['image_path'] = image_path
+        all_preds['image_idx'] = kwargs['sample_idx']
+        return all_preds
+        # loss
+
+    def compute_keypoints3d_loss(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            gt_keypoints3d: torch.Tensor,
+            has_keypoints3d: Optional[torch.Tensor] = None):
+        """Compute loss for 3d keypoints."""
+        keypoints3d_conf = gt_keypoints3d[:, :, 3].float().unsqueeze(-1)
+        keypoints3d_conf = keypoints3d_conf.repeat(1, 1, 3)
+        pred_keypoints3d = pred_keypoints3d.float()
+        gt_keypoints3d = gt_keypoints3d[:, :, :3].float()
+
+        # currently, only mpi_inf_3dhp and h36m have 3d keypoints
+        # both datasets have right_hip_extra and left_hip_extra
+        right_hip_idx = get_keypoint_idx('right_hip_extra', self.convention)
+        left_hip_idx = get_keypoint_idx('left_hip_extra', self.convention)
+        gt_pelvis = (gt_keypoints3d[:, right_hip_idx, :] +
+                     gt_keypoints3d[:, left_hip_idx, :]) / 2
+        pred_pelvis = (pred_keypoints3d[:, right_hip_idx, :] +
+                       pred_keypoints3d[:, left_hip_idx, :]) / 2
+
+        gt_keypoints3d = gt_keypoints3d - gt_pelvis[:, None, :]
+        pred_keypoints3d = pred_keypoints3d - pred_pelvis[:, None, :]
+        loss = self.loss_keypoints3d(pred_keypoints3d,
+                                     gt_keypoints3d,
+                                     reduction_override='none')
+
+        # If has_keypoints3d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints3d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints3d
+        # which have positive confidence.
+
+        # has_keypoints3d is None when the key has_keypoints3d
+        # is not in the datasets
+        if has_keypoints3d is None:
+
+            valid_pos = keypoints3d_conf > 0
+            if keypoints3d_conf[valid_pos].numel() == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints3d)
+            loss = torch.sum(loss * keypoints3d_conf)
+            loss /= keypoints3d_conf[valid_pos].numel()
+        else:
+
+            keypoints3d_conf = keypoints3d_conf[has_keypoints3d == 1]
+            if keypoints3d_conf.shape[0] == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints3d)
+            loss = loss[has_keypoints3d == 1]
+            loss = (loss * keypoints3d_conf).mean()
+        return loss
+
+    def compute_keypoints2d_loss(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            pred_cam: torch.Tensor,
+            gt_keypoints2d: torch.Tensor,
+            img_res: Optional[int] = 512,
+            focal_length: Optional[int] = 5000.,
+            has_keypoints2d: Optional[torch.Tensor] = None):
+        """Compute loss for 2d keypoints."""
+        keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1)
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+        gt_keypoints2d = gt_keypoints2d[:, :, :2].float()
+        pred_keypoints2d = project_points(pred_keypoints3d,
+                                          pred_cam,
+                                          focal_length=focal_length,
+                                          img_res=img_res)
+        # Normalize keypoints to [-1,1]
+        # The coordinate origin of pred_keypoints_2d is
+        # the center of the input image.
+        pred_keypoints2d = 2 * pred_keypoints2d / (img_res - 1)
+        # The coordinate origin of gt_keypoints_2d is
+        # the top left corner of the input image.
+        gt_keypoints2d = 2 * gt_keypoints2d / (img_res - 1) - 1
+        loss = self.loss_keypoints2d(pred_keypoints2d,
+                                     gt_keypoints2d,
+                                     reduction_override='none')
+
+        # If has_keypoints2d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints2d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints2d
+        # which have positive confidence.
+        # has_keypoints2d is None when the key has_keypoints2d
+        # is not in the datasets
+
+        if has_keypoints2d is None:
+            valid_pos = keypoints2d_conf > 0
+            if keypoints2d_conf[valid_pos].numel() == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints2d)
+            loss = torch.sum(loss * keypoints2d_conf)
+            loss /= keypoints2d_conf[valid_pos].numel()
+        else:
+            keypoints2d_conf = keypoints2d_conf[has_keypoints2d == 1]
+            if keypoints2d_conf.shape[0] == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints2d)
+            loss = loss[has_keypoints2d == 1]
+            loss = (loss * keypoints2d_conf).mean()
+
+        return loss
+
+    def compute_vertex_loss(self, pred_vertices: torch.Tensor,
+                            gt_vertices: torch.Tensor, has_smpl: torch.Tensor):
+        """Compute loss for vertices."""
+        gt_vertices = gt_vertices.float()
+        conf = has_smpl.float().view(-1, 1, 1)
+        conf = conf.repeat(1, gt_vertices.shape[1], gt_vertices.shape[2])
+        loss = self.loss_vertex(pred_vertices,
+                                gt_vertices,
+                                reduction_override='none')
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_vertices)
+        loss = torch.sum(loss * conf) / conf[valid_pos].numel()
+        return loss
+
+    def compute_smpl_pose_loss(self, pred_pose: torch.Tensor,
+                               gt_pose: torch.Tensor, has_smpl: torch.Tensor):
+        """Compute loss for smpl pose."""
+        conf = has_smpl.float().view(-1)
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_pose)
+        pred_pose = pred_pose[valid_pos]
+        gt_pose = gt_pose[valid_pos]
+        conf = conf[valid_pos]
+        # gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3)
+        loss = self.loss_smpl_pose(pred_pose,
+                                   gt_pose,
+                                   reduction_override='none')
+        loss = loss.view(loss.shape[0], -1).mean(-1)
+        loss = torch.mean(loss * conf)
+        return loss
+
+    def compute_smpl_betas_loss(self, pred_betas: torch.Tensor,
+                                gt_betas: torch.Tensor,
+                                has_smpl: torch.Tensor):
+        """Compute loss for smpl betas."""
+        conf = has_smpl.float().view(-1)
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_betas)
+        pred_betas = pred_betas[valid_pos]
+        gt_betas = gt_betas[valid_pos]
+        conf = conf[valid_pos]
+        loss = self.loss_smpl_betas(pred_betas,
+                                    gt_betas,
+                                    reduction_override='none')
+        loss = loss.view(loss.shape[0], -1).mean(-1)
+        loss = torch.mean(loss * conf)
+        return loss
+
+    def compute_camera_loss(self, cameras: torch.Tensor):
+        """Compute loss for predicted camera parameters."""
+        loss = self.loss_camera(cameras)
+        return loss
diff --git a/detrsmpl/models/architectures/__init__.py b/detrsmpl/models/architectures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/models/architectures/base_architecture.py b/detrsmpl/models/architectures/base_architecture.py
new file mode 100644
index 0000000000000000000000000000000000000000..09c6e5130cdcef666b4407a5ae891370d877601a
--- /dev/null
+++ b/detrsmpl/models/architectures/base_architecture.py
@@ -0,0 +1,108 @@
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+
+import torch
+import torch.distributed as dist
+from mmcv.runner import BaseModule
+
+
+class BaseArchitecture(BaseModule, metaclass=ABCMeta):
+    """Base class for mmhuman3d architecture."""
+    def __init__(self, init_cfg=None):
+        super(BaseArchitecture, self).__init__(init_cfg)
+
+    @abstractmethod
+    def forward_train(self, **kwargs):
+        pass
+
+    @abstractmethod
+    def forward_test(self, **kwargs):
+        pass
+
+    def _parse_losses(self, losses):
+        """Parse the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+        Returns:
+            tuple[Tensor, dict]: (loss, log_vars), loss is the loss tensor \
+                which may be a weighted sum of all losses, log_vars contains \
+                all the variables to be sent to the logger.
+        """
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(_loss.mean() for _loss in loss_value)
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(_value for _key, _value in log_vars.items()
+                   if 'loss' in _key)
+
+        log_vars['loss'] = loss
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars
+
+    def train_step(self, data, optimizer):
+        """The iteration step during training.
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating is also defined in
+        this method, such as GAN.
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer` | dict): The optimizer of
+                runner is passed to ``train_step()``. This argument is unused
+                and reserved.
+        Returns:
+            dict: It should contain at least 3 keys: ``loss``, ``log_vars``, \
+                ``num_samples``.
+                - ``loss`` is a tensor for back propagation, which can be a
+                  weighted sum of multiple losses.
+                - ``log_vars`` contains all the variables to be sent to the
+                  logger.
+                - ``num_samples`` indicates the batch size (when the model is
+                  DDP, it means the batch size on each GPU), which is used for
+                  averaging the logs.
+        """
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(loss=loss,
+                       log_vars=log_vars,
+                       num_samples=len(data['img_metas']))
+
+        return outputs
+
+    def val_step(self, data, optimizer=None):
+        """The iteration step during validation.
+
+        This method shares the same signature as :func:`train_step`, but used
+        during val epochs. Note that the evaluation after training epochs is
+        not implemented with this method, but an evaluation hook.
+        """
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(loss=loss,
+                       log_vars=log_vars,
+                       num_samples=len(data['img_metas']))
+
+        return outputs
+
+    def forward(self, **kwargs):
+        if self.training:
+            return self.forward_train(**kwargs)
+        else:
+            return self.forward_test(**kwargs)
diff --git a/detrsmpl/models/architectures/builder.py b/detrsmpl/models/architectures/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cd876521feb1c96a5a6fa07a7d5211d59574eed
--- /dev/null
+++ b/detrsmpl/models/architectures/builder.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.cnn import MODELS as MMCV_MODELS
+from mmcv.utils import Registry
+
+from .DetrSMPL import MultiBodyEstimator
+from .expressive_mesh_estimator import SMPLXImageBodyModelEstimator
+from .hybrik import HybrIK_trainer
+from .mesh_estimator import ImageBodyModelEstimator, VideoBodyModelEstimator
+from .DetrSMPLloss import DETRLoss
+
+
+def build_from_cfg(cfg, registry, default_args=None):
+    if cfg is None:
+        return None
+    return MMCV_MODELS.build_func(cfg, registry, default_args)
+
+
+ARCHITECTURES = Registry('architectures',
+                         parent=MMCV_MODELS,
+                         build_func=build_from_cfg)
+
+ARCHITECTURES.register_module(name='HybrIK_trainer', module=HybrIK_trainer)
+ARCHITECTURES.register_module(name='ImageBodyModelEstimator',
+                              module=ImageBodyModelEstimator)
+ARCHITECTURES.register_module(name='VideoBodyModelEstimator',
+                              module=VideoBodyModelEstimator)
+ARCHITECTURES.register_module(name='SMPLXImageBodyModelEstimator',
+                              module=SMPLXImageBodyModelEstimator)
+ARCHITECTURES.register_module(name='MultiBodyEstimator',
+                              module=MultiBodyEstimator)
+ARCHITECTURES.register_module(name='DETRLoss', module=DETRLoss)
+
+
+def build_architecture(cfg):
+    """Build framework."""
+    return ARCHITECTURES.build(cfg)
diff --git a/detrsmpl/models/architectures/expressive_mesh_estimator.py b/detrsmpl/models/architectures/expressive_mesh_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a45a0c06ae26db0336ff4c122d936ed5374f365
--- /dev/null
+++ b/detrsmpl/models/architectures/expressive_mesh_estimator.py
@@ -0,0 +1,848 @@
+from abc import ABCMeta, abstractmethod
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    get_keypoint_idx,
+    get_keypoint_idxs_by_part,
+)
+from detrsmpl.utils.geometry import (
+    batch_rodrigues,
+    weak_perspective_projection,
+)
+from ..backbones.builder import build_backbone
+from ..body_models.builder import build_body_model
+from ..heads.builder import build_head
+from ..losses.builder import build_loss
+from ..necks.builder import build_neck
+from ..utils import (
+    SMPLXFaceCropFunc,
+    SMPLXFaceMergeFunc,
+    SMPLXHandCropFunc,
+    SMPLXHandMergeFunc,
+)
+from .base_architecture import BaseArchitecture
+
+
+def set_requires_grad(nets, requires_grad=False):
+    """Set requies_grad for all the networks.
+
+    Args:
+        nets (nn.Module | list[nn.Module]): A list of networks or a single
+            network.
+        requires_grad (bool): Whether the networks require gradients or not
+    """
+    if not isinstance(nets, list):
+        nets = [nets]
+    for net in nets:
+        if net is not None:
+            for param in net.parameters():
+                param.requires_grad = requires_grad
+
+
+def pose2rotmat(pred_pose):
+    """aa2rotmat."""
+    if len(pred_pose.shape) == 3:
+        num_joints = pred_pose.shape[1]
+        pred_pose = batch_rodrigues(pred_pose.view(-1, 3)).view(
+            -1, num_joints, 3, 3)
+    return pred_pose
+
+
+class SMPLXBodyModelEstimator(BaseArchitecture, metaclass=ABCMeta):
+    """BodyModelEstimator Architecture.
+
+    Args:
+        backbone (dict | None, optional): Backbone config dict. Default: None.
+        neck (dict | None, optional): Neck config dict. Default: None
+        head (dict | None, optional): Regressor config dict. Default: None.
+        body_model_train (dict | None, optional): SMPL config dict during
+            training. Default: None.
+        body_model_test (dict | None, optional): SMPL config dict during
+            test. Default: None.
+        convention (str, optional): Keypoints convention. Default: "human_data"
+        loss_keypoints2d (dict | None, optional): Losses config dict for
+            2D keypoints. Default: None.
+        loss_keypoints3d (dict | None, optional): Losses config dict for
+            3D keypoints. Default: None.
+        loss_smplx_global_orient (dict | None, optional): Losses config dict
+            for smplx global orient. Default: None
+        loss_smplx_body_pose (dict | None, optional): Losses config dict
+            for smplx body pose. Default: None
+        loss_smplx_hand_pose (dict | None, optional): Losses config dict
+            for smplx hand pose. Default: None
+        loss_smplx_jaw_pose (dict | None, optional): Losses config dict
+            for smplx jaw pose. Default: None
+        loss_smplx_expression (dict | None, optional): Losses config dict
+            for smplx expression. Default: None
+        loss_smplx_betas (dict | None, optional): Losses config dict for smplx
+            betas. Default: None
+        loss_camera (dict | None, optional): Losses config dict for predicted
+            camera parameters. Default: None
+        extra_hand_model_cfg (dict | None, optional) : Hand model config for
+            refining body model prediction. Default: None
+        extra_face_model_cfg (dict | None, optional) : Face model config for
+            refining body model prediction. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+    def __init__(self,
+                 backbone: Optional[Union[dict, None]] = None,
+                 neck: Optional[Union[dict, None]] = None,
+                 head: Optional[Union[dict, None]] = None,
+                 body_model_train: Optional[Union[dict, None]] = None,
+                 body_model_test: Optional[Union[dict, None]] = None,
+                 convention: Optional[str] = 'human_data',
+                 loss_keypoints2d: Optional[Union[dict, None]] = None,
+                 loss_keypoints3d: Optional[Union[dict, None]] = None,
+                 loss_smplx_global_orient: Optional[Union[dict, None]] = None,
+                 loss_smplx_body_pose: Optional[Union[dict, None]] = None,
+                 loss_smplx_hand_pose: Optional[Union[dict, None]] = None,
+                 loss_smplx_jaw_pose: Optional[Union[dict, None]] = None,
+                 loss_smplx_expression: Optional[Union[dict, None]] = None,
+                 loss_smplx_betas: Optional[Union[dict, None]] = None,
+                 loss_smplx_betas_prior: Optional[Union[dict, None]] = None,
+                 loss_camera: Optional[Union[dict, None]] = None,
+                 extra_hand_model_cfg: Optional[Union[dict, None]] = None,
+                 extra_face_model_cfg: Optional[Union[dict, None]] = None,
+                 frozen_batchnorm: bool = False,
+                 init_cfg: Optional[Union[list, dict, None]] = None):
+        super(SMPLXBodyModelEstimator, self).__init__(init_cfg)
+        self.backbone = build_backbone(backbone)
+        self.neck = build_neck(neck)
+        self.head = build_head(head)
+
+        if frozen_batchnorm:
+            for param in self.backbone.parameters():
+                param.requires_grad = False
+            for param in self.head.parameters():
+                param.requires_grad = False
+
+            self.backbone = FrozenBatchNorm2d.convert_frozen_batchnorm(
+                self.backbone)
+            self.head = FrozenBatchNorm2d.convert_frozen_batchnorm(self.head)
+
+        self.body_model_train = build_body_model(body_model_train)
+        self.body_model_test = build_body_model(body_model_test)
+        self.convention = convention
+
+        self.apply_hand_model = False
+        self.apply_face_model = False
+        if extra_hand_model_cfg is not None:
+            self.hand_backbone = build_backbone(
+                extra_hand_model_cfg.get('backbone', None))
+            self.hand_neck = build_neck(extra_hand_model_cfg.get('neck', None))
+            self.hand_head = build_head(extra_hand_model_cfg.get('head', None))
+            crop_cfg = extra_hand_model_cfg.get('crop_cfg', None)
+            if crop_cfg is not None:
+                self.crop_hand_func = SMPLXHandCropFunc(
+                    self.hand_head,
+                    self.body_model_train,
+                    convention=self.convention,
+                    **crop_cfg)
+                self.hand_merge_func = SMPLXHandMergeFunc(
+                    self.body_model_train, self.convention)
+            self.hand_crop_loss = build_loss(
+                extra_hand_model_cfg.get('loss_hand_crop', None))
+            self.apply_hand_model = True
+            self.left_hand_idxs = get_keypoint_idxs_by_part(
+                'left_hand', self.convention)
+            self.left_hand_idxs.append(
+                get_keypoint_idx('left_wrist', self.convention))
+            self.left_hand_idxs = sorted(self.left_hand_idxs)
+            self.right_hand_idxs = get_keypoint_idxs_by_part(
+                'right_hand', self.convention)
+            self.right_hand_idxs.append(
+                get_keypoint_idx('right_wrist', self.convention))
+            self.right_hand_idxs = sorted(self.right_hand_idxs)
+
+        if extra_face_model_cfg is not None:
+            self.face_backbone = build_backbone(
+                extra_face_model_cfg.get('backbone', None))
+            self.face_neck = build_neck(extra_face_model_cfg.get('neck', None))
+            self.face_head = build_head(extra_face_model_cfg.get('head', None))
+            crop_cfg = extra_face_model_cfg.get('crop_cfg', None)
+            if crop_cfg is not None:
+                self.crop_face_func = SMPLXFaceCropFunc(
+                    self.face_head,
+                    self.body_model_train,
+                    convention=self.convention,
+                    **crop_cfg)
+                self.face_merge_func = SMPLXFaceMergeFunc(
+                    self.body_model_train, self.convention)
+            self.face_crop_loss = build_loss(
+                extra_face_model_cfg.get('loss_face_crop', None))
+            self.apply_face_model = True
+            self.face_idxs = get_keypoint_idxs_by_part('head', self.convention)
+            self.face_idxs = sorted(self.face_idxs)
+
+        self.loss_keypoints2d = build_loss(loss_keypoints2d)
+        self.loss_keypoints3d = build_loss(loss_keypoints3d)
+
+        self.loss_smplx_global_orient = build_loss(loss_smplx_global_orient)
+        self.loss_smplx_body_pose = build_loss(loss_smplx_body_pose)
+        self.loss_smplx_hand_pose = build_loss(loss_smplx_hand_pose)
+        self.loss_smplx_jaw_pose = build_loss(loss_smplx_jaw_pose)
+        self.loss_smplx_expression = build_loss(loss_smplx_expression)
+        self.loss_smplx_betas = build_loss(loss_smplx_betas)
+        self.loss_smplx_betas_piror = build_loss(loss_smplx_betas_prior)
+        self.loss_camera = build_loss(loss_camera)
+        set_requires_grad(self.body_model_train, False)
+        set_requires_grad(self.body_model_test, False)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """Train step function.
+
+        Args:
+            data_batch (torch.Tensor): Batch of data as input.
+            optimizer (dict[torch.optim.Optimizer]): Dict with optimizers for
+                generator.
+        Returns:
+            outputs (dict): Dict with loss, information for logger,
+            the number of samples.
+        """
+        if self.backbone is not None:
+            img = data_batch['img']
+            features = self.backbone(img)
+        else:
+            features = data_batch['features']
+
+        if self.neck is not None:
+            features = self.neck(features)
+
+        predictions = self.head(features)
+        if self.apply_hand_model:
+            hand_input_img, hand_mean, hand_crop_info = self.crop_hand_func(
+                predictions, data_batch['img_metas'])
+            hand_features = self.hand_backbone(hand_input_img)
+            if self.neck is not None:
+                hand_features = self.hand_neck(hand_features)
+            hand_predictions = self.hand_head(hand_features, cond=hand_mean)
+            predictions = self.hand_merge_func(predictions, hand_predictions)
+            predictions['hand_crop_info'] = hand_crop_info
+        if self.apply_face_model:
+            face_input_img, face_mean, face_crop_info = self.crop_face_func(
+                predictions, data_batch['img_metas'])
+            face_features = self.face_backbone(face_input_img)
+            if self.neck is not None:
+                face_features = self.face_neck(face_features)
+            face_predictions = self.face_head(face_features, cond=face_mean)
+            predictions = self.face_merge_func(predictions, face_predictions)
+            predictions['face_crop_info'] = face_crop_info
+
+        targets = self.prepare_targets(data_batch)
+
+        losses = self.compute_losses(predictions, targets)
+
+        loss, log_vars = self._parse_losses(losses)
+        if self.backbone is not None:
+            optimizer['backbone'].zero_grad()
+        if self.neck is not None:
+            optimizer['neck'].zero_grad()
+        if self.head is not None:
+            optimizer['head'].zero_grad()
+
+        if self.apply_hand_model:
+            if self.hand_backbone is not None:
+                optimizer['hand_backbone'].zero_grad()
+            if self.hand_neck is not None:
+                optimizer['hand_neck'].zero_grad()
+            if self.hand_head is not None:
+                optimizer['hand_head'].zero_grad()
+
+        if self.apply_face_model:
+            if self.face_backbone is not None:
+                optimizer['face_backbone'].zero_grad()
+            if self.face_neck is not None:
+                optimizer['face_neck'].zero_grad()
+            if self.face_head is not None:
+                optimizer['face_head'].zero_grad()
+
+        loss.backward()
+        if self.backbone is not None:
+            optimizer['backbone'].step()
+        if self.neck is not None:
+            optimizer['neck'].step()
+        if self.head is not None:
+            optimizer['head'].step()
+
+        if self.apply_hand_model:
+            if self.hand_backbone is not None:
+                optimizer['hand_backbone'].step()
+            if self.hand_neck is not None:
+                optimizer['hand_neck'].step()
+            if self.hand_head is not None:
+                optimizer['hand_head'].step()
+
+        if self.apply_face_model:
+            if self.face_backbone is not None:
+                optimizer['face_backbone'].step()
+            if self.face_neck is not None:
+                optimizer['face_neck'].step()
+            if self.face_head is not None:
+                optimizer['face_head'].step()
+
+        outputs = dict(loss=loss,
+                       log_vars=log_vars,
+                       num_samples=len(next(iter(data_batch.values()))))
+        return outputs
+
+    def compute_keypoints3d_loss(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            gt_keypoints3d: torch.Tensor,
+            has_keypoints3d: Optional[torch.Tensor] = None):
+        """Compute loss for 3d keypoints."""
+        keypoints3d_conf = gt_keypoints3d[:, :, 3].float().unsqueeze(-1)
+        keypoints3d_conf = keypoints3d_conf.repeat(1, 1, 3)
+        pred_keypoints3d = pred_keypoints3d.float()
+        gt_keypoints3d = gt_keypoints3d[:, :, :3].float()
+
+        if has_keypoints3d is None:
+            has_keypoints3d = torch.ones((keypoints3d_conf.shape[0]))
+        if keypoints3d_conf[has_keypoints3d == 1].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_keypoints3d)
+        # Center the predictions using the pelvis
+        target_idxs = has_keypoints3d == 1
+        pred_keypoints3d = pred_keypoints3d[target_idxs]
+        gt_keypoints3d = gt_keypoints3d[target_idxs]
+        pred_pelvis = pred_keypoints3d[:, [1, 2], :].mean(dim=1, keepdim=True)
+        pred_keypoints3d = pred_keypoints3d - pred_pelvis
+        gt_pelvis = gt_keypoints3d[:, [1, 2], :].mean(dim=1, keepdim=True)
+        gt_keypoints3d = gt_keypoints3d - gt_pelvis
+
+        loss = self.loss_keypoints3d(pred_keypoints3d,
+                                     gt_keypoints3d,
+                                     weight=keypoints3d_conf[target_idxs])
+        loss /= gt_keypoints3d.shape[0]
+        return loss
+
+    def compute_keypoints2d_loss(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            pred_cam: torch.Tensor,
+            gt_keypoints2d: torch.Tensor,
+            img_res: Optional[int] = 224,
+            focal_length: Optional[int] = 5000,
+            has_keypoints2d: Optional[torch.Tensor] = None):
+        """Compute loss for 2d keypoints."""
+        keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1)
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+        gt_keypoints2d = gt_keypoints2d[:, :, :2].float()
+        if has_keypoints2d is None:
+            has_keypoints2d = torch.ones((keypoints2d_conf.shape[0]))
+        if keypoints2d_conf[has_keypoints2d == 1].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_keypoints2d)
+
+        # Expose use weak_perspective_projection
+        pred_keypoints2d = weak_perspective_projection(
+            pred_keypoints3d,
+            scale=pred_cam[:, 0],
+            translation=pred_cam[:, 1:3])
+        gt_keypoints2d = 2 * gt_keypoints2d / (img_res - 1) - 1
+
+        target_idxs = has_keypoints2d == 1
+        pred_keypoints2d = pred_keypoints2d[target_idxs]
+        gt_keypoints2d = gt_keypoints2d[target_idxs]
+        loss = self.loss_keypoints2d(pred_keypoints2d,
+                                     gt_keypoints2d,
+                                     weight=keypoints2d_conf[target_idxs])
+        loss /= gt_keypoints2d.shape[0]
+        return loss
+
+    def compute_smplx_body_pose_loss(self, pred_rotmat: torch.Tensor,
+                                     gt_pose: torch.Tensor,
+                                     has_smplx_body_pose: torch.Tensor):
+        """Compute loss for smplx body pose."""
+        num_joints = pred_rotmat.shape[1]
+        target_idxs = has_smplx_body_pose == 1
+        if gt_pose[target_idxs].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_pose)
+
+        gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(
+            -1, num_joints, 3, 3)
+
+        loss = self.loss_smplx_body_pose(pred_rotmat[target_idxs],
+                                         gt_rotmat[target_idxs])
+        return loss
+
+    def compute_smplx_global_orient_loss(
+            self, pred_rotmat: torch.Tensor, gt_global_orient: torch.Tensor,
+            has_smplx_global_orient: torch.Tensor):
+        """Compute loss for smplx global orient."""
+        target_idxs = has_smplx_global_orient == 1
+        if gt_global_orient[target_idxs].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_global_orient)
+
+        gt_rotmat = batch_rodrigues(gt_global_orient.view(-1, 3)).view(
+            -1, 1, 3, 3)
+
+        loss = self.loss_smplx_global_orient(pred_rotmat[target_idxs],
+                                             gt_rotmat[target_idxs])
+        return loss
+
+    def compute_smplx_jaw_pose_loss(self, pred_rotmat: torch.Tensor,
+                                    gt_jaw_pose: torch.Tensor,
+                                    has_smplx_jaw_pose: torch.Tensor,
+                                    face_conf: torch.Tensor):
+        """Compute loss for smplx jaw pose."""
+        target_idxs = has_smplx_jaw_pose == 1
+        if gt_jaw_pose[target_idxs].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_jaw_pose)
+
+        gt_rotmat = batch_rodrigues(gt_jaw_pose.view(-1, 3)).view(-1, 1, 3, 3)
+        conf = face_conf.mean(axis=1).float()
+        conf = conf.view(-1, 1, 1, 1)
+
+        loss = self.loss_smplx_jaw_pose(pred_rotmat[target_idxs],
+                                        gt_rotmat[target_idxs],
+                                        weight=conf[target_idxs])
+        return loss
+
+    def compute_smplx_hand_pose_loss(self, pred_rotmat: torch.Tensor,
+                                     gt_hand_pose: torch.Tensor,
+                                     has_smplx_hand_pose: torch.Tensor,
+                                     hand_conf: torch.Tensor):
+        """Compute loss for smplx left/right hand pose."""
+        joint_num = pred_rotmat.shape[1]
+        target_idxs = has_smplx_hand_pose == 1
+        if gt_hand_pose[target_idxs].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_hand_pose)
+        gt_rotmat = batch_rodrigues(gt_hand_pose.view(-1, 3)).view(
+            -1, joint_num, 3, 3)
+        conf = hand_conf.mean(axis=1,
+                              keepdim=True).float().expand(-1, joint_num)
+        conf = conf.view(-1, joint_num, 1, 1)
+
+        loss = self.loss_smplx_hand_pose(pred_rotmat[target_idxs],
+                                         gt_rotmat[target_idxs],
+                                         weight=conf[target_idxs])
+        return loss
+
+    def compute_smplx_betas_loss(self, pred_betas: torch.Tensor,
+                                 gt_betas: torch.Tensor,
+                                 has_smplx_betas: torch.Tensor):
+        """Compute loss for smplx betas."""
+        target_idxs = has_smplx_betas == 1
+        if gt_betas[target_idxs].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_betas)
+
+        loss = self.loss_smplx_betas(pred_betas[target_idxs],
+                                     gt_betas[target_idxs])
+        loss = loss / gt_betas[target_idxs].shape[0]
+        return loss
+
+    def compute_smplx_betas_prior_loss(self, pred_betas: torch.Tensor):
+        """Compute prior loss for smplx betas."""
+        loss = self.loss_smplx_betas_piror(pred_betas)
+        return loss
+
+    def compute_smplx_expression_loss(self, pred_expression: torch.Tensor,
+                                      gt_expression: torch.Tensor,
+                                      has_smplx_expression: torch.Tensor,
+                                      face_conf: torch.Tensor):
+        """Compute loss for smplx betas."""
+        target_idxs = has_smplx_expression == 1
+        if gt_expression[target_idxs].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_expression)
+        conf = face_conf.mean(axis=1).float()
+        conf = conf.view(-1, 1)
+
+        loss = self.loss_smplx_expression(pred_expression[target_idxs],
+                                          gt_expression[target_idxs],
+                                          weight=conf[target_idxs])
+        loss = loss / gt_expression[target_idxs].shape[0]
+        return loss
+
+    def compute_camera_loss(self, cameras: torch.Tensor):
+        """Compute loss for predicted camera parameters."""
+        loss = self.loss_camera(cameras)
+        return loss
+
+    def compute_face_crop_loss(self,
+                               pred_keypoints3d: torch.Tensor,
+                               pred_cam: torch.Tensor,
+                               gt_keypoints2d: torch.Tensor,
+                               face_crop_info: dict,
+                               img_res: Optional[int] = 224,
+                               has_keypoints2d: Optional[torch.Tensor] = None):
+        """Compute face crop loss for 2d keypoints."""
+        keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1)
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+        gt_keypoints2d = gt_keypoints2d[:, :, :2].float()
+        if has_keypoints2d is None:
+            has_keypoints2d = torch.ones((keypoints2d_conf.shape[0]))
+        if keypoints2d_conf[has_keypoints2d == 1].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_keypoints2d)
+
+        # Expose use weak_perspective_projection
+        pred_keypoints2d = weak_perspective_projection(
+            pred_keypoints3d,
+            scale=pred_cam[:, 0],
+            translation=pred_cam[:, 1:3])
+        target_idxs = has_keypoints2d == 1
+        pred_keypoints2d = pred_keypoints2d[target_idxs]
+        gt_keypoints2d = gt_keypoints2d[target_idxs]
+
+        pred_keypoints2d = (0.5 * pred_keypoints2d + 0.5) * (img_res - 1)
+        face_inv_crop_transforms = face_crop_info['face_inv_crop_transforms']
+        pred_keypoints2d_hd = torch.einsum('bij,bkj->bki', [
+            face_inv_crop_transforms[:, :2, :2], pred_keypoints2d
+        ]) + face_inv_crop_transforms[:, :2, 2].unsqueeze(dim=1)
+        gt_keypoints2d_hd = torch.einsum('bij,bkj->bki', [
+            face_inv_crop_transforms[:, :2, :2], gt_keypoints2d
+        ]) + face_inv_crop_transforms[:, :2, 2].unsqueeze(dim=1)
+
+        pred_face_keypoints_hd = pred_keypoints2d_hd[:, self.face_idxs]
+        face_crop_transform = face_crop_info['face_crop_transform']
+        inv_face_crop_transf = torch.inverse(face_crop_transform)
+        face_img_keypoints = torch.einsum('bij,bkj->bki', [
+            inv_face_crop_transf[:, :2, :2], pred_face_keypoints_hd
+        ]) + inv_face_crop_transf[:, :2, 2].unsqueeze(dim=1)
+        gt_face_keypoints_hd = gt_keypoints2d_hd[:, self.face_idxs]
+        gt_face_keypoints = torch.einsum('bij,bkj->bki', [
+            inv_face_crop_transf[:, :2, :2], gt_face_keypoints_hd
+        ]) + inv_face_crop_transf[:, :2, 2].unsqueeze(dim=1)
+
+        loss = self.face_crop_loss(
+            face_img_keypoints,
+            gt_face_keypoints,
+            weight=keypoints2d_conf[target_idxs][:, self.face_idxs])
+        loss /= gt_face_keypoints.shape[0]
+        return loss
+
+    def compute_hand_crop_loss(self,
+                               pred_keypoints3d: torch.Tensor,
+                               pred_cam: torch.Tensor,
+                               gt_keypoints2d: torch.Tensor,
+                               hand_crop_info: dict,
+                               img_res: Optional[int] = 224,
+                               has_keypoints2d: Optional[torch.Tensor] = None):
+        """Compute hand crop loss for 2d keypoints."""
+        keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1)
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+        gt_keypoints2d = gt_keypoints2d[:, :, :2].float()
+        if has_keypoints2d is None:
+            has_keypoints2d = torch.ones((keypoints2d_conf.shape[0]))
+        if keypoints2d_conf[has_keypoints2d == 1].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_keypoints2d)
+
+        # Expose use weak_perspective_projection
+        pred_keypoints2d = weak_perspective_projection(
+            pred_keypoints3d,
+            scale=pred_cam[:, 0],
+            translation=pred_cam[:, 1:3])
+        target_idxs = has_keypoints2d == 1
+        pred_keypoints2d = pred_keypoints2d[target_idxs]
+        gt_keypoints2d = gt_keypoints2d[target_idxs]
+
+        pred_keypoints2d = (0.5 * pred_keypoints2d + 0.5) * (img_res - 1)
+        hand_inv_crop_transforms = hand_crop_info['hand_inv_crop_transforms']
+        pred_keypoints2d_hd = torch.einsum('bij,bkj->bki', [
+            hand_inv_crop_transforms[:, :2, :2], pred_keypoints2d
+        ]) + hand_inv_crop_transforms[:, :2, 2].unsqueeze(dim=1)
+        gt_keypoints2d_hd = torch.einsum('bij,bkj->bki', [
+            hand_inv_crop_transforms[:, :2, :2], gt_keypoints2d
+        ]) + hand_inv_crop_transforms[:, :2, 2].unsqueeze(dim=1)
+
+        pred_left_hand_keypoints_hd = pred_keypoints2d_hd[:,
+                                                          self.left_hand_idxs]
+        left_hand_crop_transform = hand_crop_info['left_hand_crop_transform']
+        inv_left_hand_crop_transf = torch.inverse(left_hand_crop_transform)
+        left_hand_img_keypoints = torch.einsum('bij,bkj->bki', [
+            inv_left_hand_crop_transf[:, :2, :2], pred_left_hand_keypoints_hd
+        ]) + inv_left_hand_crop_transf[:, :2, 2].unsqueeze(dim=1)
+        gt_left_hand_keypoints_hd = gt_keypoints2d_hd[:, self.left_hand_idxs]
+        gt_left_hand_keypoints = torch.einsum('bij,bkj->bki', [
+            inv_left_hand_crop_transf[:, :2, :2], gt_left_hand_keypoints_hd
+        ]) + inv_left_hand_crop_transf[:, :2, 2].unsqueeze(dim=1)
+
+        pred_right_hand_keypoints_hd = pred_keypoints2d_hd[:, self.
+                                                           right_hand_idxs]
+        right_hand_crop_transform = hand_crop_info['right_hand_crop_transform']
+        inv_right_hand_crop_transf = torch.inverse(right_hand_crop_transform)
+        right_hand_img_keypoints = torch.einsum('bij,bkj->bki', [
+            inv_right_hand_crop_transf[:, :2, :2], pred_right_hand_keypoints_hd
+        ]) + inv_right_hand_crop_transf[:, :2, 2].unsqueeze(dim=1)
+        gt_right_hand_keypoints_hd = gt_keypoints2d_hd[:, self.right_hand_idxs]
+        gt_right_hand_keypoints = torch.einsum('bij,bkj->bki', [
+            inv_right_hand_crop_transf[:, :2, :2], gt_right_hand_keypoints_hd
+        ]) + inv_right_hand_crop_transf[:, :2, 2].unsqueeze(dim=1)
+
+        left_loss = self.hand_crop_loss(
+            left_hand_img_keypoints,
+            gt_left_hand_keypoints,
+            weight=keypoints2d_conf[target_idxs][:, self.left_hand_idxs])
+        left_loss /= gt_left_hand_keypoints.shape[0]
+
+        right_loss = self.hand_crop_loss(
+            right_hand_img_keypoints,
+            gt_right_hand_keypoints,
+            weight=keypoints2d_conf[target_idxs][:, self.right_hand_idxs])
+        right_loss /= gt_right_hand_keypoints.shape[0]
+
+        return left_loss + right_loss
+
+    def compute_losses(self, predictions: dict, targets: dict):
+        """Compute losses."""
+        pred_param = predictions['pred_param']
+        pred_cam = predictions['pred_cam']
+        gt_keypoints3d = targets['keypoints3d']
+        gt_keypoints2d = targets['keypoints2d']
+
+        if self.body_model_train is not None:
+            pred_output = self.body_model_train(**pred_param)
+            pred_keypoints3d = pred_output['joints']
+        if 'has_keypoints3d' in targets:
+            has_keypoints3d = targets['has_keypoints3d'].squeeze(-1)
+        else:
+            has_keypoints3d = None
+        if 'has_keypoints2d' in targets:
+            has_keypoints2d = targets['has_keypoints2d'].squeeze(-1)
+        else:
+            has_keypoints2d = None
+
+        losses = {}
+        if self.loss_keypoints3d is not None:
+            losses['keypoints3d_loss'] = self.compute_keypoints3d_loss(
+                pred_keypoints3d,
+                gt_keypoints3d,
+                has_keypoints3d=has_keypoints3d)
+        if self.loss_keypoints2d is not None:
+            losses['keypoints2d_loss'] = self.compute_keypoints2d_loss(
+                pred_keypoints3d,
+                pred_cam,
+                gt_keypoints2d,
+                img_res=targets['img'].shape[-1],
+                has_keypoints2d=has_keypoints2d)
+        if self.loss_smplx_global_orient is not None:
+            pred_global_orient = pred_param['global_orient']
+            pred_global_orient = pose2rotmat(pred_global_orient)
+            gt_global_orient = targets['smplx_global_orient']
+            has_smplx_global_orient = targets[
+                'has_smplx_global_orient'].squeeze(-1)
+            losses['smplx_global_orient_loss'] = \
+                self.compute_smplx_global_orient_loss(
+                    pred_global_orient, gt_global_orient,
+                    has_smplx_global_orient)
+        if self.loss_smplx_body_pose is not None:
+            pred_pose = pred_param['body_pose']
+            pred_pose = pose2rotmat(pred_pose)
+            gt_pose = targets['smplx_body_pose']
+            has_smplx_body_pose = targets['has_smplx_body_pose'].squeeze(-1)
+            losses['smplx_body_pose_loss'] = \
+                self.compute_smplx_body_pose_loss(
+                pred_pose, gt_pose, has_smplx_body_pose)
+        if self.loss_smplx_jaw_pose is not None:
+            pred_jaw_pose = pred_param['jaw_pose']
+            pred_jaw_pose = pose2rotmat(pred_jaw_pose)
+            gt_jaw_pose = targets['smplx_jaw_pose']
+            face_conf = get_keypoint_idxs_by_part('head', self.convention)
+            has_smplx_jaw_pose = targets['has_smplx_jaw_pose'].squeeze(-1)
+            losses['smplx_jaw_pose_loss'] = self.compute_smplx_jaw_pose_loss(
+                pred_jaw_pose, gt_jaw_pose, has_smplx_jaw_pose,
+                gt_keypoints2d[:, face_conf, 2])
+        if self.loss_smplx_hand_pose is not None:
+            pred_right_hand_pose = pred_param['right_hand_pose']
+            pred_right_hand_pose = pose2rotmat(pred_right_hand_pose)
+            gt_right_hand_pose = targets['smplx_right_hand_pose']
+            right_hand_conf = get_keypoint_idxs_by_part(
+                'right_hand', self.convention)
+            has_smplx_right_hand_pose = targets[
+                'has_smplx_right_hand_pose'].squeeze(-1)
+            losses['smplx_right_hand_pose_loss'] = \
+                self.compute_smplx_hand_pose_loss(
+                    pred_right_hand_pose, gt_right_hand_pose,
+                    has_smplx_right_hand_pose,
+                    gt_keypoints2d[:, right_hand_conf, 2])
+            if 'left_hand_pose' in pred_param:
+                pred_left_hand_pose = pred_param['left_hand_pose']
+                pred_left_hand_pose = pose2rotmat(pred_left_hand_pose)
+                gt_left_hand_pose = targets['smplx_left_hand_pose']
+                left_hand_conf = get_keypoint_idxs_by_part(
+                    'left_hand', self.convention)
+                has_smplx_left_hand_pose = targets[
+                    'has_smplx_left_hand_pose'].squeeze(-1)
+                losses['smplx_left_hand_pose_loss'] = \
+                    self.compute_smplx_hand_pose_loss(
+                        pred_left_hand_pose, gt_left_hand_pose,
+                        has_smplx_left_hand_pose,
+                        gt_keypoints2d[:, left_hand_conf, 2])
+        if self.loss_smplx_betas is not None:
+            pred_betas = pred_param['betas']
+            gt_betas = targets['smplx_betas']
+            has_smplx_betas = targets['has_smplx_betas'].squeeze(-1)
+            losses['smplx_betas_loss'] = self.compute_smplx_betas_loss(
+                pred_betas, gt_betas, has_smplx_betas)
+        if self.loss_smplx_expression is not None:
+            pred_expression = pred_param['expression']
+            gt_expression = targets['smplx_expression']
+            face_conf = get_keypoint_idxs_by_part('head', self.convention)
+            has_smplx_expression = targets['has_smplx_expression'].squeeze(-1)
+            losses[
+                'smplx_expression_loss'] = self.compute_smplx_expression_loss(
+                    pred_expression, gt_expression, has_smplx_expression,
+                    gt_keypoints2d[:, face_conf, 2])
+        if self.loss_smplx_betas_piror is not None:
+            pred_betas = pred_param['betas']
+            losses['smplx_betas_prior_loss'] = \
+                self.compute_smplx_betas_prior_loss(
+                    pred_betas)
+        if self.loss_camera is not None:
+            losses['camera_loss'] = self.compute_camera_loss(pred_cam)
+        if self.apply_hand_model and self.hand_crop_loss is not None:
+            losses['hand_crop_loss'] = self.compute_hand_crop_loss(
+                pred_keypoints3d, pred_cam, gt_keypoints2d,
+                predictions['hand_crop_info'], targets['img'].shape[-1],
+                has_keypoints2d)
+        if self.apply_face_model and self.face_crop_loss is not None:
+            losses['face_crop_loss'] = self.compute_face_crop_loss(
+                pred_keypoints3d, pred_cam, gt_keypoints2d,
+                predictions['face_crop_info'], targets['img'].shape[-1],
+                has_keypoints2d)
+        return losses
+
+    @abstractmethod
+    def prepare_targets(self, data_batch):
+        pass
+
+    def forward_train(self, **kwargs):
+        """Forward function for general training.
+
+        For mesh estimation, we do not use this interface.
+        """
+        raise NotImplementedError('This interface should not be used in '
+                                  'current training schedule. Please use '
+                                  '`train_step` for training.')
+
+    @abstractmethod
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        pass
+
+
+class SMPLXImageBodyModelEstimator(SMPLXBodyModelEstimator):
+    def prepare_targets(self, data_batch: dict):
+        # Image Mesh Estimator does not need extra process for ground truth
+        return data_batch
+
+    def forward_test(self, img: torch.Tensor, img_metas: dict, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        if self.backbone is not None:
+            features = self.backbone(img)
+        else:
+            features = kwargs['features']
+
+        if self.neck is not None:
+            features = self.neck(features)
+
+        predictions = self.head(features)
+        if self.apply_hand_model:
+            hand_input_img, hand_mean, hand_crop_info = self.crop_hand_func(
+                predictions, img_metas)
+            hand_features = self.hand_backbone(hand_input_img)
+            if self.neck is not None:
+                hand_features = self.hand_neck(hand_features)
+            hand_predictions = self.hand_head(hand_features, cond=hand_mean)
+            predictions = self.hand_merge_func(predictions, hand_predictions)
+            predictions['hand_crop_info'] = hand_crop_info
+        if self.apply_face_model:
+            face_input_img, face_mean, face_crop_info = self.crop_face_func(
+                predictions, img_metas)
+            face_features = self.face_backbone(face_input_img)
+            if self.neck is not None:
+                face_features = self.face_neck(face_features)
+            face_predictions = self.face_head(face_features, cond=face_mean)
+            predictions = self.face_merge_func(predictions, face_predictions)
+            predictions['face_crop_info'] = face_crop_info
+
+        pred_param = predictions['pred_param']
+        pred_cam = predictions['pred_cam']
+
+        pred_output = self.body_model_test(**pred_param)
+
+        pred_vertices = pred_output['vertices']
+        pred_keypoints_3d = pred_output['joints']
+        all_preds = {}
+        all_preds['keypoints_3d'] = pred_keypoints_3d.detach().cpu().numpy()
+        for value in pred_param.values():
+            if isinstance(value, torch.Tensor):
+                value = value.detach().cpu().numpy()
+        all_preds['param'] = pred_param
+        all_preds['camera'] = pred_cam.detach().cpu().numpy()
+        all_preds['vertices'] = pred_vertices.detach().cpu().numpy()
+        image_path = []
+        for img_meta in img_metas:
+            image_path.append(img_meta['image_path'])
+        all_preds['image_path'] = image_path
+        all_preds['image_idx'] = kwargs['sample_idx']
+        return all_preds
+
+
+class FrozenBatchNorm2d(nn.Module):
+    """BatchNorm2d where the batch statistics and the affine parameters are
+    fixed."""
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer('weight', torch.ones(n))
+        self.register_buffer('bias', torch.zeros(n))
+        self.register_buffer('running_mean', torch.zeros(n))
+        self.register_buffer('running_var', torch.ones(n))
+
+    @staticmethod
+    def from_bn(module: nn.BatchNorm2d):
+        """Initializes a frozen batch norm module from a batch norm module."""
+        dim = len(module.weight.data)
+
+        frozen_module = FrozenBatchNorm2d(dim)
+        frozen_module.weight.data = module.weight.data
+
+        missing, not_found = frozen_module.load_state_dict(module.state_dict(),
+                                                           strict=False)
+        return frozen_module
+
+    @classmethod
+    def convert_frozen_batchnorm(cls, module):
+        """Convert BatchNorm/SyncBatchNorm in module into FrozenBatchNorm.
+
+        Args:
+            module (torch.nn.Module):
+
+        Returns:
+            If module is BatchNorm/SyncBatchNorm, returns a new module.
+            Otherwise, in-place convert module and return it.
+
+        Similar to convert_sync_batchnorm in
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/batchnorm.py
+        """
+        bn_module = nn.modules.batchnorm
+        bn_module = (bn_module.BatchNorm2d, bn_module.SyncBatchNorm)
+        res = module
+        if isinstance(module, bn_module):
+            res = cls(module.num_features)
+            if module.affine:
+                res.weight.data = module.weight.data.clone().detach()
+                res.bias.data = module.bias.data.clone().detach()
+            res.running_mean.data = module.running_mean.data
+            res.running_var.data = module.running_var.data
+            res.eps = module.eps
+        else:
+            for name, child in module.named_children():
+                new_child = cls.convert_frozen_batchnorm(child)
+                if new_child is not child:
+                    res.add_module(name, new_child)
+        return res
+
+    def forward(self, x):
+        # Cast all fixed parameters to half() if necessary
+        if x.dtype == torch.float16:
+            self.weight = self.weight.half()
+            self.bias = self.bias.half()
+            self.running_mean = self.running_mean.half()
+            self.running_var = self.running_var.half()
+
+        return F.batch_norm(x, self.running_mean, self.running_var,
+                            self.weight, self.bias, False)
diff --git a/detrsmpl/models/architectures/hybrik.py b/detrsmpl/models/architectures/hybrik.py
new file mode 100644
index 0000000000000000000000000000000000000000..5866b051054d537aeb3b41113e964c16630d14bc
--- /dev/null
+++ b/detrsmpl/models/architectures/hybrik.py
@@ -0,0 +1,276 @@
+# isort: skip_file
+from abc import ABCMeta
+
+import torch
+
+from detrsmpl.data.datasets.pipelines.hybrik_transforms import heatmap2coord
+from detrsmpl.utils.transforms import rotmat_to_quat
+from ..backbones.builder import build_backbone
+from ..body_models.builder import build_body_model
+from ..heads.builder import build_head
+from ..losses.builder import build_loss
+from ..necks.builder import build_neck
+from .base_architecture import BaseArchitecture
+
+
+def set_requires_grad(nets, requires_grad=False):
+    """Set requies_grad for all the networks.
+
+    Args:
+        nets (nn.Module | list[nn.Module]): A list of networks or a single
+            network.
+        requires_grad (bool): Whether the networks require gradients or not
+    """
+    if not isinstance(nets, list):
+        nets = [nets]
+    for net in nets:
+        if net is not None:
+            for param in net.parameters():
+                param.requires_grad = requires_grad
+
+
+class HybrIK_trainer(BaseArchitecture, metaclass=ABCMeta):
+    """Hybrik_trainer Architecture.
+
+    Args:
+        backbone (dict | None, optional): Backbone config dict. Default: None.
+        neck (dict | None, optional): Neck config dict. Default: None
+        head (dict | None, optional): Regressor config dict. Default: None.
+        body_model (dict | None, optional): SMPL config dict. Default: None.
+        loss_beta (dict | None, optional): Losses config dict for
+                beta (shape parameters) estimation. Default: None
+        loss_theta (dict | None, optional): Losses config dict for
+                theta (pose parameters) estimation. Default: None
+        loss_twist (dict | None, optional): Losses config dict
+                for twist angles estimation. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+    def __init__(self,
+                 backbone=None,
+                 neck=None,
+                 head=None,
+                 body_model=None,
+                 loss_beta=None,
+                 loss_theta=None,
+                 loss_twist=None,
+                 loss_uvd=None,
+                 init_cfg=None):
+        super(HybrIK_trainer, self).__init__(init_cfg)
+
+        self.backbone = build_backbone(backbone)
+
+        self.neck = build_neck(neck)
+        self.head = build_head(head)
+        self.smpl = build_body_model(body_model)
+
+        self.loss_beta = build_loss(loss_beta)
+        self.loss_theta = build_loss(loss_theta)
+        self.loss_twist = build_loss(loss_twist)
+        self.loss_uvd = build_loss(loss_uvd)
+
+        self.head._initialize()
+
+    def forward_train(self, img, img_metas, **kwargs):
+        """Train step function.
+
+        In this function, train step is carried out
+            with following the pipeline:
+        1. extract features with the backbone
+        2. feed the extracted features into the head to
+            predicte beta, theta, twist angle, and heatmap (uvd map)
+        3. compute regression losses of the predictions
+            and optimize backbone and head
+        Args:
+            img (torch.Tensor): Batch of data as input.
+            kwargs (dict): Dict with ground-truth
+        Returns:
+            output (dict): Dict with loss, information for logger,
+            the number of samples.
+        """
+        labels = {}
+        labels['trans_inv'] = kwargs['trans_inv']
+        labels['intrinsic_param'] = kwargs['intrinsic_param']
+        labels['joint_root'] = kwargs['joint_root']
+        labels['depth_factor'] = kwargs['depth_factor']
+        labels['target_uvd_29'] = kwargs['target_uvd_29']
+        labels['target_xyz_24'] = kwargs['target_xyz_24']
+        labels['target_weight_24'] = kwargs['target_weight_24']
+        labels['target_weight_29'] = kwargs['target_weight_29']
+        labels['target_xyz_17'] = kwargs['target_xyz_17']
+        labels['target_weight_17'] = kwargs['target_weight_17']
+        labels['target_theta'] = kwargs['target_theta']
+        labels['target_beta'] = kwargs['target_beta']
+        labels['target_smpl_weight'] = kwargs['target_smpl_weight']
+        labels['target_theta_weight'] = kwargs['target_theta_weight']
+        labels['target_twist'] = kwargs['target_twist']
+        labels['target_twist_weight'] = kwargs['target_twist_weight']
+        # flip_output = kwargs.pop('is_flipped', None)
+
+        for k, _ in labels.items():
+            labels[k] = labels[k].cuda()
+
+        trans_inv = labels.pop('trans_inv')
+        intrinsic_param = labels.pop('intrinsic_param')
+        joint_root = labels.pop('joint_root')
+        depth_factor = labels.pop('depth_factor')
+
+        if self.backbone is not None:
+            img = img.cuda().requires_grad_()
+            features = self.backbone(img)
+            features = features[0]
+        else:
+            features = img['features']
+
+        if self.neck is not None:
+            features = self.neck(features)
+
+        predictions = self.head(features, trans_inv, intrinsic_param,
+                                joint_root, depth_factor, self.smpl)
+
+        losses = self.compute_losses(predictions, labels)
+
+        return losses
+
+    def compute_losses(self, predictions, targets):
+        """Compute regression losses for beta, theta, twist and uvd."""
+        smpl_weight = targets['target_smpl_weight']
+
+        losses = {}
+        if self.loss_beta is not None:
+            losses['loss_beta'] = self.loss_beta(
+                predictions['pred_shape'] * smpl_weight,
+                targets['target_beta'] * smpl_weight)
+        if self.loss_theta is not None:
+            pred_pose = rotmat_to_quat(predictions['pred_pose']).reshape(
+                -1, 96)
+            losses['loss_theta'] = self.loss_theta(
+                pred_pose * smpl_weight * targets['target_theta_weight'],
+                targets['target_theta'] * smpl_weight *
+                targets['target_theta_weight'])
+        if self.loss_twist is not None:
+            losses['loss_twist'] = self.loss_twist(
+                predictions['pred_phi'] * targets['target_twist_weight'],
+                targets['target_twist'] * targets['target_twist_weight'])
+        if self.loss_uvd is not None:
+            pred_uvd = predictions['pred_uvd_jts']
+            target_uvd = targets['target_uvd_29'][:, :pred_uvd.shape[1]]
+            target_uvd_weight = targets['target_weight_29'][:, :pred_uvd.
+                                                            shape[1]]
+            losses['loss_uvd'] = self.loss_uvd(
+                64 * predictions['pred_uvd_jts'],
+                64 * target_uvd,
+                target_uvd_weight,
+                avg_factor=target_uvd_weight.sum())
+
+        return losses
+
+    def forward_test(self, img, img_metas, **kwargs):
+        """Test step function.
+
+        In this function, train step is carried out
+            with following the pipeline:
+        1. extract features with the backbone
+        2. feed the extracted features into the head to
+            predicte beta, theta, twist angle, and heatmap (uvd map)
+        3. store predictions for evaluation
+        Args:
+            img (torch.Tensor): Batch of data as input.
+            img_metas (dict): Dict with image metas i.e. path
+            kwargs (dict): Dict with ground-truth
+        Returns:
+            all_preds (dict): Dict with image_path, vertices, xyz_17, uvd_jts,
+            xyz_24 for predictions.
+        """
+        labels = {}
+        labels['trans_inv'] = kwargs['trans_inv']
+        labels['intrinsic_param'] = kwargs['intrinsic_param']
+        labels['joint_root'] = kwargs['joint_root']
+        labels['depth_factor'] = kwargs['depth_factor']
+        labels['target_uvd_29'] = kwargs['target_uvd_29']
+        labels['target_xyz_24'] = kwargs['target_xyz_24']
+        labels['target_weight_24'] = kwargs['target_weight_24']
+        labels['target_weight_29'] = kwargs['target_weight_29']
+        labels['target_xyz_17'] = kwargs['target_xyz_17']
+        labels['target_weight_17'] = kwargs['target_weight_17']
+        labels['target_theta'] = kwargs['target_theta']
+        labels['target_beta'] = kwargs['target_beta']
+        labels['target_smpl_weight'] = kwargs['target_smpl_weight']
+        labels['target_theta_weight'] = kwargs['target_theta_weight']
+        labels['target_twist'] = kwargs['target_twist']
+        labels['target_twist_weight'] = kwargs['target_twist_weight']
+
+        bboxes = kwargs['bbox']
+
+        for k, _ in labels.items():
+            labels[k] = labels[k].cuda()
+
+        trans_inv = labels.pop('trans_inv')
+        intrinsic_param = labels.pop('intrinsic_param')
+        joint_root = labels.pop('joint_root')
+        depth_factor = labels.pop('depth_factor')
+        if len(depth_factor.shape) != 2:
+            depth_factor = torch.unsqueeze(depth_factor, dim=1)
+
+        if self.backbone is not None:
+            img = img.cuda().requires_grad_()
+            features = self.backbone(img)
+            features = features[0]
+        else:
+            features = img['features']
+
+        if self.neck is not None:
+            features = self.neck(features)
+
+        output = self.head(features, trans_inv, intrinsic_param, joint_root,
+                           depth_factor, self.smpl)
+
+        pred_uvd_jts = output['pred_uvd_jts']
+        batch_num = pred_uvd_jts.shape[0]
+        pred_xyz_jts_24 = output['pred_xyz_jts_24'].reshape(batch_num, -1,
+                                                            3)[:, :24, :]
+        pred_xyz_jts_24_struct = output['pred_xyz_jts_24_struct'].reshape(
+            batch_num, 24, 3)
+        pred_xyz_jts_17 = output['pred_xyz_jts_17'].reshape(batch_num, 17, 3)
+        pred_mesh = output['pred_vertices'].reshape(batch_num, -1, 3)
+
+        pred_xyz_jts_24 = pred_xyz_jts_24.cpu().data.numpy()
+        pred_xyz_jts_24_struct = pred_xyz_jts_24_struct.cpu().data.numpy()
+        pred_xyz_jts_17 = pred_xyz_jts_17.cpu().data.numpy()
+        pred_uvd_jts = pred_uvd_jts.cpu().data
+        pred_mesh = pred_mesh.cpu().data.numpy()
+        pred_pose = output['pred_pose'].cpu().data.numpy()
+        pred_beta = output['pred_shape'].cpu().data.numpy()
+
+        assert pred_xyz_jts_17.ndim in [2, 3]
+        pred_xyz_jts_17 = pred_xyz_jts_17.reshape(pred_xyz_jts_17.shape[0], 17,
+                                                  3)
+        pred_uvd_jts = pred_uvd_jts.reshape(pred_uvd_jts.shape[0], -1, 3)
+        pred_xyz_jts_24 = pred_xyz_jts_24.reshape(pred_xyz_jts_24.shape[0], 24,
+                                                  3)
+        pred_scores = output['maxvals'].cpu().data[:, :29]
+
+        hm_shape = [64, 64]
+        pose_coords_list = []
+        for i in range(pred_xyz_jts_17.shape[0]):
+            bbox = bboxes[i].tolist()
+            pose_coords, _ = heatmap2coord(pred_uvd_jts[i],
+                                           pred_scores[i],
+                                           hm_shape,
+                                           bbox,
+                                           mean_bbox_scale=None)
+            pose_coords_list.append(pose_coords)
+
+        all_preds = {}
+        all_preds['vertices'] = pred_mesh
+        all_preds['smpl_pose'] = pred_pose
+        all_preds['smpl_beta'] = pred_beta
+        all_preds['xyz_17'] = pred_xyz_jts_17
+        all_preds['uvd_jts'] = pose_coords
+        all_preds['xyz_24'] = pred_xyz_jts_24_struct
+        image_path = []
+        for img_meta in img_metas:
+            image_path.append(img_meta['image_path'])
+        all_preds['image_path'] = image_path
+        all_preds['image_idx'] = kwargs['sample_idx']
+        return all_preds
diff --git a/detrsmpl/models/architectures/mesh_estimator.py b/detrsmpl/models/architectures/mesh_estimator.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f2b1b294fbda665407af7a4776c59f64f81e11c
--- /dev/null
+++ b/detrsmpl/models/architectures/mesh_estimator.py
@@ -0,0 +1,865 @@
+from abc import ABCMeta, abstractmethod
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+
+import detrsmpl.core.visualization.visualize_smpl as visualize_smpl
+from detrsmpl.core.conventions.keypoints_mapping import get_keypoint_idx
+from detrsmpl.models.utils import FitsDict
+from detrsmpl.utils.geometry import (
+    batch_rodrigues,
+    estimate_translation,
+    project_points,
+    rotation_matrix_to_angle_axis,
+)
+from ..backbones.builder import build_backbone
+from ..body_models.builder import build_body_model
+from ..discriminators.builder import build_discriminator
+from ..heads.builder import build_head
+from ..losses.builder import build_loss
+from ..necks.builder import build_neck
+from ..registrants.builder import build_registrant
+from .base_architecture import BaseArchitecture
+
+
+def set_requires_grad(nets, requires_grad=False):
+    """Set requies_grad for all the networks.
+
+    Args:
+        nets (nn.Module | list[nn.Module]): A list of networks or a single
+            network.
+        requires_grad (bool): Whether the networks require gradients or not
+    """
+    if not isinstance(nets, list):
+        nets = [nets]
+    for net in nets:
+        if net is not None:
+            for param in net.parameters():
+                param.requires_grad = requires_grad
+
+
+class BodyModelEstimator(BaseArchitecture, metaclass=ABCMeta):
+    """BodyModelEstimator Architecture.
+
+    Args:
+        backbone (dict | None, optional): Backbone config dict. Default: None.
+        neck (dict | None, optional): Neck config dict. Default: None
+        head (dict | None, optional): Regressor config dict. Default: None.
+        disc (dict | None, optional): Discriminator config dict.
+            Default: None.
+        registration (dict | None, optional): Registration config dict.
+            Default: None.
+        body_model_train (dict | None, optional): SMPL config dict during
+            training. Default: None.
+        body_model_test (dict | None, optional): SMPL config dict during
+            test. Default: None.
+        convention (str, optional): Keypoints convention. Default: "human_data"
+        loss_keypoints2d (dict | None, optional): Losses config dict for
+            2D keypoints. Default: None.
+        loss_keypoints3d (dict | None, optional): Losses config dict for
+            3D keypoints. Default: None.
+        loss_vertex (dict | None, optional): Losses config dict for mesh
+            vertices. Default: None
+        loss_smpl_pose (dict | None, optional): Losses config dict for smpl
+            pose. Default: None
+        loss_smpl_betas (dict | None, optional): Losses config dict for smpl
+            betas. Default: None
+        loss_camera (dict | None, optional): Losses config dict for predicted
+            camera parameters. Default: None
+        loss_adv (dict | None, optional): Losses config for adversial
+            training. Default: None.
+        loss_segm_mask (dict | None, optional): Losses config for predicted
+        part segmentation. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+    def __init__(self,
+                 backbone: Optional[Union[dict, None]] = None,
+                 neck: Optional[Union[dict, None]] = None,
+                 head: Optional[Union[dict, None]] = None,
+                 disc: Optional[Union[dict, None]] = None,
+                 registration: Optional[Union[dict, None]] = None,
+                 body_model_train: Optional[Union[dict, None]] = None,
+                 body_model_test: Optional[Union[dict, None]] = None,
+                 convention: Optional[str] = 'human_data',
+                 loss_keypoints2d: Optional[Union[dict, None]] = None,
+                 loss_keypoints3d: Optional[Union[dict, None]] = None,
+                 loss_vertex: Optional[Union[dict, None]] = None,
+                 loss_smpl_pose: Optional[Union[dict, None]] = None,
+                 loss_smpl_betas: Optional[Union[dict, None]] = None,
+                 loss_camera: Optional[Union[dict, None]] = None,
+                 loss_adv: Optional[Union[dict, None]] = None,
+                 loss_segm_mask: Optional[Union[dict, None]] = None,
+                 init_cfg: Optional[Union[list, dict, None]] = None):
+        super(BodyModelEstimator, self).__init__(init_cfg)
+        self.backbone = build_backbone(backbone)
+        self.neck = build_neck(neck)
+        self.head = build_head(head)
+        self.disc = build_discriminator(disc)
+
+        self.body_model_train = build_body_model(body_model_train)
+        self.body_model_test = build_body_model(body_model_test)
+        self.convention = convention
+
+        # TODO: support HMR+
+
+        self.registration = registration
+        if registration is not None:
+            self.fits_dict = FitsDict(fits='static')
+            self.registration_mode = self.registration['mode']
+            self.registrant = build_registrant(registration['registrant'])
+        else:
+            self.registrant = None
+
+        self.loss_keypoints2d = build_loss(loss_keypoints2d)
+        self.loss_keypoints3d = build_loss(loss_keypoints3d)
+
+        self.loss_vertex = build_loss(loss_vertex)
+        self.loss_smpl_pose = build_loss(loss_smpl_pose)
+        self.loss_smpl_betas = build_loss(loss_smpl_betas)
+        self.loss_adv = build_loss(loss_adv)
+        self.loss_camera = build_loss(loss_camera)
+        self.loss_segm_mask = build_loss(loss_segm_mask)
+        set_requires_grad(self.body_model_train, False)
+        set_requires_grad(self.body_model_test, False)
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        """Train step function.
+
+        In this function, the detector will finish the train step following
+        the pipeline:
+        1. get fake and real SMPL parameters
+        2. optimize discriminator (if have)
+        3. optimize generator
+        If `self.train_cfg.disc_step > 1`, the train step will contain multiple
+        iterations for optimizing discriminator with different input data and
+        only one iteration for optimizing generator after `disc_step`
+        iterations for discriminator.
+        Args:
+            data_batch (torch.Tensor): Batch of data as input.
+            optimizer (dict[torch.optim.Optimizer]): Dict with optimizers for
+                generator and discriminator (if have).
+        Returns:
+            outputs (dict): Dict with loss, information for logger,
+            the number of samples.
+        """
+        if self.backbone is not None:
+            img = data_batch['img']
+            features = self.backbone(img)
+        else:
+            features = data_batch['features']
+
+        if self.neck is not None:
+            features = self.neck(features)
+
+        predictions = self.head(features)
+        targets = self.prepare_targets(data_batch)
+
+        # optimize discriminator (if have)
+        if self.disc is not None:
+            self.optimize_discrinimator(predictions, data_batch, optimizer)
+
+        if self.registration is not None:
+            targets = self.run_registration(predictions, targets)
+
+        losses = self.compute_losses(predictions, targets)
+        # optimizer generator part
+        if self.disc is not None:
+            adv_loss = self.optimize_generator(predictions)
+            losses.update(adv_loss)
+
+        loss, log_vars = self._parse_losses(losses)
+        for key in optimizer.keys():
+            optimizer[key].zero_grad()
+        loss.backward()
+        for key in optimizer.keys():
+            optimizer[key].step()
+
+        outputs = dict(loss=loss,
+                       log_vars=log_vars,
+                       num_samples=len(next(iter(data_batch.values()))))
+        return outputs
+
+    def run_registration(
+            self,
+            predictions: dict,
+            targets: dict,
+            threshold: Optional[float] = 10.0,
+            focal_length: Optional[float] = 5000.0,
+            img_res: Optional[Union[Tuple[int], int]] = 224) -> dict:
+        """Run registration on 2D keypoinst in predictions to obtain SMPL
+        parameters as pseudo ground truth.
+
+        Args:
+            predictions (dict): predicted SMPL parameters are used for
+                initialization.
+            targets (dict): existing ground truths with 2D keypoints
+            threshold (float, optional): the threshold to update fits
+                dictionary. Default: 10.0.
+            focal_length (tuple(int) | int, optional): camera focal_length
+            img_res (int, optional): image resolution
+
+        Returns:
+            targets: contains additional SMPL parameters
+        """
+
+        img_metas = targets['img_metas']
+        dataset_name = [meta['dataset_name'] for meta in img_metas
+                        ]  # name of the dataset the image comes from
+
+        indices = targets['sample_idx'].squeeze()
+        is_flipped = targets['is_flipped'].squeeze().bool(
+        )  # flag that indicates whether image was flipped
+        # during data augmentation
+        rot_angle = targets['rotation'].squeeze(
+        )  # rotation angle used for data augmentation Q
+        gt_betas = targets['smpl_betas'].float()
+        gt_global_orient = targets['smpl_global_orient'].float()
+        gt_pose = targets['smpl_body_pose'].float().view(-1, 69)
+
+        pred_rotmat = predictions['pred_pose'].detach().clone()
+        pred_betas = predictions['pred_shape'].detach().clone()
+        pred_cam = predictions['pred_cam'].detach().clone()
+        pred_cam_t = torch.stack([
+            pred_cam[:, 1], pred_cam[:, 2], 2 * focal_length /
+            (img_res * pred_cam[:, 0] + 1e-9)
+        ],
+                                 dim=-1)
+
+        gt_keypoints_2d = targets['keypoints2d'].float()
+        num_keypoints = gt_keypoints_2d.shape[1]
+
+        has_smpl = targets['has_smpl'].view(
+            -1).bool()  # flag that indicates whether SMPL parameters are valid
+        batch_size = has_smpl.shape[0]
+        device = has_smpl.device
+
+        # Get GT vertices and model joints
+        # Note that gt_model_joints is different from gt_joints as
+        # it comes from SMPL
+        gt_out = self.body_model_train(betas=gt_betas,
+                                       body_pose=gt_pose,
+                                       global_orient=gt_global_orient)
+        # TODO: support more convention
+        assert num_keypoints == 49
+        gt_model_joints = gt_out['joints']
+        gt_vertices = gt_out['vertices']
+
+        # Get current best fits from the dictionary
+        opt_pose, opt_betas = self.fits_dict[(dataset_name, indices.cpu(),
+                                              rot_angle.cpu(),
+                                              is_flipped.cpu())]
+
+        opt_pose = opt_pose.to(device)
+        opt_betas = opt_betas.to(device)
+        opt_output = self.body_model_train(betas=opt_betas,
+                                           body_pose=opt_pose[:, 3:],
+                                           global_orient=opt_pose[:, :3])
+        opt_joints = opt_output['joints']
+        opt_vertices = opt_output['vertices']
+
+        gt_keypoints_2d_orig = gt_keypoints_2d.clone()
+        # Estimate camera translation given the model joints and 2D keypoints
+        # by minimizing a weighted least squares loss
+        gt_cam_t = estimate_translation(gt_model_joints,
+                                        gt_keypoints_2d_orig,
+                                        focal_length=focal_length,
+                                        img_size=img_res)
+
+        opt_cam_t = estimate_translation(opt_joints,
+                                         gt_keypoints_2d_orig,
+                                         focal_length=focal_length,
+                                         img_size=img_res)
+
+        with torch.no_grad():
+            loss_dict = self.registrant.evaluate(
+                global_orient=opt_pose[:, :3],
+                body_pose=opt_pose[:, 3:],
+                betas=opt_betas,
+                transl=opt_cam_t,
+                keypoints2d=gt_keypoints_2d_orig[:, :, :2],
+                keypoints2d_conf=gt_keypoints_2d_orig[:, :, 2],
+                reduction_override='none')
+        opt_joint_loss = loss_dict['keypoint2d_loss'].sum(dim=-1).sum(dim=-1)
+
+        if self.registration_mode == 'in_the_loop':
+            # Convert predicted rotation matrices to axis-angle
+            pred_rotmat_hom = torch.cat([
+                pred_rotmat.detach().view(-1, 3, 3).detach(),
+                torch.tensor([0, 0, 1], dtype=torch.float32,
+                             device=device).view(1, 3, 1).expand(
+                                 batch_size * 24, -1, -1)
+            ],
+                                        dim=-1)
+            pred_pose = rotation_matrix_to_angle_axis(
+                pred_rotmat_hom).contiguous().view(batch_size, -1)
+            # tgm.rotation_matrix_to_angle_axis returns NaN for 0 rotation,
+            # so manually hack it
+            pred_pose[torch.isnan(pred_pose)] = 0.0
+
+            registrant_output = self.registrant(
+                keypoints2d=gt_keypoints_2d_orig[:, :, :2],
+                keypoints2d_conf=gt_keypoints_2d_orig[:, :, 2],
+                init_global_orient=pred_pose[:, :3],
+                init_transl=pred_cam_t,
+                init_body_pose=pred_pose[:, 3:],
+                init_betas=pred_betas,
+                return_joints=True,
+                return_verts=True,
+                return_losses=True)
+            new_opt_vertices = registrant_output[
+                'vertices'] - pred_cam_t.unsqueeze(1)
+            new_opt_joints = registrant_output[
+                'joints'] - pred_cam_t.unsqueeze(1)
+
+            new_opt_global_orient = registrant_output['global_orient']
+            new_opt_body_pose = registrant_output['body_pose']
+            new_opt_pose = torch.cat(
+                [new_opt_global_orient, new_opt_body_pose], dim=1)
+
+            new_opt_betas = registrant_output['betas']
+            new_opt_cam_t = registrant_output['transl']
+            new_opt_joint_loss = registrant_output['keypoint2d_loss'].sum(
+                dim=-1).sum(dim=-1)
+
+            # Will update the dictionary for the examples where the new loss
+            # is less than the current one
+            update = (new_opt_joint_loss < opt_joint_loss)
+
+            opt_joint_loss[update] = new_opt_joint_loss[update]
+            opt_vertices[update, :] = new_opt_vertices[update, :]
+            opt_joints[update, :] = new_opt_joints[update, :]
+            opt_pose[update, :] = new_opt_pose[update, :]
+            opt_betas[update, :] = new_opt_betas[update, :]
+            opt_cam_t[update, :] = new_opt_cam_t[update, :]
+
+            self.fits_dict[(dataset_name, indices.cpu(), rot_angle.cpu(),
+                            is_flipped.cpu(),
+                            update.cpu())] = (opt_pose.cpu(), opt_betas.cpu())
+
+        # Replace extreme betas with zero betas
+        opt_betas[(opt_betas.abs() > 3).any(dim=-1)] = 0.
+
+        # Replace the optimized parameters with the ground truth parameters,
+        # if available
+        opt_vertices[has_smpl, :, :] = gt_vertices[has_smpl, :, :]
+        opt_cam_t[has_smpl, :] = gt_cam_t[has_smpl, :]
+        opt_joints[has_smpl, :, :] = gt_model_joints[has_smpl, :, :]
+        opt_pose[has_smpl, 3:] = gt_pose[has_smpl, :]
+        opt_pose[has_smpl, :3] = gt_global_orient[has_smpl, :]
+        opt_betas[has_smpl, :] = gt_betas[has_smpl, :]
+
+        # Assert whether a fit is valid by comparing the joint loss with
+        # the threshold
+        valid_fit = (opt_joint_loss < threshold).to(device)
+        valid_fit = valid_fit | has_smpl
+        targets['valid_fit'] = valid_fit
+
+        targets['opt_vertices'] = opt_vertices
+        targets['opt_cam_t'] = opt_cam_t
+        targets['opt_joints'] = opt_joints
+        targets['opt_pose'] = opt_pose
+        targets['opt_betas'] = opt_betas
+
+        return targets
+
+    def optimize_discrinimator(self, predictions: dict, data_batch: dict,
+                               optimizer: dict):
+        """Optimize discrinimator during adversarial training."""
+        set_requires_grad(self.disc, True)
+        fake_data = self.make_fake_data(predictions, requires_grad=False)
+        real_data = self.make_real_data(data_batch)
+        fake_score = self.disc(fake_data)
+        real_score = self.disc(real_data)
+
+        disc_losses = {}
+        disc_losses['real_loss'] = self.loss_adv(real_score,
+                                                 target_is_real=True,
+                                                 is_disc=True)
+        disc_losses['fake_loss'] = self.loss_adv(fake_score,
+                                                 target_is_real=False,
+                                                 is_disc=True)
+        loss_disc, log_vars_d = self._parse_losses(disc_losses)
+
+        optimizer['disc'].zero_grad()
+        loss_disc.backward()
+        optimizer['disc'].step()
+
+    def optimize_generator(self, predictions: dict):
+        """Optimize generator during adversarial training."""
+        set_requires_grad(self.disc, False)
+        fake_data = self.make_fake_data(predictions, requires_grad=True)
+        pred_score = self.disc(fake_data)
+        loss_adv = self.loss_adv(pred_score,
+                                 target_is_real=True,
+                                 is_disc=False)
+        loss = dict(adv_loss=loss_adv)
+        return loss
+
+    def compute_keypoints3d_loss(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            gt_keypoints3d: torch.Tensor,
+            has_keypoints3d: Optional[torch.Tensor] = None):
+        """Compute loss for 3d keypoints."""
+        keypoints3d_conf = gt_keypoints3d[:, :, 3].float().unsqueeze(-1)
+        keypoints3d_conf = keypoints3d_conf.repeat(1, 1, 3)
+        pred_keypoints3d = pred_keypoints3d.float()
+        gt_keypoints3d = gt_keypoints3d[:, :, :3].float()
+
+        # currently, only mpi_inf_3dhp and h36m have 3d keypoints
+        # both datasets have right_hip_extra and left_hip_extra
+        right_hip_idx = get_keypoint_idx('right_hip_extra', self.convention)
+        left_hip_idx = get_keypoint_idx('left_hip_extra', self.convention)
+        gt_pelvis = (gt_keypoints3d[:, right_hip_idx, :] +
+                     gt_keypoints3d[:, left_hip_idx, :]) / 2
+        pred_pelvis = (pred_keypoints3d[:, right_hip_idx, :] +
+                       pred_keypoints3d[:, left_hip_idx, :]) / 2
+
+        gt_keypoints3d = gt_keypoints3d - gt_pelvis[:, None, :]
+        pred_keypoints3d = pred_keypoints3d - pred_pelvis[:, None, :]
+        loss = self.loss_keypoints3d(pred_keypoints3d,
+                                     gt_keypoints3d,
+                                     reduction_override='none')
+
+        # If has_keypoints3d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints3d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints3d
+        # which have positive confidence.
+
+        # has_keypoints3d is None when the key has_keypoints3d
+        # is not in the datasets
+        if has_keypoints3d is None:
+
+            valid_pos = keypoints3d_conf > 0
+            if keypoints3d_conf[valid_pos].numel() == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints3d)
+            loss = torch.sum(loss * keypoints3d_conf)
+            loss /= keypoints3d_conf[valid_pos].numel()
+        else:
+
+            keypoints3d_conf = keypoints3d_conf[has_keypoints3d == 1]
+            if keypoints3d_conf.shape[0] == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints3d)
+            loss = loss[has_keypoints3d == 1]
+            loss = (loss * keypoints3d_conf).mean()
+        return loss
+
+    def compute_keypoints2d_loss(
+            self,
+            pred_keypoints3d: torch.Tensor,
+            pred_cam: torch.Tensor,
+            gt_keypoints2d: torch.Tensor,
+            img_res: Optional[int] = 224,
+            focal_length: Optional[int] = 5000,
+            has_keypoints2d: Optional[torch.Tensor] = None):
+        """Compute loss for 2d keypoints."""
+        keypoints2d_conf = gt_keypoints2d[:, :, 2].float().unsqueeze(-1)
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+        gt_keypoints2d = gt_keypoints2d[:, :, :2].float()
+        pred_keypoints2d = project_points(pred_keypoints3d,
+                                          pred_cam,
+                                          focal_length=focal_length,
+                                          img_res=img_res)
+        # Normalize keypoints to [-1,1]
+        # The coordinate origin of pred_keypoints_2d is
+        # the center of the input image.
+        pred_keypoints2d = 2 * pred_keypoints2d / (img_res - 1)
+        # The coordinate origin of gt_keypoints_2d is
+        # the top left corner of the input image.
+        gt_keypoints2d = 2 * gt_keypoints2d / (img_res - 1) - 1
+        loss = self.loss_keypoints2d(pred_keypoints2d,
+                                     gt_keypoints2d,
+                                     reduction_override='none')
+
+        # If has_keypoints2d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints2d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints2d
+        # which have positive confidence.
+        # has_keypoints2d is None when the key has_keypoints2d
+        # is not in the datasets
+
+        if has_keypoints2d is None:
+            valid_pos = keypoints2d_conf > 0
+            if keypoints2d_conf[valid_pos].numel() == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints2d)
+            loss = torch.sum(loss * keypoints2d_conf)
+            loss /= keypoints2d_conf[valid_pos].numel()
+        else:
+            keypoints2d_conf = keypoints2d_conf[has_keypoints2d == 1]
+            if keypoints2d_conf.shape[0] == 0:
+                return torch.Tensor([0]).type_as(gt_keypoints2d)
+            loss = loss[has_keypoints2d == 1]
+            loss = (loss * keypoints2d_conf).mean()
+
+        return loss
+
+    def compute_vertex_loss(self, pred_vertices: torch.Tensor,
+                            gt_vertices: torch.Tensor, has_smpl: torch.Tensor):
+        """Compute loss for vertices."""
+        gt_vertices = gt_vertices.float()
+        conf = has_smpl.float().view(-1, 1, 1)
+        conf = conf.repeat(1, gt_vertices.shape[1], gt_vertices.shape[2])
+        loss = self.loss_vertex(pred_vertices,
+                                gt_vertices,
+                                reduction_override='none')
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_vertices)
+        loss = torch.sum(loss * conf) / conf[valid_pos].numel()
+        return loss
+
+    def compute_smpl_pose_loss(self, pred_rotmat: torch.Tensor,
+                               gt_pose: torch.Tensor, has_smpl: torch.Tensor):
+        """Compute loss for smpl pose."""
+        conf = has_smpl.float().view(-1)
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_pose)
+        pred_rotmat = pred_rotmat[valid_pos]
+        gt_pose = gt_pose[valid_pos]
+        conf = conf[valid_pos]
+        gt_rotmat = batch_rodrigues(gt_pose.view(-1, 3)).view(-1, 24, 3, 3)
+        loss = self.loss_smpl_pose(pred_rotmat,
+                                   gt_rotmat,
+                                   reduction_override='none')
+        loss = loss.view(loss.shape[0], -1).mean(-1)
+        loss = torch.mean(loss * conf)
+        return loss
+
+    def compute_smpl_betas_loss(self, pred_betas: torch.Tensor,
+                                gt_betas: torch.Tensor,
+                                has_smpl: torch.Tensor):
+        """Compute loss for smpl betas."""
+        conf = has_smpl.float().view(-1)
+        valid_pos = conf > 0
+        if conf[valid_pos].numel() == 0:
+            return torch.Tensor([0]).type_as(gt_betas)
+        pred_betas = pred_betas[valid_pos]
+        gt_betas = gt_betas[valid_pos]
+        conf = conf[valid_pos]
+        loss = self.loss_smpl_betas(pred_betas,
+                                    gt_betas,
+                                    reduction_override='none')
+        loss = loss.view(loss.shape[0], -1).mean(-1)
+        loss = torch.mean(loss * conf)
+        return loss
+
+    def compute_camera_loss(self, cameras: torch.Tensor):
+        """Compute loss for predicted camera parameters."""
+        loss = self.loss_camera(cameras)
+        return loss
+
+    def compute_part_segmentation_loss(self,
+                                       pred_heatmap: torch.Tensor,
+                                       gt_vertices: torch.Tensor,
+                                       gt_keypoints2d: torch.Tensor,
+                                       gt_model_joints: torch.Tensor,
+                                       has_smpl: torch.Tensor,
+                                       img_res: Optional[int] = 224,
+                                       focal_length: Optional[int] = 500):
+        """Compute loss for part segmentations."""
+        device = gt_keypoints2d.device
+        gt_keypoints2d_valid = gt_keypoints2d[has_smpl == 1]
+        batch_size = gt_keypoints2d_valid.shape[0]
+
+        gt_vertices_valid = gt_vertices[has_smpl == 1]
+        gt_model_joints_valid = gt_model_joints[has_smpl == 1]
+
+        if batch_size == 0:
+            return torch.Tensor([0]).type_as(gt_keypoints2d)
+        gt_cam_t = estimate_translation(
+            gt_model_joints_valid,
+            gt_keypoints2d_valid,
+            focal_length=focal_length,
+            img_size=img_res,
+        )
+
+        K = torch.eye(3)
+        K[0, 0] = focal_length
+        K[1, 1] = focal_length
+        K[2, 2] = 1
+        K[0, 2] = img_res / 2.
+        K[1, 2] = img_res / 2.
+        K = K[None, :, :]
+
+        R = torch.eye(3)[None, :, :]
+        device = gt_keypoints2d.device
+        gt_sem_mask = visualize_smpl.render_smpl(
+            verts=gt_vertices_valid,
+            R=R,
+            K=K,
+            T=gt_cam_t,
+            render_choice='part_silhouette',
+            resolution=img_res,
+            return_tensor=True,
+            body_model=self.body_model_train,
+            device=device,
+            in_ndc=False,
+            convention='pytorch3d',
+            projection='perspective',
+            no_grad=True,
+            batch_size=batch_size,
+            verbose=False,
+        )
+        gt_sem_mask = torch.flip(gt_sem_mask, [1, 2]).squeeze(-1).detach()
+        pred_heatmap_valid = pred_heatmap[has_smpl == 1]
+        ph, pw = pred_heatmap_valid.size(2), pred_heatmap_valid.size(3)
+        h, w = gt_sem_mask.size(1), gt_sem_mask.size(2)
+        if ph != h or pw != w:
+            pred_heatmap_valid = F.interpolate(input=pred_heatmap_valid,
+                                               size=(h, w),
+                                               mode='bilinear')
+
+        loss = self.loss_segm_mask(pred_heatmap_valid, gt_sem_mask)
+        return loss
+
+    def compute_losses(self, predictions: dict, targets: dict):
+        """Compute losses."""
+        pred_betas = predictions['pred_shape'].view(-1, 10)
+        pred_pose = predictions['pred_pose'].view(-1, 24, 3, 3)
+        pred_cam = predictions['pred_cam'].view(-1, 3)
+
+        gt_keypoints3d = targets['keypoints3d']
+        gt_keypoints2d = targets['keypoints2d']
+        # pred_pose N, 24, 3, 3
+        if self.body_model_train is not None:
+            pred_output = self.body_model_train(
+                betas=pred_betas,
+                body_pose=pred_pose[:, 1:],
+                global_orient=pred_pose[:, 0].unsqueeze(1),
+                pose2rot=False,
+                num_joints=gt_keypoints2d.shape[1])
+            pred_keypoints3d = pred_output['joints']
+            pred_vertices = pred_output['vertices']
+
+        # # TODO: temp. Should we multiply confs here?
+        # pred_keypoints3d_mask = pred_output['joint_mask']
+        # keypoints3d_mask = keypoints3d_mask * pred_keypoints3d_mask
+
+        # TODO: temp solution
+        if 'valid_fit' in targets:
+            has_smpl = targets['valid_fit'].view(-1)
+            # global_orient = targets['opt_pose'][:, :3].view(-1, 1, 3)
+            gt_pose = targets['opt_pose']
+            gt_betas = targets['opt_betas']
+            gt_vertices = targets['opt_vertices']
+        else:
+            has_smpl = targets['has_smpl'].view(-1)
+            gt_pose = targets['smpl_body_pose']
+            global_orient = targets['smpl_global_orient'].view(-1, 1, 3)
+            gt_pose = torch.cat((global_orient, gt_pose), dim=1).float()
+            gt_betas = targets['smpl_betas'].float()
+
+            # gt_pose N, 72
+            if self.body_model_train is not None:
+                gt_output = self.body_model_train(
+                    betas=gt_betas,
+                    body_pose=gt_pose[:, 3:],
+                    global_orient=gt_pose[:, :3],
+                    num_joints=gt_keypoints2d.shape[1])
+                gt_vertices = gt_output['vertices']
+                gt_model_joints = gt_output['joints']
+        if 'has_keypoints3d' in targets:
+            has_keypoints3d = targets['has_keypoints3d'].squeeze(-1)
+        else:
+            has_keypoints3d = None
+        if 'has_keypoints2d' in targets:
+            has_keypoints2d = targets['has_keypoints2d'].squeeze(-1)
+        else:
+            has_keypoints2d = None
+        if 'pred_segm_mask' in predictions:
+            pred_segm_mask = predictions['pred_segm_mask']
+        losses = {}
+        if self.loss_keypoints3d is not None:
+            losses['keypoints3d_loss'] = self.compute_keypoints3d_loss(
+                pred_keypoints3d,
+                gt_keypoints3d,
+                has_keypoints3d=has_keypoints3d)
+        if self.loss_keypoints2d is not None:
+            losses['keypoints2d_loss'] = self.compute_keypoints2d_loss(
+                pred_keypoints3d,
+                pred_cam,
+                gt_keypoints2d,
+                has_keypoints2d=has_keypoints2d)
+        if self.loss_vertex is not None:
+            losses['vertex_loss'] = self.compute_vertex_loss(
+                pred_vertices, gt_vertices, has_smpl)
+        if self.loss_smpl_pose is not None:
+            losses['smpl_pose_loss'] = self.compute_smpl_pose_loss(
+                pred_pose, gt_pose, has_smpl)
+        if self.loss_smpl_betas is not None:
+            losses['smpl_betas_loss'] = self.compute_smpl_betas_loss(
+                pred_betas, gt_betas, has_smpl)
+        if self.loss_camera is not None:
+            losses['camera_loss'] = self.compute_camera_loss(pred_cam)
+        if self.loss_segm_mask is not None:
+            losses['loss_segm_mask'] = self.compute_part_segmentation_loss(
+                pred_segm_mask, gt_vertices, gt_keypoints2d, gt_model_joints,
+                has_smpl)
+
+        return losses
+
+    @abstractmethod
+    def make_fake_data(self, predictions, requires_grad):
+        pass
+
+    @abstractmethod
+    def make_real_data(self, data_batch):
+        pass
+
+    @abstractmethod
+    def prepare_targets(self, data_batch):
+        pass
+
+    def forward_train(self, **kwargs):
+        """Forward function for general training.
+
+        For mesh estimation, we do not use this interface.
+        """
+        raise NotImplementedError('This interface should not be used in '
+                                  'current training schedule. Please use '
+                                  '`train_step` for training.')
+
+    @abstractmethod
+    def forward_test(self, img, img_metas, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        pass
+
+
+class ImageBodyModelEstimator(BodyModelEstimator):
+    def make_fake_data(self, predictions: dict, requires_grad: bool):
+        pred_cam = predictions['pred_cam']
+        pred_pose = predictions['pred_pose']
+        pred_betas = predictions['pred_shape']
+        if requires_grad:
+            fake_data = (pred_cam, pred_pose, pred_betas)
+        else:
+            fake_data = (pred_cam.detach(), pred_pose.detach(),
+                         pred_betas.detach())
+        return fake_data
+
+    def make_real_data(self, data_batch: dict):
+        transl = data_batch['adv_smpl_transl'].float()
+        global_orient = data_batch['adv_smpl_global_orient']
+        body_pose = data_batch['adv_smpl_body_pose']
+        betas = data_batch['adv_smpl_betas'].float()
+        pose = torch.cat((global_orient, body_pose), dim=-1).float()
+        real_data = (transl, pose, betas)
+        return real_data
+
+    def prepare_targets(self, data_batch: dict):
+        # Image Mesh Estimator does not need extra process for ground truth
+        return data_batch
+
+    def forward_test(self, img: torch.Tensor, img_metas: dict, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        if self.backbone is not None:
+            features = self.backbone(img)
+        else:
+            features = kwargs['features']
+
+        if self.neck is not None:
+            features = self.neck(features)
+        predictions = self.head(features)
+        pred_pose = predictions['pred_pose']
+        pred_betas = predictions['pred_shape']
+        pred_cam = predictions['pred_cam']
+        pred_output = self.body_model_test(
+            betas=pred_betas,
+            body_pose=pred_pose[:, 1:],
+            global_orient=pred_pose[:, 0].unsqueeze(1),
+            pose2rot=False)
+
+        pred_vertices = pred_output['vertices']
+        pred_keypoints_3d = pred_output['joints']
+        all_preds = {}
+        all_preds['keypoints_3d'] = pred_keypoints_3d.detach().cpu().numpy()
+        all_preds['smpl_pose'] = pred_pose.detach().cpu().numpy()
+        all_preds['smpl_beta'] = pred_betas.detach().cpu().numpy()
+        all_preds['camera'] = pred_cam.detach().cpu().numpy()
+        all_preds['vertices'] = pred_vertices.detach().cpu().numpy()
+        image_path = []
+        for img_meta in img_metas:
+            image_path.append(img_meta['image_path'])
+        all_preds['image_path'] = image_path
+        all_preds['image_idx'] = kwargs['sample_idx']
+        return all_preds
+
+
+class VideoBodyModelEstimator(BodyModelEstimator):
+    def make_fake_data(self, predictions: dict, requires_grad: bool):
+        B, T = predictions['pred_cam'].shape[:2]
+        pred_cam_vec = predictions['pred_cam']
+        pred_betas_vec = predictions['pred_shape']
+        pred_pose = predictions['pred_pose']
+        pred_pose_vec = rotation_matrix_to_angle_axis(pred_pose.view(-1, 3, 3))
+        pred_pose_vec = pred_pose_vec.contiguous().view(B, T, -1)
+        pred_theta_vec = (pred_cam_vec, pred_pose_vec, pred_betas_vec)
+        pred_theta_vec = torch.cat(pred_theta_vec, dim=-1)
+
+        if not requires_grad:
+            pred_theta_vec = pred_theta_vec.detach()
+        return pred_theta_vec[:, :, 6:75]
+
+    def make_real_data(self, data_batch: dict):
+        B, T = data_batch['adv_smpl_transl'].shape[:2]
+        transl = data_batch['adv_smpl_transl'].view(B, T, -1)
+        global_orient = \
+            data_batch['adv_smpl_global_orient'].view(B, T, -1)
+        body_pose = data_batch['adv_smpl_body_pose'].view(B, T, -1)
+        betas = data_batch['adv_smpl_betas'].view(B, T, -1)
+        real_data = (transl, global_orient, body_pose, betas)
+        real_data = torch.cat(real_data, dim=-1).float()
+        return real_data[:, :, 6:75]
+
+    def prepare_targets(self, data_batch: dict):
+        # Video Mesh Estimator needs squeeze first two dimensions
+        B, T = data_batch['smpl_body_pose'].shape[:2]
+
+        output = {
+            'smpl_body_pose': data_batch['smpl_body_pose'].view(-1, 23, 3),
+            'smpl_global_orient': data_batch['smpl_global_orient'].view(-1, 3),
+            'smpl_betas': data_batch['smpl_betas'].view(-1, 10),
+            'has_smpl': data_batch['has_smpl'].view(-1),
+            'keypoints3d': data_batch['keypoints3d'].view(B * T, -1, 4),
+            'keypoints2d': data_batch['keypoints2d'].view(B * T, -1, 3)
+        }
+        return output
+
+    def forward_test(self, img_metas: dict, **kwargs):
+        """Defines the computation performed at every call when testing."""
+        if self.backbone is not None:
+            features = self.backbone(kwargs['img'])
+        else:
+            features = kwargs['features']
+
+        if self.neck is not None:
+            features = self.neck(features)
+
+        B, T = features.shape[:2]
+        predictions = self.head(features)
+        pred_pose = predictions['pred_pose'].view(-1, 24, 3, 3)
+        pred_betas = predictions['pred_shape'].view(-1, 10)
+        pred_cam = predictions['pred_cam'].view(-1, 3)
+
+        pred_output = self.body_model_test(
+            betas=pred_betas,
+            body_pose=pred_pose[:, 1:],
+            global_orient=pred_pose[:, 0].unsqueeze(1),
+            pose2rot=False)
+
+        pred_vertices = pred_output['vertices']
+        pred_keypoints_3d = pred_output['joints']
+        all_preds = {}
+        all_preds['keypoints_3d'] = pred_keypoints_3d.detach().cpu().numpy()
+        all_preds['smpl_pose'] = pred_pose.detach().cpu().numpy()
+        all_preds['smpl_beta'] = pred_betas.detach().cpu().numpy()
+        all_preds['camera'] = pred_cam.detach().cpu().numpy()
+        all_preds['vertices'] = pred_vertices.detach().cpu().numpy()
+        all_preds['image_idx'] = \
+            kwargs['sample_idx'].detach().cpu().numpy().reshape((-1))
+        return all_preds
diff --git a/detrsmpl/models/backbones/__init__.py b/detrsmpl/models/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/models/backbones/builder.py b/detrsmpl/models/backbones/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..aced32c469d8b18001877178ef9d11e8db5d21c0
--- /dev/null
+++ b/detrsmpl/models/backbones/builder.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.utils import Registry
+
+from .hrnet import PoseHighResolutionNet, PoseHighResolutionNetExpose
+from .resnet import ResNet, ResNetV1d
+
+BACKBONES = Registry('backbones')
+
+BACKBONES.register_module(name='ResNet', module=ResNet)
+BACKBONES.register_module(name='ResNetV1d', module=ResNetV1d)
+BACKBONES.register_module(name='PoseHighResolutionNet',
+                          module=PoseHighResolutionNet)
+BACKBONES.register_module(name='PoseHighResolutionNetExpose',
+                          module=PoseHighResolutionNetExpose)
+
+
+def build_backbone(cfg):
+    """Build backbone."""
+    if cfg is None:
+        return None
+    return BACKBONES.build(cfg)
diff --git a/detrsmpl/models/backbones/hrnet.py b/detrsmpl/models/backbones/hrnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5f950fbc1be7399ac70453df58559cf938519c5
--- /dev/null
+++ b/detrsmpl/models/backbones/hrnet.py
@@ -0,0 +1,754 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule, ModuleList, Sequential
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from .resnet import BasicBlock, Bottleneck
+
+
+class HRModule(BaseModule):
+    """High-Resolution Module for HRNet.
+
+    In this module, every branch has 4 BasicBlocks/Bottlenecks. Fusion/Exchange
+    is in this module.
+    """
+    def __init__(self,
+                 num_branches,
+                 blocks,
+                 num_blocks,
+                 in_channels,
+                 num_channels,
+                 multiscale_output=True,
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 block_init_cfg=None,
+                 init_cfg=None):
+        super(HRModule, self).__init__(init_cfg)
+        self.block_init_cfg = block_init_cfg
+        self._check_branches(num_branches, num_blocks, in_channels,
+                             num_channels)
+
+        self.in_channels = in_channels
+        self.num_branches = num_branches
+
+        self.multiscale_output = multiscale_output
+        self.norm_cfg = norm_cfg
+        self.conv_cfg = conv_cfg
+        self.with_cp = with_cp
+        self.branches = self._make_branches(num_branches, blocks, num_blocks,
+                                            num_channels)
+        self.fuse_layers = self._make_fuse_layers()
+        self.relu = nn.ReLU(inplace=False)
+
+    def _check_branches(self, num_branches, num_blocks, in_channels,
+                        num_channels):
+        if num_branches != len(num_blocks):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_BLOCKS({len(num_blocks)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(num_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_CHANNELS({len(num_channels)})'
+            raise ValueError(error_msg)
+
+        if num_branches != len(in_channels):
+            error_msg = f'NUM_BRANCHES({num_branches}) ' \
+                        f'!= NUM_INCHANNELS({len(in_channels)})'
+            raise ValueError(error_msg)
+
+    def _make_one_branch(self,
+                         branch_index,
+                         block,
+                         num_blocks,
+                         num_channels,
+                         stride=1):
+        downsample = None
+        if stride != 1 or \
+                self.in_channels[branch_index] != \
+                num_channels[branch_index] * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(self.conv_cfg,
+                                 self.in_channels[branch_index],
+                                 num_channels[branch_index] * block.expansion,
+                                 kernel_size=1,
+                                 stride=stride,
+                                 bias=False),
+                build_norm_layer(self.norm_cfg, num_channels[branch_index] *
+                                 block.expansion)[1])
+
+        layers = []
+        layers.append(
+            block(self.in_channels[branch_index],
+                  num_channels[branch_index],
+                  stride,
+                  downsample=downsample,
+                  with_cp=self.with_cp,
+                  norm_cfg=self.norm_cfg,
+                  conv_cfg=self.conv_cfg,
+                  init_cfg=self.block_init_cfg))
+        self.in_channels[branch_index] = \
+            num_channels[branch_index] * block.expansion
+        for i in range(1, num_blocks[branch_index]):
+            layers.append(
+                block(self.in_channels[branch_index],
+                      num_channels[branch_index],
+                      with_cp=self.with_cp,
+                      norm_cfg=self.norm_cfg,
+                      conv_cfg=self.conv_cfg,
+                      init_cfg=self.block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_branches(self, num_branches, block, num_blocks, num_channels):
+        branches = []
+
+        for i in range(num_branches):
+            branches.append(
+                self._make_one_branch(i, block, num_blocks, num_channels))
+
+        return ModuleList(branches)
+
+    def _make_fuse_layers(self):
+        if self.num_branches == 1:
+            return None
+
+        num_branches = self.num_branches
+        in_channels = self.in_channels
+        fuse_layers = []
+        num_out_branches = num_branches if self.multiscale_output else 1
+        for i in range(num_out_branches):
+            fuse_layer = []
+            for j in range(num_branches):
+                if j > i:
+                    fuse_layer.append(
+                        nn.Sequential(
+                            build_conv_layer(self.conv_cfg,
+                                             in_channels[j],
+                                             in_channels[i],
+                                             kernel_size=1,
+                                             stride=1,
+                                             padding=0,
+                                             bias=False),
+                            build_norm_layer(self.norm_cfg, in_channels[i])[1],
+                            nn.Upsample(scale_factor=2**(j - i),
+                                        mode='nearest')))
+                elif j == i:
+                    fuse_layer.append(None)
+                else:
+                    conv_downsamples = []
+                    for k in range(i - j):
+                        if k == i - j - 1:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(self.conv_cfg,
+                                                     in_channels[j],
+                                                     in_channels[i],
+                                                     kernel_size=3,
+                                                     stride=2,
+                                                     padding=1,
+                                                     bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[i])[1]))
+                        else:
+                            conv_downsamples.append(
+                                nn.Sequential(
+                                    build_conv_layer(self.conv_cfg,
+                                                     in_channels[j],
+                                                     in_channels[j],
+                                                     kernel_size=3,
+                                                     stride=2,
+                                                     padding=1,
+                                                     bias=False),
+                                    build_norm_layer(self.norm_cfg,
+                                                     in_channels[j])[1],
+                                    nn.ReLU(inplace=False)))
+                    fuse_layer.append(nn.Sequential(*conv_downsamples))
+            fuse_layers.append(nn.ModuleList(fuse_layer))
+
+        return nn.ModuleList(fuse_layers)
+
+    def forward(self, x):
+        """Forward function."""
+        if self.num_branches == 1:
+            return [self.branches[0](x[0])]
+
+        for i in range(self.num_branches):
+            x[i] = self.branches[i](x[i])
+
+        x_fuse = []
+        for i in range(len(self.fuse_layers)):
+            y = 0
+            for j in range(self.num_branches):
+                if i == j:
+                    y += x[j]
+                else:
+                    y += self.fuse_layers[i][j](x[j])
+            x_fuse.append(self.relu(y))
+        return x_fuse
+
+
+class PoseHighResolutionNet(BaseModule):
+    """HRNet backbone.
+    `High-Resolution Representations for Labeling Pixels and Regions
+    arXiv: <https://arxiv.org/abs/1904.04514>`_.
+    Args:
+        extra (dict): Detailed configuration for each stage of HRNet.
+            There must be 4 stages, the configuration for each stage must have
+            5 keys:
+                - num_modules(int): The number of HRModule in this stage.
+                - num_branches(int): The number of branches in the HRModule.
+                - block(str): The type of convolution block.
+                - num_blocks(tuple): The number of blocks in each branch.
+                    The length must be equal to num_branches.
+                - num_channels(tuple): The number of channels in each branch.
+                    The length must be equal to num_branches.
+        in_channels (int): Number of input image channels. Default: 3.
+        conv_cfg (dict): Dictionary to construct and config conv layer.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only. Default: True.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed. Default: False.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity. Default: False.
+        multiscale_output (bool): Whether to output multi-level features
+            produced by multiple branches. If False, only the first level
+            feature will be output. Default: True.
+        num_joints(int): the number of output for the final layer. Default: 24.
+        pretrained (str, optional): Model pretrained path. Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    blocks_dict = {'BASIC': BasicBlock, 'BOTTLENECK': Bottleneck}
+
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=True,
+                 with_cp=False,
+                 num_joints=24,
+                 zero_init_residual=False,
+                 multiscale_output=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(PoseHighResolutionNet, self).__init__(init_cfg)
+
+        self.pretrained = pretrained
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be specified at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(type='Constant',
+                         val=1,
+                         layer=['_BatchNorm', 'GroupNorm'])
+                ]
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        # Assert configurations of 4 stages are in extra
+        assert 'stage1' in extra and 'stage2' in extra \
+               and 'stage3' in extra and 'stage4' in extra
+        # Assert whether the length of `num_blocks` and `num_channels` are
+        # equal to `num_branches`
+        for i in range(4):
+            cfg = extra[f'stage{i + 1}']
+            assert len(cfg['num_blocks']) == cfg['num_branches'] and \
+                   len(cfg['num_channels']) == cfg['num_branches']
+
+        self.extra = extra
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.norm_eval = norm_eval
+        self.with_cp = with_cp
+        self.zero_init_residual = zero_init_residual
+
+        # stem net
+        self.norm1_name, norm1 = build_norm_layer(self.norm_cfg, 64, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(self.norm_cfg, 64, postfix=2)
+
+        self.conv1 = build_conv_layer(self.conv_cfg,
+                                      in_channels,
+                                      64,
+                                      kernel_size=3,
+                                      stride=2,
+                                      padding=1,
+                                      bias=False)
+
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(self.conv_cfg,
+                                      64,
+                                      64,
+                                      kernel_size=3,
+                                      stride=2,
+                                      padding=1,
+                                      bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.relu = nn.ReLU(inplace=True)
+
+        # stage 1
+        self.stage1_cfg = self.extra['stage1']
+        num_channels = self.stage1_cfg['num_channels'][0]
+        block_type = self.stage1_cfg['block']
+        num_blocks = self.stage1_cfg['num_blocks'][0]
+
+        block = self.blocks_dict[block_type]
+        stage1_out_channels = num_channels * block.expansion
+        self.layer1 = self._make_layer(block, 64, num_channels, num_blocks)
+
+        # stage 2
+        self.stage2_cfg = self.extra['stage2']
+        num_channels = self.stage2_cfg['num_channels']
+        block_type = self.stage2_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition1 = self._make_transition_layer([stage1_out_channels],
+                                                       num_channels)
+        self.stage2, pre_stage_channels = self._make_stage(
+            self.stage2_cfg, num_channels)
+
+        # stage 3
+        self.stage3_cfg = self.extra['stage3']
+        num_channels = self.stage3_cfg['num_channels']
+        block_type = self.stage3_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition2 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage3, pre_stage_channels = self._make_stage(
+            self.stage3_cfg, num_channels)
+
+        # stage 4
+        self.stage4_cfg = self.extra['stage4']
+        num_channels = self.stage4_cfg['num_channels']
+        block_type = self.stage4_cfg['block']
+
+        block = self.blocks_dict[block_type]
+        num_channels = [channel * block.expansion for channel in num_channels]
+        self.transition3 = self._make_transition_layer(pre_stage_channels,
+                                                       num_channels)
+        self.stage4, pre_stage_channels = self._make_stage(
+            self.stage4_cfg, num_channels, multiscale_output=multiscale_output)
+        # self.pretrained_layers = extra['pretrained_layers']
+        self.final_layer = build_conv_layer(
+            cfg=self.conv_cfg,
+            in_channels=pre_stage_channels[0],
+            out_channels=num_joints,
+            kernel_size=extra['final_conv_kernel'],
+            stride=1,
+            padding=1 if extra['final_conv_kernel'] == 3 else 0)
+        if extra['downsample'] and extra['use_conv']:
+            self.downsample_stage_1 = self._make_downsample_layer(
+                3, num_channel=self.stage2_cfg['num_channels'][0])
+            self.downsample_stage_2 = self._make_downsample_layer(
+                2, num_channel=self.stage2_cfg['num_channels'][-1])
+            self.downsample_stage_3 = self._make_downsample_layer(
+                1, num_channel=self.stage3_cfg['num_channels'][-1])
+        elif not extra['downsample'] and extra['use_conv']:
+            self.upsample_stage_2 = self._make_upsample_layer(
+                1, num_channel=self.stage2_cfg['num_channels'][-1])
+            self.upsample_stage_3 = self._make_upsample_layer(
+                2, num_channel=self.stage3_cfg['num_channels'][-1])
+            self.upsample_stage_4 = self._make_upsample_layer(
+                3, num_channel=self.stage4_cfg['num_channels'][-1])
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: the normalization layer named "norm2" """
+        return getattr(self, self.norm2_name)
+
+    def _make_transition_layer(self, num_channels_pre_layer,
+                               num_channels_cur_layer):
+        num_branches_cur = len(num_channels_cur_layer)
+        num_branches_pre = len(num_channels_pre_layer)
+
+        transition_layers = []
+        for i in range(num_branches_cur):
+            if i < num_branches_pre:
+                if num_channels_cur_layer[i] != num_channels_pre_layer[i]:
+                    transition_layers.append(
+                        nn.Sequential(
+                            build_conv_layer(self.conv_cfg,
+                                             num_channels_pre_layer[i],
+                                             num_channels_cur_layer[i],
+                                             kernel_size=3,
+                                             stride=1,
+                                             padding=1,
+                                             bias=False),
+                            build_norm_layer(self.norm_cfg,
+                                             num_channels_cur_layer[i])[1],
+                            nn.ReLU(inplace=True)))
+                else:
+                    transition_layers.append(None)
+            else:
+                conv_downsamples = []
+                for j in range(i + 1 - num_branches_pre):
+                    in_channels = num_channels_pre_layer[-1]
+                    out_channels = num_channels_cur_layer[i] \
+                        if j == i - num_branches_pre else in_channels
+                    conv_downsamples.append(
+                        nn.Sequential(
+                            build_conv_layer(self.conv_cfg,
+                                             in_channels,
+                                             out_channels,
+                                             kernel_size=3,
+                                             stride=2,
+                                             padding=1,
+                                             bias=False),
+                            build_norm_layer(self.norm_cfg, out_channels)[1],
+                            nn.ReLU(inplace=True)))
+                transition_layers.append(nn.Sequential(*conv_downsamples))
+
+        return nn.ModuleList(transition_layers)
+
+    def _make_layer(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                build_conv_layer(self.conv_cfg,
+                                 inplanes,
+                                 planes * block.expansion,
+                                 kernel_size=1,
+                                 stride=stride,
+                                 bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(type='Constant',
+                                      val=0,
+                                      override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(type='Constant',
+                                      val=0,
+                                      override=dict(name='norm3'))
+        layers.append(
+            block(
+                inplanes,
+                planes,
+                stride,
+                downsample=downsample,
+                with_cp=self.with_cp,
+                norm_cfg=self.norm_cfg,
+                conv_cfg=self.conv_cfg,
+                init_cfg=block_init_cfg,
+            ))
+        inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(
+                block(inplanes,
+                      planes,
+                      with_cp=self.with_cp,
+                      norm_cfg=self.norm_cfg,
+                      conv_cfg=self.conv_cfg,
+                      init_cfg=block_init_cfg))
+
+        return Sequential(*layers)
+
+    def _make_stage(self, layer_config, in_channels, multiscale_output=True):
+        num_modules = layer_config['num_modules']
+        num_branches = layer_config['num_branches']
+        num_blocks = layer_config['num_blocks']
+        num_channels = layer_config['num_channels']
+        block = self.blocks_dict[layer_config['block']]
+
+        hr_modules = []
+        block_init_cfg = None
+        if self.pretrained is None and not hasattr(
+                self, 'init_cfg') and self.zero_init_residual:
+            if block is BasicBlock:
+                block_init_cfg = dict(type='Constant',
+                                      val=0,
+                                      override=dict(name='norm2'))
+            elif block is Bottleneck:
+                block_init_cfg = dict(type='Constant',
+                                      val=0,
+                                      override=dict(name='norm3'))
+
+        for i in range(num_modules):
+            # multi_scale_output is only used for the last module
+            if not multiscale_output and i == num_modules - 1:
+                reset_multiscale_output = False
+            else:
+                reset_multiscale_output = True
+
+            hr_modules.append(
+                HRModule(num_branches,
+                         block,
+                         num_blocks,
+                         in_channels,
+                         num_channels,
+                         reset_multiscale_output,
+                         with_cp=self.with_cp,
+                         norm_cfg=self.norm_cfg,
+                         conv_cfg=self.conv_cfg,
+                         block_init_cfg=block_init_cfg))
+
+        return Sequential(*hr_modules), in_channels
+
+    def _make_upsample_layer(self, num_layers, num_channel, kernel_size=3):
+        layers = []
+        for i in range(num_layers):
+            layers.append(
+                nn.Upsample(scale_factor=2,
+                            mode='bilinear',
+                            align_corners=True))
+            layers.append(
+                build_conv_layer(
+                    cfg=self.conv_cfg,
+                    in_channels=num_channel,
+                    out_channels=num_channel,
+                    kernel_size=kernel_size,
+                    stride=1,
+                    padding=1,
+                    bias=False,
+                ))
+            layers.append(build_norm_layer(self.norm_cfg, num_channel)[1])
+            layers.append(nn.ReLU(inplace=True))
+
+        return nn.Sequential(*layers)
+
+    def _make_downsample_layer(self, num_layers, num_channel, kernel_size=3):
+        layers = []
+        for i in range(num_layers):
+            layers.append(
+                build_conv_layer(
+                    cfg=self.conv_cfg,
+                    in_channels=num_channel,
+                    out_channels=num_channel,
+                    kernel_size=kernel_size,
+                    stride=2,
+                    padding=1,
+                    bias=False,
+                ))
+            layers.append(build_norm_layer(self.norm_cfg, num_channel)[1])
+            layers.append(nn.ReLU(inplace=True))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage4(x_list)
+        if self.extra['return_list']:
+            return y_list
+        elif self.extra['downsample']:
+            if self.extra['use_conv']:
+                # Downsampling with strided convolutions
+                x1 = self.downsample_stage_1(y_list[0])
+                x2 = self.downsample_stage_2(y_list[1])
+                x3 = self.downsample_stage_3(y_list[2])
+                x = torch.cat([x1, x2, x3, y_list[3]], 1)
+            else:
+                # Downsampling with interpolation
+                x0_h, x0_w = y_list[3].size(2), y_list[3].size(3)
+                x1 = F.interpolate(y_list[0],
+                                   size=(x0_h, x0_w),
+                                   mode='bilinear',
+                                   align_corners=True)
+                x2 = F.interpolate(y_list[1],
+                                   size=(x0_h, x0_w),
+                                   mode='bilinear',
+                                   align_corners=True)
+                x3 = F.interpolate(y_list[2],
+                                   size=(x0_h, x0_w),
+                                   mode='bilinear',
+                                   align_corners=True)
+                x = torch.cat([x1, x2, x3, y_list[3]], 1)
+        else:
+            if self.extra['use_conv']:
+                # Upsampling with interpolations + convolutions
+                x1 = self.upsample_stage_2(y_list[1])
+                x2 = self.upsample_stage_3(y_list[2])
+                x3 = self.upsample_stage_4(y_list[3])
+                x = torch.cat([y_list[0], x1, x2, x3], 1)
+            else:
+                # Upsampling with interpolation
+                x0_h, x0_w = y_list[0].size(2), y_list[0].size(3)
+                x1 = F.interpolate(y_list[1],
+                                   size=(x0_h, x0_w),
+                                   mode='bilinear',
+                                   align_corners=True)
+                x2 = F.interpolate(y_list[2],
+                                   size=(x0_h, x0_w),
+                                   mode='bilinear',
+                                   align_corners=True)
+                x3 = F.interpolate(y_list[3],
+                                   size=(x0_h, x0_w),
+                                   mode='bilinear',
+                                   align_corners=True)
+                x = torch.cat([y_list[0], x1, x2, x3], 1)
+        return x
+
+    def train(self, mode=True):
+        """Convert the model into training mode will keeping the normalization
+        layer freezed."""
+        super(PoseHighResolutionNet, self).train(mode)
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+class PoseHighResolutionNetExpose(PoseHighResolutionNet):
+    """HRNet backbone for expose."""
+    def __init__(self,
+                 extra,
+                 in_channels=3,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 norm_eval=True,
+                 with_cp=False,
+                 num_joints=24,
+                 zero_init_residual=False,
+                 multiscale_output=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(extra, in_channels, conv_cfg, norm_cfg, norm_eval,
+                         with_cp, num_joints, zero_init_residual,
+                         multiscale_output, pretrained, init_cfg)
+        in_dims = (2**2 * self.stage2_cfg['num_channels'][-1] +
+                   2**1 * self.stage3_cfg['num_channels'][-1] +
+                   self.stage4_cfg['num_channels'][-1])
+        self.conv_layers = self._make_conv_layer(in_channels=in_dims,
+                                                 num_layers=5)
+        self.subsample_3 = self._make_subsample_layer(
+            in_channels=self.stage2_cfg['num_channels'][-1], num_layers=2)
+        self.subsample_2 = self._make_subsample_layer(
+            in_channels=self.stage3_cfg['num_channels'][-1], num_layers=1)
+
+    def _make_conv_layer(self,
+                         in_channels=2048,
+                         num_layers=3,
+                         num_filters=2048,
+                         stride=1):
+
+        layers = []
+        for i in range(num_layers):
+
+            downsample = nn.Conv2d(in_channels,
+                                   num_filters,
+                                   stride=1,
+                                   kernel_size=1,
+                                   bias=False)
+            layers.append(
+                Bottleneck(in_channels,
+                           num_filters // 4,
+                           downsample=downsample))
+            in_channels = num_filters
+
+        return nn.Sequential(*layers)
+
+    def _make_subsample_layer(self, in_channels=96, num_layers=3, stride=2):
+
+        layers = []
+        for i in range(num_layers):
+
+            layers.append(
+                nn.Conv2d(in_channels=in_channels,
+                          out_channels=2 * in_channels,
+                          kernel_size=3,
+                          stride=stride,
+                          padding=1))
+            in_channels = 2 * in_channels
+            layers.append(nn.BatchNorm2d(in_channels, momentum=0.1))
+            layers.append(nn.ReLU(inplace=True))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Forward function."""
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu(x)
+        x = self.conv2(x)
+        x = self.norm2(x)
+        x = self.relu(x)
+        x = self.layer1(x)
+
+        x_list = []
+        for i in range(self.stage2_cfg['num_branches']):
+            if self.transition1[i] is not None:
+                x_list.append(self.transition1[i](x))
+            else:
+                x_list.append(x)
+        y_list = self.stage2(x_list)
+
+        x_list = []
+        for i in range(self.stage3_cfg['num_branches']):
+            if self.transition2[i] is not None:
+                x_list.append(self.transition2[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        y_list = self.stage3(x_list)
+
+        x_list = []
+        for i in range(self.stage4_cfg['num_branches']):
+            if self.transition3[i] is not None:
+                x_list.append(self.transition3[i](y_list[-1]))
+            else:
+                x_list.append(y_list[i])
+        x3 = self.subsample_3(x_list[1])
+        x2 = self.subsample_2(x_list[2])
+        x1 = x_list[3]
+        xf = self.conv_layers(torch.cat([x3, x2, x1], dim=1))
+        xf = xf.mean(dim=(2, 3))
+        xf = xf.view(xf.size(0), -1)
+        return xf
diff --git a/detrsmpl/models/backbones/resnet.py b/detrsmpl/models/backbones/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..7333646625b104a35eccf4e1405caa6fb81aaa88
--- /dev/null
+++ b/detrsmpl/models/backbones/resnet.py
@@ -0,0 +1,662 @@
+import warnings
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
+from mmcv.runner import BaseModule
+from torch.nn.modules.batchnorm import _BatchNorm
+
+from ..utils import ResLayer
+
+
+class BasicBlock(BaseModule):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        super(BasicBlock, self).__init__(init_cfg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+
+        self.conv1 = build_conv_layer(conv_cfg,
+                                      inplanes,
+                                      planes,
+                                      3,
+                                      stride=stride,
+                                      padding=dilation,
+                                      dilation=dilation,
+                                      bias=False)
+        self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(conv_cfg,
+                                      planes,
+                                      planes,
+                                      3,
+                                      padding=1,
+                                      bias=False)
+        self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    def forward(self, x):
+        """Forward function."""
+        def _inner_forward(x):
+            identity = x
+
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_cfg=None):
+        """Bottleneck block for ResNet.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super(Bottleneck, self).__init__(init_cfg)
+        assert style in ['pytorch', 'caffe']
+        assert dcn is None or isinstance(dcn, dict)
+        assert plugins is None or isinstance(plugins, list)
+        if plugins is not None:
+            allowed_position = ['after_conv1', 'after_conv2', 'after_conv3']
+            assert all(p['position'] in allowed_position for p in plugins)
+
+        self.inplanes = inplanes
+        self.planes = planes
+        self.stride = stride
+        self.dilation = dilation
+        self.style = style
+        self.with_cp = with_cp
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+        self.plugins = plugins
+        self.with_plugins = plugins is not None
+
+        if self.with_plugins:
+            # collect plugins for conv1/conv2/conv3
+            self.after_conv1_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv1'
+            ]
+            self.after_conv2_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv2'
+            ]
+            self.after_conv3_plugins = [
+                plugin['cfg'] for plugin in plugins
+                if plugin['position'] == 'after_conv3'
+            ]
+
+        if self.style == 'pytorch':
+            self.conv1_stride = 1
+            self.conv2_stride = stride
+        else:
+            self.conv1_stride = stride
+            self.conv2_stride = 1
+
+        self.norm1_name, norm1 = build_norm_layer(norm_cfg, planes, postfix=1)
+        self.norm2_name, norm2 = build_norm_layer(norm_cfg, planes, postfix=2)
+        self.norm3_name, norm3 = build_norm_layer(norm_cfg,
+                                                  planes * self.expansion,
+                                                  postfix=3)
+
+        self.conv1 = build_conv_layer(conv_cfg,
+                                      inplanes,
+                                      planes,
+                                      kernel_size=1,
+                                      stride=self.conv1_stride,
+                                      bias=False)
+        self.add_module(self.norm1_name, norm1)
+        fallback_on_stride = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.pop('fallback_on_stride', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = build_conv_layer(conv_cfg,
+                                          planes,
+                                          planes,
+                                          kernel_size=3,
+                                          stride=self.conv2_stride,
+                                          padding=dilation,
+                                          dilation=dilation,
+                                          bias=False)
+        else:
+            assert self.conv_cfg is None, 'conv_cfg must be None for DCN'
+            self.conv2 = build_conv_layer(dcn,
+                                          planes,
+                                          planes,
+                                          kernel_size=3,
+                                          stride=self.conv2_stride,
+                                          padding=dilation,
+                                          dilation=dilation,
+                                          bias=False)
+
+        self.add_module(self.norm2_name, norm2)
+        self.conv3 = build_conv_layer(conv_cfg,
+                                      planes,
+                                      planes * self.expansion,
+                                      kernel_size=1,
+                                      bias=False)
+        self.add_module(self.norm3_name, norm3)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+
+        if self.with_plugins:
+            self.after_conv1_plugin_names = self.make_block_plugins(
+                planes, self.after_conv1_plugins)
+            self.after_conv2_plugin_names = self.make_block_plugins(
+                planes, self.after_conv2_plugins)
+            self.after_conv3_plugin_names = self.make_block_plugins(
+                planes * self.expansion, self.after_conv3_plugins)
+
+    def make_block_plugins(self, in_channels, plugins):
+        """make plugins for block.
+
+        Args:
+            in_channels (int): Input channels of plugin.
+            plugins (list[dict]): List of plugins cfg to build.
+        Returns:
+            list[str]: List of the names of plugin.
+        """
+        assert isinstance(plugins, list)
+        plugin_names = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            name, layer = build_plugin_layer(plugin,
+                                             in_channels=in_channels,
+                                             postfix=plugin.pop('postfix', ''))
+            assert not hasattr(self, name), f'duplicate plugin {name}'
+            self.add_module(name, layer)
+            plugin_names.append(name)
+        return plugin_names
+
+    def forward_plugin(self, x, plugin_names):
+        out = x
+        for name in plugin_names:
+            out = getattr(self, name)(x)
+        return out
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name)
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name)
+
+    @property
+    def norm3(self):
+        """nn.Module: normalization layer after the third convolution layer"""
+        return getattr(self, self.norm3_name)
+
+    def forward(self, x):
+        """Forward function."""
+        def _inner_forward(x):
+            identity = x
+            out = self.conv1(x)
+            out = self.norm1(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv1_plugin_names)
+
+            out = self.conv2(out)
+            out = self.norm2(out)
+            out = self.relu(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv2_plugin_names)
+
+            out = self.conv3(out)
+            out = self.norm3(out)
+
+            if self.with_plugins:
+                out = self.forward_plugin(out, self.after_conv3_plugin_names)
+
+            if self.downsample is not None:
+                identity = self.downsample(x)
+
+            out += identity
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(BaseModule):
+    """ResNet backbone.
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        stem_channels (int | None): Number of stem channels. If not specified,
+            it will be the same as `base_channels`. Default: None.
+        base_channels (int): Number of base channels of res layer. Default: 64.
+        in_channels (int): Number of input image channels. Default: 3.
+        num_stages (int): Resnet stages. Default: 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        deep_stem (bool): Replace 7x7 conv in input stem with 3 3x3 conv
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        norm_cfg (dict): Dictionary to construct and config norm layer.
+        norm_eval (bool): Whether to set norm layers to eval mode, namely,
+            freeze running stats (mean and var). Note: Effect on Batch Norm
+            and its variants only.
+        plugins (list[dict]): List of plugins for stages, each dict contains:
+            - cfg (dict, required): Cfg dict to build plugin.
+            - position (str, required): Position inside block to insert
+              plugin, options are 'after_conv1', 'after_conv2', 'after_conv3'.
+            - stages (tuple[bool], optional): Stages to apply plugin, length
+              should be same as 'num_stages'.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+        zero_init_residual (bool): Whether to use zero init for last norm layer
+            in resblocks to let them behave as identity.
+        pretrained (str, optional): model pretrained path. Default: None
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    Example:
+        >>> from detrsmpl.models.backbones.resnet import ResNet
+        >>> import torch
+        >>> self = ResNet(depth=18)
+        >>> self.eval()
+        >>> inputs = torch.rand(1, 3, 32, 32)
+        >>> level_outputs = self.forward(inputs)
+        >>> for level_out in level_outputs:
+        ...     print(tuple(level_out.shape))
+        (1, 64, 8, 8)
+        (1, 128, 4, 4)
+        (1, 256, 2, 2)
+        (1, 512, 1, 1)
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth,
+                 in_channels=3,
+                 stem_channels=None,
+                 base_channels=64,
+                 num_stages=4,
+                 strides=(1, 2, 2, 2),
+                 dilations=(1, 1, 1, 1),
+                 out_indices=(0, 1, 2, 3),
+                 style='pytorch',
+                 deep_stem=False,
+                 avg_down=False,
+                 frozen_stages=-1,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN', requires_grad=True),
+                 norm_eval=True,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False),
+                 plugins=None,
+                 with_cp=False,
+                 zero_init_residual=True,
+                 pretrained=None,
+                 init_cfg=None):
+        super(ResNet, self).__init__(init_cfg)
+        self.zero_init_residual = zero_init_residual
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+
+        block_init_cfg = None
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be setting at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is None:
+            if init_cfg is None:
+                self.init_cfg = [
+                    dict(type='Kaiming', layer='Conv2d'),
+                    dict(type='Constant',
+                         val=1,
+                         layer=['_BatchNorm', 'GroupNorm'])
+                ]
+                block = self.arch_settings[depth][0]
+                if self.zero_init_residual:
+                    if block is BasicBlock:
+                        block_init_cfg = dict(type='Constant',
+                                              val=0,
+                                              override=dict(name='norm2'))
+                    elif block is Bottleneck:
+                        block_init_cfg = dict(type='Constant',
+                                              val=0,
+                                              override=dict(name='norm3'))
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depth = depth
+        if stem_channels is None:
+            stem_channels = base_channels
+        self.stem_channels = stem_channels
+        self.base_channels = base_channels
+        self.num_stages = num_stages
+        assert num_stages >= 1 and num_stages <= 4
+        self.strides = strides
+        self.dilations = dilations
+        assert len(strides) == len(dilations) == num_stages
+        self.out_indices = out_indices
+        assert max(out_indices) < num_stages
+        self.style = style
+        self.deep_stem = deep_stem
+        self.avg_down = avg_down
+        self.frozen_stages = frozen_stages
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.with_cp = with_cp
+        self.norm_eval = norm_eval
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        if dcn is not None:
+            assert len(stage_with_dcn) == num_stages
+        self.plugins = plugins
+        self.block, stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        self.inplanes = stem_channels
+
+        self._make_stem_layer(in_channels, stem_channels)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            dcn = self.dcn if self.stage_with_dcn[i] else None
+            if plugins is not None:
+                stage_plugins = self.make_stage_plugins(plugins, i)
+            else:
+                stage_plugins = None
+            planes = base_channels * 2**i
+            res_layer = self.make_res_layer(block=self.block,
+                                            inplanes=self.inplanes,
+                                            planes=planes,
+                                            num_blocks=num_blocks,
+                                            stride=stride,
+                                            dilation=dilation,
+                                            style=self.style,
+                                            avg_down=self.avg_down,
+                                            with_cp=with_cp,
+                                            conv_cfg=conv_cfg,
+                                            norm_cfg=norm_cfg,
+                                            dcn=dcn,
+                                            plugins=stage_plugins,
+                                            init_cfg=block_init_cfg)
+            self.inplanes = planes * self.block.expansion
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self._freeze_stages()
+
+        self.feat_dim = self.block.expansion * base_channels * 2**(
+            len(self.stage_blocks) - 1)
+
+    def make_stage_plugins(self, plugins, stage_idx):
+        """Make plugins for ResNet ``stage_idx`` th stage.
+        Currently we support to insert ``context_block``,
+        ``empirical_attention_block``, ``nonlocal_block`` into the backbone
+        like ResNet/ResNeXt. They could be inserted after conv1/conv2/conv3 of
+        Bottleneck.
+        An example of plugins format could be:
+        Examples:
+            >>> plugins=[
+            ...     dict(cfg=dict(type='xxx', arg1='xxx'),
+            ...          stages=(False, True, True, True),
+            ...          position='after_conv2'),
+            ...     dict(cfg=dict(type='yyy'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='1'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3'),
+            ...     dict(cfg=dict(type='zzz', postfix='2'),
+            ...          stages=(True, True, True, True),
+            ...          position='after_conv3')
+            ... ]
+            >>> self = ResNet(depth=18)
+            >>> stage_plugins = self.make_stage_plugins(plugins, 0)
+            >>> assert len(stage_plugins) == 3
+        Suppose ``stage_idx=0``, the structure of blocks in the stage would be:
+        .. code-block:: none
+            conv1-> conv2->conv3->yyy->zzz1->zzz2
+        Suppose 'stage_idx=1', the structure of blocks in the stage would be:
+        .. code-block:: none
+            conv1-> conv2->xxx->conv3->yyy->zzz1->zzz2
+        If stages is missing, the plugin would be applied to all stages.
+        Args:
+            plugins (list[dict]): List of plugins cfg to build. The postfix is
+                required if multiple same type plugins are inserted.
+            stage_idx (int): Index of stage to build
+        Returns:
+            list[dict]: Plugins for current stage
+        """
+        stage_plugins = []
+        for plugin in plugins:
+            plugin = plugin.copy()
+            stages = plugin.pop('stages', None)
+            assert stages is None or len(stages) == self.num_stages
+            # whether to insert plugin into current stage
+            if stages is None or stages[stage_idx]:
+                stage_plugins.append(plugin)
+
+        return stage_plugins
+
+    def make_res_layer(self, **kwargs):
+        """Pack all blocks in a stage into a ``ResLayer``."""
+        return ResLayer(**kwargs)
+
+    @property
+    def norm1(self):
+        """nn.Module: the normalization layer named "norm1" """
+        return getattr(self, self.norm1_name)
+
+    def _make_stem_layer(self, in_channels, stem_channels):
+        if self.deep_stem:
+            self.stem = nn.Sequential(
+                build_conv_layer(self.conv_cfg,
+                                 in_channels,
+                                 stem_channels // 2,
+                                 kernel_size=3,
+                                 stride=2,
+                                 padding=1,
+                                 bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(self.conv_cfg,
+                                 stem_channels // 2,
+                                 stem_channels // 2,
+                                 kernel_size=3,
+                                 stride=1,
+                                 padding=1,
+                                 bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels // 2)[1],
+                nn.ReLU(inplace=True),
+                build_conv_layer(self.conv_cfg,
+                                 stem_channels // 2,
+                                 stem_channels,
+                                 kernel_size=3,
+                                 stride=1,
+                                 padding=1,
+                                 bias=False),
+                build_norm_layer(self.norm_cfg, stem_channels)[1],
+                nn.ReLU(inplace=True))
+        else:
+            self.conv1 = build_conv_layer(self.conv_cfg,
+                                          in_channels,
+                                          stem_channels,
+                                          kernel_size=7,
+                                          stride=2,
+                                          padding=3,
+                                          bias=False)
+            self.norm1_name, norm1 = build_norm_layer(self.norm_cfg,
+                                                      stem_channels,
+                                                      postfix=1)
+            self.add_module(self.norm1_name, norm1)
+            self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            if self.deep_stem:
+                self.stem.eval()
+                for param in self.stem.parameters():
+                    param.requires_grad = False
+            else:
+                self.norm1.eval()
+                for m in [self.conv1, self.norm1]:
+                    for param in m.parameters():
+                        param.requires_grad = False
+
+        for i in range(1, self.frozen_stages + 1):
+            m = getattr(self, f'layer{i}')
+            m.eval()
+            for param in m.parameters():
+                param.requires_grad = False
+
+    def forward(self, x):
+        """Forward function."""
+        if self.deep_stem:
+            x = self.stem(x)
+        else:
+            x = self.conv1(x)
+            x = self.norm1(x)
+            x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        return tuple(outs)
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep normalization layer
+        freezed."""
+        super(ResNet, self).train(mode)
+        self._freeze_stages()
+        if mode and self.norm_eval:
+            for m in self.modules():
+                # trick: eval have effect on BatchNorm only
+                if isinstance(m, _BatchNorm):
+                    m.eval()
+
+
+class ResNetV1d(ResNet):
+    r"""ResNetV1d variant described in `Bag of Tricks
+    <https://arxiv.org/pdf/1812.01187.pdf>`_.
+    Compared with default ResNet(ResNetV1b), ResNetV1d replaces the 7x7 conv in
+    the input stem with three 3x3 convs. And in the downsampling block, a 2x2
+    avg_pool with stride 2 is added before conv, whose stride is changed to 1.
+    """
+    def __init__(self, **kwargs):
+        super(ResNetV1d, self).__init__(deep_stem=True,
+                                        avg_down=True,
+                                        **kwargs)
+
+
+def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
+    """3x3 convolution with padding."""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=3,
+                     stride=stride,
+                     padding=dilation,
+                     groups=groups,
+                     bias=False,
+                     dilation=dilation)
+
+
+def conv1x1(in_planes, out_planes, stride=1):
+    """1x1 convolution."""
+    return nn.Conv2d(in_planes,
+                     out_planes,
+                     kernel_size=1,
+                     stride=stride,
+                     bias=False)
diff --git a/detrsmpl/models/body_models/__init__.py b/detrsmpl/models/body_models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/models/body_models/builder.py b/detrsmpl/models/body_models/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..4afc217da15ca64b7c4948c804a6c0f8f5dbbad3
--- /dev/null
+++ b/detrsmpl/models/body_models/builder.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.utils import Registry
+
+from .flame import FLAME, FLAMELayer
+from .mano import MANO, MANOLayer
+from .smpl import SMPL, GenderedSMPL, HybrIKSMPL
+from .smplx import SMPLX, SMPLXLayer
+from .star import STAR
+
+BODY_MODELS = Registry('body_models')
+
+BODY_MODELS.register_module(name=['SMPL', 'smpl'], module=SMPL)
+BODY_MODELS.register_module(name='GenderedSMPL', module=GenderedSMPL)
+BODY_MODELS.register_module(name=['STAR', 'star'], module=STAR)
+BODY_MODELS.register_module(
+    name=['HybrIKSMPL', 'HybrIKsmpl', 'hybriksmpl', 'hybrik', 'hybrIK'],
+    module=HybrIKSMPL)
+BODY_MODELS.register_module(name=['SMPLX', 'smplx'], module=SMPLX)
+BODY_MODELS.register_module(name=['flame', 'FLAME'], module=FLAME)
+BODY_MODELS.register_module(name=['MANO', 'mano'], module=MANO)
+BODY_MODELS.register_module(name=['SMPLXLayer', 'smplxlayer'],
+                            module=SMPLXLayer)
+BODY_MODELS.register_module(name=['MANOLayer', 'manolayer'], module=MANOLayer)
+BODY_MODELS.register_module(name=['FLAMELayer', 'flamelayer'],
+                            module=FLAMELayer)
+
+
+def build_body_model(cfg):
+    """Build body_models."""
+    if cfg is None:
+        return None
+    return BODY_MODELS.build(cfg)
diff --git a/detrsmpl/models/body_models/flame.py b/detrsmpl/models/body_models/flame.py
new file mode 100644
index 0000000000000000000000000000000000000000..323e1a030985db109916490dad9400f911fa03fa
--- /dev/null
+++ b/detrsmpl/models/body_models/flame.py
@@ -0,0 +1,187 @@
+import numpy as np
+import torch
+from smplx import FLAME as _FLAME
+from smplx import FLAMELayer as _FLAMELayer
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    convert_kps,
+    get_keypoint_num,
+)
+
+
+class FLAME(_FLAME):
+    """Extension of the official FLAME implementation."""
+    head_pose_keys = {'global_orient', 'jaw_pose'}
+    full_pose_keys = {
+        'global_orient', 'neck_pose', 'jaw_pose', 'leye_pose', 'reye_pose'
+    }
+
+    NUM_VERTS = 5023
+    NUM_FACES = 9976
+
+    def __init__(self,
+                 *args,
+                 keypoint_src: str = 'flame',
+                 keypoint_dst: str = 'human_data',
+                 keypoint_approximate: bool = False,
+                 **kwargs):
+        """
+        Args:
+            *args: extra arguments for FLAME initialization.
+            keypoint_src: source convention of keypoints. This convention
+                is used for keypoints obtained from joint regressors.
+                Keypoints then undergo conversion into keypoint_dst
+                convention.
+            keypoint_dst: destination convention of keypoints. This convention
+                is used for keypoints in the output.
+            keypoint_approximate: whether to use approximate matching in
+                convention conversion for keypoints.
+            **kwargs: extra keyword arguments for FLAME initialization.
+
+        Returns:
+            None
+        """
+        super(FLAME, self).__init__(*args, **kwargs)
+        self.keypoint_src = keypoint_src
+        self.keypoint_dst = keypoint_dst
+        self.keypoint_approximate = keypoint_approximate
+
+        self.num_verts = self.get_num_verts()
+        self.num_faces = self.get_num_faces()
+        self.num_joints = get_keypoint_num(convention=self.keypoint_dst)
+
+    def forward(self,
+                *args,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                **kwargs) -> dict:
+        """Forward function.
+
+        Args:
+            *args: extra arguments for FLAME
+            return_verts: whether to return vertices
+            return_full_pose: whether to return full pose parameters
+            **kwargs: extra arguments for FLAME
+
+        Returns:
+            output: contains output parameters and attributes
+        """
+        flame_output = super(FLAME, self).forward(*args, **kwargs)
+        joints = flame_output.joints
+        joints, joint_mask = convert_kps(joints,
+                                         src=self.keypoint_src,
+                                         dst=self.keypoint_dst,
+                                         approximate=self.keypoint_approximate)
+        if isinstance(joint_mask, np.ndarray):
+            joint_mask = torch.tensor(joint_mask,
+                                      dtype=torch.uint8,
+                                      device=joints.device)
+
+        batch_size = joints.shape[0]
+        joint_mask = joint_mask.reshape(1, -1).expand(batch_size, -1)
+
+        output = dict(global_orient=flame_output.global_orient,
+                      neck_pose=flame_output.neck_pose,
+                      jaw_pose=flame_output.jaw_pose,
+                      joints=joints,
+                      joint_mask=joint_mask,
+                      keypoints=torch.cat([joints, joint_mask[:, :, None]],
+                                          dim=-1),
+                      betas=flame_output.betas,
+                      expression=flame_output.expression)
+
+        if return_verts:
+            output['vertices'] = flame_output.vertices
+        if return_full_pose:
+            output['full_pose'] = flame_output.full_pose
+
+        return output
+
+
+class FLAMELayer(_FLAMELayer):
+    """Extension of the official FLAME implementation."""
+    head_pose_keys = {'global_orient', 'jaw_pose'}
+    full_pose_keys = {
+        'global_orient', 'neck_pose', 'jaw_pose', 'leye_pose', 'reye_pose'
+    }
+
+    NUM_VERTS = 5023
+    NUM_FACES = 9976
+
+    def __init__(self,
+                 *args,
+                 keypoint_src: str = 'flame',
+                 keypoint_dst: str = 'human_data',
+                 keypoint_approximate: bool = False,
+                 **kwargs):
+        """
+        Args:
+            *args: extra arguments for FLAME initialization.
+            keypoint_src: source convention of keypoints. This convention
+                is used for keypoints obtained from joint regressors.
+                Keypoints then undergo conversion into keypoint_dst
+                convention.
+            keypoint_dst: destination convention of keypoints. This convention
+                is used for keypoints in the output.
+            keypoint_approximate: whether to use approximate matching in
+                convention conversion for keypoints.
+            **kwargs: extra keyword arguments for FLAME initialization.
+
+        Returns:
+            None
+        """
+        super(FLAMELayer, self).__init__(*args, **kwargs)
+        self.keypoint_src = keypoint_src
+        self.keypoint_dst = keypoint_dst
+        self.keypoint_approximate = keypoint_approximate
+
+        self.num_verts = self.get_num_verts()
+        self.num_faces = self.get_num_faces()
+        self.num_joints = get_keypoint_num(convention=self.keypoint_dst)
+
+    def forward(self,
+                *args,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                **kwargs) -> dict:
+        """Forward function.
+
+        Args:
+            *args: extra arguments for FLAME
+            return_verts: whether to return vertices
+            return_full_pose: whether to return full pose parameters
+            **kwargs: extra arguments for FLAME
+
+        Returns:
+            output: contains output parameters and attributes
+        """
+        flame_output = super(FLAMELayer, self).forward(*args, **kwargs)
+        joints = flame_output.joints
+        joints, joint_mask = convert_kps(joints,
+                                         src=self.keypoint_src,
+                                         dst=self.keypoint_dst,
+                                         approximate=self.keypoint_approximate)
+        if isinstance(joint_mask, np.ndarray):
+            joint_mask = torch.tensor(joint_mask,
+                                      dtype=torch.uint8,
+                                      device=joints.device)
+
+        batch_size = joints.shape[0]
+        joint_mask = joint_mask.reshape(1, -1).expand(batch_size, -1)
+
+        output = dict(global_orient=flame_output.global_orient,
+                      neck_pose=flame_output.neck_pose,
+                      jaw_pose=flame_output.jaw_pose,
+                      joints=joints,
+                      joint_mask=joint_mask,
+                      keypoints=torch.cat([joints, joint_mask[:, :, None]],
+                                          dim=-1),
+                      betas=flame_output.betas,
+                      expression=flame_output.expression)
+
+        if return_verts:
+            output['vertices'] = flame_output.vertices
+        if return_full_pose:
+            output['full_pose'] = flame_output.full_pose
+
+        return output
diff --git a/detrsmpl/models/body_models/mano.py b/detrsmpl/models/body_models/mano.py
new file mode 100644
index 0000000000000000000000000000000000000000..124d95d051dcaffa51fe1e34fd737a0c22d658bf
--- /dev/null
+++ b/detrsmpl/models/body_models/mano.py
@@ -0,0 +1,271 @@
+import numpy as np
+import torch
+from smplx import MANO as _MANO
+from smplx import MANOLayer as _MANOLayer
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    convert_kps,
+    get_keypoint_num,
+)
+
+
+class MANO(_MANO):
+    """Extension of the official MANO implementation."""
+    full_pose_keys = {'global_orient', 'hand_pose'}
+
+    NUM_VERTS = 776
+    NUM_FACES = 9976
+
+    KpId2manokps = {
+        0: 0,  # Wrist
+        1: 5,
+        2: 6,
+        3: 7,  # Index
+        4: 9,
+        5: 10,
+        6: 11,  # Middle
+        7: 17,
+        8: 18,
+        9: 19,  # Pinky
+        10: 13,
+        11: 14,
+        12: 15,  # Ring
+        13: 1,
+        14: 2,
+        15: 3
+    }  # Thumb
+    kpId2vertices = {
+        4: 744,  # Thumb
+        8: 320,  # Index
+        12: 443,  # Middle
+        16: 555,  # Ring
+        20: 672  # Pink
+    }
+
+    def __init__(self,
+                 *args,
+                 keypoint_src: str = 'mano',
+                 keypoint_dst: str = 'human_data',
+                 keypoint_approximate: bool = False,
+                 **kwargs):
+        """
+        Args:
+            *args: extra arguments for MANO initialization.
+            keypoint_src: source convention of keypoints. This convention
+                is used for keypoints obtained from joint regressors.
+                Keypoints then undergo conversion into keypoint_dst
+                convention.
+            keypoint_dst: destination convention of keypoints. This convention
+                is used for keypoints in the output.
+            keypoint_approximate: whether to use approximate matching in
+                convention conversion for keypoints.
+            **kwargs: extra keyword arguments for MANO initialization.
+
+        Returns:
+            None
+        """
+        super(MANO, self).__init__(*args, **kwargs)
+        self.keypoint_src = keypoint_src
+        self.keypoint_dst = keypoint_dst
+        self.keypoint_approximate = keypoint_approximate
+
+        self.num_verts = self.get_num_verts()
+        self.num_faces = self.get_num_faces()
+        self.num_joints = get_keypoint_num(convention=self.keypoint_dst)
+
+    def forward(self,
+                *args,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                **kwargs) -> dict:
+        """Forward function.
+
+        Args:
+            *args: extra arguments for MANO
+            return_verts: whether to return vertices
+            return_full_pose: whether to return full pose parameters
+            **kwargs: extra arguments for MANO
+
+        Returns:
+            output: contains output parameters and attributes
+        """
+        if 'right_hand_pose' in kwargs:
+            kwargs['hand_pose'] = kwargs['right_hand_pose']
+        mano_output = super(MANO, self).forward(*args, **kwargs)
+        joints = mano_output.joints
+
+        joints = self.get_keypoints_from_mesh(mano_output.vertices, joints)
+
+        joints, joint_mask = convert_kps(joints,
+                                         src=self.keypoint_src,
+                                         dst=self.keypoint_dst,
+                                         approximate=self.keypoint_approximate)
+        if isinstance(joint_mask, np.ndarray):
+            joint_mask = torch.tensor(joint_mask,
+                                      dtype=torch.uint8,
+                                      device=joints.device)
+
+        batch_size = joints.shape[0]
+        joint_mask = joint_mask.reshape(1, -1).expand(batch_size, -1)
+
+        output = dict(
+            global_orient=mano_output.global_orient,
+            hand_pose=mano_output.hand_pose,
+            joints=joints,
+            joint_mask=joint_mask,
+            keypoints=torch.cat([joints, joint_mask[:, :, None]], dim=-1),
+            betas=mano_output.betas,
+        )
+
+        if return_verts:
+            output['vertices'] = mano_output.vertices
+        if return_full_pose:
+            output['full_pose'] = mano_output.full_pose
+
+        return output
+
+    def get_keypoints_from_mesh(self, mesh_vertices, keypoints_regressed):
+        """Assembles the full 21 keypoint set from the 16 Mano Keypoints and 5
+        mesh vertices for the fingers."""
+        batch_size = keypoints_regressed.shape[0]
+        keypoints = torch.zeros((batch_size, 21, 3)).cuda()
+
+        # fill keypoints which are regressed
+        for manoId, myId in self.KpId2manokps.items():
+            keypoints[:, myId, :] = keypoints_regressed[:, manoId, :]
+        # get other keypoints from mesh
+        for myId, meshId in self.kpId2vertices.items():
+            keypoints[:, myId, :] = mesh_vertices[:, meshId, :]
+
+        return keypoints
+
+
+class MANOLayer(_MANOLayer):
+    """Extension of the official MANO implementation."""
+    full_pose_keys = {'global_orient', 'hand_pose'}
+
+    NUM_VERTS = 776
+    NUM_FACES = 9976
+
+    KpId2manokps = {
+        0: 0,  # Wrist
+        1: 5,
+        2: 6,
+        3: 7,  # Index
+        4: 9,
+        5: 10,
+        6: 11,  # Middle
+        7: 17,
+        8: 18,
+        9: 19,  # Pinky
+        10: 13,
+        11: 14,
+        12: 15,  # Ring
+        13: 1,
+        14: 2,
+        15: 3
+    }  # Thumb
+    kpId2vertices = {
+        4: 744,  # Thumb
+        8: 320,  # Index
+        12: 443,  # Middle
+        16: 555,  # Ring
+        20: 672  # Pink
+    }
+
+    def __init__(self,
+                 *args,
+                 keypoint_src: str = 'mano',
+                 keypoint_dst: str = 'human_data',
+                 keypoint_approximate: bool = False,
+                 **kwargs):
+        """
+        Args:
+            *args: extra arguments for MANO initialization.
+            keypoint_src: source convention of keypoints. This convention
+                is used for keypoints obtained from joint regressors.
+                Keypoints then undergo conversion into keypoint_dst
+                convention.
+            keypoint_dst: destination convention of keypoints. This convention
+                is used for keypoints in the output.
+            keypoint_approximate: whether to use approximate matching in
+                convention conversion for keypoints.
+            **kwargs: extra keyword arguments for MANO initialization.
+
+        Returns:
+            None
+        """
+        super(MANOLayer, self).__init__(*args, **kwargs)
+        self.keypoint_src = keypoint_src
+        self.keypoint_dst = keypoint_dst
+        self.keypoint_approximate = keypoint_approximate
+
+        self.num_verts = self.get_num_verts()
+        self.num_faces = self.get_num_faces()
+        self.num_joints = get_keypoint_num(convention=self.keypoint_dst)
+
+    def forward(self,
+                *args,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                **kwargs) -> dict:
+        """Forward function.
+
+        Args:
+            *args: extra arguments for MANO
+            return_verts: whether to return vertices
+            return_full_pose: whether to return full pose parameters
+            **kwargs: extra arguments for MANO
+
+        Returns:
+            output: contains output parameters and attributes
+        """
+        if 'right_hand_pose' in kwargs:
+            kwargs['hand_pose'] = kwargs['right_hand_pose']
+        mano_output = super(MANOLayer, self).forward(*args, **kwargs)
+        joints = mano_output.joints
+
+        joints = self.get_keypoints_from_mesh(mano_output.vertices, joints)
+
+        joints, joint_mask = convert_kps(joints,
+                                         src=self.keypoint_src,
+                                         dst=self.keypoint_dst,
+                                         approximate=self.keypoint_approximate)
+        if isinstance(joint_mask, np.ndarray):
+            joint_mask = torch.tensor(joint_mask,
+                                      dtype=torch.uint8,
+                                      device=joints.device)
+
+        batch_size = joints.shape[0]
+        joint_mask = joint_mask.reshape(1, -1).expand(batch_size, -1)
+
+        output = dict(
+            global_orient=mano_output.global_orient,
+            hand_pose=mano_output.hand_pose,
+            joints=joints,
+            joint_mask=joint_mask,
+            keypoints=torch.cat([joints, joint_mask[:, :, None]], dim=-1),
+            betas=mano_output.betas,
+        )
+
+        if return_verts:
+            output['vertices'] = mano_output.vertices
+        if return_full_pose:
+            output['full_pose'] = mano_output.full_pose
+
+        return output
+
+    def get_keypoints_from_mesh(self, mesh_vertices, keypoints_regressed):
+        """Assembles the full 21 keypoint set from the 16 Mano Keypoints and 5
+        mesh vertices for the fingers."""
+        batch_size = keypoints_regressed.shape[0]
+        keypoints = torch.zeros((batch_size, 21, 3)).cuda()
+
+        # fill keypoints which are regressed
+        for manoId, myId in self.KpId2manokps.items():
+            keypoints[:, myId, :] = keypoints_regressed[:, manoId, :]
+        # get other keypoints from mesh
+        for myId, meshId in self.kpId2vertices.items():
+            keypoints[:, myId, :] = mesh_vertices[:, meshId, :]
+
+        return keypoints
diff --git a/detrsmpl/models/body_models/smpl.py b/detrsmpl/models/body_models/smpl.py
new file mode 100644
index 0000000000000000000000000000000000000000..215e4e2f9ef0134eb69422e73f01206c8ee5741f
--- /dev/null
+++ b/detrsmpl/models/body_models/smpl.py
@@ -0,0 +1,610 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Optional
+
+import numpy as np
+import torch
+from smplx import SMPL as _SMPL
+from smplx.lbs import batch_rigid_transform, blend_shapes, vertices2joints
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    convert_kps,
+    get_keypoint_num,
+)
+from detrsmpl.core.conventions.segmentation import body_segmentation
+from detrsmpl.models.utils import batch_inverse_kinematics_transform
+from detrsmpl.utils.transforms import quat_to_rotmat
+
+
+class SMPL(_SMPL):
+    """Extension of the official SMPL implementation."""
+
+    body_pose_keys = {
+        'global_orient',
+        'body_pose',
+    }
+    full_pose_keys = {
+        'global_orient',
+        'body_pose',
+    }
+    NUM_VERTS = 6890
+    NUM_FACES = 13776
+
+    def __init__(self,
+                 *args,
+                 keypoint_src: str = 'smpl_45',
+                 keypoint_dst: str = 'human_data',
+                 keypoint_approximate: bool = False,
+                 joints_regressor: str = None,
+                 extra_joints_regressor: str = None,
+                 **kwargs) -> None:
+        """
+        Args:
+            *args: extra arguments for SMPL initialization.
+            keypoint_src: source convention of keypoints. This convention
+                is used for keypoints obtained from joint regressors.
+                Keypoints then undergo conversion into keypoint_dst
+                convention.
+            keypoint_dst: destination convention of keypoints. This convention
+                is used for keypoints in the output.
+            keypoint_approximate: whether to use approximate matching in
+                convention conversion for keypoints.
+            joints_regressor: path to joint regressor. Should be a .npy
+                file. If provided, replaces the official J_regressor of SMPL.
+            extra_joints_regressor: path to extra joint regressor. Should be
+                a .npy file. If provided, extra joints are regressed and
+                concatenated after the joints regressed with the official
+                J_regressor or joints_regressor.
+            **kwargs: extra keyword arguments for SMPL initialization.
+
+        Returns:
+            None
+        """
+        super(SMPL, self).__init__(*args, **kwargs)
+        # joints = [JOINT_MAP[i] for i in JOINT_NAMES]
+        self.keypoint_src = keypoint_src
+        self.keypoint_dst = keypoint_dst
+        self.keypoint_approximate = keypoint_approximate
+        # override the default SMPL joint regressor if available
+        if joints_regressor is not None:
+            joints_regressor = torch.tensor(np.load(joints_regressor),
+                                            dtype=torch.float)
+            self.register_buffer('joints_regressor', joints_regressor)
+
+        # allow for extra joints to be regressed if available
+        if extra_joints_regressor is not None:
+            joints_regressor_extra = torch.tensor(
+                np.load(extra_joints_regressor), dtype=torch.float)
+            self.register_buffer('joints_regressor_extra',
+                                 joints_regressor_extra)
+
+        self.num_verts = self.get_num_verts()
+        self.num_joints = get_keypoint_num(convention=self.keypoint_dst)
+        self.body_part_segmentation = body_segmentation('smpl')
+
+    def forward(self,
+                *args,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                **kwargs) -> dict:
+        """Forward function.
+
+        Args:
+            *args: extra arguments for SMPL
+            return_verts: whether to return vertices
+            return_full_pose: whether to return full pose parameters
+            **kwargs: extra arguments for SMPL
+
+        Returns:
+            output: contains output parameters and attributes
+        """
+
+        kwargs['get_skin'] = True
+        smpl_output = super(SMPL, self).forward(*args, **kwargs)
+
+        if not hasattr(self, 'joints_regressor'):
+            joints = smpl_output.joints
+        else:
+            joints = vertices2joints(self.joints_regressor,
+                                     smpl_output.vertices)
+
+        if hasattr(self, 'joints_regressor_extra'):
+            extra_joints = vertices2joints(self.joints_regressor_extra,
+                                           smpl_output.vertices)
+            joints = torch.cat([joints, extra_joints], dim=1)
+
+        joints, joint_mask = convert_kps(joints,
+                                         src=self.keypoint_src,
+                                         dst=self.keypoint_dst,
+                                         approximate=self.keypoint_approximate)
+        if isinstance(joint_mask, np.ndarray):
+            joint_mask = torch.tensor(joint_mask,
+                                      dtype=torch.uint8,
+                                      device=joints.device)
+
+        batch_size = joints.shape[0]
+        joint_mask = joint_mask.reshape(1, -1).expand(batch_size, -1)
+
+        output = dict(global_orient=smpl_output.global_orient,
+                      body_pose=smpl_output.body_pose,
+                      joints=joints,
+                      joint_mask=joint_mask,
+                      keypoints=torch.cat([joints, joint_mask[:, :, None]],
+                                          dim=-1),
+                      betas=smpl_output.betas)
+
+        if return_verts:
+            output['vertices'] = smpl_output.vertices
+        if return_full_pose:
+            output['full_pose'] = smpl_output.full_pose
+
+        return output
+
+    @classmethod
+    def tensor2dict(cls,
+                    full_pose: torch.Tensor,
+                    betas: Optional[torch.Tensor] = None,
+                    transl: Optional[torch.Tensor] = None):
+        """Convert full pose tensor to pose dict.
+
+        Args:
+            full_pose (torch.Tensor): shape should be (..., 165) or
+                (..., 55, 3). All zeros for T-pose.
+            betas (Optional[torch.Tensor], optional): shape should be
+                (..., 10). The batch num should be 1 or corresponds with
+                full_pose.
+                Defaults to None.
+            transl (Optional[torch.Tensor], optional): shape should be
+                (..., 3). The batch num should be 1 or corresponds with
+                full_pose.
+                Defaults to None.
+        Returns:
+            dict: dict of smpl pose containing transl & betas.
+        """
+        full_pose = full_pose.view(-1, (cls.NUM_BODY_JOINTS + 1) * 3)
+        body_pose = full_pose[:, 3:]
+        global_orient = full_pose[:, :3]
+        batch_size = full_pose.shape[0]
+        if betas is not None:
+            # squeeze or unsqueeze betas to 2 dims
+            betas = betas.view(-1, betas.shape[-1])
+            if betas.shape[0] == 1:
+                betas = betas.repeat(batch_size, 1)
+        else:
+            betas = betas
+        transl = transl.view(batch_size, -1) if transl is not None else transl
+        return {
+            'betas': betas,
+            'body_pose': body_pose,
+            'global_orient': global_orient,
+            'transl': transl,
+        }
+
+    @classmethod
+    def dict2tensor(cls, smpl_dict: dict) -> torch.Tensor:
+        """Convert smpl pose dict to full pose tensor.
+
+        Args:
+            smpl_dict (dict): smpl pose dict.
+
+        Returns:
+            torch: full pose tensor.
+        """
+        assert cls.body_pose_keys.issubset(smpl_dict)
+        for k in smpl_dict:
+            if isinstance(smpl_dict[k], np.ndarray):
+                smpl_dict[k] = torch.Tensor(smpl_dict[k])
+        global_orient = smpl_dict['global_orient'].view(-1, 3)
+        body_pose = smpl_dict['body_pose'].view(-1, 3 * cls.NUM_BODY_JOINTS)
+        full_pose = torch.cat([global_orient, body_pose], dim=1)
+        return full_pose
+
+
+class GenderedSMPL(torch.nn.Module):
+    """A wrapper of SMPL to handle gendered inputs."""
+    def __init__(self,
+                 *args,
+                 keypoint_src: str = 'smpl_45',
+                 keypoint_dst: str = 'human_data',
+                 keypoint_approximate: bool = False,
+                 joints_regressor: str = None,
+                 extra_joints_regressor: str = None,
+                 **kwargs) -> None:
+        """
+        Args:
+            *args: extra arguments for SMPL initialization.
+            keypoint_src: source convention of keypoints. This convention
+                is used for keypoints obtained from joint regressors.
+                Keypoints then undergo conversion into keypoint_dst
+                convention.
+            keypoint_dst: destination convention of keypoints. This convention
+                is used for keypoints in the output.
+            keypoint_approximate: whether to use approximate matching in
+                convention conversion for keypoints.
+            joints_regressor: path to joint regressor. Should be a .npy
+                file. If provided, replaces the official J_regressor of SMPL.
+            extra_joints_regressor: path to extra joint regressor. Should be
+                a .npy file. If provided, extra joints are regressed and
+                concatenated after the joints regressed with the official
+                J_regressor or joints_regressor.
+            **kwargs: extra keyword arguments for SMPL initialization.
+
+        Returns:
+            None
+        """
+        super(GenderedSMPL, self).__init__()
+
+        assert 'gender' not in kwargs, \
+            self.__class__.__name__ + \
+            'does not need \'gender\' for initialization.'
+
+        self.smpl_neutral = SMPL(*args,
+                                 gender='neutral',
+                                 keypoint_src=keypoint_src,
+                                 keypoint_dst=keypoint_dst,
+                                 keypoint_approximate=keypoint_approximate,
+                                 joints_regressor=joints_regressor,
+                                 extra_joints_regressor=extra_joints_regressor,
+                                 **kwargs)
+
+        self.smpl_male = SMPL(*args,
+                              gender='male',
+                              keypoint_src=keypoint_src,
+                              keypoint_dst=keypoint_dst,
+                              keypoint_approximate=keypoint_approximate,
+                              joints_regressor=joints_regressor,
+                              extra_joints_regressor=extra_joints_regressor,
+                              **kwargs)
+
+        self.smpl_female = SMPL(*args,
+                                gender='female',
+                                keypoint_src=keypoint_src,
+                                keypoint_dst=keypoint_dst,
+                                keypoint_approximate=keypoint_approximate,
+                                joints_regressor=joints_regressor,
+                                extra_joints_regressor=extra_joints_regressor,
+                                **kwargs)
+
+        self.num_verts = self.smpl_neutral.num_verts
+        self.num_joints = self.smpl_neutral.num_joints
+        self.faces = self.smpl_neutral.faces
+
+    def forward(self,
+                *args,
+                betas: torch.Tensor = None,
+                body_pose: torch.Tensor = None,
+                global_orient: torch.Tensor = None,
+                transl: torch.Tensor = None,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                gender: torch.Tensor = None,
+                device=None,
+                **kwargs):
+        """Forward function.
+        Note:
+            B: batch size
+            J: number of joints of model, J = 23 (SMPL)
+            K: number of keypoints
+        Args:
+            *args: extra arguments
+            betas: Tensor([B, 10]), human body shape parameters of SMPL model.
+            body_pose: Tensor([B, J*3] or [B, J, 3, 3]), human body pose
+                parameters of SMPL model. It should be axis-angle vector
+                ([B, J*3]) or rotation matrix ([B, J, 3, 3)].
+            global_orient: Tensor([B, 3] or [B, 1, 3, 3]), global orientation
+                of human body. It should be axis-angle vector ([B, 3]) or
+                rotation matrix ([B, 1, 3, 3)].
+            transl: Tensor([B, 3]), global translation of human body.
+            gender: Tensor([B]), gender parameters of human body. -1 for
+                neutral, 0 for male , 1 for female.
+            device: the device of the output
+            **kwargs: extra keyword arguments
+        Returns:
+            outputs (dict): Dict with mesh vertices and joints.
+                - vertices: Tensor([B, V, 3]), mesh vertices
+                - joints: Tensor([B, K, 3]), 3d keypoints regressed from
+                    mesh vertices.
+        """
+
+        batch_size = None
+        for attr in [betas, body_pose, global_orient, transl]:
+            if attr is not None:
+                if device is None:
+                    device = attr.device
+                if batch_size is None:
+                    batch_size = attr.shape[0]
+                else:
+                    assert batch_size == attr.shape[0]
+
+        if gender is not None:
+            output = {
+                'vertices':
+                torch.zeros([batch_size, self.num_verts, 3], device=device),
+                'joints':
+                torch.zeros([batch_size, self.num_joints, 3], device=device),
+                'joint_mask':
+                torch.zeros([batch_size, self.num_joints],
+                            dtype=torch.uint8,
+                            device=device)
+            }
+
+            for body_model, gender_label in \
+                    [(self.smpl_neutral, -1),
+                     (self.smpl_male, 0),
+                     (self.smpl_female, 1)]:
+                gender_idxs = gender == gender_label
+
+                # skip if no such gender is present
+                if gender_idxs.sum() == 0:
+                    continue
+
+                output_model = body_model(
+                    betas=betas[gender_idxs] if betas is not None else None,
+                    body_pose=body_pose[gender_idxs]
+                    if body_pose is not None else None,
+                    global_orient=global_orient[gender_idxs]
+                    if global_orient is not None else None,
+                    transl=transl[gender_idxs] if transl is not None else None,
+                    **kwargs)
+
+                output['joints'][gender_idxs] = output_model['joints']
+
+                # TODO: quick fix
+                if 'joint_mask' in output_model:
+                    output['joint_mask'][gender_idxs] = output_model[
+                        'joint_mask']
+
+                if return_verts:
+                    output['vertices'][gender_idxs] = output_model['vertices']
+                if return_full_pose:
+                    output['full_pose'][gender_idxs] = output_model[
+                        'full_pose']
+        else:
+            output = self.smpl_neutral(betas=betas,
+                                       body_pose=body_pose,
+                                       global_orient=global_orient,
+                                       transl=transl,
+                                       **kwargs)
+
+        return output
+
+
+def to_tensor(array, dtype=torch.float32):
+    if 'torch.tensor' not in str(type(array)):
+        return torch.tensor(array, dtype=dtype)
+
+
+def to_np(array, dtype=np.float32):
+    if 'scipy.sparse' in str(type(array)):
+        array = array.todense()
+    return np.array(array, dtype=dtype)
+
+
+class HybrIKSMPL(SMPL):
+    """Extension of the SMPL for HybrIK."""
+
+    NUM_JOINTS = 23
+    NUM_BODY_JOINTS = 23
+    NUM_BETAS = 10
+    JOINT_NAMES = [
+        'pelvis',
+        'left_hip',
+        'right_hip',  # 2
+        'spine1',
+        'left_knee',
+        'right_knee',  # 5
+        'spine2',
+        'left_ankle',
+        'right_ankle',  # 8
+        'spine3',
+        'left_foot',
+        'right_foot',  # 11
+        'neck',
+        'left_collar',
+        'right_collar',  # 14
+        'jaw',  # 15
+        'left_shoulder',
+        'right_shoulder',  # 17
+        'left_elbow',
+        'right_elbow',  # 19
+        'left_wrist',
+        'right_wrist',  # 21
+        'left_thumb',
+        'right_thumb',  # 23
+        'head',
+        'left_middle',
+        'right_middle',  # 26
+        'left_bigtoe',
+        'right_bigtoe'  # 28
+    ]
+    LEAF_NAMES = [
+        'head', 'left_middle', 'right_middle', 'left_bigtoe', 'right_bigtoe'
+    ]
+    root_idx_17 = 0
+    root_idx_smpl = 0
+
+    def __init__(self, *args, extra_joints_regressor=None, **kwargs):
+        """
+        Args:
+            *args: extra arguments for SMPL initialization.
+            extra_joints_regressor: path to extra joint regressor. Should be
+                a .npy file. If provided, extra joints are regressed and
+                concatenated after the joints regressed with the official
+                J_regressor or joints_regressor.
+            **kwargs: extra keyword arguments for SMPL initialization.
+
+        Returns:
+            None
+        """
+        super(HybrIKSMPL,
+              self).__init__(*args,
+                             extra_joints_regressor=extra_joints_regressor,
+                             create_betas=False,
+                             create_global_orient=False,
+                             create_body_pose=False,
+                             create_transl=False,
+                             **kwargs)
+
+        self.dtype = torch.float32
+        self.num_joints = 29
+
+        self.ROOT_IDX = self.JOINT_NAMES.index('pelvis')
+        self.LEAF_IDX = [
+            self.JOINT_NAMES.index(name) for name in self.LEAF_NAMES
+        ]
+        self.SPINE3_IDX = 9
+        # # indices of parents for each joints
+        parents = torch.zeros(len(self.JOINT_NAMES), dtype=torch.long)
+        # extend kinematic tree
+        parents[:24] = self.parents
+        parents[24] = 15
+        parents[25] = 22
+        parents[26] = 23
+        parents[27] = 10
+        parents[28] = 11
+        if parents.shape[0] > self.num_joints:
+            parents = parents[:24]
+        self.register_buffer('children_map',
+                             self._parents_to_children(parents))
+        self.parents = parents
+
+    def _parents_to_children(self, parents):
+        children = torch.ones_like(parents) * -1
+        for i in range(self.num_joints):
+            if children[parents[i]] < 0:
+                children[parents[i]] = i
+        for i in self.LEAF_IDX:
+            if i < children.shape[0]:
+                children[i] = -1
+
+        children[self.SPINE3_IDX] = -3
+        children[0] = 3
+        children[self.SPINE3_IDX] = self.JOINT_NAMES.index('neck')
+
+        return children
+
+    def forward(self,
+                pose_skeleton,
+                betas,
+                phis,
+                global_orient,
+                transl=None,
+                return_verts=True,
+                leaf_thetas=None):
+        """Inverse pass for the SMPL model.
+
+        Args:
+            pose_skeleton: torch.tensor, optional, shape Bx(J*3)
+                It should be a tensor that contains joint locations in
+                (img, Y, Z) format. (default=None)
+            betas: torch.tensor, optional, shape Bx10
+                It can used if shape parameters
+                `betas` are predicted from some external model.
+                (default=None)
+            phis: torch.tensor, shape Bx23x2
+                Rotation on bone axis parameters
+            global_orient: torch.tensor, optional, shape Bx3
+                Global Orientations.
+            transl: torch.tensor, optional, shape Bx3
+                Global Translations.
+            return_verts: bool, optional
+                Return the vertices. (default=True)
+            leaf_thetas: torch.tensor, optional, shape Bx5x4
+                Quaternions of 5 leaf joints. (default=None)
+
+        Returns
+            outputs: output dictionary.
+        """
+        batch_size = pose_skeleton.shape[0]
+
+        if leaf_thetas is not None:
+            leaf_thetas = leaf_thetas.reshape(batch_size * 5, 4)
+            leaf_thetas = quat_to_rotmat(leaf_thetas)
+
+        batch_size = max(betas.shape[0], pose_skeleton.shape[0])
+        device = betas.device
+
+        # 1. Add shape contribution
+        v_shaped = self.v_template + blend_shapes(betas, self.shapedirs)
+
+        # 2. Get the rest joints
+        # NxJx3 array
+        if leaf_thetas is not None:
+            rest_J = vertices2joints(self.J_regressor, v_shaped)
+        else:
+            rest_J = torch.zeros((v_shaped.shape[0], 29, 3),
+                                 dtype=self.dtype,
+                                 device=device)
+            rest_J[:, :24] = vertices2joints(self.J_regressor, v_shaped)
+
+            leaf_number = [411, 2445, 5905, 3216, 6617]
+            leaf_vertices = v_shaped[:, leaf_number].clone()
+            rest_J[:, 24:] = leaf_vertices
+
+        # 3. Get the rotation matrics
+        rot_mats, rotate_rest_pose = batch_inverse_kinematics_transform(
+            pose_skeleton,
+            global_orient,
+            phis,
+            rest_J.clone(),
+            self.children_map,
+            self.parents,
+            dtype=self.dtype,
+            train=self.training,
+            leaf_thetas=leaf_thetas)
+
+        test_joints = True
+        if test_joints:
+            new_joints, A = batch_rigid_transform(rot_mats,
+                                                  rest_J[:, :24].clone(),
+                                                  self.parents[:24],
+                                                  dtype=self.dtype)
+        else:
+            new_joints = None
+
+        # assert torch.mean(torch.abs(rotate_rest_pose - new_joints)) < 1e-5
+        # 4. Add pose blend shapes
+        # rot_mats: N x (J + 1) x 3 x 3
+        ident = torch.eye(3, dtype=self.dtype, device=device)
+        pose_feature = (rot_mats[:, 1:] - ident).view([batch_size, -1])
+        pose_offsets = torch.matmul(pose_feature, self.posedirs) \
+            .view(batch_size, -1, 3)
+
+        v_posed = pose_offsets + v_shaped
+
+        # 5. Do skinning:
+        # W is N x V x (J + 1)
+        W = self.lbs_weights.unsqueeze(dim=0).expand([batch_size, -1, -1])
+        # (N x V x (J + 1)) x (N x (J + 1) x 16)
+        num_joints = self.J_regressor.shape[0]
+        T = torch.matmul(W, A.view(batch_size, num_joints, 16)) \
+            .view(batch_size, -1, 4, 4)
+
+        homogen_coord = torch.ones([batch_size, v_posed.shape[1], 1],
+                                   dtype=self.dtype,
+                                   device=device)
+        v_posed_homo = torch.cat([v_posed, homogen_coord], dim=2)
+        v_homo = torch.matmul(T, torch.unsqueeze(v_posed_homo, dim=-1))
+
+        vertices = v_homo[:, :, :3, 0]
+        joints_from_verts = vertices2joints(self.joints_regressor_extra,
+                                            vertices)
+
+        # rot_mats = rot_mats.reshape(batch_size * 24, 3, 3)
+        if transl is not None:
+            new_joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+            joints_from_verts += transl.unsqueeze(dim=1)
+        else:
+            new_joints = new_joints - \
+                new_joints[:, self.root_idx_smpl, :].unsqueeze(1).detach()
+            joints_from_verts = joints_from_verts - \
+                joints_from_verts[:, self.root_idx_17, :].unsqueeze(1).detach()
+
+        output = {
+            'vertices': vertices,
+            'joints': new_joints,
+            'poses': rot_mats,
+            'joints_from_verts': joints_from_verts,
+        }
+        return output
diff --git a/detrsmpl/models/body_models/smplx.py b/detrsmpl/models/body_models/smplx.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfe1fb723bf46aeca17cc5f1f0a8dae5ec0df5f2
--- /dev/null
+++ b/detrsmpl/models/body_models/smplx.py
@@ -0,0 +1,375 @@
+from typing import Optional
+
+import numpy as np
+import torch
+from smplx import SMPLX as _SMPLX
+from smplx import SMPLXLayer as _SMPLXLayer
+from smplx.lbs import vertices2joints
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    convert_kps,
+    get_keypoint_num,
+)
+from detrsmpl.core.conventions.segmentation import body_segmentation
+
+
+class SMPLX(_SMPLX):
+    """Extension of the official SMPL-X implementation."""
+
+    body_pose_keys = {'global_orient', 'body_pose'}
+    full_pose_keys = {
+        'global_orient', 'body_pose', 'left_hand_pose', 'right_hand_pose',
+        'jaw_pose', 'leye_pose', 'reye_pose'
+    }
+    NUM_VERTS = 10475
+    NUM_FACES = 20908
+
+    def __init__(self,
+                 *args,
+                 keypoint_src: str = 'smplx',
+                 keypoint_dst: str = 'human_data',
+                 keypoint_approximate: bool = False,
+                 joints_regressor: str = None,
+                 extra_joints_regressor: str = None,
+                 **kwargs):
+        """
+        Args:
+            *args: extra arguments for SMPL initialization.
+            keypoint_src: source convention of keypoints. This convention
+                is used for keypoints obtained from joint regressors.
+                Keypoints then undergo conversion into keypoint_dst
+                convention.
+            keypoint_dst: destination convention of keypoints. This convention
+                is used for keypoints in the output.
+            keypoint_approximate: whether to use approximate matching in
+                convention conversion for keypoints.
+            joints_regressor: path to joint regressor. Should be a .npy
+                file. If provided, replaces the official J_regressor of SMPL.
+            extra_joints_regressor: path to extra joint regressor. Should be
+                a .npy file. If provided, extra joints are regressed and
+                concatenated after the joints regressed with the official
+                J_regressor or joints_regressor.
+            **kwargs: extra keyword arguments for SMPL initialization.
+
+        Returns:
+            None
+        """
+        super(SMPLX, self).__init__(*args, **kwargs)
+        # joints = [JOINT_MAP[i] for i in JOINT_NAMES]
+        self.keypoint_src = keypoint_src
+        self.keypoint_dst = keypoint_dst
+        self.keypoint_approximate = keypoint_approximate
+
+        # override the default SMPL joint regressor if available
+        if joints_regressor is not None:
+            joints_regressor = torch.tensor(np.load(joints_regressor),
+                                            dtype=torch.float)
+            self.register_buffer('joints_regressor', joints_regressor)
+
+        # allow for extra joints to be regressed if available
+        if extra_joints_regressor is not None:
+            joints_regressor_extra = torch.tensor(
+                np.load(extra_joints_regressor), dtype=torch.float)
+            self.register_buffer('joints_regressor_extra',
+                                 joints_regressor_extra)
+
+        self.num_verts = self.get_num_verts()
+        self.num_joints = get_keypoint_num(convention=self.keypoint_dst)
+        self.body_part_segmentation = body_segmentation('smplx')
+
+    def forward(self,
+                *args,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                **kwargs) -> dict:
+        """Forward function.
+
+        Args:
+            *args: extra arguments for SMPL
+            return_verts: whether to return vertices
+            return_full_pose: whether to return full pose parameters
+            **kwargs: extra arguments for SMPL
+
+        Returns:
+            output: contains output parameters and attributes
+        """
+
+        kwargs['get_skin'] = True
+        smplx_output = super(SMPLX, self).forward(*args, **kwargs)
+
+        if not hasattr(self, 'joints_regressor'):
+            joints = smplx_output.joints
+        else:
+            joints = vertices2joints(self.joints_regressor,
+                                     smplx_output.vertices)
+
+        if hasattr(self, 'joints_regressor_extra'):
+            extra_joints = vertices2joints(self.joints_regressor_extra,
+                                           smplx_output.vertices)
+            joints = torch.cat([joints, extra_joints], dim=1)
+
+        joints, joint_mask = convert_kps(joints,
+                                         src=self.keypoint_src,
+                                         dst=self.keypoint_dst,
+                                         approximate=self.keypoint_approximate)
+        if isinstance(joint_mask, np.ndarray):
+            joint_mask = torch.tensor(joint_mask,
+                                      dtype=torch.uint8,
+                                      device=joints.device)
+
+        batch_size = joints.shape[0]
+        joint_mask = joint_mask.reshape(1, -1).expand(batch_size, -1)
+
+        output = dict(global_orient=smplx_output.global_orient,
+                      body_pose=smplx_output.body_pose,
+                      joints=joints,
+                      joint_mask=joint_mask,
+                      keypoints=torch.cat([joints, joint_mask[:, :, None]],
+                                          dim=-1),
+                      betas=smplx_output.betas)
+
+        if return_verts:
+            output['vertices'] = smplx_output.vertices
+        if return_full_pose:
+            output['full_pose'] = smplx_output.full_pose
+
+        return output
+
+    @classmethod
+    def tensor2dict(cls,
+                    full_pose: torch.Tensor,
+                    betas: Optional[torch.Tensor] = None,
+                    transl: Optional[torch.Tensor] = None,
+                    expression: Optional[torch.Tensor] = None) -> dict:
+        """Convert full pose tensor to pose dict.
+
+        Args:
+            full_pose (torch.Tensor): shape should be (..., 165) or
+                (..., 55, 3). All zeros for T-pose.
+            betas (Optional[torch.Tensor], optional): shape should be
+                (..., 10). The batch num should be 1 or corresponds with
+                full_pose.
+                Defaults to None.
+            transl (Optional[torch.Tensor], optional): shape should be
+                (..., 3). The batch num should be 1 or corresponds with
+                full_pose.
+                Defaults to None.
+            expression (Optional[torch.Tensor], optional): shape should
+                be (..., 10). The batch num should be 1 or corresponds with
+                full_pose.
+                Defaults to None.
+
+        Returns:
+            dict: dict of smplx pose containing transl & betas.
+        """
+        NUM_BODY_JOINTS = cls.NUM_BODY_JOINTS
+        NUM_HAND_JOINTS = cls.NUM_HAND_JOINTS
+        NUM_FACE_JOINTS = cls.NUM_FACE_JOINTS
+        NUM_JOINTS = NUM_BODY_JOINTS + 2 * NUM_HAND_JOINTS + NUM_FACE_JOINTS
+        full_pose = full_pose.view(-1, (NUM_JOINTS + 1), 3)
+        global_orient = full_pose[:, :1]
+        body_pose = full_pose[:, 1:NUM_BODY_JOINTS + 1]
+        jaw_pose = full_pose[:, NUM_BODY_JOINTS + 1:NUM_BODY_JOINTS + 2]
+        leye_pose = full_pose[:, NUM_BODY_JOINTS + 2:NUM_BODY_JOINTS + 3]
+        reye_pose = full_pose[:, NUM_BODY_JOINTS + 3:NUM_BODY_JOINTS + 4]
+        left_hand_pose = full_pose[:, NUM_BODY_JOINTS + 4:NUM_BODY_JOINTS + 19]
+        right_hand_pose = full_pose[:,
+                                    NUM_BODY_JOINTS + 19:NUM_BODY_JOINTS + 34]
+        batch_size = body_pose.shape[0]
+        if betas is not None:
+            # squeeze or unsqueeze betas to 2 dims
+            betas = betas.view(-1, betas.shape[-1])
+            if betas.shape[0] == 1:
+                betas = betas.repeat(batch_size, 1)
+        else:
+            betas = betas
+        transl = transl.view(batch_size, -1) if transl is not None else transl
+        expression = expression.view(
+            batch_size, -1) if expression is not None else torch.zeros(
+                batch_size, 10).to(body_pose.device)
+        return {
+            'betas':
+            betas,
+            'global_orient':
+            global_orient.view(batch_size, 3),
+            'body_pose':
+            body_pose.view(batch_size, NUM_BODY_JOINTS * 3),
+            'left_hand_pose':
+            left_hand_pose.view(batch_size, NUM_HAND_JOINTS * 3),
+            'right_hand_pose':
+            right_hand_pose.view(batch_size, NUM_HAND_JOINTS * 3),
+            'transl':
+            transl,
+            'expression':
+            expression,
+            'jaw_pose':
+            jaw_pose.view(batch_size, 3),
+            'leye_pose':
+            leye_pose.view(batch_size, 3),
+            'reye_pose':
+            reye_pose.view(batch_size, 3),
+        }
+
+    @classmethod
+    def dict2tensor(cls, smplx_dict: dict) -> torch.Tensor:
+        """Convert smplx pose dict to full pose tensor.
+
+        Args:
+            smplx_dict (dict): smplx pose dict.
+
+        Returns:
+            torch: full pose tensor.
+        """
+        assert cls.body_pose_keys.issubset(smplx_dict)
+        for k in smplx_dict:
+            if isinstance(smplx_dict[k], np.ndarray):
+                smplx_dict[k] = torch.Tensor(smplx_dict[k])
+        NUM_BODY_JOINTS = cls.NUM_BODY_JOINTS
+        NUM_HAND_JOINTS = cls.NUM_HAND_JOINTS
+        NUM_FACE_JOINTS = cls.NUM_FACE_JOINTS
+        NUM_JOINTS = NUM_BODY_JOINTS + 2 * NUM_HAND_JOINTS + NUM_FACE_JOINTS
+        global_orient = smplx_dict['global_orient'].reshape(-1, 1, 3)
+        body_pose = smplx_dict['body_pose'].reshape(-1, NUM_BODY_JOINTS, 3)
+        batch_size = global_orient.shape[0]
+        jaw_pose = smplx_dict.get('jaw_pose', torch.zeros((batch_size, 1, 3)))
+        leye_pose = smplx_dict.get('leye_pose', torch.zeros(
+            (batch_size, 1, 3)))
+        reye_pose = smplx_dict.get('reye_pose', torch.zeros(
+            (batch_size, 1, 3)))
+        left_hand_pose = smplx_dict.get(
+            'left_hand_pose', torch.zeros((batch_size, NUM_HAND_JOINTS, 3)))
+        right_hand_pose = smplx_dict.get(
+            'right_hand_pose', torch.zeros((batch_size, NUM_HAND_JOINTS, 3)))
+        full_pose = torch.cat([
+            global_orient, body_pose,
+            jaw_pose.reshape(-1, 1, 3),
+            leye_pose.reshape(-1, 1, 3),
+            reye_pose.reshape(-1, 1, 3),
+            left_hand_pose.reshape(-1, 15, 3),
+            right_hand_pose.reshape(-1, 15, 3)
+        ],
+                              dim=1).reshape(-1, (NUM_JOINTS + 1) * 3)
+        return full_pose
+
+
+class SMPLXLayer(_SMPLXLayer):
+    """Extension of the official SMPL-X implementation."""
+
+    body_pose_keys = {'global_orient', 'body_pose'}
+    full_pose_keys = {
+        'global_orient', 'body_pose', 'left_hand_pose', 'right_hand_pose',
+        'jaw_pose', 'leye_pose', 'reye_pose'
+    }
+    NUM_VERTS = 10475
+    NUM_FACES = 20908
+
+    def __init__(self,
+                 *args,
+                 keypoint_src: str = 'smplx',
+                 keypoint_dst: str = 'human_data',
+                 keypoint_approximate: bool = False,
+                 joints_regressor: str = None,
+                 extra_joints_regressor: str = None,
+                 **kwargs):
+        """
+        Args:
+            *args: extra arguments for SMPL initialization.
+            keypoint_src: source convention of keypoints. This convention
+                is used for keypoints obtained from joint regressors.
+                Keypoints then undergo conversion into keypoint_dst
+                convention.
+            keypoint_dst: destination convention of keypoints. This convention
+                is used for keypoints in the output.
+            keypoint_approximate: whether to use approximate matching in
+                convention conversion for keypoints.
+            joints_regressor: path to joint regressor. Should be a .npy
+                file. If provided, replaces the official J_regressor of SMPL.
+            extra_joints_regressor: path to extra joint regressor. Should be
+                a .npy file. If provided, extra joints are regressed and
+                concatenated after the joints regressed with the official
+                J_regressor or joints_regressor.
+            **kwargs: extra keyword arguments for SMPL initialization.
+
+        Returns:
+            None
+        """
+        super(SMPLXLayer, self).__init__(*args, **kwargs)
+        # joints = [JOINT_MAP[i] for i in JOINT_NAMES]
+        self.keypoint_src = keypoint_src
+        self.keypoint_dst = keypoint_dst
+        self.keypoint_approximate = keypoint_approximate
+
+        # override the default SMPL joint regressor if available
+        if joints_regressor is not None:
+            joints_regressor = torch.tensor(np.load(joints_regressor),
+                                            dtype=torch.float)
+            self.register_buffer('joints_regressor', joints_regressor)
+
+        # allow for extra joints to be regressed if available
+        if extra_joints_regressor is not None:
+            joints_regressor_extra = torch.tensor(
+                np.load(extra_joints_regressor), dtype=torch.float)
+            self.register_buffer('joints_regressor_extra',
+                                 joints_regressor_extra)
+
+        self.num_verts = self.get_num_verts()
+        self.num_joints = get_keypoint_num(convention=self.keypoint_dst)
+        self.body_part_segmentation = body_segmentation('smplx')
+
+    def forward(self,
+                *args,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                **kwargs) -> dict:
+        """Forward function.
+
+        Args:
+            *args: extra arguments for SMPL
+            return_verts: whether to return vertices
+            return_full_pose: whether to return full pose parameters
+            **kwargs: extra arguments for SMPL
+
+        Returns:
+            output: contains output parameters and attributes
+        """
+
+        kwargs['get_skin'] = True
+        smplx_output = super(SMPLXLayer, self).forward(*args, **kwargs)
+
+        if not hasattr(self, 'joints_regressor'):
+            joints = smplx_output.joints
+        else:
+            joints = vertices2joints(self.joints_regressor,
+                                     smplx_output.vertices)
+
+        if hasattr(self, 'joints_regressor_extra'):
+            extra_joints = vertices2joints(self.joints_regressor_extra,
+                                           smplx_output.vertices)
+            joints = torch.cat([joints, extra_joints], dim=1)
+
+        joints, joint_mask = convert_kps(joints,
+                                         src=self.keypoint_src,
+                                         dst=self.keypoint_dst,
+                                         approximate=self.keypoint_approximate)
+        if isinstance(joint_mask, np.ndarray):
+            joint_mask = torch.tensor(joint_mask,
+                                      dtype=torch.uint8,
+                                      device=joints.device)
+
+        batch_size = joints.shape[0]
+        joint_mask = joint_mask.reshape(1, -1).expand(batch_size, -1)
+
+        output = dict(global_orient=smplx_output.global_orient,
+                      body_pose=smplx_output.body_pose,
+                      joints=joints,
+                      joint_mask=joint_mask,
+                      keypoints=torch.cat([joints, joint_mask[:, :, None]],
+                                          dim=-1),
+                      betas=smplx_output.betas)
+
+        if return_verts:
+            output['vertices'] = smplx_output.vertices
+        if return_full_pose:
+            output['full_pose'] = smplx_output.full_pose
+
+        return output
diff --git a/detrsmpl/models/body_models/star.py b/detrsmpl/models/body_models/star.py
new file mode 100644
index 0000000000000000000000000000000000000000..b40531c77091d81c1586eb53d34d38a68bcc0d91
--- /dev/null
+++ b/detrsmpl/models/body_models/star.py
@@ -0,0 +1,333 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import os
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from detrsmpl.core.conventions.keypoints_mapping import convert_kps
+from detrsmpl.utils.transforms import (
+    aa_to_rotmat,
+    make_homegeneous_rotmat_batch,
+)
+
+
+class STAR(nn.Module):
+
+    NUM_BODY_JOINTS = 24
+
+    def __init__(self,
+                 model_path: str,
+                 gender: str = 'neutral',
+                 keypoint_src: str = 'star',
+                 keypoint_dst: str = 'human_data',
+                 keypoint_approximate: bool = False,
+                 create_global_orient: bool = True,
+                 global_orient: Optional[torch.Tensor] = None,
+                 create_body_pose: bool = True,
+                 body_pose: torch.Tensor = None,
+                 num_betas: int = 10,
+                 create_betas: bool = True,
+                 betas: torch.Tensor = None,
+                 create_transl: bool = True,
+                 transl: torch.Tensor = None,
+                 batch_size: int = 1,
+                 dtype: torch.dtype = torch.float32) -> None:
+        """STAR model constructor.
+
+        Args:
+            model_path: str
+                The path to the folder or to the file where the model
+                parameters are stored.
+            gender: str, optional
+                Which gender to load.
+            keypoint_src: str
+                Source convention of keypoints. This convention is used for
+                keypoints obtained from joint regressors. Keypoints then
+                undergo  conversion into keypoint_dst convention.
+            keypoint_dst: destination convention of keypoints. This convention
+                is used for keypoints in the output.
+            keypoint_approximate: whether to use approximate matching in
+                convention conversion for keypoints.
+            create_global_orient: bool, optional
+                Flag for creating a member variable for the global orientation
+                of the body. (default = True)
+            global_orient: torch.tensor, optional, Bx3
+                The default value for the global orientation variable.
+                (default = None)
+            create_body_pose: bool, optional
+                Flag for creating a member variable for the pose of the body.
+                (default = True)
+            body_pose: torch.tensor, optional, Bx(3*24)
+                The default value for the body pose variable.
+                (default = None)
+            num_betas: int, optional
+                Number of shape components to use
+                (default = 10).
+            create_betas: bool, optional
+                Flag for creating a member variable for the shape space
+                (default = True).
+            betas: torch.tensor, optional, Bx10
+                The default value for the shape member variable.
+                (default = None)
+            create_transl: bool, optional
+                Flag for creating a member variable for the translation
+                of the body. (default = True)
+            transl: torch.tensor, optional, Bx3
+                The default value for the transl variable.
+                (default = None)
+            batch_size: int, optional
+                The batch size used for creating the member variables.
+            dtype: torch.dtype, optional
+                The data type for the created variables.
+        """
+        if gender not in ['male', 'female', 'neutral']:
+            raise RuntimeError('Invalid gender! Should be one of '
+                               '[\'male\', \'female\', or \'neutral\']!')
+        self.gender = gender
+
+        model_fname = 'STAR_{}.npz'.format(gender.upper())
+        if not os.path.exists(model_path):
+            raise RuntimeError('Path {} does not exist!'.format(model_path))
+        elif os.path.isdir(model_path):
+            star_path = os.path.join(model_path, model_fname)
+        else:
+            if os.path.split(model_path)[-1] != model_fname:
+                raise RuntimeError(
+                    f'Model filename ({model_fname}) and gender '
+                    f'({gender}) are incompatible!')
+            star_path = model_path
+
+        super(STAR, self).__init__()
+
+        self.keypoint_src = keypoint_src
+        self.keypoint_dst = keypoint_dst
+        self.keypoint_approximate = keypoint_approximate
+
+        star_model = np.load(star_path, allow_pickle=True)
+        J_regressor = star_model['J_regressor']
+        self.num_betas = num_betas
+
+        # Model sparse joints regressor, regresses joints location from a mesh
+        self.register_buffer('J_regressor',
+                             torch.tensor(J_regressor, dtype=torch.float))
+
+        # Model skinning weights
+        self.register_buffer(
+            'weights', torch.tensor(star_model['weights'], dtype=torch.float))
+
+        # Model pose corrective blend shapes
+        self.register_buffer(
+            'posedirs',
+            torch.tensor(star_model['posedirs'].reshape((-1, 93)),
+                         dtype=torch.float))
+
+        # Mean Shape
+        self.register_buffer(
+            'v_template',
+            torch.tensor(star_model['v_template'], dtype=torch.float))
+
+        # Shape corrective blend shapes
+        self.register_buffer(
+            'shapedirs',
+            torch.tensor(star_model['shapedirs'][:, :, :num_betas],
+                         dtype=torch.float))
+
+        # Mesh traingles
+        self.register_buffer(
+            'faces', torch.from_numpy(star_model['f'].astype(np.int64)))
+        self.f = star_model['f']
+
+        # Kinematic tree of the model
+        self.register_buffer(
+            'kintree_table',
+            torch.from_numpy(star_model['kintree_table'].astype(np.int64)))
+
+        id_to_col = {
+            self.kintree_table[1, i].item(): i
+            for i in range(self.kintree_table.shape[1])
+        }
+        self.register_buffer(
+            'parent',
+            torch.tensor([
+                id_to_col[self.kintree_table[0, it].item()]
+                for it in range(1, self.kintree_table.shape[1])
+            ],
+                         dtype=torch.int64))
+
+        if create_global_orient:
+            if global_orient is None:
+                default_global_orient = torch.zeros([batch_size, 3],
+                                                    dtype=dtype)
+            else:
+                if torch.is_tensor(global_orient):
+                    default_global_orient = global_orient.clone().detach()
+                else:
+                    default_global_orient = torch.tensor(global_orient,
+                                                         dtype=dtype)
+
+            global_orient = nn.Parameter(default_global_orient,
+                                         requires_grad=True)
+            self.register_parameter('global_orient', global_orient)
+
+        if create_body_pose:
+            if body_pose is None:
+                default_body_pose = torch.zeros(
+                    [batch_size, self.NUM_BODY_JOINTS * 3], dtype=dtype)
+            else:
+                if torch.is_tensor(body_pose):
+                    default_body_pose = body_pose.clone().detach()
+                else:
+                    default_body_pose = torch.tensor(body_pose, dtype=dtype)
+            self.register_parameter(
+                'body_pose', nn.Parameter(default_body_pose,
+                                          requires_grad=True))
+
+        if create_betas:
+            if betas is None:
+                default_betas = torch.zeros([batch_size, self.num_betas],
+                                            dtype=dtype)
+            else:
+                if torch.is_tensor(betas):
+                    default_betas = betas.clone().detach()
+                else:
+                    default_betas = torch.tensor(betas, dtype=dtype)
+
+            self.register_parameter(
+                'betas', nn.Parameter(default_betas, requires_grad=True))
+
+        if create_transl:
+            if transl is None:
+                default_transl = torch.zeros([batch_size, 3],
+                                             dtype=dtype,
+                                             requires_grad=True)
+            else:
+                default_transl = torch.tensor(transl, dtype=dtype)
+            self.register_parameter(
+                'transl', nn.Parameter(default_transl, requires_grad=True))
+
+        self.verts = None
+        self.J = None
+        self.R = None
+
+    def forward(self,
+                global_orient: Optional[torch.Tensor] = None,
+                body_pose: Optional[torch.Tensor] = None,
+                betas: Optional[torch.Tensor] = None,
+                transl: Optional[torch.Tensor] = None,
+                return_verts: bool = True,
+                return_full_pose: bool = True) -> torch.Tensor:
+        """Forward pass for the STAR model.
+
+        Args:
+            global_orient: torch.tensor, optional, shape Bx3
+                Global orientation (rotation) of the body. If given, ignore the
+                member variable and use it as the global rotation of the body.
+                Useful if someone wishes to predicts this with an external
+                model. (default=None)
+            body_pose: torch.Tensor, shape Bx(J*3)
+                Pose parameters for the STAR model. It should be a tensor that
+                contains joint rotations in axis-angle format. If given, ignore
+                the member variable and use it as the body parameters.
+                (default=None)
+            betas: torch.Tensor, shape Bx10
+                Shape parameters for the STAR model. If given, ignore the
+                member variable and use it as shape parameters. (default=None)
+            transl: torch.Tensor, shape Bx3
+                Translation vector for the STAR model. If given, ignore the
+                member variable and use it as the translation of the body.
+                (default=None)
+        Returns:
+            output: Contains output parameters and attributes corresponding
+            to other body models.
+        """
+        global_orient = (global_orient
+                         if global_orient is not None else self.global_orient)
+        body_pose = body_pose if body_pose is not None else self.body_pose
+        betas = betas if betas is not None else self.betas
+        apply_transl = transl is not None or hasattr(self, 'transl')
+        if transl is None and hasattr(self, 'transl'):
+            transl = self.transl
+
+        batch_size = body_pose.shape[0]
+        v_template = self.v_template[None, :]
+        shapedirs = self.shapedirs.view(-1, self.num_betas)[None, :].expand(
+            batch_size, -1, -1)
+        beta = betas[:, :, None]
+        v_shaped = torch.matmul(shapedirs, beta).view(-1, 6890, 3) + v_template
+        J = torch.einsum('bik,ji->bjk', [v_shaped, self.J_regressor])
+
+        pose_quat = self.normalize_quaternion(body_pose.view(-1, 3)).view(
+            batch_size, -1)
+        pose_feat = torch.cat((pose_quat[:, 4:], beta[:, 1]), 1)
+
+        R = aa_to_rotmat(body_pose.view(-1, 3)).view(batch_size, 24, 3, 3)
+        R = R.view(batch_size, 24, 3, 3)
+
+        posedirs = self.posedirs[None, :].expand(batch_size, -1, -1)
+        v_posed = v_shaped + torch.matmul(
+            posedirs, pose_feat[:, :, None]).view(-1, 6890, 3)
+
+        root_transform = make_homegeneous_rotmat_batch(
+            torch.cat((R[:, 0], J[:, 0][:, :, None]), 2))
+        results = [root_transform]
+        for i in range(0, self.parent.shape[0]):
+            transform_i = make_homegeneous_rotmat_batch(
+                torch.cat((R[:, i + 1], J[:, i + 1][:, :, None] -
+                           J[:, self.parent[i]][:, :, None]), 2))
+            curr_res = torch.matmul(results[self.parent[i]], transform_i)
+            results.append(curr_res)
+        results = torch.stack(results, dim=1)
+        posed_joints = results[:, :, :3, 3]
+
+        if apply_transl:
+            posed_joints += transl[:, None, :]
+            v_posed += transl[:, None, :]
+
+        joints, joint_mask = convert_kps(posed_joints,
+                                         src=self.keypoint_src,
+                                         dst=self.keypoint_dst,
+                                         approximate=self.keypoint_approximate)
+
+        joint_mask = torch.tensor(joint_mask,
+                                  dtype=torch.uint8,
+                                  device=joints.device)
+        joint_mask = joint_mask.reshape(1, -1).expand(batch_size, -1)
+
+        output = dict(global_orient=global_orient,
+                      body_pose=body_pose,
+                      joints=posed_joints,
+                      joint_mask=joint_mask,
+                      keypoints=torch.cat([joints, joint_mask[:, :, None]],
+                                          dim=-1),
+                      betas=beta)
+
+        if return_verts:
+            output['vertices'] = v_posed
+        if return_full_pose:
+            output['full_pose'] = torch.cat([global_orient, body_pose], dim=1)
+
+        return output
+
+    @classmethod
+    def normalize_quaternion(self, theta: torch.Tensor) -> torch.Tensor:
+        """Computes a normalized quaternion ([0,0,0,0] when the body is in rest
+        pose) given joint angles.
+
+        Args:
+            theta (torch.Tensor): A tensor of joints axis angles,
+                batch size x number of joints x 3
+
+        Returns:
+            quat (torch.Tensor)
+        """
+        l1norm = torch.norm(theta + 1e-8, p=2, dim=1)
+        angle = torch.unsqueeze(l1norm, -1)
+        normalized = torch.div(theta, angle)
+        angle = angle * 0.5
+        v_cos = torch.cos(angle)
+        v_sin = torch.sin(angle)
+        quat = torch.cat([v_sin * normalized, v_cos - 1], dim=1)
+        return quat
diff --git a/detrsmpl/models/body_models/utils.py b/detrsmpl/models/body_models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..da3a4f8dce68a0f1284012453b0136b425ae96d0
--- /dev/null
+++ b/detrsmpl/models/body_models/utils.py
@@ -0,0 +1,116 @@
+import numpy as np
+
+from detrsmpl.utils.transforms import aa_to_rotmat, rotmat_to_aa
+
+
+def transform_to_camera_frame(global_orient, transl, pelvis, extrinsic):
+    """Transform body model parameters to camera frame.
+
+    Args:
+        global_orient (numpy.ndarray): shape (3, ). Only global_orient and
+            transl needs to be updated in the rigid transformation
+        transl (numpy.ndarray): shape (3, ).
+        pelvis (numpy.ndarray): shape (3, ). 3D joint location of pelvis
+            This is necessary to eliminate the offset from SMPL
+            canonical space origin to pelvis, because the global orient
+            is conducted around the pelvis, not the canonical space origin
+        extrinsic (numpy.ndarray): shape (4, 4). Transformation matrix
+            from world frame to camera frame
+    Returns:
+        (new_gloabl_orient, new_transl)
+            new_gloabl_orient: transformed global orient
+            new_transl: transformed transl
+    """
+
+    # take out the small offset from smpl origin to pelvis
+    transl_offset = pelvis - transl
+    T_p2w = np.eye(4)
+    T_p2w[:3, 3] = transl_offset
+
+    # camera extrinsic: transformation from world frame to camera frame
+    T_w2c = extrinsic
+
+    # smpl transformation: from vertex frame to world frame
+    T_v2p = np.eye(4)
+    global_orient_mat = aa_to_rotmat(global_orient)
+    T_v2p[:3, :3] = global_orient_mat
+    T_v2p[:3, 3] = transl
+
+    # compute combined transformation from vertex to world
+    T_v2w = T_p2w @ T_v2p
+
+    # compute transformation from vertex to camera
+    T_v2c = T_w2c @ T_v2w
+
+    # decompose vertex to camera transformation
+    # np: new pelvis frame
+    # T_v2c = T_np2c x T_v2np
+    T_np2c = T_p2w
+    T_v2np = np.linalg.inv(T_np2c) @ T_v2c
+
+    # decompose into new global orient and new transl
+    new_global_orient_mat = T_v2np[:3, :3]
+    new_global_orient = rotmat_to_aa(new_global_orient_mat)
+    new_transl = T_v2np[:3, 3]
+
+    return new_global_orient, new_transl
+
+
+def batch_transform_to_camera_frame(global_orient, transl, pelvis, extrinsic):
+    """Transform body model parameters to camera frame by batch.
+
+    Args:
+        global_orient (np.ndarray): shape (N, 3). Only global_orient and
+            transl needs to be updated in the rigid transformation
+        transl (np.ndarray): shape (N, 3).
+        pelvis (np.ndarray): shape (N, 3). 3D joint location of pelvis
+            This is necessary to eliminate the offset from SMPL
+            canonical space origin to pelvis, because the global orient
+            is conducted around the pelvis, not the canonical space origin
+        extrinsic (np.ndarray): shape (4, 4). Transformation matrix
+            from world frame to camera frame
+    Returns:
+        (new_gloabl_orient, new_transl)
+            new_gloabl_orient: transformed global orient
+            new_transl: transformed transl
+    """
+    N = len(global_orient)
+    assert global_orient.shape == (N, 3)
+    assert transl.shape == (N, 3)
+    assert pelvis.shape == (N, 3)
+
+    # take out the small offset from smpl origin to pelvis
+    transl_offset = pelvis - transl
+    T_p2w = np.eye(4).reshape(1, 4, 4).repeat(N, axis=0)
+    T_p2w[:, :3, 3] = transl_offset
+
+    # camera extrinsic: transformation from world frame to camera frame
+    T_w2c = extrinsic
+
+    # smpl transformation: from vertex frame to world frame
+    T_v2p = np.eye(4).reshape(1, 4, 4).repeat(N, axis=0)
+    global_orient_mat = aa_to_rotmat(global_orient)
+    T_v2p[:, :3, :3] = global_orient_mat
+    T_v2p[:, :3, 3] = transl
+
+    # compute combined transformation from vertex to world
+    T_v2w = T_p2w @ T_v2p
+
+    # compute transformation from vertex to camera
+    T_v2c = T_w2c @ T_v2w
+
+    # decompose vertex to camera transformation
+    # np: new pelvis frame
+    # T_v2c = T_np2c x T_v2np
+    T_np2c = T_p2w
+    T_v2np = np.linalg.inv(T_np2c) @ T_v2c
+
+    # decompose into new global orient and new transl
+    new_global_orient_mat = T_v2np[:, :3, :3]
+    new_global_orient = rotmat_to_aa(new_global_orient_mat)
+    new_transl = T_v2np[:, :3, 3]
+
+    assert new_global_orient.shape == (N, 3)
+    assert new_transl.shape == (N, 3)
+
+    return new_global_orient, new_transl
diff --git a/detrsmpl/models/discriminators/__init__.py b/detrsmpl/models/discriminators/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/models/discriminators/builder.py b/detrsmpl/models/discriminators/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c981eef44bf68d097a6f6b15a157b3d1ad714de
--- /dev/null
+++ b/detrsmpl/models/discriminators/builder.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.utils import Registry
+
+from .pose_discriminator import (
+    FullPoseDiscriminator,
+    PoseDiscriminator,
+    ShapeDiscriminator,
+    SMPLDiscriminator,
+)
+
+DISCRIMINATORS = Registry('discriminators')
+
+DISCRIMINATORS.register_module(name='ShapeDiscriminator',
+                               module=ShapeDiscriminator)
+DISCRIMINATORS.register_module(name='PoseDiscriminator',
+                               module=PoseDiscriminator)
+DISCRIMINATORS.register_module(name='FullPoseDiscriminator',
+                               module=FullPoseDiscriminator)
+DISCRIMINATORS.register_module(name='SMPLDiscriminator',
+                               module=SMPLDiscriminator)
+
+
+def build_discriminator(cfg):
+    """Build discriminator."""
+    if cfg is None:
+        return None
+    return DISCRIMINATORS.build(cfg)
diff --git a/detrsmpl/models/discriminators/pose_discriminator.py b/detrsmpl/models/discriminators/pose_discriminator.py
new file mode 100644
index 0000000000000000000000000000000000000000..1233302688167b426e88a434d0595783566f8e8b
--- /dev/null
+++ b/detrsmpl/models/discriminators/pose_discriminator.py
@@ -0,0 +1,302 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/akanazawa/hmr
+# Original licence: Copyright (c) 2018 akanazawa, under the MIT License.
+# ------------------------------------------------------------------------------
+
+from abc import abstractmethod
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import normal_init, xavier_init
+
+from detrsmpl.utils.geometry import batch_rodrigues
+
+
+class BaseDiscriminator(nn.Module):
+    """Base linear module for SMPL parameter discriminator.
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+            such as (9, 32, 32, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or not
+            for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active function
+            or not, such as (True, True, False)
+    """
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        super().__init__()
+        self.fc_layers = fc_layers
+        self.use_dropout = use_dropout
+        self.drop_prob = drop_prob
+        self.use_activation = use_activation
+        self._check()
+        self.create_layers()
+
+    def _check(self):
+        """Check input to avoid ValueError."""
+        if not isinstance(self.fc_layers, tuple):
+            raise TypeError(f'fc_layers require tuple, '
+                            f'get {type(self.fc_layers)}')
+
+        if not isinstance(self.use_dropout, tuple):
+            raise TypeError(f'use_dropout require tuple, '
+                            f'get {type(self.use_dropout)}')
+
+        if not isinstance(self.drop_prob, tuple):
+            raise TypeError(f'drop_prob require tuple, '
+                            f'get {type(self.drop_prob)}')
+
+        if not isinstance(self.use_activation, tuple):
+            raise TypeError(f'use_activation require tuple, '
+                            f'get {type(self.use_activation)}')
+
+        l_fc_layer = len(self.fc_layers)
+        l_use_drop = len(self.use_dropout)
+        l_drop_prob = len(self.drop_prob)
+        l_use_activation = len(self.use_activation)
+
+        pass_check = (l_fc_layer >= 2 and l_use_drop < l_fc_layer
+                      and l_drop_prob < l_fc_layer
+                      and l_use_activation < l_fc_layer
+                      and l_drop_prob == l_use_drop)
+
+        if not pass_check:
+            msg = 'Wrong BaseDiscriminator parameters!'
+            raise ValueError(msg)
+
+    def create_layers(self):
+        """Create layers."""
+        l_fc_layer = len(self.fc_layers)
+        l_use_drop = len(self.use_dropout)
+        l_use_activation = len(self.use_activation)
+
+        self.fc_blocks = nn.Sequential()
+
+        for i in range(l_fc_layer - 1):
+            self.fc_blocks.add_module(name=f'regressor_fc_{i}',
+                                      module=nn.Linear(
+                                          in_features=self.fc_layers[i],
+                                          out_features=self.fc_layers[i + 1]))
+
+            if i < l_use_activation and self.use_activation[i]:
+                self.fc_blocks.add_module(name=f'regressor_af_{i}',
+                                          module=nn.ReLU())
+
+            if i < l_use_drop and self.use_dropout[i]:
+                self.fc_blocks.add_module(
+                    name=f'regressor_fc_dropout_{i}',
+                    module=nn.Dropout(p=self.drop_prob[i]))
+
+    @abstractmethod
+    def forward(self, inputs):
+        """Forward function."""
+        msg = 'the base class [BaseDiscriminator] is not callable!'
+        raise NotImplementedError(msg)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.fc_blocks.named_modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, gain=0.01)
+
+
+class ShapeDiscriminator(BaseDiscriminator):
+    """Discriminator for SMPL shape parameters, the inputs is (batch_size x 10)
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+         such as (10, 5, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or
+            not for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+            such as (0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active
+            function or not, such as (True, False)
+    """
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        if fc_layers[-1] != 1:
+            msg = f'the neuron count of the last layer ' \
+                  f'must be 1, but got {fc_layers[-1]}'
+            raise ValueError(msg)
+
+        super().__init__(fc_layers, use_dropout, drop_prob, use_activation)
+
+    def forward(self, inputs):
+        """Forward function."""
+        return self.fc_blocks(inputs)
+
+
+class PoseDiscriminator(nn.Module):
+    """Discriminator for SMPL pose parameters of each joint.
+
+    It is composed of
+    discriminators for each joints. The inputs is (batch_size x joint_count x
+    9)
+    Args:
+        channels (Tuple): Tuple of channel number,
+            such as (9, 32, 32, 1)
+        joint_count (int): Joint number, such as 23
+    """
+    def __init__(self, channels, joint_count):
+        super().__init__()
+        if channels[-1] != 1:
+            msg = f'the neuron count of the last layer ' \
+                  f'must be 1, but got {channels[-1]}'
+            raise ValueError(msg)
+        self.joint_count = joint_count
+
+        self.conv_blocks = nn.Sequential()
+        len_channels = len(channels)
+        for idx in range(len_channels - 2):
+            self.conv_blocks.add_module(name=f'conv_{idx}',
+                                        module=nn.Conv2d(
+                                            in_channels=channels[idx],
+                                            out_channels=channels[idx + 1],
+                                            kernel_size=1,
+                                            stride=1))
+
+        self.fc_layer = nn.ModuleList()
+        for idx in range(joint_count):
+            self.fc_layer.append(
+                nn.Linear(in_features=channels[len_channels - 2],
+                          out_features=1))
+
+    def forward(self, inputs):
+        """Forward function.
+
+        The input is (batch_size x joint_count x 9)
+        """
+        # shape: batch_size x 9 x 1 x joint_count
+        inputs = inputs.transpose(1, 2).unsqueeze(2).contiguous()
+        # shape: batch_size x c x 1 x joint_count
+        internal_outputs = self.conv_blocks(inputs)
+        outputs = []
+        for idx in range(self.joint_count):
+            outputs.append(self.fc_layer[idx](internal_outputs[:, :, 0, idx]))
+
+        return torch.cat(outputs, 1), internal_outputs
+
+    def init_weights(self):
+        """Initialize model weights."""
+        for m in self.conv_blocks:
+            if isinstance(m, nn.Conv2d):
+                normal_init(m, std=0.001, bias=0)
+        for m in self.fc_layer.named_modules():
+            if isinstance(m, nn.Linear):
+                xavier_init(m, gain=0.01)
+
+
+class FullPoseDiscriminator(BaseDiscriminator):
+    """Discriminator for SMPL pose parameters of all joints.
+
+    Args:
+        fc_layers (Tuple): Tuple of neuron count,
+         such as (736, 1024, 1024, 1)
+        use_dropout (Tuple): Tuple of bool define use dropout or not
+         for each layer, such as (True, True, False)
+        drop_prob (Tuple): Tuple of float defined the drop prob,
+         such as (0.5, 0.5, 0)
+        use_activation(Tuple): Tuple of bool define use active
+         function or not, such as (True, True, False)
+    """
+    def __init__(self, fc_layers, use_dropout, drop_prob, use_activation):
+        if fc_layers[-1] != 1:
+            msg = f'the neuron count of the last layer must be 1,' \
+                  f' but got {fc_layers[-1]}'
+            raise ValueError(msg)
+
+        super().__init__(fc_layers, use_dropout, drop_prob, use_activation)
+
+    def forward(self, inputs):
+        """Forward function."""
+        return self.fc_blocks(inputs)
+
+
+class SMPLDiscriminator(nn.Module):
+    """Discriminator for SMPL pose and shape parameters.
+
+    It is composed of a
+    discriminator for SMPL shape parameters, a discriminator for SMPL pose
+    parameters of all joints  and a discriminator for SMPL pose parameters of
+    each joint.
+    Args:
+        beta_channel (tuple of int): Tuple of neuron count of the
+            discriminator of shape parameters. Defaults to (10, 5, 1)
+        per_joint_channel (tuple of int): Tuple of neuron count of the
+            discriminator of each joint. Defaults to (9, 32, 32, 1)
+        full_pose_channel (tuple of int): Tuple of neuron count of the
+            discriminator of full pose. Defaults to (23*32, 1024, 1024, 1)
+    """
+    def __init__(self,
+                 beta_channel=(10, 5, 1),
+                 per_joint_channel=(9, 32, 32, 1),
+                 full_pose_channel=(23 * 32, 1024, 1024, 1)):
+        super().__init__()
+        self.joint_count = 23
+        # The count of SMPL shape parameter is 10.
+        assert beta_channel[0] == 10
+        # Use 3 x 3 rotation matrix as the pose parameters
+        # of each joint, so the input channel is 9.
+        assert per_joint_channel[0] == 9
+        assert self.joint_count * per_joint_channel[-2] \
+            == full_pose_channel[0]
+
+        self.beta_channel = beta_channel
+        self.per_joint_channel = per_joint_channel
+        self.full_pose_channel = full_pose_channel
+        self._create_sub_modules()
+
+    def _create_sub_modules(self):
+        """Create sub discriminators."""
+
+        # create theta discriminator for each joint
+        self.pose_discriminator = PoseDiscriminator(self.per_joint_channel,
+                                                    self.joint_count)
+
+        # create full pose discriminator for total joints
+        fc_layers = self.full_pose_channel
+        use_dropout = tuple([False] * (len(fc_layers) - 1))
+        drop_prob = tuple([0.5] * (len(fc_layers) - 1))
+        use_activation = tuple([True] * (len(fc_layers) - 2) + [False])
+
+        self.full_pose_discriminator = FullPoseDiscriminator(
+            fc_layers, use_dropout, drop_prob, use_activation)
+
+        # create shape discriminator for betas
+        fc_layers = self.beta_channel
+        use_dropout = tuple([False] * (len(fc_layers) - 1))
+        drop_prob = tuple([0.5] * (len(fc_layers) - 1))
+        use_activation = tuple([True] * (len(fc_layers) - 2) + [False])
+        self.shape_discriminator = ShapeDiscriminator(fc_layers, use_dropout,
+                                                      drop_prob,
+                                                      use_activation)
+
+    def forward(self, thetas):
+        """Forward function."""
+        _, poses, shapes = thetas
+
+        batch_size = poses.shape[0]
+        shape_disc_value = self.shape_discriminator(shapes)
+
+        # The first rotation matrix is global rotation
+        # and is NOT used in discriminator.
+        if poses.dim() == 2:
+            rotate_matrixs = \
+                batch_rodrigues(poses.contiguous().view(-1, 3)
+                                ).view(batch_size, 24, 9)[:, 1:, :]
+        else:
+            rotate_matrixs = poses.contiguous().view(batch_size, 24,
+                                                     9)[:, 1:, :].contiguous()
+        pose_disc_value, pose_inter_disc_value \
+            = self.pose_discriminator(rotate_matrixs)
+        full_pose_disc_value = self.full_pose_discriminator(
+            pose_inter_disc_value.contiguous().view(batch_size, -1))
+        return torch.cat(
+            (pose_disc_value, full_pose_disc_value, shape_disc_value), 1)
+
+    def init_weights(self):
+        """Initialize model weights."""
+        self.full_pose_discriminator.init_weights()
+        self.pose_discriminator.init_weights()
+        self.shape_discriminator.init_weights()
diff --git a/detrsmpl/models/heads/__init__.py b/detrsmpl/models/heads/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/models/heads/builder.py b/detrsmpl/models/heads/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8df6333e9ed58bb7608a27be3ae77fa2b327389
--- /dev/null
+++ b/detrsmpl/models/heads/builder.py
@@ -0,0 +1,27 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.utils import Registry
+
+from .detr_head import DeformableDETRHead, DETRHead
+from .expose_head import ExPoseBodyHead, ExPoseFaceHead, ExPoseHandHead
+from .hmr_head import HMRHead
+from .hybrik_head import HybrIKHead
+from .pare_head import PareHead
+
+HEADS = Registry('heads')
+
+HEADS.register_module(name='HybrIKHead', module=HybrIKHead)
+HEADS.register_module(name='HMRHead', module=HMRHead)
+HEADS.register_module(name='PareHead', module=PareHead)
+HEADS.register_module(name='ExPoseBodyHead', module=ExPoseBodyHead)
+HEADS.register_module(name='ExPoseHandHead', module=ExPoseHandHead)
+HEADS.register_module(name='ExPoseFaceHead', module=ExPoseFaceHead)
+HEADS.register_module(name='DETRHead', module=DETRHead)
+HEADS.register_module(name='DeformableDETRHead', module=DeformableDETRHead)
+
+
+def build_head(cfg):
+    """Build head."""
+    if cfg is None:
+        return None
+    return HEADS.build(cfg)
diff --git a/detrsmpl/models/heads/detr_head.py b/detrsmpl/models/heads/detr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ef62b2b7651b98ba9b2359888ff033674fc1b0e
--- /dev/null
+++ b/detrsmpl/models/heads/detr_head.py
@@ -0,0 +1,1504 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from abc import ABCMeta
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import (
+    Conv2d,
+    ConvModule,
+    Linear,
+    bias_init_with_prob,
+    build_activation_layer,
+    constant_init,
+)
+from mmcv.cnn.bricks.transformer import FFN
+from mmcv.ops import batched_nms
+from mmcv.runner import BaseModule, force_fp32
+
+from detrsmpl.core.post_processing.bbox.assigners import build_assigner
+# from detrsmpl.core.post_processing.bbox.coder import build_bbox_coder
+from detrsmpl.core.post_processing.bbox.samplers import build_sampler
+from detrsmpl.core.post_processing.bbox.transforms import (
+    bbox_cxcywh_to_xyxy,
+    bbox_xyxy_to_cxcywh,
+)
+# from mmdet.core.anchor.point_generator import MlvlPointGenerator
+# from mmdet.core.utils import filter_scores_and_topk, select_single_mlvl
+from detrsmpl.models.utils import (
+    build_positional_encoding,
+    build_transformer,
+    inverse_sigmoid,
+)
+from detrsmpl.utils.dist_utils import reduce_mean
+from detrsmpl.utils.geometry import rot6d_to_rotmat
+# from utils.misc import multi_apply
+from detrsmpl.utils.misc import multi_apply
+from ..losses.builder import build_loss
+
+
+class DETRHead(BaseModule, metaclass=ABCMeta):
+    """Implements the DETR transformer head.
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_classes (int): Number of categories excluding the background.
+        in_channels (int): Number of channels in the input feature map.
+        num_query (int): Number of query in Transformer.
+        num_reg_fcs (int, optional): Number of fully-connected layers used in
+            `FFN`, which is then used for the regression head. Default 2.
+        transformer (obj:`mmcv.ConfigDict`|dict): Config for transformer.
+            Default: None.
+        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
+            all ranks. Default to False.
+        positional_encoding (obj:`mmcv.ConfigDict`|dict):
+            Config for position encoding.
+        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
+            classification loss. Default `CrossEntropyLoss`.
+        loss_bbox (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression loss. Default `L1Loss`.
+        loss_iou (obj:`mmcv.ConfigDict`|dict): Config of the
+            regression iou loss. Default `GIoULoss`.
+        tran_cfg (obj:`mmcv.ConfigDict`|dict): Training config of
+            transformer head.
+        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
+            transformer head.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+
+    _version = 2
+
+    def __init__(
+            self,
+            num_classes,
+            in_channels,
+            # anchor free
+            feat_channels=256,
+            stacked_convs=4,
+            strides=(4, 8, 16, 32, 64),
+            dcn_on_last_conv=False,
+            conv_bias='auto',
+            num_query=100,
+            num_reg_fcs=2,
+            transformer=None,
+            sync_cls_avg_factor=False,
+            positional_encoding=dict(type='SinePositionalEncoding',
+                                     num_feats=128,
+                                     normalize=True),
+            loss_cls=dict(type='CrossEntropyLoss',
+                          bg_cls_weight=0.1,
+                          use_sigmoid=False,
+                          loss_weight=1.0,
+                          class_weight=1.0),
+            loss_bbox=dict(type='L1Loss', loss_weight=5.0),
+            loss_iou=dict(type='GIoULoss', loss_weight=2.0),
+            # anchor free
+            bbox_coder=dict(type='DistancePointBBoxCoder'),
+            conv_cfg=None,
+            norm_cfg=None,
+            train_cfg=dict(assigner=dict(
+                type='HungarianAssigner',
+                # cls_cost=dict(type='ClassificationCost', weight=1.),
+                # reg_cost=dict(type='BBoxL1Cost', weight=5.0),
+                # iou_cost=dict(type='IoUCost', iou_mode='giou',
+                #               weight=2.0)
+                kp3d_cost=dict(
+                    type='Keypoints3DCost', convention='smpl_54', weight=5.0),
+                kp2d_cost=dict(
+                    type='Keypoints2DCost', convention='smpl_54', weight=5.0),
+            )),
+            test_cfg=dict(max_per_img=100),
+            init_cfg=dict(type='Normal',
+                          layer='Conv2d',
+                          std=0.01,
+                          override=dict(type='Normal',
+                                        name='conv_cls',
+                                        std=0.01,
+                                        bias_prob=0.01)),
+            **kwargs):
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since it brings inconvenience when the initialization of
+        # `AnchorFreeHead` is called.
+        super(DETRHead, self).__init__(init_cfg)
+        self.bg_cls_weight = 0
+        self.sync_cls_avg_factor = sync_cls_avg_factor
+        class_weight = loss_cls.get('class_weight', None)
+        if class_weight is not None and (self.__class__ is DETRHead):
+            assert isinstance(class_weight, float), 'Expected ' \
+                'class_weight to have type float. Found ' \
+                f'{type(class_weight)}.'
+            # NOTE following the official DETR rep0, bg_cls_weight means
+            # relative classification weight of the no-object class.
+            bg_cls_weight = loss_cls.get('bg_cls_weight', class_weight)
+            assert isinstance(bg_cls_weight, float), 'Expected ' \
+                'bg_cls_weight to have type float. Found ' \
+                f'{type(bg_cls_weight)}.'
+            class_weight = torch.ones(num_classes + 1) * class_weight
+            # set background class as the last indice
+            class_weight[num_classes] = bg_cls_weight
+            loss_cls.update({'class_weight': class_weight})
+            if 'bg_cls_weight' in loss_cls:
+                loss_cls.pop('bg_cls_weight')
+            self.bg_cls_weight = bg_cls_weight
+
+        if train_cfg:
+            assert 'assigner' in train_cfg, 'assigner should be provided '\
+                'when train_cfg is set.'
+            assigner = train_cfg['assigner']
+            # TODO: update these
+            # assert loss_cls['loss_weight'] == assigner['kp3d_cost']['weight'], \
+            #     'The classification weight for loss and matcher should be' \
+            #     'exactly the same.'
+            # assert loss_bbox['loss_weight'] == assigner['kp3d_cost'][
+            #     'weight'], 'The regression L1 weight for loss and matcher ' \
+            #     'should be exactly the same.'
+            # assert loss_iou['loss_weight'] == assigner['kp3d_cost']['weight'], \
+            #     'The regression iou weight for loss and matcher should be' \
+            #     'exactly the same.'
+            self.assigner = build_assigner(assigner)
+            # DETR sampling=False, so use PseudoSampler
+            sampler_cfg = dict(type='PseudoSampler')
+            self.sampler = build_sampler(sampler_cfg, context=self)
+
+        self.num_query = num_query
+        self.num_classes = num_classes
+        self.in_channels = in_channels
+        self.num_reg_fcs = num_reg_fcs
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+        self.fp16_enabled = False
+        self.loss_cls = build_loss(loss_cls)
+        self.loss_bbox = build_loss(loss_bbox)
+        self.loss_iou = build_loss(loss_iou)
+
+        if self.loss_cls.use_sigmoid:
+            self.cls_out_channels = num_classes
+        else:
+            self.cls_out_channels = num_classes + 1
+        self.act_cfg = transformer.get('act_cfg',
+                                       dict(type='ReLU', inplace=True))
+        self.activate = build_activation_layer(self.act_cfg)
+        self.positional_encoding = build_positional_encoding(
+            positional_encoding)
+        self.transformer = build_transformer(transformer)
+        self.embed_dims = self.transformer.embed_dims
+        assert 'num_feats' in positional_encoding
+        num_feats = positional_encoding['num_feats']
+        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
+            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
+            f' and {num_feats}.'
+        self._init_layers()
+
+    def _init_layers(self):
+        """Initialize layers of the transformer head."""
+        self.input_proj = Conv2d(self.in_channels,
+                                 self.embed_dims,
+                                 kernel_size=1)
+        self.fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        self.reg_ffn = FFN(self.embed_dims,
+                           self.embed_dims,
+                           self.num_reg_fcs,
+                           self.act_cfg,
+                           dropout=0.0,
+                           add_residual=False)
+        self.fc_reg = Linear(self.embed_dims, 4)
+        self.query_embedding = nn.Embedding(self.num_query, self.embed_dims)
+
+    def init_weights(self):
+        """Initialize weights of the transformer head."""
+        # The initialization for transformer is important
+        self.transformer.init_weights()
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        """load checkpoints."""
+        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
+        # since `AnchorFreeHead._load_from_state_dict` should not be
+        # called here. Invoking the default `Module._load_from_state_dict`
+        # is enough.
+
+        # Names of some parameters in has been changed.
+        version = local_metadata.get('version', None)
+        if (version is None or version < 2) and self.__class__ is DETRHead:
+            convert_dict = {
+                '.self_attn.': '.attentions.0.',
+                '.ffn.': '.ffns.0.',
+                '.multihead_attn.': '.attentions.1.',
+                '.decoder.norm.': '.decoder.post_norm.'
+            }
+            state_dict_keys = list(state_dict.keys())
+            for k in state_dict_keys:
+                for ori_key, convert_key in convert_dict.items():
+                    if ori_key in k:
+                        convert_key = k.replace(ori_key, convert_key)
+                        state_dict[convert_key] = state_dict[k]
+                        del state_dict[k]
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
+
+    def forward(self, feats, img_metas):
+        """Forward function.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
+
+                - all_cls_scores_list (list[Tensor]): Classification scores \
+                    for each scale level. Each is a 4D-tensor with shape \
+                    [nb_dec, bs, num_query, cls_out_channels]. Note \
+                    `cls_out_channels` should includes background.
+                - all_bbox_preds_list (list[Tensor]): Sigmoid regression \
+                    outputs for each scale level. Each is a 4D-tensor with \
+                    normalized coordinate format (cx, cy, w, h) and shape \
+                    [nb_dec, bs, num_query, 4].
+        """
+        num_levels = len(feats)
+        img_metas_list = [img_metas for _ in range(num_levels)]
+        return multi_apply(self.forward_single, feats, img_metas_list)
+
+    def forward_single(self, x, img_metas):
+        """"Forward function for a single feature level.
+
+        Args:
+            x (Tensor): Input feature from backbone's single stage, shape
+                [bs, c, h, w].
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format (cx, cy, w, h).
+                Shape [nb_dec, bs, num_query, 4].
+        """
+        # construct binary masks which used for the transformer.
+        # NOTE following the official DETR repo, non-zero values representing
+        # ignored positions, while zero values means valid positions.
+        batch_size = x.size(0)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        masks = x.new_ones((batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w, _ = img_metas[img_id]['img_shape']
+            masks[img_id, :img_h, :img_w] = 0
+
+        x = self.input_proj(x)
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(masks.unsqueeze(1),
+                              size=x.shape[-2:]).to(torch.bool).squeeze(1)
+        # position encoding
+        pos_embed = self.positional_encoding(masks)  # [bs, embed_dim, h, w]
+        # outs_dec: [nb_dec, bs, num_query, embed_dim]
+        outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
+                                       pos_embed)
+
+        all_cls_scores = self.fc_cls(outs_dec)
+        all_bbox_preds = self.fc_reg(self.activate(
+            self.reg_ffn(outs_dec))).sigmoid()
+        return all_cls_scores, all_bbox_preds
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def loss(self,
+             all_cls_scores_list,
+             all_bbox_preds_list,
+             gt_bboxes_list,
+             gt_labels_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """"Loss function.
+
+        Only outputs from the last feature level are used for computing
+        losses by default.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        # NOTE defaultly only the outputs from the last feature scale is used.
+        all_cls_scores = all_cls_scores_list[-1]
+        all_bbox_preds = all_bbox_preds_list[-1]
+        assert gt_bboxes_ignore is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
+                                                       losses_bbox[:-1],
+                                                       losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    def loss_single(self,
+                    cls_scores,
+                    bbox_preds,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Loss function for outputs from a single decoder layer of a single
+        feature level.
+
+        Args:
+            cls_scores (Tensor): Box score logits from a single decoder layer
+                for all images. Shape [bs, num_query, cls_out_channels].
+            bbox_preds (Tensor): Sigmoid outputs from a single decoder layer
+                for all images, with normalized coordinate (cx, cy, w, h) and
+                shape [bs, num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components for outputs from
+                a single decoder layer.
+        """
+        num_imgs = cls_scores.size(0)
+        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
+        bbox_preds_list = [bbox_preds[i] for i in range(num_imgs)]
+        cls_reg_targets = self.get_targets(cls_scores_list, bbox_preds_list,
+                                           gt_bboxes_list, gt_labels_list,
+                                           img_metas, gt_bboxes_ignore_list)
+        (labels_list, label_weights_list, bbox_targets_list, bbox_weights_list,
+         num_total_pos, num_total_neg) = cls_reg_targets
+        labels = torch.cat(labels_list, 0)
+        label_weights = torch.cat(label_weights_list, 0)
+        bbox_targets = torch.cat(bbox_targets_list, 0)
+        bbox_weights = torch.cat(bbox_weights_list, 0)
+
+        # classification loss
+        cls_scores = cls_scores.reshape(-1, self.cls_out_channels)
+        # construct weighted avg_factor to match with the official DETR repo
+        cls_avg_factor = num_total_pos * 1.0 + \
+            num_total_neg * self.bg_cls_weight
+        if self.sync_cls_avg_factor:
+            cls_avg_factor = reduce_mean(
+                cls_scores.new_tensor([cls_avg_factor]))
+        cls_avg_factor = max(cls_avg_factor, 1)
+
+        loss_cls = self.loss_cls(cls_scores,
+                                 labels,
+                                 label_weights,
+                                 avg_factor=cls_avg_factor)
+
+        # Compute the average number of gt boxes across all gpus, for
+        # normalization purposes
+        num_total_pos = loss_cls.new_tensor([num_total_pos])
+        num_total_pos = torch.clamp(reduce_mean(num_total_pos), min=1).item()
+
+        # construct factors used for rescale bboxes
+        factors = []
+        for img_meta, bbox_pred in zip(img_metas, bbox_preds):
+            img_h, img_w, _ = img_meta['img_shape']
+            factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                           img_h]).unsqueeze(0).repeat(
+                                               bbox_pred.size(0), 1)
+            factors.append(factor)
+        factors = torch.cat(factors, 0)
+
+        # DETR regress the relative position of boxes (cxcywh) in the image,
+        # thus the learning target is normalized by the image size. So here
+        # we need to re-scale them for calculating IoU loss
+        bbox_preds = bbox_preds.reshape(-1, 4)
+        bboxes = bbox_cxcywh_to_xyxy(bbox_preds) * factors
+        bboxes_gt = bbox_cxcywh_to_xyxy(bbox_targets) * factors
+
+        # regression IoU loss, defaultly GIoU loss
+        loss_iou = self.loss_iou(bboxes,
+                                 bboxes_gt,
+                                 bbox_weights,
+                                 avg_factor=num_total_pos)
+
+        # regression L1 loss
+        loss_bbox = self.loss_bbox(bbox_preds,
+                                   bbox_targets,
+                                   bbox_weights,
+                                   avg_factor=num_total_pos)
+        return loss_cls, loss_bbox, loss_iou
+
+    def get_targets(self,
+                    cls_scores_list,
+                    bbox_preds_list,
+                    gt_bboxes_list,
+                    gt_labels_list,
+                    img_metas,
+                    gt_bboxes_ignore_list=None):
+        """"Compute regression and classification targets for a batch image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_scores_list (list[Tensor]): Box score logits from a single
+                decoder layer for each image with shape [num_query,
+                cls_out_channels].
+            bbox_preds_list (list[Tensor]): Sigmoid outputs from a single
+                decoder layer for each image, with normalized coordinate
+                (cx, cy, w, h) and shape [num_query, 4].
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore_list (list[Tensor], optional): Bounding
+                boxes which can be ignored for each image. Default None.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels_list (list[Tensor]): Labels for all images.
+                - label_weights_list (list[Tensor]): Label weights for all \
+                    images.
+                - bbox_targets_list (list[Tensor]): BBox targets for all \
+                    images.
+                - bbox_weights_list (list[Tensor]): BBox weights for all \
+                    images.
+                - num_total_pos (int): Number of positive samples in all \
+                    images.
+                - num_total_neg (int): Number of negative samples in all \
+                    images.
+        """
+        assert gt_bboxes_ignore_list is None, \
+            'Only supports for gt_bboxes_ignore setting to None.'
+        num_imgs = len(cls_scores_list)
+        gt_bboxes_ignore_list = [
+            gt_bboxes_ignore_list for _ in range(num_imgs)
+        ]
+
+        (labels_list, label_weights_list, bbox_targets_list,
+         bbox_weights_list, pos_inds_list, neg_inds_list) = multi_apply(
+             self._get_target_single, cls_scores_list, bbox_preds_list,
+             gt_bboxes_list, gt_labels_list, img_metas, gt_bboxes_ignore_list)
+        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
+        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
+        return (labels_list, label_weights_list, bbox_targets_list,
+                bbox_weights_list, num_total_pos, num_total_neg)
+
+    def _get_target_single(self,
+                           cls_score,
+                           bbox_pred,
+                           gt_bboxes,
+                           gt_labels,
+                           img_meta,
+                           gt_bboxes_ignore=None):
+        """"Compute regression and classification targets for one image.
+
+        Outputs from a single decoder layer of a single feature level are used.
+
+        Args:
+            cls_score (Tensor): Box score logits from a single decoder layer
+                for one image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from a single decoder layer
+                for one image, with normalized coordinate (cx, cy, w, h) and
+                shape [num_query, 4].
+            gt_bboxes (Tensor): Ground truth bboxes for one image with
+                shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels (Tensor): Ground truth class indices for one image
+                with shape (num_gts, ).
+            img_meta (dict): Meta information for one image.
+            gt_bboxes_ignore (Tensor, optional): Bounding boxes
+                which can be ignored. Default None.
+
+        Returns:
+            tuple[Tensor]: a tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image.
+                - label_weights (Tensor]): Label weights of each image.
+                - bbox_targets (Tensor): BBox targets of each image.
+                - bbox_weights (Tensor): BBox weights of each image.
+                - pos_inds (Tensor): Sampled positive indices for each image.
+                - neg_inds (Tensor): Sampled negative indices for each image.
+        """
+
+        num_bboxes = bbox_pred.size(0)
+        # assigner and sampler
+        assign_result = self.assigner.assign(bbox_pred, cls_score, gt_bboxes,
+                                             gt_labels, img_meta,
+                                             gt_bboxes_ignore)
+        sampling_result = self.sampler.sample(assign_result, bbox_pred,
+                                              gt_bboxes)
+        pos_inds = sampling_result.pos_inds
+        neg_inds = sampling_result.neg_inds
+
+        # label targets
+        labels = gt_bboxes.new_full((num_bboxes, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds]
+        label_weights = gt_bboxes.new_ones(num_bboxes)
+
+        # bbox targets
+        bbox_targets = torch.zeros_like(bbox_pred)
+        bbox_weights = torch.zeros_like(bbox_pred)
+        bbox_weights[pos_inds] = 1.0
+        img_h, img_w, _ = img_meta['img_shape']
+
+        # DETR regress the relative position of boxes (cxcywh) in the image.
+        # Thus the learning target should be normalized by the image size, also
+        # the box format should be converted from defaultly x1y1x2y2 to cxcywh.
+        factor = bbox_pred.new_tensor([img_w, img_h, img_w,
+                                       img_h]).unsqueeze(0)
+        pos_gt_bboxes_normalized = sampling_result.pos_gt_bboxes / factor
+        pos_gt_bboxes_targets = bbox_xyxy_to_cxcywh(pos_gt_bboxes_normalized)
+        bbox_targets[pos_inds] = pos_gt_bboxes_targets
+        return (labels, label_weights, bbox_targets, bbox_weights, pos_inds,
+                neg_inds)
+
+    # over-write because img_metas are needed as inputs for bbox_head.
+    def forward_train(self,
+                      x,
+                      img_metas,
+                      gt_bboxes,
+                      gt_labels=None,
+                      gt_bboxes_ignore=None,
+                      proposal_cfg=None,
+                      **kwargs):
+        """Forward function for training mode.
+
+        Args:
+            x (list[Tensor]): Features from backbone.
+            img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+            gt_bboxes (Tensor): Ground truth bboxes of the image,
+                shape (num_gts, 4).
+            gt_labels (Tensor): Ground truth labels of each box,
+                shape (num_gts,).
+            gt_bboxes_ignore (Tensor): Ground truth bboxes to be
+                ignored, shape (num_ignored_gts, 4).
+            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert proposal_cfg is None, '"proposal_cfg" must be None'
+        outs = self(x, img_metas)
+        if gt_labels is None:
+            loss_inputs = outs + (gt_bboxes, img_metas)
+        else:
+            loss_inputs = outs + (gt_bboxes, gt_labels, img_metas)
+        losses = self.loss(*loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
+        return losses
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def get_bboxes(self,
+                   all_cls_scores_list,
+                   all_bbox_preds_list,
+                   img_metas,
+                   rescale=False):
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If True, return boxes in original
+                image space. Default False.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
+                The first item is an (n, 5) tensor, where the first 4 columns \
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
+                5-th column is a score between 0 and 1. The second item is a \
+                (n,) tensor where each item is the predicted class label of \
+                the corresponding box.
+        """
+        # NOTE defaultly only using outputs from the last feature level,
+        # and only the outputs from the last decoder layer is used.
+        cls_scores = all_cls_scores_list[-1][-1]
+        bbox_preds = all_bbox_preds_list[-1][-1]
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score, bbox_pred,
+                                                img_shape, scale_factor,
+                                                rescale)
+            result_list.append(proposals)
+
+        return result_list
+
+    def _get_bboxes_single(self,
+                           cls_score,
+                           bbox_pred,
+                           img_shape,
+                           scale_factor,
+                           rescale=False):
+        """Transform outputs from the last decoder layer into bbox predictions
+        for each image.
+
+        Args:
+            cls_score (Tensor): Box score logits from the last decoder layer
+                for each image. Shape [num_query, cls_out_channels].
+            bbox_pred (Tensor): Sigmoid outputs from the last decoder layer
+                for each image, with coordinate format (cx, cy, w, h) and
+                shape [num_query, 4].
+            img_shape (tuple[int]): Shape of input image, (height, width, 3).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            rescale (bool, optional): If True, return boxes in original image
+                space. Default False.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels.
+
+                - det_bboxes: Predicted bboxes with shape [num_query, 5], \
+                    where the first 4 columns are bounding box positions \
+                    (tl_x, tl_y, br_x, br_y) and the 5-th column are scores \
+                    between 0 and 1.
+                - det_labels: Predicted labels of the corresponding box with \
+                    shape [num_query].
+        """
+        assert len(cls_score) == len(bbox_pred)
+        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
+        # exclude background
+        if self.loss_cls.use_sigmoid:
+            cls_score = cls_score.sigmoid()
+            scores, indexes = cls_score.view(-1).topk(max_per_img)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_pred = bbox_pred[bbox_index]
+        else:
+            scores, det_labels = F.softmax(cls_score, dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img)
+            bbox_pred = bbox_pred[bbox_index]
+            det_labels = det_labels[bbox_index]
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_pred)
+        det_bboxes[:, 0::2] = det_bboxes[:, 0::2] * img_shape[1]
+        det_bboxes[:, 1::2] = det_bboxes[:, 1::2] * img_shape[0]
+        det_bboxes[:, 0::2].clamp_(min=0, max=img_shape[1])
+        det_bboxes[:, 1::2].clamp_(min=0, max=img_shape[0])
+        if rescale:
+            det_bboxes /= det_bboxes.new_tensor(scale_factor)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(1)), -1)
+
+        return det_bboxes, det_labels
+
+    def simple_test_bboxes(self, feats, img_metas, rescale=False):
+        """Test det bboxes without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n,)
+        """
+        # forward of this head requires img_metas
+        outs = self.forward(feats, img_metas)
+        results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)
+        return results_list
+
+    def forward_onnx(self, feats, img_metas):
+        """Forward function for exporting to ONNX.
+
+        Over-write `forward` because: `masks` is directly created with
+        zero (valid position tag) and has the same spatial size as `x`.
+        Thus the construction of `masks` is different from that in `forward`.
+
+        Args:
+            feats (tuple[Tensor]): Features from the upstream network, each is
+                a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            tuple[list[Tensor], list[Tensor]]: Outputs for all scale levels.
+
+                - all_cls_scores_list (list[Tensor]): Classification scores \
+                    for each scale level. Each is a 4D-tensor with shape \
+                    [nb_dec, bs, num_query, cls_out_channels]. Note \
+                    `cls_out_channels` should includes background.
+                - all_bbox_preds_list (list[Tensor]): Sigmoid regression \
+                    outputs for each scale level. Each is a 4D-tensor with \
+                    normalized coordinate format (cx, cy, w, h) and shape \
+                    [nb_dec, bs, num_query, 4].
+        """
+        num_levels = len(feats)
+        img_metas_list = [img_metas for _ in range(num_levels)]
+        return multi_apply(self.forward_single_onnx, feats, img_metas_list)
+
+    def forward_single_onnx(self, x, img_metas):
+        """"Forward function for a single feature level with ONNX exportation.
+
+        Args:
+            x (Tensor): Input feature from backbone's single stage, shape
+                [bs, c, h, w].
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head,
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression
+                head with normalized coordinate format (cx, cy, w, h).
+                Shape [nb_dec, bs, num_query, 4].
+        """
+        # Note `img_shape` is not dynamically traceable to ONNX,
+        # since the related augmentation was done with numpy under
+        # CPU. Thus `masks` is directly created with zeros (valid tag)
+        # and the same spatial shape as `x`.
+        # The difference between torch and exported ONNX model may be
+        # ignored, since the same performance is achieved (e.g.
+        # 40.1 vs 40.1 for DETR)
+        batch_size = x.size(0)
+        h, w = x.size()[-2:]
+        masks = x.new_zeros((batch_size, h, w))  # [B,h,w]
+
+        x = self.input_proj(x)
+        # interpolate masks to have the same spatial shape with x
+        masks = F.interpolate(masks.unsqueeze(1),
+                              size=x.shape[-2:]).to(torch.bool).squeeze(1)
+        pos_embed = self.positional_encoding(masks)
+        outs_dec, _ = self.transformer(x, masks, self.query_embedding.weight,
+                                       pos_embed)
+
+        all_cls_scores = self.fc_cls(outs_dec)
+        all_bbox_preds = self.fc_reg(self.activate(
+            self.reg_ffn(outs_dec))).sigmoid()
+        return all_cls_scores, all_bbox_preds
+
+    def onnx_export(self, all_cls_scores_list, all_bbox_preds_list, img_metas):
+        """Transform network outputs into bbox predictions, with ONNX
+        exportation.
+
+        Args:
+            all_cls_scores_list (list[Tensor]): Classification outputs
+                for each feature level. Each is a 4D-tensor with shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds_list (list[Tensor]): Sigmoid regression
+                outputs for each feature level. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            img_metas (list[dict]): Meta information of each image.
+
+        Returns:
+            tuple[Tensor, Tensor]: dets of shape [N, num_det, 5]
+                and class labels of shape [N, num_det].
+        """
+        assert len(img_metas) == 1, \
+            'Only support one input image while in exporting to ONNX'
+
+        cls_scores = all_cls_scores_list[-1][-1]
+        bbox_preds = all_bbox_preds_list[-1][-1]
+
+        # Note `img_shape` is not dynamically traceable to ONNX,
+        # here `img_shape_for_onnx` (padded shape of image tensor)
+        # is used.
+        img_shape = img_metas[0]['img_shape_for_onnx']
+        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
+        batch_size = cls_scores.size(0)
+        # `batch_index_offset` is used for the gather of concatenated tensor
+        batch_index_offset = torch.arange(batch_size).to(
+            cls_scores.device) * max_per_img
+        batch_index_offset = batch_index_offset.unsqueeze(1).expand(
+            batch_size, max_per_img)
+
+        # supports dynamical batch inference
+        if self.loss_cls.use_sigmoid:
+            cls_scores = cls_scores.sigmoid()
+            scores, indexes = cls_scores.view(batch_size, -1).topk(max_per_img,
+                                                                   dim=1)
+            det_labels = indexes % self.num_classes
+            bbox_index = indexes // self.num_classes
+            bbox_index = (bbox_index + batch_index_offset).view(-1)
+            bbox_preds = bbox_preds.view(-1, 4)[bbox_index]
+            bbox_preds = bbox_preds.view(batch_size, -1, 4)
+        else:
+            scores, det_labels = F.softmax(cls_scores,
+                                           dim=-1)[..., :-1].max(-1)
+            scores, bbox_index = scores.topk(max_per_img, dim=1)
+            bbox_index = (bbox_index + batch_index_offset).view(-1)
+            bbox_preds = bbox_preds.view(-1, 4)[bbox_index]
+            det_labels = det_labels.view(-1)[bbox_index]
+            bbox_preds = bbox_preds.view(batch_size, -1, 4)
+            det_labels = det_labels.view(batch_size, -1)
+
+        det_bboxes = bbox_cxcywh_to_xyxy(bbox_preds)
+        # use `img_shape_tensor` for dynamically exporting to ONNX
+        img_shape_tensor = img_shape.flip(0).repeat(2)  # [w,h,w,h]
+        img_shape_tensor = img_shape_tensor.unsqueeze(0).unsqueeze(0).expand(
+            batch_size, det_bboxes.size(1), 4)
+        det_bboxes = det_bboxes * img_shape_tensor
+        # dynamically clip bboxes
+        x1, y1, x2, y2 = det_bboxes.split((1, 1, 1, 1), dim=-1)
+        from mmdet.core.export import dynamic_clip_for_onnx
+        x1, y1, x2, y2 = dynamic_clip_for_onnx(x1, y1, x2, y2, img_shape)
+        det_bboxes = torch.cat([x1, y1, x2, y2], dim=-1)
+        det_bboxes = torch.cat((det_bboxes, scores.unsqueeze(-1)), -1)
+
+        return det_bboxes, det_labels
+
+    # BaseDenseHead
+    def _bbox_post_process(self,
+                           mlvl_scores,
+                           mlvl_labels,
+                           mlvl_bboxes,
+                           scale_factor,
+                           cfg,
+                           rescale=False,
+                           with_nms=True,
+                           mlvl_score_factors=None,
+                           **kwargs):
+        """bbox post-processing method.
+
+        The boxes would be rescaled to the original image scale and do
+        the nms operation. Usually `with_nms` is False is used for aug test.
+
+        Args:
+            mlvl_scores (list[Tensor]): Box scores from all scale
+                levels of a single image, each item has shape
+                (num_bboxes, ).
+            mlvl_labels (list[Tensor]): Box class labels from all scale
+                levels of a single image, each item has shape
+                (num_bboxes, ).
+            mlvl_bboxes (list[Tensor]): Decoded bboxes from all scale
+                levels of a single image, each item has shape (num_bboxes, 4).
+            scale_factor (ndarray, optional): Scale factor of the image arange
+                as (w_scale, h_scale, w_scale, h_scale).
+            cfg (mmcv.Config): Test / postprocessing configuration,
+                if None, test_cfg would be used.
+            rescale (bool): If True, return boxes in original image space.
+                Default: False.
+            with_nms (bool): If True, do nms before return boxes.
+                Default: True.
+            mlvl_score_factors (list[Tensor], optional): Score factor from
+                all scale levels of a single image, each item has shape
+                (num_bboxes, ). Default: None.
+
+        Returns:
+            tuple[Tensor]: Results of detected bboxes and labels. If with_nms
+                is False and mlvl_score_factor is None, return mlvl_bboxes and
+                mlvl_scores, else return mlvl_bboxes, mlvl_scores and
+                mlvl_score_factor. Usually with_nms is False is used for aug
+                test. If with_nms is True, then return the following format
+
+                - det_bboxes (Tensor): Predicted bboxes with shape \
+                    [num_bboxes, 5], where the first 4 columns are bounding \
+                    box positions (tl_x, tl_y, br_x, br_y) and the 5-th \
+                    column are scores between 0 and 1.
+                - det_labels (Tensor): Predicted labels of the corresponding \
+                    box with shape [num_bboxes].
+        """
+        assert len(mlvl_scores) == len(mlvl_bboxes) == len(mlvl_labels)
+
+        mlvl_bboxes = torch.cat(mlvl_bboxes)
+        if rescale:
+            mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
+        mlvl_scores = torch.cat(mlvl_scores)
+        mlvl_labels = torch.cat(mlvl_labels)
+
+        if mlvl_score_factors is not None:
+            # TODO： Add sqrt operation in order to be consistent with
+            #  the paper.
+            mlvl_score_factors = torch.cat(mlvl_score_factors)
+            mlvl_scores = mlvl_scores * mlvl_score_factors
+
+        if with_nms:
+            if mlvl_bboxes.numel() == 0:
+                det_bboxes = torch.cat([mlvl_bboxes, mlvl_scores[:, None]], -1)
+                return det_bboxes, mlvl_labels
+
+            det_bboxes, keep_idxs = batched_nms(mlvl_bboxes, mlvl_scores,
+                                                mlvl_labels, cfg.nms)
+            det_bboxes = det_bboxes[:cfg.max_per_img]
+            det_labels = mlvl_labels[keep_idxs][:cfg.max_per_img]
+            return det_bboxes, det_labels
+        else:
+            return mlvl_bboxes, mlvl_scores, mlvl_labels
+
+    def simple_test(self, feats, img_metas, rescale=False):
+        """Test function without test-time augmentation.
+
+        Args:
+            feats (tuple[torch.Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            img_metas (list[dict]): List of image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[tuple[Tensor, Tensor]]: Each item in result_list is 2-tuple.
+                The first item is ``bboxes`` with shape (n, 5),
+                where 5 represent (tl_x, tl_y, br_x, br_y, score).
+                The shape of the second tensor in the tuple is ``labels``
+                with shape (n, ).
+        """
+        return self.simple_test_bboxes(feats, img_metas, rescale=rescale)
+
+    # AnchorfreeHead
+
+    def _init_cls_convs(self):
+        """Initialize classification conv layers of the head."""
+        self.cls_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.cls_convs.append(
+                ConvModule(chn,
+                           self.feat_channels,
+                           3,
+                           stride=1,
+                           padding=1,
+                           conv_cfg=conv_cfg,
+                           norm_cfg=self.norm_cfg,
+                           bias=self.conv_bias))
+
+    def _init_reg_convs(self):
+        """Initialize bbox regression conv layers of the head."""
+        self.reg_convs = nn.ModuleList()
+        for i in range(self.stacked_convs):
+            chn = self.in_channels if i == 0 else self.feat_channels
+            if self.dcn_on_last_conv and i == self.stacked_convs - 1:
+                conv_cfg = dict(type='DCNv2')
+            else:
+                conv_cfg = self.conv_cfg
+            self.reg_convs.append(
+                ConvModule(chn,
+                           self.feat_channels,
+                           3,
+                           stride=1,
+                           padding=1,
+                           conv_cfg=conv_cfg,
+                           norm_cfg=self.norm_cfg,
+                           bias=self.conv_bias))
+
+    def _init_predictor(self):
+        """Initialize predictor layers of the head."""
+        self.conv_cls = nn.Conv2d(self.feat_channels,
+                                  self.cls_out_channels,
+                                  3,
+                                  padding=1)
+        self.conv_reg = nn.Conv2d(self.feat_channels, 4, 3, padding=1)
+
+    def _get_points_single(self,
+                           featmap_size,
+                           stride,
+                           dtype,
+                           device,
+                           flatten=False):
+        """Get points of a single scale level.
+
+        This function will be deprecated soon.
+        """
+
+        warnings.warn(
+            '`_get_points_single` in `AnchorFreeHead` will be '
+            'deprecated soon, we support a multi level point generator now'
+            'you can get points of a single level feature map '
+            'with `self.prior_generator.single_level_grid_priors` ')
+
+        h, w = featmap_size
+        # First create Range with the default dtype, than convert to
+        # target `dtype` for onnx exporting.
+        x_range = torch.arange(w, device=device).to(dtype)
+        y_range = torch.arange(h, device=device).to(dtype)
+        y, x = torch.meshgrid(y_range, x_range)
+        if flatten:
+            y = y.flatten()
+            x = x.flatten()
+        return y, x
+
+    def get_points(self, featmap_sizes, dtype, device, flatten=False):
+        """Get points according to feature map sizes.
+
+        Args:
+            featmap_sizes (list[tuple]): Multi-level feature map sizes.
+            dtype (torch.dtype): Type of points.
+            device (torch.device): Device of points.
+
+        Returns:
+            tuple: points of each image.
+        """
+        warnings.warn(
+            '`get_points` in `AnchorFreeHead` will be '
+            'deprecated soon, we support a multi level point generator now'
+            'you can get points of all levels '
+            'with `self.prior_generator.grid_priors` ')
+
+        mlvl_points = []
+        for i in range(len(featmap_sizes)):
+            mlvl_points.append(
+                self._get_points_single(featmap_sizes[i], self.strides[i],
+                                        dtype, device, flatten))
+        return mlvl_points
+
+    def aug_test(self, feats, img_metas, rescale=False):
+        """Test function with test time augmentation.
+
+        Args:
+            feats (list[Tensor]): the outer list indicates test-time
+                augmentations and inner Tensor should have a shape NxCxHxW,
+                which contains features for all images in the batch.
+            img_metas (list[list[dict]]): the outer list indicates test-time
+                augs (multiscale, flip, etc.) and the inner list indicates
+                images in a batch. each dict has image information.
+            rescale (bool, optional): Whether to rescale the results.
+                Defaults to False.
+
+        Returns:
+            list[ndarray]: bbox results of each class
+        """
+        return self.aug_test_bboxes(feats, img_metas, rescale=rescale)
+
+
+class DeformableDETRHead(DETRHead):
+    """Head of DeformDETR: Deformable DETR: Deformable Transformers for End-to-
+    End Object Detection.
+
+    Code is modified from the `official github repo
+    <https://github.com/fundamentalvision/Deformable-DETR>`_.
+
+    More details can be found in the `paper
+    <https://arxiv.org/abs/2010.04159>`_ .
+
+    Args:
+        with_box_refine (bool): Whether to refine the reference points
+            in the decoder. Defaults to False.
+        as_two_stage (bool) : Whether to generate the proposal from
+            the outputs of encoder.
+        transformer (obj:`ConfigDict`): ConfigDict is used for building
+            the Encoder and Decoder.
+    """
+    def __init__(
+            self,
+            *args,
+            with_box_refine=False,
+            as_two_stage=False,
+            transformer=None,
+            npose=144,
+            nbeta=10,
+            ncam=3,
+            hdim=256,  # TODO: choose proper hdim
+            niter=3,
+            smpl_mean_params=None,
+            **kwargs):
+        self.with_box_refine = with_box_refine
+        self.as_two_stage = as_two_stage
+        self.npose = npose
+        self.nbeta = nbeta
+        self.ncam = ncam
+        self.hdim = hdim
+        self.niter = niter
+
+        if self.as_two_stage:
+            transformer['as_two_stage'] = self.as_two_stage
+
+        super(DeformableDETRHead, self).__init__(*args,
+                                                 transformer=transformer,
+                                                 **kwargs)
+
+        if smpl_mean_params is None:
+            init_pose = torch.zeros([1, npose])
+            init_shape = torch.zeros([1, nbeta])
+            init_cam = torch.FloatTensor([[1, 0, 0]])
+        else:
+            mean_params = np.load(smpl_mean_params)
+            init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
+            init_shape = torch.from_numpy(
+                mean_params['shape'][:].astype('float32')).unsqueeze(0)
+            init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0)
+        self.register_buffer('init_pose', init_pose)
+        self.register_buffer('init_shape', init_shape)
+        self.register_buffer('init_cam', init_cam)
+
+    def _init_layers(self):
+        """Initialize classification branch and regression branch of head."""
+
+        fc_cls = Linear(self.embed_dims, self.cls_out_channels)
+        reg_branch = []
+        for _ in range(self.num_reg_fcs):
+            reg_branch.append(Linear(self.embed_dims, self.embed_dims))
+            reg_branch.append(nn.ReLU())
+        reg_branch.append(Linear(self.embed_dims, 4))
+        reg_branch = nn.Sequential(*reg_branch)
+
+        # smpl branch
+        smpl_branch = nn.ModuleList([
+            nn.Linear(self.embed_dims + self.npose + self.nbeta + self.ncam,
+                      self.hdim),  # fc1
+            nn.Dropout(),
+            nn.Linear(self.hdim, self.hdim),  # fc2
+            nn.Dropout(),
+            nn.Linear(self.hdim, self.npose),  # regress pose
+            nn.Linear(self.hdim, self.nbeta),  # regress beta
+            nn.Linear(self.hdim, self.ncam)  # regress cam
+        ])
+
+        def _get_clones(module, N):
+            return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+        # last reg_branch is used to generate proposal from
+        # encode feature map when as_two_stage is True.
+        num_pred = (self.transformer.decoder.num_layers + 1) if \
+            self.as_two_stage else self.transformer.decoder.num_layers
+
+        if self.with_box_refine:
+            self.cls_branches = _get_clones(fc_cls, num_pred)
+            self.reg_branches = _get_clones(reg_branch, num_pred)
+            self.smpl_branches = _get_clones(smpl_branch, num_pred)
+        else:
+
+            self.cls_branches = nn.ModuleList(
+                [fc_cls for _ in range(num_pred)])
+            self.reg_branches = nn.ModuleList(
+                [reg_branch for _ in range(num_pred)])
+            self.smpl_branches = nn.ModuleList(
+                [smpl_branch for _ in range(num_pred)])
+        if not self.as_two_stage:
+            self.query_embedding = nn.Embedding(self.num_query,
+                                                self.embed_dims * 2)
+
+    def regress_smpl(self,
+                     lvl,
+                     feature,
+                     init_pose=None,
+                     init_shape=None,
+                     init_cam=None,
+                     n_iter=3):
+        batch_size = feature.shape[0]
+        num_query = feature.shape[1]
+        if init_pose is None:
+            init_pose = self.init_pose.expand(batch_size, num_query, -1)
+        if init_shape is None:
+            init_shape = self.init_shape.expand(batch_size, num_query, -1)
+        if init_cam is None:
+            init_cam = self.init_cam.expand(batch_size, num_query, -1)
+
+        pred_pose = init_pose
+        pred_shape = init_shape
+        pred_cam = init_cam
+
+        for _ in range(n_iter):
+            xc = torch.cat([feature, pred_pose, pred_shape, pred_cam], -1)
+            xc = self.smpl_branches[lvl][0](xc)  # fc1
+            xc = self.smpl_branches[lvl][1](xc)  # drop
+            xc = self.smpl_branches[lvl][2](xc)  # fc2
+            xc = self.smpl_branches[lvl][3](xc)  # drop
+            pred_pose = self.smpl_branches[lvl][4](xc) + pred_pose  # reg pose
+            pred_shape = self.smpl_branches[lvl][5](
+                xc) + pred_shape  # reg beat
+            pred_cam = self.smpl_branches[lvl][6](xc) + pred_cam  # reg cam
+
+        pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, num_query,
+                                                      24, 3, 3)
+        return pred_rotmat, pred_shape, pred_cam
+
+    def init_weights(self):
+        """Initialize weights of the DeformDETR head."""
+        self.transformer.init_weights()
+        if self.loss_cls.use_sigmoid:
+            bias_init = bias_init_with_prob(0.01)
+            for m in self.cls_branches:
+                nn.init.constant_(m.bias, bias_init)
+        for m in self.reg_branches:
+            constant_init(m[-1], 0, bias=0)
+        nn.init.constant_(self.reg_branches[0][-1].bias.data[2:], -2.0)
+        if self.as_two_stage:
+            for m in self.reg_branches:
+                nn.init.constant_(m[-1].bias.data[2:], 0.0)
+
+    def forward(self, mlvl_feats, img_metas):
+        """Forward function.
+
+        Args:
+            mlvl_feats (tuple[Tensor]): Features from the upstream
+                network, each is a 4D-tensor with shape
+                (N, C, H, W).
+            img_metas (list[dict]): List of image information.
+
+        Returns:
+            all_cls_scores (Tensor): Outputs from the classification head, \
+                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
+                cls_out_channels should includes background.
+            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
+                head with normalized coordinate format (cx, cy, w, h). \
+                Shape [nb_dec, bs, num_query, 4].
+            enc_outputs_class (Tensor): The score of each point on encode \
+                feature map, has shape (N, h*w, num_class). Only when \
+                as_two_stage is True it would be returned, otherwise \
+                `None` would be returned.
+            enc_outputs_coord (Tensor): The proposal generate from the \
+                encode feature map, has shape (N, h*w, 4). Only when \
+                as_two_stage is True it would be returned, otherwise \
+                `None` would be returned.
+        """
+
+        batch_size = mlvl_feats[0].size(0)
+        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
+        img_masks = mlvl_feats[0].new_ones(
+            (batch_size, input_img_h, input_img_w))
+        for img_id in range(batch_size):
+            img_h, img_w = img_metas[img_id]['img_shape']
+            img_masks[img_id, :img_h, :img_w] = 0
+
+        mlvl_masks = []
+        mlvl_positional_encodings = []
+        for feat in mlvl_feats:
+            mlvl_masks.append(
+                F.interpolate(img_masks[None],
+                              size=feat.shape[-2:]).to(torch.bool).squeeze(0))
+            mlvl_positional_encodings.append(
+                self.positional_encoding(mlvl_masks[-1]))
+
+        query_embeds = None
+        if not self.as_two_stage:
+            query_embeds = self.query_embedding.weight
+        hs, init_reference, inter_references, \
+            enc_outputs_class, enc_outputs_coord = self.transformer(
+                    mlvl_feats,
+                    mlvl_masks,
+                    query_embeds,
+                    mlvl_positional_encodings,
+                    reg_branches=self.reg_branches if self.with_box_refine else None,  # noqa:E501
+                    cls_branches=self.cls_branches if self.as_two_stage else None,  # noqa:E501
+                    smpl_branches=self.smpl_branches if self.with_box_refine else None  # noqa: E501
+                )
+        hs = hs.permute(0, 2, 1, 3)
+        outputs_classes = []
+        outputs_coords = []
+        outputs_poses = []
+        outputs_shapes = []
+        outputs_cams = []
+        for lvl in range(hs.shape[0]):
+            if lvl == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[lvl - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.cls_branches[lvl](hs[lvl])
+            tmp = self.reg_branches[lvl](hs[lvl])
+            if reference.shape[-1] == 4:
+                tmp += reference
+            else:
+                assert reference.shape[-1] == 2
+                tmp[..., :2] += reference
+            outputs_coord = tmp.sigmoid()
+
+            # smpl
+            pred_pose, pred_betas, pred_cam = \
+                self.regress_smpl(lvl, hs[lvl], n_iter=self.niter)
+            outputs_poses.append(pred_pose)
+            outputs_shapes.append(pred_betas)
+            outputs_cams.append(pred_cam)
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+
+        outputs_classes = torch.stack(outputs_classes)
+        outputs_coords = torch.stack(outputs_coords)
+        outputs_poses = torch.stack(outputs_poses)
+        outputs_shapes = torch.stack(outputs_shapes)
+        outputs_cams = torch.stack(outputs_cams)
+        if self.as_two_stage:
+            return outputs_classes, outputs_coords, \
+                outputs_poses, outputs_shapes, outputs_cams, \
+                enc_outputs_class, enc_outputs_coord.sigmoid()
+        else:
+            # return outputs_classes, outputs_coords, \
+            return outputs_poses, outputs_shapes, outputs_cams, \
+                None, None
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def loss(self,
+             all_cls_scores,
+             all_bbox_preds,
+             enc_cls_scores,
+             enc_bbox_preds,
+             gt_bboxes_list,
+             gt_labels_list,
+             img_metas,
+             gt_bboxes_ignore=None):
+        """"Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification score of all
+                decoder layers, has shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds (Tensor): Sigmoid regression
+                outputs of all decode layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            enc_cls_scores (Tensor): Classification scores of
+                points on encode feature map , has shape
+                (N, h*w, num_classes). Only be passed when as_two_stage is
+                True, otherwise is None.
+            enc_bbox_preds (Tensor): Regression results of each points
+                on the encode feature map, has shape (N, h*w, 4). Only be
+                passed when as_two_stage is True, otherwise is None.
+            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
+                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
+            gt_labels_list (list[Tensor]): Ground truth class indices for each
+                image with shape (num_gts, ).
+            img_metas (list[dict]): List of image meta information.
+            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
+                which can be ignored for each image. Default None.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        assert gt_bboxes_ignore is None, \
+            f'{self.__class__.__name__} only supports ' \
+            f'for gt_bboxes_ignore setting to None.'
+
+        num_dec_layers = len(all_cls_scores)
+        all_gt_bboxes_list = [gt_bboxes_list for _ in range(num_dec_layers)]
+        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
+        all_gt_bboxes_ignore_list = [
+            gt_bboxes_ignore for _ in range(num_dec_layers)
+        ]
+        img_metas_list = [img_metas for _ in range(num_dec_layers)]
+
+        losses_cls, losses_bbox, losses_iou = multi_apply(
+            self.loss_single, all_cls_scores, all_bbox_preds,
+            all_gt_bboxes_list, all_gt_labels_list, img_metas_list,
+            all_gt_bboxes_ignore_list)
+
+        loss_dict = dict()
+        # loss of proposal generated from encode feature map.
+        if enc_cls_scores is not None:
+            binary_labels_list = [
+                torch.zeros_like(gt_labels_list[i])
+                for i in range(len(img_metas))
+            ]
+            enc_loss_cls, enc_losses_bbox, enc_losses_iou = \
+                self.loss_single(enc_cls_scores, enc_bbox_preds,
+                                 gt_bboxes_list, binary_labels_list,
+                                 img_metas, gt_bboxes_ignore)
+            loss_dict['enc_loss_cls'] = enc_loss_cls
+            loss_dict['enc_loss_bbox'] = enc_losses_bbox
+            loss_dict['enc_loss_iou'] = enc_losses_iou
+
+        # loss from the last decoder layer
+        loss_dict['loss_cls'] = losses_cls[-1]
+        loss_dict['loss_bbox'] = losses_bbox[-1]
+        loss_dict['loss_iou'] = losses_iou[-1]
+        # loss from other decoder layers
+        num_dec_layer = 0
+        for loss_cls_i, loss_bbox_i, loss_iou_i in zip(losses_cls[:-1],
+                                                       losses_bbox[:-1],
+                                                       losses_iou[:-1]):
+            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
+            loss_dict[f'd{num_dec_layer}.loss_bbox'] = loss_bbox_i
+            loss_dict[f'd{num_dec_layer}.loss_iou'] = loss_iou_i
+            num_dec_layer += 1
+        return loss_dict
+
+    @force_fp32(apply_to=('all_cls_scores_list', 'all_bbox_preds_list'))
+    def get_bboxes(self,
+                   all_cls_scores,
+                   all_bbox_preds,
+                   enc_cls_scores,
+                   enc_bbox_preds,
+                   img_metas,
+                   rescale=False):
+        """Transform network outputs for a batch into bbox predictions.
+
+        Args:
+            all_cls_scores (Tensor): Classification score of all
+                decoder layers, has shape
+                [nb_dec, bs, num_query, cls_out_channels].
+            all_bbox_preds (Tensor): Sigmoid regression
+                outputs of all decode layers. Each is a 4D-tensor with
+                normalized coordinate format (cx, cy, w, h) and shape
+                [nb_dec, bs, num_query, 4].
+            enc_cls_scores (Tensor): Classification scores of
+                points on encode feature map , has shape
+                (N, h*w, num_classes). Only be passed when as_two_stage is
+                True, otherwise is None.
+            enc_bbox_preds (Tensor): Regression results of each points
+                on the encode feature map, has shape (N, h*w, 4). Only be
+                passed when as_two_stage is True, otherwise is None.
+            img_metas (list[dict]): Meta information of each image.
+            rescale (bool, optional): If True, return boxes in original
+                image space. Default False.
+
+        Returns:
+            list[list[Tensor, Tensor]]: Each item in result_list is 2-tuple. \
+                The first item is an (n, 5) tensor, where the first 4 columns \
+                are bounding box positions (tl_x, tl_y, br_x, br_y) and the \
+                5-th column is a score between 0 and 1. The second item is a \
+                (n,) tensor where each item is the predicted class label of \
+                the corresponding box.
+        """
+        cls_scores = all_cls_scores[-1]
+        bbox_preds = all_bbox_preds[-1]
+
+        result_list = []
+        for img_id in range(len(img_metas)):
+            cls_score = cls_scores[img_id]
+            bbox_pred = bbox_preds[img_id]
+            img_shape = img_metas[img_id]['img_shape']
+            scale_factor = img_metas[img_id]['scale_factor']
+            proposals = self._get_bboxes_single(cls_score, bbox_pred,
+                                                img_shape, scale_factor,
+                                                rescale)
+            result_list.append(proposals)
+        return result_list
diff --git a/detrsmpl/models/heads/expose_head.py b/detrsmpl/models/heads/expose_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe825875189d5764cb48b6e35f625d4702157ee7
--- /dev/null
+++ b/detrsmpl/models/heads/expose_head.py
@@ -0,0 +1,526 @@
+import os
+import pickle
+from abc import abstractmethod
+from typing import List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_activation_layer, initialize
+from mmcv.runner.base_module import BaseModule
+
+from detrsmpl.utils.geometry import rot6d_to_rotmat
+
+
+class IterativeRegression(nn.Module):
+    """Regressor for ExPose Head."""
+    def __init__(self,
+                 module,
+                 mean_param,
+                 num_stages=1,
+                 append_params=True,
+                 learn_mean=False,
+                 detach_mean=False,
+                 dim=1,
+                 **kwargs):
+        super(IterativeRegression, self).__init__()
+        self.module = module
+        self._num_stages = num_stages
+        self.dim = dim
+
+        if learn_mean:
+            self.register_parameter(
+                'mean_param', nn.Parameter(mean_param, requires_grad=True))
+        else:
+            self.register_buffer('mean_param', mean_param)
+
+        self.append_params = append_params
+        self.detach_mean = detach_mean
+
+    def get_mean(self):
+        """Get the initial mean param."""
+        return self.mean_param.clone()
+
+    @property
+    def num_stages(self):
+        return self._num_stages
+
+    def forward(self,
+                features: torch.Tensor,
+                cond: Optional[torch.Tensor] = None):
+        ''' Computes deltas on top of condition iteratively
+            Parameters
+            ----------
+                features: torch.Tensor
+                    Input features
+        '''
+        batch_size = features.shape[0]
+        expand_shape = [batch_size] + [-1] * len(features.shape[1:])
+
+        parameters = []
+        deltas = []
+        module_input = features
+        if cond is None:
+            cond = self.mean_param.expand(*expand_shape).clone()
+
+        # Detach mean
+        if self.detach_mean:
+            cond = cond.detach()
+
+        if self.append_params:
+            assert features is not None, (
+                'Features are none even though append_params is True')
+            module_input = torch.cat([module_input, cond], dim=self.dim)
+
+        deltas.append(self.module(module_input))
+        num_params = deltas[-1].shape[1]
+        parameters.append(cond[:, :num_params].clone() + deltas[-1])
+
+        for stage_idx in range(1, self.num_stages):
+            module_input = torch.cat([features, parameters[stage_idx - 1]],
+                                     dim=-1)
+            params_upd = self.module(module_input)
+            deltas.append(params_upd)
+            parameters.append(parameters[stage_idx - 1] + params_upd)
+
+        return parameters
+
+
+class MLP(nn.Module):
+    """MLP
+    Args:
+        input_dim (int): Input dim of MLP.
+        output_dim (int): Output dim of MLP.
+        layers (List): Layer dims.
+        activ_type (str): Activation layer type.
+        dropout (float): Dropout.
+        gain (float): Xavier init gain value.
+    """
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        layers: List[int] = [],
+        activ_type: str = 'relu',
+        dropout: float = 0.5,
+        gain: float = 0.01,
+    ):
+        super(MLP, self).__init__()
+        curr_input_dim = input_dim
+        self.num_layers = len(layers)
+
+        self.blocks = nn.ModuleList()
+        for layer_idx, layer_dim in enumerate(layers):
+            if activ_type == 'none':
+                active = None
+            else:
+                active = build_activation_layer(
+                    cfg=dict(type=activ_type, inplace=True))
+            linear = nn.Linear(curr_input_dim, layer_dim, bias=True)
+            curr_input_dim = layer_dim
+
+            layer = []
+            layer.append(linear)
+
+            if active is not None:
+                layer.append(active)
+
+            if dropout > 0.0:
+                layer.append(nn.Dropout(dropout))
+
+            block = nn.Sequential(*layer)
+            self.add_module('layer_{:03d}'.format(layer_idx), block)
+            self.blocks.append(block)
+
+        self.output_layer = nn.Linear(curr_input_dim, output_dim)
+        initialize(self.output_layer,
+                   init_cfg=dict(type='Xavier',
+                                 gain=gain,
+                                 distribution='uniform'))
+
+    def forward(self, module_input):
+        curr_input = module_input
+        for block in self.blocks:
+            curr_input = block(curr_input)
+        return self.output_layer(curr_input)
+
+
+class ContinuousRotReprDecoder:
+    """ExPose Decoder Decode latent representation to rotation.
+
+    Args:
+        num_angles (int): Joint num.
+        dtype: dtype.
+        mean (torch.tensor): Mean value for params.
+    """
+    def __init__(self, num_angles, dtype=torch.float32, mean=None):
+        self.num_angles = num_angles
+        self.dtype = dtype
+
+        if isinstance(mean, dict):
+            mean = mean.get('cont_rot_repr', None)
+        if mean is None:
+            mean = torch.tensor([1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
+                                dtype=self.dtype).unsqueeze(dim=0).expand(
+                                    self.num_angles, -1).contiguous().view(-1)
+        if not torch.is_tensor(mean):
+            mean = torch.tensor(mean)
+        mean = mean.reshape(-1, 6)
+
+        if mean.shape[0] < self.num_angles:
+            mean = mean.repeat(self.num_angles // mean.shape[0] + 1,
+                               1).contiguous()
+            mean = mean[:self.num_angles]
+        elif mean.shape[0] > self.num_angles:
+            mean = mean[:self.num_angles]
+
+        mean = mean.reshape(-1)
+        self.mean = mean
+
+    def get_mean(self):
+        return self.mean.clone()
+
+    def get_dim_size(self):
+        return self.num_angles * 6
+
+    def __call__(self, module_input):
+        batch_size = module_input.shape[0]
+        reshaped_input = module_input.view(-1, 6)
+        rot_mats = rot6d_to_rotmat(reshaped_input)
+        # aa = rot6d_to_aa(reshaped_input)
+        # return aa.view(batch_size,-1,3)
+        return rot_mats.view(batch_size, -1, 3, 3)
+
+
+class ExPoseHead(BaseModule):
+    """General Head for ExPose."""
+    def __init__(self, init_cfg=None):
+        super().__init__(init_cfg)
+
+    def load_regressor(self,
+                       input_feat_dim: int = 2048,
+                       param_mean: torch.Tensor = None,
+                       regressor_cfg: dict = None):
+        """Build regressor for ExPose Head."""
+        param_dim = param_mean.numel()
+        regressor = MLP(input_feat_dim + param_dim, param_dim, **regressor_cfg)
+        self.regressor = IterativeRegression(regressor,
+                                             param_mean,
+                                             num_stages=3)
+
+    def load_param_decoder(self, mean_poses_dict):
+        """Build decoders for each pose."""
+        start = 0
+        mean_lst = []
+        self.pose_param_decoders = {}
+        for pose_param in self.pose_param_conf:
+            pose_name = pose_param['name']
+            num_angles = pose_param['num_angles']
+            if pose_param['use_mean']:
+                pose_decoder = ContinuousRotReprDecoder(
+                    num_angles,
+                    dtype=torch.float32,
+                    mean=mean_poses_dict.get(pose_name, None))
+            else:
+                pose_decoder = ContinuousRotReprDecoder(num_angles,
+                                                        dtype=torch.float32,
+                                                        mean=None)
+            self.pose_param_decoders['{}_decoder'.format(
+                pose_name)] = pose_decoder
+            pose_dim = pose_decoder.get_dim_size()
+            pose_mean = pose_decoder.get_mean()
+            if pose_param['rotate_axis_x']:
+                pose_mean[3] = -1
+            idxs = list(range(start, start + pose_dim))
+            idxs = torch.tensor(idxs, dtype=torch.long)
+            self.register_buffer('{}_idxs'.format(pose_name), idxs)
+            start += pose_dim
+            mean_lst.append(pose_mean.view(-1))
+        return start, mean_lst
+
+    def get_camera_param(self, camera_cfg):
+        """Build camera param."""
+        camera_pos_scale = camera_cfg.get('pos_func')
+        if camera_pos_scale == 'softplus':
+            camera_scale_func = F.softplus
+        elif camera_pos_scale == 'exp':
+            camera_scale_func = torch.exp
+        elif camera_pos_scale == 'none' or camera_pos_scale == 'None':
+
+            def func(x):
+                return x
+
+            camera_scale_func = func
+        mean_scale = camera_cfg.get('mean_scale', 0.9)
+        if camera_pos_scale == 'softplus':
+            mean_scale = np.log(np.exp(mean_scale) - 1)
+        elif camera_pos_scale == 'exp':
+            mean_scale = np.log(mean_scale)
+        camera_mean = torch.tensor([mean_scale, 0.0, 0.0], dtype=torch.float32)
+        camera_param_dim = 3
+        return camera_mean, camera_param_dim, camera_scale_func
+
+    def flat_params_to_dict(self, param_tensor):
+        """Turn param tensors to dict."""
+        smplx_dict = {}
+        raw_dict = {}
+        for pose_param in self.pose_param_conf:
+            pose_name = pose_param['name']
+            pose_idxs = getattr(self, f'{pose_name}_idxs')
+            decoder = self.pose_param_decoders[f'{pose_name}_decoder']
+            pose = torch.index_select(param_tensor, 1, pose_idxs)
+            raw_dict[f'raw_{pose_name}'] = pose.clone()
+            smplx_dict[pose_name] = decoder(pose)
+        return smplx_dict, raw_dict
+
+    def get_mean(self, name, batch_size):
+        """Get mean value of params."""
+        mean_param = self.regressor.get_mean().view(-1)
+        if name is None:
+            return mean_param.reshape(1, -1).expand(batch_size, -1)
+        idxs = getattr(self, f'{name}_idxs')
+        return mean_param[idxs].reshape(1, -1).expand(batch_size, -1)
+
+    def get_num_betas(self):
+        return self.num_betas
+
+    def get_num_expression_coeffs(self):
+        return self.num_expression_coeffs
+
+    @abstractmethod
+    def forward(self, features):
+        pass
+
+
+class ExPoseBodyHead(ExPoseHead):
+    """Head for ExPose Body Model."""
+    def __init__(self,
+                 init_cfg=None,
+                 num_betas: int = 10,
+                 num_expression_coeffs: int = 10,
+                 mean_pose_path: str = '',
+                 shape_mean_path: str = '',
+                 pose_param_conf: list = None,
+                 input_feat_dim: int = 2048,
+                 regressor_cfg: dict = None,
+                 camera_cfg: dict = None):
+        super().__init__(init_cfg)
+        self.num_betas = num_betas
+        self.num_expression_coeffs = num_expression_coeffs
+        # poses
+        self.pose_param_conf = pose_param_conf
+        mean_poses_dict = {}
+        if os.path.exists(mean_pose_path):
+            with open(mean_pose_path, 'rb') as f:
+                mean_poses_dict = pickle.load(f)
+        start, mean_lst = self.load_param_decoder(mean_poses_dict)
+
+        # shape
+        if os.path.exists(shape_mean_path):
+            shape_mean = torch.from_numpy(
+                np.load(shape_mean_path,
+                        allow_pickle=True)).to(dtype=torch.float32).reshape(
+                            1, -1)[:, :num_betas].reshape(-1)
+        else:
+            shape_mean = torch.zeros([num_betas], dtype=torch.float32)
+        shape_idxs = list(range(start, start + num_betas))
+        self.register_buffer('shape_idxs',
+                             torch.tensor(shape_idxs, dtype=torch.long))
+        start += num_betas
+        mean_lst.append(shape_mean.view(-1))
+
+        # expression
+        expression_mean = torch.zeros([num_expression_coeffs],
+                                      dtype=torch.float32)
+        expression_idxs = list(range(start, start + num_expression_coeffs))
+        self.register_buffer('expression_idxs',
+                             torch.tensor(expression_idxs, dtype=torch.long))
+        start += num_expression_coeffs
+        mean_lst.append(expression_mean.view(-1))
+
+        # camera
+        mean, dim, scale_func = self.get_camera_param(camera_cfg)
+        self.camera_scale_func = scale_func
+        camera_idxs = list(range(start, start + dim))
+        self.register_buffer('camera_idxs',
+                             torch.tensor(camera_idxs, dtype=torch.long))
+        start += dim
+        mean_lst.append(mean)
+
+        param_mean = torch.cat(mean_lst).view(1, -1)
+        self.load_regressor(input_feat_dim, param_mean, regressor_cfg)
+
+    def forward(self, features):
+        """Forward function of ExPose Body Head.
+
+        Args:
+            features (List[torch.tensor]) : Output of restnet.
+            cond : Initial params. If none, use the mean params.
+        """
+        body_parameters = self.regressor(features)[-1]
+        params_dict, raw_dict = self.flat_params_to_dict(body_parameters)
+        params_dict['betas'] = torch.index_select(body_parameters, 1,
+                                                  self.shape_idxs)
+        params_dict['expression'] = torch.index_select(body_parameters, 1,
+                                                       self.expression_idxs)
+
+        camera_params = torch.index_select(body_parameters, 1,
+                                           self.camera_idxs)
+        scale = camera_params[:, 0:1]
+        translation = camera_params[:, 1:3]
+        scale = self.camera_scale_func(scale)
+        camera_params = torch.cat([scale, translation], dim=1)
+        return {
+            'pred_param': params_dict,
+            'pred_cam': camera_params,
+            'pred_raw': raw_dict
+        }
+
+
+class ExPoseHandHead(ExPoseHead):
+    """Head for ExPose Hand Model."""
+    def __init__(self,
+                 init_cfg=None,
+                 num_betas: int = 10,
+                 mean_pose_path: str = '',
+                 pose_param_conf: list = None,
+                 input_feat_dim: int = 2048,
+                 regressor_cfg: dict = None,
+                 camera_cfg: dict = None):
+        super().__init__(init_cfg)
+        self.num_betas = num_betas
+        # poses
+        self.pose_param_conf = pose_param_conf
+        mean_poses_dict = {}
+        if os.path.exists(mean_pose_path):
+            with open(mean_pose_path, 'rb') as f:
+                mean_poses_dict = pickle.load(f)
+        start, mean_lst = self.load_param_decoder(mean_poses_dict)
+
+        shape_mean = torch.zeros([num_betas], dtype=torch.float32)
+        shape_idxs = list(range(start, start + num_betas))
+        self.register_buffer('shape_idxs',
+                             torch.tensor(shape_idxs, dtype=torch.long))
+        start += num_betas
+        mean_lst.append(shape_mean.view(-1))
+
+        # camera
+        mean, dim, scale_func = self.get_camera_param(camera_cfg)
+        self.camera_scale_func = scale_func
+        camera_idxs = list(range(start, start + dim))
+        self.register_buffer('camera_idxs',
+                             torch.tensor(camera_idxs, dtype=torch.long))
+        start += dim
+        mean_lst.append(mean)
+
+        param_mean = torch.cat(mean_lst).view(1, -1)
+        self.load_regressor(input_feat_dim, param_mean, regressor_cfg)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+    def forward(self, features, cond=None):
+        """Forward function of ExPose Hand Head.
+
+        Args:
+            features (List[torch.tensor]) : Output of restnet.
+            cond : Initial params. If none, use the mean params.
+        """
+        batch_size = features[-1].size(0)
+        features = self.avgpool(features[-1]).view(batch_size, -1)
+        hand_parameters = self.regressor(features, cond=cond)[-1]
+        params_dict, raw_dict = self.flat_params_to_dict(hand_parameters)
+        params_dict['betas'] = torch.index_select(hand_parameters, 1,
+                                                  self.shape_idxs)
+
+        camera_params = torch.index_select(hand_parameters, 1,
+                                           self.camera_idxs)
+        scale = camera_params[:, 0:1]
+        translation = camera_params[:, 1:3]
+        scale = self.camera_scale_func(scale)
+        camera_params = torch.cat([scale, translation], dim=1)
+        return {
+            'pred_param': params_dict,
+            'pred_cam': camera_params,
+            'pred_raw': raw_dict
+        }
+
+
+class ExPoseFaceHead(ExPoseHead):
+    """Head for ExPose Face Model."""
+    def __init__(self,
+                 init_cfg=None,
+                 num_betas: int = 10,
+                 num_expression_coeffs: int = 10,
+                 pose_param_conf: list = None,
+                 mean_pose_path: str = '',
+                 input_feat_dim: int = 2048,
+                 regressor_cfg: dict = None,
+                 camera_cfg: dict = None):
+        super().__init__(init_cfg)
+        self.num_betas = num_betas
+        self.num_expression_coeffs = num_expression_coeffs
+        # poses
+        self.pose_param_conf = pose_param_conf
+        mean_poses_dict = {}
+        if os.path.exists(mean_pose_path):
+            with open(mean_pose_path, 'rb') as f:
+                mean_poses_dict = pickle.load(f)
+        start, mean_lst = self.load_param_decoder(mean_poses_dict)
+
+        # shape
+        shape_mean = torch.zeros([num_betas], dtype=torch.float32)
+        shape_idxs = list(range(start, start + num_betas))
+        self.register_buffer('shape_idxs',
+                             torch.tensor(shape_idxs, dtype=torch.long))
+        start += num_betas
+        mean_lst.append(shape_mean.view(-1))
+
+        # expression
+        expression_mean = torch.zeros([num_expression_coeffs],
+                                      dtype=torch.float32)
+        expression_idxs = list(range(start, start + num_expression_coeffs))
+        self.register_buffer('expression_idxs',
+                             torch.tensor(expression_idxs, dtype=torch.long))
+        start += num_expression_coeffs
+        mean_lst.append(expression_mean.view(-1))
+
+        # camera
+        mean, dim, scale_func = self.get_camera_param(camera_cfg)
+        self.camera_scale_func = scale_func
+        camera_idxs = list(range(start, start + dim))
+        self.register_buffer('camera_idxs',
+                             torch.tensor(camera_idxs, dtype=torch.long))
+        start += dim
+        mean_lst.append(mean)
+
+        param_mean = torch.cat(mean_lst).view(1, -1)
+        self.load_regressor(input_feat_dim, param_mean, regressor_cfg)
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+
+    def forward(self, features, cond=None):
+        """Forward function of ExPose Face Head.
+
+        Args:
+            features (List[torch.tensor]) : Output of restnet.
+            cond : Initial params. If none, use the mean params.
+        """
+        batch_size = features[-1].size(0)
+        features = self.avgpool(features[-1]).view(batch_size, -1)
+        head_parameters = self.regressor(features, cond=cond)[-1]
+        params_dict, raw_dict = self.flat_params_to_dict(head_parameters)
+        params_dict['betas'] = torch.index_select(head_parameters, 1,
+                                                  self.shape_idxs)
+        params_dict['expression'] = torch.index_select(head_parameters, 1,
+                                                       self.expression_idxs)
+
+        camera_params = torch.index_select(head_parameters, 1,
+                                           self.camera_idxs)
+        scale = camera_params[:, 0:1]
+        translation = camera_params[:, 1:3]
+        scale = self.camera_scale_func(scale)
+        camera_params = torch.cat([scale, translation], dim=1)
+        return {
+            'pred_param': params_dict,
+            'pred_cam': camera_params,
+            'pred_raw': raw_dict
+        }
diff --git a/detrsmpl/models/heads/hmr_head.py b/detrsmpl/models/heads/hmr_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..379845d2a51d48af72116e9a4414698080288395
--- /dev/null
+++ b/detrsmpl/models/heads/hmr_head.py
@@ -0,0 +1,99 @@
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.runner.base_module import BaseModule
+
+from detrsmpl.utils.geometry import rot6d_to_rotmat
+
+
+class HMRHead(BaseModule):
+    def __init__(self,
+                 feat_dim,
+                 smpl_mean_params=None,
+                 npose=144,
+                 nbeta=10,
+                 ncam=3,
+                 hdim=1024,
+                 init_cfg=None):
+        super(HMRHead, self).__init__(init_cfg=init_cfg)
+        self.fc1 = nn.Linear(feat_dim + npose + nbeta + ncam, hdim)
+        self.drop1 = nn.Dropout()
+        self.fc2 = nn.Linear(hdim, hdim)
+        self.drop2 = nn.Dropout()
+        self.decpose = nn.Linear(hdim, npose)
+        self.decshape = nn.Linear(hdim, nbeta)
+        self.deccam = nn.Linear(hdim, ncam)
+
+        nn.init.xavier_uniform_(self.decpose.weight, gain=0.01)
+        nn.init.xavier_uniform_(self.decshape.weight, gain=0.01)
+        nn.init.xavier_uniform_(self.deccam.weight, gain=0.01)
+
+        if smpl_mean_params is None:
+            init_pose = torch.zeros([1, npose])
+            init_shape = torch.zeros([1, nbeta])
+            init_cam = torch.FloatTensor([[1, 0, 0]])
+        else:
+            mean_params = np.load(smpl_mean_params)
+            init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
+            init_shape = torch.from_numpy(
+                mean_params['shape'][:].astype('float32')).unsqueeze(0)
+            init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0)
+        self.register_buffer('init_pose', init_pose)
+        self.register_buffer('init_shape', init_shape)
+        self.register_buffer('init_cam', init_cam)
+
+    def forward(self,
+                x,
+                init_pose=None,
+                init_shape=None,
+                init_cam=None,
+                n_iter=3):
+
+        # hmr head only support one layer feature
+        if isinstance(x, list) or isinstance(x, tuple):
+            x = x[-1]
+
+        output_seq = False
+        if len(x.shape) == 4:
+            # use feature from the last layer of the backbone
+            # apply global average pooling on the feature map
+            x = x.mean(dim=-1).mean(dim=-1)
+        elif len(x.shape) == 3:
+            # temporal feature
+            output_seq = True
+            B, T, L = x.shape
+            x = x.view(-1, L)
+
+        batch_size = x.shape[0]
+        if init_pose is None:
+            init_pose = self.init_pose.expand(batch_size, -1)
+        if init_shape is None:
+            init_shape = self.init_shape.expand(batch_size, -1)
+        if init_cam is None:
+            init_cam = self.init_cam.expand(batch_size, -1)
+
+        pred_pose = init_pose
+        pred_shape = init_shape
+        pred_cam = init_cam
+        for i in range(n_iter):
+            xc = torch.cat([x, pred_pose, pred_shape, pred_cam], 1)
+            xc = self.fc1(xc)
+            xc = self.drop1(xc)
+            xc = self.fc2(xc)
+            xc = self.drop2(xc)
+            pred_pose = self.decpose(xc) + pred_pose
+            pred_shape = self.decshape(xc) + pred_shape
+            pred_cam = self.deccam(xc) + pred_cam
+
+        pred_rotmat = rot6d_to_rotmat(pred_pose).view(batch_size, 24, 3, 3)
+
+        if output_seq:
+            pred_rotmat = pred_rotmat.view(B, T, 24, 3, 3)
+            pred_shape = pred_shape.view(B, T, 10)
+            pred_cam = pred_cam.view(B, T, 3)
+        output = {
+            'pred_pose': pred_rotmat,
+            'pred_shape': pred_shape,
+            'pred_cam': pred_cam
+        }
+        return output
diff --git a/detrsmpl/models/heads/hybrik_head.py b/detrsmpl/models/heads/hybrik_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cea3dcb83624b4611b6f8067db118a222671e44
--- /dev/null
+++ b/detrsmpl/models/heads/hybrik_head.py
@@ -0,0 +1,443 @@
+import numpy as np
+import torch
+import torch.cuda.comm
+import torch.nn as nn
+from mmcv.runner.base_module import BaseModule
+from torch.nn import functional as F
+
+from detrsmpl.core.conventions.keypoints_mapping import get_flip_pairs
+
+
+def norm_heatmap(norm_type, heatmap):
+    """Normalize heatmap.
+
+    Args:
+        norm_type (str):
+            type of normalization. Currently only 'softmax' is supported
+        heatmap (torch.Tensor):
+            model output heatmap with shape (Bx29xF^2) where F^2 refers to
+            number of squared feature channels F
+
+    Returns:
+        heatmap (torch.Tensor):
+            normalized heatmap according to specified type with
+            shape (Bx29xF^2)
+    """
+
+    # Input tensor shape: [N,C,...]
+    shape = heatmap.shape
+    if norm_type == 'softmax':
+        heatmap = heatmap.reshape(*shape[:2], -1)
+        # global soft max
+        heatmap = F.softmax(heatmap, 2)
+        return heatmap.reshape(*shape)
+    else:
+        raise NotImplementedError
+
+
+class HybrIKHead(BaseModule):
+    """HybrIK parameters regressor head.
+
+    Args:
+        feature_channel (int):
+            Number of input channels
+        deconv_dim (List[int]):
+            List of deconvolution dimensions
+        num_joints (int):
+            Number of keypoints
+        depth_dim (int):
+            Depth dimension
+        height_dim (int):
+            Height dimension
+        width_dim (int):
+            Width dimension
+        smpl_mean_params (str):
+            file name of the mean SMPL parameters
+    """
+    def __init__(
+        self,
+        feature_channel=512,
+        deconv_dim=[256, 256, 256],
+        num_joints=29,
+        depth_dim=64,
+        height_dim=64,
+        width_dim=64,
+        smpl_mean_params=None,
+    ):
+
+        super(HybrIKHead, self).__init__()
+
+        self.deconv_dim = deconv_dim
+        self._norm_layer = nn.BatchNorm2d
+        self.num_joints = num_joints
+        self.norm_type = 'softmax'
+        self.depth_dim = depth_dim
+        self.height_dim = height_dim
+        self.width_dim = width_dim
+        self.smpl_dtype = torch.float32
+        self.feature_channel = feature_channel
+
+        self.deconv_layers = self._make_deconv_layer()
+        self.final_layer = nn.Conv2d(self.deconv_dim[2],
+                                     self.num_joints * self.depth_dim,
+                                     kernel_size=1,
+                                     stride=1,
+                                     padding=0)
+
+        self.joint_pairs_24 = get_flip_pairs('smpl')
+        self.joint_pairs_29 = get_flip_pairs('hybrik_29')
+
+        self.leaf_pairs = ((0, 1), (3, 4))
+        self.root_idx_smpl = 0
+
+        # mean shape
+        init_shape = np.load(smpl_mean_params)
+        self.register_buffer('init_shape', torch.Tensor(init_shape).float())
+
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.fc1 = nn.Linear(self.feature_channel, 1024)
+        self.drop1 = nn.Dropout(p=0.5)
+        self.fc2 = nn.Linear(1024, 1024)
+        self.drop2 = nn.Dropout(p=0.5)
+        self.decshape = nn.Linear(1024, 10)
+        self.decphi = nn.Linear(1024, 23 * 2)  # [cos(phi), sin(phi)]
+
+    def _make_deconv_layer(self):
+        deconv_layers = []
+        deconv1 = nn.ConvTranspose2d(self.feature_channel,
+                                     self.deconv_dim[0],
+                                     kernel_size=4,
+                                     stride=2,
+                                     padding=int(4 / 2) - 1,
+                                     bias=False)
+        bn1 = self._norm_layer(self.deconv_dim[0])
+        deconv2 = nn.ConvTranspose2d(self.deconv_dim[0],
+                                     self.deconv_dim[1],
+                                     kernel_size=4,
+                                     stride=2,
+                                     padding=int(4 / 2) - 1,
+                                     bias=False)
+        bn2 = self._norm_layer(self.deconv_dim[1])
+        deconv3 = nn.ConvTranspose2d(self.deconv_dim[1],
+                                     self.deconv_dim[2],
+                                     kernel_size=4,
+                                     stride=2,
+                                     padding=int(4 / 2) - 1,
+                                     bias=False)
+        bn3 = self._norm_layer(self.deconv_dim[2])
+
+        deconv_layers.append(deconv1)
+        deconv_layers.append(bn1)
+        deconv_layers.append(nn.ReLU(inplace=True))
+        deconv_layers.append(deconv2)
+        deconv_layers.append(bn2)
+        deconv_layers.append(nn.ReLU(inplace=True))
+        deconv_layers.append(deconv3)
+        deconv_layers.append(bn3)
+        deconv_layers.append(nn.ReLU(inplace=True))
+
+        return nn.Sequential(*deconv_layers)
+
+    def _initialize(self):
+        for name, m in self.deconv_layers.named_modules():
+            if isinstance(m, nn.ConvTranspose2d):
+                nn.init.normal_(m.weight, std=0.001)
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        for m in self.final_layer.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.normal_(m.weight, std=0.001)
+                nn.init.constant_(m.bias, 0)
+
+    def uvd_to_cam(self,
+                   uvd_jts,
+                   trans_inv,
+                   intrinsic_param,
+                   joint_root,
+                   depth_factor,
+                   return_relative=True):
+        """Project uvd coordinates to camera frame.
+
+        Args:
+            uvd_jts (torch.Tensor):
+                uvd coordinates with shape (BxNum_jointsx3)
+            trans_inv (torch.Tensor):
+                inverse affine transformation matrix with shape (Bx2x3)
+            intrinsic_param (torch.Tensor):
+                camera intrinsic matrix with shape (Bx3x3)
+            joint_root (torch.Tensor):
+                root joint coordinate with shape (Bx3)
+            depth_factor (float):
+                depth factor with shape (Bx1)
+            return_relative (bool):
+                Store True to return root normalized relative coordinates.
+                Default: True.
+
+        Returns:
+            xyz_jts (torch.Tensor):
+                uvd coordinates in camera frame with shape (BxNum_jointsx3)
+        """
+        assert uvd_jts.dim() == 3 and uvd_jts.shape[2] == 3, uvd_jts.shape
+        uvd_jts_new = uvd_jts.clone()
+        # if torch.sum(torch.isnan(uvd_jts)) > 0:
+        #     aaa= 1
+        assert torch.sum(torch.isnan(uvd_jts)) == 0, ('uvd_jts', uvd_jts)
+
+        # remap uv coordinate to input space
+        uvd_jts_new[:, :, 0] = (uvd_jts[:, :, 0] + 0.5) * self.width_dim * 4
+        uvd_jts_new[:, :, 1] = (uvd_jts[:, :, 1] + 0.5) * self.height_dim * 4
+        # remap d to mm
+        uvd_jts_new[:, :, 2] = uvd_jts[:, :, 2] * depth_factor
+        assert torch.sum(torch.isnan(uvd_jts_new)) == 0, ('uvd_jts_new',
+                                                          uvd_jts_new)
+
+        dz = uvd_jts_new[:, :, 2]
+
+        # transform in-bbox coordinate to image coordinate
+        uv_homo_jts = torch.cat(
+            (uvd_jts_new[:, :, :2], torch.ones_like(uvd_jts_new)[:, :, 2:]),
+            dim=2)
+        # batch-wise matrix multiply : (B,1,2,3) * (B,K,3,1) -> (B,K,2,1)
+        uv_jts = torch.matmul(trans_inv.unsqueeze(1),
+                              uv_homo_jts.unsqueeze(-1))
+        # transform (u,v,1) to (x,y,z)
+        cam_2d_homo = torch.cat((uv_jts, torch.ones_like(uv_jts)[:, :, :1, :]),
+                                dim=2)
+        # batch-wise matrix multiply : (B,1,3,3) * (B,K,3,1) -> (B,K,3,1)
+        xyz_jts = torch.matmul(intrinsic_param.unsqueeze(1), cam_2d_homo)
+        xyz_jts = xyz_jts.squeeze(dim=3)
+        # recover absolute z : (B,K) + (B,1)
+        abs_z = dz + joint_root[:, 2].unsqueeze(-1)
+        # multiply absolute z : (B,K,3) * (B,K,1)
+        xyz_jts = xyz_jts * abs_z.unsqueeze(-1)
+
+        if return_relative:
+            # (B,K,3) - (B,1,3)
+            xyz_jts = xyz_jts - joint_root.unsqueeze(1)
+
+        xyz_jts = xyz_jts / depth_factor.unsqueeze(-1)
+
+        return xyz_jts
+
+    def flip_uvd_coord(self, pred_jts, flip=False, flatten=True):
+        """Flip uvd coordinates.
+
+        Args:
+            pred_jts (torch.Tensor):
+                predicted uvd coordinates with shape (Bx87)
+            flip (bool):
+                Store True to flip uvd coordinates. Default: False.
+            flatten (bool):
+                Store True to reshape uvd_coordinates to shape (Bx29x3)
+                Default: True
+
+        Returns:
+            pred_jts (torch.Tensor):
+                flipped uvd coordinates with shape (Bx29x3)
+        """
+        if flatten:
+            assert pred_jts.dim() == 2
+            num_batches = pred_jts.shape[0]
+            pred_jts = pred_jts.reshape(num_batches, self.num_joints, 3)
+        else:
+            assert pred_jts.dim() == 3
+            num_batches = pred_jts.shape[0]
+
+        # flip
+        if flip:
+            pred_jts[:, :, 0] = -pred_jts[:, :, 0]
+        else:
+            pred_jts[:, :, 0] = -1 / self.width_dim - pred_jts[:, :, 0]
+
+        for pair in self.joint_pairs_29:
+            dim0, dim1 = pair
+            idx = torch.Tensor((dim0, dim1)).long()
+            inv_idx = torch.Tensor((dim1, dim0)).long()
+            pred_jts[:, idx] = pred_jts[:, inv_idx]
+
+        return pred_jts
+
+    def flip_phi(self, pred_phi):
+        """Flip phi.
+
+        Args:
+            pred_phi (torch.Tensor): phi in shape (Num_twistx2)
+
+        Returns:
+            pred_phi (torch.Tensor): flipped phi in shape (Num_twistx2)
+        """
+        pred_phi[:, :, 1] = -1 * pred_phi[:, :, 1]
+
+        for pair in self.joint_pairs_24:
+            dim0, dim1 = pair
+            idx = torch.Tensor((dim0 - 1, dim1 - 1)).long()
+            inv_idx = torch.Tensor((dim1 - 1, dim0 - 1)).long()
+            pred_phi[:, idx] = pred_phi[:, inv_idx]
+
+        return pred_phi
+
+    def forward(self,
+                feature,
+                trans_inv,
+                intrinsic_param,
+                joint_root,
+                depth_factor,
+                smpl_layer,
+                flip_item=None,
+                flip_output=False):
+        """Forward function.
+
+        Args:
+            feature (torch.Tensor): features extracted from backbone
+            trans_inv (torch.Tensor):
+                inverse affine transformation matrix with shape (Bx2x3)
+            intrinsic_param (torch.Tensor):
+                camera intrinsic matrix with shape (Bx3x3)
+            joint_root (torch.Tensor):
+                root joint coordinate with shape (Bx3)
+            depth_factor (float):
+                depth factor with shape (Bx1)
+            smpl_layer (torch.Tensor):
+                smpl body model
+            flip_item (List[torch.Tensor]|None):
+                list containing items to flip
+            flip_output (bool):
+                Store True to flip output. Default: False
+
+        Returns:
+            output (dict): Dict containing model predictions.
+        """
+        batch_size = feature.shape[0]
+
+        x0 = feature
+        out = self.deconv_layers(x0)
+        out = self.final_layer(out)
+
+        out = out.reshape((out.shape[0], self.num_joints, -1))
+        out = norm_heatmap(self.norm_type, out)
+        assert out.dim() == 3, out.shape
+
+        if self.norm_type == 'sigmoid':
+            maxvals, _ = torch.max(out, dim=2, keepdim=True)
+        else:
+            maxvals = torch.ones((*out.shape[:2], 1),
+                                 dtype=torch.float,
+                                 device=out.device)
+
+        heatmaps = out / out.sum(dim=2, keepdim=True)
+
+        heatmaps = heatmaps.reshape(
+            (heatmaps.shape[0], self.num_joints, self.depth_dim,
+             self.height_dim, self.width_dim))
+
+        hm_x = heatmaps.sum((2, 3))
+        hm_y = heatmaps.sum((2, 4))
+        hm_z = heatmaps.sum((3, 4))
+
+        hm_x = hm_x * torch.cuda.comm.broadcast(torch.arange(
+            hm_x.shape[-1]).type(torch.cuda.FloatTensor),
+                                                devices=[hm_x.device.index])[0]
+        hm_y = hm_y * torch.cuda.comm.broadcast(torch.arange(
+            hm_y.shape[-1]).type(torch.cuda.FloatTensor),
+                                                devices=[hm_y.device.index])[0]
+        hm_z = hm_z * torch.cuda.comm.broadcast(torch.arange(
+            hm_z.shape[-1]).type(torch.cuda.FloatTensor),
+                                                devices=[hm_z.device.index])[0]
+        coord_x = hm_x.sum(dim=2, keepdim=True)
+        coord_y = hm_y.sum(dim=2, keepdim=True)
+        coord_z = hm_z.sum(dim=2, keepdim=True)
+
+        coord_x = coord_x / float(self.width_dim) - 0.5
+        coord_y = coord_y / float(self.height_dim) - 0.5
+        coord_z = coord_z / float(self.depth_dim) - 0.5
+
+        #  -0.5 ~ 0.5
+        pred_uvd_jts_29 = torch.cat((coord_x, coord_y, coord_z), dim=2)
+
+        pred_uvd_jts_29_flat = pred_uvd_jts_29.reshape(
+            (batch_size, self.num_joints * 3))
+
+        x0 = self.avg_pool(x0)
+        x0 = x0.view(x0.size(0), -1)
+        init_shape = self.init_shape.expand(batch_size, -1)  # (B, 10,)
+
+        xc = x0
+
+        xc = self.fc1(xc)
+        xc = self.drop1(xc)
+        xc = self.fc2(xc)
+        xc = self.drop2(xc)
+
+        delta_shape = self.decshape(xc)
+        pred_shape = delta_shape + init_shape
+        pred_phi = self.decphi(xc)
+
+        if flip_item is not None:
+            assert flip_output
+            pred_uvd_jts_29_orig, pred_phi_orig, pred_leaf_orig, \
+                pred_shape_orig = flip_item
+
+        if flip_output:
+            pred_uvd_jts_29 = self.flip_uvd_coord(pred_uvd_jts_29,
+                                                  flatten=False,
+                                                  shift=True)
+        if flip_output and flip_item is not None:
+            pred_uvd_jts_29 = (pred_uvd_jts_29 + pred_uvd_jts_29_orig.reshape(
+                batch_size, 29, 3)) / 2
+
+        pred_uvd_jts_29_flat = pred_uvd_jts_29.reshape(
+            (batch_size, self.num_joints * 3))
+
+        #  -0.5 ~ 0.5
+        # Rotate back
+        pred_xyz_jts_29 = self.uvd_to_cam(pred_uvd_jts_29, trans_inv,
+                                          intrinsic_param, joint_root,
+                                          depth_factor)
+        assert torch.sum(
+            torch.isnan(pred_xyz_jts_29)) == 0, ('pred_xyz_jts_29',
+                                                 pred_xyz_jts_29)
+
+        pred_xyz_jts_29 = pred_xyz_jts_29 - \
+            pred_xyz_jts_29[:, self.root_idx_smpl, :].unsqueeze(1)
+
+        pred_phi = pred_phi.reshape(batch_size, 23, 2)
+
+        if flip_output:
+            pred_phi = self.flip_phi(pred_phi)
+
+        if flip_output and flip_item is not None:
+            pred_phi = (pred_phi + pred_phi_orig) / 2
+            pred_shape = (pred_shape + pred_shape_orig) / 2
+
+        hybrik_output = smpl_layer(
+            pose_skeleton=pred_xyz_jts_29.type(self.smpl_dtype) * 2,
+            betas=pred_shape.type(self.smpl_dtype),
+            phis=pred_phi.type(self.smpl_dtype),
+            global_orient=None,
+            return_verts=True)
+        pred_vertices = hybrik_output['vertices'].float()
+        #  -0.5 ~ 0.5
+        pred_xyz_jts_24_struct = hybrik_output['joints'].float() / 2
+        #  -0.5 ~ 0.5
+        pred_xyz_jts_17 = hybrik_output['joints_from_verts'].float() / 2
+        pred_poses = hybrik_output['poses'].float().reshape(
+            batch_size, 24, 3, 3)
+        pred_xyz_jts_24 = pred_xyz_jts_29[:, :24, :].reshape(batch_size, 72)
+        pred_xyz_jts_24_struct = pred_xyz_jts_24_struct.reshape(batch_size, 72)
+        pred_xyz_jts_17 = pred_xyz_jts_17.reshape(batch_size, 17 * 3)
+
+        output = {
+            'pred_phi': pred_phi,
+            'pred_delta_shape': delta_shape,
+            'pred_shape': pred_shape,
+            'pred_pose': pred_poses,
+            'pred_uvd_jts': pred_uvd_jts_29_flat,
+            'pred_xyz_jts_24': pred_xyz_jts_24,
+            'pred_xyz_jts_24_struct': pred_xyz_jts_24_struct,
+            'pred_xyz_jts_17': pred_xyz_jts_17,
+            'pred_vertices': pred_vertices,
+            'maxvals': maxvals,
+        }
+
+        return output
diff --git a/detrsmpl/models/heads/pare_head.py b/detrsmpl/models/heads/pare_head.py
new file mode 100644
index 0000000000000000000000000000000000000000..a27d4cae5b04103bbd71ea2fd99a048830e9cec9
--- /dev/null
+++ b/detrsmpl/models/heads/pare_head.py
@@ -0,0 +1,611 @@
+"""This script is modified from [PARE](https://github.com/
+mkocabas/PARE/tree/master/pare/models/layers).
+
+Original license please see docs/additional_licenses.md.
+"""
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.runner.base_module import BaseModule
+from torch.nn.modules.utils import _pair
+
+from detrsmpl.utils.geometry import rot6d_to_rotmat
+
+
+class LocallyConnected2d(nn.Module):
+    """Locally Connected Layer.
+
+    Args:
+        in_channels (int):
+            the in channel of the features.
+        out_channels (int):
+            the out channel of the features.
+        output_size (List[int]):
+            the output size of the features.
+        kernel_size (int):
+            the size of the kernel.
+        stride (int):
+            the stride of the kernel.
+    Returns:
+        attended_features (torch.Tensor):
+            attended feature maps
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 output_size,
+                 kernel_size,
+                 stride,
+                 bias=False):
+        super(LocallyConnected2d, self).__init__()
+        output_size = _pair(output_size)
+        self.weight = nn.Parameter(
+            torch.randn(1, out_channels, in_channels, output_size[0],
+                        output_size[1], kernel_size**2),
+            requires_grad=True,
+        )
+        if bias:
+            self.bias = nn.Parameter(torch.randn(1, out_channels,
+                                                 output_size[0],
+                                                 output_size[1]),
+                                     requires_grad=True)
+        else:
+            self.register_parameter('bias', None)
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+
+    def forward(self, x):
+        _, c, h, w = x.size()
+        kh, kw = self.kernel_size
+        dh, dw = self.stride
+        x = x.unfold(2, kh, dh).unfold(3, kw, dw)
+        x = x.contiguous().view(*x.size()[:-2], -1)
+        # Sum in in_channel and kernel_size dims
+        out = (x.unsqueeze(1) * self.weight).sum([2, -1])
+        if self.bias is not None:
+            out += self.bias
+        return out
+
+
+class KeypointAttention(nn.Module):
+    """Keypoint Attention Layer.
+
+    Args:
+        use_conv (bool):
+            whether to use conv for the attended feature map.
+            Default: False
+        in_channels (List[int]):
+            the in channel of shape_cam features and pose features.
+            Default: (256, 64)
+        out_channels (List[int]):
+            the out channel of shape_cam features and pose features.
+            Default: (256, 64)
+    Returns:
+        attended_features (torch.Tensor):
+            attended feature maps
+    """
+    def __init__(self,
+                 use_conv=False,
+                 in_channels=(256, 64),
+                 out_channels=(256, 64),
+                 act='softmax',
+                 use_scale=False):
+        super(KeypointAttention, self).__init__()
+        self.use_conv = use_conv
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.act = act
+        self.use_scale = use_scale
+        if use_conv:
+            self.conv1x1_pose = nn.Conv1d(in_channels[0],
+                                          out_channels[0],
+                                          kernel_size=1)
+            self.conv1x1_shape_cam = nn.Conv1d(in_channels[1],
+                                               out_channels[1],
+                                               kernel_size=1)
+
+    def forward(self, features, heatmaps):
+        batch_size, num_joints, height, width = heatmaps.shape
+
+        if self.use_scale:
+            scale = 1.0 / np.sqrt(height * width)
+            heatmaps = heatmaps * scale
+
+        if self.act == 'softmax':
+            normalized_heatmap = F.softmax(heatmaps.reshape(
+                batch_size, num_joints, -1),
+                                           dim=-1)
+        elif self.act == 'sigmoid':
+            normalized_heatmap = torch.sigmoid(
+                heatmaps.reshape(batch_size, num_joints, -1))
+        features = features.reshape(batch_size, -1, height * width)
+
+        attended_features = torch.matmul(normalized_heatmap,
+                                         features.transpose(2, 1))
+        attended_features = attended_features.transpose(2, 1)
+
+        if self.use_conv:
+            if attended_features.shape[1] == self.in_channels[0]:
+                attended_features = self.conv1x1_pose(attended_features)
+            else:
+                attended_features = self.conv1x1_shape_cam(attended_features)
+
+        return attended_features
+
+
+def interpolate(feat, uv):
+    """
+    Args:
+        feat (torch.Tensor): [B, C, H, W] image features
+        uv (torch.Tensor): [B, 2, N] uv coordinates
+            in the image plane, range [-1, 1]
+    Returns:
+        samples[:, :, :, 0] (torch.Tensor):
+            [B, C, N] image features at the uv coordinates
+    """
+    if uv.shape[-1] != 2:
+        uv = uv.transpose(1, 2)  # [B, N, 2]
+    uv = uv.unsqueeze(2)  # [B, N, 1, 2]
+    # NOTE: for newer PyTorch, it seems that training
+    # results are degraded due to implementation diff in F.grid_sample
+    # for old versions, simply remove the aligned_corners argument.
+    if int(torch.__version__.split('.')[1]) < 4:
+        samples = torch.nn.functional.grid_sample(feat, uv)  # [B, C, N, 1]
+    else:
+        samples = torch.nn.functional.grid_sample(
+            feat, uv, align_corners=True)  # [B, C, N, 1]
+    return samples[:, :, :, 0]  # [B, C, N]
+
+
+def _softmax(tensor, temperature, dim=-1):
+    return F.softmax(tensor * temperature, dim=dim)
+
+
+def softargmax2d(
+    heatmaps,
+    temperature=None,
+    normalize_keypoints=True,
+):
+    """Softargmax layer for heatmaps."""
+    dtype, device = heatmaps.dtype, heatmaps.device
+    if temperature is None:
+        temperature = torch.tensor(1.0, dtype=dtype, device=device)
+    batch_size, num_channels, height, width = heatmaps.shape
+    x = torch.arange(0, width, device=device, dtype=dtype).reshape(
+        1, 1, 1, width).expand(batch_size, -1, height, -1)
+    y = torch.arange(0, height, device=device,
+                     dtype=dtype).reshape(1, 1, height,
+                                          1).expand(batch_size, -1, -1, width)
+    # Should be Bx2xHxW
+    points = torch.cat([x, y], dim=1)
+    normalized_heatmap = _softmax(heatmaps.reshape(batch_size, num_channels,
+                                                   -1),
+                                  temperature=temperature.reshape(1, -1, 1),
+                                  dim=-1)
+
+    # Should be BxJx2
+    keypoints = (
+        normalized_heatmap.reshape(batch_size, -1, 1, height * width) *
+        points.reshape(batch_size, 1, 2, -1)).sum(dim=-1)
+
+    if normalize_keypoints:
+        # Normalize keypoints to [-1, 1]
+        keypoints[:, :, 0] = (keypoints[:, :, 0] / (width - 1) * 2 - 1)
+        keypoints[:, :, 1] = (keypoints[:, :, 1] / (height - 1) * 2 - 1)
+
+    return keypoints, normalized_heatmap.reshape(batch_size, -1, height, width)
+
+
+class PareHead(BaseModule):
+    def __init__(
+        self,
+        num_joints=24,
+        num_input_features=480,
+        softmax_temp=1.0,
+        num_deconv_layers=3,
+        num_deconv_filters=(256, 256, 256),
+        num_deconv_kernels=(4, 4, 4),
+        num_camera_params=3,
+        num_features_smpl=64,
+        final_conv_kernel=1,
+        pose_mlp_num_layers=1,
+        shape_mlp_num_layers=1,
+        pose_mlp_hidden_size=256,
+        shape_mlp_hidden_size=256,
+        bn_momentum=0.1,
+        use_heatmaps='part_segm',
+        use_keypoint_attention=False,
+        use_postconv_keypoint_attention=False,
+        keypoint_attention_act='softmax',  # softmax, sigmoid
+        use_scale_keypoint_attention=False,
+        backbone='hrnet_w32-conv',  # hrnet, resnet
+        smpl_mean_params=None,
+        deconv_with_bias=False,
+    ):
+        """PARE parameters regressor head. This class is modified from.
+
+        [PARE](hhttps://github.com/
+        mkocabas/PARE/blob/master/pare/models/head/pare_head.py). Original
+        license please see docs/additional_licenses.md.
+
+        Args:
+            num_joints (int):
+                Number of joints, should be 24 for smpl.
+            num_input_features (int):
+                Number of input featuremap channels.
+            softmax_temp (float):
+                Softmax tempreture
+            num_deconv_layers (int):
+                Number of deconvolution layers.
+            num_deconv_filters (List[int]):
+                Number of filters for each deconvolution layer,
+                len(num_deconv_filters) == num_deconv_layers.
+            num_deconv_kernels (List[int]):
+                Kernel size  for each deconvolution layer,
+                len(num_deconv_kernels) == num_deconv_layers.
+            num_camera_params (int):
+                Number of predicted camera parameter dimension.
+            num_features_smpl (int):
+                Number of feature map channels.
+            final_conv_kernel (int):
+                Kernel size for the final deconvolution feature map channels.
+            pose_mlp_num_layers (int):
+                Number of mpl layers for pose parameter regression.
+            shape_mlp_num_layers (int):
+                Number of mpl layers for pose parameter regression.
+            pose_mlp_hidden_size (int):
+                Hidden size for pose mpl layers.
+            shape_mlp_hidden_size (int):
+                Hidden size for pose mpl layers.
+            bn_momemtum (float):
+                Momemtum for batch normalization.
+            use_heatmaps (str):
+                Types of heat maps to use.
+            use_keypoint_attention (bool)
+                Whether to use attention based on heat maps.
+            keypoint_attention_act (str):
+                Types of activation function for attention layers.
+            use_scale_keypoint_attention (str):
+                Whether to scale the attention
+                according to the size of the attention map.
+            deconv_with_bias (bool)
+                Whether to deconv with bias.
+            backbone (str):
+                Types of the backbone.
+            smpl_mean_params (str):
+                File name of the mean SMPL parameters
+        """
+
+        super(PareHead, self).__init__()
+        self.backbone = backbone
+        self.num_joints = num_joints
+        self.deconv_with_bias = deconv_with_bias
+        self.use_heatmaps = use_heatmaps
+        self.pose_mlp_num_layers = pose_mlp_num_layers
+        self.shape_mlp_num_layers = shape_mlp_num_layers
+        self.pose_mlp_hidden_size = pose_mlp_hidden_size
+        self.shape_mlp_hidden_size = shape_mlp_hidden_size
+        self.use_keypoint_attention = use_keypoint_attention
+
+        self.num_input_features = num_input_features
+        self.bn_momentum = bn_momentum
+        if self.use_heatmaps == 'part_segm':
+
+            self.use_keypoint_attention = True
+
+        if backbone.startswith('hrnet'):
+
+            self.keypoint_deconv_layers = self._make_conv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                (3, ) * num_deconv_layers,
+            )
+            self.num_input_features = num_input_features
+            self.smpl_deconv_layers = self._make_conv_layer(
+                num_deconv_layers,
+                num_deconv_filters,
+                (3, ) * num_deconv_layers,
+            )
+        else:
+            # part branch that estimates 2d keypoints
+
+            conv_fn = self._make_deconv_layer
+
+            self.keypoint_deconv_layers = conv_fn(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+            # reset inplanes to 2048 -> final resnet layer
+            self.num_input_features = num_input_features
+            self.smpl_deconv_layers = conv_fn(
+                num_deconv_layers,
+                num_deconv_filters,
+                num_deconv_kernels,
+            )
+
+        pose_mlp_inp_dim = num_deconv_filters[-1]
+        smpl_final_dim = num_features_smpl
+        shape_mlp_inp_dim = num_joints * smpl_final_dim
+
+        self.keypoint_final_layer = nn.Conv2d(
+            in_channels=num_deconv_filters[-1],
+            out_channels=num_joints +
+            1 if self.use_heatmaps in ('part_segm',
+                                       'part_segm_pool') else num_joints,
+            kernel_size=final_conv_kernel,
+            stride=1,
+            padding=1 if final_conv_kernel == 3 else 0,
+        )
+
+        self.smpl_final_layer = nn.Conv2d(
+            in_channels=num_deconv_filters[-1],
+            out_channels=smpl_final_dim,
+            kernel_size=final_conv_kernel,
+            stride=1,
+            padding=1 if final_conv_kernel == 3 else 0,
+        )
+
+        # temperature for softargmax function
+        self.register_buffer('temperature', torch.tensor(softmax_temp))
+        mean_params = np.load(smpl_mean_params)
+        init_pose = torch.from_numpy(mean_params['pose'][:]).unsqueeze(0)
+        init_shape = torch.from_numpy(
+            mean_params['shape'][:].astype('float32')).unsqueeze(0)
+        init_cam = torch.from_numpy(mean_params['cam']).unsqueeze(0)
+        self.register_buffer('init_pose', init_pose)
+        self.register_buffer('init_shape', init_shape)
+        self.register_buffer('init_cam', init_cam)
+
+        self.pose_mlp_inp_dim = pose_mlp_inp_dim
+        self.shape_mlp_inp_dim = shape_mlp_inp_dim
+
+        self.shape_mlp = self._get_shape_mlp(output_size=10)
+        self.cam_mlp = self._get_shape_mlp(output_size=num_camera_params)
+
+        self.pose_mlp = self._get_pose_mlp(num_joints=num_joints,
+                                           output_size=6)
+
+        self.keypoint_attention = KeypointAttention(
+            use_conv=use_postconv_keypoint_attention,
+            in_channels=(self.pose_mlp_inp_dim, smpl_final_dim),
+            out_channels=(self.pose_mlp_inp_dim, smpl_final_dim),
+            act=keypoint_attention_act,
+            use_scale=use_scale_keypoint_attention,
+        )
+
+    def _get_shape_mlp(self, output_size):
+        """mlp layers for shape regression."""
+        if self.shape_mlp_num_layers == 1:
+            return nn.Linear(self.shape_mlp_inp_dim, output_size)
+
+        module_list = []
+        for i in range(self.shape_mlp_num_layers):
+            if i == 0:
+                module_list.append(
+                    nn.Linear(self.shape_mlp_inp_dim,
+                              self.shape_mlp_hidden_size))
+            elif i == self.shape_mlp_num_layers - 1:
+                module_list.append(
+                    nn.Linear(self.shape_mlp_hidden_size, output_size))
+            else:
+                module_list.append(
+                    nn.Linear(self.shape_mlp_hidden_size,
+                              self.shape_mlp_hidden_size))
+        return nn.Sequential(*module_list)
+
+    def _get_pose_mlp(self, num_joints, output_size):
+        """mlp layers for pose regression."""
+        if self.pose_mlp_num_layers == 1:
+
+            return LocallyConnected2d(
+                in_channels=self.pose_mlp_inp_dim,
+                out_channels=output_size,
+                output_size=[num_joints, 1],
+                kernel_size=1,
+                stride=1,
+            )
+
+        module_list = []
+        for i in range(self.pose_mlp_num_layers):
+            if i == 0:
+                module_list.append(
+                    LocallyConnected2d(
+                        in_channels=self.pose_mlp_inp_dim,
+                        out_channels=self.pose_mlp_hidden_size,
+                        output_size=[num_joints, 1],
+                        kernel_size=1,
+                        stride=1,
+                    ))
+            elif i == self.pose_mlp_num_layers - 1:
+                module_list.append(
+                    LocallyConnected2d(
+                        in_channels=self.pose_mlp_hidden_size,
+                        out_channels=output_size,
+                        output_size=[num_joints, 1],
+                        kernel_size=1,
+                        stride=1,
+                    ))
+            else:
+                module_list.append(
+                    LocallyConnected2d(
+                        in_channels=self.pose_mlp_hidden_size,
+                        out_channels=self.pose_mlp_hidden_size,
+                        output_size=[num_joints, 1],
+                        kernel_size=1,
+                        stride=1,
+                    ))
+        return nn.Sequential(*module_list)
+
+    def _get_deconv_cfg(self, deconv_kernel):
+        """get deconv padding, output padding according to kernel size."""
+        if deconv_kernel == 4:
+            padding = 1
+            output_padding = 0
+        elif deconv_kernel == 3:
+            padding = 1
+            output_padding = 1
+        elif deconv_kernel == 2:
+            padding = 0
+            output_padding = 0
+
+        return deconv_kernel, padding, output_padding
+
+    def _make_conv_layer(self, num_layers, num_filters, num_kernels):
+        """make convolution layers."""
+        assert num_layers == len(num_filters), \
+            'ERROR: num_conv_layers is different len(num_conv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_conv_layers is different len(num_conv_filters)'
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                nn.Conv2d(in_channels=self.num_input_features,
+                          out_channels=planes,
+                          kernel_size=kernel,
+                          stride=1,
+                          padding=padding,
+                          bias=self.deconv_with_bias))
+            layers.append(nn.BatchNorm2d(planes, momentum=self.bn_momentum))
+            layers.append(nn.ReLU(inplace=True))
+            self.num_input_features = planes
+
+        return nn.Sequential(*layers)
+
+    def _make_deconv_layer(self, num_layers, num_filters, num_kernels):
+        """make deconvolution layers."""
+        assert num_layers == len(num_filters), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+        assert num_layers == len(num_kernels), \
+            'ERROR: num_deconv_layers is different len(num_deconv_filters)'
+
+        layers = []
+        for i in range(num_layers):
+            kernel, padding, output_padding = \
+                self._get_deconv_cfg(num_kernels[i])
+
+            planes = num_filters[i]
+            layers.append(
+                nn.ConvTranspose2d(in_channels=self.num_input_features,
+                                   out_channels=planes,
+                                   kernel_size=kernel,
+                                   stride=2,
+                                   padding=padding,
+                                   output_padding=output_padding,
+                                   bias=self.deconv_with_bias))
+            layers.append(nn.BatchNorm2d(planes, momentum=self.bn_momentum))
+            layers.append(nn.ReLU(inplace=True))
+            # if self.use_self_attention:
+            #     layers.append(SelfAttention(planes))
+            self.num_input_features = planes
+
+        return nn.Sequential(*layers)
+
+    def forward(self, features):
+        batch_size = features.shape[0]
+
+        init_pose = self.init_pose.expand(batch_size, -1)  # N, Jx6
+        init_shape = self.init_shape.expand(batch_size, -1)
+        init_cam = self.init_cam.expand(batch_size, -1)
+
+        output = {}
+
+        part_feats = self._get_2d_branch_feats(features)
+
+        part_attention = self._get_part_attention_map(part_feats, output)
+
+        smpl_feats = self._get_3d_smpl_feats(features, part_feats)
+
+        point_local_feat, cam_shape_feats = self._get_local_feats(
+            smpl_feats, part_attention, output)
+
+        pred_pose, pred_shape, pred_cam = self._get_final_preds(
+            point_local_feat, cam_shape_feats, init_pose, init_shape, init_cam)
+
+        pred_rotmat = rot6d_to_rotmat(pred_pose).reshape(batch_size, 24, 3, 3)
+
+        output.update({
+            'pred_pose': pred_rotmat,
+            'pred_cam': pred_cam,
+            'pred_shape': pred_shape,
+        })
+        return output
+
+    def _get_local_feats(self, smpl_feats, part_attention, output):
+        # 1x1 conv
+        """get keypoints and camera features from backbone features."""
+
+        cam_shape_feats = self.smpl_final_layer(smpl_feats)
+
+        if self.use_keypoint_attention:
+            point_local_feat = self.keypoint_attention(smpl_feats,
+                                                       part_attention)
+            cam_shape_feats = self.keypoint_attention(cam_shape_feats,
+                                                      part_attention)
+        else:
+            point_local_feat = interpolate(smpl_feats, output['pred_kp2d'])
+            cam_shape_feats = interpolate(cam_shape_feats, output['pred_kp2d'])
+        return point_local_feat, cam_shape_feats
+
+    def _get_2d_branch_feats(self, features):
+        """get part features from backbone features."""
+        part_feats = self.keypoint_deconv_layers(features)
+
+        return part_feats
+
+    def _get_3d_smpl_feats(self, features, part_feats):
+        """get smpl feature maps from backbone features."""
+
+        smpl_feats = self.smpl_deconv_layers(features)
+
+        return smpl_feats
+
+    def _get_part_attention_map(self, part_feats, output):
+        """get attention map from part feature map."""
+        heatmaps = self.keypoint_final_layer(part_feats)
+
+        if self.use_heatmaps == 'part_segm':
+
+            output['pred_segm_mask'] = heatmaps
+            # remove the the background channel
+            heatmaps = heatmaps[:, 1:, :, :]
+        else:
+            pred_kp2d, _ = softargmax2d(heatmaps, self.temperature)
+            output['pred_kp2d'] = pred_kp2d
+            output['pred_heatmaps_2d'] = heatmaps
+        return heatmaps
+
+    def _get_final_preds(self, pose_feats, cam_shape_feats, init_pose,
+                         init_shape, init_cam):
+        """get final preds."""
+        return self._pare_get_final_preds(pose_feats, cam_shape_feats,
+                                          init_pose, init_shape, init_cam)
+
+    def _pare_get_final_preds(self, pose_feats, cam_shape_feats, init_pose,
+                              init_shape, init_cam):
+        """get final preds."""
+        pose_feats = pose_feats.unsqueeze(-1)  #
+
+        if init_pose.shape[-1] == 6:
+            # This means init_pose comes from a previous iteration
+            init_pose = init_pose.transpose(2, 1).unsqueeze(-1)
+        else:
+            # This means init pose comes from mean pose
+            init_pose = init_pose.reshape(init_pose.shape[0], 6,
+                                          -1).unsqueeze(-1)
+
+        shape_feats = cam_shape_feats
+
+        shape_feats = torch.flatten(shape_feats, start_dim=1)
+
+        pred_pose = self.pose_mlp(pose_feats)
+        pred_cam = self.cam_mlp(shape_feats)
+        pred_shape = self.shape_mlp(shape_feats)
+
+        pred_pose = pred_pose.squeeze(-1).transpose(2, 1)  # N, J, 6
+        return pred_pose, pred_shape, pred_cam
diff --git a/detrsmpl/models/losses/__init__.py b/detrsmpl/models/losses/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/models/losses/balanced_mse_loss.py b/detrsmpl/models/losses/balanced_mse_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..fee145e114eaf9ca7efa83ddc4bf9765efb3c7fa
--- /dev/null
+++ b/detrsmpl/models/losses/balanced_mse_loss.py
@@ -0,0 +1,146 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/jiawei-ren/BalancedMSE
+# Original licence: Copyright (c) 2022 Jiawei Ren, under the MIT License.
+# ------------------------------------------------------------------------------
+
+from typing import Optional, Union
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from mmcv.runner import get_dist_info
+from torch.nn.modules.loss import _Loss
+
+from .utils import weighted_loss
+
+
+@weighted_loss
+def bmc_loss_md(pred: torch.Tensor, target: torch.Tensor,
+                noise_var: torch.Tensor, all_gather: bool,
+                loss_mse_weight: float,
+                loss_debias_weight: float) -> torch.Tensor:
+    """
+    Args:
+        pred (torch.Tensor): The prediction. Shape should be (N, L).
+        target (torch.Tensor): The learning target of the prediction.
+        noise_var (torch.Tensor): Noise var of ground truth distribution.
+        all_gather (bool): Whether gather tensors across all sub-processes.
+            Only used in DDP training scheme.
+        loss_mse_weight (float, optional): The weight of the mse term.
+        loss_debias_weight (float, optional): The weight of the debiased term.
+
+    Returns:
+            torch.Tensor: The calculated loss
+    """
+    N = pred.shape[0]
+    L = pred.shape[1]
+    device = pred.device
+
+    loss_mse = F.mse_loss(pred, target, reduction='none').sum(-1)
+    loss_mse = loss_mse / noise_var
+
+    if all_gather:
+        rank, world_size = get_dist_info()
+        bs, length = target.shape
+        all_bs = [torch.zeros(1).to(device) for _ in range(world_size)]
+        dist.all_gather(all_bs, torch.Tensor([bs]).to(device))
+        all_bs_int = [int(v.item()) for v in all_bs]
+        max_bs_int = max(all_bs_int)
+        target_padding = torch.zeros(max_bs_int, length).to(device)
+        target_padding[:bs] = target
+        all_tensor = []
+        for _ in range(world_size):
+            all_tensor.append(torch.zeros(max_bs_int, length).type_as(target))
+        dist.all_gather(all_tensor, target_padding)
+        # remove padding
+        for i in range(world_size):
+            all_tensor[i] = all_tensor[i][:all_bs_int[i]]
+        target = torch.cat(all_tensor, dim=0)
+
+    # Debias term
+    target = target.unsqueeze(0).repeat(N, 1, 1)
+    pred = pred.unsqueeze(1).expand_as(target)
+    debias_term = F.mse_loss(pred, target, reduction='none').sum(-1)
+    debias_term = -0.5 * debias_term / noise_var
+    loss_debias = torch.logsumexp(debias_term, dim=1).squeeze(-1)
+    loss = loss_mse * loss_mse_weight + loss_debias * loss_debias_weight
+    # recover loss scale of mse_loss
+    loss = loss / L * noise_var.detach()
+    return loss
+
+
+class BMCLossMD(_Loss):
+    """Balanced MSE loss, use batch monte-carlo to estimate distribution.
+    https://arxiv.org/abs/2203.16427.
+
+    Args:
+        init_noise_sigma (float, optional): The initial value of noise sigma.
+            This sigma is used to represent ground truth distribution.
+            Defaults to 1.0.
+        all_gather (bool, optional): Whether gather tensors across all
+            sub-processes. If set True, BMC will have more precise estimation
+            with more time cost. Default: False.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_mse_weight (float, optional): The weight of the mse term.
+            Defaults to 1.0.
+        loss_debias_weight (float, optional): The weight of the debiased term.
+            Defaults to 1.0.
+    """
+    def __init__(self,
+                 init_noise_sigma: Optional[float] = 1.0,
+                 all_gather: Optional[bool] = False,
+                 reduction: Optional[str] = 'mean',
+                 loss_mse_weight: Optional[float] = 1.0,
+                 loss_debias_weight: Optional[float] = 1.0):
+        super(BMCLossMD, self).__init__()
+        self.noise_sigma = torch.nn.Parameter(
+            torch.tensor(init_noise_sigma).float())
+        self.all_gather = all_gather
+        assert reduction in (None, 'none', 'mean', 'sum')
+        reduction = 'none' if reduction is None else reduction
+        self.reduction = reduction
+        self.loss_mse_weight = loss_mse_weight
+        self.loss_debias_weight = loss_debias_weight
+
+    def forward(
+            self,
+            pred: torch.Tensor,
+            target: torch.Tensor,
+            weight: Optional[Union[torch.Tensor, None]] = None,
+            avg_factor: Optional[Union[int, None]] = None,
+            reduction_override: Optional[Union[str,
+                                               None]] = None) -> torch.Tensor:
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): Weight of the loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            weight (torch.Tensor, optional): Weight of the loss for each
+                prediction. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        noise_var = (self.noise_sigma**2).type_as(pred)
+        pred = pred.view(pred.shape[0], -1)
+        target = target.view(target.shape[0], -1)
+        loss = bmc_loss_md(pred,
+                           target,
+                           noise_var=noise_var,
+                           all_gather=self.all_gather,
+                           loss_mse_weight=self.loss_mse_weight,
+                           loss_debias_weight=self.loss_debias_weight,
+                           weight=weight,
+                           reduction=reduction,
+                           avg_factor=avg_factor)
+        return loss
diff --git a/detrsmpl/models/losses/builder.py b/detrsmpl/models/losses/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7c1db0f3b1b11ec918e3c9a5a094e3dcfce2978
--- /dev/null
+++ b/detrsmpl/models/losses/builder.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.utils import Registry
+
+from .balanced_mse_loss import BMCLossMD
+from .cross_entropy_loss import CrossEntropyLoss
+from .focal_loss import FocalLoss
+from .gan_loss import GANLoss
+from .iou_loss import BoundedIoULoss, CIoULoss, DIoULoss, GIoULoss, IoULoss
+from .mse_loss import KeypointMSELoss, MSELoss
+from .prior_loss import (
+    CameraPriorLoss,
+    JointPriorLoss,
+    LimbLengthLoss,
+    MaxMixturePrior,
+    PoseRegLoss,
+    ShapePriorLoss,
+    ShapeThresholdPriorLoss,
+    SmoothJointLoss,
+    SmoothPelvisLoss,
+    SmoothTranslationLoss,
+)
+from .rotaion_distance_loss import RotationDistance
+from .smooth_l1_loss import L1Loss, SmoothL1Loss
+
+LOSSES = Registry('losses')
+
+LOSSES.register_module(name='GANLoss', module=GANLoss)
+LOSSES.register_module(name='MSELoss', module=MSELoss)
+LOSSES.register_module(name='KeypointMSELoss', module=KeypointMSELoss)
+LOSSES.register_module(name='ShapePriorLoss', module=ShapePriorLoss)
+LOSSES.register_module(name='PoseRegLoss', module=PoseRegLoss)
+LOSSES.register_module(name='LimbLengthLoss', module=LimbLengthLoss)
+LOSSES.register_module(name='JointPriorLoss', module=JointPriorLoss)
+LOSSES.register_module(name='SmoothJointLoss', module=SmoothJointLoss)
+LOSSES.register_module(name='SmoothPelvisLoss', module=SmoothPelvisLoss)
+LOSSES.register_module(name='SmoothTranslationLoss',
+                       module=SmoothTranslationLoss)
+LOSSES.register_module(name='ShapeThresholdPriorLoss',
+                       module=ShapeThresholdPriorLoss)
+LOSSES.register_module(name='CameraPriorLoss', module=CameraPriorLoss)
+LOSSES.register_module(name='MaxMixturePrior', module=MaxMixturePrior)
+LOSSES.register_module(name='L1Loss', module=L1Loss)
+LOSSES.register_module(name='SmoothL1Loss', module=SmoothL1Loss)
+LOSSES.register_module(name='CrossEntropyLoss', module=CrossEntropyLoss)
+LOSSES.register_module(name='RotationDistance', module=RotationDistance)
+LOSSES.register_module(name='BMCLossMD', module=BMCLossMD)
+LOSSES.register_module(name='FocalLoss', module=FocalLoss)
+LOSSES.register_module(name='IoULoss', module=IoULoss)
+LOSSES.register_module(name='BoundedIoULoss', module=BoundedIoULoss)
+LOSSES.register_module(name='GIoULoss', module=GIoULoss)
+LOSSES.register_module(name='DIoULoss', module=DIoULoss)
+LOSSES.register_module(name='CIoULoss', module=CIoULoss)
+
+
+def build_loss(cfg):
+    """Build loss."""
+    if cfg is None:
+        return None
+    return LOSSES.build(cfg)
diff --git a/detrsmpl/models/losses/cross_entropy_loss.py b/detrsmpl/models/losses/cross_entropy_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7ecdc375309c0fd097371faebffca8b842338d2
--- /dev/null
+++ b/detrsmpl/models/losses/cross_entropy_loss.py
@@ -0,0 +1,254 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .utils import weight_reduce_loss
+
+
+def cross_entropy(pred,
+                  label,
+                  weight=None,
+                  reduction='mean',
+                  avg_factor=None,
+                  class_weight=None,
+                  ignore_index=-100):
+    """Calculate the CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int | None): The label index to be ignored.
+            If None, it will be set to default value. Default: -100.
+
+    Returns:
+        torch.Tensor: The calculated loss
+    """
+    # The default value of ignore_index is the same as F.cross_entropy
+    ignore_index = -100 if ignore_index is None else ignore_index
+    # element-wise losses
+    loss = F.cross_entropy(pred,
+                           label,
+                           weight=class_weight,
+                           reduction='none',
+                           ignore_index=ignore_index)
+
+    # apply weights and do the reduction
+    if weight is not None:
+        weight = weight.float()
+    loss = weight_reduce_loss(loss,
+                              weight=weight,
+                              reduction=reduction,
+                              avg_factor=avg_factor)
+
+    return loss
+
+
+def _expand_onehot_labels(labels, label_weights, label_channels, ignore_index):
+    """Expand onehot labels to match the size of prediction."""
+    bin_labels = labels.new_full((labels.size(0), label_channels), 0)
+    valid_mask = (labels >= 0) & (labels != ignore_index)
+    inds = torch.nonzero(valid_mask & (labels < label_channels),
+                         as_tuple=False)
+
+    if inds.numel() > 0:
+        bin_labels[inds, labels[inds]] = 1
+
+    valid_mask = valid_mask.view(-1, 1).expand(labels.size(0),
+                                               label_channels).float()
+    if label_weights is None:
+        bin_label_weights = valid_mask
+    else:
+        bin_label_weights = label_weights.view(-1, 1).repeat(1, label_channels)
+        bin_label_weights *= valid_mask
+
+    return bin_labels, bin_label_weights
+
+
+def binary_cross_entropy(pred,
+                         label,
+                         weight=None,
+                         reduction='mean',
+                         avg_factor=None,
+                         class_weight=None,
+                         ignore_index=-100):
+    """Calculate the binary CrossEntropy loss.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, 1).
+        label (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (int | None): The label index to be ignored.
+            If None, it will be set to default value. Default: -100.
+
+    Returns:
+        torch.Tensor: The calculated loss.
+    """
+    # The default value of ignore_index is the same as F.cross_entropy
+    ignore_index = -100 if ignore_index is None else ignore_index
+    if pred.dim() != label.dim():
+        label, weight = _expand_onehot_labels(label, weight, pred.size(-1),
+                                              ignore_index)
+
+    # weighted element-wise losses
+    if weight is not None:
+        weight = weight.float()
+    loss = F.binary_cross_entropy_with_logits(pred,
+                                              label.float(),
+                                              pos_weight=class_weight,
+                                              reduction='none')
+    # do the reduction for the weighted loss
+    loss = weight_reduce_loss(loss,
+                              weight,
+                              reduction=reduction,
+                              avg_factor=avg_factor)
+
+    return loss
+
+
+def mask_cross_entropy(pred,
+                       target,
+                       label,
+                       reduction='mean',
+                       avg_factor=None,
+                       class_weight=None,
+                       ignore_index=None):
+    """Calculate the CrossEntropy loss for masks.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C, *), C is the
+            number of classes. The trailing * indicates arbitrary shape.
+        target (torch.Tensor): The learning label of the prediction.
+        label (torch.Tensor): ``label`` indicates the class label of the mask
+            corresponding object. This will be used to select the mask in the
+            of the class which the object belongs to when the mask prediction
+            if not class-agnostic.
+        reduction (str, optional): The method used to reduce the loss.
+            Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        class_weight (list[float], optional): The weight for each class.
+        ignore_index (None): Placeholder, to be consistent with other loss.
+            Default: None.
+
+    Returns:
+        torch.Tensor: The calculated loss
+
+    Example:
+        >>> N, C = 3, 11
+        >>> H, W = 2, 2
+        >>> pred = torch.randn(N, C, H, W) * 1000
+        >>> target = torch.rand(N, H, W)
+        >>> label = torch.randint(0, C, size=(N,))
+        >>> reduction = 'mean'
+        >>> avg_factor = None
+        >>> class_weights = None
+        >>> loss = mask_cross_entropy(pred, target, label, reduction,
+        >>>                           avg_factor, class_weights)
+        >>> assert loss.shape == (1,)
+    """
+    assert ignore_index is None, 'BCE loss does not support ignore_index'
+    # TODO: handle these two reserved arguments
+    assert reduction == 'mean' and avg_factor is None
+    num_rois = pred.size()[0]
+    inds = torch.arange(0, num_rois, dtype=torch.long, device=pred.device)
+    pred_slice = pred[inds, label].squeeze(1)
+    return F.binary_cross_entropy_with_logits(pred_slice,
+                                              target,
+                                              weight=class_weight,
+                                              reduction='mean')[None]
+
+
+class CrossEntropyLoss(nn.Module):
+    def __init__(self,
+                 use_sigmoid=False,
+                 use_mask=False,
+                 reduction='mean',
+                 class_weight=None,
+                 ignore_index=None,
+                 loss_weight=1.0):
+        """CrossEntropyLoss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether the prediction uses sigmoid
+                of softmax. Defaults to False.
+            use_mask (bool, optional): Whether to use mask cross entropy loss.
+                Defaults to False.
+            reduction (str, optional): . Defaults to 'mean'.
+                Options are "none", "mean" and "sum".
+            class_weight (list[float], optional): Weight of each class.
+                Defaults to None.
+            ignore_index (int | None): The label index to be ignored.
+                Defaults to None.
+            loss_weight (float, optional): Weight of the loss. Defaults to 1.0.
+        """
+        super(CrossEntropyLoss, self).__init__()
+        assert (use_sigmoid is False) or (use_mask is False)
+        self.use_sigmoid = use_sigmoid
+        self.use_mask = use_mask
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = class_weight
+        self.ignore_index = ignore_index
+
+        if self.use_sigmoid:
+            self.cls_criterion = binary_cross_entropy
+        elif self.use_mask:
+            self.cls_criterion = mask_cross_entropy
+        else:
+            self.cls_criterion = cross_entropy
+
+    def forward(self,
+                cls_score,
+                label,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                ignore_index=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            cls_score (torch.Tensor): The prediction.
+            label (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): Sample-wise loss weight.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The method used to reduce the
+                loss. Options are "none", "mean" and "sum".
+            ignore_index (int | None): The label index to be ignored.
+                If not None, it will override the default value. Default: None.
+        Returns:
+            torch.Tensor: The calculated loss.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        if ignore_index is None:
+            ignore_index = self.ignore_index
+
+        if self.class_weight is not None:
+            class_weight = cls_score.new_tensor(self.class_weight,
+                                                device=cls_score.device)
+        else:
+            class_weight = None
+        loss_cls = self.loss_weight * self.cls_criterion(
+            cls_score,
+            label,
+            weight,
+            class_weight=class_weight,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            ignore_index=ignore_index,
+            **kwargs)
+        return loss_cls
diff --git a/detrsmpl/models/losses/focal_loss.py b/detrsmpl/models/losses/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5687252d28978f838a66b456c4442de9d72df99e
--- /dev/null
+++ b/detrsmpl/models/losses/focal_loss.py
@@ -0,0 +1,241 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.ops import sigmoid_focal_loss as _sigmoid_focal_loss
+
+from .utils import weight_reduce_loss
+
+
+# This method is only for debugging
+def py_sigmoid_focal_loss(pred,
+                          target,
+                          weight=None,
+                          gamma=2.0,
+                          alpha=0.25,
+                          reduction='mean',
+                          avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the
+            number of classes
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    pred_sigmoid = pred.sigmoid()
+    target = target.type_as(pred)
+    pt = (1 - pred_sigmoid) * target + pred_sigmoid * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy_with_logits(pred, target,
+                                              reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def py_focal_loss_with_prob(pred,
+                            target,
+                            weight=None,
+                            gamma=2.0,
+                            alpha=0.25,
+                            reduction='mean',
+                            avg_factor=None):
+    """PyTorch version of `Focal Loss <https://arxiv.org/abs/1708.02002>`_.
+    Different from `py_sigmoid_focal_loss`, this function accepts probability
+    as input.
+
+    Args:
+        pred (torch.Tensor): The prediction probability with shape (N, C),
+            C is the number of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    num_classes = pred.size(1)
+    target = F.one_hot(target, num_classes=num_classes + 1)
+    target = target[:, :num_classes]
+
+    target = target.type_as(pred)
+    pt = (1 - pred) * target + pred * (1 - target)
+    focal_weight = (alpha * target + (1 - alpha) *
+                    (1 - target)) * pt.pow(gamma)
+    loss = F.binary_cross_entropy(pred, target,
+                                  reduction='none') * focal_weight
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+def sigmoid_focal_loss(pred,
+                       target,
+                       weight=None,
+                       gamma=2.0,
+                       alpha=0.25,
+                       reduction='mean',
+                       avg_factor=None):
+    r"""A warpper of cuda version `Focal Loss
+    <https://arxiv.org/abs/1708.02002>`_.
+
+    Args:
+        pred (torch.Tensor): The prediction with shape (N, C), C is the number
+            of classes.
+        target (torch.Tensor): The learning label of the prediction.
+        weight (torch.Tensor, optional): Sample-wise loss weight.
+        gamma (float, optional): The gamma for calculating the modulating
+            factor. Defaults to 2.0.
+        alpha (float, optional): A balanced form for Focal Loss.
+            Defaults to 0.25.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'. Options are "none", "mean" and "sum".
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+    """
+    # Function.apply does not accept keyword arguments, so the decorator
+    # "weighted_loss" is not applicable
+    loss = _sigmoid_focal_loss(pred.contiguous(), target.contiguous(), gamma,
+                               alpha, None, 'none')
+    if weight is not None:
+        if weight.shape != loss.shape:
+            if weight.size(0) == loss.size(0):
+                # For most cases, weight is of shape (num_priors, ),
+                #  which means it does not have the second axis num_class
+                weight = weight.view(-1, 1)
+            else:
+                # Sometimes, weight per anchor per class is also needed. e.g.
+                #  in FSAF. But it may be flattened of shape
+                #  (num_priors x num_class, ), while loss is still of shape
+                #  (num_priors, num_class).
+                assert weight.numel() == loss.numel()
+                weight = weight.view(loss.size(0), -1)
+        assert weight.ndim == loss.ndim
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+class FocalLoss(nn.Module):
+    def __init__(self,
+                 use_sigmoid=True,
+                 gamma=2.0,
+                 alpha=0.25,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 activated=False):
+        """`Focal Loss <https://arxiv.org/abs/1708.02002>`_
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            gamma (float, optional): The gamma for calculating the modulating
+                factor. Defaults to 2.0.
+            alpha (float, optional): A balanced form for Focal Loss.
+                Defaults to 0.25.
+            reduction (str, optional): The method used to reduce the loss into
+                a scalar. Defaults to 'mean'. Options are "none", "mean" and
+                "sum".
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            activated (bool, optional): Whether the input is activated.
+                If True, it means the input has been activated and can be
+                treated as probabilities. Else, it should be treated as logits.
+                Defaults to False.
+        """
+        super(FocalLoss, self).__init__()
+        assert use_sigmoid is True, 'Only sigmoid focal loss supported now.'
+        self.use_sigmoid = use_sigmoid
+        self.gamma = gamma
+        self.alpha = alpha
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.activated = activated
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning label of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        if self.use_sigmoid:
+            if self.activated:
+                calculate_loss_func = py_focal_loss_with_prob
+            else:
+                if torch.cuda.is_available() and pred.is_cuda:
+                    calculate_loss_func = sigmoid_focal_loss
+                else:
+                    num_classes = pred.size(1)
+                    target = F.one_hot(target, num_classes=num_classes + 1)
+                    target = target[:, :num_classes]
+                    calculate_loss_func = py_sigmoid_focal_loss
+
+            loss_cls = self.loss_weight * calculate_loss_func(
+                pred,
+                target,
+                weight,
+                gamma=self.gamma,
+                alpha=self.alpha,
+                reduction=reduction,
+                avg_factor=avg_factor)
+
+        else:
+            raise NotImplementedError
+        return loss_cls
diff --git a/detrsmpl/models/losses/gan_loss.py b/detrsmpl/models/losses/gan_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..4f6b085acdbcfcce11cff6fb13bfebc7edb16080
--- /dev/null
+++ b/detrsmpl/models/losses/gan_loss.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+
+class GANLoss(nn.Module):
+    """Define GAN loss.
+
+    Args:
+        gan_type (str): Support 'vanilla', 'lsgan', 'wgan', 'hinge'.
+        real_label_val (float): The value for real label. Default: 1.0.
+        fake_label_val (float): The value for fake label. Default: 0.0.
+        loss_weight (float): Loss weight. Default: 1.0.
+            Note that loss_weight is only for generators; and it is always 1.0
+            for discriminators.
+    """
+    def __init__(self,
+                 gan_type,
+                 real_label_val=1.0,
+                 fake_label_val=0.0,
+                 loss_weight=1.0):
+        super().__init__()
+        self.gan_type = gan_type
+        self.loss_weight = loss_weight
+        self.real_label_val = real_label_val
+        self.fake_label_val = fake_label_val
+
+        if self.gan_type == 'vanilla':
+            self.loss = nn.BCEWithLogitsLoss()
+        elif self.gan_type == 'lsgan':
+            self.loss = nn.MSELoss()
+        elif self.gan_type == 'wgan':
+            self.loss = self._wgan_loss
+        elif self.gan_type == 'hinge':
+            self.loss = nn.ReLU()
+        else:
+            raise NotImplementedError(
+                f'GAN type {self.gan_type} is not implemented.')
+
+    @staticmethod
+    def _wgan_loss(input, target):
+        """wgan loss.
+
+        Args:
+            input (Tensor): Input tensor.
+            target (bool): Target label.
+        Returns:
+            Tensor: wgan loss.
+        """
+        return -input.mean() if target else input.mean()
+
+    def get_target_label(self, input, target_is_real):
+        """Get target label.
+
+        Args:
+            input (Tensor): Input tensor.
+            target_is_real (bool): Whether the target is real or fake.
+        Returns:
+            (bool | Tensor): Target tensor. Return bool for wgan, otherwise,
+                return Tensor.
+        """
+
+        if self.gan_type == 'wgan':
+            return target_is_real
+        target_val = (self.real_label_val
+                      if target_is_real else self.fake_label_val)
+        return input.new_ones(input.size()) * target_val
+
+    def forward(self, input, target_is_real, is_disc=False):
+        """
+        Args:
+            input (Tensor): The input for the loss module, i.e., the network
+                prediction.
+            target_is_real (bool): Whether the targe is real or fake.
+            is_disc (bool): Whether the loss for discriminators or not.
+                Default: False.
+        Returns:
+            Tensor: GAN loss value.
+        """
+        target_label = self.get_target_label(input, target_is_real)
+        if self.gan_type == 'hinge':
+            if is_disc:  # for discriminators in hinge-gan
+                input = -input if target_is_real else input
+                loss = self.loss(1 + input).mean()
+            else:  # for generators in hinge-gan
+                loss = -input.mean()
+        else:  # other gan types
+            loss = self.loss(input, target_label)
+
+        # loss_weight is always 1.0 for discriminators
+        return loss if is_disc else loss * self.loss_weight
diff --git a/detrsmpl/models/losses/iou_loss.py b/detrsmpl/models/losses/iou_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d3fdd3417fd2ee1a25fa28ce6c90c4700794ab2
--- /dev/null
+++ b/detrsmpl/models/losses/iou_loss.py
@@ -0,0 +1,458 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import mmcv
+import torch
+import torch.nn as nn
+from mmdet.core import bbox_overlaps
+
+from .utils import weighted_loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def iou_loss(pred, target, linear=False, mode='log', eps=1e-6):
+    """IoU loss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+    The loss is calculated as negative log of IoU.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+        eps (float): Eps to avoid log(0).
+
+    Return:
+        torch.Tensor: Loss tensor.
+    """
+    assert mode in ['linear', 'square', 'log']
+    if linear:
+        mode = 'linear'
+        warnings.warn('DeprecationWarning: Setting "linear=True" in '
+                      'iou_loss is deprecated, please use "mode=`linear`" '
+                      'instead.')
+    ious = bbox_overlaps(pred, target, is_aligned=True).clamp(min=eps)
+    if mode == 'linear':
+        loss = 1 - ious
+    elif mode == 'square':
+        loss = 1 - ious**2
+    elif mode == 'log':
+        loss = -ious.log()
+    else:
+        raise NotImplementedError
+    return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def bounded_iou_loss(pred, target, beta=0.2, eps=1e-3):
+    """BIoULoss.
+
+    This is an implementation of paper
+    `Improving Object Localization with Fitness NMS and Bounded IoU Loss.
+    <https://arxiv.org/abs/1711.00164>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes.
+        target (torch.Tensor): Target bboxes.
+        beta (float): beta parameter in smoothl1.
+        eps (float): eps to avoid NaN.
+    """
+    pred_ctrx = (pred[:, 0] + pred[:, 2]) * 0.5
+    pred_ctry = (pred[:, 1] + pred[:, 3]) * 0.5
+    pred_w = pred[:, 2] - pred[:, 0]
+    pred_h = pred[:, 3] - pred[:, 1]
+    with torch.no_grad():
+        target_ctrx = (target[:, 0] + target[:, 2]) * 0.5
+        target_ctry = (target[:, 1] + target[:, 3]) * 0.5
+        target_w = target[:, 2] - target[:, 0]
+        target_h = target[:, 3] - target[:, 1]
+
+    dx = target_ctrx - pred_ctrx
+    dy = target_ctry - pred_ctry
+
+    loss_dx = 1 - torch.max(
+        (target_w - 2 * dx.abs()) /
+        (target_w + 2 * dx.abs() + eps), torch.zeros_like(dx))
+    loss_dy = 1 - torch.max(
+        (target_h - 2 * dy.abs()) /
+        (target_h + 2 * dy.abs() + eps), torch.zeros_like(dy))
+    loss_dw = 1 - torch.min(target_w / (pred_w + eps), pred_w /
+                            (target_w + eps))
+    loss_dh = 1 - torch.min(target_h / (pred_h + eps), pred_h /
+                            (target_h + eps))
+    # view(..., -1) does not work for empty tensor
+    loss_comb = torch.stack([loss_dx, loss_dy, loss_dw, loss_dh],
+                            dim=-1).flatten(1)
+
+    loss = torch.where(loss_comb < beta, 0.5 * loss_comb * loss_comb / beta,
+                       loss_comb - 0.5 * beta)
+    return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def giou_loss(pred, target, eps=1e-7):
+    r"""`Generalized Intersection over Union: A Metric and A Loss for Bounding
+    Box Regression <https://arxiv.org/abs/1902.09630>`_.
+
+    Args:
+        pred (torch.Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (torch.Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+
+    Return:
+        Tensor: Loss tensor.
+    """
+    gious = bbox_overlaps(pred, target, mode='giou', is_aligned=True, eps=eps)
+    loss = 1 - gious
+    return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def diou_loss(pred, target, eps=1e-7):
+    r"""`Implementation of Distance-IoU Loss: Faster and Better
+    Learning for Bounding Box Regression, https://arxiv.org/abs/1911.08287`_.
+
+    Code is modified from https://github.com/Zzh-tju/DIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    # DIoU
+    dious = ious - rho2 / c2
+    loss = 1 - dious
+    return loss
+
+
+@mmcv.jit(derivate=True, coderize=True)
+@weighted_loss
+def ciou_loss(pred, target, eps=1e-7):
+    r"""`Implementation of paper `Enhancing Geometric Factors into
+    Model Learning and Inference for Object Detection and Instance
+    Segmentation <https://arxiv.org/abs/2005.03572>`_.
+
+    Code is modified from https://github.com/Zzh-tju/CIoU.
+
+    Args:
+        pred (Tensor): Predicted bboxes of format (x1, y1, x2, y2),
+            shape (n, 4).
+        target (Tensor): Corresponding gt bboxes, shape (n, 4).
+        eps (float): Eps to avoid log(0).
+    Return:
+        Tensor: Loss tensor.
+    """
+    # overlap
+    lt = torch.max(pred[:, :2], target[:, :2])
+    rb = torch.min(pred[:, 2:], target[:, 2:])
+    wh = (rb - lt).clamp(min=0)
+    overlap = wh[:, 0] * wh[:, 1]
+
+    # union
+    ap = (pred[:, 2] - pred[:, 0]) * (pred[:, 3] - pred[:, 1])
+    ag = (target[:, 2] - target[:, 0]) * (target[:, 3] - target[:, 1])
+    union = ap + ag - overlap + eps
+
+    # IoU
+    ious = overlap / union
+
+    # enclose area
+    enclose_x1y1 = torch.min(pred[:, :2], target[:, :2])
+    enclose_x2y2 = torch.max(pred[:, 2:], target[:, 2:])
+    enclose_wh = (enclose_x2y2 - enclose_x1y1).clamp(min=0)
+
+    cw = enclose_wh[:, 0]
+    ch = enclose_wh[:, 1]
+
+    c2 = cw**2 + ch**2 + eps
+
+    b1_x1, b1_y1 = pred[:, 0], pred[:, 1]
+    b1_x2, b1_y2 = pred[:, 2], pred[:, 3]
+    b2_x1, b2_y1 = target[:, 0], target[:, 1]
+    b2_x2, b2_y2 = target[:, 2], target[:, 3]
+
+    w1, h1 = b1_x2 - b1_x1, b1_y2 - b1_y1 + eps
+    w2, h2 = b2_x2 - b2_x1, b2_y2 - b2_y1 + eps
+
+    left = ((b2_x1 + b2_x2) - (b1_x1 + b1_x2))**2 / 4
+    right = ((b2_y1 + b2_y2) - (b1_y1 + b1_y2))**2 / 4
+    rho2 = left + right
+
+    factor = 4 / math.pi**2
+    v = factor * torch.pow(torch.atan(w2 / h2) - torch.atan(w1 / h1), 2)
+
+    with torch.no_grad():
+        alpha = (ious > 0.5).float() * v / (1 - ious + v)
+
+    # CIoU
+    cious = ious - (rho2 / c2 + alpha * v)
+    loss = 1 - cious.clamp(min=-1.0, max=1.0)
+    return loss
+
+
+class IoULoss(nn.Module):
+    """IoULoss.
+
+    Computing the IoU loss between a set of predicted bboxes and target bboxes.
+
+    Args:
+        linear (bool): If True, use linear scale of loss else determined
+            by mode. Default: False.
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+        mode (str): Loss scaling mode, including "linear", "square", and "log".
+            Default: 'log'
+    """
+    def __init__(self,
+                 linear=False,
+                 eps=1e-6,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 mode='log'):
+        super(IoULoss, self).__init__()
+        assert mode in ['linear', 'square', 'log']
+        if linear:
+            mode = 'linear'
+            warnings.warn('DeprecationWarning: Setting "linear=True" in '
+                          'IOULoss is deprecated, please use "mode=`linear`" '
+                          'instead.')
+        self.mode = mode
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * iou_loss(pred,
+                                           target,
+                                           weight,
+                                           mode=self.mode,
+                                           eps=self.eps,
+                                           reduction=reduction,
+                                           avg_factor=avg_factor,
+                                           **kwargs)
+        return loss
+
+
+class BoundedIoULoss(nn.Module):
+    def __init__(self, beta=0.2, eps=1e-3, reduction='mean', loss_weight=1.0):
+        super(BoundedIoULoss, self).__init__()
+        self.beta = beta
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss = self.loss_weight * bounded_iou_loss(pred,
+                                                   target,
+                                                   weight,
+                                                   beta=self.beta,
+                                                   eps=self.eps,
+                                                   reduction=reduction,
+                                                   avg_factor=avg_factor,
+                                                   **kwargs)
+        return loss
+
+
+class GIoULoss(nn.Module):
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(GIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * giou_loss(pred,
+                                            target,
+                                            weight,
+                                            eps=self.eps,
+                                            reduction=reduction,
+                                            avg_factor=avg_factor,
+                                            **kwargs)
+        return loss
+
+
+class DIoULoss(nn.Module):
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(DIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * diou_loss(pred,
+                                            target,
+                                            weight,
+                                            eps=self.eps,
+                                            reduction=reduction,
+                                            avg_factor=avg_factor,
+                                            **kwargs)
+        return loss
+
+
+class CIoULoss(nn.Module):
+    def __init__(self, eps=1e-6, reduction='mean', loss_weight=1.0):
+        super(CIoULoss, self).__init__()
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        if weight is not None and not torch.any(weight > 0):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # giou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * ciou_loss(pred,
+                                            target,
+                                            weight,
+                                            eps=self.eps,
+                                            reduction=reduction,
+                                            avg_factor=avg_factor,
+                                            **kwargs)
+        return loss
diff --git a/detrsmpl/models/losses/mse_loss.py b/detrsmpl/models/losses/mse_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..24d3ae6e5f5b3b16cb4c0501f246c75320b4fdb0
--- /dev/null
+++ b/detrsmpl/models/losses/mse_loss.py
@@ -0,0 +1,171 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .utils import weighted_loss
+
+
+def gmof(x, sigma):
+    """Geman-McClure error function."""
+    x_squared = x**2
+    sigma_squared = sigma**2
+    return (sigma_squared * x_squared) / (sigma_squared + x_squared)
+
+
+@weighted_loss
+def mse_loss(pred, target):
+    """Warpper of mse loss."""
+    return F.mse_loss(pred, target, reduction='none')
+
+
+@weighted_loss
+def mse_loss_with_gmof(pred, target, sigma):
+    """Extended MSE Loss with GMOF."""
+    loss = F.mse_loss(pred, target, reduction='none')
+    loss = gmof(loss, sigma)
+    return loss
+
+
+class MSELoss(nn.Module):
+    """MSELoss.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super().__init__()
+        assert reduction in (None, 'none', 'mean', 'sum')
+        reduction = 'none' if reduction is None else reduction
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): Weight of the loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss = self.loss_weight * mse_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss
+
+
+class KeypointMSELoss(nn.Module):
+    """MSELoss for 2D and 3D keypoints.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+        sigma (float, optional): Weighing parameter of Geman-McClure
+                error function. Defaults to 1.0 (no effect).
+        keypoint_weight (List[float], optional): Weighing parameter for each
+            keypoint. Shape should be (K). K: number of keypoints. Defaults to
+            None (no effect).
+    """
+    def __init__(self,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 sigma=1.0,
+                 keypoint_weight=None):
+        super().__init__()
+        assert reduction in (None, 'none', 'mean', 'sum')
+        reduction = 'none' if reduction is None else reduction
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.sigma = sigma
+        if keypoint_weight is None:
+            self.keypoint_weight = None
+        else:
+            self.keypoint_weight = torch.Tensor(keypoint_weight)
+
+    def forward(self,
+                pred,
+                target,
+                pred_conf=None,
+                target_conf=None,
+                keypoint_weight=None,
+                avg_factor=None,
+                loss_weight_override=None,
+                reduction_override=None):
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction. Shape should be (N, K, 2/3)
+                B: batch size. K: number of keypoints.
+            target (torch.Tensor): The learning target of the prediction.
+                Shape should be the same as pred.
+            pred_conf (optional, torch.Tensor): Confidence of
+                predicted keypoints. Shape should be (N, K).
+            target_conf (optional, torch.Tensor): Confidence of
+                target keypoints. Shape should be the same as pred_conf.
+            keypoint_weight (optional, torch.Tensor): keypoint-wise weight.
+                shape should be (K,). This weight allow different weights
+                to be assigned at different body parts.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            loss_weight_override (float, optional): The overall weight of loss
+                used to override the original weight of loss.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss_weight = (loss_weight_override if loss_weight_override is not None
+                       else self.loss_weight)
+
+        B, K, D = pred.shape
+        pred_conf = pred_conf.view((B, K, 1)) \
+            if pred_conf is not None else 1.0
+        target_conf = target_conf.view((B, K, 1)) \
+            if target_conf is not None else 1.0
+        keypoint_weight = keypoint_weight.view((1, K, 1)) \
+            if keypoint_weight is not None else \
+            self.keypoint_weight.view((1, K, 1)).type_as(pred) \
+            if self.keypoint_weight is not None else 1.0
+
+        weight = keypoint_weight * pred_conf * target_conf
+        assert isinstance(
+            weight,
+            float) or weight.shape == (B, K, 1) or weight.shape == (1, K, 1)
+
+        # B, J, D = pred.shape[:2]
+        # if len(weight.shape) == 1:
+        #     # for simplify tools
+        #     weight = weight.view(1, -1, 1)
+        # else:
+        #     # for body model estimator
+        #     weight = weight.view(B, J, 1)
+
+        loss = loss_weight * mse_loss_with_gmof(pred,
+                                                target,
+                                                weight,
+                                                reduction=reduction,
+                                                avg_factor=avg_factor,
+                                                sigma=self.sigma)
+
+        return loss
diff --git a/detrsmpl/models/losses/prior_loss.py b/detrsmpl/models/losses/prior_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..5509916afd539074f871856053c7bd61d8f2ccf3
--- /dev/null
+++ b/detrsmpl/models/losses/prior_loss.py
@@ -0,0 +1,754 @@
+import itertools
+import os
+import pickle
+import sys
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from detrsmpl.core.conventions.joints_mapping.standard_joint_angles import (
+    STANDARD_JOINT_ANGLE_LIMITS,
+    TRANSFORMATION_AA_TO_SJA,
+    TRANSFORMATION_SJA_TO_AA,
+)
+from detrsmpl.utils.keypoint_utils import search_limbs
+from detrsmpl.utils.transforms import aa_to_rot6d, aa_to_sja
+
+
+class ShapePriorLoss(nn.Module):
+    """Prior loss for body shape parameters.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super().__init__()
+        assert reduction in (None, 'none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                betas,
+                loss_weight_override=None,
+                reduction_override=None):
+        """Forward function of loss.
+
+        Args:
+            betas (torch.Tensor): The body shape parameters
+            loss_weight_override (float, optional): The weight of loss used to
+                override the original weight of loss
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss_weight = (loss_weight_override if loss_weight_override is not None
+                       else self.loss_weight)
+
+        shape_prior_loss = loss_weight * betas**2
+
+        if reduction == 'mean':
+            shape_prior_loss = shape_prior_loss.mean()
+        elif reduction == 'sum':
+            shape_prior_loss = shape_prior_loss.sum()
+
+        return shape_prior_loss
+
+
+class ShapeThresholdPriorLoss(nn.Module):
+    """Threshold loss for betas. Soft constraint to prevent parameters for
+    leaving feasible set. Implements a penalty constraint that encourages the
+    parameters to stay in the feasible set of solutions.
+
+    Args:
+        margin (int, optional): The threshold value
+        norm (str, optional): The loss method. Options are 'l1', l2'
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+    def __init__(self, margin=1, norm='l2', epsilon=1e-7, loss_weight=1.0):
+        super().__init__()
+        self.margin = margin
+        assert norm in ['l1', 'l2'], 'Norm variable must me l1 or l2'
+        self.norm = norm
+        self.epsilon = epsilon
+        self.loss_weight = loss_weight
+
+    def forward(self, betas):
+        """Forward function of loss.
+
+        Args:
+            betas (torch.Tensor): The body shape parameters
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        abs_values = betas.abs()
+        mask = abs_values.gt(self.margin)
+        invalid_values = torch.masked_select(betas, mask)
+
+        if self.norm == 'l1':
+            return self.loss_weight * invalid_values.abs().sum() / (
+                mask.to(dtype=betas.dtype).sum() + self.epsilon)
+        elif self.norm == 'l2':
+            return self.loss_weight * invalid_values.pow(2).sum() / (
+                mask.to(dtype=betas.dtype).sum() + self.epsilon)
+
+
+class PoseRegLoss(nn.Module):
+    """Regulizer loss for body pose parameters.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super().__init__()
+        assert reduction in (None, 'none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                body_pose,
+                weight=None,
+                avg_factor=None,
+                loss_weight_override=None,
+                reduction_override=None):
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss_weight = (loss_weight_override if loss_weight_override is not None
+                       else self.loss_weight)
+
+        pose_prior_loss = loss_weight * (body_pose**2)
+
+        if reduction == 'mean':
+            pose_prior_loss = pose_prior_loss.mean()
+        elif reduction == 'sum':
+            pose_prior_loss = pose_prior_loss.sum()
+
+        return pose_prior_loss
+
+
+class LimbLengthLoss(nn.Module):
+    """Limb length loss for body shape parameters. As betas are associated with
+    the height of a person, fitting on limb length help determine body shape
+    parameters. It penalizes the L2 distance between target limb length and
+    pred limb length. Note that it should take keypoints3d as input, as limb
+    length computed from keypoints2d varies with camera.
+
+    Args:
+        convention (str): Limb convention to search for keypoint connections.
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+        eps (float, optional): epsilon for computing normalized limb vector.
+            Defaults to 1e-4.
+    """
+    def __init__(self,
+                 convention,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 eps=1e-4):
+        super().__init__()
+        assert reduction in (None, 'none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.eps = eps
+        limb_idxs, _ = search_limbs(data_source=convention)
+        limb_idxs = sorted(limb_idxs['body'])
+        self.limb_idxs = np.array(
+            list(x for x, _ in itertools.groupby(limb_idxs)))
+
+    def _compute_limb_length(self, keypoints3d):
+        kp_src = keypoints3d[:, self.limb_idxs[:, 0], :3]
+        kp_dst = keypoints3d[:, self.limb_idxs[:, 1], :3]
+        limb_vec = kp_dst - kp_src
+        limb_length = torch.norm(limb_vec, dim=2)
+        return limb_length
+
+    def _keypoint_conf_to_limb_conf(self, keypoint_conf):
+        limb_conf = torch.min(keypoint_conf[:, self.limb_idxs[:, 1]],
+                              keypoint_conf[:, self.limb_idxs[:, 0]])
+        return limb_conf
+
+    def forward(self,
+                pred,
+                target,
+                pred_conf=None,
+                target_conf=None,
+                loss_weight_override=None,
+                reduction_override=None):
+        """Forward function of LimbLengthLoss.
+
+        Args:
+            pred (torch.Tensor): The predicted smpl keypoints3d.
+                Shape should be (N, K, 3).
+                B: batch size. K: number of keypoints.
+            target (torch.Tensor): The ground-truth keypoints3d.
+                Shape should be (N, K, 3).
+            pred_conf (torch.Tensor, optional): Confidence of
+                predicted keypoints. Shape should be (N, K).
+            target_conf (torch.Tensor, optional): Confidence of
+                target keypoints. Shape should be (N, K).
+            loss_weight_override (float, optional): The weight of loss used to
+                override the original weight of loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert pred.dim() == 3 and pred.shape[-1] == 3
+        assert pred.shape == target.shape
+        if pred_conf is not None:
+            assert pred_conf.dim() == 2
+            assert pred_conf.shape == pred.shape[:2]
+        if target_conf is not None:
+            assert target_conf.dim() == 2
+            assert target_conf.shape == target.shape[:2]
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss_weight = (loss_weight_override if loss_weight_override is not None
+                       else self.loss_weight)
+
+        limb_len_target = self._compute_limb_length(target)
+        limb_len_pred = self._compute_limb_length(pred)
+
+        if target_conf is None:
+            target_conf = torch.ones_like(target[..., 0])
+        if pred_conf is None:
+            pred_conf = torch.ones_like(pred[..., 0])
+        limb_conf_target = self._keypoint_conf_to_limb_conf(target_conf)
+        limb_conf_pred = self._keypoint_conf_to_limb_conf(pred_conf)
+        limb_conf = limb_conf_target * limb_conf_pred
+
+        diff_len = limb_len_target - limb_len_pred
+        loss = diff_len**2 * limb_conf
+
+        if reduction == 'mean':
+            loss = loss.mean()
+        elif reduction == 'sum':
+            loss = loss.sum()
+
+        loss *= loss_weight
+
+        return loss
+
+
+class JointPriorLoss(nn.Module):
+    """Prior loss for joint angles.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+        use_full_body (bool, optional): Use full set of joint constraints
+            (in standard joint angles).
+        smooth_spine (bool, optional): Ensuring smooth spine rotations
+        smooth_spine_loss_weight (float, optional): An additional weight
+            factor multiplied on smooth spine loss
+    """
+    def __init__(self,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 use_full_body=False,
+                 smooth_spine=False,
+                 smooth_spine_loss_weight=1.0):
+        super().__init__()
+        assert reduction in (None, 'none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.use_full_body = use_full_body
+        self.smooth_spine = smooth_spine
+        self.smooth_spine_loss_weight = smooth_spine_loss_weight
+
+        if self.use_full_body:
+            self.register_buffer('R_t', TRANSFORMATION_AA_TO_SJA)
+            self.register_buffer('R_t_inv', TRANSFORMATION_SJA_TO_AA)
+            self.register_buffer('sja_limits', STANDARD_JOINT_ANGLE_LIMITS)
+
+    def forward(self,
+                body_pose,
+                loss_weight_override=None,
+                reduction_override=None):
+        """Forward function of loss.
+
+        Args:
+            body_pose (torch.Tensor): The body pose parameters
+            loss_weight_override (float, optional): The weight of loss used to
+                override the original weight of loss
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss_weight = (loss_weight_override if loss_weight_override is not None
+                       else self.loss_weight)
+
+        if self.use_full_body:
+            batch_size = body_pose.shape[0]
+            body_pose_reshape = body_pose.reshape(batch_size, -1, 3)
+            assert body_pose_reshape.shape[1] in (21, 23)  # smpl-x, smpl
+            body_pose_reshape = body_pose_reshape[:, :21, :]
+
+            body_pose_sja = aa_to_sja(body_pose_reshape, self.R_t,
+                                      self.R_t_inv)
+
+            lower_limits = self.sja_limits[:, :, 0]  # shape: (21, 3)
+            upper_limits = self.sja_limits[:, :, 1]  # shape: (21, 3)
+
+            lower_loss = (torch.exp(F.relu(lower_limits - body_pose_sja)) -
+                          1).pow(2)
+            upper_loss = (torch.exp(F.relu(body_pose_sja - upper_limits)) -
+                          1).pow(2)
+
+            standard_joint_angle_prior_loss = (lower_loss + upper_loss).view(
+                body_pose.shape[0], -1)  # shape: (n, 3)
+
+            joint_prior_loss = standard_joint_angle_prior_loss
+
+        else:
+            # default joint prior loss applied on elbows and knees
+            joint_prior_loss = (torch.exp(
+                body_pose[:, [55, 58, 12, 15]] *
+                torch.tensor([1., -1., -1, -1.], device=body_pose.device)) -
+                                1)**2
+
+        if self.smooth_spine:
+            spine1 = body_pose[:, [9, 10, 11]]
+            spine2 = body_pose[:, [18, 19, 20]]
+            spine3 = body_pose[:, [27, 28, 29]]
+            smooth_spine_loss_12 = (torch.exp(F.relu(-spine1 * spine2)) -
+                                    1).pow(2) * self.smooth_spine_loss_weight
+            smooth_spine_loss_23 = (torch.exp(F.relu(-spine2 * spine3)) -
+                                    1).pow(2) * self.smooth_spine_loss_weight
+
+            joint_prior_loss = torch.cat(
+                [joint_prior_loss, smooth_spine_loss_12, smooth_spine_loss_23],
+                axis=1)
+
+        joint_prior_loss = loss_weight * joint_prior_loss
+
+        if reduction == 'mean':
+            joint_prior_loss = joint_prior_loss.mean()
+        elif reduction == 'sum':
+            joint_prior_loss = joint_prior_loss.sum()
+
+        return joint_prior_loss
+
+
+class SmoothJointLoss(nn.Module):
+    """Smooth loss for joint angles.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+        degree (bool, optional): The flag which represents whether the input
+            tensor is in degree or radian.
+    """
+    def __init__(self,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 degree=False,
+                 loss_func='L1'):
+        super().__init__()
+        assert reduction in (None, 'none', 'mean', 'sum')
+        assert loss_func in ('L1', 'L2')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.degree = degree
+        self.loss_func = loss_func
+
+    def forward(self,
+                body_pose,
+                loss_weight_override=None,
+                reduction_override=None):
+        """Forward function of SmoothJointLoss.
+
+        Args:
+            body_pose (torch.Tensor): The body pose parameters
+            loss_weight_override (float, optional): The weight of loss used to
+                override the original weight of loss
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss_weight = (loss_weight_override if loss_weight_override is not None
+                       else self.loss_weight)
+
+        theta = body_pose.reshape(body_pose.shape[0], -1, 3)
+        if self.degree:
+            theta = torch.deg2rad(theta)
+        rot_6d = aa_to_rot6d(theta)
+        rot_6d_diff = rot_6d[1:] - rot_6d[:-1]
+
+        if self.loss_func == 'L2':
+            smooth_joint_loss = (rot_6d_diff**2).sum(dim=[1, 2])
+        elif self.loss_func == 'L1':
+            smooth_joint_loss = rot_6d_diff.abs().sum(dim=[1, 2])
+        else:
+            raise TypeError(f'{self.func} is not defined')
+
+        # add zero padding to retain original batch_size
+        smooth_joint_loss = torch.cat(
+            [torch.zeros_like(smooth_joint_loss)[:1], smooth_joint_loss])
+
+        if reduction == 'mean':
+            smooth_joint_loss = smooth_joint_loss.mean()
+        elif reduction == 'sum':
+            smooth_joint_loss = smooth_joint_loss.sum()
+
+        smooth_joint_loss *= loss_weight
+
+        return smooth_joint_loss
+
+
+class SmoothPelvisLoss(nn.Module):
+    """Smooth loss for pelvis angles.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+        degree (bool, optional): The flag which represents whether the input
+            tensor is in degree or radian.
+    """
+    def __init__(self, reduction='mean', loss_weight=1.0, degree=False):
+        super().__init__()
+        assert reduction in (None, 'none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.degree = degree
+
+    def forward(self,
+                global_orient,
+                loss_weight_override=None,
+                reduction_override=None):
+        """Forward function of SmoothPelvisLoss.
+
+        Args:
+            global_orient (torch.Tensor): The global orientation parameters
+            loss_weight_override (float, optional): The weight of loss used to
+                override the original weight of loss
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss_weight = (loss_weight_override if loss_weight_override is not None
+                       else self.loss_weight)
+
+        if self.degree:
+            global_orient = torch.deg2rad(global_orient)
+
+        pelvis = global_orient.unsqueeze(1)
+        rot_6d = aa_to_rot6d(pelvis)
+
+        rot_6d_diff = rot_6d[1:] - rot_6d[:-1]
+        smooth_pelvis_loss = rot_6d_diff.abs().sum(dim=-1)
+
+        # add zero padding to retain original batch_size
+        smooth_pelvis_loss = torch.cat(
+            [torch.zeros_like(smooth_pelvis_loss)[:1],
+             smooth_pelvis_loss]).sum(dim=-1)
+
+        smooth_pelvis_loss = loss_weight * smooth_pelvis_loss
+
+        if reduction == 'mean':
+            smooth_pelvis_loss = smooth_pelvis_loss.mean()
+        elif reduction == 'sum':
+            smooth_pelvis_loss = smooth_pelvis_loss.sum()
+
+        return smooth_pelvis_loss
+
+
+class SmoothTranslationLoss(nn.Module):
+    """Smooth loss for translations.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super().__init__()
+        assert reduction in (None, 'none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                translation,
+                loss_weight_override=None,
+                reduction_override=None):
+        """Forward function of loss.
+
+        Args:
+            translation (torch.Tensor): The body translation parameters
+            loss_weight_override (float, optional): The weight of loss used to
+                override the original weight of loss
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss_weight = (loss_weight_override if loss_weight_override is not None
+                       else self.loss_weight)
+
+        translation_diff = translation[1:] - translation[:-1]
+        smooth_translation_loss = translation_diff.abs().sum(dim=-1,
+                                                             keepdim=True)
+
+        # add zero padding to retain original batch_size
+        smooth_translation_loss = torch.cat([
+            torch.zeros_like(smooth_translation_loss)[:1],
+            smooth_translation_loss
+        ]).sum(dim=-1)
+
+        smooth_translation_loss *= 1e3
+
+        smooth_translation_loss = loss_weight * \
+            smooth_translation_loss
+
+        if reduction == 'mean':
+            smooth_translation_loss = smooth_translation_loss.mean()
+        elif reduction == 'sum':
+            smooth_translation_loss = smooth_translation_loss.sum()
+
+        return smooth_translation_loss
+
+
+class CameraPriorLoss(nn.Module):
+    """Prior loss for predicted camera.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        scale (float, optional): The scale coefficient for regularizing camera
+            parameters. Defaults to 10
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+    def __init__(self, scale=10, reduction='mean', loss_weight=1.0):
+        super().__init__()
+        self.scale = scale
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                cameras,
+                loss_weight_override=None,
+                reduction_override=None):
+        """Forward function of loss.
+
+        Args:
+            cameras (torch.Tensor): The predicted camera parameters
+            loss_weight_override (float, optional): The weight of loss used to
+                override the original weight of loss
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss_weight = (loss_weight_override if loss_weight_override is not None
+                       else self.loss_weight)
+
+        camera_prior_loss = torch.exp(-cameras[:, 0] * self.scale)
+        camera_prior_loss = torch.pow(camera_prior_loss, 2) * loss_weight
+
+        if reduction == 'mean':
+            camera_prior_loss = camera_prior_loss.mean()
+        elif reduction == 'sum':
+            camera_prior_loss = camera_prior_loss.sum()
+
+        return camera_prior_loss
+
+
+class MaxMixturePrior(nn.Module):
+    """Ref: SMPLify-X
+    https://github.com/vchoutas/smplify-x/blob/master/smplifyx/prior.py
+    """
+    def __init__(self,
+                 prior_folder='data',
+                 num_gaussians=8,
+                 dtype=torch.float32,
+                 epsilon=1e-16,
+                 use_merged=True,
+                 reduction=None,
+                 loss_weight=1.0):
+        super(MaxMixturePrior, self).__init__()
+
+        assert reduction in (None, 'none', 'mean', 'sum')
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+        if dtype == torch.float32:
+            np_dtype = np.float32
+        elif dtype == torch.float64:
+            np_dtype = np.float64
+        else:
+            print('Unknown float type {}, exiting!'.format(dtype))
+            sys.exit(-1)
+
+        self.num_gaussians = num_gaussians
+        self.epsilon = epsilon
+        self.use_merged = use_merged
+        gmm_fn = 'gmm_{:02d}.pkl'.format(num_gaussians)
+
+        full_gmm_fn = os.path.join(prior_folder, gmm_fn)
+        if not os.path.exists(full_gmm_fn):
+            print('The path to the mixture prior "{}"'.format(full_gmm_fn) +
+                  ' does not exist, exiting!')
+            sys.exit(-1)
+
+        with open(full_gmm_fn, 'rb') as f:
+            gmm = pickle.load(f, encoding='latin1')
+
+        if type(gmm) == dict:
+            means = gmm['means'].astype(np_dtype)
+            covs = gmm['covars'].astype(np_dtype)
+            weights = gmm['weights'].astype(np_dtype)
+        elif 'sklearn.mixture.gmm.GMM' in str(type(gmm)):
+            means = gmm.means_.astype(np_dtype)
+            covs = gmm.covars_.astype(np_dtype)
+            weights = gmm.weights_.astype(np_dtype)
+        else:
+            print('Unknown type for the prior: {}, exiting!'.format(type(gmm)))
+            sys.exit(-1)
+
+        self.register_buffer('means', torch.tensor(means, dtype=dtype))
+
+        self.register_buffer('covs', torch.tensor(covs, dtype=dtype))
+
+        precisions = [np.linalg.inv(cov) for cov in covs]
+        precisions = np.stack(precisions).astype(np_dtype)
+
+        self.register_buffer('precisions', torch.tensor(precisions,
+                                                        dtype=dtype))
+
+        # The constant term:
+        sqrdets = np.array([(np.sqrt(np.linalg.det(c)))
+                            for c in gmm['covars']])
+        const = (2 * np.pi)**(69 / 2.)
+
+        nll_weights = np.asarray(gmm['weights'] / (const *
+                                                   (sqrdets / sqrdets.min())))
+        nll_weights = torch.tensor(nll_weights, dtype=dtype).unsqueeze(dim=0)
+        self.register_buffer('nll_weights', nll_weights)
+
+        weights = torch.tensor(gmm['weights'], dtype=dtype).unsqueeze(dim=0)
+        self.register_buffer('weights', weights)
+
+        self.register_buffer('pi_term',
+                             torch.log(torch.tensor(2 * np.pi, dtype=dtype)))
+
+        cov_dets = [
+            np.log(np.linalg.det(cov.astype(np_dtype)) + epsilon)
+            for cov in covs
+        ]
+        self.register_buffer('cov_dets', torch.tensor(cov_dets, dtype=dtype))
+
+        # The dimensionality of the random variable
+        self.random_var_dim = self.means.shape[1]
+
+    def get_mean(self):
+        """Returns the mean of the mixture."""
+        mean_pose = torch.matmul(self.weights, self.means)
+        return mean_pose
+
+    def merged_log_likelihood(self, pose):
+        diff_from_mean = pose.unsqueeze(dim=1) - self.means
+
+        prec_diff_prod = torch.einsum('mij,bmj->bmi',
+                                      [self.precisions, diff_from_mean])
+        diff_prec_quadratic = (prec_diff_prod * diff_from_mean).sum(dim=-1)
+
+        curr_loglikelihood = 0.5 * diff_prec_quadratic - \
+            torch.log(self.nll_weights)
+        #  curr_loglikelihood = 0.5 * (self.cov_dets.unsqueeze(dim=0) +
+        #  self.random_var_dim * self.pi_term +
+        #  diff_prec_quadratic
+        #  ) - torch.log(self.weights)
+
+        min_likelihood, _ = torch.min(curr_loglikelihood, dim=1)
+        return min_likelihood
+
+    def log_likelihood(self, pose):
+        """Create graph operation for negative log-likelihood calculation."""
+        likelihoods = []
+
+        for idx in range(self.num_gaussians):
+            mean = self.means[idx]
+            prec = self.precisions[idx]
+            cov = self.covs[idx]
+            diff_from_mean = pose - mean
+
+            curr_loglikelihood = torch.einsum('bj,ji->bi',
+                                              [diff_from_mean, prec])
+            curr_loglikelihood = torch.einsum(
+                'bi,bi->b', [curr_loglikelihood, diff_from_mean])
+            cov_term = torch.log(torch.det(cov) + self.epsilon)
+            curr_loglikelihood += 0.5 * (cov_term +
+                                         self.random_var_dim * self.pi_term)
+            likelihoods.append(curr_loglikelihood)
+
+        log_likelihoods = torch.stack(likelihoods, dim=1)
+        min_idx = torch.argmin(log_likelihoods, dim=1)
+        weight_component = self.nll_weights[:, min_idx]
+        weight_component = -torch.log(weight_component)
+
+        return weight_component + log_likelihoods[:, min_idx]
+
+    def forward(self,
+                body_pose,
+                loss_weight_override=None,
+                reduction_override=None):
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss_weight = (loss_weight_override if loss_weight_override is not None
+                       else self.loss_weight)
+
+        if self.use_merged:
+            pose_prior_loss = self.merged_log_likelihood(body_pose)
+        else:
+            pose_prior_loss = self.log_likelihood(body_pose)
+
+        pose_prior_loss = loss_weight * pose_prior_loss
+
+        if reduction == 'mean':
+            pose_prior_loss = pose_prior_loss.mean()
+        elif reduction == 'sum':
+            pose_prior_loss = pose_prior_loss.sum()
+
+        return pose_prior_loss
diff --git a/detrsmpl/models/losses/rotaion_distance_loss.py b/detrsmpl/models/losses/rotaion_distance_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..51118826fcd737d7d6728dd5b065c366727d8b84
--- /dev/null
+++ b/detrsmpl/models/losses/rotaion_distance_loss.py
@@ -0,0 +1,62 @@
+import torch
+import torch.nn as nn
+
+
+def rotation_distance_loss(pred, target, epsilon):
+    """Warpper of rotation distance loss."""
+    tr = torch.einsum(
+        'bij,bij->b',
+        [pred.view(-1, 3, 3), target.view(-1, 3, 3)])
+    theta = (tr - 1) * 0.5
+    loss = torch.acos(torch.clamp(theta, -1 + epsilon, 1 - epsilon))
+    return loss
+
+
+class RotationDistance(nn.Module):
+    """Rotation Distance Loss.
+
+    Args:
+        reduction (str, optional): The method that reduces the loss to a
+            scalar. Options are "none", "mean" and "sum".
+        epsilon (float, optional): A minimal value to avoid NaN.
+        loss_weight (float, optional): The weight of the loss. Defaults to 1.0
+    """
+    def __init__(self, reduction='mean', epsilon=1e-7, loss_weight=1.0):
+        super(RotationDistance, self).__init__()
+        assert reduction in (None, 'none', 'mean', 'sum')
+        reduction = 'none' if reduction is None else reduction
+        self.reduction = reduction
+        self.epsilon = epsilon
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function of loss.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): Weight of the loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        loss = self.loss_weight * rotation_distance_loss(
+            pred, target, epsilon=self.epsilon)
+        if weight is not None:
+            loss = loss.view(pred.shape[0], -1) * weight.view(
+                pred.shape[0], -1)
+            return loss.sum() / (weight.gt(0).sum() + self.epsilon)
+        else:
+            return loss.sum() / pred.shape[0]
diff --git a/detrsmpl/models/losses/smooth_l1_loss.py b/detrsmpl/models/losses/smooth_l1_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..63cf2709503ec72955356a7beca14f93a1a9b5eb
--- /dev/null
+++ b/detrsmpl/models/losses/smooth_l1_loss.py
@@ -0,0 +1,128 @@
+import torch
+import torch.nn as nn
+
+from .utils import weighted_loss
+
+
+@weighted_loss
+def smooth_l1_loss(pred, target, beta=1.0):
+    """Smooth L1 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert beta > 0
+    assert pred.size() == target.size() and target.numel() > 0
+    diff = torch.abs(pred - target)
+    loss = torch.where(diff < beta, 0.5 * diff * diff / beta,
+                       diff - 0.5 * beta)
+    return loss
+
+
+@weighted_loss
+def l1_loss(pred, target):
+    """L1 loss.
+
+    Args:
+        pred (torch.Tensor): The prediction.
+        target (torch.Tensor): The learning target of the prediction.
+    Returns:
+        torch.Tensor: Calculated loss
+    """
+    assert pred.size() == target.size() and target.numel() > 0
+    loss = torch.abs(pred - target)
+    return loss
+
+
+class SmoothL1Loss(nn.Module):
+    """Smooth L1 loss.
+
+    Args:
+        beta (float, optional): The threshold in the piecewise function.
+            Defaults to 1.0.
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum". Defaults to "mean".
+        loss_weight (float, optional): The weight of loss.
+    """
+    def __init__(self, beta=1.0, reduction='mean', loss_weight=1.0):
+        super(SmoothL1Loss, self).__init__()
+        self.beta = beta
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss = self.loss_weight * smooth_l1_loss(pred,
+                                                 target,
+                                                 weight,
+                                                 beta=self.beta,
+                                                 reduction=reduction,
+                                                 avg_factor=avg_factor,
+                                                 **kwargs)
+        return loss
+
+
+class L1Loss(nn.Module):
+    """L1 loss.
+
+    Args:
+        reduction (str, optional): The method to reduce the loss.
+            Options are "none", "mean" and "sum".
+        loss_weight (float, optional): The weight of loss.
+    """
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(L1Loss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        loss = self.loss_weight * l1_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss
diff --git a/detrsmpl/models/losses/utils.py b/detrsmpl/models/losses/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..322a0f149ef0a975fad3b2ee41c94757e9af0e37
--- /dev/null
+++ b/detrsmpl/models/losses/utils.py
@@ -0,0 +1,119 @@
+import functools
+
+import torch
+import torch.nn.functional as F
+
+
+def reduce_loss(loss, reduction):
+    """Reduce loss as specified.
+
+    Args:
+        loss (Tensor): Elementwise loss tensor.
+        reduction (str): Options are "none", "mean" and "sum".
+
+    Return:
+        Tensor: Reduced loss tensor.
+    """
+    reduction_enum = F._Reduction.get_enum(reduction)
+    # none: 0, elementwise_mean:1, sum: 2
+    if reduction_enum == 0:
+        return loss
+    elif reduction_enum == 1:
+        return loss.mean()
+    elif reduction_enum == 2:
+        return loss.sum()
+
+
+def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+    """Apply element-wise weight and reduce loss.
+
+    Args:
+        loss (Tensor): Element-wise loss.
+        weight (Tensor): Element-wise weights.
+        reduction (str): Same as built-in losses of PyTorch.
+        avg_factor (float): Average factor when computing the mean of losses.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    # if weight is specified, apply element-wise weight
+    if weight is not None:
+        loss = loss * weight
+
+    # if avg_factor is not specified, just reduce the loss
+    if avg_factor is None:
+        loss = reduce_loss(loss, reduction)
+    else:
+        # if reduction is mean, then average the loss by avg_factor
+        if reduction == 'mean':
+            loss = loss.sum() / avg_factor
+        # if reduction is 'none', then do nothing, otherwise raise an error
+        elif reduction != 'none':
+            raise ValueError('avg_factor can not be used with reduction="sum"')
+    return loss
+
+
+def weighted_loss(loss_func):
+    """Create a weighted version of a given loss function.
+
+    To use this decorator, the loss function must have the signature like
+    `loss_func(pred, target, **kwargs)`. The function only needs to compute
+    element-wise loss without any reduction. This decorator will add weight
+    and reduction arguments to the function. The decorated function will have
+    the signature like `loss_func(pred, target, weight=None, reduction='mean',
+    avg_factor=None, **kwargs)`.
+
+    :Example:
+
+    >>> import torch
+    >>> @weighted_loss
+    >>> def l1_loss(pred, target):
+    >>>     return (pred - target).abs()
+
+    >>> pred = torch.Tensor([0, 2, 3])
+    >>> target = torch.Tensor([1, 1, 1])
+    >>> weight = torch.Tensor([1, 0, 1])
+
+    >>> l1_loss(pred, target)
+    tensor(1.3333)
+    >>> l1_loss(pred, target, weight)
+    tensor(1.)
+    >>> l1_loss(pred, target, reduction='none')
+    tensor([1., 1., 2.])
+    >>> l1_loss(pred, target, weight, avg_factor=2)
+    tensor(1.5000)
+    """
+    @functools.wraps(loss_func)
+    def wrapper(pred,
+                target,
+                weight=None,
+                reduction='mean',
+                avg_factor=None,
+                **kwargs):
+        # get element-wise loss
+        loss = loss_func(pred, target, **kwargs)
+        loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+        return loss
+
+    return wrapper
+
+
+def convert_to_one_hot(targets: torch.Tensor, classes) -> torch.Tensor:
+    """This function converts target class indices to one-hot vectors, given
+    the number of classes.
+
+    Args:
+        targets (Tensor): The ground truth label of the prediction
+                with shape (N, 1)
+        classes (int): the number of classes.
+
+    Returns:
+        Tensor: Processed loss values.
+    """
+    assert (torch.max(targets).item() <
+            classes), 'Class Index must be less than number of classes'
+    one_hot_targets = torch.zeros((targets.shape[0], classes),
+                                  dtype=torch.long,
+                                  device=targets.device)
+    one_hot_targets.scatter_(1, targets.long(), 1)
+    return one_hot_targets
diff --git a/detrsmpl/models/necks/__init__.py b/detrsmpl/models/necks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..395994c223756dacb7d5f78063f71d42f6741bf7
--- /dev/null
+++ b/detrsmpl/models/necks/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .channel_mapper import ChannelMapper
+
+__all__ = ['ChannelMapper']
diff --git a/detrsmpl/models/necks/builder.py b/detrsmpl/models/necks/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..65c9b0c1ccf0999dc11d38ba9086d5cd7a009bae
--- /dev/null
+++ b/detrsmpl/models/necks/builder.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.utils import Registry
+
+from .temporal_encoder import TemporalGRUEncoder
+
+NECKS = Registry('necks')
+
+NECKS.register_module(name='TemporalGRUEncoder', module=TemporalGRUEncoder)
+
+
+def build_neck(cfg):
+    """Build neck."""
+    if cfg is None:
+        return None
+    return NECKS.build(cfg)
diff --git a/detrsmpl/models/necks/channel_mapper.py b/detrsmpl/models/necks/channel_mapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..eac9a72b30be99a689ea4477c6090258f7001074
--- /dev/null
+++ b/detrsmpl/models/necks/channel_mapper.py
@@ -0,0 +1,98 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmcv.runner import BaseModule
+
+from .builder import NECKS
+
+
+@NECKS.register_module()
+class ChannelMapper(BaseModule):
+    r"""Channel Mapper to reduce/increase channels of backbone features.
+
+    This is used to reduce/increase channels of backbone features.
+
+    Args:
+        in_channels (List[int]): Number of input channels per scale.
+        out_channels (int): Number of output channels (used at each scale).
+        kernel_size (int, optional): kernel_size for reducing channels (used
+            at each scale). Default: 3.
+        conv_cfg (dict, optional): Config dict for convolution layer.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU').
+        num_outs (int, optional): Number of output feature maps. There
+            would be extra_convs when num_outs larger than the length
+            of in_channels.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    Example:
+        >>> import torch
+        >>> in_channels = [2, 3, 5, 7]
+        >>> scales = [340, 170, 84, 43]
+        >>> inputs = [torch.rand(1, c, s, s)
+        ...           for c, s in zip(in_channels, scales)]
+        >>> self = ChannelMapper(in_channels, 11, 3).eval()
+        >>> outputs = self.forward(inputs)
+        >>> for i in range(len(outputs)):
+        ...     print(f'outputs[{i}].shape = {outputs[i].shape}')
+        outputs[0].shape = torch.Size([1, 11, 340, 340])
+        outputs[1].shape = torch.Size([1, 11, 170, 170])
+        outputs[2].shape = torch.Size([1, 11, 84, 84])
+        outputs[3].shape = torch.Size([1, 11, 43, 43])
+    """
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=dict(type='ReLU'),
+                 num_outs=None,
+                 init_cfg=dict(type='Xavier',
+                               layer='Conv2d',
+                               distribution='uniform')):
+        super(ChannelMapper, self).__init__(init_cfg)
+        assert isinstance(in_channels, list)
+        self.extra_convs = None
+        if num_outs is None:
+            num_outs = len(in_channels)
+        self.convs = nn.ModuleList()
+        for in_channel in in_channels:
+            self.convs.append(
+                ConvModule(in_channel,
+                           out_channels,
+                           kernel_size,
+                           padding=(kernel_size - 1) // 2,
+                           conv_cfg=conv_cfg,
+                           norm_cfg=norm_cfg,
+                           act_cfg=act_cfg))
+        if num_outs > len(in_channels):
+            self.extra_convs = nn.ModuleList()
+            for i in range(len(in_channels), num_outs):
+                if i == len(in_channels):
+                    in_channel = in_channels[-1]
+                else:
+                    in_channel = out_channels
+                self.extra_convs.append(
+                    ConvModule(in_channel,
+                               out_channels,
+                               3,
+                               stride=2,
+                               padding=1,
+                               conv_cfg=conv_cfg,
+                               norm_cfg=norm_cfg,
+                               act_cfg=act_cfg))
+
+    def forward(self, inputs):
+        """Forward function."""
+        assert len(inputs) == len(self.convs)
+        outs = [self.convs[i](inputs[i]) for i in range(len(inputs))]
+        if self.extra_convs:
+            for i in range(len(self.extra_convs)):
+                if i == 0:
+                    outs.append(self.extra_convs[0](inputs[-1]))
+                else:
+                    outs.append(self.extra_convs[i](outs[-1]))
+        return tuple(outs)
diff --git a/detrsmpl/models/necks/temporal_encoder.py b/detrsmpl/models/necks/temporal_encoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccb0da5ff7225dcb9263f22fbd0d8bc549e5bdcd
--- /dev/null
+++ b/detrsmpl/models/necks/temporal_encoder.py
@@ -0,0 +1,41 @@
+from typing import Optional, Union
+
+import torch.nn as nn
+from mmcv.runner.base_module import BaseModule
+
+
+class TemporalGRUEncoder(BaseModule):
+    """TemporalEncoder used for VIBE. Adapted from
+    https://github.com/mkocabas/VIBE.
+
+    Args:
+        input_size (int, optional): dimension of input feature. Default: 2048.
+        num_layer (int, optional): number of layers for GRU. Default: 1.
+        hidden_size (int, optional): hidden size for GRU. Default: 2048.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+    def __init__(self,
+                 input_size: Optional[int] = 2048,
+                 num_layers: Optional[int] = 1,
+                 hidden_size: Optional[int] = 2048,
+                 init_cfg: Optional[Union[list, dict, None]] = None):
+        super(TemporalGRUEncoder, self).__init__(init_cfg)
+
+        self.input_size = input_size
+        self.hidden_size = hidden_size
+        self.gru = nn.GRU(input_size=input_size,
+                          hidden_size=hidden_size,
+                          bidirectional=False,
+                          num_layers=num_layers)
+        self.relu = nn.ReLU()
+        self.linear = self.linear = nn.Linear(hidden_size, input_size)
+
+    def forward(self, x):
+        N, T = x.shape[:2]
+        x = x.permute(1, 0, 2)
+        y, _ = self.gru(x)
+        y = self.linear(self.relu(y).view(-1, self.hidden_size))
+        y = y.view(T, N, self.input_size) + x
+        y = y.permute(1, 0, 2).contiguous()
+        return y
diff --git a/detrsmpl/models/registrants/__init__.py b/detrsmpl/models/registrants/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/models/registrants/builder.py b/detrsmpl/models/registrants/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..9da8c4e72cf0ea797b6148d45281668b02ef3940
--- /dev/null
+++ b/detrsmpl/models/registrants/builder.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.utils import Registry
+
+from .smplify import SMPLify
+from .smplifyx import SMPLifyX
+
+REGISTRANTS = Registry('registrants')
+
+REGISTRANTS.register_module(name='SMPLify', module=SMPLify)
+REGISTRANTS.register_module(name='SMPLifyX', module=SMPLifyX)
+
+
+def build_registrant(cfg):
+    """Build registrant."""
+    if cfg is None:
+        return None
+    return REGISTRANTS.build(cfg)
diff --git a/detrsmpl/models/registrants/smplify.py b/detrsmpl/models/registrants/smplify.py
new file mode 100644
index 0000000000000000000000000000000000000000..b37cb8094aac246d1082728b57a9411f9161a867
--- /dev/null
+++ b/detrsmpl/models/registrants/smplify.py
@@ -0,0 +1,829 @@
+from typing import List, Tuple, Union
+
+import numpy as np
+import torch
+from mmcv.runner import build_optimizer
+
+from detrsmpl.core.cameras import build_cameras
+from detrsmpl.core.conventions.keypoints_mapping import (
+    get_keypoint_idx,
+    get_keypoint_idxs_by_part,
+)
+from ..body_models.builder import build_body_model
+from ..losses.builder import build_loss
+
+
+class OptimizableParameters():
+    """Collects parameters for optimization."""
+
+    def __init__(self):
+        self.opt_params = []
+
+    def set_param(self, fit_param: torch.Tensor, param: torch.Tensor) -> None:
+        """Set requires_grad and collect parameters for optimization.
+
+        Args:
+            fit_param: whether to optimize this body model parameter
+            param: body model parameter
+
+        Returns:
+            None
+        """
+        if fit_param:
+            param.requires_grad = True
+            self.opt_params.append(param)
+        else:
+            param.requires_grad = False
+
+    def parameters(self) -> List[torch.Tensor]:
+        """Returns parameters. Compatible with mmcv's build_parameters()
+
+        Returns:
+            opt_params: a list of body model parameters for optimization
+        """
+        return self.opt_params
+
+
+class SMPLify(object):
+    """Re-implementation of SMPLify with extended features.
+
+    - video input
+    - 3D keypoints
+    """
+
+    def __init__(self,
+                 body_model: Union[dict, torch.nn.Module],
+                 num_epochs: int = 20,
+                 camera: Union[dict, torch.nn.Module] = None,
+                 img_res: Union[Tuple[int], int] = 224,
+                 stages: dict = None,
+                 optimizer: dict = None,
+                 keypoints2d_loss: dict = None,
+                 keypoints3d_loss: dict = None,
+                 shape_prior_loss: dict = None,
+                 joint_prior_loss: dict = None,
+                 smooth_loss: dict = None,
+                 pose_prior_loss: dict = None,
+                 pose_reg_loss: dict = None,
+                 limb_length_loss: dict = None,
+                 use_one_betas_per_video: bool = False,
+                 ignore_keypoints: List[int] = None,
+                 device=torch.device(
+                     'cuda' if torch.cuda.is_available() else 'cpu'),
+                 verbose: bool = False) -> None:
+        """
+        Args:
+            body_model: config or an object of body model.
+            num_epochs: number of epochs of registration
+            camera: config or an object of camera
+            img_res: image resolution. If tuple, values are (width, height)
+            stages: config of registration stages
+            optimizer: config of optimizer
+            keypoints2d_loss: config of keypoint 2D loss
+            keypoints3d_loss: config of keypoint 3D loss
+            shape_prior_loss: config of shape prior loss.
+                Used to prevent extreme shapes.
+            joint_prior_loss: config of joint prior loss.
+                Used to prevent large joint rotations.
+            smooth_loss: config of smooth loss.
+                Used to prevent jittering by temporal smoothing.
+            pose_prior_loss: config of pose prior loss.
+                Used to prevent unnatural pose.
+            pose_reg_loss: config of pose regularizer loss.
+                Used to prevent pose being too large.
+            limb_length_loss: config of limb length loss.
+                Used to prevent the change of body shape.
+            use_one_betas_per_video: whether to use the same beta parameters
+                for all frames in a single video sequence.
+            ignore_keypoints: list of keypoint names to ignore in keypoint
+                loss computation
+            device: torch device
+            verbose: whether to print information during registration
+
+        Returns:
+            None
+        """
+
+        self.use_one_betas_per_video = use_one_betas_per_video
+        self.num_epochs = num_epochs
+        self.img_res = img_res
+        self.device = device
+        self.stage_config = stages
+        self.optimizer = optimizer
+        self.keypoints2d_mse_loss = build_loss(keypoints2d_loss)
+        self.keypoints3d_mse_loss = build_loss(keypoints3d_loss)
+        self.shape_prior_loss = build_loss(shape_prior_loss)
+        self.joint_prior_loss = build_loss(joint_prior_loss)
+        self.smooth_loss = build_loss(smooth_loss)
+        self.pose_prior_loss = build_loss(pose_prior_loss)
+        self.pose_reg_loss = build_loss(pose_reg_loss)
+        self.limb_length_loss = build_loss(limb_length_loss)
+
+        if self.joint_prior_loss is not None:
+            self.joint_prior_loss = self.joint_prior_loss.to(self.device)
+        if self.smooth_loss is not None:
+            self.smooth_loss = self.smooth_loss.to(self.device)
+        if self.pose_prior_loss is not None:
+            self.pose_prior_loss = self.pose_prior_loss.to(self.device)
+        if self.pose_reg_loss is not None:
+            self.pose_reg_loss = self.pose_reg_loss.to(self.device)
+        if self.limb_length_loss is not None:
+            self.limb_length_loss = self.limb_length_loss.to(self.device)
+
+        # initialize body model
+        if isinstance(body_model, dict):
+            self.body_model = build_body_model(body_model).to(self.device)
+        elif isinstance(body_model, torch.nn.Module):
+            self.body_model = body_model.to(self.device)
+        else:
+            raise TypeError(f'body_model should be either dict or '
+                            f'torch.nn.Module, but got {type(body_model)}')
+
+        # initialize camera
+        if camera is not None:
+            if isinstance(camera, dict):
+                self.camera = build_cameras(camera).to(self.device)
+            elif isinstance(camera, torch.nn.Module):
+                self.camera = camera.to(device)
+            else:
+                raise TypeError(f'camera should be either dict or '
+                                f'torch.nn.Module, but got {type(camera)}')
+
+        self.ignore_keypoints = ignore_keypoints
+        self.verbose = verbose
+
+        self._set_keypoint_idxs()
+
+    def __call__(self,
+                 keypoints2d: torch.Tensor = None,
+                 keypoints2d_conf: torch.Tensor = None,
+                 keypoints3d: torch.Tensor = None,
+                 keypoints3d_conf: torch.Tensor = None,
+                 init_global_orient: torch.Tensor = None,
+                 init_transl: torch.Tensor = None,
+                 init_body_pose: torch.Tensor = None,
+                 init_betas: torch.Tensor = None,
+                 return_verts: bool = False,
+                 return_joints: bool = False,
+                 return_full_pose: bool = False,
+                 return_losses: bool = False) -> dict:
+        """Run registration.
+
+        Notes:
+            B: batch size
+            K: number of keypoints
+            D: shape dimension
+            Provide only keypoints2d or keypoints3d, not both.
+
+        Args:
+            keypoints2d: 2D keypoints of shape (B, K, 2)
+            keypoints2d_conf: 2D keypoint confidence of shape (B, K)
+            keypoints3d: 3D keypoints of shape (B, K, 3).
+            keypoints3d_conf: 3D keypoint confidence of shape (B, K)
+            init_global_orient: initial global_orient of shape (B, 3)
+            init_transl: initial transl of shape (B, 3)
+            init_body_pose: initial body_pose of shape (B, 69)
+            init_betas: initial betas of shape (B, D)
+            return_verts: whether to return vertices
+            return_joints: whether to return joints
+            return_full_pose: whether to return full pose
+            return_losses: whether to return loss dict
+
+        Returns:
+            ret: a dictionary that includes body model parameters,
+                and optional attributes such as vertices and joints
+        """
+        assert keypoints2d is not None or keypoints3d is not None, \
+            'Neither of 2D nor 3D keypoints are provided.'
+        assert not (keypoints2d is not None and keypoints3d is not None), \
+            'Do not provide both 2D and 3D keypoints.'
+        batch_size = keypoints2d.shape[0] if keypoints2d is not None \
+            else keypoints3d.shape[0]
+
+        global_orient = self._match_init_batch_size(
+            init_global_orient, self.body_model.global_orient, batch_size)
+        transl = self._match_init_batch_size(init_transl,
+                                             self.body_model.transl,
+                                             batch_size)
+        body_pose = self._match_init_batch_size(init_body_pose,
+                                                self.body_model.body_pose,
+                                                batch_size)
+        if init_betas is None and self.use_one_betas_per_video:
+            betas = torch.zeros(1, self.body_model.betas.shape[-1]).to(
+                self.device)
+        else:
+            betas = self._match_init_batch_size(init_betas,
+                                                self.body_model.betas,
+                                                batch_size)
+
+        for i in range(self.num_epochs):
+            for stage_idx, stage_config in enumerate(self.stage_config):
+                if self.verbose:
+                    print(f'epoch {i}, stage {stage_idx}')
+                self._optimize_stage(
+                    global_orient=global_orient,
+                    transl=transl,
+                    body_pose=body_pose,
+                    betas=betas,
+                    keypoints2d=keypoints2d,
+                    keypoints2d_conf=keypoints2d_conf,
+                    keypoints3d=keypoints3d,
+                    keypoints3d_conf=keypoints3d_conf,
+                    **stage_config,
+                )
+
+        # collate results
+        ret = {
+            'global_orient': global_orient,
+            'transl': transl,
+            'body_pose': body_pose,
+            'betas': betas
+        }
+
+        if return_verts or return_joints or \
+                return_full_pose or return_losses:
+            eval_ret = self.evaluate(
+                global_orient=global_orient,
+                body_pose=body_pose,
+                betas=betas,
+                transl=transl,
+                keypoints2d=keypoints2d,
+                keypoints2d_conf=keypoints2d_conf,
+                keypoints3d=keypoints3d,
+                keypoints3d_conf=keypoints3d_conf,
+                return_verts=return_verts,
+                return_full_pose=return_full_pose,
+                return_joints=return_joints,
+                reduction_override='none'  # sample-wise loss
+            )
+
+            if return_verts:
+                ret['vertices'] = eval_ret['vertices']
+            if return_joints:
+                ret['joints'] = eval_ret['joints']
+            if return_full_pose:
+                ret['full_pose'] = eval_ret['full_pose']
+            if return_losses:
+                for k in eval_ret.keys():
+                    if 'loss' in k:
+                        ret[k] = eval_ret[k]
+
+        for k, v in ret.items():
+            if isinstance(v, torch.Tensor):
+                ret[k] = v.detach().clone()
+
+        return ret
+
+    def _optimize_stage(self,
+                        betas: torch.Tensor,
+                        body_pose: torch.Tensor,
+                        global_orient: torch.Tensor,
+                        transl: torch.Tensor,
+                        fit_global_orient: bool = True,
+                        fit_transl: bool = True,
+                        fit_body_pose: bool = True,
+                        fit_betas: bool = True,
+                        keypoints2d: torch.Tensor = None,
+                        keypoints2d_conf: torch.Tensor = None,
+                        keypoints2d_weight: float = None,
+                        keypoints3d: torch.Tensor = None,
+                        keypoints3d_conf: torch.Tensor = None,
+                        keypoints3d_weight: float = None,
+                        shape_prior_weight: float = None,
+                        joint_prior_weight: float = None,
+                        smooth_loss_weight: float = None,
+                        pose_prior_weight: float = None,
+                        pose_reg_weight: float = None,
+                        limb_length_weight: float = None,
+                        joint_weights: dict = {},
+                        num_iter: int = 1,
+                        ftol: float = 1e-4,
+                        **kwargs) -> None:
+        """Optimize a stage of body model parameters according to
+        configuration.
+
+        Notes:
+            B: batch size
+            K: number of keypoints
+            D: shape dimension
+
+        Args:
+            betas: shape (B, D)
+            body_pose: shape (B, 69)
+            global_orient: shape (B, 3)
+            transl: shape (B, 3)
+            fit_global_orient: whether to optimize global_orient
+            fit_transl: whether to optimize transl
+            fit_body_pose: whether to optimize body_pose
+            fit_betas: whether to optimize betas
+            keypoints2d: 2D keypoints of shape (B, K, 2)
+            keypoints2d_conf: 2D keypoint confidence of shape (B, K)
+            keypoints2d_weight: weight of 2D keypoint loss
+            keypoints3d: 3D keypoints of shape (B, K, 3).
+            keypoints3d_conf: 3D keypoint confidence of shape (B, K)
+            keypoints3d_weight: weight of 3D keypoint loss
+            shape_prior_weight: weight of shape prior loss
+            joint_prior_weight: weight of joint prior loss
+            smooth_loss_weight: weight of smooth loss
+            pose_prior_weight: weight of pose prior loss
+            pose_reg_weight: weight of pose regularization loss
+            limb_length_weight: weight of limb length loss
+            joint_weights: per joint weight of shape (K, )
+            num_iter: number of iterations
+            ftol: early stop tolerance for relative change in loss
+
+        Returns:
+            None
+        """
+
+        parameters = OptimizableParameters()
+        parameters.set_param(fit_global_orient, global_orient)
+        parameters.set_param(fit_transl, transl)
+        parameters.set_param(fit_body_pose, body_pose)
+        parameters.set_param(fit_betas, betas)
+
+        optimizer = build_optimizer(parameters, self.optimizer)
+
+        pre_loss = None
+        for iter_idx in range(num_iter):
+
+            def closure():
+                optimizer.zero_grad()
+                betas_video = self._expand_betas(body_pose.shape[0], betas)
+
+                loss_dict = self.evaluate(
+                    global_orient=global_orient,
+                    body_pose=body_pose,
+                    betas=betas_video,
+                    transl=transl,
+                    keypoints2d=keypoints2d,
+                    keypoints2d_conf=keypoints2d_conf,
+                    keypoints2d_weight=keypoints2d_weight,
+                    keypoints3d=keypoints3d,
+                    keypoints3d_conf=keypoints3d_conf,
+                    keypoints3d_weight=keypoints3d_weight,
+                    joint_prior_weight=joint_prior_weight,
+                    shape_prior_weight=shape_prior_weight,
+                    smooth_loss_weight=smooth_loss_weight,
+                    pose_prior_weight=pose_prior_weight,
+                    pose_reg_weight=pose_reg_weight,
+                    limb_length_weight=limb_length_weight,
+                    joint_weights=joint_weights)
+
+                loss = loss_dict['total_loss']
+                loss.backward()
+                return loss
+
+            loss = optimizer.step(closure)
+            if iter_idx > 0 and pre_loss is not None and ftol > 0:
+                loss_rel_change = self._compute_relative_change(
+                    pre_loss, loss.item())
+                if loss_rel_change < ftol:
+                    if self.verbose:
+                        print(f'[ftol={ftol}] Early stop at {iter_idx} iter!')
+                    break
+            pre_loss = loss.item()
+
+    def evaluate(
+        self,
+        betas: torch.Tensor = None,
+        body_pose: torch.Tensor = None,
+        global_orient: torch.Tensor = None,
+        transl: torch.Tensor = None,
+        keypoints2d: torch.Tensor = None,
+        keypoints2d_conf: torch.Tensor = None,
+        keypoints2d_weight: float = None,
+        keypoints3d: torch.Tensor = None,
+        keypoints3d_conf: torch.Tensor = None,
+        keypoints3d_weight: float = None,
+        shape_prior_weight: float = None,
+        joint_prior_weight: float = None,
+        smooth_loss_weight: float = None,
+        pose_prior_weight: float = None,
+        pose_reg_weight: float = None,
+        limb_length_weight: float = None,
+        joint_weights: dict = {},
+        return_verts: bool = False,
+        return_full_pose: bool = False,
+        return_joints: bool = False,
+        reduction_override: str = None,
+    ) -> dict:
+        """Evaluate fitted parameters through loss computation. This function
+        serves two purposes: 1) internally, for loss backpropagation 2)
+        externally, for fitting quality evaluation.
+
+        Notes:
+            B: batch size
+            K: number of keypoints
+            D: shape dimension
+
+        Args:
+            betas: shape (B, D)
+            body_pose: shape (B, 69)
+            global_orient: shape (B, 3)
+            transl: shape (B, 3)
+            keypoints2d: 2D keypoints of shape (B, K, 2)
+            keypoints2d_conf: 2D keypoint confidence of shape (B, K)
+            keypoints2d_weight: weight of 2D keypoint loss
+            keypoints3d: 3D keypoints of shape (B, K, 3).
+            keypoints3d_conf: 3D keypoint confidence of shape (B, K)
+            keypoints3d_weight: weight of 3D keypoint loss
+            shape_prior_weight: weight of shape prior loss
+            joint_prior_weight: weight of joint prior loss
+            smooth_loss_weight: weight of smooth loss
+            pose_prior_weight: weight of pose prior loss
+            pose_reg_weight: weight of pose regularization loss
+            limb_length_weight: weight of limb length loss
+            joint_weights: per joint weight of shape (K, )
+            return_verts: whether to return vertices
+            return_joints: whether to return joints
+            return_full_pose: whether to return full pose
+            reduction_override: reduction method, e.g., 'none', 'sum', 'mean'
+
+        Returns:
+            ret: a dictionary that includes body model parameters,
+                and optional attributes such as vertices and joints
+        """
+
+        ret = {}
+
+        body_model_output = self.body_model(
+            global_orient=global_orient,
+            body_pose=body_pose,
+            betas=betas,
+            transl=transl,
+            return_verts=return_verts,
+            return_full_pose=return_full_pose)
+
+        model_joints = body_model_output['joints']
+        model_joint_mask = body_model_output['joint_mask']
+
+        loss_dict = self._compute_loss(
+            model_joints,
+            model_joint_mask,
+            keypoints2d=keypoints2d,
+            keypoints2d_conf=keypoints2d_conf,
+            keypoints2d_weight=keypoints2d_weight,
+            keypoints3d=keypoints3d,
+            keypoints3d_conf=keypoints3d_conf,
+            keypoints3d_weight=keypoints3d_weight,
+            joint_prior_weight=joint_prior_weight,
+            shape_prior_weight=shape_prior_weight,
+            smooth_loss_weight=smooth_loss_weight,
+            pose_prior_weight=pose_prior_weight,
+            pose_reg_weight=pose_reg_weight,
+            limb_length_weight=limb_length_weight,
+            joint_weights=joint_weights,
+            reduction_override=reduction_override,
+            global_orient=global_orient,
+            body_pose=body_pose,
+            betas=betas)
+        ret.update(loss_dict)
+
+        if return_verts:
+            ret['vertices'] = body_model_output['vertices']
+        if return_full_pose:
+            ret['full_pose'] = body_model_output['full_pose']
+        if return_joints:
+            ret['joints'] = model_joints
+
+        return ret
+
+    def _compute_loss(self,
+                      model_joints: torch.Tensor,
+                      model_joint_conf: torch.Tensor,
+                      keypoints2d: torch.Tensor = None,
+                      keypoints2d_conf: torch.Tensor = None,
+                      keypoints2d_weight: float = None,
+                      keypoints3d: torch.Tensor = None,
+                      keypoints3d_conf: torch.Tensor = None,
+                      keypoints3d_weight: float = None,
+                      shape_prior_weight: float = None,
+                      joint_prior_weight: float = None,
+                      smooth_loss_weight: float = None,
+                      pose_prior_weight: float = None,
+                      pose_reg_weight: float = None,
+                      limb_length_weight: float = None,
+                      joint_weights: dict = {},
+                      reduction_override: str = None,
+                      global_orient: torch.Tensor = None,
+                      body_pose: torch.Tensor = None,
+                      betas: torch.Tensor = None):
+        """Loss computation.
+
+        Notes:
+            B: batch size
+            K: number of keypoints
+            D: shape dimension
+
+        Args:
+            model_joints: 3D joints regressed from body model of shape (B, K)
+            model_joint_conf: 3D joint confidence of shape (B, K). It is
+                normally all 1, except for zero-pads due to convert_kps in
+                the SMPL wrapper.
+            keypoints2d: 2D keypoints of shape (B, K, 2)
+            keypoints2d_conf: 2D keypoint confidence of shape (B, K)
+            keypoints2d_weight: weight of 2D keypoint loss
+            keypoints3d: 3D keypoints of shape (B, K, 3).
+            keypoints3d_conf: 3D keypoint confidence of shape (B, K)
+            keypoints3d_weight: weight of 3D keypoint loss
+            shape_prior_weight: weight of shape prior loss
+            joint_prior_weight: weight of joint prior loss
+            smooth_loss_weight: weight of smooth loss
+            pose_prior_weight: weight of pose prior loss
+            joint_weights: per joint weight of shape (K, )
+            reduction_override: reduction method, e.g., 'none', 'sum', 'mean'
+            body_pose: shape (B, 69), for loss computation
+            betas: shape (B, D), for loss computation
+
+        Returns:
+            losses: a dict that contains all losses
+        """
+        losses = {}
+
+        weight = self._get_weight(**joint_weights)
+
+        # 2D keypoint loss
+        if keypoints2d is not None and not self._skip_loss(
+                self.keypoints2d_mse_loss, keypoints2d_weight):
+            # bs = model_joints.shape[0]
+            # projected_joints = perspective_projection(
+            #     model_joints,
+            #     torch.eye(3).expand((bs, 3, 3)).to(model_joints.device),
+            #     torch.zeros((bs, 3)).to(model_joints.device), 5000.0,
+            #     torch.Tensor([self.img_res / 2,
+            #                   self.img_res / 2]).to(model_joints.device))
+            projected_joints_xyd = self.camera.transform_points_screen(
+                model_joints)
+            projected_joints = projected_joints_xyd[..., :2]
+
+            # normalize keypoints to [-1,1]
+            projected_joints = 2 * projected_joints / (self.img_res - 1) - 1
+            keypoints2d = 2 * keypoints2d / (self.img_res - 1) - 1
+
+            keypoint2d_loss = self.keypoints2d_mse_loss(
+                pred=projected_joints,
+                pred_conf=model_joint_conf,
+                target=keypoints2d,
+                target_conf=keypoints2d_conf,
+                keypoint_weight=weight,
+                loss_weight_override=keypoints2d_weight,
+                reduction_override=reduction_override)
+            losses['keypoint2d_loss'] = keypoint2d_loss
+
+        # 3D keypoint loss
+        if keypoints3d is not None and not self._skip_loss(
+                self.keypoints3d_mse_loss, keypoints3d_weight):
+            keypoints3d_loss = self.keypoints3d_mse_loss(
+                pred=model_joints,
+                pred_conf=model_joint_conf,
+                target=keypoints3d,
+                target_conf=keypoints3d_conf,
+                keypoint_weight=weight,
+                loss_weight_override=keypoints3d_weight,
+                reduction_override=reduction_override)
+            losses['keypoints3d_loss'] = keypoints3d_loss
+
+        # regularizer to prevent betas from taking large values
+        if not self._skip_loss(self.shape_prior_loss, shape_prior_weight):
+            shape_prior_loss = self.shape_prior_loss(
+                betas=betas,
+                loss_weight_override=shape_prior_weight,
+                reduction_override=reduction_override)
+            losses['shape_prior_loss'] = shape_prior_loss
+
+        # joint prior loss
+        if not self._skip_loss(self.joint_prior_loss, joint_prior_weight):
+            joint_prior_loss = self.joint_prior_loss(
+                body_pose=body_pose,
+                loss_weight_override=joint_prior_weight,
+                reduction_override=reduction_override)
+            losses['joint_prior_loss'] = joint_prior_loss
+
+        # smooth body loss
+        if not self._skip_loss(self.smooth_loss, smooth_loss_weight):
+            smooth_loss = self.smooth_loss(
+                body_pose=body_pose,
+                loss_weight_override=smooth_loss_weight,
+                reduction_override=reduction_override)
+            losses['smooth_loss'] = smooth_loss
+
+        # pose prior loss
+        if not self._skip_loss(self.pose_prior_loss, pose_prior_weight):
+            pose_prior_loss = self.pose_prior_loss(
+                body_pose=body_pose,
+                loss_weight_override=pose_prior_weight,
+                reduction_override=reduction_override)
+            losses['pose_prior_loss'] = pose_prior_loss
+
+        # pose reg loss
+        if not self._skip_loss(self.pose_reg_loss, pose_reg_weight):
+            pose_reg_loss = self.pose_reg_loss(
+                body_pose=body_pose,
+                loss_weight_override=pose_reg_weight,
+                reduction_override=reduction_override)
+            losses['pose_reg_loss'] = pose_reg_loss
+
+        # limb length loss
+        if not self._skip_loss(self.limb_length_loss, limb_length_weight):
+            limb_length_loss = self.limb_length_loss(
+                pred=model_joints,
+                pred_conf=model_joint_conf,
+                target=keypoints3d,
+                target_conf=keypoints3d_conf,
+                loss_weight_override=limb_length_weight,
+                reduction_override=reduction_override)
+            losses['limb_length_loss'] = limb_length_loss
+
+        if self.verbose:
+            msg = ''
+            for loss_name, loss in losses.items():
+                msg += f'{loss_name}={loss.mean().item():.6f}, '
+            if self.verbose:
+                print(msg.strip(', '))
+
+        total_loss = 0
+        for loss_name, loss in losses.items():
+            if loss.ndim == 3:
+                total_loss = total_loss + loss.sum(dim=(2, 1))
+            elif loss.ndim == 2:
+                total_loss = total_loss + loss.sum(dim=-1)
+            else:
+                total_loss = total_loss + loss
+        losses['total_loss'] = total_loss
+
+        return losses
+
+    def _match_init_batch_size(self, init_param: torch.Tensor,
+                               init_param_body_model: torch.Tensor,
+                               batch_size: int) -> torch.Tensor:
+        """A helper function to ensure body model parameters have the same
+        batch size as the input keypoints.
+
+        Args:
+            init_param: input initial body model parameters, may be None
+            init_param_body_model: initial body model parameters from the
+                body model
+            batch_size: batch size of keypoints
+
+        Returns:
+            param: body model parameters with batch size aligned
+        """
+
+        # param takes init values
+        param = init_param.detach().clone() \
+            if init_param is not None \
+            else init_param_body_model.detach().clone()
+
+        # expand batch dimension to match batch size
+        param_batch_size = param.shape[0]
+        if param_batch_size != batch_size:
+            if param_batch_size == 1:
+                param = param.repeat(batch_size, *[1] * (param.ndim - 1))
+            else:
+                raise ValueError('Init param does not match the batch size of '
+                                 'keypoints, and is not 1.')
+
+        # shape check
+        assert param.shape[0] == batch_size
+        assert param.shape[1:] == init_param_body_model.shape[1:], \
+            f'Shape mismatch: {param.shape} vs {init_param_body_model.shape}'
+
+        return param
+
+    def _set_keypoint_idxs(self) -> None:
+        """Set keypoint indices to 1) body parts to be assigned different
+        weights 2) be ignored for keypoint loss computation.
+
+        Returns:
+            None
+        """
+        convention = self.body_model.keypoint_dst
+
+        # obtain ignore keypoint indices
+        if self.ignore_keypoints is not None:
+            self.ignore_keypoint_idxs = []
+            for keypoint_name in self.ignore_keypoints:
+                keypoint_idx = get_keypoint_idx(
+                    keypoint_name, convention=convention)
+                if keypoint_idx != -1:
+                    self.ignore_keypoint_idxs.append(keypoint_idx)
+
+        # obtain body part keypoint indices
+        shoulder_keypoint_idxs = get_keypoint_idxs_by_part(
+            'shoulder', convention=convention)
+        hip_keypoint_idxs = get_keypoint_idxs_by_part(
+            'hip', convention=convention)
+        self.shoulder_hip_keypoint_idxs = [
+            *shoulder_keypoint_idxs, *hip_keypoint_idxs
+        ]
+
+    def _get_weight(self,
+                    use_shoulder_hip_only: bool = False,
+                    body_weight: float = 1.0) -> torch.Tensor:
+        """Get per keypoint weight.
+
+        Notes:
+            K: number of keypoints
+
+        Args:
+            use_shoulder_hip_only: whether to use only shoulder and hip
+                keypoints for loss computation. This is useful in the
+                warming-up stage to find a reasonably good initialization.
+            body_weight: weight of body keypoints. Body part segmentation
+                definition is included in the HumanData convention.
+
+        Returns:
+            weight: per keypoint weight tensor of shape (K)
+        """
+
+        num_keypoint = self.body_model.num_joints
+
+        if use_shoulder_hip_only:
+            weight = torch.zeros([num_keypoint]).to(self.device)
+            weight[self.shoulder_hip_keypoint_idxs] = 1.0
+            weight = weight * body_weight
+        else:
+            weight = torch.ones([num_keypoint]).to(self.device)
+            weight = weight * body_weight
+
+        if hasattr(self, 'ignore_keypoint_idxs'):
+            weight[self.ignore_keypoint_idxs] = 0.0
+
+        return weight
+
+    def _expand_betas(self, batch_size, betas):
+        """A helper function to expand the betas's first dim to match batch
+        size such that the same beta parameters can be used for all frames in a
+        video sequence.
+
+        Notes:
+            B: batch size
+            K: number of keypoints
+            D: shape dimension
+
+        Args:
+            batch_size: batch size
+            betas: shape (B, D)
+
+        Returns:
+            betas_video: expanded betas
+        """
+        # no expansion needed
+        if batch_size == betas.shape[0]:
+            return betas
+
+        # first dim is 1
+        else:
+            feat_dim = betas.shape[-1]
+            betas_video = betas.view(1, feat_dim).expand(batch_size, feat_dim)
+
+        return betas_video
+
+    @staticmethod
+    def _compute_relative_change(pre_v, cur_v):
+        """Compute relative loss change. If relative change is small enough, we
+        can apply early stop to accelerate the optimization. (1) When one of
+        the value is larger than 1, we calculate the relative change by diving
+        their max value. (2) When both values are smaller than 1, it degrades
+        to absolute change. Intuitively, if two values are small and close,
+        dividing the difference by the max value may yield a large value.
+
+        Args:
+            pre_v: previous value
+            cur_v: current value
+
+        Returns:
+            float: relative change
+        """
+        return np.abs(pre_v - cur_v) / max([np.abs(pre_v), np.abs(cur_v), 1])
+
+    @staticmethod
+    def _skip_loss(loss, loss_weight_override):
+        """Whether to skip loss computation. If loss is None, it will directly
+        skip the loss to avoid RuntimeError. If loss is not None, the table
+        below shows the return value. If the return value is True, it means the
+        computation of loss can be skipped. As the result is 0 even if it is
+        calculated, we can skip it to save computational cost.
+
+        | loss.loss_weight | loss_weight_override | returns |
+        | ---------------- | -------------------- | ------- |
+        |      == 0        |         None         |   True  |
+        |      != 0        |         None         |   False |
+        |      == 0        |         == 0         |   True  |
+        |      != 0        |         == 0         |   True  |
+        |      == 0        |         != 0         |   False |
+        |      != 0        |         != 0         |   False |
+
+        Args:
+            loss: loss is an object that has attribute loss_weight.
+                loss.loss_weight is assigned when loss is initialized.
+            loss_weight_override: loss_weight used to override loss.loss_weight
+
+        Returns:
+            bool: True means skipping loss computation, and vice versa
+        """
+        if (loss is None) or (loss.loss_weight == 0 and loss_weight_override is
+                              None) or (loss_weight_override == 0):
+            return True
+        return False
diff --git a/detrsmpl/models/registrants/smplifyx.py b/detrsmpl/models/registrants/smplifyx.py
new file mode 100644
index 0000000000000000000000000000000000000000..7402440ddb4287a67b560c408d8d63e450301729
--- /dev/null
+++ b/detrsmpl/models/registrants/smplifyx.py
@@ -0,0 +1,489 @@
+import torch
+from mmcv.runner import build_optimizer
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    get_keypoint_idx,
+    get_keypoint_idxs_by_part,
+)
+from .smplify import OptimizableParameters, SMPLify
+
+
+class SMPLifyX(SMPLify):
+    """Re-implementation of SMPLify-X with extended features.
+
+    - video input
+    - 3D keypoints
+    """
+    def __call__(self,
+                 keypoints2d: torch.Tensor = None,
+                 keypoints2d_conf: torch.Tensor = None,
+                 keypoints3d: torch.Tensor = None,
+                 keypoints3d_conf: torch.Tensor = None,
+                 init_global_orient: torch.Tensor = None,
+                 init_transl: torch.Tensor = None,
+                 init_body_pose: torch.Tensor = None,
+                 init_betas: torch.Tensor = None,
+                 init_left_hand_pose: torch.Tensor = None,
+                 init_right_hand_pose: torch.Tensor = None,
+                 init_expression: torch.Tensor = None,
+                 init_jaw_pose: torch.Tensor = None,
+                 init_leye_pose: torch.Tensor = None,
+                 init_reye_pose: torch.Tensor = None,
+                 return_verts: bool = False,
+                 return_joints: bool = False,
+                 return_full_pose: bool = False,
+                 return_losses: bool = False) -> dict:
+        """Run registration.
+
+        Notes:
+            B: batch size
+            K: number of keypoints
+            D: body shape dimension
+            D_H: hand pose dimension
+            D_E: expression dimension
+            Provide only keypoints2d or keypoints3d, not both.
+
+        Args:
+            keypoints2d: 2D keypoints of shape (B, K, 2)
+            keypoints2d_conf: 2D keypoint confidence of shape (B, K)
+            keypoints3d: 3D keypoints of shape (B, K, 3).
+            keypoints3d_conf: 3D keypoint confidence of shape (B, K)
+            init_global_orient: initial global_orient of shape (B, 3)
+            init_transl: initial transl of shape (B, 3)
+            init_body_pose: initial body_pose of shape (B, 69)
+            init_betas: initial betas of shape (B, D)
+            init_left_hand_pose: initial left hand pose of shape (B, D_H)
+            init_right_hand_pose: initial right hand pose of shape (B, D_H)
+            init_expression: initial left hand pose of shape (B, D_E)
+            init_jaw_pose: initial jaw pose of shape (B, 3)
+            init_leye_pose: initial left eye pose of shape (B, 3)
+            init_reye_pose: initial right eye pose of shape (B, 3)
+            return_verts: whether to return vertices
+            return_joints: whether to return joints
+            return_full_pose: whether to return full pose
+            return_losses: whether to return loss dict
+
+        Returns:
+            ret: a dictionary that includes body model parameters,
+                and optional attributes such as vertices and joints
+        """
+
+        assert keypoints2d is not None or keypoints3d is not None, \
+            'Neither of 2D nor 3D keypoints are provided.'
+        assert not (keypoints2d is not None and keypoints3d is not None), \
+            'Do not provide both 2D and 3D keypoints.'
+        batch_size = keypoints2d.shape[0] if keypoints2d is not None \
+            else keypoints3d.shape[0]
+
+        global_orient = self._match_init_batch_size(
+            init_global_orient, self.body_model.global_orient, batch_size)
+        transl = self._match_init_batch_size(init_transl,
+                                             self.body_model.transl,
+                                             batch_size)
+        body_pose = self._match_init_batch_size(init_body_pose,
+                                                self.body_model.body_pose,
+                                                batch_size)
+        left_hand_pose = self._match_init_batch_size(
+            init_left_hand_pose, self.body_model.left_hand_pose, batch_size)
+        right_hand_pose = self._match_init_batch_size(
+            init_right_hand_pose, self.body_model.right_hand_pose, batch_size)
+        expression = self._match_init_batch_size(init_expression,
+                                                 self.body_model.expression,
+                                                 batch_size)
+        jaw_pose = self._match_init_batch_size(init_jaw_pose,
+                                               self.body_model.jaw_pose,
+                                               batch_size)
+        leye_pose = self._match_init_batch_size(init_leye_pose,
+                                                self.body_model.leye_pose,
+                                                batch_size)
+        reye_pose = self._match_init_batch_size(init_reye_pose,
+                                                self.body_model.reye_pose,
+                                                batch_size)
+        if init_betas is None and self.use_one_betas_per_video:
+            betas = torch.zeros(1, self.body_model.betas.shape[-1]).to(
+                self.device)
+        else:
+            betas = self._match_init_batch_size(init_betas,
+                                                self.body_model.betas,
+                                                batch_size)
+
+        for i in range(self.num_epochs):
+            for stage_idx, stage_config in enumerate(self.stage_config):
+                # print(stage_name)
+                self._optimize_stage(
+                    global_orient=global_orient,
+                    transl=transl,
+                    body_pose=body_pose,
+                    betas=betas,
+                    left_hand_pose=left_hand_pose,
+                    right_hand_pose=right_hand_pose,
+                    expression=expression,
+                    jaw_pose=jaw_pose,
+                    leye_pose=leye_pose,
+                    reye_pose=reye_pose,
+                    keypoints2d=keypoints2d,
+                    keypoints2d_conf=keypoints2d_conf,
+                    keypoints3d=keypoints3d,
+                    keypoints3d_conf=keypoints3d_conf,
+                    **stage_config,
+                )
+
+        return {
+            'global_orient': global_orient,
+            'transl': transl,
+            'body_pose': body_pose,
+            'betas': betas,
+            'left_hand_pose': left_hand_pose,
+            'right_hand_pose': right_hand_pose,
+            'expression': expression,
+            'jaw_pose': jaw_pose,
+            'leye_pose': leye_pose,
+            'reye_pose': reye_pose
+        }
+
+    def _optimize_stage(self,
+                        betas: torch.Tensor,
+                        body_pose: torch.Tensor,
+                        global_orient: torch.Tensor,
+                        transl: torch.Tensor,
+                        left_hand_pose: torch.Tensor,
+                        right_hand_pose: torch.Tensor,
+                        expression: torch.Tensor,
+                        jaw_pose: torch.Tensor,
+                        leye_pose: torch.Tensor,
+                        reye_pose: torch.Tensor,
+                        fit_global_orient: bool = True,
+                        fit_transl: bool = True,
+                        fit_body_pose: bool = True,
+                        fit_betas: bool = True,
+                        fit_left_hand_pose: bool = True,
+                        fit_right_hand_pose: bool = True,
+                        fit_expression: bool = True,
+                        fit_jaw_pose: bool = True,
+                        fit_leye_pose: bool = True,
+                        fit_reye_pose: bool = True,
+                        keypoints2d: torch.Tensor = None,
+                        keypoints2d_conf: torch.Tensor = None,
+                        keypoints2d_weight: float = None,
+                        keypoints3d: torch.Tensor = None,
+                        keypoints3d_conf: torch.Tensor = None,
+                        keypoints3d_weight: float = None,
+                        shape_prior_weight: float = None,
+                        joint_prior_weight: float = None,
+                        smooth_loss_weight: float = None,
+                        pose_prior_weight: float = None,
+                        pose_reg_weight: float = None,
+                        limb_length_weight: float = None,
+                        joint_weights: dict = {},
+                        ftol: float = 1e-4,
+                        num_iter: int = 1) -> None:
+        """Optimize a stage of body model parameters according to
+        configuration.
+
+        Notes:
+            B: batch size
+            K: number of keypoints
+            D: shape dimension
+
+        Args:
+            betas: shape (B, D)
+            body_pose: shape (B, 69)
+            global_orient: shape (B, 3)
+            transl: shape (B, 3)
+            fit_global_orient: whether to optimize global_orient
+            fit_transl: whether to optimize transl
+            fit_body_pose: whether to optimize body_pose
+            fit_betas: whether to optimize betas
+            fit_left_hand_pose: whether to optimize left hand pose
+            fit_right_hand_pose: whether to optimize right hand pose
+            fit_expression: whether to optimize expression
+            fit_jaw_pose: whether to optimize jaw pose
+            fit_leye_pose: whether to optimize left eye pose
+            fit_reye_pose: whether to optimize right eye pose
+            keypoints2d: 2D keypoints of shape (B, K, 2)
+            keypoints2d_conf: 2D keypoint confidence of shape (B, K)
+            keypoints2d_weight: weight of 2D keypoint loss
+            keypoints3d: 3D keypoints of shape (B, K, 3).
+            keypoints3d_conf: 3D keypoint confidence of shape (B, K)
+            keypoints3d_weight: weight of 3D keypoint loss
+            shape_prior_weight: weight of shape prior loss
+            joint_prior_weight: weight of joint prior loss
+            smooth_loss_weight: weight of smooth loss
+            pose_prior_weight: weight of pose prior loss
+            pose_reg_weight: weight of pose regularization loss
+            limb_length_weight: weight of limb length loss
+            joint_weights: per joint weight of shape (K, )
+            num_iter: number of iterations
+            ftol: early stop tolerance for relative change in loss
+
+        Returns:
+            None
+        """
+
+        parameters = OptimizableParameters()
+        parameters.set_param(fit_global_orient, global_orient)
+        parameters.set_param(fit_transl, transl)
+        parameters.set_param(fit_body_pose, body_pose)
+        parameters.set_param(fit_betas, betas)
+        parameters.set_param(fit_left_hand_pose, left_hand_pose)
+        parameters.set_param(fit_right_hand_pose, right_hand_pose)
+        parameters.set_param(fit_expression, expression)
+        parameters.set_param(fit_jaw_pose, jaw_pose)
+        parameters.set_param(fit_leye_pose, leye_pose)
+        parameters.set_param(fit_reye_pose, reye_pose)
+
+        optimizer = build_optimizer(parameters, self.optimizer)
+
+        pre_loss = None
+        for iter_idx in range(num_iter):
+
+            def closure():
+                # body_pose_fixed = use_reference_spine(body_pose,
+                # init_body_pose)
+
+                optimizer.zero_grad()
+                betas_video = self._expand_betas(body_pose.shape[0], betas)
+
+                loss_dict = self.evaluate(
+                    global_orient=global_orient,
+                    body_pose=body_pose,
+                    betas=betas_video,
+                    transl=transl,
+                    left_hand_pose=left_hand_pose,
+                    right_hand_pose=right_hand_pose,
+                    expression=expression,
+                    jaw_pose=jaw_pose,
+                    leye_pose=leye_pose,
+                    reye_pose=reye_pose,
+                    keypoints2d=keypoints2d,
+                    keypoints2d_conf=keypoints2d_conf,
+                    keypoints2d_weight=keypoints2d_weight,
+                    keypoints3d=keypoints3d,
+                    keypoints3d_conf=keypoints3d_conf,
+                    keypoints3d_weight=keypoints3d_weight,
+                    joint_prior_weight=joint_prior_weight,
+                    shape_prior_weight=shape_prior_weight,
+                    smooth_loss_weight=smooth_loss_weight,
+                    pose_prior_weight=pose_prior_weight,
+                    pose_reg_weight=pose_reg_weight,
+                    limb_length_weight=limb_length_weight,
+                    joint_weights=joint_weights)
+
+                loss = loss_dict['total_loss']
+                loss.backward()
+                return loss
+
+            loss = optimizer.step(closure)
+            if iter_idx > 0 and pre_loss is not None and ftol > 0:
+                loss_rel_change = self._compute_relative_change(
+                    pre_loss, loss.item())
+                if loss_rel_change < ftol:
+                    print(f'[ftol={ftol}] Early stop at {iter_idx} iter!')
+                    break
+            pre_loss = loss.item()
+
+    def evaluate(
+        self,
+        betas: torch.Tensor = None,
+        body_pose: torch.Tensor = None,
+        global_orient: torch.Tensor = None,
+        transl: torch.Tensor = None,
+        left_hand_pose: torch.Tensor = None,
+        right_hand_pose: torch.Tensor = None,
+        expression: torch.Tensor = None,
+        jaw_pose: torch.Tensor = None,
+        leye_pose: torch.Tensor = None,
+        reye_pose: torch.Tensor = None,
+        keypoints2d: torch.Tensor = None,
+        keypoints2d_conf: torch.Tensor = None,
+        keypoints2d_weight: float = None,
+        keypoints3d: torch.Tensor = None,
+        keypoints3d_conf: torch.Tensor = None,
+        keypoints3d_weight: float = None,
+        shape_prior_weight: float = None,
+        joint_prior_weight: float = None,
+        smooth_loss_weight: float = None,
+        pose_prior_weight: float = None,
+        pose_reg_weight: float = None,
+        limb_length_weight: float = None,
+        joint_weights: dict = {},
+        return_verts: bool = False,
+        return_full_pose: bool = False,
+        return_joints: bool = False,
+        reduction_override: str = None,
+    ):
+        """Evaluate fitted parameters through loss computation. This function
+        serves two purposes: 1) internally, for loss backpropagation 2)
+        externally, for fitting quality evaluation.
+
+        Notes:
+            B: batch size
+            K: number of keypoints
+            D: body shape dimension
+            D_H: hand pose dimension
+            D_E: expression dimension
+
+        Args:
+            betas: shape (B, D)
+            body_pose: shape (B, 69)
+            global_orient: shape (B, 3)
+            transl: shape (B, 3)
+            left_hand_pose: shape (B, D_H)
+            right_hand_pose: shape (B, D_H)
+            expression: shape (B, D_E)
+            jaw_pose: shape (B, 3)
+            leye_pose: shape (B, 3)
+            reye_pose: shape (B, 3)
+            keypoints2d: 2D keypoints of shape (B, K, 2)
+            keypoints2d_conf: 2D keypoint confidence of shape (B, K)
+            keypoints2d_weight: weight of 2D keypoint loss
+            keypoints3d: 3D keypoints of shape (B, K, 3).
+            keypoints3d_conf: 3D keypoint confidence of shape (B, K)
+            keypoints3d_weight: weight of 3D keypoint loss
+            shape_prior_weight: weight of shape prior loss
+            joint_prior_weight: weight of joint prior loss
+            smooth_loss_weight: weight of smooth loss
+            pose_prior_weight: weight of pose prior loss
+            pose_reg_weight: weight of pose regularization loss
+            limb_length_weight: weight of limb length loss
+            joint_weights: per joint weight of shape (K, )
+            return_verts: whether to return vertices
+            return_joints: whether to return joints
+            return_full_pose: whether to return full pose
+            reduction_override: reduction method, e.g., 'none', 'sum', 'mean'
+
+        Returns:
+            ret: a dictionary that includes body model parameters,
+                and optional attributes such as vertices and joints
+        """
+
+        ret = {}
+
+        body_model_output = self.body_model(global_orient=global_orient,
+                                            body_pose=body_pose,
+                                            betas=betas,
+                                            transl=transl,
+                                            left_hand_pose=left_hand_pose,
+                                            right_hand_pose=right_hand_pose,
+                                            expression=expression,
+                                            jaw_pose=jaw_pose,
+                                            leye_pose=leye_pose,
+                                            reye_pose=reye_pose,
+                                            return_verts=return_verts,
+                                            return_full_pose=return_full_pose)
+
+        model_joints = body_model_output['joints']
+        model_joint_mask = body_model_output['joint_mask']
+
+        loss_dict = self._compute_loss(model_joints,
+                                       model_joint_mask,
+                                       keypoints2d=keypoints2d,
+                                       keypoints2d_conf=keypoints2d_conf,
+                                       keypoints2d_weight=keypoints2d_weight,
+                                       keypoints3d=keypoints3d,
+                                       keypoints3d_conf=keypoints3d_conf,
+                                       keypoints3d_weight=keypoints3d_weight,
+                                       joint_prior_weight=joint_prior_weight,
+                                       shape_prior_weight=shape_prior_weight,
+                                       smooth_loss_weight=smooth_loss_weight,
+                                       pose_prior_weight=pose_prior_weight,
+                                       pose_reg_weight=pose_reg_weight,
+                                       limb_length_weight=limb_length_weight,
+                                       joint_weights=joint_weights,
+                                       reduction_override=reduction_override,
+                                       body_pose=body_pose,
+                                       betas=betas)
+        ret.update(loss_dict)
+
+        if return_verts:
+            ret['vertices'] = body_model_output['vertices']
+        if return_full_pose:
+            ret['full_pose'] = body_model_output['full_pose']
+        if return_joints:
+            ret['joints'] = model_joints
+
+        return ret
+
+    def _set_keypoint_idxs(self):
+        """Set keypoint indices to 1) body parts to be assigned different
+        weights 2) be ignored for keypoint loss computation.
+
+        Returns:
+            None
+        """
+        convention = self.body_model.keypoint_dst
+
+        # obtain ignore keypoint indices
+        if self.ignore_keypoints is not None:
+            self.ignore_keypoint_idxs = []
+            for keypoint_name in self.ignore_keypoints:
+                keypoint_idx = get_keypoint_idx(keypoint_name,
+                                                convention=convention)
+                if keypoint_idx != -1:
+                    self.ignore_keypoint_idxs.append(keypoint_idx)
+
+        # obtain body part keypoint indices
+        shoulder_keypoint_idxs = get_keypoint_idxs_by_part(
+            'shoulder', convention=convention)
+        hip_keypoint_idxs = get_keypoint_idxs_by_part('hip',
+                                                      convention=convention)
+        self.shoulder_hip_keypoint_idxs = [
+            *shoulder_keypoint_idxs, *hip_keypoint_idxs
+        ]
+
+        # head keypoints include all facial landmarks
+        self.face_keypoint_idxs = get_keypoint_idxs_by_part(
+            'head', convention=convention)
+
+        left_hand_keypoint_idxs = get_keypoint_idxs_by_part(
+            'left_hand', convention=convention)
+        right_hand_keypoint_idxs = get_keypoint_idxs_by_part(
+            'right_hand', convention=convention)
+        self.hand_keypoint_idxs = [
+            *left_hand_keypoint_idxs, *right_hand_keypoint_idxs
+        ]
+
+        self.body_keypoint_idxs = get_keypoint_idxs_by_part(
+            'body', convention=convention)
+
+    def _get_weight(self,
+                    use_shoulder_hip_only: bool = False,
+                    body_weight: float = 1.0,
+                    hand_weight: float = 1.0,
+                    face_weight: float = 1.0):
+        """Get per keypoint weight.
+
+        Notes:
+            K: number of keypoints
+
+        Args:
+            use_shoulder_hip_only: whether to use only shoulder and hip
+                keypoints for loss computation. This is useful in the
+                warming-up stage to find a reasonably good initialization.
+            body_weight: weight of body keypoints. Body part segmentation
+                definition is included in the HumanData convention.
+            hand_weight: weight of hand keypoints.
+            face_weight: weight of face keypoints.
+
+        Returns:
+            weight: per keypoint weight tensor of shape (K)
+        """
+        num_keypoint = self.body_model.num_joints
+
+        if use_shoulder_hip_only:
+            weight = torch.zeros([num_keypoint]).to(self.device)
+            weight[self.shoulder_hip_keypoint_idxs] = 1.0
+        else:
+            weight = torch.ones([num_keypoint]).to(self.device)
+
+            weight[self.body_keypoint_idxs] = \
+                weight[self.body_keypoint_idxs] * body_weight
+            weight[self.hand_keypoint_idxs] = \
+                weight[self.hand_keypoint_idxs] * hand_weight
+            weight[self.face_keypoint_idxs] = \
+                weight[self.face_keypoint_idxs] * face_weight
+
+        if hasattr(self, 'ignore_keypoint_idxs'):
+            weight[self.ignore_keypoint_idxs] = 0.0
+
+        return weight
diff --git a/detrsmpl/models/utils/SMPLX.py b/detrsmpl/models/utils/SMPLX.py
new file mode 100644
index 0000000000000000000000000000000000000000..d76142d71e070b3f166b4b39e74306ceff12983d
--- /dev/null
+++ b/detrsmpl/models/utils/SMPLX.py
@@ -0,0 +1,669 @@
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from smplx.utils import find_joint_kin_chain
+
+from detrsmpl.core.conventions.keypoints_mapping import (
+    get_keypoint_idx,
+    get_keypoint_idxs_by_part,
+)
+from detrsmpl.utils.geometry import weak_perspective_projection
+
+
+class SMPLXHandMergeFunc():
+    """This function use predictions from hand model to update the hand params
+    (right_hand_pose, left_hand_pose, wrist_pose) in predictions from body
+    model."""
+    def __init__(self, body_model, convention='smplx'):
+        self.body_model = body_model
+        self.convention = convention
+        self.left_hand_idxs = get_keypoint_idxs_by_part(
+            'left_hand', self.convention)
+        self.left_wrist_idx = get_keypoint_idx('left_wrist', self.convention)
+        self.left_hand_idxs.append(self.left_wrist_idx)
+        self.left_wrist_kin_chain = find_joint_kin_chain(
+            self.left_wrist_idx, self.body_model.parents)
+
+        self.right_hand_idxs = get_keypoint_idxs_by_part(
+            'right_hand', self.convention)
+        self.right_wrist_idx = get_keypoint_idx('right_wrist', self.convention)
+        self.right_hand_idxs.append(self.right_wrist_idx)
+        self.right_wrist_kin_chain = find_joint_kin_chain(
+            self.right_wrist_idx, self.body_model.parents)
+
+    def __call__(self, body_predictions, hand_predictions):
+        """Function
+        Args:
+            body_predictions (dict): The prediction from body model.
+            hand_predictions (dict): The prediction from hand model.
+        Returns:
+            dict: Merged prediction.
+        """
+        pred_param = body_predictions['pred_param']
+        global_orient = pred_param['global_orient']
+        body_pose = pred_param['body_pose']
+        pred_cam = body_predictions['pred_cam']
+        batch_size = pred_cam.shape[0]
+        device = pred_cam.device
+        hands_from_body_idxs = torch.arange(0,
+                                            2 * batch_size,
+                                            dtype=torch.long,
+                                            device=device)
+        right_hand_from_body_idxs = hands_from_body_idxs[:batch_size]
+        left_hand_from_body_idxs = hands_from_body_idxs[batch_size:]
+
+        parent_rots = []
+        right_wrist_parent_rot = find_joint_global_rotation(
+            self.right_wrist_kin_chain[1:], global_orient, body_pose)
+
+        left_wrist_parent_rot = find_joint_global_rotation(
+            self.left_wrist_kin_chain[1:], global_orient, body_pose)
+        left_to_right_wrist_parent_rot = flip_rotmat(left_wrist_parent_rot)
+
+        parent_rots += [right_wrist_parent_rot, left_to_right_wrist_parent_rot]
+        parent_rots = torch.cat(parent_rots, dim=0)
+
+        wrist_pose_from_hand = hand_predictions['pred_param']['global_orient']
+        # Undo the rotation of the parent joints to make the wrist rotation
+        # relative again
+        wrist_pose_from_hand = torch.matmul(
+            parent_rots.reshape(-1, 3, 3).transpose(1, 2),
+            wrist_pose_from_hand.reshape(-1, 3, 3))
+
+        right_hand_wrist = wrist_pose_from_hand[right_hand_from_body_idxs]
+        left_hand_wrist = flip_rotmat(
+            wrist_pose_from_hand[left_hand_from_body_idxs])
+        right_hand_pose = hand_predictions['pred_param']['right_hand_pose'][
+            right_hand_from_body_idxs]
+        left_hand_pose = flip_rotmat(
+            hand_predictions['pred_param']['right_hand_pose']
+            [left_hand_from_body_idxs])
+
+        body_predictions['pred_param']['right_hand_pose'] = right_hand_pose
+        body_predictions['pred_param']['left_hand_pose'] = left_hand_pose
+        body_predictions['pred_param']['body_pose'][:, self.right_wrist_idx -
+                                                    1] = right_hand_wrist
+        body_predictions['pred_param']['body_pose'][:, self.left_wrist_idx -
+                                                    1] = left_hand_wrist
+
+        return body_predictions
+
+
+class SMPLXFaceMergeFunc():
+    """This function use predictions from face model to update the face params
+    (jaw_pose, expression) in predictions from body model."""
+    def __init__(self,
+                 body_model,
+                 convention='smplx',
+                 num_expression_coeffs=10):
+        self.body_model = body_model
+        self.convention = convention
+        self.num_expression_coeffs = num_expression_coeffs
+
+    def __call__(self, body_predictions, face_predictions):
+        """Function
+        Args:
+            body_predictions (dict): The prediction from body model.
+            face_predictions (dict): The prediction from face model.
+        Returns:
+            dict: Merged prediction.
+        """
+        body_predictions['pred_param']['jaw_pose'] = face_predictions[
+            'pred_param']['jaw_pose']
+        body_predictions['pred_param']['expression'] = face_predictions[
+            'pred_param']['expression'][:, :self.num_expression_coeffs]
+        return body_predictions
+
+
+def points_to_bbox(points, bbox_scale_factor: float = 1.0):
+    """Get scaled bounding box from keypoints 2D."""
+    min_coords, _ = torch.min(points, dim=1)
+    xmin, ymin = min_coords[:, 0], min_coords[:, 1]
+    max_coords, _ = torch.max(points, dim=1)
+    xmax, ymax = max_coords[:, 0], max_coords[:, 1]
+
+    center = torch.stack([xmax + xmin, ymax + ymin], dim=-1) * 0.5
+
+    width = (xmax - xmin)
+    height = (ymax - ymin)
+
+    # Convert the bounding box to a square box
+    size = torch.max(width, height) * bbox_scale_factor
+
+    return center, size
+
+
+def get_crop_info(points,
+                  img_metas,
+                  scale_factor: float = 1.0,
+                  crop_size: int = 256):
+    """Get the transformation of points on the cropped image to the points on
+    the original image."""
+    device = points.device
+    dtype = points.dtype
+    batch_size = points.shape[0]
+    # Get the image to crop transformations and bounding box sizes
+    crop_transforms = []
+    img_bbox_sizes = []
+    for img_meta in img_metas:
+        crop_transforms.append(img_meta['crop_transform'])
+        img_bbox_sizes.append(img_meta['scale'].max())
+
+    img_bbox_sizes = torch.tensor(img_bbox_sizes, dtype=dtype, device=device)
+
+    crop_transforms = torch.tensor(crop_transforms, dtype=dtype, device=device)
+
+    crop_transforms = torch.cat([
+        crop_transforms,
+        torch.tensor([0.0, 0.0, 1.0], dtype=dtype, device=device).expand(
+            [batch_size, 1, 3])
+    ],
+                                dim=1)
+
+    inv_crop_transforms = torch.inverse(crop_transforms)
+
+    # center on the cropped body image
+    center_body_crop, bbox_size = points_to_bbox(
+        points, bbox_scale_factor=scale_factor)
+
+    orig_bbox_size = bbox_size / crop_size * img_bbox_sizes
+
+    # Compute the center of the crop in the original image
+    center = (torch.einsum(
+        'bij,bj->bi', [inv_crop_transforms[:, :2, :2], center_body_crop]) +
+              inv_crop_transforms[:, :2, 2])
+
+    return {
+        'center': center.reshape(-1, 2),
+        'orig_bbox_size': orig_bbox_size,
+        # 'bbox_size': bbox_size.reshape(-1),
+        'inv_crop_transforms': inv_crop_transforms,
+        # 'center_body_crop': 2 * center_body_crop / (crop_size-1) - 1,
+    }
+
+
+def concat_images(images: List[torch.Tensor]):
+    """Concat images of different size."""
+    sizes = [img.shape[1:] for img in images]
+    H, W = [max(s) for s in zip(*sizes)]
+    batch_size = len(images)
+    batched_shape = (batch_size, images[0].shape[0], H, W)
+    batched = torch.zeros(batched_shape,
+                          device=images[0].device,
+                          dtype=images[0].dtype)
+    for ii, img in enumerate(images):
+        shape = img.shape
+        batched[ii, :shape[0], :shape[1], :shape[2]] = img
+    return batched
+
+
+def flip_rotmat(pose_rotmat):
+    """Flip function.
+
+    Flip rotmat.
+    """
+    rot_mats = pose_rotmat.reshape(-1, 9).clone()
+
+    rot_mats[:, [1, 2, 3, 6]] *= -1
+    return rot_mats.view_as(pose_rotmat)
+
+
+def find_joint_global_rotation(kin_chain, root_pose, body_pose):
+    """Computes the absolute rotation of a joint from the kinematic chain."""
+    # Create a single vector with all the poses
+    parents_pose = torch.cat([root_pose, body_pose], dim=1)[:, kin_chain]
+    output_pose = parents_pose[:, 0]
+    for idx in range(1, parents_pose.shape[1]):
+        output_pose = torch.bmm(parents_pose[:, idx], output_pose)
+    return output_pose
+
+
+class CropSampler():
+    """This function crops the HD images using bilinear interpolation."""
+    def __init__(self, crop_size: int = 256) -> None:
+        """Uses bilinear sampling to extract square crops.
+
+        This module expects a high resolution image as input and a bounding
+        box, described by its' center and size. It then proceeds to extract
+        a sub-image using the provided information through bilinear
+        interpolation.
+
+        Parameters
+        ----------
+            crop_size: int
+                The desired size for the crop.
+        """
+        super(CropSampler, self).__init__()
+
+        self.crop_size = crop_size
+        x = torch.arange(0, crop_size, dtype=torch.float32) / (crop_size - 1)
+        grid_y, grid_x = torch.meshgrid(x, x)
+
+        points = torch.stack([grid_y.flatten(), grid_x.flatten()], axis=1)
+
+        self.grid = points.unsqueeze(dim=0)
+
+    def _sample_padded(self, full_imgs, sampling_grid):
+        """"""
+        # Get the sub-images using bilinear interpolation
+        return F.grid_sample(full_imgs, sampling_grid, align_corners=True)
+
+    def __call__(self, full_imgs, center, bbox_size):
+        """Crops the HD images using the provided bounding boxes.
+
+        Parameters
+        ----------
+            full_imgs: ImageList
+                An image list structure with the full resolution images
+            center: torch.Tensor
+                A Bx2 tensor that contains the coordinates of the center of
+                the bounding box that will be cropped from the original
+                image
+            bbox_size: torch.Tensor
+                A size B tensor that contains the size of the corp
+
+        Returns
+        -------
+            cropped_images: torch.Tensoror
+                The images cropped from the high resolution input
+            sampling_grid: torch.Tensor
+                The grid used to sample the crops
+        """
+
+        batch_size, _, H, W = full_imgs.shape
+        self.grid = self.grid.to(device=full_imgs.device)
+        transforms = torch.eye(3,
+                               dtype=full_imgs.dtype,
+                               device=full_imgs.device).reshape(
+                                   1, 3, 3).expand(batch_size, -1,
+                                                   -1).contiguous()
+
+        hd_to_crop = torch.eye(3,
+                               dtype=full_imgs.dtype,
+                               device=full_imgs.device).reshape(
+                                   1, 3, 3).expand(batch_size, -1,
+                                                   -1).contiguous()
+
+        # Create the transformation that maps crop pixels to image coordinates,
+        # i.e. pixel (0, 0) from the crop_size x crop_size grid gets mapped to
+        # the top left of the bounding box, pixel
+        # (crop_size - 1, crop_size - 1) to the bottom right corner of the
+        # bounding box
+        transforms[:, 0, 0] = bbox_size  # / (self.crop_size - 1)
+        transforms[:, 1, 1] = bbox_size  # / (self.crop_size - 1)
+        transforms[:, 0, 2] = center[:, 0] - bbox_size * 0.5
+        transforms[:, 1, 2] = center[:, 1] - bbox_size * 0.5
+
+        hd_to_crop[:, 0, 0] = 2 * (self.crop_size - 1) / bbox_size
+        hd_to_crop[:, 1, 1] = 2 * (self.crop_size - 1) / bbox_size
+        hd_to_crop[:, 0,
+                   2] = -(center[:, 0] - bbox_size * 0.5) * hd_to_crop[:, 0,
+                                                                       0] - 1
+        hd_to_crop[:, 1,
+                   2] = -(center[:, 1] - bbox_size * 0.5) * hd_to_crop[:, 1,
+                                                                       1] - 1
+
+        size_bbox_sizer = torch.eye(3,
+                                    dtype=full_imgs.dtype,
+                                    device=full_imgs.device).reshape(
+                                        1, 3, 3).expand(batch_size, -1,
+                                                        -1).contiguous()
+
+        # Normalize the coordinates to [-1, 1] for the grid_sample function
+        size_bbox_sizer[:, 0, 0] = 2.0 / (W - 1)
+        size_bbox_sizer[:, 1, 1] = 2.0 / (H - 1)
+        size_bbox_sizer[:, :2, 2] = -1
+
+        #  full_transform = transforms
+        full_transform = torch.bmm(size_bbox_sizer, transforms)
+
+        batch_grid = self.grid.expand(batch_size, -1, -1)
+        # Convert the grid to image coordinates using the transformations above
+        sampling_grid = (
+            torch.bmm(full_transform[:, :2, :2], batch_grid.transpose(1, 2)) +
+            full_transform[:, :2, [2]]).transpose(1, 2)
+        sampling_grid = sampling_grid.reshape(-1, self.crop_size,
+                                              self.crop_size,
+                                              2).transpose(1, 2)
+
+        out_images = self._sample_padded(full_imgs, sampling_grid)
+
+        return {
+            'images': out_images,
+            'sampling_grid': sampling_grid.reshape(batch_size, -1, 2),
+            'transform': transforms,
+            'hd_to_crop': hd_to_crop,
+        }
+
+
+class SMPLXHandCropFunc():
+    """This function crop hand image from the original image.
+
+    Use the output keypoints predicted by the body model to locate the hand
+    position.
+    """
+    def __init__(self,
+                 model_head,
+                 body_model,
+                 convention='smplx',
+                 img_res=256,
+                 scale_factor=2.0,
+                 crop_size=224,
+                 condition_hand_wrist_pose=True,
+                 condition_hand_shape=False,
+                 condition_hand_finger_pose=True):
+        self.model_head = model_head
+        self.body_model = body_model
+        self.img_res = img_res
+        self.convention = convention
+        self.left_hand_idxs = get_keypoint_idxs_by_part(
+            'left_hand', self.convention)
+        left_wrist_idx = get_keypoint_idx('left_wrist', self.convention)
+        self.left_hand_idxs.append(left_wrist_idx)
+        self.left_wrist_kin_chain = find_joint_kin_chain(
+            left_wrist_idx, self.body_model.parents)
+
+        self.right_hand_idxs = get_keypoint_idxs_by_part(
+            'right_hand', self.convention)
+        right_wrist_idx = get_keypoint_idx('right_wrist', self.convention)
+        self.right_hand_idxs.append(right_wrist_idx)
+        self.right_wrist_kin_chain = find_joint_kin_chain(
+            right_wrist_idx, self.body_model.parents)
+
+        self.scale_factor = scale_factor
+        self.hand_cropper = CropSampler(crop_size)
+
+        self.condition_hand_wrist_pose = condition_hand_wrist_pose
+        self.condition_hand_shape = condition_hand_shape
+        self.condition_hand_finger_pose = condition_hand_finger_pose
+
+    def build_hand_mean(self, global_orient, body_pose, betas, left_hand_pose,
+                        raw_right_hand_pose, batch_size):
+        """Builds the initial point for the iterative regressor of the hand."""
+        hand_mean = []
+
+        #  if self.condition_hand_on_body:
+        # Convert the absolute pose to the latent representation
+        if self.condition_hand_wrist_pose:
+            # Compute the absolute pose of the right wrist
+            right_wrist_pose_abs = find_joint_global_rotation(
+                self.right_wrist_kin_chain, global_orient, body_pose)
+            right_wrist_pose = right_wrist_pose_abs[:, :3, :2].contiguous(
+            ).reshape(batch_size, -1)
+
+            # Compute the absolute rotation for the left wrist
+            left_wrist_pose_abs = find_joint_global_rotation(
+                self.left_wrist_kin_chain, global_orient, body_pose)
+            # Flip the left wrist to the right
+            left_to_right_wrist_pose = flip_rotmat(left_wrist_pose_abs)
+
+            # Convert to the latent representation
+            left_to_right_wrist_pose = left_to_right_wrist_pose[:, :3, :
+                                                                2].contiguous(
+                                                                ).reshape(
+                                                                    batch_size,
+                                                                    -1)
+        else:
+            right_wrist_pose = self.model_head.get_mean('global_orient',
+                                                        batch_size=batch_size)
+            left_to_right_wrist_pose = self.model_head.get_mean(
+                'global_orient', batch_size=batch_size)
+
+        # Convert the pose of the left hand to the right hand and project
+        # it to the encoder space
+        left_to_right_hand_pose = flip_rotmat(
+            left_hand_pose)[:, :, :3, :2].contiguous().reshape(batch_size, -1)
+        right_hand_pose = raw_right_hand_pose.reshape(batch_size, -1)
+        camera_mean = self.model_head.get_mean('camera', batch_size=batch_size)
+
+        shape_condition = (betas if self.condition_hand_shape else
+                           self.model_head.get_mean('shape',
+                                                    batch_size=batch_size))
+        right_finger_pose_condition = (
+            right_hand_pose if self.condition_hand_finger_pose else
+            self.model_head.get_mean('right_hand_pose', batch_size=batch_size))
+        right_hand_mean = torch.cat([
+            right_wrist_pose, right_finger_pose_condition, shape_condition,
+            camera_mean
+        ],
+                                    dim=1)
+
+        left_finger_pose_condition = (
+            left_to_right_hand_pose if self.condition_hand_finger_pose else
+            self.model_head.get_mean('right_hand_pose', batch_size=batch_size))
+        # Should be Bx31
+        left_hand_mean = torch.cat([
+            left_to_right_wrist_pose, left_finger_pose_condition,
+            shape_condition, camera_mean
+        ],
+                                   dim=1)
+
+        hand_mean += [right_hand_mean, left_hand_mean]
+        hand_mean = torch.cat(hand_mean, dim=0)
+
+        return hand_mean
+
+    def __call__(self, body_predictions, img_metas):
+        """Function
+        Args:
+            body_predictions (dict): The prediction from body model.
+            img_metas (dict): Information of the input images.
+        Returns:
+            all_hand_imgs (torch.tensor): Cropped hand images.
+            hand_mean (torch.tensor): Mean value of hand params.
+            crop_info (dict): Hand crop transforms.
+        """
+        pred_param = body_predictions['pred_param']
+        pred_cam = body_predictions['pred_cam']
+        pred_raw = body_predictions['pred_raw']
+        pred_output = self.body_model(**pred_param)
+
+        pred_keypoints3d = pred_output['joints']
+        pred_keypoints2d = weak_perspective_projection(
+            pred_keypoints3d,
+            scale=pred_cam[:, 0],
+            translation=pred_cam[:, 1:3])
+        # concat ori_img
+        full_images = []
+        for img_meta in img_metas:
+            full_images.append(img_meta['ori_img'].to(device=pred_cam.device))
+        full_imgs = concat_images(full_images)
+
+        # left hand
+        left_hand_joints = (pred_keypoints2d[:, self.left_hand_idxs] * 0.5 +
+                            0.5) * (self.img_res - 1)
+        left_hand_points_to_crop = get_crop_info(left_hand_joints, img_metas,
+                                                 self.scale_factor,
+                                                 self.img_res)
+        left_hand_center = left_hand_points_to_crop['center']
+        left_hand_orig_bbox_size = left_hand_points_to_crop['orig_bbox_size']
+        left_hand_inv_crop_transforms = left_hand_points_to_crop[
+            'inv_crop_transforms']
+
+        left_hand_cropper_out = self.hand_cropper(full_imgs, left_hand_center,
+                                                  left_hand_orig_bbox_size)
+        left_hand_crops = left_hand_cropper_out['images']
+        # left_hand_points = left_hand_cropper_out['sampling_grid']
+        left_hand_crop_transform = left_hand_cropper_out['transform']
+
+        # right hand
+        right_hand_joints = (pred_keypoints2d[:, self.right_hand_idxs] * 0.5 +
+                             0.5) * (self.img_res - 1)
+        right_hand_points_to_crop = get_crop_info(right_hand_joints, img_metas,
+                                                  self.scale_factor,
+                                                  self.img_res)
+        right_hand_center = right_hand_points_to_crop['center']
+        right_hand_orig_bbox_size = right_hand_points_to_crop['orig_bbox_size']
+        # right_hand_inv_crop_transforms = right_hand_points_to_crop[
+        #     'inv_crop_transforms']
+        right_hand_cropper_out = self.hand_cropper(full_imgs,
+                                                   right_hand_center,
+                                                   right_hand_orig_bbox_size)
+        right_hand_crops = right_hand_cropper_out['images']
+        # right_hand_points = right_hand_cropper_out['sampling_grid']
+        right_hand_crop_transform = right_hand_cropper_out['transform']
+
+        # concat
+        all_hand_imgs = []
+        all_hand_imgs.append(right_hand_crops)
+        all_hand_imgs.append(torch.flip(left_hand_crops, dims=(-1, )))
+
+        # [right_hand , left hand]
+        all_hand_imgs = torch.cat(all_hand_imgs, dim=0)
+        hand_mean = self.build_hand_mean(pred_param['global_orient'],
+                                         pred_param['body_pose'],
+                                         pred_param['betas'],
+                                         pred_param['left_hand_pose'],
+                                         pred_raw['raw_right_hand_pose'],
+                                         batch_size=full_imgs.shape[0])
+        crop_info = dict(
+            hand_inv_crop_transforms=left_hand_inv_crop_transforms,
+            left_hand_crop_transform=left_hand_crop_transform,
+            right_hand_crop_transform=right_hand_crop_transform)
+        return all_hand_imgs, hand_mean, crop_info
+
+
+class SMPLXFaceCropFunc():
+    """This function crop face image from the original image.
+
+    Use the output keypoints predicted by the facce model to locate the face
+    position.
+    """
+    def __init__(self,
+                 model_head,
+                 body_model,
+                 convention='smplx',
+                 img_res=256,
+                 scale_factor=2.0,
+                 crop_size=256,
+                 num_betas=10,
+                 num_expression_coeffs=10,
+                 condition_face_neck_pose=False,
+                 condition_face_jaw_pose=True,
+                 condition_face_shape=False,
+                 condition_face_expression=True):
+        self.model_head = model_head
+        self.body_model = body_model
+        self.img_res = img_res
+        self.convention = convention
+        self.num_betas = num_betas
+        self.num_expression_coeffs = num_expression_coeffs
+
+        self.face_idx = get_keypoint_idxs_by_part('head', self.convention)
+        neck_idx = get_keypoint_idx('neck', self.convention)
+        self.neck_kin_chain = find_joint_kin_chain(neck_idx,
+                                                   self.body_model.parents)
+
+        self.condition_face_neck_pose = condition_face_neck_pose
+        self.condition_face_jaw_pose = condition_face_jaw_pose
+        self.condition_face_shape = condition_face_shape
+        self.condition_face_expression = condition_face_expression
+
+        self.scale_factor = scale_factor
+        self.face_cropper = CropSampler(crop_size)
+
+    def build_face_mean(self, global_orient, body_pose, betas, raw_jaw_pose,
+                        expression, batch_size):
+        """Builds the initial point for the iterative regressor of the face."""
+        face_mean = []
+        # Compute the absolute pose of the right wrist
+        neck_pose_abs = find_joint_global_rotation(self.neck_kin_chain,
+                                                   global_orient, body_pose)
+        # Convert the absolute neck pose to offsets
+        neck_pose = neck_pose_abs[:, :3, :2].contiguous().reshape(
+            batch_size, -1)
+
+        camera_mean = self.model_head.get_mean('camera', batch_size=batch_size)
+
+        neck_pose_condition = (neck_pose if self.condition_face_neck_pose else
+                               self.model_head.get_mean('global_orient',
+                                                        batch_size=batch_size))
+
+        jaw_pose_condition = (raw_jaw_pose.reshape(batch_size, -1)
+                              if self.condition_face_jaw_pose else
+                              self.model_head.get_mean('jaw_pose',
+                                                       batch_size=batch_size))
+        face_num_betas = self.model_head.get_num_betas()
+        shape_padding_size = face_num_betas - self.num_betas
+        betas_condition = (
+            F.pad(betas.reshape(batch_size, -1),
+                  (0, shape_padding_size)) if self.condition_face_shape else
+            self.model_head.get_mean('shape', batch_size=batch_size))
+
+        face_num_expression_coeffs = self.model_head.get_num_expression_coeffs(
+        )
+        expr_padding_size = face_num_expression_coeffs \
+            - self.num_expression_coeffs
+        expression_condition = (
+            F.pad(expression.reshape(batch_size, -1),
+                  (0, expr_padding_size)) if self.condition_face_expression
+            else self.model_head.get_mean('expression', batch_size=batch_size))
+
+        # Should be Bx(Head pose params)
+        face_mean.append(
+            torch.cat([
+                neck_pose_condition,
+                jaw_pose_condition,
+                betas_condition,
+                expression_condition,
+                camera_mean.reshape(batch_size, -1),
+            ],
+                      dim=1))
+
+        face_mean = torch.cat(face_mean, dim=0)
+        return face_mean
+
+    def __call__(self, body_predictions, img_metas):
+        """Function
+        Args:
+            body_predictions (dict): The prediction from body model.
+            img_metas (dict): Information of the input images.
+        Returns:
+            all_face_imgs (torch.tensor): Cropped face images.
+            face_mean (torch.tensor): Mean value of face params.
+            crop_info (dict): Face crop transforms.
+        """
+        pred_param = body_predictions['pred_param']
+        pred_cam = body_predictions['pred_cam']
+        pred_raw = body_predictions['pred_raw']
+
+        pred_output = self.body_model(**pred_param)
+
+        pred_keypoints3d = pred_output['joints']
+        pred_keypoints2d = weak_perspective_projection(
+            pred_keypoints3d,
+            scale=pred_cam[:, 0],
+            translation=pred_cam[:, 1:3])
+        # concat ori_img
+        full_images = []
+        for img_meta in img_metas:
+            full_images.append(img_meta['ori_img'].to(device=pred_cam.device))
+        full_imgs = concat_images(full_images)
+
+        face_joints = (pred_keypoints2d[:, self.face_idx] * 0.5 +
+                       0.5) * (self.img_res - 1)
+        face_points_to_crop = get_crop_info(face_joints, img_metas,
+                                            self.scale_factor, self.img_res)
+        face_center = face_points_to_crop['center']
+        face_orig_bbox_size = face_points_to_crop['orig_bbox_size']
+        face_inv_crop_transforms = face_points_to_crop['inv_crop_transforms']
+
+        face_cropper_out = self.face_cropper(full_imgs, face_center,
+                                             face_orig_bbox_size)
+        face_crops = face_cropper_out['images']
+        # face_points = face_cropper_out['sampling_grid']
+        face_crop_transform = face_cropper_out['transform']
+
+        all_face_imgs = [face_crops]
+        all_face_imgs = torch.cat(all_face_imgs, dim=0)
+
+        face_mean = self.build_face_mean(pred_param['global_orient'],
+                                         pred_param['body_pose'],
+                                         pred_param['betas'],
+                                         pred_raw['raw_jaw_pose'],
+                                         pred_param['expression'],
+                                         batch_size=full_imgs.shape[0])
+        crop_info = dict(face_inv_crop_transforms=face_inv_crop_transforms,
+                         face_crop_transform=face_crop_transform)
+        return all_face_imgs, face_mean, crop_info
diff --git a/detrsmpl/models/utils/__init__.py b/detrsmpl/models/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..565fd43e8ec73a284cf1a013eb7eefacd8e1f984
--- /dev/null
+++ b/detrsmpl/models/utils/__init__.py
@@ -0,0 +1,23 @@
+from .builder import (
+    build_linear_layer,
+    build_positional_encoding,
+    build_transformer,
+)
+from .fits_dict import FitsDict
+from .inverse_kinematics import batch_inverse_kinematics_transform
+from .res_layer import ResLayer, SimplifiedBasicBlock
+from .SMPLX import (
+    SMPLXFaceCropFunc,
+    SMPLXFaceMergeFunc,
+    SMPLXHandCropFunc,
+    SMPLXHandMergeFunc,
+)
+
+
+__all__ = [
+    'build_linear_layer', 'build_positional_encoding',
+    'FitsDict', 'ResLayer', 'SimplifiedBasicBlock',
+    'batch_inverse_kinematics_transform', 'SMPLXHandCropFunc',
+    'SMPLXFaceMergeFunc', 'SMPLXFaceCropFunc', 'SMPLXHandMergeFunc',
+
+]
diff --git a/detrsmpl/models/utils/builder.py b/detrsmpl/models/utils/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34647e4f0dda82e34c57b48b78549e09d406c67
--- /dev/null
+++ b/detrsmpl/models/utils/builder.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.utils import Registry, build_from_cfg
+
+from .positional_encoding import (
+    LearnedPositionalEncoding,
+    SinePositionalEncoding,
+)
+
+TRANSFORMER = Registry('Transformer')
+LINEAR_LAYERS = Registry('linear layers')
+POSITIONAL_ENCODING = Registry('position encoding')
+
+LINEAR_LAYERS.register_module('Linear', module=nn.Linear)
+POSITIONAL_ENCODING.register_module('SinePositionalEncoding',
+                                    module=SinePositionalEncoding)
+POSITIONAL_ENCODING.register_module('LearnedPositionalEncoding',
+                                    module=LearnedPositionalEncoding)
+
+
+def build_transformer(cfg, default_args=None):
+    """Builder for Transformer."""
+    return build_from_cfg(cfg, TRANSFORMER, default_args)
+
+
+def build_linear_layer(cfg, *args, **kwargs):
+    """Build linear layer.
+    Args:
+        cfg (None or dict): The linear layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an linear layer.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding linear layer.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding linear layer.
+    Returns:
+        nn.Module: Created linear layer.
+    """
+    if cfg is None:
+        cfg_ = dict(type='Linear')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in LINEAR_LAYERS:
+        raise KeyError(f'Unrecognized linear type {layer_type}')
+    else:
+        linear_layer = LINEAR_LAYERS.get(layer_type)
+
+    layer = linear_layer(*args, **kwargs, **cfg_)
+
+    return layer
+
+
+def build_positional_encoding(cfg, default_args=None):
+    """Builder for Position Encoding."""
+    return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
diff --git a/detrsmpl/models/utils/fits_dict.py b/detrsmpl/models/utils/fits_dict.py
new file mode 100644
index 0000000000000000000000000000000000000000..c090170364cac97ba376e3a4253120ed441fa5ce
--- /dev/null
+++ b/detrsmpl/models/utils/fits_dict.py
@@ -0,0 +1,134 @@
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/nkolot/SPIN/blob/master/train/fits_dict.py
+# Original licence please see docs/additional_licenses.md
+# ------------------------------------------------------------------------------
+
+import os
+
+import cv2
+import numpy as np
+import torch
+
+from detrsmpl.utils.transforms import aa_to_rotmat
+
+train_datasets = ['h36m', 'mpi_inf_3dhp', 'lsp', 'lspet', 'mpii', 'coco']
+static_fits_load_dir = 'data/static_fits'
+save_dir = 'data/spin_fits'
+
+# Permutation of SMPL pose parameters when flipping the shape
+SMPL_JOINTS_FLIP_PERM = [
+    0, 2, 1, 3, 5, 4, 6, 8, 7, 9, 11, 10, 12, 14, 13, 15, 17, 16, 19, 18, 21,
+    20, 23, 22
+]
+SMPL_POSE_FLIP_PERM = []
+for i in SMPL_JOINTS_FLIP_PERM:
+    SMPL_POSE_FLIP_PERM.append(3 * i)
+    SMPL_POSE_FLIP_PERM.append(3 * i + 1)
+    SMPL_POSE_FLIP_PERM.append(3 * i + 2)
+
+
+class FitsDict():
+    """Dictionary keeping track of the best fit per image in the training set.
+
+    Ref: https://github.com/nkolot/SPIN/blob/master/train/fits_dict.py
+    """
+    def __init__(self, fits='static') -> None:
+        assert fits in ['static', 'final']
+        self.fits = fits
+        self.fits_dict = {}
+
+        # array used to flip SMPL pose parameters
+        self.flipped_parts = torch.tensor(SMPL_POSE_FLIP_PERM,
+                                          dtype=torch.int64)
+        # Load dictionary state
+        # for ds_name, ds in train_dataset.dataset_dict.items():
+        for ds_name in train_datasets:
+
+            # h36m has gt so no static fits
+            if ds_name == 'h36m' or self.fits == 'static':
+                dict_file = os.path.join(static_fits_load_dir,
+                                         ds_name + '_fits.npy')
+                content = np.load(dict_file)
+                self.fits_dict[ds_name] = torch.from_numpy(content)
+                del content
+            elif self.fits == 'final':
+                dict_file = os.path.join('data/final_fits', ds_name + '.npz')
+                # load like this to save mem
+                content = np.load(dict_file)
+                pose = torch.from_numpy(content['pose'])
+                betas = torch.from_numpy(content['betas'])
+                del content
+                params = torch.cat([pose, betas], dim=-1)
+                self.fits_dict[ds_name] = params
+
+    def save(self):
+        """Save dictionary state to disk."""
+        for ds_name in train_datasets:
+            dict_file = os.path.join(save_dir, ds_name + '_fits.npy')
+            np.save(dict_file, self.fits_dict[ds_name].cpu().numpy())
+
+    def __getitem__(self, x):
+        """Retrieve dictionary entries."""
+        dataset_name, ind, rot, is_flipped = x
+        batch_size = len(dataset_name)
+        pose = torch.zeros((batch_size, 72))
+        betas = torch.zeros((batch_size, 10))
+        for ds, i, n in zip(dataset_name, ind, range(batch_size)):
+            params = self.fits_dict[ds][i]
+            pose[n, :] = params[:72]
+            betas[n, :] = params[72:]
+        pose = pose.clone()
+
+        # Apply flipping and rotation
+        pose = self.rotate_pose(self.flip_pose(pose, is_flipped), rot)
+
+        betas = betas.clone()
+        return pose, betas
+
+    def __setitem__(self, x, val):
+        """Update dictionary entries."""
+        dataset_name, ind, rot, is_flipped, update = x
+        pose, betas = val
+        batch_size = len(dataset_name)
+
+        # Undo flipping and rotation
+        pose = self.flip_pose(self.rotate_pose(pose, -rot), is_flipped)
+
+        params = torch.cat((pose, betas), dim=-1).cpu()
+        for ds, i, n in zip(dataset_name, ind, range(batch_size)):
+            if update[n]:
+                self.fits_dict[ds][i] = params[n]
+
+    def flip_pose(self, pose, is_flipped):
+        """flip SMPL pose parameters."""
+        is_flipped = is_flipped.bool()
+        pose_f = pose.clone()
+        pose_f[is_flipped, :] = pose[is_flipped][:, self.flipped_parts]
+        # we also negate the second and the third dimension of the
+        # axis-angle representation
+        pose_f[is_flipped, 1::3] *= -1
+        pose_f[is_flipped, 2::3] *= -1
+        return pose_f
+
+    def rotate_pose(self, pose, rot):
+        """Rotate SMPL pose parameters by rot degrees."""
+        pose = pose.clone()
+        cos = torch.cos(-np.pi * rot / 180.)
+        sin = torch.sin(-np.pi * rot / 180.)
+        zeros = torch.zeros_like(cos)
+        r3 = torch.zeros(cos.shape[0], 1, 3, device=cos.device)
+        r3[:, 0, -1] = 1
+        R = torch.cat([
+            torch.stack([cos, -sin, zeros], dim=-1).unsqueeze(1),
+            torch.stack([sin, cos, zeros], dim=-1).unsqueeze(1), r3
+        ],
+                      dim=1)
+        global_pose = pose[:, :3]
+        global_pose_rotmat = R @ aa_to_rotmat(global_pose)
+        global_pose_rotmat = global_pose_rotmat.cpu().numpy()
+        global_pose_np = np.zeros((global_pose.shape[0], 3))
+        for i in range(global_pose.shape[0]):
+            aa, _ = cv2.Rodrigues(global_pose_rotmat[i])
+            global_pose_np[i, :] = aa.squeeze()
+        pose[:, :3] = torch.from_numpy(global_pose_np).to(pose.device)
+        return pose
diff --git a/detrsmpl/models/utils/inverse_kinematics.py b/detrsmpl/models/utils/inverse_kinematics.py
new file mode 100644
index 0000000000000000000000000000000000000000..cff524e2275170960c9b68ff59aa382298acdae4
--- /dev/null
+++ b/detrsmpl/models/utils/inverse_kinematics.py
@@ -0,0 +1,432 @@
+"""This script is based on the release codes:
+
+"HybrIK: A Hybrid Analytical-Neural Inverse Kinematics Solution for 3D Human
+Pose and Shape Estimation. CVPR 2021"
+(https://github.com/Jeff-sjtu/HybrIK).
+"""
+
+from __future__ import absolute_import, division, print_function
+
+import torch
+
+from detrsmpl.utils.transforms import aa_to_rotmat
+
+
+def batch_inverse_kinematics_transform(pose_skeleton,
+                                       global_orient,
+                                       phis,
+                                       rest_pose,
+                                       children,
+                                       parents,
+                                       dtype=torch.float32,
+                                       train=False,
+                                       leaf_thetas=None):
+    """Applies inverse kinematics transform to joints in a batch.
+
+    Args:
+        pose_skeleton (torch.tensor):
+            Locations of estimated pose skeleton with shape (Bx29x3)
+        global_orient (torch.tensor|none):
+            Tensor of global rotation matrices with shape (Bx1x3x3)
+        phis (torch.tensor):
+            Rotation on bone axis parameters with shape (Bx23x2)
+        rest_pose (torch.tensor):
+            Locations of rest (Template) pose with shape (Bx29x3)
+        children (List[int]): list of indexes of kinematic children with len 29
+        parents (List[int]): list of indexes of kinematic parents with len 29
+        dtype (torch.dtype, optional):
+            Data type of the created tensors. Default: torch.float32
+        train (bool):
+            Store True in train mode. Default: False
+        leaf_thetas (torch.tensor, optional):
+            Rotation matrixes for 5 leaf joints (Bx5x3x3). Default: None
+
+
+    Returns:
+        rot_mats (torch.tensor):
+            Rotation matrics of all joints with shape (Bx29x3x3)
+        rotate_rest_pose (torch.tensor):
+            Locations of rotated rest/ template pose with shape (Bx29x3)
+    """
+    batch_size = pose_skeleton.shape[0]
+    device = pose_skeleton.device
+
+    rel_rest_pose = rest_pose.clone()
+    # vec_t_k = t_k - t_pa(k)
+    rel_rest_pose[:, 1:] -= rest_pose[:, parents[1:]].clone()
+    rel_rest_pose = torch.unsqueeze(rel_rest_pose, dim=-1)
+
+    # rotate the T pose
+    rotate_rest_pose = torch.zeros_like(rel_rest_pose)
+    # set up the root
+    rotate_rest_pose[:, 0] = rel_rest_pose[:, 0]
+
+    rel_pose_skeleton = torch.unsqueeze(pose_skeleton.clone(), dim=-1).detach()
+    rel_pose_skeleton[:, 1:] -= rel_pose_skeleton[:, parents[1:]].clone()
+    rel_pose_skeleton[:, 0] = rel_rest_pose[:, 0]
+
+    # the predicted final pose
+    final_pose_skeleton = torch.unsqueeze(pose_skeleton.clone(), dim=-1)
+    if train:
+        final_pose_skeleton[:, 1:] -= \
+            final_pose_skeleton[:, parents[1:]].clone()
+        final_pose_skeleton[:, 0] = rel_rest_pose[:, 0]
+    else:
+        final_pose_skeleton += \
+            rel_rest_pose[:, 0:1] - final_pose_skeleton[:, 0:1]
+
+    rel_rest_pose = rel_rest_pose
+    rel_pose_skeleton = rel_pose_skeleton
+    final_pose_skeleton = final_pose_skeleton
+    rotate_rest_pose = rotate_rest_pose
+
+    assert phis.dim() == 3
+    phis = phis / (torch.norm(phis, dim=2, keepdim=True) + 1e-8)
+
+    if train:
+        global_orient_mat = batch_get_pelvis_orient(rel_pose_skeleton.clone(),
+                                                    rel_rest_pose.clone(),
+                                                    parents, children, dtype)
+    else:
+        global_orient_mat = batch_get_pelvis_orient_svd(
+            rel_pose_skeleton.clone(), rel_rest_pose.clone(), parents,
+            children, dtype)
+
+    rot_mat_chain = [global_orient_mat]
+    rot_mat_local = [global_orient_mat]
+    # leaf nodes rot_mats
+    if leaf_thetas is not None:
+        leaf_cnt = 0
+        leaf_rot_mats = leaf_thetas.view([batch_size, 5, 3, 3])
+
+    for i in range(1, parents.shape[0]):
+        if children[i] == -1:
+            # leaf nodes
+            if leaf_thetas is not None:
+                rot_mat = leaf_rot_mats[:, leaf_cnt, :, :]
+                leaf_cnt += 1
+
+                rotate_rest_pose[:, i] = rotate_rest_pose[:, parents[
+                    i]] + torch.matmul(rot_mat_chain[parents[i]],
+                                       rel_rest_pose[:, i])
+
+                rot_mat_chain.append(
+                    torch.matmul(rot_mat_chain[parents[i]], rot_mat))
+                rot_mat_local.append(rot_mat)
+        elif children[i] == -3:
+            # three children
+            rotate_rest_pose[:, i] = rotate_rest_pose[:, parents[i]] + \
+                torch.matmul(rot_mat_chain[parents[i]], rel_rest_pose[:, i])
+
+            spine_child = []
+            for c in range(1, parents.shape[0]):
+                if parents[c] == i and c not in spine_child:
+                    spine_child.append(c)
+
+            # original
+            spine_child = []
+            for c in range(1, parents.shape[0]):
+                if parents[c] == i and c not in spine_child:
+                    spine_child.append(c)
+
+            children_final_loc = []
+            children_rest_loc = []
+            for c in spine_child:
+                temp = final_pose_skeleton[:, c] - rotate_rest_pose[:, i]
+                children_final_loc.append(temp)
+
+                children_rest_loc.append(rel_rest_pose[:, c].clone())
+
+            rot_mat = batch_get_3children_orient_svd(children_final_loc,
+                                                     children_rest_loc,
+                                                     rot_mat_chain[parents[i]],
+                                                     spine_child, dtype)
+
+            rot_mat_chain.append(
+                torch.matmul(rot_mat_chain[parents[i]], rot_mat))
+            rot_mat_local.append(rot_mat)
+        else:
+            # Naive Hybrik
+            if train:
+                # i: the index of k-th joint
+                child_rest_loc = rel_rest_pose[:, i]
+                child_final_loc = final_pose_skeleton[:, i]
+
+            # q_pa(k) = q_pa^2(k) + R_pa(k)(t_pa(k) - t_pa^2(k))
+            rotate_rest_pose[:, i] = rotate_rest_pose[:, parents[i]] + \
+                torch.matmul(rot_mat_chain[parents[i]], rel_rest_pose[:, i])
+            # Adaptive HybrIK
+            if not train:
+                # children[i]: the index of k-th joint
+                child_rest_loc = rel_rest_pose[:, children[i]]
+                child_final_loc = final_pose_skeleton[:, children[
+                    i]] - rotate_rest_pose[:, i]
+
+                orig_vec = rel_pose_skeleton[:, children[i]]
+                template_vec = rel_rest_pose[:, children[i]]
+                norm_t = torch.norm(template_vec, dim=1, keepdim=True)
+                orig_vec = orig_vec * norm_t / torch.norm(
+                    orig_vec, dim=1, keepdim=True)
+
+                diff = torch.norm(child_final_loc - orig_vec,
+                                  dim=1,
+                                  keepdim=True)
+                big_diff_idx = torch.where(diff > 15 / 1000)[0]
+
+                child_final_loc[big_diff_idx] = orig_vec[big_diff_idx]
+
+            # train: vec_p_k = R_pa(k).T * (p_k - p_pa(k))
+            # test: vec_p_k = R_pa(k).T * (p_k - q_pa(k))
+            child_final_loc = torch.matmul(
+                rot_mat_chain[parents[i]].transpose(1, 2), child_final_loc)
+
+            # (B, 1, 1)
+            child_final_norm = torch.norm(child_final_loc, dim=1, keepdim=True)
+            child_rest_norm = torch.norm(child_rest_loc, dim=1, keepdim=True)
+
+            # vec_n
+            axis = torch.cross(child_rest_loc, child_final_loc, dim=1)
+            axis_norm = torch.norm(axis, dim=1, keepdim=True)
+
+            # (B, 1, 1)
+            cos = torch.sum(
+                child_rest_loc * child_final_loc, dim=1,
+                keepdim=True) / (child_rest_norm * child_final_norm + 1e-8)
+            sin = axis_norm / (child_rest_norm * child_final_norm + 1e-8)
+
+            # (B, 3, 1)
+            axis = axis / (axis_norm + 1e-8)
+
+            # Convert location revolve to rot_mat by rodrigues
+            # (B, 1, 1)
+            rx, ry, rz = torch.split(axis, 1, dim=1)
+            zeros = torch.zeros((batch_size, 1, 1), dtype=dtype, device=device)
+
+            K = torch.cat([zeros, -rz, ry, rz, zeros, -rx, -ry, rx, zeros],
+                          dim=1).view((batch_size, 3, 3))
+            ident = torch.eye(3, dtype=dtype, device=device).unsqueeze(dim=0)
+            rot_mat_loc = ident + sin * K + (1 - cos) * torch.bmm(K, K)
+
+            # Convert spin to rot_mat
+            # (B, 3, 1)
+            spin_axis = child_rest_loc / child_rest_norm
+            # (B, 1, 1)
+            rx, ry, rz = torch.split(spin_axis, 1, dim=1)
+            zeros = torch.zeros((batch_size, 1, 1), dtype=dtype, device=device)
+            K = torch.cat([zeros, -rz, ry, rz, zeros, -rx, -ry, rx, zeros],
+                          dim=1).view((batch_size, 3, 3))
+            ident = torch.eye(3, dtype=dtype, device=device).unsqueeze(dim=0)
+            # (B, 1, 1)
+            cos, sin = torch.split(phis[:, i - 1], 1, dim=1)
+            cos = torch.unsqueeze(cos, dim=2)
+            sin = torch.unsqueeze(sin, dim=2)
+            rot_mat_spin = ident + sin * K + (1 - cos) * torch.bmm(K, K)
+            rot_mat = torch.matmul(rot_mat_loc, rot_mat_spin)
+
+            rot_mat_chain.append(
+                torch.matmul(rot_mat_chain[parents[i]], rot_mat))
+            rot_mat_local.append(rot_mat)
+
+    # (B, K + 1, 3, 3)
+    rot_mats = torch.stack(rot_mat_local, dim=1)
+
+    return rot_mats, rotate_rest_pose.squeeze(-1)
+
+
+def batch_get_pelvis_orient_svd(rel_pose_skeleton, rel_rest_pose, parents,
+                                children, dtype):
+    """Get pelvis orientation svd for batch data.
+
+    Args:
+        rel_pose_skeleton (torch.tensor):
+            Locations of root-normalized pose skeleton with shape (Bx29x3)
+        rel_rest_pose (torch.tensor):
+            Locations of rest/ template pose with shape (Bx29x3)
+        parents (List[int]): list of indexes of kinematic parents with len 29
+        children (List[int]): list of indexes of kinematic children with len 29
+        dtype (torch.dtype, optional):
+            Data type of the created tensors, the default is torch.float32
+
+    Returns:
+        rot_mat (torch.tensor):
+            Rotation matrix of pelvis with shape (Bx3x3)
+    """
+    pelvis_child = [int(children[0])]
+    for i in range(1, parents.shape[0]):
+        if parents[i] == 0 and i not in pelvis_child:
+            pelvis_child.append(i)
+
+    rest_mat = []
+    target_mat = []
+    for child in pelvis_child:
+        rest_mat.append(rel_rest_pose[:, child].clone())
+        target_mat.append(rel_pose_skeleton[:, child].clone())
+
+    rest_mat = torch.cat(rest_mat, dim=2)
+    target_mat = torch.cat(target_mat, dim=2)
+    S = rest_mat.bmm(target_mat.transpose(1, 2))
+
+    mask_zero = S.sum(dim=(1, 2))
+
+    S_non_zero = S[mask_zero != 0].reshape(-1, 3, 3)
+
+    U, _, V = torch.svd(S_non_zero)
+
+    rot_mat = torch.zeros_like(S)
+    rot_mat[mask_zero == 0] = torch.eye(3, device=S.device)
+
+    rot_mat_non_zero = torch.bmm(V, U.transpose(1, 2))
+    rot_mat[mask_zero != 0] = rot_mat_non_zero
+
+    assert torch.sum(torch.isnan(rot_mat)) == 0, ('rot_mat', rot_mat)
+
+    return rot_mat
+
+
+def batch_get_pelvis_orient(rel_pose_skeleton, rel_rest_pose, parents,
+                            children, dtype):
+    """Get pelvis orientation for batch data.
+
+    Args:
+        rel_pose_skeleton (torch.tensor):
+            Locations of root-normalized pose skeleton with shape (Bx29x3)
+        rel_rest_pose (torch.tensor):
+            Locations of rest/ template pose with shape (Bx29x3)
+        parents (List[int]): list of indexes of kinematic parents with len 29
+        children (List[int]): list of indexes of kinematic children with len 29
+        dtype (torch.dtype, optional):
+            Data type of the created tensors, the default is torch.float32
+
+    Returns:
+        rot_mat (torch.tensor):
+            Rotation matrix of pelvis with shape (Bx3x3)
+    """
+    batch_size = rel_pose_skeleton.shape[0]
+    device = rel_pose_skeleton.device
+
+    assert children[0] == 3
+    pelvis_child = [int(children[0])]
+    for i in range(1, parents.shape[0]):
+        if parents[i] == 0 and i not in pelvis_child:
+            pelvis_child.append(i)
+
+    spine_final_loc = rel_pose_skeleton[:, int(children[0])].clone()
+    spine_rest_loc = rel_rest_pose[:, int(children[0])].clone()
+    # spine_norm = torch.norm(spine_final_loc, dim=1, keepdim=True)
+    # spine_norm = spine_final_loc / (spine_norm + 1e-8)
+
+    # rot_mat_spine = vectors2rotmat(spine_rest_loc, spine_final_loc, dtype)
+
+    # (B, 1, 1)
+    vec_final_norm = torch.norm(spine_final_loc, dim=1, keepdim=True)
+    vec_rest_norm = torch.norm(spine_rest_loc, dim=1, keepdim=True)
+
+    spine_norm = spine_final_loc / (vec_final_norm + 1e-8)
+
+    # (B, 3, 1)
+    axis = torch.cross(spine_rest_loc, spine_final_loc, dim=1)
+    axis_norm = torch.norm(axis, dim=1, keepdim=True)
+    axis = axis / (axis_norm + 1e-8)
+    angle = torch.arccos(
+        torch.sum(spine_rest_loc * spine_final_loc, dim=1, keepdim=True) /
+        (vec_rest_norm * vec_final_norm + 1e-8))
+    axis_angle = (angle * axis).squeeze()
+    # aa to rotmat
+    rot_mat_spine = aa_to_rotmat(axis_angle)
+
+    assert torch.sum(torch.isnan(rot_mat_spine)) == 0, ('rot_mat_spine',
+                                                        rot_mat_spine)
+    center_final_loc = 0
+    center_rest_loc = 0
+    for child in pelvis_child:
+        if child == int(children[0]):
+            continue
+        center_final_loc = center_final_loc + rel_pose_skeleton[:,
+                                                                child].clone()
+        center_rest_loc = center_rest_loc + rel_rest_pose[:, child].clone()
+    center_final_loc = center_final_loc / (len(pelvis_child) - 1)
+    center_rest_loc = center_rest_loc / (len(pelvis_child) - 1)
+
+    center_rest_loc = torch.matmul(rot_mat_spine, center_rest_loc)
+
+    center_final_loc = center_final_loc - torch.sum(
+        center_final_loc * spine_norm, dim=1, keepdim=True) * spine_norm
+    center_rest_loc = center_rest_loc - torch.sum(
+        center_rest_loc * spine_norm, dim=1, keepdim=True) * spine_norm
+
+    center_final_loc_norm = torch.norm(center_final_loc, dim=1, keepdim=True)
+    center_rest_loc_norm = torch.norm(center_rest_loc, dim=1, keepdim=True)
+
+    # (B, 3, 1)
+    axis = torch.cross(center_rest_loc, center_final_loc, dim=1)
+    axis_norm = torch.norm(axis, dim=1, keepdim=True)
+
+    # (B, 1, 1)
+    cos = torch.sum(
+        center_rest_loc * center_final_loc, dim=1,
+        keepdim=True) / (center_rest_loc_norm * center_final_loc_norm + 1e-8)
+    sin = axis_norm / (center_rest_loc_norm * center_final_loc_norm + 1e-8)
+
+    assert torch.sum(torch.isnan(cos)) == 0, ('cos', cos)
+    assert torch.sum(torch.isnan(sin)) == 0, ('sin', sin)
+    # (B, 3, 1)
+    axis = axis / (axis_norm + 1e-8)
+
+    # Convert location revolve to rot_mat by rodrigues
+    # (B, 1, 1)
+    rx, ry, rz = torch.split(axis, 1, dim=1)
+    zeros = torch.zeros((batch_size, 1, 1), dtype=dtype, device=device)
+
+    K = torch.cat([zeros, -rz, ry, rz, zeros, -rx, -ry, rx, zeros], dim=1) \
+        .view((batch_size, 3, 3))
+    ident = torch.eye(3, dtype=dtype, device=device).unsqueeze(dim=0)
+    rot_mat_center = ident + sin * K + (1 - cos) * torch.bmm(K, K)
+
+    rot_mat = torch.matmul(rot_mat_center, rot_mat_spine)
+
+    return rot_mat
+
+
+def batch_get_3children_orient_svd(rel_pose_skeleton, rel_rest_pose,
+                                   rot_mat_chain_parent, children_list, dtype):
+    """Get pelvis orientation for batch data.
+
+    Args:
+        rel_pose_skeleton (torch.tensor):
+            Locations of root-normalized pose skeleton with shape (Bx29x3)
+        rel_rest_pose (torch.tensor):
+            Locations of rest/ template pose with shape (Bx29x3)
+        rot_mat_chain_parents (torch.tensor):
+            parent's rotation matrix with shape (Bx3x3)
+        children (List[int]): list of indexes of kinematic children with len 29
+        dtype (torch.dtype, optional):
+            Data type of the created tensors, the default is torch.float32
+
+    Returns:
+        rot_mat (torch.tensor):
+            Child's rotation matrix with shape (Bx3x3)
+    """
+    rest_mat = []
+    target_mat = []
+    for c, child in enumerate(children_list):
+        if isinstance(rel_pose_skeleton, list):
+            target = rel_pose_skeleton[c].clone()
+            template = rel_rest_pose[c].clone()
+        else:
+            target = rel_pose_skeleton[:, child].clone()
+            template = rel_rest_pose[:, child].clone()
+
+        target = torch.matmul(rot_mat_chain_parent.transpose(1, 2), target)
+
+        target_mat.append(target)
+        rest_mat.append(template)
+
+    rest_mat = torch.cat(rest_mat, dim=2)
+    target_mat = torch.cat(target_mat, dim=2)
+    S = rest_mat.bmm(target_mat.transpose(1, 2))
+
+    U, _, V = torch.svd(S)
+
+    rot_mat = torch.bmm(V, U.transpose(1, 2))
+    assert torch.sum(torch.isnan(rot_mat)) == 0, ('3children rot_mat', rot_mat)
+    return rot_mat
diff --git a/detrsmpl/models/utils/positional_encoding.py b/detrsmpl/models/utils/positional_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..c668c5e3564aea1de10f0042a9e458a86fc8e297
--- /dev/null
+++ b/detrsmpl/models/utils/positional_encoding.py
@@ -0,0 +1,159 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+from mmcv.runner import BaseModule
+
+
+class SinePositionalEncoding(BaseModule):
+    """Position encoding with sine and cosine functions.
+
+    See `End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. Note the final returned dimension
+            for each position is 2 times of this value.
+        temperature (int, optional): The temperature used for scaling
+            the position embedding. Defaults to 10000.
+        normalize (bool, optional): Whether to normalize the position
+            embedding. Defaults to False.
+        scale (float, optional): A scale factor that scales the position
+            embedding. The scale will be used only when `normalize` is True.
+            Defaults to 2*pi.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Defaults to 1e-6.
+        offset (float): offset add to embed when do the normalization.
+            Defaults to 0.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None
+    """
+    def __init__(self,
+                 num_feats,
+                 temperature=10000,
+                 normalize=False,
+                 scale=2 * math.pi,
+                 eps=1e-6,
+                 offset=0.,
+                 init_cfg=None):
+        super(SinePositionalEncoding, self).__init__(init_cfg)
+        if normalize:
+            assert isinstance(scale, (float, int)), 'when normalize is set,' \
+                'scale should be provided and in float or int type, ' \
+                f'found {type(scale)}'
+        self.num_feats = num_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        self.scale = scale
+        self.eps = eps
+        self.offset = offset
+
+    def forward(self, mask):
+        """Forward function for `SinePositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        # For convenience of exporting to ONNX, it's required to convert
+        # `masks` from bool to int.
+        mask = mask.to(torch.int)
+        not_mask = 1 - mask  # logical_not
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            y_embed = (y_embed + self.offset) / \
+                      (y_embed[:, -1:, :] + self.eps) * self.scale
+            x_embed = (x_embed + self.offset) / \
+                      (x_embed[:, :, -1:] + self.eps) * self.scale
+        dim_t = torch.arange(self.num_feats,
+                             dtype=torch.float32,
+                             device=mask.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_feats)
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        B, H, W = mask.size()
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).view(B, H, W, -1)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'temperature={self.temperature}, '
+        repr_str += f'normalize={self.normalize}, '
+        repr_str += f'scale={self.scale}, '
+        repr_str += f'eps={self.eps})'
+        return repr_str
+
+
+class LearnedPositionalEncoding(BaseModule):
+    """Position embedding with learnable embedding weights.
+
+    Args:
+        num_feats (int): The feature dimension for each position
+            along x-axis or y-axis. The final returned dimension for
+            each position is 2 times of this value.
+        row_num_embed (int, optional): The dictionary size of row embeddings.
+            Default 50.
+        col_num_embed (int, optional): The dictionary size of col embeddings.
+            Default 50.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+    """
+    def __init__(self,
+                 num_feats,
+                 row_num_embed=50,
+                 col_num_embed=50,
+                 init_cfg=dict(type='Uniform', layer='Embedding')):
+        super(LearnedPositionalEncoding, self).__init__(init_cfg)
+        self.row_embed = nn.Embedding(row_num_embed, num_feats)
+        self.col_embed = nn.Embedding(col_num_embed, num_feats)
+        self.num_feats = num_feats
+        self.row_num_embed = row_num_embed
+        self.col_num_embed = col_num_embed
+
+    def forward(self, mask):
+        """Forward function for `LearnedPositionalEncoding`.
+
+        Args:
+            mask (Tensor): ByteTensor mask. Non-zero values representing
+                ignored positions, while zero values means valid positions
+                for this image. Shape [bs, h, w].
+
+        Returns:
+            pos (Tensor): Returned position embedding with shape
+                [bs, num_feats*2, h, w].
+        """
+        h, w = mask.shape[-2:]
+        x = torch.arange(w, device=mask.device)
+        y = torch.arange(h, device=mask.device)
+        x_embed = self.col_embed(x)
+        y_embed = self.row_embed(y)
+        pos = torch.cat(
+            (x_embed.unsqueeze(0).repeat(h, 1, 1), y_embed.unsqueeze(1).repeat(
+                1, w, 1)),
+            dim=-1).permute(2, 0,
+                            1).unsqueeze(0).repeat(mask.shape[0], 1, 1, 1)
+        return pos
+
+    def __repr__(self):
+        """str: a string that describes the module"""
+        repr_str = self.__class__.__name__
+        repr_str += f'(num_feats={self.num_feats}, '
+        repr_str += f'row_num_embed={self.row_num_embed}, '
+        repr_str += f'col_num_embed={self.col_num_embed})'
+        return repr_str
diff --git a/detrsmpl/models/utils/res_layer.py b/detrsmpl/models/utils/res_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..40cd79c13d9808097daa4934c3b9763565e4628b
--- /dev/null
+++ b/detrsmpl/models/utils/res_layer.py
@@ -0,0 +1,187 @@
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.runner import BaseModule, Sequential
+from torch import nn as nn
+
+
+class ResLayer(Sequential):
+    """ResLayer to build ResNet style backbone.
+
+    Args:
+        block (nn.Module): block used to build ResLayer.
+        inplanes (int): inplanes of block.
+        planes (int): planes of block.
+        num_blocks (int): number of blocks.
+        stride (int): stride of the first block. Default: 1
+        avg_down (bool): Use AvgPool instead of stride conv when
+            downsampling in the bottleneck. Default: False
+        conv_cfg (dict): dictionary to construct and config conv layer.
+            Default: None
+        norm_cfg (dict): dictionary to construct and config norm layer.
+            Default: dict(type='BN')
+        downsample_first (bool): Downsample at the first block or last block.
+            False for Hourglass, True for ResNet. Default: True
+    """
+    def __init__(self,
+                 block,
+                 inplanes,
+                 planes,
+                 num_blocks,
+                 stride=1,
+                 avg_down=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 downsample_first=True,
+                 **kwargs):
+        self.block = block
+
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = []
+            conv_stride = stride
+            if avg_down:
+                conv_stride = 1
+                downsample.append(
+                    nn.AvgPool2d(kernel_size=stride,
+                                 stride=stride,
+                                 ceil_mode=True,
+                                 count_include_pad=False))
+            downsample.extend([
+                build_conv_layer(conv_cfg,
+                                 inplanes,
+                                 planes * block.expansion,
+                                 kernel_size=1,
+                                 stride=conv_stride,
+                                 bias=False),
+                build_norm_layer(norm_cfg, planes * block.expansion)[1]
+            ])
+            downsample = nn.Sequential(*downsample)
+
+        layers = []
+        if downsample_first:
+            layers.append(
+                block(inplanes=inplanes,
+                      planes=planes,
+                      stride=stride,
+                      downsample=downsample,
+                      conv_cfg=conv_cfg,
+                      norm_cfg=norm_cfg,
+                      **kwargs))
+            inplanes = planes * block.expansion
+            for _ in range(1, num_blocks):
+                layers.append(
+                    block(inplanes=inplanes,
+                          planes=planes,
+                          stride=1,
+                          conv_cfg=conv_cfg,
+                          norm_cfg=norm_cfg,
+                          **kwargs))
+
+        else:  # downsample_first=False is for HourglassModule
+            for _ in range(num_blocks - 1):
+                layers.append(
+                    block(inplanes=inplanes,
+                          planes=inplanes,
+                          stride=1,
+                          conv_cfg=conv_cfg,
+                          norm_cfg=norm_cfg,
+                          **kwargs))
+            layers.append(
+                block(inplanes=inplanes,
+                      planes=planes,
+                      stride=stride,
+                      downsample=downsample,
+                      conv_cfg=conv_cfg,
+                      norm_cfg=norm_cfg,
+                      **kwargs))
+        super(ResLayer, self).__init__(*layers)
+
+
+class SimplifiedBasicBlock(BaseModule):
+    """Simplified version of original basic residual block. This is used in
+    `SCNet <https://arxiv.org/abs/2012.10150>`_.
+
+    - Norm layer is now optional
+    - Last ReLU in forward function is removed
+    """
+    expansion = 1
+
+    def __init__(self,
+                 inplanes,
+                 planes,
+                 stride=1,
+                 dilation=1,
+                 downsample=None,
+                 style='pytorch',
+                 with_cp=False,
+                 conv_cfg=None,
+                 norm_cfg=dict(type='BN'),
+                 dcn=None,
+                 plugins=None,
+                 init_fg=None):
+        super(SimplifiedBasicBlock, self).__init__(init_fg)
+        assert dcn is None, 'Not implemented yet.'
+        assert plugins is None, 'Not implemented yet.'
+        assert not with_cp, 'Not implemented yet.'
+        self.with_norm = norm_cfg is not None
+        with_bias = True if norm_cfg is None else False
+        self.conv1 = build_conv_layer(conv_cfg,
+                                      inplanes,
+                                      planes,
+                                      3,
+                                      stride=stride,
+                                      padding=dilation,
+                                      dilation=dilation,
+                                      bias=with_bias)
+        if self.with_norm:
+            self.norm1_name, norm1 = build_norm_layer(norm_cfg,
+                                                      planes,
+                                                      postfix=1)
+            self.add_module(self.norm1_name, norm1)
+        self.conv2 = build_conv_layer(conv_cfg,
+                                      planes,
+                                      planes,
+                                      3,
+                                      padding=1,
+                                      bias=with_bias)
+        if self.with_norm:
+            self.norm2_name, norm2 = build_norm_layer(norm_cfg,
+                                                      planes,
+                                                      postfix=2)
+            self.add_module(self.norm2_name, norm2)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    @property
+    def norm1(self):
+        """nn.Module: normalization layer after the first convolution layer"""
+        return getattr(self, self.norm1_name) if self.with_norm else None
+
+    @property
+    def norm2(self):
+        """nn.Module: normalization layer after the second convolution layer"""
+        return getattr(self, self.norm2_name) if self.with_norm else None
+
+    def forward(self, x):
+        """Forward function."""
+
+        identity = x
+
+        out = self.conv1(x)
+        if self.with_norm:
+            out = self.norm1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        if self.with_norm:
+            out = self.norm2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+
+        return out
diff --git a/detrsmpl/models/utils/transformer.py b/detrsmpl/models/utils/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1637938e74bba51a2e23ec0964bf0e5df30851ed
--- /dev/null
+++ b/detrsmpl/models/utils/transformer.py
@@ -0,0 +1,717 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.cnn.bricks.registry import (
+    TRANSFORMER_LAYER,
+    TRANSFORMER_LAYER_SEQUENCE,
+)
+from mmcv.cnn.bricks.transformer import (
+    BaseTransformerLayer,
+    TransformerLayerSequence,
+    build_transformer_layer_sequence,
+)
+from mmcv.runner.base_module import BaseModule
+# from mmcv.utils import to_2tuple
+from torch.nn.init import normal_
+
+# from mmdet.models.utils.builder import TRANSFORMER
+from .builder import TRANSFORMER
+
+# import torch.nn.functional as F
+from mmcv.cnn import (  # build_activation_layer,; build_conv_layer,
+    build_norm_layer, xavier_init,
+)
+
+# from typing import Sequence
+
+try:
+    from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention
+
+except ImportError:
+    warnings.warn(
+        '`MultiScaleDeformableAttention` in MMCV has been moved to '
+        '`mmcv.ops.multi_scale_deform_attn`, please update your MMCV')
+    from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    """Inverse function of sigmoid.
+
+    Args:
+        x (Tensor): The tensor to do the
+            inverse.
+        eps (float): EPS avoid numerical
+            overflow. Defaults 1e-5.
+    Returns:
+        Tensor: The x has passed the inverse
+            function of sigmoid, has same
+            shape with input.
+    """
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+@TRANSFORMER_LAYER.register_module()
+class DetrTransformerDecoderLayer(BaseTransformerLayer):
+    """Implements decoder layer in DETR transformer.
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | list[dict] | dict )):
+            Configs for self_attention or cross_attention, the order
+            should be consistent with it in `operation_order`. If it is
+            a dict, it would be expand to the number of attention in
+            `operation_order`.
+        feedforward_channels (int): The hidden dimension for FFNs.
+        ffn_dropout (float): Probability of an element to be zeroed
+            in ffn. Default 0.0.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Default：None
+        act_cfg (dict): The activation config for FFNs. Default: `LN`
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: `LN`.
+        ffn_num_fcs (int): The number of fully-connected layers in FFNs.
+            Default：2.
+    """
+    def __init__(self,
+                 attn_cfgs,
+                 feedforward_channels,
+                 ffn_dropout=0.0,
+                 operation_order=None,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 norm_cfg=dict(type='LN'),
+                 ffn_num_fcs=2,
+                 **kwargs):
+        super(DetrTransformerDecoderLayer,
+              self).__init__(attn_cfgs=attn_cfgs,
+                             feedforward_channels=feedforward_channels,
+                             ffn_dropout=ffn_dropout,
+                             operation_order=operation_order,
+                             act_cfg=act_cfg,
+                             norm_cfg=norm_cfg,
+                             ffn_num_fcs=ffn_num_fcs,
+                             **kwargs)
+        assert len(operation_order) == 6
+        assert set(operation_order) == set(
+            ['self_attn', 'norm', 'cross_attn', 'ffn'])
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerEncoder(TransformerLayerSequence):
+    """TransformerEncoder of DETR.
+
+    Args:
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`. Only used when `self.pre_norm` is `True`
+    """
+    def __init__(self, *args, post_norm_cfg=dict(type='LN'), **kwargs):
+        super(DetrTransformerEncoder, self).__init__(*args, **kwargs)
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(
+                post_norm_cfg, self.embed_dims)[1] if self.pre_norm else None
+        else:
+            assert not self.pre_norm, f'Use prenorm in ' \
+                                      f'{self.__class__.__name__},' \
+                                      f'Please specify post_norm_cfg'
+            self.post_norm = None
+
+    def forward(self, *args, **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Returns:
+            Tensor: forwarded results with shape [num_query, bs, embed_dims].
+        """
+        x = super(DetrTransformerEncoder, self).forward(*args, **kwargs)
+        if self.post_norm is not None:
+            x = self.post_norm(x)
+        return x
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        post_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+    def __init__(self,
+                 *args,
+                 post_norm_cfg=dict(type='LN'),
+                 return_intermediate=False,
+                 **kwargs):
+
+        super(DetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+        if post_norm_cfg is not None:
+            self.post_norm = build_norm_layer(post_norm_cfg,
+                                              self.embed_dims)[1]
+        else:
+            self.post_norm = None
+
+    def forward(self, query, *args, **kwargs):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        if not self.return_intermediate:
+            x = super().forward(query, *args, **kwargs)
+            if self.post_norm:
+                x = self.post_norm(x)[None]
+            return x
+
+        intermediate = []
+        for layer in self.layers:
+            query = layer(query, *args, **kwargs)
+            if self.return_intermediate:
+                if self.post_norm is not None:
+                    intermediate.append(self.post_norm(query))
+                else:
+                    intermediate.append(query)
+        return torch.stack(intermediate)
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class DeformableDetrTransformerDecoder(TransformerLayerSequence):
+    """Implements the decoder in DETR transformer.
+
+    Args:
+        return_intermediate (bool): Whether to return intermediate outputs.
+        coder_norm_cfg (dict): Config of last normalization layer. Default：
+            `LN`.
+    """
+    def __init__(self, *args, return_intermediate=False, **kwargs):
+
+        super(DeformableDetrTransformerDecoder, self).__init__(*args, **kwargs)
+        self.return_intermediate = return_intermediate
+
+    def forward(self,
+                query,
+                *args,
+                reference_points=None,
+                valid_ratios=None,
+                reg_branches=None,
+                **kwargs):
+        """Forward function for `TransformerDecoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_query, bs, embed_dims)`.
+            reference_points (Tensor): The reference
+                points of offset. has shape
+                (bs, num_query, 4) when as_two_stage,
+                otherwise has shape ((bs, num_query, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            reg_branch: (obj:`nn.ModuleList`): Used for
+                refining the regression results. Only would
+                be passed when with_box_refine is True,
+                otherwise would be passed a `None`.
+
+        Returns:
+            Tensor: Results with shape [1, num_query, bs, embed_dims] when
+                return_intermediate is `False`, otherwise it has shape
+                [num_layers, num_query, bs, embed_dims].
+        """
+        output = query
+        intermediate = []
+        intermediate_reference_points = []
+        for lid, layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = reference_points[:, :, None] * \
+                    torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+            else:
+                assert reference_points.shape[-1] == 2
+                reference_points_input = reference_points[:, :, None] * \
+                    valid_ratios[:, None]
+            output = layer(output,
+                           *args,
+                           reference_points=reference_points_input,
+                           **kwargs)
+            output = output.permute(1, 0, 2)
+
+            if reg_branches is not None:
+                tmp = reg_branches[lid](output)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(
+                        reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    assert reference_points.shape[-1] == 2
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[
+                        ..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            output = output.permute(1, 0, 2)
+            if self.return_intermediate:
+                intermediate.append(output)
+                intermediate_reference_points.append(reference_points)
+
+        if self.return_intermediate:
+            return torch.stack(intermediate), torch.stack(
+                intermediate_reference_points)
+
+        return output, reference_points
+
+
+@TRANSFORMER.register_module()
+class Transformer(BaseModule):
+    """Implements the DETR transformer.
+
+    Following the official DETR implementation, this module copy-paste
+    from torch.nn.Transformer with modifications:
+
+        * positional encodings are passed in MultiheadAttention
+        * extra LN at the end of encoder is removed
+        * decoder returns a stack of activations from all decoding layers
+
+    See `paper: End-to-End Object Detection with Transformers
+    <https://arxiv.org/pdf/2005.12872>`_ for details.
+
+    Args:
+        encoder (`mmcv.ConfigDict` | Dict): Config of
+            TransformerEncoder. Defaults to None.
+        decoder ((`mmcv.ConfigDict` | Dict)): Config of
+            TransformerDecoder. Defaults to None
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Defaults to None.
+    """
+    def __init__(self, encoder=None, decoder=None, init_cfg=None):
+        super(Transformer, self).__init__(init_cfg=init_cfg)
+        self.encoder = build_transformer_layer_sequence(encoder)
+        self.decoder = build_transformer_layer_sequence(decoder)
+        self.embed_dims = self.encoder.embed_dims
+
+    def init_weights(self):
+        # follow the official DETR to init parameters
+        for m in self.modules():
+            if hasattr(m, 'weight') and m.weight.dim() > 1:
+                xavier_init(m, distribution='uniform')
+        self._is_init = True
+
+    def forward(self, x, mask, query_embed, pos_embed):
+        """Forward function for `Transformer`.
+
+        Args:
+            x (Tensor): Input query with shape [bs, c, h, w] where
+                c = embed_dims.
+            mask (Tensor): The key_padding_mask used for encoder and decoder,
+                with shape [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder, with shape
+                [num_query, c].
+            pos_embed (Tensor): The positional encoding for encoder and
+                decoder, with the same shape as `x`.
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - out_dec: Output from decoder. If return_intermediate_dec \
+                      is True output has shape [num_dec_layers, bs,
+                      num_query, embed_dims], else has shape [1, bs, \
+                      num_query, embed_dims].
+                - memory: Output results from encoder, with shape \
+                      [bs, embed_dims, h, w].
+        """
+        bs, c, h, w = x.shape
+        # use `view` instead of `flatten` for dynamically exporting to ONNX
+        x = x.view(bs, c, -1).permute(2, 0, 1)  # [bs, c, h, w] -> [h*w, bs, c]
+        pos_embed = pos_embed.view(bs, c, -1).permute(2, 0, 1)
+        query_embed = query_embed.unsqueeze(1).repeat(
+            1, bs, 1)  # [num_query, dim] -> [num_query, bs, dim]
+        mask = mask.view(bs, -1)  # [bs, h, w] -> [bs, h*w]
+        memory = self.encoder(query=x,
+                              key=None,
+                              value=None,
+                              query_pos=pos_embed,
+                              query_key_padding_mask=mask)
+        target = torch.zeros_like(query_embed)
+        # out_dec: [num_layers, num_query, bs, dim]
+        out_dec = self.decoder(query=target,
+                               key=memory,
+                               value=memory,
+                               key_pos=pos_embed,
+                               query_pos=query_embed,
+                               key_padding_mask=mask)
+        out_dec = out_dec.transpose(1, 2)
+        memory = memory.permute(1, 2, 0).reshape(bs, c, h, w)
+        return out_dec, memory
+
+
+@TRANSFORMER.register_module()
+class DeformableDetrTransformer(Transformer):
+    """Implements the DeformableDETR transformer.
+
+    Args:
+        as_two_stage (bool): Generate query from encoder features.
+            Default: False.
+        num_feature_levels (int): Number of feature maps from FPN:
+            Default: 4.
+        two_stage_num_proposals (int): Number of proposals when set
+            `as_two_stage` as True. Default: 300.
+    """
+    def __init__(self,
+                 as_two_stage=False,
+                 num_feature_levels=4,
+                 two_stage_num_proposals=300,
+                 **kwargs):
+        super(DeformableDetrTransformer, self).__init__(**kwargs)
+        self.as_two_stage = as_two_stage
+        self.num_feature_levels = num_feature_levels
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.embed_dims = self.encoder.embed_dims
+        self.init_layers()
+
+    def init_layers(self):
+        """Initialize layers of the DeformableDetrTransformer."""
+        self.level_embeds = nn.Parameter(
+            torch.Tensor(self.num_feature_levels, self.embed_dims))
+
+        if self.as_two_stage:
+            self.enc_output = nn.Linear(self.embed_dims, self.embed_dims)
+            self.enc_output_norm = nn.LayerNorm(self.embed_dims)
+            self.pos_trans = nn.Linear(self.embed_dims * 2,
+                                       self.embed_dims * 2)
+            self.pos_trans_norm = nn.LayerNorm(self.embed_dims * 2)
+        else:
+            self.reference_points = nn.Linear(self.embed_dims, 2)
+
+    def init_weights(self):
+        """Initialize the transformer weights."""
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+        for m in self.modules():
+            if isinstance(m, MultiScaleDeformableAttention):
+                m.init_weights()
+        if not self.as_two_stage:
+            xavier_init(self.reference_points, distribution='uniform', bias=0.)
+        normal_(self.level_embeds)
+
+    def gen_encoder_output_proposals(self, memory, memory_padding_mask,
+                                     spatial_shapes):
+        """Generate proposals from encoded memory.
+
+        Args:
+            memory (Tensor) : The output of encoder,
+                has shape (bs, num_key, embed_dim).  num_key is
+                equal the number of points on feature map from
+                all level.
+            memory_padding_mask (Tensor): Padding mask for memory.
+                has shape (bs, num_key).
+            spatial_shapes (Tensor): The shape of all feature maps.
+                has shape (num_level, 2).
+
+        Returns:
+            tuple: A tuple of feature map and bbox prediction.
+
+                - output_memory (Tensor): The input of decoder,  \
+                    has shape (bs, num_key, embed_dim).  num_key is \
+                    equal the number of points on feature map from \
+                    all levels.
+                - output_proposals (Tensor): The normalized proposal \
+                    after a inverse sigmoid, has shape \
+                    (bs, num_keys, 4).
+        """
+
+        N, S, C = memory.shape
+        proposals = []
+        _cur = 0
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H * W)].view(
+                N, H, W, 1)
+            valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = torch.meshgrid(
+                torch.linspace(0,
+                               H - 1,
+                               H,
+                               dtype=torch.float32,
+                               device=memory.device),
+                torch.linspace(0,
+                               W - 1,
+                               W,
+                               dtype=torch.float32,
+                               device=memory.device))
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_W.unsqueeze(-1),
+                               valid_H.unsqueeze(-1)], 1).view(N, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(N, -1, -1, -1) + 0.5) / scale
+            wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+            proposal = torch.cat((grid, wh), -1).view(N, -1, 4)
+            proposals.append(proposal)
+            _cur += (H * W)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) &
+                                  (output_proposals < 0.99)).all(-1,
+                                                                 keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))
+        output_proposals = output_proposals.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float('inf'))
+        output_proposals = output_proposals.masked_fill(
+            ~output_proposals_valid, float('inf'))
+
+        output_memory = memory
+        output_memory = output_memory.masked_fill(
+            memory_padding_mask.unsqueeze(-1), float(0))
+        output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                                  float(0))
+        output_memory = self.enc_output_norm(self.enc_output(output_memory))
+        return output_memory, output_proposals
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """Get the reference points used in decoder.
+
+        Args:
+            spatial_shapes (Tensor): The shape of all
+                feature maps, has shape (num_level, 2).
+            valid_ratios (Tensor): The radios of valid
+                points on the feature map, has shape
+                (bs, num_levels, 2)
+            device (obj:`device`): The device where
+                reference_points should be.
+
+        Returns:
+            Tensor: reference points used in decoder, has \
+                shape (bs, num_keys, num_levels, 2).
+        """
+        reference_points_list = []
+        for lvl, (H, W) in enumerate(spatial_shapes):
+            #  TODO  check this 0.5
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(0.5,
+                               H - 0.5,
+                               H,
+                               dtype=torch.float32,
+                               device=device),
+                torch.linspace(0.5,
+                               W - 0.5,
+                               W,
+                               dtype=torch.float32,
+                               device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] *
+                                               H)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] *
+                                               W)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def get_valid_ratio(self, mask):
+        """Get the valid radios of feature maps of all  level."""
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self,
+                               proposals,
+                               num_pos_feats=128,
+                               temperature=10000):
+        """Get the position embedding of proposal."""
+        scale = 2 * math.pi
+        dim_t = torch.arange(num_pos_feats,
+                             dtype=torch.float32,
+                             device=proposals.device)
+        dim_t = temperature**(2 * (dim_t // 2) / num_pos_feats)
+        # N, L, 4
+        proposals = proposals.sigmoid() * scale
+        # N, L, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # N, L, 4, 64, 2
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()),
+                          dim=4).flatten(2)
+        return pos
+
+    def forward(self,
+                mlvl_feats,
+                mlvl_masks,
+                query_embed,
+                mlvl_pos_embeds,
+                reg_branches=None,
+                cls_branches=None,
+                smpl_branches=None,
+                **kwargs):
+        """Forward function for `Transformer`.
+
+        Args:
+            mlvl_feats (list(Tensor)): Input queries from
+                different level. Each element has shape
+                [bs, embed_dims, h, w].
+            mlvl_masks (list(Tensor)): The key_padding_mask from
+                different level used for encoder and decoder,
+                each element has shape  [bs, h, w].
+            query_embed (Tensor): The query embedding for decoder,
+                with shape [num_query, c].
+            mlvl_pos_embeds (list(Tensor)): The positional encoding
+                of feats from different level, has the shape
+                 [bs, embed_dims, h, w].
+            reg_branches (obj:`nn.ModuleList`): Regression heads for
+                feature maps from each decoder layer. Only would
+                be passed when
+                `with_box_refine` is True. Default to None.
+            cls_branches (obj:`nn.ModuleList`): Classification heads
+                for feature maps from each decoder layer. Only would
+                 be passed when `as_two_stage`
+                 is True. Default to None.
+
+
+        Returns:
+            tuple[Tensor]: results of decoder containing the following tensor.
+
+                - inter_states: Outputs from decoder. If
+                    return_intermediate_dec is True output has shape \
+                      (num_dec_layers, bs, num_query, embed_dims), else has \
+                      shape (1, bs, num_query, embed_dims).
+                - init_reference_out: The initial value of reference \
+                    points, has shape (bs, num_queries, 4).
+                - inter_references_out: The internal value of reference \
+                    points in decoder, has shape \
+                    (num_dec_layers, bs,num_query, embed_dims)
+                - enc_outputs_class: The classification score of \
+                    proposals generated from \
+                    encoder's feature maps, has shape \
+                    (batch, h*w, num_classes). \
+                    Only would be returned when `as_two_stage` is True, \
+                    otherwise None.
+                - enc_outputs_coord_unact: The regression results \
+                    generated from encoder's feature maps., has shape \
+                    (batch, h*w, 4). Only would \
+                    be returned when `as_two_stage` is True, \
+                    otherwise None.
+        """
+        assert self.as_two_stage or query_embed is not None
+
+        feat_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (feat, mask, pos_embed) in enumerate(
+                zip(mlvl_feats, mlvl_masks, mlvl_pos_embeds)):
+            bs, c, h, w = feat.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+            feat = feat.flatten(2).transpose(1, 2)
+            mask = mask.flatten(1)
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embeds[lvl].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            feat_flatten.append(feat)
+            mask_flatten.append(mask)
+        feat_flatten = torch.cat(feat_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes,
+                                         dtype=torch.long,
+                                         device=feat_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack(
+            [self.get_valid_ratio(m) for m in mlvl_masks], 1)
+
+        reference_points = \
+            self.get_reference_points(spatial_shapes,
+                                      valid_ratios,
+                                      device=feat.device)
+
+        feat_flatten = feat_flatten.permute(1, 0, 2)  # (H*W, bs, embed_dims)
+        lvl_pos_embed_flatten = lvl_pos_embed_flatten.permute(
+            1, 0, 2)  # (H*W, bs, embed_dims)
+        memory = self.encoder(query=feat_flatten,
+                              key=None,
+                              value=None,
+                              query_pos=lvl_pos_embed_flatten,
+                              query_key_padding_mask=mask_flatten,
+                              spatial_shapes=spatial_shapes,
+                              reference_points=reference_points,
+                              level_start_index=level_start_index,
+                              valid_ratios=valid_ratios,
+                              **kwargs)
+
+        memory = memory.permute(1, 0, 2)
+        bs, _, c = memory.shape
+        if self.as_two_stage:
+            output_memory, output_proposals = \
+                self.gen_encoder_output_proposals(
+                    memory, mask_flatten, spatial_shapes)
+            enc_outputs_class = cls_branches[self.decoder.num_layers](
+                output_memory)
+            enc_outputs_coord_unact = \
+                reg_branches[
+                    self.decoder.num_layers](output_memory) + output_proposals
+
+            topk = self.two_stage_num_proposals
+            # We only use the first channel in enc_outputs_class as foreground,
+            # the other (num_classes - 1) channels are actually not used.
+            # Its targets are set to be 0s, which indicates the first
+            # class (foreground) because we use [0, num_classes - 1] to
+            # indicate class labels, background class is indicated by
+            # num_classes (similar convention in RPN).
+            # See https://github.com/open-mmlab/mmdetection/blob/master/mmdet/models/dense_heads/deformable_detr_head.py#L241 # noqa
+            # This follows the official implementation of Deformable DETR.
+            topk_proposals = torch.topk(enc_outputs_class[..., 0], topk,
+                                        dim=1)[1]
+            topk_coords_unact = torch.gather(
+                enc_outputs_coord_unact, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+            topk_coords_unact = topk_coords_unact.detach()
+            reference_points = topk_coords_unact.sigmoid()
+            init_reference_out = reference_points
+            pos_trans_out = self.pos_trans_norm(
+                self.pos_trans(self.get_proposal_pos_embed(topk_coords_unact)))
+            query_pos, query = torch.split(pos_trans_out, c, dim=2)
+        else:
+            query_pos, query = torch.split(query_embed, c, dim=1)
+            query_pos = query_pos.unsqueeze(0).expand(bs, -1, -1)
+            query = query.unsqueeze(0).expand(bs, -1, -1)
+            reference_points = self.reference_points(query_pos).sigmoid()
+            init_reference_out = reference_points
+
+        # decoder
+        query = query.permute(1, 0, 2)
+        memory = memory.permute(1, 0, 2)
+        query_pos = query_pos.permute(1, 0, 2)
+        inter_states, inter_references = self.decoder(
+            query=query,
+            key=None,
+            value=memory,
+            query_pos=query_pos,
+            key_padding_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            reg_branches=reg_branches,
+            smpl_branches=smpl_branches,
+            **kwargs)
+
+        inter_references_out = inter_references
+        if self.as_two_stage:
+            return inter_states, init_reference_out,\
+                inter_references_out, enc_outputs_class,\
+                enc_outputs_coord_unact
+        return inter_states, init_reference_out, \
+            inter_references_out, None, None
diff --git a/detrsmpl/utils/__init__.py b/detrsmpl/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/detrsmpl/utils/camera_utils.py b/detrsmpl/utils/camera_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7ef6e252297c7503b213a30f7dbc76237551d8c
--- /dev/null
+++ b/detrsmpl/utils/camera_utils.py
@@ -0,0 +1,210 @@
+import copy
+import os
+from typing import Iterable, Optional, Union
+
+import numpy as np
+import torch
+from pytorch3d.renderer.cameras import CamerasBase
+
+from detrsmpl.core.cameras import build_cameras
+from detrsmpl.core.conventions.cameras.convert_convention import (
+    convert_camera_matrix,
+    convert_world_view,
+)
+from detrsmpl.core.conventions.cameras.convert_projection import \
+    convert_perspective_to_weakperspective  # prevent yapf isort conflict
+from detrsmpl.models.body_models.builder import build_body_model
+from detrsmpl.utils.transforms import aa_to_rotmat, rotmat_to_aa
+
+
+def convert_smpl_from_opencv_calibration(
+        R: Union[np.ndarray, torch.Tensor],
+        T: Union[np.ndarray, torch.Tensor],
+        K: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        resolution: Optional[Union[Iterable[int], int]] = None,
+        verts: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        poses: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        transl: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        model_path: Optional[str] = None,
+        betas: Optional[Union[np.ndarray, torch.Tensor]] = None,
+        model_type: Optional[str] = 'smpl',
+        gender: Optional[str] = 'neutral'):
+    """Convert opencv calibration smpl poses&transl parameters to model based
+    poses&transl or verts.
+
+    Args:
+        R (Union[np.ndarray, torch.Tensor]): (frame, 3, 3)
+        T (Union[np.ndarray, torch.Tensor]): [(frame, 3)
+        K (Optional[Union[np.ndarray, torch.Tensor]], optional):
+            (frame, 3, 3) or (frame, 4, 4). Defaults to None.
+        resolution (Optional[Union[Iterable[int], int]], optional):
+            (height, width). Defaults to None.
+        verts (Optional[Union[np.ndarray, torch.Tensor]], optional):
+            (frame, num_verts, 3). Defaults to None.
+        poses (Optional[Union[np.ndarray, torch.Tensor]], optional):
+            (frame, 72/165). Defaults to None.
+        transl (Optional[Union[np.ndarray, torch.Tensor]], optional):
+            (frame, 3). Defaults to None.
+        model_path (Optional[str], optional): model path.
+            Defaults to None.
+        betas (Optional[Union[np.ndarray, torch.Tensor]], optional):
+            (frame, 10). Defaults to None.
+        model_type (Optional[str], optional): choose in 'smpl' or 'smplx'.
+            Defaults to 'smpl'.
+        gender (Optional[str], optional): choose in 'male', 'female',
+            'neutral'.
+            Defaults to 'neutral'.
+
+    Raises:
+        ValueError: wrong input poses or transl.
+
+    Returns:
+        Tuple[torch.Tensor]: Return converted poses, transl, pred_cam
+            or verts, pred_cam.
+    """
+    R_, T_ = convert_world_view(R, T)
+
+    RT = torch.eye(4, 4)[None]
+    RT[:, :3, :3] = R_
+    RT[:, :3, 3] = T_
+
+    if verts is not None:
+        poses = None
+        betas = None
+        transl = None
+    else:
+        assert poses is not None
+        assert transl is not None
+        if isinstance(poses, dict):
+            poses = copy.deepcopy(poses)
+            for k in poses:
+                if isinstance(poses[k], np.ndarray):
+                    poses[k] = torch.Tensor(poses[k])
+        elif isinstance(poses, np.ndarray):
+            poses = torch.Tensor(poses)
+        elif isinstance(poses, torch.Tensor):
+            poses = poses.clone()
+        else:
+            raise ValueError(f'Wrong data type of poses: {type(poses)}.')
+
+        if isinstance(transl, np.ndarray):
+            transl = torch.Tensor(transl)
+        elif isinstance(transl, torch.Tensor):
+            transl = transl.clone()
+        else:
+            raise ValueError('Should pass valid `transl`.')
+        transl = transl.view(-1, 3)
+
+        if isinstance(betas, np.ndarray):
+            betas = torch.Tensor(betas)
+        elif isinstance(betas, torch.Tensor):
+            betas = betas.clone()
+
+        body_model = build_body_model(
+            dict(type=model_type,
+                 model_path=os.path.join(model_path, model_type),
+                 gender=gender,
+                 model_type=model_type))
+        if isinstance(poses, dict):
+            poses.update({'transl': transl, 'betas': betas})
+        else:
+            if isinstance(poses, np.ndarray):
+                poses = torch.tensor(poses)
+            poses = body_model.tensor2dict(full_pose=poses,
+                                           transl=transl,
+                                           betas=betas)
+        model_output = body_model(**poses)
+        verts = model_output['vertices']
+
+        global_orient = poses['global_orient']
+        global_orient = rotmat_to_aa(R_ @ aa_to_rotmat(global_orient))
+        poses['global_orient'] = global_orient
+        poses['transl'] = None
+        verts_rotated = model_output['vertices']
+        rotated_pose = body_model.dict2tensor(poses)
+
+    verts_converted = verts.clone().view(-1, 3)
+    verts_converted = RT @ torch.cat(
+        [verts_converted,
+         torch.ones(verts_converted.shape[0], 1)], dim=-1).unsqueeze(-1)
+    verts_converted = verts_converted.squeeze(-1)
+    verts_converted = verts_converted[:, :3] / verts_converted[:, 3:]
+    verts_converted = verts_converted.view(verts.shape[0], -1, 3)
+    num_frame = verts_converted.shape[0]
+    if poses is not None:
+        transl = torch.mean(verts_converted - verts_rotated, dim=1)
+
+    orig_cam = None
+    if K is not None:
+        zmean = torch.mean(verts_converted, dim=1)[:, 2]
+
+        K, _, _ = convert_camera_matrix(K,
+                                        is_perspective=True,
+                                        convention_dst='opencv',
+                                        convention_src='opencv',
+                                        in_ndc_dst=True,
+                                        in_ndc_src=False,
+                                        resolution_src=resolution)
+        K = K.repeat(num_frame, 1, 1)
+
+        orig_cam = convert_perspective_to_weakperspective(
+            K=K, zmean=zmean, in_ndc=True, resolution=resolution)
+
+        if poses is not None:
+            orig_cam[:, 0, 3] += transl[:, 0]
+            orig_cam[:, 1, 3] += transl[:, 1]
+    if poses is not None:
+        return rotated_pose, orig_cam
+    else:
+        return verts_converted, orig_cam
+
+
+def project_points(points3d: Union[np.ndarray, torch.Tensor],
+                   cameras: CamerasBase = None,
+                   resolution: Iterable[int] = None,
+                   K: Union[torch.Tensor, np.ndarray] = None,
+                   R: Union[torch.Tensor, np.ndarray] = None,
+                   T: Union[torch.Tensor, np.ndarray] = None,
+                   convention: str = 'opencv',
+                   in_ndc: bool = False) -> Union[torch.Tensor, np.ndarray]:
+    """Project 3d points to image.
+
+    Args:
+        points3d (Union[np.ndarray, torch.Tensor]): shape could be (..., 3).
+        cameras (CamerasBase): pytorch3d cameras or mmhuman3d cameras.
+        resolution (Iterable[int]): (height, width) for rectangle or width for
+            square.
+        K (Union[torch.Tensor, np.ndarray], optional): intrinsic matrix.
+            Defaults to None.
+        R (Union[torch.Tensor, np.ndarray], optional): rotation matrix.
+            Defaults to None.
+        T (Union[torch.Tensor, np.ndarray], optional): translation matrix.
+            Defaults to None.
+        convention (str, optional): camera convention. Defaults to 'opencv'.
+        in_ndc (bool, optional): whether in NDC. Defaults to False.
+
+    Returns:
+        Union[torch.Tensor, np.ndarray]: transformed points of shape (..., 2).
+    """
+    if cameras is None:
+        cameras = build_cameras(
+            dict(type='perspective',
+                 convention=convention,
+                 in_ndc=in_ndc,
+                 resolution=resolution,
+                 K=K,
+                 R=R,
+                 T=T))
+    if cameras.get_image_size() is not None:
+        image_size = cameras.get_image_size()
+    else:
+        image_size = resolution
+    if isinstance(points3d, np.ndarray):
+        points3d = torch.Tensor(points3d[..., :3]).to(cameras.device)
+        points2d = cameras.transform_points_screen(
+            points3d, image_size=image_size).cpu().numpy()
+    elif isinstance(points3d, torch.Tensor):
+        points3d = points3d[..., :3].to(cameras.device)
+        points2d = cameras.transform_points_screen(points3d,
+                                                   image_size=image_size)
+    return points2d
diff --git a/detrsmpl/utils/collect_env.py b/detrsmpl/utils/collect_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1733944e021354df70349d38abdd68f1b705228
--- /dev/null
+++ b/detrsmpl/utils/collect_env.py
@@ -0,0 +1,16 @@
+from mmcv.utils import collect_env as collect_base_env
+from mmcv.utils import get_git_hash
+
+import detrsmpl
+
+
+def collect_env():
+    """Collect the information of the running environments."""
+    env_info = collect_base_env()
+    env_info['MMHuman3d'] = detrsmpl.__version__ + '+' + get_git_hash()[:7]
+    return env_info
+
+
+if __name__ == '__main__':
+    for name, val in collect_env().items():
+        print(f'{name}: {val}')
diff --git a/detrsmpl/utils/demo_utils.py b/detrsmpl/utils/demo_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ab077444573c2b645f3c2714c2ad0a818d94da9
--- /dev/null
+++ b/detrsmpl/utils/demo_utils.py
@@ -0,0 +1,823 @@
+import colorsys
+import os
+from collections import defaultdict
+from contextlib import contextmanager
+from functools import partial
+from pathlib import Path
+
+import mmcv
+import numpy as np
+from mmcv import Timer
+from scipy import interpolate
+
+from detrsmpl.core.post_processing import build_post_processing
+
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
+
+def xyxy2xywh(bbox_xyxy):
+    """Transform the bbox format from x1y1x2y2 to xywh.
+
+    Args:
+        bbox_xyxy (np.ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5). (left, top, right, bottom, [score])
+
+    Returns:
+        np.ndarray: Bounding boxes (with scores),
+          shaped (n, 4) or (n, 5). (left, top, width, height, [score])
+    """
+    if not isinstance(bbox_xyxy, np.ndarray):
+        raise TypeError(
+            f'Input type is {type(bbox_xyxy)}, which should be numpy.ndarray.')
+    bbox_xywh = bbox_xyxy.copy()
+    bbox_xywh[..., 2] = bbox_xywh[..., 2] - bbox_xywh[..., 0]
+    bbox_xywh[..., 3] = bbox_xywh[..., 3] - bbox_xywh[..., 1]
+
+    return bbox_xywh
+
+
+def xywh2xyxy(bbox_xywh):
+    """Transform the bbox format from xywh to x1y1x2y2.
+
+    Args:
+        bbox_xywh (np.ndarray): Bounding boxes (with scores), shaped
+        (n, 4) or (n, 5). (left, top, width, height, [score])
+
+    Returns:
+        np.ndarray: Bounding boxes (with scores),
+          shaped (n, 4) or (n, 5). (left, top, right, bottom, [score])
+    """
+    if not isinstance(bbox_xywh, np.ndarray):
+        raise TypeError(
+            f'Input type is {type(bbox_xywh)}, which should be numpy.ndarray.')
+    bbox_xyxy = bbox_xywh.copy()
+    bbox_xyxy[..., 2] = bbox_xyxy[..., 2] + bbox_xyxy[..., 0] - 1
+    bbox_xyxy[..., 3] = bbox_xyxy[..., 3] + bbox_xyxy[..., 1] - 1
+
+    return bbox_xyxy
+
+
+def box2cs(bbox_xywh, aspect_ratio=1.0, bbox_scale_factor=1.25):
+    """Convert xywh coordinates to center and scale.
+
+    Args:
+    bbox_xywh (numpy.ndarray): the height of the bbox_xywh
+    aspect_ratio (int, optional): Defaults to 1.0
+    bbox_scale_factor (float, optional): Defaults to 1.25
+    Returns:
+        numpy.ndarray: center of the bbox
+        numpy.ndarray: the scale of the bbox w & h
+    """
+    if not isinstance(bbox_xywh, np.ndarray):
+        raise TypeError(
+            f'Input type is {type(bbox_xywh)}, which should be numpy.ndarray.')
+
+    bbox_xywh = bbox_xywh.copy()
+    pixel_std = 1
+    center = np.stack([
+        bbox_xywh[..., 0] + bbox_xywh[..., 2] * 0.5,
+        bbox_xywh[..., 1] + bbox_xywh[..., 3] * 0.5
+    ], -1)
+
+    mask_h = bbox_xywh[..., 2] > aspect_ratio * bbox_xywh[..., 3]
+    mask_w = ~mask_h
+
+    bbox_xywh[mask_h, 3] = bbox_xywh[mask_h, 2] / aspect_ratio
+    bbox_xywh[mask_w, 2] = bbox_xywh[mask_w, 3] * aspect_ratio
+    scale = np.stack([
+        bbox_xywh[..., 2] * 1.0 / pixel_std,
+        bbox_xywh[..., 3] * 1.0 / pixel_std
+    ], -1)
+    scale = scale * bbox_scale_factor
+
+    return center, scale
+
+
+def convert_crop_cam_to_orig_img(cam: np.ndarray,
+                                 bbox: np.ndarray,
+                                 img_width: int,
+                                 img_height: int,
+                                 aspect_ratio: float = 1.0,
+                                 bbox_scale_factor: float = 1.25,
+                                 bbox_format: Literal['xyxy', 'xywh',
+                                                      'cs'] = 'xyxy'):
+    """This function is modified from [VIBE](https://github.com/
+    mkocabas/VIBE/blob/master/lib/utils/demo_utils.py#L242-L259). Original
+    license please see docs/additional_licenses.md.
+
+    Args:
+        cam (np.ndarray): cam (ndarray, shape=(frame, 3) or
+        (frame,num_person, 3)):
+        weak perspective camera in cropped img coordinates
+        bbox (np.ndarray): bbox coordinates
+        img_width (int): original image width
+        img_height (int): original image height
+        aspect_ratio (float, optional):  Defaults to 1.0.
+        bbox_scale_factor (float, optional):  Defaults to 1.25.
+        bbox_format (Literal['xyxy', 'xywh', 'cs']): Defaults to 'xyxy'.
+            'xyxy' means the left-up point and right-bottomn point of the
+            bbox.
+            'xywh' means the left-up point and the width and height of the
+            bbox.
+            'cs' means the center of the bbox (x,y) and the scale of the
+            bbox w & h.
+    Returns:
+        orig_cam: shape = (frame, 4) or (frame, num_person, 4)
+    """
+    if not isinstance(bbox, np.ndarray):
+        raise TypeError(
+            f'Input type is {type(bbox)}, which should be numpy.ndarray.')
+    bbox = bbox.copy()
+    if bbox_format == 'xyxy':
+        bbox_xywh = xyxy2xywh(bbox)
+        center, scale = box2cs(bbox_xywh, aspect_ratio, bbox_scale_factor)
+        bbox_cs = np.concatenate([center, scale], axis=-1)
+    elif bbox_format == 'xywh':
+        center, scale = box2cs(bbox, aspect_ratio, bbox_scale_factor)
+        bbox_cs = np.concatenate([center, scale], axis=-1)
+    elif bbox_format == 'cs':
+        bbox_cs = bbox
+    else:
+        raise ValueError('Only supports the format of `xyxy`, `cs` and `xywh`')
+
+    cx, cy, h = bbox_cs[..., 0], bbox_cs[..., 1], bbox_cs[..., 2] + 1e-6
+    hw, hh = img_width / 2., img_height / 2.
+    sx = cam[..., 0] * (1. / (img_width / h))
+    sy = cam[..., 0] * (1. / (img_height / h))
+    tx = ((cx - hw) / hw / (sx + 1e-6)) + cam[..., 1]
+    ty = ((cy - hh) / hh / (sy + 1e-6)) + cam[..., 2]
+
+    orig_cam = np.stack([sx, sy, tx, ty], axis=-1)
+    return orig_cam
+
+
+def convert_bbox_to_intrinsic(bboxes: np.ndarray,
+                              img_width: int = 224,
+                              img_height: int = 224,
+                              bbox_scale_factor: float = 1.25,
+                              bbox_format: Literal['xyxy', 'xywh'] = 'xyxy'):
+    """Convert bbox to intrinsic parameters.
+
+    Args:
+        bbox (np.ndarray): (frame, num_person, 4), (frame, 4), or (4,)
+        img_width (int): image width of training data.
+        img_height (int): image height of training data.
+        bbox_scale_factor (float): scale factor for expanding the bbox.
+        bbox_format (Literal['xyxy', 'xywh'] ): 'xyxy' means the left-up point
+            and right-bottomn point of the bbox.
+            'xywh' means the left-up point and the width and height of the
+            bbox.
+    Returns:
+        np.ndarray: (frame, num_person, 3, 3), (frame, 3, 3) or (3,3)
+    """
+    if not isinstance(bboxes, np.ndarray):
+        raise TypeError(
+            f'Input type is {type(bboxes)}, which should be numpy.ndarray.')
+    assert bbox_format in ['xyxy', 'xywh']
+
+    if bbox_format == 'xyxy':
+        bboxes = xyxy2xywh(bboxes)
+
+    center_x = bboxes[..., 0] + bboxes[..., 2] / 2.0
+    center_y = bboxes[..., 1] + bboxes[..., 3] / 2.0
+
+    W = np.max(bboxes[..., 2:], axis=-1) * bbox_scale_factor
+
+    num_frame = bboxes.shape[0]
+    if bboxes.ndim == 3:
+        num_person = bboxes.shape[1]
+        Ks = np.zeros((num_frame, num_person, 3, 3))
+    elif bboxes.ndim == 2:
+        Ks = np.zeros((num_frame, 3, 3))
+    elif bboxes.ndim == 1:
+        Ks = np.zeros((3, 3))
+    else:
+        raise ValueError('Wrong input bboxes shape {bboxes.shape}')
+
+    Ks[..., 0, 0] = W / img_width
+    Ks[..., 1, 1] = W / img_height
+    Ks[..., 0, 2] = center_x - W / 2.0
+    Ks[..., 1, 2] = center_y - W / 2.0
+    Ks[..., 2, 2] = 1
+    return Ks
+
+
+def get_default_hmr_intrinsic(num_frame=1,
+                              focal_length=1000,
+                              det_width=224,
+                              det_height=224) -> np.ndarray:
+    """Get default hmr intrinsic, defined by how you trained.
+
+    Args:
+        num_frame (int, optional): num of frames. Defaults to 1.
+        focal_length (int, optional): defined same as your training.
+            Defaults to 1000.
+        det_width (int, optional): the size you used to detect.
+            Defaults to 224.
+        det_height (int, optional): the size you used to detect.
+            Defaults to 224.
+
+    Returns:
+        np.ndarray: shape of (N, 3, 3)
+    """
+    K = np.zeros((num_frame, 3, 3))
+    K[:, 0, 0] = focal_length
+    K[:, 1, 1] = focal_length
+    K[:, 0, 2] = det_width / 2
+    K[:, 1, 2] = det_height / 2
+    K[:, 2, 2] = 1
+    return K
+
+
+def convert_kp2d_to_bbox(
+        kp2d: np.ndarray,
+        bbox_format: Literal['xyxy', 'xywh'] = 'xyxy') -> np.ndarray:
+    """Convert kp2d to bbox.
+
+    Args:
+        kp2d (np.ndarray):  shape should be (num_frame, num_points, 2/3)
+            or (num_frame, num_person, num_points, 2/3).
+        bbox_format (Literal['xyxy', 'xywh'], optional): Defaults to 'xyxy'.
+
+    Returns:
+        np.ndarray: shape will be (num_frame, num_person, 4)
+    """
+    assert bbox_format in ['xyxy', 'xywh']
+    if kp2d.ndim == 2:
+        kp2d = kp2d[None, None]
+    elif kp2d.ndim == 3:
+        kp2d = kp2d[:, None]
+    num_frame, num_person, _, _ = kp2d.shape
+    x1 = np.max(kp2d[..., 0], axis=-2)
+    y1 = np.max(kp2d[..., 1], axis=-2)
+    x2 = np.max(kp2d[..., 2], axis=-2)
+    y2 = np.max(kp2d[..., 3], axis=-2)
+    bbox = np.concatenate([x1, y1, x2, y2], axis=-1)
+    assert bbox.shape == (num_frame, num_person, 4)
+    if bbox_format == 'xywh':
+        bbox = xyxy2xywh(bbox)
+    return bbox
+
+
+def convert_verts_to_cam_coord(verts,
+                               pred_cams,
+                               bboxes_xy,
+                               focal_length=5000.,
+                               bbox_scale_factor=1.25,
+                               bbox_format='xyxy'):
+    """Convert vertices from the world coordinate to camera coordinate.
+
+    Args:
+        verts ([np.ndarray]): The vertices in the world coordinate.
+            The shape is (frame,num_person,6890,3), (frame,6890,3),
+            or (6890,3).
+        pred_cams ([np.ndarray]): Camera parameters estimated by HMR or SPIN.
+            The shape is (frame,num_person,3), (frame,3), or (3,).
+        bboxes_xy ([np.ndarray]): (frame, num_person, 4|5), (frame, 4|5),
+            or (4|5,)
+        focal_length ([float],optional): Defined same as your training.
+        bbox_scale_factor (float): scale factor for expanding the bbox.
+        bbox_format (Literal['xyxy', 'xywh'] ): 'xyxy' means the left-up point
+            and right-bottomn point of the bbox.
+            'xywh' means the left-up point and the width and height of the
+            bbox.
+    Returns:
+        np.ndarray: The vertices in the camera coordinate.
+            The shape is (frame,num_person,6890,3) or (frame,6890,3).
+        np.ndarray: The intrinsic parameters of the pred_cam.
+            The shape is (num_frame, 3, 3).
+    """
+    K0 = get_default_hmr_intrinsic(focal_length=focal_length,
+                                   det_height=224,
+                                   det_width=224)
+    K1 = convert_bbox_to_intrinsic(bboxes_xy,
+                                   bbox_scale_factor=bbox_scale_factor,
+                                   bbox_format=bbox_format)
+    # K1K0(RX+T)-> K0(K0_inv K1K0)
+    Ks = np.linalg.inv(K0) @ K1 @ K0
+    # convert vertices from world to camera
+    cam_trans = np.concatenate([
+        pred_cams[..., [1]], pred_cams[..., [2]], 2 * focal_length /
+        (224 * pred_cams[..., [0]] + 1e-9)
+    ], -1)
+    verts = verts + cam_trans[..., None, :]
+    if verts.ndim == 4:
+        verts = np.einsum('fnij,fnkj->fnki', Ks, verts)
+    elif verts.ndim == 3:
+        verts = np.einsum('fij,fkj->fki', Ks, verts)
+    elif verts.ndim == 2:
+        verts = np.einsum('fij,fkj->fki', Ks, verts[None])
+    return verts, K0
+
+
+def smooth_process(x,
+                   smooth_type='savgol',
+                   cfg_base_dir='configs/_base_/post_processing/'):
+    """Smooth the array with the specified smoothing type.
+
+    Args:
+        x (np.ndarray): Shape should be (frame,num_person,K,C)
+            or (frame,K,C).
+        smooth_type (str, optional): Smooth type.
+            choose in ['oneeuro', 'gaus1d', 'savgol','smoothnet',
+                'smoothnet_windowsize8','smoothnet_windowsize16',
+                'smoothnet_windowsize32','smoothnet_windowsize64'].
+            Defaults to 'savgol'. 'smoothnet' is default with windowsize=8.
+        cfg_base_dir (str, optional): Config base dir,
+                            default configs/_base_/post_processing/
+    Raises:
+        ValueError: check the input smoothing type.
+
+    Returns:
+        np.ndarray: Smoothed data. The shape should be
+            (frame,num_person,K,C) or (frame,K,C).
+    """
+    if smooth_type == 'smoothnet':
+        smooth_type = 'smoothnet_windowsize8'
+
+    assert smooth_type in [
+        'oneeuro', 'gaus1d', 'savgol', 'smoothnet_windowsize8',
+        'smoothnet_windowsize16', 'smoothnet_windowsize32',
+        'smoothnet_windowsize64'
+    ]
+
+    cfg = os.path.join(cfg_base_dir, smooth_type + '.py')
+    if isinstance(cfg, str):
+        cfg = mmcv.Config.fromfile(cfg)
+    elif not isinstance(cfg, mmcv.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(cfg)}')
+
+    x = x.copy()
+
+    assert x.ndim == 3 or x.ndim == 4
+
+    smooth_func = build_post_processing(dict(cfg['smooth_cfg']))
+
+    if x.ndim == 4:
+        for i in range(x.shape[1]):
+            x[:, i] = smooth_func(x[:, i])
+    elif x.ndim == 3:
+        x = smooth_func(x)
+
+    return x
+
+
+def speed_up_process(x,
+                     speed_up_type='deciwatch',
+                     cfg_base_dir='configs/_base_/post_processing/'):
+    """Speed up the process with the specified speed up type.
+
+    Args:
+        x (np.ndarray): Shape should be (frame,num_person,K,C)
+            or (frame,K,C).
+        speed_up_type (str, optional): Speed up type.
+            choose in ['deciwatch',
+                        'deciwatch_interval5_q1',
+                        'deciwatch_interval5_q2',
+                        'deciwatch_interval5_q3',
+                        'deciwatch_interval5_q4',
+                        'deciwatch_interval5_q5',
+                        'deciwatch_interval10_q1',
+                        'deciwatch_interval10_q2',
+                        'deciwatch_interval10_q3',
+                        'deciwatch_interval10_q4',
+                        'deciwatch_interval10_q5',]. Defaults to 'deciwatch'.
+        cfg_base_dir (str, optional): Config base dir.
+                                Defaults to 'configs/_base_/post_processing/'
+
+    Raises:
+        ValueError: check the input speed up type.
+
+    Returns:
+        np.ndarray: Completed data. The shape should be
+            (frame,num_person,K,C) or (frame,K,C).
+    """
+
+    if speed_up_type == 'deciwatch':
+        speed_up_type = 'deciwatch_interval5_q3'
+    assert speed_up_type in [
+        'deciwatch_interval5_q1',
+        'deciwatch_interval5_q2',
+        'deciwatch_interval5_q3',
+        'deciwatch_interval5_q4',
+        'deciwatch_interval5_q5',
+        'deciwatch_interval10_q1',
+        'deciwatch_interval10_q2',
+        'deciwatch_interval10_q3',
+        'deciwatch_interval10_q4',
+        'deciwatch_interval10_q5',
+    ]
+
+    cfg = os.path.join(cfg_base_dir, speed_up_type + '.py')
+    if isinstance(cfg, str):
+        cfg = mmcv.Config.fromfile(cfg)
+    elif not isinstance(cfg, mmcv.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(cfg)}')
+    x = x.clone()
+
+    assert x.ndim == 4 or x.ndim == 5
+
+    cfg_dict = cfg['speed_up_cfg']
+    cfg_dict['device'] = x.device
+
+    speed_up_func = build_post_processing(cfg_dict)
+
+    if x.ndim == 5:
+        for i in range(x.shape[1]):
+            x[:, i] = speed_up_func(x[:, i])
+    elif x.ndim == 4:
+        x = speed_up_func(x)
+
+    return np.array(x.cpu())
+
+
+def get_speed_up_interval(speed_up_type,
+                          cfg_base_dir='configs/_base_/post_processing/'):
+    """Get the interval of specific speed up type.
+
+    Args:
+        speed_up_type (str, optional): Speed up type.
+            choose in ['deciwatch',
+                        'deciwatch_interval5_q1',
+                        'deciwatch_interval5_q2',
+                        'deciwatch_interval5_q3',
+                        'deciwatch_interval5_q4',
+                        'deciwatch_interval5_q5',
+                        'deciwatch_interval10_q1',
+                        'deciwatch_interval10_q2',
+                        'deciwatch_interval10_q3',
+                        'deciwatch_interval10_q4',
+                        'deciwatch_interval10_q5',]. Defaults to 'deciwatch'.
+        cfg_base_dir (str, optional): Config base dir,
+                            default configs/_base_/post_processing/
+
+    Raises:
+        ValueError: check the input speed up type.
+
+    Returns:
+        int: speed up interval
+    """
+
+    if speed_up_type == 'deciwatch':
+        speed_up_type = 'deciwatch_interval5_q3'
+    assert speed_up_type in [
+        'deciwatch_interval5_q1',
+        'deciwatch_interval5_q2',
+        'deciwatch_interval5_q3',
+        'deciwatch_interval5_q4',
+        'deciwatch_interval5_q5',
+        'deciwatch_interval10_q1',
+        'deciwatch_interval10_q2',
+        'deciwatch_interval10_q3',
+        'deciwatch_interval10_q4',
+        'deciwatch_interval10_q5',
+    ]
+    cfg = os.path.join(cfg_base_dir, speed_up_type + '.py')
+    if isinstance(cfg, str):
+        cfg = mmcv.Config.fromfile(cfg)
+    elif not isinstance(cfg, mmcv.Config):
+        raise TypeError('config must be a filename or Config object, '
+                        f'but got {type(cfg)}')
+
+    return cfg['speed_up_cfg']['interval']
+
+
+def speed_up_interpolate(selected_frames, speed_up_frames, smpl_poses,
+                         smpl_betas, pred_cams, bboxes_xyxy):
+    """Interpolate smpl_betas, pred_cams, and bboxes_xyxyx for speed up.
+
+    Args:
+        selected_frames (np.ndarray): Shape should be (selected frame number).
+        speed_up_frames (int): Total speed up frame number
+        smpl_poses (np.ndarray): selected frame smpl poses parameter
+        smpl_betas (np.ndarray): selected frame smpl shape paeameter
+        pred_cams (np.ndarray): selected frame camera parameter
+        bboxes_xyxy (np.ndarray): selected frame bbox
+
+    Returns:
+        smpl_poses (np.ndarray): interpolated frame smpl poses parameter
+        smpl_betas (np.ndarray): interpolated frame smpl shape paeameter
+        pred_cams (np.ndarray): interpolated frame camera parameter
+        bboxes_xyxy (np.ndarray): interpolated frame bbox
+    """
+    selected_frames = selected_frames[selected_frames <= speed_up_frames]
+    pred_cams[:speed_up_frames, :] = interpolate.interp1d(
+        selected_frames, pred_cams[selected_frames, :], kind='linear',
+        axis=0)(np.arange(0, max(selected_frames)))
+    bboxes_xyxy[:speed_up_frames, :] = interpolate.interp1d(
+        selected_frames,
+        bboxes_xyxy[selected_frames, :],
+        kind='linear',
+        axis=0)(np.arange(0, max(selected_frames)))
+    smpl_betas[:speed_up_frames, :] = interpolate.interp1d(
+        selected_frames, smpl_betas[selected_frames, :], kind='linear',
+        axis=0)(np.arange(0, max(selected_frames)))
+
+    return smpl_poses, smpl_betas, pred_cams, bboxes_xyxy
+
+
+def process_mmtracking_results(mmtracking_results,
+                               max_track_id,
+                               bbox_thr=None):
+    """Process mmtracking results.
+
+    Args:
+        mmtracking_results ([list]): mmtracking_results.
+        bbox_thr (float): threshold for bounding boxes.
+        max_track_id (int): the maximum track id.
+    Returns:
+        person_results ([list]): a list of tracked bounding boxes
+        max_track_id (int): the maximum track id.
+        instance_num (int): the number of instance.
+    """
+    person_results = []
+    # 'track_results' is changed to 'track_bboxes'
+    # in https://github.com/open-mmlab/mmtracking/pull/300
+    if 'track_bboxes' in mmtracking_results:
+        tracking_results = mmtracking_results['track_bboxes'][0]
+    elif 'track_results' in mmtracking_results:
+        tracking_results = mmtracking_results['track_results'][0]
+
+    tracking_results = np.array(tracking_results)
+
+    if bbox_thr is not None:
+        assert tracking_results.shape[-1] == 6
+        valid_idx = np.where(tracking_results[:, 5] > bbox_thr)[0]
+        tracking_results = tracking_results[valid_idx]
+
+    for track in tracking_results:
+        person = {}
+        person['track_id'] = int(track[0])
+        if max_track_id < int(track[0]):
+            max_track_id = int(track[0])
+        person['bbox'] = track[1:]
+        person_results.append(person)
+    person_results = sorted(person_results, key=lambda x: x.get('track_id', 0))
+    instance_num = len(person_results)
+    return person_results, max_track_id, instance_num
+
+
+def process_mmdet_results(mmdet_results, cat_id=1, bbox_thr=None):
+    """Process mmdet results, and return a list of bboxes.
+
+    Args:
+        mmdet_results (list|tuple): mmdet results.
+        bbox_thr (float): threshold for bounding boxes.
+        cat_id (int): category id (default: 1 for human)
+
+    Returns:
+        person_results (list): a list of detected bounding boxes
+    """
+    if isinstance(mmdet_results, tuple):
+        det_results = mmdet_results[0]
+    else:
+        det_results = mmdet_results
+
+    bboxes = det_results[cat_id - 1]
+
+    person_results = []
+    bboxes = np.array(bboxes)
+
+    if bbox_thr is not None:
+        assert bboxes.shape[-1] == 5
+        valid_idx = np.where(bboxes[:, 4] > bbox_thr)[0]
+        bboxes = bboxes[valid_idx]
+
+    for bbox in bboxes:
+        person = {}
+        person['bbox'] = bbox
+        person_results.append(person)
+
+    return person_results
+
+
+def prepare_frames(input_path=None):
+    """Prepare frames from input_path.
+
+    Args:
+        input_path (str, optional): Defaults to None.
+
+    Raises:
+        ValueError: check the input path.
+
+    Returns:
+        List[np.ndarray]: prepared frames
+    """
+    if Path(input_path).is_file():
+        img_list = [mmcv.imread(input_path)]
+        if img_list[0] is None:
+            video = mmcv.VideoReader(input_path)
+            assert video.opened, f'Failed to load file {input_path}'
+            img_list = list(video)
+    elif Path(input_path).is_dir():
+        # input_type = 'folder'
+        file_list = [
+            os.path.join(input_path, fn) for fn in os.listdir(input_path)
+            if fn.lower().endswith(('.png', '.jpg'))
+        ]
+        file_list.sort()
+        img_list = [mmcv.imread(img_path) for img_path in file_list]
+        assert len(img_list), f'Failed to load image from {input_path}'
+    else:
+        raise ValueError('Input path should be an file or folder.'
+                         f' Got invalid input path: {input_path}')
+    return img_list
+
+
+def extract_feature_sequence(extracted_results,
+                             frame_idx,
+                             causal,
+                             seq_len,
+                             step=1):
+    """Extract the target frame from person results, and pad the sequence to a
+    fixed length.
+
+    Args:
+        extracted_results (List[List[Dict]]): Multi-frame feature extraction
+            results stored in a nested list. Each element of the outer list
+            is the feature extraction results of a single frame, and each
+            element of the inner list is the feature information of one person,
+            which contains:
+                features (ndarray): extracted features
+                track_id (int): unique id of each person, required when
+                    ``with_track_id==True```
+                bbox ((4, ) or (5, )): left, right, top, bottom, [score]
+        frame_idx (int): The index of the frame in the original video.
+        causal (bool): If True, the target frame is the first frame in
+            a sequence. Otherwise, the target frame is in the middle of a
+            sequence.
+        seq_len (int): The number of frames in the input sequence.
+        step (int): Step size to extract frames from the video.
+
+    Returns:
+        List[List[Dict]]: Multi-frame feature extraction results stored in a
+            nested list with a length of seq_len.
+        int: The target frame index in the padded sequence.
+    """
+
+    if causal:
+        frames_left = 0
+        frames_right = seq_len - 1
+    else:
+        frames_left = (seq_len - 1) // 2
+        frames_right = frames_left
+    num_frames = len(extracted_results)
+
+    # get the padded sequence
+    pad_left = max(0, frames_left - frame_idx // step)
+    pad_right = max(0, frames_right - (num_frames - 1 - frame_idx) // step)
+    start = max(frame_idx % step, frame_idx - frames_left * step)
+    end = min(num_frames - (num_frames - 1 - frame_idx) % step,
+              frame_idx + frames_right * step + 1)
+    extracted_results_seq = [extracted_results[0]] * pad_left + \
+        extracted_results[start:end:step] + [extracted_results[-1]] * pad_right
+    return extracted_results_seq
+
+
+def get_different_colors(number_of_colors,
+                         flag=0,
+                         alpha: float = 1.0,
+                         mode: str = 'bgr',
+                         int_dtype: bool = True):
+    """Get a numpy of colors of shape (N, 3)."""
+    mode = mode.lower()
+    assert set(mode).issubset({'r', 'g', 'b', 'a'})
+    nst0 = np.random.get_state()
+    np.random.seed(flag)
+    colors = []
+    for i in np.arange(0., 360., 360. / number_of_colors):
+        hue = i / 360.
+        lightness = (50 + np.random.rand() * 10) / 100.
+        saturation = (90 + np.random.rand() * 10) / 100.
+        colors.append(colorsys.hls_to_rgb(hue, lightness, saturation))
+    colors_np = np.asarray(colors)
+    if int_dtype:
+        colors_bgr = (255 * colors_np).astype(np.uint8)
+    else:
+        colors_bgr = colors_np.astype(np.float32)
+    # recover the random state
+    np.random.set_state(nst0)
+    color_dict = {}
+    if 'a' in mode:
+        color_dict['a'] = np.ones((colors_bgr.shape[0], 3)) * alpha
+    color_dict['b'] = colors_bgr[:, 0:1]
+    color_dict['g'] = colors_bgr[:, 1:2]
+    color_dict['r'] = colors_bgr[:, 2:3]
+    colors_final = []
+    for channel in mode:
+        colors_final.append(color_dict[channel])
+    colors_final = np.concatenate(colors_final, -1)
+    return colors_final
+
+
+class RunningAverage():
+    r"""A helper class to calculate running average in a sliding window.
+
+    Args:
+        window (int): The size of the sliding window.
+    """
+    def __init__(self, window: int = 1):
+        self.window = window
+        self._data = []
+
+    def update(self, value):
+        """Update a new data sample."""
+        self._data.append(value)
+        self._data = self._data[-self.window:]
+
+    def average(self):
+        """Get the average value of current window."""
+        return np.mean(self._data)
+
+
+class StopWatch:
+    r"""A helper class to measure FPS and detailed time consuming of each phase
+    in a video processing loop or similar scenarios.
+
+    Args:
+        window (int): The sliding window size to calculate the running average
+            of the time consuming.
+
+    Example:
+        >>> from mmpose.utils import StopWatch
+        >>> import time
+        >>> stop_watch = StopWatch(window=10)
+        >>> with stop_watch.timeit('total'):
+        >>>     time.sleep(0.1)
+        >>>     # 'timeit' support nested use
+        >>>     with stop_watch.timeit('phase1'):
+        >>>         time.sleep(0.1)
+        >>>     with stop_watch.timeit('phase2'):
+        >>>         time.sleep(0.2)
+        >>>     time.sleep(0.2)
+        >>> report = stop_watch.report()
+    """
+    def __init__(self, window=1):
+        self.window = window
+        self._record = defaultdict(partial(RunningAverage, window=self.window))
+        self._timer_stack = []
+
+    @contextmanager
+    def timeit(self, timer_name='_FPS_'):
+        """Timing a code snippet with an assigned name.
+
+        Args:
+            timer_name (str): The unique name of the interested code snippet to
+                handle multiple timers and generate reports. Note that '_FPS_'
+                is a special key that the measurement will be in `fps` instead
+                of `millisecond`. Also see `report` and `report_strings`.
+                Default: '_FPS_'.
+        Note:
+            This function should always be used in a `with` statement, as shown
+            in the example.
+        """
+        self._timer_stack.append((timer_name, Timer()))
+        try:
+            yield
+        finally:
+            timer_name, timer = self._timer_stack.pop()
+            self._record[timer_name].update(timer.since_start())
+
+    def report(self, key=None):
+        """Report timing information.
+
+        Returns:
+            dict: The key is the timer name and the value is the \
+                corresponding average time consuming.
+        """
+        result = {
+            name: r.average() * 1000.
+            for name, r in self._record.items()
+        }
+
+        if '_FPS_' in result:
+            result['_FPS_'] = 1000. / result.pop('_FPS_')
+
+        if key is None:
+            return result
+        return result[key]
+
+    def report_strings(self):
+        """Report timing information in texture strings.
+
+        Returns:
+            list(str): Each element is the information string of a timed \
+                event, in format of '{timer_name}: {time_in_ms}'. \
+                Specially, if timer_name is '_FPS_', the result will \
+                be converted to fps.
+        """
+        result = self.report()
+        strings = []
+        if '_FPS_' in result:
+            strings.append(f'FPS: {result["_FPS_"]:>5.1f}')
+        strings += [f'{name}: {val:>3.0f}' for name, val in result.items()]
+        return strings
+
+    def reset(self):
+        self._record = defaultdict(list)
+        self._active_timer_stack = []
diff --git a/detrsmpl/utils/dist_utils.py b/detrsmpl/utils/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..83800706f8e4bb81395eb8e33ecc69028bd98f3e
--- /dev/null
+++ b/detrsmpl/utils/dist_utils.py
@@ -0,0 +1,67 @@
+from collections import OrderedDict
+
+import torch.distributed as dist
+from mmcv.runner import OptimizerHook
+from torch._utils import (
+    _flatten_dense_tensors,
+    _take_tensors,
+    _unflatten_dense_tensors,
+)
+
+
+def _allreduce_coalesced(tensors, world_size, bucket_size_mb=-1):
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
+
+
+def allreduce_grads(params, coalesce=True, bucket_size_mb=-1):
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    world_size = dist.get_world_size()
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+class DistOptimizerHook(OptimizerHook):
+    def __init__(self, grad_clip=None, coalesce=True, bucket_size_mb=-1):
+        self.grad_clip = grad_clip
+        self.coalesce = coalesce
+        self.bucket_size_mb = bucket_size_mb
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        runner.outputs['loss'].backward()
+        if self.grad_clip is not None:
+            self.clip_grads(runner.model.parameters())
+        runner.optimizer.step()
+
+
+def reduce_mean(tensor):
+    """"Obtain the mean of tensor on different GPUs."""
+    if not (dist.is_available() and dist.is_initialized()):
+        return tensor
+    tensor = tensor.clone()
+    dist.all_reduce(tensor.div_(dist.get_world_size()), op=dist.ReduceOp.SUM)
+    return tensor
diff --git a/detrsmpl/utils/ffmpeg_utils.py b/detrsmpl/utils/ffmpeg_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a401f6d3ae16055c870f1ec3f8a6fa99ab612824
--- /dev/null
+++ b/detrsmpl/utils/ffmpeg_utils.py
@@ -0,0 +1,1376 @@
+import glob
+import json
+import os
+import shutil
+import string
+import subprocess
+import sys
+from pathlib import Path
+from typing import Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from detrsmpl.utils.path_utils import check_input_path, prepare_output_path
+
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
+
+class video_writer:
+
+    def __init__(self,
+                 output_path: str,
+                 resolution: Iterable[int],
+                 fps: float = 30.0,
+                 num_frame: int = 1e9,
+                 disable_log: bool = False) -> None:
+        prepare_output_path(
+            output_path,
+            allowed_suffix=['.mp4'],
+            tag='output video',
+            path_type='file',
+            overwrite=True)
+        height, width = resolution
+        width += width % 2
+        height += height % 2
+        command = [
+            'ffmpeg',
+            '-y',  # (optional) overwrite output file if it exists
+            '-f',
+            'rawvideo',
+            '-pix_fmt',
+            'bgr24',
+            '-s',
+            f'{int(width)}x{int(height)}',
+            '-r',
+            f'{fps}',  # frames per second
+            '-loglevel',
+            'error',
+            '-threads',
+            '1',
+            '-i',
+            '-',  # The input comes from a pipe
+            '-vcodec',
+            'libx264',
+            '-r',
+            f'{fps}',  # frames per second
+            '-an',  # Tells FFMPEG not to expect any audio
+            output_path,
+        ]
+        if not disable_log:
+            print(f'Running \"{" ".join(command)}\"')
+        process = subprocess.Popen(
+            command,
+            stdin=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        if process.stdin is None or process.stderr is None:
+            raise BrokenPipeError('No buffer received.')
+        self.process = process
+        self.num_frame = num_frame
+        self.len = 0
+
+    def write(self, image_array: np.ndarray):
+        if self.len <= self.num_frame:
+            try:
+                self.process.stdin.write(image_array.tobytes())
+                self.len += 1
+            except KeyboardInterrupt:
+                self.__del__()
+
+    def __del__(self):
+        self.process.stdin.close()
+        self.process.stderr.close()
+        self.process.wait()
+
+
+def array_to_video(
+    image_array: np.ndarray,
+    output_path: str,
+    fps: Union[int, float] = 30,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    disable_log: bool = False,
+) -> None:
+    """Convert an array to a video directly, gif not supported.
+
+    Args:
+        image_array (np.ndarray): shape should be (f * h * w * 3).
+        output_path (str): output video file path.
+        fps (Union[int, float, optional): fps. Defaults to 30.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of the output video.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check output path.
+        TypeError: check input array.
+
+    Returns:
+        None.
+    """
+    if not isinstance(image_array, np.ndarray):
+        raise TypeError('Input should be np.ndarray.')
+    assert image_array.ndim == 4
+    assert image_array.shape[-1] == 3
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+    if resolution:
+        height, width = resolution
+        width += width % 2
+        height += height % 2
+    else:
+        image_array = pad_for_libx264(image_array)
+        height, width = image_array.shape[1], image_array.shape[2]
+    command = [
+        'ffmpeg',
+        '-y',  # (optional) overwrite output file if it exists
+        '-f',
+        'rawvideo',
+        '-s',
+        f'{int(width)}x{int(height)}',  # size of one frame
+        '-pix_fmt',
+        'bgr24',
+        '-r',
+        f'{fps}',  # frames per second
+        '-loglevel',
+        'error',
+        '-threads',
+        '4',
+        ''
+        '-i',
+        '-',  # The input comes from a pipe
+        '-vcodec',
+        'libx264',
+        '-an',  # Tells FFMPEG not to expect any audio
+        output_path,
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    process = subprocess.Popen(
+        command,
+        stdin=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    if process.stdin is None or process.stderr is None:
+        raise BrokenPipeError('No buffer received.')
+    index = 0
+    while True:
+        if index >= image_array.shape[0]:
+            break
+        process.stdin.write(image_array[index].tobytes())
+        index += 1
+    process.stdin.close()
+    process.stderr.close()
+    process.wait()
+
+
+def array_to_images(
+    image_array: np.ndarray,
+    output_folder: str,
+    img_format: str = '%06d.png',
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    disable_log: bool = False,
+) -> None:
+    """Convert an array to images directly.
+
+    Args:
+        image_array (np.ndarray): shape should be (f * h * w * 3).
+        output_folder (str): output folder for the images.
+        img_format (str, optional): format of the images.
+            Defaults to '%06d.png'.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): resolution(height, width) of output.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+
+    Raises:
+        FileNotFoundError: check output folder.
+        TypeError: check input array.
+
+    Returns:
+        None
+    """
+    prepare_output_path(
+        output_folder,
+        allowed_suffix=[],
+        tag='output image folder',
+        path_type='dir',
+        overwrite=True)
+
+    if not isinstance(image_array, np.ndarray):
+        raise TypeError('Input should be np.ndarray.')
+    assert image_array.ndim == 4
+    assert image_array.shape[-1] == 3
+    if resolution:
+        height, width = resolution
+    else:
+        height, width = image_array.shape[1], image_array.shape[2]
+    command = [
+        'ffmpeg',
+        '-y',  # (optional) overwrite output file if it exists
+        '-f',
+        'rawvideo',
+        '-s',
+        f'{int(width)}x{int(height)}',  # size of one frame
+        '-pix_fmt',
+        'bgr24',  # bgr24 for matching OpenCV
+        '-loglevel',
+        'error',
+        '-threads',
+        '4',
+        '-i',
+        '-',  # The input comes from a pipe
+        '-f',
+        'image2',
+        '-start_number',
+        '0',
+        os.path.join(output_folder, img_format),
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    process = subprocess.Popen(
+        command,
+        stdin=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        bufsize=10**8,
+        close_fds=True)
+    if process.stdin is None or process.stderr is None:
+        raise BrokenPipeError('No buffer received.')
+    index = 0
+    while True:
+        if index >= image_array.shape[0]:
+            break
+        process.stdin.write(image_array[index].tobytes())
+        index += 1
+    process.stdin.close()
+    process.stderr.close()
+    process.wait()
+
+
+def video_to_array(
+    input_path: str,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    start: int = 0,
+    end: Optional[int] = None,
+    disable_log: bool = False,
+) -> np.ndarray:
+    """
+    Read a video/gif as an array of (f * h * w * 3).
+
+    Args:
+        input_path (str): input path.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): resolution(height, width) of output.
+            Defaults to None.
+        start (int, optional): start frame index. Inclusive.
+             If < 0, will be converted to frame_index range in [0, frame_num].
+            Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+
+    Raises:
+        FileNotFoundError: check the input path.
+
+    Returns:
+        np.ndarray: shape will be (f * h * w * 3).
+    """
+    check_input_path(
+        input_path,
+        allowed_suffix=['.mp4', 'mkv', 'avi', '.gif'],
+        tag='input video',
+        path_type='file')
+
+    info = vid_info_reader(input_path)
+    if resolution:
+        height, width = resolution
+    else:
+        width, height = int(info['width']), int(info['height'])
+    num_frames = int(info['nb_frames'])
+    start = (min(start, num_frames - 1) + num_frames) % num_frames
+    end = (min(end, num_frames - 1) +
+           num_frames) % num_frames if end is not None else num_frames
+    command = [
+        'ffmpeg',
+        '-i',
+        input_path,
+        '-filter_complex',
+        f'[0]trim=start_frame={start}:end_frame={end}[v0]',
+        '-map',
+        '[v0]',
+        '-pix_fmt',
+        'bgr24',  # bgr24 for matching OpenCV
+        '-s',
+        f'{int(width)}x{int(height)}',
+        '-f',
+        'image2pipe',
+        '-vcodec',
+        'rawvideo',
+        '-loglevel',
+        'error',
+        'pipe:'
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    # Execute FFmpeg as sub-process with stdout as a pipe
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=10**8)
+    if process.stdout is None:
+        raise BrokenPipeError('No buffer received.')
+    # Read decoded video frames from the PIPE until no more frames to read
+    array = []
+    while True:
+        # Read decoded video frame (in raw video format) from stdout process.
+        buffer = process.stdout.read(int(width * height * 3))
+        # Break the loop if buffer length is not W*H*3\
+        # (when FFmpeg streaming ends).
+        if len(buffer) != width * height * 3:
+            break
+        img = np.frombuffer(buffer, np.uint8).reshape(height, width, 3)
+        array.append(img[np.newaxis])
+    process.stdout.flush()
+    process.stdout.close()
+    process.wait()
+    return np.concatenate(array)
+
+
+def images_to_sorted_images(input_folder, output_folder, img_format='%06d'):
+    """Copy and rename a folder of images into a new folder following the
+    `img_format`.
+
+    Args:
+        input_folder (str): input folder.
+        output_folder (str): output folder.
+        img_format (str, optional): image format name, do not need extension.
+            Defaults to '%06d'.
+
+    Returns:
+        str: image format of the rename images.
+    """
+    img_format = img_format.rsplit('.', 1)[0]
+    file_list = []
+    os.makedirs(output_folder, exist_ok=True)
+    pngs = glob.glob(os.path.join(input_folder, '*.png'))
+    if pngs:
+        ext = 'png'
+    file_list.extend(pngs)
+    jpgs = glob.glob(os.path.join(input_folder, '*.jpg'))
+    if jpgs:
+        ext = 'jpg'
+    file_list.extend(jpgs)
+    file_list.sort()
+    for index, file_name in enumerate(file_list):
+        shutil.copy(
+            file_name,
+            os.path.join(output_folder, (img_format + '.%s') % (index, ext)))
+    return img_format + '.%s' % ext
+
+
+def images_to_array(
+    input_folder: str,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    img_format: str = '%06d.png',
+    start: int = 0,
+    end: Optional[int] = None,
+    remove_raw_files: bool = False,
+    disable_log: bool = False,
+) -> np.ndarray:
+    """
+    Read a folder of images as an array of (f * h * w * 3).
+
+    Args:
+        input_folder (str): folder of input images.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]]:
+            resolution(height, width) of output. Defaults to None.
+        img_format (str, optional): format of images to be read.
+            Defaults to '%06d.png'.
+        start (int, optional): start frame index. Inclusive.
+             If < 0, will be converted to frame_index range in [0, frame_num].
+            Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        remove_raw_files (bool, optional): whether remove raw images.
+            Defaults to False.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+
+    Returns:
+        np.ndarray: shape will be (f * h * w * 3).
+    """
+    check_input_path(
+        input_folder,
+        allowed_suffix=[''],
+        tag='input image folder',
+        path_type='dir')
+
+    input_folderinfo = Path(input_folder)
+
+    temp_input_folder = None
+    if img_format is None:
+        temp_input_folder = os.path.join(input_folderinfo.parent,
+                                         input_folderinfo.name + '_temp')
+        img_format = images_to_sorted_images(
+            input_folder=input_folder, output_folder=temp_input_folder)
+        input_folder = temp_input_folder
+
+    info = vid_info_reader(f'{input_folder}/{img_format}' % start)
+    width, height = int(info['width']), int(info['height'])
+    if resolution:
+        height, width = resolution
+    else:
+        width, height = int(info['width']), int(info['height'])
+
+    num_frames = len(os.listdir(input_folder))
+    start = max(start, 0) % num_frames
+    end = min(end, num_frames) % (num_frames + 1) \
+        if end is not None else num_frames
+    command = [
+        'ffmpeg',
+        '-y',
+        '-threads',
+        '1',
+        '-start_number',
+        f'{start}',
+        '-i',
+        f'{input_folder}/{img_format}',
+        '-frames:v',
+        f'{end - start}',
+        '-f',
+        'rawvideo',
+        '-pix_fmt',
+        'bgr24',  # bgr24 for matching OpenCV
+        '-s',
+        f'{int(width)}x{int(height)}',
+        '-loglevel',
+        'error',
+        '-'
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=10**8)
+    if process.stdout is None:
+        raise BrokenPipeError('No buffer received.')
+    # Read decoded video frames from the PIPE until no more frames to read
+    array = []
+    while True:
+        # Read decoded video frame (in raw video format) from stdout process.
+        buffer = process.stdout.read(int(width * height * 3))
+        # Break the loop if buffer length is not W*H*3\
+        # (when FFmpeg streaming ends).
+
+        if len(buffer) != width * height * 3:
+            break
+        img = np.frombuffer(buffer, np.uint8).reshape(height, width, 3)
+        array.append(img[np.newaxis])
+    process.stdout.flush()
+    process.stdout.close()
+    process.wait()
+    if temp_input_folder is not None:
+        if Path(temp_input_folder).is_dir():
+            shutil.rmtree(temp_input_folder)
+    if remove_raw_files:
+        if Path(input_folder).is_dir():
+            shutil.rmtree(input_folder)
+
+    return np.concatenate(array)
+
+
+class vid_info_reader(object):
+
+    def __init__(self, input_path) -> None:
+        """Get video information from video, mimiced from ffmpeg-python.
+        https://github.com/kkroening/ffmpeg-python.
+
+        Args:
+            vid_file ([str]): video file path.
+
+        Raises:
+            FileNotFoundError: check the input path.
+
+        Returns:
+            None.
+        """
+        check_input_path(
+            input_path,
+            allowed_suffix=['.mp4', '.gif', '.png', '.jpg', '.jpeg'],
+            tag='input file',
+            path_type='file')
+        cmd = [
+            'ffprobe', '-show_format', '-show_streams', '-of', 'json',
+            input_path
+        ]
+        process = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        out, _ = process.communicate()
+        probe = json.loads(out.decode('utf-8'))
+        video_stream = next((stream for stream in probe['streams']
+                             if stream['codec_type'] == 'video'), None)
+        if video_stream is None:
+            print('No video stream found', file=sys.stderr)
+            sys.exit(1)
+        self.video_stream = video_stream
+
+    def __getitem__(
+        self,
+        key: Literal['index', 'codec_name', 'codec_long_name', 'profile',
+                     'codec_type', 'codec_time_base', 'codec_tag_string',
+                     'codec_tag', 'width', 'height', 'coded_width',
+                     'coded_height', 'has_b_frames', 'pix_fmt', 'level',
+                     'chroma_location', 'refs', 'is_avc', 'nal_length_size',
+                     'r_frame_rate', 'avg_frame_rate', 'time_base',
+                     'start_pts', 'start_time', 'duration_ts', 'duration',
+                     'bit_rate', 'bits_per_raw_sample', 'nb_frames',
+                     'disposition', 'tags']):
+        """Key (str): select in ['index', 'codec_name', 'codec_long_name',
+        'profile', 'codec_type', 'codec_time_base', 'codec_tag_string',
+        'codec_tag', 'width', 'height', 'coded_width', 'coded_height',
+        'has_b_frames', 'pix_fmt', 'level', 'chroma_location', 'refs',
+        'is_avc', 'nal_length_size', 'r_frame_rate', 'avg_frame_rate',
+        'time_base', 'start_pts', 'start_time', 'duration_ts', 'duration',
+        'bit_rate', 'bits_per_raw_sample', 'nb_frames', 'disposition',
+        'tags']"""
+        return self.video_stream[key]
+
+
+def video_to_gif(
+    input_path: str,
+    output_path: str,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    fps: Union[float, int] = 15,
+    disable_log: bool = False,
+) -> None:
+    """Convert a video to a gif file.
+
+    Args:
+        input_path (str): video file path.
+        output_path (str): gif file path.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of the output video.
+            Defaults to None.
+        fps (Union[float, int], optional): frames per second. Defaults to 15.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None.
+    """
+    check_input_path(
+        input_path,
+        allowed_suffix=['.mp4'],
+        tag='input video',
+        path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif'],
+        tag='output gif',
+        path_type='file',
+        overwrite=True)
+
+    info = vid_info_reader(input_path)
+    duration = info['duration']
+    if resolution:
+        height, width = resolution
+    else:
+        width, height = int(info['width']), int(info['height'])
+
+    command = [
+        'ffmpeg', '-r',
+        str(info['r_frame_rate']), '-i', input_path, '-r', f'{fps}', '-s',
+        f'{width}x{height}', '-loglevel', 'error', '-t', f'{duration}',
+        '-threads', '4', '-y', output_path
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+
+def video_to_images(input_path: str,
+                    output_folder: str,
+                    resolution: Optional[Union[Tuple[int, int],
+                                               Tuple[float, float]]] = None,
+                    img_format: str = '%06d.png',
+                    start: int = 0,
+                    end: Optional[int] = None,
+                    disable_log: bool = False) -> None:
+    """Convert a video to a folder of images.
+
+    Args:
+        input_path (str): video file path
+        output_folder (str): output folder to store the images
+        resolution (Optional[Tuple[int, int]], optional):
+            (height, width) of output. defaults to None.
+        img_format (str, optional): format of images to be read.
+            Defaults to '%06d.png'.
+        start (int, optional): start frame index. Inclusive.
+             If < 0, will be converted to frame_index range in [0, frame_num].
+            Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path
+        FileNotFoundError: check the output path
+
+    Returns:
+        None
+    """
+    check_input_path(
+        input_path,
+        allowed_suffix=['.mp4'],
+        tag='input video',
+        path_type='file')
+    prepare_output_path(
+        output_folder,
+        allowed_suffix=[],
+        tag='output image folder',
+        path_type='dir',
+        overwrite=True)
+    info = vid_info_reader(input_path)
+    num_frames = int(info['nb_frames'])
+    start = (min(start, num_frames - 1) + num_frames) % num_frames
+    end = (min(end, num_frames - 1) +
+           num_frames) % num_frames if end is not None else num_frames
+
+    command = [
+        'ffmpeg', '-i', input_path, '-filter_complex',
+        f'[0]trim=start_frame={start}:end_frame={end}[v0]', '-map', '[v0]',
+        '-f', 'image2', '-v', 'error', '-start_number', '0', '-threads', '1',
+        f'{output_folder}/{img_format}'
+    ]
+    if resolution:
+        height, width = resolution
+        command.insert(3, '-s')
+        command.insert(4, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+
+def images_to_video(input_folder: str,
+                    output_path: str,
+                    remove_raw_file: bool = False,
+                    img_format: str = '%06d.png',
+                    fps: Union[int, float] = 30,
+                    resolution: Optional[Union[Tuple[int, int],
+                                               Tuple[float, float]]] = None,
+                    start: int = 0,
+                    end: Optional[int] = None,
+                    disable_log: bool = False) -> None:
+    """Convert a folder of images to a video.
+
+    Args:
+        input_folder (str): input image folder
+        output_path (str): output video file path
+        remove_raw_file (bool, optional): whether remove raw images.
+            Defaults to False.
+        img_format (str, optional): format to name the images].
+            Defaults to '%06d.png'.
+        fps (Union[int, float], optional): output video fps. Defaults to 30.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output.
+            defaults to None.
+        start (int, optional): start frame index. Inclusive.
+            If < 0, will be converted to frame_index range in [0, frame_num].
+            Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None
+    """
+    check_input_path(
+        input_folder,
+        allowed_suffix=[],
+        tag='input image folder',
+        path_type='dir')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+    input_folderinfo = Path(input_folder)
+    num_frames = len(os.listdir(input_folder))
+    start = (min(start, num_frames - 1) + num_frames) % num_frames
+    end = (min(end, num_frames - 1) +
+           num_frames) % num_frames if end is not None else num_frames
+    temp_input_folder = None
+    if img_format is None:
+        temp_input_folder = os.path.join(input_folderinfo.parent,
+                                         input_folderinfo.name + '_temp')
+        img_format = images_to_sorted_images(input_folder, temp_input_folder)
+
+    command = [
+        'ffmpeg',
+        '-y',
+        '-threads',
+        '4',
+        '-start_number',
+        f'{start}',
+        '-r',
+        f'{fps}',
+        '-i',
+        f'{input_folder}/{img_format}'
+        if temp_input_folder is None else f'{temp_input_folder}/{img_format}',
+        '-frames:v',
+        f'{end - start}',
+        '-profile:v',
+        'baseline',
+        '-level',
+        '3.0',
+        '-c:v',
+        'libx264',
+        '-pix_fmt',
+        'yuv420p',
+        '-vf',
+        'scale=trunc(iw/2)*2:trunc(ih/2)*2',  # Ensure width and height are divisible by 2
+        '-an',
+        '-v',
+        'error',
+        '-loglevel',
+        'error',
+        output_path,
+    ]
+    if resolution:
+        height, width = resolution
+        width += width % 2
+        height += height % 2
+        command.insert(1, '-s')
+        command.insert(2, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+    if remove_raw_file:
+        if Path(input_folder).is_dir():
+            shutil.rmtree(input_folder)
+    if temp_input_folder is not None:
+        if Path(temp_input_folder).is_dir():
+            shutil.rmtree(temp_input_folder)
+
+
+def images_to_gif(
+    input_folder: str,
+    output_path: str,
+    remove_raw_file: bool = False,
+    img_format: str = '%06d.png',
+    fps: int = 15,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    start: int = 0,
+    end: Optional[int] = None,
+    disable_log: bool = False,
+) -> None:
+    """Convert series of images to a video, similar to images_to_video, but
+    provide more suitable parameters.
+
+    Args:
+        input_folder (str): input image folder.
+        output_path (str): output gif file path.
+        remove_raw_file (bool, optional): whether remove raw images.
+            Defaults to False.
+        img_format (str, optional): format to name the images.
+            Defaults to '%06d.png'.
+        fps (int, optional): output video fps. Defaults to 15.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output. Defaults to None.
+        start (int, optional): start frame index. Inclusive.
+            If < 0, will be converted to frame_index range in [0, frame_num].
+            Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None
+    """
+    input_folderinfo = Path(input_folder)
+    check_input_path(
+        input_folder,
+        allowed_suffix=[],
+        tag='input image folder',
+        path_type='dir')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif'],
+        tag='output gif',
+        path_type='file',
+        overwrite=True)
+    num_frames = len(os.listdir(input_folder))
+    start = (min(start, num_frames - 1) + num_frames) % num_frames
+    end = (min(end, num_frames - 1) +
+           num_frames) % num_frames if end is not None else num_frames
+    temp_input_folder = None
+    if img_format is None:
+        file_list = []
+        temp_input_folder = os.path.join(input_folderinfo.parent,
+                                         input_folderinfo.name + '_temp')
+        os.makedirs(temp_input_folder, exist_ok=True)
+        pngs = glob.glob(os.path.join(input_folder, '*.png'))
+        ext = 'png'
+        if pngs:
+            ext = 'png'
+        file_list.extend(pngs)
+        jpgs = glob.glob(os.path.join(input_folder, '*.jpg'))
+        if jpgs:
+            ext = 'jpg'
+        file_list.extend(jpgs)
+        file_list.sort()
+        for index, file_name in enumerate(file_list):
+            shutil.copy(
+                file_name,
+                os.path.join(temp_input_folder, '%06d.%s' % (index + 1, ext)))
+        input_folder = temp_input_folder
+        img_format = '%06d.' + ext
+
+    command = [
+        'ffmpeg',
+        '-y',
+        '-threads',
+        '4',
+        '-start_number',
+        f'{start}',
+        '-r',
+        f'{fps}',
+        '-i',
+        f'{input_folder}/{img_format}',
+        '-frames:v',
+        f'{end - start}',
+        '-loglevel',
+        'error',
+        '-v',
+        'error',
+        output_path,
+    ]
+    if resolution:
+        height, width = resolution
+        command.insert(1, '-s')
+        command.insert(2, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+    if remove_raw_file:
+        shutil.rmtree(input_folder)
+    if temp_input_folder is not None:
+        shutil.rmtree(temp_input_folder)
+
+
+def gif_to_video(input_path: str,
+                 output_path: str,
+                 fps: int = 30,
+                 remove_raw_file: bool = False,
+                 resolution: Optional[Union[Tuple[int, int],
+                                            Tuple[float, float]]] = None,
+                 disable_log: bool = False) -> None:
+    """Convert a gif file to a video.
+
+    Args:
+        input_path (str): input gif file path.
+        output_path (str): output video file path.
+        fps (int, optional): fps. Defaults to 30.
+        remove_raw_file (bool, optional): whether remove original input file.
+            Defaults to False.
+        down_sample_scale (Union[int, float], optional): down sample scale.
+            Defaults to 1.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output. Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None
+    """
+    check_input_path(
+        input_path, allowed_suffix=['.gif'], tag='input gif', path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+    command = [
+        'ffmpeg', '-i', input_path, '-r', f'{fps}', '-loglevel', 'error', '-y',
+        output_path, '-threads', '4'
+    ]
+    if resolution:
+        height, width = resolution
+        command.insert(3, '-s')
+        command.insert(4, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+    if remove_raw_file:
+        subprocess.call(['rm', '-f', input_path])
+
+
+def gif_to_images(input_path: str,
+                  output_folder: str,
+                  fps: int = 30,
+                  img_format: str = '%06d.png',
+                  resolution: Optional[Union[Tuple[int, int],
+                                             Tuple[float, float]]] = None,
+                  disable_log: bool = False) -> None:
+    """Convert a gif file to a folder of images.
+
+    Args:
+        input_path (str): input gif file path.
+        output_folder (str): output folder to save the images.
+        fps (int, optional): fps. Defaults to 30.
+        img_format (str, optional): output image name format.
+            Defaults to '%06d.png'.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None
+    """
+    check_input_path(
+        input_path, allowed_suffix=['.gif'], tag='input gif', path_type='file')
+    prepare_output_path(
+        output_folder,
+        allowed_suffix=[],
+        tag='output image folder',
+        path_type='dir',
+        overwrite=True)
+    command = [
+        'ffmpeg', '-r', f'{fps}', '-i', input_path, '-loglevel', 'error', '-f',
+        'image2', '-v', 'error', '-threads', '4', '-y', '-start_number', '0',
+        f'{output_folder}/{img_format}'
+    ]
+    if resolution:
+        height, width = resolution
+        command.insert(3, '-s')
+        command.insert(4, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+
+def crop_video(
+    input_path: str,
+    output_path: str,
+    box: Optional[Union[List[int], Tuple[int, int, int, int]]] = None,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    disable_log: bool = False,
+) -> None:
+    """Spatially or temporally crop a video or gif file.
+
+    Args:
+        input_path (str): input video or gif file path.
+        output_path (str): output video or gif file path.
+        box (Iterable[int], optional): [x, y of the crop region left.
+            corner and width and height]. Defaults to [0, 0, 100, 100].
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output. Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None'-start_number', f'{start}',
+    """
+    check_input_path(
+        input_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='input video',
+        path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+
+    info = vid_info_reader(input_path)
+    width, height = int(info['width']), int(info['height'])
+
+    if box is None:
+        box = [0, 0, width, height]
+
+    assert len(box) == 4
+    x, y, w, h = box
+    assert (w > 0 and h > 0)
+    command = [
+        'ffmpeg', '-i', input_path, '-vcodec', 'libx264', '-vf',
+        'crop=%d:%d:%d:%d' % (w, h, x, y), '-loglevel', 'error', '-y',
+        output_path
+    ]
+    if resolution:
+        height, width = resolution
+        width += width % 2
+        height += height % 2
+        command.insert(-1, '-s')
+        command.insert(-1, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+
+def slice_video(input_path: str,
+                output_path: str,
+                start: int = 0,
+                end: Optional[int] = None,
+                resolution: Optional[Union[Tuple[int, int],
+                                           Tuple[float, float]]] = None,
+                disable_log: bool = False) -> None:
+    """Temporally crop a video/gif into another video/gif.
+
+    Args:
+        input_path (str): input video or gif file path.
+        output_path (str): output video of gif file path.
+        start (int, optional): start frame index. Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output. Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        NoReturn
+    """
+    info = vid_info_reader(input_path)
+    num_frames = int(info['nb_frames'])
+    start = (min(start, num_frames - 1) + num_frames) % num_frames
+    end = (min(end, num_frames - 1) +
+           num_frames) % num_frames if end is not None else num_frames
+    command = [
+        'ffmpeg', '-y', '-i', input_path, '-filter_complex',
+        f'[0]trim=start_frame={start}:end_frame={end}[v0]', '-map', '[v0]',
+        '-loglevel', 'error', '-vcodec', 'libx264', output_path
+    ]
+    if resolution:
+        height, width = resolution
+        width += width % 2
+        height += height % 2
+        command.insert(1, '-s')
+        command.insert(2, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+
+def spatial_concat_video(input_path_list: List[str],
+                         output_path: str,
+                         array: List[int] = [1, 1],
+                         direction: Literal['h', 'w'] = 'h',
+                         resolution: Union[Tuple[int,
+                                                 int], List[int], List[float],
+                                           Tuple[float, float]] = (512, 512),
+                         remove_raw_files: bool = False,
+                         padding: int = 0,
+                         disable_log: bool = False) -> None:
+    """Spatially concat some videos as an array video.
+
+    Args:
+        input_path_list (list): input video or gif file list.
+        output_path (str): output video or gif file path.
+        array (List[int], optional): line number and column number of
+            the video array]. Defaults to [1, 1].
+        direction (str, optional): [choose in 'h' or 'v', represent
+            horizontal and vertical separately].
+            Defaults to 'h'.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output.
+            Defaults to (512, 512).
+        remove_raw_files (bool, optional): whether remove raw images.
+            Defaults to False.
+        padding (int, optional): width of pixels between videos.
+            Defaults to 0.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None
+    """
+    lowercase = string.ascii_lowercase
+    assert len(array) == 2
+    assert (array[0] * array[1]) >= len(input_path_list)
+    for path in input_path_list:
+        check_input_path(
+            path,
+            allowed_suffix=['.gif', '.mp4'],
+            tag='input video',
+            path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+
+    command = ['ffmpeg']
+    height, width = resolution
+    scale_command = []
+    for index, vid_file in enumerate(input_path_list):
+        command.append('-i')
+        command.append(vid_file)
+        scale_command.append(
+            '[%d:v]scale=%d:%d:force_original_aspect_ratio=0[v%d];' %
+            (index, width, height, index))
+
+    scale_command = ' '.join(scale_command)
+    pad_command = '[v%d]pad=%d:%d[%s];' % (0, width * array[1] + padding *
+                                           (array[1] - 1),
+                                           height * array[0] + padding *
+                                           (array[0] - 1), lowercase[0])
+    for index in range(1, len(input_path_list)):
+        if direction == 'h':
+            pad_width = index % array[1] * (width + padding)
+            pad_height = index // array[1] * (height + padding)
+        else:
+            pad_width = index % array[0] * (width + padding)
+            pad_height = index // array[0] * (height + padding)
+
+        pad_command += '[%s][v%d]overlay=%d:%d' % (lowercase[index - 1], index,
+                                                   pad_width, pad_height)
+        if index != len(input_path_list) - 1:
+            pad_command += '[%s];' % lowercase[index]
+
+    command += [
+        '-filter_complex',
+        '%s%s' % (scale_command, pad_command), '-loglevel', 'error', '-y',
+        output_path
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+    if remove_raw_files:
+        command = ['rm', '-f'] + input_path_list
+        subprocess.call(command)
+
+
+def temporal_concat_video(input_path_list: List[str],
+                          output_path: str,
+                          resolution: Union[Tuple[int, int],
+                                            Tuple[float, float]] = (512, 512),
+                          remove_raw_files: bool = False,
+                          disable_log: bool = False) -> None:
+    """Concat no matter videos or gifs into a temporal sequence, and save as a
+    new video or gif file.
+
+    Args:
+        input_path_list (List[str]): list of input video paths.
+        output_path (str): output video file path.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]]
+            , optional): (height, width) of output].
+            Defaults to (512,512).
+        remove_raw_files (bool, optional): whether remove the input videos.
+            Defaults to False.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None.
+    """
+    for path in input_path_list:
+        check_input_path(
+            path,
+            allowed_suffix=['.gif', '.mp4'],
+            tag='input video',
+            path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+
+    height, width = resolution
+    command = ['ffmpeg']
+    concat_command = []
+    scale_command = []
+    for index, vid_file in enumerate(input_path_list):
+        command.append('-i')
+        command.append(vid_file)
+        scale_command.append(
+            '[%d:v]scale=%d:%d:force_original_aspect_ratio=0[v%d];' %
+            (index, width, height, index))
+        concat_command.append('[v%d]' % index)
+    concat_command = ''.join(concat_command)
+    scale_command = ''.join(scale_command)
+    command += [
+        '-filter_complex',
+        '%s%sconcat=n=%d:v=1:a=0[v]' %
+        (scale_command, concat_command, len(input_path_list)), '-loglevel',
+        'error', '-map', '[v]', '-c:v', 'libx264', '-y', output_path
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+    if remove_raw_files:
+        command = ['rm'] + input_path_list
+        subprocess.call(command)
+
+
+def compress_video(input_path: str,
+                   output_path: str,
+                   compress_rate: int = 1,
+                   down_sample_scale: Union[float, int] = 1,
+                   fps: int = 30,
+                   disable_log: bool = False) -> None:
+    """Compress a video file.
+
+    Args:
+        input_path (str): input video file path.
+        output_path (str): output video file path.
+        compress_rate (int, optional): compress rate, influents the bit rate.
+            Defaults to 1.
+        down_sample_scale (Union[float, int], optional): spatial down sample
+            scale. Defaults to 1.
+        fps (int, optional): Frames per second. Defaults to 30.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None.
+    """
+    input_pathinfo = Path(input_path)
+
+    check_input_path(
+        input_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='input video',
+        path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+
+    info = vid_info_reader(input_path)
+
+    width = int(info['width'])
+    height = int(info['height'])
+    bit_rate = int(info['bit_rate'])
+    duration = float(info['duration'])
+    if (output_path == input_path) or (not output_path):
+        temp_outpath = os.path.join(
+            os.path.abspath(input_pathinfo.parent),
+            'temp_file' + input_pathinfo.suffix)
+    else:
+        temp_outpath = output_path
+    new_width = int(width / down_sample_scale)
+    new_width += new_width % 2
+    new_height = int(height / down_sample_scale)
+    new_height += new_height % 2
+    command = [
+        'ffmpeg', '-y', '-r',
+        str(info['r_frame_rate']), '-i', input_path, '-loglevel', 'error',
+        '-b:v', f'{bit_rate / (compress_rate * down_sample_scale)}', '-r',
+        f'{fps}', '-t', f'{duration}', '-s',
+        '%dx%d' % (new_width, new_height), temp_outpath
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+    if (output_path == input_path) or (not output_path):
+        subprocess.call(['mv', '-f', temp_outpath, input_path])
+
+
+def pad_for_libx264(image_array):
+    """Pad zeros if width or height of image_array is not divisible by 2.
+    Otherwise you will get.
+
+    \"[libx264 @ 0x1b1d560] width not divisible by 2 \"
+
+    Args:
+        image_array (np.ndarray):
+            Image or images load by cv2.imread().
+            Possible shapes:
+            1. [height, width]
+            2. [height, width, channels]
+            3. [images, height, width]
+            4. [images, height, width, channels]
+
+    Returns:
+        np.ndarray:
+            A image with both edges divisible by 2.
+    """
+    if image_array.ndim == 2 or \
+            (image_array.ndim == 3 and image_array.shape[2] == 3):
+        hei_index = 0
+        wid_index = 1
+    elif image_array.ndim == 4 or \
+            (image_array.ndim == 3 and image_array.shape[2] != 3):
+        hei_index = 1
+        wid_index = 2
+    else:
+        return image_array
+    hei_pad = image_array.shape[hei_index] % 2
+    wid_pad = image_array.shape[wid_index] % 2
+    if hei_pad + wid_pad > 0:
+        pad_width = []
+        for dim_index in range(image_array.ndim):
+            if dim_index == hei_index:
+                pad_width.append((0, hei_pad))
+            elif dim_index == wid_index:
+                pad_width.append((0, wid_pad))
+            else:
+                pad_width.append((0, 0))
+        values = 0
+        image_array = \
+            np.pad(image_array,
+                   pad_width,
+                   mode='constant', constant_values=values)
+    return image_array
diff --git a/detrsmpl/utils/geometry.py b/detrsmpl/utils/geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cfb508f5f9305cc799a4258d76c0460f7b565d4
--- /dev/null
+++ b/detrsmpl/utils/geometry.py
@@ -0,0 +1,536 @@
+import numpy as np
+import torch
+from torch.nn import functional as F
+import torchgeometry as tgm
+
+def batch_rodrigues(theta):
+    """Convert axis-angle representation to rotation matrix.
+
+    Args:
+        theta: size = [B, 3]
+    Returns:
+        Rotation matrix corresponding to the quaternion -- size = [B, 3, 3]
+    """
+    l1norm = torch.norm(theta + 1e-8, p=2, dim=1)
+    angle = torch.unsqueeze(l1norm, -1)
+    normalized = torch.div(theta, angle)
+    angle = angle * 0.5
+    v_cos = torch.cos(angle)
+    v_sin = torch.sin(angle)
+    quat = torch.cat([v_cos, v_sin * normalized], dim=1)
+    return quat_to_rotmat(quat)
+
+
+def quat_to_rotmat(quat):
+    """Convert quaternion coefficients to rotation matrix.
+
+    Args:
+        quat: size = [B, 4] 4 <===>(w, x, y, z)
+    Returns:
+        Rotation matrix corresponding to the quaternion -- size = [B, 3, 3]
+    """
+    norm_quat = quat
+    norm_quat = norm_quat / norm_quat.norm(p=2, dim=1, keepdim=True)
+    w = norm_quat[:, 0]
+    x = norm_quat[:, 1]
+    y = norm_quat[:, 2]
+    z = norm_quat[:, 3]
+    B = quat.size(0)
+
+    w2, x2, y2, z2 = w.pow(2), x.pow(2), y.pow(2), z.pow(2)
+    wx, wy, wz = w * x, w * y, w * z
+    xy, xz, yz = x * y, x * z, y * z
+
+    rotMat = torch.stack([
+        w2 + x2 - y2 - z2, 2 * xy - 2 * wz, 2 * wy + 2 * xz, 2 * wz + 2 * xy,
+        w2 - x2 + y2 - z2, 2 * yz - 2 * wx, 2 * xz - 2 * wy, 2 * wx + 2 * yz,
+        w2 - x2 - y2 + z2
+    ],
+                         dim=1).view(B, 3, 3)
+    return rotMat
+
+
+def rot6d_to_rotmat(x):
+    """Convert 6D rotation representation to 3x3 rotation matrix.
+
+    Based on Zhou et al., "On the Continuity of Rotation
+    Representations in Neural Networks", CVPR 2019
+    Input:
+        (B,6) Batch of 6-D rotation representations
+    Output:
+        (B,3,3) Batch of corresponding rotation matrices
+    """
+    if isinstance(x, torch.Tensor):
+        x = x.reshape(-1, 3, 2)
+    elif isinstance(x, np.ndarray):
+        x = x.view(-1, 3, 2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    return torch.stack((b1, b2, b3), dim=-1)
+
+def rot6d_to_axis_angle(x):
+    batch_size = x.shape[0]
+
+    x = x.view(-1, 3, 2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    rot_mat = torch.stack((b1, b2, b3), dim=-1)  # 3x3 rotation matrix
+
+    rot_mat = torch.cat([rot_mat, torch.zeros((batch_size, 3, 1)).cuda().float()], 2)  # 3x4 rotation matrix
+    axis_angle = tgm.rotation_matrix_to_angle_axis(rot_mat).reshape(-1, 3)  # axis-angle
+    axis_angle[torch.isnan(axis_angle)] = 0.0
+    return axis_angle
+
+def rotation_matrix_to_angle_axis(rotation_matrix):
+    """
+    This function is borrowed from https://github.com/kornia/kornia
+    Convert 3x4 rotation matrix to Rodrigues vector
+    Args:
+        rotation_matrix (Tensor): rotation matrix.
+    Returns:
+        Tensor: Rodrigues vector transformation.
+    Shape:
+        - Input: :math:`(N, 3, 4)`
+        - Output: :math:`(N, 3)`
+    Example:
+        >>> input = torch.rand(2, 3, 4)  # Nx3x4
+        >>> output = tgm.rotation_matrix_to_angle_axis(input)  # Nx3
+    """
+    if rotation_matrix.shape[1:] == (3, 3):
+        rot_mat = rotation_matrix.reshape(-1, 3, 3)
+        hom = torch.tensor([0, 0, 1],
+                           dtype=torch.float32,
+                           device=rotation_matrix.device)
+        hom = hom.reshape(1, 3, 1).expand(rot_mat.shape[0], -1, -1)
+        rotation_matrix = torch.cat([rot_mat, hom], dim=-1)
+
+    quaternion = rotation_matrix_to_quaternion(rotation_matrix)
+    aa = quaternion_to_angle_axis(quaternion)
+    aa[torch.isnan(aa)] = 0.0
+    return aa
+
+
+def quaternion_to_angle_axis(quaternion: torch.Tensor) -> torch.Tensor:
+    """
+    This function is borrowed from https://github.com/kornia/kornia
+    Convert quaternion vector to angle axis of rotation.
+    Adapted from ceres C++ library: ceres-solver/include/ceres/rotation.h
+    Args:
+        quaternion (torch.Tensor): tensor with quaternions.
+    Return:
+        torch.Tensor: tensor with angle axis of rotation.
+    Shape:
+        - Input: :math:`(*, 4)` where `*` means, any number of dimensions
+        - Output: :math:`(*, 3)`
+    Example:
+        >>> quaternion = torch.rand(2, 4)  # Nx4
+        >>> angle_axis = tgm.quaternion_to_angle_axis(quaternion)  # Nx3
+    """
+    if not torch.is_tensor(quaternion):
+        raise TypeError('Input type is not a torch.Tensor. Got {}'.format(
+            type(quaternion)))
+
+    if not quaternion.shape[-1] == 4:
+        raise ValueError(
+            'Input must be a tensor of shape Nx4 or 4. Got {}'.format(
+                quaternion.shape))
+    # unpack input and compute conversion
+    q1: torch.Tensor = quaternion[..., 1]
+    q2: torch.Tensor = quaternion[..., 2]
+    q3: torch.Tensor = quaternion[..., 3]
+    sin_squared_theta: torch.Tensor = q1 * q1 + q2 * q2 + q3 * q3
+
+    sin_theta: torch.Tensor = torch.sqrt(sin_squared_theta)
+    cos_theta: torch.Tensor = quaternion[..., 0]
+    two_theta: torch.Tensor = 2.0 * torch.where(
+        cos_theta < 0.0, torch.atan2(-sin_theta, -cos_theta),
+        torch.atan2(sin_theta, cos_theta))
+
+    k_pos: torch.Tensor = two_theta / sin_theta
+    k_neg: torch.Tensor = 2.0 * torch.ones_like(sin_theta)
+    k: torch.Tensor = torch.where(sin_squared_theta > 0.0, k_pos, k_neg)
+
+    angle_axis: torch.Tensor = torch.zeros_like(quaternion)[..., :3]
+    angle_axis[..., 0] += q1 * k
+    angle_axis[..., 1] += q2 * k
+    angle_axis[..., 2] += q3 * k
+    return angle_axis
+
+
+def rotation_matrix_to_quaternion(rotation_matrix, eps=1e-6):
+    """
+    This function is borrowed from https://github.com/kornia/kornia
+    Convert 3x4 rotation matrix to 4d quaternion vector
+    This algorithm is based on algorithm described in
+    https://github.com/KieranWynn/pyquaternion/blob/master/pyquaternion/quaternion.py#L201
+    Args:
+        rotation_matrix (Tensor): the rotation matrix to convert.
+    Return:
+        Tensor: the rotation in quaternion
+    Shape:
+        - Input: :math:`(N, 3, 4)`
+        - Output: :math:`(N, 4)`
+    Example:
+        >>> input = torch.rand(4, 3, 4)  # Nx3x4
+        >>> output = tgm.rotation_matrix_to_quaternion(input)  # Nx4
+    """
+    if not torch.is_tensor(rotation_matrix):
+        raise TypeError('Input type is not a torch.Tensor. Got {}'.format(
+            type(rotation_matrix)))
+
+    if len(rotation_matrix.shape) > 3:
+        raise ValueError(
+            'Input size must be a three dimensional tensor. Got {}'.format(
+                rotation_matrix.shape))
+    if not rotation_matrix.shape[-2:] == (3, 4):
+        raise ValueError(
+            'Input size must be a N x 3 x 4  tensor. Got {}'.format(
+                rotation_matrix.shape))
+
+    rmat_t = torch.transpose(rotation_matrix, 1, 2)
+
+    mask_d2 = rmat_t[:, 2, 2] < eps
+
+    mask_d0_d1 = rmat_t[:, 0, 0] > rmat_t[:, 1, 1]
+    mask_d0_nd1 = rmat_t[:, 0, 0] < -rmat_t[:, 1, 1]
+
+    t0 = 1 + rmat_t[:, 0, 0] - rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
+    q0 = torch.stack([
+        rmat_t[:, 1, 2] - rmat_t[:, 2, 1], t0,
+        rmat_t[:, 0, 1] + rmat_t[:, 1, 0], rmat_t[:, 2, 0] + rmat_t[:, 0, 2]
+    ], -1)
+    t0_rep = t0.repeat(4, 1).t()
+
+    t1 = 1 - rmat_t[:, 0, 0] + rmat_t[:, 1, 1] - rmat_t[:, 2, 2]
+    q1 = torch.stack([
+        rmat_t[:, 2, 0] - rmat_t[:, 0, 2], rmat_t[:, 0, 1] + rmat_t[:, 1, 0],
+        t1, rmat_t[:, 1, 2] + rmat_t[:, 2, 1]
+    ], -1)
+    t1_rep = t1.repeat(4, 1).t()
+
+    t2 = 1 - rmat_t[:, 0, 0] - rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
+    q2 = torch.stack([
+        rmat_t[:, 0, 1] - rmat_t[:, 1, 0], rmat_t[:, 2, 0] + rmat_t[:, 0, 2],
+        rmat_t[:, 1, 2] + rmat_t[:, 2, 1], t2
+    ], -1)
+    t2_rep = t2.repeat(4, 1).t()
+
+    t3 = 1 + rmat_t[:, 0, 0] + rmat_t[:, 1, 1] + rmat_t[:, 2, 2]
+    q3 = torch.stack([
+        t3, rmat_t[:, 1, 2] - rmat_t[:, 2, 1],
+        rmat_t[:, 2, 0] - rmat_t[:, 0, 2], rmat_t[:, 0, 1] - rmat_t[:, 1, 0]
+    ], -1)
+    t3_rep = t3.repeat(4, 1).t()
+
+    mask_c0 = mask_d2 * mask_d0_d1
+    mask_c1 = mask_d2 * ~mask_d0_d1
+    mask_c2 = ~mask_d2 * mask_d0_nd1
+    mask_c3 = ~mask_d2 * ~mask_d0_nd1
+    mask_c0 = mask_c0.view(-1, 1).type_as(q0)
+    mask_c1 = mask_c1.view(-1, 1).type_as(q1)
+    mask_c2 = mask_c2.view(-1, 1).type_as(q2)
+    mask_c3 = mask_c3.view(-1, 1).type_as(q3)
+
+    q = q0 * mask_c0 + q1 * mask_c1 + q2 * mask_c2 + q3 * mask_c3
+    q /= torch.sqrt(t0_rep * mask_c0 + t1_rep * mask_c1 +  # noqa
+                    t2_rep * mask_c2 + t3_rep * mask_c3)  # noqa
+    q *= 0.5
+    return q
+
+
+def perspective_projection(points, rotation, translation, focal_length,
+                           camera_center):
+    """This function computes the perspective projection of a set of points.
+
+    Input:
+        points (bs, N, 3): 3D points
+        rotation (bs, 3, 3): Camera rotation
+        translation (bs, 3): Camera translation
+        focal_length (bs,) or scalar: Focal length
+        camera_center (bs, 2): Camera center
+    """
+    batch_size = points.shape[0]
+    K = torch.zeros([batch_size, 3, 3], device=points.device)
+    K[:, 0, 0] = focal_length
+    K[:, 1, 1] = focal_length
+    K[:, 2, 2] = 1.
+    K[:, :-1, -1] = camera_center
+
+    # Transform points
+    points = torch.einsum('bij,bkj->bki', rotation, points)
+    points = points + translation.unsqueeze(1)
+
+    # Apply perspective distortion
+    projected_points = points / points[:, :, -1].unsqueeze(-1)
+
+    # Apply camera intrinsics
+    projected_points = torch.einsum('bij,bkj->bki', K, projected_points)
+
+    return projected_points[:, :, :-1]
+
+
+def estimate_translation_np(S,
+                            joints_2d,
+                            joints_conf,
+                            focal_length=5000,
+                            img_size=224):
+    """Find camera translation that brings 3D joints S closest to 2D the
+    corresponding joints_2d.
+
+    Input:
+        S: (25, 3) 3D joint locations
+        joints: (25, 3) 2D joint locations and confidence
+    Returns:
+        (3,) camera translation vector
+    """
+
+    num_joints = S.shape[0]
+    # focal length
+    f = np.array([focal_length, focal_length])
+    # optical center
+    center = np.array([img_size / 2., img_size / 2.])
+
+    # transformations
+    Z = np.reshape(np.tile(S[:, 2], (2, 1)).T, -1)
+    XY = np.reshape(S[:, 0:2], -1)
+    OO = np.tile(center, num_joints)
+    F = np.tile(f, num_joints)
+    weight2 = np.reshape(np.tile(np.sqrt(joints_conf), (2, 1)).T, -1)
+
+    # least squares
+    Q = np.array([
+        F * np.tile(np.array([1, 0]), num_joints),
+        F * np.tile(np.array([0, 1]), num_joints),
+        OO - np.reshape(joints_2d, -1)
+    ]).T
+    c = (np.reshape(joints_2d, -1) - OO) * Z - F * XY
+
+    # weighted least squares
+    W = np.diagflat(weight2)
+    Q = np.dot(W, Q)
+    c = np.dot(W, c)
+
+    # square matrix
+    A = np.dot(Q.T, Q)
+    b = np.dot(Q.T, c)
+
+    # solution
+    trans = np.linalg.solve(A, b)
+
+    return trans
+
+
+def estimate_translation(S, joints_2d, focal_length=5000., img_size=224.):
+    """Find camera translation that brings 3D joints S closest to 2D the
+    corresponding joints_2d.
+
+    Input:
+        S: (B, 49, 3) 3D joint locations
+        joints: (B, 49, 3) 2D joint locations and confidence
+    Returns:
+        (B, 3) camera translation vectors
+    """
+
+    device = S.device
+    # Use only joints 25:49 (GT joints)
+    S = S[:, 25:, :].cpu().numpy()
+    joints_2d = joints_2d[:, 25:, :].cpu().numpy()
+    joints_conf = joints_2d[:, :, -1]
+    joints_2d = joints_2d[:, :, :-1]
+    trans = np.zeros((S.shape[0], 3), dtype=np.float32)
+    # Find the translation for each example in the batch
+    for i in range(S.shape[0]):
+        S_i = S[i]
+        joints_i = joints_2d[i]
+        conf_i = joints_conf[i]
+        trans[i] = estimate_translation_np(S_i,
+                                           joints_i,
+                                           conf_i,
+                                           focal_length=focal_length,
+                                           img_size=img_size)
+    return torch.from_numpy(trans).to(device)
+
+
+def project_points(points_3d, camera, focal_length, img_res):
+    """Perform orthographic projection of 3D points using the camera
+    parameters, return projected 2D points in image plane.
+
+    Notes:
+        batch size: B
+        point number: N
+    Args:
+        points_3d (Tensor([B, N, 3])): 3D points.
+        camera (Tensor([B, 3])): camera parameters with the
+            3 channel as (scale, translation_x, translation_y)
+    Returns:
+        points_2d (Tensor([B, N, 2])): projected 2D points
+            in image space.
+    """
+    batch_size = points_3d.shape[0]
+    device = points_3d.device
+    cam_t = torch.stack([
+        camera[:, 1], camera[:, 2], 2 * focal_length /
+        (img_res * camera[:, 0] + 1e-9)
+    ],
+                        dim=-1)
+    camera_center = camera.new_zeros([batch_size, 2])
+    rot_t = torch.eye(3, device=device,
+                      dtype=points_3d.dtype).unsqueeze(0).expand(
+                          batch_size, -1, -1)
+    keypoints_2d = perspective_projection(points_3d,
+                                          rotation=rot_t,
+                                          translation=cam_t,
+                                          focal_length=focal_length,
+                                          camera_center=camera_center)
+    return keypoints_2d
+
+def project_points_new(points_3d, pred_cam, focal_length, camera_center):
+    """Perform orthographic projection of 3D points using the camera
+    parameters, return projected 2D points in image plane.
+
+    Notes:
+        batch size: B
+        point number: N
+    Args:
+        points_3d (Tensor([B, N, 3])): 3D points.
+        camera (Tensor([B, 3])): camera parameters with the
+            3 channel as (scale, translation_x, translation_y)
+    Returns:
+        points_2d (Tensor([B, N, 2])): projected 2D points
+            in image space.
+    """
+    batch_size = points_3d.shape[0]
+    device = points_3d.device
+    
+    (s, tx, ty) = (pred_cam[:, 0] + 1e-9), pred_cam[:, 1], pred_cam[:, 2]
+    depth, dx, dy = 1./s, tx/s, ty/s
+    cam_t = torch.stack([dx, dy, depth], 1)    
+    
+    # cam_t = torch.stack([
+    #     camera[:, 1], camera[:, 2], 2 * focal_length /
+    #     (img_res * camera[:, 0] + 1e-9)
+    # ],
+    #                     dim=-1)
+    rot_t = torch.eye(3, device=device,
+                      dtype=points_3d.dtype).unsqueeze(0).expand(
+                          batch_size, -1, -1)
+    keypoints_2d = perspective_projection(points_3d,
+                                          rotation=rot_t,
+                                          translation=cam_t,
+                                          focal_length=focal_length,
+                                          camera_center=camera_center)
+    return keypoints_2d
+
+
+
+
+def weak_perspective_projection(points, scale, translation):
+    """This function computes the weak perspective projection of a set of
+    points.
+
+    Input:
+        points (bs, N, 3): 3D points
+        scale (bs,1): scalar
+        translation (bs, 2): point 2D translation
+    """
+    projected_points = scale.view(
+        -1, 1, 1) * (points[:, :, :2] + translation.view(-1, 1, 2))
+
+    return projected_points
+
+
+def estimate_cam_weakperspective(joints3d,
+        joints2d,
+        joints2d_conf,
+        joints3d_conf,
+        img_size) -> torch.Tensor:
+    '''
+    img_size: wh
+    '''
+    w, h = img_size
+    if joints2d_conf is not None:
+        valid_ids = torch.where(joints2d_conf.view(-1) > 0)[0]
+        joints2d = joints2d[valid_ids]
+    if joints3d_conf is not None:
+        valid_ids = torch.where(joints3d_conf.view(-1) > 0)[0]
+        joints3d = joints3d[valid_ids]
+    x1 = torch.min(joints3d[..., 0])
+    x2 = torch.max(joints3d[..., 0])
+
+    y1 = torch.min(joints3d[..., 1])
+    y2 = torch.max(joints3d[..., 1])
+
+    # img_size = img_size if isinstance(img_size, int) else int(img_size[0])
+
+    u1 = 2*torch.min(joints2d[..., 0]) / w  -1
+    u2 = 2*torch.max(joints2d[..., 0]) / w  -1
+    v1 = (2 * torch.min(joints2d[..., 1])-h)/max(w,h)
+    v2 = (2 * torch.max(joints2d[..., 1])-h)/max(w,h)
+    
+    # u1 = torch.min(joints2d[..., 0]) / w 
+    # u2 = torch.max(joints2d[..., 0]) / w 
+    # v1 = torch.min(joints2d[..., 1]) / h
+    # v2 = torch.max(joints2d[..., 1]) / h
+    
+    sx = (u1 - u2) / (x1 - x2)
+    sy = (v1 - v2) / (y1 - y2)
+    s = torch.sqrt(sx * sy)
+
+    tx_1 = u1 / s - x1 # u1 = s*(tx_1 + x1)
+    ty_1 = v1 / s - y1 # v1 = s*(ty_1 + y1)
+
+    tx_2 = u2 / s - x2 # u2 = s*(tx_2 + x2)
+    ty_2 = v2 / s - y2 # v2 = s*(ty_2 + y2)
+
+    tx = (tx_1 + tx_2) / 2
+    ty = (ty_1 + ty_2) / 2
+    cam = torch.Tensor([s, tx, ty]).view(3)
+    return cam
+
+def estimate_cam_weakperspective_batch(
+        joints3d, joints2d, 
+        joints2d_conf, joints3d_conf,
+        img_size):
+    '''
+    img_size: b,w,h
+    '''
+    device = joints3d.device
+    joints2d = joints2d.detach().cpu()
+    joints3d = joints3d.detach().cpu()
+
+    assert joints2d.ndim == 3  # B, J, 2
+    assert joints3d.ndim == 3  # B, J, 3
+
+    cam = torch.zeros(joints3d.shape[0], 3)
+    for i in range(joints3d.shape[0]):
+        joints3d_i = joints3d[i]
+        joints2d_i = joints2d[i]
+        if joints2d_conf is not None:
+            conf2d_i = joints2d_conf[i].detach().cpu()
+        else:
+            conf2d_i = None
+
+        if joints3d_conf is not None:
+            conf3d_i = joints3d_conf[i].detach().cpu()
+        else:
+            conf3d_i = None
+        cam[i] = estimate_cam_weakperspective(joints3d=joints3d_i,
+                                              joints2d=joints2d_i,
+                                              joints2d_conf=conf2d_i,
+                                              joints3d_conf=conf3d_i,
+                                              img_size=img_size[i])
+    return cam.to(device)
+
+def pred_cam_to_transl(pred_camera, focal_length, img_size):
+    pred_cam_t = torch.stack([
+        pred_camera[:, 1], pred_camera[:, 2], 2 * focal_length /
+        (img_size * pred_camera[:, 0] + 1e-9)
+    ],
+                             dim=-1)
+    return pred_cam_t
\ No newline at end of file
diff --git a/detrsmpl/utils/keypoint_utils.py b/detrsmpl/utils/keypoint_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9cacfd85a2a31ff0503a25dbb99ed057a379f7d2
--- /dev/null
+++ b/detrsmpl/utils/keypoint_utils.py
@@ -0,0 +1,61 @@
+from typing import Optional, Tuple, Union
+
+import numpy as np
+
+from detrsmpl.core.conventions.keypoints_mapping import KEYPOINTS_FACTORY
+from detrsmpl.core.conventions.keypoints_mapping.human_data import (
+    HUMAN_DATA_LIMBS_INDEX,
+    HUMAN_DATA_PALETTE,
+)
+
+
+def search_limbs(
+        data_source: str,
+        mask: Optional[Union[np.ndarray, tuple, list]] = None,
+        keypoints_factory: dict = KEYPOINTS_FACTORY) -> Tuple[dict, dict]:
+    """Search the corresponding limbs following the basis human_data limbs. The
+    mask could mask out the incorrect keypoints.
+
+    Args:
+        data_source (str): data source type.
+        mask (Optional[Union[np.ndarray, tuple, list]], optional):
+            refer to keypoints_mapping. Defaults to None.
+        keypoints_factory (dict, optional): Dict of all the conventions.
+            Defaults to KEYPOINTS_FACTORY.
+    Returns:
+        Tuple[dict, dict]: (limbs_target, limbs_palette).
+    """
+    limbs_source = HUMAN_DATA_LIMBS_INDEX
+    limbs_palette = HUMAN_DATA_PALETTE
+    keypoints_source = keypoints_factory['human_data']
+    keypoints_target = keypoints_factory[data_source]
+    limbs_target = {}
+    for k, part_limbs in limbs_source.items():
+        limbs_target[k] = []
+        for limb in part_limbs:
+            flag = False
+            if (keypoints_source[limb[0]]
+                    in keypoints_target) and (keypoints_source[limb[1]]
+                                              in keypoints_target):
+                if mask is not None:
+                    if mask[keypoints_target.index(keypoints_source[
+                            limb[0]])] != 0 and mask[keypoints_target.index(
+                                keypoints_source[limb[1]])] != 0:
+                        flag = True
+                else:
+                    flag = True
+                if flag:
+                    limbs_target.setdefault(k, []).append([
+                        keypoints_target.index(keypoints_source[limb[0]]),
+                        keypoints_target.index(keypoints_source[limb[1]])
+                    ])
+        if k in limbs_target:
+            if k == 'body':
+                np.random.seed(0)
+                limbs_palette[k] = np.random.randint(0,
+                                                     high=255,
+                                                     size=(len(
+                                                         limbs_target[k]), 3))
+            else:
+                limbs_palette[k] = np.array(limbs_palette[k])
+    return limbs_target, limbs_palette
diff --git a/detrsmpl/utils/logger.py b/detrsmpl/utils/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca1e9451b5ac8dc5278d448d5e916dcb7ed525c
--- /dev/null
+++ b/detrsmpl/utils/logger.py
@@ -0,0 +1,7 @@
+import logging
+
+from mmcv.utils import get_logger
+
+
+def get_root_logger(log_file=None, log_level=logging.INFO):
+    return get_logger('mmhuman3d', log_file, log_level)
diff --git a/detrsmpl/utils/mesh_utils.py b/detrsmpl/utils/mesh_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..00ed5f253ccc99d5cb15c6ed0a68729385dc9e4d
--- /dev/null
+++ b/detrsmpl/utils/mesh_utils.py
@@ -0,0 +1,236 @@
+import warnings
+from typing import List, Optional, Union
+
+import torch
+from pytorch3d.io import IO
+from pytorch3d.io import load_objs_as_meshes as _load_objs_as_meshes
+from pytorch3d.io import save_obj
+from pytorch3d.renderer import TexturesUV, TexturesVertex
+from pytorch3d.structures import (
+    Meshes,
+    Pointclouds,
+    join_meshes_as_batch,
+    join_meshes_as_scene,
+    padded_to_list,
+)
+
+from .path_utils import prepare_output_path
+
+
+def join_batch_meshes_as_scene(
+    meshes: List[Meshes],
+    include_textures: bool = True,
+) -> Meshes:
+    """Join `meshes` as a scene each batch. Only for Pytorch3D `meshes`. The
+    Meshes must share the same batch size, and topology could be different.
+    They must all be on the same device. If `include_textures` is true, the
+    textures should be the same type, all be None is not accepted. If
+    `include_textures` is False, textures are ignored. The return meshes will
+    have no textures.
+
+    Args:
+        meshes (List[Meshes]): A `list` of `Meshes` with the same batches.
+            Required.
+        include_textures: (bool) whether to try to join the textures.
+
+    Returns:
+        New Meshes which has join different Meshes by each batch.
+    """
+    for mesh in meshes:
+        mesh._verts_list = padded_to_list(mesh.verts_padded(),
+                                          mesh.num_verts_per_mesh().tolist())
+    num_scene_size = len(meshes)
+    num_batch_size = len(meshes[0])
+    for i in range(num_scene_size):
+        assert len(
+            meshes[i]
+        ) == num_batch_size, 'Please make sure that the Meshes all have'
+        'the same batch size.'
+    meshes_all = []
+    for j in range(num_batch_size):
+        meshes_batch = []
+        for i in range(num_scene_size):
+            meshes_batch.append(meshes[i][j])
+        meshes_all.append(join_meshes_as_scene(meshes_batch, include_textures))
+    meshes_final = join_meshes_as_batch(meshes_all, include_textures)
+    return meshes_final
+
+
+def mesh_to_pointcloud_vc(
+    meshes: Meshes,
+    include_textures: bool = True,
+    alpha: float = 1.0,
+) -> Pointclouds:
+    """Convert PyTorch3D vertex color `Meshes` to `PointClouds`.
+
+    Args:
+        meshes (Meshes): input meshes.
+        include_textures (bool, optional): Whether include colors.
+            Require the texture of input meshes is vertex color.
+            Defaults to True.
+        alpha (float, optional): transparency.
+            Defaults to 1.0.
+
+    Returns:
+        Pointclouds: output pointclouds.
+    """
+    assert isinstance(
+        meshes.textures,
+        TexturesVertex), 'textures of input meshes should be `TexturesVertex`'
+    vertices = meshes.verts_padded()
+    if include_textures:
+        verts_rgb = meshes.textures.verts_features_padded()
+        verts_rgba = torch.cat(
+            [verts_rgb,
+             torch.ones_like(verts_rgb)[..., 0:1] * alpha], dim=-1)
+    else:
+        verts_rgba = None
+    pointclouds = Pointclouds(points=vertices, features=verts_rgba)
+    return pointclouds
+
+
+def texture_uv2vc(meshes: Meshes) -> Meshes:
+    """Convert a Pytorch3D meshes's textures from TexturesUV to TexturesVertex.
+
+    Args:
+        meshes (Meshes): input Meshes.
+
+    Returns:
+        Meshes: converted Meshes.
+    """
+    assert isinstance(meshes.textures, TexturesUV)
+    device = meshes.device
+    vert_uv = meshes.textures.verts_uvs_padded()
+    batch_size = vert_uv.shape[0]
+    verts_features = []
+    num_verts = meshes.verts_padded().shape[1]
+    for index in range(batch_size):
+        face_uv = vert_uv[index][meshes.textures.faces_uvs_padded()
+                                 [index].view(-1)]
+
+        img = meshes.textures._maps_padded[index]
+        width, height, _ = img.shape
+
+        face_uv = face_uv * torch.Tensor([width - 1, height - 1
+                                          ]).long().to(device)
+
+        face_uv[:, 0] = torch.clip(face_uv[:, 0], 0, width - 1)
+        face_uv[:, 1] = torch.clip(face_uv[:, 1], 0, height - 1)
+        face_uv = face_uv.long()
+        faces = meshes.faces_padded()
+        verts_rgb = torch.zeros(1, num_verts, 3).to(device)
+        verts_rgb[:, faces.view(-1)] = img[height - 1 - face_uv[:, 1],
+                                           face_uv[:, 0]]
+        verts_features.append(verts_rgb)
+    verts_features = torch.cat(verts_features)
+
+    meshes = meshes.clone()
+    meshes.textures = TexturesVertex(verts_features)
+    return meshes
+
+
+def load_objs_as_meshes(files: List[str],
+                        device: Optional[Union[torch.device, str]] = None,
+                        load_textures: bool = True,
+                        **kwargs) -> Meshes:
+    if not isinstance(files, list):
+        files = [files]
+    return _load_objs_as_meshes(files=files,
+                                device=device,
+                                load_textures=load_textures,
+                                **kwargs)
+
+
+def load_plys_as_meshes(
+    files: List[str],
+    device: Optional[Union[torch.device, str]] = None,
+    load_textures: bool = True,
+) -> Meshes:
+    writer = IO()
+    meshes = []
+    if not isinstance(files, list):
+        files = [files]
+    for idx in range(len(files)):
+        assert files[idx].endswith('.ply'), 'Please input .ply files.'
+        mesh = writer.load_mesh(path=files[idx],
+                                include_textures=load_textures,
+                                device=device)
+        meshes.append(mesh)
+    meshes = join_meshes_as_batch(meshes, include_textures=load_textures)
+    return meshes
+
+
+def save_meshes_as_plys(files: List[str],
+                        meshes: Meshes = None,
+                        verts: torch.Tensor = None,
+                        faces: torch.Tensor = None,
+                        verts_rgb: torch.Tensor = None) -> None:
+    """Save meshes as .ply files. Mainly for vertex color meshes.
+
+    Args:
+        files (List[str]): Output .ply file list.
+        meshes (Meshes, optional): higher priority than
+            (verts & faces & verts_rgb). Defaults to None.
+        verts (torch.Tensor, optional): lower priority than meshes.
+            Defaults to None.
+        faces (torch.Tensor, optional): lower priority than meshes.
+            Defaults to None.
+        verts_rgb (torch.Tensor, optional): lower priority than meshes.
+            Defaults to None.
+    """
+    if meshes is None:
+        assert verts is not None and faces is not None, 'Not mesh input.'
+        meshes = Meshes(
+            verts=verts,
+            faces=faces,
+            textures=TexturesVertex(
+                verts_features=verts_rgb) if verts_rgb is not None else None)
+    else:
+        if verts is not None or faces is not None or verts_rgb is not None:
+            warnings.warn('Redundant input, will use meshes only.')
+    assert files is not None
+    if not isinstance(files, list):
+        files = [files]
+    assert len(files) >= len(meshes), 'Not enough output files.'
+    writer = IO()
+    for idx in range(len(meshes)):
+        assert files[idx].endswith('.ply'), 'Please save as .ply files.'
+        writer.save_mesh(meshes[idx],
+                         files[idx],
+                         colors_as_uint8=True,
+                         binary=False)
+
+
+def save_meshes_as_objs(files: List[str], meshes: Meshes = None) -> None:
+    """Save meshes as .obj files. Pytorch3D will not save vertex color for.
+
+    .obj, please use `save_meshes_as_plys`.
+
+    Args:
+        files (List[str]): Output .obj file list.
+        meshes (Meshes, optional):
+            Defaults to None.
+    """
+    if not isinstance(files, list):
+        files = [files]
+
+    assert len(files) >= len(meshes), 'Not enough output files.'
+
+    for idx in range(len(meshes)):
+        prepare_output_path(files[idx],
+                            allowed_suffix=['.obj'],
+                            path_type='file'), 'Please save as .obj files.'
+        if isinstance(meshes.textures, TexturesUV):
+            verts_uvs = meshes.textures.verts_uvs_padded()[idx]
+            faces_uvs = meshes.textures.faces_uvs_padded()[idx]
+            texture_map = meshes.textures.maps_padded()[idx]
+        else:
+            verts_uvs = None
+            faces_uvs = None
+            texture_map = None
+        save_obj(f=files[idx],
+                 verts=meshes.verts_padded()[idx],
+                 faces=meshes.faces_padded()[idx],
+                 verts_uvs=verts_uvs,
+                 faces_uvs=faces_uvs,
+                 texture_map=texture_map)
diff --git a/detrsmpl/utils/misc.py b/detrsmpl/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..e98eb2fb3983d9bcf726fd7fd172e84a93c5415a
--- /dev/null
+++ b/detrsmpl/utils/misc.py
@@ -0,0 +1,30 @@
+from functools import partial
+
+import torch
+
+
+def multi_apply(func, *args, **kwargs):
+    """Apply function to a list of arguments.
+
+    Note:
+        This function applies the ``func`` to multiple inputs and
+        map the multiple outputs of the ``func`` into different
+        list. Each list contains the same type of outputs corresponding
+        to different inputs.
+
+    Args:
+        func (Function): A function that will be applied to a list of
+            arguments
+
+    Returns:
+        tuple(list): A tuple containing multiple list, each list contains \
+            a kind of returned results by the function
+    """
+    pfunc = partial(func, **kwargs) if kwargs else func
+    map_results = map(pfunc, *args)
+    return tuple(map(list, zip(*map_results)))
+
+
+def torch_to_numpy(x):
+    assert isinstance(x, torch.Tensor)
+    return x.detach().cpu().numpy()
diff --git a/detrsmpl/utils/path_utils.py b/detrsmpl/utils/path_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..36fb7f69e79d691a81eb5b6faad592e961878bea
--- /dev/null
+++ b/detrsmpl/utils/path_utils.py
@@ -0,0 +1,232 @@
+import os
+import warnings
+from enum import Enum
+from pathlib import Path
+from typing import List, Union
+
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
+
+def check_path_suffix(path_str: str,
+                      allowed_suffix: Union[str, List[str]] = '') -> bool:
+    """Check whether the suffix of the path is allowed.
+
+    Args:
+        path_str (str):
+            Path to check.
+        allowed_suffix (List[str], optional):
+            What extension names are allowed.
+            Offer a list like ['.jpg', ',jpeg'].
+            When it's [], all will be received.
+            Use [''] then directory is allowed.
+            Defaults to [].
+
+    Returns:
+        bool:
+            True: suffix test passed
+            False: suffix test failed
+    """
+    if isinstance(allowed_suffix, str):
+        allowed_suffix = [allowed_suffix]
+    pathinfo = Path(path_str)
+    suffix = pathinfo.suffix.lower()
+    if len(allowed_suffix) == 0:
+        return True
+    if pathinfo.is_dir():
+        if '' in allowed_suffix:
+            return True
+        else:
+            return False
+    else:
+        for index, tmp_suffix in enumerate(allowed_suffix):
+            if not tmp_suffix.startswith('.'):
+                tmp_suffix = '.' + tmp_suffix
+            allowed_suffix[index] = tmp_suffix.lower()
+        if suffix in allowed_suffix:
+            return True
+        else:
+            return False
+
+
+class Existence(Enum):
+    """State of file existence."""
+    FileExist = 0
+    DirectoryExistEmpty = 1
+    DirectoryExistNotEmpty = 2
+    MissingParent = 3
+    DirectoryNotExist = 4
+    FileNotExist = 5
+
+
+def check_path_existence(
+    path_str: str,
+    path_type: Literal['file', 'dir', 'auto'] = 'auto',
+) -> Existence:
+    """Check whether a file or a directory exists at the expected path.
+
+    Args:
+        path_str (str):
+            Path to check.
+        path_type (Literal[, optional):
+            What kind of file do we expect at the path.
+            Choose among `file`, `dir`, `auto`.
+            Defaults to 'auto'.    path_type = path_type.lower()
+
+    Raises:
+        KeyError: if `path_type` conflicts with `path_str`
+
+    Returns:
+        Existence:
+            0. FileExist: file at path_str exists.
+            1. DirectoryExistEmpty: folder at path exists and.
+            2. DirectoryExistNotEmpty: folder at path_str exists and not empty.
+            3. MissingParent: its parent doesn't exist.
+            4. DirectoryNotExist: expect a folder at path_str, but not found.
+            5. FileNotExist: expect a file at path_str, but not found.
+    """
+    path_type = path_type.lower()
+    assert path_type in {'file', 'dir', 'auto'}
+    pathinfo = Path(path_str)
+    if not pathinfo.parent.is_dir():
+        return Existence.MissingParent
+    suffix = pathinfo.suffix.lower()
+    if path_type == 'dir' or\
+            path_type == 'auto' and suffix == '':
+        if pathinfo.is_dir():
+            if len(os.listdir(path_str)) == 0:
+                return Existence.DirectoryExistEmpty
+            else:
+                return Existence.DirectoryExistNotEmpty
+        else:
+            return Existence.DirectoryNotExist
+    elif path_type == 'file' or\
+            path_type == 'auto' and suffix != '':
+        if pathinfo.is_file():
+            return Existence.FileExist
+        elif pathinfo.is_dir():
+            if len(os.listdir(path_str)) == 0:
+                return Existence.DirectoryExistEmpty
+            else:
+                return Existence.DirectoryExistNotEmpty
+        if path_str.endswith('/'):
+            return Existence.DirectoryNotExist
+        else:
+            return Existence.FileNotExist
+
+
+def prepare_output_path(output_path: str,
+                        allowed_suffix: List[str] = [],
+                        tag: str = 'output file',
+                        path_type: Literal['file', 'dir', 'auto'] = 'auto',
+                        overwrite: bool = True) -> None:
+    """Check output folder or file.
+
+    Args:
+        output_path (str): could be folder or file.
+        allowed_suffix (List[str], optional):
+            Check the suffix of `output_path`. If folder, should be [] or [''].
+            If could both be folder or file, should be [suffixs..., ''].
+            Defaults to [].
+        tag (str, optional): The `string` tag to specify the output type.
+            Defaults to 'output file'.
+        path_type (Literal[, optional):
+            Choose `file` for file and `dir` for folder.
+            Choose `auto` if allowed to be both.
+            Defaults to 'auto'.
+        overwrite (bool, optional):
+            Whether overwrite the existing file or folder.
+            Defaults to True.
+
+    Raises:
+        FileNotFoundError: suffix does not match.
+        FileExistsError: file or folder already exists and `overwrite` is
+            False.
+
+    Returns:
+        None
+    """
+    if path_type.lower() == 'dir':
+        allowed_suffix = []
+    exist_result = check_path_existence(output_path, path_type=path_type)
+    if exist_result == Existence.MissingParent:
+        warnings.warn(
+            f'The parent folder of {tag} does not exist: {output_path},' +
+            f' will make dir {Path(output_path).parent.absolute().__str__()}')
+        os.makedirs(Path(output_path).parent.absolute().__str__(),
+                    exist_ok=True)
+
+    elif exist_result == Existence.DirectoryNotExist:
+        os.mkdir(output_path)
+        print(f'Making directory {output_path} for saving results.')
+    elif exist_result == Existence.FileNotExist:
+        suffix_matched = \
+            check_path_suffix(output_path, allowed_suffix=allowed_suffix)
+        if not suffix_matched:
+            raise FileNotFoundError(
+                f'The {tag} should be {", ".join(allowed_suffix)}: '
+                f'{output_path}.')
+    elif exist_result == Existence.FileExist:
+        if not overwrite:
+            raise FileExistsError(
+                f'{output_path} exists (set overwrite = True to overwrite).')
+        else:
+            print(f'Overwriting {output_path}.')
+    elif exist_result == Existence.DirectoryExistEmpty:
+        pass
+    elif exist_result == Existence.DirectoryExistNotEmpty:
+        if not overwrite:
+            raise FileExistsError(
+                f'{output_path} is not empty (set overwrite = '
+                'True to overwrite the files).')
+        else:
+            print(f'Overwriting {output_path} and its files.')
+    else:
+        raise FileNotFoundError(f'No Existence type for {output_path}.')
+
+
+def check_input_path(
+    input_path: str,
+    allowed_suffix: List[str] = [],
+    tag: str = 'input file',
+    path_type: Literal['file', 'dir', 'auto'] = 'auto',
+):
+    """Check input folder or file.
+
+    Args:
+        input_path (str): input folder or file path.
+        allowed_suffix (List[str], optional):
+            Check the suffix of `input_path`. If folder, should be [] or [''].
+            If could both be folder or file, should be [suffixs..., ''].
+            Defaults to [].
+        tag (str, optional): The `string` tag to specify the output type.
+            Defaults to 'output file'.
+        path_type (Literal[, optional):
+            Choose `file` for file and `directory` for folder.
+            Choose `auto` if allowed to be both.
+            Defaults to 'auto'.
+
+    Raises:
+        FileNotFoundError: file does not exists or suffix does not match.
+
+    Returns:
+        None
+    """
+    if path_type.lower() == 'dir':
+        allowed_suffix = []
+    exist_result = check_path_existence(input_path, path_type=path_type)
+
+    if exist_result in [
+            Existence.FileExist, Existence.DirectoryExistEmpty,
+            Existence.DirectoryExistNotEmpty
+    ]:
+        suffix_matched = \
+            check_path_suffix(input_path, allowed_suffix=allowed_suffix)
+        if not suffix_matched:
+            raise FileNotFoundError(
+                f'The {tag} should be {", ".join(allowed_suffix)}:' +
+                f'{input_path}.')
+    else:
+        raise FileNotFoundError(f'The {tag} does not exist: {input_path}.')
diff --git a/detrsmpl/utils/tmp b/detrsmpl/utils/tmp
new file mode 100644
index 0000000000000000000000000000000000000000..9e101f41c51b0a7396301add5c27f04ba9f96044
--- /dev/null
+++ b/detrsmpl/utils/tmp
@@ -0,0 +1,1374 @@
+import glob
+import json
+import os
+import shutil
+import string
+import subprocess
+import sys
+from pathlib import Path
+from typing import Iterable, List, Optional, Tuple, Union
+
+import numpy as np
+
+from mmhuman3d.utils.path_utils import check_input_path, prepare_output_path
+
+try:
+    from typing import Literal
+except ImportError:
+    from typing_extensions import Literal
+
+
+class video_writer:
+
+    def __init__(self,
+                 output_path: str,
+                 resolution: Iterable[int],
+                 fps: float = 30.0,
+                 num_frame: int = 1e9,
+                 disable_log: bool = False) -> None:
+        prepare_output_path(
+            output_path,
+            allowed_suffix=['.mp4'],
+            tag='output video',
+            path_type='file',
+            overwrite=True)
+        height, width = resolution
+        width += width % 2
+        height += height % 2
+        command = [
+            'ffmpeg',
+            '-y',  # (optional) overwrite output file if it exists
+            '-f',
+            'rawvideo',
+            '-pix_fmt',
+            'bgr24',
+            '-s',
+            f'{int(width)}x{int(height)}',
+            '-r',
+            f'{fps}',  # frames per second
+            '-loglevel',
+            'error',
+            '-threads',
+            '1',
+            '-i',
+            '-',  # The input comes from a pipe
+            '-vcodec',
+            'libx264',
+            '-r',
+            f'{fps}',  # frames per second
+            '-an',  # Tells FFMPEG not to expect any audio
+            output_path,
+        ]
+        if not disable_log:
+            print(f'Running \"{" ".join(command)}\"')
+        process = subprocess.Popen(
+            command,
+            stdin=subprocess.PIPE,
+            stderr=subprocess.PIPE,
+        )
+        if process.stdin is None or process.stderr is None:
+            raise BrokenPipeError('No buffer received.')
+        self.process = process
+        self.num_frame = num_frame
+        self.len = 0
+
+    def write(self, image_array: np.ndarray):
+        if self.len <= self.num_frame:
+            try:
+                self.process.stdin.write(image_array.tobytes())
+                self.len += 1
+            except KeyboardInterrupt:
+                self.__del__()
+
+    def __del__(self):
+        self.process.stdin.close()
+        self.process.stderr.close()
+        self.process.wait()
+
+
+def array_to_video(
+    image_array: np.ndarray,
+    output_path: str,
+    fps: Union[int, float] = 30,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    disable_log: bool = False,
+) -> None:
+    """Convert an array to a video directly, gif not supported.
+
+    Args:
+        image_array (np.ndarray): shape should be (f * h * w * 3).
+        output_path (str): output video file path.
+        fps (Union[int, float, optional): fps. Defaults to 30.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of the output video.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check output path.
+        TypeError: check input array.
+
+    Returns:
+        None.
+    """
+    if not isinstance(image_array, np.ndarray):
+        raise TypeError('Input should be np.ndarray.')
+    assert image_array.ndim == 4
+    assert image_array.shape[-1] == 3
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+    if resolution:
+        height, width = resolution
+        width += width % 2
+        height += height % 2
+    else:
+        image_array = pad_for_libx264(image_array)
+        height, width = image_array.shape[1], image_array.shape[2]
+    command = [
+        'ffmpeg',
+        '-y',  # (optional) overwrite output file if it exists
+        '-f',
+        'rawvideo',
+        '-s',
+        f'{int(width)}x{int(height)}',  # size of one frame
+        '-pix_fmt',
+        'bgr24',
+        '-r',
+        f'{fps}',  # frames per second
+        '-loglevel',
+        'error',
+        '-threads',
+        '4',
+        ''
+        '-i',
+        '-',  # The input comes from a pipe
+        '-vcodec',
+        'libx264',
+        '-an',  # Tells FFMPEG not to expect any audio
+        output_path,
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    process = subprocess.Popen(
+        command,
+        stdin=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+    )
+    if process.stdin is None or process.stderr is None:
+        raise BrokenPipeError('No buffer received.')
+    index = 0
+    while True:
+        if index >= image_array.shape[0]:
+            break
+        process.stdin.write(image_array[index].tobytes())
+        index += 1
+    process.stdin.close()
+    process.stderr.close()
+    process.wait()
+
+
+def array_to_images(
+    image_array: np.ndarray,
+    output_folder: str,
+    img_format: str = '%06d.png',
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    disable_log: bool = False,
+) -> None:
+    """Convert an array to images directly.
+
+    Args:
+        image_array (np.ndarray): shape should be (f * h * w * 3).
+        output_folder (str): output folder for the images.
+        img_format (str, optional): format of the images.
+            Defaults to '%06d.png'.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): resolution(height, width) of output.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+
+    Raises:
+        FileNotFoundError: check output folder.
+        TypeError: check input array.
+
+    Returns:
+        None
+    """
+    prepare_output_path(
+        output_folder,
+        allowed_suffix=[],
+        tag='output image folder',
+        path_type='dir',
+        overwrite=True)
+
+    if not isinstance(image_array, np.ndarray):
+        raise TypeError('Input should be np.ndarray.')
+    assert image_array.ndim == 4
+    assert image_array.shape[-1] == 3
+    if resolution:
+        height, width = resolution
+    else:
+        height, width = image_array.shape[1], image_array.shape[2]
+    command = [
+        'ffmpeg',
+        '-y',  # (optional) overwrite output file if it exists
+        '-f',
+        'rawvideo',
+        '-s',
+        f'{int(width)}x{int(height)}',  # size of one frame
+        '-pix_fmt',
+        'bgr24',  # bgr24 for matching OpenCV
+        '-loglevel',
+        'error',
+        '-threads',
+        '4',
+        '-i',
+        '-',  # The input comes from a pipe
+        '-f',
+        'image2',
+        '-start_number',
+        '0',
+        os.path.join(output_folder, img_format),
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    process = subprocess.Popen(
+        command,
+        stdin=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        bufsize=10**8,
+        close_fds=True)
+    if process.stdin is None or process.stderr is None:
+        raise BrokenPipeError('No buffer received.')
+    index = 0
+    while True:
+        if index >= image_array.shape[0]:
+            break
+        process.stdin.write(image_array[index].tobytes())
+        index += 1
+    process.stdin.close()
+    process.stderr.close()
+    process.wait()
+
+
+def video_to_array(
+    input_path: str,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    start: int = 0,
+    end: Optional[int] = None,
+    disable_log: bool = False,
+) -> np.ndarray:
+    """
+    Read a video/gif as an array of (f * h * w * 3).
+
+    Args:
+        input_path (str): input path.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): resolution(height, width) of output.
+            Defaults to None.
+        start (int, optional): start frame index. Inclusive.
+             If < 0, will be converted to frame_index range in [0, frame_num].
+            Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+
+    Raises:
+        FileNotFoundError: check the input path.
+
+    Returns:
+        np.ndarray: shape will be (f * h * w * 3).
+    """
+    check_input_path(
+        input_path,
+        allowed_suffix=['.mp4', 'mkv', 'avi', '.gif'],
+        tag='input video',
+        path_type='file')
+
+    info = vid_info_reader(input_path)
+    if resolution:
+        height, width = resolution
+    else:
+        width, height = int(info['width']), int(info['height'])
+    num_frames = int(info['nb_frames'])
+    start = (min(start, num_frames - 1) + num_frames) % num_frames
+    end = (min(end, num_frames - 1) +
+           num_frames) % num_frames if end is not None else num_frames
+    command = [
+        'ffmpeg',
+        '-i',
+        input_path,
+        '-filter_complex',
+        f'[0]trim=start_frame={start}:end_frame={end}[v0]',
+        '-map',
+        '[v0]',
+        '-pix_fmt',
+        'bgr24',  # bgr24 for matching OpenCV
+        '-s',
+        f'{int(width)}x{int(height)}',
+        '-f',
+        'image2pipe',
+        '-vcodec',
+        'rawvideo',
+        '-loglevel',
+        'error',
+        'pipe:'
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    # Execute FFmpeg as sub-process with stdout as a pipe
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=10**8)
+    if process.stdout is None:
+        raise BrokenPipeError('No buffer received.')
+    # Read decoded video frames from the PIPE until no more frames to read
+    array = []
+    while True:
+        # Read decoded video frame (in raw video format) from stdout process.
+        buffer = process.stdout.read(int(width * height * 3))
+        # Break the loop if buffer length is not W*H*3\
+        # (when FFmpeg streaming ends).
+        if len(buffer) != width * height * 3:
+            break
+        img = np.frombuffer(buffer, np.uint8).reshape(height, width, 3)
+        array.append(img[np.newaxis])
+    process.stdout.flush()
+    process.stdout.close()
+    process.wait()
+    return np.concatenate(array)
+
+
+def images_to_sorted_images(input_folder, output_folder, img_format='%06d'):
+    """Copy and rename a folder of images into a new folder following the
+    `img_format`.
+
+    Args:
+        input_folder (str): input folder.
+        output_folder (str): output folder.
+        img_format (str, optional): image format name, do not need extension.
+            Defaults to '%06d'.
+
+    Returns:
+        str: image format of the rename images.
+    """
+    img_format = img_format.rsplit('.', 1)[0]
+    file_list = []
+    os.makedirs(output_folder, exist_ok=True)
+    pngs = glob.glob(os.path.join(input_folder, '*.png'))
+    if pngs:
+        ext = 'png'
+    file_list.extend(pngs)
+    jpgs = glob.glob(os.path.join(input_folder, '*.jpg'))
+    if jpgs:
+        ext = 'jpg'
+    file_list.extend(jpgs)
+    file_list.sort()
+    for index, file_name in enumerate(file_list):
+        shutil.copy(
+            file_name,
+            os.path.join(output_folder, (img_format + '.%s') % (index, ext)))
+    return img_format + '.%s' % ext
+
+
+def images_to_array(
+    input_folder: str,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    img_format: str = '%06d.png',
+    start: int = 0,
+    end: Optional[int] = None,
+    remove_raw_files: bool = False,
+    disable_log: bool = False,
+) -> np.ndarray:
+    """
+    Read a folder of images as an array of (f * h * w * 3).
+
+    Args:
+        input_folder (str): folder of input images.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]]:
+            resolution(height, width) of output. Defaults to None.
+        img_format (str, optional): format of images to be read.
+            Defaults to '%06d.png'.
+        start (int, optional): start frame index. Inclusive.
+             If < 0, will be converted to frame_index range in [0, frame_num].
+            Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        remove_raw_files (bool, optional): whether remove raw images.
+            Defaults to False.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+
+    Returns:
+        np.ndarray: shape will be (f * h * w * 3).
+    """
+    check_input_path(
+        input_folder,
+        allowed_suffix=[''],
+        tag='input image folder',
+        path_type='dir')
+
+    input_folderinfo = Path(input_folder)
+
+    temp_input_folder = None
+    if img_format is None:
+        temp_input_folder = os.path.join(input_folderinfo.parent,
+                                         input_folderinfo.name + '_temp')
+        img_format = images_to_sorted_images(
+            input_folder=input_folder, output_folder=temp_input_folder)
+        input_folder = temp_input_folder
+
+    info = vid_info_reader(f'{input_folder}/{img_format}' % start)
+    width, height = int(info['width']), int(info['height'])
+    if resolution:
+        height, width = resolution
+    else:
+        width, height = int(info['width']), int(info['height'])
+
+    num_frames = len(os.listdir(input_folder))
+    start = max(start, 0) % num_frames
+    end = min(end, num_frames) % (num_frames + 1) \
+        if end is not None else num_frames
+    command = [
+        'ffmpeg',
+        '-y',
+        '-threads',
+        '1',
+        '-start_number',
+        f'{start}',
+        '-i',
+        f'{input_folder}/{img_format}',
+        '-frames:v',
+        f'{end - start}',
+        '-f',
+        'rawvideo',
+        '-pix_fmt',
+        'bgr24',  # bgr24 for matching OpenCV
+        '-s',
+        f'{int(width)}x{int(height)}',
+        '-loglevel',
+        'error',
+        '-'
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    process = subprocess.Popen(command, stdout=subprocess.PIPE, bufsize=10**8)
+    if process.stdout is None:
+        raise BrokenPipeError('No buffer received.')
+    # Read decoded video frames from the PIPE until no more frames to read
+    array = []
+    while True:
+        # Read decoded video frame (in raw video format) from stdout process.
+        buffer = process.stdout.read(int(width * height * 3))
+        # Break the loop if buffer length is not W*H*3\
+        # (when FFmpeg streaming ends).
+
+        if len(buffer) != width * height * 3:
+            break
+        img = np.frombuffer(buffer, np.uint8).reshape(height, width, 3)
+        array.append(img[np.newaxis])
+    process.stdout.flush()
+    process.stdout.close()
+    process.wait()
+    if temp_input_folder is not None:
+        if Path(temp_input_folder).is_dir():
+            shutil.rmtree(temp_input_folder)
+    if remove_raw_files:
+        if Path(input_folder).is_dir():
+            shutil.rmtree(input_folder)
+
+    return np.concatenate(array)
+
+
+class vid_info_reader(object):
+
+    def __init__(self, input_path) -> None:
+        """Get video information from video, mimiced from ffmpeg-python.
+        https://github.com/kkroening/ffmpeg-python.
+
+        Args:
+            vid_file ([str]): video file path.
+
+        Raises:
+            FileNotFoundError: check the input path.
+
+        Returns:
+            None.
+        """
+        check_input_path(
+            input_path,
+            allowed_suffix=['.mp4', '.gif', '.png', '.jpg', '.jpeg'],
+            tag='input file',
+            path_type='file')
+        cmd = [
+            'ffprobe', '-show_format', '-show_streams', '-of', 'json',
+            input_path
+        ]
+        process = subprocess.Popen(
+            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        out, _ = process.communicate()
+        probe = json.loads(out.decode('utf-8'))
+        video_stream = next((stream for stream in probe['streams']
+                             if stream['codec_type'] == 'video'), None)
+        if video_stream is None:
+            print('No video stream found', file=sys.stderr)
+            sys.exit(1)
+        self.video_stream = video_stream
+
+    def __getitem__(
+        self,
+        key: Literal['index', 'codec_name', 'codec_long_name', 'profile',
+                     'codec_type', 'codec_time_base', 'codec_tag_string',
+                     'codec_tag', 'width', 'height', 'coded_width',
+                     'coded_height', 'has_b_frames', 'pix_fmt', 'level',
+                     'chroma_location', 'refs', 'is_avc', 'nal_length_size',
+                     'r_frame_rate', 'avg_frame_rate', 'time_base',
+                     'start_pts', 'start_time', 'duration_ts', 'duration',
+                     'bit_rate', 'bits_per_raw_sample', 'nb_frames',
+                     'disposition', 'tags']):
+        """Key (str): select in ['index', 'codec_name', 'codec_long_name',
+        'profile', 'codec_type', 'codec_time_base', 'codec_tag_string',
+        'codec_tag', 'width', 'height', 'coded_width', 'coded_height',
+        'has_b_frames', 'pix_fmt', 'level', 'chroma_location', 'refs',
+        'is_avc', 'nal_length_size', 'r_frame_rate', 'avg_frame_rate',
+        'time_base', 'start_pts', 'start_time', 'duration_ts', 'duration',
+        'bit_rate', 'bits_per_raw_sample', 'nb_frames', 'disposition',
+        'tags']"""
+        return self.video_stream[key]
+
+
+def video_to_gif(
+    input_path: str,
+    output_path: str,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    fps: Union[float, int] = 15,
+    disable_log: bool = False,
+) -> None:
+    """Convert a video to a gif file.
+
+    Args:
+        input_path (str): video file path.
+        output_path (str): gif file path.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of the output video.
+            Defaults to None.
+        fps (Union[float, int], optional): frames per second. Defaults to 15.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None.
+    """
+    check_input_path(
+        input_path,
+        allowed_suffix=['.mp4'],
+        tag='input video',
+        path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif'],
+        tag='output gif',
+        path_type='file',
+        overwrite=True)
+
+    info = vid_info_reader(input_path)
+    duration = info['duration']
+    if resolution:
+        height, width = resolution
+    else:
+        width, height = int(info['width']), int(info['height'])
+
+    command = [
+        'ffmpeg', '-r',
+        str(info['r_frame_rate']), '-i', input_path, '-r', f'{fps}', '-s',
+        f'{width}x{height}', '-loglevel', 'error', '-t', f'{duration}',
+        '-threads', '4', '-y', output_path
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+
+def video_to_images(input_path: str,
+                    output_folder: str,
+                    resolution: Optional[Union[Tuple[int, int],
+                                               Tuple[float, float]]] = None,
+                    img_format: str = '%06d.png',
+                    start: int = 0,
+                    end: Optional[int] = None,
+                    disable_log: bool = False) -> None:
+    """Convert a video to a folder of images.
+
+    Args:
+        input_path (str): video file path
+        output_folder (str): output folder to store the images
+        resolution (Optional[Tuple[int, int]], optional):
+            (height, width) of output. defaults to None.
+        img_format (str, optional): format of images to be read.
+            Defaults to '%06d.png'.
+        start (int, optional): start frame index. Inclusive.
+             If < 0, will be converted to frame_index range in [0, frame_num].
+            Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path
+        FileNotFoundError: check the output path
+
+    Returns:
+        None
+    """
+    check_input_path(
+        input_path,
+        allowed_suffix=['.mp4'],
+        tag='input video',
+        path_type='file')
+    prepare_output_path(
+        output_folder,
+        allowed_suffix=[],
+        tag='output image folder',
+        path_type='dir',
+        overwrite=True)
+    info = vid_info_reader(input_path)
+    num_frames = int(info['nb_frames'])
+    start = (min(start, num_frames - 1) + num_frames) % num_frames
+    end = (min(end, num_frames - 1) +
+           num_frames) % num_frames if end is not None else num_frames
+
+    command = [
+        'ffmpeg', '-i', input_path, '-filter_complex',
+        f'[0]trim=start_frame={start}:end_frame={end}[v0]', '-map', '[v0]',
+        '-f', 'image2', '-v', 'error', '-start_number', '0', '-threads', '1',
+        f'{output_folder}/{img_format}'
+    ]
+    if resolution:
+        height, width = resolution
+        command.insert(3, '-s')
+        command.insert(4, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+
+def images_to_video(input_folder: str,
+                    output_path: str,
+                    remove_raw_file: bool = False,
+                    img_format: str = '%06d.png',
+                    fps: Union[int, float] = 30,
+                    resolution: Optional[Union[Tuple[int, int],
+                                               Tuple[float, float]]] = None,
+                    start: int = 0,
+                    end: Optional[int] = None,
+                    disable_log: bool = False) -> None:
+    """Convert a folder of images to a video.
+
+    Args:
+        input_folder (str): input image folder
+        output_path (str): output video file path
+        remove_raw_file (bool, optional): whether remove raw images.
+            Defaults to False.
+        img_format (str, optional): format to name the images].
+            Defaults to '%06d.png'.
+        fps (Union[int, float], optional): output video fps. Defaults to 30.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output.
+            defaults to None.
+        start (int, optional): start frame index. Inclusive.
+            If < 0, will be converted to frame_index range in [0, frame_num].
+            Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None
+    """
+    check_input_path(
+        input_folder,
+        allowed_suffix=[],
+        tag='input image folder',
+        path_type='dir')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+    input_folderinfo = Path(input_folder)
+    num_frames = len(os.listdir(input_folder))
+    start = (min(start, num_frames - 1) + num_frames) % num_frames
+    end = (min(end, num_frames - 1) +
+           num_frames) % num_frames if end is not None else num_frames
+    temp_input_folder = None
+    if img_format is None:
+        temp_input_folder = os.path.join(input_folderinfo.parent,
+                                         input_folderinfo.name + '_temp')
+        img_format = images_to_sorted_images(input_folder, temp_input_folder)
+
+    command = [
+        'ffmpeg',
+        '-y',
+        '-threads',
+        '4',
+        '-start_number',
+        f'{start}',
+        '-r',
+        f'{fps}',
+        '-i',
+        f'{input_folder}/{img_format}'
+        if temp_input_folder is None else f'{temp_input_folder}/{img_format}',
+        '-frames:v',
+        f'{end - start}',
+        '-profile:v',
+        'baseline',
+        '-level',
+        '3.0',
+        '-c:v',
+        'libx264',
+        '-pix_fmt',
+        'yuv420p',
+        '-an',
+        '-v',
+        'error',
+        '-loglevel',
+        'error',
+        output_path,
+    ]
+    if resolution:
+        height, width = resolution
+        width += width % 2
+        height += height % 2
+        command.insert(1, '-s')
+        command.insert(2, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+    if remove_raw_file:
+        if Path(input_folder).is_dir():
+            shutil.rmtree(input_folder)
+    if temp_input_folder is not None:
+        if Path(temp_input_folder).is_dir():
+            shutil.rmtree(temp_input_folder)
+
+
+def images_to_gif(
+    input_folder: str,
+    output_path: str,
+    remove_raw_file: bool = False,
+    img_format: str = '%06d.png',
+    fps: int = 15,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    start: int = 0,
+    end: Optional[int] = None,
+    disable_log: bool = False,
+) -> None:
+    """Convert series of images to a video, similar to images_to_video, but
+    provide more suitable parameters.
+
+    Args:
+        input_folder (str): input image folder.
+        output_path (str): output gif file path.
+        remove_raw_file (bool, optional): whether remove raw images.
+            Defaults to False.
+        img_format (str, optional): format to name the images.
+            Defaults to '%06d.png'.
+        fps (int, optional): output video fps. Defaults to 15.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output. Defaults to None.
+        start (int, optional): start frame index. Inclusive.
+            If < 0, will be converted to frame_index range in [0, frame_num].
+            Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None
+    """
+    input_folderinfo = Path(input_folder)
+    check_input_path(
+        input_folder,
+        allowed_suffix=[],
+        tag='input image folder',
+        path_type='dir')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif'],
+        tag='output gif',
+        path_type='file',
+        overwrite=True)
+    num_frames = len(os.listdir(input_folder))
+    start = (min(start, num_frames - 1) + num_frames) % num_frames
+    end = (min(end, num_frames - 1) +
+           num_frames) % num_frames if end is not None else num_frames
+    temp_input_folder = None
+    if img_format is None:
+        file_list = []
+        temp_input_folder = os.path.join(input_folderinfo.parent,
+                                         input_folderinfo.name + '_temp')
+        os.makedirs(temp_input_folder, exist_ok=True)
+        pngs = glob.glob(os.path.join(input_folder, '*.png'))
+        ext = 'png'
+        if pngs:
+            ext = 'png'
+        file_list.extend(pngs)
+        jpgs = glob.glob(os.path.join(input_folder, '*.jpg'))
+        if jpgs:
+            ext = 'jpg'
+        file_list.extend(jpgs)
+        file_list.sort()
+        for index, file_name in enumerate(file_list):
+            shutil.copy(
+                file_name,
+                os.path.join(temp_input_folder, '%06d.%s' % (index + 1, ext)))
+        input_folder = temp_input_folder
+        img_format = '%06d.' + ext
+
+    command = [
+        'ffmpeg',
+        '-y',
+        '-threads',
+        '4',
+        '-start_number',
+        f'{start}',
+        '-r',
+        f'{fps}',
+        '-i',
+        f'{input_folder}/{img_format}',
+        '-frames:v',
+        f'{end - start}',
+        '-loglevel',
+        'error',
+        '-v',
+        'error',
+        output_path,
+    ]
+    if resolution:
+        height, width = resolution
+        command.insert(1, '-s')
+        command.insert(2, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+    if remove_raw_file:
+        shutil.rmtree(input_folder)
+    if temp_input_folder is not None:
+        shutil.rmtree(temp_input_folder)
+
+
+def gif_to_video(input_path: str,
+                 output_path: str,
+                 fps: int = 30,
+                 remove_raw_file: bool = False,
+                 resolution: Optional[Union[Tuple[int, int],
+                                            Tuple[float, float]]] = None,
+                 disable_log: bool = False) -> None:
+    """Convert a gif file to a video.
+
+    Args:
+        input_path (str): input gif file path.
+        output_path (str): output video file path.
+        fps (int, optional): fps. Defaults to 30.
+        remove_raw_file (bool, optional): whether remove original input file.
+            Defaults to False.
+        down_sample_scale (Union[int, float], optional): down sample scale.
+            Defaults to 1.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output. Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None
+    """
+    check_input_path(
+        input_path, allowed_suffix=['.gif'], tag='input gif', path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+    command = [
+        'ffmpeg', '-i', input_path, '-r', f'{fps}', '-loglevel', 'error', '-y',
+        output_path, '-threads', '4'
+    ]
+    if resolution:
+        height, width = resolution
+        command.insert(3, '-s')
+        command.insert(4, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+    if remove_raw_file:
+        subprocess.call(['rm', '-f', input_path])
+
+
+def gif_to_images(input_path: str,
+                  output_folder: str,
+                  fps: int = 30,
+                  img_format: str = '%06d.png',
+                  resolution: Optional[Union[Tuple[int, int],
+                                             Tuple[float, float]]] = None,
+                  disable_log: bool = False) -> None:
+    """Convert a gif file to a folder of images.
+
+    Args:
+        input_path (str): input gif file path.
+        output_folder (str): output folder to save the images.
+        fps (int, optional): fps. Defaults to 30.
+        img_format (str, optional): output image name format.
+            Defaults to '%06d.png'.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output.
+            Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None
+    """
+    check_input_path(
+        input_path, allowed_suffix=['.gif'], tag='input gif', path_type='file')
+    prepare_output_path(
+        output_folder,
+        allowed_suffix=[],
+        tag='output image folder',
+        path_type='dir',
+        overwrite=True)
+    command = [
+        'ffmpeg', '-r', f'{fps}', '-i', input_path, '-loglevel', 'error', '-f',
+        'image2', '-v', 'error', '-threads', '4', '-y', '-start_number', '0',
+        f'{output_folder}/{img_format}'
+    ]
+    if resolution:
+        height, width = resolution
+        command.insert(3, '-s')
+        command.insert(4, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+
+def crop_video(
+    input_path: str,
+    output_path: str,
+    box: Optional[Union[List[int], Tuple[int, int, int, int]]] = None,
+    resolution: Optional[Union[Tuple[int, int], Tuple[float, float]]] = None,
+    disable_log: bool = False,
+) -> None:
+    """Spatially or temporally crop a video or gif file.
+
+    Args:
+        input_path (str): input video or gif file path.
+        output_path (str): output video or gif file path.
+        box (Iterable[int], optional): [x, y of the crop region left.
+            corner and width and height]. Defaults to [0, 0, 100, 100].
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output. Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None'-start_number', f'{start}',
+    """
+    check_input_path(
+        input_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='input video',
+        path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+
+    info = vid_info_reader(input_path)
+    width, height = int(info['width']), int(info['height'])
+
+    if box is None:
+        box = [0, 0, width, height]
+
+    assert len(box) == 4
+    x, y, w, h = box
+    assert (w > 0 and h > 0)
+    command = [
+        'ffmpeg', '-i', input_path, '-vcodec', 'libx264', '-vf',
+        'crop=%d:%d:%d:%d' % (w, h, x, y), '-loglevel', 'error', '-y',
+        output_path
+    ]
+    if resolution:
+        height, width = resolution
+        width += width % 2
+        height += height % 2
+        command.insert(-1, '-s')
+        command.insert(-1, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+
+def slice_video(input_path: str,
+                output_path: str,
+                start: int = 0,
+                end: Optional[int] = None,
+                resolution: Optional[Union[Tuple[int, int],
+                                           Tuple[float, float]]] = None,
+                disable_log: bool = False) -> None:
+    """Temporally crop a video/gif into another video/gif.
+
+    Args:
+        input_path (str): input video or gif file path.
+        output_path (str): output video of gif file path.
+        start (int, optional): start frame index. Defaults to 0.
+        end (int, optional): end frame index. Exclusive.
+            Could be positive int or negative int or None.
+            If None, all frames from start till the last frame are included.
+            Defaults to None.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output. Defaults to None.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        NoReturn
+    """
+    info = vid_info_reader(input_path)
+    num_frames = int(info['nb_frames'])
+    start = (min(start, num_frames - 1) + num_frames) % num_frames
+    end = (min(end, num_frames - 1) +
+           num_frames) % num_frames if end is not None else num_frames
+    command = [
+        'ffmpeg', '-y', '-i', input_path, '-filter_complex',
+        f'[0]trim=start_frame={start}:end_frame={end}[v0]', '-map', '[v0]',
+        '-loglevel', 'error', '-vcodec', 'libx264', output_path
+    ]
+    if resolution:
+        height, width = resolution
+        width += width % 2
+        height += height % 2
+        command.insert(1, '-s')
+        command.insert(2, '%dx%d' % (width, height))
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+
+def spatial_concat_video(input_path_list: List[str],
+                         output_path: str,
+                         array: List[int] = [1, 1],
+                         direction: Literal['h', 'w'] = 'h',
+                         resolution: Union[Tuple[int,
+                                                 int], List[int], List[float],
+                                           Tuple[float, float]] = (512, 512),
+                         remove_raw_files: bool = False,
+                         padding: int = 0,
+                         disable_log: bool = False) -> None:
+    """Spatially concat some videos as an array video.
+
+    Args:
+        input_path_list (list): input video or gif file list.
+        output_path (str): output video or gif file path.
+        array (List[int], optional): line number and column number of
+            the video array]. Defaults to [1, 1].
+        direction (str, optional): [choose in 'h' or 'v', represent
+            horizontal and vertical separately].
+            Defaults to 'h'.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]],
+            optional): (height, width) of output.
+            Defaults to (512, 512).
+        remove_raw_files (bool, optional): whether remove raw images.
+            Defaults to False.
+        padding (int, optional): width of pixels between videos.
+            Defaults to 0.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None
+    """
+    lowercase = string.ascii_lowercase
+    assert len(array) == 2
+    assert (array[0] * array[1]) >= len(input_path_list)
+    for path in input_path_list:
+        check_input_path(
+            path,
+            allowed_suffix=['.gif', '.mp4'],
+            tag='input video',
+            path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+
+    command = ['ffmpeg']
+    height, width = resolution
+    scale_command = []
+    for index, vid_file in enumerate(input_path_list):
+        command.append('-i')
+        command.append(vid_file)
+        scale_command.append(
+            '[%d:v]scale=%d:%d:force_original_aspect_ratio=0[v%d];' %
+            (index, width, height, index))
+
+    scale_command = ' '.join(scale_command)
+    pad_command = '[v%d]pad=%d:%d[%s];' % (0, width * array[1] + padding *
+                                           (array[1] - 1),
+                                           height * array[0] + padding *
+                                           (array[0] - 1), lowercase[0])
+    for index in range(1, len(input_path_list)):
+        if direction == 'h':
+            pad_width = index % array[1] * (width + padding)
+            pad_height = index // array[1] * (height + padding)
+        else:
+            pad_width = index % array[0] * (width + padding)
+            pad_height = index // array[0] * (height + padding)
+
+        pad_command += '[%s][v%d]overlay=%d:%d' % (lowercase[index - 1], index,
+                                                   pad_width, pad_height)
+        if index != len(input_path_list) - 1:
+            pad_command += '[%s];' % lowercase[index]
+
+    command += [
+        '-filter_complex',
+        '%s%s' % (scale_command, pad_command), '-loglevel', 'error', '-y',
+        output_path
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+    if remove_raw_files:
+        command = ['rm', '-f'] + input_path_list
+        subprocess.call(command)
+
+
+def temporal_concat_video(input_path_list: List[str],
+                          output_path: str,
+                          resolution: Union[Tuple[int, int],
+                                            Tuple[float, float]] = (512, 512),
+                          remove_raw_files: bool = False,
+                          disable_log: bool = False) -> None:
+    """Concat no matter videos or gifs into a temporal sequence, and save as a
+    new video or gif file.
+
+    Args:
+        input_path_list (List[str]): list of input video paths.
+        output_path (str): output video file path.
+        resolution (Optional[Union[Tuple[int, int], Tuple[float, float]]]
+            , optional): (height, width) of output].
+            Defaults to (512,512).
+        remove_raw_files (bool, optional): whether remove the input videos.
+            Defaults to False.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None.
+    """
+    for path in input_path_list:
+        check_input_path(
+            path,
+            allowed_suffix=['.gif', '.mp4'],
+            tag='input video',
+            path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+
+    height, width = resolution
+    command = ['ffmpeg']
+    concat_command = []
+    scale_command = []
+    for index, vid_file in enumerate(input_path_list):
+        command.append('-i')
+        command.append(vid_file)
+        scale_command.append(
+            '[%d:v]scale=%d:%d:force_original_aspect_ratio=0[v%d];' %
+            (index, width, height, index))
+        concat_command.append('[v%d]' % index)
+    concat_command = ''.join(concat_command)
+    scale_command = ''.join(scale_command)
+    command += [
+        '-filter_complex',
+        '%s%sconcat=n=%d:v=1:a=0[v]' %
+        (scale_command, concat_command, len(input_path_list)), '-loglevel',
+        'error', '-map', '[v]', '-c:v', 'libx264', '-y', output_path
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+
+    if remove_raw_files:
+        command = ['rm'] + input_path_list
+        subprocess.call(command)
+
+
+def compress_video(input_path: str,
+                   output_path: str,
+                   compress_rate: int = 1,
+                   down_sample_scale: Union[float, int] = 1,
+                   fps: int = 30,
+                   disable_log: bool = False) -> None:
+    """Compress a video file.
+
+    Args:
+        input_path (str): input video file path.
+        output_path (str): output video file path.
+        compress_rate (int, optional): compress rate, influents the bit rate.
+            Defaults to 1.
+        down_sample_scale (Union[float, int], optional): spatial down sample
+            scale. Defaults to 1.
+        fps (int, optional): Frames per second. Defaults to 30.
+        disable_log (bool, optional): whether close the ffmepg command info.
+            Defaults to False.
+    Raises:
+        FileNotFoundError: check the input path.
+        FileNotFoundError: check the output path.
+
+    Returns:
+        None.
+    """
+    input_pathinfo = Path(input_path)
+
+    check_input_path(
+        input_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='input video',
+        path_type='file')
+    prepare_output_path(
+        output_path,
+        allowed_suffix=['.gif', '.mp4'],
+        tag='output video',
+        path_type='file',
+        overwrite=True)
+
+    info = vid_info_reader(input_path)
+
+    width = int(info['width'])
+    height = int(info['height'])
+    bit_rate = int(info['bit_rate'])
+    duration = float(info['duration'])
+    if (output_path == input_path) or (not output_path):
+        temp_outpath = os.path.join(
+            os.path.abspath(input_pathinfo.parent),
+            'temp_file' + input_pathinfo.suffix)
+    else:
+        temp_outpath = output_path
+    new_width = int(width / down_sample_scale)
+    new_width += new_width % 2
+    new_height = int(height / down_sample_scale)
+    new_height += new_height % 2
+    command = [
+        'ffmpeg', '-y', '-r',
+        str(info['r_frame_rate']), '-i', input_path, '-loglevel', 'error',
+        '-b:v', f'{bit_rate / (compress_rate * down_sample_scale)}', '-r',
+        f'{fps}', '-t', f'{duration}', '-s',
+        '%dx%d' % (new_width, new_height), temp_outpath
+    ]
+    if not disable_log:
+        print(f'Running \"{" ".join(command)}\"')
+    subprocess.call(command)
+    if (output_path == input_path) or (not output_path):
+        subprocess.call(['mv', '-f', temp_outpath, input_path])
+
+
+def pad_for_libx264(image_array):
+    """Pad zeros if width or height of image_array is not divisible by 2.
+    Otherwise you will get.
+
+    \"[libx264 @ 0x1b1d560] width not divisible by 2 \"
+
+    Args:
+        image_array (np.ndarray):
+            Image or images load by cv2.imread().
+            Possible shapes:
+            1. [height, width]
+            2. [height, width, channels]
+            3. [images, height, width]
+            4. [images, height, width, channels]
+
+    Returns:
+        np.ndarray:
+            A image with both edges divisible by 2.
+    """
+    if image_array.ndim == 2 or \
+            (image_array.ndim == 3 and image_array.shape[2] == 3):
+        hei_index = 0
+        wid_index = 1
+    elif image_array.ndim == 4 or \
+            (image_array.ndim == 3 and image_array.shape[2] != 3):
+        hei_index = 1
+        wid_index = 2
+    else:
+        return image_array
+    hei_pad = image_array.shape[hei_index] % 2
+    wid_pad = image_array.shape[wid_index] % 2
+    if hei_pad + wid_pad > 0:
+        pad_width = []
+        for dim_index in range(image_array.ndim):
+            if dim_index == hei_index:
+                pad_width.append((0, hei_pad))
+            elif dim_index == wid_index:
+                pad_width.append((0, wid_pad))
+            else:
+                pad_width.append((0, 0))
+        values = 0
+        image_array = \
+            np.pad(image_array,
+                   pad_width,
+                   mode='constant', constant_values=values)
+    return image_array
diff --git a/detrsmpl/utils/transforms.py b/detrsmpl/utils/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e3b54ab62ac4ebd965aa726b466909c5263df5a
--- /dev/null
+++ b/detrsmpl/utils/transforms.py
@@ -0,0 +1,590 @@
+from typing import Union
+
+import numpy
+import torch
+
+from detrsmpl.core.conventions.joints_mapping.standard_joint_angles import (
+    TRANSFORMATION_AA_TO_SJA,
+    TRANSFORMATION_SJA_TO_AA,
+)
+from .logger import get_root_logger
+
+try:
+    from pytorch3d.transforms import (
+        axis_angle_to_matrix,
+        axis_angle_to_quaternion,
+        euler_angles_to_matrix,
+        matrix_to_euler_angles,
+        matrix_to_quaternion,
+        matrix_to_rotation_6d,
+        quaternion_to_axis_angle,
+        quaternion_to_matrix,
+        rotation_6d_to_matrix,
+    )
+except (ImportError, ModuleNotFoundError):
+    import traceback
+    logger = get_root_logger()
+    stack_str = ''
+    for line in traceback.format_stack():
+        if 'frozen' not in line:
+            stack_str += line + '\n'
+    import_exception = traceback.format_exc() + '\n'
+    warning_msg = stack_str + import_exception + \
+        'If pytorch3d is not required,' +\
+        ' this warning could be ignored.'
+    logger.warning(warning_msg)
+
+
+class Compose:
+    def __init__(self, transforms: list):
+        """Composes several transforms together. This transform does not
+        support torchscript.
+
+        Args:
+            transforms (list): (list of transform functions)
+        """
+        self.transforms = transforms
+
+    def __call__(self,
+                 rotation: Union[torch.Tensor, numpy.ndarray],
+                 convention: str = 'xyz',
+                 **kwargs):
+        convention = convention.lower()
+        if not (set(convention) == set('xyz') and len(convention) == 3):
+            raise ValueError(f'Invalid convention {convention}.')
+        if isinstance(rotation, numpy.ndarray):
+            data_type = 'numpy'
+            rotation = torch.FloatTensor(rotation)
+        elif isinstance(rotation, torch.Tensor):
+            data_type = 'tensor'
+        else:
+            raise TypeError(
+                'Type of rotation should be torch.Tensor or numpy.ndarray')
+        for t in self.transforms:
+            if 'convention' in t.__code__.co_varnames:
+                rotation = t(rotation, convention.upper(), **kwargs)
+            else:
+                rotation = t(rotation, **kwargs)
+        if data_type == 'numpy':
+            rotation = rotation.detach().cpu().numpy()
+        return rotation
+
+
+def aa_to_rotmat(
+    axis_angle: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """
+    Convert axis_angle to rotation matrixs.
+    Args:
+        axis_angle (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3). ndim of input is unlimited.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3, 3).
+    """
+    if axis_angle.shape[-1] != 3:
+        raise ValueError(
+            f'Invalid input axis angles shape f{axis_angle.shape}.')
+    t = Compose([axis_angle_to_matrix])
+    return t(axis_angle)
+
+
+def aa_to_quat(
+    axis_angle: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """
+    Convert axis_angle to quaternions.
+    Args:
+        axis_angle (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3). ndim of input is unlimited.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 4).
+    """
+    if axis_angle.shape[-1] != 3:
+        raise ValueError(f'Invalid input axis angles f{axis_angle.shape}.')
+    t = Compose([axis_angle_to_quaternion])
+    return t(axis_angle)
+
+
+def ee_to_rotmat(euler_angle: Union[torch.Tensor, numpy.ndarray],
+                 convention='xyz') -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert euler angle to rotation matrixs.
+
+    Args:
+        euler_angle (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3). ndim of input is unlimited.
+        convention (str, optional): Convention string of three letters
+                from {“x”, “y”, and “z”}. Defaults to 'xyz'.
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3, 3).
+    """
+    if euler_angle.shape[-1] != 3:
+        raise ValueError(
+            f'Invalid input euler angles shape f{euler_angle.shape}.')
+    t = Compose([euler_angles_to_matrix])
+    return t(euler_angle, convention.upper())
+
+
+def rotmat_to_ee(
+        matrix: Union[torch.Tensor, numpy.ndarray],
+        convention: str = 'xyz') -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert rotation matrixs to euler angle.
+
+    Args:
+        matrix (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3, 3). ndim of input is unlimited.
+        convention (str, optional): Convention string of three letters
+                from {“x”, “y”, and “z”}. Defaults to 'xyz'.
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3).
+    """
+    if matrix.shape[-1] != 3 or matrix.shape[-2] != 3:
+        raise ValueError(f'Invalid rotation matrix shape f{matrix.shape}.')
+    t = Compose([matrix_to_euler_angles])
+    return t(matrix, convention.upper())
+
+
+def rotmat_to_quat(
+    matrix: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert rotation matrixs to quaternions.
+
+    Args:
+        matrix (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3, 3). ndim of input is unlimited.
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 4).
+    """
+    if matrix.shape[-1] != 3 or matrix.shape[-2] != 3:
+        raise ValueError(f'Invalid rotation matrix  shape f{matrix.shape}.')
+    t = Compose([matrix_to_quaternion])
+    return t(matrix)
+
+
+def rotmat_to_rot6d(
+    matrix: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert rotation matrixs to rotation 6d representations.
+
+    Args:
+        matrix (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3, 3). ndim of input is unlimited.
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 6).
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    if matrix.shape[-1] != 3 or matrix.shape[-2] != 3:
+        raise ValueError(f'Invalid rotation matrix  shape f{matrix.shape}.')
+    t = Compose([matrix_to_rotation_6d])
+    return t(matrix)
+
+
+def quat_to_aa(
+    quaternions: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert quaternions to axis angles.
+
+    Args:
+        quaternions (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3). ndim of input is unlimited.
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3).
+    """
+    if quaternions.shape[-1] != 4:
+        raise ValueError(f'Invalid input quaternions f{quaternions.shape}.')
+    t = Compose([quaternion_to_axis_angle])
+    return t(quaternions)
+
+
+def quat_to_rotmat(
+    quaternions: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert quaternions to rotation matrixs.
+
+    Args:
+        quaternions (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3). ndim of input is unlimited.
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3, 3).
+    """
+    if quaternions.shape[-1] != 4:
+        raise ValueError(
+            f'Invalid input quaternions shape f{quaternions.shape}.')
+    t = Compose([quaternion_to_matrix])
+    return t(quaternions)
+
+
+def rot6d_to_rotmat(
+    rotation_6d: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert rotation 6d representations to rotation matrixs.
+
+    Args:
+        rotation_6d (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 6). ndim of input is unlimited.
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3, 3).
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    if rotation_6d.shape[-1] != 6:
+        raise ValueError(f'Invalid input rotation_6d f{rotation_6d.shape}.')
+    t = Compose([rotation_6d_to_matrix])
+    return t(rotation_6d)
+
+
+def aa_to_ee(axis_angle: Union[torch.Tensor, numpy.ndarray],
+             convention: str = 'xyz') -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert axis angles to euler angle.
+
+    Args:
+        axis_angle (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3). ndim of input is unlimited.
+        convention (str, optional): Convention string of three letters
+                from {“x”, “y”, and “z”}. Defaults to 'xyz'.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3).
+    """
+    if axis_angle.shape[-1] != 3:
+        raise ValueError(
+            f'Invalid input axis_angle shape f{axis_angle.shape}.')
+    t = Compose([axis_angle_to_matrix, matrix_to_euler_angles])
+    return t(axis_angle, convention)
+
+
+def aa_to_rot6d(
+    axis_angle: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert axis angles to rotation 6d representations.
+
+    Args:
+        axis_angle (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3). ndim of input is unlimited.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 6).
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    if axis_angle.shape[-1] != 3:
+        raise ValueError(f'Invalid input axis_angle f{axis_angle.shape}.')
+    t = Compose([axis_angle_to_matrix, matrix_to_rotation_6d])
+    return t(axis_angle)
+
+
+def ee_to_aa(euler_angle: Union[torch.Tensor, numpy.ndarray],
+             convention: str = 'xyz') -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert euler angles to axis angles.
+
+    Args:
+        euler_angle (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3). ndim of input is unlimited.
+        convention (str, optional): Convention string of three letters
+                from {“x”, “y”, and “z”}. Defaults to 'xyz'.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3).
+    """
+    if euler_angle.shape[-1] != 3:
+        raise ValueError(f'Invalid input euler_angle f{euler_angle.shape}.')
+    t = Compose([
+        euler_angles_to_matrix, matrix_to_quaternion, quaternion_to_axis_angle
+    ])
+    return t(euler_angle, convention)
+
+
+def ee_to_quat(euler_angle: Union[torch.Tensor, numpy.ndarray],
+               convention='xyz') -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert euler angles to quaternions.
+
+    Args:
+        euler_angle (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3). ndim of input is unlimited.
+        convention (str, optional): Convention string of three letters
+                from {“x”, “y”, and “z”}. Defaults to 'xyz'.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 4).
+    """
+    if euler_angle.shape[-1] != 3:
+        raise ValueError(f'Invalid input euler_angle f{euler_angle.shape}.')
+    t = Compose([euler_angles_to_matrix, matrix_to_quaternion])
+    return t(euler_angle, convention)
+
+
+def ee_to_rot6d(euler_angle: Union[torch.Tensor, numpy.ndarray],
+                convention='xyz') -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert euler angles to rotation 6d representation.
+
+    Args:
+        euler_angle (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3). ndim of input is unlimited.
+        convention (str, optional): Convention string of three letters
+                from {“x”, “y”, and “z”}. Defaults to 'xyz'.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 6).
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    if euler_angle.shape[-1] != 3:
+        raise ValueError(f'Invalid input euler_angle f{euler_angle.shape}.')
+    t = Compose([euler_angles_to_matrix, matrix_to_rotation_6d])
+    return t(euler_angle, convention)
+
+
+def rotmat_to_aa(
+        matrix: Union[torch.Tensor, numpy.ndarray],
+        convention: str = 'xyz') -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert rotation matrixs to axis angles.
+
+    Args:
+        matrix (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 3, 3). ndim of input is unlimited.
+        convention (str, optional): Convention string of three letters
+                from {“x”, “y”, and “z”}. Defaults to 'xyz'.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3).
+    """
+    if matrix.shape[-1] != 3 or matrix.shape[-2] != 3:
+        raise ValueError(f'Invalid rotation matrix  shape f{matrix.shape}.')
+    t = Compose([matrix_to_quaternion, quaternion_to_axis_angle])
+    return t(matrix)
+
+
+def quat_to_ee(quaternions: Union[torch.Tensor, numpy.ndarray],
+               convention: str = 'xyz') -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert quaternions to euler angles.
+
+    Args:
+        quaternions (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 4). ndim of input is unlimited.
+        convention (str, optional): Convention string of three letters
+                from {“x”, “y”, and “z”}. Defaults to 'xyz'.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3).
+    """
+    if quaternions.shape[-1] != 4:
+        raise ValueError(f'Invalid input quaternions f{quaternions.shape}.')
+    t = Compose([quaternion_to_matrix, matrix_to_euler_angles])
+    return t(quaternions, convention)
+
+
+def quat_to_rot6d(
+    quaternions: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert quaternions to rotation 6d representations.
+
+    Args:
+        quaternions (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 4). ndim of input is unlimited.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 6).
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    if quaternions.shape[-1] != 4:
+        raise ValueError(f'Invalid input quaternions f{quaternions.shape}.')
+    t = Compose([quaternion_to_matrix, matrix_to_rotation_6d])
+    return t(quaternions)
+
+
+def rot6d_to_aa(
+    rotation_6d: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert rotation 6d representations to axis angles.
+
+    Args:
+        rotation_6d (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 6). ndim of input is unlimited.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3).
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    if rotation_6d.shape[-1] != 6:
+        raise ValueError(f'Invalid input rotation_6d f{rotation_6d.shape}.')
+    t = Compose([
+        rotation_6d_to_matrix, matrix_to_quaternion, quaternion_to_axis_angle
+    ])
+    return t(rotation_6d)
+
+
+def rot6d_to_ee(rotation_6d: Union[torch.Tensor, numpy.ndarray],
+                convention: str = 'xyz') -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert rotation 6d representations to euler angles.
+
+    Args:
+        rotation_6d (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 6). ndim of input is unlimited.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3).
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    if rotation_6d.shape[-1] != 6:
+        raise ValueError(f'Invalid input rotation_6d f{rotation_6d.shape}.')
+    t = Compose([rotation_6d_to_matrix, matrix_to_euler_angles])
+    return t(rotation_6d, convention)
+
+
+def rot6d_to_quat(
+    rotation_6d: Union[torch.Tensor, numpy.ndarray]
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert rotation 6d representations to quaternions.
+
+    Args:
+        rotation (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 6). ndim of input is unlimited.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 4).
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    if rotation_6d.shape[-1] != 6:
+        raise ValueError(
+            f'Invalid input rotation_6d shape f{rotation_6d.shape}.')
+    t = Compose([rotation_6d_to_matrix, matrix_to_quaternion])
+    return t(rotation_6d)
+
+
+def aa_to_sja(
+    axis_angle: Union[torch.Tensor, numpy.ndarray],
+    R_t: Union[torch.Tensor, numpy.ndarray] = TRANSFORMATION_AA_TO_SJA,
+    R_t_inv: Union[torch.Tensor, numpy.ndarray] = TRANSFORMATION_SJA_TO_AA
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert axis-angles to standard joint angles.
+
+    Args:
+        axis_angle (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 21, 3), ndim of input is unlimited.
+        R_t (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 21, 3, 3). Transformation matrices from
+                original axis-angle coordinate system to
+                standard joint angle coordinate system,
+                ndim of input is unlimited.
+        R_t_inv (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 21, 3, 3). Transformation matrices from
+                standard joint angle coordinate system to
+                original axis-angle coordinate system,
+                ndim of input is unlimited.
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3).
+    """
+    def _aa_to_sja(aa, R_t, R_t_inv):
+        R_aa = axis_angle_to_matrix(aa)
+        R_sja = R_t @ R_aa @ R_t_inv
+        sja = matrix_to_euler_angles(R_sja, convention='XYZ')
+        return sja
+
+    if axis_angle.shape[-2:] != (21, 3):
+        raise ValueError(
+            f'Invalid input axis angles shape f{axis_angle.shape}.')
+    if R_t.shape[-3:] != (21, 3, 3):
+        raise ValueError(f'Invalid input R_t shape f{R_t.shape}.')
+    if R_t_inv.shape[-3:] != (21, 3, 3):
+        raise ValueError(f'Invalid input R_t_inv shape f{R_t.shape}.')
+    t = Compose([_aa_to_sja])
+    return t(axis_angle, R_t=R_t, R_t_inv=R_t_inv)
+
+
+def sja_to_aa(
+    sja: Union[torch.Tensor, numpy.ndarray],
+    R_t: Union[torch.Tensor, numpy.ndarray] = TRANSFORMATION_AA_TO_SJA,
+    R_t_inv: Union[torch.Tensor, numpy.ndarray] = TRANSFORMATION_SJA_TO_AA
+) -> Union[torch.Tensor, numpy.ndarray]:
+    """Convert standard joint angles to axis angles.
+
+    Args:
+        sja (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 21, 3). ndim of input is unlimited.
+        R_t (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 21, 3, 3). Transformation matrices from
+                original axis-angle coordinate system to
+                standard joint angle coordinate system
+        R_t_inv (Union[torch.Tensor, numpy.ndarray]): input shape
+                should be (..., 21, 3, 3). Transformation matrices from
+                standard joint angle coordinate system to
+                original axis-angle coordinate system
+
+    Returns:
+        Union[torch.Tensor, numpy.ndarray]: shape would be (..., 3).
+    """
+    def _sja_to_aa(sja, R_t, R_t_inv):
+        R_sja = euler_angles_to_matrix(sja, convention='XYZ')
+        R_aa = R_t_inv @ R_sja @ R_t
+        aa = quaternion_to_axis_angle(matrix_to_quaternion(R_aa))
+        return aa
+
+    if sja.shape[-2:] != (21, 3):
+        raise ValueError(f'Invalid input axis angles shape f{sja.shape}.')
+    if R_t.shape[-3:] != (21, 3, 3):
+        raise ValueError(f'Invalid input R_t shape f{R_t.shape}.')
+    if R_t_inv.shape[-3:] != (21, 3, 3):
+        raise ValueError(f'Invalid input R_t_inv shape f{R_t.shape}.')
+    t = Compose([_sja_to_aa])
+    return t(sja, R_t=R_t, R_t_inv=R_t_inv)
+
+
+def make_homegeneous_rotmat_batch(input: torch.Tensor) -> torch.Tensor:
+    """Appends a row of [0,0,0,1] to a batch size x 3 x 4 Tensor.
+
+    Parameters
+    ----------
+    :param input: A tensor of dimensions batch size x 3 x 4
+    :return: A tensor batch size x 4 x 4 (appended with 0,0,0,1)
+    """
+    batch_size = input.shape[0]
+    row_append = torch.tensor([0.0, 0.0, 0.0, 1.0], dtype=torch.float)
+    row_append.requires_grad = False
+    padded_tensor = torch.cat(
+        [input, row_append.view(1, 1, 4).repeat(batch_size, 1, 1)], dim=1)
+    return padded_tensor
+
+
+def make_homegeneous_rotmat(input: torch.Tensor) -> torch.Tensor:
+    """Appends a row of [0,0,0,1] to a 3 x 4 Tensor.
+
+    Parameters
+    ----------
+    :param input: A tensor of dimensions 3 x 4
+    :return: A tensor batch size x 4 x 4 (appended with 0,0,0,1)
+    """
+    row_append = torch.tensor([0.0, 0.0, 0.0, 1.0], dtype=torch.float)
+    row_append.requires_grad = False
+    padded_tensor = torch.cat(input, row_append, dim=1)
+    return padded_tensor
diff --git a/detrsmpl/utils/util_mixins.py b/detrsmpl/utils/util_mixins.py
new file mode 100644
index 0000000000000000000000000000000000000000..923f1982237f3b3ac6613ba1376f55211014f551
--- /dev/null
+++ b/detrsmpl/utils/util_mixins.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This module defines the :class:`NiceRepr` mixin class, which defines a
+``__repr__`` and ``__str__`` method that only depend on a custom ``__nice__``
+method, which you must define. This means you only have to overload one
+function instead of two.  Furthermore, if the object defines a ``__len__``
+method, then the ``__nice__`` method defaults to something sensible, otherwise
+it is treated as abstract and raises ``NotImplementedError``.
+
+To use simply have your object inherit from :class:`NiceRepr`
+(multi-inheritance should be ok).
+
+This code was copied from the ubelt library: https://github.com/Erotemic/ubelt
+
+Example:
+    >>> # Objects that define __nice__ have a default __str__ and __repr__
+    >>> class Student(NiceRepr):
+    ...    def __init__(self, name):
+    ...        self.name = name
+    ...    def __nice__(self):
+    ...        return self.name
+    >>> s1 = Student('Alice')
+    >>> s2 = Student('Bob')
+    >>> print(f's1 = {s1}')
+    >>> print(f's2 = {s2}')
+    s1 = <Student(Alice)>
+    s2 = <Student(Bob)>
+
+Example:
+    >>> # Objects that define __len__ have a default __nice__
+    >>> class Group(NiceRepr):
+    ...    def __init__(self, data):
+    ...        self.data = data
+    ...    def __len__(self):
+    ...        return len(self.data)
+    >>> g = Group([1, 2, 3])
+    >>> print(f'g = {g}')
+    g = <Group(3)>
+"""
+import warnings
+
+
+class NiceRepr:
+    """Inherit from this class and define ``__nice__`` to "nicely" print your
+    objects.
+
+    Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function
+    Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``.
+    If the inheriting class has a ``__len__``, method then the default
+    ``__nice__`` method will return its length.
+
+    Example:
+        >>> class Foo(NiceRepr):
+        ...    def __nice__(self):
+        ...        return 'info'
+        >>> foo = Foo()
+        >>> assert str(foo) == '<Foo(info)>'
+        >>> assert repr(foo).startswith('<Foo(info) at ')
+
+    Example:
+        >>> class Bar(NiceRepr):
+        ...    pass
+        >>> bar = Bar()
+        >>> import pytest
+        >>> with pytest.warns(None) as record:
+        >>>     assert 'object at' in str(bar)
+        >>>     assert 'object at' in repr(bar)
+
+    Example:
+        >>> class Baz(NiceRepr):
+        ...    def __len__(self):
+        ...        return 5
+        >>> baz = Baz()
+        >>> assert str(baz) == '<Baz(5)>'
+    """
+    def __nice__(self):
+        """str: a "nice" summary string describing this module"""
+        if hasattr(self, '__len__'):
+            # It is a common pattern for objects to use __len__ in __nice__
+            # As a convenience we define a default __nice__ for these objects
+            return str(len(self))
+        else:
+            # In all other cases force the subclass to overload __nice__
+            raise NotImplementedError(
+                f'Define the __nice__ method for {self.__class__!r}')
+
+    def __repr__(self):
+        """str: the string of the module"""
+        try:
+            nice = self.__nice__()
+            classname = self.__class__.__name__
+            return f'<{classname}({nice}) at {hex(id(self))}>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
+
+    def __str__(self):
+        """str: the string of the module"""
+        try:
+            classname = self.__class__.__name__
+            nice = self.__nice__()
+            return f'<{classname}({nice})>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
diff --git a/detrsmpl/version.py b/detrsmpl/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..c64e135f874d571b38b00ed075df3035638073eb
--- /dev/null
+++ b/detrsmpl/version.py
@@ -0,0 +1,29 @@
+# Copyright (c) Open-MMLab. All rights reserved.
+
+# __version__ = '0.9.0'
+__version__ = '0.10.0'
+
+
+def parse_version_info(version_str):
+    """Parse a version string into a tuple.
+
+    Args:
+        version_str (str): The version string.
+    Returns:
+        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
+            (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
+    """
+    version_info = []
+    for x in version_str.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = parse_version_info(__version__)
+
+__all__ = ['__version__', 'version_info', 'parse_version_info']
diff --git a/engine.py b/engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..14f7082674d6e89f9e4a40d66b8aef2a21c8f06f
--- /dev/null
+++ b/engine.py
@@ -0,0 +1,352 @@
+import math
+import os
+import time
+import datetime
+import sys
+from typing import Iterable
+import os.path as osp
+import torch
+import util.misc as utils
+from collections import OrderedDict
+import mmcv
+import torch
+import numpy as np
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+from detrsmpl.apis.test import collect_results_cpu, collect_results_gpu
+from detrsmpl.utils.ffmpeg_utils import images_to_video
+from torch.utils.tensorboard import SummaryWriter   
+import json
+from mmcv.runner import get_dist_info, init_dist
+
+def round_float(items):
+    if isinstance(items, list):
+        return [round_float(item) for item in items]
+    elif isinstance(items, float):
+        return round(items, 3)
+    elif isinstance(items, np.ndarray):
+        return round_float(float(items))
+    elif isinstance(items, torch.Tensor):
+        return round_float(items.detach().cpu().numpy())
+    else:
+        return items
+
+def train_one_epoch(model: torch.nn.Module,
+                    criterion: torch.nn.Module,
+                    data_loader: Iterable,
+                    optimizer: torch.optim.Optimizer,
+                    device: torch.device,
+                    epoch: int,
+                    max_norm: float = 0,
+                    wo_class_error=False,
+                    lr_scheduler=None,
+                    args=None,
+                    logger=None,
+                    ema_m=None,
+                    tf_writer=None):
+    scaler = torch.cuda.amp.GradScaler(enabled=args.amp)
+
+    try:
+        need_tgt_for_training = args.use_dn
+    except:
+        need_tgt_for_training = False
+
+    model.train()
+    criterion.train()
+    metric_logger = utils.MetricLogger(delimiter='  ')
+    metric_logger.add_meter(
+        'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    if not wo_class_error:
+        metric_logger.add_meter(
+            'class_error', utils.SmoothedValue(window_size=1,
+                                               fmt='{value:.2f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+    print_freq = 10
+
+    _cnt = 0
+
+    for step_i, data_batch in enumerate(metric_logger.log_every(data_loader,
+                                              print_freq,
+                                              header,
+                                              logger=logger)):
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            if need_tgt_for_training:
+                outputs, targets, data_batch_nc = model(data_batch)
+            else:
+                outputs, targets, data_batch_nc = model(data_batch)
+            
+            ['hand_kp3d_4', 'face_kp3d_4', 'hand_kp2d_4',]
+            loss_dict = criterion(outputs, targets, data_batch=data_batch_nc)
+            weight_dict = criterion.weight_dict
+            
+            for k,v in weight_dict.items():
+                for n in ['hand_kp3d_4', 'face_kp3d_4', 'hand_kp2d_4']:
+                    if n in k:
+                        weight_dict[k] = weight_dict[k]/10
+
+            losses = sum(loss_dict[k] * weight_dict[k]
+                         for k in loss_dict.keys() if k in weight_dict)
+
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        loss_dict_reduced_unscaled = {
+            f'{k}_unscaled': v
+            for k, v in loss_dict_reduced.items()
+        }
+        loss_dict_reduced_scaled = {
+            k: v * weight_dict[k]
+            for k, v in loss_dict_reduced.items() if k in weight_dict
+        }
+        losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
+
+        loss_value = losses_reduced_scaled.item()
+        # loss_value = loss_value+loss_value_smpl
+        for k,v in weight_dict.items():
+            for n in ['hand_kp3d_4', 'face_kp3d_4', 'hand_kp2d_4']:
+                if n in k:
+                    weight_dict[k] = weight_dict[k]*10
+        if not math.isfinite(loss_value):
+            print('Loss is {}, stopping training'.format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        # amp backward function
+        if args.amp:
+            optimizer.zero_grad()
+            scaler.scale(losses).backward()
+            if max_norm > 0:
+                scaler.unscale_(optimizer)
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            optimizer.zero_grad()
+            losses.backward()
+            if max_norm > 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+            optimizer.step()
+        if args.onecyclelr:
+            lr_scheduler.step()
+        if args.use_ema:
+            if epoch >= args.ema_epoch:
+                ema_m.update(model)
+        rank, _ = get_dist_info()
+
+        if rank == 0:
+            tf_writer.add_scalar(
+                'loss', round_float(loss_value), step_i + len(data_loader) * epoch)
+            for k, v in loss_dict_reduced_scaled.items():
+                tf_writer.add_scalar(
+                    k, round_float(v), step_i + len(data_loader) * epoch)
+            for k, v in loss_dict_reduced_unscaled.items():
+                tf_writer.add_scalar(
+                    k, round_float(v), step_i + len(data_loader) * epoch)
+        json_log = OrderedDict()
+        json_log['now_time'] = str(datetime.datetime.now())
+        json_log['epoch'] = epoch
+        json_log['lr'] = optimizer.param_groups[0]['lr']
+        json_log['loss'] = round_float(loss_value)
+        for k, v in loss_dict_reduced_scaled.items():
+            json_log[k] = round_float(v)
+
+        for k, v in loss_dict_reduced_unscaled.items():
+            json_log[k] = round_float(v)
+
+        if rank == 0:
+            log_path = os.path.join(args.output_dir, 'train.log.json')
+            with open(log_path, 'a+') as f:
+                mmcv.dump(json_log, f, file_format='json')
+                f.write('\n')
+
+        # metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled)
+        metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled)
+        if 'class_error' in loss_dict_reduced:
+            metric_logger.update(class_error=loss_dict_reduced['class_error'])
+        metric_logger.update(lr=optimizer.param_groups[0]['lr'])
+
+        _cnt += 1
+        if args.debug:
+            if _cnt % 15 == 0:
+                print('BREAK!' * 5)
+                break
+
+    if getattr(criterion, 'loss_weight_decay', False):
+        criterion.loss_weight_decay(epoch=epoch)
+    if getattr(criterion, 'tuning_matching', False):
+        criterion.tuning_matching(epoch)
+
+    metric_logger.synchronize_between_processes()
+    print('Averaged stats:', metric_logger)
+    resstat = {
+        k: meter.global_avg
+        for k, meter in metric_logger.meters.items() if meter.count > 0
+    }
+    if getattr(criterion, 'loss_weight_decay', False):
+        resstat.update(
+            {f'weight_{k}': v
+             for k, v in criterion.weight_dict.items()})
+    return resstat
+
+
+@torch.no_grad()
+def evaluate(model,
+             criterion,
+             postprocessors,
+             data_loader,
+             device,
+             output_dir,
+             wo_class_error=False,
+             tmpdir=None,
+             gpu_collect=False,
+             args=None,
+             logger=None):
+    try:
+        need_tgt_for_training = args.use_dn
+    except:
+        need_tgt_for_training = False
+    model.eval()
+    criterion.eval()
+
+    metric_logger = utils.MetricLogger(delimiter='  ')
+    if not wo_class_error:
+        metric_logger.add_meter(
+            'class_error', utils.SmoothedValue(window_size=1,
+                                               fmt='{value:.2f}'))
+    header = 'Test:'
+    iou_types = tuple(k for k in ('bbox', 'keypoints'))
+    try:
+        useCats = args.useCats
+    except:
+        useCats = True
+    if not useCats:
+        print('useCats: {} !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'.format(
+            useCats))
+
+    _cnt = 0
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+
+    if rank == 0:
+        # Check if tmpdir is valid for cpu_collect
+        if (not gpu_collect) and (tmpdir is not None and osp.exists(tmpdir)):
+            raise OSError((f'The tmpdir {tmpdir} already exists.',
+                           ' Since tmpdir will be deleted after testing,',
+                           ' please make sure you specify an empty one.'))
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)
+    # i=0
+    cur_sample_idx = 0
+    eval_result = {}
+    # print()
+    cur_eval_result_list = []
+    rank, world_size = get_dist_info()
+
+    for data_batch in metric_logger.log_every(
+        data_loader, 10, header, logger=logger):
+        # i = i+1
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            if need_tgt_for_training:
+                # outputs = model(samples, targets)
+                outputs, targets, data_batch_nc = model(data_batch)
+            else:
+                outputs,targets, data_batch_nc = model(data_batch)
+        
+        orig_target_sizes = torch.stack([t["size"] for t in targets], dim=0)
+        result = postprocessors['bbox'](outputs, orig_target_sizes, targets, data_batch_nc,dataset = dataset)    
+        
+        # DOING SMPLer-X Test
+        cur_eval_result = dataset.evaluate(result,cur_sample_idx)
+        
+        cur_eval_result_list.append(cur_eval_result)
+        # for cur_eval_result in cur_eval_result_list:
+        #     for k, v in cur_eval_result.items():
+        #         if k in eval_result:
+        #             eval_result[k] += v
+        #         else:
+        #             eval_result[k] = v
+        cur_sample_idx += len(result)
+    cur_eval_result_new = collect_results_cpu(cur_eval_result_list, len(dataset))
+    
+    if rank == 0:
+        
+        cntt = 0
+        for res in cur_eval_result_new:
+
+            for k,v in res.items():
+                if len(v)>0:
+                    if k != 'ann_idx' and k != 'img_path':                 
+                        if k in eval_result:
+                            eval_result[k].append(v)
+                        else:
+                            eval_result[k] = [v]
+
+        for k,v in eval_result.items():
+            
+            # if k == 'mpvpe_all' or k == 'pa_mpvpe_all':
+            eval_result[k] = np.concatenate(v)
+            
+            
+        dataset.print_eval_result(eval_result)
+        # print(len(cur_eval_result_new))
+
+
+@torch.no_grad()
+def inference(model,
+             criterion,
+             postprocessors,
+             data_loader,
+             device,
+             output_dir,
+             wo_class_error=False,
+             tmpdir=None,
+             gpu_collect=False,
+             args=None,
+             logger=None):
+    try:
+        need_tgt_for_training = args.use_dn
+    except:
+        need_tgt_for_training = False
+    model.eval()
+    criterion.eval()
+
+    metric_logger = utils.MetricLogger(delimiter='  ')
+    if not wo_class_error:
+        metric_logger.add_meter(
+            'class_error', utils.SmoothedValue(window_size=1,
+                                               fmt='{value:.2f}'))
+    header = 'Test:'
+    iou_types = tuple(k for k in ('bbox', 'keypoints'))
+    try:
+        useCats = args.useCats
+    except:
+        useCats = True
+    if not useCats:
+        print('useCats: {} !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!'.format(
+            useCats))
+
+    _cnt = 0
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    for data_batch in metric_logger.log_every(data_loader, 10, header, logger=logger):
+        with torch.cuda.amp.autocast(enabled=args.amp):
+            if need_tgt_for_training:
+                # outputs = model(samples, targets)
+                outputs, targets, data_batch_nc = model(data_batch)
+            else:
+                outputs,targets, data_batch_nc = model(data_batch)
+        
+        orig_target_sizes = torch.stack([t["size"] for t in targets], dim=0)
+        result = postprocessors['bbox'](outputs, orig_target_sizes, targets, data_batch_nc)    
+        dataset.inference(result)
+
+    time.sleep(3)
+    if rank == 0 and args.to_vid:
+        # img_tmp = dataset.img_path[0]
+        if hasattr(dataset,'result_img_dir'):
+            import shutil
+            images_to_video(dataset.result_img_dir, os.path.join(dataset.output_path,'demo_vid.mp4'),remove_raw_file=False, fps=30)
+            # shutil.rmtree(dataset.result_img_dir)
+            # shutil.rmtree(dataset.tmp_dir)
+
+
diff --git a/main.py b/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..130f88f53dbf2532d95305a945974a78b0d60d8b
--- /dev/null
+++ b/main.py
@@ -0,0 +1,396 @@
+import argparse
+import datetime
+import json
+import random
+import time
+from pathlib import Path
+import os, sys
+from util.get_param_dicts import get_param_dict
+from util.logger import setup_logger
+import numpy as np
+import torch
+
+import util.misc as utils
+from detrsmpl.data.datasets import build_dataloader
+from mmcv.parallel import MMDistributedDataParallel
+
+from engine import evaluate, train_one_epoch, inference
+from util.config import DictAction
+from util.utils import ModelEma
+
+import shutil
+import torchvision.transforms as transforms
+from torch.utils.tensorboard import SummaryWriter   
+import config.config as cfg
+from datasets.dataset import MultipleDatasets
+
+def get_args_parser():
+    parser = argparse.ArgumentParser('Set transformer detector',
+                                     add_help=False)
+    parser.add_argument('--config_file', '-c', type=str, required=True)
+    parser.add_argument(
+        '--options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file.')
+    # parser.add_argument('--exp_name', default='data/log/smplx_test', type=str)
+    # dataset parameters
+    
+    # training parameters
+    parser.add_argument('--output_dir',
+                        default='',
+                        help='path where to save, empty for no saving')
+    parser.add_argument('--device',
+                        default='cuda',
+                        help='device to use for training / testing')
+    parser.add_argument('--seed', default=42, type=int)
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--pretrain_model_path',
+                        help='load from other checkpoint')
+    parser.add_argument('--finetune_ignore', type=str, nargs='+')
+    parser.add_argument('--start_epoch',
+                        default=0,
+                        type=int,
+                        metavar='N',
+                        help='start epoch')
+    parser.add_argument('--eval', action='store_true')
+    parser.add_argument('--num_workers', default=0, type=int)
+    parser.add_argument('--test', action='store_true')
+    parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--find_unused_params', action='store_true')
+
+    parser.add_argument('--save_log', action='store_true')
+    parser.add_argument('--to_vid', action='store_true')
+    parser.add_argument('--inference', action='store_true')
+    # distributed training parameters
+
+    parser.add_argument('--world_size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
+    parser.add_argument('--rank', default=0, type=int,
+                        help='number of distributed processes')
+    parser.add_argument("--local_rank", default=0, type=int, help='local rank for DistributedDataParallel')
+    parser.add_argument('--amp', action='store_true',
+                        help="Train with mixed precision")
+
+    parser.add_argument('--inference_input', default=None, type=str)
+    return parser
+
+
+def build_model_main(args, cfg):
+    print(args.modelname)
+    from models.registry import MODULE_BUILD_FUNCS
+    assert args.modelname in MODULE_BUILD_FUNCS._module_dict
+    build_func = MODULE_BUILD_FUNCS.get(args.modelname)
+    model, criterion, postprocessors, postprocessors_aios = build_func(
+        args, cfg)
+    return model, criterion, postprocessors, postprocessors_aios
+
+
+def main(args):
+    utils.init_distributed_mode(args)
+    print('Loading config file from {}'.format(args.config_file))
+    shutil.copy2(args.config_file,'config/aios_smplx.py')
+    from config.config import cfg
+    if args.options is not None:
+        cfg.merge_from_dict(args.options)
+    if args.rank == 0:
+        save_cfg_path = os.path.join(args.output_dir, 'config_cfg.py')
+        cfg.dump(save_cfg_path)
+        save_json_path = os.path.join(args.output_dir, 'config_args_raw.json')
+        with open(save_json_path, 'w') as f:
+            json.dump(vars(args), f, indent=2)
+    cfg_dict = cfg._cfg_dict.to_dict()
+    args_vars = vars(args)
+    for k, v in cfg_dict.items():
+        if k not in args_vars:
+            setattr(args, k, v)
+        else:
+            continue
+            raise ValueError('Key {} can used by args only'.format(k))
+
+    # update some new args temporally
+    if not getattr(args, 'use_ema', None):
+        args.use_ema = False
+    if not getattr(args, 'debug', None):
+        args.debug = False
+
+    # setup logger
+    os.makedirs(args.output_dir, exist_ok=True)
+    logger = setup_logger(output=os.path.join(args.output_dir, 'info.txt'),
+                          distributed_rank=args.rank,
+                          color=False,
+                          name='detr')
+    logger.info('git:\n  {}\n'.format(utils.get_sha()))
+    logger.info('Command: ' + ' '.join(sys.argv))
+    writer = None
+    if args.rank == 0:
+        writer = SummaryWriter(args.output_dir)
+        save_json_path = os.path.join(args.output_dir, 'config_args_all.json')
+        # print("args:", vars(args))
+        with open(save_json_path, 'w') as f:
+            json.dump(vars(args), f, indent=2)
+        logger.info('Full config saved to {}'.format(save_json_path))
+    logger.info('world size: {}'.format(args.world_size))
+    logger.info('rank: {}'.format(args.rank))
+    logger.info('local_rank: {}'.format(args.local_rank))
+    logger.info('args: ' + str(args) + '\n')
+
+    if args.frozen_weights is not None:
+        assert args.masks, 'Frozen training is meant for segmentation only'
+
+    device = torch.device(args.device)
+
+    # fix the seed for reproducibility
+    seed = args.seed + utils.get_rank()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+
+    # build model
+    model, criterion, postprocessors, _ = build_model_main(
+        args, cfg)
+
+    wo_class_error = False
+    model.to(device)
+
+    # ema
+    if args.use_ema:
+        ema_m = ModelEma(model, args.ema_decay)
+    else:
+        ema_m = None
+
+    model_without_ddp = model
+    if args.distributed:
+        model = MMDistributedDataParallel(
+            model,
+            device_ids=[args.gpu],
+            find_unused_parameters=args.find_unused_params)
+        model_without_ddp = model.module
+    n_parameters = sum(p.numel() for p in model.parameters()
+                       if p.requires_grad)
+    logger.info('number of params:' + str(n_parameters))
+    logger.info('params:\n' + json.dumps(
+        {n: p.numel()
+         for n, p in model.named_parameters() if p.requires_grad},
+        indent=2))
+
+    param_dicts = get_param_dict(args, model_without_ddp)
+    optimizer = torch.optim.AdamW(param_dicts,
+                                  lr=args.lr,
+                                  weight_decay=args.weight_decay)
+    
+    logger.info('Creating dataset...')
+    if not args.eval:
+        trainset= []
+        for trainset_i,v in cfg.trainset_partition.items():
+            exec('from datasets.' + trainset_i +
+                ' import ' + trainset_i)
+            trainset.append(
+                eval(trainset_i)(transforms.ToTensor(), 'train'))
+        trainset_loader = MultipleDatasets(trainset, make_same_len=False,partition=cfg.trainset_partition)
+    
+        data_loader_train = build_dataloader(
+            trainset_loader,
+            args.batch_size,
+        0  if 'workers_per_gpu' in args else 1,
+            dist=args.distributed)
+    exec('from datasets.' + cfg.testset +
+            ' import ' + cfg.testset)
+    
+    
+    if not args.inference:
+        dataset_val = eval(cfg.testset)(transforms.ToTensor(), "test")
+    else:
+        dataset_val = eval(cfg.testset)(args.inference_input, args.output_dir)
+        
+    data_loader_val = build_dataloader(
+    dataset_val,
+    args.batch_size,
+    0  if 'workers_per_gpu' in args else 2,
+    dist=args.distributed,
+    shuffle=False)
+        
+    if args.onecyclelr:
+        lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(
+            optimizer,
+            max_lr=args.lr,
+            steps_per_epoch=len(data_loader_train),
+            epochs=args.epochs,
+            pct_start=0.2)
+    elif args.multi_step_lr:
+        lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(
+            optimizer, milestones=args.lr_drop_list)
+    else:
+        lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
+
+    if args.frozen_weights is not None:
+        checkpoint = torch.load(args.frozen_weights, map_location='cpu')
+        model_without_ddp.detr.load_state_dict(checkpoint['model'])
+
+    output_dir = Path(args.output_dir)
+    if os.path.exists(os.path.join(args.output_dir, 'checkpoint.pth')):
+        args.resume = os.path.join(args.output_dir, 'checkpoint.pth')
+    if args.resume:
+        if args.resume.startswith('https'):
+            checkpoint = torch.hub.load_state_dict_from_url(args.resume,
+                                                            map_location='cpu',
+                                                            check_hash=True)
+        else:
+            checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        if args.use_ema:
+            if 'ema_model' in checkpoint:
+                ema_m.module.load_state_dict(
+                    utils.clean_state_dict(checkpoint['ema_model']))
+            else:
+                del ema_m
+                ema_m = ModelEma(model, args.ema_decay)
+
+        if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
+            optimizer.load_state_dict(checkpoint['optimizer'])
+            lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+            args.start_epoch = checkpoint['epoch'] + 1
+
+    if (not args.resume) and args.pretrain_model_path:
+        checkpoint = torch.load(args.pretrain_model_path,
+                                map_location='cpu')['model']
+        from collections import OrderedDict
+        _ignorekeywordlist = args.finetune_ignore if args.finetune_ignore else []
+        ignorelist = []
+
+        def check_keep(keyname, ignorekeywordlist):
+            for keyword in ignorekeywordlist:
+                if keyword in keyname:
+                    ignorelist.append(keyname)
+                    return False
+            return True
+
+        
+        _tmp_st = OrderedDict({
+            k: v
+            for k, v in utils.clean_state_dict(checkpoint).items()
+            if check_keep(k, _ignorekeywordlist)
+        })
+        logger.info('Ignore keys: {}'.format(json.dumps(ignorelist, indent=2)))
+        # Change This
+        _load_output = model_without_ddp.load_state_dict(_tmp_st, strict=False)
+        print('loading')
+        logger.info(str(_load_output))
+
+        if args.use_ema:
+            if 'ema_model' in checkpoint:
+                ema_m.module.load_state_dict(utils.clean_state_dict(checkpoint['ema_model']))
+            else:
+                del ema_m
+                ema_m = ModelEma(model, args.ema_decay)    
+        _load_output = model_without_ddp.load_state_dict(_tmp_st, strict=False)
+        logger.info(str(_load_output))
+
+
+    if args.eval:
+        os.environ['EVAL_FLAG'] = 'TRUE'
+
+        if args.inference_input is not None and args.inference:
+            inference(model,
+                     criterion,
+                     postprocessors,
+                     data_loader_val,
+                     device,
+                     args.output_dir,
+                     wo_class_error=wo_class_error,
+                     args=args)            
+        else:
+        
+            from config.config import cfg
+            cfg.result_dir=args.output_dir
+            cfg.exp_name=args.pretrain_model_path
+            evaluate(model,
+                     criterion,
+                     postprocessors,
+                     data_loader_val,
+                     device,
+                     args.output_dir,
+                     wo_class_error=wo_class_error,
+                     args=args)
+
+        return
+
+    print('Start training')
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        epoch_start_time = time.time()
+
+        train_stats = train_one_epoch(
+            model,
+            criterion,
+            data_loader_train,
+            optimizer,
+            device,
+            epoch,
+            args.clip_max_norm,
+            wo_class_error=wo_class_error,
+            lr_scheduler=lr_scheduler,
+            args=args,
+            logger=(logger if args.save_log else None),
+            ema_m=ema_m,
+            tf_writer=writer)
+        if args.output_dir:
+            checkpoint_paths = [output_dir / 'checkpoint.pth']
+
+        if not args.onecyclelr:
+            lr_scheduler.step()
+        if args.output_dir:
+            checkpoint_paths = [output_dir / 'checkpoint.pth']
+            # extra checkpoint before LR drop and every 100 epochs
+            if (epoch + 1) % args.lr_drop == 0 or (
+                    epoch + 1) % args.save_checkpoint_interval == 0:
+                checkpoint_paths.append(output_dir /
+                                        f'checkpoint{epoch:04}.pth')
+            for checkpoint_path in checkpoint_paths:
+                weights = {
+                    'model': model_without_ddp.state_dict(),
+                    'optimizer': optimizer.state_dict(),
+                    'lr_scheduler': lr_scheduler.state_dict(),
+                    'epoch': epoch,
+                    'args': args,
+                }
+                if args.use_ema:
+                    weights.update({
+                        'ema_model': ema_m.module.state_dict(),
+                    })
+                utils.save_on_master(weights, checkpoint_path)
+        log_stats = {
+            **{f'train_{k}': v
+               for k, v in train_stats.items()},
+        }
+
+        ep_paras = {'epoch': epoch, 'n_parameters': n_parameters}
+        log_stats.update(ep_paras)
+        try:
+            log_stats.update({'now_time': str(datetime.datetime.now())})
+        except:
+            pass
+
+        epoch_time = time.time() - epoch_start_time
+        epoch_time_str = str(datetime.timedelta(seconds=int(epoch_time)))
+        log_stats['epoch_time'] = epoch_time_str
+
+        if args.output_dir and utils.is_main_process():
+            with (output_dir / 'log.txt').open('a') as f:
+                f.write(json.dumps(log_stats) + '\n')
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser('DETR training and evaluation script',
+                                     parents=[get_args_parser()])
+    __spec__ = "ModuleSpec(name='builtins', loader=<class '_frozen_importlib.BuiltinImporter'>)"
+    args = parser.parse_args()
+    if args.output_dir:
+        Path(args.output_dir).mkdir(parents=True, exist_ok=True)
+    main(args)
diff --git a/mmcv/.circleci/config.yml b/mmcv/.circleci/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8fbf916c0254efc459ca7abc5d078ae2bda1f43b
--- /dev/null
+++ b/mmcv/.circleci/config.yml
@@ -0,0 +1,173 @@
+version: 2.1
+jobs:
+  lint:
+    docker:
+      - image: cimg/python:3.7.4
+    steps:
+      - checkout
+      - run:
+          name: Install pre-commit hook
+          command: |
+            pip install pre-commit
+            pre-commit install
+      - run:
+          name: Linting
+          command: pre-commit run --all-files
+
+  build_cpu:
+    parameters:
+      # The python version must match available image tags in
+      # https://circleci.com/developer/images/image/cimg/python
+      python:
+        type: string
+        default: "3.7.0"
+      torch:
+        type: string
+      torchvision:
+        type: string
+    machine:
+      image: ubuntu-2004:202010-01
+    resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Install system dependencies
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y ffmpeg libturbojpeg ninja-build
+            ffmpeg -version
+      - run:
+          # https://github.com/pytorch/vision/issues/2921
+          name: Install dependency of torchvision when using pyenv
+          command: sudo apt-get install -y liblzma-dev
+      - run:
+          # python3.7 should be re-installed due to the issue https://github.com/pytorch/vision/issues/2921
+          name: Select Python
+          command: |
+            pyenv uninstall -f << parameters.python >>
+            pyenv install << parameters.python >>
+            pyenv global << parameters.python >>
+      - run:
+          name: Upgrade pip
+          command: |
+            python -m pip install pip --upgrade
+      - run:
+          name: Install PyTorch
+          command: python -m pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - run:
+          name: Install psutil
+          command: python -m pip install psutil
+      - run:
+          name: Build and install
+          command: |
+            rm -rf .eggs
+            python setup.py check -m -s
+            python -m pip install -e .
+          no_output_timeout: 20m
+          environment:
+            MMCV_WITH_OPS: 1
+      - run:
+          name: Install dependencies of unit test
+          command: |
+            python -m pip install -r requirements/test.txt
+      - run:
+          name: Run unittests and generate coverage report
+          command: |
+            python -m coverage run --branch --source mmcv -m pytest tests/
+            python -m coverage xml
+            python -m coverage report -m
+
+  build_cu102:
+    machine:
+      image: ubuntu-1604-cuda-10.1:201909-23  # the actual version of cuda is 10.2
+    resource_class: gpu.nvidia.small
+    steps:
+      - checkout
+      - run:
+          name: Set CUDA environment
+          command: |
+            echo 'export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH' >> $BASH_ENV
+            echo 'export PATH=/usr/local/cuda/bin:$PATH' >> $BASH_ENV
+            echo 'export CUDA_HOME=/usr/local/cuda' >> $BASH_ENV
+            source $BASH_ENV
+            nvidia-smi
+            nvcc --version
+            gcc --version
+      - run:
+          name: Install system dependencies
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y libturbojpeg ninja-build
+            # the default version of ffmpeg is 2.8.7, which should be upgraded to 4+
+            sudo add-apt-repository -y ppa:jonathonf/ffmpeg-4
+            sudo apt-get update
+            sudo apt-get install -y ffmpeg
+            ffmpeg -version
+            sudo add-apt-repository --remove ppa:jonathonf/ffmpeg-4 -y
+      - run:
+          # https://github.com/pytorch/vision/issues/2921
+          name: Install dependency of torchvision when using pyenv
+          command: sudo apt-get install -y liblzma-dev
+      - run:
+          # python3.7 should be re-installed due to the issue https://github.com/pytorch/vision/issues/2921
+          name: Select python3.7
+          command: |
+            pyenv uninstall -f 3.7.0
+            pyenv install 3.7.0
+            pyenv global 3.7.0
+      - run:
+          name: Upgrade pip
+          command: |
+            python -m pip install pip --upgrade
+      - run:
+          name: Install PyTorch
+          command: python -m pip install torch==1.8.1+cu102 torchvision==0.9.1+cu102 -f https://download.pytorch.org/whl/torch_stable.html
+      - run:
+          name: Install psutil
+          command: python -m pip install psutil
+      - run:
+          name: Download onnxruntime library and install onnxruntime
+          command: |
+            wget https://github.com/microsoft/onnxruntime/releases/download/v1.8.1/onnxruntime-linux-x64-1.8.1.tgz
+            tar -zxvf onnxruntime-linux-x64-1.8.1.tgz
+            echo 'export ONNXRUNTIME_DIR=$(pwd)/onnxruntime-linux-x64-1.8.1' >> $BASH_ENV
+            echo 'export LD_LIBRARY_PATH=$ONNXRUNTIME_DIR/lib:$LD_LIBRARY_PATH' >> $BASH_ENV
+            source $BASH_ENV
+            python -m pip install onnxruntime==1.8.1
+      - run:
+          name: Build and install
+          command: |
+            rm -rf .eggs
+            python setup.py check -m -s
+            python -m pip install -e .
+          environment:
+            MMCV_WITH_OPS: 1
+            MMCV_WITH_ORT: 1
+      - run:
+          name: Install dependencies for unit test
+          command: |
+            python -m pip install -r requirements/test.txt
+      - run:
+          name: Run unittests and generate coverage report
+          command: |
+            python -m coverage run --branch --source mmcv -m pytest tests/
+            python -m coverage xml
+            python -m coverage report -m
+workflows:
+  unit_tests:
+    jobs:
+      - lint
+      - build_cpu:
+          name: build_py3.8_pt1.9_cpu
+          torch: 1.9.0
+          torchvision: 0.10.0
+          python: "3.8.0"
+          requires:
+            - lint
+      - hold:
+          type: approval # <<< This key-value pair will set your workflow to a status of "On Hold"
+          requires:
+            - build_py3.8_pt1.9_cpu
+      - build_cu102:
+          requires:
+            - hold
diff --git a/mmcv/.dev_scripts/check_installation.py b/mmcv/.dev_scripts/check_installation.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b771acc5b227bc584ad6dc46c5a85d16a16d6a2
--- /dev/null
+++ b/mmcv/.dev_scripts/check_installation.py
@@ -0,0 +1,44 @@
+import numpy as np
+import torch
+
+from mmcv.ops import box_iou_rotated
+from mmcv.utils import collect_env
+
+
+def check_installation():
+    """Check whether mmcv-full has been installed successfully."""
+    np_boxes1 = np.asarray(
+        [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+         [7.0, 7.0, 8.0, 8.0, 0.4]],
+        dtype=np.float32)
+    np_boxes2 = np.asarray(
+        [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+         [5.0, 5.0, 6.0, 7.0, 0.4]],
+        dtype=np.float32)
+    boxes1 = torch.from_numpy(np_boxes1)
+    boxes2 = torch.from_numpy(np_boxes2)
+
+    # test mmcv-full with CPU ops
+    box_iou_rotated(boxes1, boxes2)
+    print('CPU ops were compiled successfully.')
+
+    # test mmcv-full with both CPU and CUDA ops
+    if torch.cuda.is_available():
+        boxes1 = boxes1.cuda()
+        boxes2 = boxes2.cuda()
+        box_iou_rotated(boxes1, boxes2)
+        print('CUDA ops were compiled successfully.')
+    else:
+        print('No CUDA runtime is found, skipping the checking of CUDA ops.')
+
+
+if __name__ == '__main__':
+    print('Start checking the installation of mmcv-full ...')
+    check_installation()
+    print('mmcv-full has been installed successfully.\n')
+
+    env_info_dict = collect_env()
+    env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
+    dash_line = '-' * 60 + '\n'
+    print('Environment information:')
+    print(dash_line + env_info + '\n' + dash_line)
diff --git a/mmcv/.dev_scripts/visualize_lr.py b/mmcv/.dev_scripts/visualize_lr.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca9aaa116e75b8f693589d1bcc0031d5ace0277
--- /dev/null
+++ b/mmcv/.dev_scripts/visualize_lr.py
@@ -0,0 +1,230 @@
+import argparse
+import json
+import os
+import os.path as osp
+import time
+import warnings
+from collections import OrderedDict
+from unittest.mock import patch
+
+import matplotlib.pyplot as plt
+import numpy as np
+import torch.nn as nn
+from torch.optim import SGD
+from torch.utils.data import DataLoader
+
+import mmcv
+from mmcv.runner import build_runner
+from mmcv.utils import get_logger
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Visualize the given config'
+                                     'of learning rate and momentum, and this'
+                                     'script will overwrite the log_config')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--work-dir', default='./', help='the dir to save logs and models')
+    parser.add_argument(
+        '--num-iters', default=300, help='The number of iters per epoch')
+    parser.add_argument(
+        '--num-epochs', default=300, help='Only used in EpochBasedRunner')
+    parser.add_argument(
+        '--window-size',
+        default='12*14',
+        help='Size of the window to display images, in format of "$W*$H".')
+    parser.add_argument(
+        '--log-interval', default=10, help='The interval of TextLoggerHook')
+    args = parser.parse_args()
+    return args
+
+
+class SimpleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(1, 1, 1)
+
+    def train_step(self, *args, **kwargs):
+        return dict()
+
+    def val_step(self, *args, **kwargs):
+        return dict()
+
+
+def iter_train(self, data_loader, **kwargs):
+    self.mode = 'train'
+    self.data_loader = data_loader
+    self.call_hook('before_train_iter')
+    self.call_hook('after_train_iter')
+    self._inner_iter += 1
+    self._iter += 1
+
+
+def epoch_train(self, data_loader, **kwargs):
+    self.model.train()
+    self.mode = 'train'
+    self.data_loader = data_loader
+    self._max_iters = self._max_epochs * len(self.data_loader)
+    self.call_hook('before_train_epoch')
+    for i, data_batch in enumerate(self.data_loader):
+        self._inner_iter = i
+        self.call_hook('before_train_iter')
+        self.call_hook('after_train_iter')
+        self._iter += 1
+    self.call_hook('after_train_epoch')
+    self._epoch += 1
+
+
+def log(self, runner):
+    cur_iter = self.get_iter(runner, inner_iter=True)
+
+    log_dict = OrderedDict(
+        mode=self.get_mode(runner),
+        epoch=self.get_epoch(runner),
+        iter=cur_iter)
+
+    # only record lr of the first param group
+    cur_lr = runner.current_lr()
+    if isinstance(cur_lr, list):
+        log_dict['lr'] = cur_lr[0]
+    else:
+        assert isinstance(cur_lr, dict)
+        log_dict['lr'] = {}
+        for k, lr_ in cur_lr.items():
+            assert isinstance(lr_, list)
+            log_dict['lr'].update({k: lr_[0]})
+
+    cur_momentum = runner.current_momentum()
+    if isinstance(cur_momentum, list):
+        log_dict['momentum'] = cur_momentum[0]
+    else:
+        assert isinstance(cur_momentum, dict)
+        log_dict['momentum'] = {}
+        for k, lr_ in cur_momentum.items():
+            assert isinstance(lr_, list)
+            log_dict['momentum'].update({k: lr_[0]})
+    log_dict = dict(log_dict, **runner.log_buffer.output)
+    self._log_info(log_dict, runner)
+    self._dump_log(log_dict, runner)
+    return log_dict
+
+
+@patch('torch.cuda.is_available', lambda: False)
+@patch('mmcv.runner.EpochBasedRunner.train', epoch_train)
+@patch('mmcv.runner.IterBasedRunner.train', iter_train)
+@patch('mmcv.runner.hooks.TextLoggerHook.log', log)
+def run(cfg, logger):
+    momentum_config = cfg.get('momentum_config')
+    lr_config = cfg.get('lr_config')
+
+    model = SimpleModel()
+    optimizer = SGD(model.parameters(), 0.1, momentum=0.8)
+    cfg.work_dir = cfg.get('work_dir', './')
+    workflow = [('train', 1)]
+
+    if cfg.get('runner') is None:
+        cfg.runner = {
+            'type': 'EpochBasedRunner',
+            'max_epochs': cfg.get('total_epochs', cfg.num_epochs)
+        }
+        warnings.warn(
+            'config is now expected to have a `runner` section, '
+            'please set `runner` in your config.', UserWarning)
+    batch_size = 1
+    data = cfg.get('data')
+    if data:
+        batch_size = data.get('samples_per_gpu')
+    fake_dataloader = DataLoader(
+        list(range(cfg.num_iters)), batch_size=batch_size)
+    runner = build_runner(
+        cfg.runner,
+        default_args=dict(
+            model=model,
+            batch_processor=None,
+            optimizer=optimizer,
+            work_dir=cfg.work_dir,
+            logger=logger,
+            meta=None))
+    log_config = dict(
+        interval=cfg.log_interval, hooks=[
+            dict(type='TextLoggerHook'),
+        ])
+
+    runner.register_training_hooks(lr_config, log_config=log_config)
+    runner.register_momentum_hook(momentum_config)
+    runner.run([fake_dataloader], workflow)
+
+
+def plot_lr_curve(json_file, cfg):
+    data_dict = dict(LearningRate=[], Momentum=[])
+    assert os.path.isfile(json_file)
+    with open(json_file) as f:
+        for line in f:
+            log = json.loads(line.strip())
+            data_dict['LearningRate'].append(log['lr'])
+            data_dict['Momentum'].append(log['momentum'])
+
+    wind_w, wind_h = (int(size) for size in cfg.window_size.split('*'))
+    # if legend is None, use {filename}_{key} as legend
+    fig, axes = plt.subplots(2, 1, figsize=(wind_w, wind_h))
+    plt.subplots_adjust(hspace=0.5)
+    font_size = 20
+    for index, (updater_type, data_list) in enumerate(data_dict.items()):
+        ax = axes[index]
+        if cfg.runner.type == 'EpochBasedRunner':
+            ax.plot(data_list, linewidth=1)
+            ax.xaxis.tick_top()
+            ax.set_xlabel('Iters', fontsize=font_size)
+            ax.xaxis.set_label_position('top')
+            sec_ax = ax.secondary_xaxis(
+                'bottom',
+                functions=(lambda x: x / cfg.num_iters * cfg.log_interval,
+                           lambda y: y * cfg.num_iters / cfg.log_interval))
+            sec_ax.tick_params(labelsize=font_size)
+            sec_ax.set_xlabel('Epochs', fontsize=font_size)
+        else:
+            # plt.subplot(2, 1, index + 1)
+            x_list = np.arange(len(data_list)) * cfg.log_interval
+            ax.plot(x_list, data_list)
+            ax.set_xlabel('Iters', fontsize=font_size)
+        ax.set_ylabel(updater_type, fontsize=font_size)
+        if updater_type == 'LearningRate':
+            if cfg.get('lr_config'):
+                title = cfg.lr_config.type
+            else:
+                title = 'No learning rate scheduler'
+        else:
+            if cfg.get('momentum_config'):
+                title = cfg.momentum_config.type
+            else:
+                title = 'No momentum scheduler'
+        ax.set_title(title, fontsize=font_size)
+        ax.grid()
+        # set tick font size
+        ax.tick_params(labelsize=font_size)
+    save_path = osp.join(cfg.work_dir, 'visualization-result')
+    plt.savefig(save_path)
+    print(f'The learning rate graph is saved at {save_path}.png')
+    plt.show()
+
+
+def main():
+    args = parse_args()
+    timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
+    cfg = mmcv.Config.fromfile(args.config)
+    cfg['num_iters'] = args.num_iters
+    cfg['num_epochs'] = args.num_epochs
+    cfg['log_interval'] = args.log_interval
+    cfg['window_size'] = args.window_size
+
+    log_path = osp.join(cfg.get('work_dir', './'), f'{timestamp}.log')
+    json_path = log_path + '.json'
+    logger = get_logger('mmcv', log_path)
+
+    run(cfg, logger)
+    plot_lr_curve(json_path, cfg)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/mmcv/.dockerignore b/mmcv/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..8c22f226d3e2d8a625515290691d2cfc6ed87f2e
--- /dev/null
+++ b/mmcv/.dockerignore
@@ -0,0 +1,6 @@
+.git
+.gitignore
+*.egg-info
+.eggs/
+.mypy-cache
+pip-wheel-metadata
diff --git a/mmcv/.github/ISSUE_TEMPLATE/config.yml b/mmcv/.github/ISSUE_TEMPLATE/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9ca189206785218a14096a6f9563b1f976ffb12f
--- /dev/null
+++ b/mmcv/.github/ISSUE_TEMPLATE/config.yml
@@ -0,0 +1,9 @@
+blank_issues_enabled: false
+
+contact_links:
+  - name: Common Issues
+    url: https://mmcv.readthedocs.io/en/latest/trouble_shooting.html
+    about: Check if your issue already has solutions
+  - name: MMCV Documentation
+    url: https://mmcv.readthedocs.io/en/latest/
+    about: Check if your question is answered in docs
diff --git a/mmcv/.github/ISSUE_TEMPLATE/feature_request.md b/mmcv/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000000000000000000000000000000000000..7bf92e8c912df6839eb755715c181f5fc7244f36
--- /dev/null
+++ b/mmcv/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,21 @@
+---
+name: Feature request
+about: Suggest an idea for this project
+title: ''
+labels: ''
+assignees: ''
+---
+
+**Describe the feature**
+
+**Motivation**
+A clear and concise description of the motivation of the feature.
+Ex1. It is inconvenient when \[....\].
+Ex2. There is a recent paper \[....\], which is very helpful for \[....\].
+
+**Related resources**
+If there is an official code release or third-party implementations, please also provide the information here, which would be very helpful.
+
+**Additional context**
+Add any other context or screenshots about the feature request here.
+If you would like to implement the feature and create a PR, please leave a comment here and that would be much appreciated.
diff --git a/mmcv/.github/ISSUE_TEMPLATE/general_questions.md b/mmcv/.github/ISSUE_TEMPLATE/general_questions.md
new file mode 100644
index 0000000000000000000000000000000000000000..b5eaf2610781037b0cbea9a146c034ebb36f2934
--- /dev/null
+++ b/mmcv/.github/ISSUE_TEMPLATE/general_questions.md
@@ -0,0 +1,12 @@
+---
+name: General questions
+about: Ask general questions to get help
+title: ''
+labels: ''
+assignees: ''
+---
+
+**Checklist**
+
+1. I have searched related issues but cannot get the expected help.
+2. I have read the FAQ documentation but cannot get the expected help.
diff --git a/mmcv/.github/ISSUE_TEMPLATE/unexpected_report.md b/mmcv/.github/ISSUE_TEMPLATE/unexpected_report.md
new file mode 100644
index 0000000000000000000000000000000000000000..b0ccc0fd18bc1548d517e05afe9ae183d32bb0f9
--- /dev/null
+++ b/mmcv/.github/ISSUE_TEMPLATE/unexpected_report.md
@@ -0,0 +1,45 @@
+---
+name: Unexpected Results
+about: Create a report to help us improve
+title: ''
+labels: ''
+assignees: ''
+---
+
+Thanks for reporting the unexpected results and we appreciate it a lot.
+
+**Checklist**
+
+1. I have searched related issues but cannot get the expected help.
+2. I have read the [FAQ documentation](https://mmcv.readthedocs.io/en/latest/trouble_shooting.html) but cannot get the expected help.
+3. The unexpected results still exist in the latest version.
+
+**Describe the Issue**
+A clear and concise description of what the bug is, including what results are expected and what the real results you got.
+
+**Reproduction**
+
+1. What command, code, or script did you run?
+
+```bash
+A placeholder for the command.
+```
+
+2. Did you make any modifications on the code? Did you understand what you have modified?
+
+**Environment**
+
+1. Please run `python -c "from mmcv.utils import collect_env; print(collect_env())"` to collect necessary environment information and paste it here.
+2. You may add addition that may be helpful for locating the problem, such as
+   - How you installed PyTorch \[e.g., pip, conda, source\]
+   - Other environment variables that may be related (such as `$PATH`, `$LD_LIBRARY_PATH`, `$PYTHONPATH`, etc.)
+
+**Error traceback**
+If applicable, paste the error traceback here.
+
+```none
+A placeholder for traceback.
+```
+
+**Bug fix**
+If you have already identified the reason, you can provide the information here. If you are willing to create a PR to fix it, please also leave a comment here and that would be much appreciated!
diff --git a/mmcv/.github/pull_request_template.md b/mmcv/.github/pull_request_template.md
new file mode 100644
index 0000000000000000000000000000000000000000..0980b85db1c5fc90b2a8c32aa5fbdf923b25bf32
--- /dev/null
+++ b/mmcv/.github/pull_request_template.md
@@ -0,0 +1,33 @@
+Thanks for your contribution and we appreciate it a lot. The following instructions would make your pull request more healthy and more easily get feedback. If you do not understand some items, don't worry, just make the pull request and seek help from maintainers.
+
+## Motivation
+
+Please describe the motivation of this PR and the goal you want to achieve through this PR.
+
+## Modification
+
+Please briefly describe what modification is made in this PR.
+
+## BC-breaking (Optional)
+
+Does the modification introduce changes that break the backward-compatibility of the downstream repositories?
+If so, please describe how it breaks the compatibility and how the downstream projects should modify their code to keep compatibility with this PR.
+
+## Use cases (Optional)
+
+If this PR introduces a new feature, it is better to list some use cases here, and update the documentation.
+
+## Checklist
+
+**Before PR**:
+
+- [ ] I have read and followed the workflow indicated in the [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) to create this PR.
+- [ ] Pre-commit or linting tools indicated in [CONTRIBUTING.md](https://github.com/open-mmlab/mmcv/blob/master/CONTRIBUTING.md) are used to fix the potential lint issues.
+- [ ] Bug fixes are covered by unit tests, the case that causes the bug should be added in the unit tests.
+- [ ] New functionalities are covered by complete unit tests. If not, please add more unit test to ensure the correctness.
+- [ ] The documentation has been modified accordingly, including docstring or example tutorials.
+
+**After PR**:
+
+- [ ] If the modification has potential influence on downstream or other related projects, this PR should be tested with some of those projects, like MMDet or MMCls.
+- [ ] CLA has been signed and all committers have signed the CLA in this PR.
diff --git a/mmcv/.github/workflows/build.yml b/mmcv/.github/workflows/build.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e2ec9d8796e3309227abe85319e55486dac744bd
--- /dev/null
+++ b/mmcv/.github/workflows/build.yml
@@ -0,0 +1,404 @@
+name: build
+
+on:
+  push:
+    paths-ignore:
+      - 'README.md'
+      - 'README_zh-CN.md'
+      - 'docs/**'
+      - 'examples/**'
+      - '.dev_scripts/**'
+      - 'docker/**'
+
+  pull_request:
+    paths-ignore:
+      - 'README.md'
+      - 'README_zh-CN.md'
+      - 'docs/**'
+      - 'examples/**'
+      - '.dev_scripts/**'
+      - 'docker/**'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  MMCV_WITH_OPS: 1
+
+jobs:
+  build_without_torch:
+    runs-on: ubuntu-18.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+      - name: Build and install
+        run: rm -rf .eggs && pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests and generate coverage report
+        run: |
+          pip install -r requirements/test.txt
+          pytest tests/ \
+              --ignore=tests/test_runner \
+              --ignore=tests/test_device/test_ipu \
+              --ignore=tests/test_optimizer.py \
+              --ignore=tests/test_cnn \
+              --ignore=tests/test_parallel.py \
+              --ignore=tests/test_ops \
+              --ignore=tests/test_load_model_zoo.py \
+              --ignore=tests/test_utils/test_logging.py \
+              --ignore=tests/test_image/test_io.py \
+              --ignore=tests/test_utils/test_registry.py \
+              --ignore=tests/test_utils/test_parrots_jit.py \
+              --ignore=tests/test_utils/test_trace.py \
+              --ignore=tests/test_utils/test_hub.py \
+              --ignore=tests/test_device \
+              --ignore=tests/test_utils/test_torch_ops.py
+
+  build_without_ops:
+    runs-on: ubuntu-18.04
+    env:
+      MMCV_WITH_OPS: 0
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.7.0, 1.8.0, 1.9.0]
+        include:
+          - torch: 1.7.0
+            torchvision: 0.8.1
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Build and install
+        run: rm -rf .eggs && pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests
+        run: |
+          pip install -r requirements/test.txt
+          pytest tests/ --ignore=tests/test_ops
+
+  build_cpu:
+    runs-on: ubuntu-18.04
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.5.1, 1.6.0, 1.7.0, 1.8.0, 1.9.0]
+        include:
+          - torch: 1.5.1
+            torchvision: 0.6.1
+          - torch: 1.6.0
+            torchvision: 0.7.0
+          - torch: 1.7.0
+            torchvision: 0.8.1
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install system dependencies
+        run: sudo apt-get update && sudo apt-get install -y ffmpeg libturbojpeg
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      # pstuil is an optional package to detect the number of CPU for compiling mmcv
+      - name: Install psutil
+        run: pip install psutil
+      - name: Create sdist and untar
+        run: |
+          MMCV_WITH_OPS=1 python setup.py sdist
+          tar zxvf dist/mmcv-full* -C /tmp
+          rm -r mmcv
+      - name: Build and install from sdist
+        run: |
+          pushd /tmp/mmcv-full*
+          pip install -e .
+          popd
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests and generate coverage report
+        run: |
+          pip install -r requirements/test.txt
+          coverage run --branch --source=mmcv -m pytest tests/
+          coverage xml
+          coverage report -m
+
+  build_cu101:
+    runs-on: ubuntu-18.04
+    container:
+      image: pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel
+    env:
+      FORCE_CUDA: 1
+      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.3.1, 1.5.1+cu101, 1.6.0+cu101, 1.7.0+cu101, 1.8.0+cu101]
+        include:
+          - torch: 1.3.1
+            torchvision: 0.4.2
+          - torch: 1.5.1+cu101
+            torchvision: 0.6.1+cu101
+          - torch: 1.6.0+cu101
+            torchvision: 0.7.0+cu101
+          - torch: 1.7.0+cu101
+            torchvision: 0.8.1+cu101
+          - torch: 1.8.0+cu101
+            torchvision: 0.9.0+cu101
+          - python-version: 3.6
+            torch: 1.8.0+cu101
+            torchvision: 0.9.0+cu101
+          - python-version: 3.8
+            torch: 1.8.0+cu101
+            torchvision: 0.9.0+cu101
+          - python-version: 3.9
+            torch: 1.8.0+cu101
+            torchvision: 0.9.0+cu101
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Install python-dev
+        run: apt-get update && apt-get install -y python${{matrix.python-version}}-dev
+        if: ${{matrix.python-version != '3.9'}}
+      - name: Install Pillow
+        run: python -m pip install Pillow==6.2.2
+        if: ${{matrix.torchvision == '0.4.2'}}
+      # When we use a third-party container, we need to add python -m to call
+      # the user-installed pip when we use the pip command, otherwise it will
+      # call the system pip
+      - name: Install PyTorch
+        run: python -m pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y ffmpeg libturbojpeg ninja-build
+      - name: Install dependencies for compiling onnx when python=3.9
+        run: python -m pip install protobuf && apt-get -y install libprotobuf-dev protobuf-compiler cmake
+        if: ${{matrix.python-version == '3.9'}}
+      # pstuil is an optional package to detect the number of CPU for compiling mmcv
+      - name: Install psutil
+        run: python -m pip install psutil
+      - name: Build and install
+        run: rm -rf .eggs && python -m pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests and generate coverage report
+        run: |
+          python -m pip install -r requirements/test.txt
+          coverage run --branch --source=mmcv -m pytest tests/
+          coverage xml
+          coverage report -m
+      # Only upload coverage report for python3.7 && pytorch1.6
+      - name: Upload coverage to Codecov
+        if: ${{matrix.torch == '1.6.0+cu101' && matrix.python-version == '3.7'}}
+        uses: codecov/codecov-action@v1.0.14
+        with:
+          file: ./coverage.xml
+          flags: unittests
+          env_vars: OS,PYTHON
+          name: codecov-umbrella
+          fail_ci_if_error: false
+
+  build_cu102:
+    runs-on: ubuntu-18.04
+    container:
+      image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
+    env:
+      FORCE_CUDA: 1
+      MMCV_CUDA_ARGS: -gencode=arch=compute_61,code=sm_61
+    strategy:
+      matrix:
+        python-version: [3.7]
+        torch: [1.9.0+cu102, 1.10.0+cu102]
+        include:
+          - torch: 1.9.0+cu102
+            torchvision: 0.10.0+cu102
+          - torch: 1.10.0+cu102
+            torchvision: 0.11.0+cu102
+          - python-version: '3.10'
+            torch: 1.11.0+cu102
+            torchvision: 0.12.0+cu102
+          - python-version: '3.10'
+            torch: 1.12.0+cu102
+            torchvision: 0.13.0+cu102
+          - python-version: 3.6
+            torch: 1.9.0+cu102
+            torchvision: 0.10.0+cu102
+          - python-version: 3.8
+            torch: 1.9.0+cu102
+            torchvision: 0.10.0+cu102
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Fetch GPG keys
+        run: |
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+      - name: Add PPA
+        run: |
+          apt-get update && apt-get install -y software-properties-common
+          add-apt-repository -y ppa:deadsnakes/ppa
+      - name: Install python-dev
+        run: apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y python${{matrix.python-version}}-dev
+      - name: python -m Install PyTorch
+        run: python -m pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Install system dependencies
+        run: apt-get update && apt-get install -y ffmpeg libturbojpeg ninja-build
+      # pstuil is an optional package to detect the number of CPU for compiling mmcv
+      - name: Install psutil
+        run: python -m pip install psutil
+      - name: Build and install
+        run: rm -rf .eggs && python -m pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests and generate coverage report
+        run: |
+          python -m pip install -r requirements/test.txt
+          coverage run --branch --source=mmcv -m pytest tests/
+          coverage xml
+        if: ${{matrix.python-version != '3.10'}}
+      # special treatment for python3.10 because onnx and onnxruntime don't provide python3.10 pre-built packages
+      - name: Run unittests and generate coverage report for python3.10
+        run: |
+          python -m pip install -r requirements/test.txt
+          coverage run --branch --source=mmcv -m pytest tests/ --ignore=tests/test_ops/test_onnx.py --ignore=tests/test_ops/test_tensorrt.py --ignore=tests/test_ops/test_tensorrt_preprocess.py
+          coverage xml
+        if: ${{matrix.python-version == '3.10'}}
+
+
+  build_windows_without_ops:
+    runs-on: windows-latest
+    env:
+      MMCV_WITH_OPS: 0
+    strategy:
+      matrix:
+        torch: [1.7.1, 1.8.0, 1.9.0]
+        include:
+          - torch: 1.7.1
+            torchvision: 0.8.2
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu --no-cache-dir -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Build and install
+        run: pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests
+        run: |
+          pip install -r requirements/test.txt
+          pytest tests/ --ignore=tests/test_ops --ignore tests/test_utils/test_progressbar.py --ignore tests/test_utils/test_timer.py --ignore tests/test_image/test_io.py
+
+  build_windows:
+    runs-on: windows-latest
+    strategy:
+      matrix:
+        torch: [1.7.1, 1.8.0, 1.9.0]
+        include:
+          - torch: 1.7.1
+            torchvision: 0.8.2
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu --no-cache-dir -f https://download.pytorch.org/whl/torch_stable.html
+      - name: Build and install
+        run: pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests
+        run: |
+          pip install -r requirements/test.txt
+          pytest tests/ --ignore tests/test_utils/test_progressbar.py --ignore tests/test_utils/test_timer.py --ignore tests/test_image/test_io.py
+
+  build_macos:
+    runs-on: macos-latest
+    strategy:
+      matrix:
+        torch: [1.3.1, 1.5.1, 1.6.0, 1.7.0, 1.8.0, 1.9.0]
+        include:
+          - torch: 1.3.1
+            torchvision: 0.4.2
+          - torch: 1.5.1
+            torchvision: 0.6.1
+          - torch: 1.6.0
+            torchvision: 0.7.0
+          - torch: 1.7.0
+            torchvision: 0.8.1
+          - torch: 1.8.0
+            torchvision: 0.9.0
+          - torch: 1.9.0
+            torchvision: 0.10.0
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install system dependencies
+        run: brew install ffmpeg jpeg-turbo
+      - name: Install utils
+        run: pip install psutil
+      - name: Install Pillow
+        run: pip install Pillow==6.2.2
+        if: ${{matrix.torchvision == '0.4.2'}}
+      - name: Install PyTorch
+        run: pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} --no-cache-dir
+      - name: Build and install
+        run: |
+          rm -rf .eggs
+          CC=clang CXX=clang++ CFLAGS='-stdlib=libc++' pip install -e .
+      - name: Validate the installation
+        run: python -c "import mmcv"
+      - name: Run unittests
+        run: |
+          pip install -r requirements/test.txt
+          # The timing on macos VMs is not precise, so we skip the progressbar tests
+          pytest tests/ --ignore tests/test_utils/test_progressbar.py --ignore tests/test_utils/test_timer.py
diff --git a/mmcv/.github/workflows/build_pat.yml b/mmcv/.github/workflows/build_pat.yml
new file mode 100644
index 0000000000000000000000000000000000000000..9b02c3f41a546df213e5bf2c5e15e9047ed6c494
--- /dev/null
+++ b/mmcv/.github/workflows/build_pat.yml
@@ -0,0 +1,26 @@
+name: build_pat
+
+on: push
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+env:
+  MMCV_WITH_OPS: 1
+
+jobs:
+  build_parrots:
+    runs-on: ubuntu-18.04
+    container:
+      image: ghcr.io/zhouzaida/parrots-mmcv:1.3.4
+      credentials:
+        username: zhouzaida
+        password: ${{ secrets.CR_PAT }}
+
+    steps:
+      - uses: actions/checkout@v2
+      - name: Install unittest dependencies
+        run: pip install -r requirements/test.txt
+      - name: Build and install
+        run: rm -rf .eggs && MMCV_WITH_OPS=1 pip install -e .
diff --git a/mmcv/.github/workflows/lint.yml b/mmcv/.github/workflows/lint.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7f0550681d804d137182d40396d6d42973acc83b
--- /dev/null
+++ b/mmcv/.github/workflows/lint.yml
@@ -0,0 +1,29 @@
+name: lint
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    runs-on: ubuntu-18.04
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.7
+      - name: Install pre-commit hook
+        run: |
+          pip install pre-commit
+          pre-commit install
+      - name: Linting
+        run: pre-commit run --all-files
+      - name: Format c/cuda codes with clang-format
+        uses: DoozyX/clang-format-lint-action@v0.11
+        with:
+          source: mmcv/ops/csrc
+          extensions: h,c,cpp,hpp,cu,cuh
+          style: google
diff --git a/mmcv/.github/workflows/publish-to-pypi.yml b/mmcv/.github/workflows/publish-to-pypi.yml
new file mode 100644
index 0000000000000000000000000000000000000000..04b0add31fd12d808f58dc45bc6e02eb2ad59623
--- /dev/null
+++ b/mmcv/.github/workflows/publish-to-pypi.yml
@@ -0,0 +1,46 @@
+name: deploy
+
+on: push
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-n-publish:
+    runs-on: ubuntu-18.04
+    if: startsWith(github.event.ref, 'refs/tags')
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.7
+      - name: Upgrade Setuptools
+        run: pip install setuptools --upgrade
+      - name: Build MMCV
+        run: python setup.py sdist
+      - name: Publish distribution to PyPI
+        run: |
+          pip install twine
+          twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}
+
+  build-n-publish_with_ops:
+    runs-on: ubuntu-18.04
+    if: startsWith(github.event.ref, 'refs/tags')
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.7
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.7
+      - name: Upgrade Setuptools
+        run: pip install setuptools --upgrade
+      - name: Build MMCV with ops
+        run: |
+          sed -i "s/os.getenv('MMCV_WITH_OPS', '0')/os.getenv('MMCV_WITH_OPS', '1')/g" setup.py
+          python setup.py sdist
+      - name: Publish distribution to PyPI
+        run: |
+          pip install twine
+          twine upload dist/* -u __token__ -p ${{ secrets.pypi_password }}
diff --git a/mmcv/.gitignore b/mmcv/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..10a38f688f192d041d9aac98a2ace4bb8b1afd62
--- /dev/null
+++ b/mmcv/.gitignore
@@ -0,0 +1,121 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# PyTorch checkpoint
+*.pth
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/en/_build/
+docs/zh_cn/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# editors and IDEs
+.idea/
+.vscode/
+
+# custom
+.DS_Store
+
+# datasets and logs and checkpoints
+data/
+work_dir/
+
+src/
diff --git a/mmcv/.owners.yml b/mmcv/.owners.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8f7057cb339a36314e92be9b74d0b8ab1df2defc
--- /dev/null
+++ b/mmcv/.owners.yml
@@ -0,0 +1,14 @@
+assign:
+  strategy:
+    # random
+    daily-shift-based
+  scedule:
+    '*/1 * * * *'
+  assignees:
+    - zhouzaida
+    - ice-tong
+    - HAOCHENYE
+    - zhouzaida
+    - ice-tong
+    - HAOCHENYE
+    - zhouzaida
diff --git a/mmcv/.pre-commit-config-zh-cn.yaml b/mmcv/.pre-commit-config-zh-cn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0b0f85cacda50d99f1de49dda04d799ce25234a6
--- /dev/null
+++ b/mmcv/.pre-commit-config-zh-cn.yaml
@@ -0,0 +1,72 @@
+exclude: ^tests/data/
+repos:
+  - repo: https://gitee.com/openmmlab/mirrors-flake8
+    rev: 3.8.3
+    hooks:
+      - id: flake8
+  - repo: https://gitee.com/openmmlab/mirrors-isort
+    rev: 5.10.1
+    hooks:
+      - id: isort
+  - repo: https://gitee.com/openmmlab/mirrors-yapf
+    rev: v0.30.0
+    hooks:
+      - id: yapf
+  - repo: https://gitee.com/openmmlab/mirrors-pre-commit-hooks
+    rev: v3.1.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+  - repo: https://gitee.com/openmmlab/mirrors-codespell
+    rev: v2.1.0
+    hooks:
+      - id: codespell
+  - repo: https://gitee.com/openmmlab/mirrors-mdformat
+    rev: 0.7.9
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
+  - repo: https://gitee.com/openmmlab/mirrors-docformatter
+    rev: v1.3.1
+    hooks:
+      - id: docformatter
+        args: ["--in-place", "--wrap-descriptions", "79"]
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v2.32.1
+    hooks:
+      - id: pyupgrade
+        args: ["--py36-plus"]
+  - repo: https://gitee.com/openmmlab/pre-commit-hooks
+    rev: v0.2.0  # Use the ref you want to point at
+    hooks:
+      - id: check-copyright
+        args: ["mmcv", "tests", "--excludes", "mmcv/ops"]
+  - repo: https://gitee.com/openmmlab/mirrors-mypy
+    rev: v0.812
+    hooks:
+      - id: mypy
+        exclude: |-
+          (?x)(
+              ^test
+              | ^docs
+          )
+  # - repo: local
+  #   hooks:
+  #     - id: clang-format
+  #       name: clang-format
+  #       description: Format files with ClangFormat
+  #       entry: clang-format -style=google -i
+  #       language: system
+  #       files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
diff --git a/mmcv/.pre-commit-config.yaml b/mmcv/.pre-commit-config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f4dd84c0b4689bf6ecec35ce39c80abef077426f
--- /dev/null
+++ b/mmcv/.pre-commit-config.yaml
@@ -0,0 +1,72 @@
+exclude: ^tests/data/
+repos:
+  - repo: https://github.com/PyCQA/flake8
+    rev: 3.8.3
+    hooks:
+      - id: flake8
+  - repo: https://github.com/PyCQA/isort
+    rev: 5.10.1
+    hooks:
+      - id: isort
+  - repo: https://github.com/pre-commit/mirrors-yapf
+    rev: v0.30.0
+    hooks:
+      - id: yapf
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v3.1.0
+    hooks:
+      - id: trailing-whitespace
+      - id: check-yaml
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: double-quote-string-fixer
+      - id: check-merge-conflict
+      - id: fix-encoding-pragma
+        args: ["--remove"]
+      - id: mixed-line-ending
+        args: ["--fix=lf"]
+  - repo: https://github.com/codespell-project/codespell
+    rev: v2.1.0
+    hooks:
+      - id: codespell
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.9
+    hooks:
+      - id: mdformat
+        args: ["--number"]
+        additional_dependencies:
+          - mdformat-openmmlab
+          - mdformat_frontmatter
+          - linkify-it-py
+  - repo: https://github.com/myint/docformatter
+    rev: v1.3.1
+    hooks:
+      - id: docformatter
+        args: ["--in-place", "--wrap-descriptions", "79"]
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v2.32.1
+    hooks:
+      - id: pyupgrade
+        args: ["--py36-plus"]
+  - repo: https://github.com/open-mmlab/pre-commit-hooks
+    rev: v0.2.0  # Use the ref you want to point at
+    hooks:
+      - id: check-copyright
+        args: ["mmcv", "tests", "--excludes", "mmcv/ops"]
+  - repo: https://github.com/pre-commit/mirrors-mypy
+    rev: v0.812
+    hooks:
+      - id: mypy
+        exclude: |-
+          (?x)(
+              ^test
+              | ^docs
+          )
+  # - repo: local
+  #   hooks:
+  #     - id: clang-format
+  #       name: clang-format
+  #       description: Format files with ClangFormat
+  #       entry: clang-format -style=google -i
+  #       language: system
+  #       files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$
diff --git a/mmcv/.readthedocs.yml b/mmcv/.readthedocs.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7d5f1c2060a64e5cf9c2bec433cd24532a283164
--- /dev/null
+++ b/mmcv/.readthedocs.yml
@@ -0,0 +1,9 @@
+version: 2
+
+formats: all
+
+python:
+  version: 3.7
+  install:
+    - requirements: requirements/runtime.txt
+    - requirements: requirements/docs.txt
diff --git a/mmcv/CITATION.cff b/mmcv/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..786117aac3e063efc18ad1b55e163d570a09e379
--- /dev/null
+++ b/mmcv/CITATION.cff
@@ -0,0 +1,8 @@
+cff-version: 1.2.0
+message: "If you use this software, please cite it as below."
+authors:
+  - name: "MMCV Contributors"
+title: "OpenMMLab Computer Vision Foundation"
+date-released: 2018-08-22
+url: "https://github.com/open-mmlab/mmcv"
+license: Apache-2.0
diff --git a/mmcv/CONTRIBUTING.md b/mmcv/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..eea0b2544fd606d8593f1b2f12008a76673829d1
--- /dev/null
+++ b/mmcv/CONTRIBUTING.md
@@ -0,0 +1,59 @@
+## Contributing to OpenMMLab
+
+All kinds of contributions are welcome, including but not limited to the following.
+
+- Fix typo or bugs
+- Add documentation or translate the documentation into other languages
+- Add new features and components
+
+### Workflow
+
+1. fork and pull the latest OpenMMLab repository
+2. checkout a new branch (do not use master branch for PRs)
+3. commit your changes
+4. create a PR
+
+```{note}
+If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first.
+```
+
+### Code style
+
+#### Python
+
+We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code style.
+
+We use the following tools for linting and formatting:
+
+- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.
+- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
+- [yapf](https://github.com/google/yapf): A formatter for Python files.
+- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
+- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
+- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
+
+Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).
+
+We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
+fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
+The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).
+
+After you clone the repository, you will need to install initialize pre-commit hook.
+
+```shell
+pip install -U pre-commit
+```
+
+From the repository folder
+
+```shell
+pre-commit install
+```
+
+After this on every commit check code linters and formatter will be enforced.
+
+> Before you create a PR, make sure that your code lints and is formatted by yapf.
+
+#### C++ and CUDA
+
+We follow the [Google C++ Style Guide](https://google.github.io/styleguide/cppguide.html).
diff --git a/mmcv/Jenkinsfile b/mmcv/Jenkinsfile
new file mode 100644
index 0000000000000000000000000000000000000000..f0c19d9f3c3e0efc9ed218efa2259c598e383a06
--- /dev/null
+++ b/mmcv/Jenkinsfile
@@ -0,0 +1,56 @@
+def docker_images = ["registry.cn-hangzhou.aliyuncs.com/sensetime/openmmlab:cuda10.1-cudnn7-devel-ubuntu18.04-py37-pt1.3",
+                     "registry.cn-hangzhou.aliyuncs.com/sensetime/openmmlab:cuda10.2-cudnn7-devel-ubuntu18.04-py37-pt1.5"]
+def torch_versions = ["1.3.0", "1.5.0"]
+def torchvision_versions = ["0.4.2", "0.6.0"]
+
+
+def get_stages(docker_image, folder) {
+    def pip_mirror = "-i https://mirrors.aliyun.com/pypi/simple"
+    stages = {
+        docker.image(docker_image).inside('-u root --gpus all --net host') {
+            sh "rm -rf ${env.WORKSPACE}-${folder} ${env.WORKSPACE}-${folder}@tmp"
+            sh "cp -r ${env.WORKSPACE} ${env.WORKSPACE}-${folder}"
+            try {
+                dir("${env.WORKSPACE}-${folder}") {
+                    stage("before_install") {
+                        sh "apt-get update && apt-get install -y ninja-build"
+                    }
+                    stage("dependencies") {
+                        // torch and torchvision are pre-installed in dockers
+                        sh "pip list | grep torch"
+                        sh "apt-get install -y ffmpeg libturbojpeg"
+                        sh "pip install pytest coverage lmdb PyTurboJPEG Cython ${pip_mirror}"
+                    }
+                    stage("build") {
+                        sh "MMCV_WITH_OPS=1 pip install -e . ${pip_mirror}"
+                    }
+                    stage("test") {
+                        sh "coverage run --branch --source=mmcv -m pytest tests/"
+                        sh "coverage xml"
+                        sh "coverage report -m"
+                    }
+                }
+            } finally {
+                sh "rm -rf ${env.WORKSPACE}-${folder} ${env.WORKSPACE}-${folder}@tmp"
+            }
+        }
+    }
+    return stages
+}
+
+
+node('master') {
+    // fetch latest change from SCM (Source Control Management)
+    checkout scm
+
+    def stages = [:]
+    for (int i = 0; i < docker_images.size(); i++) {
+        def docker_image = docker_images[i]
+        def torch = torch_versions[i]
+        def torchvision = torchvision_versions[i]
+        def tag = docker_image + '_' + torch + '_' + torchvision
+        def folder = "${i}"
+        stages[tag] = get_stages(docker_image, folder)
+    }
+    parallel stages
+}
diff --git a/mmcv/LICENSE b/mmcv/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..f02314255d824c0816b0bf1648aac8ab78976199
--- /dev/null
+++ b/mmcv/LICENSE
@@ -0,0 +1,203 @@
+Copyright (c) OpenMMLab. All rights reserved
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2018-2020 Open-MMLab. All rights reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/mmcv/LICENSES.md b/mmcv/LICENSES.md
new file mode 100644
index 0000000000000000000000000000000000000000..5de8358331f4d21529e016807b86b66dc6ca29da
--- /dev/null
+++ b/mmcv/LICENSES.md
@@ -0,0 +1,8 @@
+# Licenses for special operations
+
+In this file, we list the operations with other licenses instead of Apache 2.0. Users should be careful about adopting these operations in any commercial matters.
+
+|    Operation     |                                                                               Files                                                                               |    License     |
+| :--------------: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------: | :------------: |
+|    upfirdn2d     |          [mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu)          | NVIDIA License |
+| fused_leaky_relu | [mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu](https://github.com/open-mmlab/mmcv/blob/master/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu) | NVIDIA License |
diff --git a/mmcv/MANIFEST.in b/mmcv/MANIFEST.in
new file mode 100644
index 0000000000000000000000000000000000000000..5de8494b5df3656a4f6a09da26d9f4bb27ed69a5
--- /dev/null
+++ b/mmcv/MANIFEST.in
@@ -0,0 +1,7 @@
+include requirements/runtime.txt
+include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model_zoo/mmcls.json mmcv/model_zoo/torchvision_0.12.json
+include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp
+include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp
+include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp
+include mmcv/ops/csrc/pytorch/mps/*.mm mmcv/ops/csrc/common/mps/*.h mmcv/ops/csrc/common/mps/*.mm
+recursive-include mmcv/ops/csrc/ *.h *.hpp *.cpp *.cuh *.cu *.mm
diff --git a/mmcv/README.md b/mmcv/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1a6541a689a48944394db84b48d5b484e63a8708
--- /dev/null
+++ b/mmcv/README.md
@@ -0,0 +1,274 @@
+<div align="center">
+  <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/mmcv-logo.png" width="300"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab website</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab platform</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+</div>
+
+[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmcv.readthedocs.io/en/latest/)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)
+[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)
+[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)
+[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
+
+English | [简体中文](README_zh-CN.md)
+
+## Introduction
+
+MMCV is a foundational library for computer vision research and supports many
+research projects as below:
+
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab model deployment framework.
+
+It provides the following functionalities.
+
+- Universal IO APIs
+- Image/Video processing
+- Image and annotation visualization
+- Useful utilities (progress bar, timer, ...)
+- PyTorch runner with hooking mechanism
+- Various CNN architectures
+- High-quality implementation of common CUDA ops
+
+It supports the following systems.
+
+- Linux
+- Windows
+- macOS
+
+See the [documentation](http://mmcv.readthedocs.io/en/latest) for more features and usage.
+
+Note: MMCV requires Python 3.6+.
+
+## Installation
+
+There are two versions of MMCV:
+
+- **mmcv-full**: comprehensive, with full features and various CUDA ops out of box. It takes longer time to build.
+- **mmcv**: lite, without CUDA ops but all other features, similar to mmcv\<1.0.0. It is useful when you do not need those CUDA ops.
+
+**Note**: Do not install both versions in the same environment, otherwise you may encounter errors like `ModuleNotFound`. You need to uninstall one before installing the other. `Installing the full version is highly recommended if CUDA is available`.
+
+a. Install the full version.
+
+Before installing mmcv-full, make sure that PyTorch has been successfully installed following the [official guide](https://pytorch.org/).
+
+We provide pre-built mmcv packages (recommended) with different PyTorch and CUDA versions to simplify the building for **Linux and Windows systems**. In addition, you can run [check_installation.py](.dev_scripts/check_installation.py) to check the installation of mmcv-full after running the installation commands.
+
+i. Install the latest version.
+
+The rule for installing the latest `mmcv-full` is as follows:
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+```
+
+Please replace `{cu_version}` and `{torch_version}` in the url to your desired one. For example,
+to install the latest `mmcv-full` with `CUDA 11.1` and `PyTorch 1.9.0`, use the following command:
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
+
+**Note**: mmcv-full is only compiled on PyTorch 1.x.0 because the compatibility usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1, you can install mmcv-full compiled with PyTorch 1.x.0 and it usually works well. For example, if your PyTorch version is 1.8.1 and CUDA version is 11.1, you can use the following command to install mmcv-full.
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
+```
+
+For more details, please refer the the following tables and delete `=={mmcv_version}`.
+
+ii. Install a specified version.
+
+The rule for installing a specified `mmcv-full` is as follows:
+
+```shell
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+```
+
+First of all, please refer to the Releases and replace `{mmcv_version}` a specified one. e.g. `1.3.9`.
+Then replace `{cu_version}` and `{torch_version}` in the url to your desired versions. For example,
+to install `mmcv-full==1.3.9` with `CUDA 11.1` and `PyTorch 1.9.0`, use the following command:
+
+```shell
+pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
+
+For more details, please refer the the following tables.
+
+<table class="docutils">
+  <tbody>
+    <tr>
+      <th width="80"> CUDA </th>
+      <th valign="bottom" align="left" width="120">torch 1.11</th>
+      <th valign="bottom" align="left" width="120">torch 1.10</th>
+      <th valign="bottom" align="left" width="120">torch 1.9</th>
+      <th valign="bottom" align="left" width="120">torch 1.8</th>
+      <th valign="bottom" align="left" width="120">torch 1.7</th>
+      <th valign="bottom" align="left" width="120">torch 1.6</th>
+      <th valign="bottom" align="left" width="120">torch 1.5</th>
+    </tr>
+    <tr>
+      <td align="left">11.5</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu115/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"></td>
+      <td align="left"></td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.3</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details></td>
+      <td align="left"></td>
+      <td align="left"></code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.1</td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.0</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">10.2</td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">10.1</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">9.2</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">cpu</td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> install </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+  </tbody>
+</table>
+
+**Note**: The pre-built packages provided above do not include all versions of mmcv-full, you can click on the corresponding links to see the supported versions. For example, you can click [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html) and you can see that `cu102-torch1.8.0` only provides 1.3.0 and above versions of mmcv-full. In addition, We no longer provide `mmcv-full` pre-built packages compiled with `PyTorch 1.3 & 1.4` since v1.3.17. You can find previous versions that compiled with PyTorch 1.3 & 1.4 [here](./docs/en/get_started/previous_versions.md). The compatibility is still ensured in our CI, but we will discard the support of PyTorch 1.3 & 1.4 next year.
+
+**Note**: mmcv-full does not provide pre-built packages for `cu102-torch1.11` and `cu92-torch*` on Windows.
+
+Another way is to compile locally by running
+
+```python
+pip install mmcv-full
+```
+
+Note that the local compiling may take up to 10 mins.
+
+b. Install the lite version.
+
+```python
+pip install mmcv
+```
+
+c. Install full version with custom operators for onnxruntime
+
+- Check [here](docs/en/deployment/onnxruntime_op.md) for detailed instruction.
+
+If you would like to build MMCV from source, please refer to the [guide](https://mmcv.readthedocs.io/en/latest/get_started/build.html).
+
+## FAQ
+
+If you face some installation issues, CUDA related issues or RuntimeErrors,
+you may first refer to this [Frequently Asked Questions](https://mmcv.readthedocs.io/en/latest/faq.html).
+
+## Citation
+
+If you find this project useful in your research, please consider cite:
+
+```latex
+@misc{mmcv,
+    title={{MMCV: OpenMMLab} Computer Vision Foundation},
+    author={MMCV Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmcv}},
+    year={2018}
+}
+```
+
+## Contributing
+
+We appreciate all contributions to improve MMCV. Please refer to [CONTRIBUTING.md](CONTRIBUTING.md) for the contributing guideline.
+
+## License
+
+MMCV is released under the Apache 2.0 license, while some specific operations in this library are with other licenses. Please refer to [LICENSES.md](LICENSES.md) for the careful check, if you are using our code for commercial matters.
diff --git a/mmcv/README_zh-CN.md b/mmcv/README_zh-CN.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c768c837ecddc7f6c4d7e036f590d9d2b96fa64
--- /dev/null
+++ b/mmcv/README_zh-CN.md
@@ -0,0 +1,276 @@
+<div align="center">
+  <img src="https://raw.githubusercontent.com/open-mmlab/mmcv/master/docs/en/mmcv-logo.png" width="300"/>
+  <div>&nbsp;</div>
+  <div align="center">
+    <b><font size="5">OpenMMLab 官网</font></b>
+    <sup>
+      <a href="https://openmmlab.com">
+        <i><font size="4">HOT</font></i>
+      </a>
+    </sup>
+    &nbsp;&nbsp;&nbsp;&nbsp;
+    <b><font size="5">OpenMMLab 开放平台</font></b>
+    <sup>
+      <a href="https://platform.openmmlab.com">
+        <i><font size="4">TRY IT OUT</font></i>
+      </a>
+    </sup>
+  </div>
+  <div>&nbsp;</div>
+</div>
+
+[![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmcv.readthedocs.io/zh_CN/latest/)
+[![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmcv)](https://pypi.org/project/mmcv/)
+[![PyPI](https://img.shields.io/pypi/v/mmcv)](https://pypi.org/project/mmcv)
+[![badge](https://github.com/open-mmlab/mmcv/workflows/build/badge.svg)](https://github.com/open-mmlab/mmcv/actions)
+[![codecov](https://codecov.io/gh/open-mmlab/mmcv/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmcv)
+[![license](https://img.shields.io/github/license/open-mmlab/mmcv.svg)](https://github.com/open-mmlab/mmcv/blob/master/LICENSE)
+
+[English](README.md) | 简体中文
+
+## 简介
+
+MMCV 是一个面向计算机视觉的基础库，它支持了很多开源项目，例如：
+
+- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
+- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
+- [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
+- [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
+- [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
+- [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具箱
+- [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
+- [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
+- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
+- [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
+- [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
+- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
+- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
+- [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
+
+MMCV 提供了如下众多功能：
+
+- 通用的 IO 接口
+- 图像和视频处理
+- 图像和标注结果可视化
+- 常用小工具（进度条，计时器等）
+- 基于 PyTorch 的通用训练框架
+- 多种 CNN 网络结构
+- 高质量实现的常见 CUDA 算子
+
+MMCV 支持以下的系统：
+
+- Linux
+- Windows
+- macOS
+
+如想了解更多特性和使用，请参考[文档](http://mmcv.readthedocs.io/zh_CN/latest)。
+
+提示: MMCV 需要 Python 3.6 以上版本。
+
+## 安装
+
+MMCV 有两个版本：
+
+- **mmcv-full**: 完整版，包含所有的特性以及丰富的开箱即用的 CUDA 算子。注意完整版本可能需要更长时间来编译。
+- **mmcv**: 精简版，不包含 CUDA 算子但包含其余所有特性和功能，类似 MMCV 1.0 之前的版本。如果你不需要使用 CUDA 算子的话，精简版可以作为一个考虑选项。
+
+**注意**: 请不要在同一个环境中安装两个版本，否则可能会遇到类似 `ModuleNotFound` 的错误。在安装一个版本之前，需要先卸载另一个。`如果CUDA可用，强烈推荐安装mmcv-full`。
+
+a. 安装完整版
+
+在安装 mmcv-full 之前，请确保 PyTorch 已经成功安装在环境中，可以参考 PyTorch [官方文档](https://pytorch.org/)。
+
+我们提供了 **Linux 和 Windows 平台** PyTorch 和 CUDA 版本组合的 mmcv-full 预编译包，可以大大简化用户安装编译过程。强烈推荐通过预编译包来安装。另外，安装完成后可以运行 [check_installation.py](.dev_scripts/check_installation.py) 脚本检查 mmcv-full 是否安装成功。
+
+i. 安装最新版本
+
+如下是安装最新版 `mmcv-full` 的命令
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+```
+
+请将链接中的 `{cu_version}` 和 `{torch_version}` 根据自身需求替换成实际的版本号，例如想安装和 `CUDA 11.1`、`PyTorch 1.9.0` 兼容的最新版 `mmcv-full`，使用如下替换过的命令
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
+
+**注意**: PyTorch 在 1.x.0 和 1.x.1 之间通常是兼容的，故 mmcv-full 只提供 1.x.0 的编译包。如果你的 PyTorch 版本是 1.x.1，你可以放心地安装在 1.x.0 版本编译的 mmcv-full。例如，如果你的 PyTorch 版本是 1.8.1、CUDA 版本是 11.1，你可以使用以下命令安装 mmcv-full。
+
+```shell
+pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html
+```
+
+如果想知道更多 CUDA 和 PyTorch 版本的命令，可以参考下面的表格，将链接中的 `=={mmcv_version}` 删去即可。
+
+ii. 安装特定的版本
+
+如下是安装特定版本 `mmcv-full` 的命令
+
+```shell
+pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+```
+
+首先请参考版本发布信息找到想要安装的版本号，将 `{mmcv_version}` 替换成该版本号，例如 `1.3.9`。
+然后将链接中的 `{cu_version}` 和 `{torch_version}` 根据自身需求替换成实际的版本号，例如想安装和 `CUDA 11.1`、`PyTorch 1.9.0` 兼容的 `mmcv-full` 1.3.9 版本，使用如下替换过的命令
+
+```shell
+pip install mmcv-full==1.3.9 -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html
+```
+
+对于更多的 PyTorch 和 CUDA 版本组合，请参考下表：
+
+<table class="docutils">
+  <tbody>
+    <tr>
+      <th width="80"> CUDA </th>
+      <th valign="bottom" align="left" width="120">torch 1.11</th>
+      <th valign="bottom" align="left" width="120">torch 1.10</th>
+      <th valign="bottom" align="left" width="120">torch 1.9</th>
+      <th valign="bottom" align="left" width="120">torch 1.8</th>
+      <th valign="bottom" align="left" width="120">torch 1.7</th>
+      <th valign="bottom" align="left" width="120">torch 1.6</th>
+      <th valign="bottom" align="left" width="120">torch 1.5</th>
+    </tr>
+    <tr>
+      <td align="left">11.5</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu115/torch1.11.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"></td>
+      <td align="left"></code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.3</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"></td>
+      <td align="left"></code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.1</td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">11.0</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu110/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+    </tr>
+    <tr>
+      <td align="left">10.2</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.9.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code>pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu102/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">10.1</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">9.2</td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cu92/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+    <tr>
+      <td align="left">cpu</td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.11.0/index.html</code></pre> </details></td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.10.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.9.0/index.html</code></pre> </details> </td>
+       <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.7.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.6.0/index.html</code></pre> </details> </td>
+      <td align="left"><details><summary> 安装 </summary><pre><code> pip install mmcv-full=={mmcv_version} -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.5.0/index.html</code></pre> </details> </td>
+    </tr>
+  </tbody>
+</table>
+
+**注意**：以上提供的预编译包并不囊括所有的 mmcv-full 版本，你可以点击对应链接查看支持的版本。例如，点击 [cu102-torch1.8.0](https://download.openmmlab.com/mmcv/dist/cu102/torch1.8.0/index.html)，可以看到 `cu102-torch1.8.0` 只提供了 1.3.0 及以上的 mmcv-full 版本。另外，从 `mmcv v1.3.17` 开始，我们不再提供`PyTorch 1.3 & 1.4` 对应的 mmcv-full 预编译包。你可以在 [这](./docs/zh_cn/get_started/previous_versions.md) 找到 `PyTorch 1.3 & 1.4` 对应的预编包。虽然我们不再提供 `PyTorch 1.3 & 1.4` 对应的预编译包，但是我们依然在 CI 中保证对它们的兼容持续到下一年。
+
+**注意**：mmcv-full 没有提供 Windows 平台 `cu102-torch1.8.0` 和 `cu92-torch*` 的预编译包。
+
+除了使用预编译包之外，另一种方式是在本地进行编译，直接运行下述命令
+
+```python
+pip install mmcv-full
+```
+
+但注意本地编译可能会耗时 10 分钟以上。
+
+b. 安装精简版
+
+```python
+pip install mmcv
+```
+
+c. 安装完整版并且编译 onnxruntime 的自定义算子
+
+- 详细的指南请查看[这里](docs/zh_cn/deployment/onnxruntime_op.md)。
+
+如果想从源码编译 MMCV，请参考[该文档](https://mmcv.readthedocs.io/zh_CN/latest/get_started/build.html)。
+
+## FAQ
+
+如果你遇到了安装问题，CUDA 相关的问题或者 RuntimeErrors，可以首先参考[问题解决页面](https://mmcv.readthedocs.io/zh_CN/latest/faq.html) 看是否已经有解决方案。
+
+## 贡献指南
+
+我们感谢所有的贡献者为改进和提升 MMCV 所作出的努力。请参考[贡献指南](CONTRIBUTING.md)来了解参与项目贡献的相关指引。
+
+## 许可证
+
+`MMCV` 目前以 Apache 2.0 的许可证发布，但是其中有一部分功能并不是使用的 Apache2.0 许可证，我们在 [许可证](LICENSES.md) 中详细地列出了这些功能以及他们对应的许可证，如果您正在从事盈利性活动，请谨慎参考此文档。
+
+## 欢迎加入 OpenMMLab 社区
+
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 OpenMMLab 团队的 [官方交流 QQ 群](https://jq.qq.com/?_wv=1027&k=3ijNTqfg)，或添加微信小助手”OpenMMLabwx“加入官方交流微信群。
+
+<div align="center">
+<img src="docs/en/_static/zhihu_qrcode.jpg" height="400" />  <img src="docs/en/_static/qq_group_qrcode.jpg" height="400" /> <img src="docs/en/_static/wechat_qrcode.jpg" height="400" />
+</div>
+
+我们会在 OpenMMLab 社区为大家
+
+- 📢 分享 AI 框架的前沿核心技术
+- 💻 解读 PyTorch 常用模块源码
+- 📰 发布 OpenMMLab 的相关新闻
+- 🚀 介绍 OpenMMLab 开发的前沿算法
+- 🏃 获取更高效的问题答疑和意见反馈
+- 🔥 提供与各行各业开发者充分交流的平台
+
+干货满满 📘，等你来撩 💗，OpenMMLab 社区期待您的加入 👬
diff --git a/mmcv/TERMINOLOGY.md b/mmcv/TERMINOLOGY.md
new file mode 100644
index 0000000000000000000000000000000000000000..07411b7774c2ed713f472c1287b98b871c7f4d02
--- /dev/null
+++ b/mmcv/TERMINOLOGY.md
@@ -0,0 +1,30 @@
+# English-Chinese terminology comparison (英汉术语对照)
+
+This document is used as a reference for English-Chinese terminology translation.
+
+该文档用作中英文翻译对照参考。
+
+|      English      |     中文     |
+| :---------------: | :----------: |
+|    annotation     |     标注     |
+|     backbone      |   主干网络   |
+|     benchmark     |   基准测试   |
+|    checkpoint     | 模型权重文件 |
+|    classifier     |    分类器    |
+|     cls_head      |    分类头    |
+|      decoder      |    解码器    |
+|     detector      |    检测器    |
+|      encoder      |    编码器    |
+|     finetune      |     微调     |
+|   ground truth    |   真实标签   |
+|       hook        |     钩子     |
+|     localizer     |    定位器    |
+|       neck        |   模型颈部   |
+|     pipeline      |    流水线    |
+|    recognizer     |    识别器    |
+|     register      |    注册器    |
+|     schedule      |     调整     |
+|     scheduler     |    调度器    |
+|     segmentor     |    分割器    |
+|      tensor       |     张量     |
+| training schedule |   训练策略   |
diff --git a/mmcv/docker/README.md b/mmcv/docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e9985b4ca645a14c9e3f18bf7afcc0cb4f52bf73
--- /dev/null
+++ b/mmcv/docker/README.md
@@ -0,0 +1,70 @@
+# Docker images
+
+There are two `Dockerfile` files to build docker images, one to build an image with the mmcv-full pre-built package and the other with the mmcv development environment.
+
+```text
+.
+|-- README.md
+|-- dev  # build with mmcv development environment
+|   `-- Dockerfile
+`-- release  # build with mmcv pre-built package
+    `-- Dockerfile
+```
+
+## Build docker images
+
+### Build with mmcv pre-built package
+
+Build with local repository
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
+docker build -t mmcv -f docker/release/Dockerfile .
+```
+
+Or build with remote repository
+
+```bash
+docker build -t mmcv https://github.com/open-mmlab/mmcv.git#master:docker/release
+```
+
+The [Dockerfile](release/Dockerfile) installs latest released version of mmcv-full by default, but you can specify mmcv versions to install expected versions.
+
+```bash
+docker image build -t mmcv -f docker/release/Dockerfile --build-arg MMCV=1.5.0 .
+```
+
+If you also want to use other versions of PyTorch and CUDA, you can also pass them when building docker images.
+
+An example to build an image with PyTorch 1.11 and CUDA 11.3.
+
+```bash
+docker build -t mmcv -f docker/release/Dockerfile \
+    --build-arg PYTORCH=1.9.0 \
+    --build-arg CUDA=11.1 \
+    --build-arg CUDNN=8 \
+    --build-arg MMCV=1.5.0 .
+```
+
+More available versions of PyTorch and CUDA can be found at [dockerhub/pytorch](https://hub.docker.com/r/pytorch/pytorch/tags).
+
+### Build with mmcv development environment
+
+If you want to build an docker image with the mmcv development environment, you can use the following command
+
+```bash
+git clone https://github.com/open-mmlab/mmcv.git && cd mmcv
+docker build -t mmcv -f docker/dev/Dockerfile --build-arg CUDA_ARCH=7.5 .
+```
+
+Note that `CUDA_ARCH` is the cumpute capability of your GPU and you can find it at [Compute Capability](https://developer.nvidia.com/cuda-gpus#compute).
+
+The building process may take 10 minutes or more.
+
+## Run images
+
+```bash
+docker run --gpus all --shm-size=8g -it mmcv
+```
+
+See [docker run](https://docs.docker.com/engine/reference/commandline/run/) for more usages.
diff --git a/mmcv/docker/dev/Dockerfile b/mmcv/docker/dev/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..0c673e958f2909cd80f589100c2b7cbfa726c499
--- /dev/null
+++ b/mmcv/docker/dev/Dockerfile
@@ -0,0 +1,32 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# To fix GPG key error when running apt-get update
+RUN rm /etc/apt/sources.list.d/cuda.list \
+    && rm /etc/apt/sources.list.d/nvidia-ml.list \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+# Install git and system dependencies for opencv-python
+RUN apt-get update && apt-get install -y git \
+    && apt-get update && apt-get install -y libgl1 libglib2.0-0
+
+# Install system dependencies for unit tests
+RUN apt-get install -y ffmpeg libturbojpeg \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# build mmcv-full from source with develop mode
+ARG HTTPS_PROXY=""
+ENV https_proxy=${HTTPS_PROXY}
+ENV FORCE_CUDA="1"
+ENV MMCV_WITH_OPS="1"
+ARG CUDA_ARCH=""
+ENV TORCH_CUDA_ARCH_LIST=${CUDA_ARCH}
+RUN git clone https://github.com/open-mmlab/mmcv.git /mmcv
+WORKDIR /mmcv
+RUN git rev-parse --short HEAD
+RUN pip install --no-cache-dir -e .[all] -v && pip install pre-commit && pre-commit install
diff --git a/mmcv/docker/release/Dockerfile b/mmcv/docker/release/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..493aa6d1625c9bdee1b9f3bd8121c6ff2f723d4a
--- /dev/null
+++ b/mmcv/docker/release/Dockerfile
@@ -0,0 +1,20 @@
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# To fix GPG key error when running apt-get update
+RUN rm /etc/apt/sources.list.d/cuda.list \
+    && rm /etc/apt/sources.list.d/nvidia-ml.list \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub \
+    && apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+# Install system dependencies for opencv-python
+RUN apt-get update && apt-get install -y libgl1 libglib2.0-0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install mmcv-full
+ARG MMCV="1.5.1"
+RUN pip install openmim && mim install mmcv-full==${MMCV} && python -c 'import mmcv;print(mmcv.__version__)'
diff --git a/mmcv/examples/train.py b/mmcv/examples/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..b08d36bf621747354d0df30bd6d787fd2c12faf1
--- /dev/null
+++ b/mmcv/examples/train.py
@@ -0,0 +1,84 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+import torchvision.transforms as transforms
+from torch.utils.data import DataLoader
+from torchvision.datasets import CIFAR10
+
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import EpochBasedRunner
+from mmcv.utils import get_logger
+
+
+class Model(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5)
+        self.fc1 = nn.Linear(16 * 5 * 5, 120)
+        self.fc2 = nn.Linear(120, 84)
+        self.fc3 = nn.Linear(84, 10)
+        self.loss_fn = nn.CrossEntropyLoss()
+
+    def forward(self, x):
+        x = self.pool(F.relu(self.conv1(x)))
+        x = self.pool(F.relu(self.conv2(x)))
+        x = x.view(-1, 16 * 5 * 5)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x
+
+    def train_step(self, data, optimizer):
+        images, labels = data
+        predicts = self(images)  # -> self.__call__() -> self.forward()
+        loss = self.loss_fn(predicts, labels)
+        return {'loss': loss}
+
+
+if __name__ == '__main__':
+    model = Model()
+    if torch.cuda.is_available():
+        # only use gpu:0 to train
+        # Solved issue https://github.com/open-mmlab/mmcv/issues/1470
+        model = MMDataParallel(model.cuda(), device_ids=[0])
+
+    # dataset and dataloader
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
+    ])
+    trainset = CIFAR10(
+        root='data', train=True, download=True, transform=transform)
+    trainloader = DataLoader(
+        trainset, batch_size=128, shuffle=True, num_workers=2)
+
+    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+    logger = get_logger('mmcv')
+    # runner is a scheduler to manage the training
+    runner = EpochBasedRunner(
+        model,
+        optimizer=optimizer,
+        work_dir='./work_dir',
+        logger=logger,
+        max_epochs=4)
+
+    # learning rate scheduler config
+    lr_config = dict(policy='step', step=[2, 3])
+    # configuration of optimizer
+    optimizer_config = dict(grad_clip=None)
+    # configuration of saving checkpoints periodically
+    checkpoint_config = dict(interval=1)
+    # save log periodically and multiple hooks can be used simultaneously
+    log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
+    # register hooks to runner and those hooks will be invoked automatically
+    runner.register_training_hooks(
+        lr_config=lr_config,
+        optimizer_config=optimizer_config,
+        checkpoint_config=checkpoint_config,
+        log_config=log_config)
+
+    runner.run([trainloader], [('train', 1)])
diff --git a/mmcv/mmcv/__init__.py b/mmcv/mmcv/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..14c556acdf5832a1da569da6819a428f17adc328
--- /dev/null
+++ b/mmcv/mmcv/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# flake8: noqa
+from .arraymisc import *
+from .fileio import *
+from .image import *
+from .utils import *
+from .version import *
+from .video import *
+from .visualization import *
+
+# The following modules are not imported to this level, so mmcv may be used
+# without PyTorch.
+# - runner
+# - parallel
+# - op
+# - device
diff --git a/mmcv/mmcv/arraymisc/__init__.py b/mmcv/mmcv/arraymisc/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b4700d6139ae3d604ff6e542468cce4200c020c
--- /dev/null
+++ b/mmcv/mmcv/arraymisc/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .quantization import dequantize, quantize
+
+__all__ = ['quantize', 'dequantize']
diff --git a/mmcv/mmcv/arraymisc/quantization.py b/mmcv/mmcv/arraymisc/quantization.py
new file mode 100644
index 0000000000000000000000000000000000000000..6182710d51787061304cfc7304ec97d565822536
--- /dev/null
+++ b/mmcv/mmcv/arraymisc/quantization.py
@@ -0,0 +1,65 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import numpy as np
+
+
+def quantize(arr: np.ndarray,
+             min_val: Union[int, float],
+             max_val: Union[int, float],
+             levels: int,
+             dtype=np.int64) -> tuple:
+    """Quantize an array of (-inf, inf) to [0, levels-1].
+
+    Args:
+        arr (ndarray): Input array.
+        min_val (int or float): Minimum value to be clipped.
+        max_val (int or float): Maximum value to be clipped.
+        levels (int): Quantization levels.
+        dtype (np.type): The type of the quantized array.
+
+    Returns:
+        tuple: Quantized array.
+    """
+    if not (isinstance(levels, int) and levels > 1):
+        raise ValueError(
+            f'levels must be a positive integer, but got {levels}')
+    if min_val >= max_val:
+        raise ValueError(
+            f'min_val ({min_val}) must be smaller than max_val ({max_val})')
+
+    arr = np.clip(arr, min_val, max_val) - min_val
+    quantized_arr = np.minimum(
+        np.floor(levels * arr / (max_val - min_val)).astype(dtype), levels - 1)
+
+    return quantized_arr
+
+
+def dequantize(arr: np.ndarray,
+               min_val: Union[int, float],
+               max_val: Union[int, float],
+               levels: int,
+               dtype=np.float64) -> tuple:
+    """Dequantize an array.
+
+    Args:
+        arr (ndarray): Input array.
+        min_val (int or float): Minimum value to be clipped.
+        max_val (int or float): Maximum value to be clipped.
+        levels (int): Quantization levels.
+        dtype (np.type): The type of the dequantized array.
+
+    Returns:
+        tuple: Dequantized array.
+    """
+    if not (isinstance(levels, int) and levels > 1):
+        raise ValueError(
+            f'levels must be a positive integer, but got {levels}')
+    if min_val >= max_val:
+        raise ValueError(
+            f'min_val ({min_val}) must be smaller than max_val ({max_val})')
+
+    dequantized_arr = (arr + 0.5).astype(dtype) * (max_val -
+                                                   min_val) / levels + min_val
+
+    return dequantized_arr
diff --git a/mmcv/mmcv/cnn/__init__.py b/mmcv/mmcv/cnn/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..7246c897430f0cc7ce12719ad8608824fc734446
--- /dev/null
+++ b/mmcv/mmcv/cnn/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .alexnet import AlexNet
+# yapf: disable
+from .bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
+                     PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS,
+                     ContextBlock, Conv2d, Conv3d, ConvAWS2d, ConvModule,
+                     ConvTranspose2d, ConvTranspose3d, ConvWS2d,
+                     DepthwiseSeparableConvModule, GeneralizedAttention,
+                     HSigmoid, HSwish, Linear, MaxPool2d, MaxPool3d,
+                     NonLocal1d, NonLocal2d, NonLocal3d, Scale, Swish,
+                     build_activation_layer, build_conv_layer,
+                     build_norm_layer, build_padding_layer, build_plugin_layer,
+                     build_upsample_layer, conv_ws_2d, is_norm)
+from .builder import MODELS, build_model_from_cfg
+# yapf: enable
+from .resnet import ResNet, make_res_layer
+from .utils import (INITIALIZERS, Caffe2XavierInit, ConstantInit, KaimingInit,
+                    NormalInit, PretrainedInit, TruncNormalInit, UniformInit,
+                    XavierInit, bias_init_with_prob, caffe2_xavier_init,
+                    constant_init, fuse_conv_bn, get_model_complexity_info,
+                    initialize, kaiming_init, normal_init, trunc_normal_init,
+                    uniform_init, xavier_init)
+from .vgg import VGG, make_vgg_layer
+
+__all__ = [
+    'AlexNet', 'VGG', 'make_vgg_layer', 'ResNet', 'make_res_layer',
+    'constant_init', 'xavier_init', 'normal_init', 'trunc_normal_init',
+    'uniform_init', 'kaiming_init', 'caffe2_xavier_init',
+    'bias_init_with_prob', 'ConvModule', 'build_activation_layer',
+    'build_conv_layer', 'build_norm_layer', 'build_padding_layer',
+    'build_upsample_layer', 'build_plugin_layer', 'is_norm', 'NonLocal1d',
+    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'HSigmoid', 'Swish', 'HSwish',
+    'GeneralizedAttention', 'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS',
+    'PADDING_LAYERS', 'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale',
+    'get_model_complexity_info', 'conv_ws_2d', 'ConvAWS2d', 'ConvWS2d',
+    'fuse_conv_bn', 'DepthwiseSeparableConvModule', 'Linear', 'Conv2d',
+    'ConvTranspose2d', 'MaxPool2d', 'ConvTranspose3d', 'MaxPool3d', 'Conv3d',
+    'initialize', 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
+    'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
+    'Caffe2XavierInit', 'MODELS', 'build_model_from_cfg'
+]
diff --git a/mmcv/mmcv/cnn/alexnet.py b/mmcv/mmcv/cnn/alexnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d45d96d86bdcb52a51f095c4571b21c8421cbfa
--- /dev/null
+++ b/mmcv/mmcv/cnn/alexnet.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+
+class AlexNet(nn.Module):
+    """AlexNet backbone.
+
+    Args:
+        num_classes (int): number of classes for classification.
+    """
+
+    def __init__(self, num_classes: int = -1):
+        super().__init__()
+        self.num_classes = num_classes
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Dropout(),
+                nn.Linear(256 * 6 * 6, 4096),
+                nn.ReLU(inplace=True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(inplace=True),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            from ..runner import load_checkpoint
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            # use default initializer
+            pass
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+
+        x = self.features(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), 256 * 6 * 6)
+            x = self.classifier(x)
+
+        return x
diff --git a/mmcv/mmcv/cnn/bricks/__init__.py b/mmcv/mmcv/cnn/bricks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f33124ed23fc6f27119a37bcb5ab004d3572be0
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/__init__.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .activation import build_activation_layer
+from .context_block import ContextBlock
+from .conv import build_conv_layer
+from .conv2d_adaptive_padding import Conv2dAdaptivePadding
+from .conv_module import ConvModule
+from .conv_ws import ConvAWS2d, ConvWS2d, conv_ws_2d
+from .depthwise_separable_conv_module import DepthwiseSeparableConvModule
+from .drop import Dropout, DropPath
+from .generalized_attention import GeneralizedAttention
+from .hsigmoid import HSigmoid
+from .hswish import HSwish
+from .non_local import NonLocal1d, NonLocal2d, NonLocal3d
+from .norm import build_norm_layer, is_norm
+from .padding import build_padding_layer
+from .plugin import build_plugin_layer
+from .registry import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
+                       PADDING_LAYERS, PLUGIN_LAYERS, UPSAMPLE_LAYERS)
+from .scale import Scale
+from .swish import Swish
+from .upsample import build_upsample_layer
+from .wrappers import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
+                       Linear, MaxPool2d, MaxPool3d)
+
+__all__ = [
+    'ConvModule', 'build_activation_layer', 'build_conv_layer',
+    'build_norm_layer', 'build_padding_layer', 'build_upsample_layer',
+    'build_plugin_layer', 'is_norm', 'HSigmoid', 'HSwish', 'NonLocal1d',
+    'NonLocal2d', 'NonLocal3d', 'ContextBlock', 'GeneralizedAttention',
+    'ACTIVATION_LAYERS', 'CONV_LAYERS', 'NORM_LAYERS', 'PADDING_LAYERS',
+    'UPSAMPLE_LAYERS', 'PLUGIN_LAYERS', 'Scale', 'ConvAWS2d', 'ConvWS2d',
+    'conv_ws_2d', 'DepthwiseSeparableConvModule', 'Swish', 'Linear',
+    'Conv2dAdaptivePadding', 'Conv2d', 'ConvTranspose2d', 'MaxPool2d',
+    'ConvTranspose3d', 'MaxPool3d', 'Conv3d', 'Dropout', 'DropPath'
+]
diff --git a/mmcv/mmcv/cnn/bricks/activation.py b/mmcv/mmcv/cnn/bricks/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..23e62722776d18b764cffe4a76e646e3103f8fb7
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/activation.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.utils import TORCH_VERSION, build_from_cfg, digit_version
+from .registry import ACTIVATION_LAYERS
+
+for module in [
+        nn.ReLU, nn.LeakyReLU, nn.PReLU, nn.RReLU, nn.ReLU6, nn.ELU,
+        nn.Sigmoid, nn.Tanh
+]:
+    ACTIVATION_LAYERS.register_module(module=module)
+
+
+@ACTIVATION_LAYERS.register_module(name='Clip')
+@ACTIVATION_LAYERS.register_module()
+class Clamp(nn.Module):
+    """Clamp activation layer.
+
+    This activation function is to clamp the feature map value within
+    :math:`[min, max]`. More details can be found in ``torch.clamp()``.
+
+    Args:
+        min (Number | optional): Lower-bound of the range to be clamped to.
+            Default to -1.
+        max (Number | optional): Upper-bound of the range to be clamped to.
+            Default to 1.
+    """
+
+    def __init__(self, min: float = -1., max: float = 1.):
+        super().__init__()
+        self.min = min
+        self.max = max
+
+    def forward(self, x) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: Clamped tensor.
+        """
+        return torch.clamp(x, min=self.min, max=self.max)
+
+
+class GELU(nn.Module):
+    r"""Applies the Gaussian Error Linear Units function:
+
+    .. math::
+        \text{GELU}(x) = x * \Phi(x)
+    where :math:`\Phi(x)` is the Cumulative Distribution Function for
+    Gaussian Distribution.
+
+    Shape:
+        - Input: :math:`(N, *)` where `*` means, any number of additional
+          dimensions
+        - Output: :math:`(N, *)`, same shape as the input
+
+    .. image:: scripts/activation_images/GELU.png
+
+    Examples::
+
+        >>> m = nn.GELU()
+        >>> input = torch.randn(2)
+        >>> output = m(input)
+    """
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return F.gelu(input)
+
+
+if (TORCH_VERSION == 'parrots'
+        or digit_version(TORCH_VERSION) < digit_version('1.4')):
+    ACTIVATION_LAYERS.register_module(module=GELU)
+else:
+    ACTIVATION_LAYERS.register_module(module=nn.GELU)
+
+
+def build_activation_layer(cfg: Dict) -> nn.Module:
+    """Build activation layer.
+
+    Args:
+        cfg (dict): The activation layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an activation layer.
+
+    Returns:
+        nn.Module: Created activation layer.
+    """
+    return build_from_cfg(cfg, ACTIVATION_LAYERS)
diff --git a/mmcv/mmcv/cnn/bricks/context_block.py b/mmcv/mmcv/cnn/bricks/context_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..15669cab35dcdc98a95df006788f78f84b88dc44
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/context_block.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import torch
+from torch import nn
+
+from ..utils import constant_init, kaiming_init
+from .registry import PLUGIN_LAYERS
+
+
+def last_zero_init(m: Union[nn.Module, nn.Sequential]) -> None:
+    if isinstance(m, nn.Sequential):
+        constant_init(m[-1], val=0)
+    else:
+        constant_init(m, val=0)
+
+
+@PLUGIN_LAYERS.register_module()
+class ContextBlock(nn.Module):
+    """ContextBlock module in GCNet.
+
+    See 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    (https://arxiv.org/abs/1904.11492) for details.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        ratio (float): Ratio of channels of transform bottleneck
+        pooling_type (str): Pooling method for context modeling.
+            Options are 'att' and 'avg', stand for attention pooling and
+            average pooling respectively. Default: 'att'.
+        fusion_types (Sequence[str]): Fusion method for feature fusion,
+            Options are 'channels_add', 'channel_mul', stand for channelwise
+            addition and multiplication respectively. Default: ('channel_add',)
+    """
+
+    _abbr_ = 'context_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 ratio: float,
+                 pooling_type: str = 'att',
+                 fusion_types: tuple = ('channel_add', )):
+        super().__init__()
+        assert pooling_type in ['avg', 'att']
+        assert isinstance(fusion_types, (list, tuple))
+        valid_fusion_types = ['channel_add', 'channel_mul']
+        assert all([f in valid_fusion_types for f in fusion_types])
+        assert len(fusion_types) > 0, 'at least one fusion should be used'
+        self.in_channels = in_channels
+        self.ratio = ratio
+        self.planes = int(in_channels * ratio)
+        self.pooling_type = pooling_type
+        self.fusion_types = fusion_types
+        if pooling_type == 'att':
+            self.conv_mask = nn.Conv2d(in_channels, 1, kernel_size=1)
+            self.softmax = nn.Softmax(dim=2)
+        else:
+            self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        if 'channel_add' in fusion_types:
+            self.channel_add_conv = nn.Sequential(
+                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+                nn.LayerNorm([self.planes, 1, 1]),
+                nn.ReLU(inplace=True),  # yapf: disable
+                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+        else:
+            self.channel_add_conv = None
+        if 'channel_mul' in fusion_types:
+            self.channel_mul_conv = nn.Sequential(
+                nn.Conv2d(self.in_channels, self.planes, kernel_size=1),
+                nn.LayerNorm([self.planes, 1, 1]),
+                nn.ReLU(inplace=True),  # yapf: disable
+                nn.Conv2d(self.planes, self.in_channels, kernel_size=1))
+        else:
+            self.channel_mul_conv = None
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        if self.pooling_type == 'att':
+            kaiming_init(self.conv_mask, mode='fan_in')
+            self.conv_mask.inited = True
+
+        if self.channel_add_conv is not None:
+            last_zero_init(self.channel_add_conv)
+        if self.channel_mul_conv is not None:
+            last_zero_init(self.channel_mul_conv)
+
+    def spatial_pool(self, x: torch.Tensor) -> torch.Tensor:
+        batch, channel, height, width = x.size()
+        if self.pooling_type == 'att':
+            input_x = x
+            # [N, C, H * W]
+            input_x = input_x.view(batch, channel, height * width)
+            # [N, 1, C, H * W]
+            input_x = input_x.unsqueeze(1)
+            # [N, 1, H, W]
+            context_mask = self.conv_mask(x)
+            # [N, 1, H * W]
+            context_mask = context_mask.view(batch, 1, height * width)
+            # [N, 1, H * W]
+            context_mask = self.softmax(context_mask)
+            # [N, 1, H * W, 1]
+            context_mask = context_mask.unsqueeze(-1)
+            # [N, 1, C, 1]
+            context = torch.matmul(input_x, context_mask)
+            # [N, C, 1, 1]
+            context = context.view(batch, channel, 1, 1)
+        else:
+            # [N, C, 1, 1]
+            context = self.avg_pool(x)
+
+        return context
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # [N, C, 1, 1]
+        context = self.spatial_pool(x)
+
+        out = x
+        if self.channel_mul_conv is not None:
+            # [N, C, 1, 1]
+            channel_mul_term = torch.sigmoid(self.channel_mul_conv(context))
+            out = out * channel_mul_term
+        if self.channel_add_conv is not None:
+            # [N, C, 1, 1]
+            channel_add_term = self.channel_add_conv(context)
+            out = out + channel_add_term
+
+        return out
diff --git a/mmcv/mmcv/cnn/bricks/conv.py b/mmcv/mmcv/cnn/bricks/conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..147517ef4ecdee16d26b535fa49c26a2fcbdd48e
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/conv.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+from torch import nn
+
+from .registry import CONV_LAYERS
+
+CONV_LAYERS.register_module('Conv1d', module=nn.Conv1d)
+CONV_LAYERS.register_module('Conv2d', module=nn.Conv2d)
+CONV_LAYERS.register_module('Conv3d', module=nn.Conv3d)
+CONV_LAYERS.register_module('Conv', module=nn.Conv2d)
+
+
+def build_conv_layer(cfg: Optional[Dict], *args, **kwargs) -> nn.Module:
+    """Build convolution layer.
+
+    Args:
+        cfg (None or dict): The conv layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate an conv layer.
+        args (argument list): Arguments passed to the `__init__`
+            method of the corresponding conv layer.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of the corresponding conv layer.
+
+    Returns:
+        nn.Module: Created conv layer.
+    """
+    if cfg is None:
+        cfg_ = dict(type='Conv2d')
+    else:
+        if not isinstance(cfg, dict):
+            raise TypeError('cfg must be a dict')
+        if 'type' not in cfg:
+            raise KeyError('the cfg dict must contain the key "type"')
+        cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in CONV_LAYERS:
+        raise KeyError(f'Unrecognized layer type {layer_type}')
+    else:
+        conv_layer = CONV_LAYERS.get(layer_type)
+
+    layer = conv_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/mmcv/mmcv/cnn/bricks/conv2d_adaptive_padding.py b/mmcv/mmcv/cnn/bricks/conv2d_adaptive_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a7a1d2844db097c21e5ecc55a579e0b9b95c816
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/conv2d_adaptive_padding.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Tuple, Union
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from .registry import CONV_LAYERS
+
+
+@CONV_LAYERS.register_module()
+class Conv2dAdaptivePadding(nn.Conv2d):
+    """Implementation of 2D convolution in tensorflow with `padding` as "same",
+    which applies padding to input (if needed) so that input image gets fully
+    covered by filter and stride you specified. For stride 1, this will ensure
+    that output image size is same as input. For stride of 2, output dimensions
+    will be half, for example.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True):
+        super().__init__(in_channels, out_channels, kernel_size, stride, 0,
+                         dilation, groups, bias)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        img_h, img_w = x.size()[-2:]
+        kernel_h, kernel_w = self.weight.size()[-2:]
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(img_h / stride_h)
+        output_w = math.ceil(img_w / stride_w)
+        pad_h = (
+            max((output_h - 1) * self.stride[0] +
+                (kernel_h - 1) * self.dilation[0] + 1 - img_h, 0))
+        pad_w = (
+            max((output_w - 1) * self.stride[1] +
+                (kernel_w - 1) * self.dilation[1] + 1 - img_w, 0))
+        if pad_h > 0 or pad_w > 0:
+            x = F.pad(x, [
+                pad_w // 2, pad_w - pad_w // 2, pad_h // 2, pad_h - pad_h // 2
+            ])
+        return F.conv2d(x, self.weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
diff --git a/mmcv/mmcv/cnn/bricks/conv_module.py b/mmcv/mmcv/cnn/bricks/conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5d4a8c2760ea81656d3eefdad86e8dd43488447
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/conv_module.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from mmcv.utils import _BatchNorm, _InstanceNorm
+from ..utils import constant_init, kaiming_init
+from .activation import build_activation_layer
+from .conv import build_conv_layer
+from .norm import build_norm_layer
+from .padding import build_padding_layer
+from .registry import PLUGIN_LAYERS
+
+
+@PLUGIN_LAYERS.register_module()
+class ConvModule(nn.Module):
+    """A conv block that bundles conv/norm/activation layers.
+
+    This block simplifies the usage of convolution layers, which are commonly
+    used with a norm layer (e.g., BatchNorm) and activation layer (e.g., ReLU).
+    It is based upon three build methods: `build_conv_layer()`,
+    `build_norm_layer()` and `build_activation_layer()`.
+
+    Besides, we add some additional features in this module.
+    1. Automatically set `bias` of the conv layer.
+    2. Spectral norm is supported.
+    3. More padding modes are supported. Before PyTorch 1.5, nn.Conv2d only
+    supports zero and circular padding, and we add "reflect" padding mode.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``.
+        groups (int): Number of blocked connections from input channels to
+            output channels. Same as that in ``nn._ConvNd``.
+        bias (bool | str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if `norm_cfg` is None, otherwise
+            False. Default: "auto".
+        conv_cfg (dict): Config dict for convolution layer. Default: None,
+            which means using conv2d.
+        norm_cfg (dict): Config dict for normalization layer. Default: None.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        inplace (bool): Whether to use inplace mode for activation.
+            Default: True.
+        with_spectral_norm (bool): Whether use spectral norm in conv module.
+            Default: False.
+        padding_mode (str): If the `padding_mode` has not been supported by
+            current `Conv2d` in PyTorch, we will use our own padding layer
+            instead. Currently, we support ['zeros', 'circular'] with official
+            implementation and ['reflect'] with our own implementation.
+            Default: 'zeros'.
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+            Default: ('conv', 'norm', 'act').
+    """
+
+    _abbr_ = 'conv_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: Union[bool, str] = 'auto',
+                 conv_cfg: Optional[Dict] = None,
+                 norm_cfg: Optional[Dict] = None,
+                 act_cfg: Optional[Dict] = dict(type='ReLU'),
+                 inplace: bool = True,
+                 with_spectral_norm: bool = False,
+                 padding_mode: str = 'zeros',
+                 order: tuple = ('conv', 'norm', 'act')):
+        super().__init__()
+        assert conv_cfg is None or isinstance(conv_cfg, dict)
+        assert norm_cfg is None or isinstance(norm_cfg, dict)
+        assert act_cfg is None or isinstance(act_cfg, dict)
+        official_padding_mode = ['zeros', 'circular']
+        self.conv_cfg = conv_cfg
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.inplace = inplace
+        self.with_spectral_norm = with_spectral_norm
+        self.with_explicit_padding = padding_mode not in official_padding_mode
+        self.order = order
+        assert isinstance(self.order, tuple) and len(self.order) == 3
+        assert set(order) == {'conv', 'norm', 'act'}
+
+        self.with_norm = norm_cfg is not None
+        self.with_activation = act_cfg is not None
+        # if the conv layer is before a norm layer, bias is unnecessary.
+        if bias == 'auto':
+            bias = not self.with_norm
+        self.with_bias = bias
+
+        if self.with_explicit_padding:
+            pad_cfg = dict(type=padding_mode)
+            self.padding_layer = build_padding_layer(pad_cfg, padding)
+
+        # reset padding to 0 for conv module
+        conv_padding = 0 if self.with_explicit_padding else padding
+        # build convolution layer
+        self.conv = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=conv_padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        # export the attributes of self.conv to a higher level for convenience
+        self.in_channels = self.conv.in_channels
+        self.out_channels = self.conv.out_channels
+        self.kernel_size = self.conv.kernel_size
+        self.stride = self.conv.stride
+        self.padding = padding
+        self.dilation = self.conv.dilation
+        self.transposed = self.conv.transposed
+        self.output_padding = self.conv.output_padding
+        self.groups = self.conv.groups
+
+        if self.with_spectral_norm:
+            self.conv = nn.utils.spectral_norm(self.conv)
+
+        # build normalization layers
+        if self.with_norm:
+            # norm layer is after conv layer
+            if order.index('norm') > order.index('conv'):
+                norm_channels = out_channels
+            else:
+                norm_channels = in_channels
+            self.norm_name, norm = build_norm_layer(
+                norm_cfg, norm_channels)  # type: ignore
+            self.add_module(self.norm_name, norm)
+            if self.with_bias:
+                if isinstance(norm, (_BatchNorm, _InstanceNorm)):
+                    warnings.warn(
+                        'Unnecessary conv bias before batch/instance norm')
+        else:
+            self.norm_name = None  # type: ignore
+
+        # build activation layer
+        if self.with_activation:
+            act_cfg_ = act_cfg.copy()  # type: ignore
+            # nn.Tanh has no 'inplace' argument
+            if act_cfg_['type'] not in [
+                    'Tanh', 'PReLU', 'Sigmoid', 'HSigmoid', 'Swish', 'GELU'
+            ]:
+                act_cfg_.setdefault('inplace', inplace)
+            self.activate = build_activation_layer(act_cfg_)
+
+        # Use msra init by default
+        self.init_weights()
+
+    @property
+    def norm(self):
+        if self.norm_name:
+            return getattr(self, self.norm_name)
+        else:
+            return None
+
+    def init_weights(self):
+        # 1. It is mainly for customized conv layers with their own
+        #    initialization manners by calling their own ``init_weights()``,
+        #    and we do not want ConvModule to override the initialization.
+        # 2. For customized conv layers without their own initialization
+        #    manners (that is, they don't have their own ``init_weights()``)
+        #    and PyTorch's conv layers, they will be initialized by
+        #    this method with default ``kaiming_init``.
+        # Note: For PyTorch's conv layers, they will be overwritten by our
+        #    initialization implementation using default ``kaiming_init``.
+        if not hasattr(self.conv, 'init_weights'):
+            if self.with_activation and self.act_cfg['type'] == 'LeakyReLU':
+                nonlinearity = 'leaky_relu'
+                a = self.act_cfg.get('negative_slope', 0.01)
+            else:
+                nonlinearity = 'relu'
+                a = 0
+            kaiming_init(self.conv, a=a, nonlinearity=nonlinearity)
+        if self.with_norm:
+            constant_init(self.norm, 1, bias=0)
+
+    def forward(self,
+                x: torch.Tensor,
+                activate: bool = True,
+                norm: bool = True) -> torch.Tensor:
+        for layer in self.order:
+            if layer == 'conv':
+                if self.with_explicit_padding:
+                    x = self.padding_layer(x)
+                x = self.conv(x)
+            elif layer == 'norm' and norm and self.with_norm:
+                x = self.norm(x)
+            elif layer == 'act' and activate and self.with_activation:
+                x = self.activate(x)
+        return x
diff --git a/mmcv/mmcv/cnn/bricks/conv_ws.py b/mmcv/mmcv/cnn/bricks/conv_ws.py
new file mode 100644
index 0000000000000000000000000000000000000000..6569f920fea942a9345ff509c7dbdb6ace1f3741
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/conv_ws.py
@@ -0,0 +1,154 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from .registry import CONV_LAYERS
+
+
+def conv_ws_2d(input: torch.Tensor,
+               weight: torch.Tensor,
+               bias: Optional[torch.Tensor] = None,
+               stride: Union[int, Tuple[int, int]] = 1,
+               padding: Union[int, Tuple[int, int]] = 0,
+               dilation: Union[int, Tuple[int, int]] = 1,
+               groups: int = 1,
+               eps: float = 1e-5) -> torch.Tensor:
+    c_in = weight.size(0)
+    weight_flat = weight.view(c_in, -1)
+    mean = weight_flat.mean(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    std = weight_flat.std(dim=1, keepdim=True).view(c_in, 1, 1, 1)
+    weight = (weight - mean) / (std + eps)
+    return F.conv2d(input, weight, bias, stride, padding, dilation, groups)
+
+
+@CONV_LAYERS.register_module('ConvWS')
+class ConvWS2d(nn.Conv2d):
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True,
+                 eps: float = 1e-5):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return conv_ws_2d(x, self.weight, self.bias, self.stride, self.padding,
+                          self.dilation, self.groups, self.eps)
+
+
+@CONV_LAYERS.register_module(name='ConvAWS')
+class ConvAWS2d(nn.Conv2d):
+    """AWS (Adaptive Weight Standardization)
+
+    This is a variant of Weight Standardization
+    (https://arxiv.org/pdf/1903.10520.pdf)
+    It is used in DetectoRS to avoid NaN
+    (https://arxiv.org/pdf/2006.02334.pdf)
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the conv kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If set True, adds a learnable bias to the
+            output. Default: True
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 groups: int = 1,
+                 bias: bool = True):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.register_buffer('weight_gamma',
+                             torch.ones(self.out_channels, 1, 1, 1))
+        self.register_buffer('weight_beta',
+                             torch.zeros(self.out_channels, 1, 1, 1))
+
+    def _get_weight(self, weight: torch.Tensor) -> torch.Tensor:
+        weight_flat = weight.view(weight.size(0), -1)
+        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
+        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
+        weight = (weight - mean) / std
+        weight = self.weight_gamma * weight + self.weight_beta
+        return weight
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        weight = self._get_weight(self.weight)
+        return F.conv2d(x, weight, self.bias, self.stride, self.padding,
+                        self.dilation, self.groups)
+
+    def _load_from_state_dict(self, state_dict: OrderedDict, prefix: str,
+                              local_metadata: Dict, strict: bool,
+                              missing_keys: List[str],
+                              unexpected_keys: List[str],
+                              error_msgs: List[str]) -> None:
+        """Override default load function.
+
+        AWS overrides the function _load_from_state_dict to recover
+        weight_gamma and weight_beta if they are missing. If weight_gamma and
+        weight_beta are found in the checkpoint, this function will return
+        after super()._load_from_state_dict. Otherwise, it will compute the
+        mean and std of the pretrained weights and store them in weight_beta
+        and weight_gamma.
+        """
+
+        self.weight_gamma.data.fill_(-1)
+        local_missing_keys: List = []
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, local_missing_keys,
+                                      unexpected_keys, error_msgs)
+        if self.weight_gamma.data.mean() > 0:
+            for k in local_missing_keys:
+                missing_keys.append(k)
+            return
+        weight = self.weight.data
+        weight_flat = weight.view(weight.size(0), -1)
+        mean = weight_flat.mean(dim=1).view(-1, 1, 1, 1)
+        std = torch.sqrt(weight_flat.var(dim=1) + 1e-5).view(-1, 1, 1, 1)
+        self.weight_beta.data.copy_(mean)
+        self.weight_gamma.data.copy_(std)
+        missing_gamma_beta = [
+            k for k in local_missing_keys
+            if k.endswith('weight_gamma') or k.endswith('weight_beta')
+        ]
+        for k in missing_gamma_beta:
+            local_missing_keys.remove(k)
+        for k in local_missing_keys:
+            missing_keys.append(k)
diff --git a/mmcv/mmcv/cnn/bricks/depthwise_separable_conv_module.py b/mmcv/mmcv/cnn/bricks/depthwise_separable_conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf1fe4cad3812007573211fa2bede28b23822122
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/depthwise_separable_conv_module.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from .conv_module import ConvModule
+
+
+class DepthwiseSeparableConvModule(nn.Module):
+    """Depthwise separable convolution module.
+
+    See https://arxiv.org/pdf/1704.04861.pdf for details.
+
+    This module can replace a ConvModule with the conv block replaced by two
+    conv block: depthwise conv block and pointwise conv block. The depthwise
+    conv block contains depthwise-conv/norm/activation layers. The pointwise
+    conv block contains pointwise-conv/norm/activation layers. It should be
+    noted that there will be norm/activation layer in the depthwise conv block
+    if `norm_cfg` and `act_cfg` are specified.
+
+    Args:
+        in_channels (int): Number of channels in the input feature map.
+            Same as that in ``nn._ConvNd``.
+        out_channels (int): Number of channels produced by the convolution.
+            Same as that in ``nn._ConvNd``.
+        kernel_size (int | tuple[int]): Size of the convolving kernel.
+            Same as that in ``nn._ConvNd``.
+        stride (int | tuple[int]): Stride of the convolution.
+            Same as that in ``nn._ConvNd``. Default: 1.
+        padding (int | tuple[int]): Zero-padding added to both sides of
+            the input. Same as that in ``nn._ConvNd``. Default: 0.
+        dilation (int | tuple[int]): Spacing between kernel elements.
+            Same as that in ``nn._ConvNd``. Default: 1.
+        norm_cfg (dict): Default norm config for both depthwise ConvModule and
+            pointwise ConvModule. Default: None.
+        act_cfg (dict): Default activation config for both depthwise ConvModule
+            and pointwise ConvModule. Default: dict(type='ReLU').
+        dw_norm_cfg (dict): Norm config of depthwise ConvModule. If it is
+            'default', it will be the same as `norm_cfg`. Default: 'default'.
+        dw_act_cfg (dict): Activation config of depthwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: 'default'.
+        pw_norm_cfg (dict): Norm config of pointwise ConvModule. If it is
+            'default', it will be the same as `norm_cfg`. Default: 'default'.
+        pw_act_cfg (dict): Activation config of pointwise ConvModule. If it is
+            'default', it will be the same as `act_cfg`. Default: 'default'.
+        kwargs (optional): Other shared arguments for depthwise and pointwise
+            ConvModule. See ConvModule for ref.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, int]],
+                 stride: Union[int, Tuple[int, int]] = 1,
+                 padding: Union[int, Tuple[int, int]] = 0,
+                 dilation: Union[int, Tuple[int, int]] = 1,
+                 norm_cfg: Optional[Dict] = None,
+                 act_cfg: Dict = dict(type='ReLU'),
+                 dw_norm_cfg: Union[Dict, str] = 'default',
+                 dw_act_cfg: Union[Dict, str] = 'default',
+                 pw_norm_cfg: Union[Dict, str] = 'default',
+                 pw_act_cfg: Union[Dict, str] = 'default',
+                 **kwargs):
+        super().__init__()
+        assert 'groups' not in kwargs, 'groups should not be specified'
+
+        # if norm/activation config of depthwise/pointwise ConvModule is not
+        # specified, use default config.
+        dw_norm_cfg = dw_norm_cfg if dw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
+        dw_act_cfg = dw_act_cfg if dw_act_cfg != 'default' else act_cfg
+        pw_norm_cfg = pw_norm_cfg if pw_norm_cfg != 'default' else norm_cfg  # type: ignore # noqa E501
+        pw_act_cfg = pw_act_cfg if pw_act_cfg != 'default' else act_cfg
+
+        # depthwise convolution
+        self.depthwise_conv = ConvModule(
+            in_channels,
+            in_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=in_channels,
+            norm_cfg=dw_norm_cfg,  # type: ignore
+            act_cfg=dw_act_cfg,  # type: ignore
+            **kwargs)
+
+        self.pointwise_conv = ConvModule(
+            in_channels,
+            out_channels,
+            1,
+            norm_cfg=pw_norm_cfg,  # type: ignore
+            act_cfg=pw_act_cfg,  # type: ignore
+            **kwargs)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.depthwise_conv(x)
+        x = self.pointwise_conv(x)
+        return x
diff --git a/mmcv/mmcv/cnn/bricks/drop.py b/mmcv/mmcv/cnn/bricks/drop.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea05221d854592a5d885efbef002cb673c65f778
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/drop.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn as nn
+
+from mmcv import build_from_cfg
+from .registry import DROPOUT_LAYERS
+
+
+def drop_path(x: torch.Tensor,
+              drop_prob: float = 0.,
+              training: bool = False) -> torch.Tensor:
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of
+    residual blocks).
+
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    # handle tensors with different dimensions, not just 4D tensors.
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)
+    random_tensor = keep_prob + torch.rand(
+        shape, dtype=x.dtype, device=x.device)
+    output = x.div(keep_prob) * random_tensor.floor()
+    return output
+
+
+@DROPOUT_LAYERS.register_module()
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of
+    residual blocks).
+
+    We follow the implementation
+    https://github.com/rwightman/pytorch-image-models/blob/a2727c1bf78ba0d7b5727f5f95e37fb7f8866b1f/timm/models/layers/drop.py  # noqa: E501
+
+    Args:
+        drop_prob (float): Probability of the path to be zeroed. Default: 0.1
+    """
+
+    def __init__(self, drop_prob: float = 0.1):
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return drop_path(x, self.drop_prob, self.training)
+
+
+@DROPOUT_LAYERS.register_module()
+class Dropout(nn.Dropout):
+    """A wrapper for ``torch.nn.Dropout``, We rename the ``p`` of
+    ``torch.nn.Dropout`` to ``drop_prob`` so as to be consistent with
+    ``DropPath``
+
+    Args:
+        drop_prob (float): Probability of the elements to be
+            zeroed. Default: 0.5.
+        inplace (bool):  Do the operation inplace or not. Default: False.
+    """
+
+    def __init__(self, drop_prob: float = 0.5, inplace: bool = False):
+        super().__init__(p=drop_prob, inplace=inplace)
+
+
+def build_dropout(cfg: Dict, default_args: Optional[Dict] = None) -> Any:
+    """Builder for drop out layers."""
+    return build_from_cfg(cfg, DROPOUT_LAYERS, default_args)
diff --git a/mmcv/mmcv/cnn/bricks/generalized_attention.py b/mmcv/mmcv/cnn/bricks/generalized_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..118e39c7ea2d9f24a97f22878dfbe753c4afef0b
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/generalized_attention.py
@@ -0,0 +1,412 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import kaiming_init
+from .registry import PLUGIN_LAYERS
+
+
+@PLUGIN_LAYERS.register_module()
+class GeneralizedAttention(nn.Module):
+    """GeneralizedAttention module.
+
+    See 'An Empirical Study of Spatial Attention Mechanisms in Deep Networks'
+    (https://arxiv.org/abs/1711.07971) for details.
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        spatial_range (int): The spatial range. -1 indicates no spatial range
+            constraint. Default: -1.
+        num_heads (int): The head number of empirical_attention module.
+            Default: 9.
+        position_embedding_dim (int): The position embedding dimension.
+            Default: -1.
+        position_magnitude (int): A multiplier acting on coord difference.
+            Default: 1.
+        kv_stride (int): The feature stride acting on key/value feature map.
+            Default: 2.
+        q_stride (int): The feature stride acting on query feature map.
+            Default: 1.
+        attention_type (str): A binary indicator string for indicating which
+            items in generalized empirical_attention module are used.
+            Default: '1111'.
+
+            - '1000' indicates 'query and key content' (appr - appr) item,
+            - '0100' indicates 'query content and relative position'
+              (appr - position) item,
+            - '0010' indicates 'key content only' (bias - appr) item,
+            - '0001' indicates 'relative position only' (bias - position) item.
+    """
+
+    _abbr_ = 'gen_attention_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 spatial_range: int = -1,
+                 num_heads: int = 9,
+                 position_embedding_dim: int = -1,
+                 position_magnitude: int = 1,
+                 kv_stride: int = 2,
+                 q_stride: int = 1,
+                 attention_type: str = '1111'):
+
+        super().__init__()
+
+        # hard range means local range for non-local operation
+        self.position_embedding_dim = (
+            position_embedding_dim
+            if position_embedding_dim > 0 else in_channels)
+
+        self.position_magnitude = position_magnitude
+        self.num_heads = num_heads
+        self.in_channels = in_channels
+        self.spatial_range = spatial_range
+        self.kv_stride = kv_stride
+        self.q_stride = q_stride
+        self.attention_type = [bool(int(_)) for _ in attention_type]
+        self.qk_embed_dim = in_channels // num_heads
+        out_c = self.qk_embed_dim * num_heads
+
+        if self.attention_type[0] or self.attention_type[1]:
+            self.query_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_c,
+                kernel_size=1,
+                bias=False)
+            self.query_conv.kaiming_init = True
+
+        if self.attention_type[0] or self.attention_type[2]:
+            self.key_conv = nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_c,
+                kernel_size=1,
+                bias=False)
+            self.key_conv.kaiming_init = True
+
+        self.v_dim = in_channels // num_heads
+        self.value_conv = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=self.v_dim * num_heads,
+            kernel_size=1,
+            bias=False)
+        self.value_conv.kaiming_init = True
+
+        if self.attention_type[1] or self.attention_type[3]:
+            self.appr_geom_fc_x = nn.Linear(
+                self.position_embedding_dim // 2, out_c, bias=False)
+            self.appr_geom_fc_x.kaiming_init = True
+
+            self.appr_geom_fc_y = nn.Linear(
+                self.position_embedding_dim // 2, out_c, bias=False)
+            self.appr_geom_fc_y.kaiming_init = True
+
+        if self.attention_type[2]:
+            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+            appr_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+            self.appr_bias = nn.Parameter(appr_bias_value)
+
+        if self.attention_type[3]:
+            stdv = 1.0 / math.sqrt(self.qk_embed_dim * 2)
+            geom_bias_value = -2 * stdv * torch.rand(out_c) + stdv
+            self.geom_bias = nn.Parameter(geom_bias_value)
+
+        self.proj_conv = nn.Conv2d(
+            in_channels=self.v_dim * num_heads,
+            out_channels=in_channels,
+            kernel_size=1,
+            bias=True)
+        self.proj_conv.kaiming_init = True
+        self.gamma = nn.Parameter(torch.zeros(1))
+
+        if self.spatial_range >= 0:
+            # only works when non local is after 3*3 conv
+            if in_channels == 256:
+                max_len = 84
+            elif in_channels == 512:
+                max_len = 42
+
+            max_len_kv = int((max_len - 1.0) / self.kv_stride + 1)
+            local_constraint_map = np.ones(
+                (max_len, max_len, max_len_kv, max_len_kv), dtype=int)
+            for iy in range(max_len):
+                for ix in range(max_len):
+                    local_constraint_map[
+                        iy, ix,
+                        max((iy - self.spatial_range) //
+                            self.kv_stride, 0):min((iy + self.spatial_range +
+                                                    1) // self.kv_stride +
+                                                   1, max_len),
+                        max((ix - self.spatial_range) //
+                            self.kv_stride, 0):min((ix + self.spatial_range +
+                                                    1) // self.kv_stride +
+                                                   1, max_len)] = 0
+
+            self.local_constraint_map = nn.Parameter(
+                torch.from_numpy(local_constraint_map).byte(),
+                requires_grad=False)
+
+        if self.q_stride > 1:
+            self.q_downsample = nn.AvgPool2d(
+                kernel_size=1, stride=self.q_stride)
+        else:
+            self.q_downsample = None
+
+        if self.kv_stride > 1:
+            self.kv_downsample = nn.AvgPool2d(
+                kernel_size=1, stride=self.kv_stride)
+        else:
+            self.kv_downsample = None
+
+        self.init_weights()
+
+    def get_position_embedding(self,
+                               h,
+                               w,
+                               h_kv,
+                               w_kv,
+                               q_stride,
+                               kv_stride,
+                               device,
+                               dtype,
+                               feat_dim,
+                               wave_length=1000):
+        # the default type of Tensor is float32, leading to type mismatch
+        # in fp16 mode. Cast it to support fp16 mode.
+        h_idxs = torch.linspace(0, h - 1, h).to(device=device, dtype=dtype)
+        h_idxs = h_idxs.view((h, 1)) * q_stride
+
+        w_idxs = torch.linspace(0, w - 1, w).to(device=device, dtype=dtype)
+        w_idxs = w_idxs.view((w, 1)) * q_stride
+
+        h_kv_idxs = torch.linspace(0, h_kv - 1, h_kv).to(
+            device=device, dtype=dtype)
+        h_kv_idxs = h_kv_idxs.view((h_kv, 1)) * kv_stride
+
+        w_kv_idxs = torch.linspace(0, w_kv - 1, w_kv).to(
+            device=device, dtype=dtype)
+        w_kv_idxs = w_kv_idxs.view((w_kv, 1)) * kv_stride
+
+        # (h, h_kv, 1)
+        h_diff = h_idxs.unsqueeze(1) - h_kv_idxs.unsqueeze(0)
+        h_diff *= self.position_magnitude
+
+        # (w, w_kv, 1)
+        w_diff = w_idxs.unsqueeze(1) - w_kv_idxs.unsqueeze(0)
+        w_diff *= self.position_magnitude
+
+        feat_range = torch.arange(0, feat_dim / 4).to(
+            device=device, dtype=dtype)
+
+        dim_mat = torch.Tensor([wave_length]).to(device=device, dtype=dtype)
+        dim_mat = dim_mat**((4. / feat_dim) * feat_range)
+        dim_mat = dim_mat.view((1, 1, -1))
+
+        embedding_x = torch.cat(
+            ((w_diff / dim_mat).sin(), (w_diff / dim_mat).cos()), dim=2)
+
+        embedding_y = torch.cat(
+            ((h_diff / dim_mat).sin(), (h_diff / dim_mat).cos()), dim=2)
+
+        return embedding_x, embedding_y
+
+    def forward(self, x_input: torch.Tensor) -> torch.Tensor:
+        num_heads = self.num_heads
+
+        # use empirical_attention
+        if self.q_downsample is not None:
+            x_q = self.q_downsample(x_input)
+        else:
+            x_q = x_input
+        n, _, h, w = x_q.shape
+
+        if self.kv_downsample is not None:
+            x_kv = self.kv_downsample(x_input)
+        else:
+            x_kv = x_input
+        _, _, h_kv, w_kv = x_kv.shape
+
+        if self.attention_type[0] or self.attention_type[1]:
+            proj_query = self.query_conv(x_q).view(
+                (n, num_heads, self.qk_embed_dim, h * w))
+            proj_query = proj_query.permute(0, 1, 3, 2)
+
+        if self.attention_type[0] or self.attention_type[2]:
+            proj_key = self.key_conv(x_kv).view(
+                (n, num_heads, self.qk_embed_dim, h_kv * w_kv))
+
+        if self.attention_type[1] or self.attention_type[3]:
+            position_embed_x, position_embed_y = self.get_position_embedding(
+                h, w, h_kv, w_kv, self.q_stride, self.kv_stride,
+                x_input.device, x_input.dtype, self.position_embedding_dim)
+            # (n, num_heads, w, w_kv, dim)
+            position_feat_x = self.appr_geom_fc_x(position_embed_x).\
+                view(1, w, w_kv, num_heads, self.qk_embed_dim).\
+                permute(0, 3, 1, 2, 4).\
+                repeat(n, 1, 1, 1, 1)
+
+            # (n, num_heads, h, h_kv, dim)
+            position_feat_y = self.appr_geom_fc_y(position_embed_y).\
+                view(1, h, h_kv, num_heads, self.qk_embed_dim).\
+                permute(0, 3, 1, 2, 4).\
+                repeat(n, 1, 1, 1, 1)
+
+            position_feat_x /= math.sqrt(2)
+            position_feat_y /= math.sqrt(2)
+
+        # accelerate for saliency only
+        if (np.sum(self.attention_type) == 1) and self.attention_type[2]:
+            appr_bias = self.appr_bias.\
+                view(1, num_heads, 1, self.qk_embed_dim).\
+                repeat(n, 1, 1, 1)
+
+            energy = torch.matmul(appr_bias, proj_key).\
+                view(n, num_heads, 1, h_kv * w_kv)
+
+            h = 1
+            w = 1
+        else:
+            # (n, num_heads, h*w, h_kv*w_kv), query before key, 540mb for
+            if not self.attention_type[0]:
+                energy = torch.zeros(
+                    n,
+                    num_heads,
+                    h,
+                    w,
+                    h_kv,
+                    w_kv,
+                    dtype=x_input.dtype,
+                    device=x_input.device)
+
+            # attention_type[0]: appr - appr
+            # attention_type[1]: appr - position
+            # attention_type[2]: bias - appr
+            # attention_type[3]: bias - position
+            if self.attention_type[0] or self.attention_type[2]:
+                if self.attention_type[0] and self.attention_type[2]:
+                    appr_bias = self.appr_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim)
+                    energy = torch.matmul(proj_query + appr_bias, proj_key).\
+                        view(n, num_heads, h, w, h_kv, w_kv)
+
+                elif self.attention_type[0]:
+                    energy = torch.matmul(proj_query, proj_key).\
+                        view(n, num_heads, h, w, h_kv, w_kv)
+
+                elif self.attention_type[2]:
+                    appr_bias = self.appr_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim).\
+                        repeat(n, 1, 1, 1)
+
+                    energy += torch.matmul(appr_bias, proj_key).\
+                        view(n, num_heads, 1, 1, h_kv, w_kv)
+
+            if self.attention_type[1] or self.attention_type[3]:
+                if self.attention_type[1] and self.attention_type[3]:
+                    geom_bias = self.geom_bias.\
+                        view(1, num_heads, 1, self.qk_embed_dim)
+
+                    proj_query_reshape = (proj_query + geom_bias).\
+                        view(n, num_heads, h, w, self.qk_embed_dim)
+
+                    energy_x = torch.matmul(
+                        proj_query_reshape.permute(0, 1, 3, 2, 4),
+                        position_feat_x.permute(0, 1, 2, 4, 3))
+                    energy_x = energy_x.\
+                        permute(0, 1, 3, 2, 4).unsqueeze(4)
+
+                    energy_y = torch.matmul(
+                        proj_query_reshape,
+                        position_feat_y.permute(0, 1, 2, 4, 3))
+                    energy_y = energy_y.unsqueeze(5)
+
+                    energy += energy_x + energy_y
+
+                elif self.attention_type[1]:
+                    proj_query_reshape = proj_query.\
+                        view(n, num_heads, h, w, self.qk_embed_dim)
+                    proj_query_reshape = proj_query_reshape.\
+                        permute(0, 1, 3, 2, 4)
+                    position_feat_x_reshape = position_feat_x.\
+                        permute(0, 1, 2, 4, 3)
+                    position_feat_y_reshape = position_feat_y.\
+                        permute(0, 1, 2, 4, 3)
+
+                    energy_x = torch.matmul(proj_query_reshape,
+                                            position_feat_x_reshape)
+                    energy_x = energy_x.permute(0, 1, 3, 2, 4).unsqueeze(4)
+
+                    energy_y = torch.matmul(proj_query_reshape,
+                                            position_feat_y_reshape)
+                    energy_y = energy_y.unsqueeze(5)
+
+                    energy += energy_x + energy_y
+
+                elif self.attention_type[3]:
+                    geom_bias = self.geom_bias.\
+                        view(1, num_heads, self.qk_embed_dim, 1).\
+                        repeat(n, 1, 1, 1)
+
+                    position_feat_x_reshape = position_feat_x.\
+                        view(n, num_heads, w * w_kv, self.qk_embed_dim)
+
+                    position_feat_y_reshape = position_feat_y.\
+                        view(n, num_heads, h * h_kv, self.qk_embed_dim)
+
+                    energy_x = torch.matmul(position_feat_x_reshape, geom_bias)
+                    energy_x = energy_x.view(n, num_heads, 1, w, 1, w_kv)
+
+                    energy_y = torch.matmul(position_feat_y_reshape, geom_bias)
+                    energy_y = energy_y.view(n, num_heads, h, 1, h_kv, 1)
+
+                    energy += energy_x + energy_y
+
+            energy = energy.view(n, num_heads, h * w, h_kv * w_kv)
+
+        if self.spatial_range >= 0:
+            cur_local_constraint_map = \
+                self.local_constraint_map[:h, :w, :h_kv, :w_kv].\
+                contiguous().\
+                view(1, 1, h*w, h_kv*w_kv)
+
+            energy = energy.masked_fill_(cur_local_constraint_map,
+                                         float('-inf'))
+
+        attention = F.softmax(energy, 3)
+
+        proj_value = self.value_conv(x_kv)
+        proj_value_reshape = proj_value.\
+            view((n, num_heads, self.v_dim, h_kv * w_kv)).\
+            permute(0, 1, 3, 2)
+
+        out = torch.matmul(attention, proj_value_reshape).\
+            permute(0, 1, 3, 2).\
+            contiguous().\
+            view(n, self.v_dim * self.num_heads, h, w)
+
+        out = self.proj_conv(out)
+
+        # output is downsampled, upsample back to input size
+        if self.q_downsample is not None:
+            out = F.interpolate(
+                out,
+                size=x_input.shape[2:],
+                mode='bilinear',
+                align_corners=False)
+
+        out = self.gamma * out + x_input
+        return out
+
+    def init_weights(self):
+        for m in self.modules():
+            if hasattr(m, 'kaiming_init') and m.kaiming_init:
+                kaiming_init(
+                    m,
+                    mode='fan_in',
+                    nonlinearity='leaky_relu',
+                    bias=0,
+                    distribution='uniform',
+                    a=1)
diff --git a/mmcv/mmcv/cnn/bricks/hsigmoid.py b/mmcv/mmcv/cnn/bricks/hsigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..5eb97e8ab13e76c6916a7ebba15cb50f8b846897
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/hsigmoid.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+
+from .registry import ACTIVATION_LAYERS
+
+
+@ACTIVATION_LAYERS.register_module()
+class HSigmoid(nn.Module):
+    """Hard Sigmoid Module. Apply the hard sigmoid function:
+    Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
+    Default: Hsigmoid(x) = min(max((x + 3) / 6, 0), 1)
+
+    Note:
+        In MMCV v1.4.4, we modified the default value of args to align with
+        PyTorch official.
+
+    Args:
+        bias (float): Bias of the input feature map. Default: 3.0.
+        divisor (float): Divisor of the input feature map. Default: 6.0.
+        min_value (float): Lower bound value. Default: 0.0.
+        max_value (float): Upper bound value. Default: 1.0.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self,
+                 bias: float = 3.0,
+                 divisor: float = 6.0,
+                 min_value: float = 0.0,
+                 max_value: float = 1.0):
+        super().__init__()
+        warnings.warn(
+            'In MMCV v1.4.4, we modified the default value of args to align '
+            'with PyTorch official. Previous Implementation: '
+            'Hsigmoid(x) = min(max((x + 1) / 2, 0), 1). '
+            'Current Implementation: '
+            'Hsigmoid(x) = min(max((x + 3) / 6, 0), 1).')
+        self.bias = bias
+        self.divisor = divisor
+        assert self.divisor != 0
+        self.min_value = min_value
+        self.max_value = max_value
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = (x + self.bias) / self.divisor
+
+        return x.clamp_(self.min_value, self.max_value)
diff --git a/mmcv/mmcv/cnn/bricks/hswish.py b/mmcv/mmcv/cnn/bricks/hswish.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f6cc276c10a5c49bd9c0e30a1ffad4a1b6018d4
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/hswish.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from .registry import ACTIVATION_LAYERS
+
+
+class HSwish(nn.Module):
+    """Hard Swish Module.
+
+    This module applies the hard swish function:
+
+    .. math::
+        Hswish(x) = x * ReLU6(x + 3) / 6
+
+    Args:
+        inplace (bool): can optionally do the operation in-place.
+            Default: False.
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self, inplace: bool = False):
+        super().__init__()
+        self.act = nn.ReLU6(inplace)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.act(x + 3) / 6
+
+
+if (TORCH_VERSION == 'parrots'
+        or digit_version(TORCH_VERSION) < digit_version('1.7')):
+    # Hardswish is not supported when PyTorch version < 1.6.
+    # And Hardswish in PyTorch 1.6 does not support inplace.
+    ACTIVATION_LAYERS.register_module(module=HSwish)
+else:
+    ACTIVATION_LAYERS.register_module(module=nn.Hardswish, name='HSwish')
diff --git a/mmcv/mmcv/cnn/bricks/non_local.py b/mmcv/mmcv/cnn/bricks/non_local.py
new file mode 100644
index 0000000000000000000000000000000000000000..159db245e80950d9b94e2744361bca2a09e67c13
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/non_local.py
@@ -0,0 +1,308 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta
+from typing import Dict, Optional
+
+import torch
+import torch.nn as nn
+
+from ..utils import constant_init, normal_init
+from .conv_module import ConvModule
+from .registry import PLUGIN_LAYERS
+
+
+class _NonLocalNd(nn.Module, metaclass=ABCMeta):
+    """Basic Non-local module.
+
+    This module is proposed in
+    "Non-local Neural Networks"
+    Paper reference: https://arxiv.org/abs/1711.07971
+    Code reference: https://github.com/AlexHex7/Non-local_pytorch
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+        reduction (int): Channel reduction ratio. Default: 2.
+        use_scale (bool): Whether to scale pairwise_weight by
+            `1/sqrt(inter_channels)` when the mode is `embedded_gaussian`.
+            Default: True.
+        conv_cfg (None | dict): The config dict for convolution layers.
+            If not specified, it will use `nn.Conv2d` for convolution layers.
+            Default: None.
+        norm_cfg (None | dict): The config dict for normalization layers.
+            Default: None. (This parameter is only applicable to conv_out.)
+        mode (str): Options are `gaussian`, `concatenation`,
+            `embedded_gaussian` and `dot_product`. Default: embedded_gaussian.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 reduction: int = 2,
+                 use_scale: bool = True,
+                 conv_cfg: Optional[Dict] = None,
+                 norm_cfg: Optional[Dict] = None,
+                 mode: str = 'embedded_gaussian',
+                 **kwargs):
+        super().__init__()
+        self.in_channels = in_channels
+        self.reduction = reduction
+        self.use_scale = use_scale
+        self.inter_channels = max(in_channels // reduction, 1)
+        self.mode = mode
+
+        if mode not in [
+                'gaussian', 'embedded_gaussian', 'dot_product', 'concatenation'
+        ]:
+            raise ValueError("Mode should be in 'gaussian', 'concatenation', "
+                             f"'embedded_gaussian' or 'dot_product', but got "
+                             f'{mode} instead.')
+
+        # g, theta, phi are defaulted as `nn.ConvNd`.
+        # Here we use ConvModule for potential usage.
+        self.g = ConvModule(
+            self.in_channels,
+            self.inter_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            act_cfg=None)  # type: ignore
+        self.conv_out = ConvModule(
+            self.inter_channels,
+            self.in_channels,
+            kernel_size=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        if self.mode != 'gaussian':
+            self.theta = ConvModule(
+                self.in_channels,
+                self.inter_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                act_cfg=None)
+            self.phi = ConvModule(
+                self.in_channels,
+                self.inter_channels,
+                kernel_size=1,
+                conv_cfg=conv_cfg,
+                act_cfg=None)
+
+        if self.mode == 'concatenation':
+            self.concat_project = ConvModule(
+                self.inter_channels * 2,
+                1,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=False,
+                act_cfg=dict(type='ReLU'))
+
+        self.init_weights(**kwargs)
+
+    def init_weights(self, std: float = 0.01, zeros_init: bool = True) -> None:
+        if self.mode != 'gaussian':
+            for m in [self.g, self.theta, self.phi]:
+                normal_init(m.conv, std=std)
+        else:
+            normal_init(self.g.conv, std=std)
+        if zeros_init:
+            if self.conv_out.norm_cfg is None:
+                constant_init(self.conv_out.conv, 0)
+            else:
+                constant_init(self.conv_out.norm, 0)
+        else:
+            if self.conv_out.norm_cfg is None:
+                normal_init(self.conv_out.conv, std=std)
+            else:
+                normal_init(self.conv_out.norm, std=std)
+
+    def gaussian(self, theta_x: torch.Tensor,
+                 phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def embedded_gaussian(self, theta_x: torch.Tensor,
+                          phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        if self.use_scale:
+            # theta_x.shape[-1] is `self.inter_channels`
+            pairwise_weight /= theta_x.shape[-1]**0.5
+        pairwise_weight = pairwise_weight.softmax(dim=-1)
+        return pairwise_weight
+
+    def dot_product(self, theta_x: torch.Tensor,
+                    phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = torch.matmul(theta_x, phi_x)
+        pairwise_weight /= pairwise_weight.shape[-1]
+        return pairwise_weight
+
+    def concatenation(self, theta_x: torch.Tensor,
+                      phi_x: torch.Tensor) -> torch.Tensor:
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        h = theta_x.size(2)
+        w = phi_x.size(3)
+        theta_x = theta_x.repeat(1, 1, 1, w)
+        phi_x = phi_x.repeat(1, 1, h, 1)
+
+        concat_feature = torch.cat([theta_x, phi_x], dim=1)
+        pairwise_weight = self.concat_project(concat_feature)
+        n, _, h, w = pairwise_weight.size()
+        pairwise_weight = pairwise_weight.view(n, h, w)
+        pairwise_weight /= pairwise_weight.shape[-1]
+
+        return pairwise_weight
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Assume `reduction = 1`, then `inter_channels = C`
+        # or `inter_channels = C` when `mode="gaussian"`
+
+        # NonLocal1d x: [N, C, H]
+        # NonLocal2d x: [N, C, H, W]
+        # NonLocal3d x: [N, C, T, H, W]
+        n = x.size(0)
+
+        # NonLocal1d g_x: [N, H, C]
+        # NonLocal2d g_x: [N, HxW, C]
+        # NonLocal3d g_x: [N, TxHxW, C]
+        g_x = self.g(x).view(n, self.inter_channels, -1)
+        g_x = g_x.permute(0, 2, 1)
+
+        # NonLocal1d theta_x: [N, H, C], phi_x: [N, C, H]
+        # NonLocal2d theta_x: [N, HxW, C], phi_x: [N, C, HxW]
+        # NonLocal3d theta_x: [N, TxHxW, C], phi_x: [N, C, TxHxW]
+        if self.mode == 'gaussian':
+            theta_x = x.view(n, self.in_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            if self.sub_sample:
+                phi_x = self.phi(x).view(n, self.in_channels, -1)
+            else:
+                phi_x = x.view(n, self.in_channels, -1)
+        elif self.mode == 'concatenation':
+            theta_x = self.theta(x).view(n, self.inter_channels, -1, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, 1, -1)
+        else:
+            theta_x = self.theta(x).view(n, self.inter_channels, -1)
+            theta_x = theta_x.permute(0, 2, 1)
+            phi_x = self.phi(x).view(n, self.inter_channels, -1)
+
+        pairwise_func = getattr(self, self.mode)
+        # NonLocal1d pairwise_weight: [N, H, H]
+        # NonLocal2d pairwise_weight: [N, HxW, HxW]
+        # NonLocal3d pairwise_weight: [N, TxHxW, TxHxW]
+        pairwise_weight = pairwise_func(theta_x, phi_x)
+
+        # NonLocal1d y: [N, H, C]
+        # NonLocal2d y: [N, HxW, C]
+        # NonLocal3d y: [N, TxHxW, C]
+        y = torch.matmul(pairwise_weight, g_x)
+        # NonLocal1d y: [N, C, H]
+        # NonLocal2d y: [N, C, H, W]
+        # NonLocal3d y: [N, C, T, H, W]
+        y = y.permute(0, 2, 1).contiguous().reshape(n, self.inter_channels,
+                                                    *x.size()[2:])
+
+        output = x + self.conv_out(y)
+
+        return output
+
+
+class NonLocal1d(_NonLocalNd):
+    """1D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv1d').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv1d'),
+                 **kwargs):
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool1d(kernel_size=2)
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
+
+
+@PLUGIN_LAYERS.register_module()
+class NonLocal2d(_NonLocalNd):
+    """2D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv2d').
+    """
+
+    _abbr_ = 'nonlocal_block'
+
+    def __init__(self,
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv2d'),
+                 **kwargs):
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool2d(kernel_size=(2, 2))
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
+
+
+class NonLocal3d(_NonLocalNd):
+    """3D Non-local module.
+
+    Args:
+        in_channels (int): Same as `NonLocalND`.
+        sub_sample (bool): Whether to apply max pooling after pairwise
+            function (Note that the `sub_sample` is applied on spatial only).
+            Default: False.
+        conv_cfg (None | dict): Same as `NonLocalND`.
+            Default: dict(type='Conv3d').
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 sub_sample: bool = False,
+                 conv_cfg: Dict = dict(type='Conv3d'),
+                 **kwargs):
+        super().__init__(in_channels, conv_cfg=conv_cfg, **kwargs)
+        self.sub_sample = sub_sample
+
+        if sub_sample:
+            max_pool_layer = nn.MaxPool3d(kernel_size=(1, 2, 2))
+            self.g = nn.Sequential(self.g, max_pool_layer)
+            if self.mode != 'gaussian':
+                self.phi = nn.Sequential(self.phi, max_pool_layer)
+            else:
+                self.phi = max_pool_layer
diff --git a/mmcv/mmcv/cnn/bricks/norm.py b/mmcv/mmcv/cnn/bricks/norm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6281a7c697483fbdaaba5a37d88a00f3c259d31
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/norm.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+from typing import Dict, Tuple, Union
+
+import torch.nn as nn
+
+from mmcv.utils import is_tuple_of
+from mmcv.utils.parrots_wrapper import SyncBatchNorm, _BatchNorm, _InstanceNorm
+from .registry import NORM_LAYERS
+
+NORM_LAYERS.register_module('BN', module=nn.BatchNorm2d)
+NORM_LAYERS.register_module('BN1d', module=nn.BatchNorm1d)
+NORM_LAYERS.register_module('BN2d', module=nn.BatchNorm2d)
+NORM_LAYERS.register_module('BN3d', module=nn.BatchNorm3d)
+NORM_LAYERS.register_module('SyncBN', module=SyncBatchNorm)
+NORM_LAYERS.register_module('GN', module=nn.GroupNorm)
+NORM_LAYERS.register_module('LN', module=nn.LayerNorm)
+NORM_LAYERS.register_module('IN', module=nn.InstanceNorm2d)
+NORM_LAYERS.register_module('IN1d', module=nn.InstanceNorm1d)
+NORM_LAYERS.register_module('IN2d', module=nn.InstanceNorm2d)
+NORM_LAYERS.register_module('IN3d', module=nn.InstanceNorm3d)
+
+
+def infer_abbr(class_type):
+    """Infer abbreviation from the class name.
+
+    When we build a norm layer with `build_norm_layer()`, we want to preserve
+    the norm type in variable names, e.g, self.bn1, self.gn. This method will
+    infer the abbreviation to map class types to abbreviations.
+
+    Rule 1: If the class has the property "_abbr_", return the property.
+    Rule 2: If the parent class is _BatchNorm, GroupNorm, LayerNorm or
+    InstanceNorm, the abbreviation of this layer will be "bn", "gn", "ln" and
+    "in" respectively.
+    Rule 3: If the class name contains "batch", "group", "layer" or "instance",
+    the abbreviation of this layer will be "bn", "gn", "ln" and "in"
+    respectively.
+    Rule 4: Otherwise, the abbreviation falls back to "norm".
+
+    Args:
+        class_type (type): The norm layer type.
+
+    Returns:
+        str: The inferred abbreviation.
+    """
+    if not inspect.isclass(class_type):
+        raise TypeError(
+            f'class_type must be a type, but got {type(class_type)}')
+    if hasattr(class_type, '_abbr_'):
+        return class_type._abbr_
+    if issubclass(class_type, _InstanceNorm):  # IN is a subclass of BN
+        return 'in'
+    elif issubclass(class_type, _BatchNorm):
+        return 'bn'
+    elif issubclass(class_type, nn.GroupNorm):
+        return 'gn'
+    elif issubclass(class_type, nn.LayerNorm):
+        return 'ln'
+    else:
+        class_name = class_type.__name__.lower()
+        if 'batch' in class_name:
+            return 'bn'
+        elif 'group' in class_name:
+            return 'gn'
+        elif 'layer' in class_name:
+            return 'ln'
+        elif 'instance' in class_name:
+            return 'in'
+        else:
+            return 'norm_layer'
+
+
+def build_norm_layer(cfg: Dict,
+                     num_features: int,
+                     postfix: Union[int, str] = '') -> Tuple[str, nn.Module]:
+    """Build normalization layer.
+
+    Args:
+        cfg (dict): The norm layer config, which should contain:
+
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a norm layer.
+            - requires_grad (bool, optional): Whether stop gradient updates.
+        num_features (int): Number of input channels.
+        postfix (int | str): The postfix to be appended into norm abbreviation
+            to create named layer.
+
+    Returns:
+        tuple[str, nn.Module]: The first element is the layer name consisting
+        of abbreviation and postfix, e.g., bn1, gn. The second element is the
+        created norm layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in NORM_LAYERS:
+        raise KeyError(f'Unrecognized norm type {layer_type}')
+
+    norm_layer = NORM_LAYERS.get(layer_type)
+    abbr = infer_abbr(norm_layer)
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    requires_grad = cfg_.pop('requires_grad', True)
+    cfg_.setdefault('eps', 1e-5)
+    if layer_type != 'GN':
+        layer = norm_layer(num_features, **cfg_)
+        if layer_type == 'SyncBN' and hasattr(layer, '_specify_ddp_gpu_num'):
+            layer._specify_ddp_gpu_num(1)
+    else:
+        assert 'num_groups' in cfg_
+        layer = norm_layer(num_channels=num_features, **cfg_)
+
+    for param in layer.parameters():
+        param.requires_grad = requires_grad
+
+    return name, layer
+
+
+def is_norm(layer: nn.Module,
+            exclude: Union[type, tuple, None] = None) -> bool:
+    """Check if a layer is a normalization layer.
+
+    Args:
+        layer (nn.Module): The layer to be checked.
+        exclude (type | tuple[type]): Types to be excluded.
+
+    Returns:
+        bool: Whether the layer is a norm layer.
+    """
+    if exclude is not None:
+        if not isinstance(exclude, tuple):
+            exclude = (exclude, )
+        if not is_tuple_of(exclude, type):
+            raise TypeError(
+                f'"exclude" must be either None or type or a tuple of types, '
+                f'but got {type(exclude)}: {exclude}')
+
+    if exclude and isinstance(layer, exclude):
+        return False
+
+    all_norm_bases = (_BatchNorm, _InstanceNorm, nn.GroupNorm, nn.LayerNorm)
+    return isinstance(layer, all_norm_bases)
diff --git a/mmcv/mmcv/cnn/bricks/padding.py b/mmcv/mmcv/cnn/bricks/padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..8412b0c6576fd220eca52382943ad5889f0dfd1f
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/padding.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+import torch.nn as nn
+
+from .registry import PADDING_LAYERS
+
+PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d)
+PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
+PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
+
+
+def build_padding_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
+    """Build padding layer.
+
+    Args:
+        cfg (dict): The padding layer config, which should contain:
+            - type (str): Layer type.
+            - layer args: Args needed to instantiate a padding layer.
+
+    Returns:
+        nn.Module: Created padding layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+
+    cfg_ = cfg.copy()
+    padding_type = cfg_.pop('type')
+    if padding_type not in PADDING_LAYERS:
+        raise KeyError(f'Unrecognized padding type {padding_type}.')
+    else:
+        padding_layer = PADDING_LAYERS.get(padding_type)
+
+    layer = padding_layer(*args, **kwargs, **cfg_)
+
+    return layer
diff --git a/mmcv/mmcv/cnn/bricks/plugin.py b/mmcv/mmcv/cnn/bricks/plugin.py
new file mode 100644
index 0000000000000000000000000000000000000000..095ef9234501d0bca54373d4422244b80f818341
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/plugin.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import platform
+from typing import Dict, Tuple, Union
+
+import torch.nn as nn
+
+from .registry import PLUGIN_LAYERS
+
+if platform.system() == 'Windows':
+    import regex as re  # type: ignore
+else:
+    import re  # type: ignore
+
+
+def infer_abbr(class_type: type) -> str:
+    """Infer abbreviation from the class name.
+
+    This method will infer the abbreviation to map class types to
+    abbreviations.
+
+    Rule 1: If the class has the property "abbr", return the property.
+    Rule 2: Otherwise, the abbreviation falls back to snake case of class
+    name, e.g. the abbreviation of ``FancyBlock`` will be ``fancy_block``.
+
+    Args:
+        class_type (type): The norm layer type.
+
+    Returns:
+        str: The inferred abbreviation.
+    """
+
+    def camel2snack(word):
+        """Convert camel case word into snack case.
+
+        Modified from `inflection lib
+        <https://inflection.readthedocs.io/en/latest/#inflection.underscore>`_.
+
+        Example::
+
+            >>> camel2snack("FancyBlock")
+            'fancy_block'
+        """
+
+        word = re.sub(r'([A-Z]+)([A-Z][a-z])', r'\1_\2', word)
+        word = re.sub(r'([a-z\d])([A-Z])', r'\1_\2', word)
+        word = word.replace('-', '_')
+        return word.lower()
+
+    if not inspect.isclass(class_type):
+        raise TypeError(
+            f'class_type must be a type, but got {type(class_type)}')
+    if hasattr(class_type, '_abbr_'):
+        return class_type._abbr_  # type: ignore
+    else:
+        return camel2snack(class_type.__name__)
+
+
+def build_plugin_layer(cfg: Dict,
+                       postfix: Union[int, str] = '',
+                       **kwargs) -> Tuple[str, nn.Module]:
+    """Build plugin layer.
+
+    Args:
+        cfg (dict): cfg should contain:
+
+            - type (str): identify plugin layer type.
+            - layer args: args needed to instantiate a plugin layer.
+        postfix (int, str): appended into norm abbreviation to
+            create named layer. Default: ''.
+
+    Returns:
+        tuple[str, nn.Module]: The first one is the concatenation of
+        abbreviation and postfix. The second is the created plugin layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError('cfg must be a dict')
+    if 'type' not in cfg:
+        raise KeyError('the cfg dict must contain the key "type"')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in PLUGIN_LAYERS:
+        raise KeyError(f'Unrecognized plugin type {layer_type}')
+
+    plugin_layer = PLUGIN_LAYERS.get(layer_type)
+    abbr = infer_abbr(plugin_layer)
+
+    assert isinstance(postfix, (int, str))
+    name = abbr + str(postfix)
+
+    layer = plugin_layer(**kwargs, **cfg_)
+
+    return name, layer
diff --git a/mmcv/mmcv/cnn/bricks/registry.py b/mmcv/mmcv/cnn/bricks/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..c29279776dd523e706b6af8f9b9de700bed05ba7
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/registry.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry
+
+CONV_LAYERS = Registry('conv layer')
+NORM_LAYERS = Registry('norm layer')
+ACTIVATION_LAYERS = Registry('activation layer')
+PADDING_LAYERS = Registry('padding layer')
+UPSAMPLE_LAYERS = Registry('upsample layer')
+PLUGIN_LAYERS = Registry('plugin layer')
+
+DROPOUT_LAYERS = Registry('drop out layers')
+POSITIONAL_ENCODING = Registry('position encoding')
+ATTENTION = Registry('attention')
+FEEDFORWARD_NETWORK = Registry('feed-forward Network')
+TRANSFORMER_LAYER = Registry('transformerLayer')
+TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
diff --git a/mmcv/mmcv/cnn/bricks/scale.py b/mmcv/mmcv/cnn/bricks/scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbd07c6a445e116bd6f32c96d8b52079ccf9b28a
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/scale.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+class Scale(nn.Module):
+    """A learnable scale parameter.
+
+    This layer scales the input by a learnable factor. It multiplies a
+    learnable scale parameter of shape (1,) with input of any shape.
+
+    Args:
+        scale (float): Initial value of scale factor. Default: 1.0
+    """
+
+    def __init__(self, scale: float = 1.0):
+        super().__init__()
+        self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * self.scale
diff --git a/mmcv/mmcv/cnn/bricks/swish.py b/mmcv/mmcv/cnn/bricks/swish.py
new file mode 100644
index 0000000000000000000000000000000000000000..b297adff068661859265a5057c1b2204ac8eefa7
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/swish.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from .registry import ACTIVATION_LAYERS
+
+
+@ACTIVATION_LAYERS.register_module()
+class Swish(nn.Module):
+    """Swish Module.
+
+    This module applies the swish function:
+
+    .. math::
+        Swish(x) = x * Sigmoid(x)
+
+    Returns:
+        Tensor: The output tensor.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x * torch.sigmoid(x)
diff --git a/mmcv/mmcv/cnn/bricks/transformer.py b/mmcv/mmcv/cnn/bricks/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7ba4d9f836609cec8526607db98c4b03ec4fee3
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/transformer.py
@@ -0,0 +1,944 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+import warnings
+from typing import Sequence
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.cnn import (Linear, build_activation_layer, build_conv_layer,
+                      build_norm_layer)
+from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmcv.utils import (ConfigDict, build_from_cfg, deprecated_api_warning,
+                        to_2tuple)
+from .drop import build_dropout
+from .registry import (ATTENTION, FEEDFORWARD_NETWORK, POSITIONAL_ENCODING,
+                       TRANSFORMER_LAYER, TRANSFORMER_LAYER_SEQUENCE)
+
+# Avoid BC-breaking of importing MultiScaleDeformableAttention from this file
+try:
+    from mmcv.ops.multi_scale_deform_attn import \
+        MultiScaleDeformableAttention  # noqa F401
+    warnings.warn(
+        ImportWarning(
+            '``MultiScaleDeformableAttention`` has been moved to '
+            '``mmcv.ops.multi_scale_deform_attn``, please change original path '  # noqa E501
+            '``from mmcv.cnn.bricks.transformer import MultiScaleDeformableAttention`` '  # noqa E501
+            'to ``from mmcv.ops.multi_scale_deform_attn import MultiScaleDeformableAttention`` '  # noqa E501
+        ))
+
+except ImportError:
+    warnings.warn('Fail to import ``MultiScaleDeformableAttention`` from '
+                  '``mmcv.ops.multi_scale_deform_attn``, '
+                  'You should install ``mmcv-full`` if you need this module. ')
+
+
+def build_positional_encoding(cfg, default_args=None):
+    """Builder for Position Encoding."""
+    return build_from_cfg(cfg, POSITIONAL_ENCODING, default_args)
+
+
+def build_attention(cfg, default_args=None):
+    """Builder for attention."""
+    return build_from_cfg(cfg, ATTENTION, default_args)
+
+
+def build_feedforward_network(cfg, default_args=None):
+    """Builder for feed-forward network (FFN)."""
+    return build_from_cfg(cfg, FEEDFORWARD_NETWORK, default_args)
+
+
+def build_transformer_layer(cfg, default_args=None):
+    """Builder for transformer layer."""
+    return build_from_cfg(cfg, TRANSFORMER_LAYER, default_args)
+
+
+def build_transformer_layer_sequence(cfg, default_args=None):
+    """Builder for transformer encoder and transformer decoder."""
+    return build_from_cfg(cfg, TRANSFORMER_LAYER_SEQUENCE, default_args)
+
+
+class AdaptivePadding(nn.Module):
+    """Applies padding adaptively to the input.
+
+    This module can make input get fully covered by filter
+    you specified. It support two modes "same" and "corner". The
+    "same" mode is same with "SAME" padding mode in TensorFlow, pad
+    zero around input. The "corner"  mode would pad zero
+    to bottom right.
+
+    Args:
+        kernel_size (int | tuple): Size of the kernel. Default: 1.
+        stride (int | tuple): Stride of the filter. Default: 1.
+        dilation (int | tuple): Spacing between kernel elements.
+            Default: 1.
+        padding (str): Support "same" and "corner", "corner" mode
+            would pad zero to bottom right, and "same" mode would
+            pad zero around input. Default: "corner".
+
+    Example:
+        >>> kernel_size = 16
+        >>> stride = 16
+        >>> dilation = 1
+        >>> input = torch.rand(1, 1, 15, 17)
+        >>> adap_pad = AdaptivePadding(
+        >>>     kernel_size=kernel_size,
+        >>>     stride=stride,
+        >>>     dilation=dilation,
+        >>>     padding="corner")
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+        >>> input = torch.rand(1, 1, 16, 17)
+        >>> out = adap_pad(input)
+        >>> assert (out.shape[2], out.shape[3]) == (16, 32)
+    """
+
+    def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
+        super().__init__()
+        assert padding in ('same', 'corner')
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        self.padding = padding
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.dilation = dilation
+
+    def get_pad_shape(self, input_shape):
+        """Calculate the padding size of input.
+
+        Args:
+            input_shape (:obj:`torch.Size`): arrange as (H, W).
+
+        Returns:
+            Tuple[int]: The padding size along the
+            original H and W directions
+        """
+        input_h, input_w = input_shape
+        kernel_h, kernel_w = self.kernel_size
+        stride_h, stride_w = self.stride
+        output_h = math.ceil(input_h / stride_h)
+        output_w = math.ceil(input_w / stride_w)
+        pad_h = max((output_h - 1) * stride_h +
+                    (kernel_h - 1) * self.dilation[0] + 1 - input_h, 0)
+        pad_w = max((output_w - 1) * stride_w +
+                    (kernel_w - 1) * self.dilation[1] + 1 - input_w, 0)
+        return pad_h, pad_w
+
+    def forward(self, x):
+        """Add padding to `x`
+
+        Args:
+            x (Tensor): Input tensor has shape (B, C, H, W).
+
+        Returns:
+            Tensor: The tensor with adaptive padding
+        """
+        pad_h, pad_w = self.get_pad_shape(x.size()[-2:])
+        if pad_h > 0 or pad_w > 0:
+            if self.padding == 'corner':
+                x = F.pad(x, [0, pad_w, 0, pad_h])
+            elif self.padding == 'same':
+                x = F.pad(x, [
+                    pad_w // 2, pad_w - pad_w // 2, pad_h // 2,
+                    pad_h - pad_h // 2
+                ])
+        return x
+
+
+class PatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    We use a conv layer to implement PatchEmbed.
+
+    Args:
+        in_channels (int): The num of input channels. Default: 3
+        embed_dims (int): The dimensions of embedding. Default: 768
+        conv_type (str): The type of convolution
+            to generate patch embedding. Default: "Conv2d".
+        kernel_size (int): The kernel_size of embedding conv. Default: 16.
+        stride (int): The slide stride of embedding conv.
+            Default: 16.
+        padding (int | tuple | string): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int): The dilation rate of embedding conv. Default: 1.
+        bias (bool): Bias of embed conv. Default: True.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: None.
+        input_size (int | tuple | None): The size of input, which will be
+            used to calculate the out size. Only works when `dynamic_size`
+            is False. Default: None.
+        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=768,
+                 conv_type='Conv2d',
+                 kernel_size=16,
+                 stride=16,
+                 padding='corner',
+                 dilation=1,
+                 bias=True,
+                 norm_cfg=None,
+                 input_size=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.embed_dims = embed_dims
+        if stride is None:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of conv
+            padding = 0
+        else:
+            self.adaptive_padding = None
+        padding = to_2tuple(padding)
+
+        self.projection = build_conv_layer(
+            dict(type=conv_type),
+            in_channels=in_channels,
+            out_channels=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, embed_dims)[1]
+        else:
+            self.norm = None
+
+        if input_size:
+            input_size = to_2tuple(input_size)
+            # `init_out_size` would be used outside to
+            # calculate the num_patches
+            # e.g. when `use_abs_pos_embed` outside
+            self.init_input_size = input_size
+            if self.adaptive_padding:
+                pad_h, pad_w = self.adaptive_padding.get_pad_shape(input_size)
+                input_h, input_w = input_size
+                input_h = input_h + pad_h
+                input_w = input_w + pad_w
+                input_size = (input_h, input_w)
+
+            # https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html
+            h_out = (input_size[0] + 2 * padding[0] - dilation[0] *
+                     (kernel_size[0] - 1) - 1) // stride[0] + 1
+            w_out = (input_size[1] + 2 * padding[1] - dilation[1] *
+                     (kernel_size[1] - 1) - 1) // stride[1] + 1
+            self.init_out_size = (h_out, w_out)
+        else:
+            self.init_input_size = None
+            self.init_out_size = None
+
+    def forward(self, x):
+        """
+        Args:
+            x (Tensor): Has shape (B, C, H, W). In most case, C is 3.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, out_h * out_w, embed_dims)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (out_h, out_w).
+        """
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+
+        x = self.projection(x)
+        out_size = (x.shape[2], x.shape[3])
+        x = x.flatten(2).transpose(1, 2)
+        if self.norm is not None:
+            x = self.norm(x)
+        return x, out_size
+
+
+class PatchMerging(BaseModule):
+    """Merge patch feature map.
+
+    This layer groups feature map by kernel_size, and applies norm and linear
+    layers to the grouped feature map ((used in Swin Transformer)).
+    Our implementation uses `nn.Unfold` to
+    merge patches, which is about 25% faster than the original
+    implementation. However, we need to modify pretrained
+    models for compatibility.
+
+    Args:
+        in_channels (int): The num of input channels.
+            to gets fully covered by filter and stride you specified.
+        out_channels (int): The num of output channels.
+        kernel_size (int | tuple, optional): the kernel size in the unfold
+            layer. Defaults to 2.
+        stride (int | tuple, optional): the stride of the sliding blocks in the
+            unfold layer. Default: None. (Would be set as `kernel_size`)
+        padding (int | tuple | string ): The padding length of
+            embedding conv. When it is a string, it means the mode
+            of adaptive padding, support "same" and "corner" now.
+            Default: "corner".
+        dilation (int | tuple, optional): dilation parameter in the unfold
+            layer. Default: 1.
+        bias (bool, optional): Whether to add bias in linear layer or not.
+            Defaults: False.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size=2,
+                 stride=None,
+                 padding='corner',
+                 dilation=1,
+                 bias=False,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        if stride:
+            stride = stride
+        else:
+            stride = kernel_size
+
+        kernel_size = to_2tuple(kernel_size)
+        stride = to_2tuple(stride)
+        dilation = to_2tuple(dilation)
+
+        if isinstance(padding, str):
+            self.adaptive_padding = AdaptivePadding(
+                kernel_size=kernel_size,
+                stride=stride,
+                dilation=dilation,
+                padding=padding)
+            # disable the padding of unfold
+            padding = 0
+        else:
+            self.adaptive_padding = None
+
+        padding = to_2tuple(padding)
+        self.sampler = nn.Unfold(
+            kernel_size=kernel_size,
+            dilation=dilation,
+            padding=padding,
+            stride=stride)
+
+        sample_dim = kernel_size[0] * kernel_size[1] * in_channels
+
+        if norm_cfg is not None:
+            self.norm = build_norm_layer(norm_cfg, sample_dim)[1]
+        else:
+            self.norm = None
+
+        self.reduction = nn.Linear(sample_dim, out_channels, bias=bias)
+
+    def forward(self, x, input_size):
+        """
+        Args:
+            x (Tensor): Has shape (B, H*W, C_in).
+            input_size (tuple[int]): The spatial shape of x, arrange as (H, W).
+                Default: None.
+
+        Returns:
+            tuple: Contains merged results and its spatial shape.
+
+            - x (Tensor): Has shape (B, Merged_H * Merged_W, C_out)
+            - out_size (tuple[int]): Spatial shape of x, arrange as
+              (Merged_H, Merged_W).
+        """
+        B, L, C = x.shape
+        assert isinstance(input_size, Sequence), f'Expect ' \
+                                                 f'input_size is ' \
+                                                 f'`Sequence` ' \
+                                                 f'but get {input_size}'
+
+        H, W = input_size
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C).permute([0, 3, 1, 2])  # B, C, H, W
+
+        if self.adaptive_padding:
+            x = self.adaptive_padding(x)
+            H, W = x.shape[-2:]
+
+        # Use nn.Unfold to merge patch. About 25% faster than original method,
+        # but need to modify pretrained model for compatibility
+        # if kernel_size=2 and stride=2, x should has shape (B, 4*C, H/2*W/2)
+        x = self.sampler(x)
+
+        out_h = (H + 2 * self.sampler.padding[0] - self.sampler.dilation[0] *
+                 (self.sampler.kernel_size[0] - 1) -
+                 1) // self.sampler.stride[0] + 1
+        out_w = (W + 2 * self.sampler.padding[1] - self.sampler.dilation[1] *
+                 (self.sampler.kernel_size[1] - 1) -
+                 1) // self.sampler.stride[1] + 1
+
+        output_size = (out_h, out_w)
+        x = x.transpose(1, 2)  # B, H/2*W/2, 4*C
+        x = self.norm(x) if self.norm else x
+        x = self.reduction(x)
+        return x, output_size
+
+
+@ATTENTION.register_module()
+class MultiheadAttention(BaseModule):
+    """A wrapper for ``torch.nn.MultiheadAttention``.
+
+    This module implements MultiheadAttention with identity connection,
+    and positional encoding  is also passed as input.
+
+    Args:
+        embed_dims (int): The embedding dimension.
+        num_heads (int): Parallel attention heads.
+        attn_drop (float): A Dropout layer on attn_output_weights.
+            Default: 0.0.
+        proj_drop (float): A Dropout layer after `nn.MultiheadAttention`.
+            Default: 0.0.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): When it is True,  Key, Query and Value are shape of
+            (batch, n, embed_dim), otherwise (n, batch, embed_dim).
+             Default to False.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 num_heads,
+                 attn_drop=0.,
+                 proj_drop=0.,
+                 dropout_layer=dict(type='Dropout', drop_prob=0.),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+        super().__init__(init_cfg)
+        if 'dropout' in kwargs:
+            warnings.warn(
+                'The arguments `dropout` in MultiheadAttention '
+                'has been deprecated, now you can separately '
+                'set `attn_drop`(float), proj_drop(float), '
+                'and `dropout_layer`(dict) ', DeprecationWarning)
+            attn_drop = kwargs['dropout']
+            dropout_layer['drop_prob'] = kwargs.pop('dropout')
+
+        self.embed_dims = embed_dims
+        self.num_heads = num_heads
+        self.batch_first = batch_first
+
+        self.attn = nn.MultiheadAttention(embed_dims, num_heads, attn_drop,
+                                          **kwargs)
+
+        self.proj_drop = nn.Dropout(proj_drop)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else nn.Identity()
+
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiheadAttention')
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                identity=None,
+                query_pos=None,
+                key_pos=None,
+                attn_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `MultiheadAttention`.
+
+        **kwargs allow passing a more general data flow when combining
+        with other operations in `transformerlayer`.
+
+        Args:
+            query (Tensor): The input query with shape [num_queries, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+                If None, the ``query`` will be used. Defaults to None.
+            value (Tensor): The value tensor with same shape as `key`.
+                Same in `nn.MultiheadAttention.forward`. Defaults to None.
+                If None, the `key` will be used.
+            identity (Tensor): This tensor, with the same shape as x,
+                will be used for the identity link.
+                If None, `x` will be used. Defaults to None.
+            query_pos (Tensor): The positional encoding for query, with
+                the same shape as `x`. If not None, it will
+                be added to `x` before forward function. Defaults to None.
+            key_pos (Tensor): The positional encoding for `key`, with the
+                same shape as `key`. Defaults to None. If not None, it will
+                be added to `key` before forward function. If None, and
+                `query_pos` has the same shape as `key`, then `query_pos`
+                will be used for `key_pos`. Defaults to None.
+            attn_mask (Tensor): ByteTensor mask with shape [num_queries,
+                num_keys]. Same in `nn.MultiheadAttention.forward`.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor with shape [bs, num_keys].
+                Defaults to None.
+
+        Returns:
+            Tensor: forwarded results with shape
+            [num_queries, bs, embed_dims]
+            if self.batch_first is False, else
+            [bs, num_queries embed_dims].
+        """
+
+        if key is None:
+            key = query
+        if value is None:
+            value = key
+        if identity is None:
+            identity = query
+        if key_pos is None:
+            if query_pos is not None:
+                # use query_pos if key_pos is not available
+                if query_pos.shape == key.shape:
+                    key_pos = query_pos
+                else:
+                    warnings.warn(f'position encoding of key is'
+                                  f'missing in {self.__class__.__name__}.')
+        if query_pos is not None:
+            query = query + query_pos
+        if key_pos is not None:
+            key = key + key_pos
+
+        # Because the dataflow('key', 'query', 'value') of
+        # ``torch.nn.MultiheadAttention`` is (num_query, batch,
+        # embed_dims), We should adjust the shape of dataflow from
+        # batch_first (batch, num_query, embed_dims) to num_query_first
+        # (num_query ,batch, embed_dims), and recover ``attn_output``
+        # from num_query_first to batch_first.
+        if self.batch_first:
+            query = query.transpose(0, 1)
+            key = key.transpose(0, 1)
+            value = value.transpose(0, 1)
+
+        out = self.attn(
+            query=query,
+            key=key,
+            value=value,
+            attn_mask=attn_mask,
+            key_padding_mask=key_padding_mask)[0]
+
+        if self.batch_first:
+            out = out.transpose(0, 1)
+
+        return identity + self.dropout_layer(self.proj_drop(out))
+
+
+@FEEDFORWARD_NETWORK.register_module()
+class FFN(BaseModule):
+    """Implements feed-forward networks (FFNs) with identity connection.
+
+    Args:
+        embed_dims (int): The feature dimension. Same as
+            `MultiheadAttention`. Defaults: 256.
+        feedforward_channels (int): The hidden dimension of FFNs.
+            Defaults: 1024.
+        num_fcs (int, optional): The number of fully-connected layers in
+            FFNs. Default: 2.
+        act_cfg (dict, optional): The activation config for FFNs.
+            Default: dict(type='ReLU')
+        ffn_drop (float, optional): Probability of an element to be
+            zeroed in FFN. Default 0.0.
+        add_identity (bool, optional): Whether to add the
+            identity connection. Default: `True`.
+        dropout_layer (obj:`ConfigDict`): The dropout_layer used
+            when adding the shortcut.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    @deprecated_api_warning(
+        {
+            'dropout': 'ffn_drop',
+            'add_residual': 'add_identity'
+        },
+        cls_name='FFN')
+    def __init__(self,
+                 embed_dims=256,
+                 feedforward_channels=1024,
+                 num_fcs=2,
+                 act_cfg=dict(type='ReLU', inplace=True),
+                 ffn_drop=0.,
+                 dropout_layer=None,
+                 add_identity=True,
+                 init_cfg=None,
+                 **kwargs):
+        super().__init__(init_cfg)
+        assert num_fcs >= 2, 'num_fcs should be no less ' \
+            f'than 2. got {num_fcs}.'
+        self.embed_dims = embed_dims
+        self.feedforward_channels = feedforward_channels
+        self.num_fcs = num_fcs
+        self.act_cfg = act_cfg
+        self.activate = build_activation_layer(act_cfg)
+
+        layers = []
+        in_channels = embed_dims
+        for _ in range(num_fcs - 1):
+            layers.append(
+                Sequential(
+                    Linear(in_channels, feedforward_channels), self.activate,
+                    nn.Dropout(ffn_drop)))
+            in_channels = feedforward_channels
+        layers.append(Linear(feedforward_channels, embed_dims))
+        layers.append(nn.Dropout(ffn_drop))
+        self.layers = Sequential(*layers)
+        self.dropout_layer = build_dropout(
+            dropout_layer) if dropout_layer else torch.nn.Identity()
+        self.add_identity = add_identity
+
+    @deprecated_api_warning({'residual': 'identity'}, cls_name='FFN')
+    def forward(self, x, identity=None):
+        """Forward function for `FFN`.
+
+        The function would add x to the output tensor if residue is None.
+        """
+        out = self.layers(x)
+        if not self.add_identity:
+            return self.dropout_layer(out)
+        if identity is None:
+            identity = x
+        return identity + self.dropout_layer(out)
+
+
+@TRANSFORMER_LAYER.register_module()
+class BaseTransformerLayer(BaseModule):
+    """Base `TransformerLayer` for vision transformer.
+
+    It can be built from `mmcv.ConfigDict` and support more flexible
+    customization, for example, using any number of `FFN or LN ` and
+    use different kinds of `attention` by specifying a list of `ConfigDict`
+    named `attn_cfgs`. It is worth mentioning that it supports `prenorm`
+    when you specifying `norm` as the first element of `operation_order`.
+    More details about the `prenorm`: `On Layer Normalization in the
+    Transformer Architecture <https://arxiv.org/abs/2002.04745>`_ .
+
+    Args:
+        attn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for `self_attention` or `cross_attention` modules,
+            The order of the configs in the list should be consistent with
+            corresponding attentions in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config. Default: None.
+        ffn_cfgs (list[`mmcv.ConfigDict`] | obj:`mmcv.ConfigDict` | None )):
+            Configs for FFN, The order of the configs in the list should be
+            consistent with corresponding ffn in operation_order.
+            If it is a dict, all of the attention modules in operation_order
+            will be built with this config.
+        operation_order (tuple[str]): The execution order of operation
+            in transformer. Such as ('self_attn', 'norm', 'ffn', 'norm').
+            Support `prenorm` when you specifying first element as `norm`.
+            Default：None.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+        batch_first (bool): Key, Query and Value are shape
+            of (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+    """
+
+    def __init__(self,
+                 attn_cfgs=None,
+                 ffn_cfgs=dict(
+                     type='FFN',
+                     embed_dims=256,
+                     feedforward_channels=1024,
+                     num_fcs=2,
+                     ffn_drop=0.,
+                     act_cfg=dict(type='ReLU', inplace=True),
+                 ),
+                 operation_order=None,
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None,
+                 batch_first=False,
+                 **kwargs):
+
+        deprecated_args = dict(
+            feedforward_channels='feedforward_channels',
+            ffn_dropout='ffn_drop',
+            ffn_num_fcs='num_fcs')
+        for ori_name, new_name in deprecated_args.items():
+            if ori_name in kwargs:
+                warnings.warn(
+                    f'The arguments `{ori_name}` in BaseTransformerLayer '
+                    f'has been deprecated, now you should set `{new_name}` '
+                    f'and other FFN related arguments '
+                    f'to a dict named `ffn_cfgs`. ', DeprecationWarning)
+                ffn_cfgs[new_name] = kwargs[ori_name]
+
+        super().__init__(init_cfg)
+
+        self.batch_first = batch_first
+
+        assert set(operation_order) & {
+            'self_attn', 'norm', 'ffn', 'cross_attn'} == \
+            set(operation_order), f'The operation_order of' \
+            f' {self.__class__.__name__} should ' \
+            f'contains all four operation type ' \
+            f"{['self_attn', 'norm', 'ffn', 'cross_attn']}"
+
+        num_attn = operation_order.count('self_attn') + operation_order.count(
+            'cross_attn')
+        if isinstance(attn_cfgs, dict):
+            attn_cfgs = [copy.deepcopy(attn_cfgs) for _ in range(num_attn)]
+        else:
+            assert num_attn == len(attn_cfgs), f'The length ' \
+                f'of attn_cfg {num_attn} is ' \
+                f'not consistent with the number of attention' \
+                f'in operation_order {operation_order}.'
+
+        self.num_attn = num_attn
+        self.operation_order = operation_order
+        self.norm_cfg = norm_cfg
+        self.pre_norm = operation_order[0] == 'norm'
+        self.attentions = ModuleList()
+
+        index = 0
+        for operation_name in operation_order:
+            if operation_name in ['self_attn', 'cross_attn']:
+                if 'batch_first' in attn_cfgs[index]:
+                    assert self.batch_first == attn_cfgs[index]['batch_first']
+                else:
+                    attn_cfgs[index]['batch_first'] = self.batch_first
+                attention = build_attention(attn_cfgs[index])
+                # Some custom attentions used as `self_attn`
+                # or `cross_attn` can have different behavior.
+                attention.operation_name = operation_name
+                self.attentions.append(attention)
+                index += 1
+
+        self.embed_dims = self.attentions[0].embed_dims
+
+        self.ffns = ModuleList()
+        num_ffns = operation_order.count('ffn')
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = ConfigDict(ffn_cfgs)
+        if isinstance(ffn_cfgs, dict):
+            ffn_cfgs = [copy.deepcopy(ffn_cfgs) for _ in range(num_ffns)]
+        assert len(ffn_cfgs) == num_ffns
+        for ffn_index in range(num_ffns):
+            if 'embed_dims' not in ffn_cfgs[ffn_index]:
+                ffn_cfgs[ffn_index]['embed_dims'] = self.embed_dims
+            else:
+                assert ffn_cfgs[ffn_index]['embed_dims'] == self.embed_dims
+            self.ffns.append(
+                build_feedforward_network(ffn_cfgs[ffn_index],
+                                          dict(type='FFN')))
+
+        self.norms = ModuleList()
+        num_norms = operation_order.count('norm')
+        for _ in range(num_norms):
+            self.norms.append(build_norm_layer(norm_cfg, self.embed_dims)[1])
+
+    def forward(self,
+                query,
+                key=None,
+                value=None,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerDecoderLayer`.
+
+        **kwargs contains some specific arguments of attentions.
+
+        Args:
+            query (Tensor): The input query with shape
+                [num_queries, bs, embed_dims] if
+                self.batch_first is False, else
+                [bs, num_queries embed_dims].
+            key (Tensor): The key tensor with shape [num_keys, bs,
+                embed_dims] if self.batch_first is False, else
+                [bs, num_keys, embed_dims] .
+            value (Tensor): The value tensor with same shape as `key`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor] | None): 2D Tensor used in
+                calculation of corresponding attention. The length of
+                it should equal to the number of `attention` in
+                `operation_order`. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in `self_attn` layer.
+                Defaults to None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor: forwarded results with shape [num_queries, bs, embed_dims].
+        """
+
+        norm_index = 0
+        attn_index = 0
+        ffn_index = 0
+        identity = query
+        if attn_masks is None:
+            attn_masks = [None for _ in range(self.num_attn)]
+        elif isinstance(attn_masks, torch.Tensor):
+            attn_masks = [
+                copy.deepcopy(attn_masks) for _ in range(self.num_attn)
+            ]
+            warnings.warn(f'Use same attn_mask in all attentions in '
+                          f'{self.__class__.__name__} ')
+        else:
+            assert len(attn_masks) == self.num_attn, f'The length of ' \
+                        f'attn_masks {len(attn_masks)} must be equal ' \
+                        f'to the number of attention in ' \
+                        f'operation_order {self.num_attn}'
+
+        for layer in self.operation_order:
+            if layer == 'self_attn':
+                temp_key = temp_value = query
+                query = self.attentions[attn_index](
+                    query,
+                    temp_key,
+                    temp_value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=query_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=query_key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'norm':
+                query = self.norms[norm_index](query)
+                norm_index += 1
+
+            elif layer == 'cross_attn':
+                query = self.attentions[attn_index](
+                    query,
+                    key,
+                    value,
+                    identity if self.pre_norm else None,
+                    query_pos=query_pos,
+                    key_pos=key_pos,
+                    attn_mask=attn_masks[attn_index],
+                    key_padding_mask=key_padding_mask,
+                    **kwargs)
+                attn_index += 1
+                identity = query
+
+            elif layer == 'ffn':
+                query = self.ffns[ffn_index](
+                    query, identity if self.pre_norm else None)
+                ffn_index += 1
+
+        return query
+
+
+@TRANSFORMER_LAYER_SEQUENCE.register_module()
+class TransformerLayerSequence(BaseModule):
+    """Base class for TransformerEncoder and TransformerDecoder in vision
+    transformer.
+
+    As base-class of Encoder and Decoder in vision transformer.
+    Support customization such as specifying different kind
+    of `transformer_layer` in `transformer_coder`.
+
+    Args:
+        transformerlayer (list[obj:`mmcv.ConfigDict`] |
+            obj:`mmcv.ConfigDict`): Config of transformerlayer
+            in TransformerCoder. If it is obj:`mmcv.ConfigDict`,
+             it would be repeated `num_layer` times to a
+             list[`mmcv.ConfigDict`]. Default: None.
+        num_layers (int): The number of `TransformerLayer`. Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self, transformerlayers=None, num_layers=None, init_cfg=None):
+        super().__init__(init_cfg)
+        if isinstance(transformerlayers, dict):
+            transformerlayers = [
+                copy.deepcopy(transformerlayers) for _ in range(num_layers)
+            ]
+        else:
+            assert isinstance(transformerlayers, list) and \
+                   len(transformerlayers) == num_layers
+        self.num_layers = num_layers
+        self.layers = ModuleList()
+        for i in range(num_layers):
+            self.layers.append(build_transformer_layer(transformerlayers[i]))
+        self.embed_dims = self.layers[0].embed_dims
+        self.pre_norm = self.layers[0].pre_norm
+
+    def forward(self,
+                query,
+                key,
+                value,
+                query_pos=None,
+                key_pos=None,
+                attn_masks=None,
+                query_key_padding_mask=None,
+                key_padding_mask=None,
+                **kwargs):
+        """Forward function for `TransformerCoder`.
+
+        Args:
+            query (Tensor): Input query with shape
+                `(num_queries, bs, embed_dims)`.
+            key (Tensor): The key tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            value (Tensor): The value tensor with shape
+                `(num_keys, bs, embed_dims)`.
+            query_pos (Tensor): The positional encoding for `query`.
+                Default: None.
+            key_pos (Tensor): The positional encoding for `key`.
+                Default: None.
+            attn_masks (List[Tensor], optional): Each element is 2D Tensor
+                which is used in calculation of corresponding attention in
+                operation_order. Default: None.
+            query_key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_queries]. Only used in self-attention
+                Default: None.
+            key_padding_mask (Tensor): ByteTensor for `query`, with
+                shape [bs, num_keys]. Default: None.
+
+        Returns:
+            Tensor:  results with shape [num_queries, bs, embed_dims].
+        """
+        for layer in self.layers:
+            query = layer(
+                query,
+                key,
+                value,
+                query_pos=query_pos,
+                key_pos=key_pos,
+                attn_masks=attn_masks,
+                query_key_padding_mask=query_key_padding_mask,
+                key_padding_mask=key_padding_mask,
+                **kwargs)
+        return query
diff --git a/mmcv/mmcv/cnn/bricks/upsample.py b/mmcv/mmcv/cnn/bricks/upsample.py
new file mode 100644
index 0000000000000000000000000000000000000000..d86c5f54a22ed26b09f66bd59659ff7ab1f5b3d9
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/upsample.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..utils import xavier_init
+from .registry import UPSAMPLE_LAYERS
+
+UPSAMPLE_LAYERS.register_module('nearest', module=nn.Upsample)
+UPSAMPLE_LAYERS.register_module('bilinear', module=nn.Upsample)
+
+
+@UPSAMPLE_LAYERS.register_module(name='pixel_shuffle')
+class PixelShufflePack(nn.Module):
+    """Pixel Shuffle upsample layer.
+
+    This module packs `F.pixel_shuffle()` and a nn.Conv2d module together to
+    achieve a simple upsampling with pixel shuffle.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        scale_factor (int): Upsample ratio.
+        upsample_kernel (int): Kernel size of the conv layer to expand the
+            channels.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, scale_factor: int,
+                 upsample_kernel: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.scale_factor = scale_factor
+        self.upsample_kernel = upsample_kernel
+        self.upsample_conv = nn.Conv2d(
+            self.in_channels,
+            self.out_channels * scale_factor * scale_factor,
+            self.upsample_kernel,
+            padding=(self.upsample_kernel - 1) // 2)
+        self.init_weights()
+
+    def init_weights(self):
+        xavier_init(self.upsample_conv, distribution='uniform')
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.upsample_conv(x)
+        x = F.pixel_shuffle(x, self.scale_factor)
+        return x
+
+
+def build_upsample_layer(cfg: Dict, *args, **kwargs) -> nn.Module:
+    """Build upsample layer.
+
+    Args:
+        cfg (dict): The upsample layer config, which should contain:
+
+            - type (str): Layer type.
+            - scale_factor (int): Upsample ratio, which is not applicable to
+              deconv.
+            - layer args: Args needed to instantiate a upsample layer.
+        args (argument list): Arguments passed to the ``__init__``
+            method of the corresponding conv layer.
+        kwargs (keyword arguments): Keyword arguments passed to the
+            ``__init__`` method of the corresponding conv layer.
+
+    Returns:
+        nn.Module: Created upsample layer.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        raise KeyError(
+            f'the cfg dict must contain the key "type", but got {cfg}')
+    cfg_ = cfg.copy()
+
+    layer_type = cfg_.pop('type')
+    if layer_type not in UPSAMPLE_LAYERS:
+        raise KeyError(f'Unrecognized upsample type {layer_type}')
+    else:
+        upsample = UPSAMPLE_LAYERS.get(layer_type)
+
+    if upsample is nn.Upsample:
+        cfg_['mode'] = layer_type
+    layer = upsample(*args, **kwargs, **cfg_)
+    return layer
diff --git a/mmcv/mmcv/cnn/bricks/wrappers.py b/mmcv/mmcv/cnn/bricks/wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..a07eff00e49970c7692ee3f2625c7f7aba9d7b22
--- /dev/null
+++ b/mmcv/mmcv/cnn/bricks/wrappers.py
@@ -0,0 +1,180 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+r"""Modified from https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/wrappers.py  # noqa: E501
+
+Wrap some nn modules to support empty tensor input. Currently, these wrappers
+are mainly used in mask heads like fcn_mask_head and maskiou_heads since mask
+heads are trained on only positive RoIs.
+"""
+import math
+
+import torch
+import torch.nn as nn
+from torch.nn.modules.utils import _pair, _triple
+
+from .registry import CONV_LAYERS, UPSAMPLE_LAYERS
+
+if torch.__version__ == 'parrots':
+    TORCH_VERSION = torch.__version__
+else:
+    # torch.__version__ could be 1.3.1+cu92, we only need the first two
+    # for comparison
+    TORCH_VERSION = tuple(int(x) for x in torch.__version__.split('.')[:2])
+
+
+def obsolete_torch_version(torch_version, version_threshold) -> bool:
+    return torch_version == 'parrots' or torch_version <= version_threshold
+
+
+class NewEmptyTensorOp(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx, x: torch.Tensor, new_shape: tuple) -> torch.Tensor:
+        ctx.shape = x.shape
+        return x.new_empty(new_shape)
+
+    @staticmethod
+    def backward(ctx, grad: torch.Tensor) -> tuple:
+        shape = ctx.shape
+        return NewEmptyTensorOp.apply(grad, shape), None
+
+
+@CONV_LAYERS.register_module('Conv', force=True)
+class Conv2d(nn.Conv2d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-2:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@CONV_LAYERS.register_module('Conv3d', force=True)
+class Conv3d(nn.Conv3d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d in zip(x.shape[-3:], self.kernel_size,
+                                     self.padding, self.stride, self.dilation):
+                o = (i + 2 * p - (d * (k - 1) + 1)) // s + 1
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@CONV_LAYERS.register_module()
+@CONV_LAYERS.register_module('deconv')
+@UPSAMPLE_LAYERS.register_module('deconv', force=True)
+class ConvTranspose2d(nn.ConvTranspose2d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-2:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+@CONV_LAYERS.register_module()
+@CONV_LAYERS.register_module('deconv3d')
+@UPSAMPLE_LAYERS.register_module('deconv3d', force=True)
+class ConvTranspose3d(nn.ConvTranspose3d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 4)):
+            out_shape = [x.shape[0], self.out_channels]
+            for i, k, p, s, d, op in zip(x.shape[-3:], self.kernel_size,
+                                         self.padding, self.stride,
+                                         self.dilation, self.output_padding):
+                out_shape.append((i - 1) * s - 2 * p + (d * (k - 1) + 1) + op)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
+
+
+class MaxPool2d(nn.MaxPool2d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-2:], _pair(self.kernel_size),
+                                     _pair(self.padding), _pair(self.stride),
+                                     _pair(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+
+        return super().forward(x)
+
+
+class MaxPool3d(nn.MaxPool3d):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # PyTorch 1.9 does not support empty tensor inference yet
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 9)):
+            out_shape = list(x.shape[:2])
+            for i, k, p, s, d in zip(x.shape[-3:], _triple(self.kernel_size),
+                                     _triple(self.padding),
+                                     _triple(self.stride),
+                                     _triple(self.dilation)):
+                o = (i + 2 * p - (d * (k - 1) + 1)) / s + 1
+                o = math.ceil(o) if self.ceil_mode else math.floor(o)
+                out_shape.append(o)
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            return empty
+
+        return super().forward(x)
+
+
+class Linear(torch.nn.Linear):
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # empty tensor forward of Linear layer is supported in Pytorch 1.6
+        if x.numel() == 0 and obsolete_torch_version(TORCH_VERSION, (1, 5)):
+            out_shape = [x.shape[0], self.out_features]
+            empty = NewEmptyTensorOp.apply(x, out_shape)
+            if self.training:
+                # produce dummy gradient to avoid DDP warning.
+                dummy = sum(x.view(-1)[0] for x in self.parameters()) * 0.0
+                return empty + dummy
+            else:
+                return empty
+
+        return super().forward(x)
diff --git a/mmcv/mmcv/cnn/builder.py b/mmcv/mmcv/cnn/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..7567316c566bd3aca6d8f65a84b00e9e890948a7
--- /dev/null
+++ b/mmcv/mmcv/cnn/builder.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..runner import Sequential
+from ..utils import Registry, build_from_cfg
+
+
+def build_model_from_cfg(cfg, registry, default_args=None):
+    """Build a PyTorch model from config dict(s). Different from
+    ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
+
+    Args:
+        cfg (dict, list[dict]): The config of modules, is is either a config
+            dict or a list of config dicts. If cfg is a list, a
+            the built modules will be wrapped with ``nn.Sequential``.
+        registry (:obj:`Registry`): A registry the module belongs to.
+        default_args (dict, optional): Default arguments to build the module.
+            Defaults to None.
+
+    Returns:
+        nn.Module: A built nn module.
+    """
+    if isinstance(cfg, list):
+        modules = [
+            build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
+        ]
+        return Sequential(*modules)
+    else:
+        return build_from_cfg(cfg, registry, default_args)
+
+
+MODELS = Registry('model', build_func=build_model_from_cfg)
diff --git a/mmcv/mmcv/cnn/resnet.py b/mmcv/mmcv/cnn/resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb29e6256280b671acfbf73fd9a01f079749b260
--- /dev/null
+++ b/mmcv/mmcv/cnn/resnet.py
@@ -0,0 +1,322 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import Optional, Sequence, Tuple, Union
+
+import torch.nn as nn
+import torch.utils.checkpoint as cp
+from torch import Tensor
+
+from .utils import constant_init, kaiming_init
+
+
+def conv3x3(in_planes: int,
+            out_planes: int,
+            stride: int = 1,
+            dilation: int = 1):
+    """3x3 convolution with padding."""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        dilation=dilation,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 with_cp: bool = False):
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+        self.conv1 = conv3x3(inplanes, planes, stride, dilation)
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = conv3x3(planes, planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        assert not with_cp
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self,
+                 inplanes: int,
+                 planes: int,
+                 stride: int = 1,
+                 dilation: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 style: str = 'pytorch',
+                 with_cp: bool = False):
+        """Bottleneck block.
+
+        If style is "pytorch", the stride-two layer is the 3x3 conv layer, if
+        it is "caffe", the stride-two layer is the first 1x1 conv layer.
+        """
+        super().__init__()
+        assert style in ['pytorch', 'caffe']
+        if style == 'pytorch':
+            conv1_stride = 1
+            conv2_stride = stride
+        else:
+            conv1_stride = stride
+            conv2_stride = 1
+        self.conv1 = nn.Conv2d(
+            inplanes, planes, kernel_size=1, stride=conv1_stride, bias=False)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=conv2_stride,
+            padding=dilation,
+            dilation=dilation,
+            bias=False)
+
+        self.bn1 = nn.BatchNorm2d(planes)
+        self.bn2 = nn.BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(
+            planes, planes * self.expansion, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dilation = dilation
+        self.with_cp = with_cp
+
+    def forward(self, x: Tensor) -> Tensor:
+
+        def _inner_forward(x):
+            residual = x
+
+            out = self.conv1(x)
+            out = self.bn1(out)
+            out = self.relu(out)
+
+            out = self.conv2(out)
+            out = self.bn2(out)
+            out = self.relu(out)
+
+            out = self.conv3(out)
+            out = self.bn3(out)
+
+            if self.downsample is not None:
+                residual = self.downsample(x)
+
+            out += residual
+
+            return out
+
+        if self.with_cp and x.requires_grad:
+            out = cp.checkpoint(_inner_forward, x)
+        else:
+            out = _inner_forward(x)
+
+        out = self.relu(out)
+
+        return out
+
+
+def make_res_layer(block: nn.Module,
+                   inplanes: int,
+                   planes: int,
+                   blocks: int,
+                   stride: int = 1,
+                   dilation: int = 1,
+                   style: str = 'pytorch',
+                   with_cp: bool = False) -> nn.Module:
+    downsample = None
+    if stride != 1 or inplanes != planes * block.expansion:
+        downsample = nn.Sequential(
+            nn.Conv2d(
+                inplanes,
+                planes * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                bias=False),
+            nn.BatchNorm2d(planes * block.expansion),
+        )
+
+    layers = []
+    layers.append(
+        block(
+            inplanes,
+            planes,
+            stride,
+            dilation,
+            downsample,
+            style=style,
+            with_cp=with_cp))
+    inplanes = planes * block.expansion
+    for _ in range(1, blocks):
+        layers.append(
+            block(inplanes, planes, 1, dilation, style=style, with_cp=with_cp))
+
+    return nn.Sequential(*layers)
+
+
+class ResNet(nn.Module):
+    """ResNet backbone.
+
+    Args:
+        depth (int): Depth of resnet, from {18, 34, 50, 101, 152}.
+        num_stages (int): Resnet stages, normally 4.
+        strides (Sequence[int]): Strides of the first block of each stage.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        style (str): `pytorch` or `caffe`. If set to "pytorch", the stride-two
+            layer is the 3x3 conv layer, otherwise the stride-two layer is
+            the first 1x1 conv layer.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+            running stats (mean and var).
+        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+        with_cp (bool): Use checkpoint or not. Using checkpoint will save some
+            memory while slowing down the training speed.
+    """
+
+    arch_settings = {
+        18: (BasicBlock, (2, 2, 2, 2)),
+        34: (BasicBlock, (3, 4, 6, 3)),
+        50: (Bottleneck, (3, 4, 6, 3)),
+        101: (Bottleneck, (3, 4, 23, 3)),
+        152: (Bottleneck, (3, 8, 36, 3))
+    }
+
+    def __init__(self,
+                 depth: int,
+                 num_stages: int = 4,
+                 strides: Sequence[int] = (1, 2, 2, 2),
+                 dilations: Sequence[int] = (1, 1, 1, 1),
+                 out_indices: Sequence[int] = (0, 1, 2, 3),
+                 style: str = 'pytorch',
+                 frozen_stages: int = -1,
+                 bn_eval: bool = True,
+                 bn_frozen: bool = False,
+                 with_cp: bool = False):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for resnet')
+        assert num_stages >= 1 and num_stages <= 4
+        block, stage_blocks = self.arch_settings[depth]
+        stage_blocks = stage_blocks[:num_stages]  # type: ignore
+        assert len(strides) == len(dilations) == num_stages
+        assert max(out_indices) < num_stages
+
+        self.out_indices = out_indices
+        self.style = style
+        self.frozen_stages = frozen_stages
+        self.bn_eval = bn_eval
+        self.bn_frozen = bn_frozen
+        self.with_cp = with_cp
+
+        self.inplanes: int = 64
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = nn.BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+
+        self.res_layers = []
+        for i, num_blocks in enumerate(stage_blocks):
+            stride = strides[i]
+            dilation = dilations[i]
+            planes = 64 * 2**i
+            res_layer = make_res_layer(
+                block,
+                self.inplanes,
+                planes,
+                num_blocks,
+                stride=stride,
+                dilation=dilation,
+                style=self.style,
+                with_cp=with_cp)
+            self.inplanes = planes * block.expansion  # type: ignore
+            layer_name = f'layer{i + 1}'
+            self.add_module(layer_name, res_layer)
+            self.res_layers.append(layer_name)
+
+        self.feat_dim = block.expansion * 64 * 2**(  # type: ignore
+            len(stage_blocks) - 1)
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            from ..runner import load_checkpoint
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+        outs = []
+        for i, layer_name in enumerate(self.res_layers):
+            res_layer = getattr(self, layer_name)
+            x = res_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode: bool = True) -> None:
+        super().train(mode)
+        if self.bn_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if self.bn_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
+        if mode and self.frozen_stages >= 0:
+            for param in self.conv1.parameters():
+                param.requires_grad = False
+            for param in self.bn1.parameters():
+                param.requires_grad = False
+            self.bn1.eval()
+            self.bn1.weight.requires_grad = False
+            self.bn1.bias.requires_grad = False
+            for i in range(1, self.frozen_stages + 1):
+                mod = getattr(self, f'layer{i}')
+                mod.eval()
+                for param in mod.parameters():
+                    param.requires_grad = False
diff --git a/mmcv/mmcv/cnn/utils/__init__.py b/mmcv/mmcv/cnn/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a263e31c1e3977712827ca229bbc04910b4e928e
--- /dev/null
+++ b/mmcv/mmcv/cnn/utils/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .flops_counter import get_model_complexity_info
+from .fuse_conv_bn import fuse_conv_bn
+from .sync_bn import revert_sync_batchnorm
+from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit,
+                          KaimingInit, NormalInit, PretrainedInit,
+                          TruncNormalInit, UniformInit, XavierInit,
+                          bias_init_with_prob, caffe2_xavier_init,
+                          constant_init, initialize, kaiming_init, normal_init,
+                          trunc_normal_init, uniform_init, xavier_init)
+
+__all__ = [
+    'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init',
+    'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init',
+    'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize',
+    'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
+    'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
+    'Caffe2XavierInit', 'revert_sync_batchnorm'
+]
diff --git a/mmcv/mmcv/cnn/utils/flops_counter.py b/mmcv/mmcv/cnn/utils/flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..150a55992a9561073626d26df503ba4ef37efa18
--- /dev/null
+++ b/mmcv/mmcv/cnn/utils/flops_counter.py
@@ -0,0 +1,603 @@
+# Modified from flops-counter.pytorch by Vladislav Sovrasov
+# original repo: https://github.com/sovrasov/flops-counter.pytorch
+
+# MIT License
+
+# Copyright (c) 2018 Vladislav Sovrasov
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+import sys
+import warnings
+from functools import partial
+from typing import Any, Callable, Dict, Optional, TextIO, Tuple
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+import mmcv
+
+
+def get_model_complexity_info(model: nn.Module,
+                              input_shape: tuple,
+                              print_per_layer_stat: bool = True,
+                              as_strings: bool = True,
+                              input_constructor: Optional[Callable] = None,
+                              flush: bool = False,
+                              ost: TextIO = sys.stdout) -> tuple:
+    """Get complexity information of a model.
+
+    This method can calculate FLOPs and parameter counts of a model with
+    corresponding input shape. It can also print complexity information for
+    each layer in a model.
+
+    Supported layers are listed as below:
+        - Convolutions: ``nn.Conv1d``, ``nn.Conv2d``, ``nn.Conv3d``.
+        - Activations: ``nn.ReLU``, ``nn.PReLU``, ``nn.ELU``,
+          ``nn.LeakyReLU``, ``nn.ReLU6``.
+        - Poolings: ``nn.MaxPool1d``, ``nn.MaxPool2d``, ``nn.MaxPool3d``,
+          ``nn.AvgPool1d``, ``nn.AvgPool2d``, ``nn.AvgPool3d``,
+          ``nn.AdaptiveMaxPool1d``, ``nn.AdaptiveMaxPool2d``,
+          ``nn.AdaptiveMaxPool3d``, ``nn.AdaptiveAvgPool1d``,
+          ``nn.AdaptiveAvgPool2d``, ``nn.AdaptiveAvgPool3d``.
+        - BatchNorms: ``nn.BatchNorm1d``, ``nn.BatchNorm2d``,
+          ``nn.BatchNorm3d``, ``nn.GroupNorm``, ``nn.InstanceNorm1d``,
+          ``InstanceNorm2d``, ``InstanceNorm3d``, ``nn.LayerNorm``.
+        - Linear: ``nn.Linear``.
+        - Deconvolution: ``nn.ConvTranspose2d``.
+        - Upsample: ``nn.Upsample``.
+
+    Args:
+        model (nn.Module): The model for complexity calculation.
+        input_shape (tuple): Input shape used for calculation.
+        print_per_layer_stat (bool): Whether to print complexity information
+            for each layer in a model. Default: True.
+        as_strings (bool): Output FLOPs and params counts in a string form.
+            Default: True.
+        input_constructor (None | callable): If specified, it takes a callable
+            method that generates input. otherwise, it will generate a random
+            tensor with input shape to calculate FLOPs. Default: None.
+        flush (bool): same as that in :func:`print`. Default: False.
+        ost (stream): same as ``file`` param in :func:`print`.
+            Default: sys.stdout.
+
+    Returns:
+        tuple[float | str]: If ``as_strings`` is set to True, it will return
+        FLOPs and parameter counts in a string format. otherwise, it will
+        return those in a float number format.
+    """
+    assert type(input_shape) is tuple
+    assert len(input_shape) >= 1
+    assert isinstance(model, nn.Module)
+    flops_model = add_flops_counting_methods(model)
+    flops_model.eval()
+    flops_model.start_flops_count()
+    if input_constructor:
+        input = input_constructor(input_shape)
+        _ = flops_model(**input)
+    else:
+        try:
+            batch = torch.ones(()).new_empty(
+                (1, *input_shape),
+                dtype=next(flops_model.parameters()).dtype,
+                device=next(flops_model.parameters()).device)
+        except StopIteration:
+            # Avoid StopIteration for models which have no parameters,
+            # like `nn.Relu()`, `nn.AvgPool2d`, etc.
+            batch = torch.ones(()).new_empty((1, *input_shape))
+
+        _ = flops_model(batch)
+
+    flops_count, params_count = flops_model.compute_average_flops_cost()
+    if print_per_layer_stat:
+        print_model_with_flops(
+            flops_model, flops_count, params_count, ost=ost, flush=flush)
+    flops_model.stop_flops_count()
+
+    if as_strings:
+        return flops_to_string(flops_count), params_to_string(params_count)
+
+    return flops_count, params_count
+
+
+def flops_to_string(flops: float,
+                    units: Optional[str] = 'GFLOPs',
+                    precision: int = 2) -> str:
+    """Convert FLOPs number into a string.
+
+    Note that Here we take a multiply-add counts as one FLOP.
+
+    Args:
+        flops (float): FLOPs number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'GFLOPs',
+            'MFLOPs', 'KFLOPs', 'FLOPs'. If set to None, it will automatically
+            choose the most suitable unit for FLOPs. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted FLOPs number with units.
+
+    Examples:
+        >>> flops_to_string(1e9)
+        '1.0 GFLOPs'
+        >>> flops_to_string(2e5, 'MFLOPs')
+        '0.2 MFLOPs'
+        >>> flops_to_string(3e-9, None)
+        '3e-09 FLOPs'
+    """
+    if units is None:
+        if flops // 10**9 > 0:
+            return str(round(flops / 10.**9, precision)) + ' GFLOPs'
+        elif flops // 10**6 > 0:
+            return str(round(flops / 10.**6, precision)) + ' MFLOPs'
+        elif flops // 10**3 > 0:
+            return str(round(flops / 10.**3, precision)) + ' KFLOPs'
+        else:
+            return str(flops) + ' FLOPs'
+    else:
+        if units == 'GFLOPs':
+            return str(round(flops / 10.**9, precision)) + ' ' + units
+        elif units == 'MFLOPs':
+            return str(round(flops / 10.**6, precision)) + ' ' + units
+        elif units == 'KFLOPs':
+            return str(round(flops / 10.**3, precision)) + ' ' + units
+        else:
+            return str(flops) + ' FLOPs'
+
+
+def params_to_string(num_params: float,
+                     units: Optional[str] = None,
+                     precision: int = 2) -> str:
+    """Convert parameter number into a string.
+
+    Args:
+        num_params (float): Parameter number to be converted.
+        units (str | None): Converted FLOPs units. Options are None, 'M',
+            'K' and ''. If set to None, it will automatically choose the most
+            suitable unit for Parameter number. Default: None.
+        precision (int): Digit number after the decimal point. Default: 2.
+
+    Returns:
+        str: The converted parameter number with units.
+
+    Examples:
+        >>> params_to_string(1e9)
+        '1000.0 M'
+        >>> params_to_string(2e5)
+        '200.0 k'
+        >>> params_to_string(3e-9)
+        '3e-09'
+    """
+    if units is None:
+        if num_params // 10**6 > 0:
+            return str(round(num_params / 10**6, precision)) + ' M'
+        elif num_params // 10**3:
+            return str(round(num_params / 10**3, precision)) + ' k'
+        else:
+            return str(num_params)
+    else:
+        if units == 'M':
+            return str(round(num_params / 10.**6, precision)) + ' ' + units
+        elif units == 'K':
+            return str(round(num_params / 10.**3, precision)) + ' ' + units
+        else:
+            return str(num_params)
+
+
+def print_model_with_flops(model: nn.Module,
+                           total_flops: float,
+                           total_params: float,
+                           units: Optional[str] = 'GFLOPs',
+                           precision: int = 3,
+                           ost: TextIO = sys.stdout,
+                           flush: bool = False) -> None:
+    """Print a model with FLOPs for each layer.
+
+    Args:
+        model (nn.Module): The model to be printed.
+        total_flops (float): Total FLOPs of the model.
+        total_params (float): Total parameter counts of the model.
+        units (str | None): Converted FLOPs units. Default: 'GFLOPs'.
+        precision (int): Digit number after the decimal point. Default: 3.
+        ost (stream): same as `file` param in :func:`print`.
+            Default: sys.stdout.
+        flush (bool): same as that in :func:`print`. Default: False.
+
+    Example:
+        >>> class ExampleModel(nn.Module):
+
+        >>> def __init__(self):
+        >>>     super().__init__()
+        >>>     self.conv1 = nn.Conv2d(3, 8, 3)
+        >>>     self.conv2 = nn.Conv2d(8, 256, 3)
+        >>>     self.conv3 = nn.Conv2d(256, 8, 3)
+        >>>     self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))
+        >>>     self.flatten = nn.Flatten()
+        >>>     self.fc = nn.Linear(8, 1)
+
+        >>> def forward(self, x):
+        >>>     x = self.conv1(x)
+        >>>     x = self.conv2(x)
+        >>>     x = self.conv3(x)
+        >>>     x = self.avg_pool(x)
+        >>>     x = self.flatten(x)
+        >>>     x = self.fc(x)
+        >>>     return x
+
+        >>> model = ExampleModel()
+        >>> x = (3, 16, 16)
+        to print the complexity information state for each layer, you can use
+        >>> get_model_complexity_info(model, x)
+        or directly use
+        >>> print_model_with_flops(model, 4579784.0, 37361)
+        ExampleModel(
+          0.037 M, 100.000% Params, 0.005 GFLOPs, 100.000% FLOPs,
+          (conv1): Conv2d(0.0 M, 0.600% Params, 0.0 GFLOPs, 0.959% FLOPs, 3, 8, kernel_size=(3, 3), stride=(1, 1))  # noqa: E501
+          (conv2): Conv2d(0.019 M, 50.020% Params, 0.003 GFLOPs, 58.760% FLOPs, 8, 256, kernel_size=(3, 3), stride=(1, 1))
+          (conv3): Conv2d(0.018 M, 49.356% Params, 0.002 GFLOPs, 40.264% FLOPs, 256, 8, kernel_size=(3, 3), stride=(1, 1))
+          (avg_pool): AdaptiveAvgPool2d(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.017% FLOPs, output_size=(1, 1))
+          (flatten): Flatten(0.0 M, 0.000% Params, 0.0 GFLOPs, 0.000% FLOPs, )
+          (fc): Linear(0.0 M, 0.024% Params, 0.0 GFLOPs, 0.000% FLOPs, in_features=8, out_features=1, bias=True)
+        )
+    """
+
+    def accumulate_params(self):
+        if is_supported_instance(self):
+            return self.__params__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_params()
+            return sum
+
+    def accumulate_flops(self):
+        if is_supported_instance(self):
+            return self.__flops__ / model.__batch_counter__
+        else:
+            sum = 0
+            for m in self.children():
+                sum += m.accumulate_flops()
+            return sum
+
+    def flops_repr(self):
+        accumulated_num_params = self.accumulate_params()
+        accumulated_flops_cost = self.accumulate_flops()
+        return ', '.join([
+            params_to_string(
+                accumulated_num_params, units='M', precision=precision),
+            f'{accumulated_num_params / total_params:.3%} Params',
+            flops_to_string(
+                accumulated_flops_cost, units=units, precision=precision),
+            f'{accumulated_flops_cost / total_flops:.3%} FLOPs',
+            self.original_extra_repr()
+        ])
+
+    def add_extra_repr(m):
+        m.accumulate_flops = accumulate_flops.__get__(m)
+        m.accumulate_params = accumulate_params.__get__(m)
+        flops_extra_repr = flops_repr.__get__(m)
+        if m.extra_repr != flops_extra_repr:
+            m.original_extra_repr = m.extra_repr
+            m.extra_repr = flops_extra_repr
+            assert m.extra_repr != m.original_extra_repr
+
+    def del_extra_repr(m):
+        if hasattr(m, 'original_extra_repr'):
+            m.extra_repr = m.original_extra_repr
+            del m.original_extra_repr
+        if hasattr(m, 'accumulate_flops'):
+            del m.accumulate_flops
+
+    model.apply(add_extra_repr)
+    print(model, file=ost, flush=flush)
+    model.apply(del_extra_repr)
+
+
+def get_model_parameters_number(model: nn.Module) -> float:
+    """Calculate parameter number of a model.
+
+    Args:
+        model (nn.module): The model for parameter number calculation.
+
+    Returns:
+        float: Parameter number of the model.
+    """
+    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    return num_params
+
+
+def add_flops_counting_methods(net_main_module: nn.Module) -> nn.Module:
+    # adding additional methods to the existing module object,
+    # this is done this way so that each function has access to self object
+    net_main_module.start_flops_count = start_flops_count.__get__(  # type: ignore # noqa E501
+        net_main_module)
+    net_main_module.stop_flops_count = stop_flops_count.__get__(  # type: ignore # noqa E501
+        net_main_module)
+    net_main_module.reset_flops_count = reset_flops_count.__get__(  # type: ignore # noqa E501
+        net_main_module)
+    net_main_module.compute_average_flops_cost = compute_average_flops_cost.__get__(  # type: ignore # noqa E501
+        net_main_module)
+
+    net_main_module.reset_flops_count()
+
+    return net_main_module
+
+
+def compute_average_flops_cost(self) -> Tuple[float, float]:
+    """Compute average FLOPs cost.
+
+    A method to compute average FLOPs cost, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+
+    Returns:
+        float: Current mean flops consumption per image.
+    """
+    batches_count = self.__batch_counter__
+    flops_sum = 0
+    for module in self.modules():
+        if is_supported_instance(module):
+            flops_sum += module.__flops__
+    params_sum = get_model_parameters_number(self)
+    return flops_sum / batches_count, params_sum
+
+
+def start_flops_count(self) -> None:
+    """Activate the computation of mean flops consumption per image.
+
+    A method to activate the computation of mean flops consumption per image.
+    which will be available after ``add_flops_counting_methods()`` is called on
+    a desired net object. It should be called before running the network.
+    """
+    add_batch_counter_hook_function(self)
+
+    def add_flops_counter_hook_function(module: nn.Module) -> None:
+        if is_supported_instance(module):
+            if hasattr(module, '__flops_handle__'):
+                return
+
+            else:
+                handle = module.register_forward_hook(
+                    get_modules_mapping()[type(module)])
+
+            module.__flops_handle__ = handle
+
+    self.apply(partial(add_flops_counter_hook_function))
+
+
+def stop_flops_count(self) -> None:
+    """Stop computing the mean flops consumption per image.
+
+    A method to stop computing the mean flops consumption per image, which will
+    be available after ``add_flops_counting_methods()`` is called on a desired
+    net object. It can be called to pause the computation whenever.
+    """
+    remove_batch_counter_hook_function(self)
+    self.apply(remove_flops_counter_hook_function)
+
+
+def reset_flops_count(self) -> None:
+    """Reset statistics computed so far.
+
+    A method to Reset computed statistics, which will be available after
+    `add_flops_counting_methods()` is called on a desired net object.
+    """
+    add_batch_counter_variables_or_reset(self)
+    self.apply(add_flops_counter_variable_or_reset)
+
+
+# ---- Internal functions
+def empty_flops_counter_hook(module: nn.Module, input: tuple,
+                             output: Any) -> None:
+    module.__flops__ += 0
+
+
+def upsample_flops_counter_hook(module: nn.Module, input: tuple,
+                                output: torch.Tensor) -> None:
+    output_size = output[0]
+    batch_size = output_size.shape[0]
+    output_elements_count = batch_size
+    for val in output_size.shape[1:]:
+        output_elements_count *= val
+    module.__flops__ += int(output_elements_count)
+
+
+def relu_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    active_elements_count = output.numel()
+    module.__flops__ += int(active_elements_count)
+
+
+def linear_flops_counter_hook(module: nn.Module, input: tuple,
+                              output: torch.Tensor) -> None:
+    output_last_dim = output.shape[
+        -1]  # pytorch checks dimensions, so here we don't care much
+    module.__flops__ += int(np.prod(input[0].shape) * output_last_dim)
+
+
+def pool_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    module.__flops__ += int(np.prod(input[0].shape))
+
+
+def norm_flops_counter_hook(module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    batch_flops = np.prod(input[0].shape)
+    if (getattr(module, 'affine', False)
+            or getattr(module, 'elementwise_affine', False)):
+        batch_flops *= 2
+    module.__flops__ += int(batch_flops)
+
+
+def deconv_flops_counter_hook(conv_module: nn.Module, input: tuple,
+                              output: torch.Tensor) -> None:
+    # Can have multiple inputs, getting the first one
+    batch_size = input[0].shape[0]
+    input_height, input_width = input[0].shape[2:]
+
+    kernel_height, kernel_width = conv_module.kernel_size
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = (
+        kernel_height * kernel_width * in_channels * filters_per_channel)
+
+    active_elements_count = batch_size * input_height * input_width
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+    bias_flops = 0
+    if conv_module.bias is not None:
+        output_height, output_width = output.shape[2:]
+        bias_flops = out_channels * batch_size * output_height * output_width
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def conv_flops_counter_hook(conv_module: nn.Module, input: tuple,
+                            output: torch.Tensor) -> None:
+    # Can have multiple inputs, getting the first one
+    batch_size = input[0].shape[0]
+    output_dims = list(output.shape[2:])
+
+    kernel_dims = list(conv_module.kernel_size)
+    in_channels = conv_module.in_channels
+    out_channels = conv_module.out_channels
+    groups = conv_module.groups
+
+    filters_per_channel = out_channels // groups
+    conv_per_position_flops = int(
+        np.prod(kernel_dims)) * in_channels * filters_per_channel
+
+    active_elements_count = batch_size * int(np.prod(output_dims))
+
+    overall_conv_flops = conv_per_position_flops * active_elements_count
+
+    bias_flops = 0
+
+    if conv_module.bias is not None:
+
+        bias_flops = out_channels * active_elements_count
+
+    overall_flops = overall_conv_flops + bias_flops
+
+    conv_module.__flops__ += int(overall_flops)
+
+
+def batch_counter_hook(module: nn.Module, input: tuple, output: Any) -> None:
+    batch_size = 1
+    if len(input) > 0:
+        # Can have multiple inputs, getting the first one
+        batch_size = len(input[0])
+    else:
+        warnings.warn('No positional inputs found for a module, '
+                      'assuming batch size is 1.')
+    module.__batch_counter__ += batch_size
+
+
+def add_batch_counter_variables_or_reset(module: nn.Module) -> None:
+
+    module.__batch_counter__ = 0
+
+
+def add_batch_counter_hook_function(module: nn.Module) -> None:
+    if hasattr(module, '__batch_counter_handle__'):
+        return
+
+    handle = module.register_forward_hook(batch_counter_hook)
+    module.__batch_counter_handle__ = handle
+
+
+def remove_batch_counter_hook_function(module: nn.Module) -> None:
+    if hasattr(module, '__batch_counter_handle__'):
+        module.__batch_counter_handle__.remove()
+        del module.__batch_counter_handle__
+
+
+def add_flops_counter_variable_or_reset(module: nn.Module) -> None:
+    if is_supported_instance(module):
+        if hasattr(module, '__flops__') or hasattr(module, '__params__'):
+            warnings.warn('variables __flops__ or __params__ are already '
+                          'defined for the module' + type(module).__name__ +
+                          ' ptflops can affect your code!')
+        module.__flops__ = 0
+        module.__params__ = get_model_parameters_number(module)
+
+
+def is_supported_instance(module: nn.Module) -> bool:
+    if type(module) in get_modules_mapping():
+        return True
+    return False
+
+
+def remove_flops_counter_hook_function(module: nn.Module) -> None:
+    if is_supported_instance(module):
+        if hasattr(module, '__flops_handle__'):
+            module.__flops_handle__.remove()
+            del module.__flops_handle__
+
+
+def get_modules_mapping() -> Dict:
+    return {
+        # convolutions
+        nn.Conv1d: conv_flops_counter_hook,
+        nn.Conv2d: conv_flops_counter_hook,
+        mmcv.cnn.bricks.Conv2d: conv_flops_counter_hook,
+        nn.Conv3d: conv_flops_counter_hook,
+        mmcv.cnn.bricks.Conv3d: conv_flops_counter_hook,
+        # activations
+        nn.ReLU: relu_flops_counter_hook,
+        nn.PReLU: relu_flops_counter_hook,
+        nn.ELU: relu_flops_counter_hook,
+        nn.LeakyReLU: relu_flops_counter_hook,
+        nn.ReLU6: relu_flops_counter_hook,
+        # poolings
+        nn.MaxPool1d: pool_flops_counter_hook,
+        nn.AvgPool1d: pool_flops_counter_hook,
+        nn.AvgPool2d: pool_flops_counter_hook,
+        nn.MaxPool2d: pool_flops_counter_hook,
+        mmcv.cnn.bricks.MaxPool2d: pool_flops_counter_hook,
+        nn.MaxPool3d: pool_flops_counter_hook,
+        mmcv.cnn.bricks.MaxPool3d: pool_flops_counter_hook,
+        nn.AvgPool3d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool1d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool1d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool2d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool2d: pool_flops_counter_hook,
+        nn.AdaptiveMaxPool3d: pool_flops_counter_hook,
+        nn.AdaptiveAvgPool3d: pool_flops_counter_hook,
+        # normalizations
+        nn.BatchNorm1d: norm_flops_counter_hook,
+        nn.BatchNorm2d: norm_flops_counter_hook,
+        nn.BatchNorm3d: norm_flops_counter_hook,
+        nn.GroupNorm: norm_flops_counter_hook,
+        nn.InstanceNorm1d: norm_flops_counter_hook,
+        nn.InstanceNorm2d: norm_flops_counter_hook,
+        nn.InstanceNorm3d: norm_flops_counter_hook,
+        nn.LayerNorm: norm_flops_counter_hook,
+        # FC
+        nn.Linear: linear_flops_counter_hook,
+        mmcv.cnn.bricks.Linear: linear_flops_counter_hook,
+        # Upscale
+        nn.Upsample: upsample_flops_counter_hook,
+        # Deconvolution
+        nn.ConvTranspose2d: deconv_flops_counter_hook,
+        mmcv.cnn.bricks.ConvTranspose2d: deconv_flops_counter_hook,
+    }
diff --git a/mmcv/mmcv/cnn/utils/fuse_conv_bn.py b/mmcv/mmcv/cnn/utils/fuse_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ccaab3bf1eb3ce615bad910d6dc45a467bb1fe4
--- /dev/null
+++ b/mmcv/mmcv/cnn/utils/fuse_conv_bn.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+
+def _fuse_conv_bn(conv: nn.Module, bn: nn.Module) -> nn.Module:
+    """Fuse conv and bn into one module.
+
+    Args:
+        conv (nn.Module): Conv to be fused.
+        bn (nn.Module): BN to be fused.
+
+    Returns:
+        nn.Module: Fused module.
+    """
+    conv_w = conv.weight
+    conv_b = conv.bias if conv.bias is not None else torch.zeros_like(
+        bn.running_mean)
+
+    factor = bn.weight / torch.sqrt(bn.running_var + bn.eps)
+    conv.weight = nn.Parameter(conv_w *
+                               factor.reshape([conv.out_channels, 1, 1, 1]))
+    conv.bias = nn.Parameter((conv_b - bn.running_mean) * factor + bn.bias)
+    return conv
+
+
+def fuse_conv_bn(module: nn.Module) -> nn.Module:
+    """Recursively fuse conv and bn in a module.
+
+    During inference, the functionary of batch norm layers is turned off
+    but only the mean and var alone channels are used, which exposes the
+    chance to fuse it with the preceding conv layers to save computations and
+    simplify network structures.
+
+    Args:
+        module (nn.Module): Module to be fused.
+
+    Returns:
+        nn.Module: Fused module.
+    """
+    last_conv = None
+    last_conv_name = None
+
+    for name, child in module.named_children():
+        if isinstance(child,
+                      (nn.modules.batchnorm._BatchNorm, nn.SyncBatchNorm)):
+            if last_conv is None:  # only fuse BN that is after Conv
+                continue
+            fused_conv = _fuse_conv_bn(last_conv, child)
+            module._modules[last_conv_name] = fused_conv
+            # To reduce changes, set BN as Identity instead of deleting it.
+            module._modules[name] = nn.Identity()
+            last_conv = None
+        elif isinstance(child, nn.Conv2d):
+            last_conv = child
+            last_conv_name = name
+        else:
+            fuse_conv_bn(child)
+    return module
diff --git a/mmcv/mmcv/cnn/utils/sync_bn.py b/mmcv/mmcv/cnn/utils/sync_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c534fc0e17506dde31c20529ce7bef64eef87140
--- /dev/null
+++ b/mmcv/mmcv/cnn/utils/sync_bn.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+import mmcv
+
+
+class _BatchNormXd(nn.modules.batchnorm._BatchNorm):
+    """A general BatchNorm layer without input dimension check.
+
+    Reproduced from @kapily's work:
+    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
+    The only difference between BatchNorm1d, BatchNorm2d, BatchNorm3d, etc
+    is `_check_input_dim` that is designed for tensor sanity checks.
+    The check has been bypassed in this class for the convenience of converting
+    SyncBatchNorm.
+    """
+
+    def _check_input_dim(self, input: torch.Tensor):
+        return
+
+
+def revert_sync_batchnorm(module: nn.Module) -> nn.Module:
+    """Helper function to convert all `SyncBatchNorm` (SyncBN) and
+    `mmcv.ops.sync_bn.SyncBatchNorm`(MMSyncBN) layers in the model to
+    `BatchNormXd` layers.
+
+    Adapted from @kapily's work:
+    (https://github.com/pytorch/pytorch/issues/41081#issuecomment-783961547)
+
+    Args:
+        module (nn.Module): The module containing `SyncBatchNorm` layers.
+
+    Returns:
+        module_output: The converted module with `BatchNormXd` layers.
+    """
+    module_output = module
+    module_checklist = [torch.nn.modules.batchnorm.SyncBatchNorm]
+    if hasattr(mmcv, 'ops'):
+        module_checklist.append(mmcv.ops.SyncBatchNorm)
+    if isinstance(module, tuple(module_checklist)):
+        module_output = _BatchNormXd(module.num_features, module.eps,
+                                     module.momentum, module.affine,
+                                     module.track_running_stats)
+        if module.affine:
+            # no_grad() may not be needed here but
+            # just to be consistent with `convert_sync_batchnorm()`
+            with torch.no_grad():
+                module_output.weight = module.weight
+                module_output.bias = module.bias
+        module_output.running_mean = module.running_mean
+        module_output.running_var = module.running_var
+        module_output.num_batches_tracked = module.num_batches_tracked
+        module_output.training = module.training
+        # qconfig exists in quantized models
+        if hasattr(module, 'qconfig'):
+            module_output.qconfig = module.qconfig
+    for name, child in module.named_children():
+        module_output.add_module(name, revert_sync_batchnorm(child))
+    del module
+    return module_output
diff --git a/mmcv/mmcv/cnn/utils/weight_init.py b/mmcv/mmcv/cnn/utils/weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e0d293ad4fb315462e34d5899ae6fccc4a7ba86
--- /dev/null
+++ b/mmcv/mmcv/cnn/utils/weight_init.py
@@ -0,0 +1,708 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import math
+import warnings
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmcv.utils import Registry, build_from_cfg, get_logger, print_log
+
+INITIALIZERS = Registry('initializer')
+
+
+def update_init_info(module: nn.Module, init_info: str) -> None:
+    """Update the `_params_init_info` in the module if the value of parameters
+    are changed.
+
+    Args:
+        module (obj:`nn.Module`): The module of PyTorch with a user-defined
+            attribute `_params_init_info` which records the initialization
+            information.
+        init_info (str): The string that describes the initialization.
+    """
+    assert hasattr(
+        module,
+        '_params_init_info'), f'Can not find `_params_init_info` in {module}'
+    for name, param in module.named_parameters():
+
+        assert param in module._params_init_info, (
+            f'Find a new :obj:`Parameter` '
+            f'named `{name}` during executing the '
+            f'`init_weights` of '
+            f'`{module.__class__.__name__}`. '
+            f'Please do not add or '
+            f'replace parameters during executing '
+            f'the `init_weights`. ')
+
+        # The parameter has been changed during executing the
+        # `init_weights` of module
+        mean_value = param.data.mean()
+        if module._params_init_info[param]['tmp_mean_value'] != mean_value:
+            module._params_init_info[param]['init_info'] = init_info
+            module._params_init_info[param]['tmp_mean_value'] = mean_value
+
+
+def constant_init(module: nn.Module, val: float, bias: float = 0) -> None:
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.constant_(module.weight, val)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def xavier_init(module: nn.Module,
+                gain: float = 1,
+                bias: float = 0,
+                distribution: str = 'normal') -> None:
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.xavier_uniform_(module.weight, gain=gain)
+        else:
+            nn.init.xavier_normal_(module.weight, gain=gain)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def normal_init(module: nn.Module,
+                mean: float = 0,
+                std: float = 1,
+                bias: float = 0) -> None:
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.normal_(module.weight, mean, std)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def trunc_normal_init(module: nn.Module,
+                      mean: float = 0,
+                      std: float = 1,
+                      a: float = -2,
+                      b: float = 2,
+                      bias: float = 0) -> None:
+    if hasattr(module, 'weight') and module.weight is not None:
+        trunc_normal_(module.weight, mean, std, a, b)  # type: ignore
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)  # type: ignore
+
+
+def uniform_init(module: nn.Module,
+                 a: float = 0,
+                 b: float = 1,
+                 bias: float = 0) -> None:
+    if hasattr(module, 'weight') and module.weight is not None:
+        nn.init.uniform_(module.weight, a, b)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def kaiming_init(module: nn.Module,
+                 a: float = 0,
+                 mode: str = 'fan_out',
+                 nonlinearity: str = 'relu',
+                 bias: float = 0,
+                 distribution: str = 'normal') -> None:
+    assert distribution in ['uniform', 'normal']
+    if hasattr(module, 'weight') and module.weight is not None:
+        if distribution == 'uniform':
+            nn.init.kaiming_uniform_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+        else:
+            nn.init.kaiming_normal_(
+                module.weight, a=a, mode=mode, nonlinearity=nonlinearity)
+    if hasattr(module, 'bias') and module.bias is not None:
+        nn.init.constant_(module.bias, bias)
+
+
+def caffe2_xavier_init(module: nn.Module, bias: float = 0) -> None:
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    kaiming_init(
+        module,
+        a=1,
+        mode='fan_in',
+        nonlinearity='leaky_relu',
+        bias=bias,
+        distribution='uniform')
+
+
+def bias_init_with_prob(prior_prob: float) -> float:
+    """initialize conv/fc bias value according to a given probability value."""
+    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
+    return bias_init
+
+
+def _get_bases_name(m: nn.Module) -> List[str]:
+    return [b.__name__ for b in m.__class__.__bases__]
+
+
+class BaseInit:
+
+    def __init__(self,
+                 *,
+                 bias: float = 0,
+                 bias_prob: Optional[float] = None,
+                 layer: Union[str, List, None] = None):
+        self.wholemodule = False
+        if not isinstance(bias, (int, float)):
+            raise TypeError(f'bias must be a number, but got a {type(bias)}')
+
+        if bias_prob is not None:
+            if not isinstance(bias_prob, float):
+                raise TypeError(f'bias_prob type must be float, \
+                    but got {type(bias_prob)}')
+
+        if layer is not None:
+            if not isinstance(layer, (str, list)):
+                raise TypeError(f'layer must be a str or a list of str, \
+                    but got a {type(layer)}')
+        else:
+            layer = []
+
+        if bias_prob is not None:
+            self.bias = bias_init_with_prob(bias_prob)
+        else:
+            self.bias = bias
+        self.layer = [layer] if isinstance(layer, str) else layer
+
+    def _get_init_info(self) -> str:
+        info = f'{self.__class__.__name__}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Constant')
+class ConstantInit(BaseInit):
+    """Initialize module parameters with constant values.
+
+    Args:
+        val (int | float): the value to fill the weights in the module with
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, val: Union[int, float], **kwargs):
+        super().__init__(**kwargs)
+        self.val = val
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                constant_init(m, self.val, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    constant_init(m, self.val, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self) -> str:
+        info = f'{self.__class__.__name__}: val={self.val}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Xavier')
+class XavierInit(BaseInit):
+    r"""Initialize module parameters with values according to the method
+    described in `Understanding the difficulty of training deep feedforward
+    neural networks - Glorot, X. & Bengio, Y. (2010).
+    <http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf>`_
+
+    Args:
+        gain (int | float): an optional scaling factor. Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'``
+            or ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 gain: float = 1,
+                 distribution: str = 'normal',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.gain = gain
+        self.distribution = distribution
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                xavier_init(m, self.gain, self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    xavier_init(m, self.gain, self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self) -> str:
+        info = f'{self.__class__.__name__}: gain={self.gain}, ' \
+               f'distribution={self.distribution}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Normal')
+class NormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`.
+
+    Args:
+        mean (int | float):the mean of the normal distribution. Defaults to 0.
+        std (int | float): the standard deviation of the normal distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+
+    """
+
+    def __init__(self, mean: float = 0, std: float = 1, **kwargs):
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                normal_init(m, self.mean, self.std, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    normal_init(m, self.mean, self.std, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self) -> str:
+        info = f'{self.__class__.__name__}: mean={self.mean},' \
+               f' std={self.std}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='TruncNormal')
+class TruncNormalInit(BaseInit):
+    r"""Initialize module parameters with the values drawn from the normal
+    distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` with values
+    outside :math:`[a, b]`.
+
+    Args:
+        mean (float): the mean of the normal distribution. Defaults to 0.
+        std (float):  the standard deviation of the normal distribution.
+            Defaults to 1.
+        a (float): The minimum cutoff value.
+        b ( float): The maximum cutoff value.
+        bias (float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+
+    """
+
+    def __init__(self,
+                 mean: float = 0,
+                 std: float = 1,
+                 a: float = -2,
+                 b: float = 2,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.mean = mean
+        self.std = std
+        self.a = a
+        self.b = b
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                  self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    trunc_normal_init(m, self.mean, self.std, self.a, self.b,
+                                      self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self):
+        info = f'{self.__class__.__name__}: a={self.a}, b={self.b},' \
+               f' mean={self.mean}, std={self.std}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Uniform')
+class UniformInit(BaseInit):
+    r"""Initialize module parameters with values drawn from the uniform
+    distribution :math:`\mathcal{U}(a, b)`.
+
+    Args:
+        a (int | float): the lower bound of the uniform distribution.
+            Defaults to 0.
+        b (int | float): the upper bound of the uniform distribution.
+            Defaults to 1.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self, a: float = 0., b: float = 1., **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.b = b
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                uniform_init(m, self.a, self.b, self.bias)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    uniform_init(m, self.a, self.b, self.bias)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self) -> str:
+        info = f'{self.__class__.__name__}: a={self.a},' \
+               f' b={self.b}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Kaiming')
+class KaimingInit(BaseInit):
+    r"""Initialize module parameters with the values according to the method
+    described in `Delving deep into rectifiers: Surpassing human-level
+    performance on ImageNet classification - He, K. et al. (2015).
+    <https://www.cv-foundation.org/openaccess/content_iccv_2015/
+    papers/He_Delving_Deep_into_ICCV_2015_paper.pdf>`_
+
+    Args:
+        a (int | float): the negative slope of the rectifier used after this
+            layer (only used with ``'leaky_relu'``). Defaults to 0.
+        mode (str):  either ``'fan_in'`` or ``'fan_out'``. Choosing
+            ``'fan_in'`` preserves the magnitude of the variance of the weights
+            in the forward pass. Choosing ``'fan_out'`` preserves the
+            magnitudes in the backwards pass. Defaults to ``'fan_out'``.
+        nonlinearity (str): the non-linear function (`nn.functional` name),
+            recommended to use only with ``'relu'`` or ``'leaky_relu'`` .
+            Defaults to 'relu'.
+        bias (int | float): the value to fill the bias. Defaults to 0.
+        bias_prob (float, optional): the probability for bias initialization.
+            Defaults to None.
+        distribution (str): distribution either be ``'normal'`` or
+            ``'uniform'``. Defaults to ``'normal'``.
+        layer (str | list[str], optional): the layer will be initialized.
+            Defaults to None.
+    """
+
+    def __init__(self,
+                 a: float = 0,
+                 mode: str = 'fan_out',
+                 nonlinearity: str = 'relu',
+                 distribution: str = 'normal',
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.a = a
+        self.mode = mode
+        self.nonlinearity = nonlinearity
+        self.distribution = distribution
+
+    def __call__(self, module: nn.Module) -> None:
+
+        def init(m):
+            if self.wholemodule:
+                kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                             self.bias, self.distribution)
+            else:
+                layername = m.__class__.__name__
+                basesname = _get_bases_name(m)
+                if len(set(self.layer) & set([layername] + basesname)):
+                    kaiming_init(m, self.a, self.mode, self.nonlinearity,
+                                 self.bias, self.distribution)
+
+        module.apply(init)
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self) -> str:
+        info = f'{self.__class__.__name__}: a={self.a}, mode={self.mode}, ' \
+               f'nonlinearity={self.nonlinearity}, ' \
+               f'distribution ={self.distribution}, bias={self.bias}'
+        return info
+
+
+@INITIALIZERS.register_module(name='Caffe2Xavier')
+class Caffe2XavierInit(KaimingInit):
+    # `XavierFill` in Caffe2 corresponds to `kaiming_uniform_` in PyTorch
+    # Acknowledgment to FAIR's internal code
+    def __init__(self, **kwargs):
+        super().__init__(
+            a=1,
+            mode='fan_in',
+            nonlinearity='leaky_relu',
+            distribution='uniform',
+            **kwargs)
+
+    def __call__(self, module: nn.Module) -> None:
+        super().__call__(module)
+
+
+@INITIALIZERS.register_module(name='Pretrained')
+class PretrainedInit:
+    """Initialize module by loading a pretrained model.
+
+    Args:
+        checkpoint (str): the checkpoint file of the pretrained model should
+            be load.
+        prefix (str, optional): the prefix of a sub-module in the pretrained
+            model. it is for loading a part of the pretrained model to
+            initialize. For example, if we would like to only load the
+            backbone of a detector model, we can set ``prefix='backbone.'``.
+            Defaults to None.
+        map_location (str): map tensors into proper locations.
+    """
+
+    def __init__(self,
+                 checkpoint: str,
+                 prefix: Optional[str] = None,
+                 map_location: Optional[str] = None):
+        self.checkpoint = checkpoint
+        self.prefix = prefix
+        self.map_location = map_location
+
+    def __call__(self, module: nn.Module) -> None:
+        from mmcv.runner import (_load_checkpoint_with_prefix, load_checkpoint,
+                                 load_state_dict)
+        logger = get_logger('mmcv')
+        if self.prefix is None:
+            print_log(f'load model from: {self.checkpoint}', logger=logger)
+            load_checkpoint(
+                module,
+                self.checkpoint,
+                map_location=self.map_location,
+                strict=False,
+                logger=logger)
+        else:
+            print_log(
+                f'load {self.prefix} in model from: {self.checkpoint}',
+                logger=logger)
+            state_dict = _load_checkpoint_with_prefix(
+                self.prefix, self.checkpoint, map_location=self.map_location)
+            load_state_dict(module, state_dict, strict=False, logger=logger)
+
+        if hasattr(module, '_params_init_info'):
+            update_init_info(module, init_info=self._get_init_info())
+
+    def _get_init_info(self) -> str:
+        info = f'{self.__class__.__name__}: load from {self.checkpoint}'
+        return info
+
+
+def _initialize(module: nn.Module,
+                cfg: Dict,
+                wholemodule: bool = False) -> None:
+    func = build_from_cfg(cfg, INITIALIZERS)
+    # wholemodule flag is for override mode, there is no layer key in override
+    # and initializer will give init values for the whole module with the name
+    # in override.
+    func.wholemodule = wholemodule
+    func(module)
+
+
+def _initialize_override(module: nn.Module, override: Union[Dict, List],
+                         cfg: Dict) -> None:
+    if not isinstance(override, (dict, list)):
+        raise TypeError(f'override must be a dict or a list of dict, \
+                but got {type(override)}')
+
+    override = [override] if isinstance(override, dict) else override
+
+    for override_ in override:
+
+        cp_override = copy.deepcopy(override_)
+        name = cp_override.pop('name', None)
+        if name is None:
+            raise ValueError('`override` must contain the key "name",'
+                             f'but got {cp_override}')
+        # if override only has name key, it means use args in init_cfg
+        if not cp_override:
+            cp_override.update(cfg)
+        # if override has name key and other args except type key, it will
+        # raise error
+        elif 'type' not in cp_override.keys():
+            raise ValueError(
+                f'`override` need "type" key, but got {cp_override}')
+
+        if hasattr(module, name):
+            _initialize(getattr(module, name), cp_override, wholemodule=True)
+        else:
+            raise RuntimeError(f'module did not have attribute {name}, '
+                               f'but init_cfg is {cp_override}.')
+
+
+def initialize(module: nn.Module, init_cfg: Union[Dict, List[dict]]) -> None:
+    r"""Initialize a module.
+
+    Args:
+        module (``torch.nn.Module``): the module will be initialized.
+        init_cfg (dict | list[dict]): initialization configuration dict to
+            define initializer. OpenMMLab has implemented 6 initializers
+            including ``Constant``, ``Xavier``, ``Normal``, ``Uniform``,
+            ``Kaiming``, and ``Pretrained``.
+
+    Example:
+        >>> module = nn.Linear(2, 3, bias=True)
+        >>> init_cfg = dict(type='Constant', layer='Linear', val =1 , bias =2)
+        >>> initialize(module, init_cfg)
+
+        >>> module = nn.Sequential(nn.Conv1d(3, 1, 3), nn.Linear(1,2))
+        >>> # define key ``'layer'`` for initializing layer with different
+        >>> # configuration
+        >>> init_cfg = [dict(type='Constant', layer='Conv1d', val=1),
+                dict(type='Constant', layer='Linear', val=2)]
+        >>> initialize(module, init_cfg)
+
+        >>> # define key``'override'`` to initialize some specific part in
+        >>> # module
+        >>> class FooNet(nn.Module):
+        >>>     def __init__(self):
+        >>>         super().__init__()
+        >>>         self.feat = nn.Conv2d(3, 16, 3)
+        >>>         self.reg = nn.Conv2d(16, 10, 3)
+        >>>         self.cls = nn.Conv2d(16, 5, 3)
+        >>> model = FooNet()
+        >>> init_cfg = dict(type='Constant', val=1, bias=2, layer='Conv2d',
+        >>>     override=dict(type='Constant', name='reg', val=3, bias=4))
+        >>> initialize(model, init_cfg)
+
+        >>> model = ResNet(depth=50)
+        >>> # Initialize weights with the pretrained model.
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint='torchvision://resnet50')
+        >>> initialize(model, init_cfg)
+
+        >>> # Initialize weights of a sub-module with the specific part of
+        >>> # a pretrained model by using "prefix".
+        >>> url = 'http://download.openmmlab.com/mmdetection/v2.0/retinanet/'\
+        >>>     'retinanet_r50_fpn_1x_coco/'\
+        >>>     'retinanet_r50_fpn_1x_coco_20200130-c2398f9e.pth'
+        >>> init_cfg = dict(type='Pretrained',
+                checkpoint=url, prefix='backbone.')
+    """
+    if not isinstance(init_cfg, (dict, list)):
+        raise TypeError(f'init_cfg must be a dict or a list of dict, \
+                but got {type(init_cfg)}')
+
+    if isinstance(init_cfg, dict):
+        init_cfg = [init_cfg]
+
+    for cfg in init_cfg:
+        # should deeply copy the original config because cfg may be used by
+        # other modules, e.g., one init_cfg shared by multiple bottleneck
+        # blocks, the expected cfg will be changed after pop and will change
+        # the initialization behavior of other modules
+        cp_cfg = copy.deepcopy(cfg)
+        override = cp_cfg.pop('override', None)
+        _initialize(module, cp_cfg)
+
+        if override is not None:
+            cp_cfg.pop('layer', None)
+            _initialize_override(module, override, cp_cfg)
+        else:
+            # All attributes in module have same initialization.
+            pass
+
+
+def _no_grad_trunc_normal_(tensor: Tensor, mean: float, std: float, a: float,
+                           b: float) -> Tensor:
+    # Method based on
+    # https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    # Modified from
+    # https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            'mean is more than 2 std from [a, b] in nn.init.trunc_normal_. '
+            'The distribution of values may be incorrect.',
+            stacklevel=2)
+
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        lower = norm_cdf((a - mean) / std)
+        upper = norm_cdf((b - mean) / std)
+
+        # Uniformly fill tensor with values from [lower, upper], then translate
+        # to [2lower-1, 2upper-1].
+        tensor.uniform_(2 * lower - 1, 2 * upper - 1)
+
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+
+
+def trunc_normal_(tensor: Tensor,
+                  mean: float = 0.,
+                  std: float = 1.,
+                  a: float = -2.,
+                  b: float = 2.) -> Tensor:
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+
+    Modified from
+    https://github.com/pytorch/pytorch/blob/master/torch/nn/init.py
+
+    Args:
+        tensor (``torch.Tensor``): an n-dimensional `torch.Tensor`.
+        mean (float): the mean of the normal distribution.
+        std (float): the standard deviation of the normal distribution.
+        a (float): the minimum cutoff value.
+        b (float): the maximum cutoff value.
+    """
+    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
diff --git a/mmcv/mmcv/cnn/vgg.py b/mmcv/mmcv/cnn/vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1d9ba211eb4b0056eb4127e19159e9ed5d5251f
--- /dev/null
+++ b/mmcv/mmcv/cnn/vgg.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional, Sequence, Tuple, Union
+
+import torch.nn as nn
+from torch import Tensor
+
+from .utils import constant_init, kaiming_init, normal_init
+
+
+def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
+    """3x3 convolution with padding."""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        padding=dilation,
+        dilation=dilation)
+
+
+def make_vgg_layer(inplanes: int,
+                   planes: int,
+                   num_blocks: int,
+                   dilation: int = 1,
+                   with_bn: bool = False,
+                   ceil_mode: bool = False) -> List[nn.Module]:
+    layers = []
+    for _ in range(num_blocks):
+        layers.append(conv3x3(inplanes, planes, dilation))
+        if with_bn:
+            layers.append(nn.BatchNorm2d(planes))
+        layers.append(nn.ReLU(inplace=True))
+        inplanes = planes
+    layers.append(nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=ceil_mode))
+
+    return layers
+
+
+class VGG(nn.Module):
+    """VGG backbone.
+
+    Args:
+        depth (int): Depth of vgg, from {11, 13, 16, 19}.
+        with_bn (bool): Use BatchNorm or not.
+        num_classes (int): number of classes for classification.
+        num_stages (int): VGG stages, normally 5.
+        dilations (Sequence[int]): Dilation of each stage.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (all param fixed). -1 means
+            not freezing any parameters.
+        bn_eval (bool): Whether to set BN layers as eval mode, namely, freeze
+            running stats (mean and var).
+        bn_frozen (bool): Whether to freeze weight and bias of BN layers.
+    """
+
+    arch_settings = {
+        11: (1, 1, 2, 2, 2),
+        13: (2, 2, 2, 2, 2),
+        16: (2, 2, 3, 3, 3),
+        19: (2, 2, 4, 4, 4)
+    }
+
+    def __init__(self,
+                 depth: int,
+                 with_bn: bool = False,
+                 num_classes: int = -1,
+                 num_stages: int = 5,
+                 dilations: Sequence[int] = (1, 1, 1, 1, 1),
+                 out_indices: Sequence[int] = (0, 1, 2, 3, 4),
+                 frozen_stages: int = -1,
+                 bn_eval: bool = True,
+                 bn_frozen: bool = False,
+                 ceil_mode: bool = False,
+                 with_last_pool: bool = True):
+        super().__init__()
+        if depth not in self.arch_settings:
+            raise KeyError(f'invalid depth {depth} for vgg')
+        assert num_stages >= 1 and num_stages <= 5
+        stage_blocks = self.arch_settings[depth]
+        self.stage_blocks = stage_blocks[:num_stages]
+        assert len(dilations) == num_stages
+        assert max(out_indices) <= num_stages
+
+        self.num_classes = num_classes
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.bn_eval = bn_eval
+        self.bn_frozen = bn_frozen
+
+        self.inplanes = 3
+        start_idx = 0
+        vgg_layers = []
+        self.range_sub_modules = []
+        for i, num_blocks in enumerate(self.stage_blocks):
+            num_modules = num_blocks * (2 + with_bn) + 1
+            end_idx = start_idx + num_modules
+            dilation = dilations[i]
+            planes = 64 * 2**i if i < 4 else 512
+            vgg_layer = make_vgg_layer(
+                self.inplanes,
+                planes,
+                num_blocks,
+                dilation=dilation,
+                with_bn=with_bn,
+                ceil_mode=ceil_mode)
+            vgg_layers.extend(vgg_layer)
+            self.inplanes = planes
+            self.range_sub_modules.append([start_idx, end_idx])
+            start_idx = end_idx
+        if not with_last_pool:
+            vgg_layers.pop(-1)
+            self.range_sub_modules[-1][1] -= 1
+        self.module_name = 'features'
+        self.add_module(self.module_name, nn.Sequential(*vgg_layers))
+
+        if self.num_classes > 0:
+            self.classifier = nn.Sequential(
+                nn.Linear(512 * 7 * 7, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, 4096),
+                nn.ReLU(True),
+                nn.Dropout(),
+                nn.Linear(4096, num_classes),
+            )
+
+    def init_weights(self, pretrained: Optional[str] = None) -> None:
+        if isinstance(pretrained, str):
+            logger = logging.getLogger()
+            from ..runner import load_checkpoint
+            load_checkpoint(self, pretrained, strict=False, logger=logger)
+        elif pretrained is None:
+            for m in self.modules():
+                if isinstance(m, nn.Conv2d):
+                    kaiming_init(m)
+                elif isinstance(m, nn.BatchNorm2d):
+                    constant_init(m, 1)
+                elif isinstance(m, nn.Linear):
+                    normal_init(m, std=0.01)
+        else:
+            raise TypeError('pretrained must be a str or None')
+
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor, ...]]:
+        outs = []
+        vgg_layers = getattr(self, self.module_name)
+        for i in range(len(self.stage_blocks)):
+            for j in range(*self.range_sub_modules[i]):
+                vgg_layer = vgg_layers[j]
+                x = vgg_layer(x)
+            if i in self.out_indices:
+                outs.append(x)
+        if self.num_classes > 0:
+            x = x.view(x.size(0), -1)
+            x = self.classifier(x)
+            outs.append(x)
+        if len(outs) == 1:
+            return outs[0]
+        else:
+            return tuple(outs)
+
+    def train(self, mode: bool = True) -> None:
+        super().train(mode)
+        if self.bn_eval:
+            for m in self.modules():
+                if isinstance(m, nn.BatchNorm2d):
+                    m.eval()
+                    if self.bn_frozen:
+                        for params in m.parameters():
+                            params.requires_grad = False
+        vgg_layers = getattr(self, self.module_name)
+        if mode and self.frozen_stages >= 0:
+            for i in range(self.frozen_stages):
+                for j in range(*self.range_sub_modules[i]):
+                    mod = vgg_layers[j]
+                    mod.eval()
+                    for param in mod.parameters():
+                        param.requires_grad = False
diff --git a/mmcv/mmcv/device/__init__.py b/mmcv/mmcv/device/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba217b0771bcfada461d7c61a78f41a274e5aa6a
--- /dev/null
+++ b/mmcv/mmcv/device/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from . import ipu, mlu, mps
+from .scatter_gather import scatter, scatter_kwargs
+from .utils import get_device
+
+__all__ = ['mlu', 'ipu', 'mps', 'get_device', 'scatter', 'scatter_kwargs']
diff --git a/mmcv/mmcv/device/_functions.py b/mmcv/mmcv/device/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..462a7e4ddca14685047b7937e3054108e164cf91
--- /dev/null
+++ b/mmcv/mmcv/device/_functions.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+
+from mmcv.utils import deprecated_api_warning
+from .utils import get_device
+
+
+def scatter(input: Union[List, torch.Tensor], devices: List) -> List:
+    """scatter copies tensor to devices directly."""
+    current_device = get_device()
+    if isinstance(input, list):
+        outputs = [scatter(_input, devices) for _input in input]
+        return outputs
+    elif isinstance(input, torch.Tensor):
+        output = input.contiguous()
+        return output.to(current_device) if devices != [-1] else output
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+class Scatter:
+
+    @staticmethod
+    @deprecated_api_warning({'target_mlus': 'target_devices'},
+                            cls_name='Scatter')
+    def forward(target_devices, input):
+        outputs = scatter(input, target_devices)
+        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
diff --git a/mmcv/mmcv/device/ipu/__init__.py b/mmcv/mmcv/device/ipu/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..d550865ad20790f0eb79015abc866548c0f2f83b
--- /dev/null
+++ b/mmcv/mmcv/device/ipu/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from .dataloader import IPUDataLoader
+    from .hook_wrapper import IPUFp16OptimizerHook
+    from .model_wrapper import ipu_model_wrapper
+    from .runner import IPUBaseRunner, IPUEpochBasedRunner, IPUIterBasedRunner
+    from .utils import cfg2options
+    __all__ = [
+        'cfg2options', 'ipu_model_wrapper', 'IPUFp16OptimizerHook',
+        'IPUDataLoader', 'IPUBaseRunner', 'IPUEpochBasedRunner',
+        'IPUIterBasedRunner'
+    ]
diff --git a/mmcv/mmcv/device/ipu/dataloader.py b/mmcv/mmcv/device/ipu/dataloader.py
new file mode 100755
index 0000000000000000000000000000000000000000..1485df2f31facff79238c70d89fdd9030fddcbce
--- /dev/null
+++ b/mmcv/mmcv/device/ipu/dataloader.py
@@ -0,0 +1,157 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Mapping, Sequence
+from functools import partial
+
+import poptorch
+from torch.utils.data.dataloader import default_collate
+
+from mmcv.parallel import DataContainer
+
+
+def collate(batch, samples_per_gpu=1):
+    """Put each data field into a tensor/DataContainer with outer dimension
+    batch size.
+
+    TODO support for
+    :type:`~mmcv.parallel.DataContainer`. Currently, it will be ignored.
+    There are 3 cases.
+
+    1. cpu_only = True, e.g., meta data.
+    2. cpu_only = False, stack = True, e.g., images tensors.
+    3. cpu_only = False, stack = False, e.g., gt bboxes.
+    """
+
+    if not isinstance(batch, Sequence):
+        raise TypeError(
+            f'`batch` should be a sequence, but got {type(batch)}.')
+
+    if isinstance(batch[0], DataContainer):
+        # TODO `DataContainer` will be supported in the future.
+        raise TypeError('DataContainer is not supported in ipu data loader.')
+    elif isinstance(batch[0], Sequence):
+        transposed = zip(*batch)
+        collated_batch = []
+        for samples in transposed:
+            if not isinstance(samples[0], DataContainer):
+                # At present, we will skip the processing of datacontainer,
+                # which will reduce the performance of IPU DataLoder
+                collated_batch.append(collate(samples, samples_per_gpu))
+        return collated_batch
+    elif isinstance(batch[0], Mapping):
+        collated_batch = {}
+        for key in batch[0]:
+            if not isinstance(batch[0][key], DataContainer):
+                # At present, we will skip the processing of datacontainer,
+                # which will reduce the performance of IPU DataLoder
+                collated_batch[key] = collate([d[key] for d in batch])
+        return collated_batch
+    else:
+        return default_collate(batch)
+
+
+class IPUDataLoader(poptorch.DataLoader):
+    """Thin wrapper of `torch.utils.data.DataLoader`.
+
+    Compared with the pytorch DataLoder, this DataLoder changes the way of
+    calculation of batch size and adds the AsynchronousDataAccessor to
+    load and release data faster in cpu mode.
+
+    If this data loader is used in a distributed execution environment, it will
+    ensure that each process uses a different subset of the dataset, providing
+    you first call ``options.randomSeed(N)`` with an integer N which is the
+    same across all hosts.
+
+    Args:
+        dataset (torch.utils.data.Dataset): The dataset to get the data from.
+        options (poptorch.Options): Options that will be used to compile
+            and run the model.
+        batch_size (int, optional): This is the batch size in the conventional
+            sense of being the size that runs through an operation in the model
+            at any given time.
+        shuffle (bool, optional): set to ``True`` to have the data reshuffled
+            at every epoch (default: ``False``).
+        num_workers (int, optional): how many subprocesses to use for data
+            loading. ``0`` means that the data will be loaded in the main
+            process. (default: ``0``)
+        drop_last (bool, optional): If True and the number of elements in the
+            dataset is not a multiple of the combined batch size then the
+            incomplete batch at the end will be dropped.
+        persistent_workers (bool, optional): Re-use workers between
+            iterations if True.
+        auto_distributed_partitioning (bool, optional): If True, partitions the
+            dataset for distributed execution automatically. Otherwise, it is
+            assumed that partitioning has been handled manually.
+        mode (poptorch.DataLoaderMode, optional): If `DataLoaderMode.Async`,
+            uses an :py:class:`~poptorch.AsynchronousDataAccessor` to access
+            the dataset. If `DataLoaderMode.Sync`, accesses the dataset
+            synchronously.
+        async_options (Dict[str, Any], optional): Options to pass to
+            :py:class:`~poptorch.AsynchronousDataAccessor`.
+        rebatched_worker_size (int, optional): When using AsyncRebatched: batch
+            size of the tensors loaded by the workers.
+            Default to the combined batch size.
+            If specified the ``rebatched_worker_size`` must be less than
+            or equal to the combined batch size.
+        kwargs (Dict[str, Any], optional): Other options to pass to PyTorch's
+            ``DataLoader`` constructor.
+    """
+
+    def __init__(self,
+                 dataset,
+                 options,
+                 batch_size=1,
+                 shuffle=False,
+                 num_workers=0,
+                 drop_last=True,
+                 persistent_workers=True,
+                 auto_distributed_partitioning=True,
+                 mode='sync',
+                 async_options=None,
+                 rebatched_worker_size=None,
+                 **kwargs):
+        """Lazy init:
+
+        In many frameworks, the dataloader will be constructed before the
+        initialization of the ipu options, so the lazy init method is used
+        here, and the real initialization will not be done until the dataloader
+        needs to be used and the options are input.
+        """
+        # lazy init: sometimes, we cannot get IPU options when build data
+        #            loader
+        self.kwargs = {
+            'dataset': dataset,
+            'batch_size': batch_size,
+            'shuffle': shuffle,
+            'num_workers': num_workers,
+            'drop_last': drop_last,
+            'persistent_workers': persistent_workers,
+            'auto_distributed_partitioning': auto_distributed_partitioning,
+            'mode': mode,
+            'collate_fn': partial(collate, samples_per_gpu=batch_size),
+            'async_options': async_options,
+            'rebatched_worker_size': rebatched_worker_size,
+            **kwargs
+        }
+        self.dataset = dataset
+        self.initialized = False
+        if options:
+            self.init(options=options)
+
+    def init(self, options, **kwargs):
+        if not self.initialized:
+            kwargs = {**self.kwargs, **kwargs, 'options': options}
+            if kwargs['mode'] == 'sync':
+                kwargs['mode'] = poptorch.DataLoaderMode.Sync
+            elif kwargs['mode'] == 'async':
+                kwargs['mode'] = poptorch.DataLoaderMode.AsyncRebatched
+                if kwargs['async_options'] is None:
+                    kwargs['async_options'] = {
+                        'load_indefinitely': True,
+                        'buffer_size': 8
+                    }
+                if kwargs['rebatched_worker_size'] is None:
+                    kwargs['rebatched_worker_size'] = 128
+            super().__init__(**kwargs)
+            self.initialized = True
+
+        return self
diff --git a/mmcv/mmcv/device/ipu/hierarchical_data_manager.py b/mmcv/mmcv/device/ipu/hierarchical_data_manager.py
new file mode 100755
index 0000000000000000000000000000000000000000..a6f3b3cd2a139bcbc7852e7849071ab4b9fbb76f
--- /dev/null
+++ b/mmcv/mmcv/device/ipu/hierarchical_data_manager.py
@@ -0,0 +1,243 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import torch
+
+from mmcv.parallel import DataContainer
+
+# A customized None type for HierarchicalDataManager
+HierarchicalDataNone = object()
+
+
+class HierarchicalDataManager:
+    """A class manage all the tensors in the hierarchical data.
+
+    At present, the input data structure accepted by IPU is limited,
+    when the input data structure of mmcv varies.
+    Here, an intermediate class is needed to get and update tensors
+    from the original data.
+
+    HierarchicalDataManager will record a hierarchical input/output data in
+    self._hierarchical_data. For example, we have an input data:
+    {'img': tensorA, 'label': tensorB, 'img_metas': [tensorC, tensorD]}
+    To enable IPU to use the input, HierarchicalDataManager will collect
+    the torch tensors from self._hierarchical_data into a tuple like:
+    (tensorA, tensorB, tensorC, tensorD).
+    Meanwhile, the return of IPU is a tuple of tensors, HierarchicalDataManager
+    also have a function named update_all_tensors to update tensors in
+    self._hierarchical_data which is the output for upper calls.
+
+    Args:
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+    """
+
+    def __init__(self, logger=None):
+        self.atomic_types = (int, str, float, np.ndarray, type(None))
+        self.warning = warnings.warn if logger is None else logger.warning
+        # enable or disable input data's shape and value check
+        self.quick_mode = False
+        self._hierarchical_data = None
+
+    def quick(self):
+        self.quick_mode = True
+
+    def compare_atomic_type(self, a, b):
+        """Compare data, supported datatypes are numpy array and python basic
+        types."""
+        if isinstance(a, np.ndarray):
+            return np.all(a == b)
+        else:
+            return a == b
+
+    def record_hierarchical_data(self, data):
+        """Record a hierarchical data."""
+        if self._hierarchical_data is not None:
+            if isinstance(data, torch.Tensor):
+                assert isinstance(self._hierarchical_data, torch.Tensor), \
+                    'original hierarchical data is not torch.tensor'
+                self._hierarchical_data = data
+            else:
+                self.update_hierarchical_data(data)
+        else:
+            self._hierarchical_data = data
+
+    @property
+    def hierarchical_data(self):
+        return self._hierarchical_data
+
+    def update_hierarchical_data(self,
+                                 dataA,
+                                 dataB=HierarchicalDataNone,
+                                 strict=True,
+                                 address='data'):
+        """Update dataB with dataA in-place.
+
+        Args:
+            dataA (list or dict or tuple): New hierarchical data.
+            dataB (list or dict or tuple): hierarchical data to update.
+                if not specified, self.hierarchical_data will be updated then.
+            strict (bool, optional): If true, an error will be reported
+                when the following conditions occur:
+                1. Non-torch.Tensor data changed.
+                2. Torch.Tensor data shape changed.
+            address (str): Record the address of current data to be updated.
+                Default: 'data'.
+        """
+        if dataB is HierarchicalDataNone:
+            dataB = self.hierarchical_data
+
+        # Update with a da ta with the same structure
+        # but different values(tensors and basic python data types)
+        if isinstance(dataA, (tuple, list)):
+            for idx, node in enumerate(dataA):
+                new_address = ''
+                if not self.quick_mode:
+                    new_address = address + f'[{str(idx)}]'
+                    assert isinstance(node, type(dataB[idx])),\
+                        f'data structure changed: {new_address}'
+                if isinstance(node, torch.Tensor):
+                    dataB[idx] = node
+                else:
+                    self.update_hierarchical_data(
+                        node, dataB[idx], strict, address=new_address)
+        elif isinstance(dataA, dict):
+            for k, v in dataA.items():
+                new_address = ''
+                if not self.quick_mode:
+                    new_address = address + f'[{str(k)}]'
+                    assert isinstance(v, type(dataB[k])),\
+                        f'data structure changed: {new_address}'
+                if isinstance(v, torch.Tensor):
+                    dataB[k] = v
+                else:
+                    self.update_hierarchical_data(
+                        v, dataB[k], strict, address=new_address)
+        elif isinstance(dataA, self.atomic_types):
+            if not self.quick_mode:
+                is_equal = self.compare_atomic_type(dataA, dataB)
+                if not is_equal:
+                    if strict:
+                        raise ValueError(
+                            'all data except torch.Tensor should be same, '
+                            f'but data({address}) is changed.')
+                    else:
+                        self.warning(
+                            f'find a non-torch.Tensor data({type(dataA)}) '
+                            f'changed, and the address is {address}')
+        elif isinstance(dataA, DataContainer):
+            if not self.quick_mode:
+                assert isinstance(dataB, DataContainer)
+                new_address = address + '.data'
+                self.update_hierarchical_data(
+                    dataA.data, dataB.data, False, address=new_address)
+        else:
+            raise NotImplementedError(
+                f'not supported datatype:{type(dataA)}, address is {address}')
+
+    def collect_all_tensors(self, hierarchical_data=None):
+        """Collect torch.Tensor data from self.hierarchical_data to a list and
+        return."""
+        # get a list of tensor from self._hierarchical_data
+        if hierarchical_data is None:
+            hierarchical_data = self._hierarchical_data
+        tensors = []
+        if isinstance(hierarchical_data, torch.Tensor):
+            tensors = [hierarchical_data]
+        else:
+            self._collect_tensors(hierarchical_data, tensors)
+        return tensors
+
+    def _collect_tensors(self, data, tensors):
+        if isinstance(data, (tuple, list)):
+            for node in data:
+                if isinstance(node, torch.Tensor):
+                    tensors.append(node)
+                else:
+                    self._collect_tensors(node, tensors)
+        elif isinstance(data, dict):
+            for v in data.values():
+                if isinstance(v, torch.Tensor):
+                    tensors.append(v)
+                else:
+                    self._collect_tensors(v, tensors)
+        elif isinstance(data, self.atomic_types):
+            pass
+        elif isinstance(data, DataContainer):
+            self._collect_tensors(data.data, tensors)
+        else:
+            raise NotImplementedError(f'not supported datatype:{type(data)}')
+
+    def update_all_tensors(self, tensors):
+        """Put tensors from tuple back to self.hierarchical_data."""
+        if isinstance(self._hierarchical_data, torch.Tensor):
+            print(tensors, len(tensors))
+            assert len(tensors) == 1
+            assert isinstance(tensors[0], torch.Tensor)
+            self._hierarchical_data = tensors[0]
+        else:
+            # convert to list if tensors is tuple
+            tensors = list(tensors)
+            self._set_tensors(self._hierarchical_data, tensors)
+        return self.hierarchical_data
+
+    def _set_tensors(self, data, tensors):
+        if isinstance(data, tuple):
+            data = list(data)
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = tensors.pop(0)
+                else:
+                    self._set_tensors(data[idx], tensors)
+            data = tuple(data)
+        elif isinstance(data, list):
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = tensors.pop(0)
+                else:
+                    self._set_tensors(data[idx], tensors)
+        elif isinstance(data, dict):
+            for k, v in data.items():
+                if isinstance(v, torch.Tensor):
+                    data[k] = tensors.pop(0)
+                else:
+                    self._set_tensors(v, tensors)
+        elif isinstance(data, self.atomic_types):
+            pass
+        elif isinstance(data, DataContainer):
+            self._set_tensors(data.data, tensors)
+        else:
+            raise NotImplementedError(f'not supported datatype:{type(data)}')
+
+    def clean_all_tensors(self):
+        """Delete tensors from self.hierarchical_data."""
+        self._clean_tensors(self._hierarchical_data)
+
+    def _clean_tensors(self, data):
+        if isinstance(data, tuple):
+            data = list(data)
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = None
+                else:
+                    self._clean_tensors(data[idx])
+            data = tuple(data)
+        elif isinstance(data, list):
+            for idx in range(len(data)):
+                if isinstance(data[idx], torch.Tensor):
+                    data[idx] = None
+                else:
+                    self._clean_tensors(data[idx])
+        elif isinstance(data, dict):
+            for k, v in data.items():
+                if isinstance(v, torch.Tensor):
+                    data[k] = None
+                else:
+                    self._clean_tensors(v)
+        elif isinstance(data, self.atomic_types):
+            pass
+        elif isinstance(data, DataContainer):
+            self._clean_tensors(data.data)
+        else:
+            raise NotImplementedError(f'not supported datatype:{type(data)}')
diff --git a/mmcv/mmcv/device/ipu/hook_wrapper.py b/mmcv/mmcv/device/ipu/hook_wrapper.py
new file mode 100755
index 0000000000000000000000000000000000000000..141afb86d05a42c06fb5c4355cb47cae18e9bb2f
--- /dev/null
+++ b/mmcv/mmcv/device/ipu/hook_wrapper.py
@@ -0,0 +1,105 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.runner import HOOKS, LrUpdaterHook, OptimizerHook
+from mmcv.utils import TORCH_VERSION, digit_version
+
+
+def wrap_lr_updater_hook(lr_hook_class):
+    """A wrapper function to wrap any subclass of LrUpdaterHook.
+
+    IPU needs extra operations to upload optimizer settings. This wrapper will
+    override function(_set_lr) of a subclass of LrUpdaterHook.
+    """
+    assert issubclass(lr_hook_class, LrUpdaterHook)
+
+    class ipu_lr_hook_class(lr_hook_class):
+
+        def _set_lr(self, runner, *args, **kwargs):
+            super()._set_lr(runner, *args, **kwargs)
+            # convert torch optimizer to poptorch optimizer
+            runner.model.setOptimizer(runner.optimizer)
+
+    return ipu_lr_hook_class
+
+
+def wrap_optimizer_hook(optimizer_hook_class):
+    """A wrapper function to wrap OptimizerHook.
+
+    This is an non-intrusive implementation of wrapping optimizer hook (or you
+    need to change every config file to use IPU optimizer hook) IPU's clip-norm
+    implementation is different from pytorch, so there should be an error
+    raised when using clip-norm.
+    """
+
+    class ipu_optimizer_hook_class(OptimizerHook):
+
+        def __init__(self, **kwargs):
+            super().__init__(**kwargs)
+            if self.grad_clip is not None:
+                raise NotImplementedError('IPU does not support gradient clip')
+
+    return ipu_optimizer_hook_class
+
+
+if (TORCH_VERSION != 'parrots'
+        and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+
+    @HOOKS.register_module()
+    class IPUFp16OptimizerHook(OptimizerHook):
+        """FP16 optimizer hook (using PyTorch's implementation).
+
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of GradScalar.
+                Defaults to 512. For Pytorch >= 1.6, mmcv uses official
+                implementation of GradScaler. If you use a dict version of
+                loss_scale to create GradScaler, please refer to:
+                https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
+                for the parameters.
+
+        Examples:
+            >>> loss_scale = dict(
+            ...     init_scale=65536.0,
+            ...     growth_factor=2.0,
+            ...     backoff_factor=0.5,
+            ...     growth_interval=2000
+            ... )
+            >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
+        """
+
+        def __init__(self,
+                     grad_clip=None,
+                     coalesce=True,
+                     bucket_size_mb=-1,
+                     loss_scale=512.,
+                     distributed=True):
+            assert grad_clip is None,\
+                'IPU mode does not support `grad_clip` currently'
+            assert coalesce,\
+                'implemented all reduce in distributed training currently'
+            assert bucket_size_mb == -1,\
+                '`bucket_size_mb` should not be set in IPU mode'
+            self.distributed = distributed
+            self._scale_update_param = None
+            if loss_scale == 'dynamic':
+                raise NotImplementedError(
+                    'IPU mode does not support dynamic loss scale currently')
+            elif isinstance(loss_scale, float):
+                self.loss_scale = loss_scale
+            elif isinstance(loss_scale, dict):
+                raise NotImplementedError(
+                    'IPU mode supports single scale currently')
+            else:
+                raise ValueError(
+                    f'loss_scale should be float, but got {loss_scale} ')
+
+        def after_train_iter(self, runner):
+            pass
+
+else:
+    raise RuntimeError('The IPU mode only supports torch 1.6 and above')
diff --git a/mmcv/mmcv/device/ipu/model_wrapper.py b/mmcv/mmcv/device/ipu/model_wrapper.py
new file mode 100755
index 0000000000000000000000000000000000000000..c345537e29b27cf7fff740269da8643c9570cd36
--- /dev/null
+++ b/mmcv/mmcv/device/ipu/model_wrapper.py
@@ -0,0 +1,721 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+from collections import OrderedDict
+from typing import Optional, Union
+
+import poptorch
+import torch
+import torch.nn as nn
+from poptorch import PoplarExecutor, __version__, identity_loss
+from poptorch._args_parser import ArgsParser
+
+from mmcv.runner import auto_fp16
+from .hierarchical_data_manager import HierarchicalDataManager
+from .utils import compare_ndarray, model_sharding, recomputation_checkpoint
+
+
+class DictArgsParser(ArgsParser):
+    """A helper class for handling model input.
+
+    Args:
+        inputs (list): Inputs of model.
+    """
+
+    def __init__(self, inputs):
+        # Combine args and kwargs:
+        self._has_variadic_arguments = True
+        self._varnames = list(inputs.keys())
+        self._defaults = [inspect.Parameter.empty for _ in self._varnames]
+        self._warned_not_contiguous_input = False
+
+
+class WrappedNet(nn.Module):
+    """A net wrapper for model conversion.
+
+    This wrapper will make some changes and add some extra functions to
+    training/inference model.
+
+    Args:
+        model (:obj:`nn.Module`): The model to run.
+        inputs_manager (:obj:`HierarchicalDataManager`): A parser
+            converting inputs from tuple to dictionary.
+        outputs_manager (:obj:`HierarchicalDataManager`): A parser
+            converting outputs from dictionary to tuple.
+        inter_outputs_in_cpu (dict): Specify the features to be
+            recorded.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+    """
+
+    def __init__(self,
+                 model,
+                 inputs_manager,
+                 outputs_manager,
+                 inter_outputs_in_cpu,
+                 modules_to_record=None):
+        super().__init__()
+        self.model = model
+        self.inputs_manager = inputs_manager
+        self.outputs_manager = outputs_manager
+        self.training = model.training
+        # Register a hook function to capture the intermediate features
+        # generated by the network to align the outputs between ipu and cpu
+        # Used to confirm whether the implementation of CPU is consistent
+        # with the implementation of IPU
+        self.inter_outputs_in_cpu = inter_outputs_in_cpu
+        if modules_to_record is None:
+            modules_to_record = []
+
+        for idx, (name, module) in enumerate(model.named_modules()):
+            if name in modules_to_record or idx in modules_to_record:
+                features_hook = self.get_input_output_hook(
+                    name, idx, self.inter_outputs_in_cpu)
+                module.register_forward_hook(hook=features_hook)
+
+    def get_input_output_hook(self, name, idx, save_dict):
+
+        def input_output_hook(module, fea_in, fea_out):
+            if isinstance(fea_in, tuple):
+                fea_in = list(fea_in)
+            if isinstance(fea_out, tuple):
+                fea_out = list(fea_out)
+            save_dict[name] = {
+                'fea_in': fea_in,
+                'fea_out': fea_out,
+                'idx': idx
+            }
+            return None
+
+        return input_output_hook
+
+    def forward(self, inputs_tuple):
+        """This function is used to be compiled to ipu, the inputs and outputs
+        need to be tuples, so here we need to restore the input back to a
+        dictionary and convert the output to a tuple."""
+        self.inputs_manager.update_all_tensors(inputs_tuple)
+        kwargs = {**(self.inputs_manager.hierarchical_data)}
+        if self.training:
+            outputs = self.forward_train(kwargs)
+            # tell poptorch which loss will be used finally
+            identity_loss(outputs['loss'], reduction='none')
+        else:
+            outputs = self.forward_eval(kwargs)
+
+        if isinstance(outputs, torch.Tensor):
+            # currently not support single tensor output,
+            # need to wrap it with a dictionary,
+            # use a keyword to identify this case
+            outputs = {'output of WrappedNet: single tensor': outputs}
+
+        # if there are some features need to be record, add extra outputs
+        for name in self.inter_outputs_in_cpu:
+            outputs[name] = self.inter_outputs_in_cpu[name]
+
+        # record all the places of return tensors in the converting stage
+        # while in the real run stage, all the tensor are changed in-place
+        # that means the output can be obtained directly outside this function
+        self.outputs_manager.record_hierarchical_data(outputs)
+        plain_outputs = self.outputs_manager.collect_all_tensors()
+        return plain_outputs
+
+    def forward_train(self, kwargs):
+        optimizer = kwargs.pop('optimizer')
+        outputs = self.train_step(kwargs, optimizer)
+        return outputs
+
+    def train_step(self, data, optimizer=None, **kwargs):
+        """The iteration step during training.
+
+        This method defines an iteration step during training, except for the
+        back propagation and optimizer updating, which are done in an optimizer
+        hook. Note that in some complicated cases or models, the whole process
+        including back propagation and optimizer updating are also defined in
+        this method, such as GAN.
+
+        Args:
+            data (dict): The output of dataloader.
+            optimizer (:obj:`torch.optim.Optimizer`, optional): The
+                optimizer of runner is passed to ``train_step()``. This
+                argument is unused and reserved.
+
+        Returns:
+            dict: Dict of outputs. The following fields are contained.
+                - loss (torch.Tensor): A tensor for back propagation, which \
+                    can be a weighted sum of multiple losses.
+                - log_vars (dict): Dict contains all the variables to be sent \
+                    to the logger.
+                - num_samples (int): Indicates the batch size (when the model \
+                    is DDP, it means the batch size on each GPU), which is \
+                    used for averaging the logs.
+        """
+        losses = self.model(**data)
+        loss, log_vars = self._parse_losses(losses)
+
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+
+        return outputs
+
+    def _parse_losses(self, losses):
+        log_vars = OrderedDict()
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars[loss_name] = loss_value.mean()
+            elif isinstance(loss_value, list):
+                log_vars[loss_name] = sum(loss.mean() for loss in loss_value)
+            elif isinstance(loss_value, dict):
+                for name, value in loss_value.items():
+                    log_vars[name] = value
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(value for key, value in log_vars.items() if 'loss' in key)
+        log_vars['loss'] = loss
+
+        return loss, log_vars
+
+    def forward_eval(self, kwargs):
+        img = kwargs.pop('img')
+        img_metas = kwargs.pop('img_metas', None)
+        return_loss = kwargs.pop('return_loss')
+        assert not return_loss
+        # TODO Temporarily hard-code to close post_process,
+        # otherwise, in the third trace(_check_trace),
+        # post_process will convert output tensor to numpy array automatically,
+        # resulting in _check_trace failure
+        outputs = self.model(
+            img,
+            img_metas=img_metas,
+            return_loss=return_loss,
+            post_process=False)
+        return outputs
+
+
+class MMPoplarExecutor(PoplarExecutor):
+    """An executor for inputs/outputs parsing, model compilation, data
+    alignment and IPU upload/download.
+
+    Args:
+        model (:obj:`nn.Module`): The model to be compiled.
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+        training (bool): Model in training mode or eval mode.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+        args (argument list): Arguments passed to the `__init__`
+            method of PoplarExecutor.
+        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
+            method of PoplarExecutor.
+    """
+
+    def __init__(self,
+                 model,
+                 logger=None,
+                 training=True,
+                 modules_to_record=None,
+                 *args,
+                 **kwargs):
+        # self.model == self._user_model: input pytorch model
+        # self._model: wrapped model which is used to compile
+        # and update weights, these two models use same weights
+        # wrapped model only accept and output tuple, so
+        # HierarchicalDataManager will convert dictionary
+        # to tuple and convert them back
+        self.inputs_manager = HierarchicalDataManager(logger=logger)
+        self.outputs_manager = HierarchicalDataManager(logger=logger)
+        self.logger = logger
+        # the features calculated by CPU
+        self.inter_outputs_in_cpu = {}
+        # the features calculated by IPU
+        self.inter_outputs_in_ipu = {}
+        if modules_to_record is None:
+            # It is possible that the IPU implementation of some operators
+            # is inconsistent with the expected (CPU), here you can use
+            # this method to confirm whether there is a problem
+            self.compare_with_cpu = False
+        else:
+            self.compare_with_cpu = True
+        # move model.fp16_enabled to self.fp16_enabled,
+        # modify the position where the input is automatically casted to half
+        if getattr(model, 'fp16_enabled', False):
+            model.fp16_enabled = False
+            self.fp16_enabled = True
+        # make torch.jit.trace convert self._model
+        model = WrappedNet(
+            model,
+            self.inputs_manager,
+            self.outputs_manager,
+            self.inter_outputs_in_cpu,
+            modules_to_record=modules_to_record)
+        super().__init__(model, training=training, *args, **kwargs)
+        # overwrite self._args_parser in train_step or val_step
+        self._args_parser = None
+        if training:
+            assert self.training
+        else:
+            assert not self.training
+
+    @property
+    def training(self):
+        # If trying to get the attribute(training) of self,
+        # since the class has no training attribute,
+        # it will automatically look for the training attribute of self.model.
+        # However, the real attribute we want to check is self._training,
+        # self.model.training  and self._training are often inconsistent.
+        # It is not clear whether it is a Poptorch bug or a special design,
+        # temporarily use this function to fix the problem
+        return self._training  # comes from self.model._training
+
+    @auto_fp16(supported_types=(PoplarExecutor, ))
+    def run_model(self, data_dict):
+        # this function is used to parse input_dict
+        # and convert to output_dict
+        if self.isCompiled():
+            self.inputs_manager.record_hierarchical_data(data_dict)
+            inputs_tuple = tuple(self.inputs_manager.collect_all_tensors())
+        else:
+            # get tensors out of data and put them in a tuple
+            self.inputs_manager.record_hierarchical_data(data_dict)
+            inputs_tuple = tuple(self.inputs_manager.collect_all_tensors())
+            # turn logger in data manager off after compilation
+            self.inputs_manager.quick()
+            self.outputs_manager.quick()
+
+        # parser args in the first iter
+        if self._args_parser is None:
+            self._args_parser = DictArgsParser({'args': inputs_tuple})
+
+        # run or convert model
+        # the plain_outputs will be used in converting stage
+        plain_outputs = self(inputs_tuple)
+
+        self.inputs_manager.clean_all_tensors()
+
+        # put list of tensors back to the output dict
+        # according to the same order
+        self.outputs_manager.update_all_tensors(plain_outputs)
+        # get the real output dictionary from self.outputs_manager
+        output_dict = self.outputs_manager.hierarchical_data
+
+        # split output_dict into inter_outputs_in_ipu
+        # and output of the torch model
+        torch_model_output = {}
+        for name in output_dict:
+            if name in self.inter_outputs_in_cpu:
+                self.inter_outputs_in_ipu[name] = output_dict[name]
+            else:
+                torch_model_output[name] = output_dict[name]
+
+        if 'output of WrappedNet: single tensor' in output_dict:
+            assert len(torch_model_output) == 1
+            assert isinstance(
+                torch_model_output['output of WrappedNet: single tensor'],
+                torch.Tensor)
+            torch_model_output = \
+                torch_model_output['output of WrappedNet: single tensor']
+
+        return torch_model_output
+
+    def train_step(self, data, optimizer=None, **kwargs):
+        # arguments from mmcls/models/classifiers/base.py:
+        # BaseClassifier.train_step
+        assert self.training
+        assert len(kwargs) == 0  # TODO, support later if necessary
+
+        # TODO support datacontainer as input
+        # currently, auto_fp16 and HierarchicalDataManager take too much
+        # time on traversing datacontainer
+        data['img_metas'] = None
+        num_samples = len(data['img'].data)
+
+        # TODO we will ignore optimizer because it will not be used in model,
+        # support later if necessary
+        data['optimizer'] = None
+        output_dict = self.run_model(data)
+
+        # outputs contained loss, log_vars, num_samples,
+        # only loss(torch.tensor) has been updated
+        # remove all unchanged vars, left torch.tensor
+        neat_output_dict = {'loss': output_dict['loss']}
+
+        # re-parse outputs, get back log_vars and num_samples
+        loss, log_vars = self.model._parse_losses(neat_output_dict)
+        final_output_dict = dict(
+            loss=loss, log_vars=log_vars, num_samples=num_samples)
+        return final_output_dict
+
+    def eval_call(self, img, img_metas=None, return_loss=True, **kwargs):
+        # arguments from mmdet/models/detectors/base.py:BaseDetector.forward
+        # tmp usssage for eval mode
+        assert not self.training
+        assert len(kwargs) == 0  # TODO, support later if necessary
+        assert not return_loss
+        data = {'img': img, 'img_metas': img_metas, 'return_loss': return_loss}
+
+        output_dict = self.run_model(data)
+
+        return output_dict
+
+    def detachFromDevice(self):
+        if self.isCompiled() and self._is_attached:
+            super().detachFromDevice()
+
+    def attachToDevice(self):
+        if self.isCompiled() and not self._is_attached:
+            super().attachToDevice()
+
+
+class TrainEvalModel:
+    """A class maintaining training MMPoplarExecutor and inference
+    MMPoplarExecutor.
+
+    Args:
+        train_model (:obj:`nn.Module`): The training model to be compiled.
+            ``train_model`` can be None if only executing validation.
+        eval_model (:obj:`nn.Module`): The inference model to be compiled.
+        options (mmcv.Config, dict): Options that will be used to compile
+            and run the model.
+        optimizer (:obj:`torch.optim.Optimizer`, optional): torch
+            optimizer, necessary if in training mode
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+    """
+
+    def __init__(self,
+                 train_model,
+                 eval_model,
+                 options,
+                 optimizer,
+                 modules_to_record=None,
+                 logger=None):
+        if train_model is None:
+            self._train_executor = None
+            self.training = False
+        else:
+            self._train_executor = get_training_model(
+                train_model,
+                options=options['training'],
+                optimizer=optimizer,
+                logger=logger,
+                modules_to_record=modules_to_record)
+            self.training = True
+        self._eval_executor = get_inference_model(
+            eval_model, options=options['inference'], logger=logger)
+
+    @property
+    def executor(self):
+        if self.training:
+            return self._train_executor
+        else:
+            return self._eval_executor
+
+    def train(self, mode: bool = True):
+        """Sets the module in training mode.
+
+        This has any effect only on certain modules. See documentations of
+        particular modules for details of their behaviors in
+        training/evaluation mode, if they are affected,
+        e.g. :class:`Dropout`, :class:`BatchNorm`, etc.
+
+        Args:
+            mode (bool): whether to set training mode (``True``) or evaluation
+                mode (``False``). Default: ``True``.
+
+        Returns:
+            Module: self
+        """
+        if not isinstance(mode, bool):
+            raise ValueError('training mode is expected to be boolean, '
+                             f'but got {type(mode)}')
+        if self._train_executor is None and mode:
+            raise RuntimeError(
+                'The train_executor is not initialized.'
+                'If you want to initialize train_executor,'
+                'you need to input optimizer when converting pytorch model')
+
+        if mode == self.training:
+            self.model.train(mode)
+            return self
+        else:
+            if self.isCompiled():
+                # copy weights from IPU to cpu before off-load current session
+                self.copyWeightsToHost()
+                # detach the current session before change the mode,
+                # if is training mode and weights are updated,
+                # poptorch will copy weights from IPU to host
+                self.detachFromDevice()
+
+            self.training = mode  # session will changed with mode changing
+            self.model.train(mode)
+
+            # after changing mode, attach the current new session,
+            # and this function will copy weights of model to device
+            self.attachToDevice()
+            return self
+
+    def eval(self):
+        """Sets the module in evaluation mode.
+
+        This has any effect only on certain modules.
+        See documentations of particular modules
+        for details of their behaviors in training/evaluation mode,
+        if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc.
+
+        This is equivalent with :meth:`self.train(False)
+        <nn.Module.train>`.
+
+        See :ref:`locally-disable-grad-doc` for a comparison between
+        `.eval()` and several similar mechanisms that may be confused with it.
+
+        Returns:
+            Module: self
+        """
+        return self.train(False)
+
+    def compare_data_between_ipu_and_cpu(self, inter_outputs_in_cpu,
+                                         inter_outputs_in_ipu):
+        for key, val in inter_outputs_in_cpu.items():
+            is_tensor = isinstance(val['fea_in'], torch.Tensor)
+            fea_in_cpu = val['fea_in']
+            fea_in_cpu_list = [fea_in_cpu] if is_tensor else fea_in_cpu
+            fea_in_ipu = inter_outputs_in_ipu[key]['fea_in']
+            fea_in_ipu_list = [fea_in_ipu] if is_tensor else fea_in_ipu
+
+            is_tensor = isinstance(val['fea_out'], torch.Tensor)
+            fea_out_cpu = val['fea_out']
+            fea_out_cpu_list = [fea_out_cpu] if is_tensor else fea_out_cpu
+            fea_out_ipu = inter_outputs_in_ipu[key]['fea_out']
+            fea_out_ipu_list = [fea_out_ipu] if is_tensor else fea_out_ipu
+
+            print('comparing layer:', key)
+            for idx, (featA, featB) in \
+                    enumerate(zip(fea_in_cpu_list, fea_in_ipu_list)):
+                print('fea_in, tensor ', idx)
+                compare_ndarray(featA.detach().numpy(), featB.detach().numpy())
+            for idx, (featA, featB) in \
+                    enumerate(zip(fea_out_cpu_list, fea_out_ipu_list)):
+                print('fea_out, tensor', idx)
+                compare_ndarray(featA.detach().numpy(), featB.detach().numpy())
+
+    # TODO Unified training and eval interface,
+    # merge train_step(train) and __call__(eval) together
+    def train_step(self, data, optimizer=None, **kwargs):
+        assert self.training, 'not supported train_step on eval mode'
+        inter_outputs_in_cpu = {}
+        if (self._train_executor.isCompiled()
+                and self._train_executor.compare_with_cpu):
+            self.copyWeightsToHost()
+            # run in CPU mode
+            self._train_executor.model.train_step(data, optimizer, **kwargs)
+            inter_outputs_in_cpu = {
+                **(self._train_executor.inter_outputs_in_cpu)
+            }
+        # run in IPU mode
+        result = self._train_executor.train_step(data, optimizer, **kwargs)
+        if (self._train_executor.isCompiled()
+                and self._train_executor.compare_with_cpu
+                and len(inter_outputs_in_cpu) > 0):
+            self.compare_data_between_ipu_and_cpu(
+                inter_outputs_in_cpu,
+                self._train_executor.inter_outputs_in_ipu)
+        return result
+
+    # TODO Unified training and eval interface,
+    # merge train_step(train) and __call__(eval) together
+    def __call__(self, *args, **kwargs):
+        if self.training:
+            raise NotImplementedError('use train_step rather than __call__')
+        else:
+            return self._eval_executor.eval_call(*args, **kwargs)
+
+    def __getattr__(self, attr):
+        return getattr(self.executor, attr)
+
+
+def get_training_model(model: nn.Module,
+                       options: Optional[poptorch.Options] = None,
+                       optimizer: Optional[torch.optim.Optimizer] = None,
+                       logger=None,
+                       modules_to_record=None) -> poptorch.PoplarExecutor:
+    """Create a PopTorch training model from a PyTorch model, running on IPU
+    hardware in training mode.
+
+    Note:
+        PopTorch makes a shallow copy of the model. Changes to the
+        parameters in the returned training model affect the original model
+        and vice versa. However, primitive variable types are not synced: for
+        example calling ``model.train()`` on the original model, which
+        changes the ``training`` bool of the model instance, will not alter the
+        model returned by this function. You may need to call ``model.train()``
+        on your model before you call this function for correct behavior.
+
+    Args:
+        model (:obj:`nn.Module`): The model to run.
+        options (poptorch.Options): Options that will be used to compile
+            and run the model.
+        optimizer (:obj:`torch.optim.Optimizer`, optional): The optimizers
+            to apply during training.
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+
+    Returns:
+        The :class:`poptorch.PoplarExecutor` wrapper to use in place
+        of ``model``.
+    """
+    # Create a copy of the original model in case it needs to be wrapped
+    maybe_wrapped_model = copy.copy(model)
+
+    return MMPoplarExecutor(
+        model=maybe_wrapped_model,
+        logger=logger,
+        options=options,
+        training=True,
+        optimizer=optimizer,
+        user_model=model,
+        modules_to_record=modules_to_record,
+        poptorch_version=__version__)
+
+
+def get_inference_model(model: Union[nn.Module, poptorch.PoplarExecutor],
+                        options: Optional[poptorch.Options] = None,
+                        logger=None) -> poptorch.PoplarExecutor:
+    """Create a PopTorch inference model from a PyTorch model, running on IPU
+    hardware in inference mode.
+
+    Note:
+        PopTorch makes a shallow copy of the model. Changes to the
+        parameters in the returned inference model affect the original model
+        and vice versa. However, primitive variable types are not synced: for
+        example calling ``model.eval()`` on the original model will not alter
+        the model returned by this function. You may need to call
+        ``model.eval()`` on your model before you call this function for
+        correct behavior.
+
+    Args:
+        model (:obj:`nn.Module`): The model to run.
+        options (poptorch.Options): Options that will be used to compile
+            and run the model.
+        logger (:obj:`logging.Logger`): Logger used during running.
+             Defaults to None.
+
+    Returns:
+        The :class:`poptorch.PoplarExecutor` wrapper to use in place of
+        ``model``.
+    """
+
+    return MMPoplarExecutor(
+        model=copy.copy(model),
+        logger=logger,
+        options=options,
+        training=False,
+        poptorch_version=__version__)
+
+
+def ipu_model_wrapper(model,
+                      options,
+                      optimizer=None,
+                      logger=None,
+                      modules_to_record=None,
+                      ipu_model_cfg=None,
+                      fp16_cfg=None):
+    """Convert torch model to IPU model.
+
+    Args:
+        model (nn.Module): The target model to be converted.
+        options (dict[str, poptorch.Options]): IPU options, generated
+            by :func:`cfg2options`.
+        optimizer (:obj:`torch.optim.Optimizer`, optional): torch
+            optimizer, necessary if in training mode
+        logger (:obj:`logging.Logger`): Logger used during training.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+        ipu_model_cfg (dict): A dictionary contains train_split_edges and
+            train_ckpt_nodes, See details in :func:`model_sharding` and
+            :func:`recomputation_checkpoint` functions.
+        fp16_cfg (dict): Config for IPU fp16 training. Currently supports
+            configs: `loss_scale`, `velocity_accum_type` and `accum_type`.
+            See details in
+            https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html
+
+    Returns:
+        TrainEvalModel: IPU wrapped model.
+    """
+    if ipu_model_cfg is None:
+        ipu_model_cfg = {}
+    training = model.training if optimizer is not None else False
+    # set mixed-precision
+    if fp16_cfg is not None:
+        from mmcv.runner import wrap_fp16_model
+        loss_scale = fp16_cfg['loss_scale']
+        wrap_fp16_model(model)
+        model.half()
+        # TODO tmp ussage to set loss scaling for torch original optimizer
+        if optimizer is not None:
+            optimizer.loss_scaling = loss_scale
+            if fp16_cfg.get('velocity_accum_type', False):
+                if fp16_cfg['velocity_accum_type'] == 'half':
+                    optimizer.velocity_accum_type = torch.half
+                else:
+                    optimizer.velocity_accum_type = torch.float32
+            if fp16_cfg.get('accum_type', False):
+                if fp16_cfg['accum_type'] == 'half':
+                    optimizer.accum_type = torch.half
+                else:
+                    optimizer.accum_type = torch.float32
+        # TODO support feature alignment for fp16
+        if modules_to_record is not None:
+            raise NotImplementedError(
+                'Feature alignment for fp16 is not implemented')
+
+    # set model partition
+    if optimizer is None:
+        train_model = None
+    else:
+        # split model into multi-IPUs if specified
+        train_model = model_sharding(
+            copy.copy(model).train(),
+            ipu_model_cfg.get('train_split_edges', []))
+
+        recomputation_checkpoint(train_model,
+                                 ipu_model_cfg.get('train_ckpt_nodes', []))
+
+        # TODO support feature alignment for gradient accumulation mode
+        gradient_accumulation = \
+            getattr(options['training'].Training, 'gradient_accumulation', 1)
+        if gradient_accumulation > 1:
+            assert modules_to_record is None, \
+                'Feature alignment for grad-accumulation mode not implemented'
+
+        # TODO support feature alignment for multi-replica mode
+        replication_factor = \
+            getattr(options['training'], 'replication_factor', 1)
+        if replication_factor > 1:
+            assert modules_to_record is None, \
+                'Feature alignment for multi-replica mode not implemented'
+
+    # TODO supports different model partitions between train and eval mode
+    assert len(ipu_model_cfg.get('eval_split_edges', [])) == 0,\
+        'Currently, BeginBlock can only be used once on the same model'
+    eval_model = copy.copy(model).eval()
+
+    # wrap model for compilation
+    model = TrainEvalModel(
+        train_model,
+        eval_model,
+        options=options,
+        optimizer=optimizer,
+        logger=logger,
+        modules_to_record=modules_to_record)
+    model.train(training)
+    return model
diff --git a/mmcv/mmcv/device/ipu/runner.py b/mmcv/mmcv/device/ipu/runner.py
new file mode 100755
index 0000000000000000000000000000000000000000..e2d4922677e08b2d6b5132a01034de8b043fa3f1
--- /dev/null
+++ b/mmcv/mmcv/device/ipu/runner.py
@@ -0,0 +1,142 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.runner import (HOOKS, RUNNERS, BaseRunner, EpochBasedRunner,
+                         IterBasedRunner)
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from .dataloader import IPUDataLoader
+    from .hook_wrapper import (IPUFp16OptimizerHook, wrap_lr_updater_hook,
+                               wrap_optimizer_hook)
+    from .model_wrapper import ipu_model_wrapper
+    from .utils import build_from_cfg_with_wrapper, cfg2options
+
+
+class IPUBaseRunner(BaseRunner):
+    """A base runner for IPU.
+
+    This runner has some extra processes for IPU which are shown below:
+
+    1. Parse options for IPU
+    2. wrap pytorch model for IPU
+    3. Raise errors while encountering illegal usage
+    4. Input IPU options and initialize dataloader if finding an instance
+       of IPUDataLoader
+
+    Args:
+        model (:obj:`nn.Module`): The model to run.
+        options_cfg (mmcv.Config, dict): Options that will be used to compile
+            and run the model.
+        modules_to_record (mmcv.Config, list): Index or name of modules which
+            will be recorded for output. It is necessary to specify output for
+            static graph of model training or inference.
+        ipu_model_cfg (mmcv.Config, dict): Config of model partition and
+            recomputing checkpoint
+        fp16_cfg (mmcv.Config): Config for fp16 training.
+        batch_processor (callable): A callable method that process a data
+            batch. Should be None for IPU runner
+        kwargs (Dict[str, Any], optional): Keyword arguments will be passed to
+        ``base_runner.BaseRunner``.
+    """
+
+    def __init__(self,
+                 model,
+                 options_cfg=None,
+                 modules_to_record=None,
+                 ipu_model_cfg=None,
+                 fp16_cfg=None,
+                 batch_processor=None,
+                 **kwargs):
+        assert hasattr(model, 'train_step') and batch_processor is None,\
+            'only support model with train_step'
+
+        if options_cfg is None:
+            options_cfg = {}
+        # call BaseRunner.__init__() here
+        super().__init__(model, **kwargs)
+
+        # process options of ipu
+        if IS_IPU_AVAILABLE:
+            self.options = cfg2options(options_cfg)
+            self.model = ipu_model_wrapper(
+                self.model,
+                self.options,
+                self.optimizer,
+                self.logger,
+                modules_to_record=modules_to_record,
+                ipu_model_cfg=ipu_model_cfg,
+                fp16_cfg=fp16_cfg)
+        else:
+            raise NotImplementedError('cpu mode on IPURunner is not supported')
+
+    def register_lr_hook(self, lr_config):
+        if lr_config is None:
+            return
+        assert isinstance(lr_config, dict)
+        assert 'policy' in lr_config
+        policy_type = lr_config.pop('policy')
+        # If the type of policy is all in lower case,
+        # e.g., 'cyclic', then its first letter will be capitalized,
+        # e.g., to be 'Cyclic'.
+        # This is for the convenient usage of Lr updater.
+        # Since this is not applicable for `
+        # CosineAnnealingLrUpdater`, the string will not be changed
+        # if it contains capital letters.
+        if policy_type == policy_type.lower():
+            policy_type = policy_type.title()
+        hook_type = policy_type + 'LrUpdaterHook'
+        lr_config['type'] = hook_type
+        hook = build_from_cfg_with_wrapper(lr_config, HOOKS,
+                                           wrap_lr_updater_hook)
+        self.register_hook(hook, priority='VERY_HIGH')
+
+    def register_optimizer_hook(self, optimizer_config):
+        if optimizer_config is None:
+            return
+        assert isinstance(optimizer_config, (dict, IPUFp16OptimizerHook))
+        if isinstance(optimizer_config, dict):
+            optimizer_config.setdefault('type', 'OptimizerHook')
+            hook = build_from_cfg_with_wrapper(optimizer_config, HOOKS,
+                                               wrap_optimizer_hook)
+        else:
+            hook = optimizer_config
+        self.register_hook(hook, priority='ABOVE_NORMAL')
+
+    def run(self, data_loaders, workflow, *args, **kwargs):
+        for i, flow in enumerate(workflow):
+            mode, _ = flow
+            # initialize IPU dataloader if not initialized
+            assert isinstance(data_loaders[i], IPUDataLoader),\
+                'IPU runner can only work with `IPUDataLoader`'
+            data_loaders[i].init(options=self.get_options(mode))
+
+        super().run(data_loaders, workflow, *args, **kwargs)
+
+    def get_options(self, mode):
+        if mode == 'train':
+            return self.options['training']
+        elif mode == 'val':
+            return self.options['inference']
+        else:
+            raise ValueError(f'mode should be train or val but got {mode}')
+
+
+@RUNNERS.register_module()
+class IPUEpochBasedRunner(IPUBaseRunner, EpochBasedRunner):
+    """Epoch-based Runner for IPU.
+
+    The Inheritance order(MRO) is: IPUEpochBasedRunner -> IPUBaseRunner ->
+    EpochBasedRunner -> BaseRunner This runner train models epoch by epoch.
+    """
+    pass
+
+
+@RUNNERS.register_module()
+class IPUIterBasedRunner(IPUBaseRunner, IterBasedRunner):
+    """Iteration-based Runner for IPU.
+
+    The Inheritance order(MRO) is: IPUIterBasedRunner -> IPUBaseRunner ->
+    IterBasedRunner -> BaseRunner This runner train models iteration by
+    iteration.
+    """
+    pass
diff --git a/mmcv/mmcv/device/ipu/utils.py b/mmcv/mmcv/device/ipu/utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..79709db1ee1282e8daa6614ceb23481d3cd58338
--- /dev/null
+++ b/mmcv/mmcv/device/ipu/utils.py
@@ -0,0 +1,244 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+
+import numpy as np
+import popart
+import poptorch
+import torch
+import torch.nn as nn
+
+from mmcv.utils import Registry
+
+
+def _options_assigner(cfg, options_node):
+    # set popart.options by config
+    # cfg: dict, python data type
+    # options_node: python module or function
+    if isinstance(cfg, dict):
+        for key in cfg:
+            _options_assigner(cfg[key], getattr(options_node, key))
+    elif isinstance(cfg, (int, float, str, list)):
+        if callable(options_node):
+            options_node(cfg)
+        else:
+            error_msg = f'options_node type {type(options_node)} not supported'
+            raise NotImplementedError(error_msg)
+    else:
+        error_msg = f'cfg type {type(cfg)} not supported'
+        raise NotImplementedError(error_msg)
+
+
+def cfg2options(cfg):
+    """Parse dictionary to ipu options.
+
+    Args:
+        cfg (dict): A dictionary of ipu settings.
+
+    Returns:
+        dict[str, poptorch.Options]: Training options and inference options
+        of IPU.
+    """
+    # set ipu options for inference and training by config
+    train_cfg = cfg.pop('train_cfg', {})
+    eval_cfg = cfg.pop('eval_cfg', {})
+    eval_cfg['replicationFactor'] = 1  # eval mode only use one replica
+    eval_cfg['executionStrategy'] = 'ShardedExecution'
+    # overwrite default ipu cfg with specified train cfgs
+    training_ipu_cfg = {**cfg, **train_cfg}
+    # overwrite default ipu cfg with specified eval cfgs
+    inference_ipu_cfg = {**cfg, **eval_cfg}
+
+    ipu_options = {
+        'training': _cast_to_options(training_ipu_cfg),
+        'inference': _cast_to_options(inference_ipu_cfg)
+    }
+
+    # TODO configure these codes
+    ipu_options['training']._Popart.set('disableGradAccumulationTensorStreams',
+                                        True)
+    ipu_options['training']._Popart.set(
+        'accumulateOuterFragmentSettings.schedule',
+        int(popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized))
+    ipu_options['training'].Precision.enableStochasticRounding(True)
+
+    return ipu_options
+
+
+def _cast_to_options(cfg):
+    # If it cannot be directly assigned, use if statement to parse it,
+    # and if it can be directly assigned, use _options_assigner to assign
+    options = poptorch.Options()
+
+    if 'availableMemoryProportion' in cfg:
+        available_memory_proportion = cfg.pop('availableMemoryProportion')
+        mem_props = {}
+        for i, mem_prop in enumerate(available_memory_proportion):
+            mem_props[f'IPU{i}'] = mem_prop
+        options.setAvailableMemoryProportion(mem_props)
+
+    if 'executionStrategy' in cfg:
+        execution_strategy = cfg.pop('executionStrategy')
+        if execution_strategy == 'SameAsIpu':
+            options.setExecutionStrategy(
+                poptorch.PipelinedExecution(
+                    getattr(poptorch.AutoStage, execution_strategy)))
+        elif execution_strategy == 'ShardedExecution':
+            options.setExecutionStrategy(poptorch.ShardedExecution())
+        else:
+            raise NotImplementedError(
+                'executionStrategy should be "SameAsIpu" or "ShardedExecution"'
+                f', but got {execution_strategy}')
+
+    if 'partialsType' in cfg:
+        partials_type = cfg.pop('partialsType')
+        options.Precision.setPartialsType(getattr(
+            torch, partials_type))  # half or float
+
+    _options_assigner(cfg, options)
+    return options
+
+
+def model_sharding(model, split_edges):
+    """split models in-place into multi-IPUs.
+
+    Args:
+        model (nn.Module): The target model to be split.
+        split_edges (list of dict): Model layer names or layer numbers
+            of split edge. Each item of ``split_edges`` is a dictionary,
+            which may contain the following key-pairs:
+
+            - layer_to_call: PyTorch module to assign to the block
+            - user_id (optional): A user defined identifier for the block.
+            - ipu_id: The id of the IPU to run on.
+
+        Examples:
+            >>> split_edges = [
+            ...     dict(layer_to_call='model.conv1', ipu_id=0),
+            ...     dict(layer_to_call='model.conv3', ipu_id=1)]
+            >>> sharding_model = model_sharding(torch_model, split_edges)
+
+    Returns:
+        nn.Module: Split model.
+    """
+    if len(split_edges) == 0:
+        return model
+    assert isinstance(split_edges, list)
+    spilt_edges_dict = {edge['layer_to_call']: edge for edge in split_edges}
+
+    for idx, (name, module) in enumerate(model.named_modules()):
+        if idx in spilt_edges_dict and name in spilt_edges_dict:
+            raise ValueError(
+                'The same layer is referenced twice while doing model'
+                f' partition: idx is {idx} and name is {name}')
+
+        edge = spilt_edges_dict.pop(name, None)
+        edge = spilt_edges_dict.pop(idx, edge)
+        if edge is not None:
+            poptorch.BeginBlock(module, edge.get('user_id', name),
+                                edge['ipu_id'])
+
+    # ensure all split_edges are used
+    if len(spilt_edges_dict) > 0:
+        split_edge_names = list(spilt_edges_dict.keys())
+        raise RuntimeError(
+            f'split_edges: {split_edge_names} are not contained in the model')
+    return model
+
+
+def recomputation_checkpoint(model: nn.Module, module_names: list):
+    """Annotates the output of a module to be checkpointed instead of
+    recomputed.
+
+    If recomputation mode is enabled, ipu will release the activations of
+    the middle layers to save memory. During the backward of gradient,
+    the activation of the middle layer will be recalculated again.
+    This function is used to declare the activations of some intermediate
+    layers that need to be saved in order to skip the recomputation of
+    some layers.
+
+    Args:
+        model (nn.Module): The target model to apply recomputation
+            checkpoint.
+        module_names (list): Layer names of module.
+    """
+
+    def recompute_outputs(module, inputs, outputs):
+        if isinstance(outputs, tuple):
+            return tuple(poptorch.recomputationCheckpoint(y) for y in outputs)
+        else:
+            return poptorch.recomputationCheckpoint(outputs)
+
+    for name, module in model.named_modules():
+        if name in module_names:
+            module.register_forward_hook(recompute_outputs)
+            module_names.remove(name)
+
+    # check all module_names are used
+    assert len(module_names) == 0,\
+        f'recomputed nodes: {module_names} are not contained in the model'
+
+
+def compare_ndarray(featA, featB, rtol=1e-3, atol=1e-5):
+    """Align data between two activations or weights."""
+    try:
+        np.testing.assert_allclose(featA, featB, rtol=rtol, atol=atol)
+    except AssertionError as e:
+        print(e)
+
+
+def build_from_cfg_with_wrapper(cfg,
+                                registry,
+                                wrapper_func=None,
+                                default_args=None):
+    """Build a module from config dict and wrap module with "wrapper_func".
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+        wrapper_func (function): Used to wrap class
+
+    Returns:
+        object: The constructed object.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        if default_args is None or 'type' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "type", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an mmcv.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+
+    args = cfg.copy()
+
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    obj_type = args.pop('type')
+    if isinstance(obj_type, str):
+        obj_cls = registry.get(obj_type)
+        if obj_cls is None:
+            raise KeyError(
+                f'{obj_type} is not in the {registry.name} registry')
+    elif inspect.isclass(obj_type):
+        obj_cls = obj_type
+    else:
+        raise TypeError(
+            f'type must be a str or valid type, but got {type(obj_type)}')
+
+    if wrapper_func is None:
+        wrapped_obj_cls = obj_cls
+    else:
+        wrapped_obj_cls = wrapper_func(obj_cls)
+    try:
+        return wrapped_obj_cls(**args)
+    except Exception as e:
+        # Normal TypeError does not print class name.
+        raise type(e)(f'{wrapped_obj_cls.__name__}: {e}')
diff --git a/mmcv/mmcv/device/mlu/__init__.py b/mmcv/mmcv/device/mlu/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c71ccf3ce38f3cbc9911f1d9d4b05a531771f2
--- /dev/null
+++ b/mmcv/mmcv/device/mlu/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_parallel import MLUDataParallel
+from .distributed import MLUDistributedDataParallel
+
+__all__ = ['MLUDataParallel', 'MLUDistributedDataParallel']
diff --git a/mmcv/mmcv/device/mlu/_functions.py b/mmcv/mmcv/device/mlu/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..75660fa9b3635fed049cb150639244a658534824
--- /dev/null
+++ b/mmcv/mmcv/device/mlu/_functions.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+
+
+def scatter(input: Union[List, torch.Tensor], devices: List) -> List:
+    """scatter copies tensor to MLU directly."""
+    if isinstance(input, list):
+        outputs = [scatter(_input, devices) for _input in input]
+        return outputs
+    elif isinstance(input, torch.Tensor):
+        output = input.contiguous()
+        return output.to('mlu') if devices != [-1] else output
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+class Scatter:
+
+    @staticmethod
+    def forward(target_mlus, input):
+        outputs = scatter(input, target_mlus)
+        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
diff --git a/mmcv/mmcv/device/mlu/data_parallel.py b/mmcv/mmcv/device/mlu/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebe14c0a55c92f96ec7f782a591ac10b007942dc
--- /dev/null
+++ b/mmcv/mmcv/device/mlu/data_parallel.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from mmcv.parallel import MMDataParallel
+from .scatter_gather import scatter_kwargs
+
+
+class MLUDataParallel(MMDataParallel):
+    """The MLUDataParallel module that supports DataContainer.
+
+    MLUDataParallel is a class inherited from MMDataParall, which supports
+    MLU training and inference only.
+
+    The main differences with MMDataParallel:
+
+    - It only supports single-card of MLU, and only use first card to
+      run training and inference.
+
+    - It uses direct host-to-device copy instead of stream-background
+      scatter.
+
+    .. warning::
+        MLUDataParallel only supports single MLU training, if you need to
+        train with multiple MLUs, please use MLUDistributedDataParallel
+        instead. If you have multiple MLUs, you can set the environment
+        variable ``MLU_VISIBLE_DEVICES=0`` (or any other card number(s))
+        to specify the running device.
+
+    Args:
+        module (:class:`nn.Module`): Module to be encapsulated.
+        dim (int): Dimension used to scatter the data. Defaults to 0.
+    """
+
+    def __init__(self, *args, dim=0, **kwargs):
+        super().__init__(*args, dim=dim, **kwargs)
+        self.device_ids = [0]
+        self.src_device_obj = torch.device('mlu:0')
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmcv/mmcv/device/mlu/distributed.py b/mmcv/mmcv/device/mlu/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..3768c754c908b219fd5a770d69e6ed5416781ba8
--- /dev/null
+++ b/mmcv/mmcv/device/mlu/distributed.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmcv.parallel import MMDistributedDataParallel
+from .scatter_gather import scatter_kwargs
+
+
+class MLUDistributedDataParallel(MMDistributedDataParallel):
+    """The DDP module supports DataContainer.
+
+    MLUDDP has one difference from MMDDP which moves data to MLU with coping
+    instead of scattering.
+    """
+
+    def to_kwargs(self, inputs, kwargs, device_id):
+        # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8
+        # to move all tensors to device_id
+        return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim)
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmcv/mmcv/device/mlu/scatter_gather.py b/mmcv/mmcv/device/mlu/scatter_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b0c9b96f51252e4c510f66a2ec5fb7522716e29
--- /dev/null
+++ b/mmcv/mmcv/device/mlu/scatter_gather.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.parallel.data_container import DataContainer
+from ._functions import Scatter
+
+
+def scatter(inputs, target_mlus, dim=0):
+    """Scatter inputs to target mlu.
+
+    The only difference from original :func:`scatter` is to add support for
+    :type:`~mmcv.parallel.DataContainer`.
+    """
+
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            if target_mlus != [-1]:
+                obj = obj.to('mlu')
+                return [obj]
+            else:
+                # for CPU inference we use self-implemented scatter
+                return Scatter.forward(target_mlus, obj)
+        if isinstance(obj, DataContainer):
+            if obj.cpu_only:
+                return obj.data
+            else:
+                return Scatter.forward(target_mlus, obj.data)
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            out = list(map(list, zip(*map(scatter_map, obj))))
+            return out
+        if isinstance(obj, dict) and len(obj) > 0:
+            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+            return out
+        return [obj for targets in target_mlus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+
+def scatter_kwargs(inputs, kwargs, target_mlus, dim=0):
+    """Scatter with support for kwargs dictionary."""
+    inputs = scatter(inputs, target_mlus, dim) if inputs else []
+    kwargs = scatter(kwargs, target_mlus, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
diff --git a/mmcv/mmcv/device/mps/__init__.py b/mmcv/mmcv/device/mps/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e28144ef0ae8cf65527cefc469d07c7ff854c688
--- /dev/null
+++ b/mmcv/mmcv/device/mps/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .data_parallel import MPSDataParallel
+
+__all__ = ['MPSDataParallel']
diff --git a/mmcv/mmcv/device/mps/data_parallel.py b/mmcv/mmcv/device/mps/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ae5396d24193376432ae98b792ec89fac678738
--- /dev/null
+++ b/mmcv/mmcv/device/mps/data_parallel.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import torch
+
+from mmcv.parallel import MMDataParallel
+from ..scatter_gather import scatter_kwargs
+
+
+class MPSDataParallel(MMDataParallel):
+    """The MPSDataParallel module that supports DataContainer.
+
+    MPSDataParallel is a class inherited from MMDataParall, which supports
+    MPS training and inference only.
+
+    The main differences with MMDataParallel:
+
+    - It only supports single-card of MPS, and only use first card to
+      run training and inference.
+
+    - It uses direct host-to-device copy instead of stream-background
+      scatter.
+
+    Args:
+        module (:class:`nn.Module`): Module to be encapsulated.
+        dim (int): Dimension used to scatter the data. Defaults to 0.
+    """
+
+    def __init__(self, *args, dim=0, **kwargs):
+        super().__init__(*args, dim=dim, **kwargs)
+        self.device_ids = [0]
+        self.src_device_obj = torch.device('mps:0')
+
+    def scatter(self, inputs, kwargs, device_ids):
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmcv/mmcv/device/scatter_gather.py b/mmcv/mmcv/device/scatter_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..744b0ca51e9de4cb7c43d60a986621461519f781
--- /dev/null
+++ b/mmcv/mmcv/device/scatter_gather.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.parallel.data_container import DataContainer
+from mmcv.utils import deprecated_api_warning
+from ._functions import Scatter
+from .utils import get_device
+
+
+@deprecated_api_warning({'target_mlus': 'target_devices'})
+def scatter(inputs, target_devices, dim=0):
+    """Scatter inputs to target devices.
+
+    The only difference from original :func:`scatter` is to add support for
+    :type:`~mmcv.parallel.DataContainer`.
+    """
+    current_device = get_device()
+
+    def scatter_map(obj):
+        if isinstance(obj, torch.Tensor):
+            if target_devices != [-1]:
+                obj = obj.to(current_device)
+                return [obj]
+            else:
+                # for CPU inference we use self-implemented scatter
+                return Scatter.forward(target_devices, obj)
+        if isinstance(obj, DataContainer):
+            if obj.cpu_only:
+                return obj.data
+            else:
+                return Scatter.forward(target_devices, obj.data)
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            out = list(map(list, zip(*map(scatter_map, obj))))
+            return out
+        if isinstance(obj, dict) and len(obj) > 0:
+            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+            return out
+        return [obj for _ in target_devices]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None
+
+
+@deprecated_api_warning({'target_mlus': 'target_devices'})
+def scatter_kwargs(inputs, kwargs, target_devices, dim=0):
+    """Scatter with support for kwargs dictionary."""
+    inputs = scatter(inputs, target_devices, dim) if inputs else []
+    kwargs = scatter(kwargs, target_devices, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
+    elif len(kwargs) < len(inputs):
+        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
diff --git a/mmcv/mmcv/device/utils.py b/mmcv/mmcv/device/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2adec08dd98ad83cce3a9c28d3a6651808f7112
--- /dev/null
+++ b/mmcv/mmcv/device/utils.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
+
+
+def get_device() -> str:
+    """Returns the currently existing device type.
+
+    Returns:
+        str: cuda | mlu | mps | cpu.
+    """
+    if IS_CUDA_AVAILABLE:
+        return 'cuda'
+    elif IS_MLU_AVAILABLE:
+        return 'mlu'
+    elif IS_MPS_AVAILABLE:
+        return 'mps'
+    else:
+        return 'cpu'
diff --git a/mmcv/mmcv/engine/__init__.py b/mmcv/mmcv/engine/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3193b7f664e19ce2458d81c836597fa22e4bb082
--- /dev/null
+++ b/mmcv/mmcv/engine/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test,
+                   single_gpu_test)
+
+__all__ = [
+    'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test',
+    'single_gpu_test'
+]
diff --git a/mmcv/mmcv/engine/test.py b/mmcv/mmcv/engine/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..83546caec47fb11952fd820b342c71b83b74fac2
--- /dev/null
+++ b/mmcv/mmcv/engine/test.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.utils.data import DataLoader
+
+import mmcv
+from mmcv.runner import get_dist_info
+
+
+def single_gpu_test(model: nn.Module, data_loader: DataLoader) -> list:
+    """Test model with a single gpu.
+
+    This method tests model with a single gpu and displays test progress bar.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    prog_bar = mmcv.ProgressBar(len(dataset))
+    for data in data_loader:
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.extend(result)
+
+        # Assume result has the same length of batch_size
+        # refer to https://github.com/open-mmlab/mmcv/issues/985
+        batch_size = len(result)
+        for _ in range(batch_size):
+            prog_bar.update()
+    return results
+
+
+def multi_gpu_test(model: nn.Module,
+                   data_loader: DataLoader,
+                   tmpdir: Optional[str] = None,
+                   gpu_collect: bool = False) -> Optional[list]:
+    """Test model with multiple gpus.
+
+    This method tests model with multiple gpus and collects the results
+    under two different modes: gpu and cpu modes. By setting
+    ``gpu_collect=True``, it encodes results to gpu tensors and use gpu
+    communication for results collection. On cpu mode it saves the results on
+    different gpus to ``tmpdir`` and collects them by the rank 0 worker.
+
+    Args:
+        model (nn.Module): Model to be tested.
+        data_loader (nn.Dataloader): Pytorch data loader.
+        tmpdir (str): Path of directory to save the temporary results from
+            different gpus under cpu mode.
+        gpu_collect (bool): Option to use either gpu or cpu to collect results.
+
+    Returns:
+        list: The prediction results.
+    """
+    model.eval()
+    results = []
+    dataset = data_loader.dataset
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        prog_bar = mmcv.ProgressBar(len(dataset))
+    time.sleep(2)  # This line can prevent deadlock problem in some cases.
+    for i, data in enumerate(data_loader):
+        with torch.no_grad():
+            result = model(return_loss=False, **data)
+        results.extend(result)
+
+        if rank == 0:
+            batch_size = len(result)
+            batch_size_all = batch_size * world_size
+            if batch_size_all + prog_bar.completed > len(dataset):
+                batch_size_all = len(dataset) - prog_bar.completed
+            for _ in range(batch_size_all):
+                prog_bar.update()
+
+    # collect results from all ranks
+    if gpu_collect:
+        result_from_ranks = collect_results_gpu(results, len(dataset))
+    else:
+        result_from_ranks = collect_results_cpu(results, len(dataset), tmpdir)
+    return result_from_ranks
+
+
+def collect_results_cpu(result_part: list,
+                        size: int,
+                        tmpdir: Optional[str] = None) -> Optional[list]:
+    """Collect results under cpu mode.
+
+    On cpu mode, this function will save the results on different gpus to
+    ``tmpdir`` and collect them by the rank 0 worker.
+
+    Args:
+        result_part (list): Result list containing result parts
+            to be collected.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+        tmpdir (str | None): temporal directory for collected results to
+            store. If set to None, it will create a random temporal directory
+            for it.
+
+    Returns:
+        list: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            mmcv.mkdir_or_exist('.dist_test')
+            tmpdir = tempfile.mkdtemp(dir='.dist_test')
+            tmpdir = torch.tensor(
+                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    part_file = osp.join(tmpdir, f'part_{rank}.pkl')  # type: ignore
+    mmcv.dump(result_part, part_file)
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
+            part_result = mmcv.load(part_file)
+            # When data is severely insufficient, an empty part_result
+            # on a certain gpu could makes the overall outputs empty.
+            if part_result:
+                part_list.append(part_result)
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)  # type: ignore
+        return ordered_results
+
+
+def collect_results_gpu(result_part: list, size: int) -> Optional[list]:
+    """Collect results under gpu mode.
+
+    On gpu mode, this function will encode results to gpu tensors and use gpu
+    communication for results collection.
+
+    Args:
+        result_part (list): Result list containing result parts
+            to be collected.
+        size (int): Size of the results, commonly equal to length of
+            the results.
+
+    Returns:
+        list: The collected results.
+    """
+    rank, world_size = get_dist_info()
+    # dump result part to tensor with pickle
+    part_tensor = torch.tensor(
+        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
+    # gather all result part tensor shape
+    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
+    shape_list = [shape_tensor.clone() for _ in range(world_size)]
+    dist.all_gather(shape_list, shape_tensor)
+    # padding result part tensor to max length
+    shape_max = torch.tensor(shape_list).max()
+    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
+    part_send[:shape_tensor[0]] = part_tensor
+    part_recv_list = [
+        part_tensor.new_zeros(shape_max) for _ in range(world_size)
+    ]
+    # gather all result part
+    dist.all_gather(part_recv_list, part_send)
+
+    if rank == 0:
+        part_list = []
+        for recv, shape in zip(part_recv_list, shape_list):
+            part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())
+            # When data is severely insufficient, an empty part_result
+            # on a certain gpu could makes the overall outputs empty.
+            if part_result:
+                part_list.append(part_result)
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        return ordered_results
+    else:
+        return None
diff --git a/mmcv/mmcv/fileio/__init__.py b/mmcv/mmcv/fileio/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2051b85f7e59bff7bdbaa131849ce8cd31f059a4
--- /dev/null
+++ b/mmcv/mmcv/fileio/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .file_client import BaseStorageBackend, FileClient
+from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
+from .io import dump, load, register_handler
+from .parse import dict_from_file, list_from_file
+
+__all__ = [
+    'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler',
+    'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler',
+    'list_from_file', 'dict_from_file'
+]
diff --git a/mmcv/mmcv/fileio/file_client.py b/mmcv/mmcv/fileio/file_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee7c3164e2c631c546dfe3345c45f8b8394a9995
--- /dev/null
+++ b/mmcv/mmcv/fileio/file_client.py
@@ -0,0 +1,1173 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import os
+import os.path as osp
+import re
+import tempfile
+import warnings
+from abc import ABCMeta, abstractmethod
+from contextlib import contextmanager
+from pathlib import Path
+from typing import Any, Generator, Iterator, Optional, Tuple, Union
+from urllib.request import urlopen
+
+import mmcv
+from mmcv.utils.misc import has_method
+from mmcv.utils.path import is_filepath
+
+
+class BaseStorageBackend(metaclass=ABCMeta):
+    """Abstract class of storage backends.
+
+    All backends need to implement two apis: ``get()`` and ``get_text()``.
+    ``get()`` reads the file as a byte stream and ``get_text()`` reads the file
+    as texts.
+    """
+
+    # a flag to indicate whether the backend can create a symlink for a file
+    _allow_symlink = False
+
+    @property
+    def name(self):
+        return self.__class__.__name__
+
+    @property
+    def allow_symlink(self):
+        return self._allow_symlink
+
+    @abstractmethod
+    def get(self, filepath):
+        pass
+
+    @abstractmethod
+    def get_text(self, filepath):
+        pass
+
+
+class CephBackend(BaseStorageBackend):
+    """Ceph storage backend (for internal use).
+
+    Args:
+        path_mapping (dict|None): path mapping dict from local path to Petrel
+            path. When ``path_mapping={'src': 'dst'}``, ``src`` in ``filepath``
+            will be replaced by ``dst``. Default: None.
+
+    .. warning::
+        :class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
+        please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
+    """
+
+    def __init__(self, path_mapping=None):
+        try:
+            import ceph
+        except ImportError:
+            raise ImportError('Please install ceph to enable CephBackend.')
+
+        warnings.warn(
+            'CephBackend will be deprecated, please use PetrelBackend instead',
+            DeprecationWarning)
+        self._client = ceph.S3Client()
+        assert isinstance(path_mapping, dict) or path_mapping is None
+        self.path_mapping = path_mapping
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        if self.path_mapping is not None:
+            for k, v in self.path_mapping.items():
+                filepath = filepath.replace(k, v)
+        value = self._client.Get(filepath)
+        value_buf = memoryview(value)
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class PetrelBackend(BaseStorageBackend):
+    """Petrel storage backend (for internal use).
+
+    PetrelBackend supports reading and writing data to multiple clusters.
+    If the file path contains the cluster name, PetrelBackend will read data
+    from specified cluster or write data to it. Otherwise, PetrelBackend will
+    access the default cluster.
+
+    Args:
+        path_mapping (dict, optional): Path mapping dict from local path to
+            Petrel path. When ``path_mapping={'src': 'dst'}``, ``src`` in
+            ``filepath`` will be replaced by ``dst``. Default: None.
+        enable_mc (bool, optional): Whether to enable memcached support.
+            Default: True.
+
+    Examples:
+        >>> filepath1 = 's3://path/of/file'
+        >>> filepath2 = 'cluster-name:s3://path/of/file'
+        >>> client = PetrelBackend()
+        >>> client.get(filepath1)  # get data from default cluster
+        >>> client.get(filepath2)  # get data from 'cluster-name' cluster
+    """
+
+    def __init__(self,
+                 path_mapping: Optional[dict] = None,
+                 enable_mc: bool = True):
+        try:
+            from petrel_client import client
+        except ImportError:
+            raise ImportError('Please install petrel_client to enable '
+                              'PetrelBackend.')
+
+        self._client = client.Client(enable_mc=enable_mc)
+        assert isinstance(path_mapping, dict) or path_mapping is None
+        self.path_mapping = path_mapping
+
+    def _map_path(self, filepath: Union[str, Path]) -> str:
+        """Map ``filepath`` to a string path whose prefix will be replaced by
+        :attr:`self.path_mapping`.
+
+        Args:
+            filepath (str): Path to be mapped.
+        """
+        filepath = str(filepath)
+        if self.path_mapping is not None:
+            for k, v in self.path_mapping.items():
+                filepath = filepath.replace(k, v)
+        return filepath
+
+    def _format_path(self, filepath: str) -> str:
+        """Convert a ``filepath`` to standard format of petrel oss.
+
+        If the ``filepath`` is concatenated by ``os.path.join``, in a Windows
+        environment, the ``filepath`` will be the format of
+        's3://bucket_name\\image.jpg'. By invoking :meth:`_format_path`, the
+        above ``filepath`` will be converted to 's3://bucket_name/image.jpg'.
+
+        Args:
+            filepath (str): Path to be formatted.
+        """
+        return re.sub(r'\\+', '/', filepath)
+
+    def get(self, filepath: Union[str, Path]) -> memoryview:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            memoryview: A memory view of expected bytes object to avoid
+                copying. The memoryview object can be converted to bytes by
+                ``value_buf.tobytes()``.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        value = self._client.Get(filepath)
+        value_buf = memoryview(value)
+        return value_buf
+
+    def get_text(self,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return str(self.get(filepath), encoding=encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Save data to a given ``filepath``.
+
+        Args:
+            obj (bytes): Data to be saved.
+            filepath (str or Path): Path to write data.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        self._client.put(filepath, obj)
+
+    def put_text(self,
+                 obj: str,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> None:
+        """Save data to a given ``filepath``.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to encode the ``obj``.
+                Default: 'utf-8'.
+        """
+        self.put(bytes(obj, encoding=encoding), filepath)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+        """
+        if not has_method(self._client, 'delete'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `delete` method, please use a higher version or dev'
+                ' branch instead.')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        self._client.delete(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        if not (has_method(self._client, 'contains')
+                and has_method(self._client, 'isdir')):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `contains` and `isdir` methods, please use a higher'
+                'version or dev branch instead.')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.contains(filepath) or self._client.isdir(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+        """
+        if not has_method(self._client, 'isdir'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `isdir` method, please use a higher version or dev'
+                ' branch instead.')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+        """
+        if not has_method(self._client, 'contains'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `contains` method, please use a higher version or '
+                'dev branch instead.')
+
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        return self._client.contains(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result after concatenation.
+        """
+        filepath = self._format_path(self._map_path(filepath))
+        if filepath.endswith('/'):
+            filepath = filepath[:-1]
+        formatted_paths = [filepath]
+        for path in filepaths:
+            formatted_paths.append(self._format_path(self._map_path(path)))
+        return '/'.join(formatted_paths)
+
+    @contextmanager
+    def get_local_path(
+            self,
+            filepath: Union[str,
+                            Path]) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath`` and return a temporary path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str | Path): Download a file from ``filepath``.
+
+        Examples:
+            >>> client = PetrelBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with client.get_local_path('s3://path/of/your/file') as path:
+            ...     # do something here
+
+        Yields:
+            Iterable[str]: Only yield one temporary path.
+        """
+        filepath = self._map_path(filepath)
+        filepath = self._format_path(filepath)
+        assert self.isfile(filepath)
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            Petrel has no concept of directories but it simulates the directory
+            hierarchy in the filesystem through public prefixes. In addition,
+            if the returned path ends with '/', it means the path is a public
+            prefix which is a logical directory.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+            In addition, the returned path of directory will not contains the
+            suffix '/' which is consistent with other backends.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        if not has_method(self._client, 'list'):
+            raise NotImplementedError(
+                'Current version of Petrel Python SDK has not supported '
+                'the `list` method, please use a higher version or dev'
+                ' branch instead.')
+
+        dir_path = self._map_path(dir_path)
+        dir_path = self._format_path(dir_path)
+        if list_dir and suffix is not None:
+            raise TypeError(
+                '`list_dir` should be False when `suffix` is not None')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        # Petrel's simulated directory hierarchy assumes that directory paths
+        # should end with `/`
+        if not dir_path.endswith('/'):
+            dir_path += '/'
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for path in self._client.list(dir_path):
+                # the `self.isdir` is not used here to determine whether path
+                # is a directory, because `self.isdir` relies on
+                # `self._client.list`
+                if path.endswith('/'):  # a directory path
+                    next_dir_path = self.join_path(dir_path, path)
+                    if list_dir:
+                        # get the relative path and exclude the last
+                        # character '/'
+                        rel_dir = next_dir_path[len(root):-1]
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(next_dir_path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+                else:  # a file path
+                    absolute_path = self.join_path(dir_path, path)
+                    rel_path = absolute_path[len(root):]
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
+
+
+class MemcachedBackend(BaseStorageBackend):
+    """Memcached storage backend.
+
+    Attributes:
+        server_list_cfg (str): Config file for memcached server list.
+        client_cfg (str): Config file for memcached client.
+        sys_path (str | None): Additional path to be appended to `sys.path`.
+            Default: None.
+    """
+
+    def __init__(self, server_list_cfg, client_cfg, sys_path=None):
+        if sys_path is not None:
+            import sys
+            sys.path.append(sys_path)
+        try:
+            import mc
+        except ImportError:
+            raise ImportError(
+                'Please install memcached to enable MemcachedBackend.')
+
+        self.server_list_cfg = server_list_cfg
+        self.client_cfg = client_cfg
+        self._client = mc.MemcachedClient.GetInstance(self.server_list_cfg,
+                                                      self.client_cfg)
+        # mc.pyvector servers as a point which points to a memory cache
+        self._mc_buffer = mc.pyvector()
+
+    def get(self, filepath):
+        filepath = str(filepath)
+        import mc
+        self._client.Get(filepath, self._mc_buffer)
+        value_buf = mc.ConvertBuffer(self._mc_buffer)
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+
+class LmdbBackend(BaseStorageBackend):
+    """Lmdb storage backend.
+
+    Args:
+        db_path (str): Lmdb database path.
+        readonly (bool, optional): Lmdb environment parameter. If True,
+            disallow any write operations. Default: True.
+        lock (bool, optional): Lmdb environment parameter. If False, when
+            concurrent access occurs, do not lock the database. Default: False.
+        readahead (bool, optional): Lmdb environment parameter. If False,
+            disable the OS filesystem readahead mechanism, which may improve
+            random read performance when a database is larger than RAM.
+            Default: False.
+
+    Attributes:
+        db_path (str): Lmdb database path.
+    """
+
+    def __init__(self,
+                 db_path,
+                 readonly=True,
+                 lock=False,
+                 readahead=False,
+                 **kwargs):
+        try:
+            import lmdb  # NOQA
+        except ImportError:
+            raise ImportError('Please install lmdb to enable LmdbBackend.')
+
+        self.db_path = str(db_path)
+        self.readonly = readonly
+        self.lock = lock
+        self.readahead = readahead
+        self.kwargs = kwargs
+        self._client = None
+
+    def get(self, filepath):
+        """Get values according to the filepath.
+
+        Args:
+            filepath (str | obj:`Path`): Here, filepath is the lmdb key.
+        """
+        if self._client is None:
+            self._client = self._get_client()
+
+        with self._client.begin(write=False) as txn:
+            value_buf = txn.get(str(filepath).encode('utf-8'))
+        return value_buf
+
+    def get_text(self, filepath, encoding=None):
+        raise NotImplementedError
+
+    def _get_client(self):
+        import lmdb
+
+        return lmdb.open(
+            self.db_path,
+            readonly=self.readonly,
+            lock=self.lock,
+            readahead=self.readahead,
+            **self.kwargs)
+
+    def __del__(self):
+        self._client.close()
+
+
+class HardDiskBackend(BaseStorageBackend):
+    """Raw hard disks storage backend."""
+
+    _allow_symlink = True
+
+    def get(self, filepath: Union[str, Path]) -> bytes:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes: Expected bytes object.
+        """
+        with open(filepath, 'rb') as f:
+            value_buf = f.read()
+        return value_buf
+
+    def get_text(self,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        with open(filepath, encoding=encoding) as f:
+            value_buf = f.read()
+        return value_buf
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` will create a directory if the directory of ``filepath``
+            does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        mmcv.mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'wb') as f:
+            f.write(obj)
+
+    def put_text(self,
+                 obj: str,
+                 filepath: Union[str, Path],
+                 encoding: str = 'utf-8') -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` will create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+        """
+        mmcv.mkdir_or_exist(osp.dirname(filepath))
+        with open(filepath, 'w', encoding=encoding) as f:
+            f.write(obj)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str or Path): Path to be removed.
+        """
+        os.remove(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return osp.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+        """
+        return osp.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+        """
+        return osp.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of *filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+        """
+        return osp.join(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(
+            self,
+            filepath: Union[str,
+                            Path]) -> Generator[Union[str, Path], None, None]:
+        """Only for unified API and do nothing."""
+        yield filepath
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        if list_dir and suffix is not None:
+            raise TypeError('`suffix` should be None when `list_dir` is True')
+
+        if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+            raise TypeError('`suffix` must be a string or tuple of strings')
+
+        root = dir_path
+
+        def _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                              recursive):
+            for entry in os.scandir(dir_path):
+                if not entry.name.startswith('.') and entry.is_file():
+                    rel_path = osp.relpath(entry.path, root)
+                    if (suffix is None
+                            or rel_path.endswith(suffix)) and list_file:
+                        yield rel_path
+                elif osp.isdir(entry.path):
+                    if list_dir:
+                        rel_dir = osp.relpath(entry.path, root)
+                        yield rel_dir
+                    if recursive:
+                        yield from _list_dir_or_file(entry.path, list_dir,
+                                                     list_file, suffix,
+                                                     recursive)
+
+        return _list_dir_or_file(dir_path, list_dir, list_file, suffix,
+                                 recursive)
+
+
+class HTTPBackend(BaseStorageBackend):
+    """HTTP and HTTPS storage bachend."""
+
+    def get(self, filepath):
+        value_buf = urlopen(filepath).read()
+        return value_buf
+
+    def get_text(self, filepath, encoding='utf-8'):
+        value_buf = urlopen(filepath).read()
+        return value_buf.decode(encoding)
+
+    @contextmanager
+    def get_local_path(
+            self, filepath: str) -> Generator[Union[str, Path], None, None]:
+        """Download a file from ``filepath``.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Args:
+            filepath (str): Download a file from ``filepath``.
+
+        Examples:
+            >>> client = HTTPBackend()
+            >>> # After existing from the ``with`` clause,
+            >>> # the path will be removed
+            >>> with client.get_local_path('http://path/of/your/file') as path:
+            ...     # do something here
+        """
+        try:
+            f = tempfile.NamedTemporaryFile(delete=False)
+            f.write(self.get(filepath))
+            f.close()
+            yield f.name
+        finally:
+            os.remove(f.name)
+
+
+class FileClient:
+    """A general file client to access files in different backends.
+
+    The client loads a file or text in a specified backend from its path
+    and returns it as a binary or text file. There are two ways to choose a
+    backend, the name of backend and the prefix of path. Although both of them
+    can be used to choose a storage backend, ``backend`` has a higher priority
+    that is if they are all set, the storage backend will be chosen by the
+    backend argument. If they are all `None`, the disk backend will be chosen.
+    Note that It can also register other backend accessor with a given name,
+    prefixes, and backend class. In addition, We use the singleton pattern to
+    avoid repeated object creation. If the arguments are the same, the same
+    object will be returned.
+
+    Args:
+        backend (str, optional): The storage backend type. Options are "disk",
+            "ceph", "memcached", "lmdb", "http" and "petrel". Default: None.
+        prefix (str, optional): The prefix of the registered storage backend.
+            Options are "s3", "http", "https". Default: None.
+
+    Examples:
+        >>> # only set backend
+        >>> file_client = FileClient(backend='petrel')
+        >>> # only set prefix
+        >>> file_client = FileClient(prefix='s3')
+        >>> # set both backend and prefix but use backend to choose client
+        >>> file_client = FileClient(backend='petrel', prefix='s3')
+        >>> # if the arguments are the same, the same object is returned
+        >>> file_client1 = FileClient(backend='petrel')
+        >>> file_client1 is file_client
+        True
+
+    Attributes:
+        client (:obj:`BaseStorageBackend`): The backend object.
+    """
+
+    _backends = {
+        'disk': HardDiskBackend,
+        'ceph': CephBackend,
+        'memcached': MemcachedBackend,
+        'lmdb': LmdbBackend,
+        'petrel': PetrelBackend,
+        'http': HTTPBackend,
+    }
+
+    _prefix_to_backends = {
+        's3': PetrelBackend,
+        'http': HTTPBackend,
+        'https': HTTPBackend,
+    }
+
+    _instances: dict = {}
+
+    client: Any
+
+    def __new__(cls, backend=None, prefix=None, **kwargs):
+        if backend is None and prefix is None:
+            backend = 'disk'
+        if backend is not None and backend not in cls._backends:
+            raise ValueError(
+                f'Backend {backend} is not supported. Currently supported ones'
+                f' are {list(cls._backends.keys())}')
+        if prefix is not None and prefix not in cls._prefix_to_backends:
+            raise ValueError(
+                f'prefix {prefix} is not supported. Currently supported ones '
+                f'are {list(cls._prefix_to_backends.keys())}')
+
+        # concatenate the arguments to a unique key for determining whether
+        # objects with the same arguments were created
+        arg_key = f'{backend}:{prefix}'
+        for key, value in kwargs.items():
+            arg_key += f':{key}:{value}'
+
+        if arg_key in cls._instances:
+            _instance = cls._instances[arg_key]
+        else:
+            # create a new object and put it to _instance
+            _instance = super().__new__(cls)
+            if backend is not None:
+                _instance.client = cls._backends[backend](**kwargs)
+            else:
+                _instance.client = cls._prefix_to_backends[prefix](**kwargs)
+
+            cls._instances[arg_key] = _instance
+
+        return _instance
+
+    @property
+    def name(self):
+        return self.client.name
+
+    @property
+    def allow_symlink(self):
+        return self.client.allow_symlink
+
+    @staticmethod
+    def parse_uri_prefix(uri: Union[str, Path]) -> Optional[str]:
+        """Parse the prefix of a uri.
+
+        Args:
+            uri (str | Path): Uri to be parsed that contains the file prefix.
+
+        Examples:
+            >>> FileClient.parse_uri_prefix('s3://path/of/your/file')
+            's3'
+
+        Returns:
+            str | None: Return the prefix of uri if the uri contains '://' else
+            ``None``.
+        """
+        assert is_filepath(uri)
+        uri = str(uri)
+        if '://' not in uri:
+            return None
+        else:
+            prefix, _ = uri.split('://')
+            # In the case of PetrelBackend, the prefix may contains the cluster
+            # name like clusterName:s3
+            if ':' in prefix:
+                _, prefix = prefix.split(':')
+            return prefix
+
+    @classmethod
+    def infer_client(cls,
+                     file_client_args: Optional[dict] = None,
+                     uri: Optional[Union[str, Path]] = None) -> 'FileClient':
+        """Infer a suitable file client based on the URI and arguments.
+
+        Args:
+            file_client_args (dict, optional): Arguments to instantiate a
+                FileClient. Default: None.
+            uri (str | Path, optional): Uri to be parsed that contains the file
+                prefix. Default: None.
+
+        Examples:
+            >>> uri = 's3://path/of/your/file'
+            >>> file_client = FileClient.infer_client(uri=uri)
+            >>> file_client_args = {'backend': 'petrel'}
+            >>> file_client = FileClient.infer_client(file_client_args)
+
+        Returns:
+            FileClient: Instantiated FileClient object.
+        """
+        assert file_client_args is not None or uri is not None
+        if file_client_args is None:
+            file_prefix = cls.parse_uri_prefix(uri)  # type: ignore
+            return cls(prefix=file_prefix)
+        else:
+            return cls(**file_client_args)
+
+    @classmethod
+    def _register_backend(cls, name, backend, force=False, prefixes=None):
+        if not isinstance(name, str):
+            raise TypeError('the backend name should be a string, '
+                            f'but got {type(name)}')
+        if not inspect.isclass(backend):
+            raise TypeError(
+                f'backend should be a class but got {type(backend)}')
+        if not issubclass(backend, BaseStorageBackend):
+            raise TypeError(
+                f'backend {backend} is not a subclass of BaseStorageBackend')
+        if not force and name in cls._backends:
+            raise KeyError(
+                f'{name} is already registered as a storage backend, '
+                'add "force=True" if you want to override it')
+
+        if name in cls._backends and force:
+            for arg_key, instance in list(cls._instances.items()):
+                if isinstance(instance.client, cls._backends[name]):
+                    cls._instances.pop(arg_key)
+        cls._backends[name] = backend
+
+        if prefixes is not None:
+            if isinstance(prefixes, str):
+                prefixes = [prefixes]
+            else:
+                assert isinstance(prefixes, (list, tuple))
+            for prefix in prefixes:
+                if prefix not in cls._prefix_to_backends:
+                    cls._prefix_to_backends[prefix] = backend
+                elif (prefix in cls._prefix_to_backends) and force:
+                    overridden_backend = cls._prefix_to_backends[prefix]
+                    if isinstance(overridden_backend, list):
+                        overridden_backend = tuple(overridden_backend)
+                    for arg_key, instance in list(cls._instances.items()):
+                        if isinstance(instance.client, overridden_backend):
+                            cls._instances.pop(arg_key)
+                    cls._prefix_to_backends[prefix] = backend
+                else:
+                    raise KeyError(
+                        f'{prefix} is already registered as a storage backend,'
+                        ' add "force=True" if you want to override it')
+
+    @classmethod
+    def register_backend(cls, name, backend=None, force=False, prefixes=None):
+        """Register a backend to FileClient.
+
+        This method can be used as a normal class method or a decorator.
+
+        .. code-block:: python
+
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+            FileClient.register_backend('new', NewBackend)
+
+        or
+
+        .. code-block:: python
+
+            @FileClient.register_backend('new')
+            class NewBackend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return filepath
+
+                def get_text(self, filepath):
+                    return filepath
+
+        Args:
+            name (str): The name of the registered backend.
+            backend (class, optional): The backend class to be registered,
+                which must be a subclass of :class:`BaseStorageBackend`.
+                When this method is used as a decorator, backend is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the backend if the name
+                has already been registered. Defaults to False.
+            prefixes (str or list[str] or tuple[str], optional): The prefixes
+                of the registered storage backend. Default: None.
+                `New in version 1.3.15.`
+        """
+        if backend is not None:
+            cls._register_backend(
+                name, backend, force=force, prefixes=prefixes)
+            return
+
+        def _register(backend_cls):
+            cls._register_backend(
+                name, backend_cls, force=force, prefixes=prefixes)
+            return backend_cls
+
+        return _register
+
+    def get(self, filepath: Union[str, Path]) -> Union[bytes, memoryview]:
+        """Read data from a given ``filepath`` with 'rb' mode.
+
+        Note:
+            There are two types of return values for ``get``, one is ``bytes``
+            and the other is ``memoryview``. The advantage of using memoryview
+            is that you can avoid copying, and if you want to convert it to
+            ``bytes``, you can use ``.tobytes()``.
+
+        Args:
+            filepath (str or Path): Path to read data.
+
+        Returns:
+            bytes | memoryview: Expected bytes object or a memory view of the
+            bytes object.
+        """
+        return self.client.get(filepath)
+
+    def get_text(self, filepath: Union[str, Path], encoding='utf-8') -> str:
+        """Read data from a given ``filepath`` with 'r' mode.
+
+        Args:
+            filepath (str or Path): Path to read data.
+            encoding (str): The encoding format used to open the ``filepath``.
+                Default: 'utf-8'.
+
+        Returns:
+            str: Expected text reading from ``filepath``.
+        """
+        return self.client.get_text(filepath, encoding)
+
+    def put(self, obj: bytes, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'wb' mode.
+
+        Note:
+            ``put`` should create a directory if the directory of ``filepath``
+            does not exist.
+
+        Args:
+            obj (bytes): Data to be written.
+            filepath (str or Path): Path to write data.
+        """
+        self.client.put(obj, filepath)
+
+    def put_text(self, obj: str, filepath: Union[str, Path]) -> None:
+        """Write data to a given ``filepath`` with 'w' mode.
+
+        Note:
+            ``put_text`` should create a directory if the directory of
+            ``filepath`` does not exist.
+
+        Args:
+            obj (str): Data to be written.
+            filepath (str or Path): Path to write data.
+            encoding (str, optional): The encoding format used to open the
+                `filepath`. Default: 'utf-8'.
+        """
+        self.client.put_text(obj, filepath)
+
+    def remove(self, filepath: Union[str, Path]) -> None:
+        """Remove a file.
+
+        Args:
+            filepath (str, Path): Path to be removed.
+        """
+        self.client.remove(filepath)
+
+    def exists(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path exists.
+
+        Args:
+            filepath (str or Path): Path to be checked whether exists.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` exists, ``False`` otherwise.
+        """
+        return self.client.exists(filepath)
+
+    def isdir(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a directory.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a
+                directory.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a directory,
+            ``False`` otherwise.
+        """
+        return self.client.isdir(filepath)
+
+    def isfile(self, filepath: Union[str, Path]) -> bool:
+        """Check whether a file path is a file.
+
+        Args:
+            filepath (str or Path): Path to be checked whether it is a file.
+
+        Returns:
+            bool: Return ``True`` if ``filepath`` points to a file, ``False``
+            otherwise.
+        """
+        return self.client.isfile(filepath)
+
+    def join_path(self, filepath: Union[str, Path],
+                  *filepaths: Union[str, Path]) -> str:
+        """Concatenate all file paths.
+
+        Join one or more filepath components intelligently. The return value
+        is the concatenation of filepath and any members of *filepaths.
+
+        Args:
+            filepath (str or Path): Path to be concatenated.
+
+        Returns:
+            str: The result of concatenation.
+        """
+        return self.client.join_path(filepath, *filepaths)
+
+    @contextmanager
+    def get_local_path(
+            self,
+            filepath: Union[str,
+                            Path]) -> Generator[Union[str, Path], None, None]:
+        """Download data from ``filepath`` and write the data to local path.
+
+        ``get_local_path`` is decorated by :meth:`contxtlib.contextmanager`. It
+        can be called with ``with`` statement, and when exists from the
+        ``with`` statement, the temporary path will be released.
+
+        Note:
+            If the ``filepath`` is a local path, just return itself.
+
+        .. warning::
+            ``get_local_path`` is an experimental interface that may change in
+            the future.
+
+        Args:
+            filepath (str or Path): Path to be read data.
+
+        Examples:
+            >>> file_client = FileClient(prefix='s3')
+            >>> with file_client.get_local_path('s3://bucket/abc.jpg') as path:
+            ...     # do something here
+
+        Yields:
+            Iterable[str]: Only yield one path.
+        """
+        with self.client.get_local_path(str(filepath)) as local_path:
+            yield local_path
+
+    def list_dir_or_file(self,
+                         dir_path: Union[str, Path],
+                         list_dir: bool = True,
+                         list_file: bool = True,
+                         suffix: Optional[Union[str, Tuple[str]]] = None,
+                         recursive: bool = False) -> Iterator[str]:
+        """Scan a directory to find the interested directories or files in
+        arbitrary order.
+
+        Note:
+            :meth:`list_dir_or_file` returns the path relative to ``dir_path``.
+
+        Args:
+            dir_path (str | Path): Path of the directory.
+            list_dir (bool): List the directories. Default: True.
+            list_file (bool): List the path of files. Default: True.
+            suffix (str or tuple[str], optional):  File suffix
+                that we are interested in. Default: None.
+            recursive (bool): If set to True, recursively scan the
+                directory. Default: False.
+
+        Yields:
+            Iterable[str]: A relative path to ``dir_path``.
+        """
+        yield from self.client.list_dir_or_file(dir_path, list_dir, list_file,
+                                                suffix, recursive)
diff --git a/mmcv/mmcv/fileio/handlers/__init__.py b/mmcv/mmcv/fileio/handlers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa24d91972837b8756b225f4879bac20436eb72a
--- /dev/null
+++ b/mmcv/mmcv/fileio/handlers/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import BaseFileHandler
+from .json_handler import JsonHandler
+from .pickle_handler import PickleHandler
+from .yaml_handler import YamlHandler
+
+__all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler']
diff --git a/mmcv/mmcv/fileio/handlers/base.py b/mmcv/mmcv/fileio/handlers/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c9cc15b67cbf7d320c2b9c6cbd441a5d5adf235
--- /dev/null
+++ b/mmcv/mmcv/fileio/handlers/base.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+
+
+class BaseFileHandler(metaclass=ABCMeta):
+    # `str_like` is a flag to indicate whether the type of file object is
+    # str-like object or bytes-like object. Pickle only processes bytes-like
+    # objects but json only processes str-like object. If it is str-like
+    # object, `StringIO` will be used to process the buffer.
+    str_like = True
+
+    @abstractmethod
+    def load_from_fileobj(self, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        pass
+
+    @abstractmethod
+    def dump_to_str(self, obj, **kwargs):
+        pass
+
+    def load_from_path(self, filepath: str, mode: str = 'r', **kwargs):
+        with open(filepath, mode) as f:
+            return self.load_from_fileobj(f, **kwargs)
+
+    def dump_to_path(self, obj, filepath: str, mode: str = 'w', **kwargs):
+        with open(filepath, mode) as f:
+            self.dump_to_fileobj(obj, f, **kwargs)
diff --git a/mmcv/mmcv/fileio/handlers/json_handler.py b/mmcv/mmcv/fileio/handlers/json_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..18d4f15f74139d20adff18b20be5529c592a66b6
--- /dev/null
+++ b/mmcv/mmcv/fileio/handlers/json_handler.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+
+import numpy as np
+
+from .base import BaseFileHandler
+
+
+def set_default(obj):
+    """Set default json values for non-serializable values.
+
+    It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
+    It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
+    etc.) into plain numbers of plain python built-in types.
+    """
+    if isinstance(obj, (set, range)):
+        return list(obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, np.generic):
+        return obj.item()
+    raise TypeError(f'{type(obj)} is unsupported for json dump')
+
+
+class JsonHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file):
+        return json.load(file)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('default', set_default)
+        json.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('default', set_default)
+        return json.dumps(obj, **kwargs)
diff --git a/mmcv/mmcv/fileio/handlers/pickle_handler.py b/mmcv/mmcv/fileio/handlers/pickle_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..073856fd25a731b42f3cd19269ad95744b20598f
--- /dev/null
+++ b/mmcv/mmcv/fileio/handlers/pickle_handler.py
@@ -0,0 +1,26 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pickle
+
+from .base import BaseFileHandler
+
+
+class PickleHandler(BaseFileHandler):
+
+    str_like = False
+
+    def load_from_fileobj(self, file, **kwargs):
+        return pickle.load(file, **kwargs)
+
+    def load_from_path(self, filepath, **kwargs):
+        return super().load_from_path(filepath, mode='rb', **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        return pickle.dumps(obj, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('protocol', 2)
+        pickle.dump(obj, file, **kwargs)
+
+    def dump_to_path(self, obj, filepath, **kwargs):
+        super().dump_to_path(obj, filepath, mode='wb', **kwargs)
diff --git a/mmcv/mmcv/fileio/handlers/yaml_handler.py b/mmcv/mmcv/fileio/handlers/yaml_handler.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c1b077943d634b3ddcf5ee470855179b8308e9c
--- /dev/null
+++ b/mmcv/mmcv/fileio/handlers/yaml_handler.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import yaml
+
+try:
+    from yaml import CDumper as Dumper
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader, Dumper  # type: ignore
+
+from .base import BaseFileHandler  # isort:skip
+
+
+class YamlHandler(BaseFileHandler):
+
+    def load_from_fileobj(self, file, **kwargs):
+        kwargs.setdefault('Loader', Loader)
+        return yaml.load(file, **kwargs)
+
+    def dump_to_fileobj(self, obj, file, **kwargs):
+        kwargs.setdefault('Dumper', Dumper)
+        yaml.dump(obj, file, **kwargs)
+
+    def dump_to_str(self, obj, **kwargs):
+        kwargs.setdefault('Dumper', Dumper)
+        return yaml.dump(obj, **kwargs)
diff --git a/mmcv/mmcv/fileio/io.py b/mmcv/mmcv/fileio/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..91192103cf331e8ceb970d6f1f5ac050137c0871
--- /dev/null
+++ b/mmcv/mmcv/fileio/io.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from io import BytesIO, StringIO
+from pathlib import Path
+from typing import Any, Callable, Dict, List, Optional, TextIO, Union
+
+from ..utils import is_list_of
+from .file_client import FileClient
+from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
+
+FileLikeObject = Union[TextIO, StringIO, BytesIO]
+
+file_handlers = {
+    'json': JsonHandler(),
+    'yaml': YamlHandler(),
+    'yml': YamlHandler(),
+    'pickle': PickleHandler(),
+    'pkl': PickleHandler()
+}
+
+
+def load(file: Union[str, Path, FileLikeObject],
+         file_format: Optional[str] = None,
+         file_client_args: Optional[Dict] = None,
+         **kwargs):
+    """Load data from json/yaml/pickle files.
+
+    This method provides a unified api for loading data from serialized files.
+
+    Note:
+        In v1.3.16 and later, ``load`` supports loading data from serialized
+        files those can be storaged in different backends.
+
+    Args:
+        file (str or :obj:`Path` or file-like object): Filename or a file-like
+            object.
+        file_format (str, optional): If not specified, the file format will be
+            inferred from the file extension, otherwise use the specified one.
+            Currently supported formats include "json", "yaml/yml" and
+            "pickle/pkl".
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> load('/path/of/your/file')  # file is storaged in disk
+        >>> load('https://path/of/your/file')  # file is storaged in Internet
+        >>> load('s3://path/of/your/file')  # file is storaged in petrel
+
+    Returns:
+        The content from the file.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None and isinstance(file, str):
+        file_format = file.split('.')[-1]
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+
+    handler = file_handlers[file_format]
+    f: FileLikeObject
+    if isinstance(file, str):
+        file_client = FileClient.infer_client(file_client_args, file)
+        if handler.str_like:
+            with StringIO(file_client.get_text(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+        else:
+            with BytesIO(file_client.get(file)) as f:
+                obj = handler.load_from_fileobj(f, **kwargs)
+    elif hasattr(file, 'read'):
+        obj = handler.load_from_fileobj(file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filepath str or a file-object')
+    return obj
+
+
+def dump(obj: Any,
+         file: Optional[Union[str, Path, FileLikeObject]] = None,
+         file_format: Optional[str] = None,
+         file_client_args: Optional[Dict] = None,
+         **kwargs):
+    """Dump data to json/yaml/pickle strings or files.
+
+    This method provides a unified api for dumping data as strings or to files,
+    and also supports custom arguments for each file format.
+
+    Note:
+        In v1.3.16 and later, ``dump`` supports dumping data as strings or to
+        files which is saved to different backends.
+
+    Args:
+        obj (any): The python object to be dumped.
+        file (str or :obj:`Path` or file-like object, optional): If not
+            specified, then the object is dumped to a str, otherwise to a file
+            specified by the filename or file-like object.
+        file_format (str, optional): Same as :func:`load`.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> dump('hello world', '/path/of/your/file')  # disk
+        >>> dump('hello world', 's3://path/of/your/file')  # ceph or petrel
+
+    Returns:
+        bool: True for success, False otherwise.
+    """
+    if isinstance(file, Path):
+        file = str(file)
+    if file_format is None:
+        if isinstance(file, str):
+            file_format = file.split('.')[-1]
+        elif file is None:
+            raise ValueError(
+                'file_format must be specified since file is None')
+    if file_format not in file_handlers:
+        raise TypeError(f'Unsupported format: {file_format}')
+    f: FileLikeObject
+    handler = file_handlers[file_format]
+    if file is None:
+        return handler.dump_to_str(obj, **kwargs)
+    elif isinstance(file, str):
+        file_client = FileClient.infer_client(file_client_args, file)
+        if handler.str_like:
+            with StringIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_client.put_text(f.getvalue(), file)
+        else:
+            with BytesIO() as f:
+                handler.dump_to_fileobj(obj, f, **kwargs)
+                file_client.put(f.getvalue(), file)
+    elif hasattr(file, 'write'):
+        handler.dump_to_fileobj(obj, file, **kwargs)
+    else:
+        raise TypeError('"file" must be a filename str or a file-object')
+
+
+def _register_handler(handler: BaseFileHandler,
+                      file_formats: Union[str, List[str]]) -> None:
+    """Register a handler for some file extensions.
+
+    Args:
+        handler (:obj:`BaseFileHandler`): Handler to be registered.
+        file_formats (str or list[str]): File formats to be handled by this
+            handler.
+    """
+    if not isinstance(handler, BaseFileHandler):
+        raise TypeError(
+            f'handler must be a child of BaseFileHandler, not {type(handler)}')
+    if isinstance(file_formats, str):
+        file_formats = [file_formats]
+    if not is_list_of(file_formats, str):
+        raise TypeError('file_formats must be a str or a list of str')
+    for ext in file_formats:
+        file_handlers[ext] = handler
+
+
+def register_handler(file_formats: Union[str, list], **kwargs) -> Callable:
+
+    def wrap(cls):
+        _register_handler(cls(**kwargs), file_formats)
+        return cls
+
+    return wrap
diff --git a/mmcv/mmcv/fileio/parse.py b/mmcv/mmcv/fileio/parse.py
new file mode 100644
index 0000000000000000000000000000000000000000..f28e59119325a1bb68b38dd884c59b68dbed6508
--- /dev/null
+++ b/mmcv/mmcv/fileio/parse.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from io import StringIO
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+from .file_client import FileClient
+
+
+def list_from_file(filename: Union[str, Path],
+                   prefix: str = '',
+                   offset: int = 0,
+                   max_num: int = 0,
+                   encoding: str = 'utf-8',
+                   file_client_args: Optional[Dict] = None) -> List:
+    """Load a text file and parse the content as a list of strings.
+
+    Note:
+        In v1.3.16 and later, ``list_from_file`` supports loading a text file
+        which can be storaged in different backends and parsing the content as
+        a list for strings.
+
+    Args:
+        filename (str): Filename.
+        prefix (str): The prefix to be inserted to the beginning of each item.
+        offset (int): The offset of lines.
+        max_num (int): The maximum number of lines to be read,
+            zeros and negatives mean no limitation.
+        encoding (str): Encoding used to open the file. Default utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> list_from_file('/path/of/your/file')  # disk
+        ['hello', 'world']
+        >>> list_from_file('s3://path/of/your/file')  # ceph or petrel
+        ['hello', 'world']
+
+    Returns:
+        list[str]: A list of strings.
+    """
+    cnt = 0
+    item_list = []
+    file_client = FileClient.infer_client(file_client_args, filename)
+    with StringIO(file_client.get_text(filename, encoding)) as f:
+        for _ in range(offset):
+            f.readline()
+        for line in f:
+            if 0 < max_num <= cnt:
+                break
+            item_list.append(prefix + line.rstrip('\n\r'))
+            cnt += 1
+    return item_list
+
+
+def dict_from_file(filename: Union[str, Path],
+                   key_type: type = str,
+                   encoding: str = 'utf-8',
+                   file_client_args: Optional[Dict] = None) -> Dict:
+    """Load a text file and parse the content as a dict.
+
+    Each line of the text file will be two or more columns split by
+    whitespaces or tabs. The first column will be parsed as dict keys, and
+    the following columns will be parsed as dict values.
+
+    Note:
+        In v1.3.16 and later, ``dict_from_file`` supports loading a text file
+        which can be storaged in different backends and parsing the content as
+        a dict.
+
+    Args:
+        filename(str): Filename.
+        key_type(type): Type of the dict keys. str is user by default and
+            type conversion will be performed if specified.
+        encoding (str): Encoding used to open the file. Default utf-8.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Examples:
+        >>> dict_from_file('/path/of/your/file')  # disk
+        {'key1': 'value1', 'key2': 'value2'}
+        >>> dict_from_file('s3://path/of/your/file')  # ceph or petrel
+        {'key1': 'value1', 'key2': 'value2'}
+
+    Returns:
+        dict: The parsed contents.
+    """
+    mapping = {}
+    file_client = FileClient.infer_client(file_client_args, filename)
+    with StringIO(file_client.get_text(filename, encoding)) as f:
+        for line in f:
+            items = line.rstrip('\n').split()
+            assert len(items) >= 2
+            key = key_type(items[0])
+            val = items[1:] if len(items) > 2 else items[1]
+            mapping[key] = val
+    return mapping
diff --git a/mmcv/mmcv/image/__init__.py b/mmcv/mmcv/image/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ecec4046a6f5ee25b4ea07215ed7c7c810dcfa
--- /dev/null
+++ b/mmcv/mmcv/image/__init__.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .colorspace import (bgr2gray, bgr2hls, bgr2hsv, bgr2rgb, bgr2ycbcr,
+                         gray2bgr, gray2rgb, hls2bgr, hsv2bgr, imconvert,
+                         rgb2bgr, rgb2gray, rgb2ycbcr, ycbcr2bgr, ycbcr2rgb)
+from .geometric import (cutout, imcrop, imflip, imflip_, impad,
+                        impad_to_multiple, imrescale, imresize, imresize_like,
+                        imresize_to_multiple, imrotate, imshear, imtranslate,
+                        rescale_size)
+from .io import imfrombytes, imread, imwrite, supported_backends, use_backend
+from .misc import tensor2imgs
+from .photometric import (adjust_brightness, adjust_color, adjust_contrast,
+                          adjust_hue, adjust_lighting, adjust_sharpness,
+                          auto_contrast, clahe, imdenormalize, imequalize,
+                          iminvert, imnormalize, imnormalize_, lut_transform,
+                          posterize, solarize)
+
+__all__ = [
+    'bgr2gray', 'bgr2hls', 'bgr2hsv', 'bgr2rgb', 'gray2bgr', 'gray2rgb',
+    'hls2bgr', 'hsv2bgr', 'imconvert', 'rgb2bgr', 'rgb2gray', 'imrescale',
+    'imresize', 'imresize_like', 'imresize_to_multiple', 'rescale_size',
+    'imcrop', 'imflip', 'imflip_', 'impad', 'impad_to_multiple', 'imrotate',
+    'imfrombytes', 'imread', 'imwrite', 'supported_backends', 'use_backend',
+    'imdenormalize', 'imnormalize', 'imnormalize_', 'iminvert', 'posterize',
+    'solarize', 'rgb2ycbcr', 'bgr2ycbcr', 'ycbcr2rgb', 'ycbcr2bgr',
+    'tensor2imgs', 'imshear', 'imtranslate', 'adjust_color', 'imequalize',
+    'adjust_brightness', 'adjust_contrast', 'lut_transform', 'clahe',
+    'adjust_sharpness', 'auto_contrast', 'cutout', 'adjust_lighting',
+    'adjust_hue'
+]
diff --git a/mmcv/mmcv/image/colorspace.py b/mmcv/mmcv/image/colorspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..08f9952408c8e0bb38b17c10e2089e900ed418c2
--- /dev/null
+++ b/mmcv/mmcv/image/colorspace.py
@@ -0,0 +1,309 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Union
+
+import cv2
+import numpy as np
+
+
+def imconvert(img: np.ndarray, src: str, dst: str) -> np.ndarray:
+    """Convert an image from the src colorspace to dst colorspace.
+
+    Args:
+        img (ndarray): The input image.
+        src (str): The source colorspace, e.g., 'rgb', 'hsv'.
+        dst (str): The destination colorspace, e.g., 'rgb', 'hsv'.
+
+    Returns:
+        ndarray: The converted image.
+    """
+    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+    out_img = cv2.cvtColor(img, code)
+    return out_img
+
+
+def bgr2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
+    """Convert a BGR image to grayscale image.
+
+    Args:
+        img (ndarray): The input image.
+        keepdim (bool): If False (by default), then return the grayscale image
+            with 2 dims, otherwise 3 dims.
+
+    Returns:
+        ndarray: The converted grayscale image.
+    """
+    out_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    if keepdim:
+        out_img = out_img[..., None]
+    return out_img
+
+
+def rgb2gray(img: np.ndarray, keepdim: bool = False) -> np.ndarray:
+    """Convert a RGB image to grayscale image.
+
+    Args:
+        img (ndarray): The input image.
+        keepdim (bool): If False (by default), then return the grayscale image
+            with 2 dims, otherwise 3 dims.
+
+    Returns:
+        ndarray: The converted grayscale image.
+    """
+    out_img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
+    if keepdim:
+        out_img = out_img[..., None]
+    return out_img
+
+
+def gray2bgr(img: np.ndarray) -> np.ndarray:
+    """Convert a grayscale image to BGR image.
+
+    Args:
+        img (ndarray): The input image.
+
+    Returns:
+        ndarray: The converted BGR image.
+    """
+    img = img[..., None] if img.ndim == 2 else img
+    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    return out_img
+
+
+def gray2rgb(img: np.ndarray) -> np.ndarray:
+    """Convert a grayscale image to RGB image.
+
+    Args:
+        img (ndarray): The input image.
+
+    Returns:
+        ndarray: The converted RGB image.
+    """
+    img = img[..., None] if img.ndim == 2 else img
+    out_img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB)
+    return out_img
+
+
+def _convert_input_type_range(img: np.ndarray) -> np.ndarray:
+    """Convert the type and range of the input image.
+
+    It converts the input image to np.float32 type and range of [0, 1].
+    It is mainly used for pre-processing the input image in colorspace
+    conversion functions such as rgb2ycbcr and ycbcr2rgb.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        (ndarray): The converted image with type of np.float32 and range of
+            [0, 1].
+    """
+    img_type = img.dtype
+    img = img.astype(np.float32)
+    if img_type == np.float32:
+        pass
+    elif img_type == np.uint8:
+        img /= 255.
+    else:
+        raise TypeError('The img type should be np.float32 or np.uint8, '
+                        f'but got {img_type}')
+    return img
+
+
+def _convert_output_type_range(
+        img: np.ndarray, dst_type: Union[np.uint8, np.float32]) -> np.ndarray:
+    """Convert the type and range of the image according to dst_type.
+
+    It converts the image to desired type and range. If `dst_type` is np.uint8,
+    images will be converted to np.uint8 type with range [0, 255]. If
+    `dst_type` is np.float32, it converts the image to np.float32 type with
+    range [0, 1].
+    It is mainly used for post-processing images in colorspace conversion
+    functions such as rgb2ycbcr and ycbcr2rgb.
+
+    Args:
+        img (ndarray): The image to be converted with np.float32 type and
+            range [0, 255].
+        dst_type (np.uint8 | np.float32): If dst_type is np.uint8, it
+            converts the image to np.uint8 type with range [0, 255]. If
+            dst_type is np.float32, it converts the image to np.float32 type
+            with range [0, 1].
+
+    Returns:
+        (ndarray): The converted image with desired type and range.
+    """
+    if dst_type not in (np.uint8, np.float32):
+        raise TypeError('The dst_type should be np.float32 or np.uint8, '
+                        f'but got {dst_type}')
+    if dst_type == np.uint8:
+        img = img.round()
+    else:
+        img /= 255.
+    return img.astype(dst_type)
+
+
+def rgb2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
+    """Convert a RGB image to YCbCr image.
+
+    This function produces the same results as Matlab's `rgb2ycbcr` function.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `RGB <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [65.481, 128.553, 24.966]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[65.481, -37.797, 112.0], [128.553, -74.203, -93.786],
+                  [24.966, 112.0, -18.214]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def bgr2ycbcr(img: np.ndarray, y_only: bool = False) -> np.ndarray:
+    """Convert a BGR image to YCbCr image.
+
+    The bgr version of rgb2ycbcr.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `BGR <-> YCrCb`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+        y_only (bool): Whether to only return Y channel. Default: False.
+
+    Returns:
+        ndarray: The converted YCbCr image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img)
+    if y_only:
+        out_img = np.dot(img, [24.966, 128.553, 65.481]) + 16.0
+    else:
+        out_img = np.matmul(
+            img, [[24.966, 112.0, -18.214], [128.553, -74.203, -93.786],
+                  [65.481, -37.797, 112.0]]) + [16, 128, 128]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def ycbcr2rgb(img: np.ndarray) -> np.ndarray:
+    """Convert a YCbCr image to RGB image.
+
+    This function produces the same results as Matlab's ycbcr2rgb function.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `YCrCb <-> RGB`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        ndarray: The converted RGB image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img) * 255
+    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+                              [0, -0.00153632, 0.00791071],
+                              [0.00625893, -0.00318811, 0]]) * 255.0 + [
+                                  -222.921, 135.576, -276.836
+                              ]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def ycbcr2bgr(img: np.ndarray) -> np.ndarray:
+    """Convert a YCbCr image to BGR image.
+
+    The bgr version of ycbcr2rgb.
+    It implements the ITU-R BT.601 conversion for standard-definition
+    television. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#ITU-R_BT.601_conversion.
+
+    It differs from a similar function in cv2.cvtColor: `YCrCb <-> BGR`.
+    In OpenCV, it implements a JPEG conversion. See more details in
+    https://en.wikipedia.org/wiki/YCbCr#JPEG_conversion.
+
+    Args:
+        img (ndarray): The input image. It accepts:
+            1. np.uint8 type with range [0, 255];
+            2. np.float32 type with range [0, 1].
+
+    Returns:
+        ndarray: The converted BGR image. The output image has the same type
+        and range as input image.
+    """
+    img_type = img.dtype
+    img = _convert_input_type_range(img) * 255
+    out_img = np.matmul(img, [[0.00456621, 0.00456621, 0.00456621],
+                              [0.00791071, -0.00153632, 0],
+                              [0, -0.00318811, 0.00625893]]) * 255.0 + [
+                                  -276.836, 135.576, -222.921
+                              ]
+    out_img = _convert_output_type_range(out_img, img_type)
+    return out_img
+
+
+def convert_color_factory(src: str, dst: str) -> Callable:
+
+    code = getattr(cv2, f'COLOR_{src.upper()}2{dst.upper()}')
+
+    def convert_color(img: np.ndarray) -> np.ndarray:
+        out_img = cv2.cvtColor(img, code)
+        return out_img
+
+    convert_color.__doc__ = f"""Convert a {src.upper()} image to {dst.upper()}
+        image.
+
+    Args:
+        img (ndarray or str): The input image.
+
+    Returns:
+        ndarray: The converted {dst.upper()} image.
+    """
+
+    return convert_color
+
+
+bgr2rgb = convert_color_factory('bgr', 'rgb')
+
+rgb2bgr = convert_color_factory('rgb', 'bgr')
+
+bgr2hsv = convert_color_factory('bgr', 'hsv')
+
+hsv2bgr = convert_color_factory('hsv', 'bgr')
+
+bgr2hls = convert_color_factory('bgr', 'hls')
+
+hls2bgr = convert_color_factory('hls', 'bgr')
diff --git a/mmcv/mmcv/image/geometric.py b/mmcv/mmcv/image/geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..eecd795ea08127055cd8e90eb11c5e51fe586c18
--- /dev/null
+++ b/mmcv/mmcv/image/geometric.py
@@ -0,0 +1,741 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+
+import cv2
+import numpy as np
+
+from ..utils import to_2tuple
+from .io import imread_backend
+
+try:
+    from PIL import Image
+except ImportError:
+    Image = None
+
+
+def _scale_size(size, scale):
+    """Rescale a size by a ratio.
+
+    Args:
+        size (tuple[int]): (w, h).
+        scale (float | tuple(float)): Scaling factor.
+
+    Returns:
+        tuple[int]: scaled size.
+    """
+    if isinstance(scale, (float, int)):
+        scale = (scale, scale)
+    w, h = size
+    return int(w * float(scale[0]) + 0.5), int(h * float(scale[1]) + 0.5)
+
+
+cv2_interp_codes = {
+    'nearest': cv2.INTER_NEAREST,
+    'bilinear': cv2.INTER_LINEAR,
+    'bicubic': cv2.INTER_CUBIC,
+    'area': cv2.INTER_AREA,
+    'lanczos': cv2.INTER_LANCZOS4
+}
+
+# Pillow >=v9.1.0 use a slightly different naming scheme for filters.
+# Set pillow_interp_codes according to the naming scheme used.
+if Image is not None:
+    if hasattr(Image, 'Resampling'):
+        pillow_interp_codes = {
+            'nearest': Image.Resampling.NEAREST,
+            'bilinear': Image.Resampling.BILINEAR,
+            'bicubic': Image.Resampling.BICUBIC,
+            'box': Image.Resampling.BOX,
+            'lanczos': Image.Resampling.LANCZOS,
+            'hamming': Image.Resampling.HAMMING
+        }
+    else:
+        pillow_interp_codes = {
+            'nearest': Image.NEAREST,
+            'bilinear': Image.BILINEAR,
+            'bicubic': Image.BICUBIC,
+            'box': Image.BOX,
+            'lanczos': Image.LANCZOS,
+            'hamming': Image.HAMMING
+        }
+
+
+def imresize(img,
+             size,
+             return_scale=False,
+             interpolation='bilinear',
+             out=None,
+             backend=None):
+    """Resize image to a given size.
+
+    Args:
+        img (ndarray): The input image.
+        size (tuple[int]): Target size (w, h).
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if backend is None:
+        backend = imread_backend
+    if backend not in ['cv2', 'pillow']:
+        raise ValueError(f'backend: {backend} is not supported for resize.'
+                         f"Supported backends are 'cv2', 'pillow'")
+
+    if backend == 'pillow':
+        assert img.dtype == np.uint8, 'Pillow backend only support uint8 type'
+        pil_image = Image.fromarray(img)
+        pil_image = pil_image.resize(size, pillow_interp_codes[interpolation])
+        resized_img = np.array(pil_image)
+    else:
+        resized_img = cv2.resize(
+            img, size, dst=out, interpolation=cv2_interp_codes[interpolation])
+    if not return_scale:
+        return resized_img
+    else:
+        w_scale = size[0] / w
+        h_scale = size[1] / h
+        return resized_img, w_scale, h_scale
+
+
+def imresize_to_multiple(img,
+                         divisor,
+                         size=None,
+                         scale_factor=None,
+                         keep_ratio=False,
+                         return_scale=False,
+                         interpolation='bilinear',
+                         out=None,
+                         backend=None):
+    """Resize image according to a given size or scale factor and then rounds
+    up the the resized or rescaled image size to the nearest value that can be
+    divided by the divisor.
+
+    Args:
+        img (ndarray): The input image.
+        divisor (int | tuple): Resized image size will be a multiple of
+            divisor. If divisor is a tuple, divisor should be
+            (w_divisor, h_divisor).
+        size (None | int | tuple[int]): Target size (w, h). Default: None.
+        scale_factor (None | float | tuple[float]): Multiplier for spatial
+            size. Should match input size if it is a tuple and the 2D style is
+            (w_scale_factor, h_scale_factor). Default: None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Default: False.
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend.
+        out (ndarray): The output destination.
+        backend (str | None): The image resize backend type. Options are `cv2`,
+            `pillow`, `None`. If backend is None, the global imread_backend
+            specified by ``mmcv.use_backend()`` will be used. Default: None.
+
+    Returns:
+        tuple | ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = img.shape[:2]
+    if size is not None and scale_factor is not None:
+        raise ValueError('only one of size or scale_factor should be defined')
+    elif size is None and scale_factor is None:
+        raise ValueError('one of size or scale_factor should be defined')
+    elif size is not None:
+        size = to_2tuple(size)
+        if keep_ratio:
+            size = rescale_size((w, h), size, return_scale=False)
+    else:
+        size = _scale_size((w, h), scale_factor)
+
+    divisor = to_2tuple(divisor)
+    size = tuple(int(np.ceil(s / d)) * d for s, d in zip(size, divisor))
+    resized_img, w_scale, h_scale = imresize(
+        img,
+        size,
+        return_scale=True,
+        interpolation=interpolation,
+        out=out,
+        backend=backend)
+    if return_scale:
+        return resized_img, w_scale, h_scale
+    else:
+        return resized_img
+
+
+def imresize_like(img,
+                  dst_img,
+                  return_scale=False,
+                  interpolation='bilinear',
+                  backend=None):
+    """Resize image to the same size of a given image.
+
+    Args:
+        img (ndarray): The input image.
+        dst_img (ndarray): The target image.
+        return_scale (bool): Whether to return `w_scale` and `h_scale`.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        tuple or ndarray: (`resized_img`, `w_scale`, `h_scale`) or
+        `resized_img`.
+    """
+    h, w = dst_img.shape[:2]
+    return imresize(img, (w, h), return_scale, interpolation, backend=backend)
+
+
+def rescale_size(old_size, scale, return_scale=False):
+    """Calculate the new size to be rescaled to.
+
+    Args:
+        old_size (tuple[int]): The old size (w, h) of image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image size.
+
+    Returns:
+        tuple[int]: The new rescaled image size.
+    """
+    w, h = old_size
+    if isinstance(scale, (float, int)):
+        if scale <= 0:
+            raise ValueError(f'Invalid scale {scale}, must be positive.')
+        scale_factor = scale
+    elif isinstance(scale, tuple):
+        max_long_edge = max(scale)
+        max_short_edge = min(scale)
+        scale_factor = min(max_long_edge / max(h, w),
+                           max_short_edge / min(h, w))
+    else:
+        raise TypeError(
+            f'Scale must be a number or tuple of int, but got {type(scale)}')
+
+    new_size = _scale_size((w, h), scale_factor)
+
+    if return_scale:
+        return new_size, scale_factor
+    else:
+        return new_size
+
+
+def imrescale(img,
+              scale,
+              return_scale=False,
+              interpolation='bilinear',
+              backend=None):
+    """Resize image while keeping the aspect ratio.
+
+    Args:
+        img (ndarray): The input image.
+        scale (float | tuple[int]): The scaling factor or maximum size.
+            If it is a float number, then the image will be rescaled by this
+            factor, else if it is a tuple of 2 integers, then the image will
+            be rescaled as large as possible within the scale.
+        return_scale (bool): Whether to return the scaling factor besides the
+            rescaled image.
+        interpolation (str): Same as :func:`resize`.
+        backend (str | None): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The rescaled image.
+    """
+    h, w = img.shape[:2]
+    new_size, scale_factor = rescale_size((w, h), scale, return_scale=True)
+    rescaled_img = imresize(
+        img, new_size, interpolation=interpolation, backend=backend)
+    if return_scale:
+        return rescaled_img, scale_factor
+    else:
+        return rescaled_img
+
+
+def imflip(img, direction='horizontal'):
+    """Flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image.
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return np.flip(img, axis=1)
+    elif direction == 'vertical':
+        return np.flip(img, axis=0)
+    else:
+        return np.flip(img, axis=(0, 1))
+
+
+def imflip_(img, direction='horizontal'):
+    """Inplace flip an image horizontally or vertically.
+
+    Args:
+        img (ndarray): Image to be flipped.
+        direction (str): The flip direction, either "horizontal" or
+            "vertical" or "diagonal".
+
+    Returns:
+        ndarray: The flipped image (inplace).
+    """
+    assert direction in ['horizontal', 'vertical', 'diagonal']
+    if direction == 'horizontal':
+        return cv2.flip(img, 1, img)
+    elif direction == 'vertical':
+        return cv2.flip(img, 0, img)
+    else:
+        return cv2.flip(img, -1, img)
+
+
+def imrotate(img,
+             angle,
+             center=None,
+             scale=1.0,
+             border_value=0,
+             interpolation='bilinear',
+             auto_bound=False):
+    """Rotate an image.
+
+    Args:
+        img (ndarray): Image to be rotated.
+        angle (float): Rotation angle in degrees, positive values mean
+            clockwise rotation.
+        center (tuple[float], optional): Center point (w, h) of the rotation in
+            the source image. If not specified, the center of the image will be
+            used.
+        scale (float): Isotropic scale factor.
+        border_value (int): Border value.
+        interpolation (str): Same as :func:`resize`.
+        auto_bound (bool): Whether to adjust the image size to cover the whole
+            rotated image.
+
+    Returns:
+        ndarray: The rotated image.
+    """
+    if center is not None and auto_bound:
+        raise ValueError('`auto_bound` conflicts with `center`')
+    h, w = img.shape[:2]
+    if center is None:
+        center = ((w - 1) * 0.5, (h - 1) * 0.5)
+    assert isinstance(center, tuple)
+
+    matrix = cv2.getRotationMatrix2D(center, -angle, scale)
+    if auto_bound:
+        cos = np.abs(matrix[0, 0])
+        sin = np.abs(matrix[0, 1])
+        new_w = h * sin + w * cos
+        new_h = h * cos + w * sin
+        matrix[0, 2] += (new_w - w) * 0.5
+        matrix[1, 2] += (new_h - h) * 0.5
+        w = int(np.round(new_w))
+        h = int(np.round(new_h))
+    rotated = cv2.warpAffine(
+        img,
+        matrix, (w, h),
+        flags=cv2_interp_codes[interpolation],
+        borderValue=border_value)
+    return rotated
+
+
+def bbox_clip(bboxes, img_shape):
+    """Clip bboxes to fit the image shape.
+
+    Args:
+        bboxes (ndarray): Shape (..., 4*k)
+        img_shape (tuple[int]): (height, width) of the image.
+
+    Returns:
+        ndarray: Clipped bboxes.
+    """
+    assert bboxes.shape[-1] % 4 == 0
+    cmin = np.empty(bboxes.shape[-1], dtype=bboxes.dtype)
+    cmin[0::2] = img_shape[1] - 1
+    cmin[1::2] = img_shape[0] - 1
+    clipped_bboxes = np.maximum(np.minimum(bboxes, cmin), 0)
+    return clipped_bboxes
+
+
+def bbox_scaling(bboxes, scale, clip_shape=None):
+    """Scaling bboxes w.r.t the box center.
+
+    Args:
+        bboxes (ndarray): Shape(..., 4).
+        scale (float): Scaling factor.
+        clip_shape (tuple[int], optional): If specified, bboxes that exceed the
+            boundary will be clipped according to the given shape (h, w).
+
+    Returns:
+        ndarray: Scaled bboxes.
+    """
+    if float(scale) == 1.0:
+        scaled_bboxes = bboxes.copy()
+    else:
+        w = bboxes[..., 2] - bboxes[..., 0] + 1
+        h = bboxes[..., 3] - bboxes[..., 1] + 1
+        dw = (w * (scale - 1)) * 0.5
+        dh = (h * (scale - 1)) * 0.5
+        scaled_bboxes = bboxes + np.stack((-dw, -dh, dw, dh), axis=-1)
+    if clip_shape is not None:
+        return bbox_clip(scaled_bboxes, clip_shape)
+    else:
+        return scaled_bboxes
+
+
+def imcrop(img, bboxes, scale=1.0, pad_fill=None):
+    """Crop image patches.
+
+    3 steps: scale the bboxes -> clip bboxes -> crop and pad.
+
+    Args:
+        img (ndarray): Image to be cropped.
+        bboxes (ndarray): Shape (k, 4) or (4, ), location of cropped bboxes.
+        scale (float, optional): Scale ratio of bboxes, the default value
+            1.0 means no padding.
+        pad_fill (Number | list[Number]): Value to be filled for padding.
+            Default: None, which means no padding.
+
+    Returns:
+        list[ndarray] | ndarray: The cropped image patches.
+    """
+    chn = 1 if img.ndim == 2 else img.shape[2]
+    if pad_fill is not None:
+        if isinstance(pad_fill, (int, float)):
+            pad_fill = [pad_fill for _ in range(chn)]
+        assert len(pad_fill) == chn
+
+    _bboxes = bboxes[None, ...] if bboxes.ndim == 1 else bboxes
+    scaled_bboxes = bbox_scaling(_bboxes, scale).astype(np.int32)
+    clipped_bbox = bbox_clip(scaled_bboxes, img.shape)
+
+    patches = []
+    for i in range(clipped_bbox.shape[0]):
+        x1, y1, x2, y2 = tuple(clipped_bbox[i, :])
+        if pad_fill is None:
+            patch = img[y1:y2 + 1, x1:x2 + 1, ...]
+        else:
+            _x1, _y1, _x2, _y2 = tuple(scaled_bboxes[i, :])
+            if chn == 1:
+                patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1)
+            else:
+                patch_shape = (_y2 - _y1 + 1, _x2 - _x1 + 1, chn)
+            patch = np.array(
+                pad_fill, dtype=img.dtype) * np.ones(
+                    patch_shape, dtype=img.dtype)
+            x_start = 0 if _x1 >= 0 else -_x1
+            y_start = 0 if _y1 >= 0 else -_y1
+            w = x2 - x1 + 1
+            h = y2 - y1 + 1
+            patch[y_start:y_start + h, x_start:x_start + w,
+                  ...] = img[y1:y1 + h, x1:x1 + w, ...]
+        patches.append(patch)
+
+    if bboxes.ndim == 1:
+        return patches[0]
+    else:
+        return patches
+
+
+def impad(img,
+          *,
+          shape=None,
+          padding=None,
+          pad_val=0,
+          padding_mode='constant'):
+    """Pad the given image to a certain shape or pad on all sides with
+    specified padding mode and padding value.
+
+    Args:
+        img (ndarray): Image to be padded.
+        shape (tuple[int]): Expected padding shape (h, w). Default: None.
+        padding (int or tuple[int]): Padding on each border. If a single int is
+            provided this is used to pad all borders. If tuple of length 2 is
+            provided this is the padding on left/right and top/bottom
+            respectively. If a tuple of length 4 is provided this is the
+            padding for the left, top, right and bottom borders respectively.
+            Default: None. Note that `shape` and `padding` can not be both
+            set.
+        pad_val (Number | Sequence[Number]): Values to be filled in padding
+            areas when padding_mode is 'constant'. Default: 0.
+        padding_mode (str): Type of padding. Should be: constant, edge,
+            reflect or symmetric. Default: constant.
+            - constant: pads with a constant value, this value is specified
+              with pad_val.
+            - edge: pads with the last value at the edge of the image.
+            - reflect: pads with reflection of image without repeating the last
+              value on the edge. For example, padding [1, 2, 3, 4] with 2
+              elements on both sides in reflect mode will result in
+              [3, 2, 1, 2, 3, 4, 3, 2].
+            - symmetric: pads with reflection of image repeating the last value
+              on the edge. For example, padding [1, 2, 3, 4] with 2 elements on
+              both sides in symmetric mode will result in
+              [2, 1, 1, 2, 3, 4, 4, 3]
+
+    Returns:
+        ndarray: The padded image.
+    """
+
+    assert (shape is not None) ^ (padding is not None)
+    if shape is not None:
+        width = max(shape[1] - img.shape[1], 0)
+        height = max(shape[0] - img.shape[0], 0)
+        padding = (0, 0, width, height)
+
+    # check pad_val
+    if isinstance(pad_val, tuple):
+        assert len(pad_val) == img.shape[-1]
+    elif not isinstance(pad_val, numbers.Number):
+        raise TypeError('pad_val must be a int or a tuple. '
+                        f'But received {type(pad_val)}')
+
+    # check padding
+    if isinstance(padding, tuple) and len(padding) in [2, 4]:
+        if len(padding) == 2:
+            padding = (padding[0], padding[1], padding[0], padding[1])
+    elif isinstance(padding, numbers.Number):
+        padding = (padding, padding, padding, padding)
+    else:
+        raise ValueError('Padding must be a int or a 2, or 4 element tuple.'
+                         f'But received {padding}')
+
+    # check padding mode
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
+
+    border_type = {
+        'constant': cv2.BORDER_CONSTANT,
+        'edge': cv2.BORDER_REPLICATE,
+        'reflect': cv2.BORDER_REFLECT_101,
+        'symmetric': cv2.BORDER_REFLECT
+    }
+    img = cv2.copyMakeBorder(
+        img,
+        padding[1],
+        padding[3],
+        padding[0],
+        padding[2],
+        border_type[padding_mode],
+        value=pad_val)
+
+    return img
+
+
+def impad_to_multiple(img, divisor, pad_val=0):
+    """Pad an image to ensure each edge to be multiple to some number.
+
+    Args:
+        img (ndarray): Image to be padded.
+        divisor (int): Padded image edges will be multiple to divisor.
+        pad_val (Number | Sequence[Number]): Same as :func:`impad`.
+
+    Returns:
+        ndarray: The padded image.
+    """
+    pad_h = int(np.ceil(img.shape[0] / divisor)) * divisor
+    pad_w = int(np.ceil(img.shape[1] / divisor)) * divisor
+    return impad(img, shape=(pad_h, pad_w), pad_val=pad_val)
+
+
+def cutout(img, shape, pad_val=0):
+    """Randomly cut out a rectangle from the original img.
+
+    Args:
+        img (ndarray): Image to be cutout.
+        shape (int | tuple[int]): Expected cutout shape (h, w). If given as a
+            int, the value will be used for both h and w.
+        pad_val (int | float | tuple[int | float]): Values to be filled in the
+            cut area. Defaults to 0.
+
+    Returns:
+        ndarray: The cutout image.
+    """
+
+    channels = 1 if img.ndim == 2 else img.shape[2]
+    if isinstance(shape, int):
+        cut_h, cut_w = shape, shape
+    else:
+        assert isinstance(shape, tuple) and len(shape) == 2, \
+            f'shape must be a int or a tuple with length 2, but got type ' \
+            f'{type(shape)} instead.'
+        cut_h, cut_w = shape
+    if isinstance(pad_val, (int, float)):
+        pad_val = tuple([pad_val] * channels)
+    elif isinstance(pad_val, tuple):
+        assert len(pad_val) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(pad_val), channels)
+    else:
+        raise TypeError(f'Invalid type {type(pad_val)} for `pad_val`')
+
+    img_h, img_w = img.shape[:2]
+    y0 = np.random.uniform(img_h)
+    x0 = np.random.uniform(img_w)
+
+    y1 = int(max(0, y0 - cut_h / 2.))
+    x1 = int(max(0, x0 - cut_w / 2.))
+    y2 = min(img_h, y1 + cut_h)
+    x2 = min(img_w, x1 + cut_w)
+
+    if img.ndim == 2:
+        patch_shape = (y2 - y1, x2 - x1)
+    else:
+        patch_shape = (y2 - y1, x2 - x1, channels)
+
+    img_cutout = img.copy()
+    patch = np.array(
+        pad_val, dtype=img.dtype) * np.ones(
+            patch_shape, dtype=img.dtype)
+    img_cutout[y1:y2, x1:x2, ...] = patch
+
+    return img_cutout
+
+
+def _get_shear_matrix(magnitude, direction='horizontal'):
+    """Generate the shear matrix for transformation.
+
+    Args:
+        magnitude (int | float): The magnitude used for shear.
+        direction (str): The flip direction, either "horizontal"
+            or "vertical".
+
+    Returns:
+        ndarray: The shear matrix with dtype float32.
+    """
+    if direction == 'horizontal':
+        shear_matrix = np.float32([[1, magnitude, 0], [0, 1, 0]])
+    elif direction == 'vertical':
+        shear_matrix = np.float32([[1, 0, 0], [magnitude, 1, 0]])
+    return shear_matrix
+
+
+def imshear(img,
+            magnitude,
+            direction='horizontal',
+            border_value=0,
+            interpolation='bilinear'):
+    """Shear an image.
+
+    Args:
+        img (ndarray): Image to be sheared with format (h, w)
+            or (h, w, c).
+        magnitude (int | float): The magnitude used for shear.
+        direction (str): The flip direction, either "horizontal"
+            or "vertical".
+        border_value (int | tuple[int]): Value used in case of a
+            constant border.
+        interpolation (str): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The sheared image.
+    """
+    assert direction in ['horizontal',
+                         'vertical'], f'Invalid direction: {direction}'
+    height, width = img.shape[:2]
+    if img.ndim == 2:
+        channels = 1
+    elif img.ndim == 3:
+        channels = img.shape[-1]
+    if isinstance(border_value, int):
+        border_value = tuple([border_value] * channels)
+    elif isinstance(border_value, tuple):
+        assert len(border_value) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(border_value), channels)
+    else:
+        raise ValueError(
+            f'Invalid type {type(border_value)} for `border_value`')
+    shear_matrix = _get_shear_matrix(magnitude, direction)
+    sheared = cv2.warpAffine(
+        img,
+        shear_matrix,
+        (width, height),
+        # Note case when the number elements in `border_value`
+        # greater than 3 (e.g. shearing masks whose channels large
+        # than 3) will raise TypeError in `cv2.warpAffine`.
+        # Here simply slice the first 3 values in `border_value`.
+        borderValue=border_value[:3],
+        flags=cv2_interp_codes[interpolation])
+    return sheared
+
+
+def _get_translate_matrix(offset, direction='horizontal'):
+    """Generate the translate matrix.
+
+    Args:
+        offset (int | float): The offset used for translate.
+        direction (str): The translate direction, either
+            "horizontal" or "vertical".
+
+    Returns:
+        ndarray: The translate matrix with dtype float32.
+    """
+    if direction == 'horizontal':
+        translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
+    elif direction == 'vertical':
+        translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
+    return translate_matrix
+
+
+def imtranslate(img,
+                offset,
+                direction='horizontal',
+                border_value=0,
+                interpolation='bilinear'):
+    """Translate an image.
+
+    Args:
+        img (ndarray): Image to be translated with format
+            (h, w) or (h, w, c).
+        offset (int | float): The offset used for translate.
+        direction (str): The translate direction, either "horizontal"
+            or "vertical".
+        border_value (int | tuple[int]): Value used in case of a
+            constant border.
+        interpolation (str): Same as :func:`resize`.
+
+    Returns:
+        ndarray: The translated image.
+    """
+    assert direction in ['horizontal',
+                         'vertical'], f'Invalid direction: {direction}'
+    height, width = img.shape[:2]
+    if img.ndim == 2:
+        channels = 1
+    elif img.ndim == 3:
+        channels = img.shape[-1]
+    if isinstance(border_value, int):
+        border_value = tuple([border_value] * channels)
+    elif isinstance(border_value, tuple):
+        assert len(border_value) == channels, \
+            'Expected the num of elements in tuple equals the channels' \
+            'of input image. Found {} vs {}'.format(
+                len(border_value), channels)
+    else:
+        raise ValueError(
+            f'Invalid type {type(border_value)} for `border_value`.')
+    translate_matrix = _get_translate_matrix(offset, direction)
+    translated = cv2.warpAffine(
+        img,
+        translate_matrix,
+        (width, height),
+        # Note case when the number elements in `border_value`
+        # greater than 3 (e.g. translating masks whose channels
+        # large than 3) will raise TypeError in `cv2.warpAffine`.
+        # Here simply slice the first 3 values in `border_value`.
+        borderValue=border_value[:3],
+        flags=cv2_interp_codes[interpolation])
+    return translated
diff --git a/mmcv/mmcv/image/io.py b/mmcv/mmcv/image/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..ae81b561a84cccfa4923364679dce56d762db1bc
--- /dev/null
+++ b/mmcv/mmcv/image/io.py
@@ -0,0 +1,314 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import os.path as osp
+import warnings
+from pathlib import Path
+
+import cv2
+import numpy as np
+from cv2 import (IMREAD_COLOR, IMREAD_GRAYSCALE, IMREAD_IGNORE_ORIENTATION,
+                 IMREAD_UNCHANGED)
+
+from mmcv.fileio import FileClient
+from mmcv.utils import is_filepath, is_str
+
+try:
+    from turbojpeg import TJCS_RGB, TJPF_BGR, TJPF_GRAY, TurboJPEG
+except ImportError:
+    TJCS_RGB = TJPF_GRAY = TJPF_BGR = TurboJPEG = None
+
+try:
+    from PIL import Image, ImageOps
+except ImportError:
+    Image = None
+
+try:
+    import tifffile
+except ImportError:
+    tifffile = None
+
+jpeg = None
+supported_backends = ['cv2', 'turbojpeg', 'pillow', 'tifffile']
+
+imread_flags = {
+    'color': IMREAD_COLOR,
+    'grayscale': IMREAD_GRAYSCALE,
+    'unchanged': IMREAD_UNCHANGED,
+    'color_ignore_orientation': IMREAD_IGNORE_ORIENTATION | IMREAD_COLOR,
+    'grayscale_ignore_orientation':
+    IMREAD_IGNORE_ORIENTATION | IMREAD_GRAYSCALE
+}
+
+imread_backend = 'cv2'
+
+
+def use_backend(backend):
+    """Select a backend for image decoding.
+
+    Args:
+        backend (str): The image decoding backend type. Options are `cv2`,
+        `pillow`, `turbojpeg` (see https://github.com/lilohuang/PyTurboJPEG)
+        and `tifffile`. `turbojpeg` is faster but it only supports `.jpeg`
+        file format.
+    """
+    assert backend in supported_backends
+    global imread_backend
+    imread_backend = backend
+    if imread_backend == 'turbojpeg':
+        if TurboJPEG is None:
+            raise ImportError('`PyTurboJPEG` is not installed')
+        global jpeg
+        if jpeg is None:
+            jpeg = TurboJPEG()
+    elif imread_backend == 'pillow':
+        if Image is None:
+            raise ImportError('`Pillow` is not installed')
+    elif imread_backend == 'tifffile':
+        if tifffile is None:
+            raise ImportError('`tifffile` is not installed')
+
+
+def _jpegflag(flag='color', channel_order='bgr'):
+    channel_order = channel_order.lower()
+    if channel_order not in ['rgb', 'bgr']:
+        raise ValueError('channel order must be either "rgb" or "bgr"')
+
+    if flag == 'color':
+        if channel_order == 'bgr':
+            return TJPF_BGR
+        elif channel_order == 'rgb':
+            return TJCS_RGB
+    elif flag == 'grayscale':
+        return TJPF_GRAY
+    else:
+        raise ValueError('flag must be "color" or "grayscale"')
+
+
+def _pillow2array(img, flag='color', channel_order='bgr'):
+    """Convert a pillow image to numpy array.
+
+    Args:
+        img (:obj:`PIL.Image.Image`): The image loaded using PIL
+        flag (str): Flags specifying the color type of a loaded image,
+            candidates are 'color', 'grayscale' and 'unchanged'.
+            Default to 'color'.
+        channel_order (str): The channel order of the output image array,
+            candidates are 'bgr' and 'rgb'. Default to 'bgr'.
+
+    Returns:
+        np.ndarray: The converted numpy array
+    """
+    channel_order = channel_order.lower()
+    if channel_order not in ['rgb', 'bgr']:
+        raise ValueError('channel order must be either "rgb" or "bgr"')
+
+    if flag == 'unchanged':
+        array = np.array(img)
+        if array.ndim >= 3 and array.shape[2] >= 3:  # color image
+            array[:, :, :3] = array[:, :, (2, 1, 0)]  # RGB to BGR
+    else:
+        # Handle exif orientation tag
+        if flag in ['color', 'grayscale']:
+            img = ImageOps.exif_transpose(img)
+        # If the image mode is not 'RGB', convert it to 'RGB' first.
+        if img.mode != 'RGB':
+            if img.mode != 'LA':
+                # Most formats except 'LA' can be directly converted to RGB
+                img = img.convert('RGB')
+            else:
+                # When the mode is 'LA', the default conversion will fill in
+                #  the canvas with black, which sometimes shadows black objects
+                #  in the foreground.
+                #
+                # Therefore, a random color (124, 117, 104) is used for canvas
+                img_rgba = img.convert('RGBA')
+                img = Image.new('RGB', img_rgba.size, (124, 117, 104))
+                img.paste(img_rgba, mask=img_rgba.split()[3])  # 3 is alpha
+        if flag in ['color', 'color_ignore_orientation']:
+            array = np.array(img)
+            if channel_order != 'rgb':
+                array = array[:, :, ::-1]  # RGB to BGR
+        elif flag in ['grayscale', 'grayscale_ignore_orientation']:
+            img = img.convert('L')
+            array = np.array(img)
+        else:
+            raise ValueError(
+                'flag must be "color", "grayscale", "unchanged", '
+                f'"color_ignore_orientation" or "grayscale_ignore_orientation"'
+                f' but got {flag}')
+    return array
+
+
+def imread(img_or_path,
+           flag='color',
+           channel_order='bgr',
+           backend=None,
+           file_client_args=None):
+    """Read an image.
+
+    Note:
+        In v1.4.1 and later, add `file_client_args` parameters.
+
+    Args:
+        img_or_path (ndarray or str or Path): Either a numpy array or str or
+            pathlib.Path. If it is a numpy array (loaded image), then
+            it will be returned as is.
+        flag (str): Flags specifying the color type of a loaded image,
+            candidates are `color`, `grayscale`, `unchanged`,
+            `color_ignore_orientation` and `grayscale_ignore_orientation`.
+            By default, `cv2` and `pillow` backend would rotate the image
+            according to its EXIF info unless called with `unchanged` or
+            `*_ignore_orientation` flags. `turbojpeg` and `tifffile` backend
+            always ignore image's EXIF info regardless of the flag.
+            The `turbojpeg` backend only supports `color` and `grayscale`.
+        channel_order (str): Order of channel, candidates are `bgr` and `rgb`.
+        backend (str | None): The image decoding backend type. Options are
+            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`.
+            If backend is None, the global imread_backend specified by
+            ``mmcv.use_backend()`` will be used. Default: None.
+        file_client_args (dict | None): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Returns:
+        ndarray: Loaded image array.
+
+    Examples:
+        >>> import mmcv
+        >>> img_path = '/path/to/img.jpg'
+        >>> img = mmcv.imread(img_path)
+        >>> img = mmcv.imread(img_path, flag='color', channel_order='rgb',
+        ...     backend='cv2')
+        >>> img = mmcv.imread(img_path, flag='color', channel_order='bgr',
+        ...     backend='pillow')
+        >>> s3_img_path = 's3://bucket/img.jpg'
+        >>> # infer the file backend by the prefix s3
+        >>> img = mmcv.imread(s3_img_path)
+        >>> # manually set the file backend petrel
+        >>> img = mmcv.imread(s3_img_path, file_client_args={
+        ...     'backend': 'petrel'})
+        >>> http_img_path = 'http://path/to/img.jpg'
+        >>> img = mmcv.imread(http_img_path)
+        >>> img = mmcv.imread(http_img_path, file_client_args={
+        ...     'backend': 'http'})
+    """
+
+    if isinstance(img_or_path, Path):
+        img_or_path = str(img_or_path)
+
+    if isinstance(img_or_path, np.ndarray):
+        return img_or_path
+    elif is_str(img_or_path):
+        file_client = FileClient.infer_client(file_client_args, img_or_path)
+        img_bytes = file_client.get(img_or_path)
+        return imfrombytes(img_bytes, flag, channel_order, backend)
+    else:
+        raise TypeError('"img" must be a numpy array or a str or '
+                        'a pathlib.Path object')
+
+
+def imfrombytes(content, flag='color', channel_order='bgr', backend=None):
+    """Read an image from bytes.
+
+    Args:
+        content (bytes): Image bytes got from files or other streams.
+        flag (str): Same as :func:`imread`.
+        channel_order (str): The channel order of the output, candidates
+            are 'bgr' and 'rgb'. Default to 'bgr'.
+        backend (str | None): The image decoding backend type. Options are
+            `cv2`, `pillow`, `turbojpeg`, `tifffile`, `None`. If backend is
+            None, the global imread_backend specified by ``mmcv.use_backend()``
+            will be used. Default: None.
+
+    Returns:
+        ndarray: Loaded image array.
+
+    Examples:
+        >>> img_path = '/path/to/img.jpg'
+        >>> with open(img_path, 'rb') as f:
+        >>>     img_buff = f.read()
+        >>> img = mmcv.imfrombytes(img_buff)
+        >>> img = mmcv.imfrombytes(img_buff, flag='color', channel_order='rgb')
+        >>> img = mmcv.imfrombytes(img_buff, backend='pillow')
+        >>> img = mmcv.imfrombytes(img_buff, backend='cv2')
+    """
+
+    if backend is None:
+        backend = imread_backend
+    if backend not in supported_backends:
+        raise ValueError(
+            f'backend: {backend} is not supported. Supported '
+            "backends are 'cv2', 'turbojpeg', 'pillow', 'tifffile'")
+    if backend == 'turbojpeg':
+        img = jpeg.decode(content, _jpegflag(flag, channel_order))
+        if img.shape[-1] == 1:
+            img = img[:, :, 0]
+        return img
+    elif backend == 'pillow':
+        with io.BytesIO(content) as buff:
+            img = Image.open(buff)
+            img = _pillow2array(img, flag, channel_order)
+        return img
+    elif backend == 'tifffile':
+        with io.BytesIO(content) as buff:
+            img = tifffile.imread(buff)
+        return img
+    else:
+        img_np = np.frombuffer(content, np.uint8)
+        flag = imread_flags[flag] if is_str(flag) else flag
+        img = cv2.imdecode(img_np, flag)
+        if flag == IMREAD_COLOR and channel_order == 'rgb':
+            cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        return img
+
+
+def imwrite(img,
+            file_path,
+            params=None,
+            auto_mkdir=None,
+            file_client_args=None):
+    """Write image to file.
+
+    Note:
+        In v1.4.1 and later, add `file_client_args` parameters.
+
+    Warning:
+        The parameter `auto_mkdir` will be deprecated in the future and every
+        file clients will make directory automatically.
+
+    Args:
+        img (ndarray): Image array to be written.
+        file_path (str): Image file path.
+        params (None or list): Same as opencv :func:`imwrite` interface.
+        auto_mkdir (bool): If the parent folder of `file_path` does not exist,
+            whether to create it automatically. It will be deprecated.
+        file_client_args (dict | None): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+
+    Returns:
+        bool: Successful or not.
+
+    Examples:
+        >>> # write to hard disk client
+        >>> ret = mmcv.imwrite(img, '/path/to/img.jpg')
+        >>> # infer the file backend by the prefix s3
+        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg')
+        >>> # manually set the file backend petrel
+        >>> ret = mmcv.imwrite(img, 's3://bucket/img.jpg', file_client_args={
+        ...     'backend': 'petrel'})
+    """
+    assert is_filepath(file_path)
+    file_path = str(file_path)
+    if auto_mkdir is not None:
+        warnings.warn(
+            'The parameter `auto_mkdir` will be deprecated in the future and '
+            'every file clients will make directory automatically.')
+    file_client = FileClient.infer_client(file_client_args, file_path)
+    img_ext = osp.splitext(file_path)[-1]
+    # Encode image according to image suffix.
+    # For example, if image path is '/path/your/img.jpg', the encode
+    # format is '.jpg'.
+    flag, img_buff = cv2.imencode(img_ext, img, params)
+    file_client.put(img_buff.tobytes(), file_path)
+    return flag
diff --git a/mmcv/mmcv/image/misc.py b/mmcv/mmcv/image/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..43934a689dd7ac6d35b772b7ce9921ff3b1fff50
--- /dev/null
+++ b/mmcv/mmcv/image/misc.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+
+import mmcv
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+
+def tensor2imgs(tensor, mean=None, std=None, to_rgb=True):
+    """Convert tensor to 3-channel images or 1-channel gray images.
+
+    Args:
+        tensor (torch.Tensor): Tensor that contains multiple images, shape (
+            N, C, H, W). :math:`C` can be either 3 or 1.
+        mean (tuple[float], optional): Mean of images. If None,
+            (0, 0, 0) will be used for tensor with 3-channel,
+            while (0, ) for tensor with 1-channel. Defaults to None.
+        std (tuple[float], optional): Standard deviation of images. If None,
+            (1, 1, 1) will be used for tensor with 3-channel,
+            while (1, ) for tensor with 1-channel. Defaults to None.
+        to_rgb (bool, optional): Whether the tensor was converted to RGB
+            format in the first place. If so, convert it back to BGR.
+            For the tensor with 1 channel, it must be False. Defaults to True.
+
+    Returns:
+        list[np.ndarray]: A list that contains multiple images.
+    """
+
+    if torch is None:
+        raise RuntimeError('pytorch is not installed')
+    assert torch.is_tensor(tensor) and tensor.ndim == 4
+    channels = tensor.size(1)
+    assert channels in [1, 3]
+    if mean is None:
+        mean = (0, ) * channels
+    if std is None:
+        std = (1, ) * channels
+    assert (channels == len(mean) == len(std) == 3) or \
+        (channels == len(mean) == len(std) == 1 and not to_rgb)
+
+    num_imgs = tensor.size(0)
+    mean = np.array(mean, dtype=np.float32)
+    std = np.array(std, dtype=np.float32)
+    imgs = []
+    for img_id in range(num_imgs):
+        img = tensor[img_id, ...].cpu().numpy().transpose(1, 2, 0)
+        img = mmcv.imdenormalize(
+            img, mean, std, to_bgr=to_rgb).astype(np.uint8)
+        imgs.append(np.ascontiguousarray(img))
+    return imgs
diff --git a/mmcv/mmcv/image/photometric.py b/mmcv/mmcv/image/photometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..b41cea7172ae0ece858d868b73dc65deaea3510c
--- /dev/null
+++ b/mmcv/mmcv/image/photometric.py
@@ -0,0 +1,471 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+
+from ..utils import is_tuple_of
+from .colorspace import bgr2gray, gray2bgr
+
+
+def imnormalize(img, mean, std, to_rgb=True):
+    """Normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    img = img.copy().astype(np.float32)
+    return imnormalize_(img, mean, std, to_rgb)
+
+
+def imnormalize_(img, mean, std, to_rgb=True):
+    """Inplace normalize an image with mean and std.
+
+    Args:
+        img (ndarray): Image to be normalized.
+        mean (ndarray): The mean to be used for normalize.
+        std (ndarray): The std to be used for normalize.
+        to_rgb (bool): Whether to convert to rgb.
+
+    Returns:
+        ndarray: The normalized image.
+    """
+    # cv2 inplace normalization does not accept uint8
+    assert img.dtype != np.uint8
+    mean = np.float64(mean.reshape(1, -1))
+    stdinv = 1 / np.float64(std.reshape(1, -1))
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+    cv2.subtract(img, mean, img)  # inplace
+    cv2.multiply(img, stdinv, img)  # inplace
+    return img
+
+
+def imdenormalize(img, mean, std, to_bgr=True):
+    assert img.dtype != np.uint8
+    mean = mean.reshape(1, -1).astype(np.float64)
+    std = std.reshape(1, -1).astype(np.float64)
+    img = cv2.multiply(img, std)  # make a copy
+    cv2.add(img, mean, img)  # inplace
+    if to_bgr:
+        cv2.cvtColor(img, cv2.COLOR_RGB2BGR, img)  # inplace
+    return img
+
+
+def iminvert(img):
+    """Invert (negate) an image.
+
+    Args:
+        img (ndarray): Image to be inverted.
+
+    Returns:
+        ndarray: The inverted image.
+    """
+    return np.full_like(img, 255) - img
+
+
+def solarize(img, thr=128):
+    """Solarize an image (invert all pixel values above a threshold)
+
+    Args:
+        img (ndarray): Image to be solarized.
+        thr (int): Threshold for solarizing (0 - 255).
+
+    Returns:
+        ndarray: The solarized image.
+    """
+    img = np.where(img < thr, img, 255 - img)
+    return img
+
+
+def posterize(img, bits):
+    """Posterize an image (reduce the number of bits for each color channel)
+
+    Args:
+        img (ndarray): Image to be posterized.
+        bits (int): Number of bits (1 to 8) to use for posterizing.
+
+    Returns:
+        ndarray: The posterized image.
+    """
+    shift = 8 - bits
+    img = np.left_shift(np.right_shift(img, shift), shift)
+    return img
+
+
+def adjust_color(img, alpha=1, beta=None, gamma=0):
+    r"""It blends the source image and its gray image:
+
+    .. math::
+        output = img * alpha + gray\_img * beta + gamma
+
+    Args:
+        img (ndarray): The input source image.
+        alpha (int | float): Weight for the source image. Default 1.
+        beta (int | float): Weight for the converted gray image.
+            If None, it's assigned the value (1 - `alpha`).
+        gamma (int | float): Scalar added to each sum.
+            Same as :func:`cv2.addWeighted`. Default 0.
+
+    Returns:
+        ndarray: Colored image which has the same size and dtype as input.
+    """
+    gray_img = bgr2gray(img)
+    gray_img = np.tile(gray_img[..., None], [1, 1, 3])
+    if beta is None:
+        beta = 1 - alpha
+    colored_img = cv2.addWeighted(img, alpha, gray_img, beta, gamma)
+    if not colored_img.dtype == np.uint8:
+        # Note when the dtype of `img` is not the default `np.uint8`
+        # (e.g. np.float32), the value in `colored_img` got from cv2
+        # is not guaranteed to be in range [0, 255], so here clip
+        # is needed.
+        colored_img = np.clip(colored_img, 0, 255)
+    return colored_img
+
+
+def imequalize(img):
+    """Equalize the image histogram.
+
+    This function applies a non-linear mapping to the input image,
+    in order to create a uniform distribution of grayscale values
+    in the output image.
+
+    Args:
+        img (ndarray): Image to be equalized.
+
+    Returns:
+        ndarray: The equalized image.
+    """
+
+    def _scale_channel(im, c):
+        """Scale the data in the corresponding channel."""
+        im = im[:, :, c]
+        # Compute the histogram of the image channel.
+        histo = np.histogram(im, 256, (0, 255))[0]
+        # For computing the step, filter out the nonzeros.
+        nonzero_histo = histo[histo > 0]
+        step = (np.sum(nonzero_histo) - nonzero_histo[-1]) // 255
+        if not step:
+            lut = np.array(range(256))
+        else:
+            # Compute the cumulative sum, shifted by step // 2
+            # and then normalized by step.
+            lut = (np.cumsum(histo) + (step // 2)) // step
+            # Shift lut, prepending with 0.
+            lut = np.concatenate([[0], lut[:-1]], 0)
+            # handle potential integer overflow
+            lut[lut > 255] = 255
+        # If step is zero, return the original image.
+        # Otherwise, index from lut.
+        return np.where(np.equal(step, 0), im, lut[im])
+
+    # Scales each channel independently and then stacks
+    # the result.
+    s1 = _scale_channel(img, 0)
+    s2 = _scale_channel(img, 1)
+    s3 = _scale_channel(img, 2)
+    equalized_img = np.stack([s1, s2, s3], axis=-1)
+    return equalized_img.astype(img.dtype)
+
+
+def adjust_brightness(img, factor=1.):
+    """Adjust image brightness.
+
+    This function controls the brightness of an image. An
+    enhancement factor of 0.0 gives a black image.
+    A factor of 1.0 gives the original image. This function
+    blends the source image and the degenerated black image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be brightened.
+        factor (float): A value controls the enhancement.
+            Factor 1.0 returns the original image, lower
+            factors mean less color (brightness, contrast,
+            etc), and higher values more. Default 1.
+
+    Returns:
+        ndarray: The brightened image.
+    """
+    degenerated = np.zeros_like(img)
+    # Note manually convert the dtype to np.float32, to
+    # achieve as close results as PIL.ImageEnhance.Brightness.
+    # Set beta=1-factor, and gamma=0
+    brightened_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    brightened_img = np.clip(brightened_img, 0, 255)
+    return brightened_img.astype(img.dtype)
+
+
+def adjust_contrast(img, factor=1.):
+    """Adjust image contrast.
+
+    This function controls the contrast of an image. An
+    enhancement factor of 0.0 gives a solid grey
+    image. A factor of 1.0 gives the original image. It
+    blends the source image and the degenerated mean image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be contrasted. BGR order.
+        factor (float): Same as :func:`mmcv.adjust_brightness`.
+
+    Returns:
+        ndarray: The contrasted image.
+    """
+    gray_img = bgr2gray(img)
+    hist = np.histogram(gray_img, 256, (0, 255))[0]
+    mean = round(np.sum(gray_img) / np.sum(hist))
+    degenerated = (np.ones_like(img[..., 0]) * mean).astype(img.dtype)
+    degenerated = gray2bgr(degenerated)
+    contrasted_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    contrasted_img = np.clip(contrasted_img, 0, 255)
+    return contrasted_img.astype(img.dtype)
+
+
+def auto_contrast(img, cutoff=0):
+    """Auto adjust image contrast.
+
+    This function maximize (normalize) image contrast by first removing cutoff
+    percent of the lightest and darkest pixels from the histogram and remapping
+    the image so that the darkest pixel becomes black (0), and the lightest
+    becomes white (255).
+
+    Args:
+        img (ndarray): Image to be contrasted. BGR order.
+        cutoff (int | float | tuple): The cutoff percent of the lightest and
+            darkest pixels to be removed. If given as tuple, it shall be
+            (low, high). Otherwise, the single value will be used for both.
+            Defaults to 0.
+
+    Returns:
+        ndarray: The contrasted image.
+    """
+
+    def _auto_contrast_channel(im, c, cutoff):
+        im = im[:, :, c]
+        # Compute the histogram of the image channel.
+        histo = np.histogram(im, 256, (0, 255))[0]
+        # Remove cut-off percent pixels from histo
+        histo_sum = np.cumsum(histo)
+        cut_low = histo_sum[-1] * cutoff[0] // 100
+        cut_high = histo_sum[-1] - histo_sum[-1] * cutoff[1] // 100
+        histo_sum = np.clip(histo_sum, cut_low, cut_high) - cut_low
+        histo = np.concatenate([[histo_sum[0]], np.diff(histo_sum)], 0)
+
+        # Compute mapping
+        low, high = np.nonzero(histo)[0][0], np.nonzero(histo)[0][-1]
+        # If all the values have been cut off, return the origin img
+        if low >= high:
+            return im
+        scale = 255.0 / (high - low)
+        offset = -low * scale
+        lut = np.array(range(256))
+        lut = lut * scale + offset
+        lut = np.clip(lut, 0, 255)
+        return lut[im]
+
+    if isinstance(cutoff, (int, float)):
+        cutoff = (cutoff, cutoff)
+    else:
+        assert isinstance(cutoff, tuple), 'cutoff must be of type int, ' \
+            f'float or tuple, but got {type(cutoff)} instead.'
+    # Auto adjusts contrast for each channel independently and then stacks
+    # the result.
+    s1 = _auto_contrast_channel(img, 0, cutoff)
+    s2 = _auto_contrast_channel(img, 1, cutoff)
+    s3 = _auto_contrast_channel(img, 2, cutoff)
+    contrasted_img = np.stack([s1, s2, s3], axis=-1)
+    return contrasted_img.astype(img.dtype)
+
+
+def adjust_sharpness(img, factor=1., kernel=None):
+    """Adjust image sharpness.
+
+    This function controls the sharpness of an image. An
+    enhancement factor of 0.0 gives a blurred image. A
+    factor of 1.0 gives the original image. And a factor
+    of 2.0 gives a sharpened image. It blends the source
+    image and the degenerated mean image:
+
+    .. math::
+        output = img * factor + degenerated * (1 - factor)
+
+    Args:
+        img (ndarray): Image to be sharpened. BGR order.
+        factor (float): Same as :func:`mmcv.adjust_brightness`.
+        kernel (np.ndarray, optional): Filter kernel to be applied on the img
+            to obtain the degenerated img. Defaults to None.
+
+    Note:
+        No value sanity check is enforced on the kernel set by users. So with
+        an inappropriate kernel, the ``adjust_sharpness`` may fail to perform
+        the function its name indicates but end up performing whatever
+        transform determined by the kernel.
+
+    Returns:
+        ndarray: The sharpened image.
+    """
+
+    if kernel is None:
+        # adopted from PIL.ImageFilter.SMOOTH
+        kernel = np.array([[1., 1., 1.], [1., 5., 1.], [1., 1., 1.]]) / 13
+    assert isinstance(kernel, np.ndarray), \
+        f'kernel must be of type np.ndarray, but got {type(kernel)} instead.'
+    assert kernel.ndim == 2, \
+        f'kernel must have a dimension of 2, but got {kernel.ndim} instead.'
+
+    degenerated = cv2.filter2D(img, -1, kernel)
+    sharpened_img = cv2.addWeighted(
+        img.astype(np.float32), factor, degenerated.astype(np.float32),
+        1 - factor, 0)
+    sharpened_img = np.clip(sharpened_img, 0, 255)
+    return sharpened_img.astype(img.dtype)
+
+
+def adjust_lighting(img, eigval, eigvec, alphastd=0.1, to_rgb=True):
+    """AlexNet-style PCA jitter.
+
+    This data augmentation is proposed in `ImageNet Classification with Deep
+    Convolutional Neural Networks
+    <https://dl.acm.org/doi/pdf/10.1145/3065386>`_.
+
+    Args:
+        img (ndarray): Image to be adjusted lighting. BGR order.
+        eigval (ndarray): the eigenvalue of the convariance matrix of pixel
+            values, respectively.
+        eigvec (ndarray): the eigenvector of the convariance matrix of pixel
+            values, respectively.
+        alphastd (float): The standard deviation for distribution of alpha.
+            Defaults to 0.1
+        to_rgb (bool): Whether to convert img to rgb.
+
+    Returns:
+        ndarray: The adjusted image.
+    """
+    assert isinstance(eigval, np.ndarray) and isinstance(eigvec, np.ndarray), \
+        f'eigval and eigvec should both be of type np.ndarray, got ' \
+        f'{type(eigval)} and {type(eigvec)} instead.'
+
+    assert eigval.ndim == 1 and eigvec.ndim == 2
+    assert eigvec.shape == (3, eigval.shape[0])
+    n_eigval = eigval.shape[0]
+    assert isinstance(alphastd, float), 'alphastd should be of type float, ' \
+        f'got {type(alphastd)} instead.'
+
+    img = img.copy().astype(np.float32)
+    if to_rgb:
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)  # inplace
+
+    alpha = np.random.normal(0, alphastd, n_eigval)
+    alter = eigvec \
+        * np.broadcast_to(alpha.reshape(1, n_eigval), (3, n_eigval)) \
+        * np.broadcast_to(eigval.reshape(1, n_eigval), (3, n_eigval))
+    alter = np.broadcast_to(alter.sum(axis=1).reshape(1, 1, 3), img.shape)
+    img_adjusted = img + alter
+    return img_adjusted
+
+
+def lut_transform(img, lut_table):
+    """Transform array by look-up table.
+
+    The function lut_transform fills the output array with values from the
+    look-up table. Indices of the entries are taken from the input array.
+
+    Args:
+        img (ndarray): Image to be transformed.
+        lut_table (ndarray): look-up table of 256 elements; in case of
+            multi-channel input array, the table should either have a single
+            channel (in this case the same table is used for all channels) or
+            the same number of channels as in the input array.
+
+    Returns:
+        ndarray: The transformed image.
+    """
+    assert isinstance(img, np.ndarray)
+    assert 0 <= np.min(img) and np.max(img) <= 255
+    assert isinstance(lut_table, np.ndarray)
+    assert lut_table.shape == (256, )
+
+    return cv2.LUT(np.array(img, dtype=np.uint8), lut_table)
+
+
+def clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
+    """Use CLAHE method to process the image.
+
+    See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
+    Graphics Gems, 1994:474-485.` for more information.
+
+    Args:
+        img (ndarray): Image to be processed.
+        clip_limit (float): Threshold for contrast limiting. Default: 40.0.
+        tile_grid_size (tuple[int]): Size of grid for histogram equalization.
+            Input image will be divided into equally sized rectangular tiles.
+            It defines the number of tiles in row and column. Default: (8, 8).
+
+    Returns:
+        ndarray: The processed image.
+    """
+    assert isinstance(img, np.ndarray)
+    assert img.ndim == 2
+    assert isinstance(clip_limit, (float, int))
+    assert is_tuple_of(tile_grid_size, int)
+    assert len(tile_grid_size) == 2
+
+    clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
+    return clahe.apply(np.array(img, dtype=np.uint8))
+
+
+def adjust_hue(img: np.ndarray, hue_factor: float) -> np.ndarray:
+    """Adjust hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and cyclically
+    shifting the intensities in the hue channel (H). The image is then
+    converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    Modified from
+    https://github.com/pytorch/vision/blob/main/torchvision/
+    transforms/functional.py
+
+    Args:
+        img (ndarray): Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+
+    Returns:
+        ndarray: Hue adjusted image.
+    """
+
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f'hue_factor:{hue_factor} is not in [-0.5, 0.5].')
+    if not (isinstance(img, np.ndarray) and (img.ndim in {2, 3})):
+        raise TypeError('img should be ndarray with dim=[2 or 3].')
+
+    dtype = img.dtype
+    img = img.astype(np.uint8)
+    hsv_img = cv2.cvtColor(img, cv2.COLOR_RGB2HSV_FULL)
+    h, s, v = cv2.split(hsv_img)
+    h = h.astype(np.uint8)
+    # uint8 addition take cares of rotation across boundaries
+    with np.errstate(over='ignore'):
+        h += np.uint8(hue_factor * 255)
+    hsv_img = cv2.merge([h, s, v])
+    return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2RGB_FULL).astype(dtype)
diff --git a/mmcv/mmcv/model_zoo/deprecated.json b/mmcv/mmcv/model_zoo/deprecated.json
new file mode 100644
index 0000000000000000000000000000000000000000..25cf6f28caecc22a77e3136fefa6b8dfc0e6cb5b
--- /dev/null
+++ b/mmcv/mmcv/model_zoo/deprecated.json
@@ -0,0 +1,6 @@
+{
+  "resnet50_caffe": "detectron/resnet50_caffe",
+  "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr",
+  "resnet101_caffe": "detectron/resnet101_caffe",
+  "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr"
+}
diff --git a/mmcv/mmcv/model_zoo/mmcls.json b/mmcv/mmcv/model_zoo/mmcls.json
new file mode 100644
index 0000000000000000000000000000000000000000..c073a41d0aeb44ee0243f97ecc3558de538f9300
--- /dev/null
+++ b/mmcv/mmcv/model_zoo/mmcls.json
@@ -0,0 +1,59 @@
+{
+  "vgg11": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth",
+  "vgg13": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth",
+  "vgg16": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth",
+  "vgg19": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth",
+  "vgg11_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth",
+  "vgg13_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth",
+  "vgg16_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth",
+  "vgg19_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth",
+  "resnet18": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.pth",
+  "resnet34": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_8xb32_in1k_20210831-f257d4e6.pth",
+  "resnet50": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.pth",
+  "resnet101": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth",
+  "resnet152": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_8xb32_in1k_20210901-4d7582fa.pth",
+  "resnet50_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.pth",
+  "resnet101_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.pth",
+  "resnet152_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.pth",
+  "resnext50_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth",
+  "resnext101_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth",
+  "resnext101_32x8d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth",
+  "resnext152_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth",
+  "se-resnet50": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth",
+  "se-resnet101": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth",
+  "resnest50": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth",
+  "resnest101": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth",
+  "resnest200": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth",
+  "resnest269": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth",
+  "shufflenet_v1": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth",
+  "shufflenet_v2": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth",
+  "mobilenet_v2": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth",
+  "mobilenet_v3_small": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth",
+  "mobilenet_v3_large": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth",
+  "repvgg_A0": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_3rdparty_4xb64-coslr-120e_in1k_20210909-883ab98c.pth",
+  "repvgg_A1": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_3rdparty_4xb64-coslr-120e_in1k_20210909-24003a24.pth",
+  "repvgg_A2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_3rdparty_4xb64-coslr-120e_in1k_20210909-97d7695a.pth",
+  "repvgg_B0": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_3rdparty_4xb64-coslr-120e_in1k_20210909-446375f4.pth",
+  "repvgg_B1": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_3rdparty_4xb64-coslr-120e_in1k_20210909-750cdf67.pth",
+  "repvgg_B1g2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k_20210909-344f6422.pth",
+  "repvgg_B1g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k_20210909-d4c1a642.pth",
+  "repvgg_B2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_3rdparty_4xb64-coslr-120e_in1k_20210909-bd6b937c.pth",
+  "repvgg_B2g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-7b7955f0.pth",
+  "repvgg_B3": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-dda968bf.pth",
+  "repvgg_B3g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-4e54846a.pth",
+  "repvgg_D2se": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-cf3139b7.pth",
+  "res2net101_w26": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net101-w26-s4_3rdparty_8xb32_in1k_20210927-870b6c36.pth",
+  "res2net50_w14": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w14-s8_3rdparty_8xb32_in1k_20210927-bc967bf1.pth",
+  "res2net50_w26": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w26-s8_3rdparty_8xb32_in1k_20210927-f547a94b.pth",
+  "swin_tiny": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth",
+  "swin_small": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219-7f9d988b.pth",
+  "swin_base": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window7_224_22kto1k-f967f799.pth",
+  "swin_large": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_large_patch4_window7_224_22kto1k-5f0996db.pth",
+  "t2t_vit_t_14": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_3rdparty_8xb64_in1k_20210928-b7c09b62.pth",
+  "t2t_vit_t_19": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_3rdparty_8xb64_in1k_20210928-7f1478d5.pth",
+  "t2t_vit_t_24": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_3rdparty_8xb64_in1k_20210928-fe95a61b.pth",
+  "tnt_small": "https://download.openmmlab.com/mmclassification/v0/tnt/tnt-small-p16_3rdparty_in1k_20210903-c56ee7df.pth",
+  "vit_base_p16": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth",
+  "vit_base_p32": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth",
+  "vit_large_p16": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth"
+}
diff --git a/mmcv/mmcv/model_zoo/open_mmlab.json b/mmcv/mmcv/model_zoo/open_mmlab.json
new file mode 100644
index 0000000000000000000000000000000000000000..8311db4feef92faa0841c697d75efbee8430c3a0
--- /dev/null
+++ b/mmcv/mmcv/model_zoo/open_mmlab.json
@@ -0,0 +1,50 @@
+{
+  "vgg16_caffe": "https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth",
+  "detectron/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth",
+  "detectron2/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth",
+  "detectron/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth",
+  "detectron2/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth",
+  "detectron2/resnext101_32x8d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth",
+  "resnext50_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth",
+  "resnext101_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth",
+  "resnext101_64x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth",
+  "contrib/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth",
+  "detectron/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth",
+  "detectron/resnet101_gn": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth",
+  "jhu/resnet50_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth",
+  "jhu/resnet101_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth",
+  "jhu/resnext50_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth",
+  "jhu/resnext101_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth",
+  "jhu/resnext50_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth",
+  "jhu/resnext101_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth",
+  "msra/hrnetv2_w18_small": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth",
+  "msra/hrnetv2_w18": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth",
+  "msra/hrnetv2_w32": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth",
+  "msra/hrnetv2_w40": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth",
+  "msra/hrnetv2_w48": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth",
+  "bninception_caffe": "https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth",
+  "kin400/i3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth",
+  "kin400/nl3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth",
+  "res2net101_v1d_26w_4s": "https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth",
+  "regnetx_400mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth",
+  "regnetx_800mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth",
+  "regnetx_1.6gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth",
+  "regnetx_3.2gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth",
+  "regnetx_4.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth",
+  "regnetx_6.4gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth",
+  "regnetx_8.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth",
+  "regnetx_12gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth",
+  "resnet18_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth",
+  "resnet50_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth",
+  "resnet101_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth",
+  "mmedit/vgg16": "https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth",
+  "mmedit/res34_en_nomixup": "https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth",
+  "mmedit/mobilenet_v2": "https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth",
+  "contrib/mobilenet_v3_large": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth",
+  "contrib/mobilenet_v3_small": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth",
+  "resnest50": "https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth",
+  "resnest101": "https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth",
+  "resnest200": "https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth",
+  "darknet53": "https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth",
+  "mmdet/mobilenet_v2": "https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth"
+}
diff --git a/mmcv/mmcv/model_zoo/torchvision_0.12.json b/mmcv/mmcv/model_zoo/torchvision_0.12.json
new file mode 100644
index 0000000000000000000000000000000000000000..06defe67484dff91cf6f69109324cb1dd9d64bc3
--- /dev/null
+++ b/mmcv/mmcv/model_zoo/torchvision_0.12.json
@@ -0,0 +1,57 @@
+{
+    "alexnet": "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth",
+    "densenet121": "https://download.pytorch.org/models/densenet121-a639ec97.pth",
+    "densenet169": "https://download.pytorch.org/models/densenet169-b2777c0a.pth",
+    "densenet201": "https://download.pytorch.org/models/densenet201-c1103571.pth",
+    "densenet161": "https://download.pytorch.org/models/densenet161-8d451a50.pth",
+    "efficientnet_b0": "https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth",
+    "efficientnet_b1": "https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
+    "efficientnet_b2": "https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth",
+    "efficientnet_b3": "https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth",
+    "efficientnet_b4": "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth",
+    "efficientnet_b5": "https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth",
+    "efficientnet_b6": "https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth",
+    "efficientnet_b7": "https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth",
+    "googlenet": "https://download.pytorch.org/models/googlenet-1378be20.pth",
+    "inception_v3_google": "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth",
+    "mobilenet_v2": "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth",
+    "mobilenet_v3_large": "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth",
+    "mobilenet_v3_small": "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth",
+    "regnet_y_400mf": "https://download.pytorch.org/models/regnet_y_400mf-c65dace8.pth",
+    "regnet_y_800mf": "https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth",
+    "regnet_y_1_6gf": "https://download.pytorch.org/models/regnet_y_1_6gf-b11a554e.pth",
+    "regnet_y_3_2gf": "https://download.pytorch.org/models/regnet_y_3_2gf-b5a9779c.pth",
+    "regnet_y_8gf": "https://download.pytorch.org/models/regnet_y_8gf-d0d0e4a8.pth",
+    "regnet_y_16gf": "https://download.pytorch.org/models/regnet_y_16gf-9e6ed7dd.pth",
+    "regnet_y_32gf": "https://download.pytorch.org/models/regnet_y_32gf-4dee3f7a.pth",
+    "regnet_x_400mf": "https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth",
+    "regnet_x_800mf": "https://download.pytorch.org/models/regnet_x_800mf-ad17e45c.pth",
+    "regnet_x_1_6gf": "https://download.pytorch.org/models/regnet_x_1_6gf-e3633e7f.pth",
+    "regnet_x_3_2gf": "https://download.pytorch.org/models/regnet_x_3_2gf-f342aeae.pth",
+    "regnet_x_8gf": "https://download.pytorch.org/models/regnet_x_8gf-03ceed89.pth",
+    "regnet_x_16gf": "https://download.pytorch.org/models/regnet_x_16gf-2007eb11.pth",
+    "regnet_x_32gf": "https://download.pytorch.org/models/regnet_x_32gf-9d47f8d0.pth",
+    "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth",
+    "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth",
+    "resnet50": "https://download.pytorch.org/models/resnet50-0676ba61.pth",
+    "resnet101": "https://download.pytorch.org/models/resnet101-63fe2227.pth",
+    "resnet152": "https://download.pytorch.org/models/resnet152-394f9c45.pth",
+    "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
+    "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
+    "wide_resnet50_2": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
+    "wide_resnet101_2": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
+    "shufflenetv2_x0.5": "https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth",
+    "shufflenetv2_x1.0": "https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth",
+    "shufflenetv2_x1.5": null,
+    "shufflenetv2_x2.0": null,
+    "squeezenet1_0": "https://download.pytorch.org/models/squeezenet1_0-b66bff10.pth",
+    "squeezenet1_1": "https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth",
+    "vgg11": "https://download.pytorch.org/models/vgg11-8a719046.pth",
+    "vgg13": "https://download.pytorch.org/models/vgg13-19584684.pth",
+    "vgg16": "https://download.pytorch.org/models/vgg16-397923af.pth",
+    "vgg19": "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth",
+    "vgg11_bn": "https://download.pytorch.org/models/vgg11_bn-6002323d.pth",
+    "vgg13_bn": "https://download.pytorch.org/models/vgg13_bn-abd245e5.pth",
+    "vgg16_bn": "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth",
+    "vgg19_bn": "https://download.pytorch.org/models/vgg19_bn-c79401a0.pth"
+}
diff --git a/mmcv/mmcv/onnx/__init__.py b/mmcv/mmcv/onnx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d7eb5b0db770144ac6676bd1c7e80d7d2eb7e02
--- /dev/null
+++ b/mmcv/mmcv/onnx/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .info import is_custom_op_loaded
+from .symbolic import register_extra_symbolics
+
+__all__ = ['register_extra_symbolics', 'is_custom_op_loaded']
diff --git a/mmcv/mmcv/onnx/info.py b/mmcv/mmcv/onnx/info.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8325a9c0d0dc3b48b77e9da307341059017ea28
--- /dev/null
+++ b/mmcv/mmcv/onnx/info.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import warnings
+
+import torch
+
+
+def is_custom_op_loaded() -> bool:
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    flag = False
+    try:
+        from ..tensorrt import is_tensorrt_plugin_loaded
+        flag = is_tensorrt_plugin_loaded()
+    except (ImportError, ModuleNotFoundError):
+        pass
+    if not flag:
+        try:
+            from ..ops import get_onnxruntime_op_path
+            ort_lib_path = get_onnxruntime_op_path()
+            flag = os.path.exists(ort_lib_path)
+        except (ImportError, ModuleNotFoundError):
+            pass
+    return flag or torch.__version__ == 'parrots'
diff --git a/mmcv/mmcv/onnx/onnx_utils/__init__.py b/mmcv/mmcv/onnx/onnx_utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef101fec61e72abc0eb90266d453b5b22331378d
--- /dev/null
+++ b/mmcv/mmcv/onnx/onnx_utils/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/mmcv/mmcv/onnx/onnx_utils/symbolic_helper.py b/mmcv/mmcv/onnx/onnx_utils/symbolic_helper.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc9e96f8fbbb0cadec23411ddf93b31a90d049d0
--- /dev/null
+++ b/mmcv/mmcv/onnx/onnx_utils/symbolic_helper.py
@@ -0,0 +1,331 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/pytorch/pytorch."""
+import warnings
+from functools import wraps
+from sys import maxsize
+
+import torch
+import torch.onnx
+# This import monkey-patches graph manipulation methods on Graph, used for the
+# ONNX symbolics
+import torch.onnx.utils
+from torch._C import ListType
+
+# ---------------------------------------------------------------------------------
+# Helper functions
+# ---------------------------------------------------------------------------------
+
+# Save some builtins as locals, because we'll shadown them below
+_sum = sum
+
+
+def _parse_arg(value, desc):
+    if desc == 'none':
+        return value
+    if desc == 'v' or not _is_value(value):
+        return value
+    if value.node().mustBeNone():
+        return None
+    if value.node().kind() == 'onnx::Constant':
+        tval = value.node()['value']
+        if desc == 'i':
+            return int(tval)
+        elif desc == 'f':
+            return float(tval)
+        elif desc == 'b':
+            return bool(tval)
+        elif desc == 's':
+            return str(tval)
+        elif desc == 't':
+            return tval
+        elif desc == 'is':
+            return [int(v) for v in tval]
+        elif desc == 'fs':
+            return [float(v) for v in tval]
+        else:
+            raise RuntimeError(
+                "ONNX symbolic doesn't know to interpret Constant node")
+    elif value.node().kind() == 'prim::ListConstruct':
+        if desc == 'is':
+            for v in value.node().inputs():
+                if v.node().kind() != 'onnx::Constant':
+                    raise RuntimeError(
+                        "Failed to export an ONNX attribute '" +
+                        v.node().kind() +
+                        "', since it's not constant, please try to make "
+                        'things (e.g., kernel size) static if possible')
+            return [int(v.node()['value']) for v in value.node().inputs()]
+        else:
+            raise RuntimeError(
+                "ONNX symbolic doesn't know to interpret ListConstruct node")
+
+    raise RuntimeError(f'Unexpected node type: {value.node().kind()}')
+
+
+def _maybe_get_const(value, desc):
+    if _is_value(value) and value.node().kind() == 'onnx::Constant':
+        return _parse_arg(value, desc)
+    return value
+
+
+def _maybe_get_scalar(value):
+    value_t = _maybe_get_const(value, 't')
+    if isinstance(value_t, torch.Tensor) and value_t.shape == ():
+        return value_t
+    return value
+
+
+def _get_const(value, desc, arg_name):
+    if _is_value(value) and value.node().kind() not in ('onnx::Constant',
+                                                        'prim::Constant'):
+        raise RuntimeError('ONNX symbolic expected a constant'
+                           ' value of the {} argument, got `{}`'.format(
+                               arg_name, value))
+    return _parse_arg(value, desc)
+
+
+def _unpack_list(list_value):
+    list_node = list_value.node()
+    assert list_node.kind() == 'prim::ListConstruct'
+    return list(list_node.inputs())
+
+
+# Check if list_value is output from prim::ListConstruct
+# This is usually called before _unpack_list to ensure the list can be
+# unpacked.
+def _is_packed_list(list_value):
+    return _is_value(
+        list_value) and list_value.node().kind() == 'prim::ListConstruct'
+
+
+def parse_args(*arg_descriptors):
+
+    def decorator(fn):
+        fn._arg_descriptors = arg_descriptors
+
+        def wrapper(g, *args):
+            # some args may be optional, so the length may be smaller
+            assert len(arg_descriptors) >= len(args)
+            args = [
+                _parse_arg(arg, arg_desc)
+                for arg, arg_desc in zip(args, arg_descriptors)
+            ]
+            return fn(g, *args)
+
+        # In Python 2 functools.wraps chokes on partially applied functions, so
+        # we need this as a workaround
+        try:
+            wrapper = wraps(fn)(wrapper)
+        except Exception:
+            pass
+        return wrapper
+
+    return decorator
+
+
+def _scalar(x):
+    """Convert a scalar tensor into a Python value."""
+    assert x.numel() == 1
+    return x.item()
+
+
+def _if_scalar_type_as(g, self, tensor):
+    """Convert self into the same type of tensor, as necessary."""
+    if isinstance(self, torch._C.Value):
+        return self
+
+    scalar_type = tensor.type().scalarType()
+    if scalar_type:
+        ty = scalar_type.lower()
+        return getattr(self, ty)()
+
+    return self
+
+
+def _is_none(x):
+    return x.node().mustBeNone()
+
+
+def _is_value(x):
+    return isinstance(x, torch._C.Value)
+
+
+def _is_tensor_list(x):
+    return x.type().isSubtypeOf(ListType.ofTensors())
+
+
+def _unimplemented(op, msg):
+    warnings.warn('ONNX export failed on ' + op + ' because ' + msg +
+                  ' not supported')
+
+
+def _try_get_scalar_type(*args):
+    for arg in args:
+        try:
+            return arg.type().scalarType()
+        except RuntimeError:
+            pass
+    return None
+
+
+def _topk_helper(g, input, k, dim, largest=True, sorted=False, out=None):
+    if out is not None:
+        _unimplemented('TopK', 'Out parameter is not supported')
+    if not _is_value(k):
+        k = g.op('Constant', value_t=torch.tensor([k], dtype=torch.int64))
+    else:
+        k = g.op('Reshape', k, g.op('Constant', value_t=torch.tensor([1])))
+    return g.op(
+        'TopK',
+        input,
+        k,
+        axis_i=dim,
+        largest_i=largest,
+        sorted_i=sorted,
+        outputs=2)
+
+
+def _slice_helper(g,
+                  input,
+                  axes,
+                  starts,
+                  ends,
+                  steps=None,
+                  dynamic_slice=False):
+    # TODO(ruobing): add support for opset<10
+    from torch.onnx.symbolic_opset10 import _slice
+    return _slice(g, input, axes, starts, ends, steps, dynamic_slice)
+
+
+def _unsqueeze_helper(g, input, dim):
+    from torch.onnx.symbolic_opset9 import unsqueeze
+    return unsqueeze(g, input, dim)
+
+
+def _interpolate_size_to_scales(g, input, output_size, dim):
+    output_size = _maybe_get_const(output_size, 'is')
+    if _is_value(output_size):
+        offset = 2
+        offsets = g.op(
+            'Constant', value_t=torch.ones(offset, dtype=torch.float32))
+        dividend = g.op(
+            'Cast', output_size, to_i=cast_pytorch_to_onnx['Float'])
+        divisor = _slice_helper(
+            g, g.op('Shape', input), axes=[0], ends=[maxsize], starts=[offset])
+        divisor = g.op('Cast', divisor, to_i=cast_pytorch_to_onnx['Float'])
+        scale_dims = g.op('Div', dividend, divisor)
+        scales = g.op('Concat', offsets, scale_dims, axis_i=0)
+    else:
+        scales_constant = [
+            1. if i < 2 else float(output_size[-(dim - i)]) /
+            float(input.type().sizes()[-(dim - i)]) for i in range(0, dim)
+        ]
+        scales = g.op(
+            'Constant',
+            value_t=torch.tensor(scales_constant, dtype=torch.float32))
+    return scales
+
+
+def _interpolate_get_scales_if_available(g, scales):
+    if len(scales) == 0:
+        return None
+    # scales[0] is NoneType in Pytorch == 1.5.1
+    # scales[0] is TensorType with sizes = [] in Pytorch == 1.6.0
+    # scales[0] is ListType in Pytorch == 1.7.0
+    # scales[0] is TensorType with sizes = [2] in Pytorch == 1.8.0
+    scale_desc = 'fs' if scales[0].type().kind() == 'ListType' or (
+        scales[0].type().kind() == 'TensorType' and
+        (sum(scales[0].type().sizes()) > 1)) else 'f'
+    available_scales = _maybe_get_const(
+        scales[0], scale_desc) != -1 and not _is_none(scales[0])
+
+    if not available_scales:
+        return None
+
+    offsets = g.op('Constant', value_t=torch.ones(2, dtype=torch.float32))
+    if scale_desc == 'fs':
+        scales_list = g.op(
+            'Constant',
+            value_t=torch.tensor(_maybe_get_const(scales[0], scale_desc)))
+        # modify to support PyTorch==1.7.0
+        # https://github.com/pytorch/pytorch/blob/75ee5756715e7161314ce037474843b68f69fc04/torch/onnx/symbolic_helper.py#L375 # noqa: E501
+        scales = g.op('Concat', offsets, scales_list, axis_i=0)
+    else:
+        # for PyTorch < 1.7.0
+        scales_list = []
+        for scale in scales:
+            unsqueezed_scale = _unsqueeze_helper(g, scale, 0)
+            # ONNX only supports float for the scales. double -> float.
+            unsqueezed_scale = g.op(
+                'Cast', unsqueezed_scale, to_i=cast_pytorch_to_onnx['Float'])
+            scales_list.append(unsqueezed_scale)
+        scales = g.op('Concat', offsets, *scales_list, axis_i=0)
+    return scales
+
+
+def _get_interpolate_attributes(g, mode, args):
+    if mode == 'nearest':
+        align_corners = None
+        scales = args[0:]
+    else:
+        align_corners = args[0]
+        scales = args[1:]
+    scales = _interpolate_get_scales_if_available(g, scales)
+    return scales, align_corners
+
+
+def _interpolate_get_scales(g, scale_factor, dim):
+    offsets = g.op('Constant', value_t=torch.ones(2, dtype=torch.float32))
+    if isinstance(scale_factor.type(), torch._C.ListType):
+        return g.op('Concat', offsets, scale_factor, axis_i=0)
+    else:
+        scale_factor = _unsqueeze_helper(g, scale_factor, 0)
+        scale_factor = g.op(
+            'Cast', scale_factor, to_i=cast_pytorch_to_onnx['Float'])
+        scales = [scale_factor for i in range(dim - 2)]
+    scale_factor = g.op('Concat', offsets, *scales, axis_i=0)
+    return scale_factor
+
+
+def _size_helper(g, self, dim):
+    full_shape = g.op('Shape', self)
+    from torch.onnx.symbolic_opset9 import select
+    return select(g, full_shape, g.op('Constant', value_t=torch.tensor([0])),
+                  dim)
+
+
+def _avgpool_helper(tuple_fn, padding, kernel_size, stride, divisor_override,
+                    name):
+    if divisor_override and divisor_override.node().kind() != 'prim::Constant':
+        return _unimplemented(name, 'divisor_override')
+    if not stride:
+        stride = kernel_size
+    padding = tuple(tuple_fn(padding))
+    return padding
+
+
+# Metaprogram symbolics for each ATen native specialized cast operator.
+# For e.g. we specify a function named `_cast_uint8_t` that instantiates an
+# ONNX cast node with `to` attribute 'UINT8'
+#
+# TODO: remove these once we support Type's in the JIT IR and we can once again
+# use the unified toType operator
+cast_pytorch_to_onnx = {
+    'Byte': torch.onnx.TensorProtoDataType.UINT8,
+    'Char': torch.onnx.TensorProtoDataType.INT8,
+    'Double': torch.onnx.TensorProtoDataType.DOUBLE,
+    'Float': torch.onnx.TensorProtoDataType.FLOAT,
+    'Half': torch.onnx.TensorProtoDataType.FLOAT16,
+    'Int': torch.onnx.TensorProtoDataType.INT32,
+    'Long': torch.onnx.TensorProtoDataType.INT64,
+    'Short': torch.onnx.TensorProtoDataType.INT16,
+    'Bool': torch.onnx.TensorProtoDataType.BOOL,
+    'ComplexFloat': torch.onnx.TensorProtoDataType.COMPLEX64,
+    'ComplexDouble': torch.onnx.TensorProtoDataType.COMPLEX128,
+    'Undefined': torch.onnx.TensorProtoDataType.UNDEFINED,
+}
+
+# Global set to store the list of quantized operators in the network.
+# This is currently only used in the conversion of quantized ops from PT
+# -> C2 via ONNX.
+_quantized_ops: set = set()
diff --git a/mmcv/mmcv/onnx/symbolic.py b/mmcv/mmcv/onnx/symbolic.py
new file mode 100644
index 0000000000000000000000000000000000000000..3599b3f26683ea2d1907aa5e839e02e474791370
--- /dev/null
+++ b/mmcv/mmcv/onnx/symbolic.py
@@ -0,0 +1,509 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/pytorch/pytorch."""
+import os
+import warnings
+
+import numpy as np
+import torch
+from torch.nn.modules.utils import _pair, _single, _triple
+from torch.onnx.symbolic_helper import parse_args
+from torch.onnx.symbolic_registry import register_op
+
+from .onnx_utils import symbolic_helper as sym_help
+
+
+def _interpolate(name, dim, interpolate_mode):
+
+    def symbolic_fn(g, input, output_size, *args):
+        scales, align_corners = sym_help._get_interpolate_attributes(
+            g, interpolate_mode, args)
+        align_corners = sym_help._maybe_get_scalar(align_corners)
+        transformation_mode = 'asymmetric' \
+            if interpolate_mode == 'nearest' \
+            else 'align_corners' if align_corners else 'pytorch_half_pixel'
+        empty_tensor = g.op(
+            'Constant', value_t=torch.tensor([], dtype=torch.float32))
+
+        if scales is None:
+            if 'ONNX_BACKEND' in os.environ and os.environ[
+                    'ONNX_BACKEND'] == 'TensorRT':
+                input_size = input.type().sizes()
+                # slice the first two dim
+                input_size = input_size[:2]
+                # convert output_size to int type
+                output_size = sym_help._maybe_get_const(output_size, 'is')
+                input_size.extend(output_size)
+                output_size = g.op(
+                    'Constant',
+                    value_t=torch.tensor(input_size, dtype=torch.int64))
+            else:
+                input_size = g.op('Shape', input)
+                input_size_beg = sym_help._slice_helper(
+                    g, input_size, axes=[0], ends=[2], starts=[0])
+                output_size = g.op(
+                    'Cast',
+                    output_size,
+                    to_i=sym_help.cast_pytorch_to_onnx['Long'])
+                output_size = g.op(
+                    'Concat', input_size_beg, output_size, axis_i=0)
+            scales = g.op(
+                'Constant', value_t=torch.tensor([], dtype=torch.float32))
+            return g.op(
+                'Resize',
+                input,
+                empty_tensor,
+                # roi only takes effect with
+                # coordinate_transformation_mode="tf_crop_and_resize"
+                scales,  # scales is not needed since we are sending out_size
+                output_size,
+                coordinate_transformation_mode_s=transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s='floor')  # only valid when mode="nearest"
+        else:
+            return g.op(
+                'Resize',
+                input,
+                empty_tensor,
+                # roi only takes effect with
+                # coordinate_transformation_mode="tf_crop_and_resize"
+                scales,  # scales is not needed since we are sending out_size
+                coordinate_transformation_mode_s=transformation_mode,
+                cubic_coeff_a_f=-0.75,  # only valid when mode="cubic"
+                mode_s=interpolate_mode,  # nearest, linear, or cubic
+                nearest_mode_s='floor')  # only valid when mode="nearest"
+
+    return symbolic_fn
+
+
+upsample_nearest1d = _interpolate('upsample_nearest1d', 3, 'nearest')
+upsample_nearest2d = _interpolate('upsample_nearest2d', 4, 'nearest')
+upsample_nearest3d = _interpolate('upsample_nearest3d', 5, 'nearest')
+upsample_linear1d = _interpolate('upsample_linear1d', 3, 'linear')
+upsample_bilinear2d = _interpolate('upsample_bilinear2d', 4, 'linear')
+upsample_trilinear3d = _interpolate('upsample_trilinear3d', 5, 'linear')
+upsample_bicubic2d = _interpolate('upsample_bicubic2d', 4, 'cubic')
+
+
+@parse_args('v', 'v', 'i', 'i', 'i', 'none')
+def topk(g, self, k, dim, largest, sorted, out=None):
+    return sym_help._topk_helper(
+        g, self, k, dim, largest=largest, sorted=sorted, out=out)
+
+
+def masked_select(g, self, mask):
+    from torch.onnx.symbolic_opset9 import expand_as, nonzero
+    index = nonzero(g, expand_as(g, mask, self))
+    return g.op('GatherND', self, index)
+
+
+def _prepare_onnx_paddings(g, dim, pad):
+    pad_len = torch.onnx.symbolic_opset9.size(
+        g, pad, g.op('Constant', value_t=torch.tensor([0])))
+    # Set extension = [0] * (dim * 2 - len(pad))
+    extension = g.op(
+        'Sub',
+        g.op('Mul',
+             g.op('Constant', value_t=torch.tensor(dim, dtype=torch.int64)),
+             g.op('Constant', value_t=torch.tensor(2, dtype=torch.int64))),
+        pad_len)
+    pad = g.op('Cast', pad, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+    paddings = g.op(
+        'Concat',
+        pad,
+        g.op(
+            'ConstantOfShape',
+            extension,
+            value_t=torch.tensor([0], dtype=torch.int64)),
+        axis_i=0)
+    paddings = g.op('Reshape', paddings,
+                    g.op('Constant', value_t=torch.tensor([-1, 2])))
+    paddings = g.op(
+        'Transpose',
+        torch.onnx.symbolic_opset10.flip(g, paddings, [0]),
+        perm_i=[1, 0])
+    paddings = g.op('Reshape', paddings,
+                    g.op('Constant', value_t=torch.tensor([-1])))
+    padding_c = g.op(
+        'Cast', paddings, to_i=sym_help.cast_pytorch_to_onnx['Long'])
+    return padding_c
+
+
+def constant_pad_nd(g, input, padding, value=None):
+    mode = 'constant'
+    value = sym_help._maybe_get_scalar(value)
+    value = sym_help._if_scalar_type_as(g, value, input)
+    pad = _prepare_onnx_paddings(g, input.type().dim(), padding)
+    return g.op('Pad', input, pad, value, mode_s=mode)
+
+
+def reflection_pad(g, input, padding):
+    mode = 'reflect'
+    paddings = _prepare_onnx_paddings(g, input.type().dim(), padding)
+    return g.op('Pad', input, paddings, mode_s=mode)
+
+
+reflection_pad1d = reflection_pad
+reflection_pad2d = reflection_pad
+reflection_pad3d = reflection_pad
+
+
+def _avg_pool(name, tuple_fn):
+
+    @parse_args('v', 'is', 'is', 'is', 'i', 'i', 'none')
+    def symbolic_fn(g,
+                    input,
+                    kernel_size,
+                    stride,
+                    padding,
+                    ceil_mode,
+                    count_include_pad,
+                    divisor_override=None):
+        padding = sym_help._avgpool_helper(tuple_fn, padding, kernel_size,
+                                           stride, divisor_override, name)
+        if not stride:
+            stride = kernel_size
+        if count_include_pad:
+            input = g.op(
+                'Pad',
+                input,
+                g.op(
+                    'Constant',
+                    value_t=torch.tensor(((0, ) * 2 + padding) * 2)),
+                mode_s='constant')
+            padding = (0, ) * len(padding)
+        output = g.op(
+            'AveragePool',
+            input,
+            kernel_shape_i=tuple_fn(kernel_size),
+            strides_i=tuple_fn(stride),
+            pads_i=padding * 2,
+            ceil_mode_i=ceil_mode)
+        return output
+
+    return symbolic_fn
+
+
+avg_pool1d = _avg_pool('avg_pool1d', _single)
+avg_pool2d = _avg_pool('avg_pool2d', _pair)
+avg_pool3d = _avg_pool('avg_pool3d', _triple)
+
+
+def _get_im2col_indices_along_dim(g, input_d, kernel_size_d, dilation_d,
+                                  padding_d, stride_d):
+    # Input is always 4-D (N, C, H, W)
+    # Calculate indices of sliding blocks along spatial dimension
+    # Slide kernel over input each dim d:
+    # each dimension d ranges from 0 to
+    # input[d]+2xpadding[d]-dilation[d]x(kernel_size[d]-1)
+    # with steps = stride
+
+    blocks_d = g.op('Add', input_d,
+                    g.op('Constant', value_t=torch.tensor(padding_d * 2)))
+    blocks_d = g.op(
+        'Sub', blocks_d,
+        g.op(
+            'Constant',
+            value_t=torch.tensor(dilation_d * (kernel_size_d - 1))))
+
+    # Stride kernel over input and find starting indices along dim d
+    blocks_d_indices = g.op('Range', g.op('Constant', value_t=torch.tensor(0)),
+                            blocks_d,
+                            g.op('Constant', value_t=torch.tensor(stride_d)))
+
+    # Apply dilation on kernel and find its indices along dim d
+    kernel_grid = np.arange(0, kernel_size_d * dilation_d, dilation_d)
+    kernel_grid = g.op('Constant', value_t=torch.tensor([kernel_grid]))
+
+    # Broadcast and add kernel staring positions (indices) with
+    # kernel_grid along dim d, to get block indices along dim d
+    blocks_d_indices = g.op(
+        'Unsqueeze', blocks_d_indices, axes_i=[0])  # Reshape to [1, -1]
+    kernel_mask = g.op('Reshape', kernel_grid,
+                       g.op('Constant', value_t=torch.tensor([-1, 1])))
+    block_mask = g.op('Add', blocks_d_indices, kernel_mask)
+
+    return block_mask
+
+
+def _get_im2col_padded_input(g, input, padding_h, padding_w):
+    # Input is always 4-D tensor (N, C, H, W)
+    # Padding tensor has the following format: (padding_h, padding_w)
+    # Reshape the padding to follow ONNX format:
+    # (dim1_begin, dim2_begin,...,dim1_end, dim2_end,...)
+    pad = g.op(
+        'Constant', value_t=torch.LongTensor([0, 0, padding_h, padding_w] * 2))
+    return g.op('Pad', input, pad)
+
+
+def _get_im2col_output_shape(g, input, kernel_h, kernel_w):
+    batch_dim = size(g, input, g.op('Constant', value_t=torch.tensor(0)))
+    channel_dim = size(g, input, g.op('Constant', value_t=torch.tensor(1)))
+    channel_unfolded = g.op(
+        'Mul', channel_dim,
+        g.op('Constant', value_t=torch.tensor(kernel_h * kernel_w)))
+
+    return g.op(
+        'Concat',
+        g.op('Unsqueeze', batch_dim, axes_i=[0]),
+        g.op('Unsqueeze', channel_unfolded, axes_i=[0]),
+        g.op('Constant', value_t=torch.tensor([-1])),
+        axis_i=0)
+
+
+def size(g, self, dim=None):
+    if dim is None:
+        return g.op('Shape', self)
+    return sym_help._size_helper(g, self, dim)
+
+
+@parse_args('v', 'is', 'is', 'is', 'is')
+def im2col(g, input, kernel_size, dilation, padding, stride):
+    # Input is always 4-D tensor (N, C, H, W)
+    # All other args are int[2]
+
+    input_h = size(g, input, g.op('Constant', value_t=torch.tensor(2)))
+    input_w = size(g, input, g.op('Constant', value_t=torch.tensor(3)))
+
+    stride_h, stride_w = stride[0], stride[1]
+    padding_h, padding_w = padding[0], padding[1]
+    dilation_h, dilation_w = dilation[0], dilation[1]
+    kernel_h, kernel_w = kernel_size[0], kernel_size[1]
+
+    blocks_row_indices = _get_im2col_indices_along_dim(g, input_h, kernel_h,
+                                                       dilation_h, padding_h,
+                                                       stride_h)
+    blocks_col_indices = _get_im2col_indices_along_dim(g, input_w, kernel_w,
+                                                       dilation_w, padding_w,
+                                                       stride_w)
+
+    output_shape = _get_im2col_output_shape(g, input, kernel_h, kernel_w)
+    padded_input = _get_im2col_padded_input(g, input, padding_h, padding_w)
+
+    output = g.op('Gather', padded_input, blocks_row_indices, axis_i=2)
+    output = g.op('Gather', output, blocks_col_indices, axis_i=4)
+    output = g.op('Transpose', output, perm_i=[0, 1, 2, 4, 3, 5])
+    return g.op('Reshape', output, output_shape)
+
+
+@parse_args('v', 'i')
+def one_hot(g, self, num_classes):
+    values = g.op('Constant', value_t=torch.LongTensor([0, 1]))
+    depth = g.op('Constant', value_t=torch.LongTensor([num_classes]))
+    return g.op('OneHot', self, depth, values, axis_i=-1)
+
+
+@parse_args('v', 'i', 'none')
+def softmax(g, input, dim, dtype=None):
+    input_dim = input.type().dim()
+    if input_dim:
+        # TODO: remove this as onnx opset 11 spec allows negative axes
+        if dim < 0:
+            dim = input_dim + dim
+        if input_dim == dim + 1:
+            softmax = g.op('Softmax', input, axis_i=dim)
+            if dtype and dtype.node().kind() != 'prim::Constant':
+                parsed_dtype = sym_help._get_const(dtype, 'i', 'dtype')
+                softmax = g.op(
+                    'Cast',
+                    softmax,
+                    to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
+            return softmax
+
+    max_value = g.op('ReduceMax', input, axes_i=[dim], keepdims_i=1)
+    input = g.op('Sub', input, max_value)
+    exp = g.op('Exp', input)
+    sum = g.op('ReduceSum', exp, axes_i=[dim])
+    softmax = g.op('Div', exp, sum)
+    if dtype and dtype.node().kind() != 'prim::Constant':
+        parsed_dtype = sym_help._get_const(dtype, 'i', 'dtype')
+        softmax = g.op(
+            'Cast', softmax, to_i=sym_help.scalar_type_to_onnx[parsed_dtype])
+    return softmax
+
+
+def _adaptive_pool(name, type, tuple_fn, fn=None):
+
+    @parse_args('v', 'is')
+    def symbolic_fn(g, input, output_size):
+        if output_size == [1] * len(output_size) and type == 'AveragePool':
+            return g.op('GlobalAveragePool', input)
+        if not input.isCompleteTensor():
+            if output_size == [1] * len(output_size):
+                return g.op('GlobalMaxPool', input), None
+            raise NotImplementedError(
+                '[Adaptive pool]:input size not accessible')
+        dim = input.type().sizes()[2:]
+        if output_size == [1] * len(output_size) and type == 'MaxPool':
+            return g.op('GlobalMaxPool', input), None
+
+        # compute stride = floor(input_size / output_size)
+        s = [int(dim[i] / output_size[i]) for i in range(0, len(dim))]
+
+        # compute kernel_size = input_size - (output_size - 1) * stride
+        k = [dim[i] - (output_size[i] - 1) * s[i] for i in range(0, len(dim))]
+
+        # call max_poolxd_with_indices to get indices in the output
+        if type == 'MaxPool':
+            return fn(g, input, k, k, (0, ) * len(dim), (1, ) * len(dim),
+                      False)
+        output = g.op(
+            type,
+            input,
+            kernel_shape_i=tuple_fn(k),
+            strides_i=tuple_fn(s),
+            ceil_mode_i=False)
+        return output
+
+    return symbolic_fn
+
+
+adaptive_avg_pool1d = _adaptive_pool('adaptive_avg_pool1d', 'AveragePool',
+                                     _single)
+adaptive_avg_pool2d = _adaptive_pool('adaptive_avg_pool2d', 'AveragePool',
+                                     _pair)
+adaptive_avg_pool3d = _adaptive_pool('adaptive_avg_pool3d', 'AveragePool',
+                                     _triple)
+
+
+def new_full(g,
+             self,
+             size,
+             fill_value,
+             dtype,
+             layout,
+             device,
+             pin_memory=False):
+    from torch.onnx.symbolic_opset9 import full
+    if dtype is None and self.isCompleteTensor():
+        dtype = self.type().scalarType()
+        dtype = sym_help.scalar_type_to_onnx.index(
+            sym_help.cast_pytorch_to_onnx[dtype])
+    return full(g, size, fill_value, dtype, layout, device, pin_memory)
+
+
+@parse_args('v', 'v', 'i', 'i', 'i')
+def grid_sampler(g,
+                 input,
+                 grid,
+                 interpolation_mode,
+                 padding_mode,
+                 align_corners=False):
+    return g.op(
+        'mmcv::grid_sampler',
+        input,
+        grid,
+        interpolation_mode_i=interpolation_mode,
+        padding_mode_i=padding_mode,
+        align_corners_i=align_corners)
+
+
+@parse_args('v', 'i')
+def cummax(g, input, dim):
+    return g.op('mmcv::cummax', input, dim_i=dim, outputs=2)
+
+
+@parse_args('v', 'i')
+def cummin(g, input, dim):
+    return g.op('mmcv::cummin', input, dim_i=dim, outputs=2)
+
+
+@parse_args('v', 'v', 'is')
+def roll(g, input, shifts, dims):
+    from packaging import version
+    from torch.onnx.symbolic_opset9 import squeeze
+    input_shape = g.op('Shape', input)
+
+    need_flatten = len(dims) == 0
+    # If dims is not specified, the tensor will be flattened before
+    # rolling and then restored to the original shape.
+    if need_flatten:
+        resize_shape = input_shape
+        input = g.op('Reshape', input,
+                     g.op('Constant', value_t=torch.LongTensor([1, -1])))
+        input_shape = g.op('Shape', input)
+        dims = [1]
+
+    for index, dim in enumerate(dims):
+        end_size = sym_help._slice_helper(
+            g, input_shape, axes=[0], ends=[dim + 1], starts=[dim])
+        shift_size = sym_help._slice_helper(
+            g, shifts, axes=[0], ends=[index + 1], starts=[index])
+        slice_size = g.op('Sub', end_size, shift_size)
+
+        # Can not use Mod because tensorrt does not support
+        div_size = g.op('Div', slice_size, end_size)
+        slice_size = g.op('Sub', slice_size, g.op('Mul', end_size, div_size))
+
+        if version.parse(torch.__version__) >= version.parse('1.7.0'):
+            # add dim=0 for pytorch 1.9.0
+            end_size = squeeze(g, end_size, 0)
+            slice_size = squeeze(g, slice_size, 0)
+        else:
+            end_size = g.op('Squeeze', end_size)
+            slice_size = g.op('Squeeze', slice_size)
+            dim = torch.LongTensor([dim])
+
+        input_slice0 = sym_help._slice_helper(
+            g,
+            input,
+            axes=dim,
+            starts=torch.LongTensor([0]),
+            ends=slice_size,
+            dynamic_slice=True)
+        input_slice1 = sym_help._slice_helper(
+            g,
+            input,
+            axes=dim,
+            ends=end_size,
+            starts=slice_size,
+            dynamic_slice=True)
+
+        input = g.op('Concat', input_slice1, input_slice0, axis_i=dim)
+
+    if need_flatten:
+        input = g.op('Reshape', input, resize_shape)
+
+    return input
+
+
+def register_extra_symbolics(opset=11):
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    register_op('one_hot', one_hot, '', opset)
+    register_op('im2col', im2col, '', opset)
+    register_op('topk', topk, '', opset)
+    register_op('softmax', softmax, '', opset)
+    register_op('constant_pad_nd', constant_pad_nd, '', opset)
+    register_op('reflection_pad1d', reflection_pad1d, '', opset)
+    register_op('reflection_pad2d', reflection_pad2d, '', opset)
+    register_op('reflection_pad3d', reflection_pad3d, '', opset)
+    register_op('avg_pool1d', avg_pool1d, '', opset)
+    register_op('avg_pool2d', avg_pool2d, '', opset)
+    register_op('avg_pool3d', avg_pool3d, '', opset)
+    register_op('adaptive_avg_pool1d', adaptive_avg_pool1d, '', opset)
+    register_op('adaptive_avg_pool2d', adaptive_avg_pool2d, '', opset)
+    register_op('adaptive_avg_pool3d', adaptive_avg_pool3d, '', opset)
+    register_op('masked_select', masked_select, '', opset)
+    register_op('upsample_nearest1d', upsample_nearest1d, '', opset)
+    register_op('upsample_nearest2d', upsample_nearest2d, '', opset)
+    register_op('upsample_nearest3d', upsample_nearest3d, '', opset)
+    register_op('upsample_linear1d', upsample_linear1d, '', opset)
+    register_op('upsample_bilinear2d', upsample_bilinear2d, '', opset)
+    register_op('upsample_trilinear3d', upsample_trilinear3d, '', opset)
+    register_op('upsample_bicubic2d', upsample_bicubic2d, '', opset)
+    register_op('new_full', new_full, '', opset)
+    register_op('grid_sampler', grid_sampler, '', opset)
+    register_op('cummax', cummax, '', opset)
+    register_op('cummin', cummin, '', opset)
+    register_op('roll', roll, '', opset)
diff --git a/mmcv/mmcv/ops/__init__.py b/mmcv/mmcv/ops/__init__.py
new file mode 100755
index 0000000000000000000000000000000000000000..a65f14fff5f92039947d82a291fca09408f69f87
--- /dev/null
+++ b/mmcv/mmcv/ops/__init__.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .active_rotated_filter import active_rotated_filter
+from .assign_score_withk import assign_score_withk
+from .ball_query import ball_query
+from .bbox import bbox_overlaps
+from .border_align import BorderAlign, border_align
+from .box_iou_rotated import box_iou_rotated
+from .carafe import CARAFE, CARAFENaive, CARAFEPack, carafe, carafe_naive
+from .cc_attention import CrissCrossAttention
+from .chamfer_distance import chamfer_distance
+from .contour_expand import contour_expand
+from .convex_iou import convex_giou, convex_iou
+from .corner_pool import CornerPool
+from .correlation import Correlation
+from .deform_conv import DeformConv2d, DeformConv2dPack, deform_conv2d
+from .deform_roi_pool import (DeformRoIPool, DeformRoIPoolPack,
+                              ModulatedDeformRoIPoolPack, deform_roi_pool)
+from .deprecated_wrappers import Conv2d_deprecated as Conv2d
+from .deprecated_wrappers import ConvTranspose2d_deprecated as ConvTranspose2d
+from .deprecated_wrappers import Linear_deprecated as Linear
+from .deprecated_wrappers import MaxPool2d_deprecated as MaxPool2d
+from .diff_iou_rotated import diff_iou_rotated_2d, diff_iou_rotated_3d
+from .focal_loss import (SigmoidFocalLoss, SoftmaxFocalLoss,
+                         sigmoid_focal_loss, softmax_focal_loss)
+from .furthest_point_sample import (furthest_point_sample,
+                                    furthest_point_sample_with_dist)
+from .fused_bias_leakyrelu import FusedBiasLeakyReLU, fused_bias_leakyrelu
+from .gather_points import gather_points
+from .group_points import GroupAll, QueryAndGroup, grouping_operation
+from .info import (get_compiler_version, get_compiling_cuda_version,
+                   get_onnxruntime_op_path)
+from .iou3d import (boxes_iou3d, boxes_iou_bev, boxes_overlap_bev, nms3d,
+                    nms3d_normal, nms_bev, nms_normal_bev)
+from .knn import knn
+from .masked_conv import MaskedConv2d, masked_conv2d
+from .min_area_polygons import min_area_polygons
+from .modulated_deform_conv import (ModulatedDeformConv2d,
+                                    ModulatedDeformConv2dPack,
+                                    modulated_deform_conv2d)
+from .multi_scale_deform_attn import MultiScaleDeformableAttention
+from .nms import batched_nms, nms, nms_match, nms_rotated, soft_nms
+from .pixel_group import pixel_group
+from .point_sample import (SimpleRoIAlign, point_sample,
+                           rel_roi_point_to_rel_img_point)
+from .points_in_boxes import (points_in_boxes_all, points_in_boxes_cpu,
+                              points_in_boxes_part)
+from .points_in_polygons import points_in_polygons
+from .points_sampler import PointsSampler
+from .prroi_pool import PrRoIPool, prroi_pool
+from .psa_mask import PSAMask
+from .riroi_align_rotated import RiRoIAlignRotated, riroi_align_rotated
+from .roi_align import RoIAlign, roi_align
+from .roi_align_rotated import RoIAlignRotated, roi_align_rotated
+from .roi_pool import RoIPool, roi_pool
+from .roiaware_pool3d import RoIAwarePool3d
+from .roipoint_pool3d import RoIPointPool3d
+from .rotated_feature_align import rotated_feature_align
+from .saconv import SAConv2d
+from .scatter_points import DynamicScatter, dynamic_scatter
+from .sparse_conv import (SparseConv2d, SparseConv3d, SparseConvTranspose2d,
+                          SparseConvTranspose3d, SparseInverseConv2d,
+                          SparseInverseConv3d, SubMConv2d, SubMConv3d)
+from .sparse_modules import SparseModule, SparseSequential
+from .sparse_pool import SparseMaxPool2d, SparseMaxPool3d
+from .sparse_structure import SparseConvTensor, scatter_nd
+from .sync_bn import SyncBatchNorm
+from .three_interpolate import three_interpolate
+from .three_nn import three_nn
+from .tin_shift import TINShift, tin_shift
+from .upfirdn2d import upfirdn2d
+from .voxelize import Voxelization, voxelization
+
+__all__ = [
+    'bbox_overlaps', 'CARAFE', 'CARAFENaive', 'CARAFEPack', 'carafe',
+    'carafe_naive', 'CornerPool', 'DeformConv2d', 'DeformConv2dPack',
+    'deform_conv2d', 'DeformRoIPool', 'DeformRoIPoolPack',
+    'ModulatedDeformRoIPoolPack', 'deform_roi_pool', 'SigmoidFocalLoss',
+    'SoftmaxFocalLoss', 'sigmoid_focal_loss', 'softmax_focal_loss',
+    'get_compiler_version', 'get_compiling_cuda_version',
+    'get_onnxruntime_op_path', 'MaskedConv2d', 'masked_conv2d',
+    'ModulatedDeformConv2d', 'ModulatedDeformConv2dPack',
+    'modulated_deform_conv2d', 'batched_nms', 'nms', 'soft_nms', 'nms_match',
+    'RoIAlign', 'roi_align', 'RoIPool', 'roi_pool', 'SyncBatchNorm', 'Conv2d',
+    'ConvTranspose2d', 'Linear', 'MaxPool2d', 'CrissCrossAttention', 'PSAMask',
+    'point_sample', 'rel_roi_point_to_rel_img_point', 'SimpleRoIAlign',
+    'SAConv2d', 'TINShift', 'tin_shift', 'assign_score_withk',
+    'box_iou_rotated', 'RoIPointPool3d', 'nms_rotated', 'knn', 'ball_query',
+    'upfirdn2d', 'FusedBiasLeakyReLU', 'fused_bias_leakyrelu',
+    'rotated_feature_align', 'RiRoIAlignRotated', 'riroi_align_rotated',
+    'RoIAlignRotated', 'roi_align_rotated', 'pixel_group', 'QueryAndGroup',
+    'GroupAll', 'grouping_operation', 'contour_expand', 'three_nn',
+    'three_interpolate', 'MultiScaleDeformableAttention', 'BorderAlign',
+    'border_align', 'gather_points', 'furthest_point_sample',
+    'furthest_point_sample_with_dist', 'PointsSampler', 'Correlation',
+    'boxes_iou3d', 'boxes_iou_bev', 'boxes_overlap_bev', 'nms_bev',
+    'nms_normal_bev', 'nms3d', 'nms3d_normal', 'Voxelization', 'voxelization',
+    'dynamic_scatter', 'DynamicScatter', 'RoIAwarePool3d', 'SparseConv2d',
+    'SparseConv3d', 'SparseConvTranspose2d', 'SparseConvTranspose3d',
+    'SparseInverseConv2d', 'SparseInverseConv3d', 'SubMConv2d', 'SubMConv3d',
+    'SparseModule', 'SparseSequential', 'SparseMaxPool2d', 'SparseMaxPool3d',
+    'SparseConvTensor', 'scatter_nd', 'points_in_boxes_part',
+    'points_in_boxes_cpu', 'points_in_boxes_all', 'points_in_polygons',
+    'min_area_polygons', 'active_rotated_filter', 'convex_iou', 'convex_giou',
+    'diff_iou_rotated_2d', 'diff_iou_rotated_3d', 'chamfer_distance',
+    'PrRoIPool', 'prroi_pool'
+]
diff --git a/mmcv/mmcv/ops/active_rotated_filter.py b/mmcv/mmcv/ops/active_rotated_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..46c2aa7806ab62a6d0544f6dc1fb609af3a8a483
--- /dev/null
+++ b/mmcv/mmcv/ops/active_rotated_filter.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['active_rotated_filter_forward', 'active_rotated_filter_backward'])
+
+
+class ActiveRotatedFilterFunction(Function):
+    """Encoding the orientation information and generating orientation-
+    sensitive features.
+
+    The details are described in the paper `Align Deep Features for Oriented
+    Object Detection  <https://arxiv.org/abs/2008.09397>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input (torch.Tensor): Input features with shape
+                [num_output_planes, num_input_planes, num_orientations, H, W].
+            indices (torch.Tensor): Indices with shape
+                [num_orientations, H, W, num_rotations].
+
+        Returns:
+            torch.Tensor: Refined features with shape [num_output_planes *
+            num_rotations, num_input_planes * num_orientations, H, W].
+        """
+        ctx.save_for_backward(input, indices)
+        op, ip, o, h, w = input.size()
+        o, h, w, r = indices.size()
+        output = input.new_zeros((op * r, ip * o, h, w))
+        ext_module.active_rotated_filter_forward(input, indices, output)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        """
+        Args:
+            grad_output (torch.Tensor): The gradiant of output features
+                with shape [num_output_planes * num_rotations,
+                num_input_planes * num_orientations, H, W].
+
+        Returns:
+            torch.Tensor: The gradiant of input features with shape
+            [num_output_planes, num_input_planes, num_orientations, H, W].
+        """
+        input, indices = ctx.saved_tensors
+        grad_in = torch.zeros_like(input)
+        ext_module.active_rotated_filter_backward(grad_out, indices, grad_in)
+        return grad_in, None
+
+
+active_rotated_filter = ActiveRotatedFilterFunction.apply
diff --git a/mmcv/mmcv/ops/assign_score_withk.py b/mmcv/mmcv/ops/assign_score_withk.py
new file mode 100644
index 0000000000000000000000000000000000000000..deca0892bddc52b51e9d2543a9e893f0bd67ebdb
--- /dev/null
+++ b/mmcv/mmcv/ops/assign_score_withk.py
@@ -0,0 +1,131 @@
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['assign_score_withk_forward', 'assign_score_withk_backward'])
+
+
+class AssignScoreWithK(Function):
+    r"""Perform weighted sum to generate output features according to scores.
+    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
+    scene_seg/lib/paconv_lib/src/gpu>`_.
+
+    This is a memory-efficient CUDA implementation of assign_scores operation,
+    which first transform all point features with weight bank, then assemble
+    neighbor features with ``knn_idx`` and perform weighted sum of ``scores``.
+
+    See the `paper <https://arxiv.org/pdf/2103.14635.pdf>`_ appendix Sec. D for
+        more detailed descriptions.
+
+    Note:
+        This implementation assumes using ``neighbor`` kernel input, which is
+            (point_features - center_features, point_features).
+        See https://github.com/CVMI-Lab/PAConv/blob/main/scene_seg/model/
+        pointnet2/paconv.py#L128 for more details.
+    """
+
+    @staticmethod
+    def forward(ctx,
+                scores: torch.Tensor,
+                point_features: torch.Tensor,
+                center_features: torch.Tensor,
+                knn_idx: torch.Tensor,
+                aggregate: str = 'sum') -> torch.Tensor:
+        """
+        Args:
+            scores (torch.Tensor): (B, npoint, K, M), predicted scores to
+                aggregate weight matrices in the weight bank.
+                ``npoint`` is the number of sampled centers.
+                ``K`` is the number of queried neighbors.
+                ``M`` is the number of weight matrices in the weight bank.
+            point_features (torch.Tensor): (B, N, M, out_dim)
+                Pre-computed point features to be aggregated.
+            center_features (torch.Tensor): (B, N, M, out_dim)
+                Pre-computed center features to be aggregated.
+            knn_idx (torch.Tensor): (B, npoint, K), index of sampled kNN.
+                We assume the first idx in each row is the idx of the center.
+            aggregate (str, optional): Aggregation method.
+                Can be 'sum', 'avg' or 'max'. Defaults: 'sum'.
+
+        Returns:
+            torch.Tensor: (B, out_dim, npoint, K), the aggregated features.
+        """
+        agg = {'sum': 0, 'avg': 1, 'max': 2}
+
+        B, N, M, out_dim = point_features.size()
+        _, npoint, K, _ = scores.size()
+
+        output = point_features.new_zeros((B, out_dim, npoint, K))
+        ext_module.assign_score_withk_forward(
+            point_features.contiguous(),
+            center_features.contiguous(),
+            scores.contiguous(),
+            knn_idx.contiguous(),
+            output,
+            B=B,
+            N0=N,
+            N1=npoint,
+            M=M,
+            K=K,
+            O=out_dim,
+            aggregate=agg[aggregate])
+
+        ctx.save_for_backward(output, point_features, center_features, scores,
+                              knn_idx)
+        ctx.agg = agg[aggregate]
+
+        return output
+
+    @staticmethod
+    def backward(
+        ctx, grad_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, None, None]:
+        """
+        Args:
+            grad_out (torch.Tensor): (B, out_dim, npoint, K)
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains five elements. The first one
+            is the gradient of ``scores`` whose shape is (B, npoint, K, M). The
+            second is the gradient of ``point_features`` whose shape is
+            (B, N, M, out_dim). The third is the gradient of
+            ``center_features`` with the shape of (B, N, M, out_dim). The last
+            two are ``None``.
+        """
+        _, point_features, center_features, scores, knn_idx = ctx.saved_tensors
+
+        agg = ctx.agg
+
+        B, N, M, out_dim = point_features.size()
+        _, npoint, K, _ = scores.size()
+
+        grad_point_features = point_features.new_zeros(point_features.shape)
+        grad_center_features = center_features.new_zeros(center_features.shape)
+        grad_scores = scores.new_zeros(scores.shape)
+
+        ext_module.assign_score_withk_backward(
+            grad_out.contiguous(),
+            point_features.contiguous(),
+            center_features.contiguous(),
+            scores.contiguous(),
+            knn_idx.contiguous(),
+            grad_point_features,
+            grad_center_features,
+            grad_scores,
+            B=B,
+            N0=N,
+            N1=npoint,
+            M=M,
+            K=K,
+            O=out_dim,
+            aggregate=agg)
+
+        return grad_scores, grad_point_features, \
+            grad_center_features, None, None
+
+
+assign_score_withk = AssignScoreWithK.apply
diff --git a/mmcv/mmcv/ops/ball_query.py b/mmcv/mmcv/ops/ball_query.py
new file mode 100644
index 0000000000000000000000000000000000000000..d24e0446ca81a19a9e2d4b822cb32533f941d78f
--- /dev/null
+++ b/mmcv/mmcv/ops/ball_query.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['ball_query_forward'])
+
+
+class BallQuery(Function):
+    """Find nearby points in spherical space."""
+
+    @staticmethod
+    def forward(ctx, min_radius: float, max_radius: float, sample_num: int,
+                xyz: torch.Tensor, center_xyz: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            min_radius (float): minimum radius of the balls.
+            max_radius (float): maximum radius of the balls.
+            sample_num (int): maximum number of features in the balls.
+            xyz (torch.Tensor): (B, N, 3) xyz coordinates of the features.
+            center_xyz (torch.Tensor): (B, npoint, 3) centers of the ball
+                query.
+
+        Returns:
+            torch.Tensor: (B, npoint, nsample) tensor with the indices of the
+            features that form the query balls.
+        """
+        assert center_xyz.is_contiguous()
+        assert xyz.is_contiguous()
+        assert min_radius < max_radius
+
+        B, N, _ = xyz.size()
+        npoint = center_xyz.size(1)
+        idx = xyz.new_zeros(B, npoint, sample_num, dtype=torch.int)
+
+        ext_module.ball_query_forward(
+            center_xyz,
+            xyz,
+            idx,
+            b=B,
+            n=N,
+            m=npoint,
+            min_radius=min_radius,
+            max_radius=max_radius,
+            nsample=sample_num)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, a=None) -> Tuple[None, None, None, None]:
+        return None, None, None, None
+
+
+ball_query = BallQuery.apply
diff --git a/mmcv/mmcv/ops/bbox.py b/mmcv/mmcv/ops/bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf6bd43bbb0adcb4b6d104a815f73ed2e5912069
--- /dev/null
+++ b/mmcv/mmcv/ops/bbox.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['bbox_overlaps'])
+
+
+def _bbox_overlaps_cpu(bboxes1: torch.Tensor,
+                       bboxes2: torch.Tensor,
+                       mode: str = 'iou',
+                       aligned: bool = False,
+                       offset: int = 0) -> torch.Tensor:
+    assert mode in ['iou', 'iof']
+
+    if aligned:
+        lt = torch.max(bboxes1[:, :2], bboxes2[:, :2])  # [rows, 2]
+        rb = torch.min(bboxes1[:, 2:], bboxes2[:, 2:])  # [rows, 2]
+
+        wh = (rb - lt + offset).clamp(min=0)  # [rows, 2]
+        overlap = wh[:, 0] * wh[:, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + offset)
+            ious = overlap / (area1 + area2 - overlap)
+        else:
+            ious = overlap / area1
+    else:
+        lt = torch.max(bboxes1[:, None, :2], bboxes2[:, :2])  # [rows, cols, 2]
+        rb = torch.min(bboxes1[:, None, 2:], bboxes2[:, 2:])  # [rows, cols, 2]
+
+        wh = (rb - lt + offset).clamp(min=0)  # [rows, cols, 2]
+        overlap = wh[:, :, 0] * wh[:, :, 1]
+        area1 = (bboxes1[:, 2] - bboxes1[:, 0] + offset) * (
+            bboxes1[:, 3] - bboxes1[:, 1] + offset)
+
+        if mode == 'iou':
+            area2 = (bboxes2[:, 2] - bboxes2[:, 0] + offset) * (
+                bboxes2[:, 3] - bboxes2[:, 1] + offset)
+            ious = overlap / (area1[:, None] + area2 - overlap)
+        else:
+            ious = overlap / (area1[:, None])
+
+    return ious
+
+
+def bbox_overlaps(bboxes1: torch.Tensor,
+                  bboxes2: torch.Tensor,
+                  mode: str = 'iou',
+                  aligned: bool = False,
+                  offset: int = 0) -> torch.Tensor:
+    """Calculate overlap between two set of bboxes.
+
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    Args:
+        bboxes1 (torch.Tensor): shape (m, 4) in <x1, y1, x2, y2> format or
+            empty.
+        bboxes2 (torch.Tensor): shape (n, 4) in <x1, y1, x2, y2> format or
+            empty. If aligned is ``True``, then m and n must be equal.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+
+    Returns:
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (m, n) else (m, 1).
+
+    Example:
+        >>> bboxes1 = torch.FloatTensor([
+        >>>     [0, 0, 10, 10],
+        >>>     [10, 10, 20, 20],
+        >>>     [32, 32, 38, 42],
+        >>> ])
+        >>> bboxes2 = torch.FloatTensor([
+        >>>     [0, 0, 10, 20],
+        >>>     [0, 10, 10, 19],
+        >>>     [10, 10, 20, 20],
+        >>> ])
+        >>> bbox_overlaps(bboxes1, bboxes2)
+        tensor([[0.5000, 0.0000, 0.0000],
+                [0.0000, 0.0000, 1.0000],
+                [0.0000, 0.0000, 0.0000]])
+
+    Example:
+        >>> empty = torch.FloatTensor([])
+        >>> nonempty = torch.FloatTensor([
+        >>>     [0, 0, 10, 9],
+        >>> ])
+        >>> assert tuple(bbox_overlaps(empty, nonempty).shape) == (0, 1)
+        >>> assert tuple(bbox_overlaps(nonempty, empty).shape) == (1, 0)
+        >>> assert tuple(bbox_overlaps(empty, empty).shape) == (0, 0)
+    """
+
+    mode_dict = {'iou': 0, 'iof': 1}
+    assert mode in mode_dict.keys()
+    mode_flag = mode_dict[mode]
+    # Either the boxes are empty or the length of boxes' last dimension is 4
+    assert (bboxes1.size(-1) == 4 or bboxes1.size(0) == 0)
+    assert (bboxes2.size(-1) == 4 or bboxes2.size(0) == 0)
+    assert offset == 1 or offset == 0
+
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if aligned:
+        assert rows == cols
+
+    if rows * cols == 0:
+        return bboxes1.new(rows, 1) if aligned else bboxes1.new(rows, cols)
+
+    if bboxes1.device.type == 'cpu':
+        return _bbox_overlaps_cpu(
+            bboxes1, bboxes2, mode=mode, aligned=aligned, offset=offset)
+    else:
+        if aligned:
+            ious = bboxes1.new_zeros(rows)
+        else:
+            ious = bboxes1.new_zeros((rows, cols))
+        ext_module.bbox_overlaps(
+            bboxes1,
+            bboxes2,
+            ious,
+            mode=mode_flag,
+            aligned=aligned,
+            offset=offset)
+        return ious
diff --git a/mmcv/mmcv/ops/border_align.py b/mmcv/mmcv/ops/border_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..c09501b962cfce10b1da87e6b651d61911eb8406
--- /dev/null
+++ b/mmcv/mmcv/ops/border_align.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# modified from
+# https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/border_align.py
+
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['border_align_forward', 'border_align_backward'])
+
+
+class BorderAlignFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, boxes, pool_size):
+        return g.op(
+            'mmcv::MMCVBorderAlign', input, boxes, pool_size_i=pool_size)
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, boxes: torch.Tensor,
+                pool_size: int) -> torch.Tensor:
+        ctx.pool_size = pool_size
+        ctx.input_shape = input.size()
+
+        assert boxes.ndim == 3, 'boxes must be with shape [B, H*W, 4]'
+        assert boxes.size(2) == 4, \
+            'the last dimension of boxes must be (x1, y1, x2, y2)'
+        assert input.size(1) % 4 == 0, \
+            'the channel for input feature must be divisible by factor 4'
+
+        # [B, C//4, H*W, 4]
+        output_shape = (input.size(0), input.size(1) // 4, boxes.size(1), 4)
+        output = input.new_zeros(output_shape)
+        # `argmax_idx` only used for backward
+        argmax_idx = input.new_zeros(output_shape).to(torch.int)
+
+        ext_module.border_align_forward(
+            input, boxes, output, argmax_idx, pool_size=ctx.pool_size)
+
+        ctx.save_for_backward(boxes, argmax_idx)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx,
+                 grad_output: torch.Tensor) -> Tuple[torch.Tensor, None, None]:
+        boxes, argmax_idx = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        # complex head architecture may cause grad_output uncontiguous
+        grad_output = grad_output.contiguous()
+        ext_module.border_align_backward(
+            grad_output,
+            boxes,
+            argmax_idx,
+            grad_input,
+            pool_size=ctx.pool_size)
+        return grad_input, None, None
+
+
+border_align = BorderAlignFunction.apply
+
+
+class BorderAlign(nn.Module):
+    r"""Border align pooling layer.
+
+    Applies border_align over the input feature based on predicted bboxes.
+    The details were described in the paper
+    `BorderDet: Border Feature for Dense Object Detection
+    <https://arxiv.org/abs/2007.11056>`_.
+
+    For each border line (e.g. top, left, bottom or right) of each box,
+    border_align does the following:
+
+    1. uniformly samples ``pool_size`` +1 positions on this line, involving
+       the start and end points.
+    2. the corresponding features on these points are computed by bilinear
+       interpolation.
+    3. max pooling over all the ``pool_size`` +1 positions are used for
+       computing pooled feature.
+
+    Args:
+        pool_size (int): number of positions sampled over the boxes' borders
+            (e.g. top, bottom, left, right).
+    """
+
+    def __init__(self, pool_size: int):
+        super().__init__()
+        self.pool_size = pool_size
+
+    def forward(self, input: torch.Tensor,
+                boxes: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input: Features with shape [N,4C,H,W]. Channels ranged in [0,C),
+                [C,2C), [2C,3C), [3C,4C) represent the top, left, bottom,
+                right features respectively.
+            boxes: Boxes with shape [N,H*W,4]. Coordinate format (x1,y1,x2,y2).
+
+        Returns:
+            torch.Tensor: Pooled features with shape [N,C,H*W,4]. The order is
+            (top,left,bottom,right) for the last dimension.
+        """
+        return border_align(input, boxes, self.pool_size)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(pool_size={self.pool_size})'
+        return s
diff --git a/mmcv/mmcv/ops/box_iou_rotated.py b/mmcv/mmcv/ops/box_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..2443af27c92146ed4328e8f94b1415c7e72c542b
--- /dev/null
+++ b/mmcv/mmcv/ops/box_iou_rotated.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['box_iou_rotated'])
+
+
+def box_iou_rotated(bboxes1: torch.Tensor,
+                    bboxes2: torch.Tensor,
+                    mode: str = 'iou',
+                    aligned: bool = False,
+                    clockwise: bool = True) -> torch.Tensor:
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Both sets of boxes are expected to be in
+    (x_center, y_center, width, height, angle) format.
+
+    If ``aligned`` is ``False``, then calculate the ious between each bbox
+    of bboxes1 and bboxes2, otherwise the ious between each aligned pair of
+    bboxes1 and bboxes2.
+
+    .. note::
+        The operator assumes:
+
+        1) The positive direction along x axis is left -> right.
+
+        2) The positive direction along y axis is top -> down.
+
+        3) The w border is in parallel with x axis when angle = 0.
+
+        However, there are 2 opposite definitions of the positive angular
+        direction, clockwise (CW) and counter-clockwise (CCW). MMCV supports
+        both definitions and uses CW by default.
+
+        Please set ``clockwise=False`` if you are using the CCW definition.
+
+        The coordinate system when ``clockwise`` is ``True`` (default)
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & -\\sin\\alpha \\\\
+                \\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha+0.5h\\sin\\alpha
+                \\\\
+                y_{center}-0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+
+        The coordinate system when ``clockwise`` is ``False``
+
+            .. code-block:: none
+
+                0-------------------> x (0 rad)
+                |  A-------------B
+                |  |             |
+                |  |     box     h
+                |  |   angle=0   |
+                |  D------w------C
+                v
+                y (-pi/2 rad)
+
+            In such coordination system the rotation matrix is
+
+            .. math::
+                \\begin{pmatrix}
+                \\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha
+                \\end{pmatrix}
+
+            The coordinates of the corner point A can be calculated as:
+
+            .. math::
+                P_A=
+                \\begin{pmatrix} x_A \\\\ y_A\\end{pmatrix}
+                =
+                \\begin{pmatrix} x_{center} \\\\ y_{center}\\end{pmatrix} +
+                \\begin{pmatrix}\\cos\\alpha & \\sin\\alpha \\\\
+                -\\sin\\alpha & \\cos\\alpha\\end{pmatrix}
+                \\begin{pmatrix} -0.5w \\\\ -0.5h\\end{pmatrix} \\\\
+                =
+                \\begin{pmatrix} x_{center}-0.5w\\cos\\alpha-0.5h\\sin\\alpha
+                \\\\
+                y_{center}+0.5w\\sin\\alpha-0.5h\\cos\\alpha\\end{pmatrix}
+
+    Args:
+        boxes1 (torch.Tensor): rotated bboxes 1. It has shape (N, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
+        boxes2 (torch.Tensor): rotated bboxes 2. It has shape (M, 5),
+            indicating (x, y, w, h, theta) for each row. Note that theta is in
+            radian.
+        mode (str): "iou" (intersection over union) or iof (intersection over
+            foreground).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`
+
+    Returns:
+        torch.Tensor: Return the ious betweens boxes. If ``aligned`` is
+        ``False``, the shape of ious is (N, M) else (N,).
+    """
+    assert mode in ['iou', 'iof']
+    mode_dict = {'iou': 0, 'iof': 1}
+    mode_flag = mode_dict[mode]
+    rows = bboxes1.size(0)
+    cols = bboxes2.size(0)
+    if aligned:
+        ious = bboxes1.new_zeros(rows)
+    else:
+        ious = bboxes1.new_zeros(rows * cols)
+    if not clockwise:
+        flip_mat = bboxes1.new_ones(bboxes1.shape[-1])
+        flip_mat[-1] = -1
+        bboxes1 = bboxes1 * flip_mat
+        bboxes2 = bboxes2 * flip_mat
+    bboxes1 = bboxes1.contiguous()
+    bboxes2 = bboxes2.contiguous()
+    ext_module.box_iou_rotated(
+        bboxes1, bboxes2, ious, mode_flag=mode_flag, aligned=aligned)
+    if not aligned:
+        ious = ious.view(rows, cols)
+    return ious
diff --git a/mmcv/mmcv/ops/carafe.py b/mmcv/mmcv/ops/carafe.py
new file mode 100644
index 0000000000000000000000000000000000000000..18230c08074f5309e791810a4774e294084c3f5b
--- /dev/null
+++ b/mmcv/mmcv/ops/carafe.py
@@ -0,0 +1,301 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.autograd import Function
+from torch.nn.modules.module import Module
+
+from ..cnn import UPSAMPLE_LAYERS, normal_init, xavier_init
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'carafe_naive_forward', 'carafe_naive_backward', 'carafe_forward',
+    'carafe_backward'
+])
+
+
+class CARAFENaiveFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+                 group_size: int, scale_factor: int) -> Tensor:
+        return g.op(
+            'mmcv::MMCVCARAFENaive',
+            features,
+            masks,
+            kernel_size_i=kernel_size,
+            group_size_i=group_size,
+            scale_factor_f=scale_factor)
+
+    @staticmethod
+    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+                group_size: int, scale_factor: int) -> Tensor:
+        assert scale_factor >= 1
+        assert masks.size(1) == kernel_size * kernel_size * group_size
+        assert masks.size(-1) == features.size(-1) * scale_factor
+        assert masks.size(-2) == features.size(-2) * scale_factor
+        assert features.size(1) % group_size == 0
+        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+        ctx.kernel_size = kernel_size
+        ctx.group_size = group_size
+        ctx.scale_factor = scale_factor
+        ctx.feature_size = features.size()
+        ctx.mask_size = masks.size()
+
+        n, c, h, w = features.size()
+        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+        ext_module.carafe_naive_forward(
+            features,
+            masks,
+            output,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        if features.requires_grad or masks.requires_grad or \
+                torch.__version__ == 'parrots':
+            ctx.save_for_backward(features, masks)
+        return output
+
+    @staticmethod
+    def backward(
+            ctx,
+            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
+        assert grad_output.is_cuda
+
+        features, masks = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        group_size = ctx.group_size
+        scale_factor = ctx.scale_factor
+
+        grad_input = torch.zeros_like(features)
+        grad_masks = torch.zeros_like(masks)
+        ext_module.carafe_naive_backward(
+            grad_output.contiguous(),
+            features,
+            masks,
+            grad_input,
+            grad_masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        return grad_input, grad_masks, None, None, None
+
+
+carafe_naive = CARAFENaiveFunction.apply
+
+
+class CARAFENaive(Module):
+
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super().__init__()
+
+        assert isinstance(kernel_size, int) and isinstance(
+            group_size, int) and isinstance(scale_factor, int)
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+
+    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+        return carafe_naive(features, masks, self.kernel_size, self.group_size,
+                            self.scale_factor)
+
+
+class CARAFEFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features: Tensor, masks: Tensor, kernel_size: int,
+                 group_size: int, scale_factor: int) -> Tensor:
+        return g.op(
+            'mmcv::MMCVCARAFE',
+            features,
+            masks,
+            kernel_size_i=kernel_size,
+            group_size_i=group_size,
+            scale_factor_f=scale_factor)
+
+    @staticmethod
+    def forward(ctx, features: Tensor, masks: Tensor, kernel_size: int,
+                group_size: int, scale_factor: int) -> Tensor:
+        assert scale_factor >= 1
+        assert masks.size(1) == kernel_size * kernel_size * group_size
+        assert masks.size(-1) == features.size(-1) * scale_factor
+        assert masks.size(-2) == features.size(-2) * scale_factor
+        assert features.size(1) % group_size == 0
+        assert (kernel_size - 1) % 2 == 0 and kernel_size >= 1
+        ctx.kernel_size = kernel_size
+        ctx.group_size = group_size
+        ctx.scale_factor = scale_factor
+        ctx.feature_size = features.size()
+        ctx.mask_size = masks.size()
+
+        n, c, h, w = features.size()
+        output = features.new_zeros((n, c, h * scale_factor, w * scale_factor))
+        routput = features.new_zeros(output.size(), requires_grad=False)
+        rfeatures = features.new_zeros(features.size(), requires_grad=False)
+        rmasks = masks.new_zeros(masks.size(), requires_grad=False)
+        ext_module.carafe_forward(
+            features,
+            masks,
+            rfeatures,
+            routput,
+            rmasks,
+            output,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+
+        if features.requires_grad or masks.requires_grad or \
+                torch.__version__ == 'parrots':
+            ctx.save_for_backward(features, masks, rfeatures)
+        return output
+
+    @staticmethod
+    def backward(
+            ctx,
+            grad_output: Tensor) -> Tuple[Tensor, Tensor, None, None, None]:
+        assert grad_output.is_cuda
+
+        features, masks, rfeatures = ctx.saved_tensors
+        kernel_size = ctx.kernel_size
+        group_size = ctx.group_size
+        scale_factor = ctx.scale_factor
+
+        rgrad_output = torch.zeros_like(grad_output, requires_grad=False)
+        rgrad_input_hs = torch.zeros_like(grad_output, requires_grad=False)
+        rgrad_input = torch.zeros_like(features, requires_grad=False)
+        rgrad_masks = torch.zeros_like(masks, requires_grad=False)
+        grad_input = torch.zeros_like(features, requires_grad=False)
+        grad_masks = torch.zeros_like(masks, requires_grad=False)
+        ext_module.carafe_backward(
+            grad_output.contiguous(),
+            rfeatures,
+            masks,
+            rgrad_output,
+            rgrad_input_hs,
+            rgrad_input,
+            rgrad_masks,
+            grad_input,
+            grad_masks,
+            kernel_size=kernel_size,
+            group_size=group_size,
+            scale_factor=scale_factor)
+        return grad_input, grad_masks, None, None, None
+
+
+carafe = CARAFEFunction.apply
+
+
+class CARAFE(Module):
+    """ CARAFE: Content-Aware ReAssembly of FEatures
+
+    Please refer to `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_ for more details.
+
+    Args:
+        kernel_size (int): reassemble kernel size
+        group_size (int): reassemble group size
+        scale_factor (int): upsample ratio
+
+    Returns:
+        upsampled feature map
+    """
+
+    def __init__(self, kernel_size: int, group_size: int, scale_factor: int):
+        super().__init__()
+
+        assert isinstance(kernel_size, int) and isinstance(
+            group_size, int) and isinstance(scale_factor, int)
+        self.kernel_size = kernel_size
+        self.group_size = group_size
+        self.scale_factor = scale_factor
+
+    def forward(self, features: Tensor, masks: Tensor) -> Tensor:
+        return carafe(features, masks, self.kernel_size, self.group_size,
+                      self.scale_factor)
+
+
+@UPSAMPLE_LAYERS.register_module(name='carafe')
+class CARAFEPack(nn.Module):
+    """A unified package of CARAFE upsampler that contains: 1) channel
+    compressor 2) content encoder 3) CARAFE op.
+
+    Official implementation of ICCV 2019 paper
+    `CARAFE: Content-Aware ReAssembly of FEatures
+    <https://arxiv.org/abs/1905.02188>`_.
+
+    Args:
+        channels (int): input feature channels
+        scale_factor (int): upsample ratio
+        up_kernel (int): kernel size of CARAFE op
+        up_group (int): group size of CARAFE op
+        encoder_kernel (int): kernel size of content encoder
+        encoder_dilation (int): dilation of content encoder
+        compressed_channels (int): output channels of channels compressor
+
+    Returns:
+        upsampled feature map
+    """
+
+    def __init__(self,
+                 channels: int,
+                 scale_factor: int,
+                 up_kernel: int = 5,
+                 up_group: int = 1,
+                 encoder_kernel: int = 3,
+                 encoder_dilation: int = 1,
+                 compressed_channels: int = 64):
+        super().__init__()
+        self.channels = channels
+        self.scale_factor = scale_factor
+        self.up_kernel = up_kernel
+        self.up_group = up_group
+        self.encoder_kernel = encoder_kernel
+        self.encoder_dilation = encoder_dilation
+        self.compressed_channels = compressed_channels
+        self.channel_compressor = nn.Conv2d(channels, self.compressed_channels,
+                                            1)
+        self.content_encoder = nn.Conv2d(
+            self.compressed_channels,
+            self.up_kernel * self.up_kernel * self.up_group *
+            self.scale_factor * self.scale_factor,
+            self.encoder_kernel,
+            padding=int((self.encoder_kernel - 1) * self.encoder_dilation / 2),
+            dilation=self.encoder_dilation,
+            groups=1)
+        self.init_weights()
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                xavier_init(m, distribution='uniform')
+        normal_init(self.content_encoder, std=0.001)
+
+    def kernel_normalizer(self, mask: Tensor) -> Tensor:
+        mask = F.pixel_shuffle(mask, self.scale_factor)
+        n, mask_c, h, w = mask.size()
+        # use float division explicitly,
+        # to void inconsistency while exporting to onnx
+        mask_channel = int(mask_c / float(self.up_kernel**2))
+        mask = mask.view(n, mask_channel, -1, h, w)
+
+        mask = F.softmax(mask, dim=2, dtype=mask.dtype)
+        mask = mask.view(n, mask_c, h, w).contiguous()
+
+        return mask
+
+    def feature_reassemble(self, x: Tensor, mask: Tensor) -> Tensor:
+        x = carafe(x, mask, self.up_kernel, self.up_group, self.scale_factor)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        compressed_x = self.channel_compressor(x)
+        mask = self.content_encoder(compressed_x)
+        mask = self.kernel_normalizer(mask)
+
+        x = self.feature_reassemble(x, mask)
+        return x
diff --git a/mmcv/mmcv/ops/cc_attention.py b/mmcv/mmcv/ops/cc_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e5d3325263f18f6b5eb0bfbc522eeaef1999e3b
--- /dev/null
+++ b/mmcv/mmcv/ops/cc_attention.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.cnn import PLUGIN_LAYERS, Scale
+
+
+def NEG_INF_DIAG(n: int, device: torch.device) -> torch.Tensor:
+    """Returns a diagonal matrix of size [n, n].
+
+    The diagonal are all "-inf". This is for avoiding calculating the
+    overlapped element in the Criss-Cross twice.
+    """
+    return torch.diag(torch.tensor(float('-inf')).to(device).repeat(n), 0)
+
+
+@PLUGIN_LAYERS.register_module()
+class CrissCrossAttention(nn.Module):
+    """Criss-Cross Attention Module.
+
+    .. note::
+        Before v1.3.13, we use a CUDA op. Since v1.3.13, we switch
+        to a pure PyTorch and equivalent implementation. For more
+        details, please refer to https://github.com/open-mmlab/mmcv/pull/1201.
+
+        Speed comparison for one forward pass
+
+        - Input size: [2,512,97,97]
+        - Device: 1 NVIDIA GeForce RTX 2080 Ti
+
+        +-----------------------+---------------+------------+---------------+
+        |                       |PyTorch version|CUDA version|Relative speed |
+        +=======================+===============+============+===============+
+        |with torch.no_grad()   |0.00554402 s   |0.0299619 s |5.4x           |
+        +-----------------------+---------------+------------+---------------+
+        |no with torch.no_grad()|0.00562803 s   |0.0301349 s |5.4x           |
+        +-----------------------+---------------+------------+---------------+
+
+    Args:
+        in_channels (int): Channels of the input feature map.
+    """
+
+    def __init__(self, in_channels: int) -> None:
+        super().__init__()
+        self.query_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+        self.key_conv = nn.Conv2d(in_channels, in_channels // 8, 1)
+        self.value_conv = nn.Conv2d(in_channels, in_channels, 1)
+        self.gamma = Scale(0.)
+        self.in_channels = in_channels
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """forward function of Criss-Cross Attention.
+
+        Args:
+            x (torch.Tensor): Input feature with the shape of
+                (batch_size, in_channels, height, width).
+
+        Returns:
+            torch.Tensor: Output of the layer, with the shape of
+            (batch_size, in_channels, height, width)
+        """
+        B, C, H, W = x.size()
+        query = self.query_conv(x)
+        key = self.key_conv(x)
+        value = self.value_conv(x)
+        energy_H = torch.einsum('bchw,bciw->bwhi', query, key) + NEG_INF_DIAG(
+            H, query.device)
+        energy_H = energy_H.transpose(1, 2)
+        energy_W = torch.einsum('bchw,bchj->bhwj', query, key)
+        attn = F.softmax(
+            torch.cat([energy_H, energy_W], dim=-1), dim=-1)  # [B,H,W,(H+W)]
+        out = torch.einsum('bciw,bhwi->bchw', value, attn[..., :H])
+        out += torch.einsum('bchj,bhwj->bchw', value, attn[..., H:])
+
+        out = self.gamma(out) + x
+        out = out.contiguous()
+
+        return out
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels})'
+        return s
diff --git a/mmcv/mmcv/ops/chamfer_distance.py b/mmcv/mmcv/ops/chamfer_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..d68eafb47c85418c374a1eaf086478e3fc0cb1d1
--- /dev/null
+++ b/mmcv/mmcv/ops/chamfer_distance.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Sequence, Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['chamfer_distance_forward', 'chamfer_distance_backward'])
+
+
+class ChamferDistanceFunction(Function):
+    """This is an implementation of the 2D Chamfer Distance.
+
+    It has been used in the paper `Oriented RepPoints for Aerial Object
+    Detection (CVPR 2022) <https://arxiv.org/abs/2105.11111>_`.
+    """
+
+    @staticmethod
+    def forward(ctx, xyz1: Tensor, xyz2: Tensor) -> Sequence[Tensor]:
+        """
+        Args:
+            xyz1 (Tensor): Point set with shape (B, N, 2).
+            xyz2 (Tensor): Point set with shape (B, N, 2).
+
+        Returns:
+            Sequence[Tensor]:
+
+                - dist1 (Tensor): Chamfer distance (xyz1 to xyz2) with
+                    shape (B, N).
+                - dist2 (Tensor): Chamfer distance (xyz2 to xyz1) with
+                    shape (B, N).
+                - idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
+                    with shape (B, N), which be used in compute gradient.
+                - idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
+                    with shape (B, N), which be used in compute gradient.
+        """
+        batch_size, n, _ = xyz1.size()
+        _, m, _ = xyz2.size()
+        device = xyz1.device
+        xyz1 = xyz1.contiguous()
+        xyz2 = xyz2.contiguous()
+
+        dist1 = torch.zeros(batch_size, n).to(device)
+        dist2 = torch.zeros(batch_size, m).to(device)
+        idx1 = torch.zeros(batch_size, n).type(torch.IntTensor).to(device)
+        idx2 = torch.zeros(batch_size, m).type(torch.IntTensor).to(device)
+
+        ext_module.chamfer_distance_forward(xyz1, xyz2, dist1, dist2, idx1,
+                                            idx2)
+        ctx.save_for_backward(xyz1, xyz2, idx1, idx2)
+        return dist1, dist2, idx1, idx2
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_dist1: Tensor, grad_dist2: Tensor,
+                 grad_idx1: Tensor,
+                 grad_idx2: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+
+        Args:
+            grad_dist1 (Tensor): Gradient of chamfer distance
+                (xyz1 to xyz2) with shape (B, N).
+            grad_dist2 (Tensor): Gradient of chamfer distance
+                (xyz2 to xyz1) with shape (B, N).
+            grad_idx1 (Tensor): Index of chamfer distance (xyz1 to xyz2)
+                with shape (B, N), which be used in compute gradient.
+            grad_idx2 (Tensor): Index of chamfer distance (xyz2 to xyz2)
+                with shape (B, N), which be used in compute gradient.
+
+        Returns:
+            Tuple[Tensor, Tensor]:
+
+            - grad_xyz1 (Tensor): Gradient of the point set with shape \
+                (B, N, 2).
+            - grad_xyz2 (Tensor):Gradient of the point set with shape \
+                (B, N, 2).
+        """
+        xyz1, xyz2, idx1, idx2 = ctx.saved_tensors
+        device = grad_dist1.device
+        grad_dist1 = grad_dist1.contiguous()
+        grad_dist2 = grad_dist2.contiguous()
+        grad_xyz1 = torch.zeros(xyz1.size()).to(device)
+        grad_xyz2 = torch.zeros(xyz2.size()).to(device)
+
+        ext_module.chamfer_distance_backward(xyz1, xyz2, grad_xyz1, grad_xyz2,
+                                             grad_dist1, grad_dist2, idx1,
+                                             idx2)
+        return grad_xyz1, grad_xyz2
+
+
+chamfer_distance = ChamferDistanceFunction.apply
diff --git a/mmcv/mmcv/ops/contour_expand.py b/mmcv/mmcv/ops/contour_expand.py
new file mode 100644
index 0000000000000000000000000000000000000000..7184609ad9b64d421c17fdfe4a1a0dbeb62d64c8
--- /dev/null
+++ b/mmcv/mmcv/ops/contour_expand.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Union
+
+import numpy as np
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['contour_expand'])
+
+
+def contour_expand(kernel_mask: Union[np.array, torch.Tensor],
+                   internal_kernel_label: Union[np.array, torch.Tensor],
+                   min_kernel_area: int, kernel_num: int) -> list:
+    """Expand kernel contours so that foreground pixels are assigned into
+    instances.
+
+    Args:
+        kernel_mask (np.array or torch.Tensor): The instance kernel mask with
+            size hxw.
+        internal_kernel_label (np.array or torch.Tensor): The instance internal
+            kernel label with size hxw.
+        min_kernel_area (int): The minimum kernel area.
+        kernel_num (int): The instance kernel number.
+
+    Returns:
+        list: The instance index map with size hxw.
+    """
+    assert isinstance(kernel_mask, (torch.Tensor, np.ndarray))
+    assert isinstance(internal_kernel_label, (torch.Tensor, np.ndarray))
+    assert isinstance(min_kernel_area, int)
+    assert isinstance(kernel_num, int)
+
+    if isinstance(kernel_mask, np.ndarray):
+        kernel_mask = torch.from_numpy(kernel_mask)
+    if isinstance(internal_kernel_label, np.ndarray):
+        internal_kernel_label = torch.from_numpy(internal_kernel_label)
+
+    if torch.__version__ == 'parrots':
+        if kernel_mask.shape[0] == 0 or internal_kernel_label.shape[0] == 0:
+            label = []
+        else:
+            label = ext_module.contour_expand(
+                kernel_mask,
+                internal_kernel_label,
+                min_kernel_area=min_kernel_area,
+                kernel_num=kernel_num)
+            label = label.tolist()  # type: ignore
+    else:
+        label = ext_module.contour_expand(kernel_mask, internal_kernel_label,
+                                          min_kernel_area, kernel_num)
+    return label
diff --git a/mmcv/mmcv/ops/convex_iou.py b/mmcv/mmcv/ops/convex_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..50050363ac5b08cfa8f86dd186ab7087fac6f48a
--- /dev/null
+++ b/mmcv/mmcv/ops/convex_iou.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['convex_iou', 'convex_giou'])
+
+
+def convex_giou(pointsets: torch.Tensor,
+                polygons: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+    """Return generalized intersection-over-union (Jaccard index) between point
+    sets and polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (N, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        tuple[torch.Tensor, torch.Tensor]: The first element is the gious
+        between point sets and polygons with the shape (N,). The second
+        element is the gradient of point sets with the shape (N, 18).
+    """
+    output = pointsets.new_zeros((pointsets.size(0), 19))
+    ext_module.convex_giou(pointsets, polygons, output)
+    convex_giou = output[:, -1]
+    points_grad = output[:, 0:-1]
+    return convex_giou, points_grad
+
+
+def convex_iou(pointsets: torch.Tensor,
+               polygons: torch.Tensor) -> torch.Tensor:
+    """Return intersection-over-union (Jaccard index) between point sets and
+    polygons.
+
+    Args:
+        pointsets (torch.Tensor): It has shape (N, 18),
+            indicating (x1, y1, x2, y2, ..., x9, y9) for each row.
+        polygons (torch.Tensor): It has shape (K, 8),
+            indicating (x1, y1, x2, y2, x3, y3, x4, y4) for each row.
+
+    Returns:
+        torch.Tensor: Return the ious between point sets and polygons with the
+        shape (N, K).
+    """
+    N, K = pointsets.size(0), polygons.size(0)
+    ious = pointsets.new_zeros((N, K))
+    ext_module.convex_iou(pointsets, polygons, ious)
+    return ious
diff --git a/mmcv/mmcv/ops/corner_pool.py b/mmcv/mmcv/ops/corner_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..17ce24952a3b229fb552f450429c948e70aefa19
--- /dev/null
+++ b/mmcv/mmcv/ops/corner_pool.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import Tensor, nn
+from torch.autograd import Function
+
+_mode_dict = {'top': 0, 'bottom': 1, 'left': 2, 'right': 3}
+
+
+def _corner_pool(x: Tensor, dim: int, flip: bool) -> Tensor:
+    size = x.size(dim)
+    output = x.clone()
+
+    ind = 1
+    while ind < size:
+        if flip:
+            cur_start = 0
+            cur_len = size - ind
+            next_start = ind
+            next_len = size - ind
+        else:
+            cur_start = ind
+            cur_len = size - ind
+            next_start = 0
+            next_len = size - ind
+
+        # max_temp should be cloned for backward computation
+        max_temp = output.narrow(dim, cur_start, cur_len).clone()
+        cur_temp = output.narrow(dim, cur_start, cur_len)
+        next_temp = output.narrow(dim, next_start, next_len)
+
+        cur_temp[...] = torch.where(max_temp > next_temp, max_temp, next_temp)
+
+        ind = ind << 1
+
+    return output
+
+
+class TopPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input: Tensor) -> Tensor:
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['top']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 2, True)
+
+
+class BottomPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input: Tensor) -> Tensor:
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['bottom']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 2, False)
+
+
+class LeftPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input: Tensor) -> Tensor:
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['left']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 3, True)
+
+
+class RightPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input: Tensor) -> Tensor:
+        output = g.op(
+            'mmcv::MMCVCornerPool', input, mode_i=int(_mode_dict['right']))
+        return output
+
+    @staticmethod
+    def forward(ctx, input: Tensor) -> Tensor:
+        return _corner_pool(input, 3, False)
+
+
+class CornerPool(nn.Module):
+    """Corner Pooling.
+
+    Corner Pooling is a new type of pooling layer that helps a
+    convolutional network better localize corners of bounding boxes.
+
+    Please refer to `CornerNet: Detecting Objects as Paired Keypoints
+    <https://arxiv.org/abs/1808.01244>`_ for more details.
+
+    Code is modified from https://github.com/princeton-vl/CornerNet-Lite.
+
+    Args:
+        mode (str): Pooling orientation for the pooling layer
+
+            - 'bottom': Bottom Pooling
+            - 'left': Left Pooling
+            - 'right': Right Pooling
+            - 'top': Top Pooling
+
+    Returns:
+        Feature map after pooling.
+    """
+
+    pool_functions = {
+        'bottom': BottomPoolFunction,
+        'left': LeftPoolFunction,
+        'right': RightPoolFunction,
+        'top': TopPoolFunction,
+    }
+
+    cummax_dim_flip = {
+        'bottom': (2, False),
+        'left': (3, True),
+        'right': (3, False),
+        'top': (2, True),
+    }
+
+    def __init__(self, mode: str):
+        super().__init__()
+        assert mode in self.pool_functions
+        self.mode = mode
+        self.corner_pool: Function = self.pool_functions[mode]
+
+    def forward(self, x: Tensor) -> Tensor:
+        if torch.__version__ != 'parrots' and torch.__version__ >= '1.5.0':
+            if torch.onnx.is_in_onnx_export():
+                assert torch.__version__ >= '1.7.0', \
+                    'When `cummax` serves as an intermediate component whose '\
+                    'outputs is used as inputs for another modules, it\'s '\
+                    'expected that pytorch version must be >= 1.7.0, '\
+                    'otherwise Error appears like: `RuntimeError: tuple '\
+                    'appears in op that does not forward tuples, unsupported '\
+                    'kind: prim::PythonOp`.'
+
+            dim, flip = self.cummax_dim_flip[self.mode]
+            if flip:
+                x = x.flip(dim)
+            pool_tensor, _ = torch.cummax(x, dim=dim)
+            if flip:
+                pool_tensor = pool_tensor.flip(dim)
+            return pool_tensor
+        else:
+            if torch.onnx.is_in_onnx_export():
+                return self.corner_pool.apply(x)
+            else:
+                dim, flip = self.cummax_dim_flip[self.mode]
+                return _corner_pool(x, dim, flip)
diff --git a/mmcv/mmcv/ops/correlation.py b/mmcv/mmcv/ops/correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..319b7646782637e9ebaac4ef07b82d1f460031b5
--- /dev/null
+++ b/mmcv/mmcv/ops/correlation.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['correlation_forward', 'correlation_backward'])
+
+
+class CorrelationFunction(Function):
+
+    @staticmethod
+    def forward(ctx,
+                input1: Tensor,
+                input2: Tensor,
+                kernel_size: int = 1,
+                max_displacement: int = 1,
+                stride: int = 1,
+                padding: int = 1,
+                dilation: int = 1,
+                dilation_patch: int = 1) -> Tensor:
+
+        ctx.save_for_backward(input1, input2)
+
+        kH, kW = ctx.kernel_size = _pair(kernel_size)
+        patch_size = max_displacement * 2 + 1
+        ctx.patch_size = patch_size
+        dH, dW = ctx.stride = _pair(stride)
+        padH, padW = ctx.padding = _pair(padding)
+        dilationH, dilationW = ctx.dilation = _pair(dilation)
+        dilation_patchH, dilation_patchW = ctx.dilation_patch = _pair(
+            dilation_patch)
+
+        output_size = CorrelationFunction._output_size(ctx, input1)
+
+        output = input1.new_zeros(output_size)
+
+        ext_module.correlation_forward(
+            input1,
+            input2,
+            output,
+            kH=kH,
+            kW=kW,
+            patchH=patch_size,
+            patchW=patch_size,
+            padH=padH,
+            padW=padW,
+            dilationH=dilationH,
+            dilationW=dilationW,
+            dilation_patchH=dilation_patchH,
+            dilation_patchW=dilation_patchW,
+            dH=dH,
+            dW=dW)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Tensor, Tensor, None, None, None, None, None, None]:
+        input1, input2 = ctx.saved_tensors
+
+        kH, kW = ctx.kernel_size
+        patch_size = ctx.patch_size
+        padH, padW = ctx.padding
+        dilationH, dilationW = ctx.dilation
+        dilation_patchH, dilation_patchW = ctx.dilation_patch
+        dH, dW = ctx.stride
+        grad_input1 = torch.zeros_like(input1)
+        grad_input2 = torch.zeros_like(input2)
+
+        ext_module.correlation_backward(
+            grad_output,
+            input1,
+            input2,
+            grad_input1,
+            grad_input2,
+            kH=kH,
+            kW=kW,
+            patchH=patch_size,
+            patchW=patch_size,
+            padH=padH,
+            padW=padW,
+            dilationH=dilationH,
+            dilationW=dilationW,
+            dilation_patchH=dilation_patchH,
+            dilation_patchW=dilation_patchW,
+            dH=dH,
+            dW=dW)
+        return grad_input1, grad_input2, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(ctx, input1):
+        iH, iW = input1.size(2), input1.size(3)
+        batch_size = input1.size(0)
+        kH, kW = ctx.kernel_size
+        patch_size = ctx.patch_size
+        dH, dW = ctx.stride
+        padH, padW = ctx.padding
+        dilationH, dilationW = ctx.dilation
+        dilatedKH = (kH - 1) * dilationH + 1
+        dilatedKW = (kW - 1) * dilationW + 1
+
+        oH = int((iH + 2 * padH - dilatedKH) / dH + 1)
+        oW = int((iW + 2 * padW - dilatedKW) / dW + 1)
+
+        output_size = (batch_size, patch_size, patch_size, oH, oW)
+        return output_size
+
+
+class Correlation(nn.Module):
+    r"""Correlation operator
+
+    This correlation operator works for optical flow correlation computation.
+
+    There are two batched tensors with shape :math:`(N, C, H, W)`,
+    and the correlation output's shape is :math:`(N, max\_displacement \times
+    2 + 1, max\_displacement * 2 + 1, H_{out}, W_{out})`
+
+    where
+
+    .. math::
+        H_{out} = \left\lfloor\frac{H_{in}  + 2 \times padding -
+            dilation \times (kernel\_size - 1) - 1}
+            {stride} + 1\right\rfloor
+
+    .. math::
+        W_{out} = \left\lfloor\frac{W_{in}  + 2 \times padding - dilation
+            \times (kernel\_size - 1) - 1}
+            {stride} + 1\right\rfloor
+
+    the correlation item :math:`(N_i, dy, dx)` is formed by taking the sliding
+    window convolution between input1 and shifted input2,
+
+    .. math::
+        Corr(N_i, dx, dy) =
+        \sum_{c=0}^{C-1}
+        input1(N_i, c) \star
+        \mathcal{S}(input2(N_i, c), dy, dx)
+
+    where :math:`\star` is the valid 2d sliding window convolution operator,
+    and :math:`\mathcal{S}` means shifting the input features (auto-complete
+    zero marginal), and :math:`dx, dy` are shifting distance, :math:`dx, dy \in
+    [-max\_displacement \times dilation\_patch, max\_displacement \times
+    dilation\_patch]`.
+
+    Args:
+        kernel_size (int): The size of sliding window i.e. local neighborhood
+            representing the center points and involved in correlation
+            computation. Defaults to 1.
+        max_displacement (int): The radius for computing correlation volume,
+            but the actual working space can be dilated by dilation_patch.
+            Defaults to 1.
+        stride (int): The stride of the sliding blocks in the input spatial
+            dimensions. Defaults to 1.
+        padding (int): Zero padding added to all four sides of the input1.
+            Defaults to 0.
+        dilation (int): The spacing of local neighborhood that will involved
+            in correlation. Defaults to 1.
+        dilation_patch (int): The spacing between position need to compute
+            correlation.  Defaults to 1.
+    """
+
+    def __init__(self,
+                 kernel_size: int = 1,
+                 max_displacement: int = 1,
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 dilation_patch: int = 1) -> None:
+        super().__init__()
+        self.kernel_size = kernel_size
+        self.max_displacement = max_displacement
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.dilation_patch = dilation_patch
+
+    def forward(self, input1: Tensor, input2: Tensor) -> Tensor:
+        return CorrelationFunction.apply(input1, input2, self.kernel_size,
+                                         self.max_displacement, self.stride,
+                                         self.padding, self.dilation,
+                                         self.dilation_patch)
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__
+        s += f'(kernel_size={self.kernel_size}, '
+        s += f'max_displacement={self.max_displacement}, '
+        s += f'stride={self.stride}, '
+        s += f'padding={self.padding}, '
+        s += f'dilation={self.dilation}, '
+        s += f'dilation_patch={self.dilation_patch})'
+        return s
diff --git a/mmcv/mmcv/ops/csrc/README.md b/mmcv/mmcv/ops/csrc/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..dbc82b534b1ab27593361b3053cb61e12fbd420e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/README.md
@@ -0,0 +1,189 @@
+# Code Structure of CUDA operators
+
+This folder contains all non-python code for MMCV custom ops. Please follow the same architecture if you want to add new ops.
+
+## Directories Tree
+
+```folder
+.
+├── common
+│   ├── box_iou_rotated_utils.hpp
+│   ├── parrots_cpp_helper.hpp
+│   ├── parrots_cuda_helper.hpp
+│   ├── pytorch_cpp_helper.hpp
+│   ├── pytorch_cuda_helper.hpp
+│   ├── pytorch_device_registry.hpp
+│   ├── cuda
+│   │   ├── common_cuda_helper.hpp
+│   │   ├── parrots_cudawarpfunction.cuh
+│   │   ├── ...
+│   │   └── ops_cuda_kernel.cuh
+|   ├── mps
+│   │   ├── MPSLibrary.h
+│   │   ├── ...
+│   │   └── MPSUtils.h
+|   ├── mlu
+│   │   └── ...
+|   └── utils
+│   │   └── ...
+├── onnxruntime
+│   ├── onnxruntime_register.h
+│   ├── onnxruntime_session_options_config_keys.h
+│   ├── ort_mmcv_utils.h
+│   ├── ...
+│   ├── onnx_ops.h
+│   └── cpu
+│       ├── onnxruntime_register.cpp
+│       ├── ...
+│       └── onnx_ops_impl.cpp
+├── parrots
+│   ├── ...
+│   ├── ops.cpp
+│   ├── ops_parrots.cpp
+│   └── ops_pytorch.h
+├── pytorch
+│   ├── info.cpp
+│   ├── pybind.cpp
+│   ├── ...
+│   ├── ops.cpp
+│   ├── cuda
+│   │   ├── ...
+│   │   └── ops_cuda.cu
+│   ├── cpu
+│   │   ├── ...
+│   │   └── ops.cpp
+│   ├── mps
+│   │   ├── ...
+│   |   └── op_mps.mm
+│   └── mlu
+│       ├── ...
+│       └── op_mlu.cpp
+└── tensorrt
+    ├── trt_cuda_helper.cuh
+    ├── trt_plugin_helper.hpp
+    ├── trt_plugin.hpp
+    ├── trt_serialize.hpp
+    ├── ...
+    ├── trt_ops.hpp
+    └── plugins
+        ├── trt_cuda_helper.cu
+        ├── trt_plugin.cpp
+        ├── ...
+        ├── trt_ops.cpp
+        └── trt_ops_kernel.cu
+```
+
+## Components
+
+- `common`: This directory contains all tools and shared codes.
+  - `cuda`: The cuda kernels which can be shared by all backends. **HIP** kernel is also here since they have similar syntax.
+  - `mps`: The tools used to support MPS ops. **NOTE** that MPS support is **experimental**.
+  - `mlu`: The MLU kernels used to support [Cambricon](https://www.cambricon.com/) device.
+  - `utils`: The kernels and utils of spconv.
+- `onnxruntime`: **ONNX Runtime** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
+  - `cpu`: CPU implementation of supported ops.
+- `parrots`: **Parrots** is a deep learning frame for model training and inference. Parrots custom ops are placed in this directory.
+- `pytorch`: **PyTorch** custom ops are supported by binding C++ to Python with **pybind11**. The ops implementation and binding codes are placed in this directory.
+  - `cuda`: This directory contains cuda kernel launchers, which feed memory pointers of tensor to the cuda kernel in `common/cuda`. The launchers provide c++ interface of cuda implementation of corresponding custom ops.
+  - `cpu`: This directory contain cpu implementations of corresponding custom ops.
+  - `mlu`: This directory contain launchers of each MLU kernels.
+  - `mps`: MPS ops implementation and launchers.
+- `tensorrt`: **TensorRT** support for custom ops. Has been deprecated, please try the latest custom ops in [MMDeploy](https://github.com/open-mmlab/mmdeploy).
+  - `plugins`: This directory contains the implementation of the supported custom ops. Some ops might also use shared cuda kernel in `common/cuda`.
+
+## How to add new PyTorch ops?
+
+1. (Optional) Add shared kernel in `common` to support special hardware platform.
+
+   ```c++
+   // src/common/cuda/new_ops_cuda_kernel.cuh
+
+   template <typename T>
+   __global__ void new_ops_forward_cuda_kernel(const T* input, T* output, ...) {
+       // forward here
+   }
+
+   ```
+
+   Add cuda kernel launcher in `pytorch/cuda`.
+
+   ```c++
+   // src/pytorch/cuda
+   #include <new_ops_cuda_kernel.cuh>
+
+   void NewOpsForwardCUDAKernelLauncher(Tensor input, Tensor output, ...){
+       // initialize
+       at::cuda::CUDAGuard device_guard(input.device());
+       cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+       ...
+       AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+           input.scalar_type(), "new_ops_forward_cuda_kernel", ([&] {
+               new_ops_forward_cuda_kernel<scalar_t>
+                   <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                       input.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),...);
+           }));
+       AT_CUDA_CHECK(cudaGetLastError());
+   }
+   ```
+
+2. Register implementation for different devices.
+
+   ```c++
+   // src/pytorch/cuda/cudabind.cpp
+   ...
+
+   Tensor new_ops_forward_cuda(Tensor input, Tensor output, ...){
+       // implement cuda forward here
+       // use `NewOpsForwardCUDAKernelLauncher` here
+   }
+   // declare interface here.
+   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...);
+   // register the implementation for given device (CUDA here).
+   REGISTER_DEVICE_IMPL(new_ops_forward_impl, CUDA, new_ops_forward_cuda);
+   ```
+
+3. Add ops implementation in `pytorch` directory. Select different implementations according to device type.
+
+   ```c++
+   // src/pytorch/new_ops.cpp
+   Tensor new_ops_forward_impl(Tensor input, Tensor output, ...){
+       // dispatch the implementation according to the device type of input.
+       DISPATCH_DEVICE_IMPL(new_ops_forward_impl, input, output, ...);
+   }
+   ...
+
+   Tensor new_ops_forward(Tensor input, Tensor output, ...){
+       return new_ops_forward_impl(input, output, ...);
+   }
+   ```
+
+4. Binding the implementation in `pytorch/pybind.cpp`
+
+   ```c++
+   // src/pytorch/pybind.cpp
+
+   ...
+
+   Tensor new_ops_forward(Tensor input, Tensor output, ...);
+
+   ...
+
+   // bind with pybind11
+   m.def("new_ops_forward", &new_ops_forward, "new_ops_forward",
+           py::arg("input"), py::arg("output"), ...);
+
+   ...
+
+   ```
+
+5. Build MMCV again. Enjoy new ops in python
+
+   ```python
+   from ..utils import ext_loader
+   ext_module = ext_loader.load_ext('_ext', ['new_ops_forward'])
+
+   ...
+
+   ext_module.new_ops_forward(input, output, ...)
+
+   ```
diff --git a/mmcv/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp b/mmcv/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..243200e156f1384b625d6bac7fa4c68e533d9441
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/box_iou_rotated_utils.hpp
@@ -0,0 +1,347 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_utils.h
+#pragma once
+#include <cassert>
+#include <cmath>
+
+#ifdef __CUDACC__
+// Designates functions callable from the host (CPU) and the device (GPU)
+#define HOST_DEVICE __host__ __device__
+#define HOST_DEVICE_INLINE HOST_DEVICE __forceinline__
+#else
+#include <algorithm>
+#define HOST_DEVICE
+#define HOST_DEVICE_INLINE HOST_DEVICE inline
+#endif
+
+namespace {
+
+template <typename T>
+struct RotatedBox {
+  T x_ctr, y_ctr, w, h, a;
+};
+
+template <typename T>
+struct Point {
+  T x, y;
+  HOST_DEVICE_INLINE Point(const T& px = 0, const T& py = 0) : x(px), y(py) {}
+  HOST_DEVICE_INLINE Point operator+(const Point& p) const {
+    return Point(x + p.x, y + p.y);
+  }
+  HOST_DEVICE_INLINE Point& operator+=(const Point& p) {
+    x += p.x;
+    y += p.y;
+    return *this;
+  }
+  HOST_DEVICE_INLINE Point operator-(const Point& p) const {
+    return Point(x - p.x, y - p.y);
+  }
+  HOST_DEVICE_INLINE Point operator*(const T coeff) const {
+    return Point(x * coeff, y * coeff);
+  }
+};
+
+template <typename T>
+HOST_DEVICE_INLINE T dot_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.x + A.y * B.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T cross_2d(const Point<T>& A, const Point<T>& B) {
+  return A.x * B.y - B.x * A.y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE void get_rotated_vertices(const RotatedBox<T>& box,
+                                             Point<T> (&pts)[4]) {
+  // M_PI / 180. == 0.01745329251
+  // double theta = box.a * 0.01745329251;
+  // MODIFIED
+  double theta = box.a;
+  T cosTheta2 = (T)cos(theta) * 0.5f;
+  T sinTheta2 = (T)sin(theta) * 0.5f;
+
+  // y: top --> down; x: left --> right
+  pts[0].x = box.x_ctr - sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[0].y = box.y_ctr + cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[1].x = box.x_ctr + sinTheta2 * box.h - cosTheta2 * box.w;
+  pts[1].y = box.y_ctr - cosTheta2 * box.h - sinTheta2 * box.w;
+  pts[2].x = 2 * box.x_ctr - pts[0].x;
+  pts[2].y = 2 * box.y_ctr - pts[0].y;
+  pts[3].x = 2 * box.x_ctr - pts[1].x;
+  pts[3].y = 2 * box.y_ctr - pts[1].y;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int get_intersection_points(const Point<T> (&pts1)[4],
+                                               const Point<T> (&pts2)[4],
+                                               Point<T> (&intersections)[24]) {
+  // Line vector
+  // A line from p1 to p2 is: p1 + (p2-p1)*t, t=[0,1]
+  Point<T> vec1[4], vec2[4];
+  for (int i = 0; i < 4; i++) {
+    vec1[i] = pts1[(i + 1) % 4] - pts1[i];
+    vec2[i] = pts2[(i + 1) % 4] - pts2[i];
+  }
+
+  // Line test - test all line combos for intersection
+  int num = 0;  // number of intersections
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      // Solve for 2x2 Ax=b
+      T det = cross_2d<T>(vec2[j], vec1[i]);
+
+      // This takes care of parallel lines
+      if (fabs(det) <= 1e-14) {
+        continue;
+      }
+
+      auto vec12 = pts2[j] - pts1[i];
+
+      T t1 = cross_2d<T>(vec2[j], vec12) / det;
+      T t2 = cross_2d<T>(vec1[i], vec12) / det;
+
+      if (t1 >= 0.0f && t1 <= 1.0f && t2 >= 0.0f && t2 <= 1.0f) {
+        intersections[num++] = pts1[i] + vec1[i] * t1;
+      }
+    }
+  }
+
+  // Check for vertices of rect1 inside rect2
+  {
+    const auto& AB = vec2[0];
+    const auto& DA = vec2[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      // assume ABCD is the rectangle, and P is the point to be judged
+      // P is inside ABCD iff. P's projection on AB lies within AB
+      // and P's projection on AD lies within AD
+
+      auto AP = pts1[i] - pts2[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts1[i];
+      }
+    }
+  }
+
+  // Reverse the check - check for vertices of rect2 inside rect1
+  {
+    const auto& AB = vec1[0];
+    const auto& DA = vec1[3];
+    auto ABdotAB = dot_2d<T>(AB, AB);
+    auto ADdotAD = dot_2d<T>(DA, DA);
+    for (int i = 0; i < 4; i++) {
+      auto AP = pts2[i] - pts1[0];
+
+      auto APdotAB = dot_2d<T>(AP, AB);
+      auto APdotAD = -dot_2d<T>(AP, DA);
+
+      if ((APdotAB >= 0) && (APdotAD >= 0) && (APdotAB <= ABdotAB) &&
+          (APdotAD <= ADdotAD)) {
+        intersections[num++] = pts2[i];
+      }
+    }
+  }
+
+  return num;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE int convex_hull_graham(const Point<T> (&p)[24],
+                                          const int& num_in, Point<T> (&q)[24],
+                                          bool shift_to_zero = false) {
+  assert(num_in >= 2);
+
+  // Step 1:
+  // Find point with minimum y
+  // if more than 1 points have the same minimum y,
+  // pick the one with the minimum x.
+  int t = 0;
+  for (int i = 1; i < num_in; i++) {
+    if (p[i].y < p[t].y || (p[i].y == p[t].y && p[i].x < p[t].x)) {
+      t = i;
+    }
+  }
+  auto& start = p[t];  // starting point
+
+  // Step 2:
+  // Subtract starting point from every points (for sorting in the next step)
+  for (int i = 0; i < num_in; i++) {
+    q[i] = p[i] - start;
+  }
+
+  // Swap the starting point to position 0
+  auto tmp = q[0];
+  q[0] = q[t];
+  q[t] = tmp;
+
+  // Step 3:
+  // Sort point 1 ~ num_in according to their relative cross-product values
+  // (essentially sorting according to angles)
+  // If the angles are the same, sort according to their distance to origin
+  T dist[24];
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+
+#ifdef __CUDACC__
+  // CUDA version
+  // In the future, we can potentially use thrust
+  // for sorting here to improve speed (though not guaranteed)
+  for (int i = 1; i < num_in - 1; i++) {
+    for (int j = i + 1; j < num_in; j++) {
+      T crossProduct = cross_2d<T>(q[i], q[j]);
+      if ((crossProduct < -1e-6) ||
+          (fabs(crossProduct) < 1e-6 && dist[i] > dist[j])) {
+        auto q_tmp = q[i];
+        q[i] = q[j];
+        q[j] = q_tmp;
+        auto dist_tmp = dist[i];
+        dist[i] = dist[j];
+        dist[j] = dist_tmp;
+      }
+    }
+  }
+#else
+  // CPU version
+  std::sort(q + 1, q + num_in,
+            [](const Point<T>& A, const Point<T>& B) -> bool {
+              T temp = cross_2d<T>(A, B);
+              if (fabs(temp) < 1e-6) {
+                return dot_2d<T>(A, A) < dot_2d<T>(B, B);
+              } else {
+                return temp > 0;
+              }
+            });
+  // compute distance to origin after sort, since the points are now different.
+  for (int i = 0; i < num_in; i++) {
+    dist[i] = dot_2d<T>(q[i], q[i]);
+  }
+#endif
+
+  // Step 4:
+  // Make sure there are at least 2 points (that don't overlap with each other)
+  // in the stack
+  int k;  // index of the non-overlapped second point
+  for (k = 1; k < num_in; k++) {
+    if (dist[k] > 1e-8) {
+      break;
+    }
+  }
+  if (k == num_in) {
+    // We reach the end, which means the convex hull is just one point
+    q[0] = p[t];
+    return 1;
+  }
+  q[1] = q[k];
+  int m = 2;  // 2 points in the stack
+  // Step 5:
+  // Finally we can start the scanning process.
+  // When a non-convex relationship between the 3 points is found
+  // (either concave shape or duplicated points),
+  // we pop the previous point from the stack
+  // until the 3-point relationship is convex again, or
+  // until the stack only contains two points
+  for (int i = k + 1; i < num_in; i++) {
+    while (m > 1 && cross_2d<T>(q[i] - q[m - 2], q[m - 1] - q[m - 2]) >= 0) {
+      m--;
+    }
+    q[m++] = q[i];
+  }
+
+  // Step 6 (Optional):
+  // In general sense we need the original coordinates, so we
+  // need to shift the points back (reverting Step 2)
+  // But if we're only interested in getting the area/perimeter of the shape
+  // We can simply return.
+  if (!shift_to_zero) {
+    for (int i = 0; i < m; i++) {
+      q[i] += start;
+    }
+  }
+
+  return m;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T polygon_area(const Point<T> (&q)[24], const int& m) {
+  if (m <= 2) {
+    return 0;
+  }
+
+  T area = 0;
+  for (int i = 1; i < m - 1; i++) {
+    area += fabs(cross_2d<T>(q[i] - q[0], q[i + 1] - q[0]));
+  }
+
+  return area / 2.0;
+}
+
+template <typename T>
+HOST_DEVICE_INLINE T rotated_boxes_intersection(const RotatedBox<T>& box1,
+                                                const RotatedBox<T>& box2) {
+  // There are up to 4 x 4 + 4 + 4 = 24 intersections (including dups) returned
+  // from rotated_rect_intersection_pts
+  Point<T> intersectPts[24], orderedPts[24];
+
+  Point<T> pts1[4];
+  Point<T> pts2[4];
+  get_rotated_vertices<T>(box1, pts1);
+  get_rotated_vertices<T>(box2, pts2);
+
+  int num = get_intersection_points<T>(pts1, pts2, intersectPts);
+
+  if (num <= 2) {
+    return 0.0;
+  }
+
+  // Convex Hull to order the intersection points in clockwise order and find
+  // the contour area.
+  int num_convex = convex_hull_graham<T>(intersectPts, num, orderedPts, true);
+  return polygon_area<T>(orderedPts, num_convex);
+}
+
+}  // namespace
+
+template <typename T>
+HOST_DEVICE_INLINE T single_box_iou_rotated(T const* const box1_raw,
+                                            T const* const box2_raw,
+                                            const int mode_flag) {
+  // shift center to the middle point to achieve higher precision in result
+  RotatedBox<T> box1, box2;
+  auto center_shift_x = (box1_raw[0] + box2_raw[0]) / 2.0;
+  auto center_shift_y = (box1_raw[1] + box2_raw[1]) / 2.0;
+  box1.x_ctr = box1_raw[0] - center_shift_x;
+  box1.y_ctr = box1_raw[1] - center_shift_y;
+  box1.w = box1_raw[2];
+  box1.h = box1_raw[3];
+  box1.a = box1_raw[4];
+  box2.x_ctr = box2_raw[0] - center_shift_x;
+  box2.y_ctr = box2_raw[1] - center_shift_y;
+  box2.w = box2_raw[2];
+  box2.h = box2_raw[3];
+  box2.a = box2_raw[4];
+
+  const T area1 = box1.w * box1.h;
+  const T area2 = box2.w * box2.h;
+  if (area1 < 1e-14 || area2 < 1e-14) {
+    return 0.f;
+  }
+
+  const T intersection = rotated_boxes_intersection<T>(box1, box2);
+  T baseS = 1.0;
+  if (mode_flag == 0) {
+    baseS = (area1 + area2 - intersection);
+  } else if (mode_flag == 1) {
+    baseS = area1;
+  }
+  const T iou = intersection / baseS;
+  return iou;
+}
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..36e41107ebd52d3cf5e9a71cffe6eddeed4f0765
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/active_rotated_filter_cuda_kernel.cuh
@@ -0,0 +1,59 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#ifndef ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+#define ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_forward_cuda_kernel(
+    const int nthreads, const scalar_t* weight_data, const int* indices_data,
+    const int num_input_planes, const int num_output_planes,
+    const int num_orientations, const int num_rotations, const int nEntry,
+    scalar_t* output_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t val = *(weight_data + index);
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t* target = output_data +
+                         i * (num_rotations * num_input_planes * nEntry) +
+                         k * (num_input_planes * nEntry) + j * (nEntry) + idx;
+      *target = val;
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void active_rotated_filter_backward_cuda_kernel(
+    const int nthreads, const scalar_t* gradWeight_data,
+    const int* indices_data, const int num_input_planes,
+    const int num_output_planes, const int num_orientations,
+    const int num_rotations, const int nEntry, scalar_t* weight_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int l = index % nEntry;
+    int j = (index / nEntry) % num_input_planes;
+    int i = index / nEntry / num_input_planes;
+    int k;
+    scalar_t* val = weight_data + index;
+    *val = 0;
+    scalar_t tmp = 0;
+    for (k = 0; k < num_rotations; k++) {
+      int idx = (int)(*(indices_data + l * num_rotations + k)) - 1;
+      scalar_t target =
+          *(gradWeight_data + i * (num_rotations * num_input_planes * nEntry) +
+            k * (num_input_planes * nEntry) + j * (nEntry) + idx);
+      tmp = tmp + target;
+    }
+    *val = tmp;
+  }
+}
+#endif  // ACTIVE_ROTATED_FILTER_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9f9250844b9ceeca0df0377640c3d28e3f61cecc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/assign_score_withk_cuda_kernel.cuh
@@ -0,0 +1,116 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+#define ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+// input: points(B,N0,M,O), centers(B,N0,M,O), scores(B,N1,K,M), knn_idx(B,N1,K)
+// output: fout(B,O,N)
+// algo: fout(b,i,k,j) = s(b,i,k,m)*p(b,c(i),k,m,j) =  s(b,i,k,m)*p(b,i(k),m,j)
+//       i(k) = idx(b,i,k)
+//      sum: fout(b,i,j) = fout(b,i,j) + s(b,i,k,m)*p(b,i,k,m,j)
+//      avg: fout(b,i,j) = sum(fout(b,i,k,j)) / k
+//      max: fout(b,i,j) = max(fout(b,i,k,j), sum(s(b,i,k,m)*p(b,i,k,m,j)))
+
+template <typename T>
+__global__ void assign_score_withk_forward_cuda_kernel(
+    const int B, const int N0, const int N1, const int M, const int K,
+    const int O, const int aggregate, const T* points, const T* centers,
+    const T* scores, const int64_t* knn_idx, T* output) {
+  // ----- parallel loop for B, N1, K and O ---------
+  CUDA_1D_KERNEL_LOOP(i, B * O * N1 * K) {
+    // ------- loop for M ----------
+    const int b = (int)(i / (O * N1 * K));
+    const int o = (int)(i % (O * N1 * K) / (N1 * K));
+    const int n = (int)(i % (N1 * K) / K);
+    const int k = (int)(i % K);
+    const int cn = (int)knn_idx[b * K * N1 + n * K +
+                                0];  // The first neighbor is the center point
+    const int kn = (int)knn_idx[b * K * N1 + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
+    assert(b < B);
+    assert(kn < N0);
+    assert(cn < N0);
+    assert(o < O);
+    assert(n < N1);
+    const int out_idx = b * N1 * O * K + o * N1 * K + n * K + k;
+    T val = output[out_idx];
+    for (int m = 0; m < M; m++) {
+      val += points[b * N0 * M * O + kn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m] -
+             centers[b * N0 * M * O + cn * M * O + m * O + o] *
+                 scores[b * N1 * K * M + n * K * M + k * M + m];
+    }
+    output[out_idx] = val;
+  }
+}
+
+template <typename T>
+__global__ void assign_score_withk_points_backward_cuda_kernel(
+    const int B, const int N0, const int N, const int M, const int K,
+    const int O, const int aggregate, const T* grad_out, const T* scores,
+    const int64_t* knn_idx, T* grad_points, T* grad_centers) {
+  // ----- parallel loop for B, M, O ---------
+  CUDA_1D_KERNEL_LOOP(i, B * M * O) {
+    int b = (int)(i / (M * O));
+    int m = (int)(i % (M * O) / O);
+    int o = (int)(i % O);
+
+    // ----- loop for N,K ---------
+    for (int n = 0; n < N; n++) {
+      for (int k = 0; k < K; k++) {
+        int kn = knn_idx[b * N * K + n * K + k];
+        int cn = knn_idx[b * N * K + n * K + 0];
+        if (kn >= N0 || kn < 0) {  // if index overflows, it is out of the
+                                   // neighborhood range
+          continue;
+        }
+        atomicAdd(grad_points + b * N0 * M * O + kn * M * O + m * O + o,
+                  scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+        atomicAdd(grad_centers + b * N0 * M * O + cn * M * O + m * O + o,
+                  -scores[b * N * K * M + n * K * M + k * M + m] *
+                      grad_out[b * O * N * K + o * N * K + n * K + k]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void assign_score_withk_scores_backward_cuda_kernel(
+    const int B, const int N0, const int N, const int M, const int K,
+    const int O, const int aggregate, const T* grad_out, const T* points,
+    const T* centers, const int64_t* knn_idx, T* grad_scores) {
+  // ----- parallel loop for B, N, K, M ---------
+  CUDA_1D_KERNEL_LOOP(i, B * N * K * M) {
+    const int b = (int)(i / (N * M * K));
+    const int n = (int)(i % (N * M * K) / M / K);
+    const int k = (int)(i % (M * K) / M);
+    const int m = (int)(i % M);
+    const int cn = knn_idx[b * N * K + n * K + 0];
+    const int kn = knn_idx[b * N * K + n * K + k];
+    if (kn >= N0 ||
+        kn < 0) {  // if index overflows, it is out of the neighborhood range
+      return;
+    }
+
+    // -------------- loop for O ------------------------
+    const int out_idx = b * N * K * M + n * K * M + k * M + m;
+    T val = grad_scores[out_idx];
+    for (int o = 0; o < O; o++) {
+      val += (points[b * N0 * M * O + kn * M * O + m * O + o] -
+              centers[b * N0 * M * O + cn * M * O + m * O + o]) *
+             grad_out[b * O * N * K + o * N * K + n * K + k];
+    }
+    grad_scores[out_idx] = val;
+  }
+}
+
+#endif  // ASSIGN_SCORE_WITHK_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..632b5c4940b33a9d8d839fa3f3b92e7b6a2bd29e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/ball_query_cuda_kernel.cuh
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+#ifndef BALL_QUERY_CUDA_KERNEL_CUH
+#define BALL_QUERY_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void ball_query_forward_cuda_kernel(int b, int n, int m,
+                                               float min_radius,
+                                               float max_radius, int nsample,
+                                               const T* new_xyz, const T* xyz,
+                                               int* idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;
+
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+
+    float max_radius2 = max_radius * max_radius;
+    float min_radius2 = min_radius * min_radius;
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];
+
+    int cnt = 0;
+    for (int k = 0; k < n; ++k) {
+      T x = xyz[k * 3 + 0];
+      T y = xyz[k * 3 + 1];
+      T z = xyz[k * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 == 0 || (d2 >= min_radius2 && d2 < max_radius2)) {
+        if (cnt == 0) {
+          for (int l = 0; l < nsample; ++l) {
+            idx[l] = k;
+          }
+        }
+        idx[cnt] = k;
+        ++cnt;
+        if (cnt >= nsample) break;
+      }
+    }
+  }
+}
+
+#endif  // BALL_QUERY_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..15bd91eca629895d3a99dde3fe6614036ca31dc9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/bbox_overlaps_cuda_kernel.cuh
@@ -0,0 +1,147 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BBOX_OVERLAPS_CUDA_KERNEL_CUH
+#define BBOX_OVERLAPS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ __forceinline__ void load_bbox(const T* bbox, const int base, T& x1,
+                                          T& y1, T& x2, T& y2) {
+  x1 = bbox[base];
+  y1 = bbox[base + 1];
+  x2 = bbox[base + 2];
+  y2 = bbox[base + 3];
+}
+
+template <>
+__device__ __forceinline__ void load_bbox<float>(const float* bbox,
+                                                 const int base, float& x1,
+                                                 float& y1, float& x2,
+                                                 float& y2) {
+  const float4 bbox_offset = reinterpret_cast<const float4*>(bbox + base)[0];
+  x1 = bbox_offset.x;
+  y1 = bbox_offset.y;
+  x2 = bbox_offset.z;
+  y2 = bbox_offset.w;
+}
+
+template <typename T>
+__global__ void bbox_overlaps_cuda_kernel(const T* bbox1, const T* bbox2,
+                                          T* ious, const int num_bbox1,
+                                          const int num_bbox2, const int mode,
+                                          const bool aligned,
+                                          const int offset) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1) {
+      const int b1 = index;
+      const int b2 = index;
+
+      const int base1 = b1 << 2;  // b1 * 4
+      T b1_x1, b1_y1, b1_x2, b1_y2;
+      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      const int base2 = b2 << 2;  // b2 * 4
+      T b2_x1, b2_y1, b2_x2, b2_y2;
+      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      const T width = fmaxf(right - left + offset, 0.f);
+      const T height = fmaxf(bottom - top + offset, 0.f);
+      const T interS = width * height;
+
+      const T baseS =
+          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+      ious[index] = interS / baseS;
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, num_bbox1 * num_bbox2) {
+      const int b1 = index / num_bbox2;
+      const int b2 = index % num_bbox2;
+
+      const int base1 = b1 << 2;  // b1 * 4
+      T b1_x1, b1_y1, b1_x2, b1_y2;
+      load_bbox<T>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+      const T b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset);
+
+      const int base2 = b2 << 2;  // b2 * 4
+      T b2_x1, b2_y1, b2_x2, b2_y2;
+      load_bbox<T>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+      const T b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset);
+
+      const T left = fmaxf(b1_x1, b2_x1), right = fminf(b1_x2, b2_x2);
+      const T top = fmaxf(b1_y1, b2_y1), bottom = fminf(b1_y2, b2_y2);
+      const T width = fmaxf(right - left + offset, 0.f);
+      const T height = fmaxf(bottom - top + offset, 0.f);
+      const T interS = width * height;
+
+      const T baseS =
+          fmaxf(mode == 0 ? b1_area + b2_area - interS : b1_area, T(offset));
+      ious[index] = interS / baseS;
+    }
+  }
+}
+
+#if __CUDA_ARCH__ >= 530
+__device__ __forceinline__ __half __half_area(const __half x1, const __half y1,
+                                              const __half x2, const __half y2,
+                                              const __half offset) {
+  const __half half_w = __hadd(__hsub(x2, x1), offset);
+  const __half half_h = __hadd(__hsub(y2, y1), offset);
+  return __hmul(half_w, half_h);
+}
+
+__device__ __forceinline__ __half __half_max(const __half a, const __half b) {
+  return __hge(a, b) ? a : b;
+}
+
+__device__ __forceinline__ __half __half_min(const __half a, const __half b) {
+  return __hle(a, b) ? a : b;
+}
+
+// fp16 won't provide much increase when aligned==true. It is useful when
+// aligned==false, which would give you ~40% bonus.
+__device__ void bbox_overlaps_cuda_kernel_half(
+    const __half* bbox1, const __half* bbox2, __half* ious, const int num_bbox1,
+    const int num_bbox2, const int mode, const bool aligned, const int offset) {
+  const int num_output = aligned ? num_bbox1 : num_bbox1 * num_bbox2;
+  const __half h_offset = __int2half_rn(offset);
+  CUDA_1D_KERNEL_LOOP(index, num_output) {
+    const int b1 = aligned ? index : index / num_bbox2;
+    const int b2 = aligned ? index : index % num_bbox2;
+
+    const int base1 = b1 << 2;
+    __half b1_x1, b1_y1, b1_x2, b1_y2;
+    load_bbox<__half>(bbox1, base1, b1_x1, b1_y1, b1_x2, b1_y2);
+    const __half b1_area = __half_area(b1_x1, b1_y1, b1_x2, b1_y2, h_offset);
+
+    const int base2 = b2 << 2;
+    __half b2_x1, b2_y1, b2_x2, b2_y2;
+    load_bbox<__half>(bbox2, base2, b2_x1, b2_y1, b2_x2, b2_y2);
+    const __half b2_area = __half_area(b2_x1, b2_y1, b2_x2, b2_y2, h_offset);
+
+    const __half left = __half_max(b1_x1, b2_x1),
+                 right = __half_min(b1_x2, b2_x2);
+    const __half top = __half_max(b1_y1, b2_y1),
+                 bottom = __half_min(b1_y2, b2_y2);
+    const __half width =
+        __half_max(__hadd(__hsub(right, left), h_offset), __float2half(0.f));
+    const __half height =
+        __half_max(__hadd(__hsub(bottom, top), h_offset), __float2half(0.f));
+    const __half interS = __hmul(width, height);
+
+    const __half baseS = __half_max(
+        mode == 0 ? __hsub(__hadd(b1_area, b2_area), interS) : b1_area,
+        h_offset);
+    ious[index] = __hdiv(interS, baseS);
+  }
+}
+#endif  // __CUDA_ARCH__ >= 530
+
+#endif  // BBOX_OVERLAPS_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1d2a2197b45ef5c82412c4b75d7819a7e27674f6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/border_align_cuda_kernel.cuh
@@ -0,0 +1,200 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/Megvii-BaseDetection/cvpods/blob/master/cvpods/layers/csrc/border_align/border_align_kernel.cu.
+// the main difference: (1) use `argmax_idx` for fast computing of gradient
+// during the backward. (2) `wh` is directly computed by `boxes`, rather than
+// passing it as argument to forward or backward functions.
+
+#ifndef BORDER_ALIGN_CUDA_KERNEL_CUH
+#define BORDER_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+enum BorderMode { Top = 0, Left = 1, Bottom = 2, Right = 3 };
+
+/*** Forward ***/
+template <typename T>
+__global__ void border_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* boxes, T* output,
+    int* argmax_idx, const int channels, const int box_size, const int height,
+    const int width, const int pool_size) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+    // output, and `extreme_idx` is in range [0,3]
+    int batch_idx, c_idx, box_idx, extreme_idx, maxidx, *offset_argmax_idx;
+    const T *offset_box, *offset_input, *offset_box_x;
+    T *offset_output, box_width, box_height, stride, x_stride, y_stride, x, y,
+        val, maxval;
+
+    extreme_idx = threadIdx.y;
+    // shape (N, C, box_size, 4) for output
+    batch_idx = index / channels / box_size;
+    // shape (N, box_size, 4) for boxes
+    box_idx = index % box_size + batch_idx * box_size;
+    c_idx = (index / box_size) % channels;
+
+    offset_box = boxes + box_idx * 4;
+    box_width = *(offset_box + 2) - *offset_box;
+    box_height = *(offset_box + 3) - *(offset_box + 1);
+    offset_output = output + index * 4 + extreme_idx;
+    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+    // shape (N, 4C, h, w) for input.
+    // [0,C) for top feature, [C,2C) for left feature,
+    // [2C,3C) for bottom feature, [3C,4C) for right feature
+    offset_input =
+        input + (batch_idx * channels * 4 + extreme_idx * channels + c_idx) *
+                    height * width;
+
+    // extreme_idx in [0,1] -> offset_box_x indexed at x1
+    // extreme_idx in [2,3] -> offset_box_x indexed at x2
+    offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+    // (x1,y1) or (x2,y2) for (x,y)
+    x = *offset_box_x;
+    y = *(offset_box_x + 1);
+
+    switch (extreme_idx) {
+      // top
+      case BorderMode::Top:
+        stride = box_width / pool_size;
+        x_stride = stride;
+        y_stride = 0;
+        break;
+      // left
+      case BorderMode::Left:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = stride;
+        break;
+      // bottom
+      case BorderMode::Bottom:
+        stride = box_width / pool_size;
+        x_stride = -stride;
+        y_stride = 0;
+        break;
+      // right
+      case BorderMode::Right:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = -stride;
+        break;
+    }
+
+    // initialize maxval and maxidx with the start position (e.g. (x1,y1) or
+    // (x2,y2))
+    maxval = bilinear_interpolate(offset_input, height, width, y, x, index);
+    maxidx = 0;
+
+    // do max_pool along the border
+    for (int i = 1; i <= pool_size; i++) {
+      x += x_stride;
+      y += y_stride;
+      val = bilinear_interpolate(offset_input, height, width, y, x, index);
+      if (val > maxval) {
+        maxval = val;
+        maxidx = i;
+      }
+    }
+
+    // update output and argmax_idx
+    *offset_output = maxval;
+    *offset_argmax_idx = maxidx;
+  }
+}
+
+/*** Backward ***/
+template <typename T>
+__global__ void border_align_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* boxes,
+    const int* argmax_idx, T* grad_input, const int channels,
+    const int box_size, const int height, const int width,
+    const int pool_size) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (batch_idx, c_idx, box_idx) is an element paralleled for computing
+    // output, and `extreme_idx` is in range [0,3]
+    int batch_idx, c_idx, box_idx, extreme_idx;
+    const int* offset_argmax_idx;
+    const T *offset_grad_output, *offset_box, *offset_box_x;
+    T *offset_grad_input, box_width, box_height, stride, x_stride, y_stride, x,
+        y;
+
+    extreme_idx = threadIdx.y;
+    batch_idx = index / channels / box_size;
+    box_idx = index % box_size + batch_idx * box_size;
+    c_idx = (index / box_size) % channels;
+
+    offset_box = boxes + box_idx * 4;
+    box_width = *(offset_box + 2) - *offset_box;
+    box_height = *(offset_box + 3) - *(offset_box + 1);
+    offset_grad_output = grad_output + index * 4 + extreme_idx;
+    offset_argmax_idx = argmax_idx + index * 4 + extreme_idx;
+    // [0,C) for top feature grad, [C,2C) for left feature grad,
+    // [2C,3C) for bottom feature grad, [3C,4C) for right feature grad
+    offset_grad_input = grad_input + (batch_idx * channels * 4 +
+                                      extreme_idx * channels + c_idx) *
+                                         height * width;
+
+    // extreme_idx in [0,1] -> offset_box_x indexed at x1
+    // extreme_idx in [2,3] -> offset_box_x indexed at x2
+    offset_box_x = offset_box + extreme_idx / 2 * 2;
+
+    switch (extreme_idx) {
+      // top
+      case BorderMode::Top:
+        stride = box_width / pool_size;
+        x_stride = stride;
+        y_stride = 0;
+        break;
+      // left
+      case BorderMode::Left:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = stride;
+        break;
+      // bottom
+      case BorderMode::Bottom:
+        stride = box_width / pool_size;
+        x_stride = -stride;
+        y_stride = 0;
+        break;
+      // right
+      case BorderMode::Right:
+        stride = box_height / pool_size;
+        x_stride = 0;
+        y_stride = -stride;
+        break;
+    }
+
+    // get position (x,y) which has maximum value during forward
+    x = *offset_box_x;
+    y = *(offset_box_x + 1);
+    x += x_stride * (T)(*offset_argmax_idx);
+    y += y_stride * (T)(*offset_argmax_idx);
+
+    T w1, w2, w3, w4;
+    int x_low, x_high, y_low, y_high;
+    bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4, x_low,
+                                  x_high, y_low, y_high, index);
+
+    // update grad_output
+    atomicAdd(offset_grad_input + y_low * width + x_low,
+              *offset_grad_output * w1);
+    atomicAdd(offset_grad_input + y_low * width + x_high,
+              *offset_grad_output * w2);
+    atomicAdd(offset_grad_input + y_high * width + x_low,
+              *offset_grad_output * w3);
+    atomicAdd(offset_grad_input + y_high * width + x_high,
+              *offset_grad_output * w4);
+  }
+}
+
+#endif  // BORDER_ALIGN_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh b/mmcv/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..abd47cd85437804310886de057b5a839a49481b2
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/box_iou_rotated_cuda.cuh
@@ -0,0 +1,81 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#ifndef BOX_IOU_ROTATED_CUDA_CUH
+#define BOX_IOU_ROTATED_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+// 2D block with 32 * 16 = 512 threads per block
+const int BLOCK_DIM_X = 32;
+const int BLOCK_DIM_Y = 16;
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+template <typename T>
+__global__ void box_iou_rotated_cuda_kernel(
+    const int n_boxes1, const int n_boxes2, const T* dev_boxes1,
+    const T* dev_boxes2, T* dev_ious, const int mode_flag, const bool aligned) {
+  if (aligned) {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1) {
+      int b1 = index;
+      int b2 = index;
+
+      int base1 = b1 * 5;
+
+      float block_boxes1[5];
+      float block_boxes2[5];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+
+      int base2 = b2 * 5;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+
+      dev_ious[index] =
+          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  } else {
+    CUDA_1D_KERNEL_LOOP(index, n_boxes1 * n_boxes2) {
+      int b1 = index / n_boxes2;
+      int b2 = index % n_boxes2;
+
+      int base1 = b1 * 5;
+
+      float block_boxes1[5];
+      float block_boxes2[5];
+
+      block_boxes1[0] = dev_boxes1[base1 + 0];
+      block_boxes1[1] = dev_boxes1[base1 + 1];
+      block_boxes1[2] = dev_boxes1[base1 + 2];
+      block_boxes1[3] = dev_boxes1[base1 + 3];
+      block_boxes1[4] = dev_boxes1[base1 + 4];
+
+      int base2 = b2 * 5;
+
+      block_boxes2[0] = dev_boxes2[base2 + 0];
+      block_boxes2[1] = dev_boxes2[base2 + 1];
+      block_boxes2[2] = dev_boxes2[base2 + 2];
+      block_boxes2[3] = dev_boxes2[base2 + 3];
+      block_boxes2[4] = dev_boxes2[base2 + 4];
+
+      dev_ious[index] =
+          single_box_iou_rotated<T>(block_boxes1, block_boxes2, mode_flag);
+    }
+  }
+}
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e7fa990fea1849f626baa0b81a726564373216a8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/carafe_cuda_kernel.cuh
@@ -0,0 +1,332 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_CUDA_KERNEL_CUH
+#define CARAFE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#ifdef HIP_DIFF
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+#define THREADS_PER_PIXEL 32
+#define MAX_SHARED_MEMORY 49152
+#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
+#define MAXIMIZE_KERNEL_SIZE true
+#define kTileDim 32
+#define kBlockRows 8
+#define FULL_MASK 0xffffffff
+
+inline int divideUP(const int x, const int y) { return (((x) + (y)-1) / (y)); }
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+#ifndef HIP_DIFF
+/* TODO: move this to a common place */
+template <typename scalar_t>
+__device__ inline scalar_t min(scalar_t a, scalar_t b) {
+  return a < b ? a : b;
+}
+
+template <typename scalar_t>
+__device__ inline scalar_t max(scalar_t a, scalar_t b) {
+  return a > b ? a : b;
+}
+#endif
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t warpReduceSum(scalar_t val) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef HIP_DIFF
+    val += __shfl_down(val, offset);
+#else
+    val += __shfl_down_sync(FULL_MASK, val, offset);
+#endif
+  return val;
+}
+
+template <>
+__device__ __forceinline__ phalf warpReduceSum(phalf val) {
+  for (int offset = WARP_SIZE / 2; offset > 0; offset /= 2)
+#ifdef HIP_DIFF
+    __PHALF(val) += __shfl_down(FULL_MASK, val, offset);
+#else
+    __PHALF(val) +=
+        __shfl_down_sync(FULL_MASK, static_cast<__half>(__PHALF(val)), offset);
+#endif
+  return val;
+}
+
+// Splits the original matrix into submatrices with size 32 * 32.
+// Each block transposes one submatrix by loading it into shared memory.
+// Reference https://devblogs.nvidia.com/efficient-matrix-transpose-cuda-cc/
+template <typename scalar_t>
+__global__ void BatchTranspose2DCUDAKernel(const int N, const int H,
+                                           const int W, const int dh,
+                                           const int dw,
+                                           const scalar_t *__restrict__ X,
+                                           scalar_t *__restrict__ Y) {
+  __shared__ scalar_t tile[kTileDim][kTileDim + 1];
+  const int n = blockIdx.x / (dh * dw);
+  const int k = blockIdx.x % (dh * dw);
+  const int r = k / dw;
+  const int c = k % dw;
+  const int offset = n * H * W;
+  int x = c * kTileDim + threadIdx.x;
+  int y = r * kTileDim + threadIdx.y;
+  if (x < W) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < H; i += kBlockRows) {
+      tile[threadIdx.y + i][threadIdx.x] = X[offset + (y + i) * W + x];
+    }
+  }
+  __syncthreads();
+  x = r * kTileDim + threadIdx.x;
+  y = c * kTileDim + threadIdx.y;
+  if (x < H) {
+    for (int i = 0; threadIdx.y + i < kTileDim && y + i < W; i += kBlockRows) {
+      Y[offset + (y + i) * H + x] = tile[threadIdx.x][threadIdx.y + i];
+    }
+  }
+}
+template <typename scalar_t>
+__global__ void CARAFEForward(
+    const int num_kernels, const scalar_t *__restrict__ bottom_data,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels, scalar_t *__restrict__ top_data) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+
+  const int start_w = down_pw - (kernel_size - 1) / 2;
+  const int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+  const int start_h = down_ph - (kernel_size - 1) / 2;
+  const int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    int mask_index = Loc2Index(n, ph, pw, c, height, width, mask_channels);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy++) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, iy, ix, c, down_height, down_width, channels);
+
+        output_val += bottom_data[feat_index] *
+                      shared_mask[mask_c * WARP_SIZE + pixel_id];
+      }
+    }
+
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    top_data[top_index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void CARAFEBackward_Feature(
+    const int num_kernels, const scalar_t *__restrict__ top_diff,
+    const scalar_t *__restrict__ bottom_masks, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int down_height, const int down_width, const int height,
+    const int width, const int mask_channels,
+    scalar_t *__restrict__ bottom_diff) {
+#if MAXIMIZE_KERNEL_SIZE
+  __shared__ float shared_mask[MAX_SHARED_SCALAR_T * 2];
+#else
+  __shared__ scalar_t shared_mask[MAX_SHARED_SCALAR_T];
+#endif
+
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+
+  const int pixel_id = threadIdx.x / THREADS_PER_PIXEL;
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int start_w = pw - (kernel_size - 1) * scale_factor / 2;
+  const int end_w = pw + (kernel_size - 1) * scale_factor / 2 + 1;
+  const int start_h = ph - (kernel_size - 1) * scale_factor / 2;
+  const int end_h = ph + (kernel_size - 1) * scale_factor / 2 + 1;
+  for (int c = split_id; c < mask_channels; c += THREADS_PER_PIXEL) {
+    const int mask_w = (c % kernel_size) * scale_factor;
+    const int mask_h = (c / kernel_size % kernel_size) * scale_factor;
+    const int mask_x = start_w + mask_w;
+    const int mask_y = start_h + mask_h;
+    if (mask_y < 0 || mask_y > height - 1 || mask_x < 0 || mask_x > width - 1) {
+      shared_mask[c * WARP_SIZE + pixel_id] = 0;
+      continue;
+    }
+    const int mask_group = c / (kernel_size * kernel_size);
+    const int mask_c = (2 * mask_group + 1) * kernel_size * kernel_size - c - 1;
+    int mask_index =
+        Loc2Index(n, mask_c, mask_y, mask_x, mask_channels, height, width);
+    shared_mask[c * WARP_SIZE + pixel_id] = bottom_masks[mask_index];
+  }
+  __syncthreads();
+  const int channels_per_group = ceilf(channels / (float)group_size);
+#pragma unroll
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    int mask_group = c / channels_per_group;
+    int top_index = Loc2Index(n, ph, pw, c, height, width, channels);
+    scalar_t output_val = 0;
+#pragma unroll
+    for (int iy = start_h; iy < end_h; iy += scale_factor) {
+#pragma unroll
+      for (int ix = start_w; ix < end_w; ix += scale_factor) {
+        if (iy < 0 || iy > height - 1 || ix < 0 || ix > width - 1) {
+          continue;
+        }
+        int mask_iy =
+            (iy - ph + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_ix =
+            (ix - pw + (kernel_size - 1) * scale_factor / 2) / scale_factor;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index = Loc2Index(n, iy, ix, c, height, width, channels);
+        output_val +=
+            shared_mask[mask_c * WARP_SIZE + pixel_id] * top_diff[feat_index];
+      }
+    }
+    bottom_diff[top_index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void FeatureSum(const int num_kernels,
+                           const scalar_t *__restrict__ input_data,
+                           const int scale_factor, const int channels,
+                           const int height, const int width,
+                           scalar_t *__restrict__ output_data) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+  const int split_id = threadIdx.x % THREADS_PER_PIXEL;
+  index = index / THREADS_PER_PIXEL;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+  for (int c = split_id; c < channels; c += THREADS_PER_PIXEL) {
+    scalar_t output_val = 0;
+    for (int iy = ph * scale_factor; iy < (ph + 1) * scale_factor; iy++) {
+      for (int ix = pw * scale_factor; ix < (pw + 1) * scale_factor; ix++) {
+        int input_id = Loc2Index(n, iy, ix, c, height * scale_factor,
+                                 width * scale_factor, channels);
+        output_val += input_data[input_id];
+      }
+    }
+    const int output_id = Loc2Index(n, ph, pw, c, height, width, channels);
+    output_data[output_id] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void CARAFEBackward_Mask(const int num_kernels,
+                                    const scalar_t *__restrict__ top_diff,
+                                    const scalar_t *__restrict__ bottom_data,
+                                    const int kernel_size, const int group_size,
+                                    const int scale_factor, const int channels,
+                                    const int down_height, const int down_width,
+                                    const int height, const int width,
+                                    const int mask_channels,
+                                    scalar_t *__restrict__ mask_diff) {
+  int index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index > num_kernels - 1) {
+    return;
+  }
+
+  const int lane_id = index % WARP_SIZE;
+  index = index / WARP_SIZE;
+  const int mask_c = index % mask_channels;
+  // (n, c, ph, pw) is an element in the bottom_data
+  index = index / mask_channels;
+  const int pw = index % width;
+  const int ph = (index / width) % height;
+  const int n = index / width / height;
+
+  const int down_pw = pw / scale_factor;
+  const int down_ph = ph / scale_factor;
+
+  const int mask_group = mask_c / (kernel_size * kernel_size);
+  const int mask_loc = mask_c % (kernel_size * kernel_size);
+
+  const int offset_x = mask_loc % kernel_size - (kernel_size - 1) / 2;
+  const int offset_y =
+      mask_loc / kernel_size % kernel_size - (kernel_size - 1) / 2;
+
+  const int down_x = down_pw + offset_x;
+  const int down_y = down_ph + offset_y;
+
+  scalar_t output_val = 0;
+
+  if (down_y >= 0 && down_y <= down_height - 1 && down_x >= 0 &&
+      down_x <= down_width - 1) {
+    const int channels_per_mask = ceilf(channels / (float)group_size);
+    const int start = channels_per_mask * mask_group;
+    const int end = min(channels_per_mask * (mask_group + 1), channels);
+    for (int c = start + lane_id; c < end; c += WARP_SIZE) {
+      int bottom_id =
+          Loc2Index(n, down_y, down_x, c, down_height, down_width, channels);
+      int top_id = Loc2Index(n, ph, pw, c, height, width, channels);
+      output_val += top_diff[top_id] * bottom_data[bottom_id];
+    }
+  }
+#ifdef HIP_DIFF
+  __syncthreads();
+#else
+  __syncwarp();
+#endif
+  output_val = warpReduceSum(output_val);
+  if (lane_id == 0) {
+    const int mask_id =
+        Loc2Index(n, ph, pw, mask_c, height, width, mask_channels);
+    mask_diff[mask_id] = output_val;
+  }
+}
+
+#endif  // CARAFE_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..48230c632f223b736aa72a9d5fd682c97b3aa93a
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/carafe_naive_cuda_kernel.cuh
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_NAIVE_CUDA_KERNEL_CUH
+#define CARAFE_NAIVE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__device__ inline int Loc2Index(const int n, const int c, const int h,
+                                const int w, const int channel_num,
+                                const int height, const int width) {
+  int index = w + (h + (c + n * channel_num) * height) * width;
+  return index;
+}
+
+template <typename scalar_t>
+__global__ void carafe_naive_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *top_data, const int kernel_size,
+    const int group_size, const int scale_factor, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+    scalar_t output_val = 0;
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        output_val += bottom_data[feat_index] * bottom_masks[mask_index];
+      }
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void carafe_naive_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_data,
+    const scalar_t *bottom_masks, scalar_t *bottom_diff, scalar_t *mask_diff,
+    const int kernel_size, const int group_size, const int scale_factor,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the bottom_data
+    int pw = index % width;
+    int ph = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    int mask_channels = kernel_size * kernel_size * group_size;
+    int mask_group = c / (channels / group_size);
+
+    int down_pw = pw / scale_factor;
+    int down_ph = ph / scale_factor;
+    int down_width = width / scale_factor;
+    int down_height = height / scale_factor;
+    int start_w = down_pw - (kernel_size - 1) / 2;
+    int end_w = down_pw + (kernel_size - 1) / 2 + 1;
+    int start_h = down_ph - (kernel_size - 1) / 2;
+    int end_h = down_ph + (kernel_size - 1) / 2 + 1;
+
+    for (int iy = start_h; iy < end_h; iy++) {
+      for (int ix = start_w; ix < end_w; ix++) {
+        if (iy < 0 || iy > down_height - 1 || ix < 0 || ix > down_width - 1) {
+          continue;
+        }
+        int mask_iy = iy - down_ph + (kernel_size - 1) / 2;
+        int mask_ix = ix - down_pw + (kernel_size - 1) / 2;
+        int mask_c =
+            (mask_group * kernel_size + mask_iy) * kernel_size + mask_ix;
+        int feat_index =
+            Loc2Index(n, c, iy, ix, channels, down_height, down_width);
+        int mask_index =
+            Loc2Index(n, mask_c, ph, pw, mask_channels, height, width);
+        atomicAdd(bottom_diff + feat_index,
+                  bottom_masks[mask_index] * top_diff[index]);
+        atomicAdd(mask_diff + mask_index,
+                  bottom_data[feat_index] * top_diff[index]);
+      }
+    }
+  }
+}
+
+#endif  // CARAFE_NAIVE_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..89feea4a546a5093967f26393ca6be3b9fe6ae05
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/chamfer_distance_cuda_kernel.cuh
@@ -0,0 +1,101 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cu
+#ifndef CHAMFER_DISTANCE_CUDA_KERNEL_CUH
+#define CHAMFER_DISTANCE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAX_SHARED_SCALAR_T 6144  // 49152 / 8 = 6144
+
+template <typename scalar_t>
+__global__ void chamfer_distance_forward_cuda_kernel(int b, int n,
+                                                     const scalar_t* xyz, int m,
+                                                     const scalar_t* xyz2,
+                                                     scalar_t* result,
+                                                     int* result_i) {
+  __shared__ scalar_t buf[MAX_SHARED_SCALAR_T];
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int k2 = 0; k2 < m; k2 += THREADS_PER_BLOCK) {
+      int end_k = min(m, k2 + THREADS_PER_BLOCK) - k2;
+      for (int j = threadIdx.x; j < end_k * 2; j += blockDim.x) {
+        buf[j] = xyz2[(i * m + k2) * 2 + j];
+      }
+      __syncthreads();
+      for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
+        scalar_t x1 = xyz[(i * n + j) * 2 + 0];
+        scalar_t y1 = xyz[(i * n + j) * 2 + 1];
+        int best_i = 0;
+        scalar_t best = 1e10;
+        int end_ka = end_k & (~2);
+        if (end_ka == THREADS_PER_BLOCK) {
+          for (int k = 0; k < THREADS_PER_BLOCK; k += 4) {
+#pragma unroll
+            for (int j = 0; j < 4; ++j) {
+              scalar_t x2 = buf[(k + j) * 2] - x1;
+              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
+              scalar_t d = x2 * x2 + y2 * y2;
+              if (d < best) {
+                best = d;
+                best_i = k + k2 + j;
+              }
+            }
+          }
+        } else {
+          for (int k = 0; k < end_ka; k += 4) {
+#pragma unroll
+            for (int j = 0; j < 4; ++j) {
+              scalar_t x2 = buf[(k + j) * 2] - x1;
+              scalar_t y2 = buf[(k + j) * 2 + 1] - y1;
+              scalar_t d = x2 * x2 + y2 * y2;
+              if (d < best) {
+                best = d;
+                best_i = k + k2 + j;
+              }
+            }
+          }
+        }
+        for (int k = end_ka; k < end_k; k++) {
+          scalar_t x2 = buf[k * 2 + 0] - x1;
+          scalar_t y2 = buf[k * 2 + 1] - y1;
+          scalar_t d = x2 * x2 + y2 * y2;
+          if (k == 0 || d < best) {
+            best = d;
+            best_i = k + k2;
+          }
+        }
+        if (k2 == 0 || result[(i * n + j)] > best) {
+          result[(i * n + j)] = best;
+          result_i[(i * n + j)] = best_i;
+        }
+      }
+      __syncthreads();
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void chamfer_distance_backward_cuda_kernel(
+    int b, int n, const scalar_t* xyz1, int m, const scalar_t* xyz2,
+    const scalar_t* grad_dist1, const int* idx1, scalar_t* grad_xyz1,
+    scalar_t* grad_xyz2) {
+  for (int i = blockIdx.x; i < b; i += gridDim.x) {
+    for (int j = threadIdx.x; j < n; j += blockDim.x * gridDim.y) {
+      scalar_t x1 = xyz1[(i * n + j) * 2 + 0];
+      scalar_t y1 = xyz1[(i * n + j) * 2 + 1];
+      int j2 = idx1[i * n + j];
+      scalar_t x2 = xyz2[(i * m + j2) * 2 + 0];
+      scalar_t y2 = xyz2[(i * m + j2) * 2 + 1];
+      scalar_t g = grad_dist1[i * n + j] * 2;
+      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 0]), g * (x1 - x2));
+      atomicAdd(&(grad_xyz1[(i * n + j) * 2 + 1]), g * (y1 - y2));
+      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 0]), -(g * (x1 - x2)));
+      atomicAdd(&(grad_xyz2[(i * m + j2) * 2 + 1]), -(g * (y1 - y2)));
+    }
+  }
+}
+#endif  // CHAMFER_DISTANCE_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp b/mmcv/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..b12aa9a26a2cc162fd89f68ccc97e17749090a41
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/common_cuda_helper.hpp
@@ -0,0 +1,120 @@
+#ifndef COMMON_CUDA_HELPER
+#define COMMON_CUDA_HELPER
+
+#include <cuda.h>
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+#define CUDA_2D_KERNEL_LOOP(i, n, j, m)                             \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < (n);   \
+       i += blockDim.x * gridDim.x)                                 \
+    for (size_t j = blockIdx.y * blockDim.y + threadIdx.y; j < (m); \
+         j += blockDim.y * gridDim.y)
+
+#define CUDA_2D_KERNEL_BLOCK_LOOP(i, n, j, m)          \
+  for (size_t i = blockIdx.x; i < (n); i += gridDim.x) \
+    for (size_t j = blockIdx.y; j < (m); j += gridDim.y)
+
+#define THREADS_PER_BLOCK 512
+
+inline int GET_BLOCKS(const int N, const int num_threads = THREADS_PER_BLOCK) {
+  int optimal_block_num = (N + num_threads - 1) / num_threads;
+  int max_block_num = 4096;
+  return min(optimal_block_num, max_block_num);
+}
+
+template <typename T>
+__device__ T bilinear_interpolate(const T* input, const int height,
+                                  const int width, T y, T x,
+                                  const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T>
+__device__ void bilinear_interpolate_gradient(
+    const int height, const int width, T y, T x, T& w1, T& w2, T& w3, T& w4,
+    int& x_low, int& x_high, int& y_low, int& y_high,
+    const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+#endif  // COMMON_CUDA_HELPER
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2af96f7963ec347486ced942a5ef7cc4f187db8b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/convex_iou_cuda_kernel.cuh
@@ -0,0 +1,831 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONVEX_IOU_CUDA_KERNEL_CUH
+#define CONVEX_IOU_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 100
+#define NMAX 512
+__device__ const double EPS = 1E-8;
+
+__device__ inline int sig(double d) { return (d > EPS) - (d < -EPS); }
+
+struct Point {
+  double x, y;
+  __device__ Point() {}
+  __device__ Point(double x, double y) : x(x), y(y) {}
+};
+
+__device__ inline bool point_same(Point& a, Point& b) {
+  return sig(a.x - b.x) == 0 && sig(a.y - b.y) == 0;
+}
+
+__device__ inline void swap1(Point* a, Point* b) {
+  Point temp;
+  temp.x = a->x;
+  temp.y = a->y;
+
+  a->x = b->x;
+  a->y = b->y;
+
+  b->x = temp.x;
+  b->y = temp.y;
+}
+
+__device__ inline void reverse1(Point* a, const int n) {
+  for (int i = 0; i < (n - 1) / 2.0; i++) {
+    Point* j = &(a[i]);
+    Point* k = &(a[n - 1 - i]);
+    swap1(j, k);
+  }
+}
+
+__device__ inline double cross(Point o, Point a, Point b) {
+  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline double dis(Point a, Point b) {
+  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline double area(Point* ps, int n) {
+  ps[n] = ps[0];
+  double res = 0;
+  for (int i = 0; i < n; i++) {
+    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+  }
+  return res / 2.0;
+}
+__device__ inline double polygon_area_grad(Point* ps, int n,
+                                           int* polygon_to_pred_index,
+                                           int n_pred, double* grad_C) {
+  ps[n] = ps[0];
+  double partion_grad[4 * 30 + 2];
+  double res = 0;
+  for (int i = 0; i < n; i++) {
+    res += ps[i].x * ps[i + 1].y - ps[i].y * ps[i + 1].x;
+    partion_grad[i * 4 + 2] = ps[i + 1].y;
+    partion_grad[i * 4 + 3] = -ps[i + 1].x;
+    if (i != n - 1) {
+      partion_grad[i * 4 + 4] = -ps[i].y;
+      partion_grad[i * 4 + 5] = ps[i].x;
+    } else {
+      partion_grad[0] = -ps[i].y;
+      partion_grad[1] = ps[i].x;
+    }
+  }
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < n_pred; j++) {
+      if (i == polygon_to_pred_index[j]) {
+        grad_C[2 * polygon_to_pred_index[j + n_pred]] =
+            (partion_grad[i * 4] + partion_grad[i * 4 + 2]) / 2;
+        break;
+      }
+    }
+    for (int j = 0; j < n_pred; j++) {
+      if (i == polygon_to_pred_index[j]) {
+        grad_C[2 * polygon_to_pred_index[j + n_pred] + 1] =
+            (partion_grad[i * 4 + 1] + partion_grad[i * 4 + 1 + 2]) / 2;
+        break;
+      }
+    }
+  }
+
+  return res / 2.0;
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p,
+                                double* cut_grad, int m, int n, int i) {
+  double s1, s2;
+  double s2_s1_2;
+  double ds1_dxc, ds1_dyc, ds2_dxd, ds2_dyd;
+  double dxp_dxc, dxp_dyc, dxp_dxd, dxp_dyd, dyp_dxc, dyp_dyc, dyp_dxd, dyp_dyd;
+  s1 = cross(a, b, c);
+  s2 = cross(a, b, d);
+
+  ds1_dxc = -(b.y - a.y);
+  ds1_dyc = b.x - a.x;
+  ds2_dxd = ds1_dxc;
+  ds2_dyd = ds1_dyc;
+  s2_s1_2 = (s2 - s1) * (s2 - s1);
+
+  if (sig(s1) == 0 && sig(s2) == 0) return 2;
+  if (sig(s2 - s1) == 0) return 0;
+
+  dxp_dxc =
+      ((s2 - d.x * ds1_dxc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dxc)) /
+      (s2_s1_2);
+  dxp_dyc =
+      ((0 - d.x * ds1_dyc) * (s2 - s1) - (c.x * s2 - d.x * s1) * (-ds1_dyc)) /
+      (s2_s1_2);
+  dxp_dxd =
+      ((c.x * ds2_dxd - s1) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dxd)) /
+      (s2_s1_2);
+  dxp_dyd =
+      ((c.x * ds2_dyd - 0) * (s2 - s1) - (c.x * s2 - d.x * s1) * (ds2_dyd)) /
+      (s2_s1_2);
+
+  dyp_dxc =
+      ((0 - d.y * ds1_dxc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dxc)) /
+      (s2_s1_2);
+  dyp_dyc =
+      ((s2 - d.y * ds1_dyc) * (s2 - s1) - (c.y * s2 - d.y * s1) * (-ds1_dyc)) /
+      (s2_s1_2);
+  dyp_dxd =
+      ((c.y * ds2_dxd - 0) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dxd)) /
+      (s2_s1_2);
+  dyp_dyd =
+      ((c.y * ds2_dyd - s1) * (s2 - s1) - (c.y * s2 - d.y * s1) * (ds2_dyd)) /
+      (s2_s1_2);
+
+  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+  if (i == n - 1) {
+    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
+    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+    cut_grad[4 * n * m + 0] = dxp_dxd;  // + dyp_dxd;
+    cut_grad[4 * n * m + 1] = dyp_dxd;
+    cut_grad[4 * n * m + 2] = dxp_dyd;  // + dyp_dyd;
+    cut_grad[4 * n * m + 3] = dyp_dyd;
+  } else {
+    cut_grad[4 * n * m + 4 * i] = dxp_dxc;  // + dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 1] = dyp_dxc;
+    cut_grad[4 * n * m + 4 * i + 2] = dxp_dyc;  // + dyp_dyc;
+    cut_grad[4 * n * m + 4 * i + 3] = dyp_dyc;
+    cut_grad[4 * n * m + 4 * (i + 1)] = dxp_dxd;  // + dyp_dxd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 1] = dyp_dxd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 2] = dxp_dyd;  // + dyp_dyd;
+    cut_grad[4 * n * m + 4 * (i + 1) + 3] = dyp_dyd;
+  }
+
+  return 1;
+}
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b,
+                                   double* cut_grad) {
+  Point pp[MAXN];
+  double ccur_grad[MAXN] = {};
+  int m = 0;
+  p[n] = p[0];
+  int k = n;
+  for (int i = 0; i < n; i++) {
+    if (sig(cross(a, b, p[i])) > 0) {
+      pp[m] = p[i];
+      ccur_grad[4 * n * m + 4 * i] = 1.0;
+      ccur_grad[4 * n * m + 4 * i + 3] = 1.0;
+      m++;
+    }
+    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+      lineCross(a, b, p[i], p[i + 1], pp[m], ccur_grad, m, n, i);
+      m++;
+    }
+  }
+
+  n = 0;
+  for (int i = 0; i < m; i++) {
+    if (!i || !(point_same(pp[i], pp[i - 1]))) {
+      p[n] = pp[i];
+      for (int j = 0; j < 4 * k; j++) {
+        cut_grad[4 * k * n + j] = ccur_grad[4 * k * i + j];
+      }
+      n++;
+    }
+  }
+
+  while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d,
+                                       double* grad_AB, int order,
+                                       int convex_n) {
+  Point o(0, 0);
+  int res_flag = 0;
+  int s1 = sig(cross(o, a, b));
+  int s2 = sig(cross(o, c, d));
+  if (s1 == 0 || s2 == 0) return 0.0;
+  if (s1 == -1) {
+    Point* i = &a;
+    Point* j = &b;
+    swap1(i, j);
+    res_flag = 1;
+  }
+  if (s2 == -1) {
+    Point* i = &c;
+    Point* j = &d;
+    swap1(i, j);
+  }
+  Point p[10] = {o, a, b};
+  int n = 3, n0 = 3, n1, n2, n3;
+  double cut_grad1[MAXN] = {};
+  double cut_grad2[MAXN] = {};
+  double cut_grad3[MAXN] = {};
+  double p1_p_grad[10][10] = {};
+  double p2_p1_grad[10][10] = {};
+  double p3_p2_grad[10][10] = {};
+
+  double p3_p1_grad[10][10] = {};
+  double p3_p_grad[10][10] = {};
+
+  // 1
+  polygon_cut(p, n, o, c, cut_grad1);
+  n1 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n0; j++) {
+      if (!(j % 2)) {
+        p1_p_grad[2 * i][j / 2] = cut_grad1[4 * n0 * i + j];
+      } else {
+        p1_p_grad[2 * i + 1][j / 2] = cut_grad1[4 * n0 * i + j];
+      }
+    }
+  }
+
+  // 2
+  polygon_cut(p, n, c, d, cut_grad2);
+  n2 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n1; j++) {
+      if (!(j % 2)) {
+        p2_p1_grad[2 * i][j / 2] = cut_grad2[4 * n1 * i + j];
+      } else {
+        p2_p1_grad[2 * i + 1][j / 2] = cut_grad2[4 * n1 * i + j];
+      }
+    }
+  }
+  // 3
+  polygon_cut(p, n, d, o, cut_grad3);
+  n3 = n;
+  for (int i = 0; i < n; i++) {
+    for (int j = 0; j < 4 * n2; j++) {
+      if (!(j % 2)) {
+        p3_p2_grad[2 * i][j / 2] = cut_grad3[4 * n2 * i + j];
+      } else {
+        p3_p2_grad[2 * i + 1][j / 2] = cut_grad3[4 * n2 * i + j];
+      }
+    }
+  }
+
+  // mul
+  //  p3_p2(n3 * n2) * p2_p1(n2 * n1) = p3_p1 (n3 * n1)
+  for (int i = 0; i < 2 * n3; i++) {
+    for (int j = 0; j < 2 * n1; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n2; m++) {
+        sum = sum + p3_p2_grad[i][m] * p2_p1_grad[m][j];
+      }
+      p3_p1_grad[i][j] = sum;
+    }
+  }
+
+  // p3_p1 (n3 * n1) * p1_p (n1 * n0) = p3_p (n3 * n0)
+  for (int i = 0; i < 2 * n3; i++) {
+    for (int j = 0; j < 2 * n0; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n1; m++) {
+        sum = sum + p3_p1_grad[i][m] * p1_p_grad[m][j];
+      }
+      p3_p_grad[i][j] = sum;
+    }
+  }
+
+  // calculate S_grad
+  int polygon_index_box_index[20];
+  double grad_polygon[20];
+  double S_grad[6];
+
+  for (int i = 0; i < n3; i++) {
+    polygon_index_box_index[i] = i;
+    polygon_index_box_index[i + n3] = i;
+  }
+
+  double res =
+      polygon_area_grad(p, n3, polygon_index_box_index, n3, grad_polygon);
+
+  if (s1 * s2 == -1) {
+    for (int j = 0; j < 2 * 3; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n3; m++) {
+        sum = sum - grad_polygon[m] * p3_p_grad[m][j];
+      }
+      S_grad[j] = sum;
+    }
+
+    if (order != convex_n - 1) {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[2 * order + 2] += S_grad[2];
+        grad_AB[2 * order + 3] += S_grad[3];
+
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[2 * order + 2] += S_grad[4];
+        grad_AB[2 * order + 3] += S_grad[5];
+      }
+    } else {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[0] += S_grad[2];
+        grad_AB[1] += S_grad[3];
+
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[0] += S_grad[4];
+        grad_AB[1] += S_grad[5];
+      }
+    }
+    res = -res;
+  } else {
+    for (int j = 0; j < 2 * 3; j++) {
+      double sum = 0.0;
+      for (int m = 0; m < 2 * n3; m++) {
+        sum = sum + grad_polygon[m] * p3_p_grad[m][j];
+      }
+      S_grad[j] = sum;
+    }
+
+    if (order != convex_n - 1) {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[2 * order + 2] += S_grad[2];
+        grad_AB[2 * order + 3] += S_grad[3];
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[2 * order + 2] += S_grad[4];
+        grad_AB[2 * order + 3] += S_grad[5];
+      }
+    } else {
+      if (res_flag) {
+        grad_AB[2 * order] += S_grad[4];
+        grad_AB[2 * order + 1] += S_grad[5];
+        grad_AB[0] += S_grad[2];
+        grad_AB[1] += S_grad[3];
+      } else {
+        grad_AB[2 * order] += S_grad[2];
+        grad_AB[2 * order + 1] += S_grad[3];
+        grad_AB[0] += S_grad[4];
+        grad_AB[1] += S_grad[5];
+      }
+    }
+  }
+  return res;
+}
+
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2, int n2,
+                                        double* grad_AB) {
+  if (area(ps1, n1) < 0) reverse1(ps1, n1);
+  if (area(ps2, n2) < 0) reverse1(ps2, n2);
+  ps1[n1] = ps1[0];
+  ps2[n2] = ps2[0];
+  double res = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      res +=
+          intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1], grad_AB, i, n1);
+    }
+  }
+  return res;
+}
+
+__device__ inline void Jarvis(Point* in_poly, int& n_poly) {
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[NMAX] = {}, top1, top2;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point* j = &(in_poly[0]);
+      Point* k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+  for (int i = 0; i <= top1; i++) right_point[i] = in_poly[Stack[i]];
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+  for (int i = top2 - 1; i >= 0; i--) left_point[i] = in_poly[Stack[i]];
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+}
+
+__device__ inline double intersectAreaPoly(Point* ps1, int n1, Point* ps2,
+                                           int n2, double* grad_C) {
+  Point polygon[MAXN];
+  int n = n1 + n2, n_poly = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n - n1; j++) {
+      if (point_same(ps1[i], ps2[j])) {
+        for (int k = j; k < n - n1 - 1; k++) {
+          ps2[k] = ps2[k + 1];
+        }
+        n2--;
+        break;
+      }
+    }
+  }
+  n_poly = n1 + n2;
+  for (int i = 0; i < n_poly; i++) {
+    if (i < n1) {
+      polygon[i] = ps1[i];
+    } else {
+      polygon[i] = ps2[i - n1];
+    }
+  }
+
+  Jarvis(polygon, n_poly);
+
+  int polygon_to_pred_index[18] = {-1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                   -1, -1, -1, -1, -1, -1, -1, -1, -1};
+  int n_pred = 0;
+  for (int i = 0; i < n_poly; i++) {
+    for (int j = 0; j < n1; j++) {
+      if (polygon[i].x == ps1[j].x && polygon[i].y == ps1[j].y) {
+        polygon_to_pred_index[n_pred] = i;
+        polygon_to_pred_index[n_pred + n1] = j;
+        n_pred += 1;
+        break;
+      }
+    }
+  }
+  if (n_pred == 0) {
+    double polygon_area = fabs(area(polygon, n_poly));
+    for (int i = 0; i < 18; i++) {
+      grad_C[i] = 0.0;
+    }
+    return polygon_area;
+  } else {
+    double polygon_area =
+        polygon_area_grad(polygon, n_poly, polygon_to_pred_index, n1, grad_C);
+    if (polygon_area < 0) {
+      for (int i = 0; i < 18; i++) {
+        grad_C[i] = -grad_C[i];
+      }
+    }
+    return fabs(polygon_area);
+  }
+}
+
+// convex_find and get the polygon_index_box_index
+__device__ inline void Jarvis_and_index(Point* in_poly, int& n_poly,
+                                        int* points_to_convex_ind) {
+  int n_input = n_poly;
+  Point input_poly[20];
+  for (int i = 0; i < n_input; i++) {
+    input_poly[i].x = in_poly[i].x;
+    input_poly[i].y = in_poly[i].y;
+  }
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[20], top1, top2;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point* j = &(in_poly[0]);
+      Point* k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+  for (int i = 0; i <= top1; i++) {
+    right_point[i] = in_poly[Stack[i]];
+  }
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+
+  for (int i = top2 - 1; i >= 0; i--) {
+    left_point[i] = in_poly[Stack[i]];
+  }
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+  for (int i = 0; i < n_poly; i++) {
+    for (int j = 0; j < n_input; j++) {
+      if (point_same(in_poly[i], input_poly[j])) {
+        points_to_convex_ind[i] = j;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__device__ inline float devrIoU(T const* const p, T const* const q,
+                                T* point_grad, const int idx) {
+  Point ps1[MAXN], ps2[MAXN];
+
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = (double)p[i * 2];
+    convex[i].y = (double)p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+
+  int n1 = n_convex;
+  int n2 = 4;
+
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = (double)convex[i].x;
+    ps1[i].y = (double)convex[i].y;
+  }
+
+  for (int i = 0; i < n2; i++) {
+    ps2[i].x = (double)q[i * 2];
+    ps2[i].y = (double)q[i * 2 + 1];
+  }
+
+  int polygon_index_box_index[18];
+  for (int i = 0; i < n1; i++) {
+    polygon_index_box_index[i] = i;
+    polygon_index_box_index[i + n1] = i;
+  }
+
+  double grad_A[18] = {};
+  double grad_AB[18] = {};
+  double grad_C[18] = {};
+
+  double inter_area = intersectAreaO(ps1, n1, ps2, n2, grad_AB);
+  double S_pred =
+      polygon_area_grad(ps1, n1, polygon_index_box_index, n1, grad_A);
+  if (S_pred < 0) {
+    for (int i = 0; i < n_convex * 2; i++) {
+      grad_A[i] = -grad_A[i];
+    }
+  }
+  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+
+  double iou = inter_area / union_area;
+  double polygon_area = intersectAreaPoly(ps1, n1, ps2, n2, grad_C);
+
+  //    printf("%d:live\n", idx);
+  double rot_giou = iou - (polygon_area - union_area) / polygon_area;
+
+  float grad_point_temp[18] = {};
+
+  for (int i = 0; i < n_convex; i++) {
+    int grad_point = points_to_convex_ind[i];
+    grad_point_temp[2 * grad_point] =
+        (float)((union_area + inter_area) / (union_area * union_area) *
+                    grad_AB[2 * i] -
+                iou / union_area * grad_A[2 * i] -
+                1 / polygon_area * (grad_AB[2 * i] - grad_A[2 * i]) -
+                (union_area) / polygon_area / polygon_area * grad_C[2 * i]);
+    grad_point_temp[2 * grad_point + 1] =
+        (float)((union_area + inter_area) / (union_area * union_area) *
+                    grad_AB[2 * i + 1] -
+                iou / union_area * grad_A[2 * i + 1] -
+                1 / polygon_area * (grad_AB[2 * i + 1] - grad_A[2 * i + 1]) -
+                (union_area) / polygon_area / polygon_area * grad_C[2 * i + 1]);
+  }
+
+  for (int i = 0; i < 9; i++) {
+    point_grad[2 * i] = grad_point_temp[2 * i];
+    point_grad[2 * i + 1] = grad_point_temp[2 * i + 1];
+  }
+  return (float)rot_giou;
+}
+
+template <typename T>
+__global__ void convex_giou_cuda_kernel(const int ex_n_boxes,
+                                        const int gt_n_boxes, const T* ex_boxes,
+                                        const T* gt_boxes, T* point_grad) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T* cur_box = ex_boxes + index * 18;
+    const T* cur_gt_box = gt_boxes + index * 8;
+    T* cur_grad = point_grad + index * 19;
+    T giou = devrIoU(cur_box, cur_gt_box, cur_grad, threadIdx.x);
+    cur_grad[18] = giou;
+  }
+}
+
+__device__ inline int lineCross(Point a, Point b, Point c, Point d, Point& p) {
+  double s1, s2;
+  s1 = cross(a, b, c);
+  s2 = cross(a, b, d);
+  if (sig(s1) == 0 && sig(s2) == 0) return 2;
+  if (sig(s2 - s1) == 0) return 0;
+  p.x = (c.x * s2 - d.x * s1) / (s2 - s1);
+  p.y = (c.y * s2 - d.y * s1) / (s2 - s1);
+  return 1;
+}
+
+__device__ inline void polygon_cut(Point* p, int& n, Point a, Point b) {
+  Point pp[MAXN];
+  int m = 0;
+  p[n] = p[0];
+  for (int i = 0; i < n; i++) {
+    if (sig(cross(a, b, p[i])) > 0) {
+      pp[m] = p[i];
+      m++;
+    }
+    if (sig(cross(a, b, p[i])) != sig(cross(a, b, p[i + 1]))) {
+      lineCross(a, b, p[i], p[i + 1], pp[m]);
+      m++;
+    }
+  }
+  n = 0;
+  for (int i = 0; i < m; i++) {
+    if (!i || !(point_same(pp[i], pp[i - 1]))) {
+      p[n] = pp[i];
+      n++;
+    }
+  }
+
+  while (n > 1 && point_same(p[n - 1], p[0])) n--;
+}
+
+__device__ inline double intersectArea(Point a, Point b, Point c, Point d) {
+  Point o(0, 0);
+  int s1 = sig(cross(o, a, b));
+  int s2 = sig(cross(o, c, d));
+  if (s1 == 0 || s2 == 0) return 0.0;
+  if (s1 == -1) {
+    Point* i = &a;
+    Point* j = &b;
+    swap1(i, j);
+  }
+  if (s2 == -1) {
+    Point* i = &c;
+    Point* j = &d;
+    swap1(i, j);
+  }
+  Point p[10] = {o, a, b};
+  int n = 3;
+
+  polygon_cut(p, n, o, c);
+  polygon_cut(p, n, c, d);
+  polygon_cut(p, n, d, o);
+  double res = area(p, n);
+  if (s1 * s2 == -1) res = -res;
+  return res;
+}
+__device__ inline double intersectAreaO(Point* ps1, int n1, Point* ps2,
+                                        int n2) {
+  if (area(ps1, n1) < 0) reverse1(ps1, n1);
+  if (area(ps2, n2) < 0) reverse1(ps2, n2);
+  ps1[n1] = ps1[0];
+  ps2[n2] = ps2[0];
+  double res = 0;
+  for (int i = 0; i < n1; i++) {
+    for (int j = 0; j < n2; j++) {
+      res += intersectArea(ps1[i], ps1[i + 1], ps2[j], ps2[j + 1]);
+    }
+  }
+  return res;
+}
+
+template <typename T>
+__device__ inline float devrIoU(T const* const p, T const* const q) {
+  Point ps1[MAXN], ps2[MAXN];
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = (double)p[i * 2];
+    convex[i].y = (double)p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  int points_to_convex_ind[9] = {-1, -1, -1, -1, -1, -1, -1, -1, -1};
+  Jarvis_and_index(convex, n_convex, points_to_convex_ind);
+  int n1 = n_convex;
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = (double)convex[i].x;
+    ps1[i].y = (double)convex[i].y;
+  }
+  int n2 = 4;
+  for (int i = 0; i < n2; i++) {
+    ps2[i].x = (double)q[i * 2];
+    ps2[i].y = (double)q[i * 2 + 1];
+  }
+  double inter_area = intersectAreaO(ps1, n1, ps2, n2);
+  double S_pred = area(ps1, n1);
+  double union_area = fabs(S_pred) + fabs(area(ps2, n2)) - inter_area;
+  double iou = inter_area / union_area;
+  return (float)iou;
+}
+
+template <typename T>
+__global__ void convex_iou_cuda_kernel(const int ex_n_boxes,
+                                       const int gt_n_boxes, const T* ex_boxes,
+                                       const T* gt_boxes, T* iou) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T* cur_box = ex_boxes + index * 18;
+    for (int i = 0; i < gt_n_boxes; i++) {
+      iou[index * gt_n_boxes + i] = devrIoU(cur_box, gt_boxes + i * 8);
+    }
+  }
+}
+#endif  // CONVEX_IOU_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh b/mmcv/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..2f7f112989127da235cb35476e15b206d4c2e3d4
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/correlation_cuda.cuh
@@ -0,0 +1,225 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#ifndef CORRELATION_CUDA
+#define CORRELATION_CUDA
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+// Using <torch/extension.h> is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use <torch/types.h> for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include <torch/types.h>
+
+#include <iostream>
+#include <vector>
+
+using namespace torch;
+
+#define TensorAcc4R PackedTensorAccessor32<scalar_t, 4, RestrictPtrTraits>
+#define TensorAcc5R PackedTensorAccessor32<scalar_t, 5, RestrictPtrTraits>
+#define WITHIN_BOUNDS(x, y, H, W) (x >= 0 && x < H && y >= 0 && y < W)
+
+#define WARP_SIZE 32
+#define FULL_MASK 0xffffffff
+
+template <typename scalar_t>
+__global__ void correlation_forward_cuda_kernel(
+    const TensorAcc4R rInput1, const TensorAcc4R rInput2, TensorAcc5R output,
+    int kH, int kW, int patchH, int patchW, int padH, int padW, int dilationH,
+    int dilationW, int dilation_patchH, int dilation_patchW, int dH, int dW) {
+  const int iH = rInput1.size(1);
+  const int iW = rInput1.size(2);
+  const int C = rInput1.size(3);
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y * blockDim.y + threadIdx.y;
+  const int w = blockIdx.z * blockDim.z + threadIdx.z;
+  const int thread = threadIdx.x;
+
+  const int start_i = -padH + h * dH;
+  const int start_j = -padW + w * dW;
+
+  const int patchRadH = dilation_patchH * (patchH - 1) / 2;
+  const int patchRadW = dilation_patchW * (patchW - 1) / 2;
+
+  for (int ph = 0; ph < patchH; ++ph) {
+    int ph_dilated = ph * dilation_patchH - patchRadH;
+    for (int pw = 0; pw < patchW; ++pw) {
+      int pw_dilated = pw * dilation_patchW - patchRadW;
+      scalar_t prod_sum = 0.0f;
+      for (int i = 0; i < kH; ++i) {
+        int i1 = start_i + i * dilationH;
+        int i2 = i1 + ph_dilated;
+        if
+          WITHIN_BOUNDS(i1, i2, iH, iH) {
+            for (int j = 0; j < kW; ++j) {
+              int j1 = start_j + j * dilationW;
+              int j2 = j1 + pw_dilated;
+              if
+                WITHIN_BOUNDS(j1, j2, iW, iW) {
+                  for (int c = thread; c < C; c += WARP_SIZE) {
+                    scalar_t v1 = rInput1[n][i1][j1][c];
+                    scalar_t v2 = rInput2[n][i2][j2][c];
+                    prod_sum += v1 * v2;
+                  }
+                }
+            }
+          }
+      }
+      // accumulate
+      for (int offset = 16; offset > 0; offset /= 2)
+        prod_sum += __shfl_down_sync(FULL_MASK, float(prod_sum), offset);
+      if (thread == 0) {
+        output[n][ph][pw][h][w] = prod_sum;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void correlation_backward_cuda_kernel_input1(
+    const TensorAcc5R grad_output, const TensorAcc4R input2,
+    TensorAcc4R grad_input1, const int kH, const int kW, const int patchH,
+    const int patchW, const int padH, const int padW, const int dilationH,
+    const int dilationW, const int dilation_patchH, const int dilation_patchW,
+    const int dH, const int dW) {
+  const int iH = input2.size(1);
+  const int iW = input2.size(2);
+  const int C = input2.size(3);
+
+  const int H = grad_output.size(3);
+  const int W = grad_output.size(4);
+
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+
+  const int h_2 = h + padH;
+  const int w_2 = w + padW;
+  const int min_h = h_2 - kH * dilationH;
+  const int min_w = w_2 - kW * dilationW;
+
+  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
+  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
+  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
+    const int ph = i / patchW;
+    const int pw = i % patchW;
+    int i1 = h + dilation_patchH * (ph - patchRadH);
+    int j1 = w + dilation_patchW * (pw - patchRadW);
+
+    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+      scalar_t grad_val = 0.0f;
+      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+        int i2 = (h_3) / dH;
+        if (i2 * dH != h_3) continue;
+        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+          int j2 = (w_3) / dW;
+          if (j2 * dW != w_3) continue;
+          if (WITHIN_BOUNDS(i2, j2, H, W)) {
+            grad_val += grad_output[n][ph][pw][i2][j2];
+          }
+        }
+      }
+      grad_cache[i] = grad_val;
+    }
+  }
+  __syncthreads();
+
+  for (int c = threadIdx.x; c < C; c += blockDim.x) {
+    scalar_t grad_input_val = 0.0f;
+    for (int ph = 0; ph < patchH; ++ph) {
+      int i1 = h + dilation_patchH * (ph - patchRadH);
+      for (int pw = 0; pw < patchW; ++pw) {
+        int j1 = w + dilation_patchW * (pw - patchRadW);
+        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+          grad_input_val += input2[n][i1][j1][c] * grad_cache[ph * patchW + pw];
+        }
+      }
+    }
+    grad_input1[n][c][h][w] = grad_input_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void correlation_backward_cuda_kernel_input2(
+    const TensorAcc5R grad_output, const TensorAcc4R input1,
+    TensorAcc4R grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+    int padW, int dilationH, int dilationW, int dilation_patchH,
+    int dilation_patchW, int dH, int dW) {
+  const int iH = input1.size(1);
+  const int iW = input1.size(2);
+  const int C = input1.size(3);
+
+  const int patchRadH = (patchH - 1) / 2;
+  const int patchRadW = (patchW - 1) / 2;
+
+  const int H = grad_output.size(3);
+  const int W = grad_output.size(4);
+
+  const int dilatedKH = kH * dilationH;
+  const int dilatedKW = kW * dilationW;
+
+  const int n = blockIdx.x;
+  const int h = blockIdx.y;
+  const int w = blockIdx.z;
+
+  extern __shared__ __align__(sizeof(4)) unsigned char grad_cache_char[];
+  scalar_t *grad_cache = reinterpret_cast<scalar_t *>(grad_cache_char);
+  for (int i = threadIdx.x; i < patchH * patchW; i += blockDim.x) {
+    const int ph = i / patchW;
+    const int pw = i % patchW;
+    int i1 = h - dilation_patchH * (ph - patchRadH);
+    int j1 = w - dilation_patchW * (pw - patchRadW);
+
+    if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+      scalar_t grad_val = 0.0f;
+
+      const int h_2 = i1 + padH;
+      const int w_2 = j1 + padW;
+      const int min_h = h_2 - dilatedKH;
+      const int min_w = w_2 - dilatedKW;
+
+      for (int h_3 = h_2; h_3 > min_h; h_3 -= dilationH) {
+        int i2 = (h_3) / dH;
+        if (i2 * dH != h_3) continue;
+        for (int w_3 = w_2; w_3 > min_w; w_3 -= dilationW) {
+          int j2 = (w_3) / dW;
+          if (j2 * dW != w_3) continue;
+          if (WITHIN_BOUNDS(i2, j2, H, W)) {
+            grad_val += grad_output[n][ph][pw][i2][j2];
+          }
+        }
+      }
+      grad_cache[i] = grad_val;
+    }
+  }
+  __syncthreads();
+
+  for (int c = threadIdx.x; c < C; c += blockDim.x) {
+    scalar_t grad_input_val = 0.0f;
+    for (int ph = 0; ph < patchH; ++ph) {
+      int i1 = h - dilation_patchH * (ph - patchRadH);
+      for (int pw = 0; pw < patchW; ++pw) {
+        int j1 = w - dilation_patchW * (pw - patchRadW);
+        if (WITHIN_BOUNDS(i1, j1, iH, iW)) {
+          grad_input_val += input1[n][i1][j1][c] * grad_cache[ph * patchW + pw];
+        }
+      }
+    }
+    grad_input2[n][c][h][w] = grad_input_val;
+  }
+}
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6b4d1bbd85bad1b87ee5d6b8a3cd3b29e3cbc411
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/deform_conv_cuda_kernel.cuh
@@ -0,0 +1,367 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef DEFORM_CONV_CUDA_KERNEL_CUH
+#define DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T deformable_im2col_bilinear(const T *input, const int data_width,
+                                        const int height, const int width, T h,
+                                        T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                 const int w, const int height,
+                                 const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T get_coordinate_weight(T argmax_h, T argmax_w, const int height,
+                                   const int width, const T *im_data,
+                                   const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear(data_im_ptr, width, height, width,
+                                           h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                         cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight(inv_h, inv_w, height, width,
+                                             data_im_ptr + cnt * height * width,
+                                             width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+#endif  // DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..86c4bc66dd2fb289340a4fb1714edb5db1e798c4
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/deform_roi_pool_cuda_kernel.cuh
@@ -0,0 +1,186 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+#define DEFORM_ROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void deform_roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, const T* offset,
+    T* output, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio, const T gamma,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average pooling inside a bin
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+    T output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output[index] = output_val / count;
+  }
+}
+
+template <typename T>
+__global__ void deform_roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* input, const T* rois,
+    const T* offset, T* grad_input, T* grad_offset, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const T gamma, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    const T* offset_input =
+        input + ((roi_batch_ind * channels + c) * height * width);
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - 0.5;
+    T roi_start_h = offset_rois[2] * spatial_scale - 0.5;
+    T roi_end_w = offset_rois[3] * spatial_scale - 0.5;
+    T roi_end_h = offset_rois[4] * spatial_scale - 0.5;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    // Compute roi offset
+    if (offset != NULL) {
+      const T* offset_cur_w = offset + n * pooled_width * pooled_height * 2 +
+                              ph * pooled_width + pw;
+      T offset_roi_w = gamma * roi_width * offset_cur_w[0];
+      T offset_roi_h =
+          gamma * roi_height * offset_cur_w[pooled_width * pooled_height];
+      roi_start_w += offset_roi_w;
+      roi_start_h += offset_roi_h;
+    }
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+    const T grad_output_this_bin = grad_output[index] / count;
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = roi_start_h + ph * bin_size_h +
+                  static_cast<T>(iy + .5f) * bin_size_h /
+                      static_cast<T>(roi_bin_grid_h);
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+                    static_cast<T>(ix + .5f) * bin_size_w /
+                        static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+          if (offset != NULL) {
+            T input_00 = offset_input[y_low * width + x_low];
+            T input_10 = offset_input[y_low * width + x_high];
+            T input_01 = offset_input[y_high * width + x_low];
+            T input_11 = offset_input[y_high * width + x_high];
+            T ogx = gamma * roi_width * grad_output_this_bin *
+                    (input_11 * (y - y_low) + input_10 * (y_high - y) +
+                     input_01 * (y_low - y) + input_00 * (y - y_high));
+            T ogy = gamma * roi_height * grad_output_this_bin *
+                    (input_11 * (x - x_low) + input_01 * (x_high - x) +
+                     input_10 * (x_low - x) + input_00 * (x - x_high));
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          ph * pooled_width + pw,
+                      ogx);
+            atomicAdd(grad_offset + n * pooled_width * pooled_height * 2 +
+                          pooled_width * pooled_height + ph * pooled_width + pw,
+                      ogy);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // DEFORM_ROI_POOL_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3ee1814e12d185a08640f9768d6c87b5eb3428e5
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/diff_iou_rotated_cuda_kernel.cuh
@@ -0,0 +1,136 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Adapted from
+// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAX_NUM_VERT_IDX 9
+#define INTERSECTION_OFFSET 8
+#define EPSILON 1e-8
+
+inline int opt_n_thread(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+  return max(min(1 << pow_2, THREADS_PER_BLOCK), 1);
+}
+
+/*
+compare normalized vertices (vertices around (0,0))
+if vertex1 < vertex2 return true.
+order: minimum at x-aixs, become larger in anti-clockwise direction
+*/
+__device__ bool compare_vertices(float x1, float y1, float x2, float y2) {
+  if (fabs(x1 - x2) < EPSILON && fabs(y2 - y1) < EPSILON)
+    return false;  // if equal, return false
+
+  if (y1 > 0 && y2 < 0) return true;
+  if (y1 < 0 && y2 > 0) return false;
+
+  float n1 = x1 * x1 + y1 * y1 + EPSILON;
+  float n2 = x2 * x2 + y2 * y2 + EPSILON;
+  float diff = fabs(x1) * x1 / n1 - fabs(x2) * x2 / n2;
+
+  if (y1 > 0 && y2 > 0) {
+    if (diff > EPSILON)
+      return true;
+    else
+      return false;
+  }
+  if (y1 < 0 && y2 < 0) {
+    if (diff < EPSILON)
+      return true;
+    else
+      return false;
+  }
+}
+
+__global__ void diff_iou_rotated_sort_vertices_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ vertices,
+    const bool *__restrict__ mask, const int *__restrict__ num_valid,
+    int *__restrict__ idx) {
+  int batch_idx = blockIdx.x;
+  vertices += batch_idx * n * m * 2;
+  mask += batch_idx * n * m;
+  num_valid += batch_idx * n;
+  idx += batch_idx * n * MAX_NUM_VERT_IDX;
+
+  int index = threadIdx.x;  // index of polygon
+  int stride = blockDim.x;
+  for (int i = index; i < n; i += stride) {
+    int pad;  // index of arbitrary invalid intersection point (not box corner!)
+    for (int j = INTERSECTION_OFFSET; j < m; ++j) {
+      if (!mask[i * m + j]) {
+        pad = j;
+        break;
+      }
+    }
+    if (num_valid[i] < 3) {
+      // not enough vertices, take an invalid intersection point
+      // (zero padding)
+      for (int j = 0; j < MAX_NUM_VERT_IDX; ++j) {
+        idx[i * MAX_NUM_VERT_IDX + j] = pad;
+      }
+    } else {
+      // sort the valid vertices
+      // note the number of valid vertices is known
+      // note: check that num_valid[i] < MAX_NUM_VERT_IDX
+      for (int j = 0; j < num_valid[i]; ++j) {
+        // initialize with a "big" value
+        float x_min = 1;
+        float y_min = -EPSILON;
+        int i_take = 0;
+        int i2;
+        float x2, y2;
+        if (j != 0) {
+          i2 = idx[i * MAX_NUM_VERT_IDX + j - 1];
+          x2 = vertices[i * m * 2 + i2 * 2 + 0];
+          y2 = vertices[i * m * 2 + i2 * 2 + 1];
+        }
+        for (int k = 0; k < m; ++k) {
+          float x = vertices[i * m * 2 + k * 2 + 0];
+          float y = vertices[i * m * 2 + k * 2 + 1];
+          if (mask[i * m + k] && compare_vertices(x, y, x_min, y_min)) {
+            if ((j == 0) || (j != 0 && compare_vertices(x2, y2, x, y))) {
+              x_min = x;
+              y_min = y;
+              i_take = k;
+            }
+          }
+        }
+        idx[i * MAX_NUM_VERT_IDX + j] = i_take;
+      }
+      // duplicate the first idx
+      idx[i * MAX_NUM_VERT_IDX + num_valid[i]] = idx[i * MAX_NUM_VERT_IDX + 0];
+
+      // pad zeros
+      for (int j = num_valid[i] + 1; j < MAX_NUM_VERT_IDX; ++j) {
+        idx[i * MAX_NUM_VERT_IDX + j] = pad;
+      }
+
+      // for corner case: the two boxes are exactly the same.
+      // in this case, idx would have duplicate elements, which makes the
+      // shoelace formula broken because of the definition, the duplicate
+      // elements only appear in the first 8 positions (they are "corners in
+      // box", not "intersection of edges")
+      if (num_valid[i] == 8) {
+        int counter = 0;
+        for (int j = 0; j < 4; ++j) {
+          int check = idx[i * MAX_NUM_VERT_IDX + j];
+          for (int k = 4; k < INTERSECTION_OFFSET; ++k) {
+            if (idx[i * MAX_NUM_VERT_IDX + k] == check) counter++;
+          }
+        }
+        if (counter == 4) {
+          idx[i * MAX_NUM_VERT_IDX + 4] = idx[i * MAX_NUM_VERT_IDX + 0];
+          for (int j = 5; j < MAX_NUM_VERT_IDX; ++j) {
+            idx[i * MAX_NUM_VERT_IDX + j] = pad;
+          }
+        }
+      }
+
+      // TODO: still might need to cover some other corner cases :(
+    }
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..d3801a02c1c8f44874fb84fa884cc23bee25c331
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/furthest_point_sample_cuda_kernel.cuh
@@ -0,0 +1,152 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+#define FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+__device__ void __update(float *__restrict__ dists, int *__restrict__ dists_i,
+                         int idx1, int idx2) {
+  const float v1 = dists[idx1], v2 = dists[idx2];
+  const int i1 = dists_i[idx1], i2 = dists_i[idx2];
+  dists[idx1] = max(v1, v2);
+  dists_i[idx1] = v2 > v1 ? i2 : i1;
+}
+
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  // dataset: (B, N, 3)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * 3;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    float x1 = dataset[old * 3 + 0];
+    float y1 = dataset[old * 3 + 1];
+    float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      float x2, y2, z2;
+      x2 = dataset[k * 3 + 0];
+      y2 = dataset[k * 3 + 1];
+      z2 = dataset[k * 3 + 2];
+      // float mag = (x2 * x2) + (y2 * y2) + (z2 * z2);
+      // if (mag <= 1e-3)
+      // continue;
+
+      float d =
+          (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) * (z2 - z1);
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+#pragma unroll
+    for (int block_size_thres = 1024; block_size_thres >= 2;
+         block_size_thres >>= 1) {
+      const int tid_thres = block_size_thres / 2;
+      if (block_size >= block_size_thres && tid < tid_thres) {
+        __update(dists, dists_i, tid, tid + tid_thres);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+// Modified from
+// https://github.com/qiqihaer/3DSSD-pytorch/blob/master/lib/pointnet2/src/sampling_gpu.cu
+template <unsigned int block_size>
+__global__ void furthest_point_sampling_with_dist_forward_cuda_kernel(
+    int b, int n, int m, const float *__restrict__ dataset,
+    float *__restrict__ temp, int *__restrict__ idxs) {
+  // dataset: (B, N, N)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  if (m <= 0) return;
+  __shared__ float dists[block_size];
+  __shared__ int dists_i[block_size];
+
+  int batch_index = blockIdx.x;
+  dataset += batch_index * n * n;
+  temp += batch_index * n;
+  idxs += batch_index * m;
+
+  int tid = threadIdx.x;
+  const int stride = block_size;
+
+  int old = 0;
+  if (threadIdx.x == 0) idxs[0] = old;
+
+  __syncthreads();
+  for (int j = 1; j < m; j++) {
+    int besti = 0;
+    float best = -1;
+    // float x1 = dataset[old * 3 + 0];
+    // float y1 = dataset[old * 3 + 1];
+    // float z1 = dataset[old * 3 + 2];
+    for (int k = tid; k < n; k += stride) {
+      // float x2, y2, z2;
+      // x2 = dataset[k * 3 + 0];
+      // y2 = dataset[k * 3 + 1];
+      // z2 = dataset[k * 3 + 2];
+
+      // float d = (x2 - x1) * (x2 - x1) + (y2 - y1) * (y2 - y1) + (z2 - z1) *
+      // (z2 - z1);
+      float d = dataset[old * n + k];
+
+      float d2 = min(d, temp[k]);
+      temp[k] = d2;
+      besti = d2 > best ? k : besti;
+      best = d2 > best ? d2 : best;
+    }
+    dists[tid] = best;
+    dists_i[tid] = besti;
+    __syncthreads();
+
+#pragma unroll
+    for (int block_size_thres = 1024; block_size_thres >= 2;
+         block_size_thres >>= 1) {
+      const int tid_thres = block_size_thres / 2;
+      if (block_size >= block_size_thres && tid < tid_thres) {
+        __update(dists, dists_i, tid, tid + tid_thres);
+      }
+      __syncthreads();
+    }
+
+    old = dists_i[0];
+    if (tid == 0) idxs[j] = old;
+  }
+}
+
+#endif  // FURTHEST_POINT_SAMPLE_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..6d932434cba245833e661b8c7e140601940bc35b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/gather_points_cuda_kernel.cuh
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GATHER_POINTS_CUDA_KERNEL_CUH
+#define GATHER_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define TOTAL_THREADS 1024
+
+template <typename T>
+__global__ void gather_points_forward_cuda_kernel(int b, int c, int n, int m,
+                                                  const T *points,
+                                                  const int *__restrict__ idx,
+                                                  T *out) {
+  // points: (B, C, N)
+  // idx: (B, M)
+  // output:
+  //      out: (B, C, M)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    points += bs_idx * c * n + c_idx * n;
+    out[0] = points[idx[0]];
+  }
+}
+
+template <typename T>
+__global__ void gather_points_backward_cuda_kernel(int b, int c, int n, int m,
+                                                   const T *grad_out,
+                                                   const int *__restrict__ idx,
+                                                   T *grad_points) {
+  // grad_out: (B, C, M)
+  // idx: (B, M)
+  // output:
+  //      grad_points: (B, C, N)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    grad_out += bs_idx * c * m + c_idx * m + pt_idx;
+    idx += bs_idx * m + pt_idx;
+    grad_points += bs_idx * c * n + c_idx * n;
+
+    atomicAdd(grad_points + idx[0], grad_out[0]);
+  }
+}
+
+#endif  // GATHER_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..dfad66fc16d8759f614d7f36fa961673976b1d95
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/group_points_cuda_kernel.cuh
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#ifndef GROUP_POINTS_CUDA_KERNEL_CUH
+#define GROUP_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void group_points_forward_cuda_kernel(int b, int c, int n,
+                                                 int npoints, int nsample,
+                                                 const T *points,
+                                                 const int *__restrict__ idx,
+                                                 T *out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    int pt_idx = index / nsample;
+    int sample_idx = index % nsample;
+
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+    int in_idx = bs_idx * c * n + c_idx * n + idx[0];
+    int out_idx = bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                  pt_idx * nsample + sample_idx;
+
+    out[out_idx] = points[in_idx];
+  }
+}
+
+template <typename T>
+__global__ void group_points_backward_cuda_kernel(int b, int c, int n,
+                                                  int npoints, int nsample,
+                                                  const T *grad_out,
+                                                  const int *__restrict__ idx,
+                                                  T *grad_points) {
+  // grad_out: (B, C, npoints, nsample)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      grad_points: (B, C, N)
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(index, npoints * nsample) {
+    int pt_idx = index / nsample;
+    if (bs_idx >= b || c_idx >= c) return;
+
+    int sample_idx = index % nsample;
+    grad_out += bs_idx * c * npoints * nsample + c_idx * npoints * nsample +
+                pt_idx * nsample + sample_idx;
+    idx += bs_idx * npoints * nsample + pt_idx * nsample + sample_idx;
+
+    atomicAdd(grad_points + bs_idx * c * n + c_idx * n + idx[0], grad_out[0]);
+  }
+}
+
+#endif  // GROUP_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9ebdcad15eee05a9f412ef34eb12d3553874a4dc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/iou3d_cuda_kernel.cuh
@@ -0,0 +1,367 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef IOU3D_CUDA_KERNEL_CUH
+#define IOU3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+const int THREADS_PER_BLOCK_IOU3D = 16;
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+__device__ const float EPS = 1e-8;
+
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(double _x, double _y) { x = _x, y = _y; }
+
+  __device__ void set(float _x, float _y) {
+    x = _x;
+    y = _y;
+  }
+
+  __device__ Point operator+(const Point &b) const {
+    return Point(x + b.x, y + b.y);
+  }
+
+  __device__ Point operator-(const Point &b) const {
+    return Point(x - b.x, y - b.y);
+  }
+};
+
+__device__ inline float cross(const Point &a, const Point &b) {
+  return a.x * b.y - a.y * b.x;
+}
+
+__device__ inline float cross(const Point &p1, const Point &p2,
+                              const Point &p0) {
+  return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
+}
+
+__device__ int check_rect_cross(const Point &p1, const Point &p2,
+                                const Point &q1, const Point &q2) {
+  int ret = min(p1.x, p2.x) <= max(q1.x, q2.x) &&
+            min(q1.x, q2.x) <= max(p1.x, p2.x) &&
+            min(p1.y, p2.y) <= max(q1.y, q2.y) &&
+            min(q1.y, q2.y) <= max(p1.y, p2.y);
+  return ret;
+}
+
+__device__ inline int check_in_box2d(const float *box, const Point &p) {
+  // params: box (7) [x, y, z, dx, dy, dz, heading]
+  const float MARGIN = 1e-2;
+
+  float center_x = box[0], center_y = box[1];
+  // rotate the point in the opposite direction of box
+  float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);
+  float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
+  float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
+
+  return (fabs(rot_x) < box[3] / 2 + MARGIN &&
+          fabs(rot_y) < box[4] / 2 + MARGIN);
+}
+
+__device__ inline int intersection(const Point &p1, const Point &p0,
+                                   const Point &q1, const Point &q0,
+                                   Point &ans_point) {
+  // fast exclusion
+  if (check_rect_cross(p0, p1, q0, q1) == 0) return 0;
+
+  // check cross standing
+  float s1 = cross(q0, p1, p0);
+  float s2 = cross(p1, q1, p0);
+  float s3 = cross(p0, q1, q0);
+  float s4 = cross(q1, p1, q0);
+
+  if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
+
+  // calculate intersection of two lines
+  float s5 = cross(q1, p1, p0);
+  if (fabs(s5 - s1) > EPS) {
+    ans_point.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
+    ans_point.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
+
+  } else {
+    float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
+    float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
+    float D = a0 * b1 - a1 * b0;
+
+    ans_point.x = (b0 * c1 - b1 * c0) / D;
+    ans_point.y = (a1 * c0 - a0 * c1) / D;
+  }
+
+  return 1;
+}
+
+__device__ inline void rotate_around_center(const Point &center,
+                                            const float angle_cos,
+                                            const float angle_sin, Point &p) {
+  float new_x =
+      (p.x - center.x) * angle_cos - (p.y - center.y) * angle_sin + center.x;
+  float new_y =
+      (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
+  p.set(new_x, new_y);
+}
+
+__device__ inline int point_cmp(const Point &a, const Point &b,
+                                const Point &center) {
+  return atan2(a.y - center.y, a.x - center.x) >
+         atan2(b.y - center.y, b.x - center.x);
+}
+
+__device__ inline float box_overlap(const float *box_a, const float *box_b) {
+  // params box_a: [x, y, z, dx, dy, dz, heading]
+  // params box_b: [x, y, z, dx, dy, dz, heading]
+
+  float a_angle = box_a[6], b_angle = box_b[6];
+  float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2,
+        a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
+  float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
+  float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
+  float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
+  float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
+
+  Point center_a(box_a[0], box_a[1]);
+  Point center_b(box_b[0], box_b[1]);
+
+  Point box_a_corners[5];
+  box_a_corners[0].set(a_x1, a_y1);
+  box_a_corners[1].set(a_x2, a_y1);
+  box_a_corners[2].set(a_x2, a_y2);
+  box_a_corners[3].set(a_x1, a_y2);
+
+  Point box_b_corners[5];
+  box_b_corners[0].set(b_x1, b_y1);
+  box_b_corners[1].set(b_x2, b_y1);
+  box_b_corners[2].set(b_x2, b_y2);
+  box_b_corners[3].set(b_x1, b_y2);
+
+  // get oriented corners
+  float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
+  float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
+
+  for (int k = 0; k < 4; k++) {
+    rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
+    rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
+  }
+
+  box_a_corners[4] = box_a_corners[0];
+  box_b_corners[4] = box_b_corners[0];
+
+  // get intersection of lines
+  Point cross_points[16];
+  Point poly_center;
+  int cnt = 0, flag = 0;
+
+  poly_center.set(0, 0);
+  for (int i = 0; i < 4; i++) {
+    for (int j = 0; j < 4; j++) {
+      flag = intersection(box_a_corners[i + 1], box_a_corners[i],
+                          box_b_corners[j + 1], box_b_corners[j],
+                          cross_points[cnt]);
+      if (flag) {
+        poly_center = poly_center + cross_points[cnt];
+        cnt++;
+      }
+    }
+  }
+
+  // check corners
+  for (int k = 0; k < 4; k++) {
+    if (check_in_box2d(box_a, box_b_corners[k])) {
+      poly_center = poly_center + box_b_corners[k];
+      cross_points[cnt] = box_b_corners[k];
+      cnt++;
+    }
+    if (check_in_box2d(box_b, box_a_corners[k])) {
+      poly_center = poly_center + box_a_corners[k];
+      cross_points[cnt] = box_a_corners[k];
+      cnt++;
+    }
+  }
+
+  poly_center.x /= cnt;
+  poly_center.y /= cnt;
+
+  // sort the points of polygon
+  Point temp;
+  for (int j = 0; j < cnt - 1; j++) {
+    for (int i = 0; i < cnt - j - 1; i++) {
+      if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)) {
+        temp = cross_points[i];
+        cross_points[i] = cross_points[i + 1];
+        cross_points[i + 1] = temp;
+      }
+    }
+  }
+
+  // get the overlap areas
+  float area = 0;
+  for (int k = 0; k < cnt - 1; k++) {
+    area += cross(cross_points[k] - cross_points[0],
+                  cross_points[k + 1] - cross_points[0]);
+  }
+
+  return fabs(area) / 2.0;
+}
+
+__device__ inline float iou_bev(const float *box_a, const float *box_b) {
+  // params box_a: [x, y, z, dx, dy, dz, heading]
+  // params box_b: [x, y, z, dx, dy, dz, heading]
+  float sa = box_a[3] * box_a[4];
+  float sb = box_b[3] * box_b[4];
+  float s_overlap = box_overlap(box_a, box_b);
+  return s_overlap / fmaxf(sa + sb - s_overlap, EPS);
+}
+
+__global__ void iou3d_boxes_overlap_bev_forward_cuda_kernel(
+    const int num_a, const float *boxes_a, const int num_b,
+    const float *boxes_b, float *ans_overlap) {
+  // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
+  CUDA_2D_KERNEL_LOOP(b_idx, num_b, a_idx, num_a) {
+    if (a_idx >= num_a || b_idx >= num_b) {
+      return;
+    }
+
+    const float *cur_box_a = boxes_a + a_idx * 7;
+    const float *cur_box_b = boxes_b + b_idx * 7;
+    float cur_overlap = box_overlap(cur_box_a, cur_box_b);
+    ans_overlap[a_idx * num_b + b_idx] = cur_overlap;
+  }
+}
+
+__global__ void iou3d_nms3d_forward_cuda_kernel(const int boxes_num,
+                                                const float nms_overlap_thresh,
+                                                const float *boxes,
+                                                unsigned long long *mask) {
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 7 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+      block_boxes[threadIdx.x * 7 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+      block_boxes[threadIdx.x * 7 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+      block_boxes[threadIdx.x * 7 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+      block_boxes[threadIdx.x * 7 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+      block_boxes[threadIdx.x * 7 + 5] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+      block_boxes[threadIdx.x * 7 + 6] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 7;
+
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+__device__ inline float iou_normal(float const *const a, float const *const b) {
+  // params: a: [x, y, z, dx, dy, dz, heading]
+  // params: b: [x, y, z, dx, dy, dz, heading]
+
+  float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2),
+        right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
+  float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2),
+        bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
+  float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
+  float interS = width * height;
+  float Sa = a[3] * a[4];
+  float Sb = b[3] * b[4];
+  return interS / fmaxf(Sa + Sb - interS, EPS);
+}
+
+__global__ void iou3d_nms3d_normal_forward_cuda_kernel(
+    const int boxes_num, const float nms_overlap_thresh, const float *boxes,
+    unsigned long long *mask) {
+  // params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params: mask (N, N/THREADS_PER_BLOCK_NMS)
+
+  const int blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    // if (row_start > col_start) return;
+
+    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS,
+                               THREADS_PER_BLOCK_NMS);
+
+    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
+
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 7 + 0] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
+      block_boxes[threadIdx.x * 7 + 1] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
+      block_boxes[threadIdx.x * 7 + 2] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
+      block_boxes[threadIdx.x * 7 + 3] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
+      block_boxes[threadIdx.x * 7 + 4] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
+      block_boxes[threadIdx.x * 7 + 5] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
+      block_boxes[threadIdx.x * 7 + 6] =
+          boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
+      const float *cur_box = boxes + cur_box_idx * 7;
+
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks =
+          (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+      mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif  // IOU3D_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3cf52bb90eb27d02b28c52069c760c8a38f83f08
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/knn_cuda_kernel.cuh
@@ -0,0 +1,92 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+#ifndef KNN_CUDA_KERNEL_CUH
+#define KNN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+inline __device__ void swap_float(float *x, float *y) {
+  float tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+inline __device__ void swap_int(int *x, int *y) {
+  int tmp = *x;
+  *x = *y;
+  *y = tmp;
+}
+
+__device__ void reheap(float *dist, int *idx, int k) {
+  int root = 0;
+  int child = root * 2 + 1;
+  while (child < k) {
+    if (child + 1 < k && dist[child + 1] > dist[child]) child++;
+    if (dist[root] > dist[child]) return;
+    swap_float(&dist[root], &dist[child]);
+    swap_int(&idx[root], &idx[child]);
+    root = child;
+    child = root * 2 + 1;
+  }
+}
+
+__device__ void heap_sort(float *dist, int *idx, int k) {
+  int i;
+  for (i = k - 1; i > 0; i--) {
+    swap_float(&dist[0], &dist[i]);
+    swap_int(&idx[0], &idx[i]);
+    reheap(dist, idx, i);
+  }
+}
+
+// input: xyz (b, n, 3) new_xyz (b, m, 3)
+// output: idx (b, m, nsample) dist2 (b, m, nsample)
+template <typename T>
+__global__ void knn_forward_cuda_kernel(int b, int n, int m, int nsample,
+                                        const T *xyz, const T *new_xyz,
+                                        int *__restrict__ idx, T *dist2) {
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, m) {
+    if (bs_idx >= b) return;
+
+    new_xyz += bs_idx * m * 3 + pt_idx * 3;
+    xyz += bs_idx * n * 3;
+    idx += bs_idx * m * nsample + pt_idx * nsample;
+    dist2 += bs_idx * m * nsample + pt_idx * nsample;
+
+    T new_x = new_xyz[0];
+    T new_y = new_xyz[1];
+    T new_z = new_xyz[2];
+
+    float best_dist[100];
+    int best_idx[100];
+    for (int i = 0; i < nsample; i++) {
+      best_dist[i] = 1e10;
+      best_idx[i] = 0;
+    }
+    for (int i = 0; i < n; i++) {
+      T x = xyz[i * 3 + 0];
+      T y = xyz[i * 3 + 1];
+      T z = xyz[i * 3 + 2];
+      T d2 = (new_x - x) * (new_x - x) + (new_y - y) * (new_y - y) +
+             (new_z - z) * (new_z - z);
+      if (d2 < best_dist[0]) {
+        best_dist[0] = d2;
+        best_idx[0] = i;
+        reheap(best_dist, best_idx, nsample);
+      }
+    }
+    heap_sort(best_dist, best_idx, nsample);
+    for (int i = 0; i < nsample; i++) {
+      idx[i] = best_idx[i];
+      dist2[i] = best_dist[i];
+    }
+  }
+}
+
+#endif  // KNN_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1a0bd040e823eaaa79f96e525f961a8b8fbeafb5
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/masked_conv2d_cuda_kernel.cuh
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MASKED_CONV2D_CUDA_KERNEL_CUH
+#define MASKED_CONV2D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void MaskedIm2colForward(const int n, const scalar_t *data_im,
+                                    const int height, const int width,
+                                    const int kernel_h, const int kernel_w,
+                                    const int pad_h, const int pad_w,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_col) {
+  // mask_cnt * channels
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_col = mask_h_idx[m_index];
+    const int w_col = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    const int c_col = c_im * kernel_h * kernel_w;
+    const int h_offset = h_col - pad_h;
+    const int w_offset = w_col - pad_w;
+    scalar_t *data_col_ptr = data_col + c_col * mask_cnt + m_index;
+    for (int i = 0; i < kernel_h; ++i) {
+      int h_im = h_offset + i;
+      for (int j = 0; j < kernel_w; ++j) {
+        int w_im = w_offset + j;
+        if (h_im >= 0 && w_im >= 0 && h_im < height && w_im < width) {
+          *data_col_ptr =
+              (scalar_t)data_im[(c_im * height + h_im) * width + w_im];
+        } else {
+          *data_col_ptr = 0.0;
+        }
+        data_col_ptr += mask_cnt;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void MaskedCol2imForward(const int n, const scalar_t *data_col,
+                                    const int height, const int width,
+                                    const int channels,
+                                    const int64_t *mask_h_idx,
+                                    const int64_t *mask_w_idx,
+                                    const int mask_cnt, scalar_t *data_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int m_index = index % mask_cnt;
+    const int h_im = mask_h_idx[m_index];
+    const int w_im = mask_w_idx[m_index];
+    const int c_im = index / mask_cnt;
+    // compute the start and end of the output
+    data_im[(c_im * height + h_im) * width + w_im] = data_col[index];
+  }
+}
+
+#endif  // MASKED_CONV2D_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh b/mmcv/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..df56e743669c3426f6abb113e4209d0cc60f2baf
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/min_area_polygons_cuda.cuh
@@ -0,0 +1,300 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+#define MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+#define MAXN 20
+__device__ const float PI = 3.1415926;
+
+struct Point {
+  float x, y;
+  __device__ Point() {}
+  __device__ Point(float x, float y) : x(x), y(y) {}
+};
+
+__device__ inline void swap1(Point *a, Point *b) {
+  Point temp;
+  temp.x = a->x;
+  temp.y = a->y;
+
+  a->x = b->x;
+  a->y = b->y;
+
+  b->x = temp.x;
+  b->y = temp.y;
+}
+__device__ inline float cross(Point o, Point a, Point b) {
+  return (a.x - o.x) * (b.y - o.y) - (b.x - o.x) * (a.y - o.y);
+}
+
+__device__ inline float dis(Point a, Point b) {
+  return (a.x - b.x) * (a.x - b.x) + (a.y - b.y) * (a.y - b.y);
+}
+__device__ inline void minBoundingRect(Point *ps, int n_points, float *minbox) {
+  float convex_points[2][MAXN];
+  for (int j = 0; j < n_points; j++) {
+    convex_points[0][j] = ps[j].x;
+  }
+  for (int j = 0; j < n_points; j++) {
+    convex_points[1][j] = ps[j].y;
+  }
+
+  Point edges[MAXN];
+  float edges_angles[MAXN];
+  float unique_angles[MAXN];
+  int n_edges = n_points - 1;
+  int n_unique = 0;
+  int unique_flag = 0;
+
+  for (int i = 0; i < n_edges; i++) {
+    edges[i].x = ps[i + 1].x - ps[i].x;
+    edges[i].y = ps[i + 1].y - ps[i].y;
+  }
+  for (int i = 0; i < n_edges; i++) {
+    edges_angles[i] = atan2((double)edges[i].y, (double)edges[i].x);
+    if (edges_angles[i] >= 0) {
+      edges_angles[i] = fmod((double)edges_angles[i], (double)PI / 2);
+    } else {
+      edges_angles[i] =
+          edges_angles[i] - (int)(edges_angles[i] / (PI / 2) - 1) * (PI / 2);
+    }
+  }
+  unique_angles[0] = edges_angles[0];
+  n_unique += 1;
+  for (int i = 1; i < n_edges; i++) {
+    for (int j = 0; j < n_unique; j++) {
+      if (edges_angles[i] == unique_angles[j]) {
+        unique_flag += 1;
+      }
+    }
+    if (unique_flag == 0) {
+      unique_angles[n_unique] = edges_angles[i];
+      n_unique += 1;
+      unique_flag = 0;
+    } else {
+      unique_flag = 0;
+    }
+  }
+
+  float minarea = 1e12;
+  for (int i = 0; i < n_unique; i++) {
+    float R[2][2];
+    float rot_points[2][MAXN];
+    R[0][0] = cos(unique_angles[i]);
+    R[0][1] = sin(unique_angles[i]);
+    R[1][0] = -sin(unique_angles[i]);
+    R[1][1] = cos(unique_angles[i]);
+    // R x Points
+    for (int m = 0; m < 2; m++) {
+      for (int n = 0; n < n_points; n++) {
+        float sum = 0.0;
+        for (int k = 0; k < 2; k++) {
+          sum = sum + R[m][k] * convex_points[k][n];
+        }
+        rot_points[m][n] = sum;
+      }
+    }
+
+    // xmin;
+    float xmin, ymin, xmax, ymax;
+    xmin = 1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+        continue;
+      } else {
+        if (rot_points[0][j] < xmin) {
+          xmin = rot_points[0][j];
+        }
+      }
+    }
+    // ymin
+    ymin = 1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+        continue;
+      } else {
+        if (rot_points[1][j] < ymin) {
+          ymin = rot_points[1][j];
+        }
+      }
+    }
+    // xmax
+    xmax = -1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[0][j]) || isnan(rot_points[0][j])) {
+        continue;
+      } else {
+        if (rot_points[0][j] > xmax) {
+          xmax = rot_points[0][j];
+        }
+      }
+    }
+    // ymax
+    ymax = -1e12;
+    for (int j = 0; j < n_points; j++) {
+      if (isinf(rot_points[1][j]) || isnan(rot_points[1][j])) {
+        continue;
+      } else {
+        if (rot_points[1][j] > ymax) {
+          ymax = rot_points[1][j];
+        }
+      }
+    }
+    float area = (xmax - xmin) * (ymax - ymin);
+    if (area < minarea) {
+      minarea = area;
+      minbox[0] = unique_angles[i];
+      minbox[1] = xmin;
+      minbox[2] = ymin;
+      minbox[3] = xmax;
+      minbox[4] = ymax;
+    }
+  }
+}
+
+// convex_find
+__device__ inline void Jarvis(Point *in_poly, int &n_poly) {
+  int n_input = n_poly;
+  Point input_poly[20];
+  for (int i = 0; i < n_input; i++) {
+    input_poly[i].x = in_poly[i].x;
+    input_poly[i].y = in_poly[i].y;
+  }
+  Point p_max, p_k;
+  int max_index, k_index;
+  int Stack[20], top1, top2;
+  // float sign;
+  double sign;
+  Point right_point[10], left_point[10];
+
+  for (int i = 0; i < n_poly; i++) {
+    if (in_poly[i].y < in_poly[0].y ||
+        in_poly[i].y == in_poly[0].y && in_poly[i].x < in_poly[0].x) {
+      Point *j = &(in_poly[0]);
+      Point *k = &(in_poly[i]);
+      swap1(j, k);
+    }
+    if (i == 0) {
+      p_max = in_poly[0];
+      max_index = 0;
+    }
+    if (in_poly[i].y > p_max.y ||
+        in_poly[i].y == p_max.y && in_poly[i].x > p_max.x) {
+      p_max = in_poly[i];
+      max_index = i;
+    }
+  }
+  if (max_index == 0) {
+    max_index = 1;
+    p_max = in_poly[max_index];
+  }
+
+  k_index = 0, Stack[0] = 0, top1 = 0;
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top1]], in_poly[i], p_k);
+      if ((sign > 0) || ((sign == 0) && (dis(in_poly[Stack[top1]], in_poly[i]) >
+                                         dis(in_poly[Stack[top1]], p_k)))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top1++;
+    Stack[top1] = k_index;
+  }
+
+  for (int i = 0; i <= top1; i++) {
+    right_point[i] = in_poly[Stack[i]];
+  }
+
+  k_index = 0, Stack[0] = 0, top2 = 0;
+
+  while (k_index != max_index) {
+    p_k = p_max;
+    k_index = max_index;
+    for (int i = 1; i < n_poly; i++) {
+      sign = cross(in_poly[Stack[top2]], in_poly[i], p_k);
+      if ((sign < 0) || (sign == 0) && (dis(in_poly[Stack[top2]], in_poly[i]) >
+                                        dis(in_poly[Stack[top2]], p_k))) {
+        p_k = in_poly[i];
+        k_index = i;
+      }
+    }
+    top2++;
+    Stack[top2] = k_index;
+  }
+
+  for (int i = top2 - 1; i >= 0; i--) {
+    left_point[i] = in_poly[Stack[i]];
+  }
+
+  for (int i = 0; i < top1 + top2; i++) {
+    if (i <= top1) {
+      in_poly[i] = right_point[i];
+    } else {
+      in_poly[i] = left_point[top2 - (i - top1)];
+    }
+  }
+  n_poly = top1 + top2;
+}
+
+template <typename T>
+__device__ inline void Findminbox(T const *const p, T *minpoints) {
+  Point ps1[MAXN];
+  Point convex[MAXN];
+  for (int i = 0; i < 9; i++) {
+    convex[i].x = p[i * 2];
+    convex[i].y = p[i * 2 + 1];
+  }
+  int n_convex = 9;
+  Jarvis(convex, n_convex);
+  int n1 = n_convex;
+  for (int i = 0; i < n1; i++) {
+    ps1[i].x = convex[i].x;
+    ps1[i].y = convex[i].y;
+  }
+  ps1[n1].x = convex[0].x;
+  ps1[n1].y = convex[0].y;
+
+  float minbbox[5] = {0};
+  minBoundingRect(ps1, n1 + 1, minbbox);
+  float angle = minbbox[0];
+  float xmin = minbbox[1];
+  float ymin = minbbox[2];
+  float xmax = minbbox[3];
+  float ymax = minbbox[4];
+  float R[2][2];
+
+  R[0][0] = cos(angle);
+  R[0][1] = sin(angle);
+  R[1][0] = -sin(angle);
+  R[1][1] = cos(angle);
+
+  minpoints[0] = xmax * R[0][0] + ymin * R[1][0];
+  minpoints[1] = xmax * R[0][1] + ymin * R[1][1];
+  minpoints[2] = xmin * R[0][0] + ymin * R[1][0];
+  minpoints[3] = xmin * R[0][1] + ymin * R[1][1];
+  minpoints[4] = xmin * R[0][0] + ymax * R[1][0];
+  minpoints[5] = xmin * R[0][1] + ymax * R[1][1];
+  minpoints[6] = xmax * R[0][0] + ymax * R[1][0];
+  minpoints[7] = xmax * R[0][1] + ymax * R[1][1];
+}
+
+template <typename T>
+__global__ void min_area_polygons_cuda_kernel(const int ex_n_boxes,
+                                              const T *ex_boxes, T *minbox) {
+  CUDA_1D_KERNEL_LOOP(index, ex_n_boxes) {
+    const T *cur_box = ex_boxes + index * 18;
+    T *cur_min_box = minbox + index * 8;
+    Findminbox(cur_box, cur_min_box);
+  }
+}
+
+#endif  // MIN_AREA_POLYGONS_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ca0e91a25246569bb7de04649ab4f5afe233670c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/modulated_deform_conv_cuda_kernel.cuh
@@ -0,0 +1,399 @@
+/*!
+ ******************* BEGIN Caffe Copyright Notice and Disclaimer
+ *****************
+ *
+ * COPYRIGHT
+ *
+ * All contributions by the University of California:
+ * Copyright (c) 2014-2017 The Regents of the University of California (Regents)
+ * All rights reserved.
+ *
+ * All other contributions:
+ * Copyright (c) 2014-2017, the respective contributors
+ * All rights reserved.
+ *
+ * Caffe uses a shared copyright model: each contributor holds copyright over
+ * their contributions to Caffe. The project versioning records all such
+ * contribution and copyright details. If a contributor wants to further mark
+ * their specific copyright on a particular contribution, they should indicate
+ * their copyright solely in the commit message of the change when it is
+ * committed.
+ *
+ * LICENSE
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ *AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ *IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+ *FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ *DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ *SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ *CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ *OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * CONTRIBUTION AGREEMENT
+ *
+ * By contributing to the BVLC/caffe repository through pull-request, comment,
+ * or otherwise, the contributor releases their content to the
+ * license and copyright terms herein.
+ *
+ ***************** END Caffe Copyright Notice and Disclaimer
+ *********************
+ *
+ * Copyright (c) 2018 Microsoft
+ * Licensed under The MIT License [see LICENSE for details]
+ * \file modulated_deformable_im2col.cuh
+ * \brief Function definitions of converting an image to
+ * column matrix based on kernel, padding, dilation, and offset.
+ * These functions are mainly used in deformable convolution operators.
+ * \ref: https://arxiv.org/abs/1703.06211
+ * \author Yuwen Xiong, Haozhi Qi, Jifeng Dai, Xizhou Zhu, Han Hu, Dazhi Cheng
+ */
+
+// modified from
+// https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/blob/mmdetection/mmdet/ops/dcn/src/deform_conv_cuda_kernel.cu
+
+#ifndef MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+#define MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+template <typename T>
+__device__ T dmcn_im2col_bilinear(const T *input, const int data_width,
+                                  const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+__device__ T dmcn_get_gradient_weight(T argmax_h, T argmax_w, const int h,
+                                      const int w, const int height,
+                                      const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+__device__ T dmcn_get_coordinate_weight(T argmax_h, T argmax_w,
+                                        const int height, const int width,
+                                        const T *im_data, const int data_width,
+                                        const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+__global__ void modulated_deformable_im2col_gpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear(data_im_ptr, width, height, width, h_im,
+                                     w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_gpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              dmcn_get_gradient_weight(cur_inv_h_data, cur_inv_w_data,
+                                       cur_h + dy, cur_w + dx, height, width);
+          atomicAdd(grad_im + cur_bottom_grad_pos, weight * cur_top_grad);
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void modulated_deformable_col2im_coord_gpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear(data_im_ptr + cnt * height * width, width,
+                                     height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+#endif  // MODULATED_DEFORM_CONV_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..12225ffdb3b1691ad9edabcd1663109f67ef1a6f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/ms_deform_attn_cuda_kernel.cuh
@@ -0,0 +1,801 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+#ifndef DEFORM_ATTN_CUDA_KERNEL
+#define DEFORM_ATTN_CUDA_KERNEL
+
+#include "common_cuda_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(
+    const scalar_t *&bottom_data, const int &height, const int &width,
+    const int &nheads, const int &channels, const scalar_t &h,
+    const scalar_t &w, const int &m, const int &c, const scalar_t &top_grad,
+    const scalar_t &attn_weight, scalar_t *&grad_value,
+    scalar_t *grad_sampling_loc, scalar_t *grad_attn_weight) {
+  const int h_low = floorf(h);
+  const int w_low = floorf(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0) {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value + ptr1, w1 * top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1) {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value + ptr2, w2 * top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0) {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value + ptr3, w3 * top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1) {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value + ptr4, w4 * top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(
+    const int n, const scalar_t *data_value, const int64_t *data_spatial_shapes,
+    const int64_t *data_level_start_index, const scalar_t *data_sampling_loc,
+    const scalar_t *data_attn_weight, const int batch_size,
+    const int spatial_size, const int num_heads, const int channels,
+    const int num_levels, const int num_query, const int num_point,
+    scalar_t *data_col) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr =
+          data_value +
+          (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h,
+                                                spatial_w, num_heads, channels,
+                                                h_im, w_im, m_col, c_col) *
+                 weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+  __shared__ scalar_t cache_grad_attn_weight[blockSize];
+  unsigned int tid = threadIdx.x;
+  const int qid_stride = num_heads * channels;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int _tid = 1; _tid < blockSize; ++_tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[_tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc_out = _grad_w;
+          *(grad_sampling_loc_out + 1) = _grad_h;
+          *grad_attn_weight_out = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+  __shared__ scalar_t cache_grad_attn_weight[blockSize];
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockSize / 2; s > 0; s >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight_out = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0) {
+          scalar_t _grad_w = cache_grad_sampling_loc[0],
+                   _grad_h = cache_grad_sampling_loc[1],
+                   _grad_a = cache_grad_attn_weight[0];
+          int sid = 2;
+          for (unsigned int _tid = 1; _tid < blockDim.x; ++_tid) {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[_tid];
+            sid += 2;
+          }
+
+          *grad_sampling_loc_out = _grad_w;
+          *(grad_sampling_loc_out + 1) = _grad_h;
+          *grad_attn_weight_out = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          *grad_sampling_loc_out = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc_out + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight_out = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  extern __shared__ int _s[];
+  scalar_t *cache_grad_sampling_loc = reinterpret_cast<scalar_t *>(_s);
+  scalar_t *cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+  unsigned int tid = threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc + (threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc + ((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight + threadIdx.x) = 0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              cache_grad_sampling_loc + (threadIdx.x << 1),
+              cache_grad_attn_weight + threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s = blockDim.x / 2, spre = blockDim.x; s > 0;
+             s >>= 1, spre >>= 1) {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] +=
+                cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre) {
+              cache_grad_attn_weight[tid] +=
+                  cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] +=
+                  cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] +=
+                  cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0) {
+          atomicAdd(grad_sampling_loc_out, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc_out + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight_out, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(
+    const int n, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    scalar_t *grad_sampling_loc_out =
+        grad_sampling_loc + (grad_sampling_ptr << 1);
+    scalar_t *grad_attn_weight_out = grad_attn_weight + grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col = 0; l_col < num_levels; ++l_col) {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset =
+          data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col = 0; p_col < num_point; ++p_col) {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w) {
+          ms_deform_attn_col2im_bilinear_gm(
+              data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im,
+              w_im, m_col, c_col, top_grad, weight, grad_value_ptr,
+              grad_sampling_loc_out, grad_attn_weight_out);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight_out += grad_weight_stride;
+        grad_sampling_loc_out += grad_loc_stride;
+      }
+    }
+  }
+}
+#endif  // DEFORM_ATTN_CUDA_KERNEL
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..0a5c2505f5c7716ba025a5884debed73c46db9d5
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/nms_cuda_kernel.cuh
@@ -0,0 +1,117 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef NMS_CUDA_KERNEL_CUH
+#define NMS_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+__device__ inline bool devIoU(float const *const a, float const *const b,
+                              const int offset, const float threshold) {
+  float left = fmaxf(a[0], b[0]), right = fminf(a[2], b[2]);
+  float top = fmaxf(a[1], b[1]), bottom = fminf(a[3], b[3]);
+  float width = fmaxf(right - left + offset, 0.f),
+        height = fmaxf(bottom - top + offset, 0.f);
+  float interS = width * height;
+  float Sa = (a[2] - a[0] + offset) * (a[3] - a[1] + offset);
+  float Sb = (b[2] - b[0] + offset) * (b[3] - b[1] + offset);
+  return interS > threshold * (Sa + Sb - interS);
+}
+
+__global__ void nms_cuda(const int n_boxes, const float iou_threshold,
+                         const int offset, const float *dev_boxes,
+                         unsigned long long *dev_mask) {
+  int blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  CUDA_2D_KERNEL_BLOCK_LOOP(col_start, blocks, row_start, blocks) {
+    const int tid = threadIdx.x;
+
+    if (row_start > col_start) return;
+
+    const int row_size =
+        fminf(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        fminf(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    __shared__ float block_boxes[threadsPerBlock * 4];
+    if (tid < col_size) {
+      block_boxes[tid * 4 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 0];
+      block_boxes[tid * 4 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 1];
+      block_boxes[tid * 4 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 2];
+      block_boxes[tid * 4 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + tid) * 4 + 3];
+    }
+    __syncthreads();
+
+    if (tid < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + tid;
+      const float *cur_box = dev_boxes + cur_box_idx * 4;
+      int i = 0;
+      unsigned long long int t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = tid + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        if (devIoU(cur_box, block_boxes + i * 4, offset, iou_threshold)) {
+          t |= 1ULL << i;
+        }
+      }
+      dev_mask[cur_box_idx * gridDim.y + col_start] = t;
+    }
+  }
+}
+
+__global__ void gather_keep_from_mask(bool *keep,
+                                      const unsigned long long *dev_mask,
+                                      const int n_boxes) {
+  const int col_blocks = (n_boxes + threadsPerBlock - 1) / threadsPerBlock;
+  const int tid = threadIdx.x;
+
+  // mark the bboxes which have been removed.
+  extern __shared__ unsigned long long removed[];
+
+  // initialize removed.
+  for (int i = tid; i < col_blocks; i += blockDim.x) {
+    removed[i] = 0;
+  }
+  __syncthreads();
+
+  for (int nblock = 0; nblock < col_blocks; ++nblock) {
+    auto removed_val = removed[nblock];
+    __syncthreads();
+    const int i_offset = nblock * threadsPerBlock;
+#pragma unroll
+    for (int inblock = 0; inblock < threadsPerBlock; ++inblock) {
+      const int i = i_offset + inblock;
+      if (i >= n_boxes) break;
+      // select a candidate, check if it should kept.
+      if (!(removed_val & (1ULL << inblock))) {
+        if (tid == 0) {
+          // mark the output.
+          keep[i] = true;
+        }
+        auto p = dev_mask + i * col_blocks;
+        // remove all bboxes which overlap the candidate.
+        for (int j = tid; j < col_blocks; j += blockDim.x) {
+          if (j >= nblock) removed[j] |= p[j];
+        }
+        __syncthreads();
+        removed_val = removed[nblock];
+      }
+    }
+  }
+}
+
+#endif  // NMS_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh b/mmcv/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..747327afb83900177dd4721f1b0ba99153f658d7
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/nms_rotated_cuda.cuh
@@ -0,0 +1,133 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#ifndef NMS_ROTATED_CUDA_CUH
+#define NMS_ROTATED_CUDA_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+#include "box_iou_rotated_utils.hpp"
+
+__host__ __device__ inline int divideUP(const int x, const int y) {
+  return (((x) + (y)-1) / (y));
+}
+
+namespace {
+int const threadsPerBlock = sizeof(unsigned long long) * 8;
+}
+
+template <typename T>
+__global__ void nms_rotated_cuda_kernel(const int n_boxes,
+                                        const float iou_threshold,
+                                        const T* dev_boxes,
+                                        unsigned long long* dev_mask,
+                                        const int multi_label) {
+  // nms_rotated_cuda_kernel is modified from torchvision's nms_cuda_kernel
+
+  if (multi_label == 1) {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 5 values
+    // (x_center, y_center, width, height, angle_degrees) here.
+    __shared__ T block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 6 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 6;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_rotated function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  } else {
+    const int row_start = blockIdx.y;
+    const int col_start = blockIdx.x;
+
+    // if (row_start > col_start) return;
+
+    const int row_size =
+        min(n_boxes - row_start * threadsPerBlock, threadsPerBlock);
+    const int col_size =
+        min(n_boxes - col_start * threadsPerBlock, threadsPerBlock);
+
+    // Compared to nms_cuda_kernel, where each box is represented with 4 values
+    // (x1, y1, x2, y2), each rotated box is represented with 5 values
+    // (x_center, y_center, width, height, angle_degrees) here.
+    __shared__ T block_boxes[threadsPerBlock * 5];
+    if (threadIdx.x < col_size) {
+      block_boxes[threadIdx.x * 5 + 0] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 0];
+      block_boxes[threadIdx.x * 5 + 1] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 1];
+      block_boxes[threadIdx.x * 5 + 2] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 2];
+      block_boxes[threadIdx.x * 5 + 3] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 3];
+      block_boxes[threadIdx.x * 5 + 4] =
+          dev_boxes[(threadsPerBlock * col_start + threadIdx.x) * 5 + 4];
+    }
+    __syncthreads();
+
+    if (threadIdx.x < row_size) {
+      const int cur_box_idx = threadsPerBlock * row_start + threadIdx.x;
+      const T* cur_box = dev_boxes + cur_box_idx * 5;
+      int i = 0;
+      unsigned long long t = 0;
+      int start = 0;
+      if (row_start == col_start) {
+        start = threadIdx.x + 1;
+      }
+      for (i = start; i < col_size; i++) {
+        // Instead of devIoU used by original horizontal nms, here
+        // we use the single_box_iou_rotated function from
+        // box_iou_rotated_utils.h
+        if (single_box_iou_rotated<T>(cur_box, block_boxes + i * 5, 0) >
+            iou_threshold) {
+          t |= 1ULL << i;
+        }
+      }
+      const int col_blocks = divideUP(n_boxes, threadsPerBlock);
+      dev_mask[cur_box_idx * col_blocks + col_start] = t;
+    }
+  }
+}
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh b/mmcv/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7918a57452bbde9dc7c249b0c3dd2774aa1961bf
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/parrots_cudawarpfunction.cuh
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2019, SenseTime.
+ */
+
+#ifndef INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+#define INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
+
+#ifndef __CUDACC__
+#error cudawarpfunction.cuh should only be included by .cu files
+#endif
+#include <cuda.h>
+
+#include <parrots/foundation/common.hpp>
+
+#ifdef PARROTS_USE_HALF
+#include <cuda_fp16.h>
+#endif
+#ifdef __CUDA_ARCH__
+#define CUDA_INTRINSIC_FUNC(Expr) Expr
+#else
+#define CUDA_INTRINSIC_FUNC(Expr)
+#endif
+
+#if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#ifdef PARROTS_USE_HALF
+
+#if CUDA_VERSION < 9000
+
+__device__ inline float16 __shfl(float16 var, int srcLane, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl(var.y, srcLane, width););
+}
+
+__device__ inline float16 __shfl_up(float16 var, unsigned delta, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_up(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_down(float16 var, unsigned delta, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_down(var.y, delta, width););
+}
+
+__device__ inline float16 __shfl_xor(float16 var, int laneMask, int width) {
+  CUDA_INTRINSIC_FUNC(return __shfl_xor(var.y, laneMask, width););
+}
+
+#else  // CUDA_VERSION >= 9000
+
+__device__ inline float16 __shfl_sync(unsigned mask, float16 var, int srcLane,
+                                      int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(float16 r; r.y = __shfl_sync(mask, var.y, srcLane, width);
+                      return r;);
+}
+
+__device__ inline float16 __shfl_up_sync(unsigned mask, float16 var,
+                                         unsigned delta, int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(
+      float16 r; r.y = __shfl_up_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_down_sync(unsigned mask, float16 var,
+                                           unsigned delta,
+                                           int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(
+      float16 r; r.y = __shfl_down_sync(mask, var.y, delta, width); return r;);
+}
+
+__device__ inline float16 __shfl_xor_sync(unsigned mask, float16 var,
+                                          int laneMask, int width) {
+  CUDA_INTRINSIC_FUNC(float16 r;
+                      r.y = __shfl_xor_sync(mask, var.y, laneMask, width);
+                      return r;);
+}
+
+#endif  // CUDA_VERSION < 9000
+
+#endif  // PARROTS_USE_HALF
+
+// warp shuffle interface with a dummy mask
+#if CUDA_VERSION < 9000
+
+template <typename T>
+__device__ inline T __shfl_sync(unsigned mask, T var, int srcLane,
+                                int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl(var, srcLane, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_up_sync(unsigned mask, T var, unsigned delta,
+                                   int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_up(var, delta, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_down_sync(unsigned mask, T var, unsigned delta,
+                                     int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_down(var, delta, width););
+}
+
+template <typename T>
+__device__ inline T __shfl_xor_sync(unsigned mask, T var, int laneMask,
+                                    int width = warpSize) {
+  CUDA_INTRINSIC_FUNC(return __shfl_xor(var, laneMask, width););
+}
+
+#endif  // CUDA_VERSION < 9000
+
+#endif  // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300
+
+#endif  // INCLUDE_PARROTS_DARRAY_CUDAWARPFUNCTION_CUH_
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..342362079a5ce3dde6d19532b3014872f4373330
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/points_in_boxes_cuda_kernel.cuh
@@ -0,0 +1,95 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINT_IN_BOXES_CUDA_KERNEL_CUH
+#define POINT_IN_BOXES_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void points_in_boxes_part_forward_cuda_kernel(
+    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
+    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
+  // (B, npoints), default -1
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (bs_idx >= batch_size) return;
+
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num + pt_idx;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = 0;
+    for (int k = 0; k < boxes_num; k++) {
+      cur_in_flag = check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      if (cur_in_flag) {
+        box_idx_of_points[0] = k;
+        break;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void points_in_boxes_all_forward_cuda_kernel(
+    int batch_size, int boxes_num, int pts_num, const T *boxes, const T *pts,
+    int *box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (B, npoints, 3) [x, y, z] in LiDAR coordinate params boxes_idx_of_points:
+  // (B, npoints), default -1
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (bs_idx >= batch_size) return;
+
+    boxes += bs_idx * boxes_num * 7;
+    pts += bs_idx * pts_num * 3 + pt_idx * 3;
+    box_idx_of_points += bs_idx * pts_num * boxes_num + pt_idx * boxes_num;
+
+    T local_x = 0, local_y = 0;
+    for (int k = 0; k < boxes_num; k++) {
+      const int cur_in_flag =
+          check_pt_in_box3d(pts, boxes + k * 7, local_x, local_y);
+      if (cur_in_flag) {
+        box_idx_of_points[k] = 1;
+      }
+    }
+  }
+}
+
+#endif  // POINT_IN_BOXES_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..a0769d75a29ce8d7eac00931d6f51caa292b2693
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/points_in_polygons_cuda_kernel.cuh
@@ -0,0 +1,79 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
+#define POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+struct point {
+  float x, y;
+};
+
+template <typename scalar_t>
+__global__ void points_in_polygons_forward_cuda_kernel(
+    const int nthreads, const scalar_t *vertex1, const scalar_t *vertex2,
+    const int rows, const int cols, scalar_t *inside_flag) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int row = index / cols;
+    int col = index % cols;
+
+    const scalar_t *offset_vertex1 = vertex1 + row * 2;
+    const scalar_t *offset_vertex2 = vertex2 + col * 8;
+
+    point point_[1];
+    point polygon[4];
+
+    point_[0].x = offset_vertex1[0];
+    point_[0].y = offset_vertex1[1];
+
+    polygon[0].x = offset_vertex2[0];
+    polygon[0].y = offset_vertex2[1];
+    polygon[1].x = offset_vertex2[2];
+    polygon[1].y = offset_vertex2[3];
+    polygon[2].x = offset_vertex2[4];
+    polygon[2].y = offset_vertex2[5];
+    polygon[3].x = offset_vertex2[6];
+    polygon[3].y = offset_vertex2[7];
+
+    int nCross = 0;
+    int i, j;
+    float sx, sy, tx, ty, px, py, x;
+    for (i = 0, j = 3; i < 4; j = i, i++) {
+      sx = polygon[i].x;
+      sy = polygon[i].y;
+      tx = polygon[j].x;
+      ty = polygon[j].y;
+
+      px = point_[0].x;
+      py = point_[0].y;
+
+      if (py < min(sy, ty)) continue;
+      if (py > max(sy, ty)) continue;
+
+      if ((sx == px && sy == py) || (tx == px && ty == py)) {
+        break;
+      } else {
+        if ((sy < py && ty >= py) || (sy >= py && ty < py)) {
+          x = sx + (py - sy) * (tx - sx) / (ty - sy);
+          if (x == px) {
+            break;
+          }
+          if (x > px) {
+            nCross++;
+          }
+        }
+      }
+    }
+    if (nCross % 2 == 1) {
+      inside_flag[index] = 1.0;
+    } else {
+      inside_flag[index] = 0.0;
+    }
+    return;
+  }
+}
+
+#endif  // POINTS_IN_POLYGONS_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ea8c37e22afdd5b3c48c5ea6fc29004d74340fb5
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/prroi_pool_cuda_kernel.cuh
@@ -0,0 +1,381 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/vacancy/PreciseRoIPooling/blob/master/src/prroi_pooling_gpu_impl.cu
+// Distributed under terms of the MIT license.
+#ifndef PRROI_POOL_CUDA_KERNEL_CUH
+#define PRROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingGetData(const T *data,
+                                                        const int h,
+                                                        const int w,
+                                                        const int height,
+                                                        const int width) {
+  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
+  T retVal = overflow ? 0.0f : data[h * width + w];
+  return retVal;
+}
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingGetCoeff(T dh, T dw) {
+  return (1.0f - abs(dh)) * (1.0f - abs(dw));
+}
+
+template <typename T>
+__device__ static __forceinline__ T PrRoIPoolingSingleCoorIntegral(T s, T t,
+                                                                   T c1, T c2) {
+  return 0.5 * (t * t - s * s) * (c2 - c1) + (t - s) * c1;
+}
+
+template <typename T>
+__device__ static T PrRoIPoolingInterpolation(const T *data, const T h,
+                                              const T w, const int height,
+                                              const int width) {
+  T retVal = 0.0f;
+  int h1 = floorf(h);
+  int w1 = floorf(w);
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h) + 1;
+  w1 = floorf(w);
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h);
+  w1 = floorf(w) + 1;
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  h1 = floorf(h) + 1;
+  w1 = floorf(w) + 1;
+  retVal += PrRoIPoolingGetData(data, h1, w1, height, width) *
+            PrRoIPoolingGetCoeff(h - T(h1), w - T(w1));
+  return retVal;
+}
+
+template <typename T>
+__device__ static T PrRoIPoolingMatCalculation(const T *this_data,
+                                               const int s_h, const int s_w,
+                                               const int e_h, const int e_w,
+                                               const T y0, const T x0,
+                                               const T y1, const T x1,
+                                               const int h0, const int w0) {
+  T alpha, beta, lim_alpha, lim_beta, tmp;
+  T sum_out = 0;
+
+  alpha = x0 - T(s_w);
+  beta = y0 - T(s_h);
+  lim_alpha = x1 - T(s_w);
+  lim_beta = y1 - T(s_h);
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, s_h, s_w, h0, w0) * tmp;
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, s_h, e_w, h0, w0) * tmp;
+
+  alpha = x0 - T(s_w);
+  beta = T(e_h) - y1;
+  lim_alpha = x1 - T(s_w);
+  lim_beta = T(e_h) - y0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, e_h, s_w, h0, w0) * tmp;
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  sum_out += PrRoIPoolingGetData(this_data, e_h, e_w, h0, w0) * tmp;
+
+  return sum_out;
+}
+
+template <typename T>
+__device__ static void PrRoIPoolingDistributeDiff(T *diff, const T top_diff,
+                                                  const int h, const int w,
+                                                  const int height,
+                                                  const int width,
+                                                  const T coeff) {
+  bool overflow = (h < 0) || (w < 0) || (h >= height) || (w >= width);
+  if (!overflow) atomicAdd(diff + h * width + w, top_diff * coeff);
+}
+
+template <typename T>
+__device__ static void PrRoIPoolingMatDistributeDiff(
+    T *diff, const T top_diff, const int s_h, const int s_w, const int e_h,
+    const int e_w, const T y0, const T x0, const T y1, const T x1, const int h0,
+    const int w0) {
+  T alpha, beta, lim_alpha, lim_beta, tmp;
+
+  alpha = x0 - T(s_w);
+  beta = y0 - T(s_h);
+  lim_alpha = x1 - T(s_w);
+  lim_beta = y1 - T(s_h);
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, s_w, h0, w0, tmp);
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, s_h, e_w, h0, w0, tmp);
+
+  alpha = x0 - T(s_w);
+  beta = T(e_h) - y1;
+  lim_alpha = x1 - T(s_w);
+  lim_beta = T(e_h) - y0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, s_w, h0, w0, tmp);
+
+  alpha = T(e_w) - x1;
+  lim_alpha = T(e_w) - x0;
+  tmp = (lim_alpha - 0.5f * lim_alpha * lim_alpha - alpha +
+         0.5f * alpha * alpha) *
+        (lim_beta - 0.5f * lim_beta * lim_beta - beta + 0.5f * beta * beta);
+  PrRoIPoolingDistributeDiff(diff, top_diff, e_h, e_w, h0, w0, tmp);
+}
+
+template <typename T>
+__global__ void prroi_pool_forward_cuda_kernel(
+    const int nthreads, const T *input, const T *rois, T *output,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    T roi_x1 = offset_rois[1] * spatial_scale;
+    T roi_y1 = offset_rois[2] * spatial_scale;
+    T roi_x2 = offset_rois[3] * spatial_scale;
+    T roi_y2 = offset_rois[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, ((T)0.0));
+    T roi_height = max(roi_y2 - roi_y1, ((T)0.0));
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T *this_data =
+        input + (roi_batch_ind * channels + c) * height * width;
+    T *this_out = output + index;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+    if (bin_size == 0) {
+      *this_out = 0;
+      continue;
+    }
+
+    T sum_out = 0;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
+      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
+        sum_out += PrRoIPoolingMatCalculation(
+            this_data, bin_y, bin_x, bin_y + 1, bin_x + 1,
+            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
+            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
+            width);
+    *this_out = sum_out / bin_size;
+  }
+}
+
+template <typename T>
+__global__ void prroi_pool_backward_cuda_kernel(
+    const int nthreads, const T *grad_output, const T *rois, T *grad_input,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    rois += n * 5;
+
+    int roi_batch_ind = rois[0];
+    T roi_x1 = rois[1] * spatial_scale;
+    T roi_y1 = rois[2] * spatial_scale;
+    T roi_x2 = rois[3] * spatial_scale;
+    T roi_y2 = rois[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, (T)0);
+    T roi_height = max(roi_y2 - roi_y1, (T)0);
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T *this_out_grad = grad_output + index;
+    T *this_data_grad =
+        grad_input + (roi_batch_ind * channels + c) * height * width;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+
+    T sum_out = bin_size == T(0) ? T(0) : *this_out_grad / bin_size;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x)
+      for (int bin_y = start_y; bin_y < end_y; ++bin_y)
+        PrRoIPoolingMatDistributeDiff(
+            this_data_grad, sum_out, bin_y, bin_x, bin_y + 1, bin_x + 1,
+            max(bin_y1, T(bin_y)), max(bin_x1, T(bin_x)),
+            min(bin_y2, T(bin_y) + 1.0f), min(bin_x2, T(bin_x + 1.0f)), height,
+            width);
+  }
+}
+
+template <typename T>
+__global__ void prroi_pool_coor_backward_cuda_kernel(
+    const int nthreads, const T *output, const T *grad_output, const T *input,
+    const T *rois, T *grad_rois, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int channels,
+    const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+    rois += n * 5;
+
+    int roi_batch_ind = rois[0];
+    T roi_x1 = rois[1] * spatial_scale;
+    T roi_y1 = rois[2] * spatial_scale;
+    T roi_x2 = rois[3] * spatial_scale;
+    T roi_y2 = rois[4] * spatial_scale;
+
+    T roi_width = max(roi_x2 - roi_x1, (T)0);
+    T roi_height = max(roi_y2 - roi_y1, (T)0);
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    const T output_grad_val = grad_output[index];
+    const T *this_input_data =
+        input + (roi_batch_ind * channels + c) * height * width;
+    const T output_val = output[index];
+    T *this_rois_grad = grad_rois + n * 5;
+
+    T bin_x1 = roi_x1 + bin_size_w * pw;
+    T bin_y1 = roi_y1 + bin_size_h * ph;
+    T bin_x2 = bin_x1 + bin_size_w;
+    T bin_y2 = bin_y1 + bin_size_h;
+
+    T bin_size = max(T(0.0), bin_size_w * bin_size_h);
+
+    T sum_out = bin_size == T(0) ? T(0) : output_grad_val / bin_size;
+
+    // WARNING: to be discussed
+    if (sum_out == 0) return;
+
+    int start_x, start_y, end_x, end_y;
+
+    start_x = floorf(bin_x1);
+    end_x = ceilf(bin_x2);
+    start_y = floorf(bin_y1);
+    end_y = ceilf(bin_y2);
+
+    T grad_x1_y = 0, grad_x2_y = 0, grad_x_y1 = 0, grad_x_y2 = 0;
+    for (int bin_y = start_y; bin_y < end_y; ++bin_y) {
+      grad_x1_y += PrRoIPoolingSingleCoorIntegral(
+          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x1,
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x1,
+                                    height, width));
+
+      grad_x2_y += PrRoIPoolingSingleCoorIntegral(
+          max(bin_y1, T(bin_y)) - bin_y, min(bin_y2, T(bin_y + 1)) - bin_y,
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y), bin_x2,
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, float(bin_y + 1), bin_x2,
+                                    height, width));
+    }
+
+    for (int bin_x = start_x; bin_x < end_x; ++bin_x) {
+      grad_x_y1 += PrRoIPoolingSingleCoorIntegral(
+          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
+          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x),
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, bin_y1, float(bin_x + 1),
+                                    height, width));
+
+      grad_x_y2 += PrRoIPoolingSingleCoorIntegral(
+          max(bin_x1, T(bin_x)) - bin_x, min(bin_x2, T(bin_x + 1)) - bin_x,
+          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x),
+                                    height, width),
+          PrRoIPoolingInterpolation(this_input_data, bin_y2, float(bin_x + 1),
+                                    height, width));
+    }
+
+    T partial_x1 = -grad_x1_y + (bin_y2 - bin_y1) * output_val;
+    T partial_y1 = -grad_x_y1 + (bin_x2 - bin_x1) * output_val;
+    T partial_x2 = grad_x2_y - (bin_y2 - bin_y1) * output_val;
+    T partial_y2 = grad_x_y2 - (bin_x2 - bin_x1) * output_val;
+
+    partial_x1 = partial_x1 / bin_size * spatial_scale;
+    partial_x2 = partial_x2 / bin_size * spatial_scale;
+    partial_y1 = partial_y1 / bin_size * spatial_scale;
+    partial_y2 = partial_y2 / bin_size * spatial_scale;
+
+    // (index, x1, y1, x2, y2)
+    this_rois_grad[0] = 0;
+    atomicAdd(this_rois_grad + 1,
+              (partial_x1 * (1.0f - T(pw) / pooled_width) +
+               partial_x2 * (1.0f - T(pw + 1) / pooled_width)) *
+                  output_grad_val);
+    atomicAdd(this_rois_grad + 2,
+              (partial_y1 * (1.0f - T(ph) / pooled_height) +
+               partial_y2 * (1.0f - T(ph + 1) / pooled_height)) *
+                  output_grad_val);
+    atomicAdd(this_rois_grad + 3, (partial_x2 * T(pw + 1) / pooled_width +
+                                   partial_x1 * T(pw) / pooled_width) *
+                                      output_grad_val);
+    atomicAdd(this_rois_grad + 4, (partial_y2 * T(ph + 1) / pooled_height +
+                                   partial_y1 * T(ph) / pooled_height) *
+                                      output_grad_val);
+  }
+}
+
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5d946686bdd5fdfbf8a27f6d040e15861202f471
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/psamask_cuda_kernel.cuh
@@ -0,0 +1,141 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PSAMASK_CUDA_KERNEL_CUH
+#define PSAMASK_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+// CUDA: grid stride looping
+#ifndef CUDA_KERNEL_LOOP
+#define CUDA_KERNEL_LOOP(i, n)                                 \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+#endif
+
+template <typename T>
+__global__ void psamask_collect_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature +
+                     (hidx + h - half_h_mask) * w_feature +
+                     (widx + w - half_w_mask)) *
+                        h_feature * w_feature +
+                    h * w_feature + w] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_forward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* mask_data, T* buffer_data) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        buffer_data[(n * h_feature * w_feature + h * w_feature + w) *
+                        h_feature * w_feature +
+                    (hidx + h - half_h_mask) * w_feature +
+                    (widx + w - half_w_mask)] = mask_data
+            [((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature + h) *
+                 w_feature +
+             w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_collect_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] = buffer_diff[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void psamask_distribute_backward_cuda(
+    const int nthreads, const int h_feature, const int w_feature,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask, const T* buffer_diff, T* mask_diff) {
+  CUDA_KERNEL_LOOP(index, nthreads) {
+    const int w = index % w_feature;
+    const int h = (index / w_feature) % h_feature;
+    const int n = index / w_feature / h_feature;
+    // effective mask region : [hstart, hend) x [wstart, wend) with mask-indexed
+    const int hstart = max(0, half_h_mask - h);
+    const int hend = min(h_mask, h_feature + half_h_mask - h);
+    const int wstart = max(0, half_w_mask - w);
+    const int wend = min(w_mask, w_feature + half_w_mask - w);
+    // (hidx,                    widx                   ) with mask-indexed
+    // (hidx + h - half_h_mask, widx + w - half_w_mask) with feature-indexed
+    for (int hidx = hstart; hidx < hend; hidx++) {
+      for (int widx = wstart; widx < wend; widx++) {
+        mask_diff[((n * h_mask * w_mask + hidx * w_mask + widx) * h_feature +
+                   h) *
+                      w_feature +
+                  w] =
+            buffer_diff[(n * h_feature * w_feature + h * w_feature + w) *
+                            h_feature * w_feature +
+                        (hidx + h - half_h_mask) * w_feature +
+                        (widx + w - half_w_mask)];
+      }
+    }
+  }
+}
+
+#endif  // PSAMASK_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4383d9e82cce97362f53cf799b8dfa30c7b4cd02
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/riroi_align_rotated_cuda_kernel.cuh
@@ -0,0 +1,242 @@
+// Modified from
+// https://github.com/csuhan/ReDet/blob/master/mmdet/ops/riroi_align/src/riroi_align_kernel.cu
+#ifndef RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+#define RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+
+/*** Forward ***/
+template <typename scalar_t>
+__global__ void riroi_align_rotated_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_rois, const scalar_t spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int num_orientations, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int o = (index / pooled_width / pooled_height) % num_orientations;
+    int c =
+        (index / pooled_width / pooled_height / num_orientations) % channels;
+    int n = index / pooled_width / pooled_height / num_orientations / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    // find aligned index
+    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
+    int ind = floorf(ind_float);
+    scalar_t l_var = ind_float - (scalar_t)ind;
+    scalar_t r_var = 1.0 - l_var;
+    // correct start channel
+    ind = (ind + num_orientations) % num_orientations;
+    // rotated channel
+    int ind_rot = (o - ind + num_orientations) % num_orientations;
+    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot) *
+                          height * width;
+
+    const scalar_t *offset_bottom_data_plus =
+        bottom_data + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot_plus) *
+                          height * width;
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (num_samples > 0)
+                             ? num_samples
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosscalar_theta = cos(theta);
+    scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta (counterclockwise) around the center and translate
+        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+        scalar_t val = bilinear_interpolate<scalar_t>(
+            offset_bottom_data, height, width, y, x, index);
+        scalar_t val_plus = bilinear_interpolate<scalar_t>(
+            offset_bottom_data_plus, height, width, y, x, index);
+        output_val += r_var * val + l_var * val_plus;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+/*** Backward ***/
+template <typename scalar_t>
+__global__ void riroi_align_rotated_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int num_samples, const bool clockwise,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int o = (index / pooled_width / pooled_height) % num_orientations;
+    int c =
+        (index / pooled_width / pooled_height / num_orientations) % channels;
+    int n = index / pooled_width / pooled_height / num_orientations / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    // Force malformed ROIs to be 1x1
+    roi_width = max(roi_width, (scalar_t)1.);
+    roi_height = max(roi_height, (scalar_t)1.);
+
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    // find aligned index
+    scalar_t ind_float = theta * num_orientations / (2 * M_PI);
+    int ind = floorf(ind_float);
+    scalar_t l_var = ind_float - (scalar_t)ind;
+    scalar_t r_var = 1.0 - l_var;
+    // correct start channel
+    ind = (ind + num_orientations) % num_orientations;
+    // rotated channel
+    int ind_rot = (o - ind + num_orientations) % num_orientations;
+    int ind_rot_plus = (ind_rot + 1 + num_orientations) % num_orientations;
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot) *
+                          height * width;
+    scalar_t *offset_bottom_diff_plus =
+        bottom_diff + (roi_batch_ind * channels * num_orientations +
+                       c * num_orientations + ind_rot_plus) *
+                          height * width;
+    int top_offset =
+        (n * channels * num_orientations + c * num_orientations + o) *
+        pooled_height * pooled_width;
+    const scalar_t *offset_top_diff = top_diff + top_offset;
+    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (num_samples > 0)
+                             ? num_samples
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (num_samples > 0) ? num_samples : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosTheta = cos(theta);
+    scalar_t sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
+        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
+
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
+                                                w4, x_low, x_high, y_low,
+                                                y_high, index);
+
+        scalar_t g1 = top_diff_this_bin * w1 / count;
+        scalar_t g2 = top_diff_this_bin * w2 / count;
+        scalar_t g3 = top_diff_this_bin * w3 / count;
+        scalar_t g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1 * r_var);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2 * r_var);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3 * r_var);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4 * r_var);
+
+          atomicAdd(offset_bottom_diff_plus + y_low * width + x_low,
+                    g1 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_low * width + x_high,
+                    g2 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_high * width + x_low,
+                    g3 * l_var);
+          atomicAdd(offset_bottom_diff_plus + y_high * width + x_high,
+                    g4 * l_var);
+
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // RiRoIAlignBackward
+
+#endif  // RIROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4541462afd6bd77ee794badd7d84bdd6c91b2c43
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/roi_align_cuda_kernel.cuh
@@ -0,0 +1,212 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_ALIGN_CUDA_KERNEL_CUH
+#define ROI_ALIGN_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+/*** Forward ***/
+template <typename T>
+__global__ void roi_align_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, T* argmax_y,
+    T* argmax_x, const int pooled_height, const int pooled_width,
+    const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_height / pooled_height));
+    int roi_bin_grid_w =
+        (sampling_ratio > 0)
+            ? sampling_ratio
+            : static_cast<int>(ceilf(roi_width / pooled_width));
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T maxval = -FLT_MAX;
+      T maxidx_y = -1.f, maxidx_x = -1.f;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val =
+              bilinear_interpolate(offset_input, height, width, y, x, index);
+          if (val > maxval) {
+            maxval = val;
+            maxidx_y = y;
+            maxidx_x = x;
+          }
+        }
+      }
+      output[index] = maxval;
+      argmax_y[index] = maxidx_y;
+      argmax_x[index] = maxidx_x;
+    } else if (pool_mode == 1) {
+      // We do average pooling inside a bin
+      const T count = max(roi_bin_grid_h * roi_bin_grid_w, 1);
+      T output_val = 0.;
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+          T val =
+              bilinear_interpolate(offset_input, height, width, y, x, index);
+          output_val += val;
+        }
+      }
+      output[index] = output_val / count;
+    }
+  }
+}
+
+/*** Backward ***/
+template <typename T>
+__global__ void roi_align_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* rois, const T* argmax_y,
+    const T* argmax_x, T* grad_input, const int pooled_height,
+    const int pooled_width, const T spatial_scale, const int sampling_ratio,
+    const int pool_mode,  // 0 - max pool, 1 - avg pool
+    const bool aligned, const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T grad_output_this_bin = grad_output[index];
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    if (pool_mode == 0) {
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_grad_input + y_low * width + x_low,
+                    grad_output_this_bin * w1);
+          atomicAdd(offset_grad_input + y_low * width + x_high,
+                    grad_output_this_bin * w2);
+          atomicAdd(offset_grad_input + y_high * width + x_low,
+                    grad_output_this_bin * w3);
+          atomicAdd(offset_grad_input + y_high * width + x_high,
+                    grad_output_this_bin * w4);
+        }
+      }
+    } else if (pool_mode == 1) {
+      // Do not using rounding; this implementation detail is critical
+      T offset = aligned ? (T)0.5 : (T)0.0;
+      T roi_start_w = offset_rois[1] * spatial_scale - offset;
+      T roi_start_h = offset_rois[2] * spatial_scale - offset;
+      T roi_end_w = offset_rois[3] * spatial_scale - offset;
+      T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+      T roi_width = roi_end_w - roi_start_w;
+      T roi_height = roi_end_h - roi_start_h;
+      if (!aligned) {  // for backward-compatibility only
+        roi_width = max(roi_width, (T)1.);
+        roi_height = max(roi_height, (T)1.);
+      }
+
+      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : static_cast<int>(ceilf(roi_height / pooled_height));
+      int roi_bin_grid_w =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : static_cast<int>(ceilf(roi_width / pooled_width));
+
+      // We do average (integral) pooling inside a bin
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            atomicAdd(offset_grad_input + y_low * width + x_low,
+                      grad_output_this_bin * w1 / count);
+            atomicAdd(offset_grad_input + y_low * width + x_high,
+                      grad_output_this_bin * w2 / count);
+            atomicAdd(offset_grad_input + y_high * width + x_low,
+                      grad_output_this_bin * w3 / count);
+            atomicAdd(offset_grad_input + y_high * width + x_high,
+                      grad_output_this_bin * w4 / count);
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // ROI_ALIGN_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8274dc50c709630c4ee456efd543aa1265049b41
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/roi_align_rotated_cuda_kernel.cuh
@@ -0,0 +1,202 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#ifndef ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+#define ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
+
+#include <float.h>
+#ifdef MMCV_WITH_TRT
+#include "common_cuda_helper.hpp"
+#else  // MMCV_WITH_TRT
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else  // MMCV_USE_PARROTS
+#include "pytorch_cuda_helper.hpp"
+#endif  // MMCV_USE_PARROTS
+#endif  // MMCV_WITH_TRT
+
+/*** Forward ***/
+template <typename scalar_t>
+__global__ void roi_align_rotated_forward_cuda_kernel(
+    const int nthreads, const scalar_t *bottom_data,
+    const scalar_t *bottom_rois, const scalar_t spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, scalar_t *top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    if (!aligned) {  // for backward-compatibility only
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (scalar_t)1.);
+      roi_height = max(roi_height, (scalar_t)1.);
+    }
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosscalar_theta = cos(theta);
+    scalar_t sinscalar_theta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    scalar_t output_val = 0.;
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta (counterclockwise) around the center and translate
+        scalar_t y = yy * cosscalar_theta - xx * sinscalar_theta + roi_center_h;
+        scalar_t x = yy * sinscalar_theta + xx * cosscalar_theta + roi_center_w;
+
+        scalar_t val = bilinear_interpolate<scalar_t>(
+            offset_bottom_data, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    top_data[index] = output_val;
+  }
+}
+
+/*** Backward ***/
+template <typename scalar_t>
+__global__ void roi_align_rotated_backward_cuda_kernel(
+    const int nthreads, const scalar_t *top_diff, const scalar_t *bottom_rois,
+    const scalar_t spatial_scale, const int sampling_ratio, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, scalar_t *bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const scalar_t *offset_bottom_rois = bottom_rois + n * 6;
+    int roi_batch_ind = offset_bottom_rois[0];
+
+    // Do not round
+    scalar_t offset = aligned ? (scalar_t)0.5 : (scalar_t)0.0;
+    scalar_t roi_center_w = offset_bottom_rois[1] * spatial_scale - offset;
+    scalar_t roi_center_h = offset_bottom_rois[2] * spatial_scale - offset;
+    scalar_t roi_width = offset_bottom_rois[3] * spatial_scale;
+    scalar_t roi_height = offset_bottom_rois[4] * spatial_scale;
+    // scalar_t theta = offset_bottom_rois[5] * M_PI / 180.0;
+    scalar_t theta = offset_bottom_rois[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    if (!aligned) {  // for backward-compatibility only
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (scalar_t)1.);
+      roi_height = max(roi_height, (scalar_t)1.);
+    }
+    scalar_t bin_size_h = static_cast<scalar_t>(roi_height) /
+                          static_cast<scalar_t>(pooled_height);
+    scalar_t bin_size_w =
+        static_cast<scalar_t>(roi_width) / static_cast<scalar_t>(pooled_width);
+
+    scalar_t *offset_bottom_diff =
+        bottom_diff + (roi_batch_ind * channels + c) * height * width;
+
+    int top_offset = (n * channels + c) * pooled_height * pooled_width;
+    const scalar_t *offset_top_diff = top_diff + top_offset;
+    const scalar_t top_diff_this_bin = offset_top_diff[ph * pooled_width + pw];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    scalar_t roi_start_h = -roi_height / 2.0;
+    scalar_t roi_start_w = -roi_width / 2.0;
+    scalar_t cosTheta = cos(theta);
+    scalar_t sinTheta = sin(theta);
+
+    // We do average (integral) pooling inside a bin
+    const scalar_t count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {  // e.g., iy = 0, 1
+      const scalar_t yy =
+          roi_start_h + ph * bin_size_h +
+          static_cast<scalar_t>(iy + .5f) * bin_size_h /
+              static_cast<scalar_t>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const scalar_t xx = roi_start_w + pw * bin_size_w +
+                            static_cast<scalar_t>(ix + .5f) * bin_size_w /
+                                static_cast<scalar_t>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        scalar_t y = yy * cosTheta - xx * sinTheta + roi_center_h;
+        scalar_t x = yy * sinTheta + xx * cosTheta + roi_center_w;
+
+        scalar_t w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient<scalar_t>(height, width, y, x, w1, w2, w3,
+                                                w4, x_low, x_high, y_low,
+                                                y_high, index);
+
+        scalar_t g1 = top_diff_this_bin * w1 / count;
+        scalar_t g2 = top_diff_this_bin * w2 / count;
+        scalar_t g3 = top_diff_this_bin * w3 / count;
+        scalar_t g4 = top_diff_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+          atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+          atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+          atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+        }  // if
+      }    // ix
+    }      // iy
+  }        // CUDA_1D_KERNEL_LOOP
+}  // RoIAlignBackward
+
+#endif  // ROI_ALIGN_ROTATED_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..3d7eae66b99b7812b92d9fc8bad237cbcbd59436
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/roi_pool_cuda_kernel.cuh
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_POOL_CUDA_KERNEL_CUH
+#define ROI_POOL_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void roi_pool_forward_cuda_kernel(
+    const int nthreads, const T* input, const T* rois, T* output, int* argmax,
+    const int pooled_height, const int pooled_width, const T spatial_scale,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+    // calculate the roi region on feature maps
+    T roi_x1 = offset_rois[1] * spatial_scale;
+    T roi_y1 = offset_rois[2] * spatial_scale;
+    T roi_x2 = (offset_rois[3] + 1) * spatial_scale;
+    T roi_y2 = (offset_rois[4] + 1) * spatial_scale;
+
+    // force malformed rois to be 1x1
+    T roi_w = roi_x2 - roi_x1;
+    T roi_h = roi_y2 - roi_y1;
+    if (roi_w <= 0 || roi_h <= 0) continue;
+
+    T bin_size_w = roi_w / static_cast<T>(pooled_width);
+    T bin_size_h = roi_h / static_cast<T>(pooled_height);
+
+    // the corresponding bin region
+    int bin_x1 = floorf(static_cast<T>(pw) * bin_size_w + roi_x1);
+    int bin_y1 = floorf(static_cast<T>(ph) * bin_size_h + roi_y1);
+    int bin_x2 = ceilf(static_cast<T>(pw + 1) * bin_size_w + roi_x1);
+    int bin_y2 = ceilf(static_cast<T>(ph + 1) * bin_size_h + roi_y1);
+
+    // add roi offsets and clip to input boundaries
+    bin_x1 = min(max(bin_x1, 0), width);
+    bin_y1 = min(max(bin_y1, 0), height);
+    bin_x2 = min(max(bin_x2, 0), width);
+    bin_y2 = min(max(bin_y2, 0), height);
+    bool is_empty = (bin_y2 <= bin_y1) || (bin_x2 <= bin_x1);
+
+    const T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+    // Define an empty pooling region to be zero
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    T max_val = is_empty ? 0 : -FLT_MAX;
+    int max_idx = -1;
+    for (int h = bin_y1; h < bin_y2; ++h) {
+      for (int w = bin_x1; w < bin_x2; ++w) {
+        int offset = h * width + w;
+        if (offset_input[offset] > max_val) {
+          max_val = offset_input[offset];
+          max_idx = offset;
+        }
+      }
+    }
+    output[index] = max_val;
+    if (argmax != NULL) argmax[index] = max_idx;
+  }
+}
+
+template <typename T>
+__global__ void roi_pool_backward_cuda_kernel(
+    const int nthreads, const T* grad_output, const T* rois, const int* argmax,
+    T* grad_input, const int pooled_height, const int pooled_width,
+    const int channels, const int height, const int width) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    // (n, c) is an element in the pooled output
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    int roi_batch_ind = rois[n * 5];
+    T* grad_input_offset =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+    int argmax_index = argmax[index];
+
+    if (argmax_index != -1) {
+      atomicAdd(grad_input_offset + argmax_index, grad_output[index]);
+    }
+  }
+}
+
+#endif  // ROI_POOL_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..fc0aacf1435f8715fae92de535bf01bac07ac39a
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/roiaware_pool3d_cuda_kernel.cuh
@@ -0,0 +1,260 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIAWARE_POOL3D_CUDA_KERNEL_CUH
+#define ROIAWARE_POOL3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void generate_pts_mask_for_box3d(int boxes_num, int pts_num,
+                                            int out_x, int out_y, int out_z,
+                                            const T *rois, const T *pts,
+                                            int *pts_mask) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate params pts: (npoints, 3) [x, y, z] params pts_mask: (N,
+  // npoints): -1 means point does not in this box, otherwise: encode (x_idxs,
+  // y_idxs, z_idxs) by binary bit
+  int box_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num) return;
+
+    pts += pt_idx * 3;
+    rois += box_idx * 7;
+    pts_mask += box_idx * pts_num + pt_idx;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(pts, rois, local_x, local_y);
+
+    pts_mask[0] = -1;
+    if (cur_in_flag > 0) {
+      T local_z = pts[2] - rois[2];
+      T x_size = rois[3], y_size = rois[4], z_size = rois[5];
+
+      T x_res = x_size / out_x;
+      T y_res = y_size / out_y;
+      T z_res = z_size / out_z;
+
+      unsigned int x_idx = int((local_x + x_size / 2) / x_res);
+      unsigned int y_idx = int((local_y + y_size / 2) / y_res);
+      unsigned int z_idx = int(local_z / z_res);
+
+      x_idx = min(max(x_idx, 0), out_x - 1);
+      y_idx = min(max(y_idx, 0), out_y - 1);
+      z_idx = min(max(z_idx, 0), out_z - 1);
+
+      unsigned int idx_encoding = (x_idx << 16) + (y_idx << 8) + z_idx;
+
+      pts_mask[0] = idx_encoding;
+    }
+  }
+}
+
+template <typename T>
+__global__ void collect_inside_pts_for_box3d(int boxes_num, int pts_num,
+                                             int max_pts_each_voxel, int out_x,
+                                             int out_y, int out_z,
+                                             const int *pts_mask,
+                                             T *pts_idx_of_voxels) {
+  // params pts_mask: (N, npoints)  0 or 1
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  CUDA_1D_KERNEL_LOOP(box_idx, boxes_num) {
+    int max_num_pts = max_pts_each_voxel - 1;  // index 0 is the counter
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel;
+
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_mask[box_idx * pts_num + k] != -1) {
+        unsigned int idx_encoding = pts_mask[box_idx * pts_num + k];
+        unsigned int x_idx = (idx_encoding >> 16) & 0xFF;
+        unsigned int y_idx = (idx_encoding >> 8) & 0xFF;
+        unsigned int z_idx = idx_encoding & 0xFF;
+        unsigned int base_offset = x_idx * out_y * out_z * max_pts_each_voxel +
+                                   y_idx * out_z * max_pts_each_voxel +
+                                   z_idx * max_pts_each_voxel;
+        unsigned int cnt = pts_idx_of_voxels[base_offset];
+        if (cnt < max_num_pts) {
+          pts_idx_of_voxels[base_offset + cnt + 1] = k;
+          pts_idx_of_voxels[base_offset]++;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void roiaware_maxpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const T *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   T *pooled_features, int *argmax) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
+                       offset_base * channels + channel_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+
+    int argmax_idx = -1;
+    float max_val = -1e50;
+
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++) {
+      if (pts_feature[pts_idx_of_voxels[k] * channels + channel_idx] >
+          max_val) {
+        max_val = pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+        argmax_idx = pts_idx_of_voxels[k];
+      }
+    }
+
+    if (argmax_idx != -1) {
+      pooled_features[0] = max_val;
+    }
+    argmax[0] = argmax_idx;
+  }
+}
+
+template <typename T>
+__global__ void roiaware_avgpool3d(int boxes_num, int pts_num, int channels,
+                                   int max_pts_each_voxel, int out_x, int out_y,
+                                   int out_z, const T *pts_feature,
+                                   const int *pts_idx_of_voxels,
+                                   T *pooled_features) {
+  // params pts_feature: (npoints, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel),
+  // index 0 is the counter params pooled_features: (N, out_x, out_y, out_z, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    pooled_features += box_idx * out_x * out_y * out_z * channels +
+                       offset_base * channels + channel_idx;
+
+    float sum_val = 0;
+    int total_pts = pts_idx_of_voxels[0];
+
+    for (int k = 1; k <= total_pts; k++) {
+      sum_val += pts_feature[pts_idx_of_voxels[k] * channels + channel_idx];
+    }
+
+    if (total_pts > 0) {
+      pooled_features[0] = sum_val / total_pts;
+    }
+  }
+}
+
+template <typename T>
+__global__ void roiaware_maxpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            const int *argmax,
+                                            const T *grad_out, T *grad_in) {
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    argmax += box_idx * out_x * out_y * out_z * channels +
+              offset_base * channels + channel_idx;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
+
+    if (argmax[0] == -1) return;
+
+    atomicAdd(grad_in + argmax[0] * channels + channel_idx, grad_out[0] * 1);
+  }
+}
+
+template <typename T>
+__global__ void roiaware_avgpool3d_backward(int boxes_num, int channels,
+                                            int out_x, int out_y, int out_z,
+                                            int max_pts_each_voxel,
+                                            const int *pts_idx_of_voxels,
+                                            const T *grad_out, T *grad_in) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+
+  int box_idx = blockIdx.z;
+  int channel_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(voxel_idx_flat, out_x * out_y * out_z) {
+    int x_idx = voxel_idx_flat / (out_y * out_z);
+    int y_idx = (voxel_idx_flat - x_idx * (out_y * out_z)) / out_z;
+    int z_idx = voxel_idx_flat % out_z;
+    if (box_idx >= boxes_num || channel_idx >= channels) return;
+
+    int offset_base = x_idx * out_y * out_z + y_idx * out_z + z_idx;
+    pts_idx_of_voxels += box_idx * out_x * out_y * out_z * max_pts_each_voxel +
+                         offset_base * max_pts_each_voxel;
+    grad_out += box_idx * out_x * out_y * out_z * channels +
+                offset_base * channels + channel_idx;
+
+    int total_pts = pts_idx_of_voxels[0];
+    float cur_grad = 1 / fmaxf(float(total_pts), 1.0);
+    for (int k = 1; k <= total_pts; k++) {
+      atomicAdd(grad_in + pts_idx_of_voxels[k] * channels + channel_idx,
+                grad_out[0] * cur_grad);
+    }
+  }
+}
+
+#endif  // ROIAWARE_POOL3D_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..545f6ffa09d4a6cae49f1f1e68c191c1fd54de68
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/roipoint_pool3d_cuda_kernel.cuh
@@ -0,0 +1,134 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIPOINT_POOL3D_CUDA_KERNEL_CUH
+#define ROIPOINT_POOL3D_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__device__ inline void lidar_to_local_coords(T shift_x, T shift_y, T rz,
+                                             T &local_x, T &local_y) {
+  T cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+template <typename T>
+__device__ inline int check_pt_in_box3d(const T *pt, const T *box3d, T &local_x,
+                                        T &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, dx, dy, dz, rz) in LiDAR coordinate, cz in the
+  // bottom center
+  T x = pt[0], y = pt[1], z = pt[2];
+  T cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  T dx = box3d[3], dy = box3d[4], dz = box3d[5], rz = box3d[6];
+  cz += dz / 2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > dz / 2.0) return 0;
+  lidar_to_local_coords(x - cx, y - cy, rz, local_x, local_y);
+  T in_flag = (local_x > -dx / 2.0) & (local_x < dx / 2.0) &
+              (local_y > -dy / 2.0) & (local_y < dy / 2.0);
+  return in_flag;
+}
+
+template <typename T>
+__global__ void assign_pts_to_box3d(int batch_size, int pts_num, int boxes_num,
+                                    const T *xyz, const T *boxes3d,
+                                    int *pts_assign) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_assign: (B, N, M): idx of the corresponding box3d, -1 means
+  // background points
+  int box_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(pt_idx, pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
+
+    int assign_idx =
+        bs_idx * pts_num * boxes_num + pt_idx * boxes_num + box_idx;
+    pts_assign[assign_idx] = 0;
+
+    int box_offset = bs_idx * boxes_num * 7 + box_idx * 7;
+    int pt_offset = bs_idx * pts_num * 3 + pt_idx * 3;
+
+    T local_x = 0, local_y = 0;
+    int cur_in_flag = check_pt_in_box3d(xyz + pt_offset, boxes3d + box_offset,
+                                        local_x, local_y);
+    pts_assign[assign_idx] = cur_in_flag;
+  }
+}
+
+__global__ void get_pooled_idx(int batch_size, int pts_num, int boxes_num,
+                               int sampled_pts_num, const int *pts_assign,
+                               int *pts_idx, int *pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params pts_feature: (B, N, C)
+  // params pts_assign: (B, N)
+  // params pts_idx: (B, M, 512)
+  // params pooled_empty_flag: (B, M)
+  CUDA_1D_KERNEL_LOOP(boxes_idx, boxes_num) {
+    int bs_idx = blockIdx.y;
+
+    int cnt = 0;
+    for (int k = 0; k < pts_num; k++) {
+      if (pts_assign[bs_idx * pts_num * boxes_num + k * boxes_num +
+                     boxes_idx]) {
+        if (cnt < sampled_pts_num) {
+          pts_idx[bs_idx * boxes_num * sampled_pts_num +
+                  boxes_idx * sampled_pts_num + cnt] = k;
+          cnt++;
+        } else
+          break;
+      }
+    }
+
+    if (cnt == 0) {
+      pooled_empty_flag[bs_idx * boxes_num + boxes_idx] = 1;
+    } else if (cnt < sampled_pts_num) {
+      // duplicate same points for sampling
+      for (int k = cnt; k < sampled_pts_num; k++) {
+        int duplicate_idx = k % cnt;
+        int base_offset =
+            bs_idx * boxes_num * sampled_pts_num + boxes_idx * sampled_pts_num;
+        pts_idx[base_offset + k] = pts_idx[base_offset + duplicate_idx];
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void roipoint_pool3d_forward(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const T *xyz, const int *pts_idx, const T *pts_feature,
+    T *pooled_features, int *pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params pts_idx: (B, M, 512)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int box_idx = blockIdx.y;
+  int bs_idx = blockIdx.z;
+  CUDA_1D_KERNEL_LOOP(sample_pt_idx, sampled_pts_num) {
+    if (box_idx >= boxes_num || bs_idx >= batch_size) return;
+    if (pooled_empty_flag[bs_idx * boxes_num + box_idx]) return;
+
+    int temp_idx = bs_idx * boxes_num * sampled_pts_num +
+                   box_idx * sampled_pts_num + sample_pt_idx;
+    int src_pt_idx = pts_idx[temp_idx];
+    int dst_feature_offset = temp_idx * (3 + feature_in_len);
+
+    for (int j = 0; j < 3; j++)
+      pooled_features[dst_feature_offset + j] =
+          xyz[bs_idx * pts_num * 3 + src_pt_idx * 3 + j];
+
+    int src_feature_offset =
+        bs_idx * pts_num * feature_in_len + src_pt_idx * feature_in_len;
+    memcpy(pooled_features + dst_feature_offset + 3,
+           pts_feature + src_feature_offset, feature_in_len * sizeof(T));
+  }
+}
+
+#endif  // ROIPOINT_POOL3D_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..ffcc658ccb1f5e3059c0428159bc2e80fbeee3d4
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/rotated_feature_align_cuda_kernel.cuh
@@ -0,0 +1,129 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#ifndef ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
+#define ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename scalar_t>
+__global__ void rotated_feature_align_forward_kernel(
+    const int nthreads, const int points, const scalar_t* bottom_data,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t* top_data) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename scalar_t>
+__global__ void rotated_feature_align_backward_kernel(
+    const int nthreads, const int points, const scalar_t* top_diff,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width,
+    scalar_t* bottom_diff) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    scalar_t* offset_bottom_diff =
+        bottom_diff + (n * channels + c) * height * width;
+    scalar_t value_top_diff = top_diff[index];
+
+    atomicAdd(bottom_diff + index, value_top_diff);
+    for (int i = 0; i < points; i++) {
+      scalar_t w1, w2, w3, w4;
+      int x_low, x_high, y_low, y_high;
+
+      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
+                                              w2, w3, w4, x_low, x_high, y_low,
+                                              y_high, i);
+      scalar_t g1 = value_top_diff * w1;
+      scalar_t g2 = value_top_diff * w2;
+      scalar_t g3 = value_top_diff * w3;
+      scalar_t g4 = value_top_diff * w4;
+      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+        atomicAdd(offset_bottom_diff + y_low * width + x_low, g1);
+        atomicAdd(offset_bottom_diff + y_low * width + x_high, g2);
+        atomicAdd(offset_bottom_diff + y_high * width + x_low, g3);
+        atomicAdd(offset_bottom_diff + y_high * width + x_high, g4);
+      }
+    }
+  }
+}
+#endif  // ROTATED_FEATURE_ALIGN_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..7f9c40202fd4a6b4a43e4359e50c68cdb77d335f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/scatter_points_cuda_kernel.cuh
@@ -0,0 +1,187 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SCATTER_POINTS_CUDA_KERNEL_CUH
+#define SCATTER_POINTS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+int const maxGridDim = 50000;
+
+__device__ __forceinline__ static void reduceMax(float *address, float val) {
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old || __int_as_float(old) < val);
+}
+
+__device__ __forceinline__ static void reduceMax(double *address, double val) {
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(
+        address_as_ull, assumed,
+        __double_as_longlong(fmax(val, __longlong_as_double(assumed))));
+  } while (assumed != old || __longlong_as_double(old) < val);
+}
+
+// get rid of meaningless warnings when compiling host code
+#ifdef HIP_DIFF
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+  atomicAdd(address, val);
+}
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+  atomicAdd(address, val);
+}
+#else
+#ifdef __CUDA_ARCH__
+__device__ __forceinline__ static void reduceAdd(float *address, float val) {
+#if (__CUDA_ARCH__ < 200)
+#ifdef _MSC_VER
+#pragma message( \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32")
+#else
+#warning \
+    "compute capability lower than 2.x. fall back to use CAS version of atomicAdd for float32"
+#endif
+  int *address_as_i = reinterpret_cast<int *>(address);
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_i, assumed,
+                    __float_as_int(val + __int_as_float(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+
+__device__ __forceinline__ static void reduceAdd(double *address, double val) {
+#if (__CUDA_ARCH__ < 600)
+#ifdef _MSC_VER
+#pragma message( \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64")
+#else
+#warning \
+    "compute capability lower than 6.x. fall back to use CAS version of atomicAdd for float64"
+#endif
+  unsigned long long *address_as_ull =
+      reinterpret_cast<unsigned long long *>(address);
+  unsigned long long old = *address_as_ull, assumed;
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+#else
+  atomicAdd(address, val);
+#endif
+}
+#endif  // __CUDA_ARCH__
+#endif  // HIP_DIFF
+
+template <typename T>
+__global__ void feats_reduce_kernel(
+    const T *feats, const int32_t *coors_map,
+    T *reduced_feats,  // shall be 0 at initialization
+    const int num_input, const int num_feats, const reduce_t reduce_type) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) continue;
+
+    const T *feats_offset = feats + x * num_feats;
+    T *reduced_feats_offset = reduced_feats + reduce_to * num_feats;
+    if (reduce_type == reduce_t::MAX) {
+      for (int i = 0; i < num_feats; i++) {
+        reduceMax(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    } else {
+      for (int i = 0; i < num_feats; i++) {
+        reduceAdd(&reduced_feats_offset[i], feats_offset[i]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void add_reduce_traceback_grad_kernel(
+    T *grad_feats, const T *grad_reduced_feats, const int32_t *coors_map,
+    const int32_t *reduce_count, const int num_input, const int num_feats,
+    const reduce_t reduce_type) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int input_offset = x * num_feats;
+    T *grad_feats_offset = grad_feats + input_offset;
+    const int reduced_offset = reduce_to * num_feats;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    if (reduce_type == reduce_t::SUM) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i];
+      }
+    } else if (reduce_type == reduce_t::MEAN) {
+      for (int i = 0; i < num_feats; i++) {
+        grad_feats_offset[i] = grad_reduced_feats_offset[i] /
+                               static_cast<T>(reduce_count[reduce_to]);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_traceback_scatter_idx_kernel(
+    const T *feats, const T *reduced_feats, int32_t *reduce_from,
+    const int32_t *coors_map, const int num_input, const int num_feats) {
+  CUDA_1D_KERNEL_LOOP(x, num_input) {
+    int32_t reduce_to = coors_map[x];
+
+    const int input_offset = x * num_feats;
+    const T *feats_offset = feats + input_offset;
+
+    if (reduce_to == -1) {
+      continue;
+    }
+
+    const int reduced_offset = reduce_to * num_feats;
+    const T *reduced_feats_offset = reduced_feats + reduced_offset;
+    int32_t *reduce_from_offset = reduce_from + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      if (feats_offset[i] == reduced_feats_offset[i]) {
+        atomicMin(&reduce_from_offset[i], static_cast<int32_t>(x));
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void max_reduce_scatter_grad_kernel(T *grad_feats,
+                                               const T *grad_reduced_feats,
+                                               const int32_t *reduce_from,
+                                               const int num_reduced,
+                                               const int num_feats) {
+  CUDA_1D_KERNEL_LOOP(x, num_reduced) {
+    const int reduced_offset = x * num_feats;
+    const int32_t *scatter_to_offset = reduce_from + reduced_offset;
+    const T *grad_reduced_feats_offset = grad_reduced_feats + reduced_offset;
+
+    for (int i = 0; i < num_feats; i++) {
+      grad_feats[scatter_to_offset[i] * num_feats + i] =
+          grad_reduced_feats_offset[i];
+    }
+  }
+}
+
+#endif  // SCATTER_POINTS_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1eb5f8fcccbaafdb62972652e3979803c0acd1ca
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/sigmoid_focal_loss_cuda_kernel.cuh
@@ -0,0 +1,71 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void sigmoid_focal_loss_forward_cuda_kernel(
+    const int nthreads, const T* input, const int64_t* target, const T* weight,
+    T* output, const T gamma, const T alpha, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+
+    int64_t t = target[n];
+    T flag_p = (t == c);
+    T flag_n = (t != c);
+
+    // p = sigmoid(x) = 1. / 1. + expf(-x)
+    T p = (T)1. / ((T)1. + expf(-input[index]));
+
+    // (1 - p)**gamma * log(p)
+    T term_p = pow(((T)1. - p), gamma) * log(max(p, (T)FLT_MIN));
+    // p**gamma * log(1 - p)
+    T term_n = pow(p, gamma) * log(max((T)1. - p, (T)FLT_MIN));
+
+    output[index] = (T)0.;
+    output[index] += -flag_p * alpha * term_p;
+    output[index] += -flag_n * ((T)1. - alpha) * term_n;
+    if (weight != NULL) {
+      output[index] *= weight[t];
+    }
+  }
+}
+
+template <typename T>
+__global__ void sigmoid_focal_loss_backward_cuda_kernel(
+    const int nthreads, const T* input, const int64_t* target, const T* weight,
+    T* grad_input, const T gamma, const T alpha, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+
+    int64_t t = target[n];
+    T flag_p = (t == c);
+    T flag_n = (t != c);
+
+    // p = sigmoid(x) = 1. / 1. + expf(-x)
+    T p = (T)1. / ((T)1. + exp(-input[index]));
+
+    // (1 - p)**gamma * (1 - p - gamma*p*log(p))
+    T term_p = pow(((T)1. - p), gamma) *
+               ((T)1. - p - (gamma * p * log(max(p, (T)FLT_MIN))));
+    // p**gamma * (gamma * (1 - p) * log(1 - p) - p)
+    T term_n = pow(p, gamma) *
+               (gamma * ((T)1. - p) * log(max((T)1. - p, (T)FLT_MIN)) - p);
+
+    grad_input[index] = (T)0.;
+    grad_input[index] += -flag_p * alpha * term_p;
+    grad_input[index] += -flag_n * ((T)1. - alpha) * term_n;
+    if (weight != NULL) {
+      grad_input[index] *= weight[t];
+    }
+  }
+}
+
+#endif  // SIGMOID_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..631b2c6175412a9503f6c385ee6597d9527d754f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/softmax_focal_loss_cuda_kernel.cuh
@@ -0,0 +1,72 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+#define SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void softmax_focal_loss_forward_cuda_kernel(
+    const int nthreads, const T* softmax, const int64_t* target,
+    const T* weight, T* output, const T gamma, const T alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int64_t label = target[index];
+    T pred = softmax[index * num_classes + label];
+
+    if (label >= 0) {
+      output[index] =
+          -alpha * pow((T)1. - pred, gamma) * log(max(pred, (T)FLT_MIN));
+    } else {
+      output[index] = 0;
+    }
+    if (weight != NULL) {
+      output[index] *= weight[label];
+    }
+  }
+}
+
+template <typename T>
+__global__ void softmax_focal_loss_backward_cuda1_kernel(
+    const int nthreads, const T* softmax, const int64_t* target,
+    const T* weight, T* buff, const T gamma, const T alpha,
+    const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int64_t label = target[index];
+    T pred = softmax[index * num_classes + label];
+
+    if (label >= 0) {
+      buff[index] = alpha * (-pow((T)1. - pred, gamma) +
+                             gamma * pow((T)1. - pred, gamma - 1) * pred *
+                                 log(max(pred, (T)FLT_MIN)));
+    } else {
+      buff[index] = 0;
+    }
+    if (weight != NULL) {
+      buff[index] *= weight[label];
+    }
+  }
+}
+
+template <typename T>
+__global__ void softmax_focal_loss_backward_cuda2_kernel(
+    const int nthreads, const T* softmax, const int64_t* target, const T* buff,
+    T* grad_input, const int num_classes) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n = index / num_classes;
+    int c = index % num_classes;
+    int64_t label = target[n];
+
+    if (label >= 0) {
+      T flag = (label == c ? (T)1. : (T)0.);
+      grad_input[index] = buff[n] * (flag - softmax[index]);
+    } else {
+      grad_input[index] = 0;
+    }
+  }
+}
+
+#endif  // SOFTMAX_FOCAL_LOSS_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/spconv/indice.cuh b/mmcv/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5ef0009a10f8effeb447e398cff5103b400056de
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/spconv/indice.cuh
@@ -0,0 +1,236 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef INDICE_CU_H_
+#define INDICE_CU_H_
+#include <utils/spconv/spconv/geometry.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void prepareIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
+    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  auto indicePairsDim2 = indicePairs.dim(2);
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      indicePairs(offset, 0, oldNum) = ix;
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      indicePairs(offset, 1, oldNum) = index;
+      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void prepareDeConvIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<Index> indicesOut,
+    tv::TensorView<IndexGrid> gridsOut, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indiceNum, tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  auto indicePairsDim2 = indicePairs.dim(2);
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+      indicePairs(offset, 0, oldNum) = ix;
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      indicePairs(offset, 1, oldNum) = index;
+      indicePairUnique[offset * indicePairsDim2 + oldNum] = index;
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void assignGridAndIndiceOutKernel(
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+    int numAct, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int batchSize) {
+  Index index;
+  auto indicesOutPtr = indicesOut.data();
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    index = indicePairUnique[ix];
+    gridsOut[index] = ix;
+    index = tv::rowArrayIdxInv<Index, NDim>(
+        index, indicesOutPtr + ix * (NDim + 1) + 1, outSpatialShape.data());
+    indicesOut[ix * (NDim + 1)] = index % batchSize;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void assignIndicePairsKernel(
+    tv::TensorView<Index> indicesOut, tv::TensorView<IndexGrid> gridsOut,
+    int numActIn, tv::TensorView<Index> indicePairs,
+    tv::TensorView<Index> indicePairUnique,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  Index index;
+  int kernelVolume = indicePairs.dim(0);
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    for (int i = 0; i < kernelVolume; ++i) {
+      index = indicePairs(i, 1, ix);
+      if (index > -1) {
+        indicePairs(i, 1, ix) = gridsOut[index];
+      }
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void prepareSubMGridKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + ix * (NDim + 1) + 1,
+                                         outSpatialShape.data()) +
+            spatialVolume * indicesIn(ix, 0);
+    gridsOut[index] = ix;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim,
+          int KernelMaxVolume = 256>
+__global__ void getSubMIndicePairsKernel(
+    tv::TensorView<const Index> indicesIn, tv::TensorView<IndexGrid> gridsOut,
+    tv::TensorView<Index> indicePairs, tv::TensorView<Index> indiceNum,
+    const tv::SimpleVector<Index, NDim> kernelSize,
+    const tv::SimpleVector<Index, NDim> stride,
+    const tv::SimpleVector<Index, NDim> padding,
+    const tv::SimpleVector<Index, NDim> dilation,
+    const tv::SimpleVector<Index, NDim> outSpatialShape) {
+  auto numActIn = indicesIn.dim(0);
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index numValidPoints = 0;
+  Index validPoints[KernelMaxVolume * (NDim + 1)];
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int ix : tv::KernelLoopX<int>(numActIn)) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + ix * (NDim + 1) + 1, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data(),
+        validPoints);
+    for (int i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape.data()) +
+              spatialVolume * indicesIn(ix, 0);
+      if (gridsOut[index] > -1) {
+        auto oldNum = atomicAdd(indiceNum.data() + offset, Index(1));
+        indicePairs(offset, 1, oldNum) = gridsOut[index];
+        indicePairs(offset, 0, oldNum) = ix;
+      }
+    }
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void resetGridKernel(const Index *indicePairUnique,
+                                tv::TensorView<IndexGrid> gridsOut,
+                                int numAct) {
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    gridsOut[indicePairUnique[ix]] = -1;
+  }
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+__global__ void resetGridSubMKernel(
+    const Index *indices, tv::TensorView<IndexGrid> gridsOut,
+    const tv::SimpleVector<Index, NDim> outSpatialShape, int numAct) {
+  int outSpatialShapeReg[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShapeReg[i] = outSpatialShape[i];
+  }
+  Index spatialVolume = 1;
+  auto indsPtr = indices;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index index;
+  for (int ix : tv::KernelLoopX<int>(numAct)) {
+    indsPtr = indices + ix * (NDim + 1);
+    index = tv::rowArrayIdx<Index, NDim>(indsPtr + 1, outSpatialShapeReg);
+    gridsOut[index + spatialVolume * indsPtr[0]] = -1;
+  }
+}
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh b/mmcv/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e3ec68b937b0507e3a119d63a49ad79e8f48eec7
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/spconv/reordering.cuh
@@ -0,0 +1,160 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef REORDERING_CU_H_
+#define REORDERING_CU_H_
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void gatherGenericKernel(scalar_t *buffer, const scalar_t *features,
+                                    const Index *indices, int size,
+                                    int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              features[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void gatherVecKernel(scalar_t *buffer, const scalar_t *features,
+                                const Index *indices, int size, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size)
+          reinterpret_cast<VecType *>(
+              buffer)[(ix + ILPStrideX[ilp]) * numPlanes + iy] =
+              reinterpret_cast<const VecType *>(features)[inds[ilp] + iy];
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void gatherVecBlockKernel(scalar_t *buffer, const scalar_t *features,
+                                     const Index *indices, int size,
+                                     int numPlanes) {
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  features += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      reinterpret_cast<VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x] =
+          reinterpret_cast<const VecType *>(
+              features)[indices[iy + ILPStrideY[ilp]] * numPlanes +
+                        threadIdx.x];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void scatterAddGenericKernel(scalar_t *outFeatures,
+                                        const scalar_t *buffer,
+                                        const Index *indices, int size,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index inds[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < size)
+        inds[ilp] = indices[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < size) {
+          outFeatures[inds[ilp] + iy] +=
+              buffer[(ix + ILPStrideX[ilp]) * numPlanes + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType = int4>
+__global__ void scatterAddVecBlockKernel(scalar_t *outFeatures,
+                                         const scalar_t *buffer,
+                                         const Index *indices, int size,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = ilp * gridDim.y * blockDim.y;
+  outFeatures += blockIdx.x * NumTLP;
+  buffer += blockIdx.x * NumTLP;
+  scalar_t buf[vecloadFactor];
+  scalar_t buf2[vecloadFactor];
+  Index idx;
+  for (int iy : tv::KernelLoopY<int, NumILP>(size)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idx = indices[iy + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(buf)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idx];
+      reinterpret_cast<VecType *>(buf2)[0] = reinterpret_cast<const VecType *>(
+          buffer)[(iy + ILPStrideY[ilp]) * numPlanes + threadIdx.x];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        buf[i] += buf2[i];
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idx] =
+          reinterpret_cast<VecType *>(buf)[0];
+    }
+  }
+}
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4ec6a466886832d38c72da6e3a3574e72d53cec8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/sync_bn_cuda_kernel.cuh
@@ -0,0 +1,331 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SYNCBN_CUDA_KERNEL_CUH
+#define SYNCBN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void sync_bn_forward_mean_cuda_kernel(const T *input, float *mean,
+                                                 int num, int channels,
+                                                 int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += input[index];
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_mean_cuda_kernel(const phalf *input,
+                                                 float *mean, int num,
+                                                 int channels, int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer[tid] += static_cast<float>(input[index]);
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    mean[c] = buffer[0] / total;
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_forward_var_cuda_kernel(const T *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = input[index] - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_var_cuda_kernel(const phalf *input,
+                                                const float *mean, float *var,
+                                                int num, int channels,
+                                                int spatial) {
+  __shared__ float buffer[THREADS_PER_BLOCK];
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    float td = static_cast<float>(input[index]) - mean[c];
+    buffer[tid] += td * td;
+  }
+  __syncthreads();
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer[tid] += buffer[tid + s];
+    }
+    __syncthreads();
+  }
+  int total = num * spatial;
+  if (tid == 0) {
+    var[c] = buffer[0] / total;
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const T *input, const float *mean, const float *var, float *running_mean,
+    float *running_var, const float *weight, const float *bias, float *norm,
+    float *std, T *output, int num, int channels, int spatial, float eps,
+    float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] = (input[index] - mean_value) / std_value;
+        output[index] = norm[index] * weight_value + bias_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            (input[index] - mean_value) / std_value * weight_value + bias_value;
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = norm[index] = (input[index] - mean_value) / std_value;
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = (input[index] - mean_value) / std_value;
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+
+template <>
+__global__ void sync_bn_forward_output_cuda_kernel(
+    const phalf *input, const float *mean, const float *var,
+    float *running_mean, float *running_var, const float *weight,
+    const float *bias, float *norm, float *std, phalf *output, int num,
+    int channels, int spatial, float eps, float momentum, int group_size) {
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  float mean_value = mean[c];
+  float std_value = sqrt(var[c] + eps);
+  if (weight != nullptr) {
+    float weight_value = weight[c];
+    float bias_value = bias[c];
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] =
+            static_cast<phalf>(norm[index] * weight_value + bias_value);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] =
+            static_cast<phalf>((static_cast<float>(input[index]) - mean_value) /
+                                   std_value * weight_value +
+                               bias_value);
+      }
+    }
+  } else {
+    if (norm != nullptr) {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        norm[index] =
+            (static_cast<float>(input[index]) - mean_value) / std_value;
+        output[index] = static_cast<phalf>(norm[index]);
+      }
+    } else {
+      for (int i = tid; i < num * spatial; i += blockDim.x) {
+        int index =
+            (i / spatial) * channels * spatial + c * spatial + i % spatial;
+        output[index] = static_cast<phalf>(
+            (static_cast<float>(input[index]) - mean_value) / std_value);
+      }
+    }
+  }
+  if (tid == 0) {
+    if (std != nullptr) std[c] = std_value;
+    if (running_mean != nullptr) {
+      running_mean[c] =
+          momentum * mean_value + (1 - momentum) * running_mean[c];
+      int count = num * spatial * group_size;
+      float var_unbias = count > 1 ? var[c] * count / (count - 1) : var[c];
+      running_var[c] = momentum * var_unbias + (1 - momentum) * running_var[c];
+    }
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_backward_param_cuda_kernel(const T *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += grad_output[index] * norm[index];
+    buffer2[tid] += grad_output[index];
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+
+template <>
+__global__ void sync_bn_backward_param_cuda_kernel(const phalf *grad_output,
+                                                   const float *norm,
+                                                   float *grad_weight,
+                                                   float *grad_bias, int num,
+                                                   int channels, int spatial) {
+  __shared__ float buffer1[THREADS_PER_BLOCK];
+  __shared__ float buffer2[THREADS_PER_BLOCK];
+
+  int tid = threadIdx.x;
+  int c = blockIdx.x;
+  buffer1[tid] = buffer2[tid] = 0;
+  for (int i = tid; i < num * spatial; i += blockDim.x) {
+    int index = (i / spatial) * channels * spatial + c * spatial + i % spatial;
+    buffer1[tid] += static_cast<float>(grad_output[index]) * norm[index];
+    buffer2[tid] += static_cast<float>(grad_output[index]);
+  }
+  __syncthreads();
+
+  for (int s = blockDim.x / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      buffer1[tid] += buffer1[tid + s];
+      buffer2[tid] += buffer2[tid + s];
+    }
+    __syncthreads();
+  }
+  if (tid == 0) {
+    grad_weight[c] = buffer1[0];
+    grad_bias[c] = buffer2[0];
+  }
+}
+
+template <typename T>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const T *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, T *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] =
+        weight[c] *
+        (grad_output[index] -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c];
+  }
+}
+
+template <>
+__global__ void sync_bn_backward_data_cuda_kernel(
+    int output_size, const phalf *grad_output, const float *weight,
+    const float *grad_weight, const float *grad_bias, const float *norm,
+    const float *std, phalf *grad_input, int num, int channels, int spatial) {
+  int factor = num * spatial;
+  CUDA_1D_KERNEL_LOOP(index, output_size) {
+    int c = (index / spatial) % channels;
+    grad_input[index] = static_cast<phalf>(
+        weight[c] *
+        (static_cast<float>(grad_output[index]) -
+         (grad_weight[c] * norm[index] + grad_bias[c]) / factor) /
+        std[c]);
+  }
+}
+
+#endif  // SYNCBN_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..971b496e589d2210131351305cbaf0ed1a027cb1
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/three_interpolate_cuda_kernel.cuh
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_INTERPOLATE_CUDA_KERNEL_CUH
+#define THREE_INTERPOLATE_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void three_interpolate_forward_cuda_kernel(
+    int b, int c, int m, int n, const T *points, const int *__restrict__ idx,
+    const T *weight, T *out) {
+  // points: (B, C, M)
+  // idx: (B, N, 3)
+  // weight: (B, N, 3)
+  // output:
+  //      out: (B, C, N)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+    out += bs_idx * c * n + c_idx * n;
+
+    out[pt_idx] = weight[0] * points[idx[0]] + weight[1] * points[idx[1]] +
+                  weight[2] * points[idx[2]];
+  }
+}
+
+template <typename T>
+__global__ void three_interpolate_backward_cuda_kernel(
+    int b, int c, int n, int m, const T *grad_out, const int *__restrict__ idx,
+    const T *weight, T *grad_points) {
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
+  //      grad_points: (B, C, M)
+
+  int bs_idx = blockIdx.z;
+  int c_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b || c_idx >= c) return;
+
+    grad_out += bs_idx * c * n + c_idx * n + pt_idx;
+    weight += bs_idx * n * 3 + pt_idx * 3;
+    grad_points += bs_idx * c * m + c_idx * m;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+    atomicAdd(grad_points + idx[0], grad_out[0] * weight[0]);
+    atomicAdd(grad_points + idx[1], grad_out[0] * weight[1]);
+    atomicAdd(grad_points + idx[2], grad_out[0] * weight[2]);
+  }
+}
+
+#endif  // THREE_INTERPOLATE_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..15434121b94033afb2fcb9945a83db15b92262d4
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/three_nn_cuda_kernel.cuh
@@ -0,0 +1,67 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_NN_CUDA_KERNEL_CUH
+#define THREE_NN_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void three_nn_forward_cuda_kernel(int b, int n, int m,
+                                             const T *unknown, const T *known,
+                                             T *dist2, int *__restrict__ idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
+
+  int bs_idx = blockIdx.y;
+  CUDA_1D_KERNEL_LOOP(pt_idx, n) {
+    if (bs_idx >= b) return;
+
+    unknown += bs_idx * n * 3 + pt_idx * 3;
+    known += bs_idx * m * 3;
+    dist2 += bs_idx * n * 3 + pt_idx * 3;
+    idx += bs_idx * n * 3 + pt_idx * 3;
+
+    T ux = unknown[0];
+    T uy = unknown[1];
+    T uz = unknown[2];
+
+    double best1 = 1e40, best2 = 1e40, best3 = 1e40;
+    int besti1 = 0, besti2 = 0, besti3 = 0;
+    for (int k = 0; k < m; ++k) {
+      T x = known[k * 3 + 0];
+      T y = known[k * 3 + 1];
+      T z = known[k * 3 + 2];
+      T d = (ux - x) * (ux - x) + (uy - y) * (uy - y) + (uz - z) * (uz - z);
+      if (d < best1) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = best1;
+        besti2 = besti1;
+        best1 = d;
+        besti1 = k;
+      } else if (d < best2) {
+        best3 = best2;
+        besti3 = besti2;
+        best2 = d;
+        besti2 = k;
+      } else if (d < best3) {
+        best3 = d;
+        besti3 = k;
+      }
+    }
+    dist2[0] = best1;
+    dist2[1] = best2;
+    dist2[2] = best3;
+    idx[0] = besti1;
+    idx[1] = besti2;
+    idx[2] = besti3;
+  }
+}
+
+#endif  // THREE_NN_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..4d1159a515f4de2666c25ba4bd5e4f2cbbca1e10
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/tin_shift_cuda_kernel.cuh
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TIN_SHIFT_CUDA_KERNEL_CUH
+#define TIN_SHIFT_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+template <typename T>
+__global__ void tin_shift_forward_cuda_kernel(
+    const int nthreads, const T* input, const int* shift, T* output,
+    const int batch_size, const int channels, const int t_size,
+    const int hw_size, const int group_size, const int group_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int hw_index = index % hw_size;
+    const int j = (index / hw_size) % channels;
+
+    const int n_index = (index / hw_size / channels) % batch_size;
+    int group_id = j / group_channel;
+    int t_shift = shift[n_index * group_size + group_id];
+    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
+    for (int i = 0; i < t_size; i++) {
+      int now_t = i + t_shift;
+      int data_id = i * hw_size * channels + offset;
+      if (now_t < 0 || now_t >= t_size) {
+        continue;
+      }
+      int out_id = now_t * hw_size * channels + offset;
+      output[out_id] = input[data_id];
+    }
+  }
+}
+
+template <typename T>
+__global__ void tin_shift_backward_cuda_kernel(
+    const int nthreads, const T* input, const int* shift, T* output,
+    const int batch_size, const int channels, const int t_size,
+    const int hw_size, const int group_size, const int group_channel) {
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int hw_index = index % hw_size;
+    const int j = (index / hw_size) % channels;
+
+    const int n_index = (index / hw_size / channels) % batch_size;
+    int group_id = j / group_channel;
+    int t_shift = shift[n_index * group_size + group_id];
+    int offset = n_index * t_size * hw_size * channels + hw_size * j + hw_index;
+    for (int i = 0; i < t_size; i++) {
+      int now_t = i + t_shift;
+      int data_id = i * hw_size * channels + offset;
+      if (now_t < 0 || now_t >= t_size) {
+        continue;
+      }
+      int out_id = now_t * hw_size * channels + offset;
+      output[out_id] = input[data_id];
+    }
+  }
+}
+
+#endif  // TIN_SHIFT_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh b/mmcv/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..021b488d8d716c9e8132173bf04491d42b7b6fa2
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/cuda/voxelization_cuda_kernel.cuh
@@ -0,0 +1,216 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#ifndef VOXELIZATION_CUDA_KERNEL_CUH
+#define VOXELIZATION_CUDA_KERNEL_CUH
+
+#ifdef MMCV_USE_PARROTS
+#include "parrots_cuda_helper.hpp"
+#else
+#include "pytorch_cuda_helper.hpp"
+#endif
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+template <typename T, typename T_int>
+__global__ void dynamic_voxelize_kernel(
+    const T* points, T_int* coors, const float voxel_x, const float voxel_y,
+    const float voxel_z, const float coors_x_min, const float coors_y_min,
+    const float coors_z_min, const float coors_x_max, const float coors_y_max,
+    const float coors_z_max, const int grid_x, const int grid_y,
+    const int grid_z, const int num_points, const int num_features,
+    const int NDim) {
+  //   const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    // To save some computation
+    auto points_offset = points + index * num_features;
+    auto coors_offset = coors + index * NDim;
+    int c_x = floorf((points_offset[0] - coors_x_min) / voxel_x);
+    if (c_x < 0 || c_x >= grid_x) {
+      coors_offset[0] = -1;
+      continue;
+    }
+
+    int c_y = floorf((points_offset[1] - coors_y_min) / voxel_y);
+    if (c_y < 0 || c_y >= grid_y) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      continue;
+    }
+
+    int c_z = floorf((points_offset[2] - coors_z_min) / voxel_z);
+    if (c_z < 0 || c_z >= grid_z) {
+      coors_offset[0] = -1;
+      coors_offset[1] = -1;
+      coors_offset[2] = -1;
+    } else {
+      coors_offset[0] = c_z;
+      coors_offset[1] = c_y;
+      coors_offset[2] = c_x;
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_point_to_voxel(const int nthreads, const T* points,
+                                      T_int* point_to_voxelidx,
+                                      T_int* coor_to_voxelidx, T* voxels,
+                                      const int max_points,
+                                      const int num_features,
+                                      const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    int index = thread_idx / num_features;
+
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num > -1 && voxelidx > -1) {
+      auto voxels_offset =
+          voxels + voxelidx * max_points * num_features + num * num_features;
+
+      int k = thread_idx % num_features;
+      voxels_offset[k] = points[thread_idx];
+    }
+  }
+}
+
+template <typename T, typename T_int>
+__global__ void assign_voxel_coors(const int nthreads, T_int* coor,
+                                   T_int* point_to_voxelidx,
+                                   T_int* coor_to_voxelidx, T_int* voxel_coors,
+                                   const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    // const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
+    // if (index >= num_points) return;
+    int index = thread_idx / NDim;
+    int num = point_to_voxelidx[index];
+    int voxelidx = coor_to_voxelidx[index];
+    if (num == 0 && voxelidx > -1) {
+      auto coors_offset = voxel_coors + voxelidx * NDim;
+      int k = thread_idx % NDim;
+      coors_offset[k] = coor[thread_idx];
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void point_to_voxelidx_kernel(const T_int* coor,
+                                         T_int* point_to_voxelidx,
+                                         T_int* point_to_pointidx,
+                                         const int max_points,
+                                         const int max_voxels,
+                                         const int num_points, const int NDim) {
+  CUDA_1D_KERNEL_LOOP(index, num_points) {
+    auto coor_offset = coor + index * NDim;
+    // skip invalid points
+    if (coor_offset[0] == -1) continue;
+
+    int num = 0;
+    int coor_x = coor_offset[0];
+    int coor_y = coor_offset[1];
+    int coor_z = coor_offset[2];
+    // only calculate the coors before this coor[index]
+    for (int i = 0; i < index; ++i) {
+      auto prev_coor = coor + i * NDim;
+      if (prev_coor[0] == -1) continue;
+
+      // Find all previous points that have the same coors
+      // if find the same coor, record it
+      if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
+          (prev_coor[2] == coor_z)) {
+        num++;
+        if (num == 1) {
+          // point to the same coor that first show up
+          point_to_pointidx[index] = i;
+        } else if (num >= max_points) {
+          // out of boundary
+          break;
+        }
+      }
+    }
+    if (num == 0) {
+      point_to_pointidx[index] = index;
+    }
+    if (num < max_points) {
+      point_to_voxelidx[index] = num;
+    }
+  }
+}
+
+template <typename T_int>
+__global__ void determin_voxel_num(
+    // const T_int* coor,
+    T_int* num_points_per_voxel, T_int* point_to_voxelidx,
+    T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
+    const int max_points, const int max_voxels, const int num_points) {
+  // only calculate the coors before this coor[index]
+  for (int i = 0; i < num_points; ++i) {
+    int point_pos_in_voxel = point_to_voxelidx[i];
+    // record voxel
+    if (point_pos_in_voxel == -1) {
+      // out of max_points or invalid point
+      continue;
+    } else if (point_pos_in_voxel == 0) {
+      // record new voxel
+      int voxelidx = voxel_num[0];
+      if (voxel_num[0] >= max_voxels) continue;
+      voxel_num[0] += 1;
+      coor_to_voxelidx[i] = voxelidx;
+      num_points_per_voxel[voxelidx] = 1;
+    } else {
+      int point_idx = point_to_pointidx[i];
+      int voxelidx = coor_to_voxelidx[point_idx];
+      if (voxelidx != -1) {
+        coor_to_voxelidx[i] = voxelidx;
+        num_points_per_voxel[voxelidx] += 1;
+      }
+    }
+  }
+}
+
+__global__ void nondeterministic_get_assign_pos(
+    const int nthreads, const int32_t* coors_map, int32_t* pts_id,
+    int32_t* coors_count, int32_t* reduce_count, int32_t* coors_order) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    if (coors_idx > -1) {
+      int32_t coors_pts_pos = atomicAdd(&reduce_count[coors_idx], 1);
+      pts_id[thread_idx] = coors_pts_pos;
+      if (coors_pts_pos == 0) {
+        coors_order[coors_idx] = atomicAdd(coors_count, 1);
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void nondeterministic_assign_point_voxel(
+    const int nthreads, const T* points, const int32_t* coors_map,
+    const int32_t* pts_id, const int32_t* coors_in, const int32_t* reduce_count,
+    const int32_t* coors_order, T* voxels, int32_t* coors, int32_t* pts_count,
+    const int max_voxels, const int max_points, const int num_features,
+    const int NDim) {
+  CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
+    int coors_idx = coors_map[thread_idx];
+    int coors_pts_pos = pts_id[thread_idx];
+    if (coors_idx > -1 && coors_pts_pos < max_points) {
+      int coors_pos = coors_order[coors_idx];
+      if (coors_pos < max_voxels) {
+        auto voxels_offset =
+            voxels + (coors_pos * max_points + coors_pts_pos) * num_features;
+        auto points_offset = points + thread_idx * num_features;
+        for (int k = 0; k < num_features; k++) {
+          voxels_offset[k] = points_offset[k];
+        }
+        if (coors_pts_pos == 0) {
+          pts_count[coors_pos] = min(reduce_count[coors_idx], max_points);
+          auto coors_offset = coors + coors_pos * NDim;
+          auto coors_in_offset = coors_in + coors_idx * NDim;
+          for (int k = 0; k < NDim; k++) {
+            coors_offset[k] = coors_in_offset[k];
+          }
+        }
+      }
+    }
+  }
+}
+
+#endif  // VOXELIZATION_CUDA_KERNEL_CUH
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu b/mmcv/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..58e695a0153e59ca9d0c66040962c2e12d6226b6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/bbox_overlaps_mlu_kernel.mlu
@@ -0,0 +1,322 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <float.h>
+
+#include "common_mlu_helper.hpp"
+
+#define COORD_NUM 4
+
+__nram__ char nmem_buf[MAX_NRAM_SIZE];
+
+template <typename T>
+__mlu_func__ void computeDiv(void *nram_dst, void *nram_src0, void *nram_src1,
+                             void *nram_addition, const int32_t deal_num) {
+  __bang_active_reciphp((T *)nram_dst, (T *)nram_src1, deal_num);
+  __bang_mul((T *)nram_dst, (T *)nram_src0, (T *)nram_dst, deal_num);
+}
+
+template <>
+__mlu_func__ void computeDiv<half>(void *nram_dst, void *nram_src0,
+                                   void *nram_src1, void *nram_addition,
+                                   const int32_t deal_num) {
+  __bang_half2float((float *)nram_addition, (half *)nram_src1, deal_num);
+  __bang_active_reciphp((float *)nram_addition, (float *)nram_addition,
+                        deal_num);
+  __bang_float2half_rd((half *)nram_src1, (float *)nram_addition, deal_num);
+  __bang_mul((half *)nram_dst, (half *)nram_src0, (half *)nram_src1, deal_num);
+}
+
+template <typename T>
+__mlu_func__ void bboxOverlapsWorkflow(
+    T *vec_b1_x1, T *vec_b1_y1, T *vec_b1_x2, T *vec_b1_y2, T *vec_b2_x1,
+    T *vec_b2_y1, T *vec_b2_x2, T *vec_b2_y2, T *vec_left, T *vec_right,
+    T *vec_top, T *vec_bottom, const T *bbox1, const T *bbox2, void *ious,
+    const int32_t offset, const int32_t mode, const int32_t batches_stride,
+    const int32_t num_bbox1, const int32_t num_bbox2, const bool aligned) {
+  int32_t task_batch_stride = (num_bbox1 + taskDim - 1) / taskDim;
+  int32_t batch_start = taskId * task_batch_stride;
+  int32_t batch_per_task = batch_start + task_batch_stride < num_bbox1
+                               ? task_batch_stride
+                               : num_bbox1 - batch_start;
+  batch_per_task = batch_per_task > 0 ? batch_per_task : (0);
+
+  if (aligned) {
+    int32_t num_loop_cpy = batch_per_task / batches_stride;
+    int32_t num_rem_cpy_batches = batch_per_task % batches_stride;
+    num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
+    for (int32_t i = 0; i < num_loop_cpy; i++) {
+      int32_t index = batch_start + i * batches_stride;
+      int32_t handle_batches = index + batches_stride > num_bbox1
+                                   ? num_rem_cpy_batches
+                                   : batches_stride;
+      int32_t b1 = index;
+      int32_t b2 = index;
+
+      int32_t base1 = b1 * COORD_NUM;
+      __memcpy(vec_b1_x1, &bbox1[base1], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b1_y1, &bbox1[base1 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b1_x2, &bbox1[base1 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b1_y2, &bbox1[base1 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+
+      int32_t base2 = b2 * COORD_NUM;
+      __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
+               COORD_NUM * sizeof(T), handle_batches - 1);
+      // get the width and height
+      __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
+      __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
+      __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
+      __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
+
+      // right - left + offset ---> left
+      __bang_sub(vec_left, vec_right, vec_left, batches_stride);
+      __bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
+
+      // bottom - top + offset ---> right
+      __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
+      __bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
+
+      // zero vector ---> bottom
+      __nramset(vec_bottom, batches_stride, 0.f);
+
+      // width --> vec_left
+      __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
+      T *width = vec_left;
+      // height --> vec_right
+      __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
+      T *height = vec_right;
+
+      // get the b1_area
+      // (b1_x2 - b1_x1 + offset)  --->  vec_top
+      __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
+      __bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
+
+      // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
+      __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
+      __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
+
+      // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
+      // --->  vec_top;
+      __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
+      T *b1_area = vec_top;
+
+      // get the b2_area
+      // (b2_x2 - b2_x1 + offset)  --->  b2_x1
+      __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
+      __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
+
+      // (b2_y2 - b2_y1 + offset)  --->  b2_y1
+      __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
+      __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
+
+      // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
+      // --->  b2_x1;
+      __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
+      T *b2_area = vec_b2_x1;
+
+      // inter_s = width * height
+      __bang_mul(height, width, height, batches_stride);
+      T *inter_s = height;
+
+      // offset vector ---> vec_b2_y1
+      __nramset(vec_b2_y1, batches_stride, T(offset));
+      T *vec_offset = vec_b2_y1;
+
+      if (mode == 0) {
+        __bang_add(b1_area, b1_area, b2_area, batches_stride);
+        __bang_sub(b1_area, b1_area, inter_s, batches_stride);
+        __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
+      } else {
+        __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
+      }
+      T *base_s = b1_area;
+
+      // ious = inter_s / base_s
+      computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
+      __memcpy((T *)ious + index, width, handle_batches * sizeof(T),
+               NRAM2GDRAM);
+    }
+  } else {
+    int32_t num_loop_cpy = num_bbox2 / batches_stride;
+    int32_t num_rem_cpy_batches = num_bbox2 % batches_stride;
+    num_loop_cpy = num_rem_cpy_batches > 0 ? num_loop_cpy + 1 : num_loop_cpy;
+    for (int32_t i = 0; i < batch_per_task; i++) {
+      int32_t index1 = batch_start + i;
+      int32_t b1 = index1;
+      int32_t base1 = b1 * COORD_NUM;
+
+      // set bbox1 and bbox2 to nram
+      __nramset(vec_b1_x1, batches_stride, bbox1[base1]);
+      __nramset(vec_b1_y1, batches_stride, bbox1[base1 + 1]);
+      __nramset(vec_b1_x2, batches_stride, bbox1[base1 + 2]);
+      __nramset(vec_b1_y2, batches_stride, bbox1[base1 + 3]);
+
+      for (int32_t j = 0; j < num_loop_cpy; j++) {
+        int32_t index2 = j * batches_stride;
+        int32_t handle_batches = index2 + batches_stride > num_bbox2
+                                     ? num_rem_cpy_batches
+                                     : batches_stride;
+        int32_t b2 = index2;
+        int32_t base2 = b2 * COORD_NUM;
+
+        // copy bbox2 to nram
+        __memcpy(vec_b2_x1, &bbox2[base2], sizeof(T), GDRAM2NRAM, sizeof(T),
+                 COORD_NUM * sizeof(T), handle_batches - 1);
+        __memcpy(vec_b2_y1, &bbox2[base2 + 1], sizeof(T), GDRAM2NRAM, sizeof(T),
+                 COORD_NUM * sizeof(T), handle_batches - 1);
+        __memcpy(vec_b2_x2, &bbox2[base2 + 2], sizeof(T), GDRAM2NRAM, sizeof(T),
+                 COORD_NUM * sizeof(T), handle_batches - 1);
+        __memcpy(vec_b2_y2, &bbox2[base2 + 3], sizeof(T), GDRAM2NRAM, sizeof(T),
+                 COORD_NUM * sizeof(T), handle_batches - 1);
+
+        // get the width and height
+        __bang_maxequal(vec_left, vec_b1_x1, vec_b2_x1, batches_stride);
+        __bang_minequal(vec_right, vec_b1_x2, vec_b2_x2, batches_stride);
+        __bang_maxequal(vec_top, vec_b1_y1, vec_b2_y1, batches_stride);
+        __bang_minequal(vec_bottom, vec_b1_y2, vec_b2_y2, batches_stride);
+
+        // right - left + offset ---> left
+        __bang_sub(vec_left, vec_right, vec_left, batches_stride);
+        __bang_add_const(vec_left, vec_left, (T)offset, batches_stride);
+        // bottom - top + offset ---> right
+        __bang_sub(vec_right, vec_bottom, vec_top, batches_stride);
+        __bang_add_const(vec_right, vec_right, (T)offset, batches_stride);
+
+        // zero vector ---> bottom
+        __nramset(vec_bottom, batches_stride, (T)0);
+
+        // width --> vec_left
+        __bang_maxequal(vec_left, vec_bottom, vec_left, batches_stride);
+        T *width = vec_left;
+        // height --> vec_right
+        __bang_maxequal(vec_right, vec_bottom, vec_right, batches_stride);
+        T *height = vec_right;
+
+        // get the b1_area
+        // (b1_x2 - b1_x1 + offset)  --->  vec_top
+        __bang_sub(vec_top, vec_b1_x2, vec_b1_x1, batches_stride);
+        __bang_add_const(vec_top, vec_top, (T)offset, batches_stride);
+        // (b1_y2 - b1_y1 + offset)  --->  vec_bottom
+        __bang_sub(vec_bottom, vec_b1_y2, vec_b1_y1, batches_stride);
+        __bang_add_const(vec_bottom, vec_bottom, (T)offset, batches_stride);
+        // b1_area = (b1_x2 - b1_x1 + offset) * (b1_y2 - b1_y1 + offset)
+        // --->  vec_top;
+        __bang_mul(vec_top, vec_top, vec_bottom, batches_stride);
+        T *b1_area = vec_top;
+
+        // get the b2_area
+        // (b2_x2 - b2_x1 + offset)  --->  b2_x1
+        __bang_sub(vec_b2_x1, vec_b2_x2, vec_b2_x1, batches_stride);
+        __bang_add_const(vec_b2_x1, vec_b2_x1, (T)offset, batches_stride);
+        // (b2_y2 - b2_y1 + offset)  --->  b2_y1
+        __bang_sub(vec_b2_y1, vec_b2_y2, vec_b2_y1, batches_stride);
+        __bang_add_const(vec_b2_y1, vec_b2_y1, (T)offset, batches_stride);
+        // b2_area = (b2_x2 - b2_x1 + offset) * (b2_y2 - b2_y1 + offset)
+        // --->  b2_x1;
+        __bang_mul(vec_b2_x1, vec_b2_x1, vec_b2_y1, batches_stride);
+        T *b2_area = vec_b2_x1;
+
+        // inter_s = width * height
+        __bang_mul(height, width, height, batches_stride);
+        T *inter_s = height;
+
+        // offset vector ---> vec_b2_y1
+        __nramset(vec_b2_y1, batches_stride, T(offset));
+        T *vec_offset = vec_b2_y1;
+
+        if (mode == 0) {
+          __bang_add(b1_area, b1_area, b2_area, batches_stride);
+          __bang_sub(b1_area, b1_area, inter_s, batches_stride);
+          __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
+        } else {
+          __bang_maxequal(b1_area, vec_offset, b1_area, batches_stride);
+        }
+        T *base_s = b1_area;
+
+        // ious = inter_s / base_s
+        computeDiv<T>(width, inter_s, base_s, vec_b2_x2, batches_stride);
+        int32_t gdram_offset = index1 * num_bbox2 + index2;
+        __memcpy((T *)ious + gdram_offset, width, handle_batches * sizeof(T),
+                 NRAM2GDRAM);
+      }
+    }
+  }
+}
+
+template <typename T>
+__mlu_global__ void MLUUnion1KernelBBoxOverlaps(
+    const void *bbox1, const void *bbox2, void *ious, const int32_t num_bbox1,
+    const int32_t num_bbox2, const int32_t mode, const bool aligned,
+    const int32_t offset) {
+  /*
+   * NRAM partition
+   *  |-------------------------------------------------------------|
+   *  |   vec_b1_x1   |  vec_b1_y1   |   vec_b1_x2  |   vec_b1_y2   |
+   *  |-------------------------------------------------------------|
+   *  |   vec_b2_x1   |  vec_b2_y1   |   vec_b2_x2  |   vec_b2_y2   |
+   *  |-------------------------------------------------------------|
+   *  |    vec_left   |  vec_right   |    vec_top   |   vec_bottom  |
+   *  |-------------------------------------------------------------|
+   *
+  */
+  const int32_t align_bytes = PAD_DOWN(MAX_NRAM_SIZE, NFU_ALIGN_SIZE);
+  const int32_t split_nram_num = 12;
+  const int32_t nram_stride =
+      align_bytes / NFU_ALIGN_SIZE / split_nram_num * NFU_ALIGN_SIZE;
+
+  void *vec_b1_x1 = nmem_buf;
+  void *vec_b1_y1 = nmem_buf + nram_stride;
+  void *vec_b1_x2 = nmem_buf + 2 * nram_stride;
+  void *vec_b1_y2 = nmem_buf + 3 * nram_stride;
+
+  void *vec_b2_x1 = nmem_buf + 4 * nram_stride;
+  void *vec_b2_y1 = nmem_buf + 5 * nram_stride;
+  void *vec_b2_x2 = nmem_buf + 6 * nram_stride;
+  void *vec_b2_y2 = nmem_buf + 7 * nram_stride;
+
+  void *vec_left = nmem_buf + 8 * nram_stride;
+  void *vec_right = nmem_buf + 9 * nram_stride;
+  void *vec_top = nmem_buf + 10 * nram_stride;
+  void *vec_bottom = nmem_buf + 11 * nram_stride;
+
+  const int32_t vec_length = nram_stride / sizeof(T);
+  bboxOverlapsWorkflow((T *)vec_b1_x1, (T *)vec_b1_y1, (T *)vec_b1_x2,
+                       (T *)vec_b1_y2, (T *)vec_b2_x1, (T *)vec_b2_y1,
+                       (T *)vec_b2_x2, (T *)vec_b2_y2, (T *)vec_left,
+                       (T *)vec_right, (T *)vec_top, (T *)vec_bottom,
+                       (T *)bbox1, (T *)bbox2, (T *)ious, offset, mode,
+                       vec_length, num_bbox1, num_bbox2, aligned);
+}
+
+void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                        cnrtQueue_t queue, const cnrtDataType_t d_type,
+                        const void *bbox1, const void *bbox2, void *ious,
+                        const int32_t num_bbox1, const int32_t num_bbox2,
+                        const int32_t mode, const bool aligned,
+                        const int32_t offset) {
+  if (d_type == CNRT_FLOAT16) {
+    MLUUnion1KernelBBoxOverlaps<half><<<k_dim, k_type, queue>>>(
+        bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
+  } else {
+    MLUUnion1KernelBBoxOverlaps<float><<<k_dim, k_type, queue>>>(
+        bbox1, bbox2, ious, num_bbox1, num_bbox2, mode, aligned, offset);
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp b/mmcv/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..669a9d78e0c48b6761e05ca933cb4689bbcbc272
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/common_mlu_helper.hpp
@@ -0,0 +1,190 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef COMMON_MLU_HELPER_HPP_
+#define COMMON_MLU_HELPER_HPP_
+
+#define NFU_ALIGN_SIZE 128          // Byte
+#define REM_FOR_STACK (128 * 1024)  // 128KB reserved for cncc
+
+#ifdef __BANG_ARCH__
+#define MAX_NRAM_SIZE \
+  (__MLU_NRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
+#define MAX_SRAM_SIZE \
+  (__MLU_SRAM_SIZE__ * 1024 - REM_FOR_STACK)  // 128KB reserved for cncc
+#else
+#define MAX_NRAM_SIZE (384 * 1024)   // 384KB,  initialization value
+#define MAX_SRAM_SIZE (1920 * 1024)  // 1920KB, initialization value
+#endif
+
+#ifndef PAD_UP
+#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
+#endif
+
+#ifndef PAD_DOWN
+#define PAD_DOWN(x, y) (((x) / (y)) * (y))
+#endif
+
+#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
+
+/*!
+ * @brief Converts int32 to float32 data type.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores int32 type data.
+ * @param[in,out] dst_addition
+ *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in,out] src_addition
+ *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ void convertInt2Float(float *dst, float *dst_addition, int *src,
+                                   float *src_addition, const int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_int2float((float *)dst, (int32_t *)src, src_count, 0);
+#else
+  // get sign bit
+  const float move_23bit = 8388608.0;
+  // 0x80000000 = 1,000000000,0000000000000000000000000000
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x80000000);
+  __bang_cycle_band((char *)dst_addition, (char *)src, (char *)src_addition,
+                    src_count * sizeof(float), NFU_ALIGN_SIZE);
+  // get 1 or 0 from sign bit
+  // judg is Odd
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x00000001);
+  __bang_cycle_bor((char *)dst_addition, (char *)dst_addition,
+                   (char *)src_addition, src_count * sizeof(float),
+                   NFU_ALIGN_SIZE);
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x80000001);
+  __bang_cycle_eq(dst_addition, dst_addition, src_addition, src_count,
+                  NFU_ALIGN_SIZE / sizeof(float));
+  // minus xor, positive num invariant
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0xffffffff);
+  __bang_cycle_mul(dst, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+  __bang_bxor((char *)dst, (char *)src, (char *)dst, src_count * sizeof(float));
+  // convert int32 to float32
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x7fffff);
+  __bang_cycle_band((char *)dst, (char *)dst, (char *)src_addition,
+                    src_count * sizeof(float), NFU_ALIGN_SIZE);
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x4b000000);
+  __bang_cycle_bor((char *)dst, (char *)dst, (char *)src_addition,
+                   src_count * sizeof(float), NFU_ALIGN_SIZE);
+  __bang_sub_const(dst, dst, move_23bit, src_count);
+  // add one
+  __bang_add(dst, dst, dst_addition, src_count);
+  // set sign for float32
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0xffffffff);
+  __bang_cycle_mul(dst_addition, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x00000001);
+  __bang_cycle_add(dst_addition, dst_addition, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0x80000000);
+  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
+                    (char *)src_addition, src_count * 4, 128);
+  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition, src_count * 4);
+#endif  // __BANG_ARCH__ >= 300
+}
+
+/*!
+ * @brief Converts float32 to int32 data type with to_zero round mode.
+ *
+ * @param[out] dst
+ *   Pointer to NRAM that stores float32 type data.
+ * @param[in,out] dst_addition
+ *   Pointer to NRAM as the workspace of dst, which has the same size as dst.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src
+ *   Pointer to NRAM that stores int32 type data.
+ * @param[in,out] src_addition
+ *   Pointer to NRAM as the workspace of src, which has a size of 128 Bytes.
+ *   It allows empty pointer on MLU300 series.
+ * @param[in] src_count
+ *   The count of elements in src.
+ */
+__mlu_func__ void convertFloat2Int(int *dst, float *dst_addition, float *src,
+                                   float *src_addition, const int src_count) {
+#if __BANG_ARCH__ >= 300
+  __bang_float2int_tz((int32_t *)dst, (float *)src, src_count, 0);
+#else
+  // sign ===> src_addition
+  // dst=-1.0 : when src[i] is a negative number
+  // dst=+1.0 : when src[i] is a positive number
+  const int floatDchar = sizeof(float) / sizeof(char);
+  __bang_active_sign((float *)dst, src, src_count);
+  // dst_addition = abs(src)
+  __bang_mul(dst_addition, src, (float *)dst, src_count);
+  // if dst_addition < 1.0 , then src_addition + 1, to fix add error.
+  __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 1.0f);
+  __bang_cycle_lt(dst_addition, dst_addition, (float *)src_addition, src_count,
+                  NFU_ALIGN_SIZE / sizeof(float));
+  __bang_add_tz((float *)dst, (float *)dst, (float *)dst_addition, src_count);
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            0xbf800000);
+  // set negative flag -1.0 = 0xbf80000
+  __bang_cycle_eq(
+      (float *)dst, (float *)dst, (float *)src_addition, src_count,
+      NFU_ALIGN_SIZE / sizeof(float));  //  to mark all src in [x<-1.0]
+  __bang_active_abs(dst_addition, src, src_count);
+  __nramset((float *)src_addition, NFU_ALIGN_SIZE / sizeof(float), 8388608.0f);
+  // mask shift move 23
+  __bang_cycle_add_tz(
+      dst_addition, dst_addition, src_addition, src_count,
+      NFU_ALIGN_SIZE / sizeof(float));  // right shift move 23bit
+  // two`s complement for negatibe
+  // dst=1.0 , when src <-1.0
+  // dst=0.0 , when src >=-1.0
+  __bang_sub(dst_addition, dst_addition, (float *)dst, src_count);
+  // to fix max value
+  // 0 1001 0110 111 1111 1111 1111 1111 1111 <=> 0xcb7fffff <=> 16777215.0,
+  // means max value.
+  __bang_mul_const((float *)dst, (float *)dst, 16777215.0, src_count);
+  __bang_bxor((char *)dst_addition, (char *)dst_addition, (char *)dst,
+              src_count * floatDchar);
+  // get low 23bit
+  __nramset((unsigned *)src_addition, NFU_ALIGN_SIZE / sizeof(float),
+            (unsigned)0x007fffff);
+  // mask low 23bit is 1
+  __bang_cycle_band((char *)dst_addition, (char *)dst_addition,
+                    (char *)src_addition, src_count * floatDchar,
+                    NFU_ALIGN_SIZE / sizeof(char));
+  // set 9 high bit ===> dst
+  // -2.0 <=> 0xc0000000 <=> 1100 0000 0000 0000 0000 0000 0000 0000
+  //  1.0 <=> 0x3f800000 <=> 0011 1111 1000 0000 0000 0000 0000 0000
+  __nramset(src_addition, NFU_ALIGN_SIZE / sizeof(float), 0x3f800000);
+  __bang_cycle_and((float *)dst, (float *)dst, src_addition, src_count,
+                   NFU_ALIGN_SIZE / sizeof(float));
+  // src or dst_addition
+  __bang_bor((char *)dst_addition, (char *)dst, (char *)dst_addition,
+             src_count * floatDchar);
+  __bang_mul_const((float *)dst, (float *)dst, -2.0, src_count);
+  __bang_bor((char *)dst, (char *)dst, (char *)dst_addition,
+             src_count * floatDchar);
+#endif  // __BANG_ARCH__ >= 300
+}
+
+#endif  // COMMON_MLU_HELPER_HPP_
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu b/mmcv/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..7624379b68d6df41aae0253df26b9add61c7a76e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/focal_loss_sigmoid_mlu_kernel.mlu
@@ -0,0 +1,888 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <float.h>
+
+#include "common_mlu_helper.hpp"
+
+#define PING 0
+#define PONG 1
+
+__nram__ char nram_buffer[MAX_NRAM_SIZE];
+
+namespace forward {
+template <typename T>
+__mlu_func__ void loadInput(char *nram_input, T *dram_input, const int32_t size,
+                            const int32_t dst_stride = 0,
+                            const int32_t src_stride = 0,
+                            const int32_t count = 1) {
+  if (dst_stride == src_stride) {
+    __memcpy_async(nram_input, dram_input, size * count, GDRAM2NRAM);
+  } else {
+    __memcpy_async(nram_input, dram_input, size, GDRAM2NRAM, dst_stride,
+                   src_stride, count - 1);
+  }
+}
+
+template <typename T>
+__mlu_func__ void loadWeight(char *nram_input, T *dram_input, const int32_t t,
+                             const int32_t c, const int32_t has_weight,
+                             const int32_t partition_nc) {
+  if (has_weight && partition_nc && t >= 0 && t < c) {
+    __memcpy_async(nram_input, (T *)dram_input + t, sizeof(T), GDRAM2NRAM);
+  }
+}
+
+template <typename T>
+__mlu_func__ void storeOutput(T *dram_output, char *nram_output,
+                              const int32_t size, const int32_t dst_stride = 0,
+                              const int32_t src_stride = 0,
+                              const int32_t count = 1) {
+  if (dst_stride == src_stride) {
+    __memcpy_async(dram_output, nram_output, size * count, NRAM2GDRAM);
+  } else {
+    __memcpy_async(dram_output, nram_output, size, NRAM2GDRAM, dst_stride,
+                   src_stride, count - 1);
+  }
+}
+
+template <typename T>
+__mlu_func__ void compute(T *input, const int32_t *target, const T *weight,
+                          const int32_t has_weight, const int32_t partition_nc,
+                          const int32_t deal_num, const int32_t n_seg,
+                          const int32_t c, const int32_t c_seg,
+                          const int32_t c_start_index, const float alpha,
+                          const float gamma, T *compute_a, T *compute_b,
+                          T *output) {
+  // set params
+  const int32_t c_num =
+      has_weight ? PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T)) : c_seg;
+  const int32_t c_end_index = c_start_index + c_seg;
+  const int32_t half_epsilon = 0x0400;
+  const T epsilon_f =
+      sizeof(T) == sizeof(float) ? FLT_MIN : *((half *)&half_epsilon);
+
+  // 0. alpha_t * p_t^r = alpha * (1 - p) ^ gamma  if t == c_i
+  //                    = (1 - alpha) * p ^ gamma  if t != c_i
+  __nramset((T *)output, deal_num, (T)(1 - alpha));
+  __bang_active_sigmoid((T *)compute_b, (T *)input, deal_num);
+  for (int32_t i = 0; i < n_seg; ++i) {
+    const int32_t t = *((uint32_t *)target + i);
+    if (t >= c_start_index && t < c_end_index) {
+      const uint32_t index = i * c_num + t - c_start_index;
+      *((T *)input + index) = -1.0 * (*((T *)input + index));
+      *((T *)compute_b + index) = 1.0 - (*((T *)compute_b + index)) + epsilon_f;
+      *((T *)output + index) = alpha;
+    }
+  }
+  if (sizeof(T) == sizeof(half)) {
+    __bang_half2float((float *)compute_a, (half *)compute_b, deal_num);
+    __bang_active_loghp((float *)compute_a, (float *)compute_a, deal_num);
+    __bang_mul_const((float *)compute_a, (float *)compute_a, (float)gamma,
+                     deal_num);
+    __bang_active_exphp((float *)compute_a, (float *)compute_a, deal_num);
+    __bang_float2half_rd((half *)compute_a, (float *)compute_a, deal_num);
+  } else {
+    __bang_active_loghp((T *)compute_a, (T *)compute_b, deal_num);
+    __bang_mul_const((T *)compute_a, (T *)compute_a, (T)gamma, deal_num);
+    __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
+  }
+  __bang_mul((T *)output, (T *)compute_a, (T *)output, deal_num);
+
+  // 1. max = max(0, -x)  if t == c_i
+  //        = max(0, x)   if t != c_i
+  __nramset((T *)compute_b, deal_num, (T)0);
+  __bang_maxequal((T *)compute_b, (T *)compute_b, (T *)input, deal_num);
+
+  // 2. -log(p_t) = ln(e^(-max)+ e^(-max-x) + max   if t == c_i
+  //              = ln(e^(-max)+ e^(-max+x) + max   if t != c_i
+  __bang_mul_const((T *)compute_a, (T *)compute_b, (T)-1.0, deal_num);
+  __bang_add((T *)input, (T *)compute_a, (T *)input, deal_num);
+
+  __bang_active_exphp((T *)compute_a, (T *)compute_a, deal_num);
+  __bang_active_exphp((T *)input, (T *)input, deal_num);
+  __bang_add((T *)compute_a, (T *)compute_a, (T *)input, deal_num);
+  __bang_active_loghp((T *)compute_a, (T *)compute_a, deal_num);
+  __bang_add((T *)input, (T *)compute_a, (T *)compute_b, deal_num);
+
+  // 3. output = alpha_t * p_t^r * [-log(p_t)]
+  __bang_mul((T *)output, (T *)output, (T *)input, deal_num);
+
+  // 4. with weight
+  if (has_weight) {
+    for (int32_t i = 0; i < n_seg; ++i) {
+      int32_t t = *((int32_t *)target + i);
+      if (t >= 0 && t < c) {
+        t = partition_nc ? 0 : t;
+        __bang_mul_const((T *)output + i * c_num, (T *)output + i * c_num,
+                         *((T *)weight + t), c_num);
+      }
+    }
+  }
+}
+
+template <typename T>
+__mlu_func__ void startPipeline(
+    const T *input, const int32_t *target, const T *weight,
+    char *nram_compute_a, char *nram_compute_b, char *nram_input,
+    char *nram_target, char *nram_weight, char *nram_output,
+    const int32_t has_weight, const int32_t partition_nc,
+    const int32_t pingpong_offset, const int32_t pingpong_weight_offset,
+    const int32_t c_offset_num, const int32_t n, const int32_t n_seg,
+    const int32_t c, const int32_t c_seg, const float alpha, const float gamma,
+    T *output) {
+  // with offset
+  input = (T *)((char *)input + c_offset_num * sizeof(T));
+  output = (T *)((char *)output + c_offset_num * sizeof(T));
+
+  const int32_t c_seg_align_num = PAD_UP(c_seg, NFU_ALIGN_SIZE / sizeof(T));
+  const int32_t c_num = has_weight ? c_seg_align_num : c_seg;
+  const int32_t deal_num = PAD_UP(n_seg * c_num, NFU_ALIGN_SIZE / sizeof(T));
+  const int32_t load_size = c_seg * sizeof(T);
+  const int32_t dram_stride = c * sizeof(T);
+  const int32_t nram_stride = c_num * sizeof(T);
+
+  if (has_weight && !partition_nc) {
+    loadInput<T>(nram_weight, (T *)weight, load_size, nram_stride, dram_stride,
+                 1);
+    __asm__ volatile("sync;\n\t");
+  }
+  const int32_t repeat = n / n_seg;
+  const int32_t remain = n % n_seg;
+
+  /*
+   * Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
+   *           The allocated memory space of NRAM is divided into two parts:
+   *           PING and Pong. In a single time slice, PING is used to process
+   *           IO stream and PONG is used for computation. Both of them are
+   *           processed synchronously until finished.
+   *
+   * diagram of PINGPONG:
+   * |------|-----------------------------------------------------------------|
+   * |      |                              space                              |
+   * |------|-----------------------------------------------------------------|
+   * | time |   Ping   |   Pong   |   Ping   |   Pong   |   Ping   |   Pong   |
+   * |------|-----------------------------------------------------------------|
+   * |  0   |    L0    |          |          |          |          |          |
+   * |  1   |    C0    |    L1    |          |          |          |          |
+   * |  2   |    S0    |    C1    |    L2    |          |          |          |
+   * |  3   |          |    S1    |    C2    |    L3    |          |          |
+   * |  4   |          |          |    S2    |    C3    |    L4    |          |
+   * |  5   |          |          |          |    S3    |    C4    |    L5    |
+   * |  6   |          |          |          |          |    S4    |    C5    |
+   * |  7   |          |          |          |          |          |    S5    |
+   * |------|-----------------------------------------------------------------|
+   */
+
+  // diagram of PINGPONG: L0
+  if (repeat > 0) {
+    loadInput<T>(nram_input, (T *)input, load_size, nram_stride, dram_stride,
+                 n_seg);
+    loadInput<int32_t>(nram_target, (int32_t *)target, n_seg * sizeof(int32_t));
+    loadWeight<T>(nram_weight, (T *)weight, *((int32_t *)target), c, has_weight,
+                  partition_nc);
+    __asm__ volatile("sync;\n\t");
+  }
+
+  // diagram of PINGPONG: C0 and L1
+  if (repeat > 1) {
+    compute((T *)nram_input, (int32_t *)nram_target, (T *)nram_weight,
+            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)nram_output);
+    loadInput<T>((char *)nram_input + pingpong_offset, (T *)input + c * n_seg,
+                 load_size, nram_stride, dram_stride, n_seg);
+    loadInput<int32_t>((char *)nram_target + pingpong_offset,
+                       (int32_t *)target + n_seg, n_seg * sizeof(int32_t));
+    loadWeight<T>((char *)nram_weight + pingpong_weight_offset, (T *)weight,
+                  *((int32_t *)target + n_seg), c, has_weight, partition_nc);
+    __asm__ volatile("sync;\n\t");
+  }
+
+  for (int32_t i = 0; i < repeat - 2; ++i) {
+    storeOutput<T>((T *)output + i * c * n_seg,
+                   nram_output + (i % 2) * pingpong_offset, load_size,
+                   dram_stride, nram_stride, n_seg);
+    loadInput<T>((char *)nram_input + (i % 2) * pingpong_offset,
+                 (T *)(input) + (i + 2) * c * n_seg, load_size, nram_stride,
+                 dram_stride, n_seg);
+    loadInput<int32_t>((char *)nram_target + (i % 2) * pingpong_offset,
+                       (int32_t *)target + (i + 2) * n_seg,
+                       n_seg * sizeof(int32_t));
+    loadWeight<T>((char *)nram_weight + (i % 2) * pingpong_weight_offset,
+                  (T *)weight, *((int32_t *)target + (i + 2) * n_seg), c,
+                  has_weight, partition_nc);
+    compute((T *)(nram_input + ((i + 1) % 2) * pingpong_offset),
+            (int32_t *)(nram_target + ((i + 1) % 2) * pingpong_offset),
+            (T *)(nram_weight +
+                  partition_nc * ((i + 1) % 2) * pingpong_weight_offset),
+            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)(nram_output + ((i + 1) % 2) * pingpong_offset));
+    __asm__ volatile("sync;\n\t");
+  }
+
+  if (repeat > 1) {
+    storeOutput<T>((T *)output + (repeat - 2) * c * n_seg,
+                   (char *)nram_output + (repeat % 2) * pingpong_offset,
+                   load_size, dram_stride, nram_stride, n_seg);
+  }
+
+  if (remain > 0) {
+    loadInput<T>((char *)nram_input + (repeat % 2) * pingpong_offset,
+                 (T *)input + repeat * c * n_seg, load_size, nram_stride,
+                 dram_stride, remain);
+    loadInput<int32_t>((char *)nram_target + (repeat % 2) * pingpong_offset,
+                       (int32_t *)target + repeat * n_seg,
+                       remain * sizeof(int32_t));
+    loadWeight<T>((char *)nram_weight + (repeat % 2) * pingpong_weight_offset,
+                  (T *)weight, *((int32_t *)target + repeat * n_seg), c,
+                  has_weight, partition_nc);
+  }
+
+  if (repeat > 0) {
+    compute((T *)(nram_input + ((repeat - 1) % 2) * pingpong_offset),
+            (int32_t *)(nram_target + ((repeat - 1) % 2) * pingpong_offset),
+            (T *)(nram_weight +
+                  partition_nc * ((repeat - 1) % 2) * pingpong_weight_offset),
+            has_weight, partition_nc, deal_num, n_seg, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)(nram_output + ((repeat - 1) % 2) * pingpong_offset));
+  }
+  __asm__ volatile("sync;\n\t");
+
+  if (repeat > 0) {
+    storeOutput<T>((T *)output + (repeat - 1) * c * n_seg,
+                   (char *)nram_output + ((repeat - 1) % 2) * pingpong_offset,
+                   load_size, dram_stride, nram_stride, n_seg);
+  }
+
+  if (remain > 0) {
+    int32_t rem_num = PAD_UP(remain * c_num, NFU_ALIGN_SIZE / sizeof(T));
+    compute((T *)(nram_input + (repeat % 2) * pingpong_offset),
+            (int32_t *)(nram_target + (repeat % 2) * pingpong_offset),
+            (T *)(nram_weight +
+                  partition_nc * (repeat % 2) * pingpong_weight_offset),
+            has_weight, partition_nc, rem_num, remain, c, c_seg, c_offset_num,
+            alpha, gamma, (T *)nram_compute_a, (T *)nram_compute_b,
+            (T *)(nram_output + (repeat % 2) * pingpong_offset));
+    __asm__ volatile("sync;\n\t");
+
+    storeOutput<T>((T *)output + repeat * c * n_seg,
+                   (char *)nram_output + (repeat % 2) * pingpong_offset,
+                   load_size, dram_stride, nram_stride, remain);
+  }
+  __asm__ volatile("sync;\n\t");
+}
+
+template <typename T>
+__mlu_func__ void focalLossSigmoidForwardBlock(
+    const T *input, const int32_t *target, const T *weight, const int32_t n,
+    const int32_t c, const float alpha, const float gamma, T *output) {
+  /*
+   * NRAM partition
+   *  |-----------------------------------------------------------------------|
+   *  |                                weight                                 |
+   *  |------------------------------- COMPUTE -------------------------------|
+   *  |                                   |                                   |
+   *  |              computeA             |               computeB            |
+   *  |                                   |                                   |
+   *  |------------- PING ------------------------------- PONG ---------------|
+   *  |                                   |                                   |
+   *  |              input                |               input               |
+   *  |                                   |                                   |
+   *  |-----------------------------------|-----------------------------------|
+   *  |                                   |                                   |
+   *  |              output               |               output              |
+   *  |                                   |                                   |
+   *  |-----------------------------------|-----------------------------------|
+   *  |              target               |               target              |
+   *  |-----------------------------------|-----------------------------------|
+   *
+   * split_pipeline_num is 6: COMPUTE(computeA,computeB), PING(input,output),
+   * PONG(input,output).
+   * split_target_num is 2: PING(target), PONG(target).
+   * weight is not NULL:
+   *   The nram-size of weight is equal to c_align_size when partition input-N.
+   *   The nram-size of weight is equal to NFU_ALIGN_SIZE when partition
+   * input-NC.
+  */
+
+  // calculate threshold of c
+  const int32_t split_pipeline_num = 6;
+  const int32_t split_target_num = 2;
+  const int32_t has_weight = weight != NULL;
+  const int32_t threshold_c =
+      PAD_DOWN((MAX_NRAM_SIZE - split_target_num * sizeof(int32_t)) /
+                   (split_pipeline_num + has_weight),
+               NFU_ALIGN_SIZE) /
+      sizeof(T);
+  const int32_t c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
+  const int32_t c_align_size = c_align * sizeof(T);
+
+  if (c <= threshold_c) {
+    // partition inputN
+    int32_t c_num = c;
+    int32_t reservered_align_size =
+        (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
+    int32_t weight_size = 0;
+    if (has_weight) {
+      c_num = c_align;
+      reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
+      weight_size = c_align_size;
+    }
+
+    const int32_t remain_size =
+        MAX_NRAM_SIZE - weight_size - reservered_align_size;
+    const int32_t n_seg =
+        remain_size / (split_pipeline_num * c_num * sizeof(T) +
+                       split_target_num * sizeof(int32_t));
+    const int32_t split_pipeline_size =
+        PAD_UP(c_num * n_seg * sizeof(T), NFU_ALIGN_SIZE);
+    const int32_t compute_size = 2 * split_pipeline_size;
+    const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
+
+    char *nram_weight = (char *)nram_buffer;
+    char *nram_compute_a = nram_weight + has_weight * c_align_size;
+    char *nram_compute_b = nram_compute_a + split_pipeline_size;
+    char *nram_input = nram_compute_b + split_pipeline_size;
+    char *nram_output = nram_input + split_pipeline_size;
+    char *nram_target = nram_output + split_pipeline_size;
+
+    startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
+                     nram_input, nram_target, nram_weight, nram_output,
+                     has_weight, 0, pingpong_offset, 0, 0, n, n_seg, c, c,
+                     alpha, gamma, output);
+  } else {
+    // partition inputNC
+    const int32_t weight_size = has_weight * NFU_ALIGN_SIZE;
+    const int32_t remain_size = MAX_NRAM_SIZE - weight_size;
+    const int32_t split_pipeline_size = PAD_DOWN(
+        (remain_size - split_target_num * NFU_ALIGN_SIZE) / split_pipeline_num,
+        NFU_ALIGN_SIZE);
+    const int32_t c_seg = split_pipeline_size / sizeof(T);
+    const int32_t n_seg = 1;
+    const int32_t compute_size = 2 * split_pipeline_size;
+    const int32_t pingpong_offset = (MAX_NRAM_SIZE - weight_size - compute_size) / 2;
+    const int32_t pingpong_weight_offset = weight_size / 2;
+
+    char *nram_weight = (char *)nram_buffer;
+    char *nram_compute_a = nram_weight + weight_size;
+    char *nram_compute_b = nram_compute_a + split_pipeline_size;
+    char *nram_input = nram_compute_b + split_pipeline_size;
+    char *nram_output = nram_input + split_pipeline_size;
+    char *nram_target = nram_output + split_pipeline_size;
+
+    const int32_t loop_num = (c + c_seg - 1) / c_seg;
+    const int32_t partition_nc = 1;
+    for (int32_t i = 0; i < loop_num; ++i) {
+      const int32_t c_index = i * c_seg;
+      const int32_t c_seg_curr = i == (loop_num - 1) ? c - c_index : c_seg;
+      startPipeline<T>(input, target, weight, nram_compute_a, nram_compute_b,
+                       nram_input, nram_target, nram_weight, nram_output,
+                       has_weight, partition_nc, pingpong_offset,
+                       pingpong_weight_offset, c_index, n, n_seg, c, c_seg_curr,
+                       alpha, gamma, output);
+    }
+  }
+}
+
+template <typename T>
+__mlu_global__ void MLUUnion1KernelFocalLossSigmoidForward(
+    const void *input, const void *target, const void *weight, const int32_t N,
+    const int32_t C, const float alpha, const float gamma, void *output) {
+  const int32_t n_seg = N / taskDim + (taskId == taskDim - 1) * (N % taskDim);
+  const T *input_offset = (T *)input + N / taskDim * taskId * C;
+  const int32_t *target_offset = (int32_t *)target + N / taskDim * taskId;
+  T *output_offset = (T *)output + N / taskDim * taskId * C;
+
+  focalLossSigmoidForwardBlock((T *)input_offset, (int32_t *)target_offset,
+                               (T *)weight, n_seg, C, alpha, gamma,
+                               (T *)output_offset);
+}
+}  // namespace forward
+
+namespace backward {
+template <typename T>
+__mlu_func__ void loadInput(char *nram_input, char *nram_target,
+                            const T *gdram_input, const int32_t *gdram_target,
+                            const int32_t deal_n, const int32_t total_c,
+                            const bool pingping_flag, const bool has_weight,
+                            const int32_t nram_offset,
+                            const int32_t gdram_offset) {
+  if (pingping_flag == PONG) {
+    nram_input += nram_offset;
+    nram_target += nram_offset;
+  }
+
+  __memcpy_async(nram_target, gdram_target + gdram_offset / total_c,
+                 deal_n * sizeof(int32_t), GDRAM2NRAM);
+
+  char *nram_input_load = nram_input;
+  int32_t compute_align_size = 2 * NFU_ALIGN_SIZE;
+  if (has_weight) {
+    if (sizeof(T) == sizeof(half)) {
+      int32_t compute_align_num = compute_align_size / sizeof(float);
+      int32_t align_c = PAD_UP(total_c, compute_align_num);
+      int32_t compute_size = deal_n * align_c * sizeof(float);
+      nram_input_load += compute_size / 2;
+    }
+    int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
+    int32_t total_c_size = total_c * sizeof(T);
+    int32_t align_c_size = align_c * sizeof(T);
+    __memcpy_async(nram_input_load, gdram_input + gdram_offset, total_c_size,
+                   GDRAM2NRAM, align_c_size, total_c_size, deal_n - 1);
+  } else {
+    if (sizeof(T) == sizeof(half)) {
+      int32_t compute_size =
+          PAD_UP(deal_n * total_c * sizeof(float), compute_align_size);
+      nram_input_load += compute_size / 2;
+    }
+    int32_t load_size = deal_n * total_c * sizeof(T);
+    __memcpy_async(nram_input_load, gdram_input + gdram_offset, load_size,
+                   GDRAM2NRAM);
+  }
+}
+
+template <typename T>
+__mlu_func__ void sigmoid(T *dst_data, const T *src_data,
+                          const int32_t elem_count) {
+  __bang_mul_const(dst_data, (T *)src_data, T(-1), elem_count);
+  __bang_active_exphp(dst_data, dst_data, elem_count);
+  __bang_add_const(dst_data, dst_data, T(1), elem_count);
+  __bang_active_reciphp(dst_data, dst_data, elem_count);
+}
+
+template <typename T>
+__mlu_func__ void coreCompute(char *nram_input, const T *nram_weight,
+                              const float *nram_flt_min, char *nram_pt,
+                              char *nram_alpha_t, char *nram_temp,
+                              char *nram_target, const float *nram_gamma,
+                              char *nram_output, const float alpha,
+                              const int32_t compute_num, const int32_t deal_n,
+                              const int32_t total_c, const bool pingpong_flag,
+                              const int32_t nram_offset,
+                              const bool has_weight) {
+  if (pingpong_flag == PONG) {
+    nram_input += nram_offset;
+    nram_pt += nram_offset;
+    nram_alpha_t += nram_offset;
+    nram_temp += nram_offset;
+    nram_output += nram_offset;
+    nram_target += nram_offset;
+  }
+
+  if (sizeof(T) == sizeof(half)) {
+    const int32_t compute_size = compute_num * sizeof(float);
+    char *nram_input_load = nram_input + compute_size / 2;
+    __bang_half2float((float *)nram_input, (half *)nram_input_load,
+                      compute_num);
+  }
+
+  // 0. alpha_t = alpha - 1
+  __nramset((float *)nram_alpha_t, compute_num, (float)(alpha - 1.0));
+
+  // 1. pt = 1 - sigmoid(x)
+  sigmoid((float *)nram_pt, (float *)nram_input, compute_num);
+  __bang_mul_const((float *)nram_pt, (float *)nram_pt, (float)(-1),
+                   compute_num);
+  __bang_add_const((float *)nram_pt, (float *)nram_pt, (float)1, compute_num);
+
+  // 2. pt      = target[n] == c ? sigmoid(x) : 1 - sigmoid(x)
+  //    alpha_t = target[n] == c ? alpha      : alpha - 1
+  const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(float);
+  for (int n = 0; n < deal_n; n++) {
+    const int32_t target_value = ((int32_t *)nram_target)[n];
+    if (target_value >= total_c || target_value < 0) continue;
+    int32_t c_offset = 0;
+    if (has_weight) {
+      int32_t c_align_num = nfu_align_num;
+      if (sizeof(T) == sizeof(half)) {
+        c_align_num += nfu_align_num;
+      }
+      c_offset = PAD_UP(total_c, c_align_num);
+    } else {
+      c_offset = total_c;
+    }
+    int32_t idx = n * c_offset + target_value;
+    *((float *)nram_pt + idx) = 1.0 - *((float *)nram_pt + idx);
+    *((float *)nram_alpha_t + idx) = alpha;
+  }
+
+  // 3. temp = -alpha_t * e^(gamma * log(max(1 - pt, FLT_MIN))
+  __bang_mul_const((float *)nram_temp, (float *)nram_pt, (float)(-1),
+                   compute_num);
+  __bang_add_const((float *)nram_temp, (float *)nram_temp, (float)(1),
+                   compute_num);
+  __bang_cycle_maxequal((float *)nram_temp, (float *)nram_temp,
+                        (float *)nram_flt_min, compute_num, nfu_align_num);
+  __bang_active_loghp((float *)nram_temp, (float *)nram_temp, compute_num);
+  __bang_cycle_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_gamma,
+                   compute_num, nfu_align_num);
+  __bang_active_exphp((float *)nram_temp, (float *)nram_temp, compute_num);
+  __bang_mul((float *)nram_temp, (float *)nram_temp, (float *)nram_alpha_t,
+             compute_num);
+  __bang_mul_const((float *)nram_temp, (float *)nram_temp, (float)(-1),
+                   compute_num);
+
+  // 4. output = 1 - pt - gamma * pt * log(max(pt, FLT_MIN))
+  __bang_cycle_maxequal((float *)nram_output, (float *)nram_pt,
+                        (float *)nram_flt_min, compute_num, nfu_align_num);
+  __bang_active_loghp((float *)nram_output, (float *)nram_output, compute_num);
+  __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_pt,
+             compute_num);
+  __bang_cycle_mul((float *)nram_output, (float *)nram_output,
+                   (float *)nram_gamma, compute_num, nfu_align_num);
+  __bang_add((float *)nram_output, (float *)nram_output, (float *)nram_pt,
+             compute_num);
+  __bang_mul_const((float *)nram_output, (float *)nram_output, (float)(-1),
+                   compute_num);
+  __bang_add_const((float *)nram_output, (float *)nram_output, (float)(1),
+                   compute_num);
+
+  // 5. output = output * temp
+  __bang_mul((float *)nram_output, (float *)nram_output, (float *)nram_temp,
+             compute_num);
+
+  if (sizeof(T) == sizeof(half)) {
+    __bang_float2half_rd((half *)nram_output, (float *)nram_output,
+                         compute_num);
+  }
+
+  if (has_weight) {
+    // with weight
+    for (int n = 0; n < deal_n; n++) {
+      int32_t c_align_num = nfu_align_num;
+      if (sizeof(T) == sizeof(half)) {
+        c_align_num += nfu_align_num;
+      }
+      int32_t align_c = PAD_UP(total_c, c_align_num);
+      int32_t target_value = ((int32_t *)nram_target)[n];
+      T weight_value = nram_weight[target_value];
+      __bang_mul_const((T *)nram_output + n * align_c,
+                       (T *)nram_output + n * align_c, weight_value, align_c);
+    }
+  }
+}
+
+template <typename T>
+__mlu_func__ void storeOutput(T *gdram_output, const char *nram_output,
+                              const int32_t deal_n, const int32_t total_c,
+                              const bool pingpong_flag, const bool has_weight,
+                              const int32_t nram_offset,
+                              const int32_t gdram_offset) {
+  if (pingpong_flag == PONG) {
+    nram_output += nram_offset;
+  }
+  const int32_t store_size = deal_n * total_c * sizeof(T);
+  if (has_weight) {
+    int32_t align_c = PAD_UP(total_c, NFU_ALIGN_SIZE / sizeof(T));
+    int32_t total_c_size = total_c * sizeof(T);
+    int32_t align_c_size = align_c * sizeof(T);
+    __memcpy_async(gdram_output + gdram_offset, nram_output, total_c_size,
+                   NRAM2GDRAM, total_c_size, align_c_size, deal_n - 1);
+  } else {
+    __memcpy_async(gdram_output + gdram_offset, nram_output, store_size,
+                   NRAM2GDRAM);
+  }
+}
+
+template <typename T>
+__mlu_func__ void focalLossSigmoidBackwardBlock(
+    const T *input, const int32_t *target, const T *weight, const float gamma,
+    const float alpha, const int32_t total_n, const int32_t deal_n,
+    const int32_t total_c, T *output) {
+  // params per time slice
+  int32_t deal_num = deal_n * total_c;
+  int32_t deal_size = deal_num * sizeof(float);
+  int32_t compute_num = 0;
+  int32_t compute_size = 0;
+  int32_t compute_align_size = NFU_ALIGN_SIZE;
+  const int32_t nfu_align_num = NFU_ALIGN_SIZE / sizeof(T);
+  if (sizeof(T) == sizeof(half)) {
+    compute_align_size += NFU_ALIGN_SIZE;
+  }
+  const int32_t compute_align_num = compute_align_size / sizeof(float);
+  bool has_weight = false;
+  if (weight != NULL) {
+    has_weight = true;
+    int32_t align_c = PAD_UP(total_c, compute_align_num);
+    compute_num = deal_n * align_c;
+    compute_size = compute_num * sizeof(float);
+  } else {
+    compute_size = PAD_UP(deal_size, compute_align_size);
+    compute_num = compute_size / sizeof(float);
+  }
+
+  // params per core
+  int32_t total_num = total_n * total_c;
+  int32_t num_per_core = PAD_DOWN(total_num / taskDim, deal_num);
+  int32_t loop_per_core = num_per_core / deal_num;
+
+  /* NRAM partition:
+   *
+   * |-----------------ping pong--------------------|
+   * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
+   *
+   * split_pipeline_num is 5: input, pt, alpha_t, temp, output.
+   * nram_reserved_line_num is 2: flt_min, gamma.
+   */
+  const int32_t split_pipeline_num = 5;
+  const int32_t nram_reserved_line_num = 2;
+  int32_t target_deal_size = deal_n * sizeof(int32_t);
+  int32_t target_deal_size_align = PAD_UP(target_deal_size, NFU_ALIGN_SIZE);
+  // nram PING/PONG offset
+  int32_t ping_pong_offset =
+      compute_size * split_pipeline_num + target_deal_size_align;
+
+  // gdram addr
+  int32_t *base_addr_target =
+      (int32_t *)target + taskId * loop_per_core * deal_n;
+  T *base_addr_input = (T *)input + taskId * num_per_core;
+  T *base_addr_output = output + taskId * num_per_core;
+
+  // nram addr
+  char *nram_input = (char *)nram_buffer;
+  char *nram_pt = nram_input + compute_size;
+  char *nram_alpha_t = nram_pt + compute_size;
+  char *nram_temp = nram_alpha_t + compute_size;
+  char *nram_output = nram_temp + compute_size;
+  char *nram_target = nram_output + compute_size;
+  float *nram_flt_min = NULL;
+  float *nram_gamma = NULL;
+  T *nram_weight = NULL;
+
+  if (!has_weight) {
+    nram_flt_min = (float *)(nram_buffer + MAX_NRAM_SIZE -
+                             nram_reserved_line_num * NFU_ALIGN_SIZE);
+    nram_gamma = nram_flt_min + nfu_align_num;
+  } else {
+    int32_t weight_space = PAD_UP(total_c * sizeof(T), NFU_ALIGN_SIZE);
+    nram_flt_min =
+        (float *)(nram_buffer + MAX_NRAM_SIZE -
+                  nram_reserved_line_num * NFU_ALIGN_SIZE - weight_space);
+    nram_gamma = nram_flt_min + nfu_align_num;
+    nram_weight = (T *)(nram_gamma + nfu_align_num);
+    __memcpy_async(nram_weight, weight, total_c * sizeof(T), GDRAM2NRAM);
+  }
+
+  // nram set gamma and FLT_MIN
+  __nramset(nram_gamma, nfu_align_num, gamma);
+  __nramset(nram_flt_min, nfu_align_num, FLT_MIN);
+
+  /*
+   * Pipeline: The pipeline is processed in three stages: Load, Compute, Store.
+   *           The allocated memory space of NRAM is divided into two parts:
+   *           PING and Pong. In a single time slice, PING is used to process
+   *           IO stream and PONG is used for computation. Both of them are
+   *           processed synchronously until finished.
+   *
+   * diagram of PINGPONG:
+   * |------|-----------------------------------------------------------------|
+   * |      |                              space                              |
+   * |------|-----------------------------------------------------------------|
+   * | time |   Ping   |   Pong   |   Ping   |   Pong   |   Ping   |   Pong   |
+   * |------|-----------------------------------------------------------------|
+   * |  0   |    L0    |          |          |          |          |          |
+   * |  1   |    C0    |    L1    |          |          |          |          |
+   * |  2   |    S0    |    C1    |    L2    |          |          |          |
+   * |  3   |          |    S1    |    C2    |    L3    |          |          |
+   * |  4   |          |          |    S2    |    C3    |    L4    |          |
+   * |  5   |          |          |          |    S3    |    C4    |    L5    |
+   * |  6   |          |          |          |          |    S4    |    C5    |
+   * |  7   |          |          |          |          |          |    S5    |
+   * |------|-----------------------------------------------------------------|
+   */
+
+  // diagram of PINGPONG: L0
+  if (loop_per_core > 0) {
+    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+              deal_n, total_c, PING, has_weight, ping_pong_offset, 0);
+    __asm__ volatile("sync;");
+  }
+
+  // diagram of PINGPONG: C0 and L1
+  if (loop_per_core > 1) {
+    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                compute_num, deal_n, total_c, PING, ping_pong_offset,
+                has_weight);
+    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+              deal_n, total_c, PONG, has_weight, ping_pong_offset, deal_num);
+    __asm__ volatile("sync;");
+  }
+
+  for (int i = 0; i < loop_per_core - 2; ++i) {
+    if (i % 2 == PING) {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
+                  has_weight, ping_pong_offset, i * deal_num);
+      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                  compute_num, deal_n, total_c, PONG, ping_pong_offset,
+                  has_weight);
+      loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+                deal_n, total_c, PING, has_weight, ping_pong_offset,
+                (i + 2) * deal_num);
+    } else {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
+                  has_weight, ping_pong_offset, i * deal_num);
+      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                  compute_num, deal_n, total_c, PING, ping_pong_offset,
+                  has_weight);
+      loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+                deal_n, total_c, PONG, has_weight, ping_pong_offset,
+                (i + 2) * deal_num);
+    }
+    __asm__ volatile("sync;");
+  }
+
+  if (loop_per_core > 1) {
+    if ((loop_per_core - 2) % 2 == PING) {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
+                  has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
+      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                  compute_num, deal_n, total_c, PONG, ping_pong_offset,
+                  has_weight);
+    } else {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
+                  has_weight, ping_pong_offset, (loop_per_core - 2) * deal_num);
+      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                  compute_num, deal_n, total_c, PING, ping_pong_offset,
+                  has_weight);
+    }
+    __asm__ volatile("sync;");
+  }
+
+  if (loop_per_core > 0) {
+    if (loop_per_core == 1) {
+      coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                  nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                  compute_num, deal_n, total_c, PING, ping_pong_offset,
+                  has_weight);
+      __asm__ volatile("sync;");
+    }
+    if ((loop_per_core - 1) % 2 == PING) {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PING,
+                  has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
+    } else {
+      storeOutput(base_addr_output, nram_output, deal_n, total_c, PONG,
+                  has_weight, ping_pong_offset, (loop_per_core - 1) * deal_num);
+    }
+  }
+
+  // process the remaining data which N remainder per core is less than deal_n
+  int32_t rem_for_all = total_num - num_per_core * taskDim;
+  if (rem_for_all == 0) return;
+  int32_t rem_n_for_all = rem_for_all / total_c;
+  int32_t rem_n_per_core = (rem_n_for_all + taskDim - 1) / taskDim;
+  int32_t rem_num_per_core = rem_n_per_core * total_c;
+  int32_t rem_num_per_core_align = 0;
+  int32_t rem_core_num = rem_for_all / rem_num_per_core;
+
+  int32_t rem_n_for_last = rem_n_for_all % rem_n_per_core;
+  int32_t rem_num_for_last = rem_n_for_last * total_c;
+  int32_t rem_num_for_last_align = 0;
+
+  if (has_weight) {
+    int32_t align_c = PAD_UP(total_c, compute_align_num);
+    rem_num_per_core_align = rem_n_per_core * align_c;
+    rem_num_for_last_align = rem_n_for_last * align_c;
+  } else {
+    rem_num_per_core_align = PAD_UP(rem_num_per_core, compute_align_num);
+    rem_num_for_last_align = PAD_UP(rem_num_for_last, compute_align_num);
+  }
+
+  int32_t rem_addr_base = num_per_core * taskDim;
+  int32_t rem_target_addr_base = loop_per_core * deal_n * taskDim;
+  base_addr_target = (int32_t *)target + rem_target_addr_base;
+  base_addr_input = (T *)input + rem_addr_base;
+  base_addr_output = output + rem_addr_base;
+
+  if (taskId < rem_core_num) {
+    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+              rem_n_per_core, total_c, PING, has_weight, ping_pong_offset,
+              taskId * rem_num_per_core);
+    __asm__ volatile("sync;");
+    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                rem_num_per_core_align, rem_n_per_core, total_c, PING,
+                ping_pong_offset, has_weight);
+    __asm__ volatile("sync;");
+    storeOutput(base_addr_output, nram_output, rem_n_per_core, total_c, PING,
+                has_weight, ping_pong_offset, taskId * rem_num_per_core);
+  } else if (taskId == rem_core_num) {
+    if (rem_num_for_last == 0) return;
+    loadInput(nram_input, nram_target, base_addr_input, base_addr_target,
+              rem_n_for_last, total_c, PING, has_weight, ping_pong_offset,
+              taskId * rem_num_per_core);
+    __asm__ volatile("sync;");
+    coreCompute(nram_input, nram_weight, nram_flt_min, nram_pt, nram_alpha_t,
+                nram_temp, nram_target, nram_gamma, nram_output, alpha,
+                rem_num_for_last_align, rem_n_for_last, total_c, PING,
+                ping_pong_offset, has_weight);
+    __asm__ volatile("sync;");
+    storeOutput(base_addr_output, nram_output, rem_n_for_last, total_c, PING,
+                has_weight, ping_pong_offset, taskId * rem_num_per_core);
+  } else {
+    return;
+  }
+}
+
+template <typename T>
+__mlu_global__ void MLUUnion1KernelFocalLossSigmoidBackward(
+    const void *input, const void *target, const void *weight,
+    const float gamma, const float alpha, const int32_t total_n,
+    const int32_t deal_n, const int32_t total_c, void *output) {
+  focalLossSigmoidBackwardBlock((T *)input, (int32_t *)target, (T *)weight,
+                                gamma, alpha, total_n, deal_n, total_c,
+                                (T *)output);
+}
+}  // namespace backward
+
+void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                                   cnrtQueue_t queue,
+                                   const cnrtDataType_t d_type,
+                                   const void *input, const void *target,
+                                   const void *weight, const int32_t N,
+                                   const int32_t C, const float alpha,
+                                   const float gamma, void *output) {
+  if (d_type == CNRT_FLOAT16) {
+    forward::MLUUnion1KernelFocalLossSigmoidForward<
+        half><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
+                                        gamma, output);
+  } else {
+    forward::MLUUnion1KernelFocalLossSigmoidForward<
+        float><<<k_dim, k_type, queue>>>(input, target, weight, N, C, alpha,
+                                         gamma, output);
+  }
+}
+
+void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                                    cnrtQueue_t queue,
+                                    const cnrtDataType_t d_type,
+                                    const void *input, const void *target,
+                                    const void *weight, const float gamma,
+                                    const float alpha, const int32_t dim_n,
+                                    const int32_t deal_n, const int32_t dim_c,
+                                    void *output) {
+  if (d_type == CNRT_FLOAT16) {
+    backward::MLUUnion1KernelFocalLossSigmoidBackward<
+        half><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
+                                        dim_n, deal_n, dim_c, output);
+  } else {
+    backward::MLUUnion1KernelFocalLossSigmoidBackward<
+        float><<<k_dim, k_type, queue>>>(input, target, weight, gamma, alpha,
+                                         dim_n, deal_n, dim_c, output);
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu b/mmcv/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..7cb16bb100355d49f3d1ad004a5e82998f258994
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/nms_mlu_kernel.mlu
@@ -0,0 +1,1161 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+#define NMS_SIZE (64)
+#define COORD_DIM (4)
+#define MEMORY_CORE (0x80)
+#define INFO_NUM (5)  // 5 means x1, x2, y1, y2 and score
+#define REDUCE_NUM \
+  (7)  // score, x1, y1, x2, y2, max_index (reserve 2 num for half-type input)
+
+#define SIZE_NRAM_BUF (MAX_NRAM_SIZE + REM_FOR_STACK - 62 * 1024)
+#define SIZE_SRAM_BUF (MAX_SRAM_SIZE)
+
+__nram__ int8_t nram_buffer[SIZE_NRAM_BUF];
+__mlu_shared__ int8_t sram_buffer[SIZE_SRAM_BUF];
+
+__mlu_func__ void pvLock() {
+#if __BANG_ARCH__ == 270
+  if (coreId != MEMORY_CORE) {
+    __bang_lock(0, 0);
+  }
+#endif
+}
+
+__mlu_func__ void pvUnlock() {
+#if __BANG_ARCH__ == 270
+  if (coreId != MEMORY_CORE) {
+    __bang_unlock(0, 0);
+  }
+#endif
+}
+
+enum Addr { SRAM, GDRAM };
+
+template <typename IN_DT, typename OUT_DT>
+__mlu_func__ void nms_detection(
+    uint32_t *output_box_num, const int output_mode, const int input_layout,
+    OUT_DT *output_data, const Addr dst, IN_DT *input_data_score,
+    const IN_DT *input_data_box, const Addr src, IN_DT *buffer,
+    const int buffer_size, IN_DT *sram, const int core_limit,
+    const int input_box_num, const int input_stride, const int output_stride,
+    const int keepNum, const float thresh_iou, const float thresh_score,
+    const float offset, const int algo) {
+  // global value, it is stored in sram with a offset from the begin.
+  const int flag_offset_size = 28;
+  int32_t *loop_end_flag = (int32_t *)(sram + flag_offset_size);
+  loop_end_flag[0] = 0;
+  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
+  const int nms_buffer_count1 = 9;
+  // temp nram buffer to store selected target.
+  const int nram_save_limit_count = 256;
+  float div_thresh_iou = 1.0 / thresh_iou;
+
+  // input data ptr
+  IN_DT *input_score_ptr;
+  const IN_DT *input_x1_ptr;
+  const IN_DT *input_y1_ptr;
+  const IN_DT *input_x2_ptr;
+  const IN_DT *input_y2_ptr;
+  input_score_ptr = input_data_score;
+  input_x1_ptr = input_data_box;
+  if (input_layout == 0) {
+    // [boxes_num, 4]
+    input_y1_ptr = input_x1_ptr + 1;
+    input_x2_ptr = input_x1_ptr + 2;
+    input_y2_ptr = input_x1_ptr + 3;
+  } else if (input_layout == 1) {
+    // [4, boxes_num]
+    input_y1_ptr = input_x1_ptr + input_stride;
+    input_x2_ptr = input_y1_ptr + input_stride;
+    input_y2_ptr = input_x2_ptr + input_stride;
+  }
+
+  // nram data ptr
+  IN_DT *x1;
+  IN_DT *y1;
+  IN_DT *x2;
+  IN_DT *y2;
+  IN_DT *score;
+  IN_DT *inter_x1;
+  IN_DT *inter_y1;
+  IN_DT *inter_x2;
+  IN_DT *inter_y2;
+  IN_DT *max_box;  // the max score, x1, y1, x2, y2
+  IN_DT *x1_mask;
+  IN_DT *y1_mask;
+  IN_DT *x2_mask;
+  IN_DT *y2_mask;
+  OUT_DT *nram_save;
+
+  int limit = 0;        // find limit when GDRAM or SRAM
+  int len_core = 0;     // the length deal by every core
+  int max_seg_pad = 0;  // the max length every repeat
+  int repeat = 0;
+  int remain = 0;
+  int remain_pad = 0;
+  int input_offset = 0;  // offset of input_data for current core
+  int nram_save_count = 0;
+  // mask for collect x1, y1, x2, y2. each mask has 128 elements
+  const int mask_size = 128;
+  const int total_mask_size = 512;
+
+  if (output_mode == 0) {
+    limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * sizeof(OUT_DT) -
+             total_mask_size * sizeof(IN_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  } else {
+    limit = (buffer_size - 128 /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * INFO_NUM * sizeof(OUT_DT) -
+             total_mask_size * sizeof(IN_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  }
+
+  if (core_limit == 1) {
+    len_core = input_box_num;
+    input_offset = 0;
+  } else {
+    int avg_core = input_box_num / core_limit;
+    int rem = input_box_num % core_limit;
+    len_core = avg_core + (taskId < rem ? 1 : 0);
+    input_offset = avg_core * taskId + (taskId <= rem ? taskId : rem);
+  }
+  max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
+  repeat = len_core / max_seg_pad;
+  remain = len_core % max_seg_pad;
+  remain_pad = PAD_UP(remain, NMS_SIZE);
+
+  // if datatype is half, we should convert it to float when compute the IoU
+  int max_seg_iou_compute =
+      PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
+  int repeat_iou_compute = len_core / max_seg_iou_compute;
+  int remain_iou_compute = len_core % max_seg_iou_compute;
+  int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
+  // initial the address point
+  score = buffer;
+  x1 = score + max_seg_pad;
+  y1 = x1 + max_seg_pad;
+  x2 = y1 + max_seg_pad;
+  y2 = x2 + max_seg_pad;
+  inter_x1 = y2 + max_seg_pad;
+  inter_y1 = inter_x1 + max_seg_pad;
+  inter_x2 = inter_y1 + max_seg_pad;
+  inter_y2 = inter_x2 + max_seg_pad;
+  x1_mask = inter_y2 + max_seg_pad;
+  y1_mask = x1_mask + mask_size;
+  x2_mask = y1_mask + mask_size;
+  y2_mask = x2_mask + mask_size;
+  max_box = y2_mask + mask_size;  // the max score, x1, y1, x2, y2
+  // offset two line from max_box
+  nram_save = (OUT_DT *)((char *)max_box + NFU_ALIGN_SIZE);
+
+  // set mask for __bang_collect instruction
+  if (input_layout == 0) {
+    __nramset((IN_DT *)x1_mask, total_mask_size, (IN_DT)0);
+    for (int idx = 0; idx < mask_size; idx++) {
+      int index = (idx % COORD_DIM) * mask_size + idx;
+      x1_mask[index] = (IN_DT)1.0;
+    }
+  }
+
+  for (int keep = 0; keep < keepNum; keep++) {  // loop until the max_score <= 0
+    if (core_limit != 1) {
+      __sync_cluster();  // sync before current loop
+    }
+
+    /******find max start******/
+    int max_index = 0;         // the max score index
+    int global_max_index = 0;  // for U1
+    float max_area = 0;        // the max score area
+    max_box[0] = 0;            // init 0
+
+    for (int i = 0; i <= repeat; i++) {
+      if (i == repeat && remain == 0) {
+        break;
+      }
+      int seg_len = 0;  // the length every nms compute
+      int cpy_len = 0;  // the length every nms memcpy
+      i == repeat ? seg_len = remain_pad : seg_len = max_seg_pad;
+      // check seg_len exceeds the limit of fp16 or not. 65536 is the largest
+      // num that half data type could express.
+      if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
+        // seg length exceeds the max num for fp16 datatype!
+        return;
+      }
+      i == repeat ? cpy_len = remain : cpy_len = max_seg_pad;
+      /******nms load start******/
+      mluMemcpyDirection_t load_dir = SRAM2NRAM;
+      if (src == SRAM) {
+        load_dir = SRAM2NRAM;
+      } else {
+        load_dir = GDRAM2NRAM;
+      }
+      __nramset(score, seg_len, (IN_DT)0);
+      __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+               cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+
+      /******nms load end******/
+
+      __bang_max(inter_x1, score, seg_len);
+      if (inter_x1[0] > max_box[0]) {
+        max_box[0] = inter_x1[0];
+
+        if (sizeof(IN_DT) == sizeof(half)) {
+          max_index = ((uint16_t *)inter_x1)[1] + input_offset +
+                      i * max_seg_pad;  // offset start from head of input_data
+        } else if (sizeof(IN_DT) == sizeof(float)) {
+          max_index = ((uint32_t *)inter_x1)[1] + input_offset +
+                      i * max_seg_pad;  // offset start from head of input_data
+        }
+      }
+    }  // for repeat
+
+    int stride = 1;
+    if (input_layout == 0) {
+      stride = input_stride;
+    } else if (input_layout == 1) {
+      stride = 1;
+    }
+
+    if (core_limit == 1) {
+      max_box[1] = input_x1_ptr[max_index * stride];
+      max_box[2] = input_y1_ptr[max_index * stride];
+      max_box[3] = input_x2_ptr[max_index * stride];
+      max_box[4] = input_y2_ptr[max_index * stride];
+      if (algo == 0 || offset == 0.0) {
+        max_area = ((float)max_box[3] - (float)max_box[1]) *
+                   ((float)max_box[4] - (float)max_box[2]);
+      } else {
+        max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+                   ((float)max_box[4] - (float)max_box[2] + offset);
+      }
+      input_score_ptr[max_index] = 0;
+      global_max_index = max_index;
+      ((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
+    } else if (core_limit == 4) {
+      // find the max with sram
+      // the max box's x1, y1, x2, y2 on every core
+      if (coreId != MEMORY_CORE) {
+        max_box[1] = input_x1_ptr[max_index * stride];
+        max_box[2] = input_y1_ptr[max_index * stride];
+        max_box[3] = input_x2_ptr[max_index * stride];
+        max_box[4] = input_y2_ptr[max_index * stride];
+      }
+      ((uint32_t *)(max_box + INFO_NUM))[0] = max_index;
+      // copy every core's box info to sram, form: score---x1---y1---x2---y2---
+      for (int i = 0; i < INFO_NUM; i++) {
+        __memcpy(sram + i * core_limit + taskId, max_box + i, 1 * sizeof(IN_DT),
+                 NRAM2SRAM);
+      }
+      // copy every core's max_index to sram, use 2 half to store max_index
+      __memcpy(sram + INFO_NUM * core_limit + taskId * 2, max_box + INFO_NUM,
+               sizeof(uint32_t),
+               NRAM2SRAM);  // int32_t datatype
+      __sync_cluster();
+
+      // copy score from sram to nram and find the max
+      __nramset(inter_x1, NMS_SIZE, (IN_DT)0);
+      __memcpy(inter_x1, sram, core_limit * sizeof(IN_DT), SRAM2NRAM);
+      __bang_max(max_box, inter_x1, NMS_SIZE);
+      int max_core = 0;
+      if (sizeof(IN_DT) == sizeof(half)) {
+        max_core = ((uint16_t *)max_box)[1];
+      } else if (sizeof(IN_DT) == sizeof(float)) {
+        max_core = ((uint32_t *)max_box)[1];
+      }
+
+      // copy the max box from SRAM to NRAM
+      __memcpy(max_box + 1, sram + 1 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // x1
+      __memcpy(max_box + 2, sram + 2 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // y1
+      __memcpy(max_box + 3, sram + 3 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // x2
+      __memcpy(max_box + 4, sram + 4 * core_limit + max_core, 1 * sizeof(IN_DT),
+               SRAM2NRAM);  // y2
+      __memcpy(max_box + 5, sram + 5 * core_limit + 2 * max_core,
+               sizeof(uint32_t), SRAM2NRAM);
+      if (algo == 0 || offset == 0.0) {
+        max_area = ((float)max_box[3] - (float)max_box[1]) *
+                   ((float)max_box[4] - (float)max_box[2]);
+      } else {
+        max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+                   ((float)max_box[4] - (float)max_box[2] + offset);
+      }
+      global_max_index = ((uint32_t *)(max_box + INFO_NUM))[0];
+      input_score_ptr[global_max_index] = 0;
+    }
+    // by now, we get: max_score|max_index|max_box|max_area
+    /******find max end******/
+
+    /******nms store start******/
+    // store to nram
+    if (float(max_box[0]) > thresh_score) {
+      OUT_DT *save_ptr;
+      int save_offset = 0;
+      int save_str_num = 0;
+      save_ptr = nram_save;
+      save_offset = nram_save_count;
+      save_str_num = nram_save_limit_count;
+      if (coreId == 0) {
+        if (output_mode == 0) {  // index1, index2, ...
+          __memcpy(save_ptr + save_offset, (uint32_t *)(max_box + INFO_NUM),
+                   1 * sizeof(uint32_t), NRAM2NRAM, 1 * sizeof(uint32_t),
+                   1 * sizeof(uint32_t), 0);
+        } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+          __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
+                   INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
+                   INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
+        } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
+          __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
+                   NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
+                   4);
+        }
+      }
+      nram_save_count++;
+      (*output_box_num)++;
+    }
+
+    // store to sram/gdram
+    if (*output_box_num != 0) {
+      mluMemcpyDirection_t store_dir = NRAM2GDRAM;
+      if (dst == SRAM) {
+        store_dir = NRAM2SRAM;
+      } else {  // dst == GDRAM
+        store_dir = NRAM2GDRAM;
+      }
+      if ((nram_save_count == nram_save_limit_count) ||
+          (float(max_box[0]) <= thresh_score) || keep == keepNum - 1) {
+        if (nram_save_count != 0) {
+          if (coreId == 0) {
+            if (output_mode == 0) {  // index1, index2, ...
+              pvLock();
+              __memcpy(output_data, nram_save,
+                       nram_save_count * sizeof(uint32_t), store_dir);
+              pvUnlock();
+              output_data += nram_save_count;
+            } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+              pvLock();
+              __memcpy(output_data, nram_save,
+                       nram_save_count * INFO_NUM * sizeof(IN_DT), store_dir);
+              pvUnlock();
+              output_data += nram_save_count * INFO_NUM;
+            } else if (output_mode ==
+                       2) {  // score---, x1---, y1---, x2---, y2---
+              pvLock();
+              __memcpy(output_data, nram_save, nram_save_count * sizeof(IN_DT),
+                       store_dir, output_stride * sizeof(IN_DT),
+                       nram_save_limit_count * sizeof(IN_DT), 4);
+              pvUnlock();
+              output_data += nram_save_count;
+            }
+            nram_save_count = 0;
+          }
+        }
+      }  // if move data nram->sram/gdram
+    }    // if dst
+
+    // if the max score <= 0, end
+    if (core_limit == 1) {
+      if (float(max_box[0]) <= thresh_score) {
+        break;
+      }
+    } else {
+      if (float(max_box[0]) <= thresh_score) {
+        if (coreId == 0) {
+          loop_end_flag[0] = 1;
+        }
+      }
+      __sync_cluster();
+      if (loop_end_flag[0] == 1) {
+        break;
+      }
+    }
+    /******nms store end******/
+
+    // To solve half data accuracy, we convert half to float to calculate IoU.
+    for (int i = 0; i <= repeat_iou_compute; i++) {
+      if (i == repeat_iou_compute && remain_iou_compute == 0) {
+        break;
+      }
+      int seg_len = 0;  // the length every nms compute
+      int cpy_len = 0;  // the length every nms memcpy
+      i == repeat_iou_compute ? seg_len = remain_pad_iou_compute
+                              : seg_len = max_seg_iou_compute;
+      i == repeat_iou_compute ? cpy_len = remain_iou_compute
+                              : cpy_len = max_seg_iou_compute;
+
+      /******nms load start******/
+      mluMemcpyDirection_t load_dir = SRAM2NRAM;
+      if (src == SRAM) {
+        load_dir = SRAM2NRAM;
+      } else {
+        load_dir = GDRAM2NRAM;
+      }
+
+      __nramset((float *)score, seg_len, 0.0f);
+      int dt_offset = 0;
+      if (sizeof(IN_DT) == sizeof(float)) {
+        __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        dt_offset = 0;
+      } else if (sizeof(IN_DT) == sizeof(half)) {
+        __nramset(x1, seg_len, half(0));
+        __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __bang_half2float((float *)score, (half *)x1, seg_len);
+        dt_offset = max_seg_iou_compute;
+      }
+
+      if (input_layout == 0) {
+        // the following number 4 means x1, y1, x2, y2
+        __memcpy(
+            inter_x1,
+            input_x1_ptr + (input_offset + i * max_seg_iou_compute) * COORD_DIM,
+            cpy_len * COORD_DIM * sizeof(IN_DT), load_dir,
+            cpy_len * COORD_DIM * sizeof(IN_DT),
+            cpy_len * COORD_DIM * sizeof(IN_DT), 0);
+        // here use collect instruction to transpose the [n, 4] shape into [4,
+        // n] shape to avoid
+        // discrete memory accessing.
+        for (int c_i = 0; c_i < COORD_DIM * seg_len / mask_size; c_i++) {
+          // the following number 32 means 32 elements will be selected out by
+          // once operation
+          __bang_collect(x1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         x1_mask, mask_size);
+          __bang_collect(y1 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         y1_mask, mask_size);
+          __bang_collect(x2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         x2_mask, mask_size);
+          __bang_collect(y2 + dt_offset + c_i * 32, inter_x1 + c_i * mask_size,
+                         y2_mask, mask_size);
+        }
+      } else if (input_layout == 1) {
+        __memcpy(x1 + dt_offset,
+                 input_x1_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __memcpy(y1 + dt_offset,
+                 input_y1_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __memcpy(x2 + dt_offset,
+                 input_x2_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+        __memcpy(y2 + dt_offset,
+                 input_y2_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), load_dir, cpy_len * sizeof(IN_DT),
+                 cpy_len * sizeof(IN_DT), 0);
+      }
+      /******nms load end******/
+
+      /******nms compute start******/
+      if (sizeof(IN_DT) == sizeof(half)) {
+        __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
+                          seg_len);
+      }
+      // 1、 compute IOU
+      // get the area_I
+      __nramset((float *)inter_y1, seg_len, float(max_box[1]));  // max_x1
+      __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
+                      seg_len);                                  // inter_x1
+      __nramset((float *)inter_y2, seg_len, float(max_box[3]));  // max_x2
+      __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
+                      seg_len);  // inter_x2
+      __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_x1, (float *)inter_x1,
+                         seg_len);                               // inter_w
+      __nramset((float *)inter_x2, seg_len, float(max_box[2]));  // max_y1
+      __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
+                      seg_len);                                  // inter_y1
+      __nramset((float *)inter_x2, seg_len, float(max_box[4]));  // max_y2
+      __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
+                      seg_len);  // inter_y2
+      __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_y1, (float *)inter_y1,
+                         seg_len);  // inter_h
+      __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
+                 seg_len);  // area_I
+      // get the area of input_box: area = (x2 - x1) * (y2 - y1);
+      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
+      __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+        __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
+      }
+      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
+                 seg_len);  // area
+      // get the area_U: area + max_area - area_I
+      __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
+                       seg_len);
+      __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);  // area_U
+      // 2、 select the box
+      // if IOU greater than thres, set the score to zero, abort it: area_U >
+      // area_I * (1 / thresh)?
+      if (thresh_iou > 0.0) {
+        __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
+                         seg_len);
+      } else {
+        __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
+                         seg_len);
+      }
+      __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                seg_len);
+      __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
+      /******nms compute end******/
+
+      // update the score
+      mluMemcpyDirection_t update_dir = NRAM2SRAM;
+      if (dst == SRAM) {
+        update_dir = NRAM2SRAM;
+      } else {
+        update_dir = NRAM2GDRAM;
+      }
+      if (sizeof(IN_DT) == sizeof(half)) {
+        __bang_float2half_rd((half *)score, (float *)score, seg_len);
+      }
+      pvLock();
+      __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
+               cpy_len * sizeof(IN_DT), update_dir, cpy_len * sizeof(IN_DT),
+               cpy_len * sizeof(IN_DT), 0);
+      pvUnlock();
+    }  // for repeat
+  }    // for keepNum
+}
+
+__mlu_global__ void MLUUnion1KernelNMS(
+    const void *input_boxes, const void *input_confidence,
+    const int input_num_boxes, const int input_stride,
+    const int max_output_size, const float iou_threshold,
+    const float confidence_threshold, const int mode, const int input_layout,
+    void *workspace, void *result_num, void *output,
+    const cnrtDataType_t data_type_input, const float offset, const int algo) {
+  if (data_type_input == CNRT_FLOAT16) {
+    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(half),
+             GDRAM2GDRAM);
+  } else if (data_type_input == CNRT_FLOAT32) {
+    __memcpy(workspace, input_confidence, input_num_boxes * sizeof(float),
+             GDRAM2GDRAM);
+  } else {
+  }
+
+  int output_stride = max_output_size;
+  uint32_t result_box_num = 0;
+  if (mode == 0) {
+    uint32_t *out_data = (uint32_t *)output;
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *boxes_data = (half *)input_boxes;
+        half *confi_data = (half *)workspace;
+        half *buffer = (half *)nram_buffer;
+        half *sram = (half *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *boxes_data = (float *)input_boxes;
+        float *confi_data = (float *)workspace;
+        float *buffer = (float *)nram_buffer;
+        float *sram = (float *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+    }
+  } else {
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *boxes_data = (half *)input_boxes;
+        half *confi_data = (half *)workspace;
+        half *out_data = (half *)output;
+        half *buffer = (half *)nram_buffer;
+        half *sram = (half *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *boxes_data = (float *)input_boxes;
+        float *confi_data = (float *)workspace;
+        float *out_data = (float *)output;
+        float *buffer = (float *)nram_buffer;
+        float *sram = (float *)sram_buffer;
+
+        nms_detection(&result_box_num, mode, input_layout, out_data, GDRAM,
+                      confi_data, boxes_data, GDRAM, buffer, SIZE_NRAM_BUF,
+                      sram, taskDim, input_num_boxes, input_stride,
+                      output_stride, max_output_size, iou_threshold,
+                      confidence_threshold, offset, algo);
+        ((uint32_t *)result_num)[0] = result_box_num;
+      }; break;
+    }
+  }
+}
+
+template <typename IN_DT, typename OUT_DT>
+__mlu_func__ void nms_detection_ux(
+    int32_t *loop_end_flag, uint32_t &output_box_num, OUT_DT *output_dram,
+    IN_DT *score_data, const IN_DT *boxes_data, const Addr input_ram,
+    const int input_layout, const int input_num_boxes, const int input_stride,
+    const int max_output_size, const float thresh_iou, const float thresh_score,
+    const float offset, const int output_mode, const int algo) {
+  loop_end_flag[0] = 0;
+  IN_DT *sram = (IN_DT *)sram_buffer;
+
+  // score, x1, y1, x2, y2, inter_x1, inter_y1, inter_x2, inter_y2
+  int nms_buffer_count1 = 9;
+  // temp nram buffer to store selected target.
+  int nram_save_limit_count = 256;
+  float div_thresh_iou = 1.0 / thresh_iou;
+
+  // input data ptr
+  IN_DT *input_score_ptr;
+  const IN_DT *input_x1_ptr;
+  const IN_DT *input_y1_ptr;
+  const IN_DT *input_x2_ptr;
+  const IN_DT *input_y2_ptr;
+  input_score_ptr = score_data;
+  input_x1_ptr = boxes_data;
+  input_y1_ptr = input_x1_ptr + input_stride;
+  input_x2_ptr = input_y1_ptr + input_stride;
+  input_y2_ptr = input_x2_ptr + input_stride;
+
+  int limit = 0;        // find limit when GDRAM or SRAM
+  int max_seg_pad = 0;  // the max length every repeat
+  int repeat = 0;
+  int remain = 0;
+  int remain_pad = 0;
+  int nram_save_count = 0;
+
+  if (output_mode == 0) {
+    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * sizeof(OUT_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  } else {
+    limit = (SIZE_NRAM_BUF - NFU_ALIGN_SIZE /*for max_box*/ * sizeof(IN_DT) -
+             nram_save_limit_count * INFO_NUM * sizeof(OUT_DT)) /
+            (nms_buffer_count1 * sizeof(IN_DT));
+  }
+
+  // data split
+  int avg_cluster = input_num_boxes / clusterDim;
+  int rem_cluster = input_num_boxes % clusterDim;
+  int len_cluster = avg_cluster + (clusterId < rem_cluster ? 1 : 0);
+  int cluster_offset = avg_cluster * clusterId +
+                       (clusterId <= rem_cluster ? clusterId : rem_cluster);
+
+  int avg_core = len_cluster / coreDim;
+  int rem_core = len_cluster % coreDim;
+  int len_core = avg_core + (coreId < rem_core ? 1 : 0);
+  int core_offset =
+      avg_core * coreId + (coreId <= rem_core ? coreId : rem_core);
+  int input_offset = cluster_offset + core_offset;
+
+  max_seg_pad = PAD_DOWN(limit, NMS_SIZE);
+
+  // core 0 of each cluster calculate the max score index
+  int max_index_avg_core = input_num_boxes / clusterDim;
+  int max_index_rem_core = input_num_boxes % clusterDim;
+  int max_index_len_core =
+      max_index_avg_core + (clusterId < max_index_rem_core ? 1 : 0);
+  int max_index_input_offset =
+      max_index_avg_core * clusterId +
+      (clusterId <= max_index_rem_core ? clusterId : max_index_rem_core);
+  repeat = max_index_len_core / max_seg_pad;
+  remain = max_index_len_core % max_seg_pad;
+  remain_pad = PAD_UP(remain, NMS_SIZE);
+
+  // if datatype is fp16, we should cvt to fp32 when compute iou
+  int max_seg_iou_compute =
+      PAD_DOWN(max_seg_pad / (sizeof(float) / sizeof(IN_DT)), NMS_SIZE);
+  int repeat_iou_compute = len_core / max_seg_iou_compute;
+  int remain_iou_compute = len_core % max_seg_iou_compute;
+  int remain_pad_iou_compute = PAD_UP(remain_iou_compute, NMS_SIZE);
+
+  // init the nram ptr
+  IN_DT *score = (IN_DT *)nram_buffer;
+  IN_DT *x1 = score + max_seg_pad;
+  IN_DT *y1 = x1 + max_seg_pad;
+  IN_DT *x2 = y1 + max_seg_pad;
+  IN_DT *y2 = x2 + max_seg_pad;
+  IN_DT *inter_x1 = y2 + max_seg_pad;
+  IN_DT *inter_y1 = inter_x1 + max_seg_pad;
+  IN_DT *inter_x2 = inter_y1 + max_seg_pad;
+  IN_DT *inter_y2 = inter_x2 + max_seg_pad;
+  IN_DT *max_box = inter_y2 + max_seg_pad;  // the max score, x1, y1, x2, y2
+  OUT_DT *nram_save =
+      (OUT_DT *)((char *)max_box +
+                 NFU_ALIGN_SIZE);  // offset two line from max_box
+
+  mluMemcpyDirection_t input_load_dir = SRAM2NRAM;
+  mluMemcpyDirection_t input_store_dir = NRAM2SRAM;
+  input_load_dir = (input_ram == SRAM) ? SRAM2NRAM : GDRAM2NRAM;
+  input_store_dir = (input_ram == SRAM) ? NRAM2SRAM : NRAM2GDRAM;
+
+  for (int keep = 0; keep < max_output_size;
+       keep++) {  // loop until the max_score <= 0
+    __sync_all();
+
+    /******FIND MAX START******/
+    int max_index = 0;
+    int global_max_index = 0;  // for Ux
+    float max_area = 0;        // the max socre area
+    max_box[0] = 0;            // init 0
+
+    if (coreId == 0) {
+      for (int i = 0; i <= repeat; i++) {
+        if (i == repeat && remain == 0) {
+          break;
+        }
+
+        int seg_len = (i == repeat)
+                          ? remain_pad
+                          : max_seg_pad;  // the length every nms compute
+        // check seg_len exceeds the limit of fp16 or not. 65536 is the largest
+        // num
+        // that fp16 could express.
+        if (sizeof(IN_DT) == sizeof(half) && seg_len > 65536) {
+          return;
+        }
+        int cpy_len = (i == repeat)
+                          ? remain
+                          : max_seg_pad;  // the length every nms memcpy
+
+        /******NMS LOAD START******/
+        __bang_write_zero(score, seg_len);
+        __memcpy(score,
+                 input_score_ptr + max_index_input_offset + i * max_seg_pad,
+                 cpy_len * sizeof(IN_DT), input_load_dir,
+                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+
+        /******NMS LOAD END******/
+
+        __bang_max(inter_x1, score, seg_len);
+        if (inter_x1[0] > max_box[0]) {
+          max_box[0] = inter_x1[0];
+          if (sizeof(IN_DT) == sizeof(half)) {
+            max_index =
+                ((uint16_t *)inter_x1)[1] + max_index_input_offset +
+                i * max_seg_pad;  // offset start from head of input_data
+          } else if (sizeof(IN_DT) == sizeof(float)) {
+            max_index =
+                ((uint32_t *)inter_x1)[1] + max_index_input_offset +
+                i * max_seg_pad;  // offset start from head of input_data
+          }
+        }
+      }  // for repeat
+
+      // the max box's x1, y1, x2, y2 on every cluster
+      max_box[1] = input_x1_ptr[max_index];
+      max_box[2] = input_y1_ptr[max_index];
+      max_box[3] = input_x2_ptr[max_index];
+      max_box[4] = input_y2_ptr[max_index];
+      ((uint32_t *)(max_box + 5))[0] = max_index;
+      // copy max box info to sram
+      __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
+    }
+    __sync_all();
+    // copy all partial max to the sram of cluster 0
+    if (clusterId != 0) {
+      __memcpy(sram + REDUCE_NUM * clusterId, sram, REDUCE_NUM * sizeof(IN_DT),
+               SRAM2SRAM, 0);
+    }
+    __sync_all();
+
+    // reduce between clusters to get the global max box
+    if (clusterId == 0) {
+      if (coreId == 0) {
+        __bang_write_zero(inter_x1, NMS_SIZE);
+        __memcpy(inter_x1, sram, sizeof(IN_DT), SRAM2NRAM, sizeof(IN_DT),
+                 REDUCE_NUM * sizeof(IN_DT), clusterDim - 1);
+        __bang_max(max_box, inter_x1, NMS_SIZE);
+        int max_cluster = (sizeof(IN_DT) == sizeof(half))
+                              ? ((uint16_t *)max_box)[1]
+                              : ((uint32_t *)max_box)[1];
+        __memcpy(max_box, sram + max_cluster * REDUCE_NUM,
+                 REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
+        __memcpy(sram, max_box, REDUCE_NUM * sizeof(IN_DT), NRAM2SRAM);
+      }
+      __sync_cluster();
+      if (coreId == 0x80 && clusterDim > 1) {
+        // broadcast global max box to each cluster's sram
+        for (int cluster_idx = 1; cluster_idx < clusterDim; ++cluster_idx) {
+          __memcpy(sram, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2SRAM,
+                   cluster_idx);
+        }
+      }
+      __sync_cluster();
+    }
+    __sync_all();
+
+    // copy the global max box to max_box
+    __memcpy(max_box, sram, REDUCE_NUM * sizeof(IN_DT), SRAM2NRAM);
+    if (algo == 0 || offset == 0.0) {
+      max_area = ((float)max_box[3] - (float)max_box[1]) *
+                 ((float)max_box[4] - (float)max_box[2]);
+    } else {
+      max_area = ((float)max_box[3] - (float)max_box[1] + offset) *
+                 ((float)max_box[4] - (float)max_box[2] + offset);
+    }
+    global_max_index = ((uint32_t *)(max_box + 5))[0];
+    if (coreId != 0x80) {
+      input_score_ptr[global_max_index] = 0;
+    }
+    // by now, we get: max_score|max_index|max_box|max_area
+    /******FIND MAX END******/
+
+    /******NMS STORE START******/
+    // store to nram
+    if (float(max_box[0]) > thresh_score) {
+      OUT_DT *save_ptr;
+      int save_offset = 0;
+      int save_str_num = 0;
+      save_ptr = nram_save;
+      save_offset = nram_save_count;
+      save_str_num = nram_save_limit_count;
+      if (clusterId == 0 && coreId == 0) {
+        if (output_mode == 0) {  // index1, index2, ...
+          save_ptr[save_offset] = ((uint32_t *)(max_box + INFO_NUM))[0];
+        } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+          __memcpy(save_ptr + save_offset * INFO_NUM, max_box,
+                   INFO_NUM * sizeof(IN_DT), NRAM2NRAM,
+                   INFO_NUM * sizeof(IN_DT), INFO_NUM * sizeof(IN_DT), 0);
+        } else if (output_mode == 2) {  // score---, x1---, y1---, x2---, y2---
+          __memcpy(save_ptr + save_offset, max_box, 1 * sizeof(IN_DT),
+                   NRAM2NRAM, save_str_num * sizeof(IN_DT), 1 * sizeof(IN_DT),
+                   4);
+        }
+      }
+      nram_save_count++;
+      output_box_num++;
+    }
+
+    // store to sram/gdram
+    if (output_box_num != 0) {
+      if ((nram_save_count == nram_save_limit_count) ||
+          (float(max_box[0]) <= thresh_score) || keep == max_output_size - 1) {
+        if (nram_save_count != 0) {
+          if (clusterId == 0 && coreId == 0) {
+            if (output_mode == 0) {  // index1, index2, ...
+              pvLock();
+              __memcpy(output_dram, nram_save,
+                       nram_save_count * sizeof(uint32_t), NRAM2GDRAM);
+              pvUnlock();
+              output_dram += nram_save_count;
+            } else if (output_mode == 1) {  // score, x1, y1, x2, y2
+              pvLock();
+              __memcpy(output_dram, nram_save,
+                       nram_save_count * INFO_NUM * sizeof(IN_DT), NRAM2GDRAM);
+              pvUnlock();
+              output_dram += nram_save_count * INFO_NUM;
+            } else if (output_mode ==
+                       2) {  // score---, x1---, y1---, x2---, y2---
+              pvLock();
+              __memcpy(output_dram, nram_save, nram_save_count * sizeof(IN_DT),
+                       NRAM2GDRAM, max_output_size * sizeof(IN_DT),
+                       nram_save_limit_count * sizeof(IN_DT), 4);
+              pvUnlock();
+              output_dram += nram_save_count;
+            }
+            nram_save_count = 0;
+          }
+        }
+      }  // if move data nram->sram/gdram
+    }    // if dst
+
+    if (float(max_box[0]) <= thresh_score) {
+      if (clusterId == 0 && coreId == 0) {
+        loop_end_flag[0] = 1;  // dram
+      }
+    }
+    __sync_all();
+    if (loop_end_flag[0] == 1) {
+      break;
+    }
+    /******NMS STORE END******/
+
+    // To solve fp16 accuracy, we convert fp16 to fp32 to calculate IoU.
+    for (int i = 0; i <= repeat_iou_compute; i++) {
+      if (i == repeat_iou_compute && remain_iou_compute == 0) {
+        break;
+      }
+      int seg_len = (i == repeat_iou_compute) ? remain_pad_iou_compute
+                                              : max_seg_iou_compute;
+      int cpy_len =
+          (i == repeat_iou_compute) ? remain_iou_compute : max_seg_iou_compute;
+
+      /******NMS LOAD START******/
+      __nramset((float *)score, seg_len, 0.0f);
+      int dt_offset = 0;
+      if (sizeof(IN_DT) == sizeof(float)) {
+        __memcpy(score, input_score_ptr + input_offset + i * max_seg_pad,
+                 cpy_len * sizeof(IN_DT), input_load_dir,
+                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+        dt_offset = 0;
+      } else if (sizeof(IN_DT) == sizeof(half)) {
+        __nramset(x1, seg_len, half(0));
+        __memcpy(x1, input_score_ptr + input_offset + i * max_seg_iou_compute,
+                 cpy_len * sizeof(IN_DT), input_load_dir,
+                 cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+        __bang_half2float((float *)score, (half *)x1, seg_len);
+        dt_offset = max_seg_iou_compute;
+      }
+
+      __memcpy(x1 + dt_offset,
+               input_x1_ptr + input_offset + i * max_seg_iou_compute,
+               cpy_len * sizeof(IN_DT), input_load_dir,
+               max_seg_pad * sizeof(IN_DT), input_num_boxes * sizeof(IN_DT), 3);
+      /******NMS LOAD END******/
+
+      /******NMS COMPUTE START******/
+      if (sizeof(IN_DT) == sizeof(half)) {
+        __bang_half2float((float *)x1, (half *)x1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y1, (half *)y1 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)x2, (half *)x2 + max_seg_iou_compute,
+                          seg_len);
+        __bang_half2float((float *)y2, (half *)y2 + max_seg_iou_compute,
+                          seg_len);
+      }
+      // 1、 compute IOU
+      // get the area_I
+      __nramset((float *)inter_y1, seg_len, float(max_box[1]));  // max_x1
+      __bang_maxequal((float *)inter_x1, (float *)x1, (float *)inter_y1,
+                      seg_len);                                  // inter_x1
+      __nramset((float *)inter_y2, seg_len, float(max_box[3]));  // max_x2
+      __bang_minequal((float *)inter_x2, (float *)x2, (float *)inter_y2,
+                      seg_len);  // inter_x2
+      __bang_sub((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_x1, (float *)inter_x1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_x1, (float *)inter_x1,
+                         seg_len);                               // inter_w
+      __nramset((float *)inter_x2, seg_len, float(max_box[2]));  // max_y1
+      __bang_maxequal((float *)inter_y1, (float *)y1, (float *)inter_x2,
+                      seg_len);                                  // inter_y1
+      __nramset((float *)inter_x2, seg_len, float(max_box[4]));  // max_y2
+      __bang_minequal((float *)inter_y2, (float *)y2, (float *)inter_x2,
+                      seg_len);  // inter_y2
+      __bang_sub((float *)inter_y1, (float *)inter_y2, (float *)inter_y1,
+                 seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+      }
+      __bang_active_relu((float *)inter_y1, (float *)inter_y1,
+                         seg_len);  // inter_h
+      __bang_mul((float *)inter_x1, (float *)inter_x1, (float *)inter_y1,
+                 seg_len);  // area_I
+      // get the area of input_box: area = (x2 - x1) * (y2 - y1);
+      __bang_sub((float *)inter_y1, (float *)x2, (float *)x1, seg_len);
+      __bang_sub((float *)inter_y2, (float *)y2, (float *)y1, seg_len);
+      if (algo == 1 && offset != 0.0) {
+        __bang_add_const((float *)inter_y1, (float *)inter_y1, offset, seg_len);
+        __bang_add_const((float *)inter_y2, (float *)inter_y2, offset, seg_len);
+      }
+      __bang_mul((float *)inter_x2, (float *)inter_y1, (float *)inter_y2,
+                 seg_len);  // area
+      // get the area_U: area + max_area - area_I
+      __bang_add_const((float *)inter_x2, (float *)inter_x2, float(max_area),
+                       seg_len);
+      __bang_sub((float *)inter_x2, (float *)inter_x2, (float *)inter_x1,
+                 seg_len);  // area_U
+      // 2、 select the box
+      // if IOU greater than thres, set the score to zero, abort it: area_U >
+      // area_I * (1 / thresh)?
+      if (thresh_iou > 0.0) {
+        __bang_mul_const((float *)inter_x1, (float *)inter_x1, div_thresh_iou,
+                         seg_len);
+      } else {
+        __bang_mul_const((float *)inter_x2, (float *)inter_x2, thresh_iou,
+                         seg_len);
+      }
+      __bang_ge((float *)inter_x1, (float *)inter_x2, (float *)inter_x1,
+                seg_len);
+      __bang_mul((float *)score, (float *)score, (float *)inter_x1, seg_len);
+      /******NMS COMPUTE END******/
+
+      if (sizeof(IN_DT) == 2) {
+        __bang_float2half_rd((half *)score, (float *)score, seg_len);
+      }
+      pvLock();
+      __memcpy(input_score_ptr + input_offset + i * max_seg_iou_compute, score,
+               cpy_len * sizeof(IN_DT), input_store_dir,
+               cpy_len * sizeof(IN_DT), cpy_len * sizeof(IN_DT), 0);
+      pvUnlock();
+    }  // for repeat
+  }    // for max_output_size
+}
+
+__mlu_global__ void MLUUionXKernelNMS(
+    const void *input_boxes, const void *input_confidence,
+    const int input_num_boxes, const int input_layout, const int input_stride,
+    const int max_output_size, const float iou_threshold,
+    const float confidence_threshold, const float offset,
+    const cnrtDataType_t data_type_input, const int output_mode, const int algo,
+    void *workspace, void *result_num, void *output) {
+  int input_dwidth = (data_type_input == CNRT_FLOAT32) ? 4 : 2;
+  int32_t *loop_end_flag =
+      (int32_t *)((char *)workspace +
+                  INFO_NUM * input_num_boxes * input_dwidth);
+  int reduce_sram_size = NFU_ALIGN_SIZE * REDUCE_NUM * input_dwidth;
+  int availbale_sram_size = SIZE_SRAM_BUF - reduce_sram_size;
+
+  int cluster_score_size = input_num_boxes * input_dwidth;
+  int cluster_boxes_size = input_num_boxes * 4 * input_dwidth;
+  char *sram_score = (char *)sram_buffer + reduce_sram_size;
+  char *sram_boxes =
+      (char *)sram_buffer + reduce_sram_size + cluster_score_size;
+  Addr input_ram = GDRAM;
+  if ((cluster_score_size + cluster_boxes_size) < availbale_sram_size) {
+    input_ram = SRAM;
+    __memcpy(sram_score, input_confidence, cluster_score_size, GDRAM2SRAM);
+    __memcpy(sram_boxes, input_boxes, cluster_boxes_size, GDRAM2SRAM);
+  } else {
+    __memcpy(workspace, input_confidence, cluster_score_size, GDRAM2GDRAM);
+  }
+  __sync_cluster();
+  uint32_t output_box_num = 0;
+  if (output_mode == 0) {
+    uint32_t *output_dram = (uint32_t *)output;
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *score_data;
+        half *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *score_data;
+        float *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+    }
+  } else {
+    switch (data_type_input) {
+      default: { return; }
+      case CNRT_FLOAT16: {
+        half *output_dram = (half *)output;
+        half *score_data;
+        half *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (half *)sram_score : (half *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (half *)sram_boxes : (half *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+      case CNRT_FLOAT32: {
+        float *output_dram = (float *)output;
+        float *score_data;
+        float *boxes_data;
+        score_data =
+            (input_ram == SRAM) ? (float *)sram_score : (float *)workspace;
+        boxes_data =
+            (input_ram == SRAM) ? (float *)sram_boxes : (float *)input_boxes;
+        nms_detection_ux(loop_end_flag, output_box_num, output_dram, score_data,
+                         boxes_data, input_ram, input_layout, input_num_boxes,
+                         input_stride, max_output_size, iou_threshold,
+                         confidence_threshold, offset, output_mode, algo);
+        ((uint32_t *)result_num)[0] = output_box_num;
+      }; break;
+    }
+  }
+}
+
+void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+               const cnrtDataType_t data_type_input, const void *boxes_ptr,
+               const void *scores_ptr, const int input_num_boxes,
+               const int input_stride, const int max_output_boxes,
+               const float iou_threshold, const float offset,
+               void *workspace_ptr, void *output_size_ptr, void *output_ptr) {
+  switch (k_type) {
+    default: { return; }
+    case CNRT_FUNC_TYPE_BLOCK:
+    case CNRT_FUNC_TYPE_UNION1: {
+      MLUUnion1KernelNMS<<<k_dim, k_type, queue>>>(
+          boxes_ptr, scores_ptr, input_num_boxes, input_stride,
+          max_output_boxes, iou_threshold, /*confidence_threshold=*/0.0,
+          /*output_mode=*/0,
+          /*input_layout=*/1, workspace_ptr, output_size_ptr, output_ptr,
+          data_type_input, offset, /*algo=*/1);
+    }; break;
+    case CNRT_FUNC_TYPE_UNION2:
+    case CNRT_FUNC_TYPE_UNION4:
+    case CNRT_FUNC_TYPE_UNION8:
+    case CNRT_FUNC_TYPE_UNION16: {
+      MLUUionXKernelNMS<<<k_dim, k_type, queue>>>(
+          boxes_ptr, scores_ptr, input_num_boxes, /*input_layout=*/1,
+          input_stride, max_output_boxes, iou_threshold,
+          /*confidence_threshold=*/0.0, offset, data_type_input,
+          /*output_mode=*/0, /*algo=*/1, workspace_ptr, output_size_ptr,
+          output_ptr);
+    }; break;
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu b/mmcv/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..13b4af19f669aa0b63758e899a06395b39e455aa
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/psamask_mlu_kernel.mlu
@@ -0,0 +1,615 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+#include "psamask_utils.hpp"
+
+#define COMPUTE_COUNT_ALIGN 64
+
+__nram__ char buf[MAX_NRAM_SIZE];
+
+template <typename T>
+__mlu_func__ void swap(T &a, T &b) {
+  T tmp = a;
+  a = b;
+  b = tmp;
+}
+
+template <typename T>
+__mlu_func__ void storeDataFromNramToDram(T *dst, const T *src,
+                                          const PositionInCore &position,
+                                          const Shape &shape_full) {
+  int n_offset = shape_full.h * shape_full.w * shape_full.c;
+  int h_offset = shape_full.w * shape_full.c;
+  int w_offset = shape_full.c;
+  int n_seg = position.n_end - position.n_start;
+  int h_seg = position.h_end - position.h_start;
+  int w_seg = position.w_end - position.w_start;
+  int size = h_seg * w_seg * shape_full.c;
+
+  __memcpy(dst + position.n_start * n_offset + position.h_start * h_offset +
+               position.w_start * w_offset,
+           src, size * sizeof(T), NRAM2GDRAM, n_offset * sizeof(T),
+           size * sizeof(T), n_seg - 1);
+}
+
+template <typename T>
+__mlu_func__ void loadDataFromDramToNram(T *dst, const T *src,
+                                         const PositionInCore &position,
+                                         const Shape &shape_full) {
+  int n_offset = shape_full.h * shape_full.w * shape_full.c;
+  int h_offset = shape_full.w * shape_full.c;
+  int w_offset = shape_full.c;
+  int n_seg = position.n_end - position.n_start;
+  int h_seg = position.h_end - position.h_start;
+  int w_seg = position.w_end - position.w_start;
+  int size = h_seg * w_seg * shape_full.c;
+
+  __memcpy(dst,
+           src + position.n_start * n_offset + position.h_start * h_offset +
+               position.w_start * w_offset,
+           size * sizeof(T), GDRAM2NRAM, size * sizeof(T), n_offset * sizeof(T),
+           n_seg - 1);
+}
+
+// transpose the data from A*B*C*(D*E) to A*D*E*(B*C)
+template <typename T>
+__mlu_func__ void transposeData(T *dst, T *src, const Shape &shape_seg) {
+  int align_c = CEIL_ALIGN(shape_seg.c, COMPUTE_COUNT_ALIGN / sizeof(T));
+  int align_hw =
+      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
+  for (int i = 0; i < shape_seg.n; ++i) {
+    __bang_transpose(dst, src, align_hw, align_c);
+    dst += align_hw * align_c;
+    src += align_hw * align_c;
+  }
+}
+
+template <typename T>
+__mlu_func__ void psamaskCollectForward(
+    const T *x_dram, T *y_dram, const PositionInCore &position,
+    const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask) {
+  T *x_nram = (T *)buf;
+  T *y_nram =
+      x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
+                          COMPUTE_COUNT_ALIGN / sizeof(T));
+  loadDataFromDramToNram(x_nram, x_dram, position, x_full);
+
+  // fill zeros to output
+  int elem_count =
+      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * y_full.c,
+                 NFU_ALIGN_SIZE / sizeof(T));
+  __nramset(y_nram, elem_count, (T)0);
+
+  int y_n_offset = shape_seg.h * shape_seg.w * shape_seg.c;
+  int y_h_offset = shape_seg.w * shape_seg.c;
+  int y_w_offset = shape_seg.c;
+  int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
+  int y_c_offset = 1;
+  int x_h_offset = shape_seg.w * x_full.c;
+  int x_w_offset = x_full.c;
+  int x_c_offset = 1;
+  int x_start = 0;
+  int y_start = 0;
+  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
+    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
+      for (int widx = 0; widx < shape_seg.w; ++widx) {
+        int h_abs = hidx + position.h_start;
+        int w_abs = widx + position.w_start;
+        int y_offset = y_start;
+        int x_offset = x_start;
+        y_offset += hidx * y_h_offset + widx * y_w_offset;
+        x_offset += hidx * x_h_offset + widx * x_w_offset;
+
+        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
+        const int hend = x_full.h + half_h_mask - h_abs < h_mask
+                             ? x_full.h + half_h_mask - h_abs
+                             : h_mask;
+        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
+        const int wend = x_full.w + half_w_mask - w_abs < w_mask
+                             ? x_full.w + half_w_mask - w_abs
+                             : w_mask;
+        // (h,                      w                  ) with mask-indexed
+        // (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
+        y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
+                     w_abs - half_w_mask) *
+                    y_c_offset;
+        x_offset += (hstart * w_mask + wstart) * x_c_offset;
+        int count = wend - wstart;
+        __memcpy(y_nram + y_offset, x_nram + x_offset, count * sizeof(T),
+                 NRAM2NRAM, y_c_offset * x_full.w * sizeof(T),
+                 x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
+      }
+    }
+    y_start += y_n_offset;
+    x_start += x_n_offset;
+  }
+  storeDataFromNramToDram(y_dram, y_nram, position, y_full);
+}
+
+template <typename T>
+__mlu_func__ void psamaskDistributeForward(
+    const T *x_dram, T *y_dram, const PositionInCore &position,
+    const Shape &x_full, const Shape &y_full, const Shape &shape_seg,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask) {
+  T *x_nram = (T *)buf;
+  T *y_nram_temp =
+      x_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * x_full.c,
+                          COMPUTE_COUNT_ALIGN / sizeof(T));
+  loadDataFromDramToNram(x_nram, x_dram, position, x_full);
+
+  // fill zeros to output
+  int align_c = CEIL_ALIGN(y_full.c, COMPUTE_COUNT_ALIGN / sizeof(T));
+  int align_hw =
+      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
+  int elem_count =
+      CEIL_ALIGN(shape_seg.n * align_c * align_hw, NFU_ALIGN_SIZE / sizeof(T));
+  __nramset(y_nram_temp, elem_count, (T)0);
+
+  int y_n_offset = align_hw * align_c;
+  int y_h_offset = shape_seg.w * align_c;
+  int y_w_offset = align_c;
+  int y_c_offset = 1;
+  int x_n_offset = shape_seg.h * shape_seg.w * x_full.c;
+  int x_h_offset = shape_seg.w * x_full.c;
+  int x_w_offset = x_full.c;
+  int x_c_offset = 1;
+  int h_feature = y_full.h;
+  int w_feature = y_full.w;
+
+  int y_start = 0;
+  int x_start = 0;
+  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
+    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
+      for (int widx = 0; widx < shape_seg.w; ++widx) {
+        int h_abs = hidx + position.h_start;
+        int w_abs = widx + position.w_start;
+        int y_offset = y_start;
+        int x_offset = x_start;
+        y_offset += hidx * y_h_offset + widx * y_w_offset;
+        x_offset += hidx * x_h_offset + widx * x_w_offset;
+        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
+        const int hend = h_feature + half_h_mask - h_abs < h_mask
+                             ? h_feature + half_h_mask - h_abs
+                             : h_mask;
+        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
+        const int wend = w_feature + half_w_mask - w_abs < w_mask
+                             ? w_feature + half_w_mask - w_abs
+                             : w_mask;
+        // (h,                      w                     ) with mask-indexed
+        // (h + hidx - half_h_mask, w + widx - half_w_mask) with feature-indexed
+        y_offset += ((hstart + h_abs - half_h_mask) * x_full.w + wstart +
+                     w_abs - half_w_mask) *
+                    y_c_offset;
+        x_offset += (hstart * w_mask + wstart) * x_c_offset;
+        int count = wend - wstart;
+        __memcpy(y_nram_temp + y_offset, x_nram + x_offset, count * sizeof(T),
+                 NRAM2NRAM, y_c_offset * w_feature * sizeof(T),
+                 x_c_offset * w_mask * sizeof(T), hend - hstart - 1);
+      }
+    }
+    y_start += y_n_offset;
+    x_start += x_n_offset;
+  }
+  // transpose y
+  T *y_nram = y_nram_temp + shape_seg.n * align_hw * align_c;
+  Shape y_seg{shape_seg.n, shape_seg.h, shape_seg.w, y_full.c};
+  transposeData(y_nram, y_nram_temp, y_seg);
+  swap(align_c, align_hw);
+  // store y from nram to dram
+  int y_n_offset_full = y_full.h * y_full.w * y_full.c;
+  int y_w_offset_full = y_full.c;
+  int y_c_offset_full = 1;
+
+  int y_dram_start =
+      position.n_start * y_n_offset_full +
+      (position.h_start * y_full.w + position.w_start) * y_c_offset_full;
+  int y_nram_start = 0;
+  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
+    int y_dram_offset = y_dram_start + nidx * y_n_offset_full;
+    int y_nram_offset = y_nram_start + nidx * align_hw * align_c;
+    __memcpy(y_dram + y_dram_offset, y_nram + y_nram_offset,
+             shape_seg.h * shape_seg.w * sizeof(T), NRAM2GDRAM,
+             y_w_offset_full * sizeof(T), align_c * sizeof(T),
+             h_feature * w_feature - 1);
+  }
+}
+
+template <typename T>
+__mlu_func__ void psamaskCollectBackward(
+    const T *dy_dram, T *dx_dram, const PositionInCore &position,
+    const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask) {
+  T *dy_nram = (T *)buf;
+  T *dx_nram =
+      dy_nram + CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * dy_full.c,
+                           COMPUTE_COUNT_ALIGN / sizeof(T));
+  loadDataFromDramToNram(dy_nram, dy_dram, position, dy_full);
+
+  // fill zeros to output
+  int elem_count =
+      CEIL_ALIGN(shape_seg.n * shape_seg.h * shape_seg.w * shape_seg.c,
+                 NFU_ALIGN_SIZE / sizeof(T));
+  __nramset(dx_nram, elem_count, (T)0);
+
+  int dy_n_offset = shape_seg.h * shape_seg.w * dy_full.c;
+  int dy_h_offset = shape_seg.w * dy_full.c;
+  int dy_w_offset = dy_full.c;
+  int dy_c_offset = 1;
+  int dx_n_offset = shape_seg.h * shape_seg.w * dx_full.c;
+  int dx_h_offset = shape_seg.w * dx_full.c;
+  int dx_w_offset = dx_full.c;
+  int dx_c_offset = 1;
+  int h_feature = dy_full.h;
+  int w_feature = dy_full.w;
+
+  int dy_start = 0;
+  int dx_start = 0;
+  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
+    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
+      for (int widx = 0; widx < shape_seg.w; ++widx) {
+        int h_abs = hidx + position.h_start;
+        int w_abs = widx + position.w_start;
+        int dy_offset = dy_start;
+        int dx_offset = dx_start;
+        dy_offset += hidx * dy_h_offset + widx * dy_w_offset;
+        dx_offset += hidx * dx_h_offset + widx * dx_w_offset;
+
+        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
+        const int hend = h_feature + half_h_mask - h_abs < h_mask
+                             ? h_feature + half_h_mask - h_abs
+                             : h_mask;
+        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
+        const int wend = w_feature + half_w_mask - w_abs < w_mask
+                             ? w_feature + half_w_mask - w_abs
+                             : w_mask;
+        // (h,                       w                      ) with mask-indexed
+        // (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
+        // feature-indexed
+        dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
+                      w_abs - half_w_mask) *
+                     dy_c_offset;
+        dx_offset += (hstart * w_mask + wstart) * dx_c_offset;
+        int count = wend - wstart;
+        __memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
+                 NRAM2NRAM, dx_c_offset * w_mask * sizeof(T),
+                 dy_c_offset * w_feature * sizeof(T), hend - hstart - 1);
+      }
+    }
+    dy_start += dy_n_offset;
+    dx_start += dx_n_offset;
+  }
+  storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
+}
+
+template <typename T>
+__mlu_func__ void psamaskDistributeBackward(
+    const T *dy_dram, T *dx_dram, const PositionInCore &position,
+    const Shape &dy_full, const Shape &dx_full, const Shape &shape_seg,
+    const int h_mask, const int w_mask, const int half_h_mask,
+    const int half_w_mask) {
+  // load dy from dram to nram
+  T *dy_nram_temp = (T *)buf;
+  int dy_n_offset_full = dy_full.h * dy_full.w * dy_full.c;
+  int dy_c_offset_full = 1;
+  int h_feature = dy_full.h;
+  int w_feature = dy_full.w;
+  int align_c =
+      CEIL_ALIGN(shape_seg.h * shape_seg.w, COMPUTE_COUNT_ALIGN / sizeof(T));
+  int align_hw =
+      CEIL_ALIGN(h_feature * w_feature, COMPUTE_COUNT_ALIGN / sizeof(T));
+
+  int dy_dram_start =
+      position.n_start * dy_n_offset_full +
+      (position.h_start * w_feature + position.w_start) * dy_c_offset_full;
+  int dy_nram_start = 0;
+  for (int i = 0; i < shape_seg.n; ++i) {
+    int dy_nram_offset = dy_nram_start + i * (align_hw * align_c);
+    int dy_dram_offset = dy_dram_start + i * dy_n_offset_full;
+    __memcpy(dy_nram_temp + dy_nram_offset, dy_dram + dy_dram_offset,
+             shape_seg.h * shape_seg.w * sizeof(T), GDRAM2NRAM,
+             align_c * sizeof(T), dy_full.c * sizeof(T),
+             h_feature * w_feature - 1);
+  }
+  T *dy_nram = dy_nram_temp + shape_seg.n * align_hw * align_c;
+  Shape dy_seg{shape_seg.n, h_feature, w_feature, shape_seg.h * shape_seg.w};
+  transposeData(dy_nram, dy_nram_temp, dy_seg);
+  swap(align_c, align_hw);
+
+  // fill zeros to dx
+  T *dx_nram = dy_nram + shape_seg.n * align_hw * align_c;
+  int dx_size = shape_seg.n * shape_seg.h * shape_seg.w * dx_full.c;
+  __nramset(dx_nram, CEIL_ALIGN(dx_size, NFU_ALIGN_SIZE / sizeof(T)), (T)0);
+
+  int dy_n_offset_seg = align_hw * align_c;
+  int dy_h_offset_seg = shape_seg.w * align_c;
+  int dy_w_offset_seg = align_c;
+  int dy_c_offset_seg = 1;
+  int dx_n_offset_seg = shape_seg.h * shape_seg.w * shape_seg.c;
+  int dx_h_offset_seg = shape_seg.w * shape_seg.c;
+  int dx_w_offset_seg = shape_seg.c;
+  int dx_c_offset_seg = 1;
+
+  int dy_start = 0;
+  int dx_start = 0;
+  for (int nidx = 0; nidx < shape_seg.n; ++nidx) {
+    for (int hidx = 0; hidx < shape_seg.h; ++hidx) {
+      for (int widx = 0; widx < shape_seg.w; ++widx) {
+        int h_abs = hidx + position.h_start;
+        int w_abs = widx + position.w_start;
+        int dy_offset = dy_start;
+        int dx_offset = dx_start;
+        dy_offset += hidx * dy_h_offset_seg + widx * dy_w_offset_seg;
+        dx_offset += hidx * dx_h_offset_seg + widx * dx_w_offset_seg;
+        const int hstart = half_h_mask - h_abs > 0 ? half_h_mask - h_abs : 0;
+        const int hend = h_feature + half_h_mask - h_abs < h_mask
+                             ? h_feature + half_h_mask - h_abs
+                             : h_mask;
+        const int wstart = half_w_mask - w_abs > 0 ? half_w_mask - w_abs : 0;
+        const int wend = w_feature + half_w_mask - w_abs < w_mask
+                             ? w_feature + half_w_mask - w_abs
+                             : w_mask;
+        // (h,                       w                      ) with mask-indexed
+        // (h + h_abs - half_h_mask, w + w_abs - half_w_mask) with
+        // feature-indexed
+        dy_offset += ((hstart + h_abs - half_h_mask) * w_feature + wstart +
+                      w_abs - half_w_mask) *
+                     dy_c_offset_seg;
+        dx_offset += (hstart * w_mask + wstart) * dx_c_offset_seg;
+        int count = wend - wstart;
+        __memcpy(dx_nram + dx_offset, dy_nram + dy_offset, count * sizeof(T),
+                 NRAM2NRAM, w_mask * dx_c_offset_seg * sizeof(T),
+                 w_feature * dy_c_offset_seg * sizeof(T), hend - hstart - 1);
+      }
+    }
+    dy_start += dy_n_offset_seg;
+    dx_start += dx_n_offset_seg;
+  }
+  storeDataFromNramToDram(dx_dram, dx_nram, position, dx_full);
+}
+
+template <typename T>
+__mlu_func__ void psamaskBase(const T *input_dram, T *output_dram,
+                              const Shape &input_full, const Shape &output_full,
+                              LimitParam &limit, const PsamaskType psa_type,
+                              const DimPartitionType core_partition,
+                              const DimPartitionType cluster_partition,
+                              const bool is_forward, const int h_mask,
+                              const int w_mask, const int half_h_mask,
+                              const int half_w_mask, const int n_per_core,
+                              const int h_per_core, const int n_per_cluster,
+                              const int h_per_cluster) {
+  PositionInCore position_full;
+  PositionInCore position_seg;
+  position_full.w_start = 0;
+  position_full.w_end = output_full.w;
+  int n_num_in_cluster = n_per_cluster;
+  int h_num_in_cluster = h_per_cluster;
+
+  switch (cluster_partition) {
+    case PARTITION_N: {
+      position_full.h_start = 0;
+      position_full.h_end = input_full.h;
+      position_full.n_start = taskIdY * n_per_cluster;
+      int cluster_need = (input_full.n + n_per_cluster - 1) / n_per_cluster;
+      if (taskIdY >= cluster_need) return;
+      int n_remainder = input_full.n - (cluster_need - 1) * n_per_cluster;
+      n_num_in_cluster =
+          (taskIdY == cluster_need - 1) ? n_remainder : n_per_cluster;
+      position_full.n_end = position_full.n_start + n_num_in_cluster;
+    }; break;
+    case PARTITION_H: {
+      position_full.n_start = 0;
+      position_full.n_end = input_full.n;
+      position_full.h_start = taskIdY * h_per_cluster;
+      int cluster_need = (input_full.h + h_per_cluster - 1) / h_per_cluster;
+      if (taskIdY >= cluster_need) return;
+      int h_remainder = input_full.h - (cluster_need - 1) * h_per_cluster;
+      h_num_in_cluster =
+          (taskIdY == cluster_need - 1) ? h_remainder : h_per_cluster;
+      position_full.h_end = position_full.h_start + h_num_in_cluster;
+    }; break;
+  }
+  switch (core_partition) {
+    case PARTITION_N: {
+      position_full.n_start += taskIdX * n_per_core;
+      int core_need = (n_num_in_cluster + n_per_core - 1) / n_per_core;
+      if (taskIdX >= core_need) return;
+      int n_remainder = n_num_in_cluster - (core_need - 1) * n_per_core;
+      position_full.n_end =
+          position_full.n_start +
+          ((taskIdX == core_need - 1) ? n_remainder : n_per_core);
+    }; break;
+    case PARTITION_H: {
+      position_full.h_start += taskIdX * h_per_core;
+      int core_need = (h_num_in_cluster + h_per_core - 1) / h_per_core;
+      if (taskIdX >= core_need) return;
+      int h_remainder = h_num_in_cluster - (core_need - 1) * h_per_core;
+      position_full.h_end =
+          position_full.h_start +
+          ((taskIdX == core_need - 1) ? h_remainder : h_per_core);
+    }; break;
+  }
+  // the count of n ,h and w need to be processed in the current core
+  int shape_core_n = position_full.n_end - position_full.n_start;
+  int shape_core_h = position_full.h_end - position_full.h_start;
+  int shape_core_w = input_full.w;
+
+  limit.n = limit.n < shape_core_n ? limit.n : shape_core_n;
+  limit.h = limit.h < shape_core_h ? limit.h : shape_core_h;
+  limit.w = limit.w < shape_core_w ? limit.w : shape_core_w;
+
+  // load the data to nram according to the limit
+  for (int nidx = position_full.n_start; nidx < position_full.n_end;
+       nidx += limit.n) {
+    position_seg.n_start = nidx;
+    position_seg.n_end =
+        position_seg.n_start + (position_full.n_end - nidx < limit.n
+                                    ? position_full.n_end - nidx
+                                    : limit.n);
+    for (int hidx = position_full.h_start; hidx < position_full.h_end;
+         hidx += limit.h) {
+      position_seg.h_start = hidx;
+      position_seg.h_end =
+          position_seg.h_start + (position_full.h_end - hidx < limit.h
+                                      ? position_full.h_end - hidx
+                                      : limit.h);
+      for (int widx = position_full.w_start; widx < position_full.w_end;
+           widx += limit.w) {
+        position_seg.w_start = widx;
+        position_seg.w_end =
+            position_seg.w_start + (position_full.w_end - widx < limit.w
+                                        ? position_full.w_end - widx
+                                        : limit.w);
+
+        // record the segment of output except the size of channel
+        // channel segments of output and input are the same
+        Shape shape_seg;
+        shape_seg.n = position_seg.n_end - position_seg.n_start;
+        shape_seg.h = position_seg.h_end - position_seg.h_start;
+        shape_seg.w = position_seg.w_end - position_seg.w_start;
+        shape_seg.c = output_full.c;
+
+        switch (psa_type) {
+          case COLLECT: {
+            if (is_forward) {
+              psamaskCollectForward(input_dram, output_dram, position_seg,
+                                    input_full, output_full, shape_seg, h_mask,
+                                    w_mask, half_h_mask, half_w_mask);
+            } else {
+              psamaskCollectBackward(input_dram, output_dram, position_seg,
+                                     input_full, output_full, shape_seg, h_mask,
+                                     w_mask, half_h_mask, half_w_mask);
+            }
+          } break;
+          case DISTRIBUTE: {
+            if (is_forward) {
+              psamaskDistributeForward(input_dram, output_dram, position_seg,
+                                       input_full, output_full, shape_seg,
+                                       h_mask, w_mask, half_h_mask,
+                                       half_w_mask);
+            } else {
+              psamaskDistributeBackward(input_dram, output_dram, position_seg,
+                                        input_full, output_full, shape_seg,
+                                        h_mask, w_mask, half_h_mask,
+                                        half_w_mask);
+            }
+          } break;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+__mlu_global__ void MLUUnion1KernelPsamaskForward(
+    const T *x, T *y, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg) {
+  if (coreId == 0x80) {
+    return;
+  }
+  Shape x_full, y_full;
+  x_full.n = batch;
+  x_full.h = h_feature;
+  x_full.w = w_feature;
+  x_full.c = x_c;
+  y_full.n = batch;
+  y_full.h = h_feature;
+  y_full.w = w_feature;
+  y_full.c = y_c;
+
+  LimitParam limit;
+  limit.n = limit_n_seg;
+  limit.h = limit_h_seg;
+  limit.w = limit_w_seg;
+
+  psamaskBase(x, y, x_full, y_full, limit, psa_type, core_partition,
+              cluster_partition, true, h_mask, w_mask, half_h_mask, half_w_mask,
+              n_per_core, h_per_core, n_per_cluster, h_per_cluster);
+}
+
+template <typename T>
+__mlu_global__ void MLUUnion1KernelPsamaskBackward(
+    const T *dy, T *dx, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg) {
+  if (coreId == 0x80) {
+    return;
+  }
+  Shape dy_full, dx_full;
+  dx_full.n = batch;
+  dx_full.h = h_feature;
+  dx_full.w = w_feature;
+  dx_full.c = dx_c;
+  dy_full.n = batch;
+  dy_full.h = h_feature;
+  dy_full.w = w_feature;
+  dy_full.c = dy_c;
+
+  LimitParam limit;
+  limit.n = limit_n_seg;
+  limit.h = limit_h_seg;
+  limit.w = limit_w_seg;
+
+  psamaskBase(dy, dx, dy_full, dx_full, limit, psa_type, core_partition,
+              cluster_partition, false, h_mask, w_mask, half_h_mask,
+              half_w_mask, n_per_core, h_per_core, n_per_cluster,
+              h_per_cluster);
+}
+
+void KernelPsamaskForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *x, void *y, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg) {
+  MLUUnion1KernelPsamaskForward<<<k_dim, k_type, queue>>>(
+      static_cast<const float *>(x), static_cast<float *>(y), psa_type,
+      core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
+      w_mask, x_c, y_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
+      n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
+}
+
+void KernelPsamaskBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *dy, void *dx, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg) {
+  MLUUnion1KernelPsamaskBackward<<<k_dim, k_type, queue>>>(
+      static_cast<const float *>(dy), static_cast<float *>(dx), psa_type,
+      core_partition, cluster_partition, batch, h_feature, w_feature, h_mask,
+      w_mask, dx_c, dy_c, half_h_mask, half_w_mask, n_per_core, h_per_core,
+      n_per_cluster, h_per_cluster, limit_n_seg, limit_h_seg, limit_w_seg);
+}
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/psamask_utils.hpp b/mmcv/mmcv/ops/csrc/common/mlu/psamask_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..30ec388494615842528b74da0661e169b08a545e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/psamask_utils.hpp
@@ -0,0 +1,55 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef PSAMASK_UTILS_HPP_
+#define PSAMASK_UTILS_HPP_
+
+typedef enum {
+  COLLECT = 0,
+  DISTRIBUTE = 1,
+} PsamaskType;
+
+typedef enum {
+  PARTITION_N = 0,
+  PARTITION_H = 1,
+} DimPartitionType;
+
+struct PartitionSeg {
+  int h_per_cluster;
+  int n_per_cluster;
+  int h_per_core;
+  int n_per_core;
+  DimPartitionType cluster_partition;
+  DimPartitionType core_partition;
+};
+
+struct Shape {
+  int n;
+  int h;
+  int w;
+  int c;
+};
+
+struct LimitParam {
+  int n;
+  int h;
+  int w;
+};
+
+struct PositionInCore {
+  int n_start;
+  int n_end;
+  int h_start;
+  int h_end;
+  int w_start;
+  int w_end;
+};
+#endif  // PSAMASK_UTILS_HPP_
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu b/mmcv/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..f62554d0effd9e67ba5068b1b57d7e7131c696ea
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/roi_align_mlu_kernel.mlu
@@ -0,0 +1,493 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+#define ROI_OFFSET 5
+
+__nram__ char buffer[MAX_NRAM_SIZE];
+
+namespace forward {
+template <typename T>
+__mlu_func__ void bilinearInterpolate(const int input_height,
+                                      const int input_width, T y, T x, T *w1,
+                                      T *w2, T *w3, T *w4, int *x_low,
+                                      int *x_high, int *y_low, int *y_high,
+                                      bool *empty) {
+  // deal with cases that inverse elements are of feature map boundary
+  if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
+    *empty = true;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low_ = int(y);
+  int x_low_ = int(x);
+
+  if (y_low_ >= input_height - 1) {
+    *y_high = y_low_ = input_height - 1;
+    y = (T)y_low_;
+  } else {
+    *y_high = y_low_ + 1;
+  }
+
+  if (x_low_ >= input_width - 1) {
+    *x_high = x_low_ = input_width - 1;
+    x = T(x_low_);
+  } else {
+    *x_high = x_low_ + 1;
+  }
+
+  *y_low = y_low_;
+  *x_low = x_low_;
+
+  T ly = y - y_low_;
+  T lx = x - x_low_;
+  T hy = 1.0 - ly;
+  T hx = 1.0 - lx;
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+  return;
+}
+
+template <typename T>
+__mlu_func__ void computeChannel(T *input_core, T *nram_in, T *output_core,
+                                 T *nram_out, const int roi_bin_grid_h,
+                                 const int roi_bin_grid_w, const T roi_start_h,
+                                 const T roi_start_w, const int ph,
+                                 const int pw, const T bin_size_h,
+                                 const T bin_size_w, const float count,
+                                 const int input_height, const int input_width,
+                                 const int channels, const int cyc_num,
+                                 const int max_elements) {
+  int cyc_channel = max_elements;
+
+  for (int i = 0; i < cyc_num; i++) {
+    int real_channel =
+        (i == cyc_num - 1) ? channels - i * cyc_channel : cyc_channel;
+    int align_channel = PAD_UP(real_channel, NFU_ALIGN_SIZE / sizeof(T));
+    __bang_write_zero(nram_out, align_channel);
+    uint32_t real_size = real_channel * sizeof(T);
+
+    int iy, ix;
+    for (iy = 0; iy < roi_bin_grid_h; iy++) {
+      // 1. compute the coordinates of the y axis in the current roi_bin_grid_h
+      T y = roi_start_h + ph * bin_size_h +
+            (T)(iy + 0.5) * bin_size_h / (T)(roi_bin_grid_h);
+      for (ix = 0; ix < roi_bin_grid_w; ix++) {
+        // 2. compute the coordinates of the x axis in the current
+        //    roi_bin_grid_w
+        T x = roi_start_w + pw * bin_size_w +
+              (T)(ix + 0.5) * bin_size_w / (T)(roi_bin_grid_w);
+
+        // 3. compute the four weights (w1, w2, w3 and w4), the height (y_low
+        //    and y_high) and weight (x_low and x_high) of input feature map in
+        //    the current roi bin grid, and the flag (empty) which shows if x, y
+        //    are out of input feature map ranges
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bool empty = false;
+
+        bilinearInterpolate(input_height, input_width, y, x, &w1, &w2, &w3, &w4,
+                            &x_low, &x_high, &y_low, &y_high, &empty);
+
+        // 4. compute interpolation of the current roi bin grid
+        //    tmp_cyc1, temp_cyc2, tmp_cyc3 and tmp_cyc4 store the input values
+        //    to compute the interpolation, and then reused to compute
+        //    the argmax_x and argmax_y.
+        T *tmp_cyc1 = nram_in + cyc_channel;
+        T *tmp_cyc2 = nram_in + cyc_channel * 2;
+        T *tmp_cyc3 = nram_in + cyc_channel * 3;
+        T *tmp_cyc4 = nram_in + cyc_channel * 4;
+
+        if (empty) {  // exits abnormal values
+          __bang_write_zero(nram_in, align_channel);
+        } else {
+          __bang_write_zero(nram_in, align_channel);
+          uint32_t offset1 = (y_low * input_width + x_low) * channels;
+          uint32_t offset2 = (y_low * input_width + x_high) * channels;
+          uint32_t offset3 = (y_high * input_width + x_low) * channels;
+          uint32_t offset4 = (y_high * input_width + x_high) * channels;
+          T *input1 = (T *)input_core + offset1 + i * cyc_channel;
+          T *input2 = (T *)input_core + offset2 + i * cyc_channel;
+          T *input3 = (T *)input_core + offset3 + i * cyc_channel;
+          T *input4 = (T *)input_core + offset4 + i * cyc_channel;
+
+          // load the four pixels (p1, p2, p3 and p4) of input feature map to
+          // compute interpolation
+          __memcpy(tmp_cyc1, input1, real_size, GDRAM2NRAM);
+          __memcpy(tmp_cyc2, input2, real_size, GDRAM2NRAM);
+          __memcpy(tmp_cyc3, input3, real_size, GDRAM2NRAM);
+          __memcpy(tmp_cyc4, input4, real_size, GDRAM2NRAM);
+
+          // interpolation value = w1 * p1 + w2 * p2 + w3 * p3 + w4 * p4
+          __bang_mul_const(tmp_cyc1, tmp_cyc1, w1, align_channel);
+          __bang_mul_const(tmp_cyc2, tmp_cyc2, w2, align_channel);
+          __bang_mul_const(tmp_cyc3, tmp_cyc3, w3, align_channel);
+          __bang_mul_const(tmp_cyc4, tmp_cyc4, w4, align_channel);
+
+          __bang_add(nram_in, tmp_cyc1, nram_in, align_channel);
+          __bang_add(nram_in, tmp_cyc2, nram_in, align_channel);
+          __bang_add(nram_in, tmp_cyc3, nram_in, align_channel);
+          __bang_add(nram_in, tmp_cyc4, nram_in, align_channel);
+        }
+        // 5. compute sum value and corresponding coordinates of x axis and y
+        //    axis. Update the sum value.
+        __bang_add(nram_out, nram_in, nram_out, align_channel);
+      }  // loop_roi_grid_w
+    }    // loop_roi_grid_h
+    T count_value = (T)(1.0 / count);
+    __bang_mul_const(nram_out, nram_out, count_value, align_channel);
+    __memcpy(output_core + i * cyc_channel, nram_out, real_size, NRAM2GDRAM);
+  }  // loop_cyc_num
+}
+
+template <typename T>
+__mlu_func__ void roialignForwardAvg(
+    T *input, T *rois, T *output, const bool aligned, const int channels,
+    const int pooled_height, const int pooled_width, const int input_height,
+    const int input_width, const int sampling_ratio, const T spatial_scale,
+    const int num_rois) {
+  // find limit for channel, the nram space is divided to 6 parts that are
+  // input, 4 weights to compute the interpolation (w1, w2, w3, w4), output
+
+  // max_elements : 300 : float datatype : 27296, half datatype : 54592
+  // max_elements : 200 : float datatype : 16384, half datatype : 32768
+  int max_elements = (PAD_DOWN(MAX_NRAM_SIZE / 6, NFU_ALIGN_SIZE)) / sizeof(T);
+  int cyc_num = channels / max_elements + (int)(channels % max_elements != 0);
+  T offset = aligned ? (T)0.5 : (T)0.0;
+  int task_num = num_rois * pooled_height * pooled_width;
+  T *nram_out = (T *)buffer;
+  T *nram_in = nram_out + max_elements;
+  if (task_num < taskDim) {
+    if (taskId >= task_num) {
+      return;
+    }
+  }
+
+  for (int bin_idx = taskId; bin_idx < task_num; bin_idx = bin_idx + taskDim) {
+    if (bin_idx >= task_num) {
+      return;
+    }
+
+    // (n,ph.pw) is a c in the pooled output
+    int pw = bin_idx % pooled_width;
+    int ph = (bin_idx / pooled_width) % pooled_height;
+    int n = bin_idx / pooled_width / pooled_height;
+
+    T *roi_id_tmp = rois + n * ROI_OFFSET;
+    // 1. compute width and height of roi region.
+    int batch_idx = (int)roi_id_tmp[0];
+    T roi_x1 = roi_id_tmp[1];
+    T roi_y1 = roi_id_tmp[2];
+    T roi_x2 = roi_id_tmp[3];
+    T roi_y2 = roi_id_tmp[4];
+    T roi_start_w = roi_x1 * spatial_scale - offset;
+    T roi_start_h = roi_y1 * spatial_scale - offset;
+    T roi_end_w = roi_x2 * spatial_scale - offset;
+    T roi_end_h = roi_y2 * spatial_scale - offset;
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+
+    if (!aligned) {
+      roi_width = roi_width > (T)(1.0) ? roi_width : (T)(1.0);
+      roi_height = roi_height > (T)(1.0) ? roi_height : (T)(1.0);
+    }
+
+    // 2. compute float-type width and height of roi bin region.
+    T bin_size_w = (T)roi_width / (T)pooled_width;
+    T bin_size_h = (T)roi_height / (T)pooled_height;
+
+    // 3. compute int-type width and height of roi bin region.
+    int roi_bin_grid_h, roi_bin_grid_w;
+    roi_bin_grid_h = (sampling_ratio > 0)
+                         ? sampling_ratio
+                         : int(ceilf(roi_height / pooled_height));
+    roi_bin_grid_w = (sampling_ratio > 0)
+                         ? sampling_ratio
+                         : int(ceilf(roi_width / pooled_width));
+    float count = (float)((roi_bin_grid_h * roi_bin_grid_w) > 1
+                              ? roi_bin_grid_h * roi_bin_grid_w
+                              : 1.0);
+    T *input_core = input + batch_idx * channels * input_width * input_height;
+    T *output_core = output + bin_idx * channels;
+    // 4. compute avg value and corresponding coordinates of x axis and y axis.
+    computeChannel(input_core, nram_in, output_core, nram_out, roi_bin_grid_h,
+                   roi_bin_grid_w, roi_start_h, roi_start_w, ph, pw, bin_size_h,
+                   bin_size_w, count, input_height, input_width, channels,
+                   cyc_num, max_elements);
+  }
+}
+
+__mlu_global__ void MLUUnion1KernelRoiAlignAvg(
+    const void *input, const void *rois, const int channels, const bool aligned,
+    const int pooled_height, const int pooled_width, const int input_height,
+    const int input_width, const int sampling_ratio, const float spatial_scale,
+    const int num_rois, const cnrtDataType_t data_type, void *output) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+
+  switch (data_type) {
+    case CNRT_FLOAT16: {
+      roialignForwardAvg((half *)input, (half *)rois, (half *)output, aligned,
+                         channels, pooled_height, pooled_width, input_height,
+                         input_width, sampling_ratio,
+                         (half)spatial_scale, num_rois);
+    }; break;
+    case CNRT_FLOAT32: {
+      roialignForwardAvg((float *)input, (float *)rois, (float *)output,
+                         aligned, channels, pooled_height, pooled_width,
+                         input_height, input_width, sampling_ratio,
+                         (float)spatial_scale, num_rois);
+    }; break;
+    default:
+      break;
+  }
+
+  return;
+}
+}  // namespace forward
+
+namespace backward {
+__mlu_func__ void bilinearInterpolateGradient(int height, int width, float y,
+                                              float x, float *w1, float *w2,
+                                              float *w3, float *w4, int *x_low,
+                                              int *x_high, int *y_low,
+                                              int *y_high) {
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    *w1 = 0.0, *w2 = 0.0, *w3 = 0.0, *w4 = 0.0;
+    *x_low = -1, *x_high = -1, *y_low = -1, *y_high = -1;
+    return;
+  }
+  if (y <= 0) {
+    y = 0;
+  }
+  if (x <= 0) {
+    x = 0;
+  }
+  *y_low = (int)y;
+  *x_low = (int)x;
+  if (*y_low >= height - 1) {
+    *y_high = height - 1, *y_low = height - 1;
+    y = (float)(*y_low);
+  } else {
+    *y_high = *y_low + 1;
+  }
+  if (*x_low >= width - 1) {
+    *x_high = width - 1, *x_low = width - 1;
+    x = (float)(*x_low);
+  } else {
+    *x_high = *x_low + 1;
+  }
+  float ly = y - *y_low, lx = x - *x_low;
+  float hy = 1.0 - ly, hx = 1.0 - lx;
+  *w1 = hy * hx, *w2 = hy * lx, *w3 = ly * hx, *w4 = ly * lx;
+  return;
+}
+
+template <typename T>
+__mlu_func__ void unionRoiAlignBp(
+    T *grads, T *boxes, T *grads_image, const int boxes_num, const int hi,
+    const int wi, const int c, const int no, const int ho, const int wo,
+    const float spatial_scale, const int sampling_ratio, const bool aligned) {
+  int c_align = PAD_UP(c, NFU_ALIGN_SIZE / sizeof(T));
+  int deal_all = boxes_num * hi * wi;
+  int deal_this_core = deal_all / taskDim + (int)(taskId < deal_all % taskDim);
+  for (int i = 0; i < deal_this_core; ++i) {
+    int bhw_id = i * taskDim + taskId;
+    int box_id = bhw_id / (hi * wi);
+    int ih = (bhw_id / wi) % hi;
+    int iw = bhw_id % wi;
+    T *box = boxes + box_id * 5;
+    int image_id = (int)box[0];
+    T *image_offset = grads_image + image_id * ho * wo * c;
+    T *grads_ = grads + box_id * hi * wi * c + ih * wi * c + iw * c;
+
+    float offset = aligned ? 0.5 : 0.0;
+    float x1 = box[1] * spatial_scale - offset;
+    float y1 = box[2] * spatial_scale - offset;
+    float x2 = box[3] * spatial_scale - offset;
+    float y2 = box[4] * spatial_scale - offset;
+    float roi_width = x2 - x1;
+    float roi_height = y2 - y1;
+    if (!aligned) {
+      roi_width = (roi_width > 1.0) ? roi_width : 1.0;
+      roi_height = (roi_height > 1.0) ? roi_height : 1.0;
+    }
+    float bin_size_h = roi_height / hi;
+    float bin_size_w = roi_width / wi;
+
+    int roi_grid_h =
+        (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_height / hi);
+    int roi_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_width / wi);
+    const T count = roi_grid_h * roi_grid_w;
+    if (c_align * sizeof(T) * 2 <= MAX_NRAM_SIZE) {
+      for (int iy = 0; iy < roi_grid_h; ++iy) {
+        const float y =
+            y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
+        for (int ix = 0; ix < roi_grid_w; ++ix) {
+          const float x =
+              x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
+          float w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+          bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
+                                      &x_high, &y_low, &y_high);
+          if (x_low >= 0 && y_low >= 0) {
+            __memcpy(buffer, grads_, c * sizeof(T), GDRAM2NRAM);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w1,
+                             c_align);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
+                             1 / count, c_align);
+            __bang_atomic_add((T *)buffer + c_align,
+                              image_offset + y_low * wo * c + x_low * c,
+                              (T *)buffer + c_align, c);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w2,
+                             c_align);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
+                             1 / count, c_align);
+            __bang_atomic_add((T *)buffer + c_align,
+                              image_offset + y_low * wo * c + x_high * c,
+                              (T *)buffer + c_align, c);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w3,
+                             c_align);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
+                             1 / count, c_align);
+            __bang_atomic_add((T *)buffer + c_align,
+                              image_offset + y_high * wo * c + x_low * c,
+                              (T *)buffer + c_align, c);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer, (T)w4,
+                             c_align);
+            __bang_mul_const((T *)buffer + c_align, (T *)buffer + c_align,
+                             1 / count, c_align);
+            __bang_atomic_add((T *)buffer + c_align,
+                              image_offset + y_high * wo * c + x_high * c,
+                              (T *)buffer + c_align, c);
+          }  // x_low && y_low
+        }    // ix
+      }      // iy
+    } else {
+      for (int iy = 0; iy < roi_grid_h; ++iy) {
+        const float y =
+            y1 + ih * bin_size_h + (iy + 0.5) * bin_size_h / roi_grid_h;
+        for (int ix = 0; ix < roi_grid_w; ++ix) {
+          const float x =
+              x1 + iw * bin_size_w + (ix + 0.5) * bin_size_w / roi_grid_w;
+          float w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+          bilinearInterpolateGradient(ho, wo, y, x, &w1, &w2, &w3, &w4, &x_low,
+                                      &x_high, &y_low, &y_high);
+          if (x_low >= 0 && y_low >= 0) {
+            int deal_once =
+                PAD_DOWN(MAX_NRAM_SIZE / 2, NFU_ALIGN_SIZE) / sizeof(T);
+            int c_repeat = c / deal_once + (int)(c % deal_once != 0);
+            for (int i = 0; i < c_repeat; ++i) {
+              int deal_c = deal_once;
+              int align_c = deal_once;
+              if (i == c_repeat - 1) {
+                deal_c = c - i * deal_once;
+                align_c = c_align - i * deal_once;
+              }
+              __memcpy(buffer, grads_ + i * deal_once, deal_c * sizeof(T),
+                       GDRAM2NRAM);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w1,
+                               align_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
+                               1 / count, align_c);
+              __bang_atomic_add(
+                  (T *)buffer + align_c,
+                  image_offset + y_low * wo * c + x_low * c + i * deal_once,
+                  (T *)buffer + align_c, deal_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w2,
+                               align_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
+                               1 / count, align_c);
+              __bang_atomic_add(
+                  (T *)buffer + align_c,
+                  image_offset + y_low * wo * c + x_high * c + i * deal_once,
+                  (T *)buffer + align_c, deal_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w3,
+                               align_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
+                               1 / count, align_c);
+              __bang_atomic_add(
+                  (T *)buffer + align_c,
+                  image_offset + y_high * wo * c + x_low * c + i * deal_once,
+                  (T *)buffer + align_c, deal_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer, (T)w4,
+                               align_c);
+              __bang_mul_const((T *)buffer + align_c, (T *)buffer + align_c,
+                               1 / count, align_c);
+              __bang_atomic_add(
+                  (T *)buffer + align_c,
+                  image_offset + y_high * wo * c + x_high * c + i * deal_once,
+                  (T *)buffer + align_c, deal_c);
+            }  // for c_repeat
+          }    // x_low >= 0 && y_low >= 0
+        }      // ix
+      }        // iy
+    }          // if c
+  }            // i
+}
+
+__mlu_global__ void MLUUnion1KernelRoiAlignBackward(
+    const void *grads, const void *boxes, void *grads_image,
+    const cnrtDataType_t dtype, const int boxes_num, const int hi, const int wi,
+    const int c, const int no, const int ho, const int wo,
+    const float spatial_scale, const int sampling_ratio, const bool aligned) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+  switch (dtype) {
+    case CNRT_FLOAT16: {
+      unionRoiAlignBp((half *)grads, (half *)boxes, (half *)grads_image,
+                      boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
+                      sampling_ratio, aligned);
+    }; break;
+    case CNRT_FLOAT32: {
+      unionRoiAlignBp((float *)grads, (float *)boxes, (float *)grads_image,
+                      boxes_num, hi, wi, c, no, ho, wo, spatial_scale,
+                      sampling_ratio, aligned);
+    }; break;
+    default: { return; }
+  }
+}
+}  // namespace backward
+
+void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                    cnrtQueue_t queue, const cnrtDataType_t d_type,
+                    const void *input, const void *rois, const int channels,
+                    const bool aligned, const int pooled_height,
+                    const int pooled_width, const int input_height,
+                    const int input_width, const int sampling_ratio,
+                    const float spatial_scale, const int num_rois,
+                    void *output) {
+  forward::MLUUnion1KernelRoiAlignAvg<<<k_dim, k_type, queue>>>(
+      input, rois, channels, aligned, pooled_height, pooled_width, input_height,
+      input_width, sampling_ratio, spatial_scale, num_rois, d_type, output);
+}
+
+void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                            cnrtQueue_t queue, const cnrtDataType_t dtype,
+                            const void *grads, const void *boxes,
+                            void *grads_image, const int boxes_num,
+                            const int hi, const int wi, const int c,
+                            const int no, const int ho, const int wo,
+                            const float spatial_scale, const int sampling_ratio,
+                            const bool aligned) {
+  backward::MLUUnion1KernelRoiAlignBackward<<<k_dim, k_type, queue>>>(
+      grads, boxes, grads_image, dtype, boxes_num, hi, wi, c, no, ho, wo,
+      spatial_scale, sampling_ratio, aligned);
+}
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu b/mmcv/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..7f05b525a0b278e7593db76faee8fa782df4bc38
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/roi_align_rotated_mlu_kernel.mlu
@@ -0,0 +1,472 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * OR IMPLIED, INCLUDING BUvoid NOKType LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENvoid SHALL THE AUTHORS OR COPYRIGHKType HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORvoid OR OTHERWISE, ARISING FROM, OUKType OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+#include "roi_align_rotated_utils.hpp"
+
+#define ROI_OFFSET 6
+#define SAMPLING_NUM 4
+
+__nram__ char nram_buffer[MAX_NRAM_SIZE];
+
+template <typename T>
+__mlu_func__ void swap(T &a, T &b) {
+  T tmp = a;
+  a = b;
+  b = tmp;
+}
+
+template <typename T>
+__mlu_func__ void bilinearInterpolate(const int input_height,
+                                      const int input_width, T x, T y,
+                                      const T zero_sign, T *w1, T *w2, T *w3,
+                                      T *w4, int *x_low, int *x_high,
+                                      int *y_low, int *y_high, bool *empty) {
+  // deal with case that the point is out of feature map boundary
+  if (y < -1.0 || y > input_height || x < -1.0 || x > input_width) {
+    *empty = true;
+    return;
+  }
+
+  if (y <= 0) y = (T)0;
+  if (x <= 0) x = (T)0;
+
+  *y_low = int(y);
+  *x_low = int(x);
+
+  if (*y_low >= input_height - 1) {
+    *y_high = *y_low = input_height - 1;
+    y = (T)(*y_low);
+  } else {
+    *y_high = *y_low + 1;
+  }
+
+  if (*x_low >= input_width - 1) {
+    *x_high = *x_low = input_width - 1;
+    x = T(*x_low);
+  } else {
+    *x_high = *x_low + 1;
+  }
+  T ly = y - *y_low;
+  T lx = x - *x_low;
+  T hy = 1.0 - ly;
+  T hx = 1.0 - lx;
+  *w1 = hy * hx * zero_sign;
+  *w2 = hy * lx * zero_sign;
+  *w3 = ly * hx * zero_sign;
+  *w4 = ly * lx * zero_sign;
+}
+
+template <typename T>
+__mlu_func__ void getRoiBinInfo(const T *rois_dram, const int bin_i,
+                                const RoiAlignRotatedParams &params,
+                                int *batch_idx, int *roi_n, int *pw, int *ph,
+                                T *roi_center_x, T *roi_center_y, T *roi_width,
+                                T *roi_height, T *theta) {
+  T offset = params.aligned ? (T)0.5 : (T)0.0;
+  *pw = bin_i % params.pooled_width;
+  *ph = (bin_i / params.pooled_width) % params.pooled_height;
+  *roi_n = bin_i / params.pooled_width / params.pooled_height;
+  const T *roi_info = rois_dram + (*roi_n) * ROI_OFFSET;
+  *batch_idx = (int)roi_info[0];
+  *roi_center_x = roi_info[1] * (T)params.spatial_scale - offset;
+  *roi_center_y = roi_info[2] * (T)params.spatial_scale - offset;
+  *roi_width = roi_info[3] * (T)params.spatial_scale;
+  *roi_height = roi_info[4] * (T)params.spatial_scale;
+  *theta = roi_info[5];
+  if (params.clockwise) {
+    *theta = -(*theta);
+  }
+  if (!params.aligned) {
+    *roi_width = *roi_width > (T)1.0 ? *roi_width : (T)1.0;
+    *roi_height = *roi_height > (T)1.0 ? *roi_height : (T)1.0;
+  }
+}
+
+template <typename T>
+__mlu_func__ void roiAlignRotatedForward(const T *input_dram,
+                                         const T *rois_dram, const int batch,
+                                         const int height, const int width,
+                                         const int channel, const int rois_num,
+                                         const RoiAlignRotatedParams &params,
+                                         T *output_dram) {
+  int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
+  int channel_max_cap = MAX_NRAM_SIZE / sizeof(T) / (2 * SAMPLING_NUM + 1);
+  channel_max_cap = channel_max_cap / align_base_128 * align_base_128;
+  int channel_align = channel < channel_max_cap ? channel : channel_max_cap;
+  channel_align = CEIL_ALIGN(channel_align, align_base_128);
+
+  T *nram_out = (T *)nram_buffer;
+  T *nram_ping = nram_out + channel_align;
+  T *nram_pong = nram_ping + channel_align * SAMPLING_NUM;
+
+  int bin_first = taskId;
+  int bin_end = rois_num * params.pooled_height * params.pooled_width;
+
+  for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
+    T roi_center_x, roi_center_y, roi_width, roi_height, theta;
+    int batch_idx, roi_n, pw, ph;
+    getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
+                  &roi_center_x, &roi_center_y, &roi_width, &roi_height,
+                  &theta);
+    T bin_size_h = roi_height / params.pooled_height;
+    T bin_size_w = roi_width / params.pooled_width;
+
+    int roi_bin_grid_h =
+        (params.sample_ratio > 0)
+            ? params.sample_ratio
+            : __float2int_up((float)roi_height / params.pooled_height);
+    int roi_bin_grid_w =
+        (params.sample_ratio > 0)
+            ? params.sample_ratio
+            : __float2int_up((float)roi_width / params.pooled_width);
+    T roi_start_y = -roi_height / 2;
+    T roi_start_x = -roi_width / 2;
+    const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
+                            ? roi_bin_grid_h * roi_bin_grid_w
+                            : 1;
+    T cos_theta = std::cos(theta);
+    T sin_theta = std::sin(theta);
+    T zero_sign = 1.0f / bin_dim;
+
+    bool is_first_sample = true;
+    int src_offset = 0;
+    int dst_offset = 0;
+    int c_rem, c_slice, c_slice_align, pongc_slice, pongc_slice_align;
+    for (int c_offset = 0; c_offset < channel; c_offset += channel_align) {
+      __nramset(nram_out, channel_align, (T)0);
+      c_rem = channel - c_offset;
+      c_slice = channel_align > c_rem ? c_rem : channel_align;
+      c_slice_align = CEIL_ALIGN(c_slice, align_base_128);
+      is_first_sample = true;
+      for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
+        const T yy = roi_start_y + ph * bin_size_h +
+                     T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
+        for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
+          const T xx = roi_start_x + pw * bin_size_w +
+                       T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
+          int sample_i = iy * roi_bin_grid_w + ix;
+
+          T y = yy * cos_theta - xx * sin_theta + roi_center_y;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_x;
+          T w1, w2, w3, w4;
+          bool empty = false;
+          int x_low, x_high, y_low, y_high;
+          bilinearInterpolate(height, width, x, y, zero_sign, &w1, &w2, &w3,
+                              &w4, &x_low, &x_high, &y_low, &y_high, &empty);
+          int sample_wdim = x_high - x_low + 1;
+          /*******************************************************
+                 |          ping         |          pong         |
+          |------|-----|-----|-----|-----|-----|-----|-----|-----|
+          |output|  p1 |  p2 |  p3 |  p4 |  p1 |  p2 |  p3 |  p4 |
+          |------|-----|-----|-----|-----|-----|-----|-----|-----|
+          ********************************************************/
+          if (is_first_sample && !empty) {
+            // load input data from dram to nram
+            __nramset(nram_ping, SAMPLING_NUM * c_slice_align, (T)0);
+            for (int h = y_low; h <= y_high; ++h) {
+              src_offset =
+                  (batch_idx * height * width + h * width + x_low) * channel +
+                  c_offset;
+              dst_offset = (h - y_low) * SAMPLING_NUM * c_slice_align / 2;
+              if (c_slice_align == channel) {
+                __memcpy(nram_ping + dst_offset, input_dram + src_offset,
+                         sample_wdim * channel * sizeof(T), GDRAM2NRAM);
+              } else {
+                __memcpy(nram_ping + dst_offset, input_dram + src_offset,
+                         c_slice * sizeof(T), GDRAM2NRAM,
+                         c_slice_align * sizeof(T), channel * sizeof(T),
+                         sample_wdim - 1);
+              }
+            }
+          }
+          // load next input data to nram
+          if (sample_i + 1 < bin_dim) {
+            int p_iy = (sample_i + 1) / roi_bin_grid_w;
+            int p_ix = (sample_i + 1) % roi_bin_grid_w;
+            const T p_yy = roi_start_y + ph * bin_size_h +
+                           T(p_iy + 0.5) * bin_size_h / roi_bin_grid_h;
+            const T p_xx = roi_start_x + pw * bin_size_w +
+                           T(p_ix + 0.5) * bin_size_w / roi_bin_grid_w;
+            T p_y = p_yy * cos_theta - p_xx * sin_theta + roi_center_y;
+            T p_x = p_yy * sin_theta + p_xx * cos_theta + roi_center_x;
+            T p_w1, p_w2, p_w3, p_w4;
+            bool p_empty = false;
+            int p_x_low, p_x_high, p_y_low, p_y_high;
+            bilinearInterpolate(height, width, p_x, p_y, zero_sign, &p_w1,
+                                &p_w2, &p_w3, &p_w4, &p_x_low, &p_x_high,
+                                &p_y_low, &p_y_high, &p_empty);
+            int p_sample_wdim = p_x_high - p_x_low + 1;
+            pongc_slice = c_slice;
+            pongc_slice_align = c_slice_align;
+            if (!p_empty) {
+              __nramset(nram_pong, SAMPLING_NUM * pongc_slice_align, (T)0);
+              for (int h = p_y_low; h <= p_y_high; ++h) {
+                src_offset =
+                    (batch_idx * height * width + h * width + p_x_low) *
+                        channel +
+                    c_offset;
+                dst_offset =
+                    (h - p_y_low) * SAMPLING_NUM * pongc_slice_align / 2;
+                if (pongc_slice_align == channel) {
+                  __memcpy_async(
+                      nram_pong + dst_offset, input_dram + src_offset,
+                      p_sample_wdim * channel * sizeof(T), GDRAM2NRAM);
+                } else {
+                  __memcpy_async(nram_pong + dst_offset,
+                                 input_dram + src_offset,
+                                 pongc_slice * sizeof(T), GDRAM2NRAM,
+                                 pongc_slice_align * sizeof(T),
+                                 channel * sizeof(T), p_sample_wdim - 1);
+                }
+              }
+            }
+          }
+          T *tmp_sum = nram_ping + 3 * c_slice_align;
+          if (empty) {
+            __nramset(tmp_sum, c_slice_align, T(0));
+          } else {
+            __bang_mul_const(nram_ping, nram_ping, w1, c_slice_align);
+            __bang_mul_const(nram_ping + c_slice_align,
+                             nram_ping + c_slice_align, w2, c_slice_align);
+            __bang_mul_const(nram_ping + 2 * c_slice_align,
+                             nram_ping + 2 * c_slice_align, w3, c_slice_align);
+            __bang_mul_const(nram_ping + 3 * c_slice_align,
+                             nram_ping + 3 * c_slice_align, w4, c_slice_align);
+            __bang_sumpool(tmp_sum, nram_ping, c_slice_align, 1, SAMPLING_NUM,
+                           1, SAMPLING_NUM, 1, 1);
+          }
+          __bang_add(nram_out, nram_out, tmp_sum, c_slice_align);
+          swap(nram_ping, nram_pong);
+
+          __asm__ volatile("sync;");
+          is_first_sample = false;
+        }
+      }
+      // store the result to dram
+      int output_offset =
+          ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
+              channel +
+          c_offset;
+      __memcpy(output_dram + output_offset, nram_out, c_slice * sizeof(T),
+               NRAM2GDRAM);
+    }
+  }
+}
+
+template <typename T>
+__mlu_func__ void roiAlignRotatedBackward(const T *top_grad_dram,
+                                          const T *rois_dram, const int batch,
+                                          const int height, const int width,
+                                          const int channel, const int rois_num,
+                                          const RoiAlignRotatedParams &params,
+                                          T *bottom_grad_dram) {
+  int align_base_128 = NFU_ALIGN_SIZE / sizeof(T);
+  int channel_align = CEIL_ALIGN(channel, align_base_128);
+
+  unsigned int max_element = MAX_NRAM_SIZE / sizeof(T);
+  int c_limit = max_element >> 2;
+  c_limit = c_limit > channel_align ? channel_align : c_limit;
+
+  T *nram_ping = (T *)nram_buffer;
+  T *nram_pong = nram_ping + 2 * c_limit;
+  T *nram_output = nullptr;
+
+  int bin_first = taskId;
+  int bin_end = rois_num * params.pooled_height * params.pooled_width;
+  bool is_first_bin = true;
+  T roi_center_x, roi_center_y, roi_width, roi_height, theta;
+  int batch_idx, roi_n, pw, ph;
+  T pong_roi_center_x, pong_roi_center_y, pong_roi_width, pong_roi_height,
+      pong_theta;
+  int pong_batch_idx, pong_roi_n, pong_pw, pong_ph;
+  for (int bin_i = bin_first; bin_i < bin_end; bin_i += taskDim) {
+    getRoiBinInfo(rois_dram, bin_i, params, &batch_idx, &roi_n, &pw, &ph,
+                  &roi_center_x, &roi_center_y, &roi_width, &roi_height,
+                  &theta);
+    T bin_size_h = roi_height / params.pooled_height;
+    T bin_size_w = roi_width / params.pooled_width;
+
+    int roi_bin_grid_h =
+        (params.sample_ratio > 0)
+            ? params.sample_ratio
+            : __float2int_up((float)roi_height / params.pooled_height);
+    int roi_bin_grid_w =
+        (params.sample_ratio > 0)
+            ? params.sample_ratio
+            : __float2int_up((float)roi_width / params.pooled_width);
+    T roi_start_y = -roi_height / 2;
+    T roi_start_x = -roi_width / 2;
+    const int bin_dim = roi_bin_grid_h * roi_bin_grid_w > 1
+                            ? roi_bin_grid_h * roi_bin_grid_w
+                            : 1;
+    T cos_theta = std::cos(theta);
+    T sin_theta = std::sin(theta);
+    T zero_sign = 1.0f / bin_dim;
+
+    int c_rem, c_slice, pongc_slice, c_offset;
+    c_rem = channel;
+    c_offset = 0;
+    /****************************************
+    |        ping       |        pong       |
+    |---------|---------|---------|---------|
+    |  input  |  output |  input  |  output |
+    |---------|---------|---------|---------|
+    *****************************************/
+    if (is_first_bin) {
+      // load the first top_grad to nram
+      c_slice = c_limit < c_rem ? c_limit : c_rem;
+      int top_grad_offset =
+          ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
+          channel;
+      __memcpy(nram_ping, top_grad_dram + top_grad_offset, c_slice * sizeof(T),
+               GDRAM2NRAM);
+    }
+    nram_output = nram_ping + c_limit;
+    while (c_rem > 0) {
+      c_slice = c_slice < c_rem ? c_slice : c_rem;
+      // load the next top_grad to nram
+      if (c_rem - c_slice > 0) {
+        // load the rest channels to nram
+        pongc_slice = (c_rem - c_slice > c_slice) ? c_slice : c_rem - c_slice;
+        int top_grad_offset =
+            ((roi_n * params.pooled_height + ph) * params.pooled_width + pw) *
+                channel +
+            c_offset + c_slice;
+        __memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
+                       pongc_slice * sizeof(T), GDRAM2NRAM);
+      } else if (bin_i + taskDim < bin_end) {
+        // load next bin's data to nram
+        getRoiBinInfo(rois_dram, bin_i + taskDim, params, &pong_batch_idx,
+                      &pong_roi_n, &pong_pw, &pong_ph, &pong_roi_center_x,
+                      &pong_roi_center_y, &pong_roi_width, &pong_roi_height,
+                      &pong_theta);
+        pongc_slice = c_limit < channel ? c_limit : channel;
+        int top_grad_offset = ((pong_roi_n * params.pooled_height + pong_ph) *
+                                   params.pooled_width +
+                               pong_pw) *
+                              channel;
+        __memcpy_async(nram_pong, top_grad_dram + top_grad_offset,
+                       c_slice * sizeof(T), GDRAM2NRAM);
+      }
+      // comput the output in a single bin
+
+      for (int iy = 0; iy < roi_bin_grid_h; ++iy) {
+        const T yy = roi_start_y + ph * bin_size_h +
+                     T(iy + 0.5) * bin_size_h / roi_bin_grid_h;
+        for (int ix = 0; ix < roi_bin_grid_w; ++ix) {
+          const T xx = roi_start_x + pw * bin_size_w +
+                       T(ix + 0.5) * bin_size_w / roi_bin_grid_w;
+          T y = yy * cos_theta - xx * sin_theta + roi_center_y;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_x;
+          T w1, w2, w3, w4;
+          bool empty = false;
+          int x_low, x_high, y_low, y_high;
+          bilinearInterpolate(height, width, x, y, zero_sign, &w1, &w2, &w3,
+                              &w4, &x_low, &x_high, &y_low, &y_high, &empty);
+          if (empty) {
+            continue;
+          } else {
+            __bang_mul_const(nram_output, nram_ping, w1, c_limit);
+            __bang_atomic_add(
+                (T *)nram_output,
+                bottom_grad_dram + batch_idx * height * width * channel +
+                    y_low * width * channel + x_low * channel + c_offset,
+                (T *)nram_output, c_slice);
+            __bang_mul_const(nram_output, nram_ping, w2, c_limit);
+            __bang_atomic_add(
+                (T *)nram_output,
+                bottom_grad_dram + batch_idx * height * width * channel +
+                    y_low * width * channel + x_high * channel + c_offset,
+                (T *)nram_output, c_slice);
+            __bang_mul_const(nram_output, nram_ping, w3, c_limit);
+            __bang_atomic_add(
+                (T *)nram_output,
+                bottom_grad_dram + batch_idx * height * width * channel +
+                    y_high * width * channel + x_low * channel + c_offset,
+                (T *)nram_output, c_slice);
+            __bang_mul_const(nram_output, nram_ping, w4, c_limit);
+            __bang_atomic_add(
+                (T *)nram_output,
+                bottom_grad_dram + batch_idx * height * width * channel +
+                    y_high * width * channel + x_high * channel + c_offset,
+                (T *)nram_output, c_slice);
+          }
+        }
+      }
+      swap(nram_ping, nram_pong);
+      c_rem -= c_slice;
+      c_offset += c_slice;
+      __asm__ volatile("sync;");
+    }
+    is_first_bin = false;
+  }
+}
+
+__mlu_global__ void MLUUnion1KernelRoiAlignRotatedForward(
+    const void *features, const void *rois, void *output, const int batch,
+    const int height, const int width, const int channel, const int rois_num,
+    const RoiAlignRotatedParams rroiAlignParams,
+    const cnrtDataType_t data_type) {
+  if (0x80 == coreId) {
+    return;
+  }
+
+  if (data_type == CNRT_FLOAT32) {
+    roiAlignRotatedForward((float *)features, (float *)rois, batch, height,
+                           width, channel, rois_num, rroiAlignParams,
+                           (float *)output);
+  } else {
+    roiAlignRotatedForward((half *)features, (half *)rois, batch, height, width,
+                           channel, rois_num, rroiAlignParams, (half *)output);
+  }
+}
+
+__mlu_global__ void MLUUnion1KernelRoiAlignRotatedBackward(
+    const void *top_grad, const void *rois, void *bottom_grad, const int batch,
+    const int height, const int width, const int channel, const int rois_num,
+    const RoiAlignRotatedParams rroiAlignParams,
+    const cnrtDataType_t data_type) {
+  if (0x80 == coreId) {
+    return;
+  }
+
+  if (data_type == CNRT_FLOAT32) {
+    roiAlignRotatedBackward((float *)top_grad, (float *)rois, batch, height,
+                            width, channel, rois_num, rroiAlignParams,
+                            (float *)bottom_grad);
+  } else {
+    roiAlignRotatedBackward((half *)top_grad, (half *)rois, batch, height,
+                            width, channel, rois_num, rroiAlignParams,
+                            (half *)bottom_grad);
+  }
+}
+
+void KernelRoiAlignRotatedForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const cnrtDataType_t d_type, const void *features, const void *rois,
+    void *output, const int batch, const int height, const int width,
+    const int channel, const int rois_num,
+    const RoiAlignRotatedParams roiAlignRotatedParams) {
+  MLUUnion1KernelRoiAlignRotatedForward<<<k_dim, k_type, queue>>>(
+      features, rois, output, batch, height, width, channel, rois_num,
+      roiAlignRotatedParams, d_type);
+}
+
+void KernelRoiAlignRotatedBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const cnrtDataType_t d_type, const void *top_grad, const void *rois,
+    void *bottom_grad, const int batch, const int height, const int width,
+    const int channel, const int rois_num,
+    const RoiAlignRotatedParams roiAlignRotatedParams) {
+  MLUUnion1KernelRoiAlignRotatedBackward<<<k_dim, k_type, queue>>>(
+      top_grad, rois, bottom_grad, batch, height, width, channel, rois_num,
+      roiAlignRotatedParams, d_type);
+}
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp b/mmcv/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd0ec02484fef395db7d401976d64f9c5ca59622
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/roi_align_rotated_utils.hpp
@@ -0,0 +1,24 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef ROI_ALIGN_ROTATED_UTILS_HPP_
+#define ROI_ALIGN_ROTATED_UTILS_HPP_
+
+struct RoiAlignRotatedParams {
+  int pooled_height;
+  int pooled_width;
+  int sample_ratio;
+  float spatial_scale;
+  bool aligned;
+  bool clockwise;
+};
+
+#endif  // ROI_ALIGN_ROTATED_UTILS_HPP_
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu b/mmcv/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..7186cdfac3e93677ed2727234a71def607fcd79b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/roi_pool_mlu_kernel.mlu
@@ -0,0 +1,749 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+#define ALIGN_SIZE 64
+#define PIPELINE_COMMON_NUM 2
+#define PIPELINE_PINGPONG_NUM 10
+
+__nram__ char nram_buffer[MAX_NRAM_SIZE];
+
+namespace forward {
+template <typename T>
+__mlu_func__ void getRoiBinInfo(T *input_v, T *rois_v, int bin_i, int height,
+                                int width, int channels, int p_height,
+                                int p_width, T spatial_scale, int *bin_x1,
+                                int *bin_y1, int *bin_x2, int *bin_y2,
+                                int *bin_wdim, int *bin_hdim, int *bin_dims,
+                                T **input_base, bool *is_empty) {
+  int pw = bin_i % p_width;
+  int ph = (bin_i / p_width) % p_height;
+  int roi_n = bin_i / p_width / p_height;
+
+  /*roi*/
+  const T *roi_info = rois_v + roi_n * 5;  // {{batch, x1, y1, x2, y2},,,}
+  int batch_index = (int)roi_info[0];
+  int roi_x1 = round(roi_info[1] * spatial_scale);
+  int roi_y1 = round(roi_info[2] * spatial_scale);
+  int roi_x2 = round(roi_info[3] * spatial_scale);
+  int roi_y2 = round(roi_info[4] * spatial_scale);
+  int roi_w = roi_x2 - roi_x1 + 1 > 1 ? roi_x2 - roi_x1 + 1 : 1;
+  int roi_h = roi_y2 - roi_y1 + 1 > 1 ? roi_y2 - roi_y1 + 1 : 1;
+
+  /*bin*/
+  T bin_w = (T)roi_w / (T)p_width;
+  T bin_h = (T)roi_h / (T)p_height;
+
+  *bin_x1 = (int)floor((T)pw * bin_w) + roi_x1;
+  *bin_x1 = *bin_x1 > 0 ? *bin_x1 : 0;
+  *bin_x1 = *bin_x1 < width ? *bin_x1 : width;
+
+  *bin_y1 = (int)floor((T)ph * bin_h) + roi_y1;
+  *bin_y1 = *bin_y1 > 0 ? *bin_y1 : 0;
+  *bin_y1 = *bin_y1 < height ? *bin_y1 : height;
+
+  *bin_x2 = (int)ceil((T)(pw + 1) * bin_w) + roi_x1;
+  *bin_x2 = *bin_x2 > 0 ? *bin_x2 : 0;
+  *bin_x2 = *bin_x2 < width ? *bin_x2 : width;
+
+  *bin_y2 = (int)ceil((T)(ph + 1) * bin_h) + roi_y1;
+  *bin_y2 = *bin_y2 > 0 ? *bin_y2 : 0;
+  *bin_y2 = *bin_y2 < height ? *bin_y2 : height;
+
+  *input_base = input_v + batch_index * height * width * channels;
+  *bin_wdim = *bin_x2 - *bin_x1;
+  *bin_hdim = *bin_y2 - *bin_y1;
+  *bin_dims = (*bin_hdim) * (*bin_wdim);
+  *is_empty = (*bin_y2 <= *bin_y1) || (*bin_x2 <= *bin_x1);
+}
+
+template <typename T>
+__mlu_func__ void MLUUnion1Roipool(T *input_v, T *rois_v, int batch,
+                                   int channels, int height, int width,
+                                   int p_height, int p_width, int rois_num,
+                                   T spatial_scale, T *output_v, int *argmax) {
+  /*
+   * NRAM partition
+   *  |---------------------------------------------------|
+   *  |                        ping                       |
+   *  |---------------------------------------------------|
+   *  |                        pong                       |
+   *  |---------------------------------------------------|
+   *  |                        out                        |
+   *  |---------------------------------------------------|
+   *  |                        argmax                     |
+   *  |---------------------------------------------------|
+   *  |                        a                          |
+   *  |---------------------------------------------------|
+   *  |                        b                          |
+   *  |---------------------------------------------------|
+   */
+  uint32_t is_half = sizeof(T) == sizeof(half) ? true : false;
+  uint32_t t_size = sizeof(T);
+  uint32_t float_div = NFU_ALIGN_SIZE / sizeof(float);
+  uint32_t half_div = NFU_ALIGN_SIZE / sizeof(half);
+
+  uint32_t channels_align = PAD_UP(channels, float_div);
+  uint32_t nram_limit = PAD_DOWN(
+      (MAX_NRAM_SIZE / sizeof(float) - 4 * channels_align) / 2, half_div);
+
+  // nram PING/PONG, output, argamx, a, b
+  float *nram_ping = (float *)nram_buffer;
+  float *nram_pong = (float *)nram_buffer + nram_limit;
+  float *nram_out = (float *)nram_buffer + 2 * nram_limit;
+  float *nram_argmax = nram_out + channels_align;
+  float *nram_a = nram_out + 2 * channels_align;
+  float *nram_b = nram_out + 3 * channels_align;
+
+  uint32_t c_bins_num = rois_num * p_height * p_width;
+  uint32_t task_bins = c_bins_num / taskDim;
+  uint32_t rem_bins = c_bins_num % taskDim;
+  if (taskId < rem_bins) {
+    task_bins += 1;
+  }
+  int bin_first =
+      (c_bins_num / taskDim) * taskId + (taskId > rem_bins ? rem_bins : taskId);
+  int bins_loop = bin_first + task_bins;
+
+  T *input_base = NULL;
+  T *output_base = output_v + bin_first * channels;
+  int *argmax_base = NULL != argmax ? argmax + bin_first * channels : NULL;
+  int bin_x1, bin_y1, bin_x2, bin_y2, bin_wdim, bin_hdim, bin_dims;
+  int pbin_x1, pbin_y1, pbin_x2, pbin_y2, pbin_wdim, pbin_hdim, pbin_dims;
+  bool is_empty = false;
+  bool pong_is_empty = false;
+  bool is_first_bin = true;
+  uint32_t src_offset = 0;
+  uint32_t dst_offset = 0;
+  uint32_t nram_offset = 0;
+  uint32_t half_offset =
+      is_half ? (nram_limit / 2 / half_div * half_div) * 2 : 0;
+  float *nram_tmp = NULL;
+
+  uint32_t c_slice = 0;
+  uint32_t c_slice_align = 0;
+  uint32_t pongc_slice = 0;
+  uint32_t pongc_slice_align = 0;
+  for (int bin_i = bin_first; bin_i < bins_loop; bin_i++) {
+    getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i, height, width, channels,
+                  p_height, p_width, (T)spatial_scale, &bin_x1, &bin_y1,
+                  &bin_x2, &bin_y2, &bin_wdim, &bin_hdim, &bin_dims,
+                  &input_base, &is_empty);
+    uint32_t c_rem = channels;
+    c_slice = nram_limit / bin_dims / float_div * float_div;
+
+    if (is_first_bin && !is_empty) {
+      c_slice = c_slice > c_rem ? c_rem : c_slice;
+      c_slice_align = PAD_UP(c_slice, float_div);
+      for (int h = bin_y1; h < bin_y2; h++) {
+        src_offset = (h * width + bin_x1) * channels;
+        nram_offset = (h - bin_y1) * bin_wdim * c_slice_align + half_offset;
+        if (c_slice_align == channels) {
+          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
+                   bin_wdim * c_slice * t_size, GDRAM2NRAM);
+        } else {
+          __memcpy((T *)nram_ping + nram_offset, (T *)input_base + src_offset,
+                   c_slice * t_size, GDRAM2NRAM, c_slice_align * t_size,
+                   channels * t_size, bin_wdim - 1);
+        }
+      }
+    }
+    uint32_t c_offset = 0;
+    while (c_rem > 0) {
+      c_slice = c_slice > c_rem ? c_rem : c_slice;
+      c_slice_align = PAD_UP(c_slice, float_div);
+
+      /*__memcpy_async*/
+      if (c_rem - c_slice > 0 && !is_empty) {
+        pongc_slice = c_rem - c_slice > c_slice ? c_slice : c_rem - c_slice;
+        pongc_slice_align = PAD_UP(pongc_slice, float_div);
+        for (int h = bin_y1; h < bin_y2; h++) {
+          src_offset = (h * width + bin_x1) * channels + c_offset;
+          nram_offset =
+              (h - bin_y1) * bin_wdim * pongc_slice_align + half_offset;
+          __memcpy_async((T *)nram_pong + nram_offset,
+                         (T *)input_base + src_offset + c_slice,
+                         pongc_slice * t_size, GDRAM2NRAM,
+                         pongc_slice_align * t_size, channels * t_size,
+                         bin_wdim - 1);
+        }
+      } else if (bin_i + 1 < bins_loop) {
+        getRoiBinInfo((T *)input_v, (T *)rois_v, bin_i + 1, height, width,
+                      channels, p_height, p_width, (T)spatial_scale, &pbin_x1,
+                      &pbin_y1, &pbin_x2, &pbin_y2, &pbin_wdim, &pbin_hdim,
+                      &pbin_dims, &input_base, &pong_is_empty);
+        pongc_slice = PAD_DOWN(nram_limit / pbin_dims, float_div);
+        pongc_slice = pongc_slice > channels ? channels : pongc_slice;
+        pongc_slice_align = PAD_UP(pongc_slice, float_div);
+        if (!pong_is_empty) {
+          for (int h = pbin_y1; h < pbin_y2; h++) {
+            src_offset = (h * width + pbin_x1) * channels;
+            nram_offset =
+                (h - pbin_y1) * pbin_wdim * pongc_slice_align + half_offset;
+            if (pongc_slice_align == channels) {
+              __memcpy_async((T *)nram_pong + nram_offset,
+                             (T *)input_base + src_offset,
+                             pbin_wdim * pongc_slice * t_size, GDRAM2NRAM);
+            } else {
+              __memcpy_async((T *)nram_pong + nram_offset,
+                             (T *)input_base + src_offset, pongc_slice * t_size,
+                             GDRAM2NRAM, pongc_slice_align * t_size,
+                             channels * t_size, pbin_wdim - 1);
+            }
+          }
+        }
+      }
+
+      if (is_empty) {
+        __nramset((T *)nram_out, c_slice_align, (T)0);
+        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
+                 c_slice * t_size, NRAM2GDRAM);
+        if (NULL != argmax) {
+          __nramset((int32_t *)nram_out, c_slice_align, (int32_t)(-1));
+          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
+                   (int32_t *)nram_out, c_slice * sizeof(int32_t), NRAM2GDRAM);
+        }
+      } else {
+        if (is_half) {
+          uint32_t bin_align64 = PAD_UP(bin_dims * c_slice_align, half_div);
+          __bang_half2float((float *)nram_ping, (half *)nram_ping + half_offset,
+                            bin_align64);
+        }
+        __bang_maxpool((float *)nram_out, (float *)nram_ping, c_slice_align,
+                       bin_hdim, bin_wdim, bin_hdim, bin_wdim, 1, 1);
+        if (is_half) {
+          uint32_t c_align64 = PAD_UP(c_slice_align, half_div);
+          __bang_float2half_rd((half *)nram_out, (float *)nram_out, c_align64);
+        }
+        __memcpy((T *)output_base + dst_offset + c_offset, (T *)nram_out,
+                 c_slice * t_size, NRAM2GDRAM);
+        if (NULL != argmax) {
+          /*compute max_index*/
+          __bang_maxpool_index((uint32_t *)nram_out, (float *)nram_ping,
+                               c_slice_align, bin_hdim, bin_wdim, bin_hdim,
+                               bin_wdim, 1, 1);
+          convertInt2Float((float *)nram_argmax, (float *)nram_a,
+                           (int32_t *)nram_out, (float *)nram_b, c_slice_align);
+
+          /*compute input_h*/
+          for (int i = 0; i < c_slice; i++) {
+            nram_out[i] = (float)(((uint32_t *)nram_out)[i] / bin_wdim);
+          }
+          __bang_add_const((float *)nram_a, (float *)nram_out, (float)bin_y1,
+                           c_slice_align);
+          __bang_mul_const((float *)nram_ping, (float *)nram_a, (float)width,
+                           c_slice_align);
+
+          /*compute input_w*/
+          __bang_mul_const((float *)nram_a, (float *)nram_out, (float)bin_wdim,
+                           c_slice_align);
+          __bang_sub((float *)nram_a, (float *)nram_argmax, (float *)nram_a,
+                     c_slice_align);
+          __bang_add_const((float *)nram_a, (float *)nram_a, (float)bin_x1,
+                           c_slice_align);
+          __bang_add((float *)nram_out, (float *)nram_ping, (float *)nram_a,
+                     c_slice_align);
+          convertFloat2Int((int32_t *)nram_argmax, (float *)nram_a,
+                           (float *)nram_out, (float *)nram_b, c_slice_align);
+          __memcpy((int32_t *)argmax_base + dst_offset + c_offset,
+                   (int32_t *)nram_argmax, c_slice * sizeof(int32_t),
+                   NRAM2GDRAM);
+        }
+      }
+      nram_tmp = nram_ping;
+      nram_ping = nram_pong;
+      nram_pong = nram_tmp;
+      c_offset += c_slice;
+      c_rem -= c_slice;
+      __asm__ volatile("sync;");
+    }
+    dst_offset += channels;
+    is_first_bin = false;
+  }
+}
+
+__mlu_global__ void MLUKernelRoiPool(cnrtDataType_t data_type,
+                                     const void *input_data,
+                                     const void *input_rois, int batch,
+                                     int channels, int height, int width,
+                                     int pooled_height, int pooled_width,
+                                     int rois_num, float spatial_scale,
+                                     void *output_data, int *argmax) {
+  switch (data_type) {
+    case CNRT_FLOAT16: {
+      MLUUnion1Roipool((half *)input_data, (half *)input_rois, batch, channels,
+                       height, width, pooled_height, pooled_width, rois_num,
+                       (half)spatial_scale, (half *)output_data, argmax);
+    }; break;
+    case CNRT_FLOAT32: {
+      MLUUnion1Roipool((float *)input_data, (float *)input_rois, batch,
+                       channels, height, width, pooled_height, pooled_width,
+                       rois_num, (float)spatial_scale, (float *)output_data,
+                       argmax);
+    }; break;
+    default: {
+      break;
+    }
+  }
+}
+}  // namespace forward
+
+namespace backward {
+// Convert index of argmax from global grads_image to local bin in RoI. Vector
+// operations do not support int type, so conversion from int to float is
+// performed here.
+__mlu_func__ void convertIndex(
+    int32_t *nram_argmax, int32_t *nram_argmax_fp, int32_t *nram_argmax_fp_bk1,
+    int32_t *nram_argmax_fp_bk2, int32_t *nram_argmax_int,
+    int32_t *nram_argmax_int_h, int32_t *nram_argmax_int_w,
+    int32_t *nram_argmax_fp_h, int32_t *nram_argmax_fp_w,
+    float *nram_atomic_add, float *nram_grads_image, int width, int height,
+    int wstart, int hstart, int w_compute, int h_compute, int align_c,
+    int channels, int loop_flag, int loop_id, int true_limit) {
+  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
+                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
+
+  // This step uses scalar division, because the above vector division causes
+  // rounding accuracy problem.
+  for (int i = 0; i < channels; ++i) {
+    *((float *)nram_argmax_fp + i) = *((float *)nram_argmax_fp + i) / width;
+  }
+
+  // Use 'float2int_tz' to perform '*((int32_t*)nram_argmax + i) / width'
+  // operation.
+  convertFloat2Int((int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk1,
+                   (float *)nram_argmax_fp, (float *)nram_argmax_fp_bk2,
+                   align_c);
+  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
+                   (int *)nram_argmax_int_h, (float *)nram_argmax_fp_bk2,
+                   align_c);
+
+  // Perform 'temp_result - hstart' operation
+  __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp, hstart,
+                   align_c);
+
+  // Perform 'temp_result1 - temp_result2 * width' operation
+  __bang_mul_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp, width,
+                   align_c);
+  convertInt2Float((float *)nram_argmax_fp, (float *)nram_argmax_fp_bk1,
+                   (int *)nram_argmax, (float *)nram_argmax_fp_bk2, align_c);
+  __bang_sub((float *)nram_argmax_fp_w, (float *)nram_argmax_fp,
+             (float *)nram_argmax_fp_w, align_c);
+
+  // Perform 'temp_result - wstart' operation
+  __bang_sub_const((float *)nram_argmax_fp_w, (float *)nram_argmax_fp_w, wstart,
+                   align_c);
+
+  // Perform 'temp_result = h * w_compute + w' operation
+  __bang_mul_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                   w_compute, align_c);
+  __bang_add((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+             (float *)nram_argmax_fp_w, align_c);
+
+  if (loop_flag == 1) {
+    __bang_sub_const((float *)nram_argmax_fp_h, (float *)nram_argmax_fp_h,
+                     (loop_id * true_limit), align_c);
+  }
+  convertFloat2Int((int *)nram_argmax_int, (float *)nram_argmax_fp_bk1,
+                   (float *)nram_argmax_fp_h, (float *)nram_argmax_fp_bk2,
+                   align_c);
+}
+
+template <typename T>
+__mlu_func__ void MLUUnion1Roipool(const T *rois, const T *grads,
+                                   const int32_t *argmax, T *grads_image,
+                                   int channels, int height, int width,
+                                   int pooled_height, int pooled_width,
+                                   int rois_num, const T spatial_scale,
+                                   int high_precision) {
+  // Calculate the number of rois processed by each core
+  int bin_num = rois_num * pooled_height * pooled_width;
+  int loop =
+      (bin_num % taskDim) ? (bin_num / taskDim + 1) : (bin_num / taskDim);
+  int tid = taskId * loop;
+  if (bin_num % taskDim != 0) {
+    if (tid >= bin_num) {
+      return;
+    } else {
+      // last part is (bin_num - tid).
+      loop = bin_num - tid < loop ? bin_num - tid : loop;
+    }
+  }
+  int align_c = PAD_UP(channels, ALIGN_SIZE);
+  // Common part has 2: grads, argmax; ping-pong each is PIPELINE_PINGPONG_NUM.
+  int data_size =
+      PAD_DOWN(((MAX_NRAM_SIZE / sizeof(float) - PIPELINE_COMMON_NUM * align_c -
+                 (PIPELINE_PINGPONG_NUM - 1) * align_c * 2) /
+                2),
+               ALIGN_SIZE);
+  int hw_limit = data_size / align_c;
+  float *nram_grads = (float *)nram_buffer;
+  for (int idx = tid; idx < tid + loop; ++idx) {
+    // (n, ph, pw) is a C in the pooled output
+    int pw = idx % pooled_width;
+    int ph = (idx / pooled_width) % pooled_height;
+    int n = idx / pooled_width / pooled_height;
+
+    const T *offset_rois = (const T *)(rois + n * 5);
+    int roi_batch_ind = int(offset_rois[0]);
+    // Calculate the roi region on feature maps
+    int roi_start_w = round(offset_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_rois[4] * spatial_scale);
+    // Force malformed rois to 1x1
+    int roi_width =
+        roi_end_w - roi_start_w + 1 > 1 ? roi_end_w - roi_start_w + 1 : 1;
+    int roi_height =
+        roi_end_h - roi_start_h + 1 > 1 ? roi_end_h - roi_start_h + 1 : 1;
+    T bin_size_h = (T)roi_height / (T)pooled_height;
+    T bin_size_w = (T)roi_width / (T)pooled_width;
+
+    // The corresponding bin region
+    int hstart = int(floor((T)ph * bin_size_h));
+    int wstart = int(floor((T)pw * bin_size_w));
+    int hend = int(ceil((T)(ph + 1) * bin_size_h));
+    int wend = int(ceil((T)(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries, min(max(A, B), C);
+    hstart = hstart + roi_start_h > 0 ? hstart + roi_start_h : 0;
+    hstart = hstart < height ? hstart : height;
+    hend = hend + roi_start_h > 0 ? hend + roi_start_h : 0;
+    hend = hend < height ? hend : height;
+    wstart = wstart + roi_start_w > 0 ? wstart + roi_start_w : 0;
+    wstart = wstart < width ? wstart : width;
+    wend = wend + roi_start_w > 0 ? wend + roi_start_w : 0;
+    wend = wend < width ? wend : width;
+
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+    if (!is_empty) {
+      int h_compute = hend - hstart;
+      int w_compute = wend - wstart;
+      int true_limit =
+          hw_limit < h_compute * w_compute ? hw_limit : h_compute * w_compute;
+      int loop_int = (h_compute * w_compute) / true_limit;
+      int rem = (h_compute * w_compute) % true_limit;
+      int32_t *nram_argmax = (int32_t *)nram_grads + align_c;
+      int32_t *nram_argmax_fp = (int32_t *)nram_argmax + align_c;
+      int32_t *nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
+      int32_t *nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
+      int32_t *nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
+      int32_t *nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
+      int32_t *nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
+      int32_t *nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
+      int32_t *nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
+      float *nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
+      float *nram_grads_image = (float *)nram_atomic_add + align_c;
+      if (true_limit == h_compute * w_compute) {
+        /*
+         * NRAM partition
+         *  |---------------------------------------------------|
+         *  |                     grads                         |
+         *  |---------------------------------------------------|
+         *  |                     argmax                        |
+         *  |---------------------------------------------------|
+         *  |                     argmax_temp                   |
+         *  |---------------------------------------------------|
+         *  |                     atomic_add                    |
+         *  |---------------------------------------------------|
+         *  |                     grads_image                   |
+         *  |---------------------------------------------------|
+         */
+
+        // Load the data from GDRAM to NRAM.
+        __memcpy((T *)nram_grads + align_c * high_precision,
+                 (const T *)grads + (n * pooled_height * pooled_width +
+                                     ph * pooled_width + pw) *
+                                        channels,
+                 channels * sizeof(T), GDRAM2NRAM);
+        if (high_precision) {
+          __bang_half2float((float *)nram_grads,
+                            (half *)nram_grads + align_c * high_precision,
+                            align_c);
+        }
+
+        __memcpy((int32_t *)nram_argmax,
+                 (const int32_t *)argmax + (n * pooled_height * pooled_width +
+                                            ph * pooled_width + pw) *
+                                               channels,
+                 channels * sizeof(int32_t), GDRAM2NRAM);
+
+        // Perform pooling operation on NRAM.
+        convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
+                     nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
+                     nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
+                     nram_atomic_add, nram_grads_image, width, height, wstart,
+                     hstart, w_compute, h_compute, align_c, channels, 0, 0, 0);
+        __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
+                          (int32_t *)nram_argmax_int, align_c, h_compute,
+                          w_compute, h_compute, w_compute, h_compute,
+                          w_compute);
+        if (high_precision) {
+          __bang_float2half_rd((half *)nram_grads_image,
+                               (float *)nram_grads_image,
+                               h_compute * w_compute * align_c);
+        }
+
+        // Store the result on NRAM back to GDRAM.
+        for (int hc = 0; hc < h_compute; ++hc) {
+          for (int wc = 0; wc < w_compute; ++wc) {
+            T *dst = (T *)nram_atomic_add;
+            int grad_image_offset = (roi_batch_ind * height * width +
+                                     (hc + hstart) * width + wc + wstart) *
+                                    channels;
+            T *src1 = (T *)grads_image + grad_image_offset;
+            int nram_grads_image_offset = (hc * w_compute + wc) * align_c;
+            T *src2 = (T *)nram_grads_image + nram_grads_image_offset;
+            __bang_atomic_add(dst, src1, src2, channels);
+          }
+        }
+      } else if (true_limit > 0) {
+        /*
+         * NRAM partition
+         *  |---------------------------------------------------|
+         *  |                     grads                         |
+         *  |---------------------------------------------------|
+         *  |                     argmax                        |
+         *  |--------------------ping_pong----------------------|
+         *  |       argmax_temp      |       argmax_temp        |
+         *  |------------------------|--------------------------|
+         *  |       atomic_add       |       atomic_add         |
+         *  |------------------------|--------------------------|
+         *  |       grads_image      |       grads_image        |
+         *  |---------------------------------------------------|
+         */
+
+        // Load the data from GDRAM to NRAM.
+        __memcpy((T *)nram_grads + align_c * high_precision,
+                 (const T *)grads + (n * pooled_height * pooled_width +
+                                     ph * pooled_width + pw) *
+                                        channels,
+                 channels * sizeof(T), GDRAM2NRAM);
+        if (high_precision) {
+          __bang_half2float((float *)nram_grads,
+                            (half *)nram_grads + align_c * high_precision,
+                            align_c);
+        }
+        __memcpy((int32_t *)nram_argmax,
+                 (const int32_t *)argmax + (n * pooled_height * pooled_width +
+                                            ph * pooled_width + pw) *
+                                               channels,
+                 channels * sizeof(int32_t), GDRAM2NRAM);
+
+        int ping_pong = 0;
+        int ping_pong_offset =
+            (MAX_NRAM_SIZE / sizeof(float) - align_c * PIPELINE_COMMON_NUM) / 2;
+        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
+          int size = (loop_id == loop_int) ? rem : true_limit;
+          if (size == 0) {
+            break;
+          }
+          // Perform pooling operation on NRAM.
+          nram_argmax_fp =
+              (int32_t *)nram_argmax + align_c + ping_pong * ping_pong_offset;
+          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + align_c;
+          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + align_c;
+          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + align_c;
+          nram_argmax_int_h = (int32_t *)nram_argmax_int + align_c;
+          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + align_c;
+          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + align_c;
+          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + align_c;
+          nram_atomic_add = (float *)nram_argmax_fp_w + align_c;
+          nram_grads_image = (float *)nram_atomic_add + align_c;
+          int loop_id_1 = loop_id;
+          int size_1 = ((loop_id_1) == loop_int) ? rem : true_limit;
+          if (size_1 == 0) {
+            break;
+          }
+          convertIndex(nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
+                       nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
+                       nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
+                       nram_atomic_add, nram_grads_image, width, height, wstart,
+                       hstart, w_compute, h_compute, align_c, channels, 1,
+                       loop_id_1, true_limit);
+          __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
+                            (int32_t *)nram_argmax_int, align_c, size_1, 1,
+                            size_1, 1, size_1, 1);
+          if (high_precision) {
+            __bang_float2half_rd((half *)nram_grads_image,
+                                 (float *)nram_grads_image, size_1 * align_c);
+          }
+
+          // Store the result on NRAM back to GDRAM.
+          for (int index_size = 0; index_size < size; ++index_size) {
+            int h = (loop_id * true_limit + index_size) / w_compute;
+            int w = (loop_id * true_limit + index_size) % w_compute;
+            T *dst = (T *)nram_atomic_add;
+            T *grads_image_n =
+                (T *)grads_image + roi_batch_ind * height * width * channels;
+            T *src1 = (T *)grads_image_n +
+                      ((h + hstart) * width + (w + wstart)) * channels;
+            T *src2 = (T *)nram_grads_image + index_size * align_c;
+            __bang_atomic_add(dst, src1, src2, channels);
+          }
+          ping_pong = 1 - ping_pong;
+        }
+      } else {
+        /*
+         * NRAM partition
+         *  |---------------------------------------------------|
+         *  |                     grads                         |
+         *  |---------------------------------------------------|
+         *  |                     argmax                        |
+         *  |--------------------ping_pong----------------------|
+         *  |       argmax_temp      |       argmax_temp        |
+         *  |------------------------|--------------------------|
+         *  |       atomic_add       |       atomic_add         |
+         *  |------------------------|--------------------------|
+         *  |       grads_image      |       grads_image        |
+         *  |---------------------------------------------------|
+         */
+
+        int c_limit =
+            PAD_DOWN(MAX_NRAM_SIZE / sizeof(float) /
+                         (PIPELINE_COMMON_NUM + PIPELINE_PINGPONG_NUM * 2),
+                     ALIGN_SIZE);
+        int loop_int = channels / c_limit;
+        int rem = channels % c_limit;
+        int ping_pong = 0;
+        int ping_pong_offset =
+            (MAX_NRAM_SIZE / sizeof(float) - c_limit * PIPELINE_COMMON_NUM) / 2;
+        for (int loop_id = 0; loop_id <= loop_int; ++loop_id) {
+          int size = (loop_id == loop_int) ? rem : c_limit;
+          if (size == 0) {
+            break;
+          }
+          nram_argmax_fp =
+              (int32_t *)nram_argmax + c_limit + ping_pong * ping_pong_offset;
+          nram_argmax_fp_bk1 = (int32_t *)nram_argmax_fp + c_limit;
+          nram_argmax_fp_bk2 = (int32_t *)nram_argmax_fp_bk1 + c_limit;
+          nram_argmax_int = (int32_t *)nram_argmax_fp_bk2 + c_limit;
+          nram_argmax_int_h = (int32_t *)nram_argmax_int + c_limit;
+          nram_argmax_int_w = (int32_t *)nram_argmax_int_h + c_limit;
+          nram_argmax_fp_h = (int32_t *)nram_argmax_int_w + c_limit;
+          nram_argmax_fp_w = (int32_t *)nram_argmax_fp_h + c_limit;
+          nram_atomic_add = (float *)nram_argmax_fp_w + c_limit;
+          nram_grads_image = (float *)nram_atomic_add + c_limit;
+
+          // This pipeline loads the data from GDRAM to NRAM.
+          __memcpy((T *)nram_grads + c_limit * high_precision,
+                   (const T *)grads +
+                       n * pooled_height * pooled_width * channels +
+                       ph * pooled_width * channels + pw * channels +
+                       loop_id * c_limit,
+                   size * sizeof(T), GDRAM2NRAM);
+          if (high_precision) {
+            __bang_half2float((float *)nram_grads,
+                              (half *)nram_grads + c_limit * high_precision,
+                              c_limit);
+          }
+          __memcpy((int32_t *)nram_argmax,
+                   (const int32_t *)argmax +
+                       n * pooled_height * pooled_width * channels +
+                       ph * pooled_width * channels + pw * channels +
+                       loop_id * c_limit,
+                   size * sizeof(int32_t), GDRAM2NRAM);
+
+          for (int hc = 0; hc < h_compute; ++hc) {
+            for (int wc = 0; wc < w_compute; ++wc) {
+              // This pipeline performs pooling operation on NRAM.
+              convertIndex(
+                  nram_argmax, nram_argmax_fp, nram_argmax_fp_bk1,
+                  nram_argmax_fp_bk2, nram_argmax_int, nram_argmax_int_h,
+                  nram_argmax_int_w, nram_argmax_fp_h, nram_argmax_fp_w,
+                  nram_atomic_add, nram_grads_image, width, height, wstart + wc,
+                  hstart + hc, h_compute, w_compute, c_limit, size, 0, 0, 0);
+              __bang_maxpool_bp((float *)nram_grads_image, (float *)nram_grads,
+                                (int32_t *)nram_argmax_int, c_limit, 1, 1, 1, 1,
+                                1, 1);
+              if (high_precision) {
+                __bang_float2half_rd((half *)nram_grads_image,
+                                     (float *)nram_grads_image, c_limit);
+              }
+              // This pipeline stores the result on NRAM back to GDRAM.
+              T *dst = (T *)nram_atomic_add;
+              T *grads_image_n =
+                  (T *)grads_image + roi_batch_ind * height * width * channels;
+              T *src1 = (T *)grads_image_n +
+                        ((hc + hstart) * width + (wc + wstart)) * channels +
+                        loop_id * c_limit;
+              T *src2 = (T *)nram_grads_image;
+              __bang_atomic_add(dst, src1, src2, size);
+            }
+          }
+          ping_pong = 1 - ping_pong;
+        }
+      }
+    }
+  }
+}
+
+__mlu_global__ void MLUKernelRoiPoolBackward(
+    const void *grads, const void *rois, const int *argmax, void *grads_image,
+    int rois_num, int pooled_height, int pooled_width, int channels, int no,
+    int height, int width, const float spatial_scale,
+    const cnrtDataType_t k_dtype) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+  switch (k_dtype) {
+    case CNRT_FLOAT16: {
+      // Using the float type '__bang_max_pool_bp' instruction to increase the
+      // bit width.
+      const int high_precision = 1;
+      MLUUnion1Roipool((const half *)rois, (const half *)grads,
+                       (const int32_t *)argmax, (half *)grads_image, channels,
+                       height, width, pooled_height, pooled_width, rois_num,
+                       (const half)spatial_scale, high_precision);
+    }; break;
+    case CNRT_FLOAT32: {
+      const int high_precision = 0;
+      MLUUnion1Roipool((const float *)rois, (const float *)grads,
+                       (const int32_t *)argmax, (float *)grads_image, channels,
+                       height, width, pooled_height, pooled_width, rois_num,
+                       (const float)spatial_scale, high_precision);
+    }; break;
+    default: {
+      break;
+    }
+  }
+}
+}  // namespace backward
+
+void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                          cnrtQueue_t queue, cnrtDataType_t data_type,
+                          const void *input_data, const void *input_rois,
+                          const int batch, const int channels, const int height,
+                          const int width, const int pooled_height,
+                          const int pooled_width, const int rois_num,
+                          const float spatial_scale, void *output_data,
+                          int *argmax) {
+  forward::MLUKernelRoiPool<<<k_dim, k_type, queue>>>(
+      data_type, input_data, input_rois, batch, channels, height, width,
+      pooled_height, pooled_width, rois_num, spatial_scale, output_data,
+      argmax);
+}
+
+void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                           const void *grad_output_ptr, const void *rois_ptr,
+                           const int *argmax_ptr, void *grad_input_ptr,
+                           const int box_num, const int pooled_height,
+                           const int pooled_width, const int channels,
+                           const int batch, const int height, const int width,
+                           const float spatial_scale) {
+  backward::MLUKernelRoiPoolBackward<<<k_dim, k_type, queue>>>(
+      grad_output_ptr, rois_ptr, argmax_ptr, grad_input_ptr, box_num,
+      pooled_height, pooled_width, channels, batch, height, width,
+      spatial_scale, k_dtype);
+}
diff --git a/mmcv/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu b/mmcv/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
new file mode 100644
index 0000000000000000000000000000000000000000..7cb6df0e5d531afa6c2d548a6f3f7b8a8110da28
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mlu/tin_shift_mlu_kernel.mlu
@@ -0,0 +1,307 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "common_mlu_helper.hpp"
+
+__nram__ char data_nram[MAX_NRAM_SIZE];
+
+template <typename T>
+__mlu_func__ void mluMultiKernelTinShift(
+    const T *input, const int *shifts, T *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel) {
+  for (int cur_channel_index = taskId;
+       cur_channel_index < batch_size * channel_size;
+       cur_channel_index += taskDim) {
+    int n_index = cur_channel_index / channel_size;
+    int group_id = cur_channel_index % channel_size / group_channel;
+    int t_shift = shifts[n_index * group_size + group_id];
+    int index = cur_channel_index % channel_size * hw_size +
+                n_index * time_size * channel_size * hw_size;
+    __nramset(data_nram, MAX_NRAM_SIZE, (char)0);
+    __asm__ volatile("sync;");
+    if (abs(t_shift) >= time_size) {
+      __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+               channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+               time_size - 1);
+    } else {
+      if (t_shift > 0) {
+        __memcpy(data_nram + t_shift * hw_size * sizeof(T), input + index,
+                 hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
+                 channel_size * hw_size * sizeof(T), time_size - 1 - t_shift);
+        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                 time_size - 1);
+      } else {
+        __memcpy(data_nram, input + (index - t_shift * channel_size * hw_size),
+                 hw_size * sizeof(T), GDRAM2NRAM, hw_size * sizeof(T),
+                 channel_size * hw_size * sizeof(T), time_size - 1 + t_shift);
+        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                 time_size - 1);
+      }
+    }
+    __asm__ volatile("sync;");
+  }
+}
+
+template <typename T>
+__mlu_func__ void mluHwSplit(const T *input, const int t_shift,
+                             const int time_size, const int hw_size,
+                             const int channel_size, const int index,
+                             const int cur_sequence_index,
+                             const int max_length_per_core, T *output) {
+  for (int cur_index = index; cur_index < index + hw_size;
+       cur_index += max_length_per_core) {
+    int memcpy_size = max_length_per_core;
+    if (cur_index + max_length_per_core > index + hw_size) {
+      memcpy_size = index + hw_size - cur_index;
+    }
+    if (cur_sequence_index - t_shift < 0 ||
+        cur_sequence_index - t_shift >= time_size) {
+      __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
+               NRAM2GDRAM);
+    } else {
+      __memcpy(data_nram, input + cur_index - t_shift * channel_size * hw_size,
+               memcpy_size * sizeof(T), GDRAM2NRAM);
+      __memcpy(output + cur_index, data_nram, memcpy_size * sizeof(T),
+               NRAM2GDRAM);
+    }
+    __asm__ volatile("sync;");
+  }
+}
+
+template <typename T>
+__mlu_func__ void mluMultiKernelTinShiftSplitSequence(
+    const T *input, const int *shifts, T *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const int max_number_hw_per_core, const int max_length_per_core) {
+  const int tmp_max_number_hw_per_core =
+      max_number_hw_per_core > 0 ? max_number_hw_per_core : 1;
+  const int loop_time = time_size / tmp_max_number_hw_per_core +
+                        ((time_size % tmp_max_number_hw_per_core) > 0 ? 1 : 0);
+  int segmentime_size = tmp_max_number_hw_per_core;
+  int res_segment = time_size % tmp_max_number_hw_per_core;
+
+  for (int cur_segment_index = taskId;
+       cur_segment_index < loop_time * batch_size * channel_size;
+       cur_segment_index += taskDim) {
+    int n_index = cur_segment_index / loop_time / channel_size;
+    int group_id = cur_segment_index / loop_time % channel_size / group_channel;
+    int t_shift = shifts[n_index * group_size + group_id];
+    int index = n_index * time_size * channel_size * hw_size +
+                (cur_segment_index / loop_time % channel_size) * hw_size +
+                cur_segment_index % loop_time * segmentime_size * hw_size *
+                    channel_size;
+    char *dst_gdram2nram = data_nram;
+    const T *src_gdram2nram = input + index;
+    int count_gdram2nram = -1;
+    int count_nram2gdram = -1;
+    int next_sequence_index =
+        index / hw_size / channel_size % time_size + segmentime_size;
+    int cur_sequence_index = index / hw_size / channel_size % time_size;
+    __nramset(data_nram, MAX_NRAM_SIZE, (char)0);
+    __asm__ volatile("sync;");
+    if (max_number_hw_per_core == 0) {
+      mluHwSplit(input, t_shift, time_size, hw_size, channel_size, index,
+                 cur_sequence_index, max_length_per_core, output);
+      continue;
+    }
+    if (abs(t_shift) >= time_size) {
+      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
+        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                 res_segment - 1);
+      } else {
+        __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                 channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                 segmentime_size - 1);
+      }
+      continue;
+    }
+    if (t_shift == 0) {
+      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
+        dst_gdram2nram = data_nram;
+        src_gdram2nram = input + index;
+        count_gdram2nram = res_segment - 1;
+        count_nram2gdram = res_segment - 1;
+      } else {
+        dst_gdram2nram = data_nram;
+        src_gdram2nram = input + index;
+        count_gdram2nram = segmentime_size - 1;
+        count_nram2gdram = segmentime_size - 1;
+      }
+    } else if (t_shift > 0) {
+      int first_index_cur_channel =
+          n_index * time_size * channel_size * hw_size +
+          (cur_segment_index / loop_time % channel_size) * hw_size;
+      if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
+        dst_gdram2nram = data_nram;
+        src_gdram2nram =
+            input +
+            (index - t_shift * channel_size * hw_size < first_index_cur_channel
+                 ? first_index_cur_channel
+                 : index - t_shift * channel_size * hw_size);
+        count_gdram2nram = res_segment - 1;
+        count_nram2gdram = res_segment - 1;
+        if (cur_sequence_index < t_shift && t_shift < next_sequence_index) {
+          dst_gdram2nram =
+              data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
+          count_gdram2nram = res_segment - (t_shift - cur_sequence_index) - 1;
+        }
+      } else {
+        if (t_shift >= next_sequence_index) {
+          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                   segmentime_size - 1);
+          continue;
+        } else if (cur_sequence_index < t_shift &&
+                   t_shift < next_sequence_index) {
+          dst_gdram2nram =
+              data_nram + t_shift % segmentime_size * hw_size * sizeof(T);
+          src_gdram2nram = input + first_index_cur_channel;
+          count_gdram2nram = segmentime_size - (t_shift % segmentime_size) - 1;
+          count_nram2gdram = segmentime_size - 1;
+        } else {
+          dst_gdram2nram = data_nram;
+          src_gdram2nram = input + index - t_shift * channel_size * hw_size;
+          count_gdram2nram = segmentime_size - 1;
+          count_nram2gdram = segmentime_size - 1;
+        }
+      }
+    } else {
+      int offset_index = time_size + t_shift;
+      if (cur_sequence_index >= offset_index) {
+        if ((cur_segment_index + 1) % loop_time == 0 && res_segment != 0) {
+          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                   res_segment - 1);
+          continue;
+        } else {
+          __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+                   channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+                   segmentime_size - 1);
+          continue;
+        }
+      } else {
+        dst_gdram2nram = data_nram;
+        src_gdram2nram = input + index - t_shift * channel_size * hw_size;
+        if (cur_sequence_index - t_shift + segmentime_size < time_size) {
+          count_gdram2nram = segmentime_size - 1;
+          count_nram2gdram = segmentime_size - 1;
+        } else {
+          count_gdram2nram = time_size - (cur_sequence_index - t_shift) - 1;
+          count_nram2gdram =
+              (segmentime_size - 1) < (time_size - cur_sequence_index - 1)
+                  ? (segmentime_size - 1)
+                  : (time_size - cur_sequence_index - 1);
+        }
+      }
+    }
+    __memcpy(dst_gdram2nram, src_gdram2nram, hw_size * sizeof(T), GDRAM2NRAM,
+             hw_size * sizeof(T), channel_size * hw_size * sizeof(T),
+             count_gdram2nram);
+    __memcpy(output + index, data_nram, hw_size * sizeof(T), NRAM2GDRAM,
+             channel_size * hw_size * sizeof(T), hw_size * sizeof(T),
+             count_nram2gdram);
+    __asm__ volatile("sync;");
+  }
+}
+
+__mlu_entry__ void MLUUnion1KernelTinShift(
+    const void *input, const void *shifts, void *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+  switch (data_dtype) {
+    case CNRT_FLOAT16: {
+      mluMultiKernelTinShift((half *)input, (const int *)shifts, (half *)output,
+                             batch_size, time_size, channel_size, hw_size,
+                             group_size, group_channel);
+    }; break;
+    case CNRT_FLOAT32: {
+      mluMultiKernelTinShift((float *)input, (const int *)shifts,
+                             (float *)output, batch_size, time_size,
+                             channel_size, hw_size, group_size, group_channel);
+    }; break;
+    default: { return; }
+  }
+}
+
+__mlu_entry__ void MLUUnion1KernelTinShiftSplitSequence(
+    const void *input, const void *shifts, void *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const int max_number_hw_per_core, const int max_length_per_core,
+    const cnrtDataType_t data_dtype) {
+  // make sure that memcore is not used
+  if (coreId == 0x80) {
+    return;
+  }
+  switch (data_dtype) {
+    case CNRT_FLOAT16: {
+      mluMultiKernelTinShiftSplitSequence(
+          (half *)input, (const int *)shifts, (half *)output, batch_size,
+          time_size, channel_size, hw_size, group_size, group_channel,
+          max_number_hw_per_core, max_length_per_core);
+    }; break;
+    case CNRT_FLOAT32: {
+      mluMultiKernelTinShiftSplitSequence(
+          (float *)input, (const int *)shifts, (float *)output, batch_size,
+          time_size, channel_size, hw_size, group_size, group_channel,
+          max_number_hw_per_core, max_length_per_core);
+    }; break;
+    default: { return; }
+  }
+}
+
+void KernelTinShiftForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *input, const void *shifts, void *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype, const int channel_per_core,
+    const int max_number_hw_per_core, const int max_length_per_core) {
+  if (channel_per_core >= 1) {
+    MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
+        input, shifts, output, batch_size, time_size, channel_size, hw_size,
+        group_size, group_channel, data_dtype);
+  } else {
+    MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
+        input, shifts, output, batch_size, time_size, channel_size, hw_size,
+        group_size, group_channel, max_number_hw_per_core, max_length_per_core,
+        data_dtype);
+  }
+}
+
+void KernelTinShiftBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *grad_output, const void *shifts, void *grad_input,
+    const int batch_size, const int time_size, const int channel_size,
+    const int hw_size, const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype, const int channel_per_core,
+    const int max_number_hw_per_core, const int max_length_per_core) {
+  if (channel_per_core >= 1) {
+    MLUUnion1KernelTinShift<<<k_dim, k_type, queue>>>(
+        grad_output, shifts, grad_input, batch_size, time_size, channel_size,
+        hw_size, group_size, group_channel, data_dtype);
+  } else {
+    MLUUnion1KernelTinShiftSplitSequence<<<k_dim, k_type, queue>>>(
+        grad_output, shifts, grad_input, batch_size, time_size, channel_size,
+        hw_size, group_size, group_channel, max_number_hw_per_core,
+        max_length_per_core, data_dtype);
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/common/mps/MPSDevice.h b/mmcv/mmcv/ops/csrc/common/mps/MPSDevice.h
new file mode 100644
index 0000000000000000000000000000000000000000..e1d9d49618d7aea6a30b42630350c5a7b77ea0ac
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mps/MPSDevice.h
@@ -0,0 +1,64 @@
+//  Copyright © 2022 Apple Inc.
+
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSDevice.h
+
+#pragma once
+#include <ATen/ATen.h>
+#include <c10/macros/Macros.h>
+#include <c10/util/Exception.h>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLDevice;
+typedef void* MTLDevice_t;
+#endif
+
+using namespace std;
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSDevice
+//
+// MPSDevice is a singleton class that returns the default device
+//-----------------------------------------------------------------
+
+class TORCH_API MPSDevice {
+ public:
+  /**
+   * MPSDevice should not be cloneable.
+   */
+  MPSDevice(MPSDevice& other) = delete;
+  /**
+   * MPSDevice should not be assignable.
+   */
+  void operator=(const MPSDevice&) = delete;
+  /**
+   * Gets single instance of the Device.
+   */
+  static MPSDevice* getInstance();
+  /**
+   * Returns the single device.
+   */
+  MTLDevice_t device() { return _mtl_device; }
+
+  ~MPSDevice();
+
+ private:
+  static MPSDevice* _device;
+  MTLDevice_t _mtl_device;
+  MPSDevice();
+};
+
+TORCH_API bool is_available();
+
+TORCH_API at::Allocator* GetMPSAllocator(bool useSharedAllocator = false);
+
+}  // namespace mps
+}  // namespace at
diff --git a/mmcv/mmcv/ops/csrc/common/mps/MPSLibrary.h b/mmcv/mmcv/ops/csrc/common/mps/MPSLibrary.h
new file mode 100644
index 0000000000000000000000000000000000000000..41c33fba8cbdd43cc5b3285603c11c6f9eee617b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mps/MPSLibrary.h
@@ -0,0 +1,61 @@
+#ifndef _MPS_LIBRARY_H_
+#define _MPS_LIBRARY_H_
+
+#include <string>
+#include <unordered_map>
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+typedef id<MTLComputePipelineState> MTLComputePipelineState_t;
+typedef id<MTLLibrary> MTLLibrary_t;
+#else
+typedef void* MTLComputePipelineState;
+typedef void* MTLComputePipelineState_t;
+typedef void* MTLLibrary;
+typedef void* MTLLibrary_t;
+#endif
+
+class MPSLibrary {
+ public:
+  // disable constructor for singleton
+  static MPSLibrary* createFromUrl(const std::string& library_url);
+  static MPSLibrary* createFromSource(const std::string& source);
+  ~MPSLibrary();
+
+  MTLLibrary_t library() { return _library; }
+
+  MTLComputePipelineState_t getComputePipelineState(
+      const std::string& function_name);
+
+ private:
+  MTLLibrary_t _library;
+  std::unordered_map<std::string, MTLComputePipelineState_t> _pso_map;
+};
+
+class MPSLibraryManager {
+ public:
+  // disable constructor for singleton
+  MPSLibraryManager(const MPSLibraryManager&) = delete;
+  MPSLibraryManager& operator=(const MPSLibraryManager&) = delete;
+  MPSLibraryManager(MPSLibraryManager&&) = delete;
+  MPSLibraryManager& operator=(MPSLibraryManager&&) = delete;
+
+  static MPSLibraryManager* getInstance();
+
+  bool hasLibrary(const std::string& name);
+
+  MPSLibrary* getLibrary(const std::string& library_url);
+
+  MPSLibrary* createLibraryFromSouce(const std::string& name,
+                                     const std::string& sources);
+
+  ~MPSLibraryManager();
+
+ private:
+  MPSLibraryManager();
+  std::unordered_map<std::string, std::unique_ptr<MPSLibrary>> _library_map;
+};
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/mps/MPSLibrary.mm b/mmcv/mmcv/ops/csrc/common/mps/MPSLibrary.mm
new file mode 100644
index 0000000000000000000000000000000000000000..1a3d635ca95666e110a94b33315d94af16888b7c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mps/MPSLibrary.mm
@@ -0,0 +1,110 @@
+#include "MPSLibrary.h"
+#include <c10/util/CallOnce.h>
+#include "MPSDevice.h"
+
+static std::unique_ptr<MPSLibraryManager> mps_library_manager;
+static c10::once_flag mpsdev_init;
+
+MPSLibraryManager* MPSLibraryManager::getInstance() {
+  c10::call_once(mpsdev_init, [] {
+    mps_library_manager = std::unique_ptr<MPSLibraryManager>(new MPSLibraryManager());
+  });
+  return mps_library_manager.get();
+}
+
+MPSLibraryManager::~MPSLibraryManager() {}
+
+MPSLibraryManager::MPSLibraryManager() {}
+
+bool MPSLibraryManager::hasLibrary(const std::string& name) {
+  return _library_map.find(name) != _library_map.end();
+}
+
+MPSLibrary* MPSLibraryManager::getLibrary(const std::string& library_url) {
+  if (_library_map.find(library_url) != _library_map.end()) {
+    return _library_map[library_url].get();
+  }
+  _library_map.emplace(std::make_pair(
+      library_url, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromUrl(library_url))));
+  return _library_map[library_url].get();
+}
+
+MPSLibrary* MPSLibraryManager::createLibraryFromSouce(const std::string& name,
+                                                      const std::string& source) {
+  NSString* ns_name = [NSString stringWithCString:name.c_str()];
+  if (_library_map.find(name) != _library_map.end()) {
+    NSLog(@"Library %@ already exist.", ns_name);
+    return nullptr;
+  }
+
+  _library_map.emplace(
+      std::make_pair(name, std::unique_ptr<MPSLibrary>(MPSLibrary::createFromSource(source))));
+  return _library_map[name].get();
+}
+
+MPSLibrary* MPSLibrary::createFromUrl(const std::string& library_url) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // load library and func
+    NSString* utl_str = [NSString stringWithCString:library_url.c_str()];
+    NSURL* metal_url = [NSURL fileURLWithPath:utl_str];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithURL:metal_url
+                                                                                 error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+
+  return library;
+}
+
+MPSLibrary* MPSLibrary::createFromSource(const std::string& sources) {
+  MPSLibrary* library = new MPSLibrary();
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // load library and func
+    NSString* code_str = [NSString stringWithCString:sources.c_str()];
+    library->_library = [at::mps::MPSDevice::getInstance()->device() newLibraryWithSource:code_str
+                                                                                  options:nil
+                                                                                    error:&error];
+    if (library->_library == nil) {
+      NSLog(@"Failed to find library, error %@.", error);
+      exit(1);
+    }
+  }
+
+  return library;
+}
+
+MPSLibrary::~MPSLibrary() {
+  [_library release];
+  _library = nil;
+}
+
+MTLComputePipelineState_t MPSLibrary::getComputePipelineState(const std::string& function_name) {
+  if (_pso_map.find(function_name) != _pso_map.end()) {
+    return _pso_map[function_name];
+  }
+
+  MTLComputePipelineState_t pso;
+  @autoreleasepool {
+    NSError* error = nil;
+
+    // create function
+    NSString* function_name_str = [NSString stringWithCString:function_name.c_str()];
+    id<MTLFunction> func = [_library newFunctionWithName:function_name_str];
+    if (func == nil) {
+      NSLog(@"Failed to created pipeline state object, error %@.", error);
+      exit(1);
+    }
+    // create pipeline
+    pso = [at::mps::MPSDevice::getInstance()->device() newComputePipelineStateWithFunction:func
+                                                                                     error:&error];
+    _pso_map.emplace(std::make_pair(function_name, pso));
+  }
+  return _pso_map[function_name];
+}
diff --git a/mmcv/mmcv/ops/csrc/common/mps/MPSStream.h b/mmcv/mmcv/ops/csrc/common/mps/MPSStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..54cd388494c8bbac636db44dd5c8afd1915357c6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mps/MPSStream.h
@@ -0,0 +1,132 @@
+//  Copyright © 2022 Apple Inc.
+
+// This file is modify from:
+// https://github.com/pytorch/pytorch/blob/a85d1f0bcdd02cf18d3b0517337458cb51a18cdb/aten/src/ATen/mps/MPSStream.h
+
+#pragma once
+
+#include <cstdint>
+#include <utility>
+
+#include <c10/core/DeviceGuard.h>
+#include <c10/core/Stream.h>
+#include <c10/util/Exception.h>
+#include "MPSDevice.h"
+
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+typedef id<MTLCommandQueue> MTLCommandQueue_t;
+typedef id<MTLCommandBuffer> MTLCommandBuffer_t;
+typedef id<MTLSharedEvent> MTLSharedEvent_t;
+typedef id<MTLDevice> MTLDevice_t;
+#else
+typedef void* MTLCommandQueue_t;
+typedef void* MTLCommandQueue;
+typedef void* MTLCommandBuffer_t;
+typedef void* MTLCommandBuffer;
+typedef void* MTLSharedEvent_t;
+typedef void* dispatch_queue_t;
+typedef void* MTLDevice_t;
+#define nil NULL;
+#endif
+
+namespace at {
+namespace mps {
+
+//-----------------------------------------------------------------
+//  MPSStream
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStream {
+ public:
+  enum Unchecked { UNCHECKED };
+  /// Construct a MPSStream from a Stream.  This construction is checked,
+  /// and will raise an error if the Stream is not, in fact, a MPS stream.
+  explicit MPSStream(Stream stream);
+
+  ~MPSStream();
+  MTLCommandQueue_t commandQueue() const { return _commandQueue; };
+  dispatch_queue_t queue() const { return _serialQueue; }
+
+  MTLCommandBuffer_t commandBuffer();
+  void commit(bool flush);
+  void commitAndWait();
+  void synchronize();
+
+  void flush();
+
+  /// Get the MPS device index that this stream is associated with.
+  c10::DeviceIndex device_index() const { return _stream.device_index(); }
+
+  MTLCommandQueue_t stream() const { return _commandQueue; };
+
+  MTLDevice_t device() const { return [_commandQueue device]; }
+
+  /// Explicit conversion to Stream.
+  Stream unwrap() const { return _stream; }
+
+ private:
+  Stream _stream;
+  MTLCommandQueue_t _commandQueue = nil;
+  MTLCommandBuffer_t _commandBuffer = nil;
+  void _flush(bool commitAndWait) const;
+
+  dispatch_queue_t _serialQueue = nullptr;
+};
+
+/**
+ * Get the current MPS stream
+ */
+TORCH_API MPSStream* getCurrentMPSStream();
+
+/**
+ * Get the default MPS stream
+ */
+TORCH_API MPSStream* getDefaultMPSStream();
+
+//-----------------------------------------------------------------
+//  MPSStreamImpl
+//-----------------------------------------------------------------
+
+class TORCH_API MPSStreamImpl {
+ public:
+  /**
+   * Gets single instance of the MPSStream.
+   */
+  static MPSStream* getInstance();
+
+ private:
+  static MPSStream* _stream;
+  MPSStreamImpl();
+};
+
+//-----------------------------------------------------------------
+//  MPSEvent
+//-----------------------------------------------------------------
+
+struct TORCH_API MPSEvent {
+  MPSEvent();
+  // MPSEvent(id<MTLDevice> device);
+
+  ~MPSEvent();
+  MTLSharedEvent_t event() const { return _event; }
+
+  void recordEvent(MPSStream* stream);
+  void waitForEvent(MPSStream* queue);  // waits on the cpu
+  bool queryEvent();
+  uint64_t getCurrentValue() { return _currentValue; }
+  void setCurrentValue(uint64_t currValue) { _currentValue = currValue; }
+
+ private:
+  bool _isRecorded = false;
+  uint64_t _currentValue = 0;
+  MTLSharedEvent_t _event;
+};
+
+typedef MPSEvent* mpsEvent_t;
+
+}  // namespace mps
+}  // namespace at
diff --git a/mmcv/mmcv/ops/csrc/common/mps/MPSUtils.h b/mmcv/mmcv/ops/csrc/common/mps/MPSUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a4ce6d7978d566e88dd22ee4f9722df914ff0de
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/mps/MPSUtils.h
@@ -0,0 +1,51 @@
+#ifndef _MPS_UTILS_H_
+#define _MPS_UTILS_H_
+#include <torch/extension.h>
+#ifdef __OBJC__
+#include <Foundation/Foundation.h>
+#include <Metal/Metal.h>
+#include <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+typedef id<MTLBuffer> MTLBuffer_t;
+typedef id<MTLComputeCommandEncoder> MTLComputeCommandEncoder_t;
+#else
+typedef void* MTLBuffer;
+typedef void* MTLBuffer_t;
+typedef void* MTLComputeCommandEncoder;
+typedef void* MTLComputeCommandEncoder_t;
+#endif
+
+// utils
+static inline MTLBuffer_t getMTLBufferStorage(const at::Tensor& tensor) {
+  return __builtin_bit_cast(MTLBuffer_t, tensor.storage().data());
+}
+
+template <typename T,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t);
+
+template <typename T,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value, bool> = true>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBuffer:getMTLBufferStorage(t) offset:0 atIndex:index];
+}
+
+template <typename T, std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+void setMTLArg(MTLComputeCommandEncoder_t encoder, int index, T&& t) {
+  [encoder setBytes:&t length:sizeof(t) atIndex:index];
+}
+
+inline void setMTLArgsImpl(MTLComputeCommandEncoder_t, int) {}
+
+template <typename T, typename... Args>
+void setMTLArgsImpl(MTLComputeCommandEncoder_t encoder, int index, T&& t, Args&&... args) {
+  setMTLArg(encoder, index, std::forward<T>(t));
+  setMTLArgsImpl(encoder, index + 1, std::forward<Args>(args)...);
+}
+
+template <typename... Args>
+void setMTLArgs(MTLComputeCommandEncoder_t encoder, MTLComputePipelineState_t pso, Args&&... args) {
+  [encoder setComputePipelineState:pso];
+  setMTLArgsImpl(encoder, 0, std::forward<Args>(args)...);
+}
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/parrots_cpp_helper.hpp b/mmcv/mmcv/ops/csrc/common/parrots_cpp_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..72701890dd727db911a1c0ce4d6790c1b531348d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/parrots_cpp_helper.hpp
@@ -0,0 +1,40 @@
+#ifndef PARROTS_CPP_HELPER
+#define PARROTS_CPP_HELPER
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darraylite.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+#include <vector>
+
+using namespace parrots;
+
+#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
+  case prim_type: {                                     \
+    using scalar_t = type;                              \
+    return __VA_ARGS__();                               \
+  }
+
+#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
+  [&] {                                                             \
+    const auto& the_type = TYPE;                                    \
+    switch (the_type) {                                             \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
+      default:                                                      \
+        PARROTS_NOTSUPPORTED;                                       \
+    }                                                               \
+  }()
+
+#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
+  [&] {                                                              \
+    const auto& the_type = TYPE;                                     \
+    switch (the_type) {                                              \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
+      default:                                                       \
+        PARROTS_NOTSUPPORTED;                                        \
+    }                                                                \
+  }()
+
+#endif  // PARROTS_CPP_HELPER
diff --git a/mmcv/mmcv/ops/csrc/common/parrots_cuda_helper.hpp b/mmcv/mmcv/ops/csrc/common/parrots_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..539009c3f91b46ea58a3a64f0875d799e8bd0b65
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/parrots_cuda_helper.hpp
@@ -0,0 +1,111 @@
+#ifndef PARROTS_CUDA_HELPER
+#define PARROTS_CUDA_HELPER
+
+#include <cuda.h>
+#include <float.h>
+
+#include <parrots/darray/darraymath.hpp>
+#include <parrots/darray/mathfunctions.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/darrayutil.hpp>
+#include <parrots/foundation/exceptions.hpp>
+#include <parrots/foundation/float16.hpp>
+#include <parrots/foundation/mathfunction.hpp>
+
+#include "common_cuda_helper.hpp"
+#include "parrots_cudawarpfunction.cuh"
+
+using namespace parrots;
+using phalf = float16;
+
+#define __PHALF(x) (x.y)
+
+#define PARROTS_CUDA_CHECK(exp)                         \
+  do {                                                  \
+    cudaError_t err = exp;                              \
+    if (err != cudaSuccess) {                           \
+      fprintf(stderr, "cudaCheckError() failed : %s\n", \
+              cudaGetErrorString(err));                 \
+      exit(-1);                                         \
+    }                                                   \
+  } while (0)
+
+#define PARROTS_PRIVATE_CASE_TYPE(prim_type, type, ...) \
+  case prim_type: {                                     \
+    using scalar_t = type;                              \
+    return __VA_ARGS__();                               \
+  }
+
+#define PARROTS_DISPATCH_FLOATING_TYPES(TYPE, ...)                  \
+  [&] {                                                             \
+    const auto& the_type = TYPE;                                    \
+    switch (the_type) {                                             \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__) \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)  \
+      default:                                                      \
+        PARROTS_NOTSUPPORTED;                                       \
+    }                                                               \
+  }()
+
+#define PARROTS_DISPATCH_FLOATING_TYPES_AND_HALF(TYPE, ...)          \
+  [&] {                                                              \
+    const auto& the_type = TYPE;                                     \
+    switch (the_type) {                                              \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float64, double, __VA_ARGS__)  \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float32, float, __VA_ARGS__)   \
+      PARROTS_PRIVATE_CASE_TYPE(Prim::Float16, float16, __VA_ARGS__) \
+      default:                                                       \
+        PARROTS_NOTSUPPORTED;                                        \
+    }                                                                \
+  }()
+
+/** atomicAdd **/
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
+
+static __inline__ __device__ double atomicAdd(double* address, double val) {
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed;
+  if (val == 0.0) return __longlong_as_double(old);
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull, assumed,
+                    __double_as_longlong(val + __longlong_as_double(assumed)));
+  } while (assumed != old);
+  return __longlong_as_double(old);
+}
+
+#endif
+
+static __inline__ __device__ float16 atomicAdd(float16* address, float16 val) {
+  unsigned int* aligned =
+      (unsigned int*)((size_t)address - ((size_t)address & 2));
+  unsigned int old = *aligned;
+  unsigned int assumed;
+  unsigned short old_as_us;
+  do {
+    assumed = old;
+    old_as_us =
+        (unsigned short)((size_t)address & 2 ? old >> 16 : old & 0xffff);
+
+#if __CUDACC_VER_MAJOR__ >= 9
+    float16 tmp;
+    tmp.x = old_as_us;
+    float16 sum = tmp + val;
+    unsigned short sum_as_us = sum.x;
+//         half sum = __float2half_rn(__half2float(__ushort_as_half(old_as_us))
+//         + (float)(val)); unsigned short sum_as_us = __half_as_ushort(sum);
+#else
+    unsigned short sum_as_us =
+        __float2half_rn(__half2float(old_as_us) + (float)(val));
+#endif
+
+    unsigned int sum_as_ui = (size_t)address & 2
+                                 ? (sum_as_us << 16) | (old & 0xffff)
+                                 : (old & 0xffff0000) | sum_as_us;
+    old = atomicCAS(aligned, assumed, sum_as_ui);
+  } while (assumed != old);
+  //__half_raw raw = {old_as_us};
+  // return float16(raw);
+  return *reinterpret_cast<float16*>(&old_as_us);
+}
+#endif  // PARROTS_CUDA_HELPER
diff --git a/mmcv/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp b/mmcv/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f68e8740561ef833c09e1ba9f999922f5d04bce5
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/pytorch_cpp_helper.hpp
@@ -0,0 +1,27 @@
+#ifndef PYTORCH_CPP_HELPER
+#define PYTORCH_CPP_HELPER
+#include <torch/types.h>
+
+#include <vector>
+
+using namespace at;
+
+#define CHECK_CUDA(x) \
+  TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_MLU(x) \
+  TORCH_CHECK(x.device().type() == at::kMLU, #x " must be a MLU tensor")
+#define CHECK_CPU(x) \
+  TORCH_CHECK(x.device().type() == at::kCPU, #x " must be a CPU tensor")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_CUDA_INPUT(x) \
+  CHECK_CUDA(x);            \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_MLU_INPUT(x) \
+  CHECK_MLU(x);            \
+  CHECK_CONTIGUOUS(x)
+#define CHECK_CPU_INPUT(x) \
+  CHECK_CPU(x);            \
+  CHECK_CONTIGUOUS(x)
+
+#endif  // PYTORCH_CPP_HELPER
diff --git a/mmcv/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp b/mmcv/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9869b535f8a1de758b0c35612dbd4ac2a1701ad9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/pytorch_cuda_helper.hpp
@@ -0,0 +1,19 @@
+#ifndef PYTORCH_CUDA_HELPER
+#define PYTORCH_CUDA_HELPER
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+#include <THC/THCAtomics.cuh>
+
+#include "common_cuda_helper.hpp"
+
+using at::Half;
+using at::Tensor;
+using phalf = at::Half;
+
+#define __PHALF(x) (x)
+
+#endif  // PYTORCH_CUDA_HELPER
diff --git a/mmcv/mmcv/ops/csrc/common/pytorch_device_registry.hpp b/mmcv/mmcv/ops/csrc/common/pytorch_device_registry.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..2a32b7270c3521f960394af7d18cbbd03ba50df1
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/pytorch_device_registry.hpp
@@ -0,0 +1,141 @@
+#ifndef PYTORCH_DEVICE_REGISTRY_H
+#define PYTORCH_DEVICE_REGISTRY_H
+
+// Using <torch/extension.h> is recommended in the official documentation in
+// https://pytorch.org/tutorials/advanced/cpp_extension.html#writing-the-c-op.
+// However, we use <torch/types.h> for compatibility with CUDA 9.0
+// Read https://github.com/pytorch/extension-cpp/issues/35 for more details.
+#include <torch/types.h>
+
+#include <cassert>
+#include <functional>
+#include <map>
+#include <type_traits>
+
+inline std::string GetDeviceStr(const at::Device& device) {
+  std::string str = DeviceTypeName(device.type(), true);
+  if (device.has_index()) {
+    str.push_back(':');
+    str.append(std::to_string(device.index()));
+  }
+  return str;
+}
+
+// Registry
+template <typename F, F f>
+class DeviceRegistry;
+
+template <typename Ret, typename... Args, Ret (*f)(Args...)>
+class DeviceRegistry<Ret (*)(Args...), f> {
+ public:
+  using FunctionType = Ret (*)(Args...);
+  static const int MAX_DEVICE_TYPES =
+      int8_t(at::DeviceType::COMPILE_TIME_MAX_DEVICE_TYPES);
+
+  void Register(at::DeviceType device, FunctionType function) {
+    funcs_[int8_t(device)] = function;
+  }
+
+  FunctionType Find(at::DeviceType device) const {
+    return funcs_[int8_t(device)];
+  }
+
+  static DeviceRegistry& instance() {
+    static DeviceRegistry inst;
+    return inst;
+  }
+
+ private:
+  DeviceRegistry() {
+    for (size_t i = 0; i < MAX_DEVICE_TYPES; ++i) {
+      funcs_[i] = nullptr;
+    }
+  };
+  FunctionType funcs_[MAX_DEVICE_TYPES];
+};
+
+// get device of first tensor param
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return std::forward<T>(t).device();
+}
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+at::Device GetFirstTensorDevice(T&& t, Args&&... args) {
+  return GetFirstTensorDevice(std::forward<Args>(args)...);
+}
+
+// check device consistency
+
+inline std::pair<int, at::Device> CheckDeviceConsistency(
+    const at::Device& device, int index) {
+  return {index, device};
+}
+
+template <typename T, typename... Args,
+          std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args);
+
+template <typename T, typename... Args,
+          std::enable_if_t<std::is_same<std::decay_t<T>, at::Tensor>::value,
+                           bool> = true>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  auto new_device = std::forward<T>(t).device();
+  if (new_device.type() != device.type() ||
+      new_device.index() != device.index()) {
+    return {index, new_device};
+  }
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+template <
+    typename T, typename... Args,
+    std::enable_if_t<!std::is_same<std::decay_t<T>, at::Tensor>::value, bool>>
+std::pair<int, at::Device> CheckDeviceConsistency(const at::Device& device,
+                                                  int index, T&& t,
+                                                  Args&&... args) {
+  return CheckDeviceConsistency(device, index + 1, std::forward<Args>(args)...);
+}
+
+// dispatch
+
+template <typename R, typename... Args>
+auto Dispatch(const R& registry, const char* name, Args&&... args) {
+  auto device = GetFirstTensorDevice(std::forward<Args>(args)...);
+  auto inconsist =
+      CheckDeviceConsistency(device, 0, std::forward<Args>(args)...);
+  TORCH_CHECK(inconsist.first >= int(sizeof...(Args)), name, ": at param ",
+              inconsist.first,
+              ", inconsistent device: ", GetDeviceStr(inconsist.second).c_str(),
+              " vs ", GetDeviceStr(device).c_str(), "\n")
+  auto f_ptr = registry.Find(device.type());
+  TORCH_CHECK(f_ptr != nullptr, name, ": implementation for device ",
+              GetDeviceStr(device).c_str(), " not found.\n")
+  return f_ptr(std::forward<Args>(args)...);
+}
+
+// helper macro
+
+#define DEVICE_REGISTRY(key) DeviceRegistry<decltype(&(key)), key>::instance()
+
+#define REGISTER_DEVICE_IMPL(key, device, value)           \
+  struct key##_##device##_registerer {                     \
+    key##_##device##_registerer() {                        \
+      DEVICE_REGISTRY(key).Register(at::k##device, value); \
+    }                                                      \
+  };                                                       \
+  static key##_##device##_registerer _##key##_##device##_registerer;
+
+#define DISPATCH_DEVICE_IMPL(key, ...) \
+  Dispatch(DEVICE_REGISTRY(key), #key, __VA_ARGS__)
+
+#endif  // PYTORCH_DEVICE_REGISTRY
diff --git a/mmcv/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp b/mmcv/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..72dbe5880bfed2bcebaf6b20c6f169639e34fa38
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/pytorch_mlu_helper.hpp
@@ -0,0 +1,28 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#ifndef PYTORCH_MLU_HELPER_HPP_
+#define PYTORCH_MLU_HELPER_HPP_
+
+#ifdef MMCV_WITH_MLU
+#include "aten.h"
+
+#define NFU_ALIGN_SIZE 128
+
+#define PAD_UP(x, y) (((x) / (y) + (int)((x) % (y) > 0)) * (y))
+
+#define PAD_DOWN(x, y) (((x) / (y)) * (y))
+
+#define CEIL_ALIGN(x, y) (((x) + (y)-1) / (y) * (y))
+
+#endif
+
+#endif  // PYTORCH_MLU_HELPER_HPP_
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
new file mode 100644
index 0000000000000000000000000000000000000000..f23ff4482324c51012865c42f2a5f9e59d54848a
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/paramsgrid.h
@@ -0,0 +1,70 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef PARAMS_GRID_H_
+#define PARAMS_GRID_H_
+#include <tuple>
+#include <vector>
+
+namespace detail {
+template <class scalar_t>
+int getTotalSize(std::vector<scalar_t> arg) {
+  return arg.size();
+}
+
+template <class scalar_t, class... TArgs>
+int getTotalSize(std::vector<scalar_t> arg, std::vector<TArgs>... args) {
+  return arg.size() * getTotalSize(args...);
+}
+
+template <typename scalar_t>
+int getSize(std::vector<scalar_t> arg) {
+  return arg.size();
+}
+
+template <int Idx, class TT, class scalar_t>
+void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+}
+
+template <int Idx, class TT, class scalar_t, class... TArgs>
+void assigner(TT &src, std::vector<int> counter, std::vector<scalar_t> &arg,
+              std::vector<TArgs> &... args) {
+  std::get<Idx>(src) = arg[counter[Idx]];
+  assigner<Idx + 1>(src, counter, args...);
+}
+}  // namespace detail
+
+template <class... TArgs>
+std::vector<std::tuple<TArgs...>> paramsGrid(std::vector<TArgs>... args) {
+  int length = detail::getTotalSize(args...);
+  std::vector<int> sizes = {detail::getSize(args)...};
+  int size = sizes.size();
+
+  std::vector<std::tuple<TArgs...>> params(length);
+  std::vector<int> counter(size);
+  for (int i = 0; i < length; ++i) {
+    detail::assigner<0>(params[i], counter, args...);
+    counter[size - 1] += 1;
+    for (int c = size - 1; c >= 0; --c) {
+      if (counter[c] == sizes[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return params;
+}
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/prettyprint.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
new file mode 100644
index 0000000000000000000000000000000000000000..0a6bdc3361dc1ada31fdebef87989672c9aeb51c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/prettyprint.h
@@ -0,0 +1,493 @@
+//          Copyright Louis Delacroix 2010 - 2014.
+// Distributed under the Boost Software License, Version 1.0.
+//    (See accompanying file LICENSE_1_0.txt or copy at
+//          http://www.boost.org/LICENSE_1_0.txt)
+//
+// A pretty printing library for C++
+//
+// Usage:
+// Include this header, and operator<< will "just work".
+
+#ifndef H_PRETTY_PRINT
+#define H_PRETTY_PRINT
+
+#include <cstddef>
+#include <iterator>
+#include <memory>
+#include <ostream>
+#include <set>
+#include <tuple>
+#include <type_traits>
+#include <unordered_set>
+#include <utility>
+#include <valarray>
+
+namespace pretty_print {
+namespace detail {
+// SFINAE type trait to detect whether T::const_iterator exists.
+
+struct sfinae_base {
+  using yes = char;
+  using no = yes[2];
+};
+
+template <typename T>
+struct has_const_iterator : private sfinae_base {
+ private:
+  template <typename C>
+  static yes &test(typename C::const_iterator *);
+  template <typename C>
+  static no &test(...);
+
+ public:
+  static const bool value = sizeof(test<T>(nullptr)) == sizeof(yes);
+  using type = T;
+};
+
+template <typename T>
+struct has_begin_end : private sfinae_base {
+ private:
+  template <typename C>
+  static yes &
+  f(typename std::enable_if<
+      std::is_same<decltype(static_cast<typename C::const_iterator (C::*)()
+                                            const>(&C::begin)),
+                   typename C::const_iterator (C::*)() const>::value>::type *);
+
+  template <typename C>
+  static no &f(...);
+
+  template <typename C>
+  static yes &g(typename std::enable_if<
+                std::is_same<decltype(static_cast<typename C::const_iterator (
+                                          C::*)() const>(&C::end)),
+                             typename C::const_iterator (C::*)() const>::value,
+                void>::type *);
+
+  template <typename C>
+  static no &g(...);
+
+ public:
+  static bool const beg_value = sizeof(f<T>(nullptr)) == sizeof(yes);
+  static bool const end_value = sizeof(g<T>(nullptr)) == sizeof(yes);
+};
+
+}  // namespace detail
+
+// Holds the delimiter values for a specific character type
+
+template <typename TChar>
+struct delimiters_values {
+  using char_type = TChar;
+  const char_type *prefix;
+  const char_type *delimiter;
+  const char_type *postfix;
+};
+
+// Defines the delimiter values for a specific container and character type
+
+template <typename T, typename TChar>
+struct delimiters {
+  using type = delimiters_values<TChar>;
+  static const type values;
+};
+
+// Functor to print containers. You can use this directly if you want
+// to specify a non-default delimiters type. The printing logic can
+// be customized by specializing the nested template.
+
+template <typename T, typename TChar = char,
+          typename TCharTraits = ::std::char_traits<TChar>,
+          typename TDelimiters = delimiters<T, TChar>>
+struct print_container_helper {
+  using delimiters_type = TDelimiters;
+  using ostream_type = std::basic_ostream<TChar, TCharTraits>;
+
+  template <typename U>
+  struct printer {
+    static void print_body(const U &c, ostream_type &stream) {
+      using std::begin;
+      using std::end;
+
+      auto it = begin(c);
+      const auto the_end = end(c);
+
+      if (it != the_end) {
+        for (;;) {
+          stream << *it;
+
+          if (++it == the_end) break;
+
+          if (delimiters_type::values.delimiter != NULL)
+            stream << delimiters_type::values.delimiter;
+        }
+      }
+    }
+  };
+
+  print_container_helper(const T &container) : container_(container) {}
+
+  inline void operator()(ostream_type &stream) const {
+    if (delimiters_type::values.prefix != NULL)
+      stream << delimiters_type::values.prefix;
+
+    printer<T>::print_body(container_, stream);
+
+    if (delimiters_type::values.postfix != NULL)
+      stream << delimiters_type::values.postfix;
+  }
+
+ private:
+  const T &container_;
+};
+
+// Specialization for pairs
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename T1, typename T2>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::pair<T1, T2>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+
+  static void print_body(const std::pair<T1, T2> &c, ostream_type &stream) {
+    stream << c.first;
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+    stream << c.second;
+  }
+};
+
+// Specialization for tuples
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+template <typename... Args>
+struct print_container_helper<T, TChar, TCharTraits,
+                              TDelimiters>::printer<std::tuple<Args...>> {
+  using ostream_type =
+      typename print_container_helper<T, TChar, TCharTraits,
+                                      TDelimiters>::ostream_type;
+  using element_type = std::tuple<Args...>;
+
+  template <std::size_t I>
+  struct Int {};
+
+  static void print_body(const element_type &c, ostream_type &stream) {
+    tuple_print(c, stream, Int<0>());
+  }
+
+  static void tuple_print(const element_type &, ostream_type &,
+                          Int<sizeof...(Args)>) {}
+
+  static void tuple_print(
+      const element_type &c, ostream_type &stream,
+      typename std::conditional<sizeof...(Args) != 0, Int<0>,
+                                std::nullptr_t>::type) {
+    stream << std::get<0>(c);
+    tuple_print(c, stream, Int<1>());
+  }
+
+  template <std::size_t N>
+  static void tuple_print(const element_type &c, ostream_type &stream, Int<N>) {
+    if (print_container_helper<T, TChar, TCharTraits,
+                               TDelimiters>::delimiters_type::values
+            .delimiter != NULL)
+      stream << print_container_helper<T, TChar, TCharTraits,
+                                       TDelimiters>::delimiters_type::values
+                    .delimiter;
+
+    stream << std::get<N>(c);
+
+    tuple_print(c, stream, Int<N + 1>());
+  }
+};
+
+// Prints a print_container_helper to the specified stream.
+
+template <typename T, typename TChar, typename TCharTraits,
+          typename TDelimiters>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &stream,
+    const print_container_helper<T, TChar, TCharTraits, TDelimiters> &helper) {
+  helper(stream);
+  return stream;
+}
+
+// Basic is_container template; specialize to derive from std::true_type for all
+// desired container types
+
+template <typename T>
+struct is_container
+    : public std::integral_constant<bool,
+                                    detail::has_const_iterator<T>::value &&
+                                        detail::has_begin_end<T>::beg_value &&
+                                        detail::has_begin_end<T>::end_value> {};
+
+template <typename T, std::size_t N>
+struct is_container<T[N]> : std::true_type {};
+
+template <std::size_t N>
+struct is_container<char[N]> : std::false_type {};
+
+template <typename T>
+struct is_container<std::valarray<T>> : std::true_type {};
+
+template <typename T1, typename T2>
+struct is_container<std::pair<T1, T2>> : std::true_type {};
+
+template <typename... Args>
+struct is_container<std::tuple<Args...>> : std::true_type {};
+
+// Default delimiters
+
+template <typename T>
+struct delimiters<T, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T>
+const delimiters_values<char> delimiters<T, char>::values = {"[", ", ", "]"};
+template <typename T>
+struct delimiters<T, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T>
+const delimiters_values<wchar_t> delimiters<T, wchar_t>::values = {L"[", L", ",
+                                                                   L"]"};
+
+// Delimiters for (multi)set and unordered_(multi)set
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::set<T, TComp, TAllocator>, char>::values = {"{", ", ",
+                                                                  "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::set<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::set<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<char>
+    delimiters<::std::multiset<T, TComp, TAllocator>, char>::values = {
+        "{", ", ", "}"};
+
+template <typename T, typename TComp, typename TAllocator>
+struct delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename TComp, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::multiset<T, TComp, TAllocator>, wchar_t>::values = {
+        L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t> delimiters<
+    ::std::unordered_set<T, THash, TEqual, TAllocator>, wchar_t>::values = {
+    L"{", L", ", L"}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  char> {
+  static const delimiters_values<char> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<char> delimiters<
+    ::std::unordered_multiset<T, THash, TEqual, TAllocator>, char>::values = {
+    "{", ", ", "}"};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+struct delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+                  wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+
+template <typename T, typename THash, typename TEqual, typename TAllocator>
+const delimiters_values<wchar_t>
+    delimiters<::std::unordered_multiset<T, THash, TEqual, TAllocator>,
+               wchar_t>::values = {L"{", L", ", L"}"};
+
+// Delimiters for pair and tuple
+
+template <typename T1, typename T2>
+struct delimiters<std::pair<T1, T2>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<char> delimiters<std::pair<T1, T2>, char>::values = {
+    "(", ", ", ")"};
+template <typename T1, typename T2>
+struct delimiters<::std::pair<T1, T2>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename T1, typename T2>
+const delimiters_values<wchar_t>
+    delimiters<::std::pair<T1, T2>, wchar_t>::values = {L"(", L", ", L")"};
+
+template <typename... Args>
+struct delimiters<std::tuple<Args...>, char> {
+  static const delimiters_values<char> values;
+};
+template <typename... Args>
+const delimiters_values<char> delimiters<std::tuple<Args...>, char>::values = {
+    "(", ", ", ")"};
+template <typename... Args>
+struct delimiters<::std::tuple<Args...>, wchar_t> {
+  static const delimiters_values<wchar_t> values;
+};
+template <typename... Args>
+const delimiters_values<wchar_t>
+    delimiters<::std::tuple<Args...>, wchar_t>::values = {L"(", L", ", L")"};
+
+// Type-erasing helper class for easy use of custom delimiters.
+// Requires TCharTraits = std::char_traits<TChar> and TChar = char or wchar_t,
+// and MyDelims needs to be defined for TChar. Usage: "cout <<
+// pretty_print::custom_delims<MyDelims>(x)".
+
+struct custom_delims_base {
+  virtual ~custom_delims_base() {}
+  virtual std::ostream &stream(::std::ostream &) = 0;
+  virtual std::wostream &stream(::std::wostream &) = 0;
+};
+
+template <typename T, typename Delims>
+struct custom_delims_wrapper : custom_delims_base {
+  custom_delims_wrapper(const T &t_) : t(t_) {}
+
+  std::ostream &stream(std::ostream &s) {
+    return s << print_container_helper<T, char, std::char_traits<char>, Delims>(
+               t);
+  }
+
+  std::wostream &stream(std::wostream &s) {
+    return s << print_container_helper<T, wchar_t, std::char_traits<wchar_t>,
+                                       Delims>(t);
+  }
+
+ private:
+  const T &t;
+};
+
+template <typename Delims>
+struct custom_delims {
+  template <typename Container>
+  custom_delims(const Container &c)
+      : base(new custom_delims_wrapper<Container, Delims>(c)) {}
+
+  std::unique_ptr<custom_delims_base> base;
+};
+
+template <typename TChar, typename TCharTraits, typename Delims>
+inline std::basic_ostream<TChar, TCharTraits> &operator<<(
+    std::basic_ostream<TChar, TCharTraits> &s, const custom_delims<Delims> &p) {
+  return p.base->stream(s);
+}
+
+// A wrapper for a C-style array given as pointer-plus-size.
+// Usage: std::cout << pretty_print_array(arr, n) << std::endl;
+
+template <typename T>
+struct array_wrapper_n {
+  typedef const T *const_iterator;
+  typedef T value_type;
+
+  array_wrapper_n(const T *const a, size_t n) : _array(a), _n(n) {}
+  inline const_iterator begin() const { return _array; }
+  inline const_iterator end() const { return _array + _n; }
+
+ private:
+  const T *const _array;
+  size_t _n;
+};
+
+// A wrapper for hash-table based containers that offer local iterators to each
+// bucket. Usage: std::cout << bucket_print(m, 4) << std::endl;  (Prints bucket
+// 5 of container m.)
+
+template <typename T>
+struct bucket_print_wrapper {
+  typedef typename T::const_local_iterator const_iterator;
+  typedef typename T::size_type size_type;
+
+  const_iterator begin() const { return m_map.cbegin(n); }
+
+  const_iterator end() const { return m_map.cend(n); }
+
+  bucket_print_wrapper(const T &m, size_type bucket) : m_map(m), n(bucket) {}
+
+ private:
+  const T &m_map;
+  const size_type n;
+};
+
+}  // namespace pretty_print
+
+// Global accessor functions for the convenience wrappers
+
+template <typename T>
+inline pretty_print::array_wrapper_n<T> pretty_print_array(const T *const a,
+                                                           size_t n) {
+  return pretty_print::array_wrapper_n<T>(a, n);
+}
+
+template <typename T>
+pretty_print::bucket_print_wrapper<T> bucket_print(const T &m,
+                                                   typename T::size_type n) {
+  return pretty_print::bucket_print_wrapper<T>(m, n);
+}
+
+// Main magic entry point: An overload snuck into namespace std.
+// Can we do better?
+
+namespace std {
+// Prints a container to the stream using default delimiters
+
+template <typename T, typename TChar, typename TCharTraits>
+inline typename enable_if<::pretty_print::is_container<T>::value,
+                          basic_ostream<TChar, TCharTraits> &>::type
+operator<<(basic_ostream<TChar, TCharTraits> &stream, const T &container) {
+  return stream
+         << ::pretty_print::print_container_helper<T, TChar, TCharTraits>(
+                container);
+}
+}  // namespace std
+
+#endif  // H_PRETTY_PRINT
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..026e35b1a6b52ec74fee27fbccd2dfda5ef845ce
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/pybind11_utils.h
@@ -0,0 +1,60 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <pybind11/embed.h>
+#include <pybind11/functional.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <spconv/tensorview/tensorview.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+
+template <typename scalar_t, typename TPyObject>
+std::vector<scalar_t> array2Vector(TPyObject arr) {
+  py::array arr_np = arr;
+  size_t size = arr.attr("size").template cast<size_t>();
+  py::array_t<scalar_t> arr_cc = arr_np;
+  std::vector<scalar_t> data(arr_cc.data(), arr_cc.data() + size);
+  return data;
+}
+
+template <typename scalar_t>
+std::vector<scalar_t> arrayT2Vector(py::array_t<scalar_t> arr) {
+  std::vector<scalar_t> data(arr.data(), arr.data() + arr.size());
+  return data;
+}
+
+template <typename scalar_t, typename TPyObject>
+tv::TensorView<scalar_t> array2TensorView(TPyObject arr) {
+  py::array arr_np = arr;
+  py::array_t<scalar_t> arr_cc = arr_np;
+  tv::Shape shape;
+  for (int i = 0; i < arr_cc.ndim(); ++i) {
+    shape.push_back(arr_cc.shape(i));
+  }
+  return tv::TensorView<scalar_t>(arr_cc.mutable_data(), shape);
+}
+template <typename scalar_t>
+tv::TensorView<scalar_t> arrayT2TensorView(py::array_t<scalar_t> arr) {
+  tv::Shape shape;
+  for (int i = 0; i < arr.ndim(); ++i) {
+    shape.push_back(arr.shape(i));
+  }
+  return tv::TensorView<scalar_t>(arr.mutable_data(), shape);
+}
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5e093fbbed4f0485559d9860b291e258337443f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/geometry.h
@@ -0,0 +1,297 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPCONV_GEOMETRY_H_
+#define SPCONV_GEOMETRY_H_
+
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <iostream>
+#include <limits>
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPos(const Index *input_pos,
+                                    const Index *kernelSize,
+                                    const Index *stride, const Index *padding,
+                                    const Index *dilation,
+                                    const Index *outSpatialShape, Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    lowers[i] = (input_pos[i] - (kernelSize[i] - 1) * dilation[i] - 1 +
+                 stride[i] + padding[i]) /
+                stride[i];
+    uppers[i] = (input_pos[i] + padding[i]) / stride[i];
+  }
+
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+        // break;
+      }
+      offset += m * (input_pos[j] - val * stride[j] + padding[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid) ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE Index getValidOutPosTranspose(
+    const Index *input_pos, const Index *kernelSize, const Index *stride,
+    const Index *padding, const Index *dilation, const Index *outSpatialShape,
+    Index *out) {
+  Index lowers[NDim];
+  Index uppers[NDim];
+  Index counter[NDim];
+  Index counterSize[NDim];
+  Index pointCounter = 0;
+  Index val;
+  Index numPoints = 1;
+  Index m, offset;
+  bool valid = false;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    lowers[i] = input_pos[i] * stride[i] - padding[i];
+    uppers[i] = lowers[i] + (kernelSize[i] - 1) * dilation[i];
+  }
+#pragma unroll
+  for (unsigned i = 0; i < NDim; ++i) {
+    counterSize[i] = ((uppers[i] - lowers[i]) / dilation[i] + 1);
+    numPoints *= counterSize[i];
+  }
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    counter[i] = 0;
+  }
+  for (int i = 0; i < numPoints; ++i) {
+    valid = true;
+    m = 1;
+    offset = 0;
+#pragma unroll
+    for (int j = NDim - 1; j >= 0; --j) {
+      val = uppers[j] - counter[j] * dilation[j];
+      out[pointCounter * (NDim + 1) + j] = val;
+      if (val < 0 || (val > outSpatialShape[j] - 1)) {
+        valid = false;
+      }
+      offset += m * (val - lowers[j]) / dilation[j];
+      m *= kernelSize[j];
+    }
+    out[pointCounter * (NDim + 1) + NDim] = offset;
+    if (valid) ++pointCounter;
+    counter[NDim - 1] += 1;
+#pragma unroll
+    for (int c = NDim - 1; c >= 0; --c) {
+      if (counter[c] == counterSize[c] && c > 0) {
+        counter[c - 1] += 1;
+        counter[c] = 0;
+      }
+    }
+  }
+  return pointCounter;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsConv(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<Index> indicesOut,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *kernelSize, const Index *stride,
+                         const Index *padding, const Index *dilation,
+                         const Index *outSpatialShape) {
+  // indicesOut: num_active * kernelVolume * (NDim + 1)
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsDeConv(tv::TensorView<const Index> indicesIn,
+                           tv::TensorView<Index> indicesOut,
+                           tv::TensorView<IndexGrid> gridsOut,
+                           tv::TensorView<Index> indicePairs,
+                           tv::TensorView<Index> indiceNum,
+                           const Index *kernelSize, const Index *stride,
+                           const Index *padding, const Index *dilation,
+                           const Index *outSpatialShape) {
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  for (int j = 0; j < numActIn; ++j) {
+    batchIdx = indicesIn(j, 0);
+    numValidPoints = getValidOutPosTranspose<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      auto index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+                   spatialVolume * batchIdx;
+      if (gridsOut[index] == -1) {
+        for (unsigned k = 1; k < NDim + 1; ++k) {
+          indicesOut(numAct, k) = pointPtr[k - 1];
+        }
+        indicesOut(numAct, 0) = batchIdx;
+        gridsOut[index] = numAct++;
+      }
+      // indicePairs: [K, 2, L]
+      indicePairs(offset, 0, indiceNum[offset]) = j;
+      indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+    }
+  }
+  return numAct;
+}
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+Index getIndicePairsSubM(tv::TensorView<const Index> indicesIn,
+                         tv::TensorView<IndexGrid> gridsOut,
+                         tv::TensorView<Index> indicePairs,
+                         tv::TensorView<Index> indiceNum,
+                         const Index *const kernelSize,
+                         const Index *const stride, const Index *const padding,
+                         const Index *dilation,
+                         const Index *const outSpatialShape) {
+  Index numAct = 0;
+  auto numActIn = indicesIn.dim(0);
+  Index batchIdx = 0;
+  Index spatialVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    spatialVolume *= outSpatialShape[i];
+  }
+  Index kernelVolume = 1;
+#pragma unroll
+  for (int i = 0; i < NDim; ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  Index numValidPoints = 0;
+  // Index validPoints[kernelVolume * (NDim + 1)];
+  std::vector<Index> validPoints_(kernelVolume * (NDim + 1));
+  Index *validPoints = validPoints_.data();
+  Index *pointPtr = nullptr;
+  Index index = 0;
+  for (int j = 0; j < numActIn; ++j) {
+    index = tv::rowArrayIdx<Index, NDim>(indicesIn.data() + j * (NDim + 1) + 1,
+                                         outSpatialShape) +
+            spatialVolume * indicesIn(j, 0);
+    gridsOut[index] = j;
+  }
+  for (int j = 0; j < numActIn; ++j) {
+    numValidPoints = getValidOutPos<Index, NDim>(
+        indicesIn.data() + j * (NDim + 1) + 1, kernelSize, stride, padding,
+        dilation, outSpatialShape, validPoints);
+    for (Index i = 0; i < numValidPoints; ++i) {
+      pointPtr = validPoints + i * (NDim + 1);
+      auto offset = pointPtr[NDim];
+      index = tv::rowArrayIdx<Index, NDim>(pointPtr, outSpatialShape) +
+              spatialVolume * indicesIn(j, 0);
+      if (gridsOut[index] > -1) {
+        indicePairs(offset, 0, indiceNum[offset]) = j;
+        indicePairs(offset, 1, indiceNum[offset]++) = gridsOut[index];
+      }
+    }
+  }
+  return numActIn;
+}
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
new file mode 100644
index 0000000000000000000000000000000000000000..96ce34e3b456f0c999002bd53b8b1a6ab082edae
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/indice.h
@@ -0,0 +1,78 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_CONV_INDICE_FUNCTOR_H_
+#define SPARSE_CONV_INDICE_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2 {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+
+template <typename Device, typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor {
+  Index operator()(const Device& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid = false);
+};
+}  // namespace functor
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
new file mode 100644
index 0000000000000000000000000000000000000000..78f32edd4db70724d38826809672aa461a6d065e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/maxpool.h
@@ -0,0 +1,37 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_MAXPOOL_FUNCTOR_H_
+#define SPARSE_MAXPOOL_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor {
+  void operator()(const Device& d, tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size);
+};
+}  // namespace functor
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..8262b30efb5e127d7e079ebdde0693c671fb96d6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/mp_helper.h
@@ -0,0 +1,50 @@
+#ifndef MP_HELPER_H_
+#define MP_HELPER_H_
+#include <type_traits>
+#include <utility>
+
+template <class... T>
+struct mp_list {};
+
+template <class T, T... I>
+using mp_list_c = mp_list<std::integral_constant<T, I>...>;
+
+namespace detail {
+
+template <class... T, class F>
+constexpr F mp_for_each_impl(mp_list<T...>, F &&f) {
+  return std::initializer_list<int>{(f(T()), 0)...}, std::forward<F>(f);
+}
+
+template <class F>
+constexpr F mp_for_each_impl(mp_list<>, F &&f) {
+  return std::forward<F>(f);
+}
+
+}  // namespace detail
+
+namespace detail {
+
+template <class A, template <class...> class B>
+struct mp_rename_impl {
+  // An error "no type named 'type'" here means that the first argument to
+  // mp_rename is not a list
+};
+
+template <template <class...> class A, class... T, template <class...> class B>
+struct mp_rename_impl<A<T...>, B> {
+  using type = B<T...>;
+};
+
+}  // namespace detail
+
+template <class A, template <class...> class B>
+using mp_rename = typename ::detail::mp_rename_impl<A, B>::type;
+
+template <class L, class F>
+constexpr F mp_for_each(F &&f) {
+  return ::detail::mp_for_each_impl(mp_rename<L, mp_list>(),
+                                    std::forward<F>(f));
+}
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
new file mode 100644
index 0000000000000000000000000000000000000000..95c1c6e389eb2f451e8640592ee2698d8b736010
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/point2voxel.h
@@ -0,0 +1,385 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <math.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <algorithm>
+#include <iostream>
+
+namespace py = pybind11;
+using namespace pybind11::literals;
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np(py::array_t<DType> points, py::array_t<DType> voxels,
+                          py::array_t<int> coors,
+                          py::array_t<int> num_points_per_voxel,
+                          py::array_t<int> coor_to_voxelidx,
+                          std::vector<DType> voxel_size,
+                          std::vector<DType> coors_range, int max_points,
+                          int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_mean(py::array_t<DType> points,
+                               py::array_t<DType> voxels,
+                               py::array_t<DType> means, py::array_t<int> coors,
+                               py::array_t<int> num_points_per_voxel,
+                               py::array_t<int> coor_to_voxelidx,
+                               std::vector<DType> voxel_size,
+                               std::vector<DType> coors_range, int max_points,
+                               int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto means_rw = means.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+      for (int k = 0; k < num_features; ++k) {
+        means_rw(voxelidx, k) +=
+            (points_rw(i, k) - means_rw(voxelidx, k)) / DType(num + 1);
+      }
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    num = num_points_per_voxel_rw(i);
+    for (int j = num; j < max_points; ++j) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(i, j, k) = means_rw(i, k);
+      }
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_np_height(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<DType> height, py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+        height_rw(voxelidx, k) =
+            std::min(points_rw(i, k), height_rw(voxelidx, k));
+        maxs_rw(voxelidx, k) = std::max(points_rw(i, k), maxs_rw(voxelidx, k));
+      }
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor_to_voxelidx_rw(coors_rw(i, 0), coors_rw(i, 1), coors_rw(i, 2)) = -1;
+    for (int k = 0; k < num_features; ++k) {
+      height_rw(i, k) = maxs_rw(i, k) - height_rw(i, k);
+    }
+  }
+  return voxel_num;
+}
+
+template <typename DType, int NDim>
+int block_filtering(py::array_t<DType> points, py::array_t<int> mask,
+                    py::array_t<DType> height, py::array_t<DType> maxs,
+                    py::array_t<int> coor_to_voxelidx,
+                    std::vector<DType> voxel_size,
+                    std::vector<DType> coors_range, int max_voxels, DType eps) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto height_rw = height.template mutable_unchecked<1>();
+  auto maxs_rw = maxs.template mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int voxelidx, num;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+    }
+    height_rw(voxelidx) = std::min(points_rw(i, 2), height_rw(voxelidx));
+    maxs_rw(voxelidx) = std::max(points_rw(i, 2), maxs_rw(voxelidx));
+  }
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if ((maxs_rw(voxelidx) - height_rw(voxelidx, 2)) < eps) {
+      mask(i) = 0;
+    }
+  }
+}
+
+template <typename DType, int NDim>
+int points_to_voxel_3d_with_filtering(
+    py::array_t<DType> points, py::array_t<DType> voxels,
+    py::array_t<int> voxel_mask, py::array_t<DType> mins,
+    py::array_t<DType> maxs, py::array_t<int> coors,
+    py::array_t<int> num_points_per_voxel, py::array_t<int> coor_to_voxelidx,
+    std::vector<DType> voxel_size, std::vector<DType> coors_range,
+    int max_points, int max_voxels, int block_factor, int block_size,
+    DType height_threshold) {
+  auto points_rw = points.template mutable_unchecked<2>();
+  auto mins_rw = mins.template mutable_unchecked<2>();
+  auto maxs_rw = maxs.template mutable_unchecked<2>();
+  auto voxels_rw = voxels.template mutable_unchecked<3>();
+  auto voxel_mask_rw = voxel_mask.template mutable_unchecked<1>();
+  auto coors_rw = coors.mutable_unchecked<2>();
+  auto num_points_per_voxel_rw = num_points_per_voxel.mutable_unchecked<1>();
+  auto coor_to_voxelidx_rw = coor_to_voxelidx.mutable_unchecked<NDim>();
+  auto N = points_rw.shape(0);
+  auto num_features = points_rw.shape(1);
+  constexpr int ndim_minus_1 = NDim - 1;
+  int voxel_num = 0;
+  bool failed = false;
+  int coor[NDim];
+  int c;
+  int grid_size[NDim];
+
+  DType max_value, min_value;
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+  int block_shape_H = grid_size[1] / block_factor;
+  int block_shape_W = grid_size[0] / block_factor;
+  int voxelidx, num;
+  int block_coor[2];
+  int startx, stopx, starty, stopy;
+  for (int i = 0; i < N; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points_rw(i, j) - coors_range[j]) / voxel_size[j]);
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+    if (failed) continue;
+    voxelidx = coor_to_voxelidx_rw(coor[0], coor[1], coor[2]);
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+      coor_to_voxelidx_rw(coor[0], coor[1], coor[2]) = voxelidx;
+      for (int k = 0; k < NDim; ++k) {
+        coors_rw(voxelidx, k) = coor[k];
+      }
+    }
+    num = num_points_per_voxel_rw(voxelidx);
+    if (num < max_points) {
+      for (int k = 0; k < num_features; ++k) {
+        voxels_rw(voxelidx, num, k) = points_rw(i, k);
+      }
+      block_coor[0] = coor[1] / block_factor;
+      block_coor[1] = coor[2] / block_factor;
+      mins_rw(block_coor[0], block_coor[1]) =
+          std::min(points_rw(i, 2), mins_rw(block_coor[0], block_coor[1]));
+      maxs_rw(block_coor[0], block_coor[1]) =
+          std::max(points_rw(i, 2), maxs_rw(block_coor[0], block_coor[1]));
+      num_points_per_voxel_rw(voxelidx) += 1;
+    }
+  }
+  for (int i = 0; i < voxel_num; ++i) {
+    coor[1] = coors_rw(i, 1);
+    coor[2] = coors_rw(i, 2);
+    coor_to_voxelidx_rw(coors_rw(i, 0), coor[1], coor[2]) = -1;
+    block_coor[0] = coor[1] / block_factor;
+    block_coor[1] = coor[2] / block_factor;
+    min_value = mins_rw(block_coor[0], block_coor[1]);
+    max_value = maxs_rw(block_coor[0], block_coor[1]);
+    startx = std::max(0, block_coor[0] - block_size / 2);
+    stopx =
+        std::min(block_shape_H, block_coor[0] + block_size - block_size / 2);
+    starty = std::max(0, block_coor[1] - block_size / 2);
+    stopy =
+        std::min(block_shape_W, block_coor[1] + block_size - block_size / 2);
+
+    for (int j = startx; j < stopx; ++j) {
+      for (int k = starty; k < stopy; ++k) {
+        min_value = std::min(min_value, mins_rw(j, k));
+        max_value = std::max(max_value, maxs_rw(j, k));
+      }
+    }
+    voxel_mask_rw(i) = (max_value - min_value) > height_threshold;
+  }
+  return voxel_num;
+}
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
new file mode 100644
index 0000000000000000000000000000000000000000..998d9511b060d02d9f12408038b56a802f63c1da
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/spconv/reordering.h
@@ -0,0 +1,36 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SPARSE_REORDERING_FUNCTOR_H_
+#define SPARSE_REORDERING_FUNCTOR_H_
+#include <utils/spconv/tensorview/tensorview.h>
+
+namespace functor {
+template <typename Device, typename scalar_t, typename Index>
+struct SparseGatherFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size);
+};
+
+template <typename Device, typename scalar_t, typename Index>
+struct SparseScatterAddFunctor {
+  void operator()(const Device& d, tv::TensorView<scalar_t> out_features,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size,
+                  bool stable = false);
+};
+}  // namespace functor
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh b/mmcv/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..70851bc70ecb8ce1c74d777006d5b30b78e0d232
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_kernel.cuh
@@ -0,0 +1,75 @@
+#pragma once
+namespace tv {
+namespace detail {
+
+template <typename scalar_t>
+class KernelLoop {
+  struct Iterator {
+    __forceinline__ __device__ Iterator(scalar_t index, scalar_t delta)
+        : index_(index), delta_(delta) {}
+    __forceinline__ __device__ scalar_t operator*() const { return index_; }
+    __forceinline__ __device__ Iterator &operator++() {
+      index_ += delta_;
+      return *this;
+    }
+    __forceinline__ __device__ bool operator!=(const Iterator &other) const {
+      bool greater = index_ > other.index_;
+      bool less = index_ < other.index_;
+      if (!other.delta_) {
+        return less;
+      }
+      if (!delta_) {
+        return greater;
+      }
+      return less || greater;
+    }
+
+   private:
+    scalar_t index_;
+    const scalar_t delta_;
+  };
+
+ public:
+  __forceinline__ __device__ KernelLoop(scalar_t begin, scalar_t delta,
+                                        scalar_t end)
+      : begin_(begin), delta_(delta), end_(end) {}
+
+  __forceinline__ __device__ Iterator begin() const {
+    return Iterator{begin_, delta_};
+  }
+  __forceinline__ __device__ Iterator end() const { return Iterator{end_, 0}; }
+
+ private:
+  scalar_t begin_;
+  scalar_t delta_;
+  scalar_t end_;
+};
+
+}  // namespace detail
+
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopX(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.x * blockDim.x + threadIdx.x,
+                                      gridDim.x * blockDim.x * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the y-coordinate.
+// Usage: for(int i : KernelLoopY(count)) { visit(i); }
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopY(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.y * blockDim.y + threadIdx.y,
+                                      gridDim.y * blockDim.y * NumILP, count);
+}
+
+// Helper to visit indices in the range 0 <= i < count using the z-coordinate.
+// Usage: for(int i : KernelLoopZ(count)) { visit(i); }
+template <typename scalar_t, int NumILP = 1>
+__forceinline__ __device__ detail::KernelLoop<scalar_t> KernelLoopZ(
+    scalar_t count) {
+  return detail::KernelLoop<scalar_t>(blockIdx.z * blockDim.z + threadIdx.z,
+                                      gridDim.z * blockDim.z * NumILP, count);
+}
+
+}  // namespace tv
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
new file mode 100644
index 0000000000000000000000000000000000000000..163df1720cbb0e55c70fb82e9762b040b3b13fb9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/tensorview/helper_launch.h
@@ -0,0 +1,19 @@
+#pragma once
+// from pytorch.aten
+#include "tensorview.h"
+namespace tv {
+namespace launch {
+
+template <typename T1, typename T2>
+inline int DivUp(const T1 a, const T2 b) {
+  return (a + b - 1) / b;
+}
+
+constexpr int CUDA_NUM_THREADS = 1024;
+inline int getBlocks(const int N) {
+  TV_ASSERT_RT_ERR(N > 0,
+                   "CUDA kernel launch blocks must be positive, but got N=", N);
+  return DivUp(N, CUDA_NUM_THREADS);
+}
+}  // namespace launch
+}  // namespace tv
diff --git a/mmcv/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h b/mmcv/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb2f018a934c97c6ac1b965a562a3e4122d7cf4e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/common/utils/spconv/tensorview/tensorview.h
@@ -0,0 +1,1119 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <cassert>
+#include <cstdlib>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <type_traits>
+#include <vector>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace tv {
+
+#ifdef __NVCC__
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__ __host__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#define TV_ASSERT(expr) assert(expr)
+#elif defined(__CUDACC_RTC__)
+#define TV_ASSERT(expr) assert(expr)
+#define TV_HOST_DEVICE_INLINE __forceinline__ __device__
+#define TV_DEVICE_INLINE __forceinline__ __device__
+#define TV_HOST_DEVICE __device__ __host__
+#else
+#define TV_ASSERT(x) assert(x)
+#define TV_HOST_DEVICE_INLINE inline
+#define TV_HOST_DEVICE
+#endif
+
+#define TV_REQUIRE(expr, ...) \
+  {                           \
+    if (!(expr)) {            \
+      printf(__VA_ARGS__);    \
+      assert(expr);           \
+    }                         \
+  }
+
+#define TV_DEVICE_REQUIRE(expr, ...)                      \
+  {                                                       \
+    if (!(expr) && threadIdx.x == 0) printf(__VA_ARGS__); \
+    assert(expr);                                         \
+  }
+
+template <class SStream, class T>
+void sstream_print(SStream &ss, T val) {
+  ss << val;
+}
+
+template <class SStream, class T, class... TArgs>
+void sstream_print(SStream &ss, T val, TArgs... args) {
+  ss << val << " ";
+  sstream_print(ss, args...);
+}
+
+#define TV_ASSERT_RT_ERR(expr, ...)                     \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert failed. ";         \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::runtime_error(__macro_s.str());        \
+    }                                                   \
+  }
+
+#define TV_ASSERT_INVALID_ARG(expr, ...)                \
+  {                                                     \
+    if (!(expr)) {                                      \
+      std::stringstream __macro_s;                      \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n"; \
+      __macro_s << #expr << " assert failed. ";         \
+      tv::sstream_print(__macro_s, __VA_ARGS__);        \
+      throw std::invalid_argument(__macro_s.str());     \
+    }                                                   \
+  }
+
+#define TV_CHECK_CUDA_ERR()                                    \
+  {                                                            \
+    auto err = cudaGetLastError();                             \
+    if (err != cudaSuccess) {                                  \
+      std::stringstream __macro_s;                             \
+      __macro_s << __FILE__ << " " << __LINE__ << "\n";        \
+      __macro_s << "cuda execution failed with error " << err; \
+      throw std::runtime_error(__macro_s.str());               \
+    }                                                          \
+  }
+
+struct CPU {};
+
+#define TV_MAX_DIM 6
+
+template <typename scalar_t, size_t MaxDim = TV_MAX_DIM>
+struct SimpleVector {
+ public:
+  TV_HOST_DEVICE_INLINE SimpleVector(){};
+  TV_HOST_DEVICE_INLINE SimpleVector(std::initializer_list<scalar_t> q) {
+    TV_ASSERT(q.size() <= MaxDim);
+    mSize = 0;
+    for (scalar_t s : q) {
+      mArray[mSize++] = s;
+    }
+    mSize = q.size();
+  }
+  SimpleVector(const std::vector<scalar_t> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE SimpleVector(
+      const SimpleVector<scalar_t, MaxDim> &arr) {
+    TV_ASSERT(arr.size() <= MaxDim);
+    for (size_t i = 0; i < arr.size(); ++i) {
+      mArray[i] = arr[i];
+    }
+    mSize = arr.size();
+  }
+  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE const scalar_t &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < mSize);
+#endif
+    return mArray[idx];
+  }
+  TV_HOST_DEVICE_INLINE void push_back(scalar_t s) {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize < MaxDim);
+#endif
+    mArray[mSize] = s;
+    mSize++;
+  }
+  TV_HOST_DEVICE_INLINE void pop_back() {
+#ifdef TV_DEBUG
+    TV_ASSERT(mSize > 0);
+#endif
+    mSize--;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const { return mSize; }
+  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mArray; }
+  TV_HOST_DEVICE_INLINE size_t empty() const { return mSize == 0; }
+
+  typedef size_t size_type;
+
+  class iterator {
+   public:
+    typedef iterator self_type;
+    typedef scalar_t value_type;
+    typedef scalar_t &reference;
+    typedef scalar_t *pointer;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef std::ptrdiff_t difference_type;
+    TV_HOST_DEVICE_INLINE iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+   private:
+    pointer ptr_;
+  };
+
+  class const_iterator {
+   public:
+    typedef const_iterator self_type;
+    typedef scalar_t value_type;
+    typedef const scalar_t &reference;
+    typedef const scalar_t *pointer;
+    typedef std::ptrdiff_t difference_type;
+    typedef std::forward_iterator_tag iterator_category;
+    TV_HOST_DEVICE_INLINE const_iterator(pointer ptr) : ptr_(ptr) {}
+    TV_HOST_DEVICE_INLINE self_type operator++(int junk) {
+      self_type i = *this;
+      ptr_++;
+      return i;
+    }
+    TV_HOST_DEVICE_INLINE self_type operator++() {
+      ptr_++;
+      return *this;
+    }
+    TV_HOST_DEVICE_INLINE reference operator*() { return *ptr_; }
+    TV_HOST_DEVICE_INLINE pointer operator->() { return ptr_; }
+    TV_HOST_DEVICE_INLINE bool operator==(const self_type &rhs) {
+      return ptr_ == rhs.ptr_;
+    }
+    TV_HOST_DEVICE_INLINE bool operator!=(const self_type &rhs) {
+      return ptr_ != rhs.ptr_;
+    }
+
+   private:
+    pointer ptr_;
+  };
+
+  TV_HOST_DEVICE_INLINE iterator begin() { return iterator(mArray); }
+
+  TV_HOST_DEVICE_INLINE iterator end() { return iterator(mArray + mSize); }
+
+  TV_HOST_DEVICE_INLINE const_iterator begin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator end() const {
+    return const_iterator(mArray + mSize);
+  }
+  TV_HOST_DEVICE_INLINE const_iterator cbegin() const {
+    return const_iterator(mArray);
+  }
+
+  TV_HOST_DEVICE_INLINE const_iterator cend() const {
+    return const_iterator(mArray + mSize);
+  }
+
+ protected:
+  scalar_t mArray[MaxDim];
+  size_t mSize = 0;
+};
+
+template <typename scalar_t, size_t MaxDim>
+bool operator==(const SimpleVector<scalar_t, MaxDim> &lfs,
+                const SimpleVector<scalar_t, MaxDim> &rfs) {
+  if (lfs.size() != rfs.size()) return false;
+  for (size_t i = 0; i < lfs.size(); ++i) {
+    if (lfs[i] != rfs[i]) return false;
+  }
+  return true;
+}
+
+template <typename scalar_t, size_t MaxDim>
+bool operator!=(const SimpleVector<scalar_t, MaxDim> &lfs,
+                const SimpleVector<scalar_t, MaxDim> &rfs) {
+  return !(lfs == rfs);
+}
+
+struct Slice {
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE Slice(Integers... ints) {
+    static_assert(sizeof...(ints) <= 3, "slice init must smaller than 3");
+    SimpleVector<int, 3> slices{int(ints)...};
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    for (size_t i = 0; i < slices.size(); ++i) {
+      mSlices[i] = slices[i];
+    }
+  }
+
+  TV_HOST_DEVICE_INLINE Slice() {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+  }
+  template <typename scalar_t>
+  TV_HOST_DEVICE_INLINE Slice(std::initializer_list<scalar_t> slice) {
+    mSlices[0] = -1;
+    mSlices[1] = -1;
+    mSlices[2] = -1;
+    TV_ASSERT(slice.size() <= 3);
+    int idx = 0;
+    for (scalar_t s : slice) {
+      mSlices[idx] = int(s);
+      ++idx;
+    }
+  }
+  TV_HOST_DEVICE_INLINE int &operator[](int idx) {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+  TV_HOST_DEVICE_INLINE const int &operator[](int idx) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(idx >= 0 && idx < 3);
+#endif
+    return mSlices[idx];
+  }
+
+ protected:
+  int mSlices[3];
+};
+
+template <size_t MaxDim = TV_MAX_DIM>
+struct ShapeBase : public SimpleVector<int, MaxDim> {
+  TV_HOST_DEVICE_INLINE ShapeBase() : SimpleVector<int, MaxDim>(){};
+  TV_HOST_DEVICE_INLINE ShapeBase(std::initializer_list<int> shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+
+  template <typename scalar_t, template <class...> class Container>
+  ShapeBase(Container<scalar_t> shape) : SimpleVector<int, MaxDim>(shape) {}
+  TV_HOST_DEVICE_INLINE ShapeBase(const ShapeBase<MaxDim> &shape)
+      : SimpleVector<int, MaxDim>(shape) {}
+  ShapeBase(const std::vector<int> &arr) : SimpleVector<int, MaxDim>(arr) {}
+
+  ShapeBase<MaxDim> &operator=(const ShapeBase<MaxDim> &shape) = default;
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start, int end) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && end < this->mSize && end > start);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < end; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> subshape(int start) const {
+#ifdef TV_DEBUG
+    TV_ASSERT(start >= 0 && start <= this->mSize);
+#endif
+    ShapeBase<MaxDim> shape;
+    for (int i = start; i < this->mSize; ++i) {
+      shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+
+  TV_HOST_DEVICE_INLINE size_t size() const {
+    if (this->mSize == 0) return 0;
+    size_t s = 1;
+    for (int i = 0; i < int(this->mSize); ++i) {
+      s *= this->mArray[i];
+    }
+    return s;
+  }
+  TV_HOST_DEVICE_INLINE size_t ndim() const { return this->mSize; }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze() const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (this->mArray[i] != 1) shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+  TV_HOST_DEVICE_INLINE ShapeBase<MaxDim> squeeze(int dim) const {
+    ShapeBase<MaxDim> shape;
+    for (int i = 0; i < this->mSize; ++i) {
+      if (i != dim || this->mArray[i] != 1) shape.push_back(this->mArray[i]);
+    }
+    return shape;
+  }
+};
+
+using Shape = ShapeBase<TV_MAX_DIM>;
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#ifdef TV_DEBUG
+  TV_ASSERT(sizeof...(indexes) == shape.size());
+#endif
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(std::vector<int> &shape,
+                                           std::vector<int> &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = shape.size() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <class... Inds>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           Inds... indexes) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  int indexes_vec[sizeof...(indexes)] = {indexes...};
+#pragma unroll
+  for (int i = sizeof...(indexes) - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Shape &shape,
+                                           const Shape &indexes_vec) {
+  unsigned offset = 0;
+  unsigned m = 1;
+  for (int i = indexes_vec.ndim() - 1; i >= 0; --i) {
+    offset += m * indexes_vec[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE unsigned rowArrayIdx(const Index *indexes,
+                                           const Index *shape) {
+  unsigned offset = 0;
+  unsigned m = 1;
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    offset += m * indexes[i];
+    m *= shape[i];
+  }
+  return offset;
+}
+
+template <typename Index, unsigned NDim>
+TV_HOST_DEVICE_INLINE Index rowArrayIdxInv(Index index, Index *output,
+                                           const Index *shape) {
+#pragma unroll
+  for (int i = NDim - 1; i >= 0; --i) {
+    output[i] = index % shape[i];
+    index -= output[i];
+    index /= shape[i];
+  }
+  return index;
+}
+
+template <int N>
+struct ArrayIndexRowMajor {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return indexes[N - 1] +
+           shape[N - 1] * ArrayIndexRowMajor<N - 1>::run(shape, indexes);
+  }
+};
+
+template <>
+struct ArrayIndexRowMajor<0> {
+  TV_HOST_DEVICE_INLINE static unsigned run(const Shape &shape,
+                                            const Shape &indexes) {
+    return 0;
+  }
+};
+
+namespace detail {
+template <typename scalar_t>
+constexpr const char *simpleTypeName(scalar_t val = scalar_t());
+template <>
+constexpr const char *simpleTypeName(float val) {
+  return "float32";
+}
+template <>
+constexpr const char *simpleTypeName(double val) {
+  return "float64";
+}
+template <>
+constexpr const char *simpleTypeName(int val) {
+  return "int32";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned val) {
+  return "uint32";
+}
+template <>
+constexpr const char *simpleTypeName(long val) {
+  return "int64";
+}
+template <>
+constexpr const char *simpleTypeName(unsigned long val) {
+  return "uint64";
+}
+};  // namespace detail
+
+template <typename scalar_t, int Rank = -1>
+struct TensorView {
+  TV_HOST_DEVICE_INLINE TensorView() {}
+  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Shape shape)
+      : mPtr(ptr), mShape(shape) {}
+  template <class... Integers>
+  explicit TV_HOST_DEVICE_INLINE TensorView(scalar_t *ptr, Integers... shapes)
+      : mPtr(ptr) {
+    mShape = {int(shapes)...};
+  }
+
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
+      const TensorView<scalar_t, Rank> &tensor) {
+    TV_REQUIRE(tensor.shape() == shape(), "you must provide same input size%s",
+               "\n");
+    scalar_t *ptr = mPtr;
+    const scalar_t *other_ptr = tensor.data();
+    for (size_t i = 0; i < size(); ++i) *(ptr++) = *(other_ptr++);
+    return *this;
+  }
+
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &assign(
+      std::initializer_list<T1> seq) {
+    TV_REQUIRE(seq.size() == size(), "you must provide same input size%s",
+               "\n");
+    scalar_t *ptr = mPtr;
+    for (const T1 &s : seq) *(ptr++) = scalar_t(s);
+    return *this;
+  }
+
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(Inds... inds) {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(Inds... inds) const {
+#ifdef TV_DEBUG
+    int idxes[sizeof...(Inds)]{int(inds)...};
+    TV_REQUIRE(sizeof...(inds) == mShape.ndim(),
+               "you provide %d indexes, but dim is %d\n", sizeof...(inds),
+               mShape.ndim());
+    for (int i = 0; i < sizeof...(inds); ++i) {
+      TV_REQUIRE(idxes[i] >= 0 && idxes[i] < mShape[i],
+                 "index-%d(%d) out-of-range: [0, %d)\n", i, idxes[i],
+                 mShape[i]);
+    }
+#endif
+    return mPtr[rowArrayIdx(mShape, int(inds)...)];
+  }
+  TV_HOST_DEVICE_INLINE scalar_t &operator()() {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()() const {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mPtr != nullptr,
+                      "you want get value but the view is empty.%s", "\n");
+    TV_DEVICE_REQUIRE(mShape.ndim() == 0,
+                      "you provide 0 indexes, but dim is %ld\n", mShape.ndim());
+#else
+    TV_REQUIRE(mPtr != nullptr, "you want get value but the view is empty.%s",
+               "\n");
+    TV_REQUIRE(mShape.ndim() == 0, "you provide 0 indexes, but dim is %ld\n",
+               mShape.ndim());
+#endif
+#endif
+    return mPtr[0];
+  }
+
+  template <class T1>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1) {
+#if defined TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, i1, mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE scalar_t &operator()(T1 i1, T2 i2, T3 i3, T4 i4) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  template <class T1>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 1,
+                      "you provide 1 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+#else
+    TV_REQUIRE(mShape.ndim() == 1, "you provide 1 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+#endif
+#endif
+    return mPtr[i1];
+  }
+  template <class T1, class T2>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 2,
+                      "you provide 2 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+#else
+    TV_REQUIRE(mShape.ndim() == 2, "you provide 2 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+
+#endif
+#endif
+    return mPtr[i1 * mShape[1] + i2];
+  }
+  template <class T1, class T2, class T3>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 3,
+                      "you provide 3 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+#else
+    TV_REQUIRE(mShape.ndim() == 3, "you provide 3 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+#endif
+#endif
+    return mPtr[(i1 * mShape[1] + i2) * mShape[2] + i3];
+  }
+  template <class T1, class T2, class T3, class T4>
+  TV_HOST_DEVICE_INLINE const scalar_t &operator()(T1 i1, T2 i2, T3 i3,
+                                                   T4 i4) const {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(mShape.ndim() == 4,
+                      "you provide 4 indexes, but dim is %ld\n", mShape.ndim());
+    TV_DEVICE_REQUIRE(i1 >= 0 && i1 < mShape[0],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1),
+                      mShape[0]);
+    TV_DEVICE_REQUIRE(i2 >= 0 && i2 < mShape[1],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2),
+                      mShape[1]);
+    TV_DEVICE_REQUIRE(i3 >= 0 && i3 < mShape[2],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3),
+                      mShape[2]);
+    TV_DEVICE_REQUIRE(i4 >= 0 && i4 < mShape[3],
+                      "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4),
+                      mShape[3]);
+#else
+    TV_REQUIRE(mShape.ndim() == 4, "you provide 4 indexes, but dim is %ld\n",
+               mShape.ndim());
+    TV_REQUIRE(i1 >= 0 && i1 < mShape[0],
+               "index-%d(%d) out-of-range: [0, %d)\n", 0, int(i1), mShape[0]);
+    TV_REQUIRE(i2 >= 0 && i2 < mShape[1],
+               "index-%d(%d) out-of-range: [0, %d)\n", 1, int(i2), mShape[1]);
+    TV_REQUIRE(i3 >= 0 && i3 < mShape[2],
+               "index-%d(%d) out-of-range: [0, %d)\n", 2, int(i3), mShape[2]);
+    TV_REQUIRE(i4 >= 0 && i4 < mShape[3],
+               "index-%d(%d) out-of-range: [0, %d)\n", 3, int(i4), mShape[3]);
+#endif
+#endif
+    return mPtr[((i1 * mShape[1] + i2) * mShape[2] + i3) * mShape[3] + i4];
+  }
+
+  TV_HOST_DEVICE_INLINE scalar_t &operator[](int idx) {
+#ifdef TV_DEBUG
+#if defined(__CUDA_ARCH__)
+    TV_DEVICE_REQUIRE(idx >= 0 && idx < size(),
+                      "index(%d) out-of-range: [0, %ld)\n", int(idx), size());
+#else
+    TV_REQUIRE(idx >= 0 && idx < size(), "index(%d) out-of-range: [0, %ld)\n",
+               int(idx), size());
+#endif
+#endif
+    return mPtr[idx];
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> operator[](
+      SimpleVector<Slice> slice_vec) {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE const TensorView<scalar_t, Rank> operator[](
+      SimpleVector<Slice> slice_vec) const {
+    return _subview(slice_vec);
+  }
+  TV_HOST_DEVICE_INLINE bool empty() const { return mPtr == nullptr; }
+  TV_HOST_DEVICE_INLINE scalar_t *data() { return mPtr; }
+  TV_HOST_DEVICE_INLINE const scalar_t *data() const { return mPtr; }
+  TV_HOST_DEVICE_INLINE const Shape &shape() const { return mShape; }
+  TV_HOST_DEVICE_INLINE int dim(int idx) const { return mShape[idx]; }
+  TV_HOST_DEVICE_INLINE int ndim() const { return mShape.ndim(); }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Inds... newShapes) {
+    Shape shapes{int(newShapes)...};
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> &reshape(Shape shapes) {
+    TV_ASSERT(shapes.size() == size());
+    mShape = shapes;
+    return *this;
+  }
+  template <class... Inds>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(
+      Inds... newShapes) const {
+    Shape shapes{int(newShapes)...};
+    for (size_t i = 0; i < shapes.ndim(); ++i) {
+      if (shapes[i] == -1) {
+        shapes[i] = 1;
+        shapes[i] = size() / shapes.size();
+        break;
+      }
+    }
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<scalar_t, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> view(Shape shapes) const {
+    TV_ASSERT(shapes.size() == size());
+    return TensorView<scalar_t, Rank>(mPtr, shapes);
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze() const {
+    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze());
+  }
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> squeeze(int dim) const {
+    return TensorView<scalar_t, Rank>(mPtr, mShape.squeeze(dim));
+  }
+  TV_HOST_DEVICE_INLINE size_t size() const { return mShape.size(); }
+
+  template <class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
+      Slice slice, Slices... slices) const {
+    return subview<float, Slice, Slices...>(slice, slices...);
+  }
+  template <class T2 = float, class... Slices>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(
+      Slices... slices) const {
+    Slice slice_vec[sizeof...(Slices)] = {to_slice(slices)...};
+    Shape new_shape{to_slice(slices)[0]...};
+    Shape start{to_slice(slices)[0]...};
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1;
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+#pragma unroll
+    for (size_t i = 0; i < sizeof...(Slices); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+#pragma unroll
+    for (size_t i = sizeof...(Slices); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
+  }
+
+  template <class... Integers>
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> subview(int id,
+                                                           Integers... ints) {
+    Shape start = {id, ints...};
+    for (int i = 1 + sizeof...(ints); i < ndim(); ++i) {
+      start.push_back(0);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + rowArrayIdx(mShape, start),
+                                      mShape.subshape(sizeof...(ints) + 1));
+  }
+
+  std::string repr() const {
+    std::ostringstream ss;
+    if (empty()) return "";
+    if (mShape.ndim() == 0) {
+      ss << *mPtr;
+      ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
+      return ss.str();
+    }
+    Shape counter = mShape;
+    auto tensor_flat = this->view(-1);
+    for (int i = 0; i < counter.ndim(); ++i) {
+      counter[i] = 0;
+      ss << "[";
+    }
+    for (size_t i = 0; i < this->size(); ++i) {
+      ss << tensor_flat(rowArrayIdx(mShape, counter));
+      counter[counter.ndim() - 1] += 1;
+      int inc_count = 0;
+      bool print_comma = true;
+      for (int c = counter.ndim() - 1; c >= 0; --c) {
+        if (counter[c] == this->dim(c) && c > 0) {
+          ++inc_count;
+          counter[c - 1] += 1;
+          counter[c] = 0;
+          print_comma = false;
+        }
+      }
+      if (print_comma && i != this->size() - 1) ss << ", ";
+      for (int j = 0; j < inc_count; ++j) {
+        ss << "]";
+      }
+      if (i != this->size() - 1) {
+        if (inc_count != 0) ss << "\n";
+        for (int j = 0; j < inc_count; ++j) {
+          ss << "[";
+        }
+      }
+    }
+    ss << "]";
+    ss << "Tensor: dtype=" << detail::simpleTypeName<scalar_t>();
+    return ss.str();
+  }
+
+ protected:
+  // TODO: make this function public.
+  // currently this function is called unexpectedly when using subview({0, 0}).
+  TV_HOST_DEVICE_INLINE TensorView<scalar_t, Rank> _subview(
+      SimpleVector<Slice> slice_vec) {
+    Shape new_shape;
+    for (int i = 0; i < slice_vec.size(); ++i) {
+      new_shape.push_back(slice_vec[i][0]);
+    }
+    Shape start = new_shape;
+    TV_ASSERT(new_shape.ndim() <= mShape.ndim());
+    TV_ASSERT(new_shape.ndim() != 0);
+    size_t idxsize = new_shape.ndim();
+    for (size_t i = idxsize; i < mShape.ndim(); ++i) {
+      new_shape.push_back(0);
+      start.push_back(0);
+    }
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        new_shape[i] = slice_vec[i][1] - slice_vec[i][0];
+        TV_ASSERT(new_shape[i] >= 0);
+      } else {
+        new_shape[i] = 1;  // reduce dim
+      }
+    }
+    auto offset = rowArrayIdx(mShape, start);
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      new_shape[i] = mShape[i];
+      TV_ASSERT(new_shape[i] >= 0);
+    }
+    Shape reduced_shape;
+    for (size_t i = 0; i < slice_vec.size(); ++i) {
+      if (slice_vec[i][1] != -1) {
+        reduced_shape.push_back(new_shape[i]);
+      }
+    }
+    for (size_t i = slice_vec.size(); i < mShape.ndim(); ++i) {
+      reduced_shape.push_back(new_shape[i]);
+    }
+    return TensorView<scalar_t, Rank>(mPtr + offset, reduced_shape);
+  }
+  template <typename T1>
+  TV_HOST_DEVICE_INLINE Slice to_slice(T1 s) const {
+    return Slice{int(s), -1, -1};
+  }
+
+  TV_HOST_DEVICE_INLINE Slice to_slice(Slice s) const { return Slice(s); }
+
+  scalar_t *mPtr = nullptr;
+  Shape mShape;
+};
+
+template <typename Os, typename scalar_t, int Rank>
+Os &operator<<(Os &os, const TensorView<scalar_t, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+template <typename Os, typename scalar_t, int Rank>
+Os &operator<<(Os &os, const TensorView<const scalar_t, Rank> &dt) {
+  os << dt.repr();
+  return os;
+}
+
+namespace detail {
+template <typename scalar_t>
+constexpr const char *printfTypeFormat(scalar_t val = scalar_t());
+template <>
+constexpr const char *printfTypeFormat(float val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(double val) {
+  return "%.2f";
+}
+template <>
+constexpr const char *printfTypeFormat(int val) {
+  return "%d";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned val) {
+  return "%u";
+}
+template <>
+constexpr const char *printfTypeFormat(long val) {
+  return "%ld";
+}
+template <>
+constexpr const char *printfTypeFormat(unsigned long val) {
+  return "%lu";
+}
+};  // namespace detail
+
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const TensorView<scalar_t> tensor,
+                                    const char *format) {
+  if (tensor.empty()) return;
+  if (tensor.ndim() == 0) {
+    printf(format, tensor());
+    printf("\n");
+    return;
+  }
+  Shape counter = tensor.shape();
+  auto tensor_flat = tensor.view(-1);
+  for (int i = 0; i < counter.ndim(); ++i) {
+    counter[i] = 0;
+    printf("[");
+  }
+  for (size_t i = 0; i < tensor.size(); ++i) {
+    printf(format, tensor_flat(rowArrayIdx(tensor.shape(), counter)));
+    counter[counter.ndim() - 1] += 1;
+    int inc_count = 0;
+    bool print_comma = true;
+    for (int c = counter.ndim() - 1; c >= 0; --c) {
+      if (counter[c] == tensor.dim(c) && c > 0) {
+        ++inc_count;
+        counter[c - 1] += 1;
+        counter[c] = 0;
+        print_comma = false;
+      }
+    }
+    if (print_comma && i != tensor.size() - 1) printf(", ");
+    for (int j = 0; j < inc_count; ++j) {
+      printf("]");
+    }
+    if (i != tensor.size() - 1) {
+      if (inc_count != 0) printf("\n");
+      for (int j = 0; j < inc_count; ++j) {
+        printf("[");
+      }
+    }
+  }
+  printf("]\n");
+}
+
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(TensorView<scalar_t> tensor) {
+  using Traw = typename std::remove_const<scalar_t>::type;
+  return printTensorView(tensor, detail::printfTypeFormat<Traw>());
+}
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape) {
+  using Traw = typename std::remove_const<scalar_t>::type;
+  return printTensorView(TensorView<const scalar_t>(ptr, shape),
+                         detail::printfTypeFormat<Traw>());
+}
+template <typename scalar_t>
+TV_HOST_DEVICE void printTensorView(const scalar_t *ptr, Shape shape,
+                                    const char *format) {
+  return printTensorView(TensorView<const scalar_t>(ptr, shape), format);
+}
+
+}  // namespace tv
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/corner_pool.h b/mmcv/mmcv/ops/csrc/onnxruntime/corner_pool.h
new file mode 100644
index 0000000000000000000000000000000000000000..b408679258bc397736468a193bd2ed9f20afc713
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/corner_pool.h
@@ -0,0 +1,46 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_CORNER_POOL_H
+#define ONNXRUNTIME_CORNER_POOL_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVCornerPoolKernel {
+ public:
+  MMCVCornerPoolKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "mode");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+
+  int64_t mode_;
+};
+
+struct MMCVCornerPoolCustomOp
+    : Ort::CustomOpBase<MMCVCornerPoolCustomOp, MMCVCornerPoolKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCornerPoolKernel(api, info);
+  }
+
+  const char* GetName() const { return "MMCVCornerPool"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_CORNER_POOL_H
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..397fe10e747525a5c0da65d79b00fe6ad605e47b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/corner_pool.cpp
@@ -0,0 +1,123 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "corner_pool.h"
+
+#include "../ort_mmcv_utils.h"
+
+void TopPoolForwardCPU(const float *input, float *output, const int batch_size,
+                       const int channels, const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int w = 0; w < width; w++) {
+        // directly copy the most bottom value from input to output
+        output[index_n_c + (height - 1) * width + w] =
+            input[index_n_c + (height - 1) * width + w];
+        // do top_pool
+        for (int h = height - 2; h >= 0; h--) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + (h + 1) * width + w],
+                       input[index_n_c + h * width + w]);
+        }  // for h
+      }    // for w
+    }      // for c
+  }        // for n
+}
+
+void BottomPoolForwardCPU(const float *input, float *output,
+                          const int batch_size, const int channels,
+                          const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int w = 0; w < width; w++) {
+        // directly copy the most top value from input to output
+        output[index_n_c + w] = input[index_n_c + w];
+        // do top_pool
+        for (int h = 1; h < height; h++) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + (h - 1) * width + w],
+                       input[index_n_c + h * width + w]);
+        }  // for h
+      }    // for w
+    }      // for c
+  }        // for n
+}
+
+void LeftPoolForwardCPU(const float *input, float *output, const int batch_size,
+                        const int channels, const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int h = 0; h < height; h++) {
+        // directly copy the most right value from input to output
+        output[index_n_c + h * width + width - 1] =
+            input[index_n_c + h * width + width - 1];
+        // do left_pool
+        for (int w = width - 2; w >= 0; w--) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + h * width + w + 1],
+                       input[index_n_c + h * width + w]);
+        }  // for w
+      }    // for h
+    }      // for c
+  }        // for n
+}
+
+void RightPoolForwardCPU(const float *input, float *output,
+                         const int batch_size, const int channels,
+                         const int height, const int width) {
+  for (int n = 0; n < batch_size; n++) {
+    int index_n = n * channels * width * height;
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * width * height;
+      for (int h = 0; h < height; h++) {
+        // directly copy the most left value from input to output
+        output[index_n_c + h * width] = input[index_n_c + h * width];
+        // do right_pool
+        for (int w = 1; w < width; w++) {
+          output[index_n_c + h * width + w] =
+              std::max(output[index_n_c + h * width + w - 1],
+                       input[index_n_c + h * width + w]);
+        }  // for w
+      }    // for h
+    }      // for c
+  }        // for n
+}
+
+void MMCVCornerPoolKernel::Compute(OrtKernelContext *context) {
+  const int mode = int(mode_);
+  typedef float T;
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const T *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(input));
+
+  // get output memory
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  T *output_data = ort_.GetTensorMutableData<T>(output);
+
+  // 'top': 0, 'bottom': 1, 'left': 2, 'right':3
+  assert(mode == 0 || mode == 1 || mode == 2 || mode == 3);
+
+  // do corner_pool
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+  if (mode == 0)
+    TopPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                      input_height, input_width);
+  else if (mode == 1)
+    BottomPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                         input_height, input_width);
+  else if (mode == 2)
+    LeftPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                       input_height, input_width);
+  else
+    RightPoolForwardCPU(input_data, output_data, batch_size, input_channels,
+                        input_height, input_width);
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db1f08b51de42de74138a11d4a6eb90722de6709
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/deform_conv.cpp
@@ -0,0 +1,263 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+void gemm_ref_fp32_deform(const float *A, const float *B, const float *V,
+                          const float *H, const int32_t trans_A,
+                          const int32_t trans_B, const int32_t M,
+                          const int32_t N, const int32_t K, const float alpha,
+                          const float beta, float *Y) {
+  if (!trans_A && !trans_B) {  // MK, KN; NN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && !trans_B) {  // KM, KN; TN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && trans_B) {  // KM, NK; TT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (!trans_A && trans_B) {  // MK, NK; NT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+}
+
+float bilinear_interpolate(const float *src, const int64_t src_h,
+                           const int64_t src_w, const float h, const float w) {
+  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
+    return 0;
+  }
+
+  int64_t h_low = floor(h);
+  int64_t w_low = floor(w);
+  int64_t h_high = h_low + 1;
+  int64_t w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh;
+  float hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+  float v3 = 0;
+  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+  float v4 = 0;
+  if (h_high <= src_h - 1 && w_high <= src_w - 1)
+    v4 = src[h_high * src_w + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+void deformable_im2col(const float *input, const float *offset,
+                       const int64_t src_h, const int64_t src_w,
+                       const int64_t kernel_h, const int64_t kernel_w,
+                       const int64_t pad_h, const int64_t pad_w,
+                       const int64_t stride_h, const int64_t stride_w,
+                       const int64_t dilation_h, const int64_t dilation_w,
+                       const int64_t channels, const int64_t offset_groups,
+                       const int64_t dst_h, const int64_t dst_w,
+                       float *columns) {
+  const int64_t indices = channels * dst_h * dst_w;
+  for (int64_t index = 0; index != indices; ++index) {
+    const int64_t w_col = index % dst_w;
+    const int64_t h_col = (index / dst_w) % dst_h;
+    const int64_t c_im = index / (dst_w * dst_h);
+    const int64_t c_col = c_im * kernel_h * kernel_w;
+
+    int64_t c_per_offset_grp = channels / offset_groups;
+    const int64_t grp_idx = c_im / c_per_offset_grp;
+    auto columns_ptr =
+        columns + (c_col * (dst_h * dst_w) + h_col * dst_w + w_col);
+    auto input_ptr = input + c_im * (src_h * src_w);
+    auto offset_ptr =
+        offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+
+    for (int64_t kh = 0; kh < kernel_h; ++kh) {
+      for (int64_t kw = 0; kw < kernel_w; ++kw) {
+        const int data_offset_h_ptr =
+            ((2 * (kh * kernel_w + kw)) * dst_h + h_col) * dst_w + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (kh * kernel_w + kw) + 1) * dst_h + h_col) * dst_w + w_col;
+
+        const float offset_h = offset_ptr[data_offset_h_ptr];
+        const float offset_w = offset_ptr[data_offset_w_ptr];
+        const float ih =
+            (h_col * stride_h - pad_h) + kh * dilation_h + offset_h;
+        const float iw =
+            (w_col * stride_w - pad_w) + kw * dilation_w + offset_w;
+        *columns_ptr = bilinear_interpolate(input_ptr, src_h, src_w, ih, iw);
+        columns_ptr += dst_h * dst_w;
+      }
+    }
+  }
+}
+
+void deformable_conv_forward(
+    const float *src, const float *offset, const float *filter,
+    const int64_t batch, const int64_t src_c, const int64_t src_h,
+    const int64_t src_w, const int64_t dst_c, const int64_t dst_h,
+    const int64_t dst_w, const int64_t group, const int64_t offset_group,
+    const int64_t channels, const int64_t num_output, const int64_t kernel_h,
+    const int64_t kernel_w, const int64_t stride_h, const int64_t stride_w,
+    const int64_t pad_h, const int64_t pad_w, const int64_t dilation_h,
+    const int64_t dilation_w, float *columns, float *dst) {
+  const int64_t ic_per_gp = channels / group;
+  const int64_t oc_per_gp = num_output / group;
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t g = 0; g < group; ++g) {
+      deformable_im2col(
+          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+          src_h, src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
+          columns);
+      float *dst_ptr =
+          dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+
+      memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+
+      gemm_ref_fp32_deform(
+          filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w, columns,
+          nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
+          ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
+    }
+  }
+}
+
+MMCVDeformConvKernel::MMCVDeformConvKernel(OrtApi api,
+                                           const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  std::vector<int64_t> stride =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+  stride_height_ = stride[0];
+  stride_width_ = stride[1];
+  std::vector<int64_t> padding =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+  padding_height_ = padding[0];
+  padding_width_ = padding[1];
+  std::vector<int64_t> dilation =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+  dilation_height_ = dilation[0];
+  dilation_width_ = dilation[1];
+  deformable_group_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void MMCVDeformConvKernel::Compute(OrtKernelContext *context) {
+  const int64_t stride_height = stride_height_;
+  const int64_t stride_width = stride_width_;
+  const int64_t padding_height = padding_height_;
+  const int64_t padding_width = padding_width_;
+  const int64_t dilation_height = dilation_height_;
+  const int64_t dilation_width = dilation_width_;
+  const int64_t deformable_group = deformable_group_;
+  const int64_t group = group_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
+  const float *offset_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
+
+  const OrtValue *filter = ort_.KernelContext_GetInput(context, 2);
+  const float *filter_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions filter_dims(ort_, filter);
+
+  int64_t batch_size = input_dims[0];
+  int64_t in_channels = input_dims[1];
+  int64_t in_height = input_dims[2];
+  int64_t in_width = input_dims[3];
+  int64_t out_channels = filter_dims[0];
+  int64_t kernel_height = filter_dims[2];
+  int64_t kernel_width = filter_dims[3];
+
+  // get output memory
+  int64_t out_height = floor((in_height + 2 * padding_height -
+                              dilation_height * (kernel_height - 1) - 1) /
+                                 stride_height +
+                             1);
+  int64_t out_width = floor(
+      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
+          stride_width +
+      1);
+
+  std::vector<int64_t> output_dims = {batch_size, out_channels, out_height,
+                                      out_width};
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  // allocate tmp memory
+  int64_t column_len = (in_channels / group) * kernel_height * kernel_width *
+                       out_height * out_width;
+  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
+  deformable_conv_forward(
+      input_data, offset_data, filter_data, batch_size, in_channels, in_height,
+      in_width, out_channels, out_height, out_width, group, deformable_group,
+      in_channels, out_channels, kernel_height, kernel_width, stride_height,
+      stride_width, padding_height, padding_width, dilation_height,
+      dilation_width, columns, out_ptr);
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ca150cd7a6f6dd9dd1bebbf3fdaebf354af99462
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/gridSample.cpp
@@ -0,0 +1,314 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <cmath>
+
+#include "../ort_mmcv_utils.h"
+#include "grid_sample.h"
+
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
+#define MAX(a, b) (((a) < (b)) ? (b) : (a))
+#define CLIP_COORDINATES(in, out, clip_limit) \
+  out = MIN((clip_limit - 1), MAX(in, 0))
+
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/GridSampler.cpp
+
+GridSampleKernel::GridSampleKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  align_corners_ = ort_.KernelInfoGetAttribute<int64_t>(info, "align_corners");
+  interpolation_mode_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "interpolation_mode");
+  padding_mode_ = ort_.KernelInfoGetAttribute<int64_t>(info, "padding_mode");
+
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+enum GridSamplerInterpolation { Bilinear = 0, Nearest = 1, Bicubic = 2 };
+enum GridSamplerPadding { Zeros = 0, Border = 1, Reflection = 2 };
+
+template <typename scalar_t>
+static inline scalar_t grid_sampler_unnormalize(scalar_t coord, int64_t size,
+                                                bool align_corners) {
+  if (align_corners) {
+    return ((coord + 1) / 2) * (size - 1);
+  } else {
+    return ((coord + 1) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static inline scalar_t clip_coordinates(scalar_t in, int64_t clip_limit) {
+  return std::min(static_cast<scalar_t>(clip_limit - 1),
+                  std::max(in, static_cast<scalar_t>(0)));
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static inline scalar_t reflect_coordinates(scalar_t in, int64_t twice_low,
+                                           int64_t twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = std::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = std::fmod(in, span);
+  int flips = static_cast<int>(std::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename scalar_t>
+static inline scalar_t compute_coordinates(scalar_t coord, int64_t size,
+                                           int64_t padding_mode,
+                                           bool align_corners) {
+  if (padding_mode == GridSamplerPadding::Border) {
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    }
+    coord = clip_coordinates(coord, size);
+  }
+  return coord;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static inline scalar_t grid_sampler_compute_source_index(scalar_t coord,
+                                                         int64_t size,
+                                                         int64_t padding_mode,
+                                                         bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  coord = compute_coordinates(coord, size, padding_mode, align_corners);
+  return coord;
+}
+
+static inline bool within_bounds_2d(int64_t h, int64_t w, int64_t H,
+                                    int64_t W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename scalar_t>
+static inline scalar_t get_value_bounded(const scalar_t *data, scalar_t x,
+                                         scalar_t y, int64_t W, int64_t H,
+                                         int64_t sW, int64_t sH,
+                                         int64_t padding_mode,
+                                         bool align_corners) {
+  x = compute_coordinates(x, W, padding_mode, align_corners);
+  y = compute_coordinates(y, H, padding_mode, align_corners);
+
+  int64_t ix = static_cast<int64_t>(x);
+  int64_t iy = static_cast<int64_t>(y);
+
+  if (within_bounds_2d(iy, ix, H, W)) {
+    return data[iy * sH + ix * sW];
+  }
+  return static_cast<scalar_t>(0);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution1(scalar_t x, scalar_t A) {
+  return ((A + 2) * x - (A + 3)) * x * x + 1;
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_convolution2(scalar_t x, scalar_t A) {
+  return ((A * x - 5 * A) * x + 8 * A) * x - 4 * A;
+}
+
+template <typename scalar_t>
+static inline void get_cubic_upsample_coefficients(scalar_t coeffs[4],
+                                                   scalar_t t) {
+  scalar_t A = -0.75;
+
+  scalar_t x1 = t;
+  coeffs[0] = cubic_convolution2<scalar_t>(x1 + 1.0, A);
+  coeffs[1] = cubic_convolution1<scalar_t>(x1, A);
+
+  // opposite coefficients
+  scalar_t x2 = 1.0 - t;
+  coeffs[2] = cubic_convolution1<scalar_t>(x2, A);
+  coeffs[3] = cubic_convolution2<scalar_t>(x2 + 1.0, A);
+}
+
+template <typename scalar_t>
+static inline scalar_t cubic_interp1d(scalar_t x0, scalar_t x1, scalar_t x2,
+                                      scalar_t x3, scalar_t t) {
+  scalar_t coeffs[4];
+  get_cubic_upsample_coefficients<scalar_t>(coeffs, t);
+
+  return x0 * coeffs[0] + x1 * coeffs[1] + x2 * coeffs[2] + x3 * coeffs[3];
+}
+
+void GridSampleKernel::Compute(OrtKernelContext *context) {
+  const bool align_corners = align_corners_;
+  const int64_t padding_mode = padding_mode_;
+  const int64_t interpolation_mode = interpolation_mode_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *grid = ort_.KernelContext_GetInput(context, 1);
+  const float *grid_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(grid));
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions grid_dims(ort_, grid);
+  int64_t N = input_dims[0];
+  int64_t C = input_dims[1];
+  int64_t inp_H = input_dims[2];
+  int64_t inp_W = input_dims[3];
+  int64_t out_H = grid_dims[1];
+  int64_t out_W = grid_dims[2];
+
+  std::vector<int64_t> output_dims = {N, C, out_H, out_W};
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  int64_t inp_sN = input_dims[1] * input_dims[2] * input_dims[3];
+  int64_t inp_sC = input_dims[2] * input_dims[3];
+  int64_t inp_sH = input_dims[3];
+  int64_t inp_sW = 1;
+  int64_t grid_sN = grid_dims[1] * grid_dims[2] * grid_dims[3];
+  int64_t grid_sH = grid_dims[2] * grid_dims[3];
+  int64_t grid_sW = grid_dims[3];
+  int64_t grid_sCoor = 1;
+  int64_t out_sN = output_dims[1] * output_dims[2] * output_dims[3];
+  int64_t out_sC = output_dims[2] * output_dims[3];
+  int64_t out_sH = output_dims[3];
+  int64_t out_sW = 1;
+
+  // loop over each output pixel
+  for (int64_t n = 0; n < N; ++n) {
+    const float *grid_ptr_N = grid_data + n * grid_sN;
+    const float *inp_ptr_N = input_data + n * inp_sN;
+    for (int64_t h = 0; h < out_H; ++h) {
+      for (int64_t w = 0; w < out_W; ++w) {
+        const float *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW;
+        float x = *grid_ptr_NHW;
+        float y = grid_ptr_NHW[grid_sCoor];
+
+        float ix = grid_sampler_compute_source_index(x, inp_W, padding_mode,
+                                                     align_corners);
+        float iy = grid_sampler_compute_source_index(y, inp_H, padding_mode,
+                                                     align_corners);
+
+        if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+          // get corner pixel values from (x, y)
+          // for 4d, we use north-east-south-west
+          int64_t ix_nw = static_cast<int64_t>(std::floor(ix));
+          int64_t iy_nw = static_cast<int64_t>(std::floor(iy));
+
+          int64_t ix_ne = ix_nw + 1;
+          int64_t iy_ne = iy_nw;
+
+          int64_t ix_sw = ix_nw;
+          int64_t iy_sw = iy_nw + 1;
+
+          int64_t ix_se = ix_nw + 1;
+          int64_t iy_se = iy_nw + 1;
+
+          // get surfaces to each neighbor:
+          float nw = (ix_se - ix) * (iy_se - iy);
+          float ne = (ix - ix_sw) * (iy_sw - iy);
+          float sw = (ix_ne - ix) * (iy - iy_ne);
+          float se = (ix - ix_nw) * (iy - iy_nw);
+
+          // calculate bilinear weighted pixel value and set output pixel
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            auto res = static_cast<float>(0);
+            if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+            }
+            if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+            }
+            if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+            }
+            if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+              res += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+            }
+            *out_ptr_NCHW = res;
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+          int64_t ix_nearest = static_cast<int64_t>(std::nearbyint(ix));
+          int64_t iy_nearest = static_cast<int64_t>(std::nearbyint(iy));
+
+          // assign nearest neighbor pixel value to output pixel
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          const float *inp_ptr_NC = inp_ptr_N;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+              *out_ptr_NCHW =
+                  inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+            } else {
+              *out_ptr_NCHW = static_cast<float>(0);
+            }
+          }
+        } else if (interpolation_mode == GridSamplerInterpolation::Bicubic) {
+          // grid_sampler_compute_source_index will "clip the value" of idx
+          // depends on the padding,
+          // which would cause calculation to be wrong,
+          // for example x = -0.1 -> ix = 0 for zero padding, but in bicubic ix
+          // = floor(x) = -1
+          // There would be more problem in reflection padding, since the -1 and
+          // +1 direction is not fixed in boundary condition
+          ix = grid_sampler_unnormalize(x, inp_W, align_corners);
+          iy = grid_sampler_unnormalize(y, inp_H, align_corners);
+
+          float ix_nw = std::floor(ix);
+          float iy_nw = std::floor(iy);
+
+          const float tx = ix - ix_nw;
+          const float ty = iy - iy_nw;
+
+          const float *inp_ptr_NC = inp_ptr_N;
+          float *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW;
+          for (int64_t c = 0; c < C;
+               ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) {
+            float coefficients[4];
+
+            // Interpolate 4 values in the x direction
+            for (int64_t i = 0; i < 4; ++i) {
+              coefficients[i] = cubic_interp1d<float>(
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 0, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 1, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  get_value_bounded<float>(inp_ptr_NC, ix_nw + 2, iy_nw - 1 + i,
+                                           inp_W, inp_H, inp_sW, inp_sH,
+                                           padding_mode, align_corners),
+                  tx);
+            }
+
+            // Interpolate in the y direction
+            *out_ptr_NCHW =
+                cubic_interp1d<float>(coefficients[0], coefficients[1],
+                                      coefficients[2], coefficients[3], ty);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cd8f0d061b7ddd29c3bd732394c195993d1ec389
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/modulated_deform_conv.cpp
@@ -0,0 +1,292 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "modulated_deform_conv.h"
+
+#include <cmath>
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+float bilinear_interpolate_2d(const float *src, const int64_t src_h,
+                              const int64_t src_w, const float h,
+                              const float w) {
+  if (h <= -1 || src_h <= h || w <= -1 || src_w <= w) {
+    return 0;
+  }
+
+  int64_t h_low = floor(h);
+  int64_t w_low = floor(w);
+  int64_t h_high = h_low + 1;
+  int64_t w_high = w_low + 1;
+
+  float lh = h - h_low;
+  float lw = w - w_low;
+  float hh = 1 - lh;
+  float hw = 1 - lw;
+
+  float v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = src[h_low * src_w + w_low];
+  float v2 = 0;
+  if (h_low >= 0 && w_high <= src_w - 1) v2 = src[h_low * src_w + w_high];
+  float v3 = 0;
+  if (h_high <= src_h - 1 && w_low >= 0) v3 = src[h_high * src_w + w_low];
+  float v4 = 0;
+  if (h_high <= src_h - 1 && w_high <= src_w - 1)
+    v4 = src[h_high * src_w + w_high];
+
+  float w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  float val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+// output: (channels * kernel_h * kernel_w, dst_h * dst_w)
+void deformable_im2col_2d(const float *input, const float *offset,
+                          const float *mask, const int64_t src_h,
+                          const int64_t src_w, const int64_t kernel_h,
+                          const int64_t kernel_w, const int64_t pad_h,
+                          const int64_t pad_w, const int64_t stride_h,
+                          const int64_t stride_w, const int64_t dilation_h,
+                          const int64_t dilation_w, const int64_t channels,
+                          const int64_t offset_groups, const int64_t dst_h,
+                          const int64_t dst_w, const bool use_mask,
+                          float *columns) {
+  const int64_t workload = channels * dst_h * dst_w;
+  for (int64_t index = 0; index != workload; ++index) {
+    const int64_t ow = index % dst_w;
+    const int64_t oh = (index / dst_w) % dst_h;
+    const int64_t ic = index / (dst_w * dst_h);
+    const int64_t oc = ic * kernel_h * kernel_w;
+
+    int64_t c_per_offset_grp = channels / offset_groups;
+    const int64_t grp_idx = ic / c_per_offset_grp;
+
+    auto columns_ptr = columns + (oc * (dst_h * dst_w) + oh * dst_w + ow);
+    auto input_ptr = input + ic * (src_h * src_w);
+    auto offset_ptr =
+        offset + grp_idx * 2 * kernel_h * kernel_w * dst_h * dst_w;
+    auto mask_ptr = mask;
+    if (use_mask) {
+      mask_ptr += grp_idx * kernel_h * kernel_w * dst_h * dst_w;
+    }
+
+    for (int64_t kh = 0; kh < kernel_h; ++kh) {
+      for (int64_t kw = 0; kw < kernel_w; ++kw) {
+        const int64_t mask_idx = kh * kernel_w + kw;
+        const int64_t offset_idx = 2 * mask_idx;
+
+        float mask_value = 1;
+        if (use_mask) {
+          mask_value = mask_ptr[mask_idx * (dst_h * dst_w) + oh * dst_w + ow];
+        }
+
+        const float offset_h =
+            offset_ptr[offset_idx * (dst_h * dst_w) + oh * dst_w + ow];
+        const float offset_w =
+            offset_ptr[(offset_idx + 1) * (dst_h * dst_w) + oh * dst_w + ow];
+        const float ih = (oh * stride_h - pad_h) + kh * dilation_h + offset_h;
+        const float iw = (ow * stride_w - pad_w) + kw * dilation_w + offset_w;
+        *columns_ptr = mask_value *
+                       bilinear_interpolate_2d(input_ptr, src_h, src_w, ih, iw);
+        columns_ptr += dst_h * dst_w;
+      }
+    }
+  }
+}
+
+void gemm_ref_fp32(const float *A, const float *B, const float *V,
+                   const float *H, const int32_t trans_A, const int32_t trans_B,
+                   const int32_t M, const int32_t N, const int32_t K,
+                   const float alpha, const float beta, float *Y) {
+  if (!trans_A && !trans_B) {  // MK, KN; NN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && !trans_B) {  // KM, KN; TN
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[k * N + n];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (trans_A && trans_B) {  // KM, NK; TT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[k * M + m] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+  if (!trans_A && trans_B) {  // MK, NK; NT
+    for (int64_t m = 0; m < M; ++m) {
+      for (int64_t n = 0; n < N; ++n) {
+        float y = 0.0f;
+        for (int64_t k = 0; k < K; ++k) {
+          y += A[m * K + k] * B[n * K + k];
+        }
+        y *= alpha;
+        if (V) y += beta * V[n];
+        if (H) y += beta * H[m * N + n];
+        Y[m * N + n] = y;
+      }
+    }
+  }
+}
+
+void deformable_conv2d_ref_fp32(
+    const float *src, const float *offset, const float *mask,
+    const float *filter, const float *bias, const int64_t batch,
+    const int64_t src_c, const int64_t src_h, const int64_t src_w,
+    const int64_t dst_c, const int64_t dst_h, const int64_t dst_w,
+    const int64_t group, const int64_t offset_group, const int64_t channels,
+    const int64_t num_output, const int64_t kernel_h, const int64_t kernel_w,
+    const int64_t stride_h, const int64_t stride_w, const int64_t pad_h,
+    const int64_t pad_w, const int64_t dilation_h, const int64_t dilation_w,
+    float *columns, float *dst) {
+  const int64_t ic_per_gp = channels / group;
+  const int64_t oc_per_gp = num_output / group;
+
+  for (int64_t b = 0; b < batch; ++b) {
+    for (int64_t g = 0; g < group; ++g) {
+      deformable_im2col_2d(
+          src + b * src_c * src_h * src_w + g * ic_per_gp * src_h * src_w,
+          offset + b * offset_group * 2 * kernel_h * kernel_w * dst_h * dst_w,
+          mask + b * offset_group * kernel_h * kernel_w * dst_h * dst_w, src_h,
+          src_w, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+          dilation_h, dilation_w, ic_per_gp, offset_group, dst_h, dst_w,
+          mask != nullptr, columns);
+      float *dst_ptr =
+          dst + b * dst_c * dst_h * dst_w + g * oc_per_gp * dst_h * dst_w;
+      if (bias != nullptr) {
+        const float *bias_ptr = bias + g * oc_per_gp;
+        for (int64_t oc = 0; oc < oc_per_gp; ++oc) {
+          for (int64_t hw = 0; hw < dst_h * dst_w; ++hw) {
+            dst_ptr[oc * dst_h * dst_w + hw] = bias_ptr[oc];
+          }
+        }
+      } else {
+        memset(dst_ptr, 0.0f, sizeof(float) * oc_per_gp * dst_h * dst_w);
+      }
+      gemm_ref_fp32(filter + g * oc_per_gp * ic_per_gp * kernel_h * kernel_w,
+                    columns, nullptr, dst_ptr, 0, 0, oc_per_gp, dst_h * dst_w,
+                    ic_per_gp * kernel_h * kernel_w, 1.0f, 1.0f, dst_ptr);
+    }
+  }
+}
+
+MMCVModulatedDeformConvKernel::MMCVModulatedDeformConvKernel(
+    OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  std::vector<int64_t> stride =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "stride");
+  stride_height_ = stride[0];
+  stride_width_ = stride[1];
+  std::vector<int64_t> padding =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "padding");
+  padding_height_ = padding[0];
+  padding_width_ = padding[1];
+  std::vector<int64_t> dilation =
+      ort_.KernelInfoGetAttribute<std::vector<int64_t>>(info, "dilation");
+  dilation_height_ = dilation[0];
+  dilation_width_ = dilation[1];
+  deformable_group_ =
+      ort_.KernelInfoGetAttribute<int64_t>(info, "deform_groups");
+  group_ = ort_.KernelInfoGetAttribute<int64_t>(info, "groups");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void MMCVModulatedDeformConvKernel::Compute(OrtKernelContext *context) {
+  const int64_t stride_height = stride_height_;
+  const int64_t stride_width = stride_width_;
+  const int64_t padding_height = padding_height_;
+  const int64_t padding_width = padding_width_;
+  const int64_t dilation_height = dilation_height_;
+  const int64_t dilation_width = dilation_width_;
+  const int64_t deformable_group = deformable_group_;
+  const int64_t group = group_;
+
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  const OrtValue *offset = ort_.KernelContext_GetInput(context, 1);
+  const float *offset_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(offset));
+
+  const OrtValue *mask = ort_.KernelContext_GetInput(context, 2);
+  const float *mask_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(mask));
+
+  const OrtValue *filter = ort_.KernelContext_GetInput(context, 3);
+  const float *filter_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(filter));
+
+  const OrtValue *bias = ort_.KernelContext_GetInput(context, 4);
+  const float *bias_data =
+      (bias != nullptr)
+          ? reinterpret_cast<const float *>(ort_.GetTensorData<float>(bias))
+          : nullptr;
+  // const float *bias_data = nullptr;
+
+  OrtTensorDimensions input_dims(ort_, input);
+  OrtTensorDimensions filter_dims(ort_, filter);
+
+  int64_t batch = input_dims[0];
+  int64_t channels = input_dims[1];
+  int64_t in_height = input_dims[2];
+  int64_t in_width = input_dims[3];
+  int64_t num_output = filter_dims[0];
+  int64_t kernel_height = filter_dims[2];
+  int64_t kernel_width = filter_dims[3];
+
+  // get output memory
+  int64_t out_height = floor((in_height + 2 * padding_height -
+                              dilation_height * (kernel_height - 1) - 1) /
+                                 stride_height +
+                             1);
+  int64_t out_width = floor(
+      (in_width + 2 * padding_width - dilation_width * (kernel_width - 1) - 1) /
+          stride_width +
+      1);
+
+  std::vector<int64_t> output_dims = {batch, num_output, out_height, out_width};
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, output_dims.data(), output_dims.size());
+  float *out_ptr = ort_.GetTensorMutableData<float>(output);
+
+  // allocate tmp memory
+  int64_t column_len = (channels / group) * kernel_height * kernel_width *
+                       out_height * out_width;
+  float *columns = (float *)allocator_.Alloc(sizeof(float) * column_len);
+
+  deformable_conv2d_ref_fp32(
+      input_data, offset_data, mask_data, filter_data, bias_data, batch,
+      channels, in_height, in_width, num_output, out_height, out_width, group,
+      deformable_group, channels, num_output, kernel_height, kernel_width,
+      stride_height, stride_width, padding_height, padding_width,
+      dilation_height, dilation_width, columns, out_ptr);
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/nms.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b38a76e11658d6c9d3a4928ac77a9749b47ef50c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/nms.cpp
@@ -0,0 +1,108 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "nms.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <iterator>
+#include <numeric>  // std::iota
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+NmsKernel::NmsKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+  offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void NmsKernel::Compute(OrtKernelContext *context) {
+  const float iou_threshold = iou_threshold_;
+  const int64_t offset = offset_;
+
+  const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
+  const float *boxes_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(boxes));
+  const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
+  const float *scores_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(scores));
+
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+
+  int64_t nboxes = boxes_dim[0];
+  assert(boxes_dim[1] == 4);
+
+  // allocate tmp memory
+  float *tmp_boxes = (float *)allocator_.Alloc(sizeof(float) * nboxes * 4);
+  float *sc = (float *)allocator_.Alloc(sizeof(float) * nboxes);
+  float *areas = (float *)allocator_.Alloc(sizeof(float) * nboxes);
+  bool *select = (bool *)allocator_.Alloc(sizeof(bool) * nboxes);
+  for (int64_t i = 0; i < nboxes; i++) {
+    select[i] = true;
+  }
+
+  memcpy(tmp_boxes, boxes_data, sizeof(float) * nboxes * 4);
+  memcpy(sc, scores_data, sizeof(float) * nboxes);
+
+  // sort scores
+  std::vector<float> tmp_sc;
+  for (int i = 0; i < nboxes; i++) {
+    tmp_sc.push_back(sc[i]);
+  }
+  std::vector<int64_t> order(tmp_sc.size());
+  std::iota(order.begin(), order.end(), 0);
+  std::sort(order.begin(), order.end(), [&tmp_sc](int64_t id1, int64_t id2) {
+    return tmp_sc[id1] > tmp_sc[id2];
+  });
+
+  // area = (x2 - x1 + offset) * (y2 - y1 + offset)
+  for (int64_t i = 0; i < nboxes; i++) {
+    areas[i] = (tmp_boxes[i * 4 + 2] - tmp_boxes[i * 4 + 0] + offset) *
+               (tmp_boxes[i * 4 + 3] - tmp_boxes[i * 4 + 1] + offset);
+  }
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = tmp_boxes[i * 4 + 0];
+    auto iy1 = tmp_boxes[i * 4 + 1];
+    auto ix2 = tmp_boxes[i * 4 + 2];
+    auto iy2 = tmp_boxes[i * 4 + 3];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, tmp_boxes[j * 4 + 0]);
+      auto yy1 = std::max(iy1, tmp_boxes[j * 4 + 1]);
+      auto xx2 = std::min(ix2, tmp_boxes[j * 4 + 2]);
+      auto yy2 = std::min(iy2, tmp_boxes[j * 4 + 3]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  std::vector<int64_t> res_order;
+  for (int i = 0; i < nboxes; i++) {
+    if (select[i]) {
+      res_order.push_back(order[i]);
+    }
+  }
+
+  std::vector<int64_t> inds_dims({res_order.size()});
+
+  OrtValue *res = ort_.KernelContext_GetOutput(context, 0, inds_dims.data(),
+                                               inds_dims.size());
+  int64_t *res_data = ort_.GetTensorMutableData<int64_t>(res);
+
+  memcpy(res_data, res_order.data(), sizeof(int64_t) * res_order.size());
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..840eed82e437ca27a4c20050c798ab86d3647c13
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/onnxruntime_register.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "onnxruntime_register.h"
+
+#include "corner_pool.h"
+#include "deform_conv.h"
+#include "grid_sample.h"
+#include "modulated_deform_conv.h"
+#include "nms.h"
+#include "ort_mmcv_utils.h"
+#include "reduce_ops.h"
+#include "roi_align.h"
+#include "roi_align_rotated.h"
+#include "rotated_feature_align.h"
+#include "soft_nms.h"
+
+const char *c_MMCVOpDomain = "mmcv";
+SoftNmsOp c_SoftNmsOp;
+NmsOp c_NmsOp;
+MMCVRoiAlignCustomOp c_MMCVRoiAlignCustomOp;
+MMCVRoIAlignRotatedCustomOp c_MMCVRoIAlignRotatedCustomOp;
+MMCVRotatedFeatureAlignCustomOp c_MMCVRotatedFeatureAlignCustomOp;
+GridSampleOp c_GridSampleOp;
+MMCVCumMaxCustomOp c_MMCVCumMaxCustomOp;
+MMCVCumMinCustomOp c_MMCVCumMinCustomOp;
+MMCVCornerPoolCustomOp c_MMCVCornerPoolCustomOp;
+MMCVModulatedDeformConvOp c_MMCVModulatedDeformConvOp;
+MMCVDeformConvOp c_MMCVDeformConvOp;
+
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api) {
+  OrtCustomOpDomain *domain = nullptr;
+  const OrtApi *ortApi = api->GetApi(ORT_API_VERSION);
+
+  if (auto status = ortApi->CreateCustomOpDomain(c_MMCVOpDomain, &domain)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_SoftNmsOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_NmsOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoiAlignCustomOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVRoIAlignRotatedCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_GridSampleOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVCornerPoolCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMaxCustomOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVCumMinCustomOp)) {
+    return status;
+  }
+
+  if (auto status =
+          ortApi->CustomOpDomain_Add(domain, &c_MMCVModulatedDeformConvOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(domain, &c_MMCVDeformConvOp)) {
+    return status;
+  }
+
+  if (auto status = ortApi->CustomOpDomain_Add(
+          domain, &c_MMCVRotatedFeatureAlignCustomOp)) {
+    return status;
+  }
+
+  return ortApi->AddCustomOpDomain(options, domain);
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81aef39067c47ebe33fb5f4a39b81a5fe9aab798
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/reduce_ops.cpp
@@ -0,0 +1,188 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "reduce_ops.h"
+
+#include <assert.h>
+
+#include <vector>
+
+#include "../ort_mmcv_utils.h"
+
+// modified from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/ReduceOps.cpp
+
+static inline int64_t maybe_wrap_dim(int64_t dim, int64_t ndims) {
+  int64_t min = -ndims;
+  int64_t max = ndims - 1;
+  assert(dim >= min && dim <= max);
+  if (dim < 0) dim += ndims;
+  return dim;
+}
+
+static inline int64_t get_dim_stride(const int64_t dim, const int64_t ndims,
+                                     const int64_t *reversed_dim_cumprod) {
+  return dim == ndims - 1 ? 1 : reversed_dim_cumprod[dim + 1];
+}
+
+static inline int64_t get_dim_size(const int64_t dim, const int64_t ndims,
+                                   const int64_t *reversed_dim_cumprod) {
+  return dim == ndims - 1
+             ? reversed_dim_cumprod[dim]
+             : reversed_dim_cumprod[dim] / reversed_dim_cumprod[dim + 1];
+}
+
+template <typename T1, typename T2, typename Operation>
+void cummax_cummin_helper(const T1 *input, T1 *output, T2 *indices,
+                          const int64_t input_dim_size, const int64_t stride) {
+  Operation op;
+  T1 out = input[0];
+  int64_t idx = 0;
+  for (int64_t i = 0; i < input_dim_size; i++) {
+    T1 curr_elem = input[i * stride];
+    if (op(curr_elem, out)) {
+      out = curr_elem;
+      idx = i;
+    }
+    output[i * stride] = out;
+    indices[i * stride] = idx;
+  }
+}
+
+// modified `tensor_dim_apply3` from
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/native/TensorDimApply.h.
+// the difference is that: (1) use `reversed_dim_cumprod` for fast computing of
+// tensor `size` and `stride`. (2) the same `stride` is used for input, output,
+// and indices, since it's unnecessary to use separate values. currently
+// `tensor_dim_apply3` is only used for `cummax` and `cummin`, according to the
+// official pytorch projects: https://github.com/pytorch/pytorch.
+template <typename T1, typename T2, typename Function>
+void tensor_dim_apply3(const T1 *input, T1 *output, T2 *indices,
+                       const int64_t dim, const int64_t ndims,
+                       const int64_t *reversed_dim_cumprod, Function func) {
+  int dim_apply_finished = 0;
+  int64_t input_dim_size = get_dim_size(dim, ndims, reversed_dim_cumprod);
+  // the same stride is used for input, output and indices
+  int64_t stride = get_dim_stride(dim, ndims, reversed_dim_cumprod);
+  std::vector<int64_t> counter(ndims, 0);
+
+  while (!dim_apply_finished) {
+    // call `func` once to update output and indices
+    func(input, output, indices, input_dim_size, stride);
+    if (ndims == 1) break;
+    for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
+      if (dim_i == dim) {
+        if (dim_i == (ndims - 1)) {
+          dim_apply_finished = 1;
+          break;
+        }
+        continue;
+      }
+      counter[dim_i]++;
+
+      // the same stride is used for input, output, and indices
+      int64_t stride_dim_i = get_dim_stride(dim_i, ndims, reversed_dim_cumprod);
+      input += stride_dim_i;
+      output += stride_dim_i;
+      indices += stride_dim_i;
+
+      if (counter[dim_i] == get_dim_size(dim_i, ndims, reversed_dim_cumprod)) {
+        if (dim_i == ndims - 1) {
+          dim_apply_finished = 1;
+          break;
+        } else {
+          input -= counter[dim_i] * stride_dim_i;
+          output -= counter[dim_i] * stride_dim_i;
+          indices -= counter[dim_i] * stride_dim_i;
+          counter[dim_i] = 0;
+        }
+      } else {
+        break;
+      }  // if
+    }    // for
+  }      // while
+}
+
+template <typename T1, typename T2, typename Operation>
+void CumMax_CumMin_CPU(const T1 *input, T1 *output, T2 *indices,
+                       int64_t *reversed_dim_cumprod, const int64_t dim,
+                       const OrtTensorDimensions &out_dimensions) {
+  // calculate numel
+  const int64_t ndims = out_dimensions.size();
+  int64_t numel = 1;
+  for (int64_t dim_i = 0; dim_i < ndims; dim_i++) {
+    numel *= out_dimensions.data()[dim_i];
+  }
+
+  // cummax is only applied to input which is non-zero dim and non-empty
+  if (numel) {
+    // compute the cumulative production on dimension size,
+    // which is then used for computing the stride or size of a specific `dim`.
+    reversed_dim_cumprod[ndims - 1] = out_dimensions.data()[ndims - 1];
+    for (int64_t dim_i = ndims - 2; dim_i >= 0; dim_i--) {
+      reversed_dim_cumprod[dim_i] =
+          reversed_dim_cumprod[dim_i + 1] * out_dimensions.data()[dim_i];
+    }
+
+    // do cummax or cummin based on `Operation` type
+    tensor_dim_apply3<float, int64_t>(
+        input, output, indices, dim, ndims, reversed_dim_cumprod,
+        cummax_cummin_helper<float, int64_t, Operation>);
+  }
+}
+
+void MMCVCumMaxKernel::Compute(OrtKernelContext *context) {
+  // get input
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  // get output
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *output_data = ort_.GetTensorMutableData<float>(output);
+  OrtValue *indices = ort_.KernelContext_GetOutput(
+      context, 1, out_dimensions.data(), out_dimensions.size());
+  int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
+
+  // allocate tmp memory for computing the cumulative production on dimension
+  // size
+  const int64_t ndims = out_dimensions.size();
+  assert(ndims > 0);
+  int64_t *reversed_dim_cumprod =
+      (int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
+
+  // dim should be wrapped if it's negative (e.g. -1)
+  const int64_t dim = maybe_wrap_dim(dim_, ndims);
+  CumMax_CumMin_CPU<float, int64_t, std::greater_equal<float>>(
+      input_data, output_data, indices_data, reversed_dim_cumprod, dim,
+      out_dimensions);
+}
+
+void MMCVCumMinKernel::Compute(OrtKernelContext *context) {
+  // get input
+  const OrtValue *input = ort_.KernelContext_GetInput(context, 0);
+  const float *input_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input));
+
+  // get output
+  OrtTensorDimensions out_dimensions(ort_, input);
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *output_data = ort_.GetTensorMutableData<float>(output);
+  OrtValue *indices = ort_.KernelContext_GetOutput(
+      context, 1, out_dimensions.data(), out_dimensions.size());
+  int64_t *indices_data = ort_.GetTensorMutableData<int64_t>(indices);
+
+  // allocate tmp memory for computing the cumulative production on dimension
+  // size
+  const int64_t ndims = out_dimensions.size();
+  assert(ndims > 0);
+  int64_t *reversed_dim_cumprod =
+      (int64_t *)allocator_.Alloc(sizeof(int64_t) * ndims);
+
+  // dim should be wrapped if it's negative (e.g. -1)
+  const int64_t dim = maybe_wrap_dim(dim_, ndims);
+  CumMax_CumMin_CPU<float, int64_t, std::less_equal<float>>(
+      input_data, output_data, indices_data, reversed_dim_cumprod, dim,
+      out_dimensions);
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2151d2ac6e2113d90a9829f6530cd0c7d7eb3240
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/roi_align.cpp
@@ -0,0 +1,265 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "roi_align.h"
+
+#include "../ort_mmcv_utils.h"
+
+// implementation taken from Caffe2
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  float w1;
+  float w2;
+  float w3;
+  float w4;
+};
+
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const float yy =
+            roi_start_h + ph * bin_size_h +
+            static_cast<float>(iy + .5f) * bin_size_h /
+                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const float xx = roi_start_w + pw * bin_size_w +
+                           static_cast<float>(ix + .5f) * bin_size_w /
+                               static_cast<float>(roi_bin_grid_w);
+
+          float x = xx;
+          float y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (float)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (float)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          float ly = y - y_low;
+          float lx = x - x_low;
+          float hy = 1. - ly, hx = 1. - lx;
+          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+void ROIAlignForwardCPU(const int nthreads, const float *input,
+                        const float *rois, float *output, float *argmax_y,
+                        float *argmax_x, const int pooled_height,
+                        const int pooled_width, const float spatial_scale,
+                        const int sampling_ratio,
+                        const int pool_mode,  // 0 - max pool, 1 - avg pool
+                        const bool aligned, const int channels,
+                        const int height, const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const float *offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    float offset = aligned ? (float)0.5 : (float)0.0;
+    float roi_start_w = offset_rois[1] * spatial_scale - offset;
+    float roi_start_h = offset_rois[2] * spatial_scale - offset;
+    float roi_end_w = offset_rois[3] * spatial_scale - offset;
+    float roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    float roi_width = roi_end_w - roi_start_w;
+    float roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      /*AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");*/
+      assert(roi_width >= 0 && roi_height >= 0);
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (float)1.);
+      roi_height = std::max(roi_height, (float)1.);
+    }
+    float bin_size_h =
+        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+    float bin_size_w =
+        static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const float count =
+        std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                  pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const float *offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          float output_val = 0.;
+          float maxval = -10000;
+          float maxidx_y = -1.f, maxidx_x = -1.f;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const float y = roi_start_h + ph * bin_size_h +
+                            static_cast<float>(iy + .5f) * bin_size_h /
+                                static_cast<float>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const float x = roi_start_w + pw * bin_size_w +
+                              static_cast<float>(ix + .5f) * bin_size_w /
+                                  static_cast<float>(roi_bin_grid_w);
+              PreCalc pc = pre_calc[pre_calc_index];
+              float val = pc.w1 * offset_input[pc.pos1] +
+                          pc.w2 * offset_input[pc.pos2] +
+                          pc.w3 * offset_input[pc.pos3] +
+                          pc.w4 * offset_input[pc.pos4];
+              if (val > maxval) {
+                maxval = val;
+                maxidx_y = y;
+                maxidx_x = x;
+              }
+              output_val += val;
+              pre_calc_index += 1;
+            }
+          }
+          if (pool_mode == 0) {
+            // We do max pooling inside a bin
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+          } else if (pool_mode == 1) {
+            // We do average (integral) pooling inside a bin
+            output[index] = output_val / count;
+          }  // if
+        }    // for pw
+      }      // for ph
+    }        // for c
+  }          // for n
+}
+
+void MMCVRoiAlignKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
+  const float *X_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
+  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
+  const float *rois = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_rois));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_X);
+  OrtTensorDimensions roi_dimensions(ort_, input_rois);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  out_dimensions.data()[0] = roi_dimensions.data()[0];
+  out_dimensions.data()[2] = aligned_height_;
+  out_dimensions.data()[3] = aligned_width_;
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+
+  int poolMod = 1;
+  if (pool_mode_ == "max") poolMod = 0;
+
+  float *argmax_x = nullptr, *argmax_y = nullptr;
+  if (poolMod == 0) {
+    argmax_y = new float[output_size];
+    argmax_x = new float[output_size];
+  }
+
+  ROIAlignForwardCPU(output_size, X_data, rois, out, argmax_y, argmax_x,
+                     aligned_height_, aligned_width_, spatial_scale_,
+                     sampling_ratio_, poolMod, aligned_, input_channels,
+                     input_height, input_width);
+
+  if (argmax_x) delete argmax_x;
+  if (argmax_y) delete argmax_y;
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ce0b220291bfd364391c3239d49f5d815f1ae809
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/roi_align_rotated.cpp
@@ -0,0 +1,247 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "roi_align_rotated.h"
+
+#include "../ort_mmcv_utils.h"
+
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  float w1;
+  float w2;
+  float w3;
+  float w4;
+};
+
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    float roi_start_h, float roi_start_w, float bin_size_h, float bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, float roi_center_h,
+    float roi_center_w, float cos_theta, float sin_theta,
+    std::vector<PreCalc> &pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const float yy =
+            roi_start_h + ph * bin_size_h +
+            static_cast<float>(iy + .5f) * bin_size_h /
+                static_cast<float>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const float xx = roi_start_w + pw * bin_size_w +
+                           static_cast<float>(ix + .5f) * bin_size_w /
+                               static_cast<float>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          float y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          float x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (float)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (float)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          float ly = y - y_low;
+          float lx = x - x_low;
+          float hy = 1. - ly, hx = 1. - lx;
+          float w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+void ROIAlignRotatedForwardCPU(const int nthreads, const float *input,
+                               const float *rois, float *output,
+                               const float &spatial_scale, const int aligned,
+                               const int clockwise, const int channels,
+                               const int height, const int width,
+                               const int pooled_height, const int pooled_width,
+                               const int sampling_ratio) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const float *current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    float offset = aligned ? (float)0.5 : (float)0.0;
+    float roi_center_w = current_roi[1] * spatial_scale - offset;
+    float roi_center_h = current_roi[2] * spatial_scale - offset;
+    float roi_width = current_roi[3] * spatial_scale;
+    float roi_height = current_roi[4] * spatial_scale;
+    // float theta = current_roi[5] * M_PI / 180.0;
+    float theta = current_roi[5];  // Radian angle by default
+    if (clockwise) {
+      theta = -theta;
+    }
+    float cos_theta = cos(theta);
+    float sin_theta = sin(theta);
+    if (!aligned) {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (float)1.);
+      roi_height = std::max(roi_height, (float)1.);
+    }
+
+    float bin_size_h =
+        static_cast<float>(roi_height) / static_cast<float>(pooled_height);
+    float bin_size_w =
+        static_cast<float>(roi_width) / static_cast<float>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceil(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const float count =
+        std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                  pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    float roi_start_h = -roi_height / 2.0;
+    float roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
+        sin_theta, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const float *offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          float output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+void MMCVRoIAlignRotatedKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_X = ort_.KernelContext_GetInput(context, 0);
+  const float *X_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<float>(input_X));
+  const OrtValue *input_rois = ort_.KernelContext_GetInput(context, 1);
+  const float *rois = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_rois));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_X);
+  OrtTensorDimensions roi_dimensions(ort_, input_rois);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  out_dimensions.data()[0] = roi_dimensions.data()[0];
+  out_dimensions.data()[2] = aligned_height_;
+  out_dimensions.data()[3] = aligned_width_;
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+  ROIAlignRotatedForwardCPU(output_size, X_data, rois, out, spatial_scale_,
+                            aligned_, clockwise_, input_channels, input_height,
+                            input_width, aligned_height_, aligned_width_,
+                            sampling_ratio_);
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/rotated_feature_align.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8d07376317d91ec3cd919257fae7f00d6945f29
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/rotated_feature_align.cpp
@@ -0,0 +1,132 @@
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "rotated_feature_align.h"
+
+#include "../ort_mmcv_utils.h"
+
+template <typename T>
+T bilinear_interpolate(const T *input, const int height, const int width, T y,
+                       T x, const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+  // do bilinear interpolation
+  T v1 = input[int(fma(y_low, width, x_low))];
+  T v2 = input[int(fma(y_low, width, x_high))];
+  T v3 = input[int(fma(y_high, width, x_low))];
+  T v4 = input[int(fma(y_high, width, x_high))];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename scalar_t>
+void rotated_feature_align_forward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t *bottom_data,
+    const scalar_t *best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t *top_data) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t *bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t *offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+void MMCVRotatedFeatureAlignKernel::Compute(OrtKernelContext *context) {
+  // Setup inputs
+  const OrtValue *input_features = ort_.KernelContext_GetInput(context, 0);
+  const float *features_data = reinterpret_cast<const float *>(
+      ort_.GetTensorData<float>(input_features));
+  const OrtValue *input_best_rbboxes = ort_.KernelContext_GetInput(context, 1);
+  const float *best_rbboxes = reinterpret_cast<const float *>(
+      ort_.GetTensorData<const float *>(input_best_rbboxes));
+
+  // Setup output
+  OrtTensorDimensions out_dimensions(ort_, input_features);
+
+  int batch_size = out_dimensions.data()[0];
+  int input_channels = out_dimensions.data()[1];
+  int input_height = out_dimensions.data()[2];
+  int input_width = out_dimensions.data()[3];
+
+  OrtValue *output = ort_.KernelContext_GetOutput(
+      context, 0, out_dimensions.data(), out_dimensions.size());
+  float *out = ort_.GetTensorMutableData<float>(output);
+  OrtTensorTypeAndShapeInfo *output_info = ort_.GetTensorTypeAndShape(output);
+  ort_.ReleaseTensorTypeAndShapeInfo(output_info);
+
+  // TODO: forward here
+  int output_size = out_dimensions.data()[0];
+  for (auto i = 1; i < out_dimensions.size(); ++i) {
+    output_size *= out_dimensions.data()[i];
+  }
+  rotated_feature_align_forward_cpu_kernel<float>(
+      output_size, points_, features_data, best_rbboxes, spatial_scale_,
+      input_channels, input_height, input_width, out);
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8bb4ce3365b24b53b5523018bea4ec39ae6f4fdd
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/cpu/soft_nms.cpp
@@ -0,0 +1,156 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "soft_nms.h"
+
+#include <assert.h>
+
+#include <algorithm>
+#include <cmath>
+
+#include "../ort_mmcv_utils.h"
+
+SoftNmsKernel::SoftNmsKernel(OrtApi api, const OrtKernelInfo *info)
+    : api_(api), ort_(api_), info_(info) {
+  iou_threshold_ = ort_.KernelInfoGetAttribute<float>(info, "iou_threshold");
+  sigma_ = ort_.KernelInfoGetAttribute<float>(info, "sigma");
+  min_score_ = ort_.KernelInfoGetAttribute<float>(info, "min_score");
+  method_ = ort_.KernelInfoGetAttribute<int64_t>(info, "method");
+  offset_ = ort_.KernelInfoGetAttribute<int64_t>(info, "offset");
+
+  // create allocator
+  allocator_ = Ort::AllocatorWithDefaultOptions();
+}
+
+void SoftNmsKernel::Compute(OrtKernelContext *context) {
+  typedef float T;
+
+  const T iou_threshold = T(iou_threshold_);
+  const T sigma = T(sigma_);
+  const T min_score = T(min_score_);
+  const int method = int(method_);
+  const T offset = T(offset_);
+
+  const OrtValue *boxes = ort_.KernelContext_GetInput(context, 0);
+  const T *boxes_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(boxes));
+  const OrtValue *scores = ort_.KernelContext_GetInput(context, 1);
+  const T *scores_data =
+      reinterpret_cast<const float *>(ort_.GetTensorData<T>(scores));
+
+  OrtTensorDimensions boxes_dim(ort_, boxes);
+  OrtTensorDimensions scores_dim(ort_, scores);
+
+  int64_t nboxes = boxes_dim[0];
+  assert(boxes_dim[1] == 4);
+
+  // allocate tmp memory
+  T *tmp_boxes = (T *)allocator_.Alloc(sizeof(T) * nboxes * 4);
+  T *x1 = tmp_boxes;
+  T *y1 = tmp_boxes + 1;
+  T *x2 = tmp_boxes + 2;
+  T *y2 = tmp_boxes + 3;
+  T *sc = (T *)allocator_.Alloc(sizeof(T) * nboxes);
+  T *areas = (T *)allocator_.Alloc(sizeof(T) * nboxes);
+  T *de = (T *)allocator_.Alloc(sizeof(T) * nboxes * 5);
+  int64_t *inds = (int64_t *)allocator_.Alloc(sizeof(int64_t) * nboxes);
+
+  memcpy(tmp_boxes, boxes_data, sizeof(T) * nboxes * 4);
+  memcpy(sc, scores_data, sizeof(T) * nboxes);
+
+  // init inds as arange(nboxes)
+  std::generate(inds, inds + nboxes, [n = 0]() mutable { return n++; });
+
+  // area = (x2-x1+offset)*(y2-y1+offset)
+  for (int64_t i = 0; i < nboxes; i++) {
+    areas[i] =
+        (x2[i * 4] - x1[i * 4] + offset) * (y2[i * 4] - y1[i * 4] + offset);
+  }
+
+  int64_t pos = 0;
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos * 4];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos * 4];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos * 4];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos * 4];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos * 4] = x1[i * 4];
+    y1[max_pos * 4] = y1[i * 4];
+    x2[max_pos * 4] = x2[i * 4];
+    y2[max_pos * 4] = y2[i * 4];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i * 4] = ix1;
+    y1[i * 4] = iy1;
+    x2[i * 4] = ix2;
+    y2[i * 4] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos * 4]);
+      auto yy1 = std::max(iy1, y1[pos * 4]);
+      auto xx2 = std::min(ix2, x2[pos * 4]);
+      auto yy2 = std::min(iy2, y2[pos * 4]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos * 4] = x1[(nboxes - 1) * 4];
+        y1[pos * 4] = y1[(nboxes - 1) * 4];
+        x2[pos * 4] = x2[(nboxes - 1) * 4];
+        y2[pos * 4] = y2[(nboxes - 1) * 4];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+
+  std::vector<int64_t> dets_dim({nboxes, 5});
+  OrtValue *dets = ort_.KernelContext_GetOutput(context, 0, dets_dim.data(),
+                                                dets_dim.size());
+  T *dets_data = ort_.GetTensorMutableData<T>(dets);
+
+  std::vector<int64_t> inds_dim({nboxes});
+  OrtValue *inds_ov = ort_.KernelContext_GetOutput(context, 1, inds_dim.data(),
+                                                   inds_dim.size());
+  int64_t *inds_data = ort_.GetTensorMutableData<int64_t>(inds_ov);
+
+  memcpy(dets_data, de, sizeof(T) * nboxes * 5);
+  memcpy(inds_data, inds, sizeof(int64_t) * nboxes);
+}
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/deform_conv.h b/mmcv/mmcv/ops/csrc/onnxruntime/deform_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..05f324a7dfe9d40d28b139d55155dbe6cff15f63
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/deform_conv.h
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_DEFORM_CONV_H
+#define ONNXRUNTIME_DEFORM_CONV_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVDeformConvKernel {
+  MMCVDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t stride_height_;
+  int64_t stride_width_;
+  int64_t padding_height_;
+  int64_t padding_width_;
+  int64_t dilation_height_;
+  int64_t dilation_width_;
+  int64_t deformable_group_;
+  int64_t group_;
+  int64_t im2col_step_;
+};
+
+struct MMCVDeformConvOp
+    : Ort::CustomOpBase<MMCVDeformConvOp, MMCVDeformConvKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new MMCVDeformConvKernel(api, info);
+  }
+
+  const char *GetName() const { return "MMCVDeformConv2d"; };
+
+  size_t GetInputTypeCount() const { return 3; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
+      size_t index) const {
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/grid_sample.h b/mmcv/mmcv/ops/csrc/onnxruntime/grid_sample.h
new file mode 100644
index 0000000000000000000000000000000000000000..6be15146b2e1bd1158c3253a88af7df04875fa75
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/grid_sample.h
@@ -0,0 +1,44 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_GRIDSAMPLE_H
+#define ONNXRUNTIME_GRIDSAMPLE_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct GridSampleKernel {
+  GridSampleKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t align_corners_;
+  int64_t interpolation_mode_;
+  int64_t padding_mode_;
+};
+
+struct GridSampleOp : Ort::CustomOpBase<GridSampleOp, GridSampleKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new GridSampleKernel(api, info);
+  };
+
+  const char *GetName() const { return "grid_sampler"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h b/mmcv/mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h
new file mode 100644
index 0000000000000000000000000000000000000000..09d9d1f85116e75129df82db2c07bb2e376d7b8e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/modulated_deform_conv.h
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_MODULATED_DEFORM_CONV_H
+#define ONNXRUNTIME_MODULATED_DEFORM_CONV_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVModulatedDeformConvKernel {
+  MMCVModulatedDeformConvKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t stride_height_;
+  int64_t stride_width_;
+  int64_t padding_height_;
+  int64_t padding_width_;
+  int64_t dilation_height_;
+  int64_t dilation_width_;
+  int64_t deformable_group_;
+  int64_t group_;
+};
+
+struct MMCVModulatedDeformConvOp
+    : Ort::CustomOpBase<MMCVModulatedDeformConvOp,
+                        MMCVModulatedDeformConvKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new MMCVModulatedDeformConvKernel(api, info);
+  }
+
+  const char *GetName() const { return "MMCVModulatedDeformConv2d"; };
+
+  size_t GetInputTypeCount() const { return 5; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(
+      size_t index) const {
+    // The last input (index == 4) is optional, which is bias
+    if (index == 4)
+      return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL;
+
+    return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/nms.h b/mmcv/mmcv/ops/csrc/onnxruntime/nms.h
new file mode 100644
index 0000000000000000000000000000000000000000..ddb208de4d58276fa9c11a3de9e3117ee58c44a8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/nms.h
@@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_NMS_H
+#define ONNXRUNTIME_NMS_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct NmsKernel {
+  NmsKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  float iou_threshold_;
+  int64_t offset_;
+};
+
+struct NmsOp : Ort::CustomOpBase<NmsOp, NmsKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new NmsKernel(api, info);
+  };
+
+  const char *GetName() const { return "NonMaxSuppression"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 1; };
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+  }
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/onnxruntime_register.h b/mmcv/mmcv/ops/csrc/onnxruntime/onnxruntime_register.h
new file mode 100644
index 0000000000000000000000000000000000000000..84d2014556f04e11cd1a548c44dcbf2998ee0ac8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/onnxruntime_register.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_REGISTER_H
+#define ONNXRUNTIME_REGISTER_H
+#include <onnxruntime_c_api.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+OrtStatus *ORT_API_CALL RegisterCustomOps(OrtSessionOptions *options,
+                                          const OrtApiBase *api);
+
+#ifdef __cplusplus
+}
+#endif
+#endif  // ONNXRUNTIME_REGISTER_H
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h b/mmcv/mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h
new file mode 100644
index 0000000000000000000000000000000000000000..8e8dbf4bdb05ddc4ea1ecba11a9531f7f31db8d9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/onnxruntime_session_options_config_keys.h
@@ -0,0 +1,44 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#ifndef ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
+#define ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
+
+/*
+ * This file defines SessionOptions Config Keys and format of the Config Values.
+ *
+ * The Naming Convention for a SessionOptions Config Key,
+ * "[Area][.[SubArea1].[SubArea2]...].[Keyname]"
+ * Such as "ep.cuda.use_arena"
+ * The Config Key cannot be empty
+ * The maximum length of the Config Key is 128
+ *
+ * The string format of a SessionOptions Config Value is defined individually
+ * for each Config. The maximum length of the Config Value is 1024
+ */
+
+// Key for disable PrePacking,
+// If the config value is set to "1" then the prepacking is disabled, otherwise
+// prepacking is enabled (default value)
+static const char* const kOrtSessionOptionsConfigDisablePrepacking =
+    "session.disable_prepacking";
+
+// A value of "1" means allocators registered in the env will be used. "0" means
+// the allocators created in the session will be used. Use this to override the
+// usage of env allocators on a per session level.
+static const char* const kOrtSessionOptionsConfigUseEnvAllocators =
+    "session.use_env_allocators";
+
+// Set to 'ORT' (case sensitive) to load an ORT format model.
+// If unset, model type will default to ONNX unless inferred from filename
+// ('.ort' == ORT format) or bytes to be ORT
+static const char* const kOrtSessionOptionsConfigLoadModelFormat =
+    "session.load_model_format";
+
+// Set to 'ORT' (case sensitive) to save optimized model in ORT format when
+// SessionOptions.optimized_model_path is set. If unset, format will default to
+// ONNX unless optimized_model_filepath ends in '.ort'.
+static const char* const kOrtSessionOptionsConfigSaveModelFormat =
+    "session.save_model_format";
+
+#endif  // ONNXRUNTIME_SESSION_OPTIONS_CONFIG_KEYS_H
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h b/mmcv/mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3d6d3da7decb211f0b587086be83719b85692b5
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/ort_mmcv_utils.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ORT_MMCV_UTILS_H
+#define ORT_MMCV_UTILS_H
+#include <onnxruntime_cxx_api.h>
+
+#include <vector>
+
+struct OrtTensorDimensions : std::vector<int64_t> {
+  OrtTensorDimensions(Ort::CustomOpApi ort, const OrtValue* value) {
+    OrtTensorTypeAndShapeInfo* info = ort.GetTensorTypeAndShape(value);
+    std::vector<int64_t>::operator=(ort.GetTensorShape(info));
+    ort.ReleaseTensorTypeAndShapeInfo(info);
+  }
+};
+#endif  // ORT_MMCV_UTILS_H
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/reduce_ops.h b/mmcv/mmcv/ops/csrc/onnxruntime/reduce_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..996a84e1fdc226ca76973a19d3ea1b6285c4e8d2
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/reduce_ops.h
@@ -0,0 +1,95 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_REDUCE_OPS_H
+#define ONNXRUNTIME_REDUCE_OPS_H
+
+#include <onnxruntime_cxx_api.h>
+
+struct MMCVCumMaxKernel {
+ public:
+  MMCVCumMaxKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
+
+    // create allocator
+    allocator_ = Ort::AllocatorWithDefaultOptions();
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t dim_;
+};
+
+struct MMCVCumMinKernel {
+ public:
+  MMCVCumMinKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    dim_ = ort_.KernelInfoGetAttribute<int64_t>(info, "dim");
+
+    // create allocator
+    allocator_ = Ort::AllocatorWithDefaultOptions();
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  int64_t dim_;
+};
+
+struct MMCVCumMaxCustomOp
+    : Ort::CustomOpBase<MMCVCumMaxCustomOp, MMCVCumMaxKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCumMaxKernel(api, info);
+  }
+
+  const char* GetName() const { return "cummax"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+
+struct MMCVCumMinCustomOp
+    : Ort::CustomOpBase<MMCVCumMinCustomOp, MMCVCumMinKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVCumMinKernel(api, info);
+  }
+
+  const char* GetName() const { return "cummin"; }
+
+  size_t GetInputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+
+#endif  // ONNXRUNTIME_REDUCE_OPS_H
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/roi_align.h b/mmcv/mmcv/ops/csrc/onnxruntime/roi_align.h
new file mode 100644
index 0000000000000000000000000000000000000000..bacc11cf9e2e4adb6735eca16333912b37445def
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/roi_align.h
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_ROI_ALIGN_H
+#define ONNXRUNTIME_ROI_ALIGN_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+
+struct MMCVRoiAlignKernel {
+ public:
+  MMCVRoiAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+    aligned_height_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+    pool_mode_ = ort_.KernelInfoGetAttribute<std::string>(info, "mode");
+    sampling_ratio_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+
+  int aligned_height_;
+  int aligned_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  std::string pool_mode_;
+  int aligned_;
+};
+
+struct MMCVRoiAlignCustomOp
+    : Ort::CustomOpBase<MMCVRoiAlignCustomOp, MMCVRoiAlignKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRoiAlignKernel(api, info);
+  }
+  const char* GetName() const { return "MMCVRoiAlign"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROI_ALIGN_H
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/roi_align_rotated.h b/mmcv/mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9ba2895c0685b16aa437487200c294220060f28
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/roi_align_rotated.h
@@ -0,0 +1,62 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_ROI_ALIGN_ROTATED_H
+#define ONNXRUNTIME_ROI_ALIGN_ROTATED_H
+
+#include <assert.h>
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <vector>
+
+struct MMCVRoIAlignRotatedKernel {
+ public:
+  MMCVRoIAlignRotatedKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    aligned_height_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "output_height");
+    aligned_width_ = ort_.KernelInfoGetAttribute<int64_t>(info, "output_width");
+    sampling_ratio_ =
+        ort_.KernelInfoGetAttribute<int64_t>(info, "sampling_ratio");
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+    aligned_ = ort_.KernelInfoGetAttribute<int64_t>(info, "aligned");
+    clockwise_ = ort_.KernelInfoGetAttribute<int64_t>(info, "clockwise");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  int aligned_height_;
+  int aligned_width_;
+  float spatial_scale_;
+  int sampling_ratio_;
+  int aligned_;
+  int clockwise_;
+};
+
+struct MMCVRoIAlignRotatedCustomOp
+    : Ort::CustomOpBase<MMCVRoIAlignRotatedCustomOp,
+                        MMCVRoIAlignRotatedKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRoIAlignRotatedKernel(api, info);
+  }
+  const char* GetName() const { return "MMCVRoIAlignRotated"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROI_ALIGN_ROTATED_H
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/rotated_feature_align.h b/mmcv/mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
new file mode 100644
index 0000000000000000000000000000000000000000..0fc03d84de9023425d19c83cc9bae11e87af1bc9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/rotated_feature_align.h
@@ -0,0 +1,50 @@
+#ifndef ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
+#define ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
+
+#include <onnxruntime_cxx_api.h>
+
+#include <cmath>
+
+struct MMCVRotatedFeatureAlignKernel {
+ public:
+  MMCVRotatedFeatureAlignKernel(Ort::CustomOpApi ort, const OrtKernelInfo* info)
+      : ort_(ort) {
+    spatial_scale_ = ort_.KernelInfoGetAttribute<float>(info, "spatial_scale");
+    points_ = ort_.KernelInfoGetAttribute<int64_t>(info, "points");
+  }
+
+  void Compute(OrtKernelContext* context);
+
+ private:
+  Ort::CustomOpApi ort_;
+  float spatial_scale_;
+  int points_;
+};
+
+struct MMCVRotatedFeatureAlignCustomOp
+    : Ort::CustomOpBase<MMCVRotatedFeatureAlignCustomOp,
+                        MMCVRotatedFeatureAlignKernel> {
+  void* CreateKernel(Ort::CustomOpApi api, const OrtKernelInfo* info) const {
+    return new MMCVRotatedFeatureAlignKernel(api, info);
+  }
+
+  const char* GetName() const { return "MMCVRotatedFeatureAlign"; }
+
+  size_t GetInputTypeCount() const { return 2; }
+
+  ONNXTensorElementDataType GetInputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  size_t GetOutputTypeCount() const { return 1; }
+
+  ONNXTensorElementDataType GetOutputType(size_t) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  }
+
+  // force cpu
+  const char* GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  }
+};
+#endif  // ONNXRUNTIME_ROTATED_FEATURE_ALIGN_H
diff --git a/mmcv/mmcv/ops/csrc/onnxruntime/soft_nms.h b/mmcv/mmcv/ops/csrc/onnxruntime/soft_nms.h
new file mode 100644
index 0000000000000000000000000000000000000000..7f9f8e62571810b81799f437d9933a433a788c49
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/onnxruntime/soft_nms.h
@@ -0,0 +1,49 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ONNXRUNTIME_SOFT_NMS_H
+#define ONNXRUNTIME_SOFT_NMS_H
+#include <onnxruntime_cxx_api.h>
+
+struct SoftNmsKernel {
+  SoftNmsKernel(OrtApi api, const OrtKernelInfo *info);
+
+  void Compute(OrtKernelContext *context);
+
+ protected:
+  OrtApi api_;
+  Ort::CustomOpApi ort_;
+  const OrtKernelInfo *info_;
+  Ort::AllocatorWithDefaultOptions allocator_;
+
+  float iou_threshold_;
+  float sigma_;
+  float min_score_;
+  int64_t method_;
+  int64_t offset_;
+};
+
+struct SoftNmsOp : Ort::CustomOpBase<SoftNmsOp, SoftNmsKernel> {
+  void *CreateKernel(OrtApi api, const OrtKernelInfo *info) const {
+    return new SoftNmsKernel(api, info);
+  };
+
+  const char *GetName() const { return "SoftNonMaxSuppression"; };
+
+  size_t GetInputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetInputType(size_t /*index*/) const {
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  size_t GetOutputTypeCount() const { return 2; };
+  ONNXTensorElementDataType GetOutputType(size_t index) const {
+    if (index == 1) {
+      return ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64;
+    }
+    return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT;
+  };
+
+  // force cpu
+  const char *GetExecutionProviderType() const {
+    return "CPUExecutionProvider";
+  };
+};
+#endif  // ONNXRUNTIME_SOFT_NMS_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/active_rotated_filter.cpp b/mmcv/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ead1f8e4700d019fff7b25034e2475087040c8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/active_rotated_filter.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
+                       output);
+}
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
+                       grad_in);
+}
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output) {
+  active_rotated_filter_forward_impl(input, indices, output);
+}
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in) {
+  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9097f7e0a15d817b8e176a01e080e8f4476f6be9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/active_rotated_filter_parrots.cpp
@@ -0,0 +1,63 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "active_rotated_filter_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void active_rotated_filter_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto input = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  active_rotated_filter_forward(input, indices, output);
+}
+
+void active_rotated_filter_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto grad_out = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto grad_in = buildATensor(ctx, outs[0]);
+  active_rotated_filter_backward(grad_out, indices, grad_in);
+}
+#endif
+
+void active_rotated_filter_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto input = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  active_rotated_filter_forward(input, indices, output);
+}
+
+void active_rotated_filter_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto grad_out = buildATensor(ctx, ins[0]);
+  auto indices = buildATensor(ctx, ins[1]);
+  auto grad_in = buildATensor(ctx, outs[0]);
+  active_rotated_filter_backward(grad_out, indices, grad_in);
+}
+
+PARROTS_EXTENSION_REGISTER(active_rotated_filter_forward)
+    .input(2)
+    .output(1)
+    .apply(active_rotated_filter_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(active_rotated_filter_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(active_rotated_filter_backward)
+    .input(2)
+    .output(1)
+    .apply(active_rotated_filter_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(active_rotated_filter_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a4d2ce96a416d6d845413f08b586aa55c57ea2f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/active_rotated_filter_pytorch.h
@@ -0,0 +1,13 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ACTIVE_ROTATED_FILTER_PYTORCH_H
+#define ACTIVE_ROTATED_FILTER_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+#endif  // ACTIVE_ROTATED_FILTER_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/assign_score_withk.cpp b/mmcv/mmcv/ops/csrc/parrots/assign_score_withk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9076277181c48c7c8f236cb9da79a83c5d38d47f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/assign_score_withk.cpp
@@ -0,0 +1,42 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate) {
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate) {
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5729c716310069f2abd49412255b048a5dfe3f68
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/assign_score_withk_parrots.cpp
@@ -0,0 +1,89 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "assign_score_withk_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void assign_score_withk_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int B, N0, N1, M, K, O, aggregate;
+  SSAttrs(attr)
+      .get<int>("B", B)
+      .get<int>("N0", N0)
+      .get<int>("N1", N1)
+      .get<int>("M", M)
+      .get<int>("K", K)
+      .get<int>("O", O)
+      .get<int>("aggregate", aggregate)
+      .done();
+
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& centers = buildATensor(ctx, ins[1]);
+  const auto& scores = buildATensor(ctx, ins[2]);
+  const auto& knn_idx = buildATensor(ctx, ins[3]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  assign_score_withk_forward(points, centers, scores, knn_idx, output, B, N0,
+                             N1, M, K, O, aggregate);
+}
+
+void assign_score_withk_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int B, N0, N1, M, K, O, aggregate;
+  SSAttrs(attr)
+      .get<int>("B", B)
+      .get<int>("N0", N0)
+      .get<int>("N1", N1)
+      .get<int>("M", M)
+      .get<int>("K", K)
+      .get<int>("O", O)
+      .get<int>("aggregate", aggregate)
+      .done();
+
+  const auto& grad_out = buildATensor(ctx, ins[0]);
+  const auto& points = buildATensor(ctx, ins[1]);
+  const auto& centers = buildATensor(ctx, ins[2]);
+  const auto& scores = buildATensor(ctx, ins[3]);
+  const auto& knn_idx = buildATensor(ctx, ins[4]);
+
+  auto grad_points = buildATensor(ctx, outs[0]);
+  auto grad_centers = buildATensor(ctx, outs[1]);
+  auto grad_scores = buildATensor(ctx, outs[2]);
+  assign_score_withk_backward(grad_out, points, centers, scores, knn_idx,
+                              grad_points, grad_centers, grad_scores, B, N0, N1,
+                              M, K, O, aggregate);
+}
+
+PARROTS_EXTENSION_REGISTER(assign_score_withk_forward)
+    .attr("B")
+    .attr("N0")
+    .attr("N1")
+    .attr("M")
+    .attr("K")
+    .attr("O")
+    .attr("aggregate")
+    .input(4)
+    .output(1)
+    .apply(assign_score_withk_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(assign_score_withk_backward)
+    .attr("B")
+    .attr("N0")
+    .attr("N1")
+    .attr("M")
+    .attr("K")
+    .attr("O")
+    .attr("aggregate")
+    .input(5)
+    .output(3)
+    .apply(assign_score_withk_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..660594feec80371eaece3a5663facf1db2b366d9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/assign_score_withk_pytorch.h
@@ -0,0 +1,19 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ASSIGN_SCORE_WITHK_PYTORCH_H
+#define ASSIGN_SCORE_WITHK_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate);
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate);
+
+#endif  // ASSIGN_SCORE_WITHK_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/ball_query._parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/ball_query._parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..01ab9739b09986a59b69961c5b108bb098b36d6e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/ball_query._parrots.cpp
@@ -0,0 +1,43 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "ball_query_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void ball_query_parrots(CudaContext& ctx, const SSElement& attr,
+                        const OperatorBase::in_list_t& ins,
+                        OperatorBase::out_list_t& outs) {
+  int b, n, m, nsample;
+  float min_radius, max_radius;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .get<int>("nsample", nsample)
+      .get<float>("min_radius", min_radius)
+      .get<float>("max_radius", max_radius)
+      .done();
+
+  const auto& center_xyz = buildATensor(ctx, ins[0]);
+  const auto& xyz = buildATensor(ctx, ins[1]);
+  auto idx = buildATensor(ctx, outs[0]);
+  ball_query_forward(center_xyz, xyz, idx, b, n, m, min_radius, max_radius,
+                     nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(ball_query_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .attr("nsample")
+    .attr("min_radius")
+    .attr("max_radius")
+    .input(2)
+    .output(1)
+    .apply(ball_query_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/ball_query.cpp b/mmcv/mmcv/ops/csrc/parrots/ball_query.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c9e7a20785e894c80d15256a1b040beffa92b47
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/ball_query.cpp
@@ -0,0 +1,20 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample) {
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/ball_query_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/ball_query_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..70026f315089d1c37335865ae719f301407d6231
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/ball_query_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BALL_QUERY_PYTORCH_H
+#define BALL_QUERY_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void ball_query_forward(const Tensor new_xyz, const Tensor xyz, Tensor idx,
+                        int b, int n, int m, float min_radius, float max_radius,
+                        int nsample);
+
+#endif  // BALL_QUERY_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/bbox_overlaps.cpp b/mmcv/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..187216fb01a307906a6fff8d7c10fc4efa1b9b3a
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/bbox_overlaps.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
+}
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5f6264d3c07a6b0c0f5b1cb98666580e7bae6a25
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/bbox_overlaps_parrots.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "bbox_overlaps_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor
+ * ious, const int mode, const bool aligned, const int offset);
+ */
+void bbox_overlaps_parrots(CudaContext& ctx, const SSElement& attr,
+                           const OperatorBase::in_list_t& ins,
+                           OperatorBase::out_list_t& outs) {
+  int mode, offset;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("mode", mode)
+      .get<bool>("aligned", aligned)
+      .get<int>("offset", offset)
+      .done();
+
+  const auto& bboxes1 = buildATensor(ctx, ins[0]);
+  const auto& bboxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  bbox_overlaps_cuda(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+PARROTS_EXTENSION_REGISTER(bbox_overlaps)
+    .attr("mode")
+    .attr("aligned")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(bbox_overlaps_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f68aa3397d80db7dd2cf4299b4391cddc533920
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/bbox_overlaps_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BBOX_OVERLAPS_PYTORCH_H
+#define BBOX_OVERLAPS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+
+#endif  // BBOX_OVERLAPS_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/border_align.cpp b/mmcv/mmcv/ops/csrc/parrots/border_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..565de689913413ab106884365e6dc1edfa940de0
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/border_align.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
+}
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
+}
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size) {
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size) {
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/border_align_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/border_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9a075a1096af389811a3aea898ddb1d3a6277355
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/border_align_parrots.cpp
@@ -0,0 +1,51 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "border_align_pytorch.h"
+
+using namespace parrots;
+
+void border_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int pool_size;
+  SSAttrs(attr).get<int>("pool_size", pool_size).done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& boxes = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_idx = buildATensor(ctx, outs[1]);
+  border_align_forward_cuda(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int pool_size;
+  SSAttrs(attr).get<int>("pool_size", pool_size).done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& boxes = buildATensor(ctx, ins[1]);
+  const auto& argmax_idx = buildATensor(ctx, ins[2]);
+
+  auto bottom_grad = buildATensor(ctx, outs[0]);
+  border_align_backward_cuda(top_grad, boxes, argmax_idx, bottom_grad,
+                             pool_size);
+}
+
+PARROTS_EXTENSION_REGISTER(border_align_forward)
+    .attr("pool_size")
+    .input(2)
+    .output(2)
+    .apply(border_align_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(border_align_backward)
+    .attr("pool_size")
+    .input(3)
+    .output(1)
+    .apply(border_align_backward_cuda_parrots)
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/border_align_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/border_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb031e572a50df4edec4fc65056700c8850f7715
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/border_align_pytorch.h
@@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BORDER_ALIGN_PYTORCH_H
+#define BORDER_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void border_align_forward_cuda(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_cuda(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size);
+#endif
+
+#endif  // BORDER_ALIGN_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/box_iou_rotated.cpp b/mmcv/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2a4e0953a5575f72c167bd668c6b6e758ebae87
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/box_iou_rotated.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned) {
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a90d640458b8ed38b9e18c3b26f574ce4c58e8fb
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/box_iou_rotated_parrots.cpp
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "box_iou_rotated_pytorch.h"
+
+using namespace parrots;
+
+/*
+ * void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor
+ * ious, const int mode_flag, const bool aligned);
+ */
+void box_iou_rotated_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  bool aligned;
+  int mode_flag;
+  SSAttrs(attr)
+      .get<bool>("aligned", aligned)
+      .get<int>("mode_flag", mode_flag)
+      .done();
+
+  const auto& boxes1 = buildATensor(ctx, ins[0]);
+  const auto& boxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  box_iou_rotated_cpu(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor
+ * ious, const int mode_flag, const bool aligned);
+ */
+void box_iou_rotated_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  bool aligned;
+  int mode_flag;
+  SSAttrs(attr)
+      .get<bool>("aligned", aligned)
+      .get<int>("mode_flag", mode_flag)
+      .done();
+
+  const auto& boxes1 = buildATensor(ctx, ins[0]);
+  const auto& boxes2 = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  box_iou_rotated_cuda(boxes1, boxes2, ious, mode_flag, aligned);
+}
+#endif
+
+PARROTS_EXTENSION_REGISTER(box_iou_rotated)
+    .attr("aligned")
+    .attr("mode_flag")
+    .input(2)
+    .output(1)
+    .apply(box_iou_rotated_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(box_iou_rotated_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..afab7031812d4389707e6b4235affba93faef6c0
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/box_iou_rotated_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef BOX_IOU_ROTATED_PYTORCH_H
+#define BOX_IOU_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned);
+
+#ifdef MMCV_WITH_CUDA
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+#endif
+
+#endif  // BOX_IOU_ROTATED_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/carafe.cpp b/mmcv/mmcv/ops/csrc/parrots/carafe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a563aed94f04e32614e38062c4e7f4250c6dafe6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/carafe.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/carafe_naive.cpp b/mmcv/mmcv/ops/csrc/parrots/carafe_naive.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e8917a61d93c7e6613566902cb00623ea89444e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/carafe_naive.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
+}
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c16a3707991d015971325fe161c2c9c4c2c31a6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/carafe_naive_parrots.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "carafe_naive_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+ *                                int kernel_size, int group_size,
+ *                                int scale_factor)
+ */
+void carafe_naive_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = buildATensor(ctx, ins[0]);
+  const auto& masks = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  carafe_naive_forward_cuda(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+/*void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor
+ * masks, Tensor bottom_grad, Tensor mask_grad, int kernel_size, int group_size,
+ *                                int scale_factor);
+ */
+void carafe_naive_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& features = buildATensor(ctx, ins[1]);
+  const auto& masks = buildATensor(ctx, ins[2]);
+
+  auto bottom_grad = buildATensor(ctx, outs[0]);
+  auto mask_grad = buildATensor(ctx, outs[1]);
+  carafe_naive_backward_cuda(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(1)
+    .apply(carafe_naive_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_naive_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(2)
+    .apply(carafe_naive_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6df9b88c231b4949f128c528cc3f31633c76fb79
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/carafe_naive_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_NAIVE_PYTORCH_H
+#define CARAFE_NAIVE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+#endif  // CARAFE_NAIVE_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/carafe_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/carafe_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e99f59ef221bfe7058c53a486c75e201c44e7f68
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/carafe_parrots.cpp
@@ -0,0 +1,88 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "carafe_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*
+ * void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+ *                          Tensor routput, Tensor rmasks, Tensor output,
+ *                          int kernel_size, int group_size, int scale_factor);
+ */
+void carafe_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                 const OperatorBase::in_list_t& ins,
+                                 OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& features = buildATensor(ctx, ins[0]);
+  const auto& masks = buildATensor(ctx, ins[1]);
+
+  auto rfeatures = buildATensor(ctx, outs[0]);
+  auto routput = buildATensor(ctx, outs[1]);
+  auto rmasks = buildATensor(ctx, outs[2]);
+  auto output = buildATensor(ctx, outs[3]);
+
+  carafe_forward_cuda(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+/*
+ * void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+ *                           Tensor rtop_grad, Tensor rbottom_grad_hs,
+ *                           Tensor rbottom_grad, Tensor rmask_grad,
+ *                           Tensor bottom_grad, Tensor mask_grad, int
+ * kernel_size, int group_size, int scale_factor);
+ */
+void carafe_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                  const OperatorBase::in_list_t& ins,
+                                  OperatorBase::out_list_t& outs) {
+  int kernel_size, group_size, scale_factor;
+  SSAttrs(attr)
+      .get<int>("kernel_size", kernel_size)
+      .get<int>("group_size", group_size)
+      .get<int>("scale_factor", scale_factor)
+      .done();
+
+  const auto& top_grad = buildATensor(ctx, ins[0]);
+  const auto& rfeatures = buildATensor(ctx, ins[1]);
+  const auto& masks = buildATensor(ctx, ins[2]);
+
+  auto rtop_grad = buildATensor(ctx, outs[0]);
+  auto rbottom_grad_hs = buildATensor(ctx, outs[1]);
+  auto rbottom_grad = buildATensor(ctx, outs[2]);
+  auto rmask_grad = buildATensor(ctx, outs[3]);
+  auto bottom_grad = buildATensor(ctx, outs[4]);
+  auto mask_grad = buildATensor(ctx, outs[5]);
+
+  carafe_backward_cuda(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
+
+PARROTS_EXTENSION_REGISTER(carafe_forward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(2)
+    .output(4)
+    .apply(carafe_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(carafe_backward)
+    .attr("kernel_size")
+    .attr("group_size")
+    .attr("scale_factor")
+    .input(3)
+    .output(6)
+    .apply(carafe_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/carafe_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/carafe_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..2b94d44d3c9d1a81e0838bf209d774c703004fa9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/carafe_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CARAFE_PYTORCH_H
+#define CARAFE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+#endif  // CARAFE_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/contour_expand.cpp b/mmcv/mmcv/ops/csrc/parrots/contour_expand.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..586c48ee44b6b7dbb24573b4a2d2ecf499a56d0b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/contour_expand.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/whai362/PSENet
+#include <iostream>
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+
+using namespace std;
+
+class Point2d {
+ public:
+  int x;
+  int y;
+
+  Point2d() : x(0), y(0) {}
+  Point2d(int _x, int _y) : x(_x), y(_y) {}
+};
+
+void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
+                   const int *label_map, int &label_num, int &min_area,
+                   vector<vector<int>> &text_line) {
+  std::vector<int> area(label_num + 1);
+  int kernel_num = data_shape[0];
+  int height = data_shape[1];
+  int width = data_shape[2];
+
+  for (int x = 0; x < height; ++x) {
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      area[label] += 1;
+    }
+  }
+
+  queue<Point2d> queue, next_queue;
+  for (int x = 0; x < height; ++x) {
+    vector<int> row(width);
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      if (area[label] < min_area) continue;
+
+      Point2d point(x, y);
+      queue.push(point);
+      row[y] = label;
+    }
+    text_line.emplace_back(row);
+  }
+
+  int dx[] = {-1, 1, 0, 0};
+  int dy[] = {0, 0, -1, 1};
+  vector<int> kernel_step(kernel_num);
+  std::for_each(kernel_step.begin(), kernel_step.end(),
+                [=](int &k) { return k * height * width; });
+
+  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
+    while (!queue.empty()) {
+      Point2d point = queue.front();
+      queue.pop();
+      int x = point.x;
+      int y = point.y;
+      int label = text_line[x][y];
+
+      bool is_edge = true;
+      for (int d = 0; d < 4; ++d) {
+        int tmp_x = x + dx[d];
+        int tmp_y = y + dy[d];
+
+        if (tmp_x < 0 || tmp_x >= height) continue;
+        if (tmp_y < 0 || tmp_y >= width) continue;
+        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+        if (kernel_value == 0) continue;
+        if (text_line[tmp_x][tmp_y] > 0) continue;
+
+        Point2d point(tmp_x, tmp_y);
+        queue.push(point);
+        text_line[tmp_x][tmp_y] = label;
+        is_edge = false;
+      }
+
+      if (is_edge) {
+        next_queue.push(point);
+      }
+    }
+    swap(queue, next_queue);
+  }
+}
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num) {
+  kernel_mask = kernel_mask.contiguous();
+  internal_kernel_label = internal_kernel_label.contiguous();
+  assert(kernel_mask.dim() == 3);
+  assert(internal_kernel_label.dim() == 2);
+  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
+  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
+  CHECK_CPU_INPUT(kernel_mask);
+  CHECK_CPU_INPUT(internal_kernel_label);
+  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
+  IntArrayRef data_shape = kernel_mask.sizes();
+
+  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  vector<vector<int>> text_line;
+
+  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
+                min_kernel_area, text_line);
+
+  return text_line;
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/contour_expand_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/contour_expand_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1581fdc833c8f6b19a8e5a892ddbd8ec9414333e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/contour_expand_parrots.cpp
@@ -0,0 +1,43 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "contour_expand_pytorch.h"
+
+using namespace parrots;
+using namespace std;
+
+template <typename T>
+void contour_expand_parrots(T& ctx, const SSElement& attr,
+                            const OperatorBase::in_list_t& ins,
+                            OperatorBase::out_list_t& outs) {
+  int min_kernel_area, kernel_num;
+  SSAttrs(attr)
+      .get<int>("min_kernel_area", min_kernel_area)
+      .get<int>("kernel_num", kernel_num)
+      .done();
+  at::Tensor kernel_mask;
+  at::Tensor internal_kernel_label;
+  kernel_mask = buildATensor(ctx, ins[0]);
+  internal_kernel_label = buildATensor(ctx, ins[1]);
+  auto out = contour_expand(kernel_mask, internal_kernel_label, min_kernel_area,
+                            kernel_num);
+  int n = out.size(), m = 0;
+  for (int i = 0; i < n; ++i)
+    if (m < out[i].size()) m = out[i].size();
+  auto options = torch::TensorOptions().dtype(at::kInt);
+  auto tensor = torch::zeros({n, m}, options);
+  for (int i = 0; i < n; i++)
+    tensor.slice(0, i, i + 1) =
+        torch::from_blob(out[i].data(), {out[i].size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(contour_expand)
+    .attr("min_kernel_area")
+    .attr("kernel_num")
+    .input(2)
+    .output(1)
+    .apply(contour_expand_parrots<HostContext>)
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/contour_expand_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/contour_expand_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..881bbac3cb73494e0063314c340adc7a280f4fc6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/contour_expand_pytorch.h
@@ -0,0 +1,12 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONTOUR_EXPAND_PYTORCH_H
+#define CONTOUR_EXPAND_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num);
+
+#endif  // CONTOUR_EXPAND_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/convex_iou.cpp b/mmcv/mmcv/ops/csrc/parrots/convex_iou.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79f2028b551c474453aff2f6633dd426194e4afd
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/convex_iou.cpp
@@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
+}
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
+  convex_iou_impl(pointsets, polygons, ious);
+}
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
+}
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
+  convex_giou_impl(pointsets, polygons, output);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bf766542f0a04da85a1b15022f3e5f078c283a1a
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/convex_iou_parrots.cpp
@@ -0,0 +1,40 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "convex_iou_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void convex_iou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+  auto ious = buildATensor(ctx, outs[0]);
+  convex_iou(pointsets, polygons, ious);
+}
+
+void convex_giou_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  convex_giou(pointsets, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(convex_iou)
+    .input(2)
+    .output(1)
+    .apply(convex_iou_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(convex_giou)
+    .input(2)
+    .output(1)
+    .apply(convex_giou_forward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/convex_iou_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/convex_iou_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4f16a1ce4b62bbe91b3083465468c2b9ae6df055
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/convex_iou_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CONVEX_IOU_PYTORCH_H
+#define CONVEX_IOU_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/correlation.cpp b/mmcv/mmcv/ops/csrc/parrots/correlation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4adba2a0c17201476352c473f1c7117af020ab2
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/correlation.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <iostream>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
+}
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW) {
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
+}
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW) {
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/correlation_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/correlation_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b1e287d063564775070389285a6fee7ea1aaeb80
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/correlation_parrots.cpp
@@ -0,0 +1,176 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "correlation_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void correlation_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto input1 = buildATensor(ctx, ins[0]);
+  auto input2 = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,
+                      padW, dilationH, dilationW, dilation_patchH,
+                      dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto input1 = buildATensor(ctx, ins[1]);
+  auto input2 = buildATensor(ctx, ins[2]);
+
+  auto grad_input1 = buildATensor(ctx, outs[0]);
+  auto grad_input2 = buildATensor(ctx, outs[1]);
+
+  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,
+                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+#endif
+
+void correlation_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto input1 = buildATensor(ctx, ins[0]);
+  auto input2 = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  correlation_forward(input1, input2, output, kH, kW, patchH, patchW, padH,
+                      padW, dilationH, dilationW, dilation_patchH,
+                      dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kH, kW, patchH, patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW;
+  SSAttrs(attr)
+      .get<int>("kH", kH)
+      .get<int>("kW", kW)
+      .get<int>("patchH", patchH)
+      .get<int>("patchW", patchW)
+      .get<int>("padH", padH)
+      .get<int>("padW", padW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilation_patchH", dilation_patchH)
+      .get<int>("dilation_patchW", dilation_patchW)
+      .get<int>("dH", dH)
+      .get<int>("dW", dW)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto input1 = buildATensor(ctx, ins[1]);
+  auto input2 = buildATensor(ctx, ins[2]);
+
+  auto grad_input1 = buildATensor(ctx, outs[0]);
+  auto grad_input2 = buildATensor(ctx, outs[1]);
+
+  correlation_backward(grad_output, input1, input2, grad_input1, grad_input2,
+                       kH, kW, patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+PARROTS_EXTENSION_REGISTER(correlation_forward)
+    .attr("kH")
+    .attr("kW")
+    .attr("patchH")
+    .attr("patchW")
+    .attr("padH")
+    .attr("padW")
+    .attr("dilationH")
+    .attr("dilationW")
+    .attr("dilation_patchH")
+    .attr("dilation_patchW")
+    .attr("dH")
+    .attr("dW")
+    .input(2)
+    .output(1)
+    .apply(correlation_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(correlation_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(correlation_backward)
+    .attr("kH")
+    .attr("kW")
+    .attr("patchH")
+    .attr("patchW")
+    .attr("padH")
+    .attr("padW")
+    .attr("dilationH")
+    .attr("dilationW")
+    .attr("dilation_patchH")
+    .attr("dilation_patchW")
+    .attr("dH")
+    .attr("dW")
+    .input(3)
+    .output(2)
+    .apply(correlation_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(correlation_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/correlation_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/correlation_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..806fcaa710deb7d4622be6373dda84b20e7278fc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/correlation_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef CORRELATION_PYTORCH_H
+#define CORRELATION_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW);
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW);
+
+#endif  // CORRELATION_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/cudabind.cpp b/mmcv/mmcv/ops/csrc/parrots/cudabind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04c6e36c4a1f4ff33d94a82dc1dc334aeadfc340
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/cudabind.cpp
@@ -0,0 +1,1591 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void assign_score_withk_backward_cuda(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output);
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
+                     assign_score_withk_forward_cuda);
+REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
+                     assign_score_withk_backward_cuda);
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
+                                          const Tensor& boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
+                                           const Tensor& boxes,
+                                           const Tensor& argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
+}
+
+void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
+}
+
+void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
+                     border_align_forward_cuda);
+REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
+                     border_align_backward_cuda);
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
+                     carafe_naive_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
+                     carafe_naive_backward_cuda);
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
+}
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW);
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW);
+
+REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
+REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
+                     correlation_backward_cuda);
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
+                     deformable_col2im_coord_cuda);
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
+                     deform_roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
+                     deform_roi_pool_backward_cuda);
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
+                     sigmoid_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
+                     sigmoid_focal_loss_backward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
+                     softmax_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
+                     softmax_focal_loss_backward_cuda);
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs);
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs);
+
+void furthest_point_sampling_forward_cuda(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
+}
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m);
+
+REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
+                     furthest_point_sampling_forward_cuda);
+REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
+                     furthest_point_sampling_with_dist_forward_cuda);
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale);
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale);
+REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
+                     fused_bias_leakyrelu_op);
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
+                     gather_points_forward_cuda);
+REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
+                     gather_points_backward_cuda);
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
+
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
+                     group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
+                     group_points_backward_cuda);
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap);
+
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
+                                         unsigned long long* mask,
+                                         int boxes_num,
+                                         float nms_overlap_thresh);
+
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                               unsigned long long* mask,
+                                               int boxes_num,
+                                               float nms_overlap_thresh);
+
+void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                                ans_overlap);
+};
+
+void iou3d_nms3d_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                              int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                      nms_overlap_thresh);
+};
+
+void iou3d_nms3d_normal_forward_cuda(const Tensor boxes,
+                                     unsigned long long* mask, int boxes_num,
+                                     float nms_overlap_thresh) {
+  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                            nms_overlap_thresh);
+};
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap);
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long* mask,
+                              int boxes_num, float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+                                     unsigned long long* mask, int boxes_num,
+                                     float nms_overlap_thresh);
+
+REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
+                     iou3d_boxes_overlap_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
+                     iou3d_nms3d_normal_forward_cuda);
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+}
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2);
+REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
+                     masked_im2col_forward_cuda);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
+                     masked_col2im_forward_cuda);
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
+                     modulated_deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
+                     modulated_deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
+                     modulated_deformable_col2im_coord_cuda);
+
+Tensor ms_deform_attn_cuda_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
+                     ms_deform_attn_cuda_forward);
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
+                     ms_deform_attn_cuda_backward);
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points);
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points);
+REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
+                     points_in_boxes_part_forward_cuda);
+REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
+                     points_in_boxes_all_forward_cuda);
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
+}
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = input.size(1);
+  int data_height = input.size(2);
+  int data_width = input.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, output);
+}
+
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, bottom_grad);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
+                     roi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
+                     roi_align_rotated_backward_cuda);
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output);
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad);
+
+void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = features.size(1) / num_orientations;
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  RiROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, output);
+}
+
+void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(top_grad);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = bottom_grad.size(1) / num_orientations;
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  RiROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, bottom_grad);
+}
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise);
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise);
+
+REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
+                     riroi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
+                     riroi_align_rotated_backward_cuda);
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
+                     roiaware_pool3d_forward_cuda);
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
+                     roiaware_pool3d_backward_cuda);
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
+                     roipoint_pool3d_forward_cuda);
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor& feats, const at::Tensor& coors,
+    const reduce_t reduce_type);
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
+    const at::Tensor& feats, const at::Tensor& reduced_feats,
+    const at::Tensor& coors_map, const at::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type) {
+  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
+                                                      reduce_type);
+};
+
+void dynamic_point_to_voxel_backward_cuda(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type) {
+  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
+                                                feats, reduced_feats, coors_idx,
+                                                reduce_count, reduce_type);
+};
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
+                     dynamic_point_to_voxel_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
+                     dynamic_point_to_voxel_backward_cuda);
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+}
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
+}
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
+                     sync_bn_forward_mean_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
+                     sync_bn_forward_output_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
+                     sync_bn_backward_param_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
+                     sync_bn_backward_data_cuda);
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out);
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points);
+REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
+                     three_interpolate_forward_cuda);
+REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
+                     three_interpolate_backward_cuda);
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
+
+torch::Tensor upfirdn2d_op(const torch::Tensor& input,
+                           const torch::Tensor& kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1);
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1);
+REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim) {
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+int nondeterministic_hard_voxelize_forward_cuda(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim) {
+  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim) {
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
+                     hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
+                     nondeterministic_hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
+                     dynamic_voxelize_forward_cuda);
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output);
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad);
+
+void rotated_feature_align_forward_cuda(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
+                                               spatial_scale, points, output);
+};
+
+void rotated_feature_align_backward_cuda(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardCUDAKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+};
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
+                     rotated_feature_align_forward_cuda);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
+                     rotated_feature_align_backward_cuda);
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output);
+
+void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
+                                            output);
+};
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
+                     points_in_polygons_forward_cuda);
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
+
+void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
+  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
+}
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
+
+REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output);
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in);
+
+void active_rotated_filter_forward_cuda(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
+};
+
+void active_rotated_filter_backward_cuda(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
+};
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
+                     active_rotated_filter_forward_cuda);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
+                     active_rotated_filter_backward_cuda);
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious);
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output);
+
+void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
+}
+
+void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
+}
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious);
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output);
+
+REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
+REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
+
+Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
+                                                    Tensor mask,
+                                                    Tensor num_valid);
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
+                                                      num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
+                     diff_iou_rotated_sort_vertices_forward_cuda);
diff --git a/mmcv/mmcv/ops/csrc/parrots/deform_conv.cpp b/mmcv/mmcv/ops/csrc/parrots/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86690b9394a4b758104009062f656dcfe0de178e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/deform_conv.cpp
@@ -0,0 +1,517 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
+
+void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
+                             at::Tensor *gradOutput, at::Tensor weight, int kH,
+                             int kW, int dH, int dW, int padH, int padW,
+                             int dilationH, int dilationW, int group,
+                             int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
+              kH, kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+              "kernel size should be consistent with weight, ",
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+              kH, kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
+              dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4,
+              "3D or 4D input tensor expected but got: %s", ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+              "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+              "invalid number of input planes, expected: %d, but got: %d",
+              nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+              "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+              "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
+                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
+                                   output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradInput);
+    CHECK_CUDA_INPUT(gradOffset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(columns);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradInput);
+    CHECK_CPU_INPUT(gradOffset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(columns);
+  }
+  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
+                          padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradWeight);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradWeight);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
+                          dW, padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c07a170dfb73032756277096d53b82a528ecafd1
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/deform_conv_parrots.cpp
@@ -0,0 +1,273 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "deform_conv_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void deform_conv_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+
+  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,
+                      dH, padW, padH, dilationW, dilationH, group,
+                      deformable_group, im2col_step);
+}
+
+void deform_conv_backward_input_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradInput = buildATensor(ctx, outs[0]);
+  auto gradOffset = buildATensor(ctx, outs[1]);
+  auto weight = buildATensor(ctx, outs[2]);
+  auto columns = buildATensor(ctx, outs[3]);
+
+  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,
+                             weight, columns, kW, kH, dW, dH, padW, padH,
+                             dilationW, dilationH, group, deformable_group,
+                             im2col_step);
+}
+
+void deform_conv_backward_parameters_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  float scale;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<float>("scale", scale)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradWeight = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,
+                                  columns, ones, kW, kH, dW, dH, padW, padH,
+                                  dilationW, dilationH, group, deformable_group,
+                                  scale, im2col_step);
+}
+#endif
+
+void deform_conv_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+
+  deform_conv_forward(input, weight, offset, output, columns, ones, kW, kH, dW,
+                      dH, padW, padH, dilationW, dilationH, group,
+                      deformable_group, im2col_step);
+}
+
+void deform_conv_backward_input_cpu_parrots(HostContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradInput = buildATensor(ctx, outs[0]);
+  auto gradOffset = buildATensor(ctx, outs[1]);
+  auto weight = buildATensor(ctx, outs[2]);
+  auto columns = buildATensor(ctx, outs[3]);
+
+  deform_conv_backward_input(input, offset, gradOutput, gradInput, gradOffset,
+                             weight, columns, kW, kH, dW, dH, padW, padH,
+                             dilationW, dilationH, group, deformable_group,
+                             im2col_step);
+}
+
+void deform_conv_backward_parameters_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kW, kH, dW, dH, padW, padH, dilationW, dilationH, group, deformable_group,
+      im2col_step;
+  float scale;
+  SSAttrs(attr)
+      .get<int>("kW", kW)
+      .get<int>("kH", kH)
+      .get<int>("dW", dW)
+      .get<int>("dH", dH)
+      .get<int>("padW", padW)
+      .get<int>("padH", padH)
+      .get<int>("dilationW", dilationW)
+      .get<int>("dilationH", dilationH)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<float>("scale", scale)
+      .get<int>("im2col_step", im2col_step)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& offset = buildATensor(ctx, ins[1]);
+  const auto& gradOutput = buildATensor(ctx, ins[2]);
+
+  auto gradWeight = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+  auto ones = buildATensor(ctx, outs[2]);
+  deform_conv_backward_parameters(input, offset, gradOutput, gradWeight,
+                                  columns, ones, kW, kH, dW, dH, padW, padH,
+                                  dilationW, dilationH, group, deformable_group,
+                                  scale, im2col_step);
+}
+
+PARROTS_EXTENSION_REGISTER(deform_conv_forward)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_input)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("im2col_step")
+    .input(3)
+    .output(4)
+    .apply(deform_conv_backward_input_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_backward_input_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_conv_backward_parameters)
+    .attr("kW")
+    .attr("kH")
+    .attr("dW")
+    .attr("dH")
+    .attr("padW")
+    .attr("padH")
+    .attr("dilationW")
+    .attr("dilationH")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("scale")
+    .attr("im2col_step")
+    .input(3)
+    .output(3)
+    .apply(deform_conv_backward_parameters_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(deform_conv_backward_parameters_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/deform_conv_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/deform_conv_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e0d3d40d1c9eb32a466d5d4b427556741a4c79fc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/deform_conv_pytorch.h
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_CONV_PYTORCH_H
+#define DEFORM_CONV_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step);
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step);
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step);
+
+#endif  // DEFORM_CONV_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/deform_roi_pool.cpp b/mmcv/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fb78a96e74f7e97dff5212bb767eab743f2e73c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/deform_roi_pool.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc2701d52d921ee03fd2ff518852d52e291d6c4c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/deform_roi_pool_parrots.cpp
@@ -0,0 +1,102 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "deform_roi_pool_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+/*void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+ *                                  Tensor output, int pooled_height,
+ *                                  int pooled_width, float spatial_scale,
+ *                                  int sampling_ratio, float gamma);
+ */
+void deform_roi_pool_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& offset = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  deform_roi_pool_forward_cuda(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+/*void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+ *                                   Tensor rois, Tensor offset,
+ *                                   Tensor grad_input, Tensor grad_offset,
+ *                                   int pooled_height, int pooled_width,
+ *                                   float spatial_scale, int sampling_ratio,
+ *                                   float gamma);
+ */
+void deform_roi_pool_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  float gamma;
+
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<float>("gamma", gamma)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& input = buildATensor(ctx, ins[1]);
+  const auto& rois = buildATensor(ctx, ins[2]);
+  const auto& offset = buildATensor(ctx, ins[3]);
+
+  auto grad_input = buildATensor(ctx, outs[0]);
+  auto grad_offset = buildATensor(ctx, outs[1]);
+
+  deform_roi_pool_backward_cuda(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
+
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(3)
+    .output(1)
+    .apply(deform_roi_pool_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(deform_roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("gamma")
+    .input(4)
+    .output(2)
+    .apply(deform_roi_pool_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ac0f2c324bb8329f2a0b6bc683f3d902a300156c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/deform_roi_pool_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DEFORM_ROI_POOL_PYTORCH_H
+#define DEFORM_ROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+#endif  // DEFORM_ROI_POOL_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp b/mmcv/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2361b7fbe5c86fa62a0fa78f39f6d018de108f8f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/diff_iou_rotated.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+                              vertices, mask, num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
+                                              Tensor num_valid) {
+  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4d3e0e05900a1c9c731fcc7e2194eeedc8b9bfb
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/diff_iou_rotated_parrots.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "diff_iou_rotated_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void diff_iou_rotated_sort_vertices_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  at::Tensor boxes, scores, dets;
+  auto vertices = buildATensor(ctx, ins[0]);
+  auto mask = buildATensor(ctx, ins[1]);
+  auto num_valid = buildATensor(ctx, ins[2]);
+  auto out =
+      diff_iou_rotated_sort_vertices_forward_cuda(vertices, mask, num_valid);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(diff_iou_rotated_sort_vertices_forward)
+    .input(3)
+    .output(1)
+    .apply(diff_iou_rotated_sort_vertices_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef911ecc20c7e648dea7aeb74a4d3ec2f46ec990
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/diff_iou_rotated_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef DIFF_IOU_ROTATED_PYTORCH_H
+#define DIFF_IOU_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+#endif  // DIFF_IOU_ROTATED_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/focal_loss.cpp b/mmcv/mmcv/ops/csrc/parrots/focal_loss.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed0e2186532d9d6d909f76d653283bbdc29eac11
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/focal_loss.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha) {
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..044e200c40ef6342c6147e2d9282d856cc3dd9a2
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/focal_loss_parrots.cpp
@@ -0,0 +1,113 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "focal_loss_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void sigmoid_focal_loss_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  sigmoid_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto grad_input = buildATensor(ctx, outs[0]);
+
+  sigmoid_focal_loss_backward_cuda(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  softmax_focal_loss_forward_cuda(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float gamma;
+  float alpha;
+  SSAttrs(attr).get<float>("gamma", gamma).get<float>("alpha", alpha).done();
+
+  // get inputs and outputs
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& target = buildATensor(ctx, ins[1]);
+  const auto& weight = buildATensor(ctx, ins[2]);
+
+  auto buff = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  softmax_focal_loss_backward_cuda(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sigmoid_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(sigmoid_focal_loss_backward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_forward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(1)
+    .apply(softmax_focal_loss_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softmax_focal_loss_backward)
+    .attr("gamma")
+    .attr("alpha")
+    .input(3)
+    .output(2)
+    .apply(softmax_focal_loss_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/focal_loss_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/focal_loss_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b7a00c8abcd5fccd5bf2e3bfcde0451545c69f28
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/focal_loss_pytorch.h
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FOCAL_LOSS_PYTORCH_H
+#define FOCAL_LOSS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+#endif  // FOCAL_LOSS_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/furthest_point_sample.cpp b/mmcv/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c7098acdb5b8392a698803dd7c7d34a360df6ad
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/furthest_point_sample.cpp
@@ -0,0 +1,34 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..483bfb24316d505c6c6086f0ec1f70a61c2e2baf
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/furthest_point_sample_parrots.cpp
@@ -0,0 +1,57 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "furthest_point_sample_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void furthest_point_sample_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto temp_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+
+  furthest_point_sampling_forward(points_tensor, temp_tensor, idx_tensor, b, n,
+                                  m);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto temp_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+
+  furthest_point_sampling_with_dist_forward(points_tensor, temp_tensor,
+                                            idx_tensor, b, n, m);
+}
+PARROTS_EXTENSION_REGISTER(furthest_point_sampling_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(1)
+    .apply(furthest_point_sample_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(furthest_point_sampling_with_dist_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(1)
+    .apply(furthest_point_sampling_with_dist_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0325cd66ed317574d2ab258152617091552a9301
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/furthest_point_sample_pytorch.h
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef FURTHEST_POINT_SAMPLE_PYTORCH_H
+#define FURTHEST_POINT_SAMPLE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m);
+#endif  // FURTHEST_POINT_SAMPLE_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp b/mmcv/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d411c9d843f15174653aab4b24cbb3c37564073
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/fused_bias_leakyrelu.cpp
@@ -0,0 +1,119 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
+                                   int grad, float alpha, float scale) {
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/fused_bias_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/fused_bias_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..47409ad20bbb5d4852eceb16038d3cec41e3431c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/fused_bias_parrots.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor &input,
+                                   const torch::Tensor &bias,
+                                   const torch::Tensor &refer, int act,
+                                   int grad, float alpha, float scale);
+
+void fused_bias_leakyrelu_parrots(CudaContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int act, grad;
+  float alpha, scale;
+  SSAttrs(attr)
+      .get<int>("act", act)
+      .get<int>("grad", grad)
+      .get<float>("alpha", alpha)
+      .get<float>("scale", scale)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &bias = buildATensor(ctx, ins[1]);
+  const auto &refer = buildATensor(ctx, ins[2]);
+  auto out = fused_bias_leakyrelu(input, bias, refer, act, grad, alpha, scale);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(fused_bias_leakyrelu)
+    .attr("act")
+    .attr("grad")
+    .attr("alpha")
+    .attr("scale")
+    .input(3)
+    .output(1)
+    .apply(fused_bias_leakyrelu_parrots)
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/gather_points.cpp b/mmcv/mmcv/ops/csrc/parrots/gather_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8fb020022902bfbeb5ba940621d51859c616bdc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/gather_points.cpp
@@ -0,0 +1,30 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n,
+                           int npoints) {
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
+}
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints) {
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/gather_points_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/gather_points_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1d2d9e1290f26ccbfeb301a102fcb0917ff2cfa1
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/gather_points_parrots.cpp
@@ -0,0 +1,71 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "gather_points_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void gather_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  gather_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n,
+                        npoints);
+}
+
+void gather_points_backward_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .done();
+
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  gather_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,
+                         n, npoints);
+}
+
+PARROTS_EXTENSION_REGISTER(gather_points_forward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .input(2)
+    .output(1)
+    .apply(gather_points_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(gather_points_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .input(2)
+    .output(1)
+    .apply(gather_points_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/gather_points_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/gather_points_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1689ae6ad9ca00e795510ac356f6b49c4890bf2e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/gather_points_pytorch.h
@@ -0,0 +1,13 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GATHER_POINTS_PYTORCH_H
+#define GATHER_POINTS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n, int npoints);
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints);
+#endif  // GATHER_POINTS_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/group_points.cpp b/mmcv/mmcv/ops/csrc/parrots/group_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cdd190d40bbfdb109e34148791775dfe9d16be2e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/group_points.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
+}
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample) {
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/group_points_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/group_points_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..282c01a8c175cc6145ab45e5938325d2f7e0d491
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/group_points_parrots.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "group_points_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void group_points_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .get<int>("nsample", nsample)
+      .done();
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  group_points_forward(points_tensor, idx_tensor, out_tensor, b, c, n, npoints,
+                       nsample);
+}
+
+void group_points_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int b, c, n, npoints, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("npoints", npoints)
+      .get<int>("nsample", nsample)
+      .done();
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  group_points_backward(grad_out_tensor, idx_tensor, grad_points_tensor, b, c,
+                        n, npoints, nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(group_points_forward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .attr("nsample")
+    .input(2)
+    .output(1)
+    .apply(group_points_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(group_points_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("npoints")
+    .attr("nsample")
+    .input(2)
+    .output(1)
+    .apply(group_points_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/group_points_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/group_points_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e704ab078e0ea3833c0ef29e5e4ab00693151be3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/group_points_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef GROUP_POINTS_PYTORCH_H
+#define GROUP_POINTS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample);
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample);
+
+#endif  // GROUP_POINTS_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/info.cpp b/mmcv/mmcv/ops/csrc/parrots/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a08d227d4c6e94f0dabd8cebab7bf2d77b9df4b9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/info.cpp
@@ -0,0 +1,56 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("rocm not available");
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/iou3d.cpp b/mmcv/mmcv/ops/csrc/parrots/iou3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ef9c7e819943a1c5305ca3fd6294b8a3f870056
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/iou3d.cpp
@@ -0,0 +1,135 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
+                              int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+                                     unsigned long long *mask, int boxes_num,
+                                     float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
+}
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+    *keep_num_data = num_to_keep;
+  }
+}
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
+                                  nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+  }
+
+  *keep_num_data = num_to_keep;
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/iou3d_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..20e288aeab9bdaef047115bdac645e4b58e4c629
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/iou3d_parrots.cpp
@@ -0,0 +1,70 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "iou3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void iou3d_boxes_overlap_bev_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_a = buildATensor(ctx, ins[0]);
+  auto boxes_b = buildATensor(ctx, ins[1]);
+
+  auto ans_iou = buildATensor(ctx, outs[0]);
+
+  iou3d_boxes_overlap_bev_forward(boxes_a, boxes_b, ans_iou);
+}
+
+void iou3d_nms3d_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  float nms_overlap_thresh;
+  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
+
+  auto boxes = buildATensor(ctx, ins[0]);
+
+  auto keep = buildATensor(ctx, outs[0]);
+  auto keep_num = buildATensor(ctx, outs[1]);
+
+  iou3d_nms3d_forward(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  float nms_overlap_thresh;
+  SSAttrs(attr).get<float>("nms_overlap_thresh", nms_overlap_thresh).done();
+
+  auto boxes = buildATensor(ctx, ins[0]);
+
+  auto keep = buildATensor(ctx, outs[0]);
+  auto keep_num = buildATensor(ctx, outs[1]);
+
+  iou3d_nms3d_normal_forward(boxes, keep, keep_num, nms_overlap_thresh);
+}
+
+PARROTS_EXTENSION_REGISTER(iou3d_boxes_overlap_bev_forward)
+    .input(2)
+    .output(1)
+    .apply(iou3d_boxes_overlap_bev_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(iou3d_nms3d_forward)
+    .attr("nms_overlap_thresh")
+    .input(1)
+    .output(2)
+    .apply(iou3d_nms3d_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(iou3d_nms3d_normal_forward)
+    .attr("nms_overlap_thresh")
+    .input(1)
+    .output(2)
+    .apply(iou3d_nms3d_normal_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/iou3d_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/iou3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..76170edc7083dbaff4a2d23356c4e7702b929a2d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/iou3d_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef IOU_3D_PYTORCH_H
+#define IOU_3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap);
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh);
+
+#endif  // IOU_3D_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/knn.cpp b/mmcv/mmcv/ops/csrc/parrots/knn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4be9428c59c0f04635891b954f4c73f7fb0536d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/knn.cpp
@@ -0,0 +1,17 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
+}
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/knn_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/knn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..585b84644a4427330046ac0ea2220d07580ee638
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/knn_parrots.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "knn_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void knn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                              const OperatorBase::in_list_t& ins,
+                              OperatorBase::out_list_t& outs) {
+  int b, n, m, nsample;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .get<int>("nsample", nsample)
+      .done();
+
+  auto xyz_tensor = buildATensor(ctx, ins[0]);
+  auto new_xyz_tensor = buildATensor(ctx, ins[1]);
+
+  auto idx_tensor = buildATensor(ctx, outs[0]);
+  auto dist2_tensor = buildATensor(ctx, outs[1]);
+
+  knn_forward(xyz_tensor, new_xyz_tensor, idx_tensor, dist2_tensor, b, n, m,
+              nsample);
+}
+
+PARROTS_EXTENSION_REGISTER(knn_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .attr("nsample")
+    .input(2)
+    .output(2)
+    .apply(knn_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/knn_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/knn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0875f8389ee91bfc93083da844ccd4f6be9fdf3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/knn_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef KNN_PYTORCH_H
+#define KNN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+#endif  // KNN_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/masked_conv2d.cpp b/mmcv/mmcv/ops/csrc/parrots/masked_conv2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5903925351fcb193b86c8b5f01b410e4fc0bbaf9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/masked_conv2d.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
+}
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w) {
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels) {
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39f19740c84b521cf16a2030fb01b07bda1e75e4
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/masked_conv2d_parrots.cpp
@@ -0,0 +1,72 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "masked_conv2d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void masked_im2col_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  int kernel_h, kernel_w, pad_h, pad_w;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .done();
+
+  const auto& im = buildATensor(ctx, ins[0]);
+  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
+  const auto& mask_w_idx = buildATensor(ctx, ins[2]);
+
+  auto col = buildATensor(ctx, outs[0]);
+  masked_im2col_forward_cuda(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  int height, width, channels;
+  SSAttrs(attr)
+      .get<int>("height", height)
+      .get<int>("width", width)
+      .get<int>("channels", channels)
+      .done();
+
+  const auto& col = buildATensor(ctx, ins[0]);
+  const auto& mask_h_idx = buildATensor(ctx, ins[1]);
+  const auto& mask_w_idx = buildATensor(ctx, ins[2]);
+
+  auto im = buildATensor(ctx, outs[0]);
+  masked_col2im_forward_cuda(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
+
+PARROTS_EXTENSION_REGISTER(masked_im2col_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .input(3)
+    .output(1)
+    .apply(masked_im2col_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(masked_col2im_forward)
+    .attr("height")
+    .attr("width")
+    .attr("channels")
+    .input(3)
+    .output(1)
+    .apply(masked_col2im_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..36d5643f6037bf05cfdcdb23a02151aab0c1d4b4
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/masked_conv2d_pytorch.h
@@ -0,0 +1,15 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MASKED_CONV2D_PYTORCH_H
+#define MASKED_CONV2D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+#endif  // MASKED_CONV2D_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/min_area_polygons.cpp b/mmcv/mmcv/ops/csrc/parrots/min_area_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ff996dc8992b4c95633516054ecdba5913de8f3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/min_area_polygons.cpp
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d9e4ff4b3dd80746ca534cbf4f02ace966b363d8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/min_area_polygons_parrots.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "min_area_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void min_area_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  auto pointsets = buildATensor(ctx, ins[0]);
+
+  auto polygons = buildATensor(ctx, outs[0]);
+  min_area_polygons(pointsets, polygons);
+}
+
+PARROTS_EXTENSION_REGISTER(min_area_polygons)
+    .input(1)
+    .output(1)
+    .apply(min_area_polygons_cuda_parrots)
+    .done();
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1df27641882c6ae29028809f726c1a19b9a192cd
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/min_area_polygons_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MIN_AREA_POLYGONS_PYTORCH_H
+#define MIN_AREA_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+#endif  // MIN_AREA_POLYGONS_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp b/mmcv/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..12b538a05e6fd98becccfddf8e79cba7abf96d93
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/modulated_deform_conv.cpp
@@ -0,0 +1,237 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ef7efff6e473abd4ee94d21c8b8dc05ab34f1d9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/modulated_deform_conv_parrots.cpp
@@ -0,0 +1,199 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "modulated_deform_conv_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void modulated_deform_conv_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+
+  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,
+                                columns, kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dilation_h, dilation_w, group,
+                                deformable_group, with_bias);
+}
+
+void modulated_deform_conv_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto columns = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  auto grad_weight = buildATensor(ctx, outs[2]);
+  auto grad_bias = buildATensor(ctx, outs[3]);
+  auto grad_offset = buildATensor(ctx, outs[4]);
+  auto grad_mask = buildATensor(ctx, outs[5]);
+  auto grad_output = buildATensor(ctx, outs[6]);
+  modulated_deform_conv_backward(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+}
+#endif
+
+void modulated_deform_conv_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto output = buildATensor(ctx, outs[0]);
+  auto columns = buildATensor(ctx, outs[1]);
+
+  modulated_deform_conv_forward(input, weight, bias, ones, offset, mask, output,
+                                columns, kernel_h, kernel_w, stride_h, stride_w,
+                                pad_h, pad_w, dilation_h, dilation_w, group,
+                                deformable_group, with_bias);
+}
+
+void modulated_deform_conv_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int kernel_h, kernel_w, stride_h, stride_w, pad_h, pad_w, dilation_h,
+      dilation_w, group, deformable_group, with_bias;
+  SSAttrs(attr)
+      .get<int>("kernel_h", kernel_h)
+      .get<int>("kernel_w", kernel_w)
+      .get<int>("stride_h", stride_h)
+      .get<int>("stride_w", stride_w)
+      .get<int>("pad_h", pad_h)
+      .get<int>("pad_w", pad_w)
+      .get<int>("dilation_h", dilation_h)
+      .get<int>("dilation_w", dilation_w)
+      .get<int>("group", group)
+      .get<int>("deformable_group", deformable_group)
+      .get<int>("with_bias", with_bias)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& bias = buildATensor(ctx, ins[2]);
+  const auto& ones = buildATensor(ctx, ins[3]);
+  const auto& offset = buildATensor(ctx, ins[4]);
+  const auto& mask = buildATensor(ctx, ins[5]);
+
+  auto columns = buildATensor(ctx, outs[0]);
+  auto grad_input = buildATensor(ctx, outs[1]);
+  auto grad_weight = buildATensor(ctx, outs[2]);
+  auto grad_bias = buildATensor(ctx, outs[3]);
+  auto grad_offset = buildATensor(ctx, outs[4]);
+  auto grad_mask = buildATensor(ctx, outs[5]);
+  auto grad_output = buildATensor(ctx, outs[6]);
+  modulated_deform_conv_backward(
+      input, weight, bias, ones, offset, mask, columns, grad_input, grad_weight,
+      grad_bias, grad_offset, grad_mask, grad_output, kernel_h, kernel_w,
+      stride_h, stride_w, pad_h, pad_w, dilation_h, dilation_w, group,
+      deformable_group, with_bias);
+}
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_forward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(2)
+    .apply(modulated_deform_conv_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(modulated_deform_conv_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(modulated_deform_conv_backward)
+    .attr("kernel_h")
+    .attr("kernel_w")
+    .attr("stride_h")
+    .attr("stride_w")
+    .attr("pad_h")
+    .attr("pad_w")
+    .attr("dilation_h")
+    .attr("dilation_w")
+    .attr("group")
+    .attr("deformable_group")
+    .attr("with_bias")
+    .input(6)
+    .output(7)
+    .apply(modulated_deform_conv_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(modulated_deform_conv_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..12f6868612d5e7596378c4ce2e8fa25f1b9c0afc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/modulated_deform_conv_pytorch.h
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef MODULATED_DEFORM_CONV_PYTORCH_H
+#define MODULATED_DEFORM_CONV_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias);
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+#endif  // MODULATED_DEFORM_CONV_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/ms_deform_attn.cpp b/mmcv/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25c8f6209b16c475ba181eea7c880eb27cca4082
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/ms_deform_attn.cpp
@@ -0,0 +1,60 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a3ad786a8e08129fa84fa73b710637e6e23b2994
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/ms_deform_attn_parrots.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight, const int im2col_step);
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step);
+
+void ms_deform_attn_forward_parrots(CudaContext &ctx, const SSElement &attr,
+                                    const OperatorBase::in_list_t &ins,
+                                    OperatorBase::out_list_t &outs) {
+  int im2col_step;
+  SSAttrs(attr).get<int>("im2col_step", im2col_step).done();
+  const auto &value = buildATensor(ctx, ins[0]);
+  const auto &spatial_shapes = buildATensor(ctx, ins[1]);
+  const auto &level_start_index = buildATensor(ctx, ins[2]);
+  const auto &sampling_loc = buildATensor(ctx, ins[3]);
+  const auto &attn_weight = buildATensor(ctx, ins[4]);
+  auto out = ms_deform_attn_forward(value, spatial_shapes, level_start_index,
+                                    sampling_loc, attn_weight, im2col_step);
+  updateDArray(ctx, out, outs[0]);
+}
+
+void ms_deform_attn_backward_parrots(CudaContext &ctx, const SSElement &attr,
+                                     const OperatorBase::in_list_t &ins,
+                                     OperatorBase::out_list_t &outs) {
+  int im2col_step;
+  SSAttrs(attr).get<int>("im2col_step", im2col_step).done();
+  const auto &value = buildATensor(ctx, ins[0]);
+  const auto &spatial_shapes = buildATensor(ctx, ins[1]);
+  const auto &level_start_index = buildATensor(ctx, ins[2]);
+  const auto &sampling_loc = buildATensor(ctx, ins[3]);
+  const auto &attn_weight = buildATensor(ctx, ins[4]);
+  const auto &grad_output = buildATensor(ctx, ins[5]);
+  auto grad_value = buildATensor(ctx, outs[0]);
+  auto grad_sampling_loc = buildATensor(ctx, outs[1]);
+  auto grad_attn_weight = buildATensor(ctx, outs[2]);
+  ms_deform_attn_backward(value, spatial_shapes, level_start_index,
+                          sampling_loc, attn_weight, grad_output, grad_value,
+                          grad_sampling_loc, grad_attn_weight, im2col_step);
+}
+
+PARROTS_EXTENSION_REGISTER(ms_deform_attn_forward)
+    .attr("im2col_step")
+    .input(5)
+    .output(1)
+    .apply(ms_deform_attn_forward_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(ms_deform_attn_backward)
+    .attr("im2col_step")
+    .input(6)
+    .output(3)
+    .apply(ms_deform_attn_backward_parrots)
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/nms.cpp b/mmcv/mmcv/ops/csrc/parrots/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..199d8af236f5442fcdd53ce3dfd8d24aa67481bb
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/nms.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
+}
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return nms_impl(boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset) {
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
+}
+
+std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
+  return nms_match_impl(dets, iou_threshold);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/nms_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/nms_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db8b5f16e9a276a9891f0a415276c334ebf0901f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/nms_parrots.cpp
@@ -0,0 +1,140 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "nms_pytorch.h"
+
+using namespace parrots;
+
+// Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+template <typename T>
+void nms_parrots(T& ctx, const SSElement& attr,
+                 const OperatorBase::in_list_t& ins,
+                 OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int offset;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("offset", offset)
+      .done();
+  at::Tensor boxes, scores;
+  boxes = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  auto out = nms(boxes, scores, iou_threshold, offset);
+  updateDArray(ctx, out, outs[0]);
+}
+
+/*Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+ *                float sigma, float min_score, int method, int offset);*/
+template <typename T>
+void softnms_parrots(T& ctx, const SSElement& attr,
+                     const OperatorBase::in_list_t& ins,
+                     OperatorBase::out_list_t& outs) {
+  float iou_threshold, sigma, min_score;
+  int method, offset;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("sigma", sigma)
+      .get("min_score", min_score)
+      .get("method", method)
+      .get("offset", offset)
+      .done();
+  at::Tensor boxes, scores, dets;
+  boxes = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  dets = buildATensor(ctx, ins[2]);
+  auto out = softnms(boxes, scores, dets, iou_threshold, sigma, min_score,
+                     method, offset);
+  updateDArray(ctx, out, outs[0]);
+}
+
+// std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold);
+template <typename T>
+void nms_match_parrots(T& ctx, const SSElement& attr,
+                       const OperatorBase::in_list_t& ins,
+                       OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  SSAttrs(attr).get("iou_threshold", iou_threshold).done();
+  at::Tensor dets;
+  dets = buildATensor(ctx, ins[0]);
+  auto out = nms_match(dets, iou_threshold);
+  int n = out.size(), m = 0;
+  for (int i = 0; i < n; ++i)
+    if (m < out[i].size()) m = out[i].size();
+  auto options = torch::TensorOptions().dtype(at::kInt);
+  auto tensor = torch::zeros({n, m}, options);
+  for (int i = 0; i < n; i++)
+    tensor.slice(0, i, i + 1) =
+        torch::from_blob(out[i].data(), {out[i].size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+/*Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+ *                    const Tensor dets_sorted, const float iou_threshold,
+ *                                       const int multi_label);*/
+template <typename T>
+void nms_rotated_parrots(T& ctx, const SSElement& attr,
+                         const OperatorBase::in_list_t& ins,
+                         OperatorBase::out_list_t& outs) {
+  float iou_threshold;
+  int multi_label;
+  SSAttrs(attr)
+      .get("iou_threshold", iou_threshold)
+      .get("multi_label", multi_label)
+      .done();
+  at::Tensor dets, scores, order, dets_sorted;
+  dets = buildATensor(ctx, ins[0]);
+  scores = buildATensor(ctx, ins[1]);
+  order = buildATensor(ctx, ins[2]);
+  dets_sorted = buildATensor(ctx, ins[3]);
+  auto out =
+      nms_rotated(dets, scores, order, dets_sorted, iou_threshold, multi_label);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(nms)
+    .attr("iou_threshold")
+    .attr("offset")
+    .input(2)
+    .output(1)
+    .apply(nms_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(softnms)
+    .attr("iou_threshold")
+    .attr("sigma")
+    .attr("min_score")
+    .attr("method")
+    .attr("offset")
+    .input(3)
+    .output(1)
+    .apply(softnms_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(softnms_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_match)
+    .attr("iou_threshold")
+    .input(1)
+    .output(1)
+    .apply(nms_match_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_match_parrots<CudaContext>)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(nms_rotated)
+    .attr("multi_label")
+    .attr("iou_threshold")
+    .input(4)
+    .output(1)
+    .apply(nms_rotated_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(nms_rotated_parrots<CudaContext>)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/nms_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/nms_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..78c680e57c3089b44d29586175f56a5599560914
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/nms_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef NMS_PYTORCH_H
+#define NMS_PYTORCH_H
+#include <torch/extension.h>
+
+at::Tensor nms(at::Tensor boxes, at::Tensor scores, float iou_threshold,
+               int offset);
+
+at::Tensor softnms(at::Tensor boxes, at::Tensor scores, at::Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset);
+
+std::vector<std::vector<int> > nms_match(at::Tensor dets, float iou_threshold);
+
+at::Tensor nms_rotated(const at::Tensor dets, const at::Tensor scores,
+                       const at::Tensor order, const at::Tensor dets_sorted,
+                       const float iou_threshold, const int multi_label);
+#endif  // NMS_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/nms_rotated.cpp b/mmcv/mmcv/ops/csrc/parrots/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4ef676a9d6f94e5f60b7c9e1df8ce78eb6cbaa2
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/nms_rotated.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order, const Tensor dets_sorted,
+                        const float iou_threshold, const int multi_label);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                            multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_rotated_cpu(dets, scores, iou_threshold);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/pixel_group.cpp b/mmcv/mmcv/ops/csrc/parrots/pixel_group.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2bf8c8bbf2061cacb9e0c2d33c8a635834407622
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/pixel_group.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
+}
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
+  score = score.contiguous();
+  mask = mask.contiguous();
+  embedding = embedding.contiguous();
+  kernel_label = kernel_label.contiguous();
+  kernel_contour = kernel_contour.contiguous();
+
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/pixel_group_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/pixel_group_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd863a4e1b341441b3700fe3931c9bb78c159ee6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/pixel_group_parrots.cpp
@@ -0,0 +1,54 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "pixel_group_pytorch.h"
+
+using namespace parrots;
+using namespace std;
+
+template <typename T>
+void pixel_group_parrots(T& ctx, const SSElement& attr,
+                         const OperatorBase::in_list_t& ins,
+                         OperatorBase::out_list_t& outs) {
+  int kernel_region_num;
+  float distance_threshold;
+  SSAttrs(attr)
+      .get<int>("kernel_region_num", kernel_region_num)
+      .get<float>("distance_threshold", distance_threshold)
+      .done();
+  at::Tensor score;
+  at::Tensor mask;
+  at::Tensor embedding;
+  at::Tensor kernel_label;
+  at::Tensor kernel_contour;
+  score = buildATensor(ctx, ins[0]);
+  mask = buildATensor(ctx, ins[1]);
+  embedding = buildATensor(ctx, ins[2]);
+  kernel_label = buildATensor(ctx, ins[3]);
+  kernel_contour = buildATensor(ctx, ins[4]);
+  auto out = pixel_group(score, mask, embedding, kernel_label, kernel_contour,
+                         kernel_region_num, distance_threshold);
+  int n = out.size();
+  std::vector<float> out_tensor;
+  for (int i = 0; i < n; ++i) out_tensor.push_back(float(out[i].size()));
+  for (int i = 0; i < n; ++i)
+    out_tensor.insert(out_tensor.end(), out[i].begin(), out[i].end());
+  auto options = torch::TensorOptions().dtype(at::kFloat);
+  auto tensor = torch::zeros({1, out_tensor.size()}, options);
+  tensor.slice(0, 0, 1) =
+      torch::from_blob(out_tensor.data(), {out_tensor.size()}, options);
+  updateDArray(ctx, tensor, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(pixel_group)
+    .attr("kernel_region_num")
+    .attr("distance_threshold")
+    .input(5)
+    .output(1)
+    .apply(pixel_group_parrots<HostContext>)
+#ifdef MMCV_WITH_CUDA
+    .apply(pixel_group_parrots<CudaContext>)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/pixel_group_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/pixel_group_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1686ef3ee3647ada5fa37ded01415c37a4186f2d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/pixel_group_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PIXEL_GROUP_PYTORCH_H
+#define PIXEL_GROUP_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold);
+
+#endif  // PIXEL_GROUP_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/points_in_boxes.cpp b/mmcv/mmcv/ops/csrc/parrots/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..540da94038f6dea2dc10443905f289ddd131f1af
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/points_in_boxes.cpp
@@ -0,0 +1,44 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
+  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..afd2b0eb2d6c84f0dc44229c08b6b764185365fb
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/points_in_boxes_parrots.cpp
@@ -0,0 +1,64 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "points_in_boxes_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void points_in_boxes_part_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_part_forward(boxes_tensor, pts_tensor,
+                               box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto box_idx_of_points_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_all_forward(boxes_tensor, pts_tensor,
+                              box_idx_of_points_tensor);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_part_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_part_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_all_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_all_forward_cuda_parrots)
+    .done();
+#endif
+
+void points_in_boxes_forward_cpu_parrots(HostContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  auto boxes_tensor = buildATensor(ctx, ins[0]);
+  auto pts_tensor = buildATensor(ctx, ins[1]);
+
+  auto pts_indices_tensor = buildATensor(ctx, outs[0]);
+
+  points_in_boxes_cpu_forward(boxes_tensor, pts_tensor, pts_indices_tensor);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_boxes_cpu_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_boxes_forward_cpu_parrots)
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f3e465e3c785e5e78c020f61eaeaa23e59d1948a
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/points_in_boxes_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_BOXES_PYTORCH_H
+#define POINTS_IN_BOXES_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor);
+
+#endif  // POINTS_IN_BOXES_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/points_in_polygons.cpp b/mmcv/mmcv/ops/csrc/parrots/points_in_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75a93dcef33f23904c1218048e16beff65c230d1
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/points_in_polygons.cpp
@@ -0,0 +1,15 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d52018e6451f52d0c10648cea2ee036b3214376d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/points_in_polygons_parrots.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "points_in_polygons_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void points_in_polygons_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  auto points = buildATensor(ctx, ins[0]);
+  auto polygons = buildATensor(ctx, ins[1]);
+
+  auto output = buildATensor(ctx, outs[0]);
+
+  points_in_polygons_forward(points, polygons, output);
+}
+
+PARROTS_EXTENSION_REGISTER(points_in_polygons_forward)
+    .input(2)
+    .output(1)
+    .apply(points_in_polygons_cuda_parrots)
+    .done();
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..042678143472b18c85ac6d1bdcd79cc97a4e7ab0
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/points_in_polygons_pytorch.h
@@ -0,0 +1,9 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef POINTS_IN_POLYGONS_PYTORCH_H
+#define POINTS_IN_POLYGONS_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+#endif  // POINTS_IN_POLYGONS_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/psamask.cpp b/mmcv/mmcv/ops/csrc/parrots/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6064c9ba5fd7ec9bcfef22b3abcc65ef50106d67
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/psamask.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask) {
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask) {
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/psamask_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/psamask_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f67102d02cc124a81d300aea4946c65155ede81d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/psamask_parrots.cpp
@@ -0,0 +1,129 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "psamask_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void psamask_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  auto output = buildATensor(ctx, outs[0]);
+  psamask_forward_cuda(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                   const OperatorBase::in_list_t &ins,
+                                   OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  psamask_backward_cuda(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
+#endif
+
+void psamask_forward_cpu_parrots(HostContext &ctx, const SSElement &attr,
+                                 const OperatorBase::in_list_t &ins,
+                                 OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+  const auto &input = buildATensor(ctx, ins[0]);
+  auto output = buildATensor(ctx, outs[0]);
+  psamask_forward_cpu(psa_type, input, output, num_, h_feature, w_feature,
+                      h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward_cpu_parrots(HostContext &ctx, const SSElement &attr,
+                                  const OperatorBase::in_list_t &ins,
+                                  OperatorBase::out_list_t &outs) {
+  int psa_type, num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+      half_w_mask;
+  SSAttrs(attr)
+      .get<int>("psa_type", psa_type)
+      .get<int>("num_", num_)
+      .get<int>("h_feature", h_feature)
+      .get<int>("w_feature", w_feature)
+      .get<int>("h_mask", h_mask)
+      .get<int>("w_mask", w_mask)
+      .get<int>("half_h_mask", half_h_mask)
+      .get<int>("half_w_mask", half_w_mask)
+      .done();
+
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  psamask_backward_cpu(psa_type, grad_output, grad_input, num_, h_feature,
+                       w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+PARROTS_EXTENSION_REGISTER(psamask_forward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(psamask_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(psamask_backward)
+    .attr("psa_type")
+    .attr("num_")
+    .attr("h_feature")
+    .attr("w_feature")
+    .attr("h_mask")
+    .attr("w_mask")
+    .attr("half_h_mask")
+    .attr("half_w_mask")
+    .input(1)
+    .output(1)
+    .apply(psamask_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(psamask_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/psamask_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/psamask_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3f0579efb8b8149f1840d0a20fc5ba91df74f06
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/psamask_pytorch.h
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef PSAMASK_PYTORCH_H
+#define PSAMASK_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+#endif
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask);
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask);
+#endif  // PSAMASK_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp b/mmcv/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81ffa9fd6dcd82117ca13ac83b88b5f023aca466
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/riroi_align_rotated.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5eb340ce42cf0ed4ccbe66a4b97aaed55a13be8b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/riroi_align_rotated_parrots.cpp
@@ -0,0 +1,86 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "riroi_align_rotated_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void riroi_align_rotated_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto input = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_forward(input, rois, output, pooled_height, pooled_width,
+                              spatial_scale, sample_num, num_orientations,
+                              clockwise);
+}
+
+void riroi_align_rotated_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sample_num;
+  int num_orientations;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("num_samples", sample_num)
+      .get<int>("num_orientations", num_orientations)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  riroi_align_rotated_backward(grad_output, rois, grad_input, pooled_height,
+                               pooled_width, spatial_scale, sample_num,
+                               num_orientations, clockwise);
+}
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(riroi_align_rotated_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("num_samples")
+    .attr("num_orientations")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(riroi_align_rotated_backward_cuda_parrots)
+    .done();
+
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..49a30bffaffe059c98884332449c6af817036390
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/riroi_align_rotated_pytorch.h
@@ -0,0 +1,18 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef RIROI_ALIGN_ROTATED_PYTORCH_H
+#define RIROI_ALIGN_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+#endif  // RIROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/roi_align.cpp b/mmcv/mmcv/ops/csrc/parrots/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e7077397d06ecd55af1e1060e64fe8c5ff08c94
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roi_align.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/roi_align_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/roi_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60abea092709427b0e62c101931911c2c1924cf1
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roi_align_parrots.cpp
@@ -0,0 +1,151 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_align_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_y = buildATensor(ctx, outs[1]);
+  auto argmax_x = buildATensor(ctx, outs[2]);
+  roi_align_forward_cuda(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                     const OperatorBase::in_list_t& ins,
+                                     OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax_y = buildATensor(ctx, ins[2]);
+  const auto& argmax_x = buildATensor(ctx, ins[3]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_backward_cuda(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
+#endif
+
+void roi_align_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax_y = buildATensor(ctx, outs[1]);
+  auto argmax_x = buildATensor(ctx, outs[2]);
+  roi_align_forward_cpu(input, rois, output, argmax_y, argmax_x, aligned_height,
+                        aligned_width, spatial_scale, sampling_ratio, pool_mode,
+                        aligned);
+}
+
+void roi_align_backward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int aligned_height;
+  int aligned_width;
+  float spatial_scale;
+  int sampling_ratio;
+  int pool_mode;
+  bool aligned;
+  SSAttrs(attr)
+      .get<int>("aligned_height", aligned_height)
+      .get<int>("aligned_width", aligned_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<int>("pool_mode", pool_mode)
+      .get<bool>("aligned", aligned)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax_y = buildATensor(ctx, ins[2]);
+  const auto& argmax_x = buildATensor(ctx, ins[3]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_backward_cpu(grad_output, rois, argmax_y, argmax_x, grad_input,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_align_forward)
+    .attr("aligned_height")
+    .attr("aligned_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("pool_mode")
+    .attr("aligned")
+    .input(2)
+    .output(3)
+    .apply(roi_align_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_align_backward)
+    .attr("aligned_height")
+    .attr("aligned_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("pool_mode")
+    .attr("aligned")
+    .input(4)
+    .output(1)
+    .apply(roi_align_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/roi_align_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/roi_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c60160984fd964663547c590025558780c8c62f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roi_align_pytorch.h
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_ALIGN_PYTORCH_H
+#define ROI_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+#endif
+
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+#endif  // ROI_ALIGN_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/roi_align_rotated.cpp b/mmcv/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ef691ada07e599740906254369631189e5d6f51
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roi_align_rotated.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_rotated_forward_impl(Tensor features, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sample_ratio,
+                                    bool aligned, bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, features, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sample_ratio, bool aligned,
+                                     bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sample_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise) {
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9386250a27b1db338bcc522c4acf9b29b05077db
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roi_align_rotated_parrots.cpp
@@ -0,0 +1,147 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_align_rotated_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_rotated_forward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  roi_align_rotated_forward_cuda(input, rois, output, pooled_height,
+                                 pooled_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_rotated_backward_cuda(grad_output, rois, grad_input, pooled_height,
+                                  pooled_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
+#endif
+
+void roi_align_rotated_forward_cpu_parrots(HostContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  roi_align_rotated_forward_cpu(input, rois, output, pooled_height,
+                                pooled_width, spatial_scale, sampling_ratio,
+                                aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cpu_parrots(HostContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  int sampling_ratio;
+  bool aligned;
+  bool clockwise;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("sampling_ratio", sampling_ratio)
+      .get<bool>("aligned", aligned)
+      .get<bool>("clockwise", clockwise)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_align_rotated_backward_cpu(grad_output, rois, grad_input, pooled_height,
+                                 pooled_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_align_rotated_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("aligned")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(roi_align_rotated_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_rotated_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_align_rotated_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .attr("sampling_ratio")
+    .attr("aligned")
+    .attr("clockwise")
+    .input(2)
+    .output(1)
+    .apply(roi_align_rotated_backward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(roi_align_rotated_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..8136b56d133d4dfa32b0d1aa2a02425560dee0e0
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roi_align_rotated_pytorch.h
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_ALIGN_ROTATED_PYTORCH_H
+#define ROI_ALIGN_ROTATED_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                                    int pooled_height, int pooled_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_cuda(Tensor grad_output, Tensor rois,
+                                     Tensor bottom_grad, int pooled_height,
+                                     int pooled_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+#endif
+
+void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_cpu(Tensor grad_output, Tensor rois,
+                                    Tensor bottom_grad, int pooled_height,
+                                    int pooled_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise);
+
+#endif  // ROI_ALIGN_ROTATED_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/roi_pool.cpp b/mmcv/mmcv/ops/csrc/parrots/roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bba90b806c5fe59d9e20a0b41a51df9922e91c3f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roi_pool.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width,
+                      float spatial_scale) {
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale) {
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/roi_pool_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/roi_pool_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0acde4a41e46ccac53c8b4bae80bd88fb2fde6d6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roi_pool_parrots.cpp
@@ -0,0 +1,67 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roi_pool_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roi_pool_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  auto argmax = buildATensor(ctx, outs[1]);
+  roi_pool_forward_cuda(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                    const OperatorBase::in_list_t& ins,
+                                    OperatorBase::out_list_t& outs) {
+  int pooled_height;
+  int pooled_width;
+  float spatial_scale;
+  SSAttrs(attr)
+      .get<int>("pooled_height", pooled_height)
+      .get<int>("pooled_width", pooled_width)
+      .get<float>("spatial_scale", spatial_scale)
+      .done();
+
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& rois = buildATensor(ctx, ins[1]);
+  const auto& argmax = buildATensor(ctx, ins[2]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  roi_pool_backward_cuda(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
+
+PARROTS_EXTENSION_REGISTER(roi_pool_forward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(2)
+    .output(2)
+    .apply(roi_pool_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roi_pool_backward)
+    .attr("pooled_height")
+    .attr("pooled_width")
+    .attr("spatial_scale")
+    .input(3)
+    .output(1)
+    .apply(roi_pool_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/roi_pool_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/roi_pool_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..d67a1502fe955fa469cc5f854687df88ee432756
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roi_pool_pytorch.h
@@ -0,0 +1,16 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROI_POOL_PYTORCH_H
+#define ROI_POOL_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+#ifdef MMCV_WITH_CUDA
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+#endif
+#endif  // ROI_POOL_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp b/mmcv/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cf9cf0945db4c0ce1774aed6d334b62f3e1a9e4
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roiaware_pool3d.cpp
@@ -0,0 +1,72 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
+                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
+                       pts, pts_feature, argmax, pts_idx_of_voxels,
+                       pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
+                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
+                       argmax, grad_out, grad_in, pool_method);
+}
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
+  // coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
+
+  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
+                               out_x, out_y, out_z, rois, pts, pts_feature,
+                               argmax, pts_idx_of_voxels, pooled_features,
+                               pool_method);
+}
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in,
+                              int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
+
+  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
+                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                grad_out, grad_in, pool_method);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..771d920043869cd538377a9f9a7320dd67243c69
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roiaware_pool3d_parrots.cpp
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roiaware_pool3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roiaware_pool3d_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int pool_method;
+  SSAttrs(attr).get<int>("pool_method", pool_method).done();
+  auto rois = buildATensor(ctx, ins[0]);
+  auto pts = buildATensor(ctx, ins[1]);
+  auto pts_feature = buildATensor(ctx, ins[2]);
+
+  auto argmax = buildATensor(ctx, outs[0]);
+  auto pts_idx_of_voxels = buildATensor(ctx, outs[1]);
+  auto pooled_features = buildATensor(ctx, outs[2]);
+
+  roiaware_pool3d_forward(rois, pts, pts_feature, argmax, pts_idx_of_voxels,
+                          pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int pool_method;
+  SSAttrs(attr).get<int>("pool_method", pool_method).done();
+  auto pts_idx_of_voxels = buildATensor(ctx, ins[0]);
+  auto argmax = buildATensor(ctx, ins[1]);
+  auto grad_out = buildATensor(ctx, ins[2]);
+
+  auto grad_in = buildATensor(ctx, outs[0]);
+
+  roiaware_pool3d_backward(pts_idx_of_voxels, argmax, grad_out, grad_in,
+                           pool_method);
+}
+
+PARROTS_EXTENSION_REGISTER(roiaware_pool3d_forward)
+    .attr("pool_method")
+    .input(3)
+    .output(3)
+    .apply(roiaware_pool3d_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(roiaware_pool3d_backward)
+    .attr("pool_method")
+    .input(3)
+    .output(1)
+    .apply(roiaware_pool3d_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0b4b0402afa573c2231a3667fec41632ed854ad2
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roiaware_pool3d_pytorch.h
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIAWARE_POOL3D_PYTORCH_H
+#define ROIAWARE_POOL3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in, int pool_method);
+
+#endif  // ROIAWARE_POOL3D_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp b/mmcv/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a10080b7c23abb3a31b6f764c972ea7917f52346
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roipoint_pool3d.cpp
@@ -0,0 +1,39 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
+                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
+                       pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int batch_size = xyz.size(0);
+  int pts_num = xyz.size(1);
+  int boxes_num = boxes3d.size(1);
+  int feature_in_len = pts_feature.size(2);
+  int sampled_pts_num = pooled_features.size(2);
+
+  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
+                               sampled_pts_num, xyz, boxes3d, pts_feature,
+                               pooled_features, pooled_empty_flag);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..17f549849df4d433d5c7369f5f43715d1f88a56e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roipoint_pool3d_parrots.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "roipoint_pool3d_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void roipoint_pool3d_forward_cuda_parrots(CudaContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  auto xyz = buildATensor(ctx, ins[0]);
+  auto boxes3d = buildATensor(ctx, ins[1]);
+  auto pts_feature = buildATensor(ctx, ins[2]);
+
+  auto pooled_features = buildATensor(ctx, outs[0]);
+  auto pooled_empty_flag = buildATensor(ctx, outs[1]);
+
+  roipoint_pool3d_forward(xyz, boxes3d, pts_feature, pooled_features,
+                          pooled_empty_flag);
+}
+
+PARROTS_EXTENSION_REGISTER(roipoint_pool3d_forward)
+    .input(3)
+    .output(2)
+    .apply(roipoint_pool3d_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5b61b0d9ab2d2ed6ea3db9947ae8dc1e0d96992
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/roipoint_pool3d_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROIPOINT_POOL3D_PYTORCH_H
+#define ROIPOINT_POOL3D_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag);
+
+#endif  // ROIPOINT_POOL3D_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/rotated_feature_align.cpp b/mmcv/mmcv/ops/csrc/parrots/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71fe0c9a0a26003310a388d4edca6e79aa7b9026
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/rotated_feature_align.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad11a9d2fe71750de2d12249c2323d1e68d671c0
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/rotated_feature_align_parrots.cpp
@@ -0,0 +1,99 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "rotated_feature_align_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void rotated_feature_align_forward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto features = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
+                                points);
+}
+
+void rotated_feature_align_backward_cuda_parrots(
+    CudaContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
+                                 spatial_scale, points);
+}
+
+void rotated_feature_align_forward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto features = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  rotated_feature_align_forward(features, best_bboxes, output, spatial_scale,
+                                points);
+}
+#endif
+
+void rotated_feature_align_backward_cpu_parrots(
+    HostContext& ctx, const SSElement& attr, const OperatorBase::in_list_t& ins,
+    OperatorBase::out_list_t& outs) {
+  float spatial_scale;
+  int points;
+  SSAttrs(attr)
+      .get<float>("spatial_scale", spatial_scale)
+      .get<int>("points", points)
+      .done();
+
+  auto grad_output = buildATensor(ctx, ins[0]);
+  auto best_bboxes = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  rotated_feature_align_backward(grad_output, best_bboxes, grad_input,
+                                 spatial_scale, points);
+}
+
+PARROTS_EXTENSION_REGISTER(rotated_feature_align_forward)
+    .attr("spatial_scale")
+    .attr("points")
+    .input(2)
+    .output(1)
+    .apply(rotated_feature_align_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(rotated_feature_align_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(rotated_feature_align_backward)
+    .attr("spatial_scale")
+    .attr("points")
+    .input(2)
+    .output(1)
+    .apply(rotated_feature_align_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(rotated_feature_align_backward_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a695ee5e3de4b2d8f77e93fb06986967f3a35d0
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/rotated_feature_align_pytorch.h
@@ -0,0 +1,17 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef ROTATED_FEATURE_ALIGN_PYTORCH_H
+#define ROTATED_FEATURE_ALIGN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+#endif  // ROTATED_FEATURE_ALIGN_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/sync_bn.cpp b/mmcv/mmcv/ops/csrc/parrots/sync_bn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd5a513273a7bbce2cf41c790706fe4801f4c414
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/sync_bn.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
+}
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
+}
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
+                       running_mean, running_var, weight, bias, norm, std,
+                       output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
+                       grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
+                       grad_weight, grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean) {
+  sync_bn_forward_mean_impl(input, mean);
+}
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
+  sync_bn_forward_var_impl(input, mean, var);
+}
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size) {
+  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias) {
+  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input) {
+  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/sync_bn_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/sync_bn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0b1855abd1cca4bd0cd831c3b86e50f273779339
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/sync_bn_parrots.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "sync_bn_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void sync_bn_forward_mean_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  const auto& input = buildATensor(ctx, ins[0]);
+  auto mean = buildATensor(ctx, outs[0]);
+  sync_bn_forward_mean_cuda(input, mean);
+}
+
+void sync_bn_forward_var_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                      const OperatorBase::in_list_t& ins,
+                                      OperatorBase::out_list_t& outs) {
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& mean = buildATensor(ctx, ins[1]);
+  auto var = buildATensor(ctx, outs[0]);
+  sync_bn_forward_var_cuda(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  size_t group_size;
+  float eps, momentum;
+  SSAttrs(attr)
+      .get<float>("eps", eps)
+      .get<float>("momentum", momentum)
+      .get<size_t>("group_size", group_size)
+      .done();
+
+  const auto& input = buildATensor(ctx, ins[0]);
+  const auto& mean = buildATensor(ctx, ins[1]);
+  const auto& var = buildATensor(ctx, ins[2]);
+  const auto& weight = buildATensor(ctx, ins[3]);
+  const auto& bias = buildATensor(ctx, ins[4]);
+  auto running_mean = buildATensor(ctx, outs[0]);
+  auto running_var = buildATensor(ctx, outs[1]);
+  auto norm = buildATensor(ctx, outs[2]);
+  auto std = buildATensor(ctx, outs[3]);
+  auto output = buildATensor(ctx, outs[4]);
+  sync_bn_forward_output_cuda(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param_cuda_parrots(CudaContext& ctx,
+                                         const SSElement& attr,
+                                         const OperatorBase::in_list_t& ins,
+                                         OperatorBase::out_list_t& outs) {
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& norm = buildATensor(ctx, ins[1]);
+  auto grad_weight = buildATensor(ctx, outs[0]);
+  auto grad_bias = buildATensor(ctx, outs[1]);
+  sync_bn_backward_param_cuda(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  const auto& grad_output = buildATensor(ctx, ins[0]);
+  const auto& weight = buildATensor(ctx, ins[1]);
+  const auto& grad_weight = buildATensor(ctx, ins[2]);
+  const auto& grad_bias = buildATensor(ctx, ins[3]);
+  const auto& norm = buildATensor(ctx, ins[4]);
+  const auto& std = buildATensor(ctx, ins[5]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  sync_bn_backward_data_cuda(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_mean)
+    .input(1)
+    .output(1)
+    .apply(sync_bn_forward_mean_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_var)
+    .input(2)
+    .output(1)
+    .apply(sync_bn_forward_var_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_forward_output)
+    .attr("eps")
+    .attr("momentum")
+    .attr("group_size")
+    .input(5)
+    .output(5)
+    .apply(sync_bn_forward_output_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_backward_param)
+    .input(2)
+    .output(2)
+    .apply(sync_bn_backward_param_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(sync_bn_backward_data)
+    .input(6)
+    .output(1)
+    .apply(sync_bn_backward_data_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/sync_bn_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/sync_bn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6bd6a7fada22ed512489f74d69445042b9aaf84b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/sync_bn_pytorch.h
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef SYNC_BN_PYTORCH_H
+#define SYNC_BN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+#endif  // SYNC_BN_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/three_interpolate.cpp b/mmcv/mmcv/ops/csrc/parrots/three_interpolate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e0ec71bb3d3fdb8416dcc62cfda926cc45c9977
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/three_interpolate.cpp
@@ -0,0 +1,33 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
+                       weight, out);
+}
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
+                       idx, weight, grad_points);
+}
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n) {
+  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
+                                 weight_tensor, out_tensor);
+}
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m) {
+  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
+                                  weight_tensor, grad_points_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a71a90fd1e6b0321e14665265430a31c2934cb51
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/three_interpolate_parrots.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "three_interpolate_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void three_interpolate_forward_cuda_parrots(CudaContext& ctx,
+                                            const SSElement& attr,
+                                            const OperatorBase::in_list_t& ins,
+                                            OperatorBase::out_list_t& outs) {
+  int b, c, m, n;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("m", m)
+      .get<int>("n", n)
+      .done();
+
+  auto points_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+  auto weight_tensor = buildATensor(ctx, ins[2]);
+
+  auto out_tensor = buildATensor(ctx, outs[0]);
+
+  three_interpolate_forward(points_tensor, idx_tensor, weight_tensor,
+                            out_tensor, b, c, m, n);
+}
+
+void three_interpolate_backward_cuda_parrots(CudaContext& ctx,
+                                             const SSElement& attr,
+                                             const OperatorBase::in_list_t& ins,
+                                             OperatorBase::out_list_t& outs) {
+  int b, c, n, m;
+  SSAttrs(attr)
+      .get<int>("b", b)
+      .get<int>("c", c)
+      .get<int>("n", n)
+      .get<int>("m", m)
+      .done();
+
+  auto grad_out_tensor = buildATensor(ctx, ins[0]);
+  auto idx_tensor = buildATensor(ctx, ins[1]);
+  auto weight_tensor = buildATensor(ctx, ins[2]);
+
+  auto grad_points_tensor = buildATensor(ctx, outs[0]);
+
+  three_interpolate_backward(grad_out_tensor, idx_tensor, weight_tensor,
+                             grad_points_tensor, b, c, n, m);
+}
+
+PARROTS_EXTENSION_REGISTER(three_interpolate_forward)
+    .attr("b")
+    .attr("c")
+    .attr("m")
+    .attr("n")
+    .input(3)
+    .output(1)
+    .apply(three_interpolate_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(three_interpolate_backward)
+    .attr("b")
+    .attr("c")
+    .attr("n")
+    .attr("m")
+    .input(3)
+    .output(1)
+    .apply(three_interpolate_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..464c6d90051529e2f2c694bfda9cb15f5998c9c5
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/three_interpolate_pytorch.h
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_INTERPOLATE_PYTORCH_H
+#define THREE_INTERPOLATE_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n);
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m);
+#endif  // THREE_INTERPOLATE_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/three_nn.cpp b/mmcv/mmcv/ops/csrc/parrots/three_nn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b629200c0727cdec5ca4e0abd8ac65baacaa31f9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/three_nn.cpp
@@ -0,0 +1,18 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m) {
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/three_nn_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/three_nn_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c28c7d216cc6c2d4ab55de26b7b9d9e0197642b3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/three_nn_parrots.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "three_nn_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void three_nn_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                   const OperatorBase::in_list_t& ins,
+                                   OperatorBase::out_list_t& outs) {
+  int b, n, m;
+  SSAttrs(attr).get<int>("b", b).get<int>("n", n).get<int>("m", m).done();
+
+  auto unknown_tensor = buildATensor(ctx, ins[0]);
+  auto known_tensor = buildATensor(ctx, ins[1]);
+
+  auto dist2_tensor = buildATensor(ctx, outs[0]);
+  auto idx_tensor = buildATensor(ctx, outs[1]);
+
+  three_nn_forward(unknown_tensor, known_tensor, dist2_tensor, idx_tensor, b, n,
+                   m);
+}
+
+PARROTS_EXTENSION_REGISTER(three_nn_forward)
+    .attr("b")
+    .attr("n")
+    .attr("m")
+    .input(2)
+    .output(2)
+    .apply(three_nn_forward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/three_nn_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/three_nn_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..6574fba0912bd87425de995db5ddb6c7b715381d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/three_nn_pytorch.h
@@ -0,0 +1,10 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef THREE_NN_PYTORCH_H
+#define THREE_NN_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m);
+#endif  // THREE_NN_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/tin_shift.cpp b/mmcv/mmcv/ops/csrc/parrots/tin_shift.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b03f587541f17cae3c3f03f5cb8747d4b0208efc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/tin_shift.cpp
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
+}
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
+}
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
+  tin_shift_forward_impl(input, shift, output);
+}
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
+  tin_shift_backward_impl(grad_output, shift, grad_input);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/tin_shift_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/tin_shift_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b0920928e73a0af9650726420396c6a481e1b2bd
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/tin_shift_parrots.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "tin_shift_pytorch.h"
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void tin_shift_forward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                    const OperatorBase::in_list_t &ins,
+                                    OperatorBase::out_list_t &outs) {
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &shift = buildATensor(ctx, ins[1]);
+  auto output = buildATensor(ctx, outs[0]);
+  tin_shift_forward_cuda(input, shift, output);
+}
+
+void tin_shift_backward_cuda_parrots(CudaContext &ctx, const SSElement &attr,
+                                     const OperatorBase::in_list_t &ins,
+                                     OperatorBase::out_list_t &outs) {
+  const auto &grad_output = buildATensor(ctx, ins[0]);
+  const auto &shift = buildATensor(ctx, ins[1]);
+  auto grad_input = buildATensor(ctx, outs[0]);
+  tin_shift_backward_cuda(grad_output, shift, grad_input);
+}
+
+PARROTS_EXTENSION_REGISTER(tin_shift_forward)
+    .input(2)
+    .output(1)
+    .apply(tin_shift_forward_cuda_parrots)
+    .done();
+
+PARROTS_EXTENSION_REGISTER(tin_shift_backward)
+    .input(2)
+    .output(1)
+    .apply(tin_shift_backward_cuda_parrots)
+    .done();
+#endif
diff --git a/mmcv/mmcv/ops/csrc/parrots/tin_shift_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/tin_shift_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe72383764cd0ed13fd8b74938027ea9db992d52
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/tin_shift_pytorch.h
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TIN_SHIFT_PYTORCH_H
+#define TIN_SHIFT_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+#endif  // TIN_SHIFT_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/parrots/upfirdn2d.cpp b/mmcv/mmcv/ops/csrc/parrots/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd325bd7887a49b5f0ccd134604f24c0fd40fc10
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/upfirdn2d.cpp
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
+                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
+}
+
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
+                        int up_x, int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1) {
+  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                           pad_x1, pad_y0, pad_y1);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f0c50db5cdfca872a8231c26d6f578d0fdc171f5
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/upfirdn2d_parrots.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+using namespace at;
+using namespace parrots;
+
+torch::Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x,
+                        int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1);
+
+void upfirdn2d_parrots(CudaContext &ctx, const SSElement &attr,
+                       const OperatorBase::in_list_t &ins,
+                       OperatorBase::out_list_t &outs) {
+  int up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1;
+  const auto &input = buildATensor(ctx, ins[0]);
+  const auto &kernel = buildATensor(ctx, ins[1]);
+  SSAttrs(attr)
+      .get("up_x", up_x)
+      .get("up_y", up_y)
+      .get("down_x", down_x)
+      .get("down_y", down_y)
+      .get("pad_x0", pad_x0)
+      .get("pad_x1", pad_x1)
+      .get("pad_y0", pad_y0)
+      .get("pad_y1", pad_y1)
+      .done();
+  auto out = upfirdn2d(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                       pad_x1, pad_y0, pad_y1);
+  updateDArray(ctx, out, outs[0]);
+}
+
+PARROTS_EXTENSION_REGISTER(upfirdn2d)
+    .attr("up_x")
+    .attr("up_y")
+    .attr("down_x")
+    .attr("down_y")
+    .attr("pad_x0")
+    .attr("pad_x1")
+    .attr("pad_y0")
+    .attr("pad_y1")
+    .input(2)
+    .output(1)
+    .apply(upfirdn2d_parrots)
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/voxelization.cpp b/mmcv/mmcv/ops/csrc/parrots/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7946be6178ad5eae64958b4631c1cabec2a04eee
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/voxelization.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
+                              points, voxels, coors, num_points_per_voxel,
+                              voxel_size, coors_range, max_points, max_voxels,
+                              NDim);
+}
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim = 3) {
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true) {
+  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+
+  if (deterministic) {
+    *voxel_num_data = hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  } else {
+    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  }
+}
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3) {
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
+}
diff --git a/mmcv/mmcv/ops/csrc/parrots/voxelization_parrots.cpp b/mmcv/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..90e2a4445c217a49ecddf064455874b1be12a14f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/voxelization_parrots.cpp
@@ -0,0 +1,113 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <parrots/compute/aten.hpp>
+#include <parrots/extension.hpp>
+#include <parrots/foundation/ssattrs.hpp>
+
+#include "voxelization_pytorch.h"
+
+using namespace parrots;
+
+#ifdef MMCV_WITH_CUDA
+void hard_voxelize_forward_cuda_parrots(CudaContext& ctx, const SSElement& attr,
+                                        const OperatorBase::in_list_t& ins,
+                                        OperatorBase::out_list_t& outs) {
+  int max_points, max_voxels, NDim;
+  bool deterministic;
+  SSAttrs(attr)
+      .get<int>("max_points", max_points)
+      .get<int>("max_voxels", max_voxels)
+      .get<int>("NDim", NDim)
+      .get<bool>("deterministic", deterministic)
+      .done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto voxels = buildATensor(ctx, outs[0]);
+  auto coors = buildATensor(ctx, outs[1]);
+  auto num_points_per_voxel = buildATensor(ctx, outs[2]);
+  auto voxel_num = buildATensor(ctx, outs[3]);
+
+  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
+                        num_points_per_voxel, voxel_num, max_points, max_voxels,
+                        NDim, deterministic);
+}
+
+void dynamic_voxelize_forward_cuda_parrots(CudaContext& ctx,
+                                           const SSElement& attr,
+                                           const OperatorBase::in_list_t& ins,
+                                           OperatorBase::out_list_t& outs) {
+  int NDim;
+  SSAttrs(attr).get<int>("NDim", NDim).done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto coors = buildATensor(ctx, outs[0]);
+
+  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);
+}
+#endif
+
+void hard_voxelize_forward_cpu_parrots(HostContext& ctx, const SSElement& attr,
+                                       const OperatorBase::in_list_t& ins,
+                                       OperatorBase::out_list_t& outs) {
+  int max_points, max_voxels, NDim;
+  bool deterministic;
+  SSAttrs(attr)
+      .get<int>("max_points", max_points)
+      .get<int>("max_voxels", max_voxels)
+      .get<int>("NDim", NDim)
+      .get<bool>("deterministic", deterministic)
+      .done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto voxels = buildATensor(ctx, outs[0]);
+  auto coors = buildATensor(ctx, outs[1]);
+  auto num_points_per_voxel = buildATensor(ctx, outs[2]);
+  auto voxel_num = buildATensor(ctx, outs[3]);
+
+  hard_voxelize_forward(points, voxel_size, coors_range, voxels, coors,
+                        num_points_per_voxel, voxel_num, max_points, max_voxels,
+                        NDim, deterministic);
+}
+
+void dynamic_voxelize_forward_cpu_parrots(HostContext& ctx,
+                                          const SSElement& attr,
+                                          const OperatorBase::in_list_t& ins,
+                                          OperatorBase::out_list_t& outs) {
+  int NDim;
+  SSAttrs(attr).get<int>("NDim", NDim).done();
+  const auto& points = buildATensor(ctx, ins[0]);
+  const auto& voxel_size = buildATensor(ctx, ins[1]);
+  const auto& coors_range = buildATensor(ctx, ins[2]);
+
+  auto coors = buildATensor(ctx, outs[0]);
+
+  dynamic_voxelize_forward(points, voxel_size, coors_range, coors, NDim);
+}
+
+PARROTS_EXTENSION_REGISTER(hard_voxelize_forward)
+    .attr("max_points")
+    .attr("max_voxels")
+    .attr("NDim")
+    .attr("deterministic")
+    .input(3)
+    .output(4)
+    .apply(hard_voxelize_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(hard_voxelize_forward_cuda_parrots)
+#endif
+    .done();
+
+PARROTS_EXTENSION_REGISTER(dynamic_voxelize_forward)
+    .attr("NDim")
+    .input(3)
+    .output(1)
+    .apply(dynamic_voxelize_forward_cpu_parrots)
+#ifdef MMCV_WITH_CUDA
+    .apply(dynamic_voxelize_forward_cuda_parrots)
+#endif
+    .done();
diff --git a/mmcv/mmcv/ops/csrc/parrots/voxelization_pytorch.h b/mmcv/mmcv/ops/csrc/parrots/voxelization_pytorch.h
new file mode 100644
index 0000000000000000000000000000000000000000..0019d51912cb4b8077147e553925ab107bc216ce
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/parrots/voxelization_pytorch.h
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef VOXELIZATION_PYTORCH_H
+#define VOXELIZATION_PYTORCH_H
+#include <torch/extension.h>
+using namespace at;
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true);
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3);
+
+#endif  // VOXELIZATION_PYTORCH_H
diff --git a/mmcv/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp b/mmcv/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1ead1f8e4700d019fff7b25034e2475087040c8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/active_rotated_filter.cpp
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/ActiveRotatingFilter.h
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_forward_impl, input, indices,
+                       output);
+}
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  DISPATCH_DEVICE_IMPL(active_rotated_filter_backward_impl, grad_out, indices,
+                       grad_in);
+}
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output) {
+  active_rotated_filter_forward_impl(input, indices, output);
+}
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in) {
+  active_rotated_filter_backward_impl(grad_out, indices, grad_in);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/assign_score_withk.cpp b/mmcv/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9076277181c48c7c8f236cb9da79a83c5d38d47f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/assign_score_withk.cpp
@@ -0,0 +1,42 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_forward_impl, B, N0, N1, M, K, O,
+                       aggregate, points, centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  DISPATCH_DEVICE_IMPL(assign_score_withk_backward_impl, B, N0, N1, M, K, O,
+                       aggregate, grad_out, points, centers, scores, knn_idx,
+                       grad_points, grad_centers, grad_scores);
+}
+
+void assign_score_withk_forward(const Tensor& points, const Tensor& centers,
+                                const Tensor& scores, const Tensor& knn_idx,
+                                Tensor& output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate) {
+  assign_score_withk_forward_impl(B, N0, N1, M, K, O, aggregate, points,
+                                  centers, scores, knn_idx, output);
+}
+
+void assign_score_withk_backward(const Tensor& grad_out, const Tensor& points,
+                                 const Tensor& centers, const Tensor& scores,
+                                 const Tensor& knn_idx, Tensor& grad_points,
+                                 Tensor& grad_centers, Tensor& grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate) {
+  assign_score_withk_backward_impl(B, N0, N1, M, K, O, aggregate, grad_out,
+                                   points, centers, scores, knn_idx,
+                                   grad_points, grad_centers, grad_scores);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/ball_query.cpp b/mmcv/mmcv/ops/csrc/pytorch/ball_query.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c9e7a20785e894c80d15256a1b040beffa92b47
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/ball_query.cpp
@@ -0,0 +1,20 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  DISPATCH_DEVICE_IMPL(ball_query_forward_impl, b, n, m, min_radius, max_radius,
+                       nsample, new_xyz, xyz, idx);
+}
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample) {
+  ball_query_forward_impl(b, n, m, min_radius, max_radius, nsample,
+                          new_xyz_tensor, xyz_tensor, idx_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp b/mmcv/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..187216fb01a307906a6fff8d7c10fc4efa1b9b3a
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/bbox_overlaps.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  DISPATCH_DEVICE_IMPL(bbox_overlaps_impl, bboxes1, bboxes2, ious, mode,
+                       aligned, offset);
+}
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset) {
+  bbox_overlaps_impl(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/border_align.cpp b/mmcv/mmcv/ops/csrc/pytorch/border_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..565de689913413ab106884365e6dc1edfa940de0
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/border_align.cpp
@@ -0,0 +1,30 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void border_align_forward_impl(const Tensor &input, const Tensor &boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_forward_impl, input, boxes, output,
+                       argmax_idx, pool_size);
+}
+
+void border_align_backward_impl(const Tensor &grad_output, const Tensor &boxes,
+                                const Tensor &argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  DISPATCH_DEVICE_IMPL(border_align_backward_impl, grad_output, boxes,
+                       argmax_idx, grad_input, pool_size);
+}
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size) {
+  border_align_forward_impl(input, boxes, output, argmax_idx, pool_size);
+}
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size) {
+  border_align_backward_impl(grad_output, boxes, argmax_idx, grad_input,
+                             pool_size);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp b/mmcv/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a2a4e0953a5575f72c167bd668c6b6e758ebae87
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/box_iou_rotated.cpp
@@ -0,0 +1,19 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated.h
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  DISPATCH_DEVICE_IMPL(box_iou_rotated_impl, boxes1, boxes2, ious, mode_flag,
+                       aligned);
+}
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned) {
+  box_iou_rotated_impl(boxes1, boxes2, ious, mode_flag, aligned);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/carafe.cpp b/mmcv/mmcv/ops/csrc/pytorch/carafe.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a563aed94f04e32614e38062c4e7f4250c6dafe6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/carafe.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_forward_impl, features, masks, rfeatures, routput,
+                       rmasks, output, kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_backward_impl, top_grad, rfeatures, masks,
+                       rtop_grad, rbottom_grad_hs, rbottom_grad, rmask_grad,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor) {
+  carafe_forward_impl(features, masks, rfeatures, routput, rmasks, output,
+                      kernel_size, group_size, scale_factor);
+}
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor) {
+  carafe_backward_impl(top_grad, rfeatures, masks, rtop_grad, rbottom_grad_hs,
+                       rbottom_grad, rmask_grad, bottom_grad, mask_grad,
+                       kernel_size, group_size, scale_factor);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/carafe_naive.cpp b/mmcv/mmcv/ops/csrc/pytorch/carafe_naive.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e8917a61d93c7e6613566902cb00623ea89444e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/carafe_naive.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_forward_impl, features, masks, output,
+                       kernel_size, group_size, scale_factor);
+}
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  DISPATCH_DEVICE_IMPL(carafe_naive_backward_impl, top_grad, features, masks,
+                       bottom_grad, mask_grad, kernel_size, group_size,
+                       scale_factor);
+}
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_forward_impl(features, masks, output, kernel_size, group_size,
+                            scale_factor);
+}
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor) {
+  carafe_naive_backward_impl(top_grad, features, masks, bottom_grad, mask_grad,
+                             kernel_size, group_size, scale_factor);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/chamfer_distance.cpp b/mmcv/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6ea1ba675eabead6c3df28eee92eff837983adf6
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/chamfer_distance.cpp
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_forward_impl, xyz1, xyz2, dist1, dist2,
+                       idx1, idx2);
+}
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor gradxyz1, Tensor gradxyz2,
+                                    Tensor graddist1, Tensor graddist2,
+                                    Tensor idx1, Tensor idx2) {
+  DISPATCH_DEVICE_IMPL(chamfer_distance_backward_impl, xyz1, xyz2, gradxyz1,
+                       gradxyz2, graddist1, graddist2, idx1, idx2);
+}
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx2) {
+  chamfer_distance_forward_impl(xyz1, xyz2, dist1, dist2, idx1, idx2);
+}
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor gradxyz1, Tensor gradxyz2,
+                               Tensor graddist1, Tensor graddist2, Tensor idx1,
+                               Tensor idx2) {
+  chamfer_distance_backward_impl(xyz1, xyz2, gradxyz1, gradxyz2, graddist1,
+                                 graddist2, idx1, idx2);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/contour_expand.cpp b/mmcv/mmcv/ops/csrc/pytorch/contour_expand.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..586c48ee44b6b7dbb24573b4a2d2ecf499a56d0b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/contour_expand.cpp
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/whai362/PSENet
+#include <iostream>
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+
+using namespace std;
+
+class Point2d {
+ public:
+  int x;
+  int y;
+
+  Point2d() : x(0), y(0) {}
+  Point2d(int _x, int _y) : x(_x), y(_y) {}
+};
+
+void kernel_dilate(const uint8_t *data, IntArrayRef data_shape,
+                   const int *label_map, int &label_num, int &min_area,
+                   vector<vector<int>> &text_line) {
+  std::vector<int> area(label_num + 1);
+  int kernel_num = data_shape[0];
+  int height = data_shape[1];
+  int width = data_shape[2];
+
+  for (int x = 0; x < height; ++x) {
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      area[label] += 1;
+    }
+  }
+
+  queue<Point2d> queue, next_queue;
+  for (int x = 0; x < height; ++x) {
+    vector<int> row(width);
+    for (int y = 0; y < width; ++y) {
+      int label = label_map[x * width + y];
+      if (label == 0) continue;
+      if (area[label] < min_area) continue;
+
+      Point2d point(x, y);
+      queue.push(point);
+      row[y] = label;
+    }
+    text_line.emplace_back(row);
+  }
+
+  int dx[] = {-1, 1, 0, 0};
+  int dy[] = {0, 0, -1, 1};
+  vector<int> kernel_step(kernel_num);
+  std::for_each(kernel_step.begin(), kernel_step.end(),
+                [=](int &k) { return k * height * width; });
+
+  for (int kernel_id = kernel_num - 2; kernel_id >= 0; --kernel_id) {
+    while (!queue.empty()) {
+      Point2d point = queue.front();
+      queue.pop();
+      int x = point.x;
+      int y = point.y;
+      int label = text_line[x][y];
+
+      bool is_edge = true;
+      for (int d = 0; d < 4; ++d) {
+        int tmp_x = x + dx[d];
+        int tmp_y = y + dy[d];
+
+        if (tmp_x < 0 || tmp_x >= height) continue;
+        if (tmp_y < 0 || tmp_y >= width) continue;
+        int kernel_value = data[kernel_step[kernel_id] + tmp_x * width + tmp_y];
+        if (kernel_value == 0) continue;
+        if (text_line[tmp_x][tmp_y] > 0) continue;
+
+        Point2d point(tmp_x, tmp_y);
+        queue.push(point);
+        text_line[tmp_x][tmp_y] = label;
+        is_edge = false;
+      }
+
+      if (is_edge) {
+        next_queue.push(point);
+      }
+    }
+    swap(queue, next_queue);
+  }
+}
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num) {
+  kernel_mask = kernel_mask.contiguous();
+  internal_kernel_label = internal_kernel_label.contiguous();
+  assert(kernel_mask.dim() == 3);
+  assert(internal_kernel_label.dim() == 2);
+  assert(kernel_mask.size(1) == internal_kernel_label.size(0));
+  assert(kernel_mask.size(2) == internal_kernel_label.size(1));
+  CHECK_CPU_INPUT(kernel_mask);
+  CHECK_CPU_INPUT(internal_kernel_label);
+  auto ptr_data = kernel_mask.data_ptr<uint8_t>();
+  IntArrayRef data_shape = kernel_mask.sizes();
+
+  auto data_label_map = internal_kernel_label.data_ptr<int32_t>();
+  vector<vector<int>> text_line;
+
+  kernel_dilate(ptr_data, data_shape, data_label_map, kernel_num,
+                min_kernel_area, text_line);
+
+  return text_line;
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/convex_iou.cpp b/mmcv/mmcv/ops/csrc/pytorch/convex_iou.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..79f2028b551c474453aff2f6633dd426194e4afd
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/convex_iou.cpp
@@ -0,0 +1,23 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/tree/main/mmdet/ops/iou/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  DISPATCH_DEVICE_IMPL(convex_iou_impl, pointsets, polygons, ious);
+}
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious) {
+  convex_iou_impl(pointsets, polygons, ious);
+}
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  DISPATCH_DEVICE_IMPL(convex_giou_impl, pointsets, polygons, output);
+}
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output) {
+  convex_giou_impl(pointsets, polygons, output);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/correlation.cpp b/mmcv/mmcv/ops/csrc/pytorch/correlation.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f4adba2a0c17201476352c473f1c7117af020ab2
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/correlation.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <iostream>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_forward_impl, input1, input2, output, kH, kW,
+                       patchH, patchW, padH, padW, dilationH, dilationW,
+                       dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  DISPATCH_DEVICE_IMPL(correlation_backward_impl, grad_output, input1, input2,
+                       grad_input1, grad_input2, kH, kW, patchH, patchW, padH,
+                       padW, dilationH, dilationW, dilation_patchH,
+                       dilation_patchW, dH, dW);
+}
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW) {
+  correlation_forward_impl(input1, input2, output, kH, kW, patchH, patchW, padH,
+                           padW, dilationH, dilationW, dilation_patchH,
+                           dilation_patchW, dH, dW);
+}
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW) {
+  correlation_backward_impl(grad_output, input1, input2, grad_input1,
+                            grad_input2, kH, kW, patchH, patchW, padH, padW,
+                            dilationH, dilationW, dilation_patchH,
+                            dilation_patchW, dH, dW);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa5a8b3d517e9cec4cf953aa9f3de8e2fb17c3a3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/active_rotated_filter.cpp
@@ -0,0 +1,120 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cpu/ActiveRotatingFilter_cpu.cpp
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void active_rotated_filter_forward_cpu_kernel(
+    const T* weightData, const int* indicesData, const int num_output_planes,
+    const int num_input_planes, const int num_orientations, const int kH,
+    const int kW, const int num_rotations, T* outputData) {
+  const int nEntry = num_orientations * kH * kW;
+  int i, j, l;
+  int k;
+
+#pragma omp parallel for private(i, j, l, k)
+  for (i = 0; i < num_output_planes; i++) {
+    for (j = 0; j < num_input_planes; j++) {
+      for (l = 0; l < nEntry; l++) {
+        int weightIndex = i * num_input_planes * nEntry + j * nEntry + l;
+        T val = *(weightData + weightIndex);
+        for (k = 0; k < num_rotations; k++) {
+          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
+          T* target = outputData +
+                      i * (num_rotations * num_input_planes * nEntry) +
+                      k * (num_input_planes * nEntry) + j * (nEntry) + index;
+          *target = val;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void active_rotated_filter_backward_cpu_kernel(
+    const T* gradOutputData, const int* indicesData,
+    const int num_output_planes, const int num_input_planes,
+    const int num_orientations, const int kH, const int kW,
+    const int num_rotations, T* gradInputData) {
+  const int nEntry = num_orientations * kH * kW;
+  int i, j, l;
+  int k;
+
+#pragma omp parallel for private(i, j, l, k)
+  for (i = 0; i < num_output_planes; i++) {
+    for (j = 0; j < num_input_planes; j++) {
+      for (l = 0; l < nEntry; l++) {
+        int gradInputIndex = i * num_input_planes * nEntry + j * nEntry + l;
+        T* val = gradInputData + gradInputIndex;
+        *val = 0;
+        for (k = 0; k < num_rotations; k++) {
+          int index = (int)(*(indicesData + l * num_rotations + k)) - 1;
+          const T* target =
+              gradOutputData + i * (num_rotations * num_input_planes * nEntry) +
+              k * (num_input_planes * nEntry) + j * (nEntry) + index;
+          *val = *val + *target;
+        }
+      }
+    }
+  }
+}
+
+void ActiveRotatedFilterForwardCPULauncher(const Tensor input,
+                                           const Tensor indices,
+                                           Tensor output) {
+  const int num_output_planes = input.size(0);
+  const int num_input_planes = input.size(1);
+  const int num_orientations = input.size(2);
+  const int kH = input.size(3);
+  const int kW = input.size(4);
+  const int num_rotations = indices.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "active_rotated_filter_forward_cpu_kernel", [&] {
+        active_rotated_filter_forward_cpu_kernel<scalar_t>(
+            input.data_ptr<scalar_t>(), indices.data_ptr<int>(),
+            num_output_planes, num_input_planes, num_orientations, kH, kW,
+            num_rotations, output.data_ptr<scalar_t>());
+      });
+}
+
+void ActiveRotatedFilterBackwardCPULauncher(const Tensor grad_out,
+                                            const Tensor indices,
+                                            Tensor grad_in) {
+  const int num_orientations = indices.size(0);
+  const int kH = indices.size(1);
+  const int kW = indices.size(2);
+  const int num_rotations = indices.size(3);
+  const int num_output_planes = grad_out.size(0) / num_rotations;
+  const int num_input_planes = grad_out.size(1) / num_orientations;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "active_rotated_filter_backward_cpu_kernel", [&] {
+        active_rotated_filter_backward_cpu_kernel<scalar_t>(
+            grad_out.data_ptr<scalar_t>(), indices.data_ptr<int>(),
+            num_output_planes, num_input_planes, num_orientations, kH, kW,
+            num_rotations, grad_in.data_ptr<scalar_t>());
+      });
+}
+
+void active_rotated_filter_forward_cpu(const Tensor input, const Tensor indices,
+                                       Tensor output) {
+  ActiveRotatedFilterForwardCPULauncher(input, indices, output);
+}
+
+void active_rotated_filter_backward_cpu(const Tensor grad_out,
+                                        const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCPULauncher(grad_out, indices, grad_in);
+}
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CPU,
+                     active_rotated_filter_forward_cpu);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CPU,
+                     active_rotated_filter_backward_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..585d2c9fddd1566e4898c35ce6e1f4533cd1a236
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/box_iou_rotated.cpp
@@ -0,0 +1,38 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cpu.cpp
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+void box_iou_rotated_cpu_kernel(const Tensor boxes1, const Tensor boxes2,
+                                Tensor ious, const int mode_flag,
+                                const bool aligned) {
+  int output_size = ious.numel();
+  auto num_boxes1 = boxes1.size(0);
+  auto num_boxes2 = boxes2.size(0);
+
+  if (aligned) {
+    for (int i = 0; i < output_size; i++) {
+      ious[i] = single_box_iou_rotated<T>(boxes1[i].data_ptr<T>(),
+                                          boxes2[i].data_ptr<T>(), mode_flag);
+    }
+  } else {
+    for (int i = 0; i < num_boxes1; i++) {
+      for (int j = 0; j < num_boxes2; j++) {
+        ious[i * num_boxes2 + j] = single_box_iou_rotated<T>(
+            boxes1[i].data_ptr<T>(), boxes2[j].data_ptr<T>(), mode_flag);
+      }
+    }
+  }
+}
+
+void box_iou_rotated_cpu(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                         const int mode_flag, const bool aligned) {
+  box_iou_rotated_cpu_kernel<float>(boxes1, boxes2, ious, mode_flag, aligned);
+}
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CPU, box_iou_rotated_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7ab67e78c7b5fb4468f47066935cb35b68525b54
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/deform_conv.cpp
@@ -0,0 +1,408 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T deformable_im2col_bilinear_cpu(const T *input, const int data_width,
+                                 const int height, const int width, T h, T w) {
+  if (h <= -1 || height <= h || w <= -1 || width <= w) {
+    return 0;
+  }
+
+  int h_low = floor(h);
+  int w_low = floor(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                          const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                            const int width, const T *im_data,
+                            const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floor(argmax_h);
+  int argmax_w_low = floor(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const int height,
+    const int width, const int kernel_h, const int kernel_w, const int pad_h,
+    const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = deformable_im2col_bilinear_cpu(data_im_ptr, width, height,
+                                               width, h_im, w_im);
+        *data_col_ptr = val;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const int channels,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index];
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight =
+              get_gradient_weight_cpu(cur_inv_h_data, cur_inv_w_data,
+                                      cur_h + dy, cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int offset_channels, const int deformable_group, const int height_col,
+    const int width_col, T *grad_offset) {
+  for (int index = 0; index < n; index++) {
+    T val = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      const T weight = get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos];
+      cnt += 1;
+    }
+
+    grad_offset[index] = val;
+  }
+}
+
+void deformable_im2col_cpu(Tensor data_im, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor data_col) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_cpu", [&] {
+        deformable_im2col_cpu_kernel<scalar_t>(
+            num_kernels, data_im.data_ptr<scalar_t>(),
+            data_offset.data_ptr<scalar_t>(), height, width, ksize_h, ksize_w,
+            pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col,
+            data_col.data_ptr<scalar_t>());
+      });
+}
+
+void deformable_col2im_cpu(Tensor data_col, Tensor data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void deformable_col2im_coord_cpu(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_cpu_kernel<scalar_t>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+}
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CPU, deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CPU, deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CPU,
+                     deformable_col2im_coord_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..95390956450d062a37eaec98664aff11a8035587
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/modulated_deform_conv.cpp
@@ -0,0 +1,436 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T dmcn_im2col_bilinear_cpu(const T *input, const int data_width,
+                           const int height, const int width, T h, T w) {
+  int h_low = floorf(h);
+  int w_low = floorf(w);
+  int h_high = h_low + 1;
+  int w_high = w_low + 1;
+
+  T lh = h - h_low;
+  T lw = w - w_low;
+  T hh = 1 - lh, hw = 1 - lw;
+
+  T v1 = 0;
+  if (h_low >= 0 && w_low >= 0) v1 = input[h_low * data_width + w_low];
+  T v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+    v2 = input[h_low * data_width + w_high];
+  T v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+    v3 = input[h_high * data_width + w_low];
+  T v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+    v4 = input[h_high * data_width + w_high];
+
+  T w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+template <typename T>
+T dmcn_get_gradient_weight_cpu(T argmax_h, T argmax_w, const int h, const int w,
+                               const int height, const int width) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+  if (h == argmax_h_low && w == argmax_w_low)
+    weight = (h + 1 - argmax_h) * (w + 1 - argmax_w);
+  if (h == argmax_h_low && w == argmax_w_high)
+    weight = (h + 1 - argmax_h) * (argmax_w + 1 - w);
+  if (h == argmax_h_high && w == argmax_w_low)
+    weight = (argmax_h + 1 - h) * (w + 1 - argmax_w);
+  if (h == argmax_h_high && w == argmax_w_high)
+    weight = (argmax_h + 1 - h) * (argmax_w + 1 - w);
+  return weight;
+}
+
+template <typename T>
+T dmcn_get_coordinate_weight_cpu(T argmax_h, T argmax_w, const int height,
+                                 const int width, const T *im_data,
+                                 const int data_width, const int bp_dir) {
+  if (argmax_h <= -1 || argmax_h >= height || argmax_w <= -1 ||
+      argmax_w >= width) {
+    // empty
+    return 0;
+  }
+
+  int argmax_h_low = floorf(argmax_h);
+  int argmax_w_low = floorf(argmax_w);
+  int argmax_h_high = argmax_h_low + 1;
+  int argmax_w_high = argmax_w_low + 1;
+
+  T weight = 0;
+
+  if (bp_dir == 0) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += -1 * (argmax_w - argmax_w_low) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += (argmax_w_low + 1 - argmax_w) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_w - argmax_w_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  } else if (bp_dir == 1) {
+    if (argmax_h_low >= 0 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_low];
+    if (argmax_h_low >= 0 && argmax_w_high <= width - 1)
+      weight += (argmax_h_low + 1 - argmax_h) *
+                im_data[argmax_h_low * data_width + argmax_w_high];
+    if (argmax_h_high <= height - 1 && argmax_w_low >= 0)
+      weight += -1 * (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_low];
+    if (argmax_h_high <= height - 1 && argmax_w_high <= width - 1)
+      weight += (argmax_h - argmax_h_low) *
+                im_data[argmax_h_high * data_width + argmax_w_high];
+  }
+
+  return weight;
+}
+
+template <typename T>
+void modulated_deformable_im2col_cpu_kernel(
+    const int n, const T *data_im, const T *data_offset, const T *data_mask,
+    const int height, const int width, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int num_channels, const int deformable_group, const int height_col,
+    const int width_col, T *data_col) {
+  for (int index = 0; index < n; index++) {
+    // index index of output matrix
+    const int w_col = index % width_col;
+    const int h_col = (index / width_col) % height_col;
+    const int b_col = (index / width_col / height_col) % batch_size;
+    const int c_im = (index / width_col / height_col) / batch_size;
+    const int c_col = c_im * kernel_h * kernel_w;
+
+    // compute deformable group index
+    const int deformable_group_index = c_im / channel_per_deformable_group;
+
+    const int h_in = h_col * stride_h - pad_h;
+    const int w_in = w_col * stride_w - pad_w;
+
+    T *data_col_ptr =
+        data_col +
+        ((c_col * batch_size + b_col) * height_col + h_col) * width_col + w_col;
+    const T *data_im_ptr =
+        data_im + (b_col * num_channels + c_im) * height * width;
+    const T *data_offset_ptr =
+        data_offset + (b_col * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+
+    const T *data_mask_ptr =
+        data_mask + (b_col * deformable_group + deformable_group_index) *
+                        kernel_h * kernel_w * height_col * width_col;
+
+    for (int i = 0; i < kernel_h; ++i) {
+      for (int j = 0; j < kernel_w; ++j) {
+        const int data_offset_h_ptr =
+            ((2 * (i * kernel_w + j)) * height_col + h_col) * width_col + w_col;
+        const int data_offset_w_ptr =
+            ((2 * (i * kernel_w + j) + 1) * height_col + h_col) * width_col +
+            w_col;
+        const int data_mask_hw_ptr =
+            ((i * kernel_w + j) * height_col + h_col) * width_col + w_col;
+        const T offset_h = data_offset_ptr[data_offset_h_ptr];
+        const T offset_w = data_offset_ptr[data_offset_w_ptr];
+        const T mask = data_mask_ptr[data_mask_hw_ptr];
+        T val = static_cast<T>(0);
+        const T h_im = h_in + i * dilation_h + offset_h;
+        const T w_im = w_in + j * dilation_w + offset_w;
+        if (h_im > -1 && w_im > -1 && h_im < height && w_im < width)
+          val = dmcn_im2col_bilinear_cpu(data_im_ptr, width, height, width,
+                                         h_im, w_im);
+        *data_col_ptr = val * mask;
+        data_col_ptr += batch_size * height_col * width_col;
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_cpu_kernel(
+    const int n, const T *data_col, const T *data_offset, const T *data_mask,
+    const int channels, const int height, const int width, const int kernel_h,
+    const int kernel_w, const int pad_h, const int pad_w, const int stride_h,
+    const int stride_w, const int dilation_h, const int dilation_w,
+    const int channel_per_deformable_group, const int batch_size,
+    const int deformable_group, const int height_col, const int width_col,
+    T *grad_im) {
+  for (int index = 0; index < n; index++) {
+    const int j = (index / width_col / height_col / batch_size) % kernel_w;
+    const int i =
+        (index / width_col / height_col / batch_size / kernel_w) % kernel_h;
+    const int c =
+        index / width_col / height_col / batch_size / kernel_w / kernel_h;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / channel_per_deformable_group;
+
+    int w_out = index % width_col;
+    int h_out = (index / width_col) % height_col;
+    int b = (index / width_col / height_col) % batch_size;
+    int w_in = w_out * stride_w - pad_w;
+    int h_in = h_out * stride_h - pad_h;
+
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+    const int data_offset_h_ptr =
+        ((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out;
+    const int data_offset_w_ptr =
+        ((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col + w_out;
+    const int data_mask_hw_ptr =
+        ((i * kernel_w + j) * height_col + h_out) * width_col + w_out;
+    const T offset_h = data_offset_ptr[data_offset_h_ptr];
+    const T offset_w = data_offset_ptr[data_offset_w_ptr];
+    const T mask = data_mask_ptr[data_mask_hw_ptr];
+    const T cur_inv_h_data = h_in + i * dilation_h + offset_h;
+    const T cur_inv_w_data = w_in + j * dilation_w + offset_w;
+
+    const T cur_top_grad = data_col[index] * mask;
+    const int cur_h = (int)cur_inv_h_data;
+    const int cur_w = (int)cur_inv_w_data;
+    for (int dy = -2; dy <= 2; dy++) {
+      for (int dx = -2; dx <= 2; dx++) {
+        if (cur_h + dy >= 0 && cur_h + dy < height && cur_w + dx >= 0 &&
+            cur_w + dx < width && abs(cur_inv_h_data - (cur_h + dy)) < 1 &&
+            abs(cur_inv_w_data - (cur_w + dx)) < 1) {
+          int cur_bottom_grad_pos =
+              ((b * channels + c) * height + cur_h + dy) * width + cur_w + dx;
+          T weight = dmcn_get_gradient_weight_cpu(cur_inv_h_data,
+                                                  cur_inv_w_data, cur_h + dy,
+                                                  cur_w + dx, height, width);
+          *(grad_im + cur_bottom_grad_pos) += weight * cur_top_grad;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void modulated_deformable_col2im_coord_cpu_kernel(
+    const int n, const T *data_col, const T *data_im, const T *data_offset,
+    const T *data_mask, const int channels, const int height, const int width,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int channel_per_deformable_group,
+    const int batch_size, const int offset_channels, const int deformable_group,
+    const int height_col, const int width_col, T *grad_offset, T *grad_mask) {
+  for (int index = 0; index < n; index++) {
+    T val = 0, mval = 0;
+    int w = index % width_col;
+    int h = (index / width_col) % height_col;
+    int c = (index / width_col / height_col) % offset_channels;
+    int b = (index / width_col / height_col) / offset_channels;
+    // compute the start and end of the output
+
+    const int deformable_group_index = c / (2 * kernel_h * kernel_w);
+    const int col_step = kernel_h * kernel_w;
+    int cnt = 0;
+    const T *data_col_ptr = data_col + deformable_group_index *
+                                           channel_per_deformable_group *
+                                           batch_size * width_col * height_col;
+    const T *data_im_ptr =
+        data_im + (b * deformable_group + deformable_group_index) *
+                      channel_per_deformable_group / kernel_h / kernel_w *
+                      height * width;
+    const T *data_offset_ptr =
+        data_offset + (b * deformable_group + deformable_group_index) * 2 *
+                          kernel_h * kernel_w * height_col * width_col;
+    const T *data_mask_ptr =
+        data_mask + (b * deformable_group + deformable_group_index) * kernel_h *
+                        kernel_w * height_col * width_col;
+
+    const int offset_c = c - deformable_group_index * 2 * kernel_h * kernel_w;
+
+    for (int col_c = (offset_c / 2); col_c < channel_per_deformable_group;
+         col_c += col_step) {
+      const int col_pos =
+          (((col_c * batch_size + b) * height_col) + h) * width_col + w;
+      const int bp_dir = offset_c % 2;
+
+      int j = (col_pos / width_col / height_col / batch_size) % kernel_w;
+      int i =
+          (col_pos / width_col / height_col / batch_size / kernel_w) % kernel_h;
+      int w_out = col_pos % width_col;
+      int h_out = (col_pos / width_col) % height_col;
+      int w_in = w_out * stride_w - pad_w;
+      int h_in = h_out * stride_h - pad_h;
+      const int data_offset_h_ptr =
+          (((2 * (i * kernel_w + j)) * height_col + h_out) * width_col + w_out);
+      const int data_offset_w_ptr =
+          (((2 * (i * kernel_w + j) + 1) * height_col + h_out) * width_col +
+           w_out);
+      const int data_mask_hw_ptr =
+          (((i * kernel_w + j) * height_col + h_out) * width_col + w_out);
+      const T offset_h = data_offset_ptr[data_offset_h_ptr];
+      const T offset_w = data_offset_ptr[data_offset_w_ptr];
+      const T mask = data_mask_ptr[data_mask_hw_ptr];
+      T inv_h = h_in + i * dilation_h + offset_h;
+      T inv_w = w_in + j * dilation_w + offset_w;
+      if (inv_h <= -1 || inv_w <= -1 || inv_h >= height || inv_w >= width)
+        inv_h = inv_w = -2;
+      else
+        mval += data_col_ptr[col_pos] *
+                dmcn_im2col_bilinear_cpu(data_im_ptr + cnt * height * width,
+                                         width, height, width, inv_h, inv_w);
+      const T weight = dmcn_get_coordinate_weight_cpu(
+          inv_h, inv_w, height, width, data_im_ptr + cnt * height * width,
+          width, bp_dir);
+      val += weight * data_col_ptr[col_pos] * mask;
+      cnt += 1;
+    }
+    // KERNEL_ASSIGN(grad_offset[index], offset_req, val);
+    grad_offset[index] = val;
+    if (offset_c % 2 == 0)
+      // KERNEL_ASSIGN(grad_mask[(((b * deformable_group +
+      // deformable_group_index) * kernel_h * kernel_w + offset_c / 2) *
+      // height_col + h) * width_col + w], mask_req, mval);
+      grad_mask[(((b * deformable_group + deformable_group_index) * kernel_h *
+                      kernel_w +
+                  offset_c / 2) *
+                     height_col +
+                 h) *
+                    width_col +
+                w] = mval;
+  }
+}
+
+void modulated_deformable_im2col_cpu(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_cpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_cpu_kernel(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+}
+
+void modulated_deformable_col2im_cpu(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_cpu_kernel(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+}
+
+void modulated_deformable_col2im_coord_cpu(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_cpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_cpu_kernel(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+}
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CPU,
+                     modulated_deformable_im2col_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CPU,
+                     modulated_deformable_col2im_cpu);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CPU,
+                     modulated_deformable_col2im_coord_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/nms.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..53e9b9a8d82c405e8f923be06f78cda730c0f4ee
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/nms.cpp
@@ -0,0 +1,230 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_cpu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto nboxes = boxes.size(0);
+  Tensor select_t = at::ones({nboxes}, boxes.options().dtype(at::kBool));
+
+  auto select = select_t.data_ptr<bool>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  for (int64_t _i = 0; _i < nboxes; _i++) {
+    if (select[_i] == false) continue;
+    auto i = order[_i];
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < nboxes; _j++) {
+      if (select[_j] == false) continue;
+      auto j = order[_j];
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr > iou_threshold) select[_j] = false;
+    }
+  }
+  return order_t.masked_select(select_t);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CPU, nms_cpu);
+
+Tensor softnms_cpu(Tensor boxes, Tensor scores, Tensor dets,
+                   float iou_threshold, float sigma, float min_score,
+                   int method, int offset) {
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  auto x1_t = boxes.select(1, 0).contiguous();
+  auto y1_t = boxes.select(1, 1).contiguous();
+  auto x2_t = boxes.select(1, 2).contiguous();
+  auto y2_t = boxes.select(1, 3).contiguous();
+  auto scores_t = scores.clone();
+
+  Tensor areas_t = (x2_t - x1_t + offset) * (y2_t - y1_t + offset);
+
+  auto nboxes = boxes.size(0);
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto sc = scores_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+  auto de = dets.data_ptr<float>();
+
+  int64_t pos = 0;
+  Tensor inds_t = at::arange(nboxes, boxes.options().dtype(at::kLong));
+  auto inds = inds_t.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < nboxes; i++) {
+    auto max_score = sc[i];
+    auto max_pos = i;
+
+    pos = i + 1;
+    // get max box
+    while (pos < nboxes) {
+      if (max_score < sc[pos]) {
+        max_score = sc[pos];
+        max_pos = pos;
+      }
+      pos = pos + 1;
+    }
+    // swap
+    auto ix1 = de[i * 5 + 0] = x1[max_pos];
+    auto iy1 = de[i * 5 + 1] = y1[max_pos];
+    auto ix2 = de[i * 5 + 2] = x2[max_pos];
+    auto iy2 = de[i * 5 + 3] = y2[max_pos];
+    auto iscore = de[i * 5 + 4] = sc[max_pos];
+    auto iarea = areas[max_pos];
+    auto iind = inds[max_pos];
+    x1[max_pos] = x1[i];
+    y1[max_pos] = y1[i];
+    x2[max_pos] = x2[i];
+    y2[max_pos] = y2[i];
+    sc[max_pos] = sc[i];
+    areas[max_pos] = areas[i];
+    inds[max_pos] = inds[i];
+    x1[i] = ix1;
+    y1[i] = iy1;
+    x2[i] = ix2;
+    y2[i] = iy2;
+    sc[i] = iscore;
+    areas[i] = iarea;
+    inds[i] = iind;
+
+    pos = i + 1;
+    while (pos < nboxes) {
+      auto xx1 = std::max(ix1, x1[pos]);
+      auto yy1 = std::max(iy1, y1[pos]);
+      auto xx2 = std::min(ix2, x2[pos]);
+      auto yy2 = std::min(iy2, y2[pos]);
+
+      auto w = std::max(0.f, xx2 - xx1 + offset);
+      auto h = std::max(0.f, yy2 - yy1 + offset);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[pos] - inter);
+
+      float weight = 1.;
+      if (method == 0) {
+        if (ovr >= iou_threshold) weight = 0;
+      } else if (method == 1) {
+        if (ovr >= iou_threshold) weight = 1 - ovr;
+      } else if (method == 2) {
+        weight = std::exp(-(ovr * ovr) / sigma);
+      }
+      sc[pos] *= weight;
+      // if box score falls below threshold, discard the box by
+      // swapping with last box update N
+      if (sc[pos] < min_score) {
+        x1[pos] = x1[nboxes - 1];
+        y1[pos] = y1[nboxes - 1];
+        x2[pos] = x2[nboxes - 1];
+        y2[pos] = y2[nboxes - 1];
+        sc[pos] = sc[nboxes - 1];
+        areas[pos] = areas[nboxes - 1];
+        inds[pos] = inds[nboxes - 1];
+        nboxes = nboxes - 1;
+        pos = pos - 1;
+      }
+      pos = pos + 1;
+    }
+  }
+  return inds_t.slice(0, 0, nboxes);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset);
+REGISTER_DEVICE_IMPL(softnms_impl, CPU, softnms_cpu);
+
+std::vector<std::vector<int> > nms_match_cpu(Tensor dets, float iou_threshold) {
+  auto x1_t = dets.select(1, 0).contiguous();
+  auto y1_t = dets.select(1, 1).contiguous();
+  auto x2_t = dets.select(1, 2).contiguous();
+  auto y2_t = dets.select(1, 3).contiguous();
+  auto scores = dets.select(1, 4).contiguous();
+
+  at::Tensor areas_t = (x2_t - x1_t) * (y2_t - y1_t);
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  at::Tensor suppressed_t =
+      at::zeros({ndets}, dets.options().dtype(at::kByte).device(at::kCPU));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto order = order_t.data_ptr<int64_t>();
+  auto x1 = x1_t.data_ptr<float>();
+  auto y1 = y1_t.data_ptr<float>();
+  auto x2 = x2_t.data_ptr<float>();
+  auto y2 = y2_t.data_ptr<float>();
+  auto areas = areas_t.data_ptr<float>();
+
+  std::vector<int> keep;
+  std::vector<std::vector<int> > matched;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) continue;
+    keep.push_back(i);
+    std::vector<int> v_i;
+    auto ix1 = x1[i];
+    auto iy1 = y1[i];
+    auto ix2 = x2[i];
+    auto iy2 = y2[i];
+    auto iarea = areas[i];
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) continue;
+      auto xx1 = std::max(ix1, x1[j]);
+      auto yy1 = std::max(iy1, y1[j]);
+      auto xx2 = std::min(ix2, x2[j]);
+      auto yy2 = std::min(iy2, y2[j]);
+
+      auto w = std::max(static_cast<float>(0), xx2 - xx1);
+      auto h = std::max(static_cast<float>(0), yy2 - yy1);
+      auto inter = w * h;
+      auto ovr = inter / (iarea + areas[j] - inter);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+        v_i.push_back(j);
+      }
+    }
+    matched.push_back(v_i);
+  }
+  for (size_t i = 0; i < keep.size(); i++)
+    matched[i].insert(matched[i].begin(), keep[i]);
+  return matched;
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets, float iou_threshold);
+REGISTER_DEVICE_IMPL(nms_match_impl, CPU, nms_match_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d2774c82654ef83d220ca81566cce8d25d02c275
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/nms_rotated.cpp
@@ -0,0 +1,66 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cpu.cpp
+#include "box_iou_rotated_utils.hpp"
+#include "pytorch_cpp_helper.hpp"
+
+template <typename scalar_t>
+Tensor nms_rotated_cpu_kernel(const Tensor dets, const Tensor scores,
+                              const float iou_threshold) {
+  // nms_rotated_cpu_kernel is modified from torchvision's nms_cpu_kernel,
+  // however, the code in this function is much shorter because
+  // we delegate the IoU computation for rotated boxes to
+  // the single_box_iou_rotated function in box_iou_rotated_utils.h
+  AT_ASSERTM(!dets.is_cuda(), "dets must be a CPU tensor");
+  AT_ASSERTM(!scores.is_cuda(), "scores must be a CPU tensor");
+  AT_ASSERTM(dets.scalar_type() == scores.scalar_type(),
+             "dets should have the same type as scores");
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(0, /* descending=*/true));
+
+  auto ndets = dets.size(0);
+  Tensor suppressed_t = at::zeros({ndets}, dets.options().dtype(at::kByte));
+  Tensor keep_t = at::zeros({ndets}, dets.options().dtype(at::kLong));
+
+  auto suppressed = suppressed_t.data_ptr<uint8_t>();
+  auto keep = keep_t.data_ptr<int64_t>();
+  auto order = order_t.data_ptr<int64_t>();
+
+  int64_t num_to_keep = 0;
+
+  for (int64_t _i = 0; _i < ndets; _i++) {
+    auto i = order[_i];
+    if (suppressed[i] == 1) {
+      continue;
+    }
+
+    keep[num_to_keep++] = i;
+
+    for (int64_t _j = _i + 1; _j < ndets; _j++) {
+      auto j = order[_j];
+      if (suppressed[j] == 1) {
+        continue;
+      }
+
+      auto ovr = single_box_iou_rotated<scalar_t>(
+          dets[i].data_ptr<scalar_t>(), dets[j].data_ptr<scalar_t>(), 0);
+      if (ovr >= iou_threshold) {
+        suppressed[j] = 1;
+      }
+    }
+  }
+  return keep_t.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep);
+}
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold) {
+  auto result = at::empty({0}, dets.options());
+  AT_DISPATCH_FLOATING_TYPES(dets.scalar_type(), "nms_rotated", [&] {
+    result = nms_rotated_cpu_kernel<scalar_t>(dets, scores, iou_threshold);
+  });
+  return result;
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..db06a224a075e641b8d7738fe3e7be3f71990fc7
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/pixel_group.cpp
@@ -0,0 +1,126 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include <queue>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> estimate_confidence(int32_t* label,
+                                                    float* score, int label_num,
+                                                    int height, int width) {
+  std::vector<std::vector<float>> point_vector;
+  for (int i = 0; i < label_num; i++) {
+    std::vector<float> point;
+    point.push_back(0);
+    point.push_back(0);
+    point_vector.push_back(point);
+  }
+  for (int y = 0; y < height; y++) {
+    auto label_tmp = label + y * width;
+    auto score_tmp = score + y * width;
+    for (int x = 0; x < width; x++) {
+      auto l = label_tmp[x];
+      if (l > 0) {
+        float confidence = score_tmp[x];
+        point_vector[l].push_back(x);
+        point_vector[l].push_back(y);
+        point_vector[l][0] += confidence;
+        point_vector[l][1] += 1;
+      }
+    }
+  }
+  for (size_t l = 0; l < point_vector.size(); l++)
+    if (point_vector[l][1] > 0) {
+      point_vector[l][0] /= point_vector[l][1];
+    }
+  return point_vector;
+}
+std::vector<std::vector<float>> pixel_group_cpu(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  assert(score.dim() == 2);
+  assert(mask.dim() == 2);
+  assert(embedding.dim() == 3);
+  int height = score.size(0);
+  int width = score.size(1);
+  assert(height == mask.size(0) == embedding.size(1) == kernel_label.size(1));
+  assert(width == mask.size(1) == embedding.size(2) == kernel_label.size(2));
+
+  auto threshold_square = dis_threshold * dis_threshold;
+  auto ptr_score = score.data_ptr<float>();
+  auto ptr_mask = mask.data_ptr<bool>();
+  auto ptr_kernel_contour = kernel_contour.data_ptr<uint8_t>();
+  auto ptr_embedding = embedding.data_ptr<float>();
+  auto ptr_kernel_label = kernel_label.data_ptr<int32_t>();
+  std::queue<std::tuple<int, int, int32_t>> contour_pixels;
+  auto embedding_dim = embedding.size(2);
+  std::vector<std::vector<float>> kernel_vector(
+      kernel_region_num, std::vector<float>(embedding_dim + 1, 0));
+
+  Tensor text_label;
+  text_label = kernel_label.clone();
+  auto ptr_text_label = text_label.data_ptr<int32_t>();
+
+  for (int i = 0; i < height; i++) {
+    auto ptr_embedding_tmp = ptr_embedding + i * width * embedding_dim;
+    auto ptr_kernel_label_tmp = ptr_kernel_label + i * width;
+    auto ptr_kernel_contour_tmp = ptr_kernel_contour + i * width;
+
+    for (int j = 0, k = 0; j < width && k < width * embedding_dim;
+         j++, k += embedding_dim) {
+      int32_t label = ptr_kernel_label_tmp[j];
+      if (label > 0) {
+        for (int d = 0; d < embedding_dim; d++)
+          kernel_vector[label][d] += ptr_embedding_tmp[k + d];
+        kernel_vector[label][embedding_dim] += 1;
+        // kernel pixel number
+        if (ptr_kernel_contour_tmp[j]) {
+          contour_pixels.push(std::make_tuple(i, j, label));
+        }
+      }
+    }
+  }
+  for (int i = 0; i < kernel_region_num; i++) {
+    for (int j = 0; j < embedding_dim; j++) {
+      kernel_vector[i][j] /= kernel_vector[i][embedding_dim];
+    }
+  }
+  int dx[4] = {-1, 1, 0, 0};
+  int dy[4] = {0, 0, -1, 1};
+  while (!contour_pixels.empty()) {
+    auto query_pixel = contour_pixels.front();
+    contour_pixels.pop();
+    int y = std::get<0>(query_pixel);
+    int x = std::get<1>(query_pixel);
+    int32_t l = std::get<2>(query_pixel);
+    auto kernel_cv = kernel_vector[l];
+    for (int idx = 0; idx < 4; idx++) {
+      int tmpy = y + dy[idx];
+      int tmpx = x + dx[idx];
+      auto ptr_text_label_tmp = ptr_text_label + tmpy * width;
+      if (tmpy < 0 || tmpy >= height || tmpx < 0 || tmpx >= width) continue;
+      if (!ptr_mask[tmpy * width + tmpx] || ptr_text_label_tmp[tmpx] > 0)
+        continue;
+
+      float dis = 0;
+      auto ptr_embedding_tmp = ptr_embedding + tmpy * width * embedding_dim;
+      for (size_t i = 0; i < size_t(embedding_dim); i++) {
+        dis +=
+            pow(kernel_cv[i] - ptr_embedding_tmp[tmpx * embedding_dim + i], 2);
+        // ignore further computing if dis is big enough
+        if (dis >= threshold_square) break;
+      }
+      if (dis >= threshold_square) continue;
+      contour_pixels.push(std::make_tuple(tmpy, tmpx, l));
+      ptr_text_label_tmp[tmpx] = l;
+    }
+  }
+
+  return estimate_confidence(ptr_text_label, ptr_score, kernel_region_num,
+                             height, width);
+}
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold);
+REGISTER_DEVICE_IMPL(pixel_group_impl, CPU, pixel_group_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c16baa4cca4c380db4ae25462f5074607f084214
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/points_in_boxes.cpp
@@ -0,0 +1,53 @@
+#include "pytorch_cpp_helper.hpp"
+
+inline void lidar_to_local_coords_cpu(float shift_x, float shift_y, float rz,
+                                      float &local_x, float &local_y) {
+  float cosa = cos(-rz), sina = sin(-rz);
+  local_x = shift_x * cosa + shift_y * (-sina);
+  local_y = shift_x * sina + shift_y * cosa;
+}
+
+inline int check_pt_in_box3d_cpu(const float *pt, const float *box3d,
+                                 float &local_x, float &local_y) {
+  // param pt: (x, y, z)
+  // param box3d: (cx, cy, cz, x_size, y_size, z_size, rz) in LiDAR coordinate,
+  // cz in the bottom center
+  float x = pt[0], y = pt[1], z = pt[2];
+  float cx = box3d[0], cy = box3d[1], cz = box3d[2];
+  float x_size = box3d[3], y_size = box3d[4], z_size = box3d[5], rz = box3d[6];
+  cz += z_size /
+        2.0;  // shift to the center since cz in box3d is the bottom center
+
+  if (fabsf(z - cz) > z_size / 2.0) return 0;
+  lidar_to_local_coords_cpu(x - cx, y - cy, rz, local_x, local_y);
+  float in_flag = (local_x > -x_size / 2.0) & (local_x < x_size / 2.0) &
+                  (local_y > -y_size / 2.0) & (local_y < y_size / 2.0);
+  return in_flag;
+}
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor) {
+  // params boxes: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box DO NOT overlaps params pts:
+  // (npoints, 3) [x, y, z] in LiDAR coordinate params pts_indices: (N, npoints)
+
+  CHECK_CONTIGUOUS(boxes_tensor);
+  CHECK_CONTIGUOUS(pts_tensor);
+  CHECK_CONTIGUOUS(pts_indices_tensor);
+
+  int boxes_num = boxes_tensor.size(0);
+  int pts_num = pts_tensor.size(0);
+
+  const float *boxes = boxes_tensor.data_ptr<float>();
+  const float *pts = pts_tensor.data_ptr<float>();
+  int *pts_indices = pts_indices_tensor.data_ptr<int>();
+
+  float local_x = 0, local_y = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    for (int j = 0; j < pts_num; j++) {
+      int cur_in_flag =
+          check_pt_in_box3d_cpu(pts + j * 3, boxes + i * 7, local_x, local_y);
+      pts_indices[i * pts_num + j] = cur_in_flag;
+    }
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/psamask.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa7fdcbdca908e3f037d75bcc6d7d9e68102d192
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/psamask.cpp
@@ -0,0 +1,199 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+#ifndef min
+#define min(a, b) (((a) < (b)) ? (a) : (b))
+#endif
+#ifndef max
+#define max(a, b) (((a) > (b)) ? (a) : (b))
+#endif
+
+void psamask_collect_forward(const int num_, const int h_feature,
+                             const int w_feature, const int h_mask,
+                             const int w_mask, const int half_h_mask,
+                             const int half_w_mask, const Tensor mask_data,
+                             Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view({-1})[(n * h_feature * w_feature +
+                                    (hidx + h - half_h_mask) * w_feature +
+                                    (widx + w - half_w_mask)) *
+                                       h_feature * w_feature +
+                                   h * w_feature + w] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_forward(const int num_, const int h_feature,
+                                const int w_feature, const int h_mask,
+                                const int w_mask, const int half_h_mask,
+                                const int half_w_mask, const Tensor mask_data,
+                                Tensor buffer_data) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            buffer_data.view(
+                {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                          h_feature * w_feature +
+                      (hidx + h - half_h_mask) * w_feature +
+                      (widx + w - half_w_mask)] =
+                mask_data.view(
+                    {-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                               h_feature +
+                           h) *
+                              w_feature +
+                          w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_collect_backward(const int num_, const int h_feature,
+                              const int w_feature, const int h_mask,
+                              const int w_mask, const int half_h_mask,
+                              const int half_w_mask, const Tensor buffer_diff,
+                              Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view({-1})[(n * h_feature * w_feature +
+                                        (hidx + h - half_h_mask) * w_feature +
+                                        (widx + w - half_w_mask)) *
+                                           h_feature * w_feature +
+                                       h * w_feature + w];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_distribute_backward(const int num_, const int h_feature,
+                                 const int w_feature, const int h_mask,
+                                 const int w_mask, const int half_h_mask,
+                                 const int half_w_mask,
+                                 const Tensor buffer_diff, Tensor mask_diff) {
+  for (int n = 0; n < num_; n++) {
+    for (int h = 0; h < h_feature; h++) {
+      for (int w = 0; w < w_feature; w++) {
+        // effective mask region : [hstart, hend) x [wstart, wend) with
+        // mask-indexed
+        const int hstart = max(0, half_h_mask - h);
+        const int hend = min(h_mask, h_feature + half_h_mask - h);
+        const int wstart = max(0, half_w_mask - w);
+        const int wend = min(w_mask, w_feature + half_w_mask - w);
+        // (hidx,                    widx                   ) with mask-indexed
+        // (hidx + h - half_h_mask, widx + w - half_w_mask) with
+        // feature-indexed
+        for (int hidx = hstart; hidx < hend; hidx++) {
+          for (int widx = wstart; widx < wend; widx++) {
+            mask_diff.view({-1})[((n * h_mask * w_mask + hidx * w_mask + widx) *
+                                      h_feature +
+                                  h) *
+                                     w_feature +
+                                 w] =
+                buffer_diff.view(
+                    {-1})[(n * h_feature * w_feature + h * w_feature + w) *
+                              h_feature * w_feature +
+                          (hidx + h - half_h_mask) * w_feature +
+                          (widx + w - half_w_mask)];
+          }
+        }
+      }
+    }
+  }
+}
+
+void psamask_forward_cpu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                            half_h_mask, half_w_mask, input, output);
+  else
+    psamask_distribute_forward(num_, h_feature, w_feature, h_mask, w_mask,
+                               half_h_mask, half_w_mask, input, output);
+}
+
+void psamask_backward_cpu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  if (psa_type == 0)
+    psamask_collect_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                             half_h_mask, half_w_mask, grad_output, grad_input);
+  else
+    psamask_distribute_backward(num_, h_feature, w_feature, h_mask, w_mask,
+                                half_h_mask, half_w_mask, grad_output,
+                                grad_input);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CPU, psamask_forward_cpu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CPU, psamask_backward_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d545390645917aff7e5e8b42564fb83eb4e62ae7
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/roi_align.cpp
@@ -0,0 +1,466 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlign
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          T x = xx;
+          T y = yy;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y <= 0) {
+            y = 0;
+          }
+          if (x <= 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignForward(const int nthreads, const T* input, const T* rois,
+                     T* output, T* argmax_y, T* argmax_x,
+                     const int pooled_height, const int pooled_width,
+                     const T spatial_scale, const int sampling_ratio,
+                     const int pool_mode,  // 0 - max pool, 1 - avg pool
+                     const bool aligned, const int channels, const int height,
+                     const int width) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign cannot have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // When the grid is empty, output zeros == 0/1, instead of NaN.
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          T maxval = -10000;
+          T maxidx_y = -1.f, maxidx_x = -1.f;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            const T y = roi_start_h + ph * bin_size_h +
+                        static_cast<T>(iy + .5f) * bin_size_h /
+                            static_cast<T>(roi_bin_grid_h);
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              const T x = roi_start_w + pw * bin_size_w +
+                          static_cast<T>(ix + .5f) * bin_size_w /
+                              static_cast<T>(roi_bin_grid_w);
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              T val = pc.w1 * offset_input[pc.pos1] +
+                      pc.w2 * offset_input[pc.pos2] +
+                      pc.w3 * offset_input[pc.pos3] +
+                      pc.w4 * offset_input[pc.pos4];
+              if (val > maxval) {
+                maxval = val;
+                maxidx_y = y;
+                maxidx_x = x;
+              }
+              output_val += val;
+              pre_calc_index += 1;
+            }
+          }
+          if (pool_mode == 0) {
+            // We do max pooling inside a bin
+            output[index] = maxval;
+            argmax_y[index] = maxidx_y;
+            argmax_x[index] = maxidx_x;
+          } else if (pool_mode == 1) {
+            // We do average (integral) pooling inside a bin
+            output[index] = output_val / count;
+          }  // if
+        }    // for pw
+      }      // for ph
+    }        // for c
+  }          // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high,
+                                   const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignBackward(const int nthreads, const T* grad_output, const T* rois,
+                      const T* argmax_y, const T* argmax_x, T* grad_input,
+                      const int pooled_height, const int pooled_width,
+                      const T spatial_scale, const int sampling_ratio,
+                      const int pool_mode,  // 0 - max pool, 1 - avg pool
+                      const bool aligned, const int channels, const int height,
+                      const int width, const int n_stride, const int c_stride,
+                      const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* offset_rois = rois + n * 5;
+    int roi_batch_ind = offset_rois[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlign do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    if (pool_mode == 0) {
+      // We do max pooling inside a bin
+      T y = argmax_y[index], x = argmax_x[index];
+      if (y != -1.f) {
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high, index);
+
+        T g1 = grad_output_this_bin * w1;
+        T g2 = grad_output_this_bin * w2;
+        T g3 = grad_output_this_bin * w3;
+        T g4 = grad_output_this_bin * w4;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // mode
+    } else if (pool_mode == 1) {
+      // We do average (integral) pooling inside a bin
+      // We use roi_bin_grid to sample the grid and mimic integral
+      int roi_bin_grid_h =
+          (sampling_ratio > 0)
+              ? sampling_ratio
+              : ceilf(roi_height / pooled_height);  // e.g., = 2
+      int roi_bin_grid_w = (sampling_ratio > 0)
+                               ? sampling_ratio
+                               : ceilf(roi_width / pooled_width);
+
+      const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+      for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+        const T y = roi_start_h + ph * bin_size_h +
+                    static_cast<T>(iy + .5f) * bin_size_h /
+                        static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+          const T x = roi_start_w + pw * bin_size_w +
+                      static_cast<T>(ix + .5f) * bin_size_w /
+                          static_cast<T>(roi_bin_grid_w);
+
+          T w1, w2, w3, w4;
+          int x_low, x_high, y_low, y_high;
+
+          bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                        x_low, x_high, y_low, y_high, index);
+
+          T g1 = grad_output_this_bin * w1 / count;
+          T g2 = grad_output_this_bin * w2 / count;
+          T g3 = grad_output_this_bin * w3 / count;
+          T g4 = grad_output_this_bin * w4 / count;
+
+          if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+            // atomic add is not needed for now since it is single threaded
+            add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+            add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+            add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+            add(offset_grad_input + y_high * width + x_high,
+                static_cast<T>(g4));
+          }  // if
+        }    // ix
+      }      // iy
+    }        // mode
+  }          // for
+}  // ROIAlignBackward
+
+void ROIAlignForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                Tensor argmax_y, Tensor argmax_x,
+                                int aligned_height, int aligned_width,
+                                float spatial_scale, int sampling_ratio,
+                                int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlign_forward", [&] {
+        ROIAlignForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            output.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), aligned_height, aligned_width,
+            static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+            aligned, channels, height, width);
+      });
+}
+
+void ROIAlignBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                 Tensor argmax_y, Tensor argmax_x,
+                                 Tensor grad_input, int aligned_height,
+                                 int aligned_width, float spatial_scale,
+                                 int sampling_ratio, int pool_mode,
+                                 bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlign_backward", [&] {
+        ROIAlignBackward<scalar_t>(
+            output_size, grad_output.data_ptr<scalar_t>(),
+            rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+            argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+            aligned_height, aligned_width, static_cast<scalar_t>(spatial_scale),
+            sampling_ratio, pool_mode, aligned, channels, height, width,
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
+
+void roi_align_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardCPULauncher(input, rois, output, argmax_y, argmax_x,
+                             aligned_height, aligned_width, spatial_scale,
+                             sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cpu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardCPULauncher(grad_output, rois, argmax_y, argmax_x, grad_input,
+                              aligned_height, aligned_width, spatial_scale,
+                              sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CPU, roi_align_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CPU, roi_align_backward_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c849de0cbc564a9a88cdbcd35b4acdb065f99a3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/roi_align_rotated.cpp
@@ -0,0 +1,455 @@
+// Modified from
+// https://github.com/facebookresearch/detectron2/tree/master/detectron2/layers/csrc/ROIAlignRotated
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include <ATen/ATen.h>
+#include <ATen/TensorUtils.h>
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+// implementation taken from Caffe2
+template <typename T>
+struct PreCalc {
+  int pos1;
+  int pos2;
+  int pos3;
+  int pos4;
+  T w1;
+  T w2;
+  T w3;
+  T w4;
+};
+
+template <typename T>
+void pre_calc_for_bilinear_interpolate(
+    const int height, const int width, const int pooled_height,
+    const int pooled_width, const int iy_upper, const int ix_upper,
+    T roi_start_h, T roi_start_w, T bin_size_h, T bin_size_w,
+    int roi_bin_grid_h, int roi_bin_grid_w, T roi_center_h, T roi_center_w,
+    T cos_theta, T sin_theta, std::vector<PreCalc<T>>& pre_calc) {
+  int pre_calc_index = 0;
+  for (int ph = 0; ph < pooled_height; ph++) {
+    for (int pw = 0; pw < pooled_width; pw++) {
+      for (int iy = 0; iy < iy_upper; iy++) {
+        const T yy = roi_start_h + ph * bin_size_h +
+                     static_cast<T>(iy + .5f) * bin_size_h /
+                         static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+        for (int ix = 0; ix < ix_upper; ix++) {
+          const T xx = roi_start_w + pw * bin_size_w +
+                       static_cast<T>(ix + .5f) * bin_size_w /
+                           static_cast<T>(roi_bin_grid_w);
+
+          // Rotate by theta around the center and translate
+          // In image space, (y, x) is the order for Right Handed System,
+          // and this is essentially multiplying the point by a rotation matrix
+          // to rotate it counterclockwise through angle theta.
+          T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+          T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+          // deal with: inverse elements are out of feature map boundary
+          if (y < -1.0 || y > height || x < -1.0 || x > width) {
+            // empty
+            PreCalc<T> pc;
+            pc.pos1 = 0;
+            pc.pos2 = 0;
+            pc.pos3 = 0;
+            pc.pos4 = 0;
+            pc.w1 = 0;
+            pc.w2 = 0;
+            pc.w3 = 0;
+            pc.w4 = 0;
+            pre_calc[pre_calc_index] = pc;
+            pre_calc_index += 1;
+            continue;
+          }
+
+          if (y < 0) {
+            y = 0;
+          }
+          if (x < 0) {
+            x = 0;
+          }
+
+          int y_low = (int)y;
+          int x_low = (int)x;
+          int y_high;
+          int x_high;
+
+          if (y_low >= height - 1) {
+            y_high = y_low = height - 1;
+            y = (T)y_low;
+          } else {
+            y_high = y_low + 1;
+          }
+
+          if (x_low >= width - 1) {
+            x_high = x_low = width - 1;
+            x = (T)x_low;
+          } else {
+            x_high = x_low + 1;
+          }
+
+          T ly = y - y_low;
+          T lx = x - x_low;
+          T hy = 1. - ly, hx = 1. - lx;
+          T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+          // save weights and indices
+          PreCalc<T> pc;
+          pc.pos1 = y_low * width + x_low;
+          pc.pos2 = y_low * width + x_high;
+          pc.pos3 = y_high * width + x_low;
+          pc.pos4 = y_high * width + x_high;
+          pc.w1 = w1;
+          pc.w2 = w2;
+          pc.w3 = w3;
+          pc.w4 = w4;
+          pre_calc[pre_calc_index] = pc;
+
+          pre_calc_index += 1;
+        }
+      }
+    }
+  }
+}
+
+template <typename T>
+void ROIAlignRotatedForward(const int nthreads, const T* input,
+                            const T& spatial_scale, const bool aligned,
+                            const bool clockwise, const int channels,
+                            const int height, const int width,
+                            const int pooled_height, const int pooled_width,
+                            const int sampling_ratio, const T* rois,
+                            T* output) {
+  int n_rois = nthreads / channels / pooled_width / pooled_height;
+  // (n, c, ph, pw) is an element in the pooled output
+  // can be parallelized using omp
+  // #pragma omp parallel for num_threads(32)
+  for (int n = 0; n < n_rois; n++) {
+    int index_n = n * channels * pooled_width * pooled_height;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1);  // e.g. = 4
+
+    // we want to precalculate indices and weights shared by all channels,
+    // this is the key point of optimization
+    std::vector<PreCalc<T>> pre_calc(roi_bin_grid_h * roi_bin_grid_w *
+                                     pooled_width * pooled_height);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    pre_calc_for_bilinear_interpolate(
+        height, width, pooled_height, pooled_width, roi_bin_grid_h,
+        roi_bin_grid_w, roi_start_h, roi_start_w, bin_size_h, bin_size_w,
+        roi_bin_grid_h, roi_bin_grid_w, roi_center_h, roi_center_w, cos_theta,
+        sin_theta, pre_calc);
+
+    for (int c = 0; c < channels; c++) {
+      int index_n_c = index_n + c * pooled_width * pooled_height;
+      const T* offset_input =
+          input + (roi_batch_ind * channels + c) * height * width;
+      int pre_calc_index = 0;
+
+      for (int ph = 0; ph < pooled_height; ph++) {
+        for (int pw = 0; pw < pooled_width; pw++) {
+          int index = index_n_c + ph * pooled_width + pw;
+
+          T output_val = 0.;
+          for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+            for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+              PreCalc<T> pc = pre_calc[pre_calc_index];
+              output_val += pc.w1 * offset_input[pc.pos1] +
+                            pc.w2 * offset_input[pc.pos2] +
+                            pc.w3 * offset_input[pc.pos3] +
+                            pc.w4 * offset_input[pc.pos4];
+
+              pre_calc_index += 1;
+            }
+          }
+          output_val /= count;
+
+          output[index] = output_val;
+        }  // for pw
+      }    // for ph
+    }      // for c
+  }        // for n
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y < 0) {
+    y = 0;
+  }
+
+  if (x < 0) {
+    x = 0;
+  }
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <class T>
+inline void add(T* address, const T& val) {
+  *address += val;
+}
+
+template <typename T>
+void ROIAlignRotatedBackward(
+    const int nthreads,
+    // may not be contiguous. should index using n_stride, etc
+    const T* grad_output, const T& spatial_scale, const bool aligned,
+    const bool clockwise, const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, const int sampling_ratio,
+    T* grad_input, const T* rois, const int n_stride, const int c_stride,
+    const int h_stride, const int w_stride) {
+  for (int index = 0; index < nthreads; index++) {
+    // (n, c, ph, pw) is an element in the pooled output
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const T* current_roi = rois + n * 6;
+    int roi_batch_ind = current_roi[0];
+
+    // Do not use rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_center_w = current_roi[1] * spatial_scale - offset;
+    T roi_center_h = current_roi[2] * spatial_scale - offset;
+    T roi_width = current_roi[3] * spatial_scale;
+    T roi_height = current_roi[4] * spatial_scale;
+    T theta = current_roi[5];
+    if (clockwise) {
+      theta = -theta;  // If clockwise, the angle needs to be reversed.
+    }
+    T cos_theta = cos(theta);
+    T sin_theta = sin(theta);
+
+    if (aligned) {
+      AT_ASSERTM(roi_width >= 0 && roi_height >= 0,
+                 "ROIs in ROIAlignRotated do not have non-negative size!");
+    } else {  // for backward-compatibility only
+      roi_width = std::max(roi_width, (T)1.);
+      roi_height = std::max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    T* offset_grad_input =
+        grad_input + ((roi_batch_ind * channels + c) * height * width);
+
+    int output_offset = n * n_stride + c * c_stride;
+    const T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    int roi_bin_grid_h = (sampling_ratio > 0)
+                             ? sampling_ratio
+                             : ceilf(roi_height / pooled_height);  // e.g., = 2
+    int roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceilf(roi_width / pooled_width);
+
+    // roi_start_h and roi_start_w are computed wrt the center of RoI (x, y).
+    // Appropriate translation needs to be applied after.
+    T roi_start_h = -roi_height / 2.0;
+    T roi_start_w = -roi_width / 2.0;
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w;  // e.g. = 4
+
+    for (int iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T yy = roi_start_h + ph * bin_size_h +
+                   static_cast<T>(iy + .5f) * bin_size_h /
+                       static_cast<T>(roi_bin_grid_h);  // e.g., 0.5, 1.5
+      for (int ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T xx = roi_start_w + pw * bin_size_w +
+                     static_cast<T>(ix + .5f) * bin_size_w /
+                         static_cast<T>(roi_bin_grid_w);
+
+        // Rotate by theta around the center and translate
+        T y = yy * cos_theta - xx * sin_theta + roi_center_h;
+        T x = yy * sin_theta + xx * cos_theta + roi_center_w;
+
+        T w1, w2, w3, w4;
+        int x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(height, width, y, x, w1, w2, w3, w4,
+                                      x_low, x_high, y_low, y_high);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          // atomic add is not needed for now since it is single threaded
+          add(offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
+          add(offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
+          add(offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
+          add(offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+        }  // if
+      }    // ix
+    }      // iy
+  }        // for
+}  // ROIAlignRotatedBackward
+
+void ROIAlignRotatedForwardCPULauncher(Tensor input, Tensor rois, Tensor output,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       bool aligned, bool clockwise) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotated_forward", [&] {
+        ROIAlignRotatedForward<scalar_t>(
+            output_size, input.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
+      });
+}
+
+void ROIAlignRotatedBackwardCPULauncher(Tensor grad_output, Tensor rois,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, bool aligned,
+                                        bool clockwise) {
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  // get stride values to ensure indexing into gradients is correct.
+  int n_stride = grad_output.stride(0);
+  int c_stride = grad_output.stride(1);
+  int h_stride = grad_output.stride(2);
+  int w_stride = grad_output.stride(3);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "ROIAlignRotated_backward", [&] {
+        ROIAlignRotatedBackward<scalar_t>(
+            grad_output.numel(), grad_output.data_ptr<scalar_t>(),
+            static_cast<scalar_t>(spatial_scale), aligned, clockwise, channels,
+            height, width, aligned_height, aligned_width, sampling_ratio,
+            grad_input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+            n_stride, c_stride, h_stride, w_stride);
+      });
+}
+
+void roi_align_rotated_forward_cpu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardCPULauncher(input, rois, output, aligned_height,
+                                    aligned_width, spatial_scale,
+                                    sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_cpu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  ROIAlignRotatedBackwardCPULauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CPU,
+                     roi_align_rotated_forward_cpu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CPU,
+                     roi_align_rotated_backward_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..09dcdd33759aa03e619c629ef7ae052d0fe48f2b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/rotated_feature_align.cpp
@@ -0,0 +1,262 @@
+// modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T>
+T bilinear_interpolate(const T* input, const int height, const int width, T y,
+                       T x, const int index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) return 0;
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  int y_low = (int)y;
+  int x_low = (int)x;
+  int y_high;
+  int x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  const T v_low = fma(v2 - v1, lx, v1);
+  const T v_high = fma(v4 - v3, lx, v3);
+  const T val = fma(v_high - v_low, ly, v_low);
+
+  return val;
+}
+
+template <typename scalar_t>
+void rotated_feature_align_forward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t* bottom_data,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width, scalar_t* top_data) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    const scalar_t* offset_bottom_data =
+        bottom_data + (n * channels + c) * height * width;
+
+    scalar_t output_val = bottom_data[index];
+    for (int i = 0; i < points; i++) {
+      output_val += bilinear_interpolate<scalar_t>(offset_bottom_data, height,
+                                                   width, py[i], px[i], i);
+    }
+    top_data[index] = output_val;
+  }
+}
+
+template <typename T>
+void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
+                                   T& w1, T& w2, T& w3, T& w4, int& x_low,
+                                   int& x_high, int& y_low, int& y_high,
+                                   const int index) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0) y = 0;
+  if (x <= 0) x = 0;
+
+  y_low = (int)y;
+  x_low = (int)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  return;
+}
+
+template <typename scalar_t>
+inline void valueAdd(scalar_t* address, scalar_t val) {
+  scalar_t old = *address;
+  *address = (old + val);
+}
+
+template <typename scalar_t>
+void rotated_feature_align_backward_cpu_kernel(
+    const int nthreads, const int points, const scalar_t* top_diff,
+    const scalar_t* best_bboxes, const scalar_t spatial_scale,
+    const int channels, const int height, const int width,
+    scalar_t* bottom_diff) {
+  for (int index = 0; index < nthreads; index++) {
+    int w = index % width;
+    int h = (index / width) % height;
+    int c = (index / width / height) % channels;
+    int n = index / width / height / channels;
+
+    const scalar_t* bbox_offset =
+        best_bboxes + ((n * height + h) * width + w) * 5;
+    scalar_t roi_y = bbox_offset[0] * spatial_scale;
+    scalar_t roi_x = bbox_offset[1] * spatial_scale;
+
+    scalar_t px[5] = {roi_x, 0, 0, 0, 0};
+    scalar_t py[5] = {roi_y, 0, 0, 0, 0};
+
+    if (points > 1) {
+      scalar_t roi_w = bbox_offset[2] * spatial_scale;
+      scalar_t roi_h = bbox_offset[3] * spatial_scale;
+      scalar_t roi_a = bbox_offset[4];
+
+      scalar_t w_2 = roi_w / 2, h_2 = roi_h / 2;
+      scalar_t cosa = cosf(roi_a), sina = sinf(roi_a);
+      scalar_t wx = cosa * w_2, wy = sina * w_2;
+      scalar_t hx = -sina * h_2, hy = cosa * h_2;
+
+      px[1] = roi_x + wx + hx;
+      py[1] = roi_y + wy + hy;
+      px[2] = roi_x - wx + hx;
+      py[2] = roi_y - wy + hy;
+      px[3] = roi_x - wx - hx;
+      py[3] = roi_y - wy - hy;
+      px[4] = roi_x + wx - hx;
+      py[4] = roi_y + wy - hy;
+    }
+
+    scalar_t* offset_bottom_diff =
+        bottom_diff + (n * channels + c) * height * width;
+    scalar_t value_top_diff = top_diff[index];
+
+    valueAdd(bottom_diff + index, value_top_diff);
+    for (int i = 0; i < points; i++) {
+      scalar_t w1, w2, w3, w4;
+      int x_low, x_high, y_low, y_high;
+
+      bilinear_interpolate_gradient<scalar_t>(height, width, py[i], px[i], w1,
+                                              w2, w3, w4, x_low, x_high, y_low,
+                                              y_high, i);
+      scalar_t g1 = value_top_diff * w1;
+      scalar_t g2 = value_top_diff * w2;
+      scalar_t g3 = value_top_diff * w3;
+      scalar_t g4 = value_top_diff * w4;
+      if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+        valueAdd(offset_bottom_diff + y_low * width + x_low, g1);
+        valueAdd(offset_bottom_diff + y_low * width + x_high, g2);
+        valueAdd(offset_bottom_diff + y_high * width + x_low, g3);
+        valueAdd(offset_bottom_diff + y_high * width + x_high, g4);
+      }
+    }
+  }
+}
+
+void rotated_feature_align_forward_cpu(const Tensor features,
+                                       const Tensor best_bboxes,
+                                       const float spatial_scale,
+                                       const int points, Tensor output) {
+  const int output_size = features.numel();
+  AT_DISPATCH_FLOATING_TYPES(
+      features.scalar_type(), "rotated_feature_align_forward_cpu_kernel", [&] {
+        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* top_data = output.data_ptr<scalar_t>();
+
+        rotated_feature_align_forward_cpu_kernel<scalar_t>(
+            output_size, points, bottom_data, bboxes_data,
+            scalar_t(spatial_scale), features.size(1), features.size(2),
+            features.size(3), top_data);
+      });
+}
+
+void rotated_feature_align_backward_cpu(const Tensor top_grad,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor bottom_grad) {
+  const int output_size = top_grad.numel();
+  AT_DISPATCH_FLOATING_TYPES(
+      top_grad.scalar_type(), "rotated_feature_align_backward_cpu_kernel", [&] {
+        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
+
+        rotated_feature_align_backward_cpu_kernel<scalar_t>(
+            output_size, points, top_diff, bboxes_data, scalar_t(spatial_scale),
+            top_grad.size(1), top_grad.size(2), top_grad.size(3), bottom_diff);
+      });
+}
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CPU,
+                     rotated_feature_align_forward_cpu);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CPU,
+                     rotated_feature_align_backward_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b2c592b77d35af5dba3c8bc986aca30c2726d25c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/sparse_indice.cpp
@@ -0,0 +1,84 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/geometry.h>
+#include <utils/spconv/spconv/indice.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    if (transpose)
+      return getIndicePairsDeConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+    else
+      return getIndicePairsConv<Index, IndexGrid, NDim>(
+          indicesIn, indicesOut, gridsOut, indicePairs, indiceNum,
+          kernelSize.data(), stride.data(), padding.data(), dilation.data(),
+          outSpatialShape.data());
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::CPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::CPU& d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    return getIndicePairsSubM<Index, IndexGrid, NDim>(
+        indicesIn, gridsOut, indicePairs, indiceNum, kernelSize.data(),
+        stride.data(), padding.data(), dilation.data(), outSpatialShape.data());
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_INDEX_NDIM(Index, NDIM)                           \
+  template struct functor::CreateConvIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;               \
+  template struct functor::CreateSubMIndicePairFunctor<tv::CPU, Index, int, \
+                                                       NDIM>;
+
+#define DECLARE_CPU_INDEX(Index)          \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 3); \
+  DECLARE_CPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_CPU_INDEX(int);
+DECLARE_CPU_INDEX(long);
+
+#undef DECLARE_CPU_INDEX
+#undef DECLARE_CPU_SPECS_INDEX_NDIM
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6266741ff9a4c1e122012d94578bb2cef58e4178
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/sparse_maxpool.cpp
@@ -0,0 +1,82 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/maxpool.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] < inFeaturesData[idxi + plane])
+          outFeaturesData[idxo + plane] = inFeaturesData[idxi + plane];
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU &d, tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size) {
+    int stride = outFeatures.dim(1);
+    auto outFeaturesData = outFeatures.data();
+    auto inFeaturesData = inFeatures.data();
+    auto foutData = fout.data();
+    auto finData = fin.data();
+    auto indicesIn = indices.subview(0).data();
+    auto indicesOut = indices.subview(1).data();
+    Index idxi, idxo;
+    for (int row = 0; row < size; row++) {
+      idxi = indicesIn[row] * stride;
+      idxo = indicesOut[row] * stride;
+      for (int plane = 0; plane < stride; ++plane)
+        if (outFeaturesData[idxo + plane] == inFeaturesData[idxi + plane])
+          finData[idxi + plane] += foutData[idxo + plane];
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(T, Index)                                \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::CPU, T, Index>; \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::CPU, T, Index>;
+
+#define DECLARE_CPU_SPECS(T)         \
+  DECLARE_CPU_SPECS_T_INDEX(T, int); \
+  DECLARE_CPU_SPECS_T_INDEX(T, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d4223da36093c558f62dd92a698411b3f5572096
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/sparse_reordering.cpp
@@ -0,0 +1,68 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/script.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    int numPlanes = features.dim(1);
+    for (int i = 0; i < size; ++i) {
+      std::memcpy(buffer.data() + i * numPlanes,
+                  features.data() + indices[i] * numPlanes,
+                  sizeof(scalar_t) * numPlanes);
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::CPU, scalar_t, Index> {
+  void operator()(const tv::CPU& d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    int numPlanes = outFeatures.dim(1);
+    const scalar_t* buf = buffer.data();
+    scalar_t* out = outFeatures.data();
+    for (int i = 0; i < size; ++i) {
+      buf = buffer.data() + i * numPlanes;
+      out = outFeatures.data() + indices[i] * numPlanes;
+      for (int j = 0; j < numPlanes; ++j) {
+        out[j] += buf[j];
+      }
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_CPU_SPECS_T_INDEX(scalar_t, Index)                        \
+  template struct functor::SparseGatherFunctor<tv::CPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::CPU, scalar_t, Index>;
+
+#define DECLARE_CPU_SPECS(scalar_t)         \
+  DECLARE_CPU_SPECS_T_INDEX(scalar_t, int); \
+  DECLARE_CPU_SPECS_T_INDEX(scalar_t, long);
+
+DECLARE_CPU_SPECS(float);
+DECLARE_CPU_SPECS(double);
+DECLARE_CPU_SPECS(at::Half);
+
+#undef DECLARE_CPU_SPECS
+#undef DECLARE_CPU_SPECS_T_INDEX
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp b/mmcv/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a21f849a0b90ebb489d26daadbbc48427d6dd502
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cpu/voxelization.cpp
@@ -0,0 +1,186 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <typename T, typename T_int>
+void dynamic_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T_int, 2> coors, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const std::vector<int> grid_size,
+    const int num_points, const int num_features, const int NDim) {
+  const int ndim_minus_1 = NDim - 1;
+  bool failed = false;
+  // int coor[NDim];
+  int* coor = new int[NDim]();
+  int c;
+
+  for (int i = 0; i < num_points; ++i) {
+    failed = false;
+    for (int j = 0; j < NDim; ++j) {
+      c = floor((points[i][j] - coors_range[j]) / voxel_size[j]);
+      // necessary to rm points out of range
+      if ((c < 0 || c >= grid_size[j])) {
+        failed = true;
+        break;
+      }
+      coor[ndim_minus_1 - j] = c;
+    }
+
+    // memcpy and memset will cause problem because of the memory distribution
+    // discontinuity of TensorAccessor, so here using loops to replace memcpy
+    // or memset
+    if (failed) {
+      for (int k = 0; k < NDim; ++k) {
+        coors[i][k] = -1;
+      }
+    } else {
+      for (int k = 0; k < NDim; ++k) {
+        coors[i][k] = coor[k];
+      }
+    }
+  }
+
+  delete[] coor;
+  return;
+}
+
+template <typename T, typename T_int>
+void hard_voxelize_forward_cpu_kernel(
+    const torch::TensorAccessor<T, 2> points,
+    torch::TensorAccessor<T, 3> voxels, torch::TensorAccessor<T_int, 2> coors,
+    torch::TensorAccessor<T_int, 1> num_points_per_voxel,
+    torch::TensorAccessor<T_int, 3> coor_to_voxelidx, int& voxel_num,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const std::vector<int> grid_size, const int max_points,
+    const int max_voxels, const int num_points, const int num_features,
+    const int NDim) {
+  // declare a temp coors
+  at::Tensor temp_coors = at::zeros(
+      {num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
+
+  // First use dynamic voxelization to get coors,
+  // then check max points/voxels constraints
+  dynamic_voxelize_forward_cpu_kernel<T, int>(
+      points, temp_coors.accessor<int, 2>(), voxel_size, coors_range, grid_size,
+      num_points, num_features, NDim);
+
+  int voxelidx, num;
+  auto coor = temp_coors.accessor<int, 2>();
+
+  for (int i = 0; i < num_points; ++i) {
+    // T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
+
+    if (coor[i][0] == -1) continue;
+
+    voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
+
+    // record voxel
+    if (voxelidx == -1) {
+      voxelidx = voxel_num;
+      if (max_voxels != -1 && voxel_num >= max_voxels) continue;
+      voxel_num += 1;
+
+      coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
+      // memcpy will cause problem because of the memory distribution
+      // discontinuity of TensorAccessor, so here using loops to replace memcpy
+      for (int k = 0; k < NDim; ++k) {
+        coors[voxelidx][k] = coor[i][k];
+      }
+    }
+
+    // put points into voxel
+    num = num_points_per_voxel[voxelidx];
+    if (max_points == -1 || num < max_points) {
+      // memcpy will cause problem because of the memory distribution
+      // discontinuity of TensorAccessor, so here using loops to replace memcpy
+      for (int k = 0; k < num_features; ++k) {
+        voxels[voxelidx][num][k] = points[i][k];
+      }
+      num_points_per_voxel[voxelidx] += 1;
+    }
+  }
+
+  return;
+}
+
+void dynamic_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& coors,
+                                  const std::vector<float> voxel_size,
+                                  const std::vector<float> coors_range,
+                                  const int NDim = 3) {
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "dynamic_voxelize_forward_cpu_kernel", [&] {
+        dynamic_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), coors.accessor<int, 2>(),
+            voxel_size, coors_range, grid_size, num_points, num_features, NDim);
+      });
+}
+
+int hard_voxelize_forward_cpu(const at::Tensor& points, at::Tensor& voxels,
+                              at::Tensor& coors,
+                              at::Tensor& num_points_per_voxel,
+                              const std::vector<float> voxel_size,
+                              const std::vector<float> coors_range,
+                              const int max_points, const int max_voxels,
+                              const int NDim = 3) {
+  // current version tooks about 0.02s_0.03s for one frame on cpu
+  // check device
+  AT_ASSERTM(points.device().is_cpu(), "points must be a CPU tensor");
+
+  std::vector<int> grid_size(NDim);
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  for (int i = 0; i < NDim; ++i) {
+    grid_size[i] =
+        round((coors_range[NDim + i] - coors_range[i]) / voxel_size[i]);
+  }
+
+  // coors, num_points_per_voxel, coor_to_voxelidx are int Tensor
+  // printf("cpu coor_to_voxelidx size: [%d, %d, %d]\n", grid_size[2],
+  // grid_size[1], grid_size[0]);
+  at::Tensor coor_to_voxelidx =
+      -at::ones({grid_size[2], grid_size[1], grid_size[0]}, coors.options());
+
+  int voxel_num = 0;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "hard_voxelize_forward_cpu_kernel", [&] {
+        hard_voxelize_forward_cpu_kernel<scalar_t, int>(
+            points.accessor<scalar_t, 2>(), voxels.accessor<scalar_t, 3>(),
+            coors.accessor<int, 2>(), num_points_per_voxel.accessor<int, 1>(),
+            coor_to_voxelidx.accessor<int, 3>(), voxel_num, voxel_size,
+            coors_range, grid_size, max_points, max_voxels, num_points,
+            num_features, NDim);
+      });
+
+  return voxel_num;
+}
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CPU,
+                     hard_voxelize_forward_cpu);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CPU,
+                     dynamic_voxelize_forward_cpu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..27fffb9faeaa33eff201c0fcaf236866e5d10712
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/active_rotated_filter_cuda.cu
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/csuhan/s2anet/blob/master/mmdet/ops/orn/src/cuda/ActiveRotatingFilter_cuda.cu
+#include "active_rotated_filter_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output) {
+  int num_output_planes = input.size(0);
+  int num_input_planes = input.size(1);
+  int num_orientations = input.size(2);
+  int kH = input.size(3);
+  int kW = input.size(4);
+  int num_rotations = indices.size(3);
+  int nEntry = num_orientations * kH * kW;
+  int output_size = input.numel();
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "active_rotated_filter_forward_cuda_kernel", [&] {
+        active_rotated_filter_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                indices.data_ptr<int>(), num_input_planes, num_output_planes,
+                num_orientations, num_rotations, nEntry,
+                output.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in) {
+  int num_orientations = indices.size(0);
+  int kH = indices.size(1);
+  int kW = indices.size(2);
+  int num_rotations = indices.size(3);
+  int num_output_planes = grad_out.size(0) / num_rotations;
+  int num_input_planes = grad_out.size(1) / num_orientations;
+  int nEntry = num_orientations * kH * kW;
+  int output_size = grad_in.numel();
+
+  at::cuda::CUDAGuard device_guard(indices.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "active_rotated_filter_backward_cuda_kernel",
+      [&] {
+        active_rotated_filter_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_out.data_ptr<scalar_t>(),
+                indices.data_ptr<int>(), num_input_planes, num_output_planes,
+                num_orientations, num_rotations, nEntry,
+                grad_in.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bdb5fab9fc61ad19d9230cfdc26642dc7fe5972e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/assign_score_withk_cuda.cu
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/paconv_lib/src/gpu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "assign_score_withk_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output) {
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(B * O * N1 * K, THREADS_PER_BLOCK));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "assign_score_withk_forward_cuda_kernel", [&] {
+        assign_score_withk_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, points.data_ptr<scalar_t>(),
+                centers.data_ptr<scalar_t>(), scores.data_ptr<scalar_t>(),
+                knn_idx.data_ptr<int64_t>(), output.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks1(GET_BLOCKS(B * M * O, THREADS_PER_BLOCK));
+  dim3 threads1(THREADS_PER_BLOCK);
+  dim3 blocks2(GET_BLOCKS(B * N1 * K * M, THREADS_PER_BLOCK));
+  dim3 threads2(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "assign_score_withk_points_backward_cuda_kernel",
+      [&] {
+        assign_score_withk_points_backward_cuda_kernel<scalar_t>
+            <<<blocks1, threads1, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
+                scores.data_ptr<scalar_t>(), knn_idx.data_ptr<int64_t>(),
+                grad_points.data_ptr<scalar_t>(),
+                grad_centers.data_ptr<scalar_t>());
+      });
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "assign_score_withk_scores_backward_cuda_kernel",
+      [&] {
+        assign_score_withk_scores_backward_cuda_kernel<scalar_t>
+            <<<blocks2, threads2, 0, stream>>>(
+                B, N0, N1, M, K, O, aggregate, grad_out.data_ptr<scalar_t>(),
+                points.data_ptr<scalar_t>(), centers.data_ptr<scalar_t>(),
+                knn_idx.data_ptr<int64_t>(), grad_scores.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c42c3e2ae6164dfc504c2794db1436607ec8445f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/ball_query_cuda.cu
@@ -0,0 +1,38 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/ball_query_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "ball_query_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx) {
+  // new_xyz: (B, M, 3)
+  // xyz: (B, N, 3)
+  // output:
+  //      idx: (B, M, nsample)
+
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "ball_query_forward_cuda_kernel", [&] {
+        ball_query_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, n, m, min_radius, max_radius, nsample,
+                new_xyz.data_ptr<scalar_t>(), xyz.data_ptr<scalar_t>(),
+                idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b3272539bfc882cdd49c9077054a1a08452bdbc9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/bbox_overlaps_cuda.cu
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "bbox_overlaps_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+// Disable fp16 on ROCm device
+#ifndef HIP_DIFF
+#if __CUDA_ARCH__ >= 530
+template <>
+__global__ void bbox_overlaps_cuda_kernel<at::Half>(
+    const at::Half* bbox1, const at::Half* bbox2, at::Half* ious,
+    const int num_bbox1, const int num_bbox2, const int mode,
+    const bool aligned, const int offset) {
+  bbox_overlaps_cuda_kernel_half(reinterpret_cast<const __half*>(bbox1),
+                                 reinterpret_cast<const __half*>(bbox2),
+                                 reinterpret_cast<__half*>(ious), num_bbox1,
+                                 num_bbox2, mode, aligned, offset);
+}
+#endif  // __CUDA_ARCH__ >= 530
+#endif  // HIP_DIFF
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset) {
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(bboxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bboxes1.scalar_type(), "bbox_overlaps_cuda_kernel", ([&] {
+        bbox_overlaps_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                bboxes1.data_ptr<scalar_t>(), bboxes2.data_ptr<scalar_t>(),
+                ious.data_ptr<scalar_t>(), num_bbox1, num_bbox2, mode, aligned,
+                offset);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3aeefea5ddafa81da74f320ae7f166f4977787b4
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/border_align_cuda.cu
@@ -0,0 +1,68 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "border_align_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor &input,
+                                          const Tensor &boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size) {
+  // shape assertion
+  AT_ASSERTM(input.ndimension() == 4,
+             "non-empty 4D(batch mode) tensor expected for input feature");
+  AT_ASSERTM(boxes.ndimension() == 3,
+             "boxes must be 3D tensor with size of [B, H*W, 4]");
+
+  int batch_size = input.size(0);
+  int feat_channels = input.size(1);
+  int channels = feat_channels / 4;
+  int height = input.size(2);
+  int width = input.size(3);
+  // shape [N, box_size, 4] for boxes. (x1, y1, x2, y2) format
+  int box_size = boxes.size(1);
+  // shape [N, channels, box_size, 4] for output
+  int nthreads = batch_size * channels * box_size;
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block(128, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "border_align_forward_cuda_kernel", [&] {
+        border_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
+                nthreads, input.data_ptr<scalar_t>(),
+                boxes.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax_idx.data_ptr<int>(), channels, box_size, height, width,
+                pool_size);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor &grad_output,
+                                           const Tensor &boxes,
+                                           const Tensor &argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size) {
+  int batch_size = grad_input.size(0);
+  int feat_channels = grad_input.size(1);
+  int channels = feat_channels / 4;
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  int box_size = boxes.size(1);
+  int nthreads = batch_size * channels * box_size;
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  dim3 block(128, 4);
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "border_align_backward_cuda_kernel", [&] {
+        border_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(nthreads), block, 0, stream>>>(
+                nthreads, grad_output.data_ptr<scalar_t>(),
+                boxes.data_ptr<scalar_t>(), argmax_idx.data_ptr<int>(),
+                grad_input.data_ptr<scalar_t>(), channels, box_size, height,
+                width, pool_size);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3c13e06237b208a48e2489ef8246c90ada78ef51
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/box_iou_rotated_cuda.cu
@@ -0,0 +1,25 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/box_iou_rotated/box_iou_rotated_cuda.cu
+#include "box_iou_rotated_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned) {
+  using scalar_t = float;
+  AT_ASSERTM(boxes1.is_cuda(), "boxes1 must be a CUDA tensor");
+  AT_ASSERTM(boxes2.is_cuda(), "boxes2 must be a CUDA tensor");
+
+  int output_size = ious.numel();
+  int num_boxes1 = boxes1.size(0);
+  int num_boxes2 = boxes2.size(0);
+
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  box_iou_rotated_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          num_boxes1, num_boxes2, boxes1.data_ptr<scalar_t>(),
+          boxes2.data_ptr<scalar_t>(), (scalar_t*)ious.data_ptr<scalar_t>(),
+          mode_flag, aligned);
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..984e734f9ea5e15de2517d6a580dbe35a11c208b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/carafe_cuda.cu
@@ -0,0 +1,180 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "carafe_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor) {
+  const int batch_size = output.size(0);
+  const int channels = output.size(1);
+  const int output_height = output.size(2);
+  const int output_width = output.size(3);
+
+  const int input_height = features.size(2);
+  const int input_width = features.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rfeatures.resize_({batch_size, input_height, input_width, channels});
+  routput.resize_({batch_size, output_height, output_width, channels});
+  rmasks.resize_({batch_size, output_height, output_width, mask_channels});
+
+  // one warp per pixel
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Feature", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        scalar_t *top_data = rfeatures.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(input_height * input_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, input_height * input_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NCHW2NHWC_Masks", ([&] {
+        const scalar_t *bottom_data = masks.data_ptr<scalar_t>();
+        scalar_t *top_data = rmasks.data_ptr<scalar_t>();
+        const int dh = divideUP(mask_channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, mask_channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFELaucherForward", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = rmasks.data_ptr<scalar_t>();
+        scalar_t *top_data = routput.data_ptr<scalar_t>();
+
+        CARAFEForward<scalar_t><<<divideUP(num_kernels, THREADS_PER_BLOCK),
+                                  THREADS_PER_BLOCK, 0, stream>>>(
+            num_kernels, bottom_data, bottom_masks, kernel_size, group_size,
+            scale_factor, channels, input_height, input_width, output_height,
+            output_width, mask_channels, top_data);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "NHWC2NCHW", ([&] {
+        const scalar_t *bottom_data = routput.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor) {
+  const int batch_size = top_grad.size(0);
+  const int channels = top_grad.size(1);
+  const int output_height = top_grad.size(2);
+  const int output_width = top_grad.size(3);
+
+  const int input_height = bottom_grad.size(2);
+  const int input_width = bottom_grad.size(3);
+
+  const int mask_channels = masks.size(1);
+
+  rtop_grad.resize_({batch_size, output_height, output_width, channels});
+  rbottom_grad.resize_({batch_size, input_height, input_width, channels});
+  rbottom_grad_hs.resize_({batch_size, output_height, output_width, channels});
+  rmask_grad.resize_({batch_size, output_height, output_width, mask_channels});
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NCHW2NHWC_Top_Grad", ([&] {
+        const scalar_t *bottom_data = top_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = rtop_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(channels, kTileDim);
+        const int dw = divideUP(output_height * output_width, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, channels, output_height * output_width, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Feature", ([&] {
+        const int num_kernels =
+            batch_size * output_height * output_width * THREADS_PER_PIXEL;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_masks = masks.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad_hs.data_ptr<scalar_t>();
+
+        CARAFEBackward_Feature<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_masks, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "FeatureSum", ([&] {
+        const int num_kernels =
+            batch_size * input_height * input_width * THREADS_PER_PIXEL;
+        const scalar_t *bottom_diff_hs = rbottom_grad_hs.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = rbottom_grad.data_ptr<scalar_t>();
+
+        FeatureSum<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, bottom_diff_hs, scale_factor, channels,
+                         input_height, input_width, bottom_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Bottom_Grad", ([&] {
+        const scalar_t *bottom_data = rbottom_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = bottom_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(input_height * input_width, kTileDim);
+        const int dw = divideUP(channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, input_height * input_width, channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFELaucherBackward_Mask", ([&] {
+        const int num_kernels = batch_size * output_height * output_width *
+                                mask_channels * WARP_SIZE;
+        const scalar_t *top_diff = rtop_grad.data_ptr<scalar_t>();
+        const scalar_t *bottom_data = rfeatures.data_ptr<scalar_t>();
+        scalar_t *mask_diff = rmask_grad.data_ptr<scalar_t>();
+
+        CARAFEBackward_Mask<scalar_t>
+            <<<divideUP(num_kernels, THREADS_PER_BLOCK), THREADS_PER_BLOCK, 0,
+               stream>>>(num_kernels, top_diff, bottom_data, kernel_size,
+                         group_size, scale_factor, channels, input_height,
+                         input_width, output_height, output_width,
+                         mask_channels, mask_diff);
+      }));
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "NHWC2NCHW_Mask_Grad", ([&] {
+        const scalar_t *bottom_data = rmask_grad.data_ptr<scalar_t>();
+        scalar_t *top_data = mask_grad.data_ptr<scalar_t>();
+        const int dh = divideUP(output_height * output_width, kTileDim);
+        const int dw = divideUP(mask_channels, kTileDim);
+        BatchTranspose2DCUDAKernel<scalar_t>
+            <<<batch_size * dh * dw, dim3(kTileDim, kBlockRows), 0, stream>>>(
+                batch_size, output_height * output_width, mask_channels, dh, dw,
+                bottom_data, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2fc5667686d225064bd14c2f2ad5d06b93bd5fca
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/carafe_naive_cuda.cu
@@ -0,0 +1,52 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "carafe_naive_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor) {
+  int output_size = output.numel();
+  int channels = output.size(1);
+  int height = output.size(2);
+  int width = output.size(3);
+
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "CARAFENAIVEForward", ([&] {
+        carafe_naive_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, features.data_ptr<scalar_t>(),
+                masks.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                kernel_size, group_size, scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor) {
+  int output_size = top_grad.numel();
+  int channels = top_grad.size(1);
+  int height = top_grad.size(2);
+  int width = top_grad.size(3);
+
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "CARAFENAIVEBackward", ([&] {
+        carafe_naive_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_grad.data_ptr<scalar_t>(),
+                features.data_ptr<scalar_t>(), masks.data_ptr<scalar_t>(),
+                bottom_grad.data_ptr<scalar_t>(),
+                mask_grad.data_ptr<scalar_t>(), kernel_size, group_size,
+                scale_factor, channels, height, width);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..980482eb540b27746c771b2f2f3492668e1a571d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/chamfer_distance_cuda.cu
@@ -0,0 +1,63 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/chrdiller/pyTorchChamferDistance/blob/master/chamfer_distance/chamfer_distance.cpp
+#include "chamfer_distance_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ChamferDistanceForwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
+    const Tensor dist2, const Tensor idx1, const Tensor idx2) {
+  int batch_size = xyz1.size(0);
+  int n = xyz1.size(1);
+  int m = xyz2.size(1);
+
+  at::cuda::CUDAGuard device_guard(xyz1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
+        chamfer_distance_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK, 0, stream>>>(
+                batch_size, n, xyz1.data_ptr<scalar_t>(), m,
+                xyz2.data_ptr<scalar_t>(), dist1.data_ptr<scalar_t>(),
+                idx1.data_ptr<int>());
+      });
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_forward_cuda_kernel", [&] {
+        chamfer_distance_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK, 0, stream>>>(
+                batch_size, m, xyz2.data_ptr<scalar_t>(), n,
+                xyz1.data_ptr<scalar_t>(), dist2.data_ptr<scalar_t>(),
+                idx2.data_ptr<int>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ChamferDistanceBackwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, Tensor grad_xyz1, Tensor grad_xyz2,
+    Tensor grad_dist1, Tensor grad_dist2, Tensor idx1, Tensor idx2) {
+  int batch_size = xyz1.size(0);
+  int n = xyz1.size(1);
+  int m = xyz2.size(1);
+
+  at::cuda::CUDAGuard device_guard(xyz1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
+        chamfer_distance_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * n), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                batch_size, m, xyz1.data_ptr<scalar_t>(), n,
+                xyz2.data_ptr<scalar_t>(), grad_dist1.data_ptr<scalar_t>(),
+                idx1.data_ptr<int>(), grad_xyz1.data_ptr<scalar_t>(),
+                grad_xyz2.data_ptr<scalar_t>());
+      });
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz1.scalar_type(), "chamfer_distance_backward_cuda_kernel", [&] {
+        chamfer_distance_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(batch_size * m), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                batch_size, n, xyz2.data_ptr<scalar_t>(), m,
+                xyz1.data_ptr<scalar_t>(), grad_dist2.data_ptr<scalar_t>(),
+                idx2.data_ptr<int>(), grad_xyz2.data_ptr<scalar_t>(),
+                grad_xyz1.data_ptr<scalar_t>());
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
new file mode 100644
index 0000000000000000000000000000000000000000..804f7ac3bae433173f2e71011fa5be2c2c81e761
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/convex_iou.cu
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/iou/src/convex_iou_kernel.cu
+#include "convex_iou_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious) {
+  int output_size = ious.numel();
+  int num_pointsets = pointsets.size(0);
+  int num_polygons = polygons.size(0);
+
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "convex_iou_cuda_kernel", ([&] {
+        convex_iou_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>(), ious.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output) {
+  int output_size = output.numel();
+  int num_pointsets = pointsets.size(0);
+  int num_polygons = polygons.size(0);
+
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "convex_giou_cuda_kernel", ([&] {
+        convex_giou_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK / 2, 0, stream>>>(
+                num_pointsets, num_polygons, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>(), output.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c10e9d40e04a785821738b49d55b143a5875f09c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/correlation_cuda.cu
@@ -0,0 +1,94 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/ClementPinard/Pytorch-Correlation-extension/blob/master/Correlation_Module/correlation_cuda_kernel.cu
+// Original licence: Under MIT License
+
+#include "correlation_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int dilatedKH = (kH - 1) * dilationH + 1;
+  const int dilatedKW = (kW - 1) * dilationW + 1;
+
+  const auto oH = (iH + 2 * padH - dilatedKH) / dH + 1;
+  const auto oW = (iW + 2 * padW - dilatedKW) / dW + 1;
+
+  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
+  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
+
+  const dim3 threads(WARP_SIZE, 4, 4);
+  const dim3 blocks(batch_size, (oH + 3) >> 2, (oW + 3) >> 2);
+
+  at::cuda::CUDAGuard device_guard(input1.device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input1.scalar_type(), "correlation_forward_cuda", ([&] {
+        TensorAcc4R trInput1_acc =
+            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R trInput2_acc =
+            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc5R output_acc =
+            output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
+
+        correlation_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, at::cuda::getCurrentCUDAStream()>>>(
+                trInput1_acc, trInput2_acc, output_acc, kH, kW, patchH, patchW,
+                padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
+      }));
+}
+
+void CorrelationBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input1, Tensor input2, Tensor grad_input1,
+    Tensor grad_input2, int kH, int kW, int patchH, int patchW, int padH,
+    int padW, int dilationH, int dilationW, int dilation_patchH,
+    int dilation_patchW, int dH, int dW) {
+  const int batch_size = input1.size(0);
+  const int iH = input1.size(2);
+  const int iW = input1.size(3);
+  const int C = input1.size(1);
+
+  auto trInput1 = input1.permute({0, 2, 3, 1}).contiguous();
+  auto trInput2 = input2.permute({0, 2, 3, 1}).contiguous();
+  const dim3 blocks(batch_size, iH, iW);
+  const dim3 threads(THREADS_PER_BLOCK);
+
+  at::cuda::CUDAGuard device_guard(input1.device());
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input1.scalar_type(), "correlation_backward_cuda", ([&] {
+        const int grad_cache_size = patchH * patchW * sizeof(scalar_t);
+        TensorAcc4R input1_acc =
+            trInput1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R input2_acc =
+            trInput2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R grad_input1_acc =
+            grad_input1.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc4R grad_input2_acc =
+            grad_input2.packed_accessor32<scalar_t, 4, RestrictPtrTraits>();
+        TensorAcc5R grad_output_acc =
+            grad_output.packed_accessor32<scalar_t, 5, RestrictPtrTraits>();
+
+        correlation_backward_cuda_kernel_input1<scalar_t>
+            <<<blocks, threads, grad_cache_size,
+               at::cuda::getCurrentCUDAStream()>>>(
+                grad_output_acc, input2_acc, grad_input1_acc, kH, kW, patchH,
+                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
+
+        correlation_backward_cuda_kernel_input2<scalar_t>
+            <<<blocks, threads, grad_cache_size,
+               at::cuda::getCurrentCUDAStream()>>>(
+                grad_output_acc, input1_acc, grad_input2_acc, kH, kW, patchH,
+                patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+                dilation_patchW, dH, dW);
+      }));
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp b/mmcv/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..12cf7afdc26bc61efb528424a5d2624b754e8516
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/cudabind.cpp
@@ -0,0 +1,1790 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void AssignScoreWithKForwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& points, const Tensor& centers, const Tensor& scores,
+    const Tensor& knn_idx, Tensor& output);
+
+void AssignScoreWithKBackwardCUDAKernelLauncher(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+void assign_score_withk_forward_cuda(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output) {
+  AssignScoreWithKForwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, points, centers, scores, knn_idx, output);
+};
+
+void assign_score_withk_backward_cuda(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores) {
+  AssignScoreWithKBackwardCUDAKernelLauncher(
+      B, N0, N1, M, K, O, aggregate, grad_out, points, centers, scores, knn_idx,
+      grad_points, grad_centers, grad_scores);
+};
+
+void assign_score_withk_forward_impl(int B, int N0, int N1, int M, int K, int O,
+                                     int aggregate, const Tensor& points,
+                                     const Tensor& centers,
+                                     const Tensor& scores,
+                                     const Tensor& knn_idx, Tensor& output);
+
+void assign_score_withk_backward_impl(
+    int B, int N0, int N1, int M, int K, int O, int aggregate,
+    const Tensor& grad_out, const Tensor& points, const Tensor& centers,
+    const Tensor& scores, const Tensor& knn_idx, Tensor& grad_points,
+    Tensor& grad_centers, Tensor& grad_scores);
+
+REGISTER_DEVICE_IMPL(assign_score_withk_forward_impl, CUDA,
+                     assign_score_withk_forward_cuda);
+REGISTER_DEVICE_IMPL(assign_score_withk_backward_impl, CUDA,
+                     assign_score_withk_backward_cuda);
+
+void BallQueryForwardCUDAKernelLauncher(int b, int n, int m, float min_radius,
+                                        float max_radius, int nsample,
+                                        const Tensor new_xyz, const Tensor xyz,
+                                        Tensor idx);
+
+void ball_query_forward_cuda(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx) {
+  BallQueryForwardCUDAKernelLauncher(b, n, m, min_radius, max_radius, nsample,
+                                     new_xyz, xyz, idx);
+};
+
+void ball_query_forward_impl(int b, int n, int m, float min_radius,
+                             float max_radius, int nsample,
+                             const Tensor new_xyz, const Tensor xyz,
+                             Tensor idx);
+REGISTER_DEVICE_IMPL(ball_query_forward_impl, CUDA, ball_query_forward_cuda);
+
+void BBoxOverlapsCUDAKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                    Tensor ious, const int mode,
+                                    const bool aligned, const int offset);
+
+void bbox_overlaps_cuda(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsCUDAKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, CUDA, bbox_overlaps_cuda);
+
+void BorderAlignForwardCUDAKernelLauncher(const Tensor& input,
+                                          const Tensor& boxes, Tensor output,
+                                          Tensor argmax_idx,
+                                          const int pool_size);
+
+void BorderAlignBackwardCUDAKernelLauncher(const Tensor& grad_output,
+                                           const Tensor& boxes,
+                                           const Tensor& argmax_idx,
+                                           Tensor grad_input,
+                                           const int pool_size);
+
+void border_align_forward_cuda(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size) {
+  BorderAlignForwardCUDAKernelLauncher(input, boxes, output, argmax_idx,
+                                       pool_size);
+}
+
+void border_align_backward_cuda(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size) {
+  BorderAlignBackwardCUDAKernelLauncher(grad_output, boxes, argmax_idx,
+                                        grad_input, pool_size);
+}
+
+void border_align_forward_impl(const Tensor& input, const Tensor& boxes,
+                               Tensor output, Tensor argmax_idx,
+                               const int pool_size);
+
+void border_align_backward_impl(const Tensor& grad_output, const Tensor& boxes,
+                                const Tensor& argmax_idx, Tensor grad_input,
+                                const int pool_size);
+
+REGISTER_DEVICE_IMPL(border_align_forward_impl, CUDA,
+                     border_align_forward_cuda);
+REGISTER_DEVICE_IMPL(border_align_backward_impl, CUDA,
+                     border_align_backward_cuda);
+
+void box_iou_rotated_cuda(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+
+void box_iou_rotated_impl(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                          const int mode_flag, const bool aligned);
+REGISTER_DEVICE_IMPL(box_iou_rotated_impl, CUDA, box_iou_rotated_cuda);
+
+void CARAFEForwardCUDAKernelLauncher(const Tensor features, const Tensor masks,
+                                     Tensor rfeatures, Tensor routput,
+                                     Tensor rmasks, Tensor output,
+                                     const int kernel_size,
+                                     const int group_size,
+                                     const int scale_factor);
+
+void CARAFEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor rfeatures, const Tensor masks,
+    Tensor rtop_grad, Tensor rbottom_grad_hs, Tensor rbottom_grad,
+    Tensor rmask_grad, Tensor bottom_grad, Tensor mask_grad,
+    const int kernel_size, const int group_size, const int scale_factor);
+
+void carafe_forward_cuda(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor) {
+  CARAFEForwardCUDAKernelLauncher(features, masks, rfeatures, routput, rmasks,
+                                  output, kernel_size, group_size,
+                                  scale_factor);
+}
+
+void carafe_backward_cuda(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor) {
+  CARAFEBackwardCUDAKernelLauncher(top_grad, rfeatures, masks, rtop_grad,
+                                   rbottom_grad_hs, rbottom_grad, rmask_grad,
+                                   bottom_grad, mask_grad, kernel_size,
+                                   group_size, scale_factor);
+}
+
+void carafe_forward_impl(Tensor features, Tensor masks, Tensor rfeatures,
+                         Tensor routput, Tensor rmasks, Tensor output,
+                         int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward_impl(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                          Tensor rtop_grad, Tensor rbottom_grad_hs,
+                          Tensor rbottom_grad, Tensor rmask_grad,
+                          Tensor bottom_grad, Tensor mask_grad, int kernel_size,
+                          int group_size, int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_forward_impl, CUDA, carafe_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_backward_impl, CUDA, carafe_backward_cuda);
+
+void CARAFENAIVEForwardCUDAKernelLauncher(const Tensor features,
+                                          const Tensor masks, Tensor output,
+                                          const int kernel_size,
+                                          const int group_size,
+                                          const int scale_factor);
+
+void CARAFENAIVEBackwardCUDAKernelLauncher(
+    const Tensor top_grad, const Tensor features, const Tensor masks,
+    Tensor bottom_grad, Tensor mask_grad, const int kernel_size,
+    const int group_size, const int scale_factor);
+
+void carafe_naive_forward_cuda(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor) {
+  CARAFENAIVEForwardCUDAKernelLauncher(features, masks, output, kernel_size,
+                                       group_size, scale_factor);
+}
+
+void carafe_naive_backward_cuda(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor) {
+  CARAFENAIVEBackwardCUDAKernelLauncher(top_grad, features, masks, bottom_grad,
+                                        mask_grad, kernel_size, group_size,
+                                        scale_factor);
+}
+void carafe_naive_forward_impl(Tensor features, Tensor masks, Tensor output,
+                               int kernel_size, int group_size,
+                               int scale_factor);
+
+void carafe_naive_backward_impl(Tensor top_grad, Tensor features, Tensor masks,
+                                Tensor bottom_grad, Tensor mask_grad,
+                                int kernel_size, int group_size,
+                                int scale_factor);
+
+REGISTER_DEVICE_IMPL(carafe_naive_forward_impl, CUDA,
+                     carafe_naive_forward_cuda);
+REGISTER_DEVICE_IMPL(carafe_naive_backward_impl, CUDA,
+                     carafe_naive_backward_cuda);
+
+void CorrelationForwardCUDAKernelLauncher(Tensor input1, Tensor input2,
+                                          Tensor output, int kH, int kW,
+                                          int patchH, int patchW, int padH,
+                                          int padW, int dilationH,
+                                          int dilationW, int dilation_patchH,
+                                          int dilation_patchW, int dH, int dW);
+
+void CorrelationBackwardCUDAKernelLauncher(Tensor grad_output, Tensor input1,
+                                           Tensor input2, Tensor grad_input1,
+                                           Tensor grad_input2, int kH, int kW,
+                                           int patchH, int patchW, int padH,
+                                           int padW, int dilationH,
+                                           int dilationW, int dilation_patchH,
+                                           int dilation_patchW, int dH, int dW);
+
+void correlation_forward_cuda(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW) {
+  CorrelationForwardCUDAKernelLauncher(
+      input1, input2, output, kH, kW, patchH, patchW, padH, padW, dilationH,
+      dilationW, dilation_patchH, dilation_patchW, dH, dW);
+}
+
+void correlation_backward_cuda(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW) {
+  CorrelationBackwardCUDAKernelLauncher(
+      grad_output, input1, input2, grad_input1, grad_input2, kH, kW, patchH,
+      patchW, padH, padW, dilationH, dilationW, dilation_patchH,
+      dilation_patchW, dH, dW);
+}
+
+void correlation_forward_impl(Tensor input1, Tensor input2, Tensor output,
+                              int kH, int kW, int patchH, int patchW, int padH,
+                              int padW, int dilationH, int dilationW,
+                              int dilation_patchH, int dilation_patchW, int dH,
+                              int dW);
+
+void correlation_backward_impl(Tensor grad_output, Tensor input1, Tensor input2,
+                               Tensor grad_input1, Tensor grad_input2, int kH,
+                               int kW, int patchH, int patchW, int padH,
+                               int padW, int dilationH, int dilationW,
+                               int dilation_patchH, int dilation_patchW, int dH,
+                               int dW);
+
+REGISTER_DEVICE_IMPL(correlation_forward_impl, CUDA, correlation_forward_cuda);
+REGISTER_DEVICE_IMPL(correlation_backward_impl, CUDA,
+                     correlation_backward_cuda);
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col);
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im);
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset);
+
+REGISTER_DEVICE_IMPL(deformable_im2col_impl, CUDA, deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_impl, CUDA, deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(deformable_col2im_coord_impl, CUDA,
+                     deformable_col2im_coord_cuda);
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma);
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma);
+
+void deform_roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DeformRoIPoolForwardCUDAKernelLauncher(input, rois, offset, output,
+                                         pooled_height, pooled_width,
+                                         spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_cuda(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DeformRoIPoolBackwardCUDAKernelLauncher(
+      grad_output, input, rois, offset, grad_input, grad_offset, pooled_height,
+      pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma);
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma);
+
+REGISTER_DEVICE_IMPL(deform_roi_pool_forward_impl, CUDA,
+                     deform_roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(deform_roi_pool_backward_impl, CUDA,
+                     deform_roi_pool_backward_cuda);
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha);
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha);
+
+void sigmoid_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  SigmoidFocalLossBackwardCUDAKernelLauncher(input, target, weight, grad_input,
+                                             gamma, alpha);
+}
+
+void softmax_focal_loss_forward_cuda(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  SoftmaxFocalLossForwardCUDAKernelLauncher(input, target, weight, output,
+                                            gamma, alpha);
+}
+
+void softmax_focal_loss_backward_cuda(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  SoftmaxFocalLossBackwardCUDAKernelLauncher(input, target, weight, buff,
+                                             grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, CUDA,
+                     sigmoid_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, CUDA,
+                     sigmoid_focal_loss_backward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_forward_impl, CUDA,
+                     softmax_focal_loss_forward_cuda);
+REGISTER_DEVICE_IMPL(softmax_focal_loss_backward_impl, CUDA,
+                     softmax_focal_loss_backward_cuda);
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs);
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs);
+
+void furthest_point_sampling_forward_cuda(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingForwardCUDAKernelLauncher(b, n, m, dataset, temp, idxs);
+}
+
+void furthest_point_sampling_with_dist_forward_cuda(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  const float* dataset = points_tensor.data_ptr<float>();
+  float* temp = temp_tensor.data_ptr<float>();
+  int* idxs = idx_tensor.data_ptr<int>();
+  FurthestPointSamplingWithDistForwardCUDAKernelLauncher(b, n, m, dataset, temp,
+                                                         idxs);
+}
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m);
+
+REGISTER_DEVICE_IMPL(furthest_point_sampling_forward_impl, CUDA,
+                     furthest_point_sampling_forward_cuda);
+REGISTER_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl, CUDA,
+                     furthest_point_sampling_with_dist_forward_cuda);
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale);
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale);
+REGISTER_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, CUDA,
+                     fused_bias_leakyrelu_op);
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out);
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points);
+
+void gather_points_forward_cuda(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  GatherPointsForwardCUDAKernelLauncher(b, c, n, npoints, points, idx, out);
+};
+
+void gather_points_backward_cuda(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  GatherPointsBackwardCUDAKernelLauncher(b, c, n, npoints, grad_out, idx,
+                                         grad_points);
+};
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out);
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(gather_points_forward_impl, CUDA,
+                     gather_points_forward_cuda);
+REGISTER_DEVICE_IMPL(gather_points_backward_impl, CUDA,
+                     gather_points_backward_cuda);
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out);
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points);
+
+void group_points_forward_cuda(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  GroupPointsForwardCUDAKernelLauncher(b, c, n, npoints, nsample, points, idx,
+                                       out);
+};
+
+void group_points_backward_cuda(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  GroupPointsBackwardCUDAKernelLauncher(b, c, n, npoints, nsample, grad_out,
+                                        idx, grad_points);
+};
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out);
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points);
+
+REGISTER_DEVICE_IMPL(group_points_forward_impl, CUDA,
+                     group_points_forward_cuda);
+REGISTER_DEVICE_IMPL(group_points_backward_impl, CUDA,
+                     group_points_backward_cuda);
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap);
+
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
+                                         unsigned long long* mask,
+                                         int boxes_num,
+                                         float nms_overlap_thresh);
+
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                               unsigned long long* mask,
+                                               int boxes_num,
+                                               float nms_overlap_thresh);
+
+void iou3d_boxes_overlap_bev_forward_cuda(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(num_a, boxes_a, num_b, boxes_b,
+                                                ans_overlap);
+};
+
+void iou3d_nms3d_forward_cuda(const Tensor boxes, unsigned long long* mask,
+                              int boxes_num, float nms_overlap_thresh) {
+  IoU3DNMS3DForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                      nms_overlap_thresh);
+};
+
+void iou3d_nms3d_normal_forward_cuda(const Tensor boxes,
+                                     unsigned long long* mask, int boxes_num,
+                                     float nms_overlap_thresh) {
+  IoU3DNMS3DNormalForwardCUDAKernelLauncher(boxes, mask, boxes_num,
+                                            nms_overlap_thresh);
+};
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap);
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long* mask,
+                              int boxes_num, float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+                                     unsigned long long* mask, int boxes_num,
+                                     float nms_overlap_thresh);
+
+REGISTER_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, CUDA,
+                     iou3d_boxes_overlap_bev_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_forward_impl, CUDA, iou3d_nms3d_forward_cuda);
+REGISTER_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, CUDA,
+                     iou3d_nms3d_normal_forward_cuda);
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2);
+
+void knn_forward_cuda(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  KNNForwardCUDAKernelLauncher(b, n, m, nsample, xyz, new_xyz, idx, dist2);
+}
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2);
+REGISTER_DEVICE_IMPL(knn_forward_impl, CUDA, knn_forward_cuda);
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w);
+
+void MaskedCol2imForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int height,
+                                           const int width, const int channels);
+
+void masked_im2col_forward_cuda(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kw), col: (kh * kw * ic, ow * oh)
+  MaskedIm2colForwardCUDAKernelLauncher(im, mask_h_idx, mask_w_idx, col,
+                                        kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_cuda(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  // im: (n, ic, h, w), kernel size (kh, kw)
+  // kernel: (oc, ic * kh * kh), col: (kh * kw * ic, ow * oh)
+  MaskedCol2imForwardCUDAKernelLauncher(col, mask_h_idx, mask_w_idx, im, height,
+                                        width, channels);
+}
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w);
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels);
+
+REGISTER_DEVICE_IMPL(masked_im2col_forward_impl, CUDA,
+                     masked_im2col_forward_cuda);
+REGISTER_DEVICE_IMPL(masked_col2im_forward_impl, CUDA,
+                     masked_col2im_forward_cuda);
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col);
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im);
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask);
+
+REGISTER_DEVICE_IMPL(modulated_deformable_im2col_impl, CUDA,
+                     modulated_deformable_im2col_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_impl, CUDA,
+                     modulated_deformable_col2im_cuda);
+REGISTER_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, CUDA,
+                     modulated_deformable_col2im_coord_cuda);
+
+Tensor ms_deform_attn_cuda_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_cuda_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+Tensor ms_deform_attn_impl_forward(const Tensor& value,
+                                   const Tensor& spatial_shapes,
+                                   const Tensor& level_start_index,
+                                   const Tensor& sampling_loc,
+                                   const Tensor& attn_weight,
+                                   const int im2col_step);
+
+void ms_deform_attn_impl_backward(
+    const Tensor& value, const Tensor& spatial_shapes,
+    const Tensor& level_start_index, const Tensor& sampling_loc,
+    const Tensor& attn_weight, const Tensor& grad_output, Tensor& grad_value,
+    Tensor& grad_sampling_loc, Tensor& grad_attn_weight, const int im2col_step);
+
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_forward, CUDA,
+                     ms_deform_attn_cuda_forward);
+REGISTER_DEVICE_IMPL(ms_deform_attn_impl_backward, CUDA,
+                     ms_deform_attn_cuda_backward);
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset);
+
+Tensor nms_cuda(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSCUDAKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, CUDA, nms_cuda);
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points);
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points);
+
+void points_in_boxes_part_forward_cuda(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  PointsInBoxesPartForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                             boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_all_forward_cuda(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  PointsInBoxesAllForwardCUDAKernelLauncher(batch_size, boxes_num, pts_num,
+                                            boxes, pts, box_idx_of_points);
+};
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points);
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points);
+REGISTER_DEVICE_IMPL(points_in_boxes_part_forward_impl, CUDA,
+                     points_in_boxes_part_forward_cuda);
+REGISTER_DEVICE_IMPL(points_in_boxes_all_forward_impl, CUDA,
+                     points_in_boxes_all_forward_cuda);
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask);
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask);
+
+void psamask_forward_cuda(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  PSAMaskForwardCUDAKernelLauncher(psa_type, input, output, num_, h_feature,
+                                   w_feature, h_mask, w_mask, half_h_mask,
+                                   half_w_mask);
+}
+
+void psamask_backward_cuda(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardCUDAKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                    h_feature, w_feature, h_mask, w_mask,
+                                    half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+REGISTER_DEVICE_IMPL(psamask_forward_impl, CUDA, psamask_forward_cuda);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, CUDA, psamask_backward_cuda);
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned);
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned);
+
+void roi_align_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignForwardCUDAKernelLauncher(
+      input, rois, output, argmax_y, argmax_x, aligned_height, aligned_width,
+      spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  ROIAlignBackwardCUDAKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, CUDA, roi_align_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, CUDA, roi_align_backward_cuda);
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output);
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad);
+
+void roi_align_rotated_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = input.size(1);
+  int data_height = input.size(2);
+  int data_width = input.size(3);
+  ROIAlignRotatedForwardCUDAKernelLauncher(
+      input, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, output);
+}
+
+void roi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+
+  int num_channels = bottom_grad.size(1);
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  ROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, sampling_ratio, aligned, clockwise,
+      num_channels, data_height, data_width, num_rois, aligned_height,
+      aligned_width, bottom_grad);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, CUDA,
+                     roi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, CUDA,
+                     roi_align_rotated_backward_cuda);
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output);
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad);
+
+void riroi_align_rotated_forward_cuda(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(features);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = features.size(1) / num_orientations;
+  int data_height = features.size(2);
+  int data_width = features.size(3);
+  RiROIAlignRotatedForwardCUDAKernelLauncher(
+      features, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, output);
+}
+
+void riroi_align_rotated_backward_cuda(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  // Number of ROIs
+  int num_rois = rois.size(0);
+  int size_rois = rois.size(1);
+  if (size_rois != 6) {
+    AT_ERROR("wrong roi size");
+  }
+  CHECK_CONTIGUOUS(top_grad);
+  CHECK_CONTIGUOUS(rois);
+  int num_channels = bottom_grad.size(1) / num_orientations;
+  int data_height = bottom_grad.size(2);
+  int data_width = bottom_grad.size(3);
+  RiROIAlignRotatedBackwardCUDAKernelLauncher(
+      top_grad, rois, spatial_scale, num_samples, clockwise, num_channels,
+      data_height, data_width, num_rois, pooled_height, pooled_width,
+      num_orientations, bottom_grad);
+}
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise);
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise);
+
+REGISTER_DEVICE_IMPL(riroi_align_rotated_forward_impl, CUDA,
+                     riroi_align_rotated_forward_cuda);
+REGISTER_DEVICE_IMPL(riroi_align_rotated_backward_impl, CUDA,
+                     riroi_align_rotated_backward_cuda);
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method);
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method);
+
+void roiaware_pool3d_forward_cuda(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  RoiawarePool3dForwardCUDAKernelLauncher(
+      boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y, out_z,
+      rois, pts, pts_feature, argmax, pts_idx_of_voxels, pooled_features,
+      pool_method);
+};
+
+void roiaware_pool3d_backward_cuda(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  RoiawarePool3dBackwardCUDAKernelLauncher(
+      boxes_num, out_x, out_y, out_z, channels, max_pts_each_voxel,
+      pts_idx_of_voxels, argmax, grad_out, grad_in, pool_method);
+};
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method);
+
+REGISTER_DEVICE_IMPL(roiaware_pool3d_forward_impl, CUDA,
+                     roiaware_pool3d_forward_cuda);
+REGISTER_DEVICE_IMPL(roiaware_pool3d_backward_impl, CUDA,
+                     roiaware_pool3d_backward_cuda);
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features, Tensor pooled_empty_flag);
+
+void roipoint_pool3d_forward_cuda(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  RoIPointPool3dForwardCUDAKernelLauncher(
+      batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num, xyz,
+      boxes3d, pts_feature, pooled_features, pooled_empty_flag);
+};
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag);
+REGISTER_DEVICE_IMPL(roipoint_pool3d_forward_impl, CUDA,
+                     roipoint_pool3d_forward_cuda);
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale);
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale);
+
+void roi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  ROIPoolForwardCUDAKernelLauncher(input, rois, output, argmax, pooled_height,
+                                   pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_cuda(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  ROIPoolBackwardCUDAKernelLauncher(grad_output, rois, argmax, grad_input,
+                                    pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, CUDA, roi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, CUDA, roi_pool_backward_cuda);
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor& feats, const at::Tensor& coors,
+    const reduce_t reduce_type);
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor& grad_feats, const at::Tensor& grad_reduced_feats,
+    const at::Tensor& feats, const at::Tensor& reduced_feats,
+    const at::Tensor& coors_map, const at::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_cuda(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type) {
+  return DynamicPointToVoxelForwardCUDAKernelLauncher(feats, coors,
+                                                      reduce_type);
+};
+
+void dynamic_point_to_voxel_backward_cuda(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type) {
+  DynamicPointToVoxelBackwardCUDAKernelLauncher(grad_feats, grad_reduced_feats,
+                                                feats, reduced_feats, coors_idx,
+                                                reduce_count, reduce_type);
+};
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor& feats, const torch::Tensor& coors,
+    const reduce_t reduce_type);
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor& grad_feats, const torch::Tensor& grad_reduced_feats,
+    const torch::Tensor& feats, const torch::Tensor& reduced_feats,
+    const torch::Tensor& coors_idx, const torch::Tensor& reduce_count,
+    const reduce_t reduce_type);
+
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, CUDA,
+                     dynamic_point_to_voxel_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, CUDA,
+                     dynamic_point_to_voxel_backward_cuda);
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean);
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var);
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size);
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias);
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input);
+
+void sync_bn_forward_mean_cuda(const Tensor input, Tensor mean) {
+  SyncBNForwardMeanCUDAKernelLauncher(input, mean);
+}
+
+void sync_bn_forward_var_cuda(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  SyncBNForwardVarCUDAKernelLauncher(input, mean, var);
+}
+
+void sync_bn_forward_output_cuda(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  SyncBNForwardOutputCUDAKernelLauncher(input, mean, var, running_mean,
+                                        running_var, weight, bias, norm, std,
+                                        output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_cuda(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  SyncBNBackwardParamCUDAKernelLauncher(grad_output, norm, grad_weight,
+                                        grad_bias);
+}
+
+void sync_bn_backward_data_cuda(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  SyncBNBackwardDataCUDAKernelLauncher(grad_output, weight, grad_weight,
+                                       grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var);
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size);
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(sync_bn_forward_mean_impl, CUDA,
+                     sync_bn_forward_mean_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_var_impl, CUDA, sync_bn_forward_var_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_forward_output_impl, CUDA,
+                     sync_bn_forward_output_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_param_impl, CUDA,
+                     sync_bn_backward_param_cuda);
+REGISTER_DEVICE_IMPL(sync_bn_backward_data_impl, CUDA,
+                     sync_bn_backward_data_cuda);
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight, Tensor out);
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points);
+
+void three_interpolate_forward_cuda(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  ThreeInterpolateForwardCUDAKernelLauncher(b, c, m, n, points, idx, weight,
+                                            out);
+};
+
+void three_interpolate_backward_cuda(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  ThreeInterpolateBackwardCUDAKernelLauncher(b, c, n, m, grad_out, idx, weight,
+                                             grad_points);
+};
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out);
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points);
+REGISTER_DEVICE_IMPL(three_interpolate_forward_impl, CUDA,
+                     three_interpolate_forward_cuda);
+REGISTER_DEVICE_IMPL(three_interpolate_backward_impl, CUDA,
+                     three_interpolate_backward_cuda);
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx);
+
+void three_nn_forward_cuda(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  ThreeNNForwardCUDAKernelLauncher(b, n, m, unknown, known, dist2, idx);
+};
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx);
+REGISTER_DEVICE_IMPL(three_nn_forward_impl, CUDA, three_nn_forward_cuda);
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output);
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input);
+
+void tin_shift_forward_cuda(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardCUDAKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_cuda(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  TINShiftBackwardCUDAKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, CUDA, tin_shift_forward_cuda);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, CUDA, tin_shift_backward_cuda);
+
+torch::Tensor upfirdn2d_op(const torch::Tensor& input,
+                           const torch::Tensor& kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1);
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1);
+REGISTER_DEVICE_IMPL(upfirdn2d_op_impl, CUDA, upfirdn2d_op);
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3);
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor& points, at::Tensor& coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3);
+
+int hard_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim) {
+  return HardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+int nondeterministic_hard_voxelize_forward_cuda(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim) {
+  return NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+      points, voxels, coors, num_points_per_voxel, voxel_size, coors_range,
+      max_points, max_voxels, NDim);
+};
+
+void dynamic_voxelize_forward_cuda(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim) {
+  DynamicVoxelizeForwardCUDAKernelLauncher(points, coors, voxel_size,
+                                           coors_range, NDim);
+};
+
+int hard_voxelize_forward_impl(const at::Tensor& points, at::Tensor& voxels,
+                               at::Tensor& coors,
+                               at::Tensor& num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim);
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor& points, at::Tensor& voxels, at::Tensor& coors,
+    at::Tensor& num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim);
+
+void dynamic_voxelize_forward_impl(const at::Tensor& points, at::Tensor& coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim);
+
+REGISTER_DEVICE_IMPL(hard_voxelize_forward_impl, CUDA,
+                     hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl, CUDA,
+                     nondeterministic_hard_voxelize_forward_cuda);
+REGISTER_DEVICE_IMPL(dynamic_voxelize_forward_impl, CUDA,
+                     dynamic_voxelize_forward_cuda);
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output);
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad);
+
+void rotated_feature_align_forward_cuda(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  RotatedFeatureAlignForwardCUDAKernelLauncher(features, best_bboxes,
+                                               spatial_scale, points, output);
+};
+
+void rotated_feature_align_backward_cuda(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  RotatedFeatureAlignBackwardCUDAKernelLauncher(
+      top_grad, best_bboxes, spatial_scale, points, bottom_grad);
+};
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output);
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad);
+
+REGISTER_DEVICE_IMPL(rotated_feature_align_forward_impl, CUDA,
+                     rotated_feature_align_forward_cuda);
+REGISTER_DEVICE_IMPL(rotated_feature_align_backward_impl, CUDA,
+                     rotated_feature_align_backward_cuda);
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output);
+
+void points_in_polygons_forward_cuda(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  PointsInPolygonsForwardCUDAKernelLauncher(points, polygons, rows, cols,
+                                            output);
+};
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols);
+
+REGISTER_DEVICE_IMPL(points_in_polygons_forward_impl, CUDA,
+                     points_in_polygons_forward_cuda);
+
+torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
+                                                     torch::Tensor indicePairs,
+                                                     torch::Tensor indiceNum,
+                                                     int64_t numAct);
+
+torch::Tensor indice_maxpool_forward_cuda(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return IndiceMaxpoolForwardCUDAKernelLauncher(features, indicePairs,
+                                                indiceNum, numAct);
+};
+
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct);
+REGISTER_DEVICE_IMPL(indice_maxpool_forward_impl, CUDA,
+                     indice_maxpool_forward_cuda);
+
+torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
+                                                      torch::Tensor outFeatures,
+                                                      torch::Tensor outGrad,
+                                                      torch::Tensor indicePairs,
+                                                      torch::Tensor indiceNum);
+
+torch::Tensor indice_maxpool_backward_cuda(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return IndiceMaxpoolBackwardCUDAKernelLauncher(features, outFeatures, outGrad,
+                                                 indicePairs, indiceNum);
+};
+
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum);
+
+REGISTER_DEVICE_IMPL(indice_maxpool_backward_impl, CUDA,
+                     indice_maxpool_backward_cuda)
+
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM);
+
+torch::Tensor indice_conv_forward_cuda(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return IndiceConvForwardCUDAKernelLauncher(
+      features, filters, indicePairs, indiceNum, numActOut, _inverse, _subM);
+};
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM);
+
+REGISTER_DEVICE_IMPL(indice_conv_forward_impl, CUDA, indice_conv_forward_cuda);
+
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+
+std::vector<torch::Tensor> indice_conv_backward_cuda(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return IndiceConvBackwardCUDAKernelLauncher(
+      features, filters, outGrad, indicePairs, indiceNum, _inverse, _subM);
+};
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM);
+
+REGISTER_DEVICE_IMPL(indice_conv_backward_impl, CUDA,
+                     indice_conv_backward_cuda);
+
+torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM);
+
+torch::Tensor fused_indice_conv_batchnorm_forward_cuda(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return FusedIndiceConvBatchnormCUDAKernelLauncher(features, filters, bias,
+                                                    indicePairs, indiceNum,
+                                                    numActOut, _inverse, _subM);
+};
+
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM);
+
+REGISTER_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl, CUDA,
+                     fused_indice_conv_batchnorm_forward_cuda)
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets, Tensor polygons);
+
+void min_area_polygons_cuda(const Tensor pointsets, Tensor polygons) {
+  MinAreaPolygonsCUDAKernelLauncher(pointsets, polygons);
+}
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons);
+
+REGISTER_DEVICE_IMPL(min_area_polygons_impl, CUDA, min_area_polygons_cuda);
+
+void ActiveRotatedFilterForwardCUDAKernelLauncher(const Tensor input,
+                                                  const Tensor indices,
+                                                  Tensor output);
+
+void ActiveRotatedFilterBackwardCUDAKernelLauncher(const Tensor grad_out,
+                                                   const Tensor indices,
+                                                   Tensor grad_in);
+
+void active_rotated_filter_forward_cuda(const Tensor input,
+                                        const Tensor indices, Tensor output) {
+  ActiveRotatedFilterForwardCUDAKernelLauncher(input, indices, output);
+};
+
+void active_rotated_filter_backward_cuda(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in) {
+  ActiveRotatedFilterBackwardCUDAKernelLauncher(grad_out, indices, grad_in);
+};
+
+void active_rotated_filter_forward_impl(const Tensor input,
+                                        const Tensor indices, Tensor output);
+
+void active_rotated_filter_backward_impl(const Tensor grad_out,
+                                         const Tensor indices, Tensor grad_in);
+
+REGISTER_DEVICE_IMPL(active_rotated_filter_forward_impl, CUDA,
+                     active_rotated_filter_forward_cuda);
+REGISTER_DEVICE_IMPL(active_rotated_filter_backward_impl, CUDA,
+                     active_rotated_filter_backward_cuda);
+
+void ConvexIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                 Tensor ious);
+
+void ConvexGIoUCUDAKernelLauncher(const Tensor pointsets, const Tensor polygons,
+                                  Tensor output);
+
+void convex_iou_cuda(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious) {
+  ConvexIoUCUDAKernelLauncher(pointsets, polygons, ious);
+}
+
+void convex_giou_cuda(const Tensor pointsets, const Tensor polygons,
+                      Tensor output) {
+  ConvexGIoUCUDAKernelLauncher(pointsets, polygons, output);
+}
+
+void convex_iou_impl(const Tensor pointsets, const Tensor polygons,
+                     Tensor ious);
+
+void convex_giou_impl(const Tensor pointsets, const Tensor polygons,
+                      Tensor output);
+
+REGISTER_DEVICE_IMPL(convex_iou_impl, CUDA, convex_iou_cuda);
+REGISTER_DEVICE_IMPL(convex_giou_impl, CUDA, convex_giou_cuda);
+
+Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(Tensor vertices,
+                                                    Tensor mask,
+                                                    Tensor num_valid);
+
+Tensor diff_iou_rotated_sort_vertices_forward_cuda(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DiffIoURotatedSortVerticesCUDAKernelLauncher(vertices, mask,
+                                                      num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid);
+
+REGISTER_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl, CUDA,
+                     diff_iou_rotated_sort_vertices_forward_cuda);
+
+void ChamferDistanceForwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, const Tensor dist1,
+    const Tensor dist2, const Tensor idx1, const Tensor idx2);
+
+void ChamferDistanceBackwardCUDAKernelLauncher(
+    const Tensor xyz1, const Tensor xyz2, Tensor grad_xyz1, Tensor grad_xyz2,
+    Tensor grad_dist1, Tensor grad_dist2, Tensor idx1, Tensor idx2);
+
+void chamfer_distance_forward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2) {
+  ChamferDistanceForwardCUDAKernelLauncher(xyz1, xyz2, dist1, dist2, idx1,
+                                           idx2);
+};
+
+void chamfer_distance_backward_cuda(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor gradxyz1, Tensor gradxyz2,
+                                    Tensor graddist1, Tensor graddist2,
+                                    Tensor idx1, Tensor idx2) {
+  ChamferDistanceBackwardCUDAKernelLauncher(xyz1, xyz2, gradxyz1, gradxyz2,
+                                            graddist1, graddist2, idx1, idx2);
+};
+
+void chamfer_distance_forward_impl(const Tensor xyz1, const Tensor xyz2,
+                                   const Tensor dist1, const Tensor dist2,
+                                   const Tensor idx1, const Tensor idx2);
+
+void chamfer_distance_backward_impl(const Tensor xyz1, const Tensor xyz2,
+                                    Tensor gradxyz1, Tensor gradxyz2,
+                                    Tensor graddist1, Tensor graddist2,
+                                    Tensor idx1, Tensor idx2);
+
+REGISTER_DEVICE_IMPL(chamfer_distance_forward_impl, CUDA,
+                     chamfer_distance_forward_cuda);
+REGISTER_DEVICE_IMPL(chamfer_distance_backward_impl, CUDA,
+                     chamfer_distance_backward_cuda);
+
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale);
+
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width, float spatial_scale);
+
+void PrROIPoolCoorBackwardCUDAKernelLauncher(
+    Tensor output, Tensor grad_output, Tensor input, Tensor rois,
+    Tensor grad_rois, int pooled_height, int pooled_width, float spatial_scale);
+
+void prroi_pool_forward_cuda(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  PrROIPoolForwardCUDAKernelLauncher(input, rois, output, pooled_height,
+                                     pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_cuda(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  PrROIPoolBackwardCUDAKernelLauncher(grad_output, rois, grad_input,
+                                      pooled_height, pooled_width,
+                                      spatial_scale);
+}
+
+void prroi_pool_coor_backward_cuda(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  PrROIPoolCoorBackwardCUDAKernelLauncher(output, grad_output, input, rois,
+                                          grad_rois, pooled_height,
+                                          pooled_width, spatial_scale);
+}
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale);
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale);
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale);
+REGISTER_DEVICE_IMPL(prroi_pool_forward_impl, CUDA, prroi_pool_forward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_backward_impl, CUDA, prroi_pool_backward_cuda);
+REGISTER_DEVICE_IMPL(prroi_pool_coor_backward_impl, CUDA,
+                     prroi_pool_coor_backward_cuda);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..05fc08b70be937411ed04c0dc80c40f5479c0d9e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/deform_conv_cuda.cu
@@ -0,0 +1,105 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void deformable_im2col_cuda(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  // num_axes should be smaller than block size
+  // todo: check parallel_imgs is correctly passed in
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        deformable_im2col_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, height, width, ksize_h,
+            ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, parallel_imgs, channels,
+            deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_cuda(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  // todo: make sure parallel_imgs is passed in correctly
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels =
+      channels * ksize_h * ksize_w * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        deformable_col2im_gpu_kernel<<<GET_BLOCKS(num_kernels),
+                                       THREADS_PER_BLOCK, 0,
+                                       at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, channels, height, width,
+            ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+            dilation_w, channel_per_deformable_group, parallel_imgs,
+            deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void deformable_col2im_coord_cuda(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = height_col * width_col * 2 * ksize_h * ksize_w *
+                    deformable_group * parallel_imgs;
+  int channel_per_deformable_group =
+      channels * ksize_h * ksize_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+
+        deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, channels, height,
+            width, ksize_h, ksize_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, parallel_imgs,
+            2 * ksize_h * ksize_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d44399829e99f725e2c24418723ea14685819858
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/deform_roi_pool_cuda.cu
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "deform_roi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void DeformRoIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                            Tensor offset, Tensor output,
+                                            int pooled_height, int pooled_width,
+                                            float spatial_scale,
+                                            int sampling_ratio, float gamma) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "deform_roi_pool_forward_cuda_kernel", [&] {
+        deform_roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), offset.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void DeformRoIPoolBackwardCUDAKernelLauncher(
+    Tensor grad_output, Tensor input, Tensor rois, Tensor offset,
+    Tensor grad_input, Tensor grad_offset, int pooled_height, int pooled_width,
+    float spatial_scale, int sampling_ratio, float gamma) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "deform_roi_pool_backward_cuda_kernel", [&] {
+        deform_roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                input.data_ptr<scalar_t>(), rois.data_ptr<scalar_t>(),
+                offset.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                grad_offset.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio,
+                static_cast<scalar_t>(gamma), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..62dbf5da357ac8f2178e53d21fd8f9d3339eca81
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/diff_iou_rotated_cuda.cu
@@ -0,0 +1,35 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Adapted from
+// https://github.com/lilanxiao/Rotated_IoU/cuda_op/sort_vert_kernel.cu  # noqa
+#include "diff_iou_rotated_cuda_kernel.cuh"
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_cuda_helper.hpp"
+
+at::Tensor DiffIoURotatedSortVerticesCUDAKernelLauncher(at::Tensor vertices,
+                                                        at::Tensor mask,
+                                                        at::Tensor num_valid) {
+  at::cuda::CUDAGuard device_guard(vertices.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  CHECK_CONTIGUOUS(vertices);
+  CHECK_CONTIGUOUS(mask);
+  CHECK_CONTIGUOUS(num_valid);
+  CHECK_CUDA(vertices);
+  CHECK_CUDA(mask);
+  CHECK_CUDA(num_valid);
+
+  int b = vertices.size(0);
+  int n = vertices.size(1);
+  int m = vertices.size(2);
+  at::Tensor idx =
+      torch::zeros({b, n, MAX_NUM_VERT_IDX},
+                   at::device(vertices.device()).dtype(at::ScalarType::Int));
+
+  diff_iou_rotated_sort_vertices_forward_cuda_kernel<<<b, opt_n_thread(n), 0,
+                                                       stream>>>(
+      b, n, m, vertices.data_ptr<float>(), mask.data_ptr<bool>(),
+      num_valid.data_ptr<int>(), idx.data_ptr<int>());
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return idx;
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cb899f954fd969e57a23d5723bf2f9c49b35a853
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/focal_loss_cuda.cu
@@ -0,0 +1,111 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "sigmoid_focal_loss_cuda_kernel.cuh"
+#include "softmax_focal_loss_cuda_kernel.cuh"
+
+void SigmoidFocalLossForwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = input.size(1);
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_forward_cuda_kernel", [&] {
+        sigmoid_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SigmoidFocalLossBackwardCUDAKernelLauncher(Tensor input, Tensor target,
+                                                Tensor weight,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int output_size = grad_input.numel();
+  int num_classes = input.size(1);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sigmoid_focal_loss_backward_cuda_kernel", [&] {
+        sigmoid_focal_loss_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossForwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  int output_size = output.numel();
+  int num_classes = softmax.size(1);
+
+  AT_ASSERTM(target.max().item<int64_t>() <= (int64_t)num_classes,
+             "target label should smaller or equal than num classes");
+  at::cuda::CUDAGuard device_guard(softmax.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      softmax.scalar_type(), "softmax_focal_loss_forward_cuda_kernel", [&] {
+        softmax_focal_loss_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                output.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SoftmaxFocalLossBackwardCUDAKernelLauncher(Tensor softmax, Tensor target,
+                                                Tensor weight, Tensor buff,
+                                                Tensor grad_input,
+                                                const float gamma,
+                                                const float alpha) {
+  int num_classes = softmax.size(1);
+
+  int output_size = buff.numel();
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda1_"
+      "kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda1_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), weight.data_ptr<scalar_t>(),
+                buff.data_ptr<scalar_t>(), gamma, alpha, num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  output_size = grad_input.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_input.scalar_type(),
+      "softmax_focal_loss_backward_cuda2_"
+      "kernel",
+      [&] {
+        softmax_focal_loss_backward_cuda2_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, softmax.data_ptr<scalar_t>(),
+                target.data_ptr<int64_t>(), buff.data_ptr<scalar_t>(),
+                grad_input.data_ptr<scalar_t>(), num_classes);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cfb4cd3646fa181de8bf61df33526c99dfdf5522
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/furthest_point_sample_cuda.cu
@@ -0,0 +1,143 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling_gpu.cu
+
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "furthest_point_sample_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+inline int opt_n_threads(int work_size) {
+  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
+
+  return max(min(1 << pow_2, 1024), 1);
+}
+
+void FurthestPointSamplingForwardCUDAKernelLauncher(int b, int n, int m,
+                                                    const float* dataset,
+                                                    float* temp, int* idxs) {
+  // dataset: (B, N, 3)
+  // tmp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  unsigned int n_threads = opt_n_threads(n);
+
+  switch (n_threads) {
+    case 1024:
+      furthest_point_sampling_forward_cuda_kernel<1024>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 512:
+      furthest_point_sampling_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_forward_cuda_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_forward_cuda_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_forward_cuda_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_forward_cuda_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_forward_cuda_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_forward_cuda_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_forward_cuda_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_forward_cuda_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_forward_cuda_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void FurthestPointSamplingWithDistForwardCUDAKernelLauncher(
+    int b, int n, int m, const float* dataset, float* temp, int* idxs) {
+  // dataset: (B, N, N)
+  // temp: (B, N)
+  // output:
+  //      idx: (B, M)
+
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  unsigned int n_threads = opt_n_threads(n);
+
+  switch (n_threads) {
+    case 1024:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<1024>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 512:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 256:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<256>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 128:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<128>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 64:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<64>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 32:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<32>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 16:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<16>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 8:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<8>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 4:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<4>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 2:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<2>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    case 1:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<1>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+      break;
+    default:
+      furthest_point_sampling_with_dist_forward_cuda_kernel<512>
+          <<<b, n_threads, 0, stream>>>(b, n, m, dataset, temp, idxs);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..911ea019aad65c8e51ca94c273cb5bbad70ae8db
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/fused_bias_leakyrelu_cuda.cu
@@ -0,0 +1,109 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act_kernel.cu
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+template <typename scalar_t>
+static __global__ void fused_bias_act_kernel(
+    scalar_t* out, const scalar_t* p_x, const scalar_t* p_b,
+    const scalar_t* p_ref, int act, int grad, scalar_t alpha, scalar_t scale,
+    int loop_x, int size_x, int step_b, int size_b, int use_bias, int use_ref) {
+  int xi = blockIdx.x * loop_x * blockDim.x + threadIdx.x;
+
+  scalar_t zero = 0.0;
+
+  for (int loop_idx = 0; loop_idx < loop_x && xi < size_x;
+       loop_idx++, xi += blockDim.x) {
+    scalar_t x = p_x[xi];
+
+    if (use_bias) {
+      x += p_b[(xi / step_b) % size_b];
+    }
+
+    scalar_t ref = use_ref ? p_ref[xi] : zero;
+
+    scalar_t y;
+
+    // act = 1: linear layer
+    // act = 3: leaky relu layer
+    // grad = 0: direct forward path
+    // grad = 1: first order deviation
+    // grad = 2: second order deviation
+    switch (act * 10 + grad) {
+      default:
+      case 10:
+        y = x;
+        break;
+      case 11:
+        y = x;
+        break;
+      case 12:
+        y = 0.0;
+        break;
+
+      case 30:
+        y = (x > 0.0) ? x : x * alpha;
+        break;
+      case 31:
+        y = (ref > 0.0) ? x : x * alpha;
+        break;
+      case 32:
+        y = 0.0;
+        break;
+    }
+
+    out[xi] = y * scale;
+  }
+}
+
+torch::Tensor fused_bias_leakyrelu_op(const torch::Tensor& input,
+                                      const torch::Tensor& bias,
+                                      const torch::Tensor& refer, int act,
+                                      int grad, float alpha, float scale) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  auto x = input.contiguous();
+  auto b = bias.contiguous();
+  auto ref = refer.contiguous();
+
+  int use_bias = b.numel() ? 1 : 0;
+  int use_ref = ref.numel() ? 1 : 0;
+
+  int size_x = x.numel();
+  int size_b = b.numel();
+  int step_b = 1;
+
+  for (int i = 1 + 1; i < x.dim(); i++) {
+    step_b *= x.size(i);
+  }
+
+  int loop_x = 4;
+  int block_size = 4 * 32;
+  int grid_size = (size_x - 1) / (loop_x * block_size) + 1;
+
+  auto y = torch::empty_like(x);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      x.scalar_type(), "fused_bias_act_kernel", [&] {
+        fused_bias_act_kernel<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            y.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(), ref.data_ptr<scalar_t>(), act, grad, alpha,
+            scale, loop_x, size_x, step_b, size_b, use_bias, use_ref);
+      });
+
+  return y;
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1b161d39228e1ea24cca1950dba1a3ca86e391e1
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/fused_spconv_ops_cuda.cu
@@ -0,0 +1,104 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+torch::Tensor FusedIndiceConvBatchnormCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+
+  torch::Tensor output =
+      torch::zeros({numActOut, numOutPlanes}, options).copy_(bias);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {  // the center index of subm conv don't need gather and scatter
+               // add.
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "FusedIndiceConvBatchnormKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+
+  return output;
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd0a7b5daf03510cfb7408ff82cfac760af92afb
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/gather_points_cuda.cu
@@ -0,0 +1,58 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "gather_points_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void GatherPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           const Tensor points,
+                                           const Tensor idx, Tensor out) {
+  // points: (B, C, N)
+  // idx: (B, npoints)
+  // output:
+  //      out: (B, C, npoints)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "gather_points_forward_cuda_kernel", [&] {
+        gather_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, points.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GatherPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                            const Tensor grad_out,
+                                            const Tensor idx,
+                                            Tensor grad_points) {
+  // grad_out: (B, C, npoints)
+  // idx: (B, npoints)
+  // output:
+  //      grad_points: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "gather_points_backward_cuda_kernel", [&] {
+        gather_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, grad_out.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..42fc2bb67b13938b8994f1961ec2fbc41a30d2d8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/group_points_cuda.cu
@@ -0,0 +1,61 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points_gpu.cu
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "group_points_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void GroupPointsForwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                          int nsample, const Tensor points,
+                                          const Tensor idx, Tensor out) {
+  // points: (B, C, N)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      out: (B, C, npoints, nsample)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "group_points_forward_cuda_kernel", [&] {
+        group_points_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, nsample, points.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void GroupPointsBackwardCUDAKernelLauncher(int b, int c, int n, int npoints,
+                                           int nsample, const Tensor grad_out,
+                                           const Tensor idx,
+                                           Tensor grad_points) {
+  // grad_out: (B, C, npoints, nsample)
+  // idx: (B, npoints, nsample)
+  // output:
+  //      grad_points: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(npoints * nsample, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "group_points_backward_cuda_kernel", [&] {
+        group_points_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, npoints, nsample, grad_out.data_ptr<scalar_t>(),
+                idx.data_ptr<int>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ad5878fba27fee2fc6989d59b1f4f5959788c518
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/iou3d_cuda.cu
@@ -0,0 +1,67 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms_kernel.cu
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include <stdio.h>
+
+#include "iou3d_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void IoU3DBoxesOverlapBevForwardCUDAKernelLauncher(const int num_a,
+                                                   const Tensor boxes_a,
+                                                   const int num_b,
+                                                   const Tensor boxes_b,
+                                                   Tensor ans_overlap) {
+  at::cuda::CUDAGuard device_guard(boxes_a.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(num_b, THREADS_PER_BLOCK_IOU3D),
+              GET_BLOCKS(num_a, THREADS_PER_BLOCK_IOU3D));
+  dim3 threads(THREADS_PER_BLOCK_IOU3D, THREADS_PER_BLOCK_IOU3D);
+
+  iou3d_boxes_overlap_bev_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      num_a, boxes_a.data_ptr<float>(), num_b, boxes_b.data_ptr<float>(),
+      ans_overlap.data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DNMS3DForwardCUDAKernelLauncher(const Tensor boxes,
+                                         unsigned long long *mask,
+                                         int boxes_num,
+                                         float nms_overlap_thresh) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  iou3d_nms3d_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void IoU3DNMS3DNormalForwardCUDAKernelLauncher(const Tensor boxes,
+                                               unsigned long long *mask,
+                                               int boxes_num,
+                                               float nms_overlap_thresh) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS),
+              GET_BLOCKS(boxes_num, THREADS_PER_BLOCK_NMS));
+  dim3 threads(THREADS_PER_BLOCK_NMS);
+
+  iou3d_nms3d_normal_forward_cuda_kernel<<<blocks, threads, 0, stream>>>(
+      boxes_num, nms_overlap_thresh, boxes.data_ptr<float>(), mask);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e3351819779cc356cc21d7bb375082f71da2cb75
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/knn_cuda.cu
@@ -0,0 +1,34 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include <cmath>
+#include <cstdio>
+
+#include "knn_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void KNNForwardCUDAKernelLauncher(int b, int n, int m, int nsample,
+                                  const Tensor xyz, const Tensor new_xyz,
+                                  Tensor idx, Tensor dist2) {
+  // param new_xyz: (B, m, 3)
+  // param xyz: (B, n, 3)
+  // param idx: (B, m, nsample)
+
+  at::cuda::CUDAGuard device_guard(new_xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(m, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      new_xyz.scalar_type(), "knn_forward_cuda_kernel", [&] {
+        knn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            b, n, m, nsample, xyz.data_ptr<scalar_t>(),
+            new_xyz.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+            dist2.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..022e18901580a415037d1d5942791b3ccafc30b9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/masked_conv2d_cuda.cu
@@ -0,0 +1,54 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "masked_conv2d_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MaskedIm2colForwardCUDAKernelLauncher(const Tensor bottom_data,
+                                           const Tensor mask_h_idx,
+                                           const Tensor mask_w_idx,
+                                           Tensor top_data, const int kernel_h,
+                                           const int kernel_w, const int pad_h,
+                                           const int pad_w) {
+  int channels = bottom_data.size(1);
+  int height = bottom_data.size(2);
+  int width = bottom_data.size(3);
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedIm2colLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+        MaskedIm2colForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, kernel_h, kernel_w,
+                pad_h, pad_w, mask_h_idx_, mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void MaskedCol2imForwardCUDAKernelLauncher(
+    const Tensor bottom_data, const Tensor mask_h_idx, const Tensor mask_w_idx,
+    Tensor top_data, const int height, const int width, const int channels) {
+  int mask_cnt = mask_h_idx.size(0);
+  int output_size = mask_cnt * channels;
+
+  at::cuda::CUDAGuard device_guard(bottom_data.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      bottom_data.scalar_type(), "MaskedCol2imLaucherForward", ([&] {
+        const scalar_t *bottom_data_ = bottom_data.data_ptr<scalar_t>();
+        const int64_t *mask_h_idx_ = mask_h_idx.data_ptr<int64_t>();
+        const int64_t *mask_w_idx_ = mask_w_idx.data_ptr<int64_t>();
+        scalar_t *top_data_ = top_data.data_ptr<scalar_t>();
+
+        MaskedCol2imForward<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data_, height, width, channels, mask_h_idx_,
+                mask_w_idx_, mask_cnt, top_data_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9314f2dda6c89e1f35369b1b7ab9d290cf2ab295
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/min_area_polygons.cu
@@ -0,0 +1,21 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/SDL-GuoZonghao/BeyondBoundingBox/blob/main/mmdet/ops/minareabbox/src/minareabbox_kernel.cu
+#include "min_area_polygons_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void MinAreaPolygonsCUDAKernelLauncher(const Tensor pointsets,
+                                       Tensor polygons) {
+  int num_pointsets = pointsets.size(0);
+  const int output_size = polygons.numel();
+  at::cuda::CUDAGuard device_guard(pointsets.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      pointsets.scalar_type(), "min_area_polygons_cuda_kernel", ([&] {
+        min_area_polygons_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                num_pointsets, pointsets.data_ptr<scalar_t>(),
+                polygons.data_ptr<scalar_t>());
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2b52796e4fdfa2b8bf039fd66f0b16a3af8c84ee
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/modulated_deform_conv_cuda.cu
@@ -0,0 +1,96 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "modulated_deform_conv_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void modulated_deformable_im2col_cuda(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_im.scalar_type(), "modulated_deformable_im2col_gpu", ([&] {
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+
+        modulated_deformable_im2col_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_im_, data_offset_, data_mask_, height_im,
+            width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+            dilation_h, dilation_w, channel_per_deformable_group, batch_size,
+            channels, deformable_group, height_col, width_col, data_col_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_cuda(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels =
+      channels * kernel_h * kernel_w * batch_size * height_col * width_col;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_im_ = grad_im.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_offset_, data_mask_, channels,
+            height_im, width_im, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+            stride_w, dilation_h, dilation_w, channel_per_deformable_group,
+            batch_size, deformable_group, height_col, width_col, grad_im_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void modulated_deformable_col2im_coord_cuda(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  const int num_kernels = batch_size * height_col * width_col * 2 * kernel_h *
+                          kernel_w * deformable_group;
+  const int channel_per_deformable_group =
+      channels * kernel_h * kernel_w / deformable_group;
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      data_col.scalar_type(), "modulated_deformable_col2im_coord_gpu", ([&] {
+        const scalar_t *data_col_ = data_col.data_ptr<scalar_t>();
+        const scalar_t *data_im_ = data_im.data_ptr<scalar_t>();
+        const scalar_t *data_offset_ = data_offset.data_ptr<scalar_t>();
+        const scalar_t *data_mask_ = data_mask.data_ptr<scalar_t>();
+        scalar_t *grad_offset_ = grad_offset.data_ptr<scalar_t>();
+        scalar_t *grad_mask_ = grad_mask.data_ptr<scalar_t>();
+
+        modulated_deformable_col2im_coord_gpu_kernel<<<
+            GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0,
+            at::cuda::getCurrentCUDAStream()>>>(
+            num_kernels, data_col_, data_im_, data_offset_, data_mask_,
+            channels, height_im, width_im, kernel_h, kernel_w, pad_h, pad_w,
+            stride_h, stride_w, dilation_h, dilation_w,
+            channel_per_deformable_group, batch_size,
+            2 * kernel_h * kernel_w * deformable_group, deformable_group,
+            height_col, width_col, grad_offset_, grad_mask_);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..fd191ee9c99eb000dced9131abf551ce65c691d3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,351 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <THC/THCAtomics.cuh>
+#include <vector>
+
+#include "ms_deform_attn_cuda_kernel.cuh"
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream, const scalar_t *data_value,
+                               const int64_t *data_spatial_shapes,
+                               const int64_t *data_level_start_index,
+                               const scalar_t *data_sampling_loc,
+                               const scalar_t *data_attn_weight,
+                               const int batch_size, const int spatial_size,
+                               const int num_heads, const int channels,
+                               const int num_levels, const int num_query,
+                               const int num_point, scalar_t *data_col) {
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = THREADS_PER_BLOCK;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0, stream>>>(
+          num_kernels, data_value, data_spatial_shapes, data_level_start_index,
+          data_sampling_loc, data_attn_weight, batch_size, spatial_size,
+          num_heads, channels, num_levels, num_query, num_point, data_col);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(
+    cudaStream_t stream, const scalar_t *grad_col, const scalar_t *data_value,
+    const int64_t *data_spatial_shapes, const int64_t *data_level_start_index,
+    const scalar_t *data_sampling_loc, const scalar_t *data_attn_weight,
+    const int batch_size, const int spatial_size, const int num_heads,
+    const int channels, const int num_levels, const int num_query,
+    const int num_point, scalar_t *grad_value, scalar_t *grad_sampling_loc,
+    scalar_t *grad_attn_weight) {
+  const int num_threads =
+      (channels > THREADS_PER_BLOCK) ? THREADS_PER_BLOCK : channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > THREADS_PER_BLOCK) {
+    if ((channels & THREADS_PER_BLOCK - 1) == 0) {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+             num_threads * 3 * sizeof(scalar_t), stream>>>(
+              num_kernels, grad_col, data_value, data_spatial_shapes,
+              data_level_start_index, data_sampling_loc, data_attn_weight,
+              batch_size, spatial_size, num_heads, channels, num_levels,
+              num_query, num_point, grad_value, grad_sampling_loc,
+              grad_attn_weight);
+    } else {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+             stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                       data_level_start_index, data_sampling_loc,
+                       data_attn_weight, batch_size, spatial_size, num_heads,
+                       channels, num_levels, num_query, num_point, grad_value,
+                       grad_sampling_loc, grad_attn_weight);
+    }
+  } else {
+    switch (channels) {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      1>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      2>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      4>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      8>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      16>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t,
+                                                                      32>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      64>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      128>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      256>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t,
+                                                                      512>
+            <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads, 0,
+               stream>>>(num_kernels, grad_col, data_value, data_spatial_shapes,
+                         data_level_start_index, data_sampling_loc,
+                         data_attn_weight, batch_size, spatial_size, num_heads,
+                         channels, num_levels, num_query, num_point, grad_value,
+                         grad_sampling_loc, grad_attn_weight);
+        break;
+      default:
+        if (channels < 64) {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        } else {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+              <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+                 num_threads * 3 * sizeof(scalar_t), stream>>>(
+                  num_kernels, grad_col, data_value, data_spatial_shapes,
+                  data_level_start_index, data_sampling_loc, data_attn_weight,
+                  batch_size, spatial_size, num_heads, channels, num_levels,
+                  num_query, num_point, grad_value, grad_sampling_loc,
+                  grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+}
+
+at::Tensor ms_deform_attn_cuda_forward(const at::Tensor &value,
+                                       const at::Tensor &spatial_shapes,
+                                       const at::Tensor &level_start_index,
+                                       const at::Tensor &sampling_loc,
+                                       const at::Tensor &attn_weight,
+                                       const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  auto output =
+      at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+  const int batch_n = im2col_step_;
+  auto output_n = output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto columns = output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES(
+        value.scalar_type(), "ms_deform_attn_forward_cuda", ([&] {
+          ms_deformable_im2col_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point, columns.data_ptr<scalar_t>());
+        }));
+  }
+
+  output = output.view({batch, num_query, num_heads * channels});
+
+  return output;
+}
+
+void ms_deform_attn_cuda_backward(
+    const at::Tensor &value, const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index, const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight, const at::Tensor &grad_output,
+    at::Tensor &grad_value, at::Tensor &grad_sampling_loc,
+    at::Tensor &grad_attn_weight, const int im2col_step) {
+  AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+  AT_ASSERTM(spatial_shapes.is_contiguous(),
+             "spatial_shapes tensor has to be contiguous");
+  AT_ASSERTM(level_start_index.is_contiguous(),
+             "level_start_index tensor has to be contiguous");
+  AT_ASSERTM(sampling_loc.is_contiguous(),
+             "sampling_loc tensor has to be contiguous");
+  AT_ASSERTM(attn_weight.is_contiguous(),
+             "attn_weight tensor has to be contiguous");
+  AT_ASSERTM(grad_output.is_contiguous(),
+             "grad_output tensor has to be contiguous");
+
+  AT_ASSERTM(value.is_cuda(), "value must be a CUDA tensor");
+  AT_ASSERTM(spatial_shapes.is_cuda(), "spatial_shapes must be a CUDA tensor");
+  AT_ASSERTM(level_start_index.is_cuda(),
+             "level_start_index must be a CUDA tensor");
+  AT_ASSERTM(sampling_loc.is_cuda(), "sampling_loc must be a CUDA tensor");
+  AT_ASSERTM(attn_weight.is_cuda(), "attn_weight must be a CUDA tensor");
+  AT_ASSERTM(grad_output.is_cuda(), "grad_output must be a CUDA tensor");
+
+  const int batch = value.size(0);
+  const int spatial_size = value.size(1);
+  const int num_heads = value.size(2);
+  const int channels = value.size(3);
+
+  const int num_levels = spatial_shapes.size(0);
+
+  const int num_query = sampling_loc.size(1);
+  const int num_point = sampling_loc.size(4);
+
+  const int im2col_step_ = std::min(batch, im2col_step);
+
+  AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)",
+             batch, im2col_step_);
+
+  const int batch_n = im2col_step_;
+  auto per_value_size = spatial_size * num_heads * channels;
+  auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+  auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+  auto grad_output_n = grad_output.view(
+      {batch / im2col_step_, batch_n, num_query, num_heads, channels});
+
+  for (int n = 0; n < batch / im2col_step_; ++n) {
+    auto grad_output_g = grad_output_n.select(0, n);
+    AT_DISPATCH_FLOATING_TYPES(
+        value.scalar_type(), "ms_deform_attn_backward_cuda", ([&] {
+          ms_deformable_col2im_cuda(
+              at::cuda::getCurrentCUDAStream(),
+              grad_output_g.data_ptr<scalar_t>(),
+              value.data_ptr<scalar_t>() + n * im2col_step_ * per_value_size,
+              spatial_shapes.data_ptr<int64_t>(),
+              level_start_index.data_ptr<int64_t>(),
+              sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size,
+              batch_n, spatial_size, num_heads, channels, num_levels, num_query,
+              num_point,
+              grad_value.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_value_size,
+              grad_sampling_loc.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_sample_loc_size,
+              grad_attn_weight.data_ptr<scalar_t>() +
+                  n * im2col_step_ * per_attn_weight_size);
+        }));
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1b87e0fa75bd5507ed0c94c7e32eb601a95a5f76
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/nms_cuda.cu
@@ -0,0 +1,36 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "nms_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor NMSCUDAKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                             int offset) {
+  at::cuda::CUDAGuard device_guard(boxes.device());
+
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+  auto order_t = std::get<1>(scores.sort(0, /*descending=*/true));
+  auto boxes_sorted = boxes.index_select(0, order_t);
+
+  int boxes_num = boxes.size(0);
+  const int col_blocks = (boxes_num + threadsPerBlock - 1) / threadsPerBlock;
+  const int col_blocks_alloc = GET_BLOCKS(boxes_num, threadsPerBlock);
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  dim3 blocks(col_blocks_alloc, col_blocks_alloc);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  nms_cuda<<<blocks, threads, 0, stream>>>(
+      boxes_num, iou_threshold, offset, boxes_sorted.data_ptr<float>(),
+      (unsigned long long*)mask.data_ptr<int64_t>());
+
+  // Filter the boxes which should be kept.
+  at::Tensor keep_t = at::zeros(
+      {boxes_num}, boxes.options().dtype(at::kBool).device(at::kCUDA));
+  gather_keep_from_mask<<<1, min(col_blocks, THREADS_PER_BLOCK),
+                          col_blocks * sizeof(unsigned long long), stream>>>(
+      keep_t.data_ptr<bool>(), (unsigned long long*)mask.data_ptr<int64_t>(),
+      boxes_num);
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.masked_select(keep_t);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1185f81cb2fd58d00a30d3fff5215af76f57a85
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/nms_rotated_cuda.cu
@@ -0,0 +1,62 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated_cuda.cu
+#include "nms_rotated_cuda.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order_t, const Tensor dets_sorted,
+                        float iou_threshold, const int multi_label) {
+  // using scalar_t = float;
+  AT_ASSERTM(dets.is_cuda(), "dets must be a CUDA tensor");
+  AT_ASSERTM(scores.is_cuda(), "scores must be a CUDA tensor");
+  at::cuda::CUDAGuard device_guard(dets.device());
+
+  int dets_num = dets.size(0);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(dets_num, threadsPerBlock);
+
+  Tensor mask =
+      at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      dets_sorted.scalar_type(), "nms_rotated_kernel_cuda", [&] {
+        nms_rotated_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            dets_num, iou_threshold, dets_sorted.data_ptr<scalar_t>(),
+            (unsigned long long*)mask.data_ptr<int64_t>(), multi_label);
+      });
+
+  Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host =
+      (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  Tensor keep =
+      at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  int num_to_keep = 0;
+  for (int i = 0; i < dets_num; i++) {
+    int nblock = i / threadsPerBlock;
+    int inblock = i % threadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep)
+           .to(order_t.device(), keep.scalar_type())});
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3cc89d010a80126360fe42503a1754ef4a420afa
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/points_in_boxes_cuda.cu
@@ -0,0 +1,62 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <stdio.h>
+
+#include "points_in_boxes_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PointsInBoxesPartForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                                int pts_num, const Tensor boxes,
+                                                const Tensor pts,
+                                                Tensor box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is
+  // the bottom center, each box DO NOT overlaps params pts: (B, npoints, 3) [x,
+  // y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default
+  // -1
+
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      boxes.scalar_type(), "points_in_boxes_part_forward_cuda_kernel", [&] {
+        points_in_boxes_part_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
+                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PointsInBoxesAllForwardCUDAKernelLauncher(int batch_size, int boxes_num,
+                                               int pts_num, const Tensor boxes,
+                                               const Tensor pts,
+                                               Tensor box_idx_of_points) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+
+  at::cuda::CUDAGuard device_guard(boxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      boxes.scalar_type(), "points_in_boxes_all_forward_cuda_kernel", [&] {
+        points_in_boxes_all_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                batch_size, boxes_num, pts_num, boxes.data_ptr<scalar_t>(),
+                pts.data_ptr<scalar_t>(), box_idx_of_points.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6e7db9ddfd63e4bfb3ca150a83dde5a79fb1717e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/points_in_polygons_cuda.cu
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/ming71/CUDA/blob/master/point_justify/points_justify_kernel.cu
+
+#include <stdio.h>
+
+#include "points_in_polygons_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PointsInPolygonsForwardCUDAKernelLauncher(const at::Tensor points,
+                                               const at::Tensor polygons,
+                                               const int rows, const int cols,
+                                               at::Tensor output) {
+  const int output_size = rows * cols;
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "points_in_polygons_forward_cuda_kernel", ([&] {
+        const scalar_t *vertex1 = points.data_ptr<scalar_t>();
+        const scalar_t *vertex2 = polygons.data_ptr<scalar_t>();
+        scalar_t *inside_flag = output.data_ptr<scalar_t>();
+
+        points_in_polygons_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, vertex1, vertex2, rows, cols, inside_flag);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e0636098b1d6fb6eef0c6a5ff334ddb43ae7855f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/prroi_pool_cuda.cu
@@ -0,0 +1,65 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "prroi_pool_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PrROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois,
+                                        Tensor output, int pooled_height,
+                                        int pooled_width, float spatial_scale) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_forward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, input.data_ptr<float>(), rois.data_ptr<float>(),
+          output.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PrROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                         Tensor grad_input, int pooled_height,
+                                         int pooled_width,
+                                         float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_backward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, grad_output.data_ptr<float>(), rois.data_ptr<float>(),
+          grad_input.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void PrROIPoolCoorBackwardCUDAKernelLauncher(Tensor output, Tensor grad_output,
+                                             Tensor input, Tensor rois,
+                                             Tensor grad_rois,
+                                             int pooled_height,
+                                             int pooled_width,
+                                             float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  prroi_pool_coor_backward_cuda_kernel<float>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, output.data_ptr<float>(), grad_output.data_ptr<float>(),
+          input.data_ptr<float>(), rois.data_ptr<float>(),
+          grad_rois.data_ptr<float>(), pooled_height, pooled_width,
+          static_cast<float>(spatial_scale), channels, height, width);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a0bdfa60c2d3ba75d089d0bfa44648821aaf4fed
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/psamask_cuda.cu
@@ -0,0 +1,60 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+
+#include <torch/serialize/tensor.h>
+
+#include "psamask_cuda_kernel.cuh"
+#include "pytorch_cuda_helper.hpp"
+
+void PSAMaskForwardCUDAKernelLauncher(const int psa_type, const Tensor input,
+                                      Tensor output, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (psa_type == 0)
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_collect_forward_cuda", [&] {
+          psamask_collect_forward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, input.data_ptr<scalar_t>(),
+              output.data_ptr<scalar_t>());
+        });
+  else
+    AT_DISPATCH_FLOATING_TYPES(
+        input.scalar_type(), "psamask_distribute_forward_cuda", [&] {
+          psamask_distribute_forward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, input.data_ptr<scalar_t>(),
+                  output.data_ptr<scalar_t>());
+        });
+}
+
+void PSAMaskBackwardCUDAKernelLauncher(
+    const int psa_type, const Tensor grad_output, Tensor grad_input,
+    const int num_, const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int half_h_mask, const int half_w_mask) {
+  int nthreads = num_ * h_feature * w_feature;
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  if (psa_type == 0)
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_collect_backward_cuda", [&] {
+          psamask_collect_backward_cuda<scalar_t><<<nthreads, 512, 0, stream>>>(
+              nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+              half_w_mask, grad_output.data_ptr<scalar_t>(),
+              grad_input.data_ptr<scalar_t>());
+        });
+  else
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_input.scalar_type(), "psamask_distribute_backward_cuda", [&] {
+          psamask_distribute_backward_cuda<scalar_t>
+              <<<nthreads, 512, 0, stream>>>(
+                  nthreads, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                  half_w_mask, grad_output.data_ptr<scalar_t>(),
+                  grad_input.data_ptr<scalar_t>());
+        });
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9829da731d6f5ad61ad2cde04a3b8511b5ca942c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/riroi_align_rotated_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "riroi_align_rotated_cuda_kernel.cuh"
+
+void RiROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor features, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor output) {
+  const int output_size =
+      num_rois * pooled_height * pooled_width * channels * num_orientations;
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "riroi_align_rotated_forward_cuda_kernel", ([&] {
+        const scalar_t *bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+
+        riroi_align_rotated_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                num_samples, clockwise, channels, height, width, pooled_height,
+                pooled_width, num_orientations, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RiROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int num_samples, const bool clockwise, const int channels,
+    const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, const int num_orientations,
+    at::Tensor bottom_grad) {
+  const int output_size =
+      num_rois * pooled_height * pooled_width * channels * num_orientations;
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "riroi_align_rotated_backward_cuda_kernel", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        riroi_align_rotated_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, top_diff, rois_data, spatial_scale, num_samples,
+                clockwise, channels, height, width, pooled_height, pooled_width,
+                num_orientations, bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3d4f7614e4bce44b77027c82d99cabbd571e608c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/roi_align_cuda.cu
@@ -0,0 +1,58 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_align_cuda_kernel.cuh"
+
+void ROIAlignForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       int aligned_height, int aligned_width,
+                                       float spatial_scale, int sampling_ratio,
+                                       int pool_mode, bool aligned) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "roi_align_forward_cuda_kernel", [&] {
+        roi_align_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax_y.data_ptr<scalar_t>(), argmax_x.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+                aligned, channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                        Tensor argmax_y, Tensor argmax_x,
+                                        Tensor grad_input, int aligned_height,
+                                        int aligned_width, float spatial_scale,
+                                        int sampling_ratio, int pool_mode,
+                                        bool aligned) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "roi_align_backward_cuda_kernel", [&] {
+        roi_align_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), argmax_y.data_ptr<scalar_t>(),
+                argmax_x.data_ptr<scalar_t>(), grad_input.data_ptr<scalar_t>(),
+                aligned_height, aligned_width,
+                static_cast<scalar_t>(spatial_scale), sampling_ratio, pool_mode,
+                aligned, channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c0fd987bb91d4c903c7e408190d7a31b906bae62
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/roi_align_rotated_cuda.cu
@@ -0,0 +1,45 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_align_rotated_cuda_kernel.cuh"
+
+void ROIAlignRotatedForwardCUDAKernelLauncher(
+    const at::Tensor input, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor output) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "ROIAlignRotatedLaucherForward", ([&] {
+        const scalar_t *bottom_data = input.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *top_data = output.data_ptr<scalar_t>();
+
+        roi_align_rotated_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, bottom_data, rois_data, scalar_t(spatial_scale),
+                sampling_ratio, aligned, clockwise, channels, height, width,
+                pooled_height, pooled_width, top_data);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIAlignRotatedBackwardCUDAKernelLauncher(
+    const at::Tensor top_grad, const at::Tensor rois, const float spatial_scale,
+    const int sampling_ratio, const bool aligned, const bool clockwise,
+    const int channels, const int height, const int width, const int num_rois,
+    const int pooled_height, const int pooled_width, at::Tensor bottom_grad) {
+  const int output_size = num_rois * pooled_height * pooled_width * channels;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "ROIAlignLaucherBackward", ([&] {
+        const scalar_t *top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t *rois_data = rois.data_ptr<scalar_t>();
+        scalar_t *bottom_diff = bottom_grad.data_ptr<scalar_t>();
+        roi_align_rotated_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK>>>(
+                output_size, top_diff, rois_data, spatial_scale, sampling_ratio,
+                aligned, clockwise, channels, height, width, pooled_height,
+                pooled_width, bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d9cdf3050964e9bd4fbb64f0650b138ccb51ac6d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/roi_pool_cuda.cu
@@ -0,0 +1,50 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "roi_pool_cuda_kernel.cuh"
+
+void ROIPoolForwardCUDAKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax, int pooled_height,
+                                      int pooled_width, float spatial_scale) {
+  int output_size = output.numel();
+  int channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "roi_pool_forward_cuda_kernel", [&] {
+        roi_pool_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), output.data_ptr<scalar_t>(),
+                argmax.data_ptr<int>(), pooled_height, pooled_width,
+                static_cast<scalar_t>(spatial_scale), channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ROIPoolBackwardCUDAKernelLauncher(Tensor grad_output, Tensor rois,
+                                       Tensor argmax, Tensor grad_input,
+                                       int pooled_height, int pooled_width,
+                                       float spatial_scale) {
+  int output_size = grad_output.numel();
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "roi_pool_backward_cuda_kernel", [&] {
+        roi_pool_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                rois.data_ptr<scalar_t>(), argmax.data_ptr<int>(),
+                grad_input.data_ptr<scalar_t>(), pooled_height, pooled_width,
+                channels, height, width);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7d83755f4c89104a037cb7c16a59e6dd25f84e12
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/roiaware_pool3d_cuda.cu
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/sshaoshuai/PCDet/blob/master/pcdet/ops/roiaware_pool3d/src/roiaware_pool3d_kernel.cu
+// Written by Shaoshuai Shi
+// All Rights Reserved 2019.
+
+#include <stdio.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "roiaware_pool3d_cuda_kernel.cuh"
+
+void RoiawarePool3dForwardCUDAKernelLauncher(
+    int boxes_num, int pts_num, int channels, int max_pts_each_voxel, int out_x,
+    int out_y, int out_z, const Tensor rois, const Tensor pts,
+    const Tensor pts_feature, Tensor argmax, Tensor pts_idx_of_voxels,
+    Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate params pts: (npoints, 3) [x, y, z] in LiDAR coordinate params
+  // pts_feature: (npoints, C) params argmax: (N, out_x, out_y, out_z, C) params
+  // pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel) params
+  // pooled_features: (N, out_x, out_y, out_z, C) params pool_method: 0:
+  // max_pool 1: avg_pool
+
+  at::cuda::CUDAGuard device_guard(pts_feature.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  Tensor pts_mask =
+      -at::ones({boxes_num, pts_num}, pts_feature.options().dtype(at::kInt));
+
+  dim3 blocks_mask(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      rois.scalar_type(), "generate_pts_mask_for_box3d", [&] {
+        generate_pts_mask_for_box3d<scalar_t>
+            <<<blocks_mask, threads, 0, stream>>>(
+                boxes_num, pts_num, out_x, out_y, out_z,
+                rois.data_ptr<scalar_t>(), pts.data_ptr<scalar_t>(),
+                pts_mask.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // TODO: Merge the collect and pool functions, SS
+
+  dim3 blocks_collect(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK));
+
+  AT_DISPATCH_INTEGRAL_TYPES(
+      pts_idx_of_voxels.scalar_type(), "collect_inside_pts_for_box3d", [&] {
+        collect_inside_pts_for_box3d<scalar_t>
+            <<<blocks_collect, threads, 0, stream>>>(
+                boxes_num, pts_num, max_pts_each_voxel, out_x, out_y, out_z,
+                pts_mask.data_ptr<int>(),
+                pts_idx_of_voxels.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  dim3 blocks_pool(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK),
+                   channels, boxes_num);
+  if (pool_method == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pts_feature.scalar_type(), "roiaware_maxpool3d", [&] {
+          roiaware_maxpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
+              out_z, pts_feature.data_ptr<scalar_t>(),
+              pts_idx_of_voxels.data_ptr<int>(),
+              pooled_features.data_ptr<scalar_t>(), argmax.data_ptr<int>());
+        });
+  } else if (pool_method == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        pts_feature.scalar_type(), "roiaware_avgpool3d", [&] {
+          roiaware_avgpool3d<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+              boxes_num, pts_num, channels, max_pts_each_voxel, out_x, out_y,
+              out_z, pts_feature.data_ptr<scalar_t>(),
+              pts_idx_of_voxels.data_ptr<int>(),
+              pooled_features.data_ptr<scalar_t>());
+        });
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RoiawarePool3dBackwardCUDAKernelLauncher(
+    int boxes_num, int out_x, int out_y, int out_z, int channels,
+    int max_pts_each_voxel, const Tensor pts_idx_of_voxels, const Tensor argmax,
+    const Tensor grad_out, Tensor grad_in, int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool, 1: avg_pool
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  dim3 blocks(GET_BLOCKS(out_x * out_y * out_z, THREADS_PER_BLOCK), channels,
+              boxes_num);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  if (pool_method == 0) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        grad_in.scalar_type(), "roiaware_maxpool3d_backward", [&] {
+          roiaware_maxpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
+              boxes_num, channels, out_x, out_y, out_z, argmax.data_ptr<int>(),
+              grad_out.data_ptr<scalar_t>(), grad_in.data_ptr<scalar_t>());
+        });
+  } else if (pool_method == 1) {
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        grad_in.scalar_type(), "roiaware_avgpool3d_backward", [&] {
+          roiaware_avgpool3d_backward<scalar_t><<<blocks, threads, 0, stream>>>(
+              boxes_num, channels, out_x, out_y, out_z, max_pts_each_voxel,
+              pts_idx_of_voxels.data_ptr<int>(), grad_out.data_ptr<scalar_t>(),
+              grad_in.data_ptr<scalar_t>());
+        });
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..af2098e8229ef29c08fe3c8d715863fe67cda06e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/roipoint_pool3d_cuda.cu
@@ -0,0 +1,60 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d_kernel.cu
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include <math.h>
+#include <stdio.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "roipoint_pool3d_cuda_kernel.cuh"
+
+void RoIPointPool3dForwardCUDAKernelLauncher(
+    int batch_size, int pts_num, int boxes_num, int feature_in_len,
+    int sampled_pts_num, const Tensor xyz, const Tensor boxes3d,
+    const Tensor pts_feature, Tensor pooled_features,
+    Tensor pooled_empty_flag) {
+  Tensor pts_assign = at::empty({batch_size, pts_num, boxes_num},
+                                boxes3d.options().dtype(at::kInt));
+
+  at::cuda::CUDAGuard device_guard(xyz.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(pts_num, THREADS_PER_BLOCK), boxes_num, batch_size);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz.scalar_type(), "assign_pts_to_box3d", [&] {
+        assign_pts_to_box3d<scalar_t><<<blocks, threads, 0, stream>>>(
+            batch_size, pts_num, boxes_num, xyz.data_ptr<scalar_t>(),
+            boxes3d.data_ptr<scalar_t>(), pts_assign.data_ptr<int>());
+      });
+
+  Tensor pts_idx = at::empty({batch_size, boxes_num, sampled_pts_num},
+                             boxes3d.options().dtype(at::kInt));
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks2(GET_BLOCKS(boxes_num, THREADS_PER_BLOCK), batch_size);
+
+  get_pooled_idx<<<blocks2, threads, 0, stream>>>(
+      batch_size, pts_num, boxes_num, sampled_pts_num,
+      pts_assign.data_ptr<int>(), pts_idx.data_ptr<int>(),
+      pooled_empty_flag.data_ptr<int>());
+
+  dim3 blocks_pool(GET_BLOCKS(sampled_pts_num, THREADS_PER_BLOCK), boxes_num,
+                   batch_size);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      xyz.scalar_type(), "roipoint_pool3d_forward", [&] {
+        roipoint_pool3d_forward<scalar_t><<<blocks_pool, threads, 0, stream>>>(
+            batch_size, pts_num, boxes_num, feature_in_len, sampled_pts_num,
+            xyz.data_ptr<scalar_t>(), pts_idx.data_ptr<int>(),
+            pts_feature.data_ptr<scalar_t>(),
+            pooled_features.data_ptr<scalar_t>(),
+            pooled_empty_flag.data_ptr<int>());
+      });
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d172338ae76b7d1509b3011383d3ea95ee8d9527
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/rotated_feature_align_cuda.cu
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_kernel.cu
+#include "pytorch_cuda_helper.hpp"
+#include "rotated_feature_align_cuda_kernel.cuh"
+
+void RotatedFeatureAlignForwardCUDAKernelLauncher(const Tensor features,
+                                                  const Tensor best_bboxes,
+                                                  const float spatial_scale,
+                                                  const int points,
+                                                  Tensor output) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int output_size = features.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      features.scalar_type(), "rotated_feature_align_forward_cuda_kernel",
+      ([&] {
+        const scalar_t* bottom_data = features.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* top_data = output.data_ptr<scalar_t>();
+
+        rotated_feature_align_forward_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, points, bottom_data, bboxes_data,
+                scalar_t(spatial_scale), features.size(1), features.size(2),
+                features.size(3), top_data);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void RotatedFeatureAlignBackwardCUDAKernelLauncher(const Tensor top_grad,
+                                                   const Tensor best_bboxes,
+                                                   const float spatial_scale,
+                                                   const int points,
+                                                   Tensor bottom_grad) {
+  at::cuda::CUDAGuard device_guard(top_grad.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  const int output_size = top_grad.numel();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      top_grad.scalar_type(), "rotated_feature_align_backward_cuda_kernel",
+      ([&] {
+        const scalar_t* top_diff = top_grad.data_ptr<scalar_t>();
+        const scalar_t* bboxes_data = best_bboxes.data_ptr<scalar_t>();
+        scalar_t* bottom_diff = bottom_grad.data_ptr<scalar_t>();
+
+        rotated_feature_align_backward_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, points, top_diff, bboxes_data,
+                scalar_t(spatial_scale), top_grad.size(1), top_grad.size(2),
+                top_grad.size(3), bottom_diff);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cbc44651fc51a5392031e51355de242837242596
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/scatter_points_cuda.cu
@@ -0,0 +1,132 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <stdio.h>
+#include <stdlib.h>
+#include <torch/types.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "scatter_points_cuda_kernel.cuh"
+
+std::vector<at::Tensor> DynamicPointToVoxelForwardCUDAKernelLauncher(
+    const at::Tensor &feats, const at::Tensor &coors,
+    const reduce_t reduce_type) {
+  const int num_input = feats.size(0);
+  const int num_feats = feats.size(1);
+
+  if (num_input == 0)
+    return {feats.clone().detach(), coors.clone().detach(),
+            coors.new_empty({0}, torch::kInt32),
+            coors.new_empty({0}, torch::kInt32)};
+
+  at::Tensor out_coors;
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = coors.masked_fill(coors.lt(0).any(-1, true), -1);
+
+  std::tie(out_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, true);
+
+  if (out_coors[0][0].lt(0).item<bool>()) {
+    // the first element of out_coors (-1,-1,-1) and should be removed
+    out_coors = out_coors.slice(0, 1);
+    reduce_count = reduce_count.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  coors_map = coors_map.to(torch::kInt32);
+  reduce_count = reduce_count.to(torch::kInt32);
+
+  auto reduced_feats =
+      at::empty({out_coors.size(0), num_feats}, feats.options());
+
+  at::cuda::CUDAGuard device_guard(feats.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  AT_DISPATCH_FLOATING_TYPES(
+      feats.scalar_type(), "feats_reduce_kernel", ([&] {
+        if (reduce_type == reduce_t::MAX)
+          reduced_feats.fill_(-std::numeric_limits<scalar_t>::infinity());
+        else
+          reduced_feats.fill_(static_cast<scalar_t>(0));
+
+        dim3 blocks(std::min(
+            at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+        dim3 threads(THREADS_PER_BLOCK);
+        feats_reduce_kernel<<<blocks, threads, 0, stream>>>(
+            feats.data_ptr<scalar_t>(), coors_map.data_ptr<int32_t>(),
+            reduced_feats.data_ptr<scalar_t>(), num_input, num_feats,
+            reduce_type);
+        if (reduce_type == reduce_t::MEAN)
+          reduced_feats /= reduce_count.unsqueeze(-1).to(reduced_feats.dtype());
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return {reduced_feats, out_coors, coors_map, reduce_count};
+}
+
+void DynamicPointToVoxelBackwardCUDAKernelLauncher(
+    at::Tensor &grad_feats, const at::Tensor &grad_reduced_feats,
+    const at::Tensor &feats, const at::Tensor &reduced_feats,
+    const at::Tensor &coors_map, const at::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  const int num_input = feats.size(0);
+  const int num_reduced = reduced_feats.size(0);
+  const int num_feats = feats.size(1);
+
+  grad_feats.fill_(0);
+  // copy voxel grad to points
+
+  if (num_input == 0 || num_reduced == 0) return;
+  at::cuda::CUDAGuard device_guard(feats.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (reduce_type == reduce_t::MEAN || reduce_type == reduce_t::SUM) {
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(), "add_reduce_traceback_grad_kernel",
+        ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          add_reduce_traceback_grad_kernel<<<blocks, threads, 0, stream>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              coors_map.data_ptr<int32_t>(), reduce_count.data_ptr<int32_t>(),
+              num_input, num_feats, reduce_type);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  } else {
+    auto reduce_from = at::full({num_reduced, num_feats}, num_input,
+                                coors_map.options().dtype(torch::kInt32));
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(std::min(
+              at::cuda::ATenCeilDiv(num_input, THREADS_PER_BLOCK), maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          max_reduce_traceback_scatter_idx_kernel<<<blocks, threads, 0,
+                                                    stream>>>(
+              feats.data_ptr<scalar_t>(), reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), coors_map.data_ptr<int32_t>(),
+              num_input, num_feats);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+
+    AT_DISPATCH_FLOATING_TYPES(
+        grad_reduced_feats.scalar_type(),
+        "max_reduce_traceback_scatter_idx_kernel", ([&] {
+          dim3 blocks(
+              std::min(at::cuda::ATenCeilDiv(num_reduced, THREADS_PER_BLOCK),
+                       maxGridDim));
+          dim3 threads(THREADS_PER_BLOCK);
+          max_reduce_scatter_grad_kernel<<<blocks, threads, 0, stream>>>(
+              grad_feats.data_ptr<scalar_t>(),
+              grad_reduced_feats.data_ptr<scalar_t>(),
+              reduce_from.data_ptr<int32_t>(), num_reduced, num_feats);
+        }));
+
+    AT_CUDA_CHECK(cudaGetLastError());
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
new file mode 100644
index 0000000000000000000000000000000000000000..89a2d3af843718676f243e1d885f78050c4a61b9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_indice.cu
@@ -0,0 +1,156 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <spconv/indice.cuh>
+#include <type_traits>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+namespace functor {
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose) {
+    Index batchSize = gridsOut.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    if (transpose)
+      prepareDeConvIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
+    else
+      prepareIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn, indicesOut, gridsOut, indicePairs,
+                              indiceNum, indicePairUnique, kernelSize, stride,
+                              padding, dilation, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    return 1;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<Index> indicesOut,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   tv::TensorView<Index> indicePairUnique,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    Index batchSize = gridsOut.dim(0);
+    auto kernelVolume = indicePairs.dim(0);
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    Index numAct = indicePairUnique.dim(0) - 1;
+    assignGridAndIndiceOutKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numAct, indicePairs,
+                            indicePairUnique, outSpatialShape, batchSize);
+    TV_CHECK_CUDA_ERR();
+    assignIndicePairsKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesOut, gridsOut, numActIn, indicePairs,
+                            indicePairUnique, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+
+    if (resetGrid) {
+      resetGridKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numAct), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicePairUnique.data(), gridsOut, numAct);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numAct;
+  }
+};
+
+template <typename Index, typename IndexGrid, unsigned NDim>
+struct CreateSubMIndicePairFunctor<tv::TorchGPU, Index, IndexGrid, NDim> {
+  Index operator()(const tv::TorchGPU &d, tv::TensorView<const Index> indicesIn,
+                   tv::TensorView<IndexGrid> gridsOut,
+                   tv::TensorView<Index> indicePairs,
+                   tv::TensorView<Index> indiceNum,
+                   const tv::SimpleVector<Index, NDim> kernelSize,
+                   const tv::SimpleVector<Index, NDim> stride,
+                   const tv::SimpleVector<Index, NDim> padding,
+                   const tv::SimpleVector<Index, NDim> dilation,
+                   const tv::SimpleVector<Index, NDim> outSpatialShape,
+                   bool transpose, bool resetGrid) {
+    auto numActIn = indicesIn.dim(0);
+    if (numActIn == 0) return 0;
+    prepareSubMGridKernel<Index, IndexGrid, NDim>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+    getSubMIndicePairsKernel<Index, IndexGrid, NDim, 4096>
+        <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+           d.getStream()>>>(indicesIn, gridsOut, indicePairs, indiceNum,
+                            kernelSize, stride, padding, dilation,
+                            outSpatialShape);
+    TV_CHECK_CUDA_ERR();
+
+    if (resetGrid) {
+      resetGridSubMKernel<Index, IndexGrid, NDim>
+          <<<tv::launch::getBlocks(numActIn), tv::launch::CUDA_NUM_THREADS, 0,
+             d.getStream()>>>(indicesIn.data(), gridsOut, outSpatialShape,
+                              numActIn);
+      TV_CHECK_CUDA_ERR();
+    }
+    return numActIn;
+  }
+};
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_INDEX_NDIM(Index, NDIM)                             \
+  template struct functor::CreateConvIndicePairFunctor<tv::TorchGPU, Index,   \
+                                                       int, NDIM>;            \
+  template struct functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, Index, \
+                                                         int, NDIM>;          \
+  template struct functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, Index, \
+                                                         int, NDIM>;          \
+  template struct functor::CreateSubMIndicePairFunctor<tv::TorchGPU, Index,   \
+                                                       int, NDIM>;
+
+#define DECLARE_GPU_INDEX(Index)          \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 1); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 2); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 3); \
+  DECLARE_GPU_SPECS_INDEX_NDIM(Index, 4);
+
+DECLARE_GPU_INDEX(int);
+
+#undef DECLARE_GPU_INDEX
+#undef DECLARE_GPU_SPECS_INDEX_NDIM
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1addf2e9828f344734c1d346919a97b0f1c36936
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_maxpool.cu
@@ -0,0 +1,483 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <utils/spconv/spconv/maxpool.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdBlockKernel(scalar_t *outFeatures,
+                                      const scalar_t *inFeatures,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  scalar_t in, out;
+  int ILPStrideY[NumILP];
+  Index idxo, idxi;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in > out) {
+          outFeatures[idxo] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericBlockKernel(scalar_t *outFeatures,
+                                             const scalar_t *inFeatures,
+                                             const Index *indicesIn,
+                                             const Index *indicesOut,
+                                             int numHot, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in > out) {
+          outFeatures[RO[ilp] + iy] = in;
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void maxPoolFwdVecBlockKernel(scalar_t *outFeatures,
+                                         const scalar_t *inFeatures,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+  scalar_t bufi[vecloadFactor];
+  scalar_t bufo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] > bufo[i]) {
+          bufo[i] = bufi[i];
+        }
+      }
+      reinterpret_cast<VecType *>(outFeatures)[idxo] =
+          reinterpret_cast<VecType *>(bufo)[0];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolFwdGenericKernel(scalar_t *outFeatures,
+                                        const scalar_t *inFeatures,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in > out) {
+            outFeatures[RO[ilp] + iy] = in;
+          }
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdBlockKernel(const scalar_t *outFeatures,
+                                      const scalar_t *inFeatures,
+                                      const scalar_t *fout, scalar_t *fin,
+                                      const Index *indicesIn,
+                                      const Index *indicesOut, int numHot,
+                                      int numPlanes) {
+  scalar_t in, out;
+  Index idxo, idxi;
+  int ILPStrideY[NumILP];
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  fout += blockIdx.y * NumTLP;
+  fin += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x; ix < numHot;
+       ix += blockDim.x * gridDim.x) {
+    {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+        in = inFeatures[idxi];
+        out = outFeatures[idxo];
+        if (in == out) {
+          fin[idxi] += fout[idxo];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericBlockKernel(
+    const scalar_t *outFeatures, const scalar_t *inFeatures,
+    const scalar_t *fout, scalar_t *fin, const Index *indicesIn,
+    const Index *indicesOut, int numHot, int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+      RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        in = inFeatures[RI[ilp] + iy];
+        out = outFeatures[RO[ilp] + iy];
+        if (in == out) {
+          fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP,
+          typename VecType>
+__global__ void maxPoolBwdVecBlockKernel(const scalar_t *outFeatures,
+                                         const scalar_t *inFeatures,
+                                         const scalar_t *fout, scalar_t *fin,
+                                         const Index *indicesIn,
+                                         const Index *indicesOut, int numHot,
+                                         int numPlanes) {
+  int ILPStrideY[NumILP];
+  constexpr int vecloadFactor = sizeof(VecType) / sizeof(scalar_t);
+  scalar_t bufi[vecloadFactor];
+  scalar_t bufo[vecloadFactor];
+  scalar_t bufdi[vecloadFactor];
+  scalar_t bufdo[vecloadFactor];
+  Index idxi, idxo;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideY[ilp] = threadIdx.y + ilp * blockDim.y;
+  outFeatures += blockIdx.y * NumTLP;
+  inFeatures += blockIdx.y * NumTLP;
+  for (int ix = blockIdx.x * blockDim.x * vecloadFactor; ix < numHot;
+       ix += blockDim.x * gridDim.x * vecloadFactor) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ++ilp) {
+      idxi = indicesIn[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      idxo = indicesOut[ix + ILPStrideY[ilp]] * numPlanes + threadIdx.x;
+      reinterpret_cast<VecType *>(bufo)[0] =
+          reinterpret_cast<const VecType *>(outFeatures)[idxo];
+      reinterpret_cast<VecType *>(bufi)[0] =
+          reinterpret_cast<const VecType *>(inFeatures)[idxi];
+      reinterpret_cast<VecType *>(bufdo)[0] =
+          reinterpret_cast<const VecType *>(fout)[idxo];
+      reinterpret_cast<VecType *>(bufdi)[0] =
+          reinterpret_cast<VecType *>(fin)[idxi];
+
+#pragma unroll
+      for (int i = 0; i < vecloadFactor; i++) {
+        if (bufi[i] == bufo[i]) {
+          bufdi[i] += bufdo[i];
+        }
+      }
+      reinterpret_cast<VecType *>(fin)[idxi] =
+          reinterpret_cast<VecType *>(bufdi)[0];
+    }
+  }
+}
+
+template <typename scalar_t, typename Index, int NumTLP, int NumILP>
+__global__ void maxPoolBwdGenericKernel(const scalar_t *outFeatures,
+                                        const scalar_t *inFeatures,
+                                        const scalar_t *fout, scalar_t *fin,
+                                        const Index *indicesIn,
+                                        const Index *indicesOut, int numHot,
+                                        int numPlanes) {
+  int ILPStrideX[NumILP];
+  Index RI[NumILP];
+  Index RO[NumILP];
+  scalar_t in, out;
+#pragma unroll
+  for (int ilp = 0; ilp < NumILP; ilp++)
+    ILPStrideX[ilp] = ilp * gridDim.x * blockDim.x;
+  for (int ix : tv::KernelLoopX<int, NumILP>(numHot)) {
+#pragma unroll
+    for (int ilp = 0; ilp < NumILP; ilp++) {
+      if (ix + ILPStrideX[ilp] < numHot) {
+        RI[ilp] = indicesIn[ix + ILPStrideX[ilp]] * numPlanes;
+        RO[ilp] = indicesOut[ix + ILPStrideX[ilp]] * numPlanes;
+      }
+    }
+    for (int iy : tv::KernelLoopY<int>(numPlanes)) {
+#pragma unroll
+      for (int ilp = 0; ilp < NumILP; ++ilp) {
+        if (ix + ILPStrideX[ilp] < numHot) {
+          in = inFeatures[RI[ilp] + iy];
+          out = outFeatures[RO[ilp] + iy];
+          if (in == out) {
+            fin[RI[ilp] + iy] += fout[RO[ilp] + iy];
+          }
+        }
+      }
+    }
+  }
+}
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolFwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolFwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolFwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolFwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+template <typename scalar_t, typename Index>
+struct SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d,
+                  tv::TensorView<const scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> inFeatures,
+                  tv::TensorView<const scalar_t> fout,
+                  tv::TensorView<scalar_t> fin,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = inFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &outFeatures, &inFeatures, &fout, &fin,
+                                 &indices, &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (numHotBlock >= NumTLP) {
+            maxPoolBwdVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(std::min(size / NumTLP, 512), numPlanes / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                    fout.data(), fin.data(),
+                                    indices.subview(0).data(),
+                                    indices.subview(1).data(), numHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+
+          if (size > numHotBlock) {
+            maxPoolBwdGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(outFeatures.data(), inFeatures.data(),
+                                       fout.data(), fin.data(),
+                                       indices.subview(0).data() + numHotBlock,
+                                       indices.subview(1).data() + numHotBlock,
+                                       size - numHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      int numHotBlock = (size / NumTLP) * NumTLP;
+      if (numHotBlock >= NumTLP) {
+        maxPoolBwdGenericBlockKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(size / NumTLP, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
+                indices.subview(0).data(), indices.subview(1).data(),
+                numHotBlock, numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+
+      if (size > numHotBlock) {
+        maxPoolBwdGenericKernel<scalar_t, Index, NumTLP, NumILP>
+            <<<dim3(1, tv::launch::DivUp(numPlanes, NumTLP)),
+               dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+                outFeatures.data(), inFeatures.data(), fout.data(), fin.data(),
+                indices.subview(0).data() + numHotBlock,
+                indices.subview(1).data() + numHotBlock, size - numHotBlock,
+                numPlanes);
+        TV_CHECK_CUDA_ERR();
+      }
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, \
+                                                       Index>;                 \
+  template struct functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU,          \
+                                                        scalar_t, Index>;
+
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..44ca42e3f63905f53e6ce75d80db14eec5fb083f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_pool_ops_cuda.cu
@@ -0,0 +1,89 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+#include <utils/spconv/spconv/maxpool.h>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+torch::Tensor IndiceMaxpoolForwardCUDAKernelLauncher(torch::Tensor features,
+                                                     torch::Tensor indicePairs,
+                                                     torch::Tensor indiceNum,
+                                                     int64_t numAct) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  auto device = features.device().type();
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor output = torch::zeros({numAct, numInPlanes}, options);
+  double totalTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceMaxpoolForwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseMaxPoolForwardFunctor<tv::CPU, scalar_t, int>
+                forwardFtor;
+            forwardFtor(tv::CPU(), tv::torch2tv<scalar_t>(output),
+                        tv::torch2tv<const scalar_t>(features),
+                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+          } else {
+            functor::SparseMaxPoolForwardFunctor<tv::TorchGPU, scalar_t, int>
+                forwardFtor;
+            forwardFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                        tv::torch2tv<const scalar_t>(features),
+                        tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+
+torch::Tensor IndiceMaxpoolBackwardCUDAKernelLauncher(torch::Tensor features,
+                                                      torch::Tensor outFeatures,
+                                                      torch::Tensor outGrad,
+                                                      torch::Tensor indicePairs,
+                                                      torch::Tensor indiceNum) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  auto device = features.device().type();
+  auto numInPlanes = features.size(1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  auto kernelVolume = indicePairs.size(0);
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0) {
+      continue;
+    }
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceMaxpoolBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseMaxPoolBackwardFunctor<tv::CPU, scalar_t, int>
+                backwardFtor;
+            backwardFtor(tv::CPU(), tv::torch2tv<const scalar_t>(outFeatures),
+                         tv::torch2tv<const scalar_t>(features),
+                         tv::torch2tv<const scalar_t>(outGrad),
+                         tv::torch2tv<scalar_t>(inputGrad),
+                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+          } else {
+            functor::SparseMaxPoolBackwardFunctor<tv::TorchGPU, scalar_t, int>
+                backwardFtor;
+            backwardFtor(tv::TorchGPU(),
+                         tv::torch2tv<const scalar_t>(outFeatures),
+                         tv::torch2tv<const scalar_t>(features),
+                         tv::torch2tv<const scalar_t>(outGrad),
+                         tv::torch2tv<scalar_t>(inputGrad),
+                         tv::torch2tv<const int>(indicePairs).subview(i), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return inputGrad;
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2929a75773ef2bfbf34e2fd6704a4a2e39dc9214
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/sparse_reordering.cu
@@ -0,0 +1,157 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <ATen/ATen.h>
+#include <utils/spconv/spconv/mp_helper.h>
+#include <utils/spconv/spconv/reordering.h>
+#include <utils/spconv/tensorview/helper_launch.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include <chrono>
+#include <limits>
+#include <spconv/reordering.cuh>
+#include <type_traits>
+#include <utils/spconv/tensorview/helper_kernel.cuh>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+namespace functor {
+template <typename scalar_t, typename Index>
+struct SparseGatherFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> buffer,
+                  tv::TensorView<const scalar_t> features,
+                  tv::TensorView<const Index> indices, int size) {
+    if (size <= 0) return;
+    int numPlanes = features.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor = sizeof(vecload_type_t) / sizeof(scalar_t);
+    mp_for_each<kernel_block_t>([=, &buffer, &features, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            gatherVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                 vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(buffer.data(), features.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            gatherVecKernel<scalar_t, Index, int(NumTLP), NumILP,
+                            vecload_type_t>
+                <<<dim3(1, numPlanes / NumTLP),
+                   dim3(NumTLP / NumILP, NumTLP / vecloadFactor), 0,
+                   d.getStream()>>>(buffer.data() + nHotBlock * numPlanes,
+                                    features.data(), indices.data() + nHotBlock,
+                                    size - nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      gatherGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              buffer.data(), features.data(), indices.data(), size, numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+template <typename scalar_t, typename Index>
+struct SparseScatterAddFunctor<tv::TorchGPU, scalar_t, Index> {
+  using vecload_type_t =
+      std::conditional_t<std::is_same<scalar_t, at::Half>::value, int2, int4>;
+  using kernel_block_t = mp_list_c<int, 64, 32, 16>;
+  void operator()(const tv::TorchGPU &d, tv::TensorView<scalar_t> outFeatures,
+                  tv::TensorView<const scalar_t> buffer,
+                  tv::TensorView<const Index> indices, int size, bool stable) {
+    if (size <= 0) return;
+    int numPlanes = outFeatures.dim(1);
+    bool notFound = true;
+    constexpr int vecloadFactor =
+        sizeof(vecload_type_t) / sizeof(scalar_t);  // important for half.
+    mp_for_each<kernel_block_t>([=, &d, &outFeatures, &buffer, &indices,
+                                 &notFound](auto NumTLP) {
+      constexpr int NumILP = NumTLP / 4;
+      int nHotBlock = (size / NumTLP) * NumTLP;
+      if (notFound) {
+        if (numPlanes % NumTLP == 0) {
+          if (nHotBlock >= NumTLP) {
+            scatterAddVecBlockKernel<scalar_t, Index, int(NumTLP), NumILP,
+                                     vecload_type_t>
+                <<<dim3(numPlanes / NumTLP, size / NumTLP),
+                   dim3(NumTLP / vecloadFactor, NumTLP / NumILP), 0,
+                   d.getStream()>>>(outFeatures.data(), buffer.data(),
+                                    indices.data(), nHotBlock,
+                                    numPlanes / vecloadFactor);
+            TV_CHECK_CUDA_ERR();
+          }
+          if (size - nHotBlock > 0) {
+            scatterAddGenericKernel<scalar_t, Index, int(NumTLP), NumILP>
+                <<<dim3(1, numPlanes / NumTLP), dim3(NumTLP / NumILP, NumTLP),
+                   0, d.getStream()>>>(
+                    outFeatures.data(), buffer.data() + nHotBlock * numPlanes,
+                    indices.data() + nHotBlock, size - nHotBlock, numPlanes);
+            TV_CHECK_CUDA_ERR();
+          }
+          notFound = false;
+        }
+      }
+    });
+    if (notFound) {
+      constexpr int NumTLP = 64;
+      constexpr int NumILP = NumTLP / 4;
+      scatterAddGenericKernel<scalar_t, Index, NumTLP, NumILP>
+          <<<dim3(tv::launch::DivUp(size, NumTLP),
+                  tv::launch::DivUp(numPlanes, NumTLP)),
+             dim3(NumTLP / NumILP, NumTLP), 0, d.getStream()>>>(
+              outFeatures.data(), buffer.data(), indices.data(), size,
+              numPlanes);
+      TV_CHECK_CUDA_ERR();
+    }
+  }
+};
+
+}  // namespace functor
+
+#define DECLARE_GPU_SPECS_T_INDEX(scalar_t, Index)                             \
+  template struct functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, Index>; \
+  template struct functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t,     \
+                                                   Index>;
+
+#define DECLARE_GPU_SPECS(scalar_t) DECLARE_GPU_SPECS_T_INDEX(scalar_t, int);
+
+DECLARE_GPU_SPECS(float);
+DECLARE_GPU_SPECS(double);
+DECLARE_GPU_SPECS(at::Half);
+
+#undef DECLARE_GPU_SPECS
+#undef DECLARE_GPU_SPECS_T_INDEX
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e1a0e1a73a6b3134a0c0f96358cc6d93bddcf768
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/spconv_ops_cuda.cu
@@ -0,0 +1,474 @@
+#include <cuda_runtime_api.h>
+#include <torch/script.h>
+#include <utils/spconv/spconv/indice.h>
+#include <utils/spconv/spconv/reordering.h>
+
+#include "../spconv_utils.h"
+#include "pytorch_cuda_helper.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor gridOut =
+      torch::full({batchSize * outputVolume}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  at::cuda::CUDAGuard device_guard(indices.device());
+  bool subM = _subM != 0;
+  bool transpose = _transpose != 0;
+  auto numAct = indices.size(0);
+  auto coorDim = indices.size(1) - 1;
+  TV_ASSERT_RT_ERR(NDim == coorDim, "error");
+  TV_ASSERT_RT_ERR(kernelSize.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outSpatialShape.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(stride.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(padding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(outPadding.size() == coorDim, "error");
+  TV_ASSERT_RT_ERR(dilation.size() == coorDim, "error");
+  auto kernelVolume = kernelSize[0];
+  for (int i = 1; i < kernelSize.size(); ++i) {
+    kernelVolume *= kernelSize[i];
+  }
+  TV_ASSERT_RT_ERR(kernelVolume <= 4096, "error");
+  auto outputVolume = outSpatialShape[0];
+  for (int i = 1; i < outSpatialShape.size(); ++i) {
+    outputVolume *= outSpatialShape[i];
+  }
+  TV_ASSERT_INVALID_ARG(gridOut.numel() >= outputVolume * batchSize, "error");
+  torch::Tensor indicePairs =
+      torch::full({kernelVolume, 2, numAct}, -1,
+                  torch::dtype(torch::kInt32).device(indices.device()));
+  torch::Tensor indiceNum = torch::zeros(
+      {kernelVolume}, torch::dtype(torch::kInt32).device(indices.device()));
+  int64_t numActOut = -1;
+  tv::SimpleVector<int, NDim> outSpatialShape32;
+  tv::SimpleVector<int, NDim> kernelSize32;
+  tv::SimpleVector<int, NDim> stride32;
+  tv::SimpleVector<int, NDim> padding32;
+  tv::SimpleVector<int, NDim> dilation32;
+  auto indicePairUnique = torch::full(
+      {indicePairs.numel() / 2 + 1}, std::numeric_limits<int>::max(),
+      torch::dtype(torch::kInt32).device(indices.device()));
+  for (int i = 0; i < NDim; ++i) {
+    outSpatialShape32.push_back(outSpatialShape[i]);
+    kernelSize32.push_back(kernelSize[i]);
+    if (subM) {
+      stride32.push_back(1);
+      padding32.push_back(kernelSize[i] / 2);
+      dilation32.push_back(dilation[i]);
+    } else {
+      stride32.push_back(stride[i]);
+      padding32.push_back(padding[i]);
+      dilation32.push_back(dilation[i]);
+    }
+  }
+  if (subM) {
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtor =
+          functor::CreateSubMIndicePairFunctor<tv::TorchGPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(gridOut), tv::torch2tv<int>(indicePairs),
+          tv::torch2tv<int>(indiceNum), kernelSize32, stride32, padding32,
+          dilation32, outSpatialShape32, transpose, true);
+    }
+    return {indices, indicePairs, indiceNum};
+  } else {
+    torch::Tensor outInds =
+        torch::zeros({numAct * kernelVolume, coorDim + 1},
+                     torch::dtype(torch::kInt32).device(indices.device()));
+    if (indices.device().type() == torch::kCPU) {
+      auto getIndicePairFtor =
+          functor::CreateConvIndicePairFunctor<tv::CPU, int, int, NDim>();
+      numActOut = getIndicePairFtor(
+          tv::CPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          kernelSize32, stride32, padding32, dilation32, outSpatialShape32,
+          transpose, true);
+      gridOut.fill_(-1);
+    } else {
+      auto getIndicePairFtorP1 =
+          functor::CreateConvIndicePairFunctorP1<tv::TorchGPU, int, int,
+                                                 NDim>();
+      auto getIndicePairFtorP2 =
+          functor::CreateConvIndicePairFunctorP2<tv::TorchGPU, int, int,
+                                                 NDim>();
+      numActOut = getIndicePairFtorP1(
+          tv::TorchGPU(), tv::torch2tv<const int>(indices),
+          tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+          tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+          tv::torch2tv<int>(indicePairUnique), kernelSize32, stride32,
+          padding32, dilation32, outSpatialShape32, transpose);
+      if (numActOut > 0) {
+        auto res = torch::_unique(indicePairUnique);
+        indicePairUnique = std::get<0>(res);
+        numActOut = getIndicePairFtorP2(
+            tv::TorchGPU(), tv::torch2tv<const int>(indices),
+            tv::torch2tv<int>(outInds), tv::torch2tv<int>(gridOut),
+            tv::torch2tv<int>(indicePairs), tv::torch2tv<int>(indiceNum),
+            tv::torch2tv<int>(indicePairUnique), outSpatialShape32, transpose,
+            true);
+      }
+    }
+    return {outInds.slice(0, 0, numActOut), indicePairs, indiceNum};
+  }
+}
+
+torch::Tensor IndiceConvForwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor indicePairs,
+    torch::Tensor indiceNum, int64_t numActOut, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+
+  torch::Tensor output = torch::zeros({numActOut, numOutPlanes}, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    torch::mm_out(output, features, filters[indicePairMaxOffset]);
+  }
+  double totalGatherTime = 0;
+  double totalGEMMTime = 0;
+  double totalSAddTime = 0;
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvForwardKernel", [&] {
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            /* slower than SparseGatherFunctor, may due to int->long conversion
+            auto indicePairLong = indicePairs[i][inverse].to(torch::kInt64);
+            auto indicePairBlob =
+            torch::from_blob(indicePairLong.data_ptr<long>(), {nHot},
+            indicePairOptions); torch::index_select_out(inputBufferBlob,
+            features, 0, indicePairBlob);*/
+          }
+          torch::mm_out(outputBufferBlob, inputBufferBlob, filters[i]);
+
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(output),
+                tv::torch2tv<const scalar_t>(outputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse), nHot,
+                true);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return output;
+}
+
+std::vector<torch::Tensor> IndiceConvBackwardCUDAKernelLauncher(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  at::cuda::CUDAGuard device_guard(features.device());
+  bool subM = _subM != 0;
+  bool inverse = _inverse != 0;
+
+  auto device = features.device().type();
+  auto ndim = filters.dim() - 2;
+  auto kernelVolume = indicePairs.size(0);
+  auto numInPlanes = features.size(1);
+  auto numOutPlanes = filters.size(ndim + 1);
+  auto indicePairNumCpu = indiceNum.to({torch::kCPU});
+  auto indicePairMaxSizeIter =
+      std::max_element(indicePairNumCpu.data_ptr<int>(),
+                       indicePairNumCpu.data_ptr<int>() + kernelVolume);
+  int indicePairMaxOffset =
+      indicePairMaxSizeIter - indicePairNumCpu.data_ptr<int>();
+  int indicePairMaxSize = *indicePairMaxSizeIter;
+  auto options =
+      torch::TensorOptions().dtype(features.dtype()).device(features.device());
+  auto filterShape = filters.sizes();
+  torch::Tensor inputGrad = torch::zeros(features.sizes(), options);
+  torch::Tensor filtersGrad = torch::zeros(filterShape, options);
+  torch::Tensor inputBuffer =
+      torch::zeros({indicePairMaxSize, numInPlanes}, options);
+  torch::Tensor outputBuffer =
+      torch::zeros({indicePairMaxSize, numOutPlanes}, options);
+
+  filters = filters.view({-1, numInPlanes, numOutPlanes});
+  filtersGrad = filtersGrad.view({-1, numInPlanes, numOutPlanes});
+  if (subM) {
+    auto filterGradSub = filtersGrad[indicePairMaxOffset];
+    torch::mm_out(filterGradSub, features.t(), outGrad);
+    torch::mm_out(inputGrad, outGrad, filters[indicePairMaxOffset].t());
+  }
+  for (int i = 0; i < kernelVolume; ++i) {
+    auto nHot = indicePairNumCpu.data_ptr<int>()[i];
+    if (nHot <= 0 || (subM && i == indicePairMaxOffset)) {
+      continue;
+    }
+
+    AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+        features.scalar_type(), "IndiceConvBackwardKernel", [&] {
+          if (device == torch::kCPU) {
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtor;
+            functor::SparseGatherFunctor<tv::CPU, scalar_t, int> gatherFtorOut;
+            gatherFtor(tv::CPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            gatherFtorOut(
+                tv::CPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+          } else {
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtor;
+            functor::SparseGatherFunctor<tv::TorchGPU, scalar_t, int>
+                gatherFtorOut;
+            gatherFtor(tv::TorchGPU(), tv::torch2tv<scalar_t>(inputBuffer),
+                       tv::torch2tv<const scalar_t>(features),
+                       tv::torch2tv<const int>(indicePairs).subview(i, inverse),
+                       nHot);
+            TV_CHECK_CUDA_ERR();
+            gatherFtorOut(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(outputBuffer),
+                tv::torch2tv<const scalar_t>(outGrad),
+                tv::torch2tv<const int>(indicePairs).subview(i, !inverse),
+                nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+          auto filterGradSub = filtersGrad[i];
+          auto outputBufferBlob = torch::from_blob(
+              outputBuffer.data_ptr<scalar_t>(), {nHot, numOutPlanes}, options);
+          auto inputBufferBlob = torch::from_blob(
+              inputBuffer.data_ptr<scalar_t>(), {nHot, numInPlanes}, options);
+
+          torch::mm_out(filterGradSub, inputBufferBlob.t(), outputBufferBlob);
+          torch::mm_out(inputBufferBlob, outputBufferBlob, filters[i].t());
+          if (device == torch::kCPU) {
+            functor::SparseScatterAddFunctor<tv::CPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::CPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+          } else {
+            functor::SparseScatterAddFunctor<tv::TorchGPU, scalar_t, int>
+                scatterFtor;
+            scatterFtor(
+                tv::TorchGPU(), tv::torch2tv<scalar_t>(inputGrad),
+                tv::torch2tv<const scalar_t>(inputBuffer),
+                tv::torch2tv<const int>(indicePairs).subview(i, inverse), nHot);
+            TV_CHECK_CUDA_ERR();
+          }
+        });
+  }
+  return {inputGrad, filtersGrad.view(filterShape)};
+}
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..657c81701b7c114af700c4f8cf37094c705b9a94
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/sync_bn_cuda.cu
@@ -0,0 +1,110 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "sync_bn_cuda_kernel.cuh"
+
+void SyncBNForwardMeanCUDAKernelLauncher(const Tensor input, Tensor mean) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_mean_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardVarCUDAKernelLauncher(const Tensor input, const Tensor mean,
+                                        Tensor var) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_var_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
+                var.data_ptr<float>(), num, channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNForwardOutputCUDAKernelLauncher(
+    const Tensor input, const Tensor mean, const Tensor var,
+    Tensor running_mean, Tensor running_var, const Tensor weight,
+    const Tensor bias, Tensor norm, Tensor std, Tensor output, float eps,
+    float momentum, int group_size) {
+  int num = input.size(0);
+  int channels = input.size(1);
+  int spatial = input.size(2);
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "sync_bn_forward_mean_cuda_kernel", [&] {
+        sync_bn_forward_output_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                input.data_ptr<scalar_t>(), mean.data_ptr<float>(),
+                var.data_ptr<float>(), running_mean.data_ptr<float>(),
+                running_var.data_ptr<float>(), weight.data_ptr<float>(),
+                bias.data_ptr<float>(), norm.data_ptr<float>(),
+                std.data_ptr<float>(), output.data_ptr<scalar_t>(), num,
+                channels, spatial, eps, momentum, group_size);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardParamCUDAKernelLauncher(const Tensor grad_output,
+                                           const Tensor norm,
+                                           Tensor grad_weight,
+                                           Tensor grad_bias) {
+  int num = grad_output.size(0);
+  int channels = grad_output.size(1);
+  int spatial = grad_output.size(2);
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "sync_bn_backward_param_cuda_kernel", [&] {
+        sync_bn_backward_param_cuda_kernel<scalar_t>
+            <<<channels, THREADS_PER_BLOCK, 0, stream>>>(
+                grad_output.data_ptr<scalar_t>(), norm.data_ptr<float>(),
+                grad_weight.data_ptr<float>(), grad_bias.data_ptr<float>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void SyncBNBackwardDataCUDAKernelLauncher(const Tensor grad_output,
+                                          const Tensor weight,
+                                          const Tensor grad_weight,
+                                          const Tensor grad_bias,
+                                          const Tensor norm, const Tensor std,
+                                          Tensor grad_input) {
+  int output_size = grad_input.numel();
+  int num = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int spatial = grad_input.size(2);
+
+  at::cuda::CUDAGuard device_guard(grad_input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "sync_bn_backward_data_cuda_kernel", [&] {
+        sync_bn_backward_data_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                weight.data_ptr<float>(), grad_weight.data_ptr<float>(),
+                grad_bias.data_ptr<float>(), norm.data_ptr<float>(),
+                std.data_ptr<float>(), grad_input.data_ptr<scalar_t>(), num,
+                channels, spatial);
+      });
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..56a5550066035efb96d1d8e46c5f1ecd3e36083b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/three_interpolate_cuda.cu
@@ -0,0 +1,66 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "three_interpolate_cuda_kernel.cuh"
+
+void ThreeInterpolateForwardCUDAKernelLauncher(int b, int c, int m, int n,
+                                               const Tensor points,
+                                               const Tensor idx,
+                                               const Tensor weight,
+                                               Tensor out) {
+  // points: (B, C, M)
+  // idx: (B, N, 3)
+  // weight: (B, N, 3)
+  // output:
+  //      out: (B, C, N)
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      points.scalar_type(), "three_interpolate_forward_cuda_kernel", [&] {
+        three_interpolate_forward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, m, n, points.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+                weight.data_ptr<scalar_t>(), out.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void ThreeInterpolateBackwardCUDAKernelLauncher(int b, int c, int n, int m,
+                                                const Tensor grad_out,
+                                                const Tensor idx,
+                                                const Tensor weight,
+                                                Tensor grad_points) {
+  // grad_out: (B, C, N)
+  // weight: (B, N, 3)
+  // output:
+  //      grad_points: (B, C, M)
+
+  at::cuda::CUDAGuard device_guard(grad_out.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), c, b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_out.scalar_type(), "three_interpolate_backward_cuda_kernel", [&] {
+        three_interpolate_backward_cuda_kernel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                b, c, n, m, grad_out.data_ptr<scalar_t>(), idx.data_ptr<int>(),
+                weight.data_ptr<scalar_t>(), grad_points.data_ptr<scalar_t>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..91c68829b9f2c19f1a64def88475c0fedf40de9f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/three_nn_cuda.cu
@@ -0,0 +1,35 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate_gpu.cu
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "three_nn_cuda_kernel.cuh"
+
+void ThreeNNForwardCUDAKernelLauncher(int b, int n, int m, const Tensor unknown,
+                                      const Tensor known, Tensor dist2,
+                                      Tensor idx) {
+  // unknown: (B, N, 3)
+  // known: (B, M, 3)
+  // output:
+  //      dist2: (B, N, 3)
+  //      idx: (B, N, 3)
+
+  at::cuda::CUDAGuard device_guard(unknown.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // blockIdx.x(col), blockIdx.y(row)
+  dim3 blocks(GET_BLOCKS(n, THREADS_PER_BLOCK), b);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      unknown.scalar_type(), "three_nn_forward_cuda_kernel", [&] {
+        three_nn_forward_cuda_kernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            b, n, m, unknown.data_ptr<scalar_t>(), known.data_ptr<scalar_t>(),
+            dist2.data_ptr<scalar_t>(), idx.data_ptr<int>());
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..19c85c76c9f53cb70314d4cdc1c1d2379322f30e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/tin_shift_cuda.cu
@@ -0,0 +1,55 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cuda_helper.hpp"
+#include "pytorch_device_registry.hpp"
+#include "tin_shift_cuda_kernel.cuh"
+
+void TINShiftForwardCUDAKernelLauncher(Tensor input, Tensor shift,
+                                       Tensor output) {
+  int output_size = output.numel();
+  int batch_size = input.size(0);
+  int t_size = input.size(1);
+  int channels = input.size(2);
+  int hw_size = input.size(3);
+  int group_size = shift.size(1);
+  int group_channel = channels / group_size;
+  int num_kernels = batch_size * hw_size * channels;
+
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      input.scalar_type(), "tin_shift_forward_cuda_kernel", [&] {
+        tin_shift_forward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, input.data_ptr<scalar_t>(), shift.data_ptr<int>(),
+                output.data_ptr<scalar_t>(), batch_size, channels, t_size,
+                hw_size, group_size, group_channel);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
+
+void TINShiftBackwardCUDAKernelLauncher(Tensor grad_output, Tensor shift,
+                                        Tensor grad_input) {
+  int output_size = grad_output.numel();
+  int batch_size = grad_output.size(0);
+  int t_size = grad_output.size(1);
+  int channels = grad_output.size(2);
+  int hw_size = grad_output.size(3);
+  int group_size = shift.size(1);
+  int group_channel = channels / group_size;
+  int num_kernels = batch_size * hw_size * channels;
+
+  at::cuda::CUDAGuard device_guard(grad_output.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      grad_output.scalar_type(), "tin_shift_backward_cuda_kernel", [&] {
+        tin_shift_backward_cuda_kernel<scalar_t>
+            <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+                output_size, grad_output.data_ptr<scalar_t>(),
+                shift.data_ptr<int>(), grad_input.data_ptr<scalar_t>(),
+                batch_size, channels, t_size, hw_size, group_size,
+                group_channel);
+      });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ea2f08820023cea60bdefe8aae56b0f303c72ffa
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/upfirdn2d_kernel.cu
@@ -0,0 +1,370 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d_kernel.cu
+// Copyright (c) 2019, NVIDIA Corporation. All rights reserved.
+//
+// This work is made available under the Nvidia Source Code License-NC.
+// To view a copy of this license, visit
+// https://nvlabs.github.io/stylegan2/license.html
+
+#include <ATen/ATen.h>
+#include <ATen/AccumulateType.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <torch/types.h>
+
+#include <ATen/cuda/CUDAApplyUtils.cuh>
+
+static __host__ __device__ __forceinline__ int floor_div(int a, int b) {
+  int c = a / b;
+
+  if (c * b > a) {
+    c--;
+  }
+
+  return c;
+}
+
+struct UpFirDn2DKernelParams {
+  int up_x;
+  int up_y;
+  int down_x;
+  int down_y;
+  int pad_x0;
+  int pad_x1;
+  int pad_y0;
+  int pad_y1;
+
+  int major_dim;
+  int in_h;
+  int in_w;
+  int minor_dim;
+  int kernel_h;
+  int kernel_w;
+  int out_h;
+  int out_w;
+  int loop_major;
+  int loop_x;
+};
+
+template <typename scalar_t>
+__global__ void upfirdn2d_kernel_large(scalar_t *out, const scalar_t *input,
+                                       const scalar_t *kernel,
+                                       const UpFirDn2DKernelParams p) {
+  int minor_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int out_y = minor_idx / p.minor_dim;
+  minor_idx -= out_y * p.minor_dim;
+  int out_x_base = blockIdx.y * p.loop_x * blockDim.y + threadIdx.y;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (out_x_base >= p.out_w || out_y >= p.out_h ||
+      major_idx_base >= p.major_dim) {
+    return;
+  }
+
+  int mid_y = out_y * p.down_y + p.up_y - 1 - p.pad_y0;
+  int in_y = min(max(floor_div(mid_y, p.up_y), 0), p.in_h);
+  int h = min(max(floor_div(mid_y + p.kernel_h, p.up_y), 0), p.in_h) - in_y;
+  int kernel_y = mid_y + p.kernel_h - (in_y + 1) * p.up_y;
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major && major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, out_x = out_x_base;
+         loop_x < p.loop_x && out_x < p.out_w; loop_x++, out_x += blockDim.y) {
+      int mid_x = out_x * p.down_x + p.up_x - 1 - p.pad_x0;
+      int in_x = min(max(floor_div(mid_x, p.up_x), 0), p.in_w);
+      int w = min(max(floor_div(mid_x + p.kernel_w, p.up_x), 0), p.in_w) - in_x;
+      int kernel_x = mid_x + p.kernel_w - (in_x + 1) * p.up_x;
+
+      const scalar_t *x_p =
+          &input[((major_idx * p.in_h + in_y) * p.in_w + in_x) * p.minor_dim +
+                 minor_idx];
+      const scalar_t *k_p = &kernel[kernel_y * p.kernel_w + kernel_x];
+      int x_px = p.minor_dim;
+      int k_px = -p.up_x;
+      int x_py = p.in_w * p.minor_dim;
+      int k_py = -p.up_y * p.kernel_w;
+
+      scalar_t v = 0.0f;
+
+      for (int y = 0; y < h; y++) {
+        for (int x = 0; x < w; x++) {
+          v += static_cast<scalar_t>(*x_p) * static_cast<scalar_t>(*k_p);
+          x_p += x_px;
+          k_p += k_px;
+        }
+
+        x_p += x_py - w * x_px;
+        k_p += k_py - w * k_px;
+      }
+
+      out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+          minor_idx] = v;
+    }
+  }
+}
+
+template <typename scalar_t, int up_x, int up_y, int down_x, int down_y,
+          int kernel_h, int kernel_w, int tile_out_h, int tile_out_w>
+__global__ void upfirdn2d_kernel(scalar_t *out, const scalar_t *input,
+                                 const scalar_t *kernel,
+                                 const UpFirDn2DKernelParams p) {
+  const int tile_in_h = ((tile_out_h - 1) * down_y + kernel_h - 1) / up_y + 1;
+  const int tile_in_w = ((tile_out_w - 1) * down_x + kernel_w - 1) / up_x + 1;
+
+  __shared__ volatile float sk[kernel_h][kernel_w];
+  __shared__ volatile float sx[tile_in_h][tile_in_w];
+
+  int minor_idx = blockIdx.x;
+  int tile_out_y = minor_idx / p.minor_dim;
+  minor_idx -= tile_out_y * p.minor_dim;
+  tile_out_y *= tile_out_h;
+  int tile_out_x_base = blockIdx.y * p.loop_x * tile_out_w;
+  int major_idx_base = blockIdx.z * p.loop_major;
+
+  if (tile_out_x_base >= p.out_w | tile_out_y >= p.out_h |
+      major_idx_base >= p.major_dim) {
+    return;
+  }
+
+  for (int tap_idx = threadIdx.x; tap_idx < kernel_h * kernel_w;
+       tap_idx += blockDim.x) {
+    int ky = tap_idx / kernel_w;
+    int kx = tap_idx - ky * kernel_w;
+    scalar_t v = 0.0;
+
+    if (kx < p.kernel_w & ky < p.kernel_h) {
+      v = kernel[(p.kernel_h - 1 - ky) * p.kernel_w + (p.kernel_w - 1 - kx)];
+    }
+
+    sk[ky][kx] = v;
+  }
+
+  for (int loop_major = 0, major_idx = major_idx_base;
+       loop_major < p.loop_major & major_idx < p.major_dim;
+       loop_major++, major_idx++) {
+    for (int loop_x = 0, tile_out_x = tile_out_x_base;
+         loop_x < p.loop_x & tile_out_x < p.out_w;
+         loop_x++, tile_out_x += tile_out_w) {
+      int tile_mid_x = tile_out_x * down_x + up_x - 1 - p.pad_x0;
+      int tile_mid_y = tile_out_y * down_y + up_y - 1 - p.pad_y0;
+      int tile_in_x = floor_div(tile_mid_x, up_x);
+      int tile_in_y = floor_div(tile_mid_y, up_y);
+
+      __syncthreads();
+
+      for (int in_idx = threadIdx.x; in_idx < tile_in_h * tile_in_w;
+           in_idx += blockDim.x) {
+        int rel_in_y = in_idx / tile_in_w;
+        int rel_in_x = in_idx - rel_in_y * tile_in_w;
+        int in_x = rel_in_x + tile_in_x;
+        int in_y = rel_in_y + tile_in_y;
+
+        scalar_t v = 0.0;
+
+        if (in_x >= 0 & in_y >= 0 & in_x < p.in_w & in_y < p.in_h) {
+          v = input[((major_idx * p.in_h + in_y) * p.in_w + in_x) *
+                        p.minor_dim +
+                    minor_idx];
+        }
+
+        sx[rel_in_y][rel_in_x] = v;
+      }
+
+      __syncthreads();
+      for (int out_idx = threadIdx.x; out_idx < tile_out_h * tile_out_w;
+           out_idx += blockDim.x) {
+        int rel_out_y = out_idx / tile_out_w;
+        int rel_out_x = out_idx - rel_out_y * tile_out_w;
+        int out_x = rel_out_x + tile_out_x;
+        int out_y = rel_out_y + tile_out_y;
+
+        int mid_x = tile_mid_x + rel_out_x * down_x;
+        int mid_y = tile_mid_y + rel_out_y * down_y;
+        int in_x = floor_div(mid_x, up_x);
+        int in_y = floor_div(mid_y, up_y);
+        int rel_in_x = in_x - tile_in_x;
+        int rel_in_y = in_y - tile_in_y;
+        int kernel_x = (in_x + 1) * up_x - mid_x - 1;
+        int kernel_y = (in_y + 1) * up_y - mid_y - 1;
+
+        scalar_t v = 0.0;
+
+#pragma unroll
+        for (int y = 0; y < kernel_h / up_y; y++)
+#pragma unroll
+          for (int x = 0; x < kernel_w / up_x; x++)
+            v += sx[rel_in_y + y][rel_in_x + x] *
+                 sk[kernel_y + y * up_y][kernel_x + x * up_x];
+
+        if (out_x < p.out_w & out_y < p.out_h) {
+          out[((major_idx * p.out_h + out_y) * p.out_w + out_x) * p.minor_dim +
+              minor_idx] = v;
+        }
+      }
+    }
+  }
+}
+
+torch::Tensor upfirdn2d_op(const torch::Tensor &input,
+                           const torch::Tensor &kernel, int up_x, int up_y,
+                           int down_x, int down_y, int pad_x0, int pad_x1,
+                           int pad_y0, int pad_y1) {
+  int curDevice = -1;
+  cudaGetDevice(&curDevice);
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream(curDevice);
+
+  UpFirDn2DKernelParams p;
+
+  auto x = input.contiguous();
+  auto k = kernel.contiguous();
+
+  p.major_dim = x.size(0);
+  p.in_h = x.size(1);
+  p.in_w = x.size(2);
+  p.minor_dim = x.size(3);
+  p.kernel_h = k.size(0);
+  p.kernel_w = k.size(1);
+  p.up_x = up_x;
+  p.up_y = up_y;
+  p.down_x = down_x;
+  p.down_y = down_y;
+  p.pad_x0 = pad_x0;
+  p.pad_x1 = pad_x1;
+  p.pad_y0 = pad_y0;
+  p.pad_y1 = pad_y1;
+
+  p.out_h = (p.in_h * p.up_y + p.pad_y0 + p.pad_y1 - p.kernel_h + p.down_y) /
+            p.down_y;
+  p.out_w = (p.in_w * p.up_x + p.pad_x0 + p.pad_x1 - p.kernel_w + p.down_x) /
+            p.down_x;
+
+  auto out =
+      at::empty({p.major_dim, p.out_h, p.out_w, p.minor_dim}, x.options());
+
+  int mode = -1;
+
+  int tile_out_h = -1;
+  int tile_out_w = -1;
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 1;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 3 && p.kernel_w <= 3) {
+    mode = 2;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 3;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 2 && p.up_y == 2 && p.down_x == 1 && p.down_y == 1 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 4;
+    tile_out_h = 16;
+    tile_out_w = 64;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 4 && p.kernel_w <= 4) {
+    mode = 5;
+    tile_out_h = 8;
+    tile_out_w = 32;
+  }
+
+  if (p.up_x == 1 && p.up_y == 1 && p.down_x == 2 && p.down_y == 2 &&
+      p.kernel_h <= 2 && p.kernel_w <= 2) {
+    mode = 6;
+    tile_out_h = 8;
+    tile_out_w = 32;
+  }
+
+  dim3 block_size;
+  dim3 grid_size;
+
+  if (tile_out_h > 0 && tile_out_w > 0) {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 1;
+    block_size = dim3(32 * 8, 1, 1);
+    grid_size = dim3(((p.out_h - 1) / tile_out_h + 1) * p.minor_dim,
+                     (p.out_w - 1) / (p.loop_x * tile_out_w) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  } else {
+    p.loop_major = (p.major_dim - 1) / 16384 + 1;
+    p.loop_x = 4;
+    block_size = dim3(4, 32, 1);
+    grid_size = dim3((p.out_h * p.minor_dim - 1) / block_size.x + 1,
+                     (p.out_w - 1) / (p.loop_x * block_size.y) + 1,
+                     (p.major_dim - 1) / p.loop_major + 1);
+  }
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(x.scalar_type(), "upfirdn2d_cuda", [&] {
+    switch (mode) {
+      case 1:
+        upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 4, 4, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 2:
+        upfirdn2d_kernel<scalar_t, 1, 1, 1, 1, 3, 3, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 3:
+        upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 4, 4, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 4:
+        upfirdn2d_kernel<scalar_t, 2, 2, 1, 1, 2, 2, 16, 64>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 5:
+        upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      case 6:
+        upfirdn2d_kernel<scalar_t, 1, 1, 2, 2, 4, 4, 8, 32>
+            <<<grid_size, block_size, 0, stream>>>(out.data_ptr<scalar_t>(),
+                                                   x.data_ptr<scalar_t>(),
+                                                   k.data_ptr<scalar_t>(), p);
+
+        break;
+
+      default:
+        upfirdn2d_kernel_large<scalar_t><<<grid_size, block_size, 0, stream>>>(
+            out.data_ptr<scalar_t>(), x.data_ptr<scalar_t>(),
+            k.data_ptr<scalar_t>(), p);
+    }
+  });
+
+  return out;
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu b/mmcv/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f4166b7b7a4fc7297f452636a991bbf91789dd85
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/cuda/voxelization_cuda.cu
@@ -0,0 +1,286 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "pytorch_cuda_helper.hpp"
+#include "voxelization_cuda_kernel.cuh"
+
+int HardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  dim3 grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 block(512);
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int><<<grid, block, 0, stream>>>(
+            points.contiguous().data_ptr<scalar_t>(),
+            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
+            NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 2. map point to the idx of the corresponding voxel, find duplicate coor
+  // create some temporary variables
+  auto point_to_pointidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto point_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+
+  dim3 map_grid(std::min(at::cuda::ATenCeilDiv(num_points, 512), 4096));
+  dim3 map_block(512);
+
+  AT_DISPATCH_ALL_TYPES(
+      temp_coors.scalar_type(), "determin_duplicate", ([&] {
+        point_to_voxelidx_kernel<int><<<map_grid, map_block, 0, stream>>>(
+            temp_coors.contiguous().data_ptr<int>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            point_to_pointidx.contiguous().data_ptr<int>(), max_points,
+            max_voxels, num_points, NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 3. determine voxel num and voxel's coor index
+  // make the logic in the CUDA device could accelerate about 10 times
+  auto coor_to_voxelidx = -at::ones(
+      {
+          num_points,
+      },
+      points.options().dtype(at::kInt));
+  auto voxel_num = at::zeros(
+      {
+          1,
+      },
+      points.options().dtype(at::kInt));  // must be zero from the beginning
+
+  AT_DISPATCH_ALL_TYPES(temp_coors.scalar_type(), "determin_duplicate", ([&] {
+                          determin_voxel_num<int><<<1, 1, 0, stream>>>(
+                              num_points_per_voxel.contiguous().data_ptr<int>(),
+                              point_to_voxelidx.contiguous().data_ptr<int>(),
+                              point_to_pointidx.contiguous().data_ptr<int>(),
+                              coor_to_voxelidx.contiguous().data_ptr<int>(),
+                              voxel_num.contiguous().data_ptr<int>(),
+                              max_points, max_voxels, num_points);
+                        }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  // 4. copy point features to voxels
+  // Step 4 & 5 could be parallel
+  auto pts_output_size = num_points * num_features;
+  dim3 cp_grid(std::min(at::cuda::ATenCeilDiv(pts_output_size, 512), 4096));
+  dim3 cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_point_to_voxel<float, int><<<cp_grid, cp_block, 0, stream>>>(
+            pts_output_size, points.contiguous().data_ptr<float>(),
+            point_to_voxelidx.contiguous().data_ptr<int>(),
+            coor_to_voxelidx.contiguous().data_ptr<int>(),
+            voxels.contiguous().data_ptr<float>(), max_points, num_features,
+            num_points, NDim);
+      }));
+  //   cudaDeviceSynchronize();
+  //   AT_CUDA_CHECK(cudaGetLastError());
+
+  // 5. copy coors of each voxels
+  auto coors_output_size = num_points * NDim;
+  dim3 coors_cp_grid(
+      std::min(at::cuda::ATenCeilDiv(coors_output_size, 512), 4096));
+  dim3 coors_cp_block(512);
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        assign_voxel_coors<float, int>
+            <<<coors_cp_grid, coors_cp_block, 0, stream>>>(
+                coors_output_size, temp_coors.contiguous().data_ptr<int>(),
+                point_to_voxelidx.contiguous().data_ptr<int>(),
+                coor_to_voxelidx.contiguous().data_ptr<int>(),
+                coors.contiguous().data_ptr<int>(), num_points, NDim);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  auto voxel_num_cpu = voxel_num.to(at::kCPU);
+  int voxel_num_int = voxel_num_cpu.data_ptr<int>()[0];
+
+  return voxel_num_int;
+}
+
+int NondeterministicHardVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  if (num_points == 0) return 0;
+
+  dim3 blocks(
+      std::min(at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK), 4096));
+  dim3 threads(THREADS_PER_BLOCK);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  // map points to voxel coors
+  at::Tensor temp_coors =
+      at::zeros({num_points, NDim}, points.options().dtype(at::kInt));
+
+  // 1. link point to corresponding voxel coors
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "hard_voxelize_kernel", ([&] {
+        dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+            points.contiguous().data_ptr<scalar_t>(),
+            temp_coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+            coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+            coors_z_max, grid_x, grid_y, grid_z, num_points, num_features,
+            NDim);
+      }));
+
+  at::Tensor coors_map;
+  at::Tensor reduce_count;
+
+  auto coors_clean = temp_coors.masked_fill(temp_coors.lt(0).any(-1, true), -1);
+
+  std::tie(temp_coors, coors_map, reduce_count) =
+      at::unique_dim(coors_clean, 0, true, true, false);
+
+  if (temp_coors[0][0].lt(0).item<bool>()) {
+    // the first element of temp_coors is (-1,-1,-1) and should be removed
+    temp_coors = temp_coors.slice(0, 1);
+    coors_map = coors_map - 1;
+  }
+
+  int num_coors = temp_coors.size(0);
+  temp_coors = temp_coors.to(at::kInt);
+  coors_map = coors_map.to(at::kInt);
+
+  at::Tensor coors_count = at::zeros({1}, coors_map.options());
+  at::Tensor coors_order = at::empty({num_coors}, coors_map.options());
+  at::Tensor pts_id = at::zeros({num_points}, coors_map.options());
+  reduce_count = at::zeros({num_coors}, coors_map.options());
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "get_assign_pos", ([&] {
+        nondeterministic_get_assign_pos<<<blocks, threads, 0, stream>>>(
+            num_points, coors_map.contiguous().data_ptr<int32_t>(),
+            pts_id.contiguous().data_ptr<int32_t>(),
+            coors_count.contiguous().data_ptr<int32_t>(),
+            reduce_count.contiguous().data_ptr<int32_t>(),
+            coors_order.contiguous().data_ptr<int32_t>());
+      }));
+
+  AT_DISPATCH_ALL_TYPES(
+      points.scalar_type(), "assign_point_to_voxel", ([&] {
+        nondeterministic_assign_point_voxel<scalar_t>
+            <<<blocks, threads, 0, stream>>>(
+                num_points, points.contiguous().data_ptr<scalar_t>(),
+                coors_map.contiguous().data_ptr<int32_t>(),
+                pts_id.contiguous().data_ptr<int32_t>(),
+                temp_coors.contiguous().data_ptr<int32_t>(),
+                reduce_count.contiguous().data_ptr<int32_t>(),
+                coors_order.contiguous().data_ptr<int32_t>(),
+                voxels.contiguous().data_ptr<scalar_t>(),
+                coors.contiguous().data_ptr<int32_t>(),
+                num_points_per_voxel.contiguous().data_ptr<int32_t>(),
+                max_voxels, max_points, num_features, NDim);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return max_voxels < num_coors ? max_voxels : num_coors;
+}
+
+void DynamicVoxelizeForwardCUDAKernelLauncher(
+    const at::Tensor &points, at::Tensor &coors,
+    const std::vector<float> voxel_size, const std::vector<float> coors_range,
+    const int NDim = 3) {
+  // current version tooks about 0.04s for one frame on cpu
+  // check device
+
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int num_points = points.size(0);
+  const int num_features = points.size(1);
+
+  const float voxel_x = voxel_size[0];
+  const float voxel_y = voxel_size[1];
+  const float voxel_z = voxel_size[2];
+  const float coors_x_min = coors_range[0];
+  const float coors_y_min = coors_range[1];
+  const float coors_z_min = coors_range[2];
+  const float coors_x_max = coors_range[3];
+  const float coors_y_max = coors_range[4];
+  const float coors_z_max = coors_range[5];
+
+  const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
+  const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
+  const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
+
+  const int col_blocks = at::cuda::ATenCeilDiv(num_points, THREADS_PER_BLOCK);
+  dim3 blocks(col_blocks);
+  dim3 threads(THREADS_PER_BLOCK);
+
+  AT_DISPATCH_ALL_TYPES(points.scalar_type(), "dynamic_voxelize_kernel", [&] {
+    dynamic_voxelize_kernel<scalar_t, int><<<blocks, threads, 0, stream>>>(
+        points.contiguous().data_ptr<scalar_t>(),
+        coors.contiguous().data_ptr<int>(), voxel_x, voxel_y, voxel_z,
+        coors_x_min, coors_y_min, coors_z_min, coors_x_max, coors_y_max,
+        coors_z_max, grid_x, grid_y, grid_z, num_points, num_features, NDim);
+  });
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/deform_conv.cpp b/mmcv/mmcv/ops/csrc/pytorch/deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..86690b9394a4b758104009062f656dcfe0de178e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/deform_conv.cpp
@@ -0,0 +1,517 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deformable_im2col_impl(Tensor data_im, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(deformable_im2col_impl, data_im, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, data_col);
+}
+
+void deformable_col2im_impl(Tensor data_col, Tensor data_offset,
+                            const int channels, const int height,
+                            const int width, const int ksize_h,
+                            const int ksize_w, const int pad_h, const int pad_w,
+                            const int stride_h, const int stride_w,
+                            const int dilation_h, const int dilation_w,
+                            const int parallel_imgs, const int deformable_group,
+                            Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_impl, data_col, data_offset, channels,
+                       height, width, ksize_h, ksize_w, pad_h, pad_w, stride_h,
+                       stride_w, dilation_h, dilation_w, parallel_imgs,
+                       deformable_group, grad_im);
+}
+
+void deformable_col2im_coord_impl(
+    Tensor data_col, Tensor data_im, Tensor data_offset, const int channels,
+    const int height, const int width, const int ksize_h, const int ksize_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int parallel_imgs,
+    const int deformable_group, Tensor grad_offset) {
+  DISPATCH_DEVICE_IMPL(deformable_col2im_coord_impl, data_col, data_im,
+                       data_offset, channels, height, width, ksize_h, ksize_w,
+                       pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+                       parallel_imgs, deformable_group, grad_offset);
+}
+
+void deform_conv_shape_check(at::Tensor input, at::Tensor offset,
+                             at::Tensor *gradOutput, at::Tensor weight, int kH,
+                             int kW, int dH, int dW, int padH, int padW,
+                             int dilationH, int dilationW, int group,
+                             int deformable_group) {
+  TORCH_CHECK(
+      weight.ndimension() == 4,
+      "4D weight tensor (nOutputPlane,nInputPlane,kH,kW) expected, but got: %s",
+      weight.ndimension());
+
+  TORCH_CHECK(weight.is_contiguous(), "weight tensor has to be contiguous");
+
+  TORCH_CHECK(kW > 0 && kH > 0,
+              "kernel size should be greater than zero, but got kH: %d kW: %d",
+              kH, kW);
+
+  TORCH_CHECK((weight.size(2) == kH && weight.size(3) == kW),
+              "kernel size should be consistent with weight, ",
+              "but got kH: %d kW: %d weight.size(2): %d, weight.size(3): %d",
+              kH, kW, weight.size(2), weight.size(3));
+
+  TORCH_CHECK(dW > 0 && dH > 0,
+              "stride should be greater than zero, but got dH: %d dW: %d", dH,
+              dW);
+
+  TORCH_CHECK(
+      dilationW > 0 && dilationH > 0,
+      "dilation should be greater than 0, but got dilationH: %d dilationW: %d",
+      dilationH, dilationW);
+
+  int ndim = input.ndimension();
+  int dimf = 0;
+  int dimh = 1;
+  int dimw = 2;
+
+  if (ndim == 4) {
+    dimf++;
+    dimh++;
+    dimw++;
+  }
+
+  TORCH_CHECK(ndim == 3 || ndim == 4,
+              "3D or 4D input tensor expected but got: %s", ndim);
+
+  long nInputPlane = weight.size(1) * group;
+  long inputHeight = input.size(dimh);
+  long inputWidth = input.size(dimw);
+  long nOutputPlane = weight.size(0);
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+
+  TORCH_CHECK(nInputPlane % deformable_group == 0,
+              "input channels must divide deformable group size");
+
+  if (outputWidth < 1 || outputHeight < 1)
+    AT_ERROR(
+        "Given input size: (%ld x %ld x %ld). "
+        "Calculated output size: (%ld x %ld x %ld). Output size is too small",
+        nInputPlane, inputHeight, inputWidth, nOutputPlane, outputHeight,
+        outputWidth);
+
+  TORCH_CHECK(input.size(1) == nInputPlane,
+              "invalid number of input planes, expected: %d, but got: %d",
+              nInputPlane, input.size(1));
+
+  TORCH_CHECK((inputHeight >= kH && inputWidth >= kW),
+              "input image is smaller than kernel");
+
+  TORCH_CHECK(
+      (offset.size(2) == outputHeight && offset.size(3) == outputWidth),
+      "invalid spatial size of offset, expected height: %d width: %d, but "
+      "got height: %d width: %d",
+      outputHeight, outputWidth, offset.size(2), offset.size(3));
+
+  TORCH_CHECK((offset.size(1) == deformable_group * 2 * kH * kW),
+              "invalid number of channels of offset");
+
+  if (gradOutput != NULL) {
+    TORCH_CHECK(
+        gradOutput->size(dimf) == nOutputPlane,
+        "invalid number of gradOutput planes, expected: %d, but got: %d",
+        nOutputPlane, gradOutput->size(dimf));
+
+    TORCH_CHECK(
+        (gradOutput->size(dimh) == outputHeight &&
+         gradOutput->size(dimw) == outputWidth),
+        "invalid size of gradOutput, expected height: %d width: %d , but "
+        "got height: %d width: %d",
+        outputHeight, outputWidth, gradOutput->size(dimh),
+        gradOutput->size(dimw));
+  }
+}
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(output);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(output);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, NULL, weight, kH, kW, dH, dW, padH,
+                          padW, dilationH, dilationW, group, deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input.unsqueeze_(0);
+    offset.unsqueeze_(0);
+  }
+
+  // todo: assert batchsize dividable by im2col_step
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  output = output.view({batchSize / im2col_step, im2col_step, nOutputPlane,
+                        outputHeight, outputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < outputHeight * outputWidth) {
+    ones = at::ones({outputHeight, outputWidth}, input.options());
+  }
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  Tensor output_buffer = at::zeros({batchSize / im2col_step, nOutputPlane,
+                                    im2col_step * outputHeight, outputWidth},
+                                   output.options());
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), group, output_buffer.size(1) / group,
+       output_buffer.size(2), output_buffer.size(3)});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      output_buffer[elt][g] = output_buffer[elt][g]
+                                  .flatten(1)
+                                  .addmm_(weight[g].flatten(1), columns[g])
+                                  .view_as(output_buffer[elt][g]);
+    }
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  output_buffer = output_buffer.view(
+      {output_buffer.size(0), output_buffer.size(1) * output_buffer.size(2),
+       output_buffer.size(3), output_buffer.size(4)});
+
+  output_buffer = output_buffer.view({batchSize / im2col_step, nOutputPlane,
+                                      im2col_step, outputHeight, outputWidth});
+  output_buffer.transpose_(1, 2);
+  output.copy_(output_buffer);
+  output = output.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    output = output.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradInput);
+    CHECK_CUDA_INPUT(gradOffset);
+    CHECK_CUDA_INPUT(weight);
+    CHECK_CUDA_INPUT(columns);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradInput);
+    CHECK_CPU_INPUT(gradOffset);
+    CHECK_CPU_INPUT(weight);
+    CHECK_CPU_INPUT(columns);
+  }
+  deform_conv_shape_check(input, offset, &gradOutput, weight, kH, kW, dH, dW,
+                          padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view({1, input.size(0), input.size(1), input.size(2)});
+    offset = offset.view({1, offset.size(0), offset.size(1), offset.size(2)});
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = weight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), 3, "invalid batch size of offset");
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  // change order of grad output
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  gradInput = gradInput.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                              inputHeight, inputWidth});
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  gradOffset = gradOffset.view({batchSize / im2col_step, im2col_step,
+                                deformable_group * 2 * kH * kW, outputHeight,
+                                outputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    // divide into groups
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), group, gradOutput.size(1) / group,
+         gradOutput.size(2), gradOutput.size(3), gradOutput.size(4)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g] = columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                                     gradOutput[elt][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradOutput = gradOutput.view(
+        {gradOutput.size(0), gradOutput.size(1) * gradOutput.size(2),
+         gradOutput.size(3), gradOutput.size(4), gradOutput.size(5)});
+
+    deformable_col2im_coord_impl(columns, input[elt], offset[elt], nInputPlane,
+                                 inputHeight, inputWidth, kH, kW, padH, padW,
+                                 dH, dW, dilationH, dilationW, im2col_step,
+                                 deformable_group, gradOffset[elt]);
+
+    deformable_col2im_impl(columns, offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group,
+                           gradInput[elt]);
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+  }
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  gradInput = gradInput.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  gradOffset = gradOffset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+    gradInput = gradInput.view({nInputPlane, inputHeight, inputWidth});
+    offset = offset.view({offset.size(1), offset.size(2), offset.size(3)});
+    gradOffset =
+        gradOffset.view({offset.size(1), offset.size(2), offset.size(3)});
+  }
+}
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step) {
+  if (input.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(input);
+    CHECK_CUDA_INPUT(offset);
+    CHECK_CUDA_INPUT(gradOutput);
+    CHECK_CUDA_INPUT(gradWeight);
+    CHECK_CUDA_INPUT(columns);
+    CHECK_CUDA_INPUT(ones);
+#else
+    AT_ERROR("DeformConv is not compiled with GPU support");
+#endif
+  } else {
+    CHECK_CPU_INPUT(input);
+    CHECK_CPU_INPUT(offset);
+    CHECK_CPU_INPUT(gradOutput);
+    CHECK_CPU_INPUT(gradWeight);
+    CHECK_CPU_INPUT(columns);
+    CHECK_CPU_INPUT(ones);
+  }
+
+  deform_conv_shape_check(input, offset, &gradOutput, gradWeight, kH, kW, dH,
+                          dW, padH, padW, dilationH, dilationW, group,
+                          deformable_group);
+  at::DeviceGuard guard(input.device());
+
+  int batch = 1;
+
+  if (input.ndimension() == 3) {
+    // Force batch
+    batch = 0;
+    input = input.view(
+        at::IntList({1, input.size(0), input.size(1), input.size(2)}));
+    gradOutput = gradOutput.view(
+        {1, gradOutput.size(0), gradOutput.size(1), gradOutput.size(2)});
+  }
+
+  long batchSize = input.size(0);
+  long nInputPlane = input.size(1);
+  long inputHeight = input.size(2);
+  long inputWidth = input.size(3);
+
+  long nOutputPlane = gradWeight.size(0);
+
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  TORCH_CHECK((offset.size(0) == batchSize), "invalid batch size of offset");
+
+  columns = at::zeros(
+      {nInputPlane * kW * kH, im2col_step * outputHeight * outputWidth},
+      input.options());
+
+  gradOutput = gradOutput.view({batchSize / im2col_step, im2col_step,
+                                nOutputPlane, outputHeight, outputWidth});
+  gradOutput.transpose_(1, 2);
+
+  Tensor gradOutputBuffer = at::zeros_like(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane, im2col_step,
+                             outputHeight, outputWidth});
+  gradOutputBuffer = gradOutputBuffer.contiguous();
+  gradOutputBuffer.copy_(gradOutput);
+  gradOutputBuffer =
+      gradOutputBuffer.view({batchSize / im2col_step, nOutputPlane,
+                             im2col_step * outputHeight, outputWidth});
+
+  gradOutput.transpose_(1, 2);
+  gradOutput =
+      gradOutput.view({batchSize, nOutputPlane, outputHeight, outputWidth});
+
+  input = input.view({batchSize / im2col_step, im2col_step, nInputPlane,
+                      inputHeight, inputWidth});
+  offset =
+      offset.view({batchSize / im2col_step, im2col_step,
+                   deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    deformable_im2col_impl(input[elt], offset[elt], nInputPlane, inputHeight,
+                           inputWidth, kH, kW, padH, padW, dH, dW, dilationH,
+                           dilationW, im2col_step, deformable_group, columns);
+
+    // divide into group
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0), group, gradOutputBuffer.size(1) / group,
+         gradOutputBuffer.size(2), gradOutputBuffer.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    gradWeight =
+        gradWeight.view({group, gradWeight.size(0) / group, gradWeight.size(1),
+                         gradWeight.size(2), gradWeight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      gradWeight[g] = gradWeight[g]
+                          .flatten(1)
+                          .addmm_(gradOutputBuffer[elt][g].flatten(1),
+                                  columns[g].transpose(1, 0), 1.0, scale)
+                          .view_as(gradWeight[g]);
+    }
+    gradOutputBuffer = gradOutputBuffer.view(
+        {gradOutputBuffer.size(0),
+         gradOutputBuffer.size(1) * gradOutputBuffer.size(2),
+         gradOutputBuffer.size(3), gradOutputBuffer.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    gradWeight = gradWeight.view({gradWeight.size(0) * gradWeight.size(1),
+                                  gradWeight.size(2), gradWeight.size(3),
+                                  gradWeight.size(4)});
+  }
+
+  input = input.view({batchSize, nInputPlane, inputHeight, inputWidth});
+  offset = offset.view(
+      {batchSize, deformable_group * 2 * kH * kW, outputHeight, outputWidth});
+
+  if (batch == 0) {
+    gradOutput = gradOutput.view({nOutputPlane, outputHeight, outputWidth});
+    input = input.view({nInputPlane, inputHeight, inputWidth});
+  }
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp b/mmcv/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4fb78a96e74f7e97dff5212bb767eab743f2e73c
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/deform_roi_pool.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void deform_roi_pool_forward_impl(Tensor input, Tensor rois, Tensor offset,
+                                  Tensor output, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int sampling_ratio, float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_forward_impl, input, rois, offset,
+                       output, pooled_height, pooled_width, spatial_scale,
+                       sampling_ratio, gamma);
+}
+
+void deform_roi_pool_backward_impl(Tensor grad_output, Tensor input,
+                                   Tensor rois, Tensor offset,
+                                   Tensor grad_input, Tensor grad_offset,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   float gamma) {
+  DISPATCH_DEVICE_IMPL(deform_roi_pool_backward_impl, grad_output, input, rois,
+                       offset, grad_input, grad_offset, pooled_height,
+                       pooled_width, spatial_scale, sampling_ratio, gamma);
+}
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma) {
+  deform_roi_pool_forward_impl(input, rois, offset, output, pooled_height,
+                               pooled_width, spatial_scale, sampling_ratio,
+                               gamma);
+}
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma) {
+  deform_roi_pool_backward_impl(grad_output, input, rois, offset, grad_input,
+                                grad_offset, pooled_height, pooled_width,
+                                spatial_scale, sampling_ratio, gamma);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp b/mmcv/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2361b7fbe5c86fa62a0fa78f39f6d018de108f8f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/diff_iou_rotated.cpp
@@ -0,0 +1,14 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor diff_iou_rotated_sort_vertices_forward_impl(Tensor vertices, Tensor mask,
+                                                   Tensor num_valid) {
+  return DISPATCH_DEVICE_IMPL(diff_iou_rotated_sort_vertices_forward_impl,
+                              vertices, mask, num_valid);
+}
+
+Tensor diff_iou_rotated_sort_vertices_forward(Tensor vertices, Tensor mask,
+                                              Tensor num_valid) {
+  return diff_iou_rotated_sort_vertices_forward_impl(vertices, mask, num_valid);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/focal_loss.cpp b/mmcv/mmcv/ops/csrc/pytorch/focal_loss.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ed0e2186532d9d6d909f76d653283bbdc29eac11
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/focal_loss.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, input, target, weight,
+                       grad_input, gamma, alpha);
+}
+
+void softmax_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_forward_impl, input, target, weight,
+                       output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor buff,
+                                      Tensor grad_input, float gamma,
+                                      float alpha) {
+  DISPATCH_DEVICE_IMPL(softmax_focal_loss_backward_impl, input, target, weight,
+                       buff, grad_input, gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  sigmoid_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha) {
+  sigmoid_focal_loss_backward_impl(input, target, weight, grad_input, gamma,
+                                   alpha);
+}
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha) {
+  softmax_focal_loss_forward_impl(input, target, weight, output, gamma, alpha);
+}
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha) {
+  softmax_focal_loss_backward_impl(input, target, weight, buff, grad_input,
+                                   gamma, alpha);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp b/mmcv/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c7098acdb5b8392a698803dd7c7d34a360df6ad
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/furthest_point_sample.cpp
@@ -0,0 +1,34 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/sampling.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void furthest_point_sampling_forward_impl(Tensor points_tensor,
+                                          Tensor temp_tensor, Tensor idx_tensor,
+                                          int b, int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_forward_impl, points_tensor,
+                       temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward_impl(Tensor points_tensor,
+                                                    Tensor temp_tensor,
+                                                    Tensor idx_tensor, int b,
+                                                    int n, int m) {
+  DISPATCH_DEVICE_IMPL(furthest_point_sampling_with_dist_forward_impl,
+                       points_tensor, temp_tensor, idx_tensor, b, n, m);
+}
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m) {
+  furthest_point_sampling_forward_impl(points_tensor, temp_tensor, idx_tensor,
+                                       b, n, m);
+}
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m) {
+  furthest_point_sampling_with_dist_forward_impl(points_tensor, temp_tensor,
+                                                 idx_tensor, b, n, m);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp b/mmcv/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8d411c9d843f15174653aab4b24cbb3c37564073
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/fused_bias_leakyrelu.cpp
@@ -0,0 +1,119 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_bias_act.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_bias_leakyrelu_op_impl(const torch::Tensor& input,
+                                           const torch::Tensor& bias,
+                                           const torch::Tensor& refer, int act,
+                                           int grad, float alpha, float scale) {
+  return DISPATCH_DEVICE_IMPL(fused_bias_leakyrelu_op_impl, input, bias, refer,
+                              act, grad, alpha, scale);
+}
+
+torch::Tensor fused_bias_leakyrelu(const torch::Tensor& input,
+                                   const torch::Tensor& bias,
+                                   const torch::Tensor& refer, int act,
+                                   int grad, float alpha, float scale) {
+  return fused_bias_leakyrelu_op_impl(input, bias, refer, act, grad, alpha,
+                                      scale);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp b/mmcv/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..54073a54ec5d335d2e2ed68c553eb1d6eb49557b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/fused_spconv_ops.cpp
@@ -0,0 +1,34 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor fused_indice_conv_batchnorm_forward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(fused_indice_conv_batchnorm_forward_impl,
+                              features, filters, bias, indicePairs, indiceNum,
+                              numActOut, _inverse, _subM);
+}
+
+torch::Tensor fused_indice_conv_batchnorm_forward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor bias,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t numActOut,
+    int64_t _inverse, int64_t _subM) {
+  return fused_indice_conv_batchnorm_forward_impl(features, filters, bias,
+                                                  indicePairs, indiceNum,
+                                                  numActOut, _inverse, _subM);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/gather_points.cpp b/mmcv/mmcv/ops/csrc/pytorch/gather_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b8fb020022902bfbeb5ba940621d51859c616bdc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/gather_points.cpp
@@ -0,0 +1,30 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void gather_points_forward_impl(int b, int c, int n, int npoints,
+                                const Tensor points, const Tensor idx,
+                                Tensor out) {
+  DISPATCH_DEVICE_IMPL(gather_points_forward_impl, b, c, n, npoints, points,
+                       idx, out);
+}
+
+void gather_points_backward_impl(int b, int c, int n, int npoints,
+                                 const Tensor grad_out, const Tensor idx,
+                                 Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(gather_points_backward_impl, b, c, n, npoints, grad_out,
+                       idx, grad_points);
+}
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n,
+                           int npoints) {
+  gather_points_forward_impl(b, c, n, npoints, points_tensor, idx_tensor,
+                             out_tensor);
+}
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints) {
+  gather_points_backward_impl(b, c, n, npoints, grad_out_tensor, idx_tensor,
+                              grad_points_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/group_points.cpp b/mmcv/mmcv/ops/csrc/pytorch/group_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cdd190d40bbfdb109e34148791775dfe9d16be2e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/group_points.cpp
@@ -0,0 +1,34 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/group_points.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void group_points_forward_impl(int b, int c, int n, int npoints, int nsample,
+                               const Tensor points, const Tensor idx,
+                               Tensor out) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points, idx, out);
+}
+
+void group_points_backward_impl(int b, int c, int n, int npoints, int nsample,
+                                const Tensor grad_out, const Tensor idx,
+                                Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(group_points_backward_impl, b, c, n, npoints, nsample,
+                       grad_out, idx, grad_points);
+}
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample) {
+  DISPATCH_DEVICE_IMPL(group_points_forward_impl, b, c, n, npoints, nsample,
+                       points_tensor, idx_tensor, out_tensor);
+}
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample) {
+  group_points_backward_impl(b, c, n, npoints, nsample, grad_out_tensor,
+                             idx_tensor, grad_points_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/info.cpp b/mmcv/mmcv/ops/csrc/pytorch/info.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a08d227d4c6e94f0dabd8cebab7bf2d77b9df4b9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/info.cpp
@@ -0,0 +1,56 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/vision.cpp
+#include "pytorch_cpp_helper.hpp"
+
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+#include <cuda_runtime_api.h>
+int get_cudart_version() { return CUDART_VERSION; }
+#endif
+#endif
+
+std::string get_compiling_cuda_version() {
+#ifdef MMCV_WITH_CUDA
+#ifndef HIP_DIFF
+  std::ostringstream oss;
+  // copied from
+  // https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/cuda/detail/CUDAHooks.cpp#L231
+  auto printCudaStyleVersion = [&](int v) {
+    oss << (v / 1000) << "." << (v / 10 % 100);
+    if (v % 10 != 0) {
+      oss << "." << (v % 10);
+    }
+  };
+  printCudaStyleVersion(get_cudart_version());
+  return oss.str();
+#else
+  return std::string("rocm not available");
+#endif
+#else
+  return std::string("not available");
+#endif
+}
+
+// similar to
+// https://github.com/pytorch/pytorch/blob/master/aten/src/ATen/Version.cpp
+std::string get_compiler_version() {
+  std::ostringstream ss;
+#if defined(__GNUC__)
+#ifndef __clang__
+  { ss << "GCC " << __GNUC__ << "." << __GNUC_MINOR__; }
+#endif
+#endif
+
+#if defined(__clang_major__)
+  {
+    ss << "clang " << __clang_major__ << "." << __clang_minor__ << "."
+       << __clang_patchlevel__;
+  }
+#endif
+
+#if defined(_MSC_VER)
+  { ss << "MSVC " << _MSC_FULL_VER; }
+#endif
+  return ss.str();
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/iou3d.cpp b/mmcv/mmcv/ops/csrc/pytorch/iou3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ef9c7e819943a1c5305ca3fd6294b8a3f870056
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/iou3d.cpp
@@ -0,0 +1,135 @@
+// Modified from
+// https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/iou3d_nms/src/iou3d_nms.cpp
+
+/*
+3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
+Written by Shaoshuai Shi
+All Rights Reserved 2019-2020.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
+
+void iou3d_boxes_overlap_bev_forward_impl(const int num_a, const Tensor boxes_a,
+                                          const int num_b, const Tensor boxes_b,
+                                          Tensor ans_overlap) {
+  DISPATCH_DEVICE_IMPL(iou3d_boxes_overlap_bev_forward_impl, num_a, boxes_a,
+                       num_b, boxes_b, ans_overlap);
+}
+
+void iou3d_nms3d_forward_impl(const Tensor boxes, unsigned long long *mask,
+                              int boxes_num, float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_nms3d_normal_forward_impl(const Tensor boxes,
+                                     unsigned long long *mask, int boxes_num,
+                                     float nms_overlap_thresh) {
+  DISPATCH_DEVICE_IMPL(iou3d_nms3d_normal_forward_impl, boxes, mask, boxes_num,
+                       nms_overlap_thresh);
+}
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params boxes_b: (M, 5)
+  // params ans_overlap: (N, M)
+  int num_a = boxes_a.size(0);
+  int num_b = boxes_b.size(0);
+
+  iou3d_boxes_overlap_bev_forward_impl(num_a, boxes_a, num_b, boxes_b,
+                                       ans_overlap);
+}
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms3d_forward_impl(boxes, mask_data, boxes_num, nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+    *keep_num_data = num_to_keep;
+  }
+}
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh) {
+  // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
+  // params keep: (N)
+
+  CHECK_CONTIGUOUS(boxes);
+  CHECK_CONTIGUOUS(keep);
+
+  int boxes_num = boxes.size(0);
+  int64_t *keep_data = keep.data_ptr<int64_t>();
+  int64_t *keep_num_data = keep_num.data_ptr<int64_t>();
+
+  const int col_blocks =
+      (boxes_num + THREADS_PER_BLOCK_NMS - 1) / THREADS_PER_BLOCK_NMS;
+
+  Tensor mask =
+      at::empty({boxes_num, col_blocks}, boxes.options().dtype(at::kLong));
+  unsigned long long *mask_data =
+      (unsigned long long *)mask.data_ptr<int64_t>();
+  iou3d_nms3d_normal_forward_impl(boxes, mask_data, boxes_num,
+                                  nms_overlap_thresh);
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long *mask_host =
+      (unsigned long long *)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv_cpu(col_blocks);
+  memset(&remv_cpu[0], 0, sizeof(unsigned long long) * col_blocks);
+  int num_to_keep = 0;
+
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / THREADS_PER_BLOCK_NMS;
+    int inblock = i % THREADS_PER_BLOCK_NMS;
+
+    if (!(remv_cpu[nblock] & (1ULL << inblock))) {
+      keep_data[num_to_keep++] = i;
+      unsigned long long *p = &mask_host[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv_cpu[j] |= p[j];
+      }
+    }
+  }
+
+  *keep_num_data = num_to_keep;
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/knn.cpp b/mmcv/mmcv/ops/csrc/pytorch/knn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b4be9428c59c0f04635891b954f4c73f7fb0536d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/knn.cpp
@@ -0,0 +1,17 @@
+// Modified from
+// https://github.com/CVMI-Lab/PAConv/tree/main/scene_seg/lib/pointops/src/knnquery_heap
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void knn_forward_impl(int b, int n, int m, int nsample, const Tensor xyz,
+                      const Tensor new_xyz, Tensor idx, Tensor dist2) {
+  DISPATCH_DEVICE_IMPL(knn_forward_impl, b, n, m, nsample, xyz, new_xyz, idx,
+                       dist2);
+}
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample) {
+  knn_forward_impl(b, n, m, nsample, xyz_tensor, new_xyz_tensor, idx_tensor,
+                   dist2_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/masked_conv2d.cpp b/mmcv/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5903925351fcb193b86c8b5f01b410e4fc0bbaf9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/masked_conv2d.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void masked_im2col_forward_impl(const Tensor im, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor col,
+                                const int kernel_h, const int kernel_w,
+                                const int pad_h, const int pad_w) {
+  DISPATCH_DEVICE_IMPL(masked_im2col_forward_impl, im, mask_h_idx, mask_w_idx,
+                       col, kernel_h, kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward_impl(const Tensor col, const Tensor mask_h_idx,
+                                const Tensor mask_w_idx, Tensor im, int height,
+                                int width, int channels) {
+  DISPATCH_DEVICE_IMPL(masked_col2im_forward_impl, col, mask_h_idx, mask_w_idx,
+                       im, height, width, channels);
+}
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w) {
+  masked_im2col_forward_impl(im, mask_h_idx, mask_w_idx, col, kernel_h,
+                             kernel_w, pad_h, pad_w);
+}
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels) {
+  masked_col2im_forward_impl(col, mask_h_idx, mask_w_idx, im, height, width,
+                             channels);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/min_area_polygons.cpp b/mmcv/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8ff996dc8992b4c95633516054ecdba5913de8f3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/min_area_polygons.cpp
@@ -0,0 +1,11 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void min_area_polygons_impl(const Tensor pointsets, Tensor polygons) {
+  DISPATCH_DEVICE_IMPL(min_area_polygons_impl, pointsets, polygons);
+}
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons) {
+  min_area_polygons_impl(pointsets, polygons);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp b/mmcv/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..82d55559c52047bfd82c3813c995e3d0ae0c24c0
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/mlu/bbox_overlaps_mlu.cpp
@@ -0,0 +1,100 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelBBoxOverlaps(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                        cnrtQueue_t queue, const cnrtDataType_t d_type,
+                        const void *bbox1, const void *bbox2, void *ious,
+                        const int32_t num_bbox1, const int32_t num_bbox2,
+                        const int32_t mode, const bool aligned,
+                        const int32_t offset);
+
+static void policyFunc(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
+                       const int32_t batch_num_all) {
+  auto union_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto core_num = union_num * core_dim;
+
+  // Union1 policyFunc
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_dim;
+  auto need_core_num = PAD_UP(batch_num_all, core_dim);
+  k_dim->y =
+      (need_core_num < core_num) ? (need_core_num / core_dim) : union_num;
+  k_dim->z = 1;
+
+  return;
+}
+
+void BBoxOverlapsMLUKernelLauncher(const Tensor bboxes1, const Tensor bboxes2,
+                                   Tensor ious, const int32_t mode,
+                                   const bool aligned, const int32_t offset) {
+  // check dtype
+  TORCH_CHECK(
+      bboxes1.scalar_type() == at::kFloat || bboxes1.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      bboxes1.scalar_type(), ".");
+  TORCH_CHECK(bboxes1.scalar_type() == bboxes2.scalar_type(),
+              "bboxes1's dtype should be the same with bboxes2's dtype.");
+
+  // params check
+  TORCH_CHECK(bboxes1.dim() == 2, "bboxes1 should be a 2d tensor, got ",
+              bboxes1.dim(), "D");
+  TORCH_CHECK(bboxes2.dim() == 2, "bboxes2 should be a 2d tensor, got ",
+              bboxes2.dim(), "D");
+
+  auto rows = bboxes1.size(0);
+  auto cols = bboxes2.size(0);
+  auto batch_num_all = rows;
+
+  if (rows * cols == 0) {
+    // return if zero element
+    return;
+  }
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(&k_dim, &k_type, batch_num_all);
+
+  // get compute queue
+  cnrtQueue_t queue = torch_mlu::getCurQueue();
+
+  // get dtype of input
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bboxes1.dtype());
+
+  // get ptr of tensors
+  auto bboxes1_impl = torch_mlu::getMluTensorImpl(bboxes1);
+  auto bboxes1_ptr = bboxes1_impl->cnnlMalloc();
+  auto bboxes2_impl = torch_mlu::getMluTensorImpl(bboxes2);
+  auto bboxes2_ptr = bboxes2_impl->cnnlMalloc();
+  auto ious_impl = torch_mlu::getMluTensorImpl(ious);
+  auto ious_ptr = ious_impl->cnnlMalloc();
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUUnion1BboxOverlapsKernel";
+  CNLOG(INFO) << "kDim :[ " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z
+              << " ]";
+  KernelBBoxOverlaps(k_dim, k_type, queue, d_type, bboxes1_ptr, bboxes2_ptr,
+                     ious_ptr, rows, cols, mode, aligned, offset);
+}
+
+void bbox_overlaps_mlu(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                       const int mode, const bool aligned, const int offset) {
+  BBoxOverlapsMLUKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                        const int mode, const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MLU, bbox_overlaps_mlu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp b/mmcv/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9242644c894c31b8d7ac2d719fc80d2b57bbdb96
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/mlu/focal_loss_sigmoid_mlu.cpp
@@ -0,0 +1,332 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <string>
+#include <vector>
+
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelFocalLossSigmoidForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                                   cnrtQueue_t queue,
+                                   const cnrtDataType_t d_type,
+                                   const void *input, const void *target,
+                                   const void *weight, const int32_t N,
+                                   const int32_t C, const float alpha,
+                                   const float gamma, void *output);
+
+void KernelFocalLossSigmoidBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                                    cnrtQueue_t queue,
+                                    const cnrtDataType_t d_type,
+                                    const void *input, const void *target,
+                                    const void *weight, const float gamma,
+                                    const float alpha, const int32_t dim_n,
+                                    const int32_t deal_n, const int32_t dim_c,
+                                    void *output);
+// Policy Function for Forward
+static void policyFuncForward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type,
+                              const Tensor &input, const Tensor &target,
+                              const Tensor &weight) {
+  auto N = input.size(0);
+  auto C = input.size(1);
+
+  const size_t nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  const size_t c_align_size = PAD_UP((C * input.itemsize()), NFU_ALIGN_SIZE);
+  const int split_target_num = 2;
+  const int split_pipeline_num = 6;
+  const int has_weight = weight.data_ptr() != nullptr;
+  const int target_data_width = target.scalar_type() == at::kLong
+                                    ? target.itemsize() / 2
+                                    : target.itemsize();
+  const int threshold_c =
+      PAD_DOWN((nram_size - split_target_num * sizeof(int)) /
+                   (split_pipeline_num + has_weight),
+               NFU_ALIGN_SIZE) /
+      input.itemsize();
+
+  int n_seg = 1;
+  if (C <= threshold_c) {
+    int c_size = C * input.itemsize();
+    int reservered_align_size =
+        (split_target_num + split_pipeline_num) * NFU_ALIGN_SIZE;
+    int wegiht_size = 0;
+    if (has_weight) {
+      c_size = c_align_size;
+      reservered_align_size = split_target_num * NFU_ALIGN_SIZE;
+      wegiht_size = c_align_size;
+    }
+    // n_seg * c_size * split_pipeline_num + n_seg * target.itemsize() *
+    // split_target_num
+    //     + weight_size + reservered_align_size <= nram_size
+    n_seg = (nram_size - wegiht_size - reservered_align_size) /
+            (split_pipeline_num * c_size + split_target_num * sizeof(int32_t));
+  }
+  auto seg_num = n_seg == 0 ? N : (N + n_seg - 1) / n_seg;
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  auto core_num = core_dim * cluster_num;
+
+  k_dim->x = *k_type;
+  k_dim->y =
+      seg_num > core_num ? cluster_num : (seg_num + core_dim - 1) / core_dim;
+  k_dim->z = 1;
+}
+
+// Policy Function for Backward
+static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  // set Union1 Job
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim->z = 1;
+}
+
+void SigmoidFocalLossForwardMLUKernelLauncher(Tensor input, Tensor target,
+                                              Tensor weight, Tensor output,
+                                              const float gamma,
+                                              const float alpha) {
+  // params check
+  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
+              "But now gamma is ", gamma, ".");
+
+  // check dtype
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(
+      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
+      "target type should be Int or Long. ", "But now target type is ",
+      target.scalar_type(), ".");
+
+  if (weight.data_ptr() != nullptr) {
+    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
+                "Data types of input and weight should be the same. But now "
+                "input type is ",
+                input.scalar_type(), ", weight type is ", weight.scalar_type(),
+                ".");
+  } else {
+    CNLOG(INFO) << "weight is a empty tensor.";
+  }
+
+  // return if zero-element
+  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
+    return;
+  }
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  policyFuncForward(&k_dim, &k_type, input, target, weight);
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto target_impl = torch_mlu::getMluTensorImpl(target);
+  auto target_ptr = target_impl->cnnlMalloc();
+  auto weight_impl = torch_mlu::getMluTensorImpl(weight);
+  auto weight_ptr = weight_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get dtype of input
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
+
+  CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidForward<<<Union"
+              << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
+              << k_dim.z << ">>>";
+  // launch kernel
+  KernelFocalLossSigmoidForward(k_dim, k_type, queue, d_type, input_ptr,
+                                target_ptr, weight_ptr, input.size(0),
+                                input.size(1), alpha, gamma, output_ptr);
+}
+
+void getDealNAndThresholdC(const int compute_data_bytes,
+                           const int target_data_bytes, const int total_c,
+                           int *deal_n_ptr, int *threshold_c_ptr,
+                           const bool has_weight, const bool is_half) {
+  /* NRAM partition:
+   *
+   * |-----------------ping pong--------------------|
+   * |input | pt | alpha_t | temp | output | target | flt_min | gamma | weight|
+   *
+   * split_pipeline_num is 5: including input, pt, alpha_t, temp, output.
+   */
+  const int nram_split_num = 5;
+  const int nram_split_pingpong = 2;
+  const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  int32_t compute_align_size = NFU_ALIGN_SIZE;
+  if (is_half) {
+    compute_align_size += NFU_ALIGN_SIZE;
+  }
+  const int32_t compute_align_num = compute_align_size / compute_data_bytes;
+  // reservered_align_size: including input(ping pong), pt(ping pong),
+  //                        alpha_t(ping pong), temp(ping pong),
+  //                        output(ping pong), target(ping pong),
+  //                        flt_min and gamma.
+  const int reservered_align_size =
+      ((nram_split_num + 1) * nram_split_pingpong + 2) * compute_align_size;
+  int nram_pingpong_size = max_nram_size - reservered_align_size;
+
+  int compute_c = total_c;
+  int threshold_c = 0;
+  if (has_weight) {
+    // reserved space for weight to align
+    nram_pingpong_size -= NFU_ALIGN_SIZE;
+
+    // threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
+    //     nram_split_pingpong * target_data_bytes +
+    //     threshold_c * compute_data_bytes <= nram_pingpong_size
+    threshold_c =
+        (nram_pingpong_size - nram_split_pingpong * target_data_bytes) /
+        (compute_data_bytes * (nram_split_num * nram_split_pingpong + 1));
+    threshold_c = PAD_DOWN(threshold_c, compute_align_num);
+    int weight_space = PAD_UP(total_c * compute_data_bytes, NFU_ALIGN_SIZE);
+
+    // reserved space for weight
+    nram_pingpong_size -= weight_space;
+    compute_c = PAD_UP(total_c, compute_align_num);
+  } else {
+    // threshold_c * nram_split_pingpong * compute_data_bytes * nram_split_num +
+    //     nram_split_pingpong * target_data_bytes <= nram_pingpong_size
+    threshold_c =
+        (nram_pingpong_size / nram_split_pingpong - target_data_bytes) /
+        (nram_split_num * compute_data_bytes);
+  }
+  // deal_n * compute_c * nram_split_pingpong * compute_data_bytes *
+  //     nram_split_num + deal_n * nram_split_pingpong * target_data_bytes <=
+  //     nram_pingpong_size
+  *deal_n_ptr =
+      nram_pingpong_size /
+      ((nram_split_num * compute_c * compute_data_bytes + target_data_bytes) *
+       nram_split_pingpong);
+  *threshold_c_ptr = threshold_c;
+}
+
+void SigmoidFocalLossBackwardMLUKernelLauncher(Tensor input, Tensor target,
+                                               Tensor weight, Tensor output,
+                                               const float gamma,
+                                               const float alpha) {
+  // params check
+  TORCH_CHECK(gamma >= 0, "gamma should be greater than or equal to 0. ",
+              "But now gamma is ", gamma, ".");
+  // check dtype
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "Data type of input should be Float or Half. But now input type is ",
+      input.scalar_type(), ".");
+
+  TORCH_CHECK(
+      (target.scalar_type() == at::kInt || target.scalar_type() == at::kLong),
+      "target type should be Int or Long. ", "But now target type is ",
+      target.scalar_type(), ".");
+
+  bool has_weight = false;
+  if (weight.data_ptr() != nullptr) {
+    TORCH_CHECK(weight.scalar_type() == input.scalar_type(),
+                "Data types of input and weight should be the same. But now "
+                "input type is ",
+                input.scalar_type(), ", weight type is ", weight.scalar_type(),
+                ".");
+    has_weight = true;
+  } else {
+    CNLOG(INFO) << "weight is a empty tensor.";
+  }
+
+  auto dim_c = input.size(1);
+  const int compute_data_bytes = sizeof(float);
+  // target supports only INT on MLU device while it keeps LONG on host side,
+  // so target.itemsize() / 2
+  const int target_data_bytes = target.scalar_type() == at::kLong
+                                    ? (target.itemsize() / 2)
+                                    : target.itemsize();
+  int deal_n = 0;
+  int threshold_c = 0;
+  bool is_half = false;
+  if (input.scalar_type() == at::kHalf) {
+    is_half = true;
+  }
+  // calculate deal_n and threshold_c
+  getDealNAndThresholdC(compute_data_bytes, target_data_bytes, dim_c, &deal_n,
+                        &threshold_c, has_weight, is_half);
+
+  // check C
+  TORCH_CHECK(threshold_c >= dim_c,
+              "input.size(1) should be in the range of [0, ", threshold_c,
+              "]. ", "But now input.size(1) is ", dim_c, ".");
+
+  if (input.numel() == 0 || target.numel() == 0 || output.numel() == 0) {
+    // return if zero-element
+    return;
+  }
+
+  // set task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncBackward(&k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto target_impl = torch_mlu::getMluTensorImpl(target);
+  auto target_ptr = target_impl->cnnlMalloc();
+  auto weight_impl = torch_mlu::getMluTensorImpl(weight);
+  auto weight_ptr = weight_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  // get dtype of input
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
+  auto core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto dim_n = input.size(0);
+
+  CNLOG(INFO) << "Launch Kernel KernelFocalLossSigmoidBackward<<<Union"
+              << k_type / core_dim << ", " << k_dim.x << ", " << k_dim.y << ", "
+              << k_dim.z << ">>>";
+
+  // launch kernel
+  KernelFocalLossSigmoidBackward(k_dim, k_type, queue, d_type, input_ptr,
+                                 target_ptr, weight_ptr, gamma, alpha, dim_n,
+                                 deal_n, dim_c, output_ptr);
+}
+
+void sigmoid_focal_loss_forward_mlu(Tensor input, Tensor target, Tensor weight,
+                                    Tensor output, float gamma, float alpha) {
+  SigmoidFocalLossForwardMLUKernelLauncher(input, target, weight, output, gamma,
+                                           alpha);
+}
+
+void sigmoid_focal_loss_backward_mlu(Tensor input, Tensor target, Tensor weight,
+                                     Tensor grad_input, float gamma,
+                                     float alpha) {
+  SigmoidFocalLossBackwardMLUKernelLauncher(input, target, weight, grad_input,
+                                            gamma, alpha);
+}
+
+void sigmoid_focal_loss_forward_impl(Tensor input, Tensor target, Tensor weight,
+                                     Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward_impl(Tensor input, Tensor target,
+                                      Tensor weight, Tensor grad_input,
+                                      float gamma, float alpha);
+
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_forward_impl, MLU,
+                     sigmoid_focal_loss_forward_mlu);
+REGISTER_DEVICE_IMPL(sigmoid_focal_loss_backward_impl, MLU,
+                     sigmoid_focal_loss_backward_mlu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp b/mmcv/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..33c4f7de50834fc9e97af00b2e455e2eba279ddc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/mlu/nms_mlu.cpp
@@ -0,0 +1,130 @@
+/*************************************************************************
+ * Copyright (C) 2021 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelNms(cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+               const cnrtDataType_t data_type_input, const void *boxes_ptr,
+               const void *scores_ptr, const int input_num_boxes,
+               const int input_stride, const int max_output_boxes,
+               const float iou_threshold, const float offset,
+               void *workspace_ptr, void *output_size_ptr, void *output_ptr);
+
+int selectUnionType(uint32_t use_job, int box_num_per_core) {
+  // the box_num_per_core should be at least 256, otherwise the real IO
+  // bandwidth would be very low
+  while (box_num_per_core < 256 && use_job >= 4) {
+    box_num_per_core *= 2;
+    use_job /= 2;
+  }
+  return use_job;
+}
+
+Tensor NMSMLUKernelLauncher(Tensor boxes, Tensor scores, float iou_threshold,
+                            int offset) {
+  // dimension parameters check
+  TORCH_CHECK(boxes.dim() == 2, "boxes should be a 2d tensor, got ",
+              boxes.dim(), "D");
+  TORCH_CHECK(boxes.size(1) == 4,
+              "boxes should have 4 elements in dimension 1, got ",
+              boxes.size(1));
+  TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ",
+              scores.dim(), "D");
+
+  // data type check
+  TORCH_CHECK(boxes.scalar_type() == scores.scalar_type(),
+              "boxes should have the same type as scores");
+  TORCH_CHECK(
+      boxes.scalar_type() == at::kFloat || boxes.scalar_type() == at::kHalf,
+      "data type of boxes should be Float or Half, got ", boxes.scalar_type());
+
+  if (boxes.numel() == 0) {
+    return at::empty({0}, boxes.options().dtype(at::kLong));
+  }
+
+  int input_num_boxes = boxes.size(0);
+  int input_stride = boxes.size(0);
+  int max_output_boxes = boxes.size(0);
+
+  cnrtDataType_t data_type_input = torch_mlu::toCnrtDtype(boxes.dtype());
+  cnrtDim3_t k_dim;
+  cnrtJobType_t k_type;
+  uint32_t union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  uint32_t core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t job_limit = union_number * core_dim;
+  uint32_t core_number = union_number * core_dim;
+  int box_num_per_core = (input_num_boxes + core_number - 1) / core_number;
+  // initiate k_type as Union1
+  k_dim.x = core_dim;
+  k_dim.y = 1;
+  k_dim.z = 1;
+  k_type = CNRT_FUNC_TYPE_UNION1;
+  int use_job = selectUnionType(job_limit, box_num_per_core);
+  if (use_job < 4) {
+    k_dim.x = 1;
+    k_type = CNRT_FUNC_TYPE_BLOCK;
+  } else if (use_job == 4) {
+    k_dim.x = core_dim;
+    k_type = CNRT_FUNC_TYPE_UNION1;
+  } else {
+    k_dim.x = use_job;
+    k_type = (cnrtFunctionType_t)use_job;
+  }
+
+  // transpose boxes (n, 4) to (4, n) for better performance
+  auto boxes_t = boxes.transpose(0, 1);
+  auto boxes_ = torch_mlu::cnnl::ops::cnnl_contiguous(boxes_t);
+  auto scores_ = torch_mlu::cnnl::ops::cnnl_contiguous(scores);
+  auto output = at::empty({max_output_boxes}, boxes.options().dtype(at::kLong));
+  auto output_size = at::empty({1}, scores.options().dtype(at::kInt));
+
+  // workspace
+  const int info_num = 5;  // x1, x2, y1, y2 and score
+  size_t space_size = 0;
+  if (boxes.scalar_type() == at::kHalf) {
+    space_size = input_num_boxes * sizeof(int16_t) * info_num + sizeof(float);
+  } else {
+    space_size = input_num_boxes * sizeof(float) * info_num + sizeof(float);
+  }
+  auto workspace = at::empty(space_size, boxes.options().dtype(at::kByte));
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  auto boxes_impl = torch_mlu::getMluTensorImpl(boxes_);
+  auto boxes_ptr = boxes_impl->cnnlMalloc();
+  auto scores_impl = torch_mlu::getMluTensorImpl(scores_);
+  auto scores_ptr = scores_impl->cnnlMalloc();
+  auto workspace_impl = torch_mlu::getMluTensorImpl(workspace);
+  auto workspace_ptr = workspace_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto output_size_impl = torch_mlu::getMluTensorImpl(output_size);
+  auto output_size_ptr = output_size_impl->cnnlMalloc();
+
+  CNLOG(INFO) << "Launch Kernel MLUUnionX NMS<<<Union" << k_type / core_dim
+              << ", " << k_dim.x << ", " << k_dim.y << ", " << k_dim.z << ">>>";
+  KernelNms(k_dim, k_type, queue, data_type_input, boxes_ptr, scores_ptr,
+            input_num_boxes, input_stride, max_output_boxes, iou_threshold,
+            offset, workspace_ptr, output_size_ptr, output_ptr);
+
+  int output_num = *static_cast<int *>(output_size.cpu().data_ptr());
+  return output.slice(0, 0, output_num);
+}
+
+Tensor nms_mlu(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return NMSMLUKernelLauncher(boxes, scores, iou_threshold, offset);
+}
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+REGISTER_DEVICE_IMPL(nms_impl, MLU, nms_mlu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp b/mmcv/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0579da879edb4140236e74222fd3d05117e93dfb
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/mlu/psamask_mlu.cpp
@@ -0,0 +1,308 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include <algorithm>
+
+#include "psamask_utils.hpp"
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+#define COMPUTE_COUNT_ALIGN 64
+
+void KernelPsamaskForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *x, void *y, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int x_c, const int y_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg);
+
+void KernelPsamaskBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *dy, void *dx, const PsamaskType psa_type,
+    const DimPartitionType core_partition,
+    const DimPartitionType cluster_partition, const int batch,
+    const int h_feature, const int w_feature, const int h_mask,
+    const int w_mask, const int dx_c, const int dy_c, const int half_h_mask,
+    const int half_w_mask, const int n_per_core, const int h_per_core,
+    const int n_per_cluster, const int h_per_cluster, const int limit_n_seg,
+    const int limit_h_seg, const int limit_w_seg);
+
+namespace {
+void policyFunc(cnrtDim3_t *k_dim_ptr, cnrtFunctionType_t *f_type_ptr,
+                PartitionSeg *partition_ptr, const int n, const int h_feature) {
+  unsigned int core_dim = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  unsigned int use_cluster_num = cluster_num;
+  unsigned int use_core_num = core_dim;
+
+  if (n >= cluster_num || n >= h_feature) {
+    partition_ptr->cluster_partition = PARTITION_N;
+    partition_ptr->n_per_cluster = (n + cluster_num - 1) / cluster_num;
+    partition_ptr->h_per_cluster = h_feature;
+    use_cluster_num =
+        (n + partition_ptr->n_per_cluster - 1) / partition_ptr->n_per_cluster;
+  } else {
+    partition_ptr->cluster_partition = PARTITION_H;
+    partition_ptr->h_per_cluster = (h_feature + cluster_num - 1) / cluster_num;
+    partition_ptr->n_per_cluster = n;
+    use_cluster_num = (h_feature + partition_ptr->h_per_cluster - 1) /
+                      partition_ptr->h_per_cluster;
+  }
+
+  if (partition_ptr->n_per_cluster >= core_dim ||
+      partition_ptr->n_per_cluster >= partition_ptr->h_per_cluster) {
+    partition_ptr->core_partition = PARTITION_N;
+    partition_ptr->n_per_core =
+        (partition_ptr->n_per_cluster + core_dim - 1) / core_dim;
+    partition_ptr->h_per_core = partition_ptr->h_per_cluster;
+    use_core_num =
+        (partition_ptr->n_per_cluster + partition_ptr->n_per_core - 1) /
+        partition_ptr->n_per_core;
+  } else {
+    partition_ptr->core_partition = PARTITION_H;
+    partition_ptr->h_per_core =
+        (partition_ptr->h_per_cluster + core_dim - 1) / core_dim;
+    partition_ptr->n_per_core = partition_ptr->n_per_cluster;
+    use_core_num =
+        (partition_ptr->h_per_cluster + partition_ptr->h_per_core - 1) /
+        partition_ptr->h_per_core;
+  }
+  *k_dim_ptr = {core_dim, use_cluster_num, 1};
+}
+
+}  // namespace
+
+bool findLimit(const int shape_core_n, const int shape_core_h,
+               const int shape_core_w, const int shape_core_ci,
+               const int shape_core_co, int *limit_n_seg_ptr,
+               int *limit_h_seg_ptr, int *limit_w_seg_ptr, const int psa_type) {
+  const bool need_temp = psa_type == 1;
+  const int input_bytes = sizeof(float);
+  int limit_n_seg = shape_core_n;
+  int limit_h_seg = shape_core_h;
+  int limit_w_seg = shape_core_w;
+
+  const int max_nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  const int align_base_128 = NFU_ALIGN_SIZE / input_bytes;
+  const int align_base_64 = COMPUTE_COUNT_ALIGN / input_bytes;
+  const int align_co = CEIL_ALIGN(shape_core_co, align_base_64);
+  const int align_w = CEIL_ALIGN(shape_core_w, align_base_64);
+  const int align_hw = CEIL_ALIGN(shape_core_h * shape_core_w, align_base_64);
+  const int max_num = max_nram_size / input_bytes;
+
+  int n_limit =
+      max_num /
+      (CEIL_ALIGN(shape_core_h * shape_core_w * shape_core_ci, align_base_128) +
+       align_hw * align_co * (1 + need_temp));
+  if (n_limit > 0) {
+    n_limit = std::min(n_limit, shape_core_n);
+    limit_n_seg = n_limit;
+  } else {
+    int h_limit =
+        max_num / (CEIL_ALIGN(shape_core_w * shape_core_ci, align_base_128) +
+                   align_w * align_co * (1 + need_temp));
+    if (h_limit > 0) {
+      h_limit = std::min(h_limit, shape_core_h);
+      limit_h_seg = h_limit;
+      limit_n_seg = 1;
+    } else {
+      int w_limit =
+          max_num / (CEIL_ALIGN(shape_core_ci, align_base_128) +
+                     CEIL_ALIGN(align_co, align_base_128) * (1 + need_temp));
+      if (w_limit > 0 && w_limit >= (COMPUTE_COUNT_ALIGN / input_bytes)) {
+        w_limit = std::min(w_limit, shape_core_w);
+        w_limit = w_limit / (COMPUTE_COUNT_ALIGN / input_bytes) *
+                  (COMPUTE_COUNT_ALIGN / input_bytes);
+        limit_w_seg = w_limit;
+        limit_h_seg = 1;
+        limit_n_seg = 1;
+      } else {
+        CNLOG(INFO) << "The size of input channel is too large.";
+        return false;
+      }
+    }
+  }
+  *limit_n_seg_ptr = limit_n_seg;
+  *limit_h_seg_ptr = limit_h_seg;
+  *limit_w_seg_ptr = limit_w_seg;
+  return true;
+}
+
+void PSAMaskForwardMLUKernelLauncher(const int psa_type, const Tensor x,
+                                     Tensor y, const int num_,
+                                     const int h_feature, const int w_feature,
+                                     const int h_mask, const int w_mask,
+                                     const int half_h_mask,
+                                     const int half_w_mask) {
+  // params check
+  TORCH_CHECK(x.scalar_type() == at::kFloat, "x type should be Float, got ",
+              x.scalar_type());
+  TORCH_CHECK(y.scalar_type() == x.scalar_type(),
+              "y should have the same type as x");
+  TORCH_CHECK(x.dim() == 4, "x should be a 4d tensor, got ", x.dim(), "D");
+  TORCH_CHECK(y.dim() == 4, "y should be a 4d tensor, got ", y.dim(), "D");
+
+  int x_c = x.size(1);
+  int y_c = y.size(1);
+  TORCH_CHECK(h_mask * w_mask == x_c,
+              "channel of x should be the same as h_mask * w_mask");
+  TORCH_CHECK(h_feature * w_feature == y_c,
+              "channel of y should be the same as h_feature * w_feature");
+  TORCH_CHECK(psa_type == 0 || psa_type == 1,
+              "psa_type only suppurts 'COLLECT' and 'DISTRIBUTE' currently");
+
+  if (x.numel() == 0) {
+    CNLOG(INFO) << "skip zero-element tensor";
+    return;
+  }
+
+  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  cnrtDim3_t k_dim;
+  PartitionSeg partition_info;
+  policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
+  int n_limit_seg, h_limit_seg, w_limit_seg;
+  bool ret =
+      findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
+                x_c, y_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
+  if (ret != true) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(x.dim());
+  auto x_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(x, memory_format);
+  at::Tensor y_tmp =
+      at::empty({num_, y_c, h_feature, w_feature}, x.options(), memory_format);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto x_impl = torch_mlu::getMluTensorImpl(x_tensor);
+  auto x_ptr = x_impl->cnnlMalloc();
+  auto y_impl = torch_mlu::getMluTensorImpl(y_tmp);
+  auto y_ptr = y_impl->cnnlMalloc();
+
+  KernelPsamaskForward(
+      k_dim, k_type, queue, x_ptr, y_ptr, (PsamaskType)psa_type,
+      partition_info.core_partition, partition_info.cluster_partition, num_,
+      h_feature, w_feature, h_mask, w_mask, x_c, y_c, half_h_mask, half_w_mask,
+      partition_info.n_per_core, partition_info.h_per_core,
+      partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
+      h_limit_seg, w_limit_seg);
+
+  y.copy_(y_tmp);
+}
+
+void PSAMaskBackwardMLUKernelLauncher(const int psa_type, const Tensor dy,
+                                      Tensor dx, const int num_,
+                                      const int h_feature, const int w_feature,
+                                      const int h_mask, const int w_mask,
+                                      const int half_h_mask,
+                                      const int half_w_mask) {
+  // params check
+  TORCH_CHECK(dy.scalar_type() == at::kFloat, "dy type should be Float, got ",
+              dy.scalar_type());
+  TORCH_CHECK(dx.scalar_type() == dy.scalar_type(),
+              "dx should have the same type as dy");
+  TORCH_CHECK(dy.dim() == 4, "dy should be a 4d tensor, got ", dy.dim(), "D");
+  TORCH_CHECK(dx.dim() == 4, "dx should be a 4d tensor, got ", dx.dim(), "D");
+
+  int dy_c = dy.size(1);
+  int dx_c = dx.size(1);
+  TORCH_CHECK(h_feature * w_feature == dy_c,
+              "channel of dy should be the same as h_feature * w_feature");
+  TORCH_CHECK(h_mask * w_mask == dx_c,
+              "channel of dx should be the same as h_mask * w_mask");
+  TORCH_CHECK(psa_type == 0 || psa_type == 1,
+              "psa_type only suppurts 'COLLECT' and 'DISTRIBUTE' currently");
+
+  if (dx.numel() == 0) {
+    CNLOG(INFO) << "skip zero-element tensor";
+    return;
+  }
+
+  cnrtFunctionType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  cnrtDim3_t k_dim;
+  PartitionSeg partition_info;
+  policyFunc(&k_dim, &k_type, &partition_info, num_, h_feature);
+  int n_limit_seg, h_limit_seg, w_limit_seg;
+  bool ret =
+      findLimit(partition_info.n_per_core, partition_info.h_per_core, w_feature,
+                dx_c, dy_c, &n_limit_seg, &h_limit_seg, &w_limit_seg, psa_type);
+  if (ret != true) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(dy.dim());
+  auto dy_tensor = torch_mlu::cnnl::ops::cnnl_contiguous(dy, memory_format);
+  at::Tensor dx_tmp = at::empty({num_, dx_c, h_feature, w_feature},
+                                dy.options(), memory_format);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto dx_impl = torch_mlu::getMluTensorImpl(dx_tmp);
+  auto dx_ptr = dx_impl->cnnlMalloc();
+  auto dy_impl = torch_mlu::getMluTensorImpl(dy_tensor);
+  auto dy_ptr = dy_impl->cnnlMalloc();
+
+  KernelPsamaskBackward(
+      k_dim, k_type, queue, dy_ptr, dx_ptr, (PsamaskType)psa_type,
+      partition_info.core_partition, partition_info.cluster_partition, num_,
+      h_feature, w_feature, h_mask, w_mask, dx_c, dy_c, half_h_mask,
+      half_w_mask, partition_info.n_per_core, partition_info.h_per_core,
+      partition_info.n_per_cluster, partition_info.h_per_cluster, n_limit_seg,
+      h_limit_seg, w_limit_seg);
+
+  dx.copy_(dx_tmp);
+}
+
+void psamask_forward_mlu(const int psa_type, const Tensor input, Tensor output,
+                         const int num_, const int h_feature,
+                         const int w_feature, const int h_mask,
+                         const int w_mask, const int half_h_mask,
+                         const int half_w_mask) {
+  PSAMaskForwardMLUKernelLauncher(psa_type, input, output, num_, h_feature,
+                                  w_feature, h_mask, w_mask, half_h_mask,
+                                  half_w_mask);
+}
+
+void psamask_backward_mlu(const int psa_type, const Tensor grad_output,
+                          Tensor grad_input, const int num_,
+                          const int h_feature, const int w_feature,
+                          const int h_mask, const int w_mask,
+                          const int half_h_mask, const int half_w_mask) {
+  PSAMaskBackwardMLUKernelLauncher(psa_type, grad_output, grad_input, num_,
+                                   h_feature, w_feature, h_mask, w_mask,
+                                   half_h_mask, half_w_mask);
+}
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask);
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask);
+
+REGISTER_DEVICE_IMPL(psamask_forward_impl, MLU, psamask_forward_mlu);
+REGISTER_DEVICE_IMPL(psamask_backward_impl, MLU, psamask_backward_mlu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp b/mmcv/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..077dbfc51e2fd42aa85f3ef47f9af2f08134d698
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/mlu/roi_align_mlu.cpp
@@ -0,0 +1,206 @@
+/*************************************************************************
+ * Copyright (C) 2021 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelRoiAlign(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                    cnrtQueue_t queue, const cnrtDataType_t d_type,
+                    const void *input, const void *rois, const int channels,
+                    const bool aligned, const int pooled_height,
+                    const int pooled_width, const int input_height,
+                    const int input_width, const int sampling_ratio,
+                    const float spatial_scale, const int num_rois,
+                    void *output);
+
+void KernelRoiAlignBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                            cnrtQueue_t queue, const cnrtDataType_t dtype,
+                            const void *grads, const void *boxes,
+                            void *grads_image, const int boxes_num,
+                            const int hi, const int wi, const int c,
+                            const int no, const int ho, const int wo,
+                            const float spatial_scale, const int sampling_ratio,
+                            const bool aligned);
+
+void ROIAlignForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                      Tensor argmax_y, Tensor argmax_x,
+                                      int aligned_height, int aligned_width,
+                                      float spatial_scale, int sampling_ratio,
+                                      int pool_mode, bool aligned) {
+  // params check
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type());
+  TORCH_CHECK(rois.scalar_type() == input.scalar_type(),
+              "rois should have the same type as input");
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(pool_mode == 1, "pool_mode only suppurts 'avg' currently");
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+
+  auto num_rois = rois.size(0);
+  auto channels = input.size(1);
+  int height = input.size(2);
+  int width = input.size(3);
+
+  if (output.numel() == 0) {
+    output = at::zeros({num_rois, channels, aligned_height, aligned_width},
+                       input.options());
+    return;
+  }
+
+  at::Tensor output_tmp =
+      at::empty({num_rois, channels, aligned_height, aligned_width},
+                input.options(), memory_format);
+
+  // get tensor impl
+  auto self_impl = torch_mlu::getMluTensorImpl(input_tensor);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto self_ptr = self_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  cnrtDim3_t k_dim;
+  k_dim.x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim.y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim.z = 1;
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input.dtype());
+
+  KernelRoiAlign(k_dim, k_type, queue, data_type, self_ptr, rois_ptr, channels,
+                 aligned, aligned_height, aligned_width, height, width,
+                 sampling_ratio, spatial_scale, num_rois, output_ptr);
+
+  output.copy_(output_tmp);
+}
+
+static int nearestPower2(int x) {
+  x--;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  x++;
+  return x;
+}
+
+void ROIAlignBackwardMLUKernelLauncher(Tensor grad, Tensor rois,
+                                       Tensor argmax_y, Tensor argmax_x,
+                                       Tensor grad_input, int aligned_height,
+                                       int aligned_width, float spatial_scale,
+                                       int sampling_ratio, int pool_mode,
+                                       bool aligned) {
+  // params check
+  TORCH_CHECK(
+      grad.scalar_type() == at::kFloat || grad.scalar_type() == at::kHalf,
+      "grad type should be Float or Half, got ", grad.scalar_type());
+  TORCH_CHECK(rois.scalar_type() == grad.scalar_type(),
+              "rois should have the same type as grad");
+  TORCH_CHECK(grad.dim() == 4, "grad should be a 4d tensor, got ", grad.dim(),
+              "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(pool_mode == 1, "pool_mode only suppurts 'avg' currently");
+
+  int batch_size = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad.dim());
+  auto grad_ = torch_mlu::cnnl::ops::cnnl_contiguous(grad, memory_format);
+  auto grad_input_ = at::empty({batch_size, channels, height, width},
+                               grad.options(), memory_format)
+                         .zero_();
+
+  int boxes_num = rois.size(0);
+  int hi = grad.size(2);
+  int wi = grad.size(3);
+  int c = grad.size(1);
+
+  int no = grad_input.size(0);
+  int ho = grad_input.size(2);
+  int wo = grad_input.size(3);
+
+  // get tensor impl
+  auto grad_impl = torch_mlu::getMluTensorImpl(grad_);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto grad_ptr = grad_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  cnrtJobType_t k_type = CNRT_FUNC_TYPE_UNION1;
+  int need_core = nearestPower2(boxes_num);
+  int union_number = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  uint32_t dim_x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  uint32_t dim_y = (need_core - 1) / dim_x + 1;
+  dim_y = (dim_y > union_number) ? union_number : dim_y;
+  cnrtDim3_t k_dim = {dim_x, dim_y, 1};
+  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad.dtype());
+
+  KernelRoiAlignBackward(k_dim, k_type, queue, k_dtype, grad_ptr, rois_ptr,
+                         grad_input_ptr, boxes_num, hi, wi, c, no, ho, wo,
+                         spatial_scale, sampling_ratio, aligned);
+  grad_input.copy_(grad_input_);
+}
+
+void roi_align_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                           int aligned_width, float spatial_scale,
+                           int sampling_ratio, int pool_mode, bool aligned) {
+  ROIAlignForwardMLUKernelLauncher(input, rois, output, argmax_y, argmax_x,
+                                   aligned_height, aligned_width, spatial_scale,
+                                   sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                            Tensor argmax_x, Tensor grad_input,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  ROIAlignBackwardMLUKernelLauncher(
+      grad_output, rois, argmax_y, argmax_x, grad_input, aligned_height,
+      aligned_width, spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned);
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned);
+
+REGISTER_DEVICE_IMPL(roi_align_forward_impl, MLU, roi_align_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_align_backward_impl, MLU, roi_align_backward_mlu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp b/mmcv/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..255aefdd9e2bb9893ca8a332401c5e86223d162e
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/mlu/roi_align_rotated_mlu.cpp
@@ -0,0 +1,232 @@
+/*************************************************************************
+ * Copyright (C) 2022 by Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+#include "roi_align_rotated_utils.hpp"
+
+namespace {
+
+void policyFunc(int bin_num, cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  unsigned int core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  unsigned int cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_num;
+  unsigned int use_cluster = (bin_num + core_num - 1) / core_num;
+  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
+  k_dim->z = 1;
+}
+
+}  // namespace
+
+void KernelRoiAlignRotatedForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const cnrtDataType_t d_type, const void *features, const void *rois,
+    void *output, const int batch, const int height, const int width,
+    const int channel, const int rois_num,
+    const RoiAlignRotatedParams roiAlignRotatedParams);
+
+void KernelRoiAlignRotatedBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const cnrtDataType_t d_type, const void *top_grad, const void *rois,
+    void *bottom_grad, const int batch, const int height, const int width,
+    const int channel, const int rois_num,
+    const RoiAlignRotatedParams roiAlignRotatedParams);
+
+void ROIAlignRotatedForwardMLUKernelLauncher(Tensor input, Tensor rois,
+                                             Tensor output, int pooled_height,
+                                             int pooled_width,
+                                             float spatial_scale,
+                                             int sampling_ratio, bool aligned,
+                                             bool clockwise) {
+  TORCH_CHECK(((input.scalar_type() == output.scalar_type()) &&
+               (output.scalar_type() == rois.scalar_type())),
+              "data types of input, rois and output should be the same, ",
+              "but now input type is ", input.scalar_type(), ", rois type is ",
+              rois.scalar_type(), ", output type is ", output.scalar_type(),
+              ".");
+  TORCH_CHECK(
+      (input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf),
+      "input type should be Float or Half, got ", input.scalar_type(), ".");
+
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "D.");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D.");
+  TORCH_CHECK(output.dim() == 4, "output should be a 4d tensor, got ",
+              output.dim(), "D.");
+
+  TORCH_CHECK((rois.size(0) == output.size(0)),
+              "the 1st dimensions of rois and output should be the same, ",
+              "but now the 1st dimension of rois is ", rois.size(0),
+              ", and output is ", output.size(0), ".");
+
+  TORCH_CHECK((input.size(1) == output.size(1)),
+              "the 2nd dimensions of input and output should be the same, ",
+              "but now the 2nd dimension of input is ", input.size(1),
+              ", and output is ", output.size(1), ".");
+
+  int channel = input.size(1);
+  int width = input.size(3);
+  int height = input.size(2);
+  int batch = input.size(0);
+  int rois_nums = rois.size(0);
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(input.dtype());
+
+  // return if zero-elements
+  if (input.numel() == 0) {
+    CNLOG(INFO) << "Skip the zero-elements case.";
+    return;
+  }
+
+  RoiAlignRotatedParams roiAlignRotatedParams{pooled_height,  pooled_width,
+                                              sampling_ratio, spatial_scale,
+                                              aligned,        clockwise};
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+  at::Tensor output_tmp =
+      at::empty({batch, channel, pooled_height, pooled_width}, input.options(),
+                memory_format);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_tensor);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_tmp);
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  KernelRoiAlignRotatedForward(k_dim, k_type, queue, d_type, input_ptr,
+                               rois_ptr, output_ptr, batch, height, width,
+                               channel, rois_nums, roiAlignRotatedParams);
+  output.copy_(output_tmp);
+}
+
+void ROIAlignRotatedBackwardMLUKernelLauncher(
+    Tensor top_grad, Tensor rois, Tensor bottom_grad, int pooled_height,
+    int pooled_width, float spatial_scale, int sampling_ratio, bool aligned,
+    bool clockwise) {
+  TORCH_CHECK(((top_grad.scalar_type() == bottom_grad.scalar_type()) &&
+               (bottom_grad.scalar_type() == rois.scalar_type())),
+              "data types of top_grad, rois and bottom_grad should be ",
+              "the same, but now top_grad type is ", top_grad.scalar_type(),
+              ", rois type is ", rois.scalar_type(), ", bottom_grad type is ",
+              bottom_grad.scalar_type(), ".");
+  TORCH_CHECK((bottom_grad.scalar_type() == at::kFloat ||
+               bottom_grad.scalar_type() == at::kHalf),
+              "Data type of bottom_grad should be Float ro Half, got ",
+              bottom_grad.scalar_type(), ".");
+
+  TORCH_CHECK(bottom_grad.dim() == 4, "bottom_grad should be a 4d tensor, got ",
+              top_grad.dim(), "D.");
+  TORCH_CHECK(rois.dim() == 2, "rois should be a 2d tensor, got ", rois.dim(),
+              "D.");
+  TORCH_CHECK(top_grad.dim() == 4, "top_grad should be a 4d tensor, got ",
+              bottom_grad.dim(), "D.");
+
+  TORCH_CHECK((rois.size(0) == top_grad.size(0)),
+              "the 1st dimensions of rois and top_grad should be the same, ",
+              "but now the 1st dimension of rois is ", rois.size(0),
+              ", and top_grad is ", top_grad.size(0), ".");
+
+  TORCH_CHECK((bottom_grad.size(1) == top_grad.size(1)),
+              "the 2nd dimensions of bottom_grad and top_grad should be ",
+              "the same, but now the 2nd dimension of bottom_grad is ",
+              bottom_grad.size(1), ", and top_grad is ", top_grad.size(1), ".");
+
+  int channel = bottom_grad.size(1);
+  int width = bottom_grad.size(3);
+  int height = bottom_grad.size(2);
+  int batch = bottom_grad.size(0);
+  int rois_nums = rois.size(0);
+  cnrtDataType_t d_type = torch_mlu::toCnrtDtype(bottom_grad.dtype());
+
+  // return if zero-elements
+  if (bottom_grad.numel() == 0) {
+    CNLOG(INFO) << "Skip the zero-elements case.";
+    return;
+  }
+
+  RoiAlignRotatedParams roiAlignRotatedParams{pooled_height,  pooled_width,
+                                              sampling_ratio, spatial_scale,
+                                              aligned,        clockwise};
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFunc(rois_nums * pooled_height * pooled_width, &k_dim, &k_type);
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(top_grad.dim());
+  auto top_grad_tensor =
+      torch_mlu::cnnl::ops::cnnl_contiguous(top_grad, memory_format);
+  at::Tensor bottom_grad_tmp = at::empty({batch, channel, height, width},
+                                         top_grad.options(), memory_format)
+                                   .zero_();
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto bottom_grad_impl = torch_mlu::getMluTensorImpl(bottom_grad_tmp);
+  auto bottom_grad_ptr = bottom_grad_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto top_grad_impl = torch_mlu::getMluTensorImpl(top_grad_tensor);
+  auto top_grad_ptr = top_grad_impl->cnnlMalloc();
+
+  KernelRoiAlignRotatedBackward(k_dim, k_type, queue, d_type, top_grad_ptr,
+                                rois_ptr, bottom_grad_ptr, batch, height, width,
+                                channel, rois_nums, roiAlignRotatedParams);
+  bottom_grad.copy_(bottom_grad_tmp);
+}
+
+void roi_align_rotated_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                                   int aligned_height, int aligned_width,
+                                   float spatial_scale, int sampling_ratio,
+                                   bool aligned, bool clockwise) {
+  ROIAlignRotatedForwardMLUKernelLauncher(input, rois, output, aligned_height,
+                                          aligned_width, spatial_scale,
+                                          sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_mlu(Tensor top_grad, Tensor rois,
+                                    Tensor bottom_grad, int aligned_height,
+                                    int aligned_width, float spatial_scale,
+                                    int sampling_ratio, bool aligned,
+                                    bool clockwise) {
+  ROIAlignRotatedBackwardMLUKernelLauncher(
+      top_grad, rois, bottom_grad, aligned_height, aligned_width, spatial_scale,
+      sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise);
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise);
+
+REGISTER_DEVICE_IMPL(roi_align_rotated_forward_impl, MLU,
+                     roi_align_rotated_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_align_rotated_backward_impl, MLU,
+                     roi_align_rotated_backward_mlu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp b/mmcv/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7db23957d2cbba8f496b9effd67a62f87cde39e5
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/mlu/roi_pool_mlu.cpp
@@ -0,0 +1,275 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelRoiPoolForward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                          cnrtQueue_t queue, cnrtDataType_t data_type,
+                          const void *input_data, const void *input_rois,
+                          const int batch, const int channels, const int height,
+                          const int width, const int pooled_height,
+                          const int pooled_width, const int rois_num,
+                          const float spatial_scale, void *output_data,
+                          int *argmax);
+
+void KernelRoiPoolBackward(cnrtDim3_t k_dim, cnrtFunctionType_t k_type,
+                           cnrtQueue_t queue, cnrtDataType_t k_dtype,
+                           const void *grad_output_ptr, const void *rois_ptr,
+                           const int *argmax_ptr, void *grad_input_ptr,
+                           const int box_num, const int pooled_height,
+                           const int pooled_width, const int channels,
+                           const int batch, const int height, const int width,
+                           const float spatial_scale);
+
+// policy function for forward
+static void policyFuncForward(const int bin_num, cnrtDim3_t *k_dim,
+                              cnrtFunctionType_t *k_type) {
+  auto core_num = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto cluster_num = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = core_num;
+  unsigned int use_cluster = bin_num / core_num + (bin_num % core_num > 0);
+  k_dim->y = use_cluster > cluster_num ? cluster_num : use_cluster;
+  k_dim->z = 1;
+}
+
+void ROIPoolForwardMLUKernelLauncher(Tensor input, Tensor rois, Tensor output,
+                                     Tensor argmax, int pooled_height,
+                                     int pooled_width, float spatial_scale) {
+  // Check dtype.
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type());
+  TORCH_CHECK(input.scalar_type() == rois.scalar_type(),
+              "rois should have the same type as input");
+
+  // Check dtype relationship.
+  TORCH_CHECK(
+      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
+      "argmax type should be Int or Long, got ", argmax.scalar_type());
+
+  // Check shape.
+  TORCH_CHECK(input.dim() == 4, "input should be 4d tensor, got ", input.dim(),
+              "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
+              argmax.dim(), "D");
+
+  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
+              "spatial_scale should be within (0, 1], got ", spatial_scale);
+
+  // compute kernel params
+  auto batch = input.size(0);
+  auto height = input.size(2);
+  auto width = input.size(3);
+  auto channels = input.size(1);
+  auto rois_num = output.size(0);
+
+  if (output.numel() == 0) {
+    output = at::zeros({rois_num, channels, pooled_height, pooled_width},
+                       input.options());
+    return;
+  }
+  if (argmax.numel() == 0) {
+    argmax = at::zeros({rois_num, channels, pooled_height, pooled_width},
+                       argmax.options());
+    return;
+  }
+
+  // zero element check
+  if (input.numel() == 0 || rois.numel() == 0 || output.numel() == 0 ||
+      argmax.numel() == 0) {
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(input.dim());
+  auto input_ = torch_mlu::cnnl::ops::cnnl_contiguous(input, memory_format);
+
+  at::Tensor output_ =
+      at::empty({rois_num, channels, pooled_height, pooled_width},
+                input.options(), memory_format);
+  at::Tensor argmax_ =
+      at::empty({rois_num, channels, pooled_height, pooled_width},
+                argmax.options(), memory_format);
+
+  // calculate task dimension
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncForward(rois_num * pooled_height * pooled_width, &k_dim, &k_type);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get ptr of tensors
+  auto input_impl = torch_mlu::getMluTensorImpl(input_);
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto output_impl = torch_mlu::getMluTensorImpl(output_);
+  auto output_ptr = output_impl->cnnlMalloc();
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+
+  // get comput dtype of input
+  cnrtDataType_t data_type = torch_mlu::toCnrtDtype(input_.dtype());
+
+  // launch kernel
+  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolForward<<<" << k_dim.x << ", "
+              << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelRoiPoolForward(k_dim, k_type, queue, data_type, input_ptr, rois_ptr,
+                       batch, channels, height, width, pooled_height,
+                       pooled_width, rois_num, spatial_scale, output_ptr,
+                       (int *)argmax_ptr);
+  output.copy_(output_);
+  argmax.copy_(argmax_);
+}
+
+// policy function for backward
+static void policyFuncBackward(cnrtDim3_t *k_dim, cnrtFunctionType_t *k_type) {
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+  k_dim->x = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  k_dim->y = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  k_dim->z = 1;
+}
+
+void ROIPoolBackwardMLUKernelLauncher(Tensor grad_output, Tensor rois,
+                                      Tensor argmax, Tensor grad_input,
+                                      int pooled_height, int pooled_width,
+                                      float spatial_scale) {
+  // Check dtype.
+  TORCH_CHECK(
+      argmax.scalar_type() == at::kLong || argmax.scalar_type() == at::kInt,
+      "argmax type should be Int or Long, got ", argmax.scalar_type());
+  TORCH_CHECK((grad_output.scalar_type() == at::kFloat ||
+               grad_output.scalar_type() == at::kHalf),
+              "grad_output type should be FLoat or Half, got ",
+              grad_output.scalar_type());
+
+  // Check dtype relationship.
+  TORCH_CHECK((rois.scalar_type() == grad_output.scalar_type()),
+              "rois should have the same type as grad_output");
+
+  // Check shape.
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be 4d tensor, got ",
+              grad_output.dim(), "D");
+  TORCH_CHECK(rois.dim() == 2, "rois should be 2d tensor, got ", rois.dim(),
+              "D");
+  TORCH_CHECK(argmax.dim() == 4, "argmax should be 4d tensor, got ",
+              argmax.dim(), "D");
+
+  TORCH_CHECK(spatial_scale > 0 && spatial_scale <= 1,
+              "spatial_scale should be within (0, 1], got ", spatial_scale);
+
+  // Check relationship between tensor.
+  // Check the relationship of n.
+  TORCH_CHECK(grad_output.size(0) == rois.size(0),
+              "grad_output.size(0) = ", grad_output.size(0),
+              ", while rois.size(0) = ", rois.size(0),
+              ". They should be the same.");
+
+  // Check the relationship of channels.
+  TORCH_CHECK(grad_output.size(1) == argmax.size(1),
+              "grad_output.size(1) = ", grad_output.size(1),
+              ", while argmax.size(1) = ", argmax.size(1),
+              ". They should be the same.");
+
+  // Check the relationship of height and width.
+  TORCH_CHECK(grad_output.size(2) == argmax.size(2),
+              "argmax.size(2) = ", argmax.size(2),
+              ", while grad_output.size(2) = ", grad_output.size(2),
+              ". They should be the same.");
+  TORCH_CHECK(grad_output.size(3) == argmax.size(3),
+              "argmax.size(3) = ", argmax.size(3),
+              ", while grad_output.size(3) = ", grad_output.size(3),
+              ". They should be the same.");
+
+  // Check zero element.
+  if (grad_output.numel() == 0 || rois.numel() == 0 || argmax.numel() == 0 ||
+      grad_input.numel() == 0) {
+    // return if zero-element
+    return;
+  }
+
+  auto memory_format =
+      torch_mlu::cnnl::ops::get_channels_last_memory_format(grad_output.dim());
+  auto grad_output_ =
+      torch_mlu::cnnl::ops::cnnl_contiguous(grad_output, memory_format);
+  auto argmax_ = torch_mlu::cnnl::ops::cnnl_contiguous(argmax, memory_format);
+
+  int boxes_num = grad_output.size(0);
+  int no = grad_input.size(0);
+  int channels = grad_input.size(1);
+  int height = grad_input.size(2);
+  int width = grad_input.size(3);
+  auto grad_input_ = at::empty({no, channels, height, width},
+                               grad_input.options(), memory_format)
+                         .zero_();
+
+  // get tensor impl
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output_);
+  auto rois_impl = torch_mlu::getMluTensorImpl(rois);
+  auto argmax_impl = torch_mlu::getMluTensorImpl(argmax_);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input_);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get mlu ptr
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto rois_ptr = rois_impl->cnnlMalloc();
+  auto argmax_ptr = argmax_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  // calculate task dimension
+  cnrtDataType_t k_dtype = torch_mlu::toCnrtDtype(grad_input.dtype());
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  policyFuncBackward(&k_dim, &k_type);
+
+  CNLOG(INFO) << "Launch Kernel MLUKernelRoiPoolBackward<<<" << k_dim.x << ", "
+              << k_dim.y << ", " << k_dim.z << ">>>";
+
+  KernelRoiPoolBackward(k_dim, k_type, queue, k_dtype, grad_output_ptr,
+                        rois_ptr, (int *)argmax_ptr, grad_input_ptr, boxes_num,
+                        pooled_height, pooled_width, channels, no, height,
+                        width, spatial_scale);
+
+  grad_input.copy_(grad_input_);
+}
+
+void roi_pool_forward_mlu(Tensor input, Tensor rois, Tensor output,
+                          Tensor argmax, int pooled_height, int pooled_width,
+                          float spatial_scale) {
+  ROIPoolForwardMLUKernelLauncher(input, rois, output, argmax, pooled_height,
+                                  pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_mlu(Tensor grad_output, Tensor rois, Tensor argmax,
+                           Tensor grad_input, int pooled_height,
+                           int pooled_width, float spatial_scale) {
+  ROIPoolBackwardMLUKernelLauncher(grad_output, rois, argmax, grad_input,
+                                   pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale);
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale);
+
+REGISTER_DEVICE_IMPL(roi_pool_forward_impl, MLU, roi_pool_forward_mlu);
+REGISTER_DEVICE_IMPL(roi_pool_backward_impl, MLU, roi_pool_backward_mlu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp b/mmcv/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..728330795da89e944e037040f92e10be3634c406
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/mlu/tin_shift_mlu.cpp
@@ -0,0 +1,203 @@
+/*************************************************************************
+ * Copyright (C) 2022 Cambricon.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+ * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *************************************************************************/
+#include "pytorch_device_registry.hpp"
+#include "pytorch_mlu_helper.hpp"
+
+void KernelTinShiftForward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *input, const void *shifts, void *output, const int batch_size,
+    const int time_size, const int channel_size, const int hw_size,
+    const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype, const int channel_per_core,
+    const int max_number_hw_per_core, const int max_length_per_core);
+
+void KernelTinShiftBackward(
+    cnrtDim3_t k_dim, cnrtFunctionType_t k_type, cnrtQueue_t queue,
+    const void *grad_output, const void *shifts, void *grad_input,
+    const int batch_size, const int time_size, const int channel_size,
+    const int hw_size, const int group_size, const int group_channel,
+    const cnrtDataType_t data_dtype, const int channel_per_core,
+    const int max_number_hw_per_core, const int max_length_per_core);
+
+// policy function
+static void policyFunc(const Tensor &input, cnrtDim3_t *k_dim,
+                       cnrtFunctionType_t *k_type, int *channel_per_core,
+                       int *max_number_hw_per_core, int *max_length_per_core) {
+  const int32_t cluster_limit = torch_mlu::getDeviceAttr(cnrtAttrClusterCount);
+  const int32_t core_limit = torch_mlu::getDeviceAttr(cnrtAttrMcorePerCluster);
+  auto nram_size = torch_mlu::getDeviceAttr(cnrtAttrNramSizePerMcore);
+  const int core_num = core_limit * cluster_limit;
+  const int batch_size = input.size(0);
+  const int time_size = input.size(1);
+  const int channel_size = input.size(2);
+  const int hw_size = input.size(3);
+
+  const size_t size_per_channel = time_size * hw_size * input.itemsize();
+  *channel_per_core = nram_size / size_per_channel;
+  int task_dim = 0;
+  if (*channel_per_core == 0) {
+    const size_t size_per_hw = hw_size * input.itemsize();
+    *max_number_hw_per_core = nram_size / size_per_hw;
+    if (*max_number_hw_per_core <= 0) {
+      *max_length_per_core = nram_size / input.itemsize();
+    }
+    int tmp_max_number_hw_per_core =
+        *max_number_hw_per_core > 0 ? *max_number_hw_per_core : 1;
+    const int loop_time =
+        (time_size / (tmp_max_number_hw_per_core)) +
+        ((time_size % (tmp_max_number_hw_per_core)) > 0 ? 1 : 0);
+    task_dim = batch_size * channel_size * loop_time < core_num
+                   ? batch_size * channel_size * loop_time
+                   : core_num;
+  } else {
+    task_dim = batch_size * channel_size < core_num ? batch_size * channel_size
+                                                    : core_num;
+  }
+
+  k_dim->x = core_limit;
+  k_dim->y = (task_dim / core_limit) > 0 ? (task_dim / core_limit) : 1;
+  k_dim->z = 1;
+  *k_type = CNRT_FUNC_TYPE_UNION1;
+}
+
+void TINShiftForwardMLUKernelLauncher(Tensor input, Tensor shift,
+                                      Tensor output) {
+  // params check
+  TORCH_CHECK(
+      input.scalar_type() == at::kFloat || input.scalar_type() == at::kHalf,
+      "input type should be Float or Half, got ", input.scalar_type(), ".");
+  TORCH_CHECK(input.dim() == 4, "input should be a 4d tensor, got ",
+              input.dim(), "d.");
+  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
+              shift.dim(), "d.");
+  TORCH_CHECK(
+      input.size(0) == shift.size(0),
+      "input batch size should be the same as shift's, input batch size is ",
+      input.size(0), " and shift batch size is ", shift.size(0), ".");
+  TORCH_CHECK(input.size(0) != 0, "Input batch size should not be zero.");
+  TORCH_CHECK(input.size(3) != 0,
+              "The last dim size of input should not be zero.");
+  if (input.size(1) == 0) {
+    return;
+  }
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  int channel_per_core = 0;
+  int max_number_hw_per_core = 0;
+  int max_length_per_core = 0;
+  policyFunc(input, &k_dim, &k_type, &channel_per_core, &max_number_hw_per_core,
+             &max_length_per_core);
+
+  const int batch_size = input.size(0);
+  const int time_size = input.size(1);
+  const int channel_size = input.size(2);
+  const int hw_size = input.size(3);
+  const int group_size = shift.size(1);
+  int group_channel = channel_size / group_size;
+
+  // get tensor impl
+  auto input_impl = torch_mlu::getMluTensorImpl(input);
+  auto shift_impl = torch_mlu::getMluTensorImpl(shift);
+  auto output_impl = torch_mlu::getMluTensorImpl(output);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto input_ptr = input_impl->cnnlMalloc();
+  auto shift_ptr = shift_impl->cnnlMalloc();
+  auto output_ptr = output_impl->cnnlMalloc();
+
+  cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(input.dtype());
+
+  KernelTinShiftForward(k_dim, k_type, queue, input_ptr, shift_ptr, output_ptr,
+                        batch_size, time_size, channel_size, hw_size,
+                        group_size, group_channel, data_dtype, channel_per_core,
+                        max_number_hw_per_core, max_length_per_core);
+}
+
+void TINShiftBackwardMLUKernelLauncher(Tensor grad_output, Tensor shift,
+                                       Tensor grad_input) {
+  // params check
+  TORCH_CHECK(grad_output.scalar_type() == at::kFloat ||
+                  grad_output.scalar_type() == at::kHalf,
+              "grad_output type should be Float or Half, got ",
+              grad_output.scalar_type(), ".");
+  TORCH_CHECK(grad_output.dim() == 4, "grad_output should be a 4d tensor, got ",
+              grad_output.dim(), "d.");
+  TORCH_CHECK(shift.dim() == 2, "shift should be a 2d tensor, got ",
+              shift.dim(), "d.");
+  TORCH_CHECK(grad_output.size(0) == shift.size(0),
+              "grad_output batch size should be the same as shift's, "
+              "grad_output batch size is ",
+              grad_output.size(0), ", shift batch size is ", shift.size(0),
+              ".");
+  TORCH_CHECK(grad_output.size(0) != 0,
+              "grad_output batch size should not be zero.");
+  TORCH_CHECK(grad_output.size(3) != 0,
+              "The last dim size of grad_output should not be zero.");
+  if (grad_output.size(1) == 0) {
+    return;
+  }
+  cnrtDim3_t k_dim;
+  cnrtFunctionType_t k_type;
+  int channel_per_core = 0;
+  int max_number_hw_per_core = 0;
+  int max_length_per_core = 0;
+  policyFunc(grad_output, &k_dim, &k_type, &channel_per_core,
+             &max_number_hw_per_core, &max_length_per_core);
+
+  const int batch_size = grad_output.size(0);
+  const int time_size = grad_output.size(1);
+  const int channel_size = grad_output.size(2);
+  const int hw_size = grad_output.size(3);
+  const int group_size = shift.size(1);
+  int group_channel = channel_size / group_size;
+
+  // get tensor impl
+  auto grad_output_impl = torch_mlu::getMluTensorImpl(grad_output);
+  auto shift_impl = torch_mlu::getMluTensorImpl(shift);
+  auto grad_input_impl = torch_mlu::getMluTensorImpl(grad_input);
+
+  // get compute queue
+  auto queue = torch_mlu::getCurQueue();
+
+  // get the mlu ptr
+  auto grad_output_ptr = grad_output_impl->cnnlMalloc();
+  auto shift_ptr = shift_impl->cnnlMalloc();
+  auto grad_input_ptr = grad_input_impl->cnnlMalloc();
+
+  cnrtDataType_t data_dtype = torch_mlu::toCnrtDtype(grad_output.dtype());
+
+  KernelTinShiftBackward(k_dim, k_type, queue, grad_output_ptr, shift_ptr,
+                         grad_input_ptr, batch_size, time_size, channel_size,
+                         hw_size, group_size, group_channel, data_dtype,
+                         channel_per_core, max_number_hw_per_core,
+                         max_length_per_core);
+}
+
+void tin_shift_forward_mlu(Tensor input, Tensor shift, Tensor output) {
+  TINShiftForwardMLUKernelLauncher(input, shift, output);
+}
+
+void tin_shift_backward_mlu(Tensor grad_output, Tensor shift,
+                            Tensor grad_input) {
+  TINShiftBackwardMLUKernelLauncher(grad_output, shift, grad_input);
+}
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input);
+
+REGISTER_DEVICE_IMPL(tin_shift_forward_impl, MLU, tin_shift_forward_mlu);
+REGISTER_DEVICE_IMPL(tin_shift_backward_impl, MLU, tin_shift_backward_mlu);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp b/mmcv/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..12b538a05e6fd98becccfddf8e79cba7abf96d93
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/modulated_deform_conv.cpp
@@ -0,0 +1,237 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void modulated_deformable_im2col_impl(
+    const Tensor data_im, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor data_col) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_im2col_impl, data_im, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, data_col);
+}
+
+void modulated_deformable_col2im_impl(
+    const Tensor data_col, const Tensor data_offset, const Tensor data_mask,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kernel_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, Tensor grad_im) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_impl, data_col, data_offset,
+                       data_mask, batch_size, channels, height_im, width_im,
+                       height_col, width_col, kernel_h, kernel_w, pad_h, pad_w,
+                       stride_h, stride_w, dilation_h, dilation_w,
+                       deformable_group, grad_im);
+}
+
+void modulated_deformable_col2im_coord_impl(
+    const Tensor data_col, const Tensor data_im, const Tensor data_offset,
+    const Tensor data_mask, const int batch_size, const int channels,
+    const int height_im, const int width_im, const int height_col,
+    const int width_col, const int kernel_h, const int kernel_w,
+    const int pad_h, const int pad_w, const int stride_h, const int stride_w,
+    const int dilation_h, const int dilation_w, const int deformable_group,
+    Tensor grad_offset, Tensor grad_mask) {
+  DISPATCH_DEVICE_IMPL(modulated_deformable_col2im_coord_impl, data_col,
+                       data_im, data_offset, data_mask, batch_size, channels,
+                       height_im, width_im, height_col, width_col, kernel_h,
+                       kernel_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+                       dilation_w, deformable_group, grad_offset, grad_mask);
+}
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_out = weight.size(0);
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  // resize output
+  output = output.view({batch, channels_out, height_out, width_out}).zero_();
+  // resize temporary columns
+  columns =
+      at::zeros({channels * kernel_h * kernel_w, 1 * height_out * width_out},
+                input.options());
+
+  output = output.view({output.size(0), group, output.size(1) / group,
+                        output.size(2), output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    // divide into group
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+
+    for (int g = 0; g < group; g++) {
+      output[b][g] = output[b][g]
+                         .flatten(1)
+                         .addmm_(weight[g].flatten(1), columns[g])
+                         .view_as(output[b][g]);
+    }
+
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+  }
+
+  output = output.view({output.size(0), output.size(1) * output.size(2),
+                        output.size(3), output.size(4)});
+
+  if (with_bias) {
+    output += bias.view({1, bias.size(0), 1, 1});
+  }
+}
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias) {
+  at::DeviceGuard guard(input.device());
+
+  const int batch = input.size(0);
+  const int channels = input.size(1);
+  const int height = input.size(2);
+  const int width = input.size(3);
+
+  const int channels_kernel = weight.size(1);
+  const int kernel_h_ = weight.size(2);
+  const int kernel_w_ = weight.size(3);
+  if (kernel_h_ != kernel_h || kernel_w_ != kernel_w)
+    AT_ERROR("Input shape and kernel shape won't match: (%d x %d vs %d x %d).",
+             kernel_h_, kernel_w, kernel_h_, kernel_w_);
+  if (channels != channels_kernel * group)
+    AT_ERROR("Input shape and kernel channels won't match: (%d vs %d).",
+             channels, channels_kernel * group);
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  if (ones.ndimension() != 2 ||
+      ones.size(0) * ones.size(1) < height_out * width_out) {
+    // Resize plane and fill with ones...
+    ones = at::ones({height_out, width_out}, input.options());
+  }
+
+  grad_input = grad_input.view({batch, channels, height, width});
+  columns = at::zeros({channels * kernel_h * kernel_w, height_out * width_out},
+                      input.options());
+
+  grad_output =
+      grad_output.view({grad_output.size(0), group, grad_output.size(1) / group,
+                        grad_output.size(2), grad_output.size(3)});
+
+  for (int b = 0; b < batch; b++) {
+    // divide int group
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    weight = weight.view({group, weight.size(0) / group, weight.size(1),
+                          weight.size(2), weight.size(3)});
+
+    for (int g = 0; g < group; g++) {
+      columns[g].addmm_(weight[g].flatten(1).transpose(0, 1),
+                        grad_output[b][g].flatten(1), 0.0f, 1.0f);
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    weight = weight.view({weight.size(0) * weight.size(1), weight.size(2),
+                          weight.size(3), weight.size(4)});
+
+    // gradient w.r.t. input coordinate data
+    modulated_deformable_col2im_coord_impl(
+        columns, input[b], offset[b], mask[b], 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, grad_offset[b],
+        grad_mask[b]);
+    // gradient w.r.t. input data
+    modulated_deformable_col2im_impl(
+        columns, offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, grad_input[b]);
+
+    // gradient w.r.t. weight, dWeight should accumulate across the batch and
+    // group
+    modulated_deformable_im2col_impl(
+        input[b], offset[b], mask[b], 1, channels, height, width, height_out,
+        width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h, stride_w,
+        dilation_h, dilation_w, deformable_group, columns);
+
+    columns = columns.view({group, columns.size(0) / group, columns.size(1)});
+    grad_weight = grad_weight.view({group, grad_weight.size(0) / group,
+                                    grad_weight.size(1), grad_weight.size(2),
+                                    grad_weight.size(3)});
+    if (with_bias)
+      grad_bias = grad_bias.view({group, grad_bias.size(0) / group});
+
+    for (int g = 0; g < group; g++) {
+      grad_weight[g] =
+          grad_weight[g]
+              .flatten(1)
+              .addmm_(grad_output[b][g].flatten(1), columns[g].transpose(0, 1))
+              .view_as(grad_weight[g]);
+      if (with_bias) {
+        grad_bias[g] =
+            grad_bias[g]
+                .view({-1, 1})
+                .addmm_(grad_output[b][g].flatten(1), ones.view({-1, 1}))
+                .view(-1);
+      }
+    }
+
+    columns =
+        columns.view({columns.size(0) * columns.size(1), columns.size(2)});
+    grad_weight = grad_weight.view({grad_weight.size(0) * grad_weight.size(1),
+                                    grad_weight.size(2), grad_weight.size(3),
+                                    grad_weight.size(4)});
+    if (with_bias)
+      grad_bias = grad_bias.view({grad_bias.size(0) * grad_bias.size(1)});
+  }
+  grad_output = grad_output.view({grad_output.size(0) * grad_output.size(1),
+                                  grad_output.size(2), grad_output.size(3),
+                                  grad_output.size(4)});
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm b/mmcv/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
new file mode 100644
index 0000000000000000000000000000000000000000..cad6a41a09a0d9dbf43ae473235c356b16a2eec8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/mps/bbox_overlaps_mps.mm
@@ -0,0 +1,99 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+#include "pytorch_device_registry.hpp"
+
+#include "MPSLibrary.h"
+#include "MPSStream.h"
+#include "MPSUtils.h"
+
+using at::Tensor;
+
+const static std::string kSourceCode = R"(
+#include <metal_math>
+#include <metal_stdlib>
+using namespace metal;
+
+kernel void bbox_overlap_mps_kernel(constant const float4* bboxes1,
+                       constant const float4* bboxes2,
+                       device float* ious,
+                       constant int& num_bbox1,
+                       constant int& num_bbox2,
+                       constant int& mode,
+                       constant bool& aligned,
+                       constant int& offset,
+                       uint index [[thread_position_in_grid]])
+{
+    int base1 = index;
+    int base2 = index;
+    if(!aligned){
+      base1 = index / num_bbox2;
+      base2 = index % num_bbox2;
+    }
+
+    const float f_offset = float(offset);
+
+    const float4 b1 = bboxes1[base1];
+    const float b1_area = (b1[2]-b1[0]+f_offset)*(b1[3]-b1[1]+f_offset);
+
+    const float4 b2 = bboxes2[base2];
+    const float b2_area = (b2[2]-b2[0]+f_offset)*(b2[3]-b2[1]+f_offset);
+
+    const float2 left_top = fmax(b1.xy, b2.xy);
+    const float2 right_bottom = fmin(b1.zw, b2.zw);
+    const float2 wh = fmax(right_bottom - left_top + f_offset, 0.0f);
+    const float interS = wh.x * wh.y;
+
+    const float baseS =
+        fmax(mode == 0 ? b1_area + b2_area - interS : b1_area, f_offset);
+    ious[index] = interS / baseS;
+}
+)";
+
+void BBoxOverlapsMPSKernelLauncher(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                                   const int mode, const bool aligned, const int offset) {
+  // get stream
+  auto stream = at::mps::getCurrentMPSStream();
+  auto library_manager = MPSLibraryManager::getInstance();
+  MPSLibrary* library;
+  const static std::string kLibraryName = "bbox_overlap";
+  if (library_manager->hasLibrary(kLibraryName))
+    library = library_manager->getLibrary(kLibraryName);
+  else
+    library = library_manager->createLibraryFromSouce(kLibraryName, kSourceCode);
+  auto func_pso = library->getComputePipelineState("bbox_overlap_mps_kernel");
+
+  // create command buffer and encoder
+  MTLCommandBuffer_t command_buffer = stream->commandBuffer();
+  MTLComputeCommandEncoder_t compute_encoder = [command_buffer computeCommandEncoder];
+
+  // set pso and buffer
+  int output_size = ious.numel();
+  int num_bbox1 = bboxes1.size(0);
+  int num_bbox2 = bboxes2.size(0);
+  int num_elements = output_size;
+  setMTLArgs(compute_encoder, func_pso, bboxes1, bboxes2, ious, num_bbox1, num_bbox2, mode, aligned,
+             offset);
+
+  // set grid size
+  MTLSize grid_size = MTLSizeMake(num_elements, 1, 1);
+  NSUInteger thread_group_size_x = func_pso.maxTotalThreadsPerThreadgroup;
+  if (thread_group_size_x > num_elements) {
+    thread_group_size_x = num_elements;
+  }
+  MTLSize thread_group_size = MTLSizeMake(thread_group_size_x, 1, 1);
+
+  // encoding
+  [compute_encoder dispatchThreads:grid_size threadsPerThreadgroup:thread_group_size];
+  [compute_encoder endEncoding];
+
+  // commit, not sure if flush is required
+  stream->commit(false);
+}
+
+void bbox_overlaps_mps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                       const bool aligned, const int offset) {
+  BBoxOverlapsMPSKernelLauncher(bboxes1, bboxes2, ious, mode, aligned, offset);
+}
+
+void bbox_overlaps_impl(const Tensor bboxes1, const Tensor bboxes2, Tensor ious, const int mode,
+                        const bool aligned, const int offset);
+REGISTER_DEVICE_IMPL(bbox_overlaps_impl, MPS, bbox_overlaps_mps);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp b/mmcv/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..25c8f6209b16c475ba181eea7c880eb27cca4082
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/ms_deform_attn.cpp
@@ -0,0 +1,60 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from
+*https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor ms_deform_attn_impl_forward(const Tensor &value,
+                                   const Tensor &spatial_shapes,
+                                   const Tensor &level_start_index,
+                                   const Tensor &sampling_loc,
+                                   const Tensor &attn_weight,
+                                   const int im2col_step) {
+  return DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_forward, value,
+                              spatial_shapes, level_start_index, sampling_loc,
+                              attn_weight, im2col_step);
+}
+
+void ms_deform_attn_impl_backward(
+    const Tensor &value, const Tensor &spatial_shapes,
+    const Tensor &level_start_index, const Tensor &sampling_loc,
+    const Tensor &attn_weight, const Tensor &grad_output, Tensor &grad_value,
+    Tensor &grad_sampling_loc, Tensor &grad_attn_weight,
+    const int im2col_step) {
+  DISPATCH_DEVICE_IMPL(ms_deform_attn_impl_backward, value, spatial_shapes,
+                       level_start_index, sampling_loc, attn_weight,
+                       grad_output, grad_value, grad_sampling_loc,
+                       grad_attn_weight, im2col_step);
+}
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight,
+                              const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  return ms_deform_attn_impl_forward(value, spatial_shapes, level_start_index,
+                                     sampling_loc, attn_weight, im2col_step);
+}
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step) {
+  at::DeviceGuard guard(value.device());
+  ms_deform_attn_impl_backward(value, spatial_shapes, level_start_index,
+                               sampling_loc, attn_weight, grad_output,
+                               grad_value, grad_sampling_loc, grad_attn_weight,
+                               im2col_step);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/nms.cpp b/mmcv/mmcv/ops/csrc/pytorch/nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..199d8af236f5442fcdd53ce3dfd8d24aa67481bb
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/nms.cpp
@@ -0,0 +1,33 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+Tensor nms_impl(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return DISPATCH_DEVICE_IMPL(nms_impl, boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms_impl(Tensor boxes, Tensor scores, Tensor dets,
+                    float iou_threshold, float sigma, float min_score,
+                    int method, int offset) {
+  return DISPATCH_DEVICE_IMPL(softnms_impl, boxes, scores, dets, iou_threshold,
+                              sigma, min_score, method, offset);
+}
+
+std::vector<std::vector<int> > nms_match_impl(Tensor dets,
+                                              float iou_threshold) {
+  return DISPATCH_DEVICE_IMPL(nms_match_impl, dets, iou_threshold);
+}
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset) {
+  return nms_impl(boxes, scores, iou_threshold, offset);
+}
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset) {
+  return softnms_impl(boxes, scores, dets, iou_threshold, sigma, min_score,
+                      method, offset);
+}
+
+std::vector<std::vector<int> > nms_match(Tensor dets, float iou_threshold) {
+  return nms_match_impl(dets, iou_threshold);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/nms_rotated.cpp b/mmcv/mmcv/ops/csrc/pytorch/nms_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e4ef676a9d6f94e5f60b7c9e1df8ce78eb6cbaa2
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/nms_rotated.cpp
@@ -0,0 +1,32 @@
+// Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+// modified from
+// https://github.com/facebookresearch/detectron2/blob/master/detectron2/layers/csrc/nms_rotated/nms_rotated.h
+#include "pytorch_cpp_helper.hpp"
+
+Tensor nms_rotated_cpu(const Tensor dets, const Tensor scores,
+                       const float iou_threshold);
+
+#ifdef MMCV_WITH_CUDA
+Tensor nms_rotated_cuda(const Tensor dets, const Tensor scores,
+                        const Tensor order, const Tensor dets_sorted,
+                        const float iou_threshold, const int multi_label);
+#endif
+
+// Interface for Python
+// inline is needed to prevent multiple function definitions when this header is
+// included by different cpps
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label) {
+  assert(dets.device().is_cuda() == scores.device().is_cuda());
+  if (dets.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    return nms_rotated_cuda(dets, scores, order, dets_sorted, iou_threshold,
+                            multi_label);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  }
+
+  return nms_rotated_cpu(dets, scores, iou_threshold);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/pixel_group.cpp b/mmcv/mmcv/ops/csrc/pytorch/pixel_group.cpp
new file mode 100755
index 0000000000000000000000000000000000000000..2bf8c8bbf2061cacb9e0c2d33c8a635834407622
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/pixel_group.cpp
@@ -0,0 +1,26 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// It is modified from https://github.com/WenmuZhou/PAN.pytorch
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+std::vector<std::vector<float>> pixel_group_impl(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float dis_threshold) {
+  return DISPATCH_DEVICE_IMPL(pixel_group_impl, score, mask, embedding,
+                              kernel_label, kernel_contour, kernel_region_num,
+                              dis_threshold);
+}
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold) {
+  score = score.contiguous();
+  mask = mask.contiguous();
+  embedding = embedding.contiguous();
+  kernel_label = kernel_label.contiguous();
+  kernel_contour = kernel_contour.contiguous();
+
+  return pixel_group_impl(score, mask, embedding, kernel_label, kernel_contour,
+                          kernel_region_num, distance_threshold);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/points_in_boxes.cpp b/mmcv/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..540da94038f6dea2dc10443905f289ddd131f1af
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/points_in_boxes.cpp
@@ -0,0 +1,44 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_boxes_part_forward_impl(int batch_size, int boxes_num,
+                                       int pts_num, const Tensor boxes,
+                                       const Tensor pts,
+                                       Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_part_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_all_forward_impl(int batch_size, int boxes_num,
+                                      int pts_num, const Tensor boxes,
+                                      const Tensor pts,
+                                      Tensor box_idx_of_points) {
+  DISPATCH_DEVICE_IMPL(points_in_boxes_all_forward_impl, batch_size, boxes_num,
+                       pts_num, boxes, pts, box_idx_of_points);
+}
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center, each box params pts: (B, npoints, 3)
+  // [x, y, z] in LiDAR coordinate params boxes_idx_of_points: (B, npoints),
+  // default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_part_forward_impl(batch_size, boxes_num, pts_num,
+                                    boxes_tensor, pts_tensor,
+                                    box_idx_of_points_tensor);
+}
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor) {
+  // params boxes: (B, N, 7) [x, y, z, x_size, y_size, z_size, rz] in LiDAR
+  // coordinate, z is the bottom center. params pts: (B, npoints, 3) [x, y, z]
+  // in LiDAR coordinate params boxes_idx_of_points: (B, npoints), default -1
+  int batch_size = boxes_tensor.size(0);
+  int boxes_num = boxes_tensor.size(1);
+  int pts_num = pts_tensor.size(1);
+  points_in_boxes_all_forward_impl(batch_size, boxes_num, pts_num, boxes_tensor,
+                                   pts_tensor, box_idx_of_points_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/points_in_polygons.cpp b/mmcv/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..75a93dcef33f23904c1218048e16beff65c230d1
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/points_in_polygons.cpp
@@ -0,0 +1,15 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void points_in_polygons_forward_impl(const Tensor points, const Tensor polygons,
+                                     Tensor output, const int rows,
+                                     const int cols) {
+  DISPATCH_DEVICE_IMPL(points_in_polygons_forward_impl, points, polygons,
+                       output, rows, cols);
+}
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output) {
+  int rows = points.size(0);
+  int cols = polygons.size(0);
+  points_in_polygons_forward_impl(points, polygons, output, rows, cols);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/prroi_pool.cpp b/mmcv/mmcv/ops/csrc/pytorch/prroi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..00db84a154bef7a7cee8d38ba6236d959849a3bc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/prroi_pool.cpp
@@ -0,0 +1,47 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void prroi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                             int pooled_height, int pooled_width,
+                             float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_forward_impl, input, rois, output,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_backward_impl(Tensor grad_output, Tensor rois,
+                              Tensor grad_input, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_backward_impl, grad_output, rois, grad_input,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward_impl(Tensor output, Tensor grad_output,
+                                   Tensor input, Tensor rois, Tensor grad_rois,
+                                   int pooled_height, int pooled_width,
+                                   float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(prroi_pool_coor_backward_impl, output, grad_output,
+                       input, rois, grad_rois, pooled_height, pooled_width,
+                       spatial_scale);
+}
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale) {
+  prroi_pool_forward_impl(input, rois, output, pooled_height, pooled_width,
+                          spatial_scale);
+}
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale) {
+  prroi_pool_backward_impl(grad_output, rois, grad_input, pooled_height,
+                           pooled_width, spatial_scale);
+}
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale) {
+  prroi_pool_coor_backward_impl(output, grad_output, input, rois, grad_rois,
+                                pooled_height, pooled_width, spatial_scale);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/psamask.cpp b/mmcv/mmcv/ops/csrc/pytorch/psamask.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6064c9ba5fd7ec9bcfef22b3abcc65ef50106d67
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/psamask.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from
+// https://github.com/hszhao/semseg/blob/master/lib/psa/src
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void psamask_forward_impl(const int psa_type, const Tensor input, Tensor output,
+                          const int num_, const int h_feature,
+                          const int w_feature, const int h_mask,
+                          const int w_mask, const int half_h_mask,
+                          const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_forward_impl, psa_type, input, output, num_,
+                       h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_backward_impl(const int psa_type, const Tensor grad_output,
+                           Tensor grad_input, const int num_,
+                           const int h_feature, const int w_feature,
+                           const int h_mask, const int w_mask,
+                           const int half_h_mask, const int half_w_mask) {
+  DISPATCH_DEVICE_IMPL(psamask_backward_impl, psa_type, grad_output, grad_input,
+                       num_, h_feature, w_feature, h_mask, w_mask, half_h_mask,
+                       half_w_mask);
+}
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask) {
+  psamask_forward_impl(psa_type, input, output, num_, h_feature, w_feature,
+                       h_mask, w_mask, half_h_mask, half_w_mask);
+}
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask) {
+  psamask_backward_impl(psa_type, grad_output, grad_input, num_, h_feature,
+                        w_feature, h_mask, w_mask, half_h_mask, half_w_mask);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/pybind.cpp b/mmcv/mmcv/ops/csrc/pytorch/pybind.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c134090871301a4755483d85def92fd039dcae1f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/pybind.cpp
@@ -0,0 +1,856 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <torch/extension.h>
+
+#include "pytorch_cpp_helper.hpp"
+
+std::string get_compiler_version();
+std::string get_compiling_cuda_version();
+
+void assign_score_withk_forward(const Tensor &points, const Tensor &centers,
+                                const Tensor &scores, const Tensor &knn_idx,
+                                Tensor &output, int B, int N0, int N1, int M,
+                                int K, int O, int aggregate);
+
+void assign_score_withk_backward(const Tensor &grad_out, const Tensor &points,
+                                 const Tensor &centers, const Tensor &scores,
+                                 const Tensor &knn_idx, Tensor &grad_points,
+                                 Tensor &grad_centers, Tensor &grad_scores,
+                                 int B, int N0, int N1, int M, int K, int O,
+                                 int aggregate);
+
+void carafe_naive_forward(Tensor features, Tensor masks, Tensor output,
+                          int kernel_size, int group_size, int scale_factor);
+
+void carafe_naive_backward(Tensor top_grad, Tensor features, Tensor masks,
+                           Tensor bottom_grad, Tensor mask_grad,
+                           int kernel_size, int group_size, int scale_factor);
+
+void carafe_forward(Tensor features, Tensor masks, Tensor rfeatures,
+                    Tensor routput, Tensor rmasks, Tensor output,
+                    int kernel_size, int group_size, int scale_factor);
+
+void carafe_backward(Tensor top_grad, Tensor rfeatures, Tensor masks,
+                     Tensor rtop_grad, Tensor rbottom_grad_hs,
+                     Tensor rbottom_grad, Tensor rmask_grad, Tensor bottom_grad,
+                     Tensor mask_grad, int kernel_size, int group_size,
+                     int scale_factor);
+
+void deform_conv_forward(Tensor input, Tensor weight, Tensor offset,
+                         Tensor output, Tensor columns, Tensor ones, int kW,
+                         int kH, int dW, int dH, int padW, int padH,
+                         int dilationW, int dilationH, int group,
+                         int deformable_group, int im2col_step);
+
+void deform_conv_backward_input(Tensor input, Tensor offset, Tensor gradOutput,
+                                Tensor gradInput, Tensor gradOffset,
+                                Tensor weight, Tensor columns, int kW, int kH,
+                                int dW, int dH, int padW, int padH,
+                                int dilationW, int dilationH, int group,
+                                int deformable_group, int im2col_step);
+
+void deform_conv_backward_parameters(Tensor input, Tensor offset,
+                                     Tensor gradOutput, Tensor gradWeight,
+                                     Tensor columns, Tensor ones, int kW,
+                                     int kH, int dW, int dH, int padW, int padH,
+                                     int dilationW, int dilationH, int group,
+                                     int deformable_group, float scale,
+                                     int im2col_step);
+
+void deform_roi_pool_forward(Tensor input, Tensor rois, Tensor offset,
+                             Tensor output, int pooled_height, int pooled_width,
+                             float spatial_scale, int sampling_ratio,
+                             float gamma);
+
+void deform_roi_pool_backward(Tensor grad_output, Tensor input, Tensor rois,
+                              Tensor offset, Tensor grad_input,
+                              Tensor grad_offset, int pooled_height,
+                              int pooled_width, float spatial_scale,
+                              int sampling_ratio, float gamma);
+
+void group_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                          Tensor out_tensor, int b, int c, int n, int npoints,
+                          int nsample);
+
+void group_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                           Tensor grad_points_tensor, int b, int c, int n,
+                           int npoints, int nsample);
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag);
+
+void gather_points_forward(Tensor points_tensor, Tensor idx_tensor,
+                           Tensor out_tensor, int b, int c, int n, int npoints);
+
+void gather_points_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                            Tensor grad_points_tensor, int b, int c, int n,
+                            int npoints);
+
+void sigmoid_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha);
+
+void sigmoid_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor grad_input, float gamma, float alpha);
+
+void softmax_focal_loss_forward(Tensor input, Tensor target, Tensor weight,
+                                Tensor output, float gamma, float alpha);
+
+void softmax_focal_loss_backward(Tensor input, Tensor target, Tensor weight,
+                                 Tensor buff, Tensor grad_input, float gamma,
+                                 float alpha);
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n);
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m);
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m);
+
+void bbox_overlaps(const Tensor bboxes1, const Tensor bboxes2, Tensor ious,
+                   const int mode, const bool aligned, const int offset);
+
+void knn_forward(Tensor xyz_tensor, Tensor new_xyz_tensor, Tensor idx_tensor,
+                 Tensor dist2_tensor, int b, int n, int m, int nsample);
+
+void iou3d_boxes_overlap_bev_forward(Tensor boxes_a, Tensor boxes_b,
+                                     Tensor ans_overlap);
+
+void iou3d_nms3d_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                         float nms_overlap_thresh);
+
+void iou3d_nms3d_normal_forward(Tensor boxes, Tensor keep, Tensor keep_num,
+                                float nms_overlap_thresh);
+
+void furthest_point_sampling_forward(Tensor points_tensor, Tensor temp_tensor,
+                                     Tensor idx_tensor, int b, int n, int m);
+
+void furthest_point_sampling_with_dist_forward(Tensor points_tensor,
+                                               Tensor temp_tensor,
+                                               Tensor idx_tensor, int b, int n,
+                                               int m);
+
+void masked_im2col_forward(const Tensor im, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor col,
+                           const int kernel_h, const int kernel_w,
+                           const int pad_h, const int pad_w);
+
+void masked_col2im_forward(const Tensor col, const Tensor mask_h_idx,
+                           const Tensor mask_w_idx, Tensor im, int height,
+                           int width, int channels);
+
+void modulated_deform_conv_forward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor output, Tensor columns, int kernel_h, int kernel_w,
+    const int stride_h, const int stride_w, const int pad_h, const int pad_w,
+    const int dilation_h, const int dilation_w, const int group,
+    const int deformable_group, const bool with_bias);
+
+void modulated_deform_conv_backward(
+    Tensor input, Tensor weight, Tensor bias, Tensor ones, Tensor offset,
+    Tensor mask, Tensor columns, Tensor grad_input, Tensor grad_weight,
+    Tensor grad_bias, Tensor grad_offset, Tensor grad_mask, Tensor grad_output,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w, int dilation_h, int dilation_w, int group, int deformable_group,
+    const bool with_bias);
+
+Tensor ms_deform_attn_forward(const Tensor &value, const Tensor &spatial_shapes,
+                              const Tensor &level_start_index,
+                              const Tensor &sampling_loc,
+                              const Tensor &attn_weight, const int im2col_step);
+
+void ms_deform_attn_backward(const Tensor &value, const Tensor &spatial_shapes,
+                             const Tensor &level_start_index,
+                             const Tensor &sampling_loc,
+                             const Tensor &attn_weight,
+                             const Tensor &grad_output, Tensor &grad_value,
+                             Tensor &grad_sampling_loc,
+                             Tensor &grad_attn_weight, const int im2col_step);
+
+Tensor nms(Tensor boxes, Tensor scores, float iou_threshold, int offset);
+
+Tensor softnms(Tensor boxes, Tensor scores, Tensor dets, float iou_threshold,
+               float sigma, float min_score, int method, int offset);
+
+std::vector<std::vector<int>> nms_match(Tensor dets, float iou_threshold);
+
+std::vector<std::vector<float>> pixel_group(
+    Tensor score, Tensor mask, Tensor embedding, Tensor kernel_label,
+    Tensor kernel_contour, int kernel_region_num, float distance_threshold);
+
+std::vector<std::vector<int>> contour_expand(Tensor kernel_mask,
+                                             Tensor internal_kernel_label,
+                                             int min_kernel_area,
+                                             int kernel_num);
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned);
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width, float spatial_scale);
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale);
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean);
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var);
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size);
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias);
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input);
+
+void psamask_forward(const Tensor input, Tensor output, const int psa_type,
+                     const int num_, const int h_feature, const int w_feature,
+                     const int h_mask, const int w_mask, const int half_h_mask,
+                     const int half_w_mask);
+
+void psamask_backward(Tensor grad_output, const Tensor grad_input,
+                      const int psa_type, const int num_, const int h_feature,
+                      const int w_feature, const int h_mask, const int w_mask,
+                      const int half_h_mask, const int half_w_mask);
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output);
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input);
+
+void ball_query_forward(Tensor new_xyz_tensor, Tensor xyz_tensor,
+                        Tensor idx_tensor, int b, int n, int m,
+                        float min_radius, float max_radius, int nsample);
+
+void prroi_pool_forward(Tensor input, Tensor rois, Tensor output,
+                        int pooled_height, int pooled_width,
+                        float spatial_scale);
+
+void prroi_pool_backward(Tensor grad_output, Tensor rois, Tensor grad_input,
+                         int pooled_height, int pooled_width,
+                         float spatial_scale);
+
+void prroi_pool_coor_backward(Tensor output, Tensor grad_output, Tensor input,
+                              Tensor rois, Tensor grad_rois, int pooled_height,
+                              int pooled_width, float spatial_scale);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<Tensor> get_indice_pairs_backward(
+    Tensor indices, Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+Tensor indice_conv_forward(Tensor features, Tensor filters, Tensor indicePairs,
+                           Tensor indiceNum, int64_t numActOut,
+                           int64_t _inverse, int64_t _subM);
+
+std::vector<Tensor> indice_conv_backward(Tensor features, Tensor filters,
+                                         Tensor outGrad, Tensor indicePairs,
+                                         Tensor indiceNum, int64_t _inverse,
+                                         int64_t _subM);
+
+Tensor fused_indice_conv_batchnorm_forward(Tensor features, Tensor filters,
+                                           Tensor bias, Tensor indicePairs,
+                                           Tensor indiceNum, int64_t numActOut,
+                                           int64_t _inverse, int64_t _subM);
+
+Tensor indice_maxpool_forward(Tensor features, Tensor indicePairs,
+                              Tensor indiceNum, int64_t numAct);
+
+Tensor indice_maxpool_backward(Tensor features, Tensor outFeatures,
+                               Tensor outGrad, Tensor indicePairs,
+                               Tensor indiceNum);
+
+void box_iou_rotated(const Tensor boxes1, const Tensor boxes2, Tensor ious,
+                     const int mode_flag, const bool aligned);
+
+Tensor nms_rotated(const Tensor dets, const Tensor scores, const Tensor order,
+                   const Tensor dets_sorted, const float iou_threshold,
+                   const int multi_label);
+
+Tensor upfirdn2d(const Tensor &input, const Tensor &kernel, int up_x, int up_y,
+                 int down_x, int down_y, int pad_x0, int pad_x1, int pad_y0,
+                 int pad_y1);
+
+Tensor fused_bias_leakyrelu(const Tensor &input, const Tensor &bias,
+                            const Tensor &refer, int act, int grad, float alpha,
+                            float scale);
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int pooled_height, int pooled_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise);
+
+void roi_align_rotated_backward(Tensor grad_output, Tensor rois,
+                                Tensor grad_input, int pooled_height,
+                                int pooled_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise);
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const std::string &reduce_type);
+
+void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                     const torch::Tensor &grad_reduced_feats,
+                                     const torch::Tensor &feats,
+                                     const torch::Tensor &reduced_feats,
+                                     const torch::Tensor &coors_idx,
+                                     const torch::Tensor &reduce_count,
+                                     const std::string &reduce_type);
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim,
+                           const bool deterministic);
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim);
+
+void border_align_forward(const Tensor &input, const Tensor &boxes,
+                          Tensor output, Tensor argmax_idx,
+                          const int pool_size);
+
+void border_align_backward(const Tensor &grad_output, const Tensor &boxes,
+                           const Tensor &argmax_idx, Tensor grad_input,
+                           const int pool_size);
+
+void points_in_boxes_cpu_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor pts_indices_tensor);
+
+void points_in_boxes_part_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                  Tensor box_idx_of_points_tensor);
+
+void points_in_boxes_all_forward(Tensor boxes_tensor, Tensor pts_tensor,
+                                 Tensor box_idx_of_points_tensor);
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method);
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in, int pool_method);
+
+void correlation_forward(Tensor input1, Tensor input2, Tensor output, int kH,
+                         int kW, int patchH, int patchW, int padH, int padW,
+                         int dilationH, int dilationW, int dilation_patchH,
+                         int dilation_patchW, int dH, int dW);
+
+void correlation_backward(Tensor grad_output, Tensor input1, Tensor input2,
+                          Tensor grad_input1, Tensor grad_input2, int kH,
+                          int kW, int patchH, int patchW, int padH, int padW,
+                          int dilationH, int dilationW, int dilation_patchH,
+                          int dilation_patchW, int dH, int dW);
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale, const int points);
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points);
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise);
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise);
+
+void points_in_polygons_forward(Tensor points, Tensor polygons, Tensor output);
+
+void min_area_polygons(const Tensor pointsets, Tensor polygons);
+
+void active_rotated_filter_forward(const Tensor input, const Tensor indices,
+                                   Tensor output);
+
+void active_rotated_filter_backward(const Tensor grad_out, const Tensor indices,
+                                    Tensor grad_in);
+
+void convex_iou(const Tensor pointsets, const Tensor polygons, Tensor ious);
+
+void convex_giou(const Tensor pointsets, const Tensor polygons, Tensor output);
+
+at::Tensor diff_iou_rotated_sort_vertices_forward(at::Tensor vertices,
+                                                  at::Tensor mask,
+                                                  at::Tensor num_valid);
+
+void chamfer_distance_forward(const Tensor xyz1, const Tensor xyz2,
+                              const Tensor dist1, const Tensor dist2,
+                              const Tensor idx1, const Tensor idx);
+
+void chamfer_distance_backward(const Tensor xyz1, const Tensor xyz2,
+                               Tensor gradxyz1, Tensor gradxyz2,
+                               Tensor graddist1, Tensor graddist2, Tensor idx1,
+                               Tensor idx2);
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("upfirdn2d", &upfirdn2d, "upfirdn2d (CUDA)", py::arg("input"),
+        py::arg("kernel"), py::arg("up_x"), py::arg("up_y"), py::arg("down_x"),
+        py::arg("down_y"), py::arg("pad_x0"), py::arg("pad_x1"),
+        py::arg("pad_y0"), py::arg("pad_y1"));
+  m.def("fused_bias_leakyrelu", &fused_bias_leakyrelu,
+        "fused_bias_leakyrelu (CUDA)", py::arg("input"), py::arg("bias"),
+        py::arg("empty"), py::arg("act"), py::arg("grad"), py::arg("alpha"),
+        py::arg("scale"));
+  m.def("gather_points_forward", &gather_points_forward,
+        "gather_points_forward", py::arg("points_tensor"),
+        py::arg("idx_tensor"), py::arg("out_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
+  m.def("gather_points_backward", &gather_points_backward,
+        "gather_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"));
+  m.def("get_compiler_version", &get_compiler_version, "get_compiler_version");
+  m.def("get_compiling_cuda_version", &get_compiling_cuda_version,
+        "get_compiling_cuda_version");
+  m.def("assign_score_withk_forward", &assign_score_withk_forward,
+        "assign_score_withk_forward", py::arg("points"), py::arg("centers"),
+        py::arg("scores"), py::arg("knn_idx"), py::arg("output"), py::arg("B"),
+        py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"), py::arg("O"),
+        py::arg("aggregate"));
+  m.def("assign_score_withk_backward", &assign_score_withk_backward,
+        "assign_score_withk_backward", py::arg("grad_out"), py::arg("points"),
+        py::arg("centers"), py::arg("scores"), py::arg("knn_idx"),
+        py::arg("grad_points"), py::arg("grad_centers"), py::arg("grad_scores"),
+        py::arg("B"), py::arg("N0"), py::arg("N1"), py::arg("M"), py::arg("K"),
+        py::arg("O"), py::arg("aggregate"));
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("dist2_tensor"), py::arg("b"), py::arg("n"), py::arg("m"),
+        py::arg("nsample"));
+  m.def("carafe_naive_forward", &carafe_naive_forward, "carafe_naive_forward",
+        py::arg("features"), py::arg("masks"), py::arg("output"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_naive_backward", &carafe_naive_backward,
+        "carafe_naive_backward", py::arg("top_grad"), py::arg("features"),
+        py::arg("masks"), py::arg("bottom_grad"), py::arg("mask_grad"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_forward", &carafe_forward, "carafe_forward",
+        py::arg("features"), py::arg("masks"), py::arg("rfeatures"),
+        py::arg("routput"), py::arg("rmasks"), py::arg("output"),
+        py::arg("kernel_size"), py::arg("group_size"), py::arg("scale_factor"));
+  m.def("carafe_backward", &carafe_backward, "carafe_backward",
+        py::arg("top_grad"), py::arg("rfeatures"), py::arg("masks"),
+        py::arg("rtop_grad"), py::arg("rbottom_grad_hs"),
+        py::arg("rbottom_grad"), py::arg("rmask_grad"), py::arg("bottom_grad"),
+        py::arg("mask_grad"), py::arg("kernel_size"), py::arg("group_size"),
+        py::arg("scale_factor"));
+  m.def("deform_conv_forward", &deform_conv_forward, "deform_conv_forward",
+        py::arg("input"), py::arg("weight"), py::arg("offset"),
+        py::arg("output"), py::arg("columns"), py::arg("ones"), py::arg("kW"),
+        py::arg("kH"), py::arg("dW"), py::arg("dH"), py::arg("padW"),
+        py::arg("padH"), py::arg("dilationW"), py::arg("dilationH"),
+        py::arg("group"), py::arg("deformable_group"), py::arg("im2col_step"));
+  m.def("deform_conv_backward_input", &deform_conv_backward_input,
+        "deform_conv_backward_input", py::arg("input"), py::arg("offset"),
+        py::arg("gradOutput"), py::arg("gradInput"), py::arg("gradOffset"),
+        py::arg("weight"), py::arg("columns"), py::arg("kW"), py::arg("kH"),
+        py::arg("dW"), py::arg("dH"), py::arg("padW"), py::arg("padH"),
+        py::arg("dilationW"), py::arg("dilationH"), py::arg("group"),
+        py::arg("deformable_group"), py::arg("im2col_step"));
+  m.def("deform_conv_backward_parameters", &deform_conv_backward_parameters,
+        "deform_conv_backward_parameters", py::arg("input"), py::arg("offset"),
+        py::arg("gradOutput"), py::arg("gradWeight"), py::arg("columns"),
+        py::arg("ones"), py::arg("kW"), py::arg("kH"), py::arg("dW"),
+        py::arg("dH"), py::arg("padW"), py::arg("padH"), py::arg("dilationW"),
+        py::arg("dilationH"), py::arg("group"), py::arg("deformable_group"),
+        py::arg("scale"), py::arg("im2col_step"));
+  m.def("deform_roi_pool_forward", &deform_roi_pool_forward,
+        "deform roi pool forward", py::arg("input"), py::arg("rois"),
+        py::arg("offset"), py::arg("output"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("gamma"));
+  m.def("deform_roi_pool_backward", &deform_roi_pool_backward,
+        "deform roi pool backward", py::arg("grad_output"), py::arg("input"),
+        py::arg("rois"), py::arg("offset"), py::arg("grad_input"),
+        py::arg("grad_offset"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("gamma"));
+  m.def("roipoint_pool3d_forward", &roipoint_pool3d_forward,
+        "roipoint_pool3d_forward", py::arg("xyz"), py::arg("boxes3d"),
+        py::arg("pts_feature"), py::arg("pooled_features"),
+        py::arg("pooled_empty_flag"));
+  m.def("sigmoid_focal_loss_forward", &sigmoid_focal_loss_forward,
+        "sigmoid_focal_loss_forward ", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("output"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("sigmoid_focal_loss_backward", &sigmoid_focal_loss_backward,
+        "sigmoid_focal_loss_backward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("grad_input"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("softmax_focal_loss_forward", &softmax_focal_loss_forward,
+        "softmax_focal_loss_forward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("output"), py::arg("gamma"),
+        py::arg("alpha"));
+  m.def("softmax_focal_loss_backward", &softmax_focal_loss_backward,
+        "softmax_focal_loss_backward", py::arg("input"), py::arg("target"),
+        py::arg("weight"), py::arg("buff"), py::arg("grad_input"),
+        py::arg("gamma"), py::arg("alpha"));
+  m.def("three_interpolate_forward", &three_interpolate_forward,
+        "three_interpolate_forward", py::arg("points_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"), py::arg("out_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("m"), py::arg("n"));
+  m.def("three_interpolate_backward", &three_interpolate_backward,
+        "three_interpolate_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("weight_tensor"),
+        py::arg("grad_points_tensor"), py::arg("b"), py::arg("c"), py::arg("n"),
+        py::arg("m"));
+  m.def("three_nn_forward", &three_nn_forward, "three_nn_forward",
+        py::arg("unknown_tensor"), py::arg("known_tensor"),
+        py::arg("dist2_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("bbox_overlaps", &bbox_overlaps, "bbox_overlaps", py::arg("bboxes1"),
+        py::arg("bboxes2"), py::arg("ious"), py::arg("mode"),
+        py::arg("aligned"), py::arg("offset"));
+  m.def("group_points_forward", &group_points_forward, "group_points_forward",
+        py::arg("points_tensor"), py::arg("idx_tensor"), py::arg("out_tensor"),
+        py::arg("b"), py::arg("c"), py::arg("n"), py::arg("npoints"),
+        py::arg("nsample"));
+  m.def("group_points_backward", &group_points_backward,
+        "group_points_backward", py::arg("grad_out_tensor"),
+        py::arg("idx_tensor"), py::arg("grad_points_tensor"), py::arg("b"),
+        py::arg("c"), py::arg("n"), py::arg("npoints"), py::arg("nsample"));
+  m.def("knn_forward", &knn_forward, "knn_forward", py::arg("b"), py::arg("n"),
+        py::arg("m"), py::arg("nsample"), py::arg("xyz_tensor"),
+        py::arg("new_xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("dist2_tensor"));
+  m.def("iou3d_boxes_overlap_bev_forward", &iou3d_boxes_overlap_bev_forward,
+        "iou3d_boxes_overlap_bev_forward", py::arg("boxes_a"),
+        py::arg("boxes_b"), py::arg("ans_iou"));
+  m.def("iou3d_nms3d_forward", &iou3d_nms3d_forward, "iou3d_nms3d_forward",
+        py::arg("boxes"), py::arg("keep"), py::arg("num_out"),
+        py::arg("nms_overlap_thresh"));
+  m.def("iou3d_nms3d_normal_forward", &iou3d_nms3d_normal_forward,
+        "iou3d_nms3d_normal_forward", py::arg("boxes"), py::arg("keep"),
+        py::arg("num_out"), py::arg("nms_overlap_thresh"));
+  m.def("furthest_point_sampling_forward", &furthest_point_sampling_forward,
+        "furthest_point_sampling_forward", py::arg("points_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("furthest_point_sampling_with_dist_forward",
+        &furthest_point_sampling_with_dist_forward,
+        "furthest_point_sampling_with_dist_forward", py::arg("points_tensor"),
+        py::arg("temp_tensor"), py::arg("idx_tensor"), py::arg("b"),
+        py::arg("n"), py::arg("m"));
+  m.def("masked_im2col_forward", &masked_im2col_forward,
+        "masked_im2col_forward", py::arg("im"), py::arg("mask_h_idx"),
+        py::arg("mask_w_idx"), py::arg("col"), py::arg("kernel_h"),
+        py::arg("kernel_w"), py::arg("pad_h"), py::arg("pad_w"));
+  m.def("masked_col2im_forward", &masked_col2im_forward,
+        "masked_col2im_forward", py::arg("col"), py::arg("mask_h_idx"),
+        py::arg("mask_w_idx"), py::arg("im"), py::arg("height"),
+        py::arg("width"), py::arg("channels"));
+  m.def("modulated_deform_conv_forward", &modulated_deform_conv_forward,
+        "modulated deform conv forward", py::arg("input"), py::arg("weight"),
+        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
+        py::arg("output"), py::arg("columns"), py::arg("kernel_h"),
+        py::arg("kernel_w"), py::arg("stride_h"), py::arg("stride_w"),
+        py::arg("pad_h"), py::arg("pad_w"), py::arg("dilation_h"),
+        py::arg("dilation_w"), py::arg("group"), py::arg("deformable_group"),
+        py::arg("with_bias"));
+  m.def("modulated_deform_conv_backward", &modulated_deform_conv_backward,
+        "modulated deform conv backward", py::arg("input"), py::arg("weight"),
+        py::arg("bias"), py::arg("ones"), py::arg("offset"), py::arg("mask"),
+        py::arg("columns"), py::arg("grad_input"), py::arg("grad_weight"),
+        py::arg("grad_bias"), py::arg("grad_offset"), py::arg("grad_mask"),
+        py::arg("grad_output"), py::arg("kernel_h"), py::arg("kernel_w"),
+        py::arg("stride_h"), py::arg("stride_w"), py::arg("pad_h"),
+        py::arg("pad_w"), py::arg("dilation_h"), py::arg("dilation_w"),
+        py::arg("group"), py::arg("deformable_group"), py::arg("with_bias"));
+  m.def("nms", &nms, "nms (CPU/CUDA) ", py::arg("boxes"), py::arg("scores"),
+        py::arg("iou_threshold"), py::arg("offset"));
+  m.def("softnms", &softnms, "softnms (CPU) ", py::arg("boxes"),
+        py::arg("scores"), py::arg("dets"), py::arg("iou_threshold"),
+        py::arg("sigma"), py::arg("min_score"), py::arg("method"),
+        py::arg("offset"));
+  m.def("nms_match", &nms_match, "nms_match (CPU) ", py::arg("dets"),
+        py::arg("iou_threshold"));
+  m.def("pixel_group", &pixel_group, "pixel group (CPU) ", py::arg("score"),
+        py::arg("mask"), py::arg("embedding"), py::arg("kernel_label"),
+        py::arg("kernel_contour"), py::arg("kernel_region_label"),
+        py::arg("distance_threshold"));
+  m.def("contour_expand", &contour_expand, "contour exapnd (CPU) ",
+        py::arg("kernel_mask"), py::arg("internal_kernel_label"),
+        py::arg("min_kernel_area"), py::arg("kernel_num"));
+  m.def("roi_align_forward", &roi_align_forward, "roi_align forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("argmax_y"), py::arg("argmax_x"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
+  m.def("roi_align_backward", &roi_align_backward, "roi_align backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("argmax_y"),
+        py::arg("argmax_x"), py::arg("grad_input"), py::arg("aligned_height"),
+        py::arg("aligned_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("pool_mode"), py::arg("aligned"));
+  m.def("roi_pool_forward", &roi_pool_forward, "roi_pool forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"), py::arg("argmax"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("roi_pool_backward", &roi_pool_backward, "roi_pool backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("argmax"),
+        py::arg("grad_input"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"));
+  m.def("sync_bn_forward_mean", &sync_bn_forward_mean, "sync_bn forward_mean",
+        py::arg("input"), py::arg("mean"));
+  m.def("sync_bn_forward_var", &sync_bn_forward_var, "sync_bn forward_var",
+        py::arg("input"), py::arg("mean"), py::arg("var"));
+  m.def("sync_bn_forward_output", &sync_bn_forward_output,
+        "sync_bn forward_output", py::arg("input"), py::arg("mean"),
+        py::arg("var"), py::arg("weight"), py::arg("bias"),
+        py::arg("running_mean"), py::arg("running_var"), py::arg("norm"),
+        py::arg("std"), py::arg("output"), py::arg("eps"), py::arg("momentum"),
+        py::arg("group_size"));
+  m.def("sync_bn_backward_param", &sync_bn_backward_param,
+        "sync_bn backward_param", py::arg("grad_output"), py::arg("norm"),
+        py::arg("grad_weight"), py::arg("grad_bias"));
+  m.def("sync_bn_backward_data", &sync_bn_backward_data,
+        "sync_bn backward_data", py::arg("grad_output"), py::arg("weight"),
+        py::arg("grad_weight"), py::arg("grad_bias"), py::arg("norm"),
+        py::arg("std"), py::arg("grad_input"));
+  m.def("get_indice_pairs_2d_forward", &get_indice_pairs_forward<2>,
+        "get_indice_pairs_2d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_forward", &get_indice_pairs_forward<3>,
+        "get_indice_pairs_3d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_4d_forward", &get_indice_pairs_forward<4>,
+        "get_indice_pairs_4d_forward", py::arg("indices"), py::arg("batchSize"),
+        py::arg("outSpatialShape"), py::arg("spatialShape"),
+        py::arg("kernelSize"), py::arg("stride"), py::arg("padding"),
+        py::arg("dilation"), py::arg("outPadding"), py::arg("_subM"),
+        py::arg("_transpose"));
+  m.def("get_indice_pairs_2d_backward", &get_indice_pairs_backward<2>,
+        "get_indice_pairs_2d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("get_indice_pairs_3d_backward", &get_indice_pairs_backward<3>,
+        "get_indice_pairs_3d_backward", py::arg("indices"), py::arg("gridOut"),
+        py::arg("batchSize"), py::arg("outSpatialShape"),
+        py::arg("spatialShape"), py::arg("kernelSize"), py::arg("stride"),
+        py::arg("padding"), py::arg("dilation"), py::arg("outPadding"),
+        py::arg("_subM"), py::arg("_transpose"));
+  m.def("indice_conv_forward", &indice_conv_forward, "indice_conv_forward",
+        py::arg("features"), py::arg("filters"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numActOut"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("indice_conv_backward", &indice_conv_backward, "indice_conv_backward",
+        py::arg("features"), py::arg("filters"), py::arg("outGrad"),
+        py::arg("indicePairs"), py::arg("indiceNum"), py::arg("_inverse"),
+        py::arg("_subM"));
+  m.def("fused_indice_conv_forward", &fused_indice_conv_batchnorm_forward,
+        "fused_indice_conv_forward", py::arg("features"), py::arg("filters"),
+        py::arg("bias"), py::arg("indicePairs"), py::arg("indiceNum"),
+        py::arg("numActOut"), py::arg("_inverse"), py::arg("_subM"));
+  m.def("indice_maxpool_forward", &indice_maxpool_forward,
+        "indice_maxpool_forward", py::arg("features"), py::arg("indicePairs"),
+        py::arg("indiceNum"), py::arg("numAct"));
+  m.def("indice_maxpool_backward", &indice_maxpool_backward,
+        "indice_maxpool_backward", py::arg("features"), py::arg("outFeatures"),
+        py::arg("outGrad"), py::arg("indicePairs"), py::arg("indiceNum"));
+  m.def("psamask_forward", &psamask_forward, "PSAMASK forward (CPU/CUDA)",
+        py::arg("input"), py::arg("output"), py::arg("psa_type"),
+        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
+        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
+        py::arg("half_w_mask"));
+  m.def("psamask_backward", &psamask_backward, "PSAMASK backward (CPU/CUDA)",
+        py::arg("grad_output"), py::arg("grad_input"), py::arg("psa_type"),
+        py::arg("num_"), py::arg("h_feature"), py::arg("w_feature"),
+        py::arg("h_mask"), py::arg("w_mask"), py::arg("half_h_mask"),
+        py::arg("half_w_mask"));
+  m.def("tin_shift_forward", &tin_shift_forward, "tin_shift forward",
+        py::arg("input"), py::arg("shift"), py::arg("output"));
+  m.def("tin_shift_backward", &tin_shift_backward, "tin_shift backward",
+        py::arg("grad_output"), py::arg("shift"), py::arg("grad_input"));
+  m.def("box_iou_rotated", &box_iou_rotated, "IoU for rotated boxes",
+        py::arg("boxes1"), py::arg("boxes2"), py::arg("ious"),
+        py::arg("mode_flag"), py::arg("aligned"));
+  m.def("nms_rotated", &nms_rotated, "NMS for rotated boxes", py::arg("dets"),
+        py::arg("scores"), py::arg("order"), py::arg("dets_sorted"),
+        py::arg("iou_threshold"), py::arg("multi_label"));
+  m.def("ball_query_forward", &ball_query_forward, "ball_query_forward",
+        py::arg("new_xyz_tensor"), py::arg("xyz_tensor"), py::arg("idx_tensor"),
+        py::arg("b"), py::arg("n"), py::arg("m"), py::arg("min_radius"),
+        py::arg("max_radius"), py::arg("nsample"));
+  m.def("roi_align_rotated_forward", &roi_align_rotated_forward,
+        "roi_align_rotated forward", py::arg("input"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("sampling_ratio"), py::arg("aligned"),
+        py::arg("clockwise"));
+  m.def("roi_align_rotated_backward", &roi_align_rotated_backward,
+        "roi_align_rotated backward", py::arg("rois"), py::arg("grad_input"),
+        py::arg("grad_output"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("sampling_ratio"), py::arg("aligned"), py::arg("clockwise"));
+  m.def("dynamic_point_to_voxel_forward", &dynamic_point_to_voxel_forward,
+        "dynamic_point_to_voxel_forward", py::arg("feats"), py::arg("coors"),
+        py::arg("reduce_type"));
+  m.def("dynamic_point_to_voxel_backward", &dynamic_point_to_voxel_backward,
+        "dynamic_point_to_voxel_backward", py::arg("grad_feats"),
+        py::arg("grad_reduced_feats"), py::arg("feats"),
+        py::arg("reduced_feats"), py::arg("coors_idx"), py::arg("reduce_count"),
+        py::arg("reduce_type"));
+  m.def("hard_voxelize_forward", &hard_voxelize_forward,
+        "hard_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
+        py::arg("coors_range"), py::arg("voxels"), py::arg("coors"),
+        py::arg("num_points_per_voxel"), py::arg("voxel_num"),
+        py::arg("max_points"), py::arg("max_voxels"), py::arg("NDim"),
+        py::arg("deterministic"));
+  m.def("dynamic_voxelize_forward", &dynamic_voxelize_forward,
+        "dynamic_voxelize_forward", py::arg("points"), py::arg("voxel_size"),
+        py::arg("coors_range"), py::arg("coors"), py::arg("NDim"));
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward,
+        "forward function of multi-scale deformable attention",
+        py::arg("value"), py::arg("value_spatial_shapes"),
+        py::arg("value_level_start_index"), py::arg("sampling_locations"),
+        py::arg("attention_weights"), py::arg("im2col_step"));
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward,
+        "backward function of multi-scale deformable attention",
+        py::arg("value"), py::arg("value_spatial_shapes"),
+        py::arg("value_level_start_index"), py::arg("sampling_locations"),
+        py::arg("attention_weights"), py::arg("grad_output"),
+        py::arg("grad_value"), py::arg("grad_sampling_loc"),
+        py::arg("grad_attn_weight"), py::arg("im2col_step"));
+  m.def("border_align_forward", &border_align_forward,
+        "forward function of border_align", py::arg("input"), py::arg("boxes"),
+        py::arg("output"), py::arg("argmax_idx"), py::arg("pool_size"));
+  m.def("border_align_backward", &border_align_backward,
+        "backward function of border_align", py::arg("grad_output"),
+        py::arg("boxes"), py::arg("argmax_idx"), py::arg("grad_input"),
+        py::arg("pool_size"));
+  m.def("correlation_forward", &correlation_forward, "Correlation forward",
+        py::arg("input1"), py::arg("input2"), py::arg("output"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("correlation_backward", &correlation_backward, "Correlation backward",
+        py::arg("grad_output"), py::arg("input1"), py::arg("input2"),
+        py::arg("grad_input1"), py::arg("grad_input2"), py::arg("kH"),
+        py::arg("kW"), py::arg("patchH"), py::arg("patchW"), py::arg("padH"),
+        py::arg("padW"), py::arg("dilationH"), py::arg("dilationW"),
+        py::arg("dilation_patchH"), py::arg("dilation_patchW"), py::arg("dH"),
+        py::arg("dW"));
+  m.def("points_in_boxes_cpu_forward", &points_in_boxes_cpu_forward,
+        "points_in_boxes_cpu_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("pts_indices_tensor"));
+  m.def("points_in_boxes_part_forward", &points_in_boxes_part_forward,
+        "points_in_boxes_part_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
+  m.def("points_in_boxes_all_forward", &points_in_boxes_all_forward,
+        "points_in_boxes_all_forward", py::arg("boxes_tensor"),
+        py::arg("pts_tensor"), py::arg("box_idx_of_points_tensor"));
+  m.def("roiaware_pool3d_forward", &roiaware_pool3d_forward,
+        "roiaware_pool3d_forward", py::arg("rois"), py::arg("pts"),
+        py::arg("pts_feature"), py::arg("argmax"), py::arg("pts_idx_of_voxels"),
+        py::arg("pooled_features"), py::arg("pool_method"));
+  m.def("roiaware_pool3d_backward", &roiaware_pool3d_backward,
+        "roiaware_pool3d_backward", py::arg("pts_idx_of_voxels"),
+        py::arg("argmax"), py::arg("grad_out"), py::arg("grad_in"),
+        py::arg("pool_method"));
+  m.def("rotated_feature_align_forward", &rotated_feature_align_forward,
+        "Feature Refine forward (CUDA)", py::arg("features"),
+        py::arg("best_bboxes"), py::arg("output"), py::arg("spatial_scale"),
+        py::arg("points"));
+  m.def("rotated_feature_align_backward", &rotated_feature_align_backward,
+        "Feature Refine backward (CUDA)", py::arg("top_grad"),
+        py::arg("best_bboxes"), py::arg("bottom_grad"),
+        py::arg("spatial_scale"), py::arg("points"));
+  m.def("riroi_align_rotated_forward", &riroi_align_rotated_forward,
+        "riroi_align_rotated forward", py::arg("features"), py::arg("rois"),
+        py::arg("output"), py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"), py::arg("num_samples"),
+        py::arg("num_orientations"), py::arg("clockwise"));
+  m.def("riroi_align_rotated_backward", &riroi_align_rotated_backward,
+        "riroi_align_rotated backward", py::arg("top_grad"), py::arg("rois"),
+        py::arg("bottom_grad"), py::arg("pooled_height"),
+        py::arg("pooled_width"), py::arg("spatial_scale"),
+        py::arg("num_samples"), py::arg("num_orientations"),
+        py::arg("clockwise"));
+  m.def("points_in_polygons_forward", &points_in_polygons_forward,
+        "points_in_polygons_forward", py::arg("points"), py::arg("polygons"),
+        py::arg("output"));
+  m.def("min_area_polygons", &min_area_polygons, "min_area_polygons",
+        py::arg("pointsets"), py::arg("polygons"));
+  m.def("active_rotated_filter_forward", &active_rotated_filter_forward,
+        "active_rotated_filter_forward", py::arg("input"), py::arg("indices"),
+        py::arg("output"));
+  m.def("active_rotated_filter_backward", &active_rotated_filter_backward,
+        "active_rotated_filter_backward", py::arg("grad_out"),
+        py::arg("indices"), py::arg("grad_in"));
+  m.def("convex_iou", &convex_iou, "convex_iou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("ious"));
+  m.def("convex_giou", &convex_giou, "convex_giou", py::arg("pointsets"),
+        py::arg("polygons"), py::arg("output"));
+  m.def("diff_iou_rotated_sort_vertices_forward",
+        &diff_iou_rotated_sort_vertices_forward,
+        "diff_iou_rotated_sort_vertices_forward", py::arg("vertices"),
+        py::arg("mask"), py::arg("num_valid"));
+  m.def("chamfer_distance_forward", &chamfer_distance_forward,
+        "chamfer_distance_forward", py::arg("xyz1"), py::arg("xyz2"),
+        py::arg("dist1"), py::arg("dist2"), py::arg("idx1"), py::arg("idx2"));
+  m.def("chamfer_distance_backward", &chamfer_distance_backward,
+        "chamfer_distance_backward", py::arg("xyz1"), py::arg("xyz2"),
+        py::arg("gradxyz1"), py::arg("gradxyz2"), py::arg("graddist1"),
+        py::arg("graddist2"), py::arg("idx1"), py::arg("idx2"));
+  m.def("prroi_pool_forward", &prroi_pool_forward, "prroi_pool forward",
+        py::arg("input"), py::arg("rois"), py::arg("output"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("prroi_pool_backward", &prroi_pool_backward, "prroi_pool_backward",
+        py::arg("grad_output"), py::arg("rois"), py::arg("grad_input"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+  m.def("prroi_pool_coor_backward", &prroi_pool_coor_backward,
+        "prroi_pool_coor_backward", py::arg("output"), py::arg("grad_output"),
+        py::arg("input"), py::arg("rois"), py::arg("grad_rois"),
+        py::arg("pooled_height"), py::arg("pooled_width"),
+        py::arg("spatial_scale"));
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp b/mmcv/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..81ffa9fd6dcd82117ca13ac83b88b5f023aca466
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/riroi_align_rotated.cpp
@@ -0,0 +1,42 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void riroi_align_rotated_forward_impl(Tensor features, Tensor rois,
+                                      Tensor output, int pooled_height,
+                                      int pooled_width, float spatial_scale,
+                                      int num_samples, int num_orientations,
+                                      bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_forward_impl, features, rois, output,
+                       pooled_height, pooled_width, spatial_scale, num_samples,
+                       num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                       Tensor bottom_grad, int pooled_height,
+                                       int pooled_width, float spatial_scale,
+                                       int num_samples, int num_orientations,
+                                       bool clockwise) {
+  DISPATCH_DEVICE_IMPL(riroi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, pooled_height, pooled_width, spatial_scale,
+                       num_samples, num_orientations, clockwise);
+}
+
+void riroi_align_rotated_forward(Tensor features, Tensor rois, Tensor output,
+                                 int pooled_height, int pooled_width,
+                                 float spatial_scale, int num_samples,
+                                 int num_orientations, bool clockwise) {
+  riroi_align_rotated_forward_impl(features, rois, output, pooled_height,
+                                   pooled_width, spatial_scale, num_samples,
+                                   num_orientations, clockwise);
+}
+
+void riroi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                  Tensor bottom_grad, int pooled_height,
+                                  int pooled_width, float spatial_scale,
+                                  int num_samples, int num_orientations,
+                                  bool clockwise) {
+  riroi_align_rotated_backward_impl(top_grad, rois, bottom_grad, pooled_height,
+                                    pooled_width, spatial_scale, num_samples,
+                                    num_orientations, clockwise);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/roi_align.cpp b/mmcv/mmcv/ops/csrc/pytorch/roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6e7077397d06ecd55af1e1060e64fe8c5ff08c94
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/roi_align.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_forward_impl(Tensor input, Tensor rois, Tensor output,
+                            Tensor argmax_y, Tensor argmax_x,
+                            int aligned_height, int aligned_width,
+                            float spatial_scale, int sampling_ratio,
+                            int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_forward_impl, input, rois, output, argmax_y,
+                       argmax_x, aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                             Tensor argmax_x, Tensor grad_input,
+                             int aligned_height, int aligned_width,
+                             float spatial_scale, int sampling_ratio,
+                             int pool_mode, bool aligned) {
+  DISPATCH_DEVICE_IMPL(roi_align_backward_impl, grad_output, rois, argmax_y,
+                       argmax_x, grad_input, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_forward(Tensor input, Tensor rois, Tensor output,
+                       Tensor argmax_y, Tensor argmax_x, int aligned_height,
+                       int aligned_width, float spatial_scale,
+                       int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_forward_impl(input, rois, output, argmax_y, argmax_x,
+                         aligned_height, aligned_width, spatial_scale,
+                         sampling_ratio, pool_mode, aligned);
+}
+
+void roi_align_backward(Tensor grad_output, Tensor rois, Tensor argmax_y,
+                        Tensor argmax_x, Tensor grad_input, int aligned_height,
+                        int aligned_width, float spatial_scale,
+                        int sampling_ratio, int pool_mode, bool aligned) {
+  roi_align_backward_impl(grad_output, rois, argmax_y, argmax_x, grad_input,
+                          aligned_height, aligned_width, spatial_scale,
+                          sampling_ratio, pool_mode, aligned);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp b/mmcv/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..77ea5ce70cff1724a6b012aee127ba256c7dd326
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/roi_align_rotated.cpp
@@ -0,0 +1,41 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_align_rotated_forward_impl(Tensor input, Tensor rois, Tensor output,
+                                    int aligned_height, int aligned_width,
+                                    float spatial_scale, int sampling_ratio,
+                                    bool aligned, bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_forward_impl, input, rois, output,
+                       aligned_height, aligned_width, spatial_scale,
+                       sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_backward_impl(Tensor top_grad, Tensor rois,
+                                     Tensor bottom_grad, int aligned_height,
+                                     int aligned_width, float spatial_scale,
+                                     int sampling_ratio, bool aligned,
+                                     bool clockwise) {
+  DISPATCH_DEVICE_IMPL(roi_align_rotated_backward_impl, top_grad, rois,
+                       bottom_grad, aligned_height, aligned_width,
+                       spatial_scale, sampling_ratio, aligned, clockwise);
+}
+
+void roi_align_rotated_forward(Tensor input, Tensor rois, Tensor output,
+                               int aligned_height, int aligned_width,
+                               float spatial_scale, int sampling_ratio,
+                               bool aligned, bool clockwise) {
+  roi_align_rotated_forward_impl(input, rois, output, aligned_height,
+                                 aligned_width, spatial_scale, sampling_ratio,
+                                 aligned, clockwise);
+}
+
+void roi_align_rotated_backward(Tensor top_grad, Tensor rois,
+                                Tensor bottom_grad, int aligned_height,
+                                int aligned_width, float spatial_scale,
+                                int sampling_ratio, bool aligned,
+                                bool clockwise) {
+  roi_align_rotated_backward_impl(top_grad, rois, bottom_grad, aligned_height,
+                                  aligned_width, spatial_scale, sampling_ratio,
+                                  aligned, clockwise);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/roi_pool.cpp b/mmcv/mmcv/ops/csrc/pytorch/roi_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bba90b806c5fe59d9e20a0b41a51df9922e91c3f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/roi_pool.cpp
@@ -0,0 +1,31 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roi_pool_forward_impl(Tensor input, Tensor rois, Tensor output,
+                           Tensor argmax, int pooled_height, int pooled_width,
+                           float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_forward_impl, input, rois, output, argmax,
+                       pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_backward_impl(Tensor grad_output, Tensor rois, Tensor argmax,
+                            Tensor grad_input, int pooled_height,
+                            int pooled_width, float spatial_scale) {
+  DISPATCH_DEVICE_IMPL(roi_pool_backward_impl, grad_output, rois, argmax,
+                       grad_input, pooled_height, pooled_width, spatial_scale);
+}
+
+void roi_pool_forward(Tensor input, Tensor rois, Tensor output, Tensor argmax,
+                      int pooled_height, int pooled_width,
+                      float spatial_scale) {
+  roi_pool_forward_impl(input, rois, output, argmax, pooled_height,
+                        pooled_width, spatial_scale);
+}
+
+void roi_pool_backward(Tensor grad_output, Tensor rois, Tensor argmax,
+                       Tensor grad_input, int pooled_height, int pooled_width,
+                       float spatial_scale) {
+  roi_pool_backward_impl(grad_output, rois, argmax, grad_input, pooled_height,
+                         pooled_width, spatial_scale);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp b/mmcv/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cf9cf0945db4c0ce1774aed6d334b62f3e1a9e4
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/roiaware_pool3d.cpp
@@ -0,0 +1,72 @@
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roiaware_pool3d_forward_impl(int boxes_num, int pts_num, int channels,
+                                  int max_pts_each_voxel, int out_x, int out_y,
+                                  int out_z, const Tensor rois,
+                                  const Tensor pts, const Tensor pts_feature,
+                                  Tensor argmax, Tensor pts_idx_of_voxels,
+                                  Tensor pooled_features, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_forward_impl, boxes_num, pts_num,
+                       channels, max_pts_each_voxel, out_x, out_y, out_z, rois,
+                       pts, pts_feature, argmax, pts_idx_of_voxels,
+                       pooled_features, pool_method);
+}
+
+void roiaware_pool3d_backward_impl(int boxes_num, int out_x, int out_y,
+                                   int out_z, int channels,
+                                   int max_pts_each_voxel,
+                                   const Tensor pts_idx_of_voxels,
+                                   const Tensor argmax, const Tensor grad_out,
+                                   Tensor grad_in, int pool_method) {
+  DISPATCH_DEVICE_IMPL(roiaware_pool3d_backward_impl, boxes_num, out_x, out_y,
+                       out_z, channels, max_pts_each_voxel, pts_idx_of_voxels,
+                       argmax, grad_out, grad_in, pool_method);
+}
+
+void roiaware_pool3d_forward(Tensor rois, Tensor pts, Tensor pts_feature,
+                             Tensor argmax, Tensor pts_idx_of_voxels,
+                             Tensor pooled_features, int pool_method) {
+  // params rois: (N, 7) [x, y, z, x_size, y_size, z_size, ry] in LiDAR
+  // coordinate
+  // params pts: (npoints, 3) [x, y, z] in LiDAR coordinate
+  // params pts_feature: (npoints, C)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params pooled_features: (N, out_x, out_y, out_z, C)
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = rois.size(0);
+  int pts_num = pts.size(0);
+  int channels = pts_feature.size(1);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  assert((out_x < 256) && (out_y < 256) &&
+         (out_z < 256));  // we encode index with 8bit
+
+  roiaware_pool3d_forward_impl(boxes_num, pts_num, channels, max_pts_each_voxel,
+                               out_x, out_y, out_z, rois, pts, pts_feature,
+                               argmax, pts_idx_of_voxels, pooled_features,
+                               pool_method);
+}
+
+void roiaware_pool3d_backward(Tensor pts_idx_of_voxels, Tensor argmax,
+                              Tensor grad_out, Tensor grad_in,
+                              int pool_method) {
+  // params pts_idx_of_voxels: (N, out_x, out_y, out_z, max_pts_each_voxel)
+  // params argmax: (N, out_x, out_y, out_z, C)
+  // params grad_out: (N, out_x, out_y, out_z, C)
+  // params grad_in: (npoints, C), return value
+  // params pool_method: 0: max_pool 1: avg_pool
+  int boxes_num = pts_idx_of_voxels.size(0);
+  int out_x = pts_idx_of_voxels.size(1);
+  int out_y = pts_idx_of_voxels.size(2);
+  int out_z = pts_idx_of_voxels.size(3);
+  int max_pts_each_voxel = pts_idx_of_voxels.size(4);  // index 0 is the counter
+  int channels = grad_out.size(4);
+
+  roiaware_pool3d_backward_impl(boxes_num, out_x, out_y, out_z, channels,
+                                max_pts_each_voxel, pts_idx_of_voxels, argmax,
+                                grad_out, grad_in, pool_method);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp b/mmcv/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a10080b7c23abb3a31b6f764c972ea7917f52346
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/roipoint_pool3d.cpp
@@ -0,0 +1,39 @@
+/*
+Modified from
+https://github.com/open-mmlab/OpenPCDet/blob/master/pcdet/ops/roipoint_pool3d/src/roipoint_pool3d.cpp
+Point cloud feature pooling
+Written by Shaoshuai Shi
+All Rights Reserved 2018.
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void roipoint_pool3d_forward_impl(int batch_size, int pts_num, int boxes_num,
+                                  int feature_in_len, int sampled_pts_num,
+                                  const Tensor xyz, const Tensor boxes3d,
+                                  const Tensor pts_feature,
+                                  Tensor pooled_features,
+                                  Tensor pooled_empty_flag) {
+  DISPATCH_DEVICE_IMPL(roipoint_pool3d_forward_impl, batch_size, pts_num,
+                       boxes_num, feature_in_len, sampled_pts_num, xyz, boxes3d,
+                       pts_feature, pooled_features, pooled_empty_flag);
+}
+
+void roipoint_pool3d_forward(Tensor xyz, Tensor boxes3d, Tensor pts_feature,
+                             Tensor pooled_features, Tensor pooled_empty_flag) {
+  // params xyz: (B, N, 3)
+  // params boxes3d: (B, M, 7)
+  // params pts_feature: (B, N, C)
+  // params pooled_features: (B, M, 512, 3+C)
+  // params pooled_empty_flag: (B, M)
+  int batch_size = xyz.size(0);
+  int pts_num = xyz.size(1);
+  int boxes_num = boxes3d.size(1);
+  int feature_in_len = pts_feature.size(2);
+  int sampled_pts_num = pooled_features.size(2);
+
+  roipoint_pool3d_forward_impl(batch_size, pts_num, boxes_num, feature_in_len,
+                               sampled_pts_num, xyz, boxes3d, pts_feature,
+                               pooled_features, pooled_empty_flag);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp b/mmcv/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..71fe0c9a0a26003310a388d4edca6e79aa7b9026
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/rotated_feature_align.cpp
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+// Modified from
+// https://github.com/SJTU-Thinklab-Det/r3det-on-mmdetection/blob/master/mmdet/ops/fr/src/feature_refine_cuda.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void rotated_feature_align_forward_impl(const Tensor features,
+                                        const Tensor best_bboxes,
+                                        const float spatial_scale,
+                                        const int points, Tensor output) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_forward_impl, features,
+                       best_bboxes, spatial_scale, points, output);
+}
+
+void rotated_feature_align_backward_impl(const Tensor top_grad,
+                                         const Tensor best_bboxes,
+                                         const float spatial_scale,
+                                         const int points, Tensor bottom_grad) {
+  DISPATCH_DEVICE_IMPL(rotated_feature_align_backward_impl, top_grad,
+                       best_bboxes, spatial_scale, points, bottom_grad);
+}
+
+void rotated_feature_align_forward(const Tensor features,
+                                   const Tensor best_bboxes, Tensor output,
+                                   const float spatial_scale,
+                                   const int points) {
+  rotated_feature_align_forward_impl(features, best_bboxes, spatial_scale,
+                                     points, output);
+}
+
+void rotated_feature_align_backward(const Tensor top_grad,
+                                    const Tensor best_bboxes,
+                                    Tensor bottom_grad,
+                                    const float spatial_scale,
+                                    const int points) {
+  rotated_feature_align_backward_impl(top_grad, best_bboxes, spatial_scale,
+                                      points, bottom_grad);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/scatter_points.cpp b/mmcv/mmcv/ops/csrc/pytorch/scatter_points.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0de8ebf64a3432db25b61a81fce305efc09195b8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/scatter_points.cpp
@@ -0,0 +1,53 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+typedef enum { SUM = 0, MEAN = 1, MAX = 2 } reduce_t;
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward_impl(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const reduce_t reduce_type) {
+  return DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_forward_impl, feats, coors,
+                              reduce_type);
+}
+
+void dynamic_point_to_voxel_backward_impl(
+    torch::Tensor &grad_feats, const torch::Tensor &grad_reduced_feats,
+    const torch::Tensor &feats, const torch::Tensor &reduced_feats,
+    const torch::Tensor &coors_idx, const torch::Tensor &reduce_count,
+    const reduce_t reduce_type) {
+  DISPATCH_DEVICE_IMPL(dynamic_point_to_voxel_backward_impl, grad_feats,
+                       grad_reduced_feats, feats, reduced_feats, coors_idx,
+                       reduce_count, reduce_type);
+}
+
+inline reduce_t convert_reduce_type(const std::string &reduce_type) {
+  if (reduce_type == "max")
+    return reduce_t::MAX;
+  else if (reduce_type == "sum")
+    return reduce_t::SUM;
+  else if (reduce_type == "mean")
+    return reduce_t::MEAN;
+  else
+    TORCH_CHECK(false, "do not support reduce type " + reduce_type)
+  return reduce_t::SUM;
+}
+
+std::vector<torch::Tensor> dynamic_point_to_voxel_forward(
+    const torch::Tensor &feats, const torch::Tensor &coors,
+    const std::string &reduce_type) {
+  return dynamic_point_to_voxel_forward_impl(feats, coors,
+                                             convert_reduce_type(reduce_type));
+}
+
+void dynamic_point_to_voxel_backward(torch::Tensor &grad_feats,
+                                     const torch::Tensor &grad_reduced_feats,
+                                     const torch::Tensor &feats,
+                                     const torch::Tensor &reduced_feats,
+                                     const torch::Tensor &coors_idx,
+                                     const torch::Tensor &reduce_count,
+                                     const std::string &reduce_type) {
+  dynamic_point_to_voxel_backward_impl(grad_feats, grad_reduced_feats, feats,
+                                       reduced_feats, coors_idx, reduce_count,
+                                       convert_reduce_type(reduce_type));
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp b/mmcv/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6f38fc68a3ec4fc1de253215c1068fba6109599
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/sparse_pool_ops.cpp
@@ -0,0 +1,48 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor indice_maxpool_forward_impl(torch::Tensor features,
+                                          torch::Tensor indicePairs,
+                                          torch::Tensor indiceNum,
+                                          int64_t numAct) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_forward_impl, features,
+                              indicePairs, indiceNum, numAct);
+}
+
+torch::Tensor indice_maxpool_forward(torch::Tensor features,
+                                     torch::Tensor indicePairs,
+                                     torch::Tensor indiceNum, int64_t numAct) {
+  return indice_maxpool_forward_impl(features, indicePairs, indiceNum, numAct);
+}
+
+torch::Tensor indice_maxpool_backward_impl(torch::Tensor features,
+                                           torch::Tensor outFeatures,
+                                           torch::Tensor outGrad,
+                                           torch::Tensor indicePairs,
+                                           torch::Tensor indiceNum) {
+  return DISPATCH_DEVICE_IMPL(indice_maxpool_backward_impl, features,
+                              outFeatures, outGrad, indicePairs, indiceNum);
+}
+
+torch::Tensor indice_maxpool_backward(torch::Tensor features,
+                                      torch::Tensor outFeatures,
+                                      torch::Tensor outGrad,
+                                      torch::Tensor indicePairs,
+                                      torch::Tensor indiceNum) {
+  return indice_maxpool_backward_impl(features, outFeatures, outGrad,
+                                      indicePairs, indiceNum);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/spconv_ops.cpp b/mmcv/mmcv/ops/csrc/pytorch/spconv_ops.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..09c8110ad8c895145575484a7e9c6e7bf1fb5bce
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/spconv_ops.cpp
@@ -0,0 +1,171 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsForwardCUDAKernelLauncher(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward_cuda(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsForwardCUDAKernelLauncher<NDim>(
+      indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+      padding, dilation, outPadding, _subM, _transpose);
+};
+
+template <unsigned NDim>
+std::vector<torch::Tensor> GetIndicePairsBackwardCUDAKernelLauncher(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward_cuda(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  return GetIndicePairsBackwardCUDAKernelLauncher<NDim>(
+      indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+      stride, padding, dilation, outPadding, _subM, _transpose);
+};
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_forward(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+
+    return get_indice_pairs_forward_cuda<NDim>(
+        indices, batchSize, outSpatialShape, spatialShape, kernelSize, stride,
+        padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+
+template <unsigned NDim>
+std::vector<torch::Tensor> get_indice_pairs_backward(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose) {
+  if (indices.device().is_cuda()) {
+#ifdef MMCV_WITH_CUDA
+    CHECK_CUDA_INPUT(indices);
+    CHECK_CUDA_INPUT(gridOut);
+
+    return get_indice_pairs_backward_cuda<NDim>(
+        indices, gridOut, batchSize, outSpatialShape, spatialShape, kernelSize,
+        stride, padding, dilation, outPadding, _subM, _transpose);
+#else
+    AT_ERROR("get_indice_pairs is not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("get_indice_pairs is not implemented on CPU");
+  }
+}
+
+torch::Tensor indice_conv_forward_impl(torch::Tensor features,
+                                       torch::Tensor filters,
+                                       torch::Tensor indicePairs,
+                                       torch::Tensor indiceNum,
+                                       int64_t numActOut, int64_t _inverse,
+                                       int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_forward_impl, features, filters,
+                              indicePairs, indiceNum, numActOut, _inverse,
+                              _subM);
+}
+
+torch::Tensor indice_conv_forward(torch::Tensor features, torch::Tensor filters,
+                                  torch::Tensor indicePairs,
+                                  torch::Tensor indiceNum, int64_t numActOut,
+                                  int64_t _inverse, int64_t _subM) {
+  return indice_conv_forward_impl(features, filters, indicePairs, indiceNum,
+                                  numActOut, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward_impl(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return DISPATCH_DEVICE_IMPL(indice_conv_backward_impl, features, filters,
+                              outGrad, indicePairs, indiceNum, _inverse, _subM);
+}
+
+std::vector<torch::Tensor> indice_conv_backward(
+    torch::Tensor features, torch::Tensor filters, torch::Tensor outGrad,
+    torch::Tensor indicePairs, torch::Tensor indiceNum, int64_t _inverse,
+    int64_t _subM) {
+  return indice_conv_backward_impl(features, filters, outGrad, indicePairs,
+                                   indiceNum, _inverse, _subM);
+}
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<2>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<3>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_forward<4>(
+    torch::Tensor indices, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_backward<2>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
+
+template std::vector<torch::Tensor> get_indice_pairs_backward<3>(
+    torch::Tensor indices, torch::Tensor gridOut, int64_t batchSize,
+    std::vector<int64_t> outSpatialShape, std::vector<int64_t> spatialShape,
+    std::vector<int64_t> kernelSize, std::vector<int64_t> stride,
+    std::vector<int64_t> padding, std::vector<int64_t> dilation,
+    std::vector<int64_t> outPadding, int64_t _subM, int64_t _transpose);
diff --git a/mmcv/mmcv/ops/csrc/pytorch/spconv_utils.h b/mmcv/mmcv/ops/csrc/pytorch/spconv_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d3de025b690f6247abfb813614e70de36b02d7d
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/spconv_utils.h
@@ -0,0 +1,79 @@
+// Copyright 2019 Yan Yan
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <torch/script.h>
+#include <utils/spconv/tensorview/tensorview.h>
+
+#include "pytorch_cuda_helper.hpp"
+
+namespace tv {
+struct GPU {
+  GPU(cudaStream_t s = 0) : mStream(s) {}
+  virtual cudaStream_t getStream() const { return mStream; }
+  cudaStream_t mStream = 0;
+};
+
+struct TorchGPU : public tv::GPU {
+  virtual cudaStream_t getStream() const override {
+    return at::cuda::getCurrentCUDAStream();
+  }
+};
+
+template <typename scalar_t>
+void check_torch_dtype(const torch::Tensor &tensor) {
+  switch (tensor.type().scalarType()) {
+    case at::ScalarType::Double: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, double>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Float: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, float>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Int: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, int>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Half: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, at::Half>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    case at::ScalarType::Long: {
+      auto val = std::is_same<std::remove_const_t<scalar_t>, long>::value;
+      TV_ASSERT_RT_ERR(val, "error");
+      break;
+    }
+    default:
+      TV_ASSERT_RT_ERR(false, "error");
+  }
+}
+
+template <typename scalar_t>
+tv::TensorView<scalar_t> torch2tv(const torch::Tensor &tensor) {
+  check_torch_dtype<scalar_t>(tensor);
+  tv::Shape shape;
+  for (auto i : tensor.sizes()) {
+    shape.push_back(i);
+  }
+  return tv::TensorView<scalar_t>(
+      tensor.data_ptr<std::remove_const_t<scalar_t>>(), shape);
+}
+}  // namespace tv
diff --git a/mmcv/mmcv/ops/csrc/pytorch/sync_bn.cpp b/mmcv/mmcv/ops/csrc/pytorch/sync_bn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fd5a513273a7bbce2cf41c790706fe4801f4c414
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/sync_bn.cpp
@@ -0,0 +1,69 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void sync_bn_forward_mean_impl(const Tensor input, Tensor mean) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_mean_impl, input, mean);
+}
+
+void sync_bn_forward_var_impl(const Tensor input, const Tensor mean,
+                              Tensor var) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_var_impl, input, mean, var);
+}
+
+void sync_bn_forward_output_impl(const Tensor input, const Tensor mean,
+                                 const Tensor var, Tensor running_mean,
+                                 Tensor running_var, const Tensor weight,
+                                 const Tensor bias, Tensor norm, Tensor std,
+                                 Tensor output, float eps, float momentum,
+                                 int group_size) {
+  DISPATCH_DEVICE_IMPL(sync_bn_forward_output_impl, input, mean, var,
+                       running_mean, running_var, weight, bias, norm, std,
+                       output, eps, momentum, group_size);
+}
+
+void sync_bn_backward_param_impl(const Tensor grad_output, const Tensor norm,
+                                 Tensor grad_weight, Tensor grad_bias) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_param_impl, grad_output, norm,
+                       grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data_impl(const Tensor grad_output, const Tensor weight,
+                                const Tensor grad_weight,
+                                const Tensor grad_bias, const Tensor norm,
+                                const Tensor std, Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(sync_bn_backward_data_impl, grad_output, weight,
+                       grad_weight, grad_bias, norm, std, grad_input);
+}
+
+void sync_bn_forward_mean(const Tensor input, Tensor mean) {
+  sync_bn_forward_mean_impl(input, mean);
+}
+
+void sync_bn_forward_var(const Tensor input, const Tensor mean, Tensor var) {
+  sync_bn_forward_var_impl(input, mean, var);
+}
+
+void sync_bn_forward_output(const Tensor input, const Tensor mean,
+                            const Tensor var, const Tensor weight,
+                            const Tensor bias, Tensor running_mean,
+                            Tensor running_var, Tensor norm, Tensor std,
+                            Tensor output, float eps, float momentum,
+                            int group_size) {
+  sync_bn_forward_output_impl(input, mean, var, running_mean, running_var,
+                              weight, bias, norm, std, output, eps, momentum,
+                              group_size);
+}
+
+void sync_bn_backward_param(const Tensor grad_output, const Tensor norm,
+                            Tensor grad_weight, Tensor grad_bias) {
+  sync_bn_backward_param_impl(grad_output, norm, grad_weight, grad_bias);
+}
+
+void sync_bn_backward_data(const Tensor grad_output, const Tensor weight,
+                           const Tensor grad_weight, const Tensor grad_bias,
+                           const Tensor norm, const Tensor std,
+                           Tensor grad_input) {
+  sync_bn_backward_data_impl(grad_output, weight, grad_weight, grad_bias, norm,
+                             std, grad_input);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/three_interpolate.cpp b/mmcv/mmcv/ops/csrc/pytorch/three_interpolate.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e0ec71bb3d3fdb8416dcc62cfda926cc45c9977
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/three_interpolate.cpp
@@ -0,0 +1,33 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_interpolate_forward_impl(int b, int c, int m, int n,
+                                    const Tensor points, const Tensor idx,
+                                    const Tensor weight, Tensor out) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_forward_impl, b, c, m, n, points, idx,
+                       weight, out);
+}
+
+void three_interpolate_backward_impl(int b, int c, int n, int m,
+                                     const Tensor grad_out, const Tensor idx,
+                                     const Tensor weight, Tensor grad_points) {
+  DISPATCH_DEVICE_IMPL(three_interpolate_backward_impl, b, c, n, m, grad_out,
+                       idx, weight, grad_points);
+}
+
+void three_interpolate_forward(Tensor points_tensor, Tensor idx_tensor,
+                               Tensor weight_tensor, Tensor out_tensor, int b,
+                               int c, int m, int n) {
+  three_interpolate_forward_impl(b, c, m, n, points_tensor, idx_tensor,
+                                 weight_tensor, out_tensor);
+}
+
+void three_interpolate_backward(Tensor grad_out_tensor, Tensor idx_tensor,
+                                Tensor weight_tensor, Tensor grad_points_tensor,
+                                int b, int c, int n, int m) {
+  three_interpolate_backward_impl(b, c, n, m, grad_out_tensor, idx_tensor,
+                                  weight_tensor, grad_points_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/three_nn.cpp b/mmcv/mmcv/ops/csrc/pytorch/three_nn.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b629200c0727cdec5ca4e0abd8ac65baacaa31f9
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/three_nn.cpp
@@ -0,0 +1,18 @@
+// Modified from
+// https://github.com/sshaoshuai/Pointnet2.PyTorch/tree/master/pointnet2/src/interpolate.cpp
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void three_nn_forward_impl(int b, int n, int m, const Tensor unknown,
+                           const Tensor known, Tensor dist2, Tensor idx) {
+  DISPATCH_DEVICE_IMPL(three_nn_forward_impl, b, n, m, unknown, known, dist2,
+                       idx);
+}
+
+void three_nn_forward(Tensor unknown_tensor, Tensor known_tensor,
+                      Tensor dist2_tensor, Tensor idx_tensor, int b, int n,
+                      int m) {
+  three_nn_forward_impl(b, n, m, unknown_tensor, known_tensor, dist2_tensor,
+                        idx_tensor);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/tin_shift.cpp b/mmcv/mmcv/ops/csrc/pytorch/tin_shift.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b03f587541f17cae3c3f03f5cb8747d4b0208efc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/tin_shift.cpp
@@ -0,0 +1,20 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+void tin_shift_forward_impl(Tensor input, Tensor shift, Tensor output) {
+  DISPATCH_DEVICE_IMPL(tin_shift_forward_impl, input, shift, output);
+}
+
+void tin_shift_backward_impl(Tensor grad_output, Tensor shift,
+                             Tensor grad_input) {
+  DISPATCH_DEVICE_IMPL(tin_shift_backward_impl, grad_output, shift, grad_input);
+}
+
+void tin_shift_forward(Tensor input, Tensor shift, Tensor output) {
+  tin_shift_forward_impl(input, shift, output);
+}
+
+void tin_shift_backward(Tensor grad_output, Tensor shift, Tensor grad_input) {
+  tin_shift_backward_impl(grad_output, shift, grad_input);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/upfirdn2d.cpp b/mmcv/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd325bd7887a49b5f0ccd134604f24c0fd40fc10
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/upfirdn2d.cpp
@@ -0,0 +1,118 @@
+// Modified from
+// https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.cpp
+
+/*
+Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+
+NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+Augmentation (ADA)
+=======================================================================
+
+1. Definitions
+
+"Licensor" means any person or entity that distributes its Work.
+
+"Software" means the original work of authorship made available under
+this License.
+
+"Work" means the Software and any additions to or derivative works of
+the Software that are made available under this License.
+
+The terms "reproduce," "reproduction," "derivative works," and
+"distribution" have the meaning as provided under U.S. copyright law;
+provided, however, that for the purposes of this License, derivative
+works shall not include works that remain separable from, or merely
+link (or bind by name) to the interfaces of, the Work.
+
+Works, including the Software, are "made available" under this License
+by including in or with the Work either (a) a copyright notice
+referencing the applicability of this License to the Work, or (b) a
+copy of this License.
+
+2. License Grants
+
+    2.1 Copyright Grant. Subject to the terms and conditions of this
+    License, each Licensor grants to you a perpetual, worldwide,
+    non-exclusive, royalty-free, copyright license to reproduce,
+    prepare derivative works of, publicly display, publicly perform,
+    sublicense and distribute its Work and any resulting derivative
+    works in any form.
+
+3. Limitations
+
+    3.1 Redistribution. You may reproduce or distribute the Work only
+    if (a) you do so under this License, (b) you include a complete
+    copy of this License with your distribution, and (c) you retain
+    without modification any copyright, patent, trademark, or
+    attribution notices that are present in the Work.
+
+    3.2 Derivative Works. You may specify that additional or different
+    terms apply to the use, reproduction, and distribution of your
+    derivative works of the Work ("Your Terms") only if (a) Your Terms
+    provide that the use limitation in Section 3.3 applies to your
+    derivative works, and (b) you identify the specific derivative
+    works that are subject to Your Terms. Notwithstanding Your Terms,
+    this License (including the redistribution requirements in Section
+    3.1) will continue to apply to the Work itself.
+
+    3.3 Use Limitation. The Work and any derivative works thereof only
+    may be used or intended for use non-commercially. Notwithstanding
+    the foregoing, NVIDIA and its affiliates may use the Work and any
+    derivative works commercially. As used herein, "non-commercially"
+    means for research or evaluation purposes only.
+
+    3.4 Patent Claims. If you bring or threaten to bring a patent claim
+    against any Licensor (including any claim, cross-claim or
+    counterclaim in a lawsuit) to enforce any patents that you allege
+    are infringed by any Work, then your rights under this License from
+    such Licensor (including the grant in Section 2.1) will terminate
+    immediately.
+
+    3.5 Trademarks. This License does not grant any rights to use any
+    Licensor’s or its affiliates’ names, logos, or trademarks, except
+    as necessary to reproduce the notices described in this License.
+
+    3.6 Termination. If you violate any term of this License, then your
+    rights under this License (including the grant in Section 2.1) will
+    terminate immediately.
+
+4. Disclaimer of Warranty.
+
+THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+THIS LICENSE.
+
+5. Limitation of Liability.
+
+EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+(INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+THE POSSIBILITY OF SUCH DAMAGES.
+
+=======================================================================
+*/
+
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+torch::Tensor upfirdn2d_op_impl(const torch::Tensor& input,
+                                const torch::Tensor& kernel, int up_x, int up_y,
+                                int down_x, int down_y, int pad_x0, int pad_x1,
+                                int pad_y0, int pad_y1) {
+  return DISPATCH_DEVICE_IMPL(upfirdn2d_op_impl, input, kernel, up_x, up_y,
+                              down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1);
+}
+
+torch::Tensor upfirdn2d(const torch::Tensor& input, const torch::Tensor& kernel,
+                        int up_x, int up_y, int down_x, int down_y, int pad_x0,
+                        int pad_x1, int pad_y0, int pad_y1) {
+  return upfirdn2d_op_impl(input, kernel, up_x, up_y, down_x, down_y, pad_x0,
+                           pad_x1, pad_y0, pad_y1);
+}
diff --git a/mmcv/mmcv/ops/csrc/pytorch/voxelization.cpp b/mmcv/mmcv/ops/csrc/pytorch/voxelization.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7946be6178ad5eae64958b4631c1cabec2a04eee
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/pytorch/voxelization.cpp
@@ -0,0 +1,74 @@
+// Copyright (c) OpenMMLab. All rights reserved.
+#include "pytorch_cpp_helper.hpp"
+#include "pytorch_device_registry.hpp"
+
+int hard_voxelize_forward_impl(const at::Tensor &points, at::Tensor &voxels,
+                               at::Tensor &coors,
+                               at::Tensor &num_points_per_voxel,
+                               const std::vector<float> voxel_size,
+                               const std::vector<float> coors_range,
+                               const int max_points, const int max_voxels,
+                               const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(hard_voxelize_forward_impl, points, voxels, coors,
+                              num_points_per_voxel, voxel_size, coors_range,
+                              max_points, max_voxels, NDim);
+}
+
+int nondeterministic_hard_voxelize_forward_impl(
+    const at::Tensor &points, at::Tensor &voxels, at::Tensor &coors,
+    at::Tensor &num_points_per_voxel, const std::vector<float> voxel_size,
+    const std::vector<float> coors_range, const int max_points,
+    const int max_voxels, const int NDim = 3) {
+  return DISPATCH_DEVICE_IMPL(nondeterministic_hard_voxelize_forward_impl,
+                              points, voxels, coors, num_points_per_voxel,
+                              voxel_size, coors_range, max_points, max_voxels,
+                              NDim);
+}
+
+void dynamic_voxelize_forward_impl(const at::Tensor &points, at::Tensor &coors,
+                                   const std::vector<float> voxel_size,
+                                   const std::vector<float> coors_range,
+                                   const int NDim = 3) {
+  DISPATCH_DEVICE_IMPL(dynamic_voxelize_forward_impl, points, coors, voxel_size,
+                       coors_range, NDim);
+}
+
+void hard_voxelize_forward(const at::Tensor &points,
+                           const at::Tensor &voxel_size,
+                           const at::Tensor &coors_range, at::Tensor &voxels,
+                           at::Tensor &coors, at::Tensor &num_points_per_voxel,
+                           at::Tensor &voxel_num, const int max_points,
+                           const int max_voxels, const int NDim = 3,
+                           const bool deterministic = true) {
+  int64_t *voxel_num_data = voxel_num.data_ptr<int64_t>();
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+
+  if (deterministic) {
+    *voxel_num_data = hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  } else {
+    *voxel_num_data = nondeterministic_hard_voxelize_forward_impl(
+        points, voxels, coors, num_points_per_voxel, voxel_size_v,
+        coors_range_v, max_points, max_voxels, NDim);
+  }
+}
+
+void dynamic_voxelize_forward(const at::Tensor &points,
+                              const at::Tensor &voxel_size,
+                              const at::Tensor &coors_range, at::Tensor &coors,
+                              const int NDim = 3) {
+  std::vector<float> voxel_size_v(
+      voxel_size.data_ptr<float>(),
+      voxel_size.data_ptr<float>() + voxel_size.numel());
+  std::vector<float> coors_range_v(
+      coors_range.data_ptr<float>(),
+      coors_range.data_ptr<float>() + coors_range.numel());
+  dynamic_voxelize_forward_impl(points, coors, voxel_size_v, coors_range_v,
+                                NDim);
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d405a7d6b1fe1e212b71c061c2c22079f8b7ea5f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool.cpp
@@ -0,0 +1,217 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_corner_pool.hpp"
+
+#include <assert.h>
+
+#include "trt_serialize.hpp"
+
+void CornerPoolForwardLauncher_float(const float *input, float *output,
+                                     const int batch_size, const int channels,
+                                     const int height, const int width,
+                                     const int pool_type, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *CORNER_POOL_PLUGIN_NAME{"MMCVCornerPool"};
+}  // namespace
+
+CornerPoolPluginDynamic::CornerPoolPluginDynamic(const std::string &name,
+                                                 TRT_CORNER_POOL_TYPE poolType)
+    : mLayerName(name), mPoolType(poolType) {}
+
+CornerPoolPluginDynamic::CornerPoolPluginDynamic(const std::string name,
+                                                 const void *data,
+                                                 size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mPoolType);
+}
+
+CornerPoolPluginDynamic::~CornerPoolPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *CornerPoolPluginDynamic::clone() const {
+  CornerPoolPluginDynamic *plugin =
+      new CornerPoolPluginDynamic(mLayerName, mPoolType);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs CornerPoolPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  return inputs[0];
+}
+
+bool CornerPoolPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  switch (pos) {
+    // input[0]
+    case 0:
+      return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    // output[0]
+    case 1:
+      return inOut[pos].type == inOut[0].type &&
+             inOut[pos].format == inOut[0].format;
+    default:
+      return false;
+  }
+}
+
+void CornerPoolPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t CornerPoolPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+}
+
+int CornerPoolPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  const void *input = inputs[0];
+  void *output_value = outputs[0];
+
+  const int batch_size = inputDesc[0].dims.d[0];
+  const int channels = inputDesc[0].dims.d[1];
+  const int height = inputDesc[0].dims.d[2];
+  const int width = inputDesc[0].dims.d[3];
+
+  CornerPoolForwardLauncher_float((float *)input, (float *)output_value,
+                                  batch_size, channels, height, width,
+                                  int(mPoolType), stream);
+
+  return 0;
+}
+
+nvinfer1::DataType CornerPoolPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *CornerPoolPluginDynamic::getPluginType() const {
+  switch (mPoolType) {
+    case TRT_CORNER_POOL_TYPE::TRT_TOP_POOL:
+    case TRT_CORNER_POOL_TYPE::TRT_BOTTOM_POOL:
+    case TRT_CORNER_POOL_TYPE::TRT_LEFT_POOL:
+    case TRT_CORNER_POOL_TYPE::TRT_RIGHT_POOL:
+      return CORNER_POOL_PLUGIN_NAME;
+
+    default:
+      return "UnknownpoolType";
+  }
+}
+
+const char *CornerPoolPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int CornerPoolPluginDynamic::getNbOutputs() const { return 1; }
+
+int CornerPoolPluginDynamic::initialize() { return 0; }
+
+void CornerPoolPluginDynamic::terminate() {}
+
+size_t CornerPoolPluginDynamic::getSerializationSize() const {
+  return sizeof(mPoolType);
+}
+
+void CornerPoolPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mPoolType);
+}
+
+void CornerPoolPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void CornerPoolPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CornerPoolPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+CornerPoolPluginDynamicCreator::CornerPoolPluginDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *CornerPoolPluginDynamicCreator::getPluginName() const {
+  return CORNER_POOL_PLUGIN_NAME;
+}
+
+const char *CornerPoolPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+CornerPoolPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *CornerPoolPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  TRT_CORNER_POOL_TYPE poolType;
+  int poolMode = -1;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("mode") == 0) {
+      poolMode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+
+  assert(poolMode >= 0 && poolMode <= 3);
+  switch (poolMode) {
+    case 0:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_TOP_POOL;
+      break;
+    case 1:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_BOTTOM_POOL;
+      break;
+    case 2:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_LEFT_POOL;
+      break;
+    case 3:
+      poolType = TRT_CORNER_POOL_TYPE::TRT_RIGHT_POOL;
+      break;
+
+    default:
+      break;
+  }
+
+  CornerPoolPluginDynamic *plugin = new CornerPoolPluginDynamic(name, poolType);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *CornerPoolPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  // This object will be deleted when the network is destroyed, which will
+  // call FCPluginDynamic::destroy()
+  auto plugin = new CornerPoolPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void CornerPoolPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CornerPoolPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ecf9ee6e827cb8a71e7e6f2907576e7f4fa4ebed
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_corner_pool_kernel.cu
@@ -0,0 +1,110 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+template <typename scalar_t>
+__global__ void top_bottom_pool_kernel(const scalar_t *input, scalar_t *output,
+                                       const int batch_size, const int channels,
+                                       const int height, const int width,
+                                       const int pool_type) {
+  const int nthreads = batch_size * channels * width;
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n_idx = index / (channels * width);  // batch
+    int w_idx = index % width;               // width
+    int c_idx = (index / width) % channels;  // channels
+    int offset_n = n_idx * channels * width * height;
+    int offset_n_c = offset_n + c_idx * width * height;
+    int direction = -1;            // in [-1, 1], default for TopPool
+    int index_start = height - 2;  // default for TopPool
+    // pool_type in [0, 1]
+    if (pool_type == 0) {
+      // TopPool
+      // directly copy the most bottom value from input to output
+      output[offset_n_c + (height - 1) * width + w_idx] =
+          input[offset_n_c + (height - 1) * width + w_idx];
+    } else {
+      // BottomPool
+      // directly copy the most top value from input to output
+      output[offset_n_c + w_idx] = input[offset_n_c + w_idx];
+      index_start = 1;
+      direction = 1;
+    }
+    // do pool
+    for (int h = index_start; h >= 0 && h < height; h += direction) {
+      output[offset_n_c + h * width + w_idx] =
+          max(output[offset_n_c + (h - direction) * width + w_idx],
+              input[offset_n_c + h * width + w_idx]);
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void left_right_pool_kernel(const scalar_t *input, scalar_t *output,
+                                       const int batch_size, const int channels,
+                                       const int height, const int width,
+                                       const int pool_type) {
+  const int nthreads = batch_size * channels * height;
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    int n_idx = index / (channels * height);  // batch
+    int h_idx = index % height;               // height
+    int c_idx = (index / height) % channels;  // channels
+    int offset_n = n_idx * channels * width * height;
+    int offset_n_c = offset_n + c_idx * width * height;
+    int offset_n_c_h = offset_n_c + h_idx * width;
+    int direction = -1;           // in [-1, 1], default for LeftPool
+    int index_start = width - 2;  // default for LeftPool
+    // pool_type in [2, 3]
+    if (pool_type == 2) {
+      // LeftPool
+      // directly copy the most right value from input to output
+      output[offset_n_c_h + width - 1] = input[offset_n_c_h + width - 1];
+    } else {
+      // RightPool
+      // directly copy the most left value from input to output
+      output[offset_n_c_h] = input[offset_n_c_h];
+      index_start = 1;
+      direction = 1;
+    }
+    // do pool
+    for (int w = index_start; w >= 0 && w < width; w += direction) {
+      output[offset_n_c_h + w] =
+          max(output[offset_n_c_h + w - direction], input[offset_n_c_h + w]);
+    }
+  }
+}
+
+template <typename scalar_t>
+void CornerPoolForwardLauncher(const scalar_t *input, scalar_t *output,
+                               const int batch_size, const int channels,
+                               const int height, const int width,
+                               const int pool_type, cudaStream_t stream) {
+  int nthreads = -1, col_block = -1;
+
+  switch (pool_type) {
+    case 0:
+    case 1:
+      nthreads = batch_size * channels * width;
+      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
+      top_bottom_pool_kernel<scalar_t>
+          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
+              input, output, batch_size, channels, height, width, pool_type);
+      break;
+    case 2:
+    case 3:
+      nthreads = batch_size * channels * height;
+      col_block = GET_BLOCKS(nthreads, THREADS_PER_BLOCK);
+      left_right_pool_kernel<scalar_t>
+          <<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
+              input, output, batch_size, channels, height, width, pool_type);
+      break;
+  }
+}
+
+void CornerPoolForwardLauncher_float(const float *input, float *output,
+                                     const int batch_size, const int channels,
+                                     const int height, const int width,
+                                     const int pool_type, cudaStream_t stream) {
+  CornerPoolForwardLauncher<float>(input, output, batch_size, channels, height,
+                                   width, pool_type, stream);
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f76c5f229f49f54f1200c8967b399e640711ba90
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_cuda_helper.cu
@@ -0,0 +1,91 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <cublas_v2.h>
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+using mmcv::TensorDesc;
+
+template <class scalar_t>
+__global__ void copy_permute_kernel(scalar_t *dst, const scalar_t *src, int n,
+                                    TensorDesc ts_src_stride,
+                                    TensorDesc ts_dst_stride,
+                                    TensorDesc ts_permute) {
+  const int src_dim = ts_src_stride.dim;
+  int *src_stride = &(ts_src_stride.stride[0]);
+  int *dst_stride = &(ts_dst_stride.stride[0]);
+  int *permute = &(ts_permute.shape[0]);
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    size_t dst_index = index;
+    size_t src_index = 0;
+    for (int i = 0; i < src_dim; ++i) {
+      int dim_index = dst_index / dst_stride[i];
+      dst_index = dst_index % dst_stride[i];
+      src_index += dim_index * src_stride[permute[i]];
+    }
+    dst[index] = src[src_index];
+  }
+}
+
+template <class scalar_t>
+void memcpyPermute(scalar_t *dst, const scalar_t *src, int *src_size,
+                   int *permute, int src_dim, cudaStream_t stream) {
+  size_t copy_size = 1;
+  TensorDesc ts_permute;
+  memcpy(&(ts_permute.shape[0]), permute, src_dim * sizeof(int));
+
+  TensorDesc ts_src_stride;
+  TensorDesc ts_dst_stride;
+  ts_src_stride.dim = src_dim;
+  ts_dst_stride.dim = src_dim;
+  int *src_stride = &(ts_src_stride.stride[0]);
+  int *dst_stride = &(ts_dst_stride.stride[0]);
+  int *dst_size = &(ts_dst_stride.shape[0]);
+  src_stride[src_dim - 1] = 1;
+  dst_stride[src_dim - 1] = 1;
+
+  for (int i = src_dim - 1; i >= 0; --i) {
+    dst_size[i] = src_size[permute[i]];
+    if (i < src_dim - 1) {
+      src_stride[i] = src_stride[i + 1] * src_size[i + 1];
+    }
+  }
+
+  for (int i = src_dim - 1; i >= 0; --i) {
+    copy_size *= dst_size[i];
+    if (i < src_dim - 1) {
+      dst_stride[i] = dst_stride[i + 1] * dst_size[i + 1];
+    }
+  }
+
+  copy_permute_kernel<scalar_t>
+      <<<GET_BLOCKS(copy_size), THREADS_PER_BLOCK, 0, stream>>>(
+          dst, src, copy_size, ts_src_stride, ts_dst_stride, ts_permute);
+}
+
+template void memcpyPermute<float>(float *dst, const float *src, int *src_size,
+                                   int *permute, int src_dim,
+                                   cudaStream_t stream);
+
+template <>
+cublasStatus_t cublasGemmWrap<float>(cublasHandle_t handle,
+                                     cublasOperation_t transa,
+                                     cublasOperation_t transb, int m, int n,
+                                     int k, const float *alpha, const float *A,
+                                     int lda, const float *B, int ldb,
+                                     const float *beta, float *C, int ldc) {
+  return cublasSgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
+                     beta, C, ldc);
+}
+
+template <>
+cublasStatus_t cublasGemmWrap<half>(cublasHandle_t handle,
+                                    cublasOperation_t transa,
+                                    cublasOperation_t transb, int m, int n,
+                                    int k, const half *alpha, const half *A,
+                                    int lda, const half *B, int ldb,
+                                    const half *beta, half *C, int ldc) {
+  return cublasHgemm(handle, transa, transb, m, n, k, alpha, A, lda, B, ldb,
+                     beta, C, ldc);
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..40bebbca2ca1f661a2d6a67ab0d7f21e9f87b8e2
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin.cpp
@@ -0,0 +1,242 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_cummaxmin.hpp"
+
+#include <assert.h>
+
+#include "trt_serialize.hpp"
+
+void CumMaxMinForwardLauncher_float(const float *input, float *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream);
+
+void CumMaxMinForwardLauncher_int32(const int *input, int *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *CUMMAXMIN_PLUGIN_NAME{"cummaxmin"};
+static const char *CUMMAX_PLUGIN_NAME{"cummax"};
+static const char *CUMMIN_PLUGIN_NAME{"cummin"};
+}  // namespace
+
+CumMaxMinPluginDynamic::CumMaxMinPluginDynamic(const std::string &name, int dim,
+                                               TRT_CUMCMPTYPE cumType)
+    : mLayerName(name), mDim(dim), mCumType(cumType) {}
+
+CumMaxMinPluginDynamic::CumMaxMinPluginDynamic(const std::string name,
+                                               const void *data, size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mDim);
+  deserialize_value(&data, &length, &mCumType);
+}
+
+CumMaxMinPluginDynamic::~CumMaxMinPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *CumMaxMinPluginDynamic::clone() const {
+  CumMaxMinPluginDynamic *plugin =
+      new CumMaxMinPluginDynamic(mLayerName, mDim, mCumType);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs CumMaxMinPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  return inputs[0];
+}
+
+bool CumMaxMinPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  switch (pos) {
+    // input[0]
+    case 0:
+      return (inOut[pos].type == nvinfer1::DataType::kFLOAT ||
+              inOut[pos].type == nvinfer1::DataType::kINT32) &&
+             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    // output[0]
+    case 1:
+      return inOut[pos].type == inOut[0].type &&
+             inOut[pos].format == inOut[0].format;
+    // output[1]
+    case 2:
+      return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+             inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+    default:
+      return false;
+  }
+}
+
+void CumMaxMinPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t CumMaxMinPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+}
+
+int CumMaxMinPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  const void *input = inputs[0];
+  void *output_value = outputs[0];
+  int *output_index = (int *)outputs[1];
+
+  const int *dims = &(inputDesc[0].dims.d[0]);
+  int nbDims = inputDesc[0].dims.nbDims;
+
+  switch (inputDesc[0].type) {
+    case nvinfer1::DataType::kFLOAT:
+      CumMaxMinForwardLauncher_float((float *)input, (float *)output_value,
+                                     output_index, dims, nbDims, mDim,
+                                     int(mCumType), stream);
+      break;
+    case nvinfer1::DataType::kINT32:
+      CumMaxMinForwardLauncher_int32((int *)input, (int *)output_value,
+                                     output_index, dims, nbDims, mDim,
+                                     int(mCumType), stream);
+      break;
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType CumMaxMinPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  switch (index) {
+    case 0:
+      return inputTypes[0];
+    case 1:
+      return nvinfer1::DataType::kINT32;
+    default:
+      break;
+  }
+}
+
+// IPluginV2 Methods
+const char *CumMaxMinPluginDynamic::getPluginType() const {
+  switch (mCumType) {
+    case TRT_CUMCMPTYPE::TRT_CUMMAX:
+      return CUMMAX_PLUGIN_NAME;
+    case TRT_CUMCMPTYPE::TRT_CUMMIN:
+      return CUMMIN_PLUGIN_NAME;
+    default:
+      return "UnknownCumType";
+  }
+}
+
+const char *CumMaxMinPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int CumMaxMinPluginDynamic::getNbOutputs() const { return 2; }
+
+int CumMaxMinPluginDynamic::initialize() { return 0; }
+
+void CumMaxMinPluginDynamic::terminate() {}
+
+size_t CumMaxMinPluginDynamic::getSerializationSize() const {
+  return sizeof(mDim) + sizeof(mCumType);
+}
+
+void CumMaxMinPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mDim);
+  serialize_value(&buffer, mCumType);
+}
+
+void CumMaxMinPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void CumMaxMinPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CumMaxMinPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+CumMaxMinPluginDynamicCreator::CumMaxMinPluginDynamicCreator(
+    TRT_CUMCMPTYPE cumType)
+    : mCumType(cumType) {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dim"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *CumMaxMinPluginDynamicCreator::getPluginName() const {
+  return CUMMAXMIN_PLUGIN_NAME;
+}
+
+const char *CumMaxMinPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+CumMaxMinPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *CumMaxMinPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int dim = 0;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("dim") == 0) {
+      dim = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+
+  CumMaxMinPluginDynamic *plugin =
+      new CumMaxMinPluginDynamic(name, dim, mCumType);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *CumMaxMinPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  // This object will be deleted when the network is destroyed, which will
+  // call FCPluginDynamic::destroy()
+  auto plugin = new CumMaxMinPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void CumMaxMinPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *CumMaxMinPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+CumMaxPluginDynamicCreator::CumMaxPluginDynamicCreator()
+    : CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE::TRT_CUMMAX) {}
+
+const char *CumMaxPluginDynamicCreator::getPluginName() const {
+  return CUMMAX_PLUGIN_NAME;
+}
+
+CumMinPluginDynamicCreator::CumMinPluginDynamicCreator()
+    : CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE::TRT_CUMMIN) {}
+
+const char *CumMinPluginDynamicCreator::getPluginName() const {
+  return CUMMIN_PLUGIN_NAME;
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..47d756a33bd692b05cba38bd2c5fafe874261616
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_cummaxmin_kernel.cu
@@ -0,0 +1,90 @@
+// Copyright (c) OpenMMLab. All rights reserved
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+using mmcv::TensorDesc;
+
+template <typename scalar_t>
+__global__ void cummaxmin_kernel(const scalar_t *input, scalar_t *output_value,
+                                 int *output_index, TensorDesc tensor_desc,
+                                 int cum_dim, int cum_type) {
+  const size_t cum_size = tensor_desc.shape[cum_dim];
+  const size_t cum_stride = tensor_desc.stride[cum_dim];
+  const size_t data_size =
+      tensor_desc.stride[0] * tensor_desc.shape[0] / cum_size;
+  CUDA_1D_KERNEL_LOOP(index, data_size) {
+    size_t cum_offset =
+        index / cum_stride * (cum_size * cum_stride) + index % cum_stride;
+    int cum_index = 0;
+    auto cum_value = input[cum_offset];
+    output_value[cum_offset] = cum_value;
+    output_index[cum_offset] = cum_index;
+
+    for (size_t cum_index_current = 1; cum_index_current < cum_size;
+         ++cum_index_current) {
+      cum_offset += cum_stride;
+      const auto cum_value_current = input[cum_offset];
+      switch (cum_type) {
+        case 0:  // max
+          if (cum_value_current > cum_value) {
+            cum_value = cum_value_current;
+            cum_index = cum_index_current;
+          }
+          break;
+        case 1:  // min
+          if (cum_value_current < cum_value) {
+            cum_value = cum_value_current;
+            cum_index = cum_index_current;
+          }
+          break;
+      }
+      output_value[cum_offset] = cum_value;
+      output_index[cum_offset] = cum_index;
+    }
+  }
+}
+
+template <typename scalar_t>
+void CumMaxMinForwardLauncher(const scalar_t *input, scalar_t *output_value,
+                              int *output_index, const int *dims, int nbDims,
+                              int cum_dim, int cum_type, cudaStream_t stream) {
+  // fill tensordesc and initial
+  TensorDesc tensor_desc;
+  memset((void *)&tensor_desc, 0, sizeof(TensorDesc));
+  tensor_desc.dim = nbDims;
+  tensor_desc.shape[nbDims - 1] = dims[nbDims - 1];
+  tensor_desc.stride[nbDims - 1] = 1;
+  for (int i = nbDims - 2; i >= 0; --i) {
+    tensor_desc.shape[i] = dims[i];
+    tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
+  }
+
+  // cum dim should be larger than 0
+  cum_dim = cum_dim >= 0 ? cum_dim : (nbDims + cum_dim);
+
+  const int data_size =
+      tensor_desc.stride[0] * tensor_desc.shape[0] / tensor_desc.shape[cum_dim];
+
+  const int col_block = GET_BLOCKS(data_size, THREADS_PER_BLOCK);
+
+  cummaxmin_kernel<scalar_t><<<col_block, THREADS_PER_BLOCK, 0, stream>>>(
+      input, output_value, output_index, tensor_desc, cum_dim, cum_type);
+}
+
+void CumMaxMinForwardLauncher_float(const float *input, float *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream) {
+  CumMaxMinForwardLauncher<float>(input, output_value, output_index, dims,
+                                  nbDims, cum_dim, cum_type, stream);
+}
+
+void CumMaxMinForwardLauncher_int32(const int *input, int *output_value,
+                                    int *output_index, const int *dims,
+                                    int nbDims, int cum_dim, int cum_type,
+                                    cudaStream_t stream) {
+  CumMaxMinForwardLauncher<int>(input, output_value, output_index, dims, nbDims,
+                                cum_dim, cum_type, stream);
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9b2439ba9a90cc00eb6b27316ccdd86acefa7e3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv.cpp
@@ -0,0 +1,318 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_deform_conv.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+void DeformConvForwardCUDAKernelLauncher_float(
+    const float *input, const float *weight, const float *offset, float *output,
+    void *workspace, int batchSize, int nInputPlane, int inputHeight,
+    int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVDeformConv2d"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection DeformableConvPluginDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    DeformableConvPluginDynamicCreator::mPluginAttributes;
+
+DeformableConvPluginDynamic::DeformableConvPluginDynamic(
+    const std::string &name, const nvinfer1::Dims &stride,
+    const nvinfer1::Dims &padding, const nvinfer1::Dims &dilation,
+    const int deformableGroup, const int group, int im2colStep)
+    : mLayerName(name),
+      mStride(stride),
+      mPadding(padding),
+      mDilation(dilation),
+      mDeformableGroup(deformableGroup),
+      mGroup(group),
+      mIm2colStep(im2colStep) {}
+
+DeformableConvPluginDynamic::DeformableConvPluginDynamic(const std::string name,
+                                                         const void *data,
+                                                         size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mStride);
+  deserialize_value(&data, &length, &mPadding);
+  deserialize_value(&data, &length, &mDilation);
+  deserialize_value(&data, &length, &mDeformableGroup);
+  deserialize_value(&data, &length, &mGroup);
+  deserialize_value(&data, &length, &mIm2colStep);
+}
+DeformableConvPluginDynamic::~DeformableConvPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *DeformableConvPluginDynamic::clone() const {
+  DeformableConvPluginDynamic *plugin =
+      new DeformableConvPluginDynamic(mLayerName, mStride, mPadding, mDilation,
+                                      mDeformableGroup, mGroup, mIm2colStep);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs DeformableConvPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[2].d[0];
+
+  ret.d[2] = inputs[1].d[2];
+  ret.d[3] = inputs[1].d[3];
+
+  return ret;
+}
+
+bool DeformableConvPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos == 0) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+
+  } else {
+    return inOut[pos].type == inOut[0].type &&
+           inOut[pos].format == inOut[0].format;
+  }
+}
+
+void DeformableConvPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t DeformableConvPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+
+  int batch_size = inputs[0].dims.d[0];
+  int nInputPlane = inputs[0].dims.d[1];
+  int inputHeight = inputs[0].dims.d[2];
+  int inputWidth = inputs[0].dims.d[3];
+
+  int nOutputPlane = outputs[0].dims.d[1];
+  int outputHeight = outputs[0].dims.d[2];
+  int outputWidth = outputs[0].dims.d[3];
+
+  int kW = inputs[2].dims.d[2];
+  int kH = inputs[2].dims.d[3];
+  int im2col_step = std::min(batch_size, mIm2colStep);
+
+  size_t col_size =
+      mmcv::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
+                           outputWidth * sizeof_dtype);
+
+  size_t out_size = 0;
+  if (im2col_step != 1)
+    out_size = mmcv::getAlignedSize(batch_size * nOutputPlane * outputHeight *
+                                    outputWidth * sizeof_dtype);
+
+  return col_size + out_size;
+}
+
+int DeformableConvPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  int batch_size = inputDesc[0].dims.d[0];
+  int inputChannel = inputDesc[0].dims.d[1];
+  int inputHeight = inputDesc[0].dims.d[2];
+  int inputWidth = inputDesc[0].dims.d[3];
+  int outputChannel = outputDesc[0].dims.d[1];
+  int kernelHeight = inputDesc[2].dims.d[2];
+  int kernelWidth = inputDesc[2].dims.d[3];
+
+  const void *x = inputs[0];
+  const void *offset = inputs[1];
+  const void *weight = inputs[2];
+  void *output = outputs[0];
+  int im2col_step = std::min(batch_size, mIm2colStep);
+
+  // TODO: add fp16 support
+  auto data_type = inputDesc[0].type;
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      DeformConvForwardCUDAKernelLauncher_float(
+          (float *)x, (float *)weight, (float *)offset, (float *)output,
+          workSpace, batch_size, inputChannel, inputHeight, inputWidth,
+          outputChannel, kernelWidth, kernelHeight, mStride.d[0], mStride.d[1],
+          mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
+          mDeformableGroup, im2col_step, m_cublas_handle, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType DeformableConvPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *DeformableConvPluginDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char *DeformableConvPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int DeformableConvPluginDynamic::getNbOutputs() const { return 1; }
+
+int DeformableConvPluginDynamic::initialize() { return 0; }
+
+void DeformableConvPluginDynamic::terminate() {}
+
+size_t DeformableConvPluginDynamic::getSerializationSize() const {
+  return sizeof(mStride) + sizeof(mPadding) + sizeof(mDilation) +
+         sizeof(mDeformableGroup) + sizeof(mGroup) + sizeof(mIm2colStep);
+}
+
+void DeformableConvPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mStride);
+  serialize_value(&buffer, mPadding);
+  serialize_value(&buffer, mDilation);
+  serialize_value(&buffer, mDeformableGroup);
+  serialize_value(&buffer, mGroup);
+  serialize_value(&buffer, mIm2colStep);
+}
+
+void DeformableConvPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void DeformableConvPluginDynamic::attachToContext(
+    cudnnContext *cudnnContext, cublasContext *cublasContext,
+    nvinfer1::IGpuAllocator *gpuAllocator) {
+  m_cublas_handle = cublasContext;
+}
+
+void DeformableConvPluginDynamic::detachFromContext() {}
+
+void DeformableConvPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *DeformableConvPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+DeformableConvPluginDynamicCreator::DeformableConvPluginDynamicCreator() {
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("bias"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("im2col_step"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+DeformableConvPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  nvinfer1::Dims stride{2, {1, 1}};
+  nvinfer1::Dims padding{2, {0, 0}};
+  nvinfer1::Dims dilation{2, {1, 1}};
+  int deformableGroup = 1;
+  int group = 1;
+  int im2col_step = 32;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("stride") == 0) {
+      stride.nbDims = 2;
+      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      if (fc->fields[i].length == 1) {
+        stride.d[1] = stride.d[0];
+      } else {
+        stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+      }
+    }
+
+    if (field_name.compare("padding") == 0) {
+      padding.nbDims = 2;
+      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      if (fc->fields[i].length == 1) {
+        padding.d[1] = padding.d[0];
+      } else {
+        padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+      }
+    }
+
+    if (field_name.compare("dilation") == 0) {
+      dilation.nbDims = 2;
+      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      if (fc->fields[i].length == 1) {
+        dilation.d[1] = dilation.d[0];
+      } else {
+        dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+      }
+    }
+
+    if (field_name.compare("deform_groups") == 0) {
+      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("group") == 0) {
+      group = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("im2col_step") == 0) {
+      im2col_step = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+
+  DeformableConvPluginDynamic *plugin = new DeformableConvPluginDynamic(
+      name, stride, padding, dilation, deformableGroup, group, im2col_step);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *DeformableConvPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new DeformableConvPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void DeformableConvPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *DeformableConvPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b1f69890409d9b6d3bee6cc06db51cbdd98efe47
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_deform_conv_kernel.cu
@@ -0,0 +1,129 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <cuda_fp16.h>
+
+#include "common_cuda_helper.hpp"
+#include "deform_conv_cuda_kernel.cuh"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+template <typename T>
+void trt_deformable_im2col(const T* data_input, const T* data_offset,
+                           const int channels, const int height,
+                           const int width, const int ksize_h,
+                           const int ksize_w, const int pad_h, const int pad_w,
+                           const int stride_h, const int stride_w,
+                           const int dilation_h, const int dilation_w,
+                           const int parallel_imgs, const int deformable_group,
+                           T* data_col, cudaStream_t stream) {
+  int height_col =
+      (height + 2 * pad_h - (dilation_h * (ksize_h - 1) + 1)) / stride_h + 1;
+  int width_col =
+      (width + 2 * pad_w - (dilation_w * (ksize_w - 1) + 1)) / stride_w + 1;
+  int num_kernels = channels * height_col * width_col * parallel_imgs;
+  int channel_per_deformable_group = channels / deformable_group;
+
+  deformable_im2col_gpu_kernel<T>
+      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+          num_kernels, data_input, data_offset, height, width, ksize_h, ksize_w,
+          pad_h, pad_w, stride_h, stride_w, dilation_h, dilation_w,
+          channel_per_deformable_group, parallel_imgs, channels,
+          deformable_group, height_col, width_col, data_col);
+
+  cudaCheckError();
+}
+
+template <typename scalar_t>
+void DeformConvForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* weight, const scalar_t* offset,
+    scalar_t* output, void* workspace, int batchSize, int nInputPlane,
+    int inputHeight, int inputWidth, int nOutputPlane, int kW, int kH, int dW,
+    int dH, int padW, int padH, int dilationW, int dilationH, int group,
+    int deformable_group, int im2col_step, cublasHandle_t cublas_handle,
+    cudaStream_t stream) {
+  size_t word_size = sizeof(scalar_t);
+
+  im2col_step = std::min(int(batchSize), im2col_step);
+  long outputWidth =
+      (inputWidth + 2 * padW - (dilationW * (kW - 1) + 1)) / dW + 1;
+  long outputHeight =
+      (inputHeight + 2 * padH - (dilationH * (kH - 1) + 1)) / dH + 1;
+
+  long long columns_size =
+      mmcv::getAlignedSize(nInputPlane * kW * kH * im2col_step * outputHeight *
+                           outputWidth * word_size);
+
+  // column buffer for img2col
+  scalar_t* columns = (scalar_t*)workspace;
+  workspace = workspace + columns_size;
+
+  scalar_t* output_buffer;
+  long long output_buffer_size = 0;
+  if (im2col_step == 1) {
+    output_buffer = output;
+  } else {
+    // output need permute when im2col_step!=1
+    output_buffer = (scalar_t*)workspace;
+    output_buffer_size = batchSize * nOutputPlane * outputWidth * outputHeight;
+  }
+
+  long long input_elt_step =
+      im2col_step * nInputPlane * inputHeight * inputWidth;
+  long long offset_elt_step =
+      im2col_step * deformable_group * 2 * kH * kW * outputHeight * outputWidth;
+  long long out_buffer_step =
+      nOutputPlane * im2col_step * outputHeight * outputWidth;
+  long long col_g_step =
+      nInputPlane * kW * kH / group * im2col_step * outputHeight * outputWidth;
+  long long weight_g_step =
+      nOutputPlane / group * nInputPlane / group * kH * kW;
+  long long out_buffer_g_step =
+      nOutputPlane / group * im2col_step * outputHeight * outputWidth;
+  int m = nOutputPlane / group;
+  int n = im2col_step * outputHeight * outputWidth;
+  int k = nInputPlane / group * kH * kW;
+  scalar_t alpha = 1.;
+  scalar_t beta = 0.;
+
+  for (int elt = 0; elt < batchSize / im2col_step; elt++) {
+    const scalar_t* input_start = input + elt * input_elt_step;
+    const scalar_t* offset_start = offset + elt * offset_elt_step;
+
+    trt_deformable_im2col<scalar_t>(input_start, offset_start, nInputPlane,
+                                    inputHeight, inputWidth, kH, kW, padH, padW,
+                                    dH, dW, dilationH, dilationW, im2col_step,
+                                    deformable_group, columns, stream);
+
+    for (int g = 0; g < group; ++g) {
+      const scalar_t* weight_start = weight + g * weight_g_step;
+      scalar_t* col_start = columns + g * col_g_step;
+      scalar_t* out_buffer_start =
+          output_buffer + elt * out_buffer_step + g * out_buffer_g_step;
+
+      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
+                               &alpha, col_start, n, weight_start, k, &beta,
+                               out_buffer_start, n);
+      cudaCheckError();
+    }
+  }
+
+  if (im2col_step != 1) {
+    int output_buffer_shape[5] = {batchSize / im2col_step, nOutputPlane,
+                                  im2col_step, outputHeight, outputWidth};
+    int output_buffer_permute[5] = {0, 2, 1, 3, 4};
+    memcpyPermute<scalar_t>(output, output_buffer, &output_buffer_shape[0],
+                            &output_buffer_permute[0], 5, stream);
+  }
+}
+
+void DeformConvForwardCUDAKernelLauncher_float(
+    const float* input, const float* weight, const float* offset, float* output,
+    void* workspace, int batchSize, int nInputPlane, int inputHeight,
+    int inputWidth, int nOutputPlane, int kW, int kH, int dW, int dH, int padW,
+    int padH, int dilationW, int dilationH, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
+  DeformConvForwardCUDAKernelLauncher<float>(
+      input, weight, offset, output, workspace, batchSize, nInputPlane,
+      inputHeight, inputWidth, nOutputPlane, kW, kH, dW, dH, padW, padH,
+      dilationW, dilationH, group, deformable_group, im2col_step, cublas_handle,
+      stream);
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d955ca53e5375da24047a25800caa7ab3fa1a3ef
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler.cpp
@@ -0,0 +1,256 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_grid_sampler.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+using mmcv::GridSamplerInterpolation;
+using mmcv::GridSamplerPadding;
+
+void grid_sample_float(float *output, const float *input, const float *grid,
+                       int *output_dims, int *input_dims, int *grid_dims,
+                       int nb_dims, GridSamplerInterpolation interp,
+                       GridSamplerPadding padding, bool align_corners,
+                       cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"grid_sampler"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection GridSamplerDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField> GridSamplerDynamicCreator::mPluginAttributes;
+
+GridSamplerDynamic::GridSamplerDynamic(const std::string &name, int mode,
+                                       int paddingMode, bool alignCorners)
+    : mLayerName(name),
+      mMode(mode),
+      mPaddingMode(paddingMode),
+      mAlignCorners(alignCorners) {}
+
+GridSamplerDynamic::GridSamplerDynamic(const std::string name, const void *data,
+                                       size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mMode);
+  deserialize_value(&data, &length, &mPaddingMode);
+  deserialize_value(&data, &length, &mAlignCorners);
+}
+
+nvinfer1::IPluginV2DynamicExt *GridSamplerDynamic::clone() const {
+  GridSamplerDynamic *plugin =
+      new GridSamplerDynamic(mLayerName, mMode, mPaddingMode, mAlignCorners);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs GridSamplerDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = inputs[0].nbDims;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[0].d[1];
+  for (int i = 2; i < ret.nbDims; ++i) {
+    ret.d[i] = inputs[1].d[i - 1];
+  }
+  return ret;
+}
+
+bool GridSamplerDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos == 0) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else {
+    return inOut[pos].type == inOut[0].type &&
+           inOut[pos].format == inOut[0].format;
+  }
+}
+
+void GridSamplerDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
+  // Validate input arguments
+}
+
+size_t GridSamplerDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  return 0;
+}
+
+int GridSamplerDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                const nvinfer1::PluginTensorDesc *outputDesc,
+                                const void *const *inputs, void *const *outputs,
+                                void *workSpace, cudaStream_t stream) {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  nvinfer1::Dims grid_dims = inputDesc[1].dims;
+  nvinfer1::Dims output_dims = outputDesc[0].dims;
+
+  using mmcv::GridSamplerInterpolation;
+  using mmcv::GridSamplerPadding;
+
+  GridSamplerInterpolation interp_mode = GridSamplerInterpolation::Bilinear;
+  switch (mMode) {
+    case 0:
+      interp_mode = GridSamplerInterpolation::Bilinear;
+      break;
+    case 1:
+      interp_mode = GridSamplerInterpolation::Nearest;
+      break;
+    default:
+      break;
+  }
+
+  GridSamplerPadding padding_mode = GridSamplerPadding::Zeros;
+  switch (mPaddingMode) {
+    case 0:
+      padding_mode = GridSamplerPadding::Zeros;
+      break;
+
+    case 1:
+      padding_mode = GridSamplerPadding::Border;
+      break;
+
+    case 2:
+      padding_mode = GridSamplerPadding::Reflection;
+      break;
+    default:
+      break;
+  }
+
+  auto data_type = inputDesc[0].type;
+
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      grid_sample_float(
+          (float *)outputs[0], (float *)inputs[0], (float *)inputs[1],
+          &(output_dims.d[0]), &(input_dims.d[0]), &(grid_dims.d[0]),
+          input_dims.nbDims, interp_mode, padding_mode, mAlignCorners, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType GridSamplerDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *GridSamplerDynamic::getPluginType() const { return PLUGIN_NAME; }
+
+const char *GridSamplerDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int GridSamplerDynamic::getNbOutputs() const { return 1; }
+
+int GridSamplerDynamic::initialize() { return 0; }
+
+void GridSamplerDynamic::terminate() {}
+
+size_t GridSamplerDynamic::getSerializationSize() const {
+  return sizeof(mMode) + sizeof(mPaddingMode) + sizeof(mAlignCorners);
+}
+
+void GridSamplerDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mMode);
+  serialize_value(&buffer, mPaddingMode);
+  serialize_value(&buffer, mAlignCorners);
+}
+
+void GridSamplerDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void GridSamplerDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *GridSamplerDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+GridSamplerDynamicCreator::GridSamplerDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("interpolation_mode"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding_mode"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("align_corners"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *GridSamplerDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *GridSamplerDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+GridSamplerDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *GridSamplerDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int mode = 0;
+  int paddingMode = 0;
+  bool alignCorners = false;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("interpolation_mode") == 0) {
+      mode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("padding_mode") == 0) {
+      paddingMode = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("align_corners") == 0) {
+      alignCorners = (bool)(static_cast<const int *>(fc->fields[i].data)[0]);
+    }
+  }
+
+  GridSamplerDynamic *plugin =
+      new GridSamplerDynamic(name, mode, paddingMode, alignCorners);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *GridSamplerDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  // This object will be deleted when the network is destroyed, which will
+  // call FCPluginDynamic::destroy()
+  auto plugin = new GridSamplerDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void GridSamplerDynamicCreator::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *GridSamplerDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..253a35d5989acfec80dd0851c87c2a527fb5b451
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_grid_sampler_kernel.cu
@@ -0,0 +1,441 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// modified from
+// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cuh
+// and
+// https://github.com/pytorch/pytorch/blob/ec683299ebabf297a3504c76248d37be830e4342/aten/src/ATen/native/cuda/GridSampler.cu
+
+#include <cuda_fp16.h>
+#include <stdio.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_grid_sampler.hpp"
+#include "trt_plugin_helper.hpp"
+
+using mmcv::GridSamplerInterpolation;
+using mmcv::GridSamplerPadding;
+using mmcv::TensorDesc;
+
+// Unnormalizes a coordinate from the -1 to +1 scale to its pixel index value,
+// where we view each pixel as an area between (idx - 0.5) and (idx + 0.5).
+// if align_corners: -1 and +1 get sent to the centers of the corner pixels
+//     -1 --> 0
+//     +1 --> (size - 1)
+//     scale_factor = (size - 1) / 2
+// if not align_corners: -1 and +1 get sent to the image edges
+//     -1 --> -0.5
+//     +1 --> (size - 1) + 0.5 == size - 0.5
+//     scale_factor = size / 2
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t
+grid_sampler_unnormalize(scalar_t coord, int size, bool align_corners) {
+  if (align_corners) {
+    // unnormalize coord from [-1, 1] to [0, size - 1]
+    return ((coord + 1.f) / 2) * (size - 1);
+  } else {
+    // unnormalize coord from [-1, 1] to [-0.5, size - 0.5]
+    return ((coord + 1.f) * size - 1) / 2;
+  }
+}
+
+// Clips coordinates to between 0 and clip_limit - 1
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t clip_coordinates(scalar_t in,
+                                                            int clip_limit) {
+  return ::min(static_cast<scalar_t>(clip_limit - 1),
+               ::max(in, static_cast<scalar_t>(0)));
+}
+
+// Reflects coordinates until they fall between low and high (inclusive).
+// The bounds are passed as twice their value so that half-integer values
+// can be represented as ints.
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t reflect_coordinates(scalar_t in,
+                                                               int twice_low,
+                                                               int twice_high) {
+  if (twice_low == twice_high) {
+    return static_cast<scalar_t>(0);
+  }
+  scalar_t min = static_cast<scalar_t>(twice_low) / 2;
+  scalar_t span = static_cast<scalar_t>(twice_high - twice_low) / 2;
+  in = ::fabs(in - min);
+  // `fmod` returns same sign as `in`, which is positive after the `fabs` above.
+  scalar_t extra = ::fmod(in, span);
+  int flips = static_cast<int>(::floor(in / span));
+  if (flips % 2 == 0) {
+    return extra + min;
+  } else {
+    return span - extra + min;
+  }
+}
+
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t
+safe_downgrade_to_int_range(scalar_t x) {
+  // -100.0 does not have special meaning. This is just to make sure
+  // it's not within_bounds_2d or within_bounds_3d, and does not cause
+  // undefined behavior. See #35506.
+  if (x > INT_MAX - 1 || x < INT_MIN || !::isfinite(static_cast<double>(x)))
+    return static_cast<scalar_t>(-100.0);
+  return x;
+}
+
+// Computes the pixel source index value for a grid coordinate
+template <typename scalar_t>
+static __forceinline__ __device__ scalar_t grid_sampler_compute_source_index(
+    scalar_t coord, int size, GridSamplerPadding padding_mode,
+    bool align_corners) {
+  coord = grid_sampler_unnormalize(coord, size, align_corners);
+  if (padding_mode == GridSamplerPadding::Border) {
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  } else if (padding_mode == GridSamplerPadding::Reflection) {
+    // reflect coordinates by image borders
+    if (align_corners) {
+      coord = reflect_coordinates(coord, 0, 2 * (size - 1));
+    } else {
+      coord = reflect_coordinates(coord, -1, 2 * size - 1);
+    }
+    // clip coordinates to image borders
+    coord = clip_coordinates(coord, size);
+  }
+
+  coord = safe_downgrade_to_int_range(coord);
+  return coord;
+}
+
+static __forceinline__ __device__ bool within_bounds_2d(int h, int w, int H,
+                                                        int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+static __forceinline__ __device__ bool within_bounds_3d(int d, int h, int w,
+                                                        int D, int H, int W) {
+  return d >= 0 && d < D && h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename scalar_t>
+__global__ void grid_sampler_2d_kernel(
+    const int nthreads, const scalar_t *input, const scalar_t *grid,
+    scalar_t *output, TensorDesc input_desc, TensorDesc grid_desc,
+    TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode,
+    const GridSamplerPadding padding_mode, bool align_corners) {
+  int C = input_desc.shape[1];
+  int inp_H = input_desc.shape[2];
+  int inp_W = input_desc.shape[3];
+  int out_H = grid_desc.shape[1];
+  int out_W = grid_desc.shape[2];
+  int inp_sN = input_desc.stride[0];
+  int inp_sC = input_desc.stride[1];
+  int inp_sH = input_desc.stride[2];
+  int inp_sW = input_desc.stride[3];
+  int grid_sN = grid_desc.stride[0];
+  int grid_sH = grid_desc.stride[1];
+  int grid_sW = grid_desc.stride[2];
+  int grid_sCoor = grid_desc.stride[3];
+  int out_sN = output_desc.stride[0];
+  int out_sC = output_desc.stride[1];
+  int out_sH = output_desc.stride[2];
+  int out_sW = output_desc.stride[3];
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_W;
+    const int h = (index / out_W) % out_H;
+    const int n = index / (out_H * out_W);
+    const int grid_offset = n * grid_sN + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y coordinates from grid
+    scalar_t ix = grid[grid_offset];
+    scalar_t iy = grid[grid_offset + grid_sCoor];
+
+    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode,
+                                           align_corners);
+    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode,
+                                           align_corners);
+
+    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+      // get NE, NW, SE, SW pixel values from (x, y)
+      int ix_nw = static_cast<int>(::floor(ix));
+      int iy_nw = static_cast<int>(::floor(iy));
+      int ix_ne = ix_nw + 1;
+      int iy_ne = iy_nw;
+      int ix_sw = ix_nw;
+      int iy_sw = iy_nw + 1;
+      int ix_se = ix_nw + 1;
+      int iy_se = iy_nw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t nw = (ix_se - ix) * (iy_se - iy);
+      scalar_t ne = (ix - ix_sw) * (iy_sw - iy);
+      scalar_t sw = (ix_ne - ix) * (iy - iy_ne);
+      scalar_t se = (ix - ix_nw) * (iy - iy_nw);
+
+      // calculate bilinear weighted pixel value and set output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        *out_ptr_NCHW = static_cast<scalar_t>(0);
+        if (within_bounds_2d(iy_nw, ix_nw, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_nw * inp_sH + ix_nw * inp_sW] * nw;
+        }
+        if (within_bounds_2d(iy_ne, ix_ne, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_ne * inp_sH + ix_ne * inp_sW] * ne;
+        }
+        if (within_bounds_2d(iy_sw, ix_sw, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_sw * inp_sH + ix_sw * inp_sW] * sw;
+        }
+        if (within_bounds_2d(iy_se, ix_se, inp_H, inp_W)) {
+          *out_ptr_NCHW += inp_ptr_NC[iy_se * inp_sH + ix_se * inp_sW] * se;
+        }
+      }
+    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+
+      // assign nearest neighbor pixel value to output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCHW = output + n * out_sN + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) {
+        if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) {
+          *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCHW = static_cast<scalar_t>(0);
+        }
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void grid_sampler_3d_kernel(
+    const int nthreads, const scalar_t *input, const scalar_t *grid,
+    scalar_t *output, TensorDesc input_desc, TensorDesc grid_desc,
+    TensorDesc output_desc, const GridSamplerInterpolation interpolation_mode,
+    const GridSamplerPadding padding_mode, bool align_corners) {
+  int C = input_desc.shape[1];
+  int inp_D = input_desc.shape[2];
+  int inp_H = input_desc.shape[3];
+  int inp_W = input_desc.shape[4];
+  int out_D = grid_desc.shape[1];
+  int out_H = grid_desc.shape[2];
+  int out_W = grid_desc.shape[3];
+  int inp_sN = input_desc.stride[0];
+  int inp_sC = input_desc.stride[1];
+  int inp_sD = input_desc.stride[2];
+  int inp_sH = input_desc.stride[3];
+  int inp_sW = input_desc.stride[4];
+  int grid_sN = grid_desc.stride[0];
+  int grid_sD = grid_desc.stride[1];
+  int grid_sH = grid_desc.stride[2];
+  int grid_sW = grid_desc.stride[3];
+  int grid_sCoor = grid_desc.stride[4];
+  int out_sN = output_desc.stride[0];
+  int out_sC = output_desc.stride[1];
+  int out_sD = output_desc.stride[2];
+  int out_sH = output_desc.stride[3];
+  int out_sW = output_desc.stride[4];
+
+  CUDA_1D_KERNEL_LOOP(index, nthreads) {
+    const int w = index % out_W;
+    const int h = (index / out_W) % out_H;
+    const int d = (index / (out_H * out_W)) % out_D;
+    const int n = index / (out_D * out_H * out_W);
+    const int grid_offset =
+        n * grid_sN + d * grid_sD + h * grid_sH + w * grid_sW;
+
+    // get the corresponding input x, y, z coordinates from grid
+    scalar_t ix = grid[grid_offset];
+    scalar_t iy = grid[grid_offset + grid_sCoor];
+    scalar_t iz = grid[grid_offset + 2 * grid_sCoor];
+
+    ix = grid_sampler_compute_source_index(ix, inp_W, padding_mode,
+                                           align_corners);
+    iy = grid_sampler_compute_source_index(iy, inp_H, padding_mode,
+                                           align_corners);
+    iz = grid_sampler_compute_source_index(iz, inp_D, padding_mode,
+                                           align_corners);
+
+    if (interpolation_mode == GridSamplerInterpolation::Bilinear) {
+      // get corner pixel values from (x, y, z)
+      // for 4d, we used north-east-south-west
+      // for 5d, we add top-bottom
+      int ix_tnw = static_cast<int>(::floor(ix));
+      int iy_tnw = static_cast<int>(::floor(iy));
+      int iz_tnw = static_cast<int>(::floor(iz));
+
+      int ix_tne = ix_tnw + 1;
+      int iy_tne = iy_tnw;
+      int iz_tne = iz_tnw;
+
+      int ix_tsw = ix_tnw;
+      int iy_tsw = iy_tnw + 1;
+      int iz_tsw = iz_tnw;
+
+      int ix_tse = ix_tnw + 1;
+      int iy_tse = iy_tnw + 1;
+      int iz_tse = iz_tnw;
+
+      int ix_bnw = ix_tnw;
+      int iy_bnw = iy_tnw;
+      int iz_bnw = iz_tnw + 1;
+
+      int ix_bne = ix_tnw + 1;
+      int iy_bne = iy_tnw;
+      int iz_bne = iz_tnw + 1;
+
+      int ix_bsw = ix_tnw;
+      int iy_bsw = iy_tnw + 1;
+      int iz_bsw = iz_tnw + 1;
+
+      int ix_bse = ix_tnw + 1;
+      int iy_bse = iy_tnw + 1;
+      int iz_bse = iz_tnw + 1;
+
+      // get surfaces to each neighbor:
+      scalar_t tnw = (ix_bse - ix) * (iy_bse - iy) * (iz_bse - iz);
+      scalar_t tne = (ix - ix_bsw) * (iy_bsw - iy) * (iz_bsw - iz);
+      scalar_t tsw = (ix_bne - ix) * (iy - iy_bne) * (iz_bne - iz);
+      scalar_t tse = (ix - ix_bnw) * (iy - iy_bnw) * (iz_bnw - iz);
+      scalar_t bnw = (ix_tse - ix) * (iy_tse - iy) * (iz - iz_tse);
+      scalar_t bne = (ix - ix_tsw) * (iy_tsw - iy) * (iz - iz_tsw);
+      scalar_t bsw = (ix_tne - ix) * (iy - iy_tne) * (iz - iz_tne);
+      scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw);
+
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW =
+          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        //   (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) *
+        //   tne
+        // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) *
+        // tse
+        // + (c, iz_bnw, iy_bnw, ix_bnw) * bnw + (c, iz_bne, iy_bne, ix_bne) *
+        // bne
+        // + (c, iz_bsw, iy_bsw, ix_bsw) * bsw + (c, iz_bse, iy_bse, ix_bse) *
+        // bse
+        *out_ptr_NCDHW = static_cast<scalar_t>(0);
+        if (within_bounds_3d(iz_tnw, iy_tnw, ix_tnw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tnw * inp_sD + iy_tnw * inp_sH + ix_tnw * inp_sW] *
+              tnw;
+        }
+        if (within_bounds_3d(iz_tne, iy_tne, ix_tne, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tne * inp_sD + iy_tne * inp_sH + ix_tne * inp_sW] *
+              tne;
+        }
+        if (within_bounds_3d(iz_tsw, iy_tsw, ix_tsw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tsw * inp_sD + iy_tsw * inp_sH + ix_tsw * inp_sW] *
+              tsw;
+        }
+        if (within_bounds_3d(iz_tse, iy_tse, ix_tse, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_tse * inp_sD + iy_tse * inp_sH + ix_tse * inp_sW] *
+              tse;
+        }
+        if (within_bounds_3d(iz_bnw, iy_bnw, ix_bnw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bnw * inp_sD + iy_bnw * inp_sH + ix_bnw * inp_sW] *
+              bnw;
+        }
+        if (within_bounds_3d(iz_bne, iy_bne, ix_bne, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bne * inp_sD + iy_bne * inp_sH + ix_bne * inp_sW] *
+              bne;
+        }
+        if (within_bounds_3d(iz_bsw, iy_bsw, ix_bsw, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bsw * inp_sD + iy_bsw * inp_sH + ix_bsw * inp_sW] *
+              bsw;
+        }
+        if (within_bounds_3d(iz_bse, iy_bse, ix_bse, inp_D, inp_H, inp_W)) {
+          *out_ptr_NCDHW +=
+              inp_ptr_NC[iz_bse * inp_sD + iy_bse * inp_sH + ix_bse * inp_sW] *
+              bse;
+        }
+      }
+    } else if (interpolation_mode == GridSamplerInterpolation::Nearest) {
+      int ix_nearest = static_cast<int>(::round(ix));
+      int iy_nearest = static_cast<int>(::round(iy));
+      int iz_nearest = static_cast<int>(::round(iz));
+
+      // assign nearest neighbor pixel value to output pixel
+      auto inp_ptr_NC = input + n * inp_sN;
+      auto out_ptr_NCDHW =
+          output + n * out_sN + d * out_sD + h * out_sH + w * out_sW;
+      for (int c = 0; c < C;
+           ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) {
+        if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H,
+                             inp_W)) {
+          *out_ptr_NCDHW =
+              inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH +
+                         ix_nearest * inp_sW];
+        } else {
+          *out_ptr_NCDHW = static_cast<scalar_t>(0);
+        }
+      }
+    }
+  }
+}
+
+void create_desc(const int *dims, int nb_dims, TensorDesc &desc) {
+  memcpy(&desc.shape[0], dims, sizeof(int) * nb_dims);
+  desc.stride[nb_dims - 1] = 1;
+  for (int i = nb_dims - 2; i >= 0; --i) {
+    desc.stride[i] = desc.stride[i + 1] * desc.shape[i + 1];
+  }
+}
+
+template <typename T>
+void grid_sample(T *output, const T *input, const T *grid, int *output_dims,
+                 int *input_dims, int *grid_dims, int nb_dims,
+                 GridSamplerInterpolation interp, GridSamplerPadding padding,
+                 bool align_corners, cudaStream_t stream) {
+  TensorDesc input_desc;
+  create_desc(input_dims, nb_dims, input_desc);
+
+  TensorDesc output_desc;
+  create_desc(output_dims, nb_dims, output_desc);
+
+  TensorDesc grid_desc;
+  create_desc(grid_dims, nb_dims, grid_desc);
+
+  int count = 1;
+  for (int i = 0; i < nb_dims; ++i) {
+    if (i == 1) {
+      continue;
+    }
+    count *= output_desc.shape[i];
+  }
+
+  if (nb_dims == 4) {
+    grid_sampler_2d_kernel<T>
+        <<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
+            count, input, grid, output, input_desc, grid_desc, output_desc,
+            interp, padding, align_corners);
+  } else if (nb_dims == 5) {
+    grid_sampler_3d_kernel<T>
+        <<<GET_BLOCKS(count), THREADS_PER_BLOCK, 0, stream>>>(
+            count, input, grid, output, input_desc, grid_desc, output_desc,
+            interp, padding, align_corners);
+  } else {
+    printf("input and grid dims should be 4 or 5\n");
+  }
+}
+
+void grid_sample_float(float *output, const float *input, const float *grid,
+                       int *output_dims, int *input_dims, int *grid_dims,
+                       int nb_dims, GridSamplerInterpolation interp,
+                       GridSamplerPadding padding, bool align_corners,
+                       cudaStream_t stream) {
+  grid_sample<float>(output, input, grid, output_dims, input_dims, grid_dims,
+                     nb_dims, interp, padding, align_corners, stream);
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b9b363a810eaa6c02617d80d2bc012640966199a
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_instance_norm.cpp
@@ -0,0 +1,246 @@
+// Copyright (c) OpenMMLab. All rights reserved
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.cpp
+
+#include "trt_instance_norm.hpp"
+
+#include <cuda_fp16.h>
+
+#include <stdexcept>
+
+#include "trt_serialize.hpp"
+
+using namespace nvinfer1;
+
+cudnnStatus_t convert_trt2cudnn_dtype(nvinfer1::DataType trt_dtype,
+                                      cudnnDataType_t* cudnn_dtype) {
+  switch (trt_dtype) {
+    case nvinfer1::DataType::kFLOAT:
+      *cudnn_dtype = CUDNN_DATA_FLOAT;
+      break;
+    case nvinfer1::DataType::kHALF:
+      *cudnn_dtype = CUDNN_DATA_HALF;
+      break;
+    default:
+      return CUDNN_STATUS_BAD_PARAM;
+  }
+  return CUDNN_STATUS_SUCCESS;
+}
+
+namespace {
+constexpr const char* PLUGIN_VERSION{"1"};
+constexpr const char* PLUGIN_NAME{"MMCVInstanceNormalization"};
+}  // namespace
+
+PluginFieldCollection InstanceNormalizationDynamicCreator::mFC{};
+std::vector<PluginField> InstanceNormalizationDynamicCreator::mPluginAttributes;
+
+InstanceNormalizationDynamic::InstanceNormalizationDynamic(
+    const std::string& name, float epsilon)
+    : mLayerName(name), mEpsilon(epsilon) {}
+
+InstanceNormalizationDynamic::InstanceNormalizationDynamic(
+    const std::string& name, void const* serialData, size_t serialLength)
+    : mLayerName(name) {
+  deserialize_value(&serialData, &serialLength, &mEpsilon);
+}
+
+InstanceNormalizationDynamic::~InstanceNormalizationDynamic() {}
+
+// InstanceNormalizationDynamic returns one output.
+int InstanceNormalizationDynamic::getNbOutputs() const { return 1; }
+
+DimsExprs InstanceNormalizationDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+    nvinfer1::IExprBuilder& exprBuilder) {
+  nvinfer1::DimsExprs output(inputs[0]);
+  return output;
+}
+
+int InstanceNormalizationDynamic::initialize() { return 0; }
+
+void InstanceNormalizationDynamic::terminate() {}
+
+size_t InstanceNormalizationDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  int n = inputs[0].dims.d[0];
+  int c = inputs[0].dims.d[1];
+  int elem_size = mmcv::getElementSize(inputs[1].type);
+  return mmcv::getAlignedSize(n * c * elem_size) * 2;
+}
+
+int InstanceNormalizationDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* inputDesc,
+    const nvinfer1::PluginTensorDesc* outputDesc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  nvinfer1::Dims input_dims = inputDesc[0].dims;
+  int n = input_dims.d[0];
+  int c = input_dims.d[1];
+  int h = input_dims.d[2];
+  int w = input_dims.nbDims > 3 ? input_dims.d[3] : 1;
+  int elem_size = mmcv::getElementSize(inputDesc[1].type);
+
+  void* n_scales = (void*)workspace;
+  void* n_bias = (void*)(workspace + mmcv::getAlignedSize(n * c * elem_size));
+
+  const void* scales = (const void*)inputs[1];
+  const void* bias = (const void*)inputs[2];
+
+  for (int i = 0; i < n; ++i) {
+    cudaMemcpyAsync(n_scales + i * c * elem_size, scales, c * elem_size,
+                    cudaMemcpyDeviceToDevice, stream);
+    cudaMemcpyAsync(n_bias + i * c * elem_size, bias, c * elem_size,
+                    cudaMemcpyDeviceToDevice, stream);
+  }
+
+  cudnnSetTensor4dDescriptor(_b_desc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1,
+                             n * c, 1, 1);
+  cudnnDataType_t cudnn_dtype{};
+  convert_trt2cudnn_dtype(inputDesc[0].type, &cudnn_dtype);
+  cudnnSetTensor4dDescriptor(_x_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c,
+                             h, w);
+  cudnnSetTensor4dDescriptor(_y_desc, CUDNN_TENSOR_NCHW, cudnn_dtype, 1, n * c,
+                             h, w);
+  float alpha = 1;
+  float beta = 0;
+  void const* x_ptr = inputs[0];
+  void* y_ptr = outputs[0];
+  cudnnSetStream(_cudnn_handle, stream);
+  // Note: Use of CUDNN_BATCHNORM_SPATIAL_PERSISTENT can cause numerical
+  //       overflows (NaNs) for fp32 data in some circumstances. The lower-
+  //       performance CUDNN_BATCHNORM_SPATIAL should be used if this is not
+  //       acceptable.
+  cudnnBatchNormalizationForwardTraining(
+      _cudnn_handle, CUDNN_BATCHNORM_SPATIAL_PERSISTENT, &alpha, &beta, _x_desc,
+      x_ptr, _y_desc, y_ptr, _b_desc, n_scales, n_bias, 1., nullptr, nullptr,
+      mEpsilon, nullptr, nullptr);
+  return 0;
+}
+
+size_t InstanceNormalizationDynamic::getSerializationSize() const {
+  return serialized_size(mEpsilon);
+}
+
+void InstanceNormalizationDynamic::serialize(void* buffer) const {
+  serialize_value(&buffer, mEpsilon);
+}
+
+bool InstanceNormalizationDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* inOut, int nbInputs,
+    int nbOutputs) {
+  return ((inOut[pos].type == nvinfer1::DataType::kFLOAT ||
+           inOut[pos].type == nvinfer1::DataType::kHALF) &&
+          inOut[pos].format == nvinfer1::PluginFormat::kLINEAR &&
+          inOut[pos].type == inOut[0].type);
+}
+
+const char* InstanceNormalizationDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char* InstanceNormalizationDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+void InstanceNormalizationDynamic::destroy() { delete this; }
+
+IPluginV2DynamicExt* InstanceNormalizationDynamic::clone() const {
+  auto* plugin = new InstanceNormalizationDynamic{mLayerName, mEpsilon};
+  plugin->setPluginNamespace(mPluginNamespace.c_str());
+  return plugin;
+}
+
+// Set plugin namespace
+void InstanceNormalizationDynamic::setPluginNamespace(
+    const char* pluginNamespace) {
+  mPluginNamespace = pluginNamespace;
+}
+
+const char* InstanceNormalizationDynamic::getPluginNamespace() const {
+  return mPluginNamespace.c_str();
+}
+
+nvinfer1::DataType InstanceNormalizationDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// Attach the plugin object to an execution context and grant the plugin the
+// access to some context resource.
+void InstanceNormalizationDynamic::attachToContext(
+    cudnnContext* cudnnContext, cublasContext* cublasContext,
+    IGpuAllocator* gpuAllocator) {
+  _cudnn_handle = cudnnContext;
+  cudnnCreateTensorDescriptor(&_b_desc);
+  cudnnCreateTensorDescriptor(&_x_desc);
+  cudnnCreateTensorDescriptor(&_y_desc);
+}
+
+// Detach the plugin object from its execution context.
+void InstanceNormalizationDynamic::detachFromContext() {
+  cudnnDestroyTensorDescriptor(_y_desc);
+  cudnnDestroyTensorDescriptor(_x_desc);
+  cudnnDestroyTensorDescriptor(_b_desc);
+}
+
+void InstanceNormalizationDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+// InstanceNormalizationDynamicCreator methods
+InstanceNormalizationDynamicCreator::InstanceNormalizationDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(
+      PluginField("epsilon", nullptr, PluginFieldType::kFLOAT32, 1));
+
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char* InstanceNormalizationDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char* InstanceNormalizationDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const PluginFieldCollection*
+InstanceNormalizationDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+IPluginV2DynamicExt* InstanceNormalizationDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  float epsilon = 1e-5;
+  const PluginField* fields = fc->fields;
+  for (int i = 0; i < fc->nbFields; ++i) {
+    const char* attrName = fields[i].name;
+    if (!strcmp(attrName, "epsilon")) {
+      epsilon = *(static_cast<const float*>(fields[i].data));
+    }
+  }
+
+  InstanceNormalizationDynamic* obj =
+      new InstanceNormalizationDynamic(name, epsilon);
+  obj->setPluginNamespace(mNamespace.c_str());
+  return obj;
+}
+
+IPluginV2DynamicExt* InstanceNormalizationDynamicCreator::deserializePlugin(
+    const char* name, const void* serialData, size_t serialLength) {
+  InstanceNormalizationDynamic* obj =
+      new InstanceNormalizationDynamic{name, serialData, serialLength};
+  obj->setPluginNamespace(mNamespace.c_str());
+  return obj;
+}
+
+void InstanceNormalizationDynamicCreator::setPluginNamespace(
+    const char* libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char* InstanceNormalizationDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30ca758b845e029e306a028daf08651a9f8e25fa
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv.cpp
@@ -0,0 +1,308 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_modulated_deform_conv.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+void ModulatedDeformConvForwardCUDAKernelLauncher_float(
+    const float *input, const float *weight, const float *bias,
+    const float *offset, const float *mask, float *output, void *workspace,
+    int batch, int channels, int height, int width, int channels_out,
+    int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
+    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVModulatedDeformConv2d"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection
+    ModulatedDeformableConvPluginDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    ModulatedDeformableConvPluginDynamicCreator::mPluginAttributes;
+
+ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
+    const std::string &name, const nvinfer1::Dims stride,
+    const nvinfer1::Dims padding, const nvinfer1::Dims dilation,
+    const int deformableGroup, const int group)
+    : mLayerName(name),
+      mStride(stride),
+      mPadding(padding),
+      mDilation(dilation),
+      mDeformableGroup(deformableGroup),
+      mGroup(group) {
+  mWithBias = false;
+}
+
+ModulatedDeformableConvPluginDynamic::ModulatedDeformableConvPluginDynamic(
+    const std::string name, const void *data, size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mStride);
+  deserialize_value(&data, &length, &mPadding);
+  deserialize_value(&data, &length, &mDilation);
+  deserialize_value(&data, &length, &mDeformableGroup);
+  deserialize_value(&data, &length, &mGroup);
+  mWithBias = false;
+}
+ModulatedDeformableConvPluginDynamic::~ModulatedDeformableConvPluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt *ModulatedDeformableConvPluginDynamic::clone()
+    const {
+  ModulatedDeformableConvPluginDynamic *plugin =
+      new ModulatedDeformableConvPluginDynamic(
+          mLayerName, mStride, mPadding, mDilation, mDeformableGroup, mGroup);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs ModulatedDeformableConvPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[0].d[0];
+  ret.d[1] = inputs[3].d[0];
+
+  ret.d[2] = inputs[1].d[2];
+  ret.d[3] = inputs[1].d[3];
+
+  return ret;
+}
+
+bool ModulatedDeformableConvPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos == 0) {
+    return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+            inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+
+  } else {
+    return inOut[pos].type == inOut[0].type &&
+           inOut[pos].format == inOut[0].format;
+  }
+}
+
+void ModulatedDeformableConvPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {
+  if (nbInputs == 5) {
+    mWithBias = true;
+  }
+}
+
+size_t ModulatedDeformableConvPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  int sizeof_dtype = mmcv::getElementSize(outputs[0].type);
+
+  int batch_size = inputs[0].dims.d[0];
+  int nInputPlane = inputs[0].dims.d[1];
+  int inputHeight = inputs[0].dims.d[2];
+  int inputWidth = inputs[0].dims.d[3];
+
+  int nOutputPlane = outputs[0].dims.d[1];
+  int outputHeight = outputs[0].dims.d[2];
+  int outputWidth = outputs[0].dims.d[3];
+
+  int kW = inputs[3].dims.d[2];
+  int kH = inputs[3].dims.d[3];
+  int im2col_step = std::min(32, batch_size);
+
+  size_t col_size = mmcv::getAlignedSize(nInputPlane * kW * kH * outputHeight *
+                                         outputWidth * sizeof_dtype);
+
+  return col_size;
+}
+
+int ModulatedDeformableConvPluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  int batch = inputDesc[0].dims.d[0];
+  int channels = inputDesc[0].dims.d[1];
+  int height = inputDesc[0].dims.d[2];
+  int width = inputDesc[0].dims.d[3];
+  int channels_out = outputDesc[0].dims.d[1];
+  int kernel_h = inputDesc[3].dims.d[2];
+  int kernel_w = inputDesc[3].dims.d[3];
+
+  const void *x = inputs[0];
+  const void *offset = inputs[1];
+  const void *mask = inputs[2];
+  const void *weight = inputs[3];
+  const void *bias = mWithBias ? inputs[4] : nullptr;
+  void *output = outputs[0];
+  int im2col_step = std::min(batch, 32);
+
+  // TODO: add fp16 support
+  auto data_type = inputDesc[0].type;
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      ModulatedDeformConvForwardCUDAKernelLauncher_float(
+          (float *)x, (float *)weight, (float *)bias, (float *)offset,
+          (float *)mask, (float *)output, workSpace, batch, channels, height,
+          width, channels_out, kernel_w, kernel_h, mStride.d[0], mStride.d[1],
+          mPadding.d[0], mPadding.d[1], mDilation.d[0], mDilation.d[1], mGroup,
+          mDeformableGroup, im2col_step, m_cublas_handle, stream);
+      break;
+    default:
+      return 1;
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType ModulatedDeformableConvPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *ModulatedDeformableConvPluginDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char *ModulatedDeformableConvPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int ModulatedDeformableConvPluginDynamic::getNbOutputs() const { return 1; }
+
+int ModulatedDeformableConvPluginDynamic::initialize() { return 0; }
+
+void ModulatedDeformableConvPluginDynamic::terminate() {}
+
+size_t ModulatedDeformableConvPluginDynamic::getSerializationSize() const {
+  return sizeof(mStride) + sizeof(mPadding) + sizeof(mDilation) +
+         sizeof(mDeformableGroup) + sizeof(mGroup);
+}
+
+void ModulatedDeformableConvPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mStride);
+  serialize_value(&buffer, mPadding);
+  serialize_value(&buffer, mDilation);
+  serialize_value(&buffer, mDeformableGroup);
+  serialize_value(&buffer, mGroup);
+}
+
+void ModulatedDeformableConvPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void ModulatedDeformableConvPluginDynamic::attachToContext(
+    cudnnContext *cudnnContext, cublasContext *cublasContext,
+    nvinfer1::IGpuAllocator *gpuAllocator) {
+  m_cublas_handle = cublasContext;
+}
+
+void ModulatedDeformableConvPluginDynamic::detachFromContext() {}
+
+void ModulatedDeformableConvPluginDynamic::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ModulatedDeformableConvPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+ModulatedDeformableConvPluginDynamicCreator::
+    ModulatedDeformableConvPluginDynamicCreator() {
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("stride"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("padding"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("dilation"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("groups"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("deform_groups"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginVersion()
+    const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+ModulatedDeformableConvPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *ModulatedDeformableConvPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  nvinfer1::Dims stride{2, {1, 1}};
+  nvinfer1::Dims padding{2, {0, 0}};
+  nvinfer1::Dims dilation{2, {1, 1}};
+  int deformableGroup = 1;
+  int group = 1;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("deform_groups") == 0) {
+      deformableGroup = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("group") == 0) {
+      group = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("stride") == 0) {
+      stride.nbDims = 2;
+      stride.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      stride.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+
+    if (field_name.compare("padding") == 0) {
+      padding.nbDims = 2;
+      padding.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      padding.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+
+    if (field_name.compare("dilation") == 0) {
+      dilation.nbDims = 2;
+      dilation.d[0] = static_cast<const int *>(fc->fields[i].data)[0];
+      dilation.d[1] = static_cast<const int *>(fc->fields[i].data)[1];
+    }
+  }
+
+  ModulatedDeformableConvPluginDynamic *plugin =
+      new ModulatedDeformableConvPluginDynamic(name, stride, padding, dilation,
+                                               deformableGroup, group);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *
+ModulatedDeformableConvPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin =
+      new ModulatedDeformableConvPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void ModulatedDeformableConvPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ModulatedDeformableConvPluginDynamicCreator::getPluginNamespace()
+    const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f29a7a79d5331d17645d5905bc98d50632d89e6a
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_modulated_deform_conv_kernel.cu
@@ -0,0 +1,134 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <assert.h>
+#include <cuda_fp16.h>
+
+#include "common_cuda_helper.hpp"
+#include "modulated_deform_conv_cuda_kernel.cuh"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+template <typename T>
+void trt_modulated_deformable_im2col(
+    const T* data_im_, const T* data_offset_, const T* data_mask_,
+    const int batch_size, const int channels, const int height_im,
+    const int width_im, const int height_col, const int width_col,
+    const int kernel_h, const int kenerl_w, const int pad_h, const int pad_w,
+    const int stride_h, const int stride_w, const int dilation_h,
+    const int dilation_w, const int deformable_group, T* data_col_,
+    cudaStream_t stream) {
+  // num_axes should be smaller than block size
+  const int channel_per_deformable_group = channels / deformable_group;
+  const int num_kernels = channels * batch_size * height_col * width_col;
+
+  modulated_deformable_im2col_gpu_kernel<T>
+      <<<GET_BLOCKS(num_kernels), THREADS_PER_BLOCK, 0, stream>>>(
+          num_kernels, data_im_, data_offset_, data_mask_, height_im, width_im,
+          kernel_h, kenerl_w, pad_h, pad_w, stride_h, stride_w, dilation_h,
+          dilation_w, channel_per_deformable_group, batch_size, channels,
+          deformable_group, height_col, width_col, data_col_);
+
+  cudaCheckError();
+}
+
+template <typename scalar_t>
+__global__ void output_add_bias_kernel(scalar_t* output, const scalar_t* bias,
+                                       size_t step_batch, size_t step_channel,
+                                       size_t n) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    output[index] += bias[(index % step_batch) / step_channel];
+  }
+}
+
+template <typename scalar_t>
+static void output_add_bias(scalar_t* output, const scalar_t* bias,
+                            size_t batch, size_t channel, size_t height,
+                            size_t width, cudaStream_t stream) {
+  size_t step_channel = height * width;
+  size_t step_batch = step_channel * channel;
+  size_t n = step_batch * batch;
+  output_add_bias_kernel<<<GET_BLOCKS(n), THREADS_PER_BLOCK, 0, stream>>>(
+      output, bias, step_batch, step_channel, n);
+}
+
+template <typename scalar_t>
+void ModulatedDeformConvForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* weight, const scalar_t* bias,
+    const scalar_t* offset, const scalar_t* mask, scalar_t* output,
+    void* workspace, int batch, int channels, int height, int width,
+    int channels_out, int kernel_w, int kernel_h, int stride_w, int stride_h,
+    int pad_w, int pad_h, int dilation_w, int dilation_h, int group,
+    int deformable_group, int im2col_step, cublasHandle_t cublas_handle,
+    cudaStream_t stream) {
+  size_t sizeof_dtype = sizeof(scalar_t);
+  bool with_bias = (bias != nullptr);
+
+  im2col_step = std::min(int(batch), im2col_step);
+  assert(batch % im2col_step == 0);
+  const int channels_kernel = channels / group;
+
+  const int height_out =
+      (height + 2 * pad_h - (dilation_h * (kernel_h - 1) + 1)) / stride_h + 1;
+  const int width_out =
+      (width + 2 * pad_w - (dilation_w * (kernel_w - 1) + 1)) / stride_w + 1;
+
+  scalar_t* columns = (scalar_t*)workspace;
+
+  const size_t input_step = channels * height * width;
+  const size_t offset_step =
+      deformable_group * kernel_h * kernel_w * 2 * height * width;
+  const size_t mask_step =
+      deformable_group * kernel_h * kernel_w * height * width;
+  const size_t out_step = channels_out * height_out * width_out;
+  const size_t out_group_step = out_step / group;
+  const size_t col_g_step =
+      channels * kernel_w * kernel_h / group * height_out * width_out;
+  const size_t weight_g_step =
+      channels_out / group * channels / group * kernel_h * kernel_w;
+
+  const int m = channels_out / group;
+  const int n = height_out * width_out;
+  const int k = channels / group * kernel_h * kernel_w;
+  scalar_t alpha = 1.;
+  scalar_t beta = 0.;
+
+  for (int b = 0; b < batch; b++) {
+    const scalar_t* input_start = input + b * input_step;
+    const scalar_t* offset_start = offset + b * offset_step;
+    const scalar_t* mask_start = mask + b * mask_step;
+    trt_modulated_deformable_im2col<scalar_t>(
+        input_start, offset_start, mask_start, 1, channels, height, width,
+        height_out, width_out, kernel_h, kernel_w, pad_h, pad_w, stride_h,
+        stride_w, dilation_h, dilation_w, deformable_group, columns, stream);
+
+    for (int g = 0; g < group; g++) {
+      const scalar_t* weight_start = weight + g * weight_g_step;
+      scalar_t* col_start = columns + g * col_g_step;
+      scalar_t* out_buffer_start = output + b * out_step + g * out_group_step;
+
+      // cudaMemsetAsync(out_buffer_start, 0, 1, stream);
+      cublasGemmWrap<scalar_t>(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, n, m, k,
+                               &alpha, col_start, n, weight_start, k, &beta,
+                               out_buffer_start, n);
+      cudaCheckError();
+    }
+  }
+
+  if (with_bias) {
+    output_add_bias<scalar_t>(output, bias, batch, channels_out, height_out,
+                              width_out, stream);
+  }
+}
+
+void ModulatedDeformConvForwardCUDAKernelLauncher_float(
+    const float* input, const float* weight, const float* bias,
+    const float* offset, const float* mask, float* output, void* workspace,
+    int batch, int channels, int height, int width, int channels_out,
+    int kernel_w, int kernel_h, int stride_w, int stride_h, int pad_w,
+    int pad_h, int dilation_w, int dilation_h, int group, int deformable_group,
+    int im2col_step, cublasHandle_t cublas_handle, cudaStream_t stream) {
+  ModulatedDeformConvForwardCUDAKernelLauncher<float>(
+      input, weight, bias, offset, mask, output, workspace, batch, channels,
+      height, width, channels_out, kernel_w, kernel_h, stride_w, stride_h,
+      pad_w, pad_h, dilation_w, dilation_h, group, deformable_group,
+      im2col_step, cublas_handle, stream);
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..64be215e7cd6bc7822b1a79727ecb1b1838dfca3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_nms.cpp
@@ -0,0 +1,279 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_nms.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+extern size_t get_onnxnms_workspace_size(
+    size_t num_batches, size_t spatial_dimension, size_t num_classes,
+    size_t boxes_word_size, int center_point_box, size_t output_length);
+
+extern void TRTNMSCUDAKernelLauncher_float(
+    const float *boxes, const float *scores,
+    const int max_output_boxes_per_class, const float iou_threshold,
+    const float score_threshold, const int offset, int *output,
+    int center_point_box, int num_batches, int spatial_dimension,
+    int num_classes, size_t output_length, void *workspace,
+    cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"NonMaxSuppression"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection NonMaxSuppressionDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    NonMaxSuppressionDynamicCreator::mPluginAttributes;
+
+NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(
+    const std::string &name, int centerPointBox, int maxOutputBoxesPerClass,
+    float iouThreshold, float scoreThreshold, int offset)
+    : mLayerName(name),
+      mCenterPointBox(centerPointBox),
+      mMaxOutputBoxesPerClass(maxOutputBoxesPerClass),
+      mIouThreshold(iouThreshold),
+      mScoreThreshold(scoreThreshold),
+      mOffset(offset) {}
+
+NonMaxSuppressionDynamic::NonMaxSuppressionDynamic(const std::string name,
+                                                   const void *data,
+                                                   size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mCenterPointBox);
+  deserialize_value(&data, &length, &mMaxOutputBoxesPerClass);
+  deserialize_value(&data, &length, &mIouThreshold);
+  deserialize_value(&data, &length, &mScoreThreshold);
+  deserialize_value(&data, &length, &mOffset);
+}
+
+nvinfer1::IPluginV2DynamicExt *NonMaxSuppressionDynamic::clone() const {
+  NonMaxSuppressionDynamic *plugin = new NonMaxSuppressionDynamic(
+      mLayerName, mCenterPointBox, mMaxOutputBoxesPerClass, mIouThreshold,
+      mScoreThreshold, mOffset);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs NonMaxSuppressionDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 2;
+  auto num_batches = inputs[0].d[0];
+  auto spatial_dimension = inputs[0].d[1];
+  if (mMaxOutputBoxesPerClass > 0) {
+    spatial_dimension = exprBuilder.operation(
+        nvinfer1::DimensionOperation::kMIN, *spatial_dimension,
+        *exprBuilder.constant(mMaxOutputBoxesPerClass));
+  }
+  auto num_classes = inputs[1].d[1];
+  ret.d[0] = exprBuilder.operation(
+      nvinfer1::DimensionOperation::kPROD, *num_batches,
+      *exprBuilder.operation(nvinfer1::DimensionOperation::kPROD,
+                             *spatial_dimension, *num_classes));
+  ret.d[1] = exprBuilder.constant(3);
+
+  return ret;
+}
+
+bool NonMaxSuppressionDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos < nbInputs) {
+    switch (pos) {
+      case 0:
+        // boxes
+        return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      case 1:
+        // scores
+        return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      default:
+        return true;
+    }
+  } else {
+    switch (pos - nbInputs) {
+      case 0:
+        // selected_indices
+        return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      default:
+        return true;
+    }
+  }
+  return true;
+}
+
+void NonMaxSuppressionDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t NonMaxSuppressionDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  size_t boxes_word_size = mmcv::getElementSize(inputs[0].type);
+  size_t num_batches = inputs[0].dims.d[0];
+  size_t spatial_dimension = inputs[0].dims.d[1];
+  size_t num_classes = inputs[1].dims.d[1];
+  size_t output_length = outputs[0].dims.d[0];
+
+  return get_onnxnms_workspace_size(num_batches, spatial_dimension, num_classes,
+                                    boxes_word_size, mCenterPointBox,
+                                    output_length);
+}
+
+int NonMaxSuppressionDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc *inputDesc,
+    const nvinfer1::PluginTensorDesc *outputDesc, const void *const *inputs,
+    void *const *outputs, void *workSpace, cudaStream_t stream) {
+  int num_batches = inputDesc[0].dims.d[0];
+  int spatial_dimension = inputDesc[0].dims.d[1];
+  int num_classes = inputDesc[1].dims.d[1];
+  int output_length = outputDesc[0].dims.d[0];
+
+  const float *boxes = (const float *)inputs[0];
+  const float *scores = (const float *)inputs[1];
+  int *output = (int *)outputs[0];
+  TRTNMSCUDAKernelLauncher_float(
+      boxes, scores, mMaxOutputBoxesPerClass, mIouThreshold, mScoreThreshold,
+      mOffset, output, mCenterPointBox, num_batches, spatial_dimension,
+      num_classes, output_length, workSpace, stream);
+
+  return 0;
+}
+
+nvinfer1::DataType NonMaxSuppressionDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return nvinfer1::DataType::kINT32;
+}
+
+// IPluginV2 Methods
+const char *NonMaxSuppressionDynamic::getPluginType() const {
+  return PLUGIN_NAME;
+}
+
+const char *NonMaxSuppressionDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int NonMaxSuppressionDynamic::getNbOutputs() const { return 1; }
+
+int NonMaxSuppressionDynamic::initialize() { return 0; }
+
+void NonMaxSuppressionDynamic::terminate() {}
+
+size_t NonMaxSuppressionDynamic::getSerializationSize() const {
+  return sizeof(mCenterPointBox) + sizeof(mMaxOutputBoxesPerClass) +
+         sizeof(mIouThreshold) + sizeof(mScoreThreshold) + sizeof(mOffset);
+}
+
+void NonMaxSuppressionDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mCenterPointBox);
+  serialize_value(&buffer, mMaxOutputBoxesPerClass);
+  serialize_value(&buffer, mIouThreshold);
+  serialize_value(&buffer, mScoreThreshold);
+  serialize_value(&buffer, mOffset);
+}
+
+void NonMaxSuppressionDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void NonMaxSuppressionDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *NonMaxSuppressionDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+NonMaxSuppressionDynamicCreator::NonMaxSuppressionDynamicCreator() {
+  mPluginAttributes.clear();
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("center_point_box"));
+  mPluginAttributes.emplace_back(
+      nvinfer1::PluginField("max_output_boxes_per_class"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("iou_threshold"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("score_threshold"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("offset"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *NonMaxSuppressionDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *NonMaxSuppressionDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+NonMaxSuppressionDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int centerPointBox = 0;
+  int maxOutputBoxesPerClass = 0;
+  float iouThreshold = 0.0f;
+  float scoreThreshold = 0.0f;
+  int offset = 0;
+
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("center_point_box") == 0) {
+      centerPointBox = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("max_output_boxes_per_class") == 0) {
+      maxOutputBoxesPerClass = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("iou_threshold") == 0) {
+      iouThreshold = static_cast<const float *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("score_threshold") == 0) {
+      scoreThreshold = static_cast<const float *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("offset") == 0) {
+      offset = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+  }
+  NonMaxSuppressionDynamic *plugin =
+      new NonMaxSuppressionDynamic(name, centerPointBox, maxOutputBoxesPerClass,
+                                   iouThreshold, scoreThreshold, offset);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *NonMaxSuppressionDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new NonMaxSuppressionDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void NonMaxSuppressionDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *NonMaxSuppressionDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..3de37ca6ead7511cb543f79fc4f187b75f0d6941
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_nms_kernel.cu
@@ -0,0 +1,274 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <stdio.h>
+#include <thrust/execution_policy.h>
+#include <thrust/gather.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+
+#include <chrono>
+#include <thread>
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "nms_cuda_kernel.cuh"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+struct NMSBox {
+  float box[4];
+};
+
+struct nms_centerwh2xyxy {
+  __host__ __device__ NMSBox operator()(const NMSBox box) {
+    NMSBox out;
+    out.box[0] = box.box[0] - box.box[2] / 2.0f;
+    out.box[1] = box.box[1] - box.box[3] / 2.0f;
+    out.box[2] = box.box[0] + box.box[2] / 2.0f;
+    out.box[3] = box.box[1] + box.box[3] / 2.0f;
+    return out;
+  }
+};
+
+struct nms_sbox_idle {
+  const float* idle_box_;
+  __host__ __device__ nms_sbox_idle(const float* idle_box) {
+    idle_box_ = idle_box;
+  }
+
+  __host__ __device__ NMSBox operator()(const NMSBox box) {
+    return {idle_box_[0], idle_box_[1], idle_box_[2], idle_box_[3]};
+  }
+};
+
+struct nms_score_threshold {
+  float score_threshold_;
+  __host__ __device__ nms_score_threshold(const float score_threshold) {
+    score_threshold_ = score_threshold;
+  }
+
+  __host__ __device__ bool operator()(const float score) {
+    return score < score_threshold_;
+  }
+};
+
+__global__ void nms_reindex_kernel(int n, int* output, int* index_cache) {
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    const int old_index = output[index * 3 + 2];
+    output[index * 3 + 2] = index_cache[old_index];
+  }
+}
+
+__global__ void mask_to_output_kernel(const unsigned long long* dev_mask,
+                                      const int* index, int* output,
+                                      int* output_count, int batch_id,
+                                      int cls_id, int spatial_dimension,
+                                      int col_blocks,
+                                      int max_output_boxes_per_class) {
+  extern __shared__ unsigned long long remv[];
+
+  // fill remv with 0
+  CUDA_1D_KERNEL_LOOP(i, col_blocks) { remv[i] = 0; }
+  __syncthreads();
+
+  int start = *output_count;
+  int out_per_class_count = 0;
+  for (int i = 0; i < spatial_dimension; i++) {
+    const int nblock = i / threadsPerBlock;
+    const int inblock = i % threadsPerBlock;
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      if (threadIdx.x == 0) {
+        output[start * 3 + 0] = batch_id;
+        output[start * 3 + 1] = cls_id;
+        output[start * 3 + 2] = index[i];
+        start += 1;
+      }
+      out_per_class_count += 1;
+      if (out_per_class_count >= max_output_boxes_per_class) {
+        break;
+      }
+      __syncthreads();
+      // set every overlap box with bit 1 in remv
+      const unsigned long long* p = dev_mask + i * col_blocks;
+      CUDA_1D_KERNEL_LOOP(j, col_blocks) {
+        if (j >= nblock) {
+          remv[j] |= p[j];
+        }
+      }  // j
+      __syncthreads();
+    }
+  }  // i
+  if (threadIdx.x == 0) {
+    *output_count = start;
+  }
+}
+
+size_t get_onnxnms_workspace_size(size_t num_batches, size_t spatial_dimension,
+                                  size_t num_classes, size_t boxes_word_size,
+                                  int center_point_box, size_t output_length) {
+  size_t boxes_xyxy_workspace = 0;
+  if (center_point_box == 1) {
+    boxes_xyxy_workspace = mmcv::getAlignedSize(
+        num_batches * spatial_dimension * 4 * boxes_word_size);
+  }
+  size_t scores_workspace =
+      mmcv::getAlignedSize(spatial_dimension * boxes_word_size);
+  size_t boxes_workspace =
+      mmcv::getAlignedSize(spatial_dimension * 4 * boxes_word_size);
+  const int col_blocks =
+      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
+  size_t mask_workspace = mmcv::getAlignedSize(spatial_dimension * col_blocks *
+                                               sizeof(unsigned long long));
+  size_t index_template_workspace =
+      mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+  size_t index_workspace =
+      mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+  size_t count_workspace = mmcv::getAlignedSize(sizeof(int));
+  return scores_workspace + boxes_xyxy_workspace + boxes_workspace +
+         mask_workspace + index_template_workspace + index_workspace +
+         count_workspace;
+}
+
+/**
+ * Launch the NonMaxSuppression kernel
+ *
+ * The NMS will be performed on each batch/class, share the kernel implement
+ * `nms_cuda`. For each batch/class, the `boxes_sorted` and `index_cache` will
+ * be sorted by scores, boxes_sorted will be used in `nms_cuda` kernel. After
+ * that, the output would be generated by `mask_to_output_kernel` with
+ * `dev_mask` and `sorted_cache`.
+ *
+ * @param[in] bboxes with shape [num_batch, spatial_dimension, 4], input boxes
+ * @param[in] scores with shape [num_batch, num_classes, spatial_dimension],
+ *     input scores
+ * @param[in] max_output_boxes_per_class max output boxes per class
+ * @param[in] iou_threshold threshold of iou
+ * @param[in] score_threshold threshold of scores
+ * @param[in] offset box offset, only 0 or 1 is valid
+ * @param[out] output with shape [output_length, 3], each row contain index
+ *     (batch_id, class_id, boxes_id), filling -1 if result is not valid.
+ * @param[in] center_point_box 0 if boxes is [left, top, right, bottom] 1 if
+ *     boxes is [center_x, center_y, width, height]
+ * @param[in] num_batches batch size of boxes and scores
+ * @param[in] spatial_dimension boxes numbers each batch
+ * @param[in] num_classes class numbers
+ * @param[in] output_length the max output rows
+ * @param[in] workspace memory for all temporary variables.
+ * @param[in] stream cuda stream
+ */
+void TRTNMSCUDAKernelLauncher_float(const float* boxes, const float* scores,
+                                    const int max_output_boxes_per_class,
+                                    const float iou_threshold,
+                                    const float score_threshold,
+                                    const int offset, int* output,
+                                    int center_point_box, int num_batches,
+                                    int spatial_dimension, int num_classes,
+                                    size_t output_length, void* workspace,
+                                    cudaStream_t stream) {
+  const int col_blocks =
+      (spatial_dimension + threadsPerBlock - 1) / threadsPerBlock;
+  float* boxes_sorted = (float*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * 4 * sizeof(float));
+
+  float* boxes_xyxy = nullptr;
+  if (center_point_box == 1) {
+    boxes_xyxy = (float*)workspace;
+    workspace = static_cast<char*>(workspace) +
+                mmcv::getAlignedSize(num_batches * spatial_dimension * 4 *
+                                     sizeof(float));
+    thrust::transform(thrust::cuda::par.on(stream), (NMSBox*)boxes,
+                      (NMSBox*)(boxes + num_batches * spatial_dimension * 4),
+                      (NMSBox*)boxes_xyxy, nms_centerwh2xyxy());
+    cudaCheckError();
+  }
+
+  float* scores_sorted = (float*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * sizeof(float));
+
+  unsigned long long* dev_mask = (unsigned long long*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * col_blocks *
+                                   sizeof(unsigned long long));
+
+  int* index_cache = (int*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+
+  // generate sequence [0,1,2,3,4 ....]
+  int* index_template = (int*)workspace;
+  workspace = static_cast<char*>(workspace) +
+              mmcv::getAlignedSize(spatial_dimension * sizeof(int));
+  thrust::sequence(thrust::cuda::par.on(stream), index_template,
+                   index_template + spatial_dimension, 0);
+
+  int max_output_boxes_per_class_cpu = max_output_boxes_per_class;
+  if (max_output_boxes_per_class_cpu <= 0) {
+    max_output_boxes_per_class_cpu = spatial_dimension;
+  }
+
+  int* output_count = (int*)workspace;
+  workspace = static_cast<char*>(workspace) + mmcv::getAlignedSize(sizeof(int));
+  cudaMemsetAsync(output_count, 0, sizeof(int), stream);
+
+  // fill output with -1
+  thrust::fill(thrust::cuda::par.on(stream), output, output + output_length * 3,
+               -1);
+  cudaCheckError();
+
+  dim3 blocks(col_blocks, col_blocks);
+  dim3 threads(threadsPerBlock);
+
+  for (int batch_id = 0; batch_id < num_batches; ++batch_id) {
+    for (int cls_id = 0; cls_id < num_classes; ++cls_id) {
+      const int batch_cls_id = batch_id * num_classes + cls_id;
+
+      // sort boxes by score
+      cudaMemcpyAsync(scores_sorted, scores + batch_cls_id * spatial_dimension,
+                      spatial_dimension * sizeof(float),
+                      cudaMemcpyDeviceToDevice, stream);
+      cudaCheckError();
+
+      cudaMemcpyAsync(index_cache, index_template,
+                      spatial_dimension * sizeof(int), cudaMemcpyDeviceToDevice,
+                      stream);
+      cudaCheckError();
+
+      thrust::sort_by_key(thrust::cuda::par.on(stream), scores_sorted,
+                          scores_sorted + spatial_dimension, index_cache,
+                          thrust::greater<float>());
+
+      if (center_point_box == 1) {
+        thrust::gather(thrust::cuda::par.on(stream), index_cache,
+                       index_cache + spatial_dimension,
+                       (NMSBox*)(boxes_xyxy + batch_id * spatial_dimension * 4),
+                       (NMSBox*)boxes_sorted);
+      } else {
+        thrust::gather(thrust::cuda::par.on(stream), index_cache,
+                       index_cache + spatial_dimension,
+                       (NMSBox*)(boxes + batch_id * spatial_dimension * 4),
+                       (NMSBox*)boxes_sorted);
+      }
+
+      cudaCheckError();
+
+      if (score_threshold > 0.0f) {
+        thrust::transform_if(
+            thrust::cuda::par.on(stream), (NMSBox*)boxes_sorted,
+            (NMSBox*)(boxes_sorted + spatial_dimension * 4), scores_sorted,
+            (NMSBox*)boxes_sorted, nms_sbox_idle(boxes_sorted),
+            nms_score_threshold(score_threshold));
+      }
+
+      nms_cuda<<<blocks, threads, 0, stream>>>(spatial_dimension, iou_threshold,
+                                               offset, boxes_sorted, dev_mask);
+
+      // will be performed when dev_mask is full.
+      mask_to_output_kernel<<<1, threadsPerBlock,
+                              col_blocks * sizeof(unsigned long long),
+                              stream>>>(
+          dev_mask, index_cache, output, output_count, batch_id, cls_id,
+          spatial_dimension, col_blocks, max_output_boxes_per_class_cpu);
+    }  // cls_id
+  }    // batch_id
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..eec1bb2c786454fedbe2938d57060ee56b13e9f0
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_plugin.cpp
@@ -0,0 +1,27 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_plugin.hpp"
+
+#include "trt_corner_pool.hpp"
+#include "trt_cummaxmin.hpp"
+#include "trt_deform_conv.hpp"
+#include "trt_grid_sampler.hpp"
+#include "trt_instance_norm.hpp"
+#include "trt_modulated_deform_conv.hpp"
+#include "trt_nms.hpp"
+#include "trt_roi_align.hpp"
+#include "trt_scatternd.hpp"
+
+REGISTER_TENSORRT_PLUGIN(CumMaxPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(CumMinPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(GridSamplerDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(DeformableConvPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(ModulatedDeformableConvPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(NonMaxSuppressionDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(RoIAlignPluginDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(ONNXScatterNDDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(InstanceNormalizationDynamicCreator);
+REGISTER_TENSORRT_PLUGIN(CornerPoolPluginDynamicCreator);
+
+extern "C" {
+bool initLibMMCVInferPlugins() { return true; }
+}  // extern "C"
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align.cpp b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..97700f939b3417c37060671f81a558f1ba7e961f
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align.cpp
@@ -0,0 +1,294 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_roi_align.hpp"
+
+#include <assert.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+extern void TRTRoIAlignForwardCUDAKernelLauncher_float(
+    const float *input, const float *rois, float *output, float *argmax_y,
+    float *argmax_x, int output_size, int channels, int height, int width,
+    int aligned_height, int aligned_width, float spatial_scale,
+    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"MMCVRoiAlign"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection RoIAlignPluginDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    RoIAlignPluginDynamicCreator::mPluginAttributes;
+
+RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string &name,
+                                             int outWidth, int outHeight,
+                                             float spatialScale,
+                                             int sampleRatio, int poolMode,
+                                             bool aligned)
+    : mLayerName(name),
+      mOutWidth(outWidth),
+      mOutHeight(outHeight),
+      mSpatialScale(spatialScale),
+      mSampleRatio(sampleRatio),
+      mPoolMode(poolMode),
+      mAligned(aligned) {}
+
+RoIAlignPluginDynamic::RoIAlignPluginDynamic(const std::string name,
+                                             const void *data, size_t length)
+    : mLayerName(name) {
+  deserialize_value(&data, &length, &mOutWidth);
+  deserialize_value(&data, &length, &mOutHeight);
+  deserialize_value(&data, &length, &mSpatialScale);
+  deserialize_value(&data, &length, &mSampleRatio);
+  deserialize_value(&data, &length, &mPoolMode);
+  deserialize_value(&data, &length, &mAligned);
+}
+
+nvinfer1::IPluginV2DynamicExt *RoIAlignPluginDynamic::clone() const {
+  RoIAlignPluginDynamic *plugin = new RoIAlignPluginDynamic(
+      mLayerName, mOutWidth, mOutHeight, mSpatialScale, mSampleRatio, mPoolMode,
+      mAligned);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs RoIAlignPluginDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  nvinfer1::DimsExprs ret;
+  ret.nbDims = 4;
+  ret.d[0] = inputs[1].d[0];
+  ret.d[1] = inputs[0].d[1];
+  ret.d[2] = exprBuilder.constant(mOutHeight);
+  ret.d[3] = exprBuilder.constant(mOutWidth);
+
+  return ret;
+}
+
+bool RoIAlignPluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  return inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+         inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+}
+
+void RoIAlignPluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t RoIAlignPluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  size_t output_size = 0;
+  size_t word_size = 0;
+  switch (mPoolMode) {
+    case 0:  // max
+      output_size = outputs[0].dims.d[0] * outputs[0].dims.d[1] *
+                    outputs[0].dims.d[2] * outputs[0].dims.d[3];
+      word_size = mmcv::getElementSize(outputs[0].type);
+      return output_size * word_size * 2;
+      break;
+    case 1:
+      return 0;
+      break;
+    default:
+      return 0;
+  }
+  return 0;
+}
+
+int RoIAlignPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                   const nvinfer1::PluginTensorDesc *outputDesc,
+                                   const void *const *inputs,
+                                   void *const *outputs, void *workSpace,
+                                   cudaStream_t stream) {
+  int channels = inputDesc[0].dims.d[1];
+  int height = inputDesc[0].dims.d[2];
+  int width = inputDesc[0].dims.d[3];
+
+  int output_size = outputDesc[0].dims.d[0] * outputDesc[0].dims.d[1] *
+                    outputDesc[0].dims.d[2] * outputDesc[0].dims.d[3];
+  int word_size = mmcv::getElementSize(outputDesc[0].type);
+
+  const void *feat = inputs[0];
+  const void *rois = inputs[1];
+  void *output = outputs[0];
+  void *argmax_y = nullptr;
+  void *argmax_x = nullptr;
+
+  switch (mPoolMode) {
+    case 0:  // max
+      argmax_y = workSpace;
+      argmax_x = argmax_y + output_size * word_size;
+      break;
+    case 1:  // avg
+      break;
+  }
+
+  switch (outputDesc[0].type) {
+    case nvinfer1::DataType::kFLOAT:
+      TRTRoIAlignForwardCUDAKernelLauncher_float(
+          (const float *)feat, (const float *)rois, (float *)output,
+          (float *)argmax_y, (float *)argmax_x, output_size, channels, height,
+          width, mOutHeight, mOutWidth, mSpatialScale, mSampleRatio, mPoolMode,
+          mAligned, stream);
+      break;
+
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType RoIAlignPluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *RoIAlignPluginDynamic::getPluginType() const { return PLUGIN_NAME; }
+
+const char *RoIAlignPluginDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int RoIAlignPluginDynamic::getNbOutputs() const { return 1; }
+
+int RoIAlignPluginDynamic::initialize() { return 0; }
+
+void RoIAlignPluginDynamic::terminate() {}
+
+size_t RoIAlignPluginDynamic::getSerializationSize() const {
+  return sizeof(mOutWidth) + sizeof(mOutHeight) + sizeof(mSpatialScale) +
+         sizeof(mSampleRatio) + sizeof(mPoolMode) + sizeof(mAligned);
+}
+
+void RoIAlignPluginDynamic::serialize(void *buffer) const {
+  serialize_value(&buffer, mOutWidth);
+  serialize_value(&buffer, mOutHeight);
+  serialize_value(&buffer, mSpatialScale);
+  serialize_value(&buffer, mSampleRatio);
+  serialize_value(&buffer, mPoolMode);
+  serialize_value(&buffer, mAligned);
+}
+
+void RoIAlignPluginDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void RoIAlignPluginDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *RoIAlignPluginDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+RoIAlignPluginDynamicCreator::RoIAlignPluginDynamicCreator() {
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("output_height"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("output_width"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("spatial_scale"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("sampling_ratio"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("mode"));
+  mPluginAttributes.emplace_back(nvinfer1::PluginField("aligned"));
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *RoIAlignPluginDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *RoIAlignPluginDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+RoIAlignPluginDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  int outWidth = 7;
+  int outHeight = 7;
+  float spatialScale = 1.0;
+  int sampleRatio = 0;
+  int poolMode = -1;
+  bool aligned = true;
+  for (int i = 0; i < fc->nbFields; i++) {
+    if (fc->fields[i].data == nullptr) {
+      continue;
+    }
+    std::string field_name(fc->fields[i].name);
+
+    if (field_name.compare("output_height") == 0) {
+      outHeight = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("output_width") == 0) {
+      outWidth = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("spatial_scale") == 0) {
+      spatialScale = static_cast<const float *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("sampling_ratio") == 0) {
+      sampleRatio = static_cast<const int *>(fc->fields[i].data)[0];
+    }
+
+    if (field_name.compare("mode") == 0) {
+      int data_size = fc->fields[i].length;
+      const char *data_start = static_cast<const char *>(fc->fields[i].data);
+      std::string poolModeStr(data_start, data_size);
+      if (poolModeStr == "avg") {
+        poolMode = 1;
+      } else if (poolModeStr == "max") {
+        poolMode = 0;
+      } else {
+        std::cout << "Unknown pool mode \"" << poolModeStr << "\"."
+                  << std::endl;
+      }
+      assert(poolMode >= 0);
+    }
+
+    if (field_name.compare("aligned") == 0) {
+      int aligned_int = static_cast<const int *>(fc->fields[i].data)[0];
+      aligned = aligned_int != 0;
+    }
+  }
+
+  assert(outHeight > 0);
+  assert(outWidth > 0);
+  assert(spatialScale > 0.);
+  assert(poolMode >= 0);
+
+  RoIAlignPluginDynamic *plugin = new RoIAlignPluginDynamic(
+      name, outWidth, outHeight, spatialScale, sampleRatio, poolMode, aligned);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *RoIAlignPluginDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new RoIAlignPluginDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void RoIAlignPluginDynamicCreator::setPluginNamespace(
+    const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *RoIAlignPluginDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align_kernel.cu b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..650bc685cba89188eb28f9ff199d16f1c88dd3dc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_roi_align_kernel.cu
@@ -0,0 +1,28 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "common_cuda_helper.hpp"
+#include "roi_align_cuda_kernel.cuh"
+
+template <typename scalar_t>
+void TRTRoIAlignForwardCUDAKernelLauncher(
+    const scalar_t* input, const scalar_t* rois, scalar_t* output,
+    scalar_t* argmax_y, scalar_t* argmax_x, int output_size, int channels,
+    int height, int width, int aligned_height, int aligned_width,
+    scalar_t spatial_scale, int sampling_ratio, int pool_mode, bool aligned,
+    cudaStream_t stream) {
+  roi_align_forward_cuda_kernel<scalar_t>
+      <<<GET_BLOCKS(output_size), THREADS_PER_BLOCK, 0, stream>>>(
+          output_size, input, rois, output, argmax_y, argmax_x, aligned_height,
+          aligned_width, static_cast<scalar_t>(spatial_scale), sampling_ratio,
+          pool_mode, aligned, channels, height, width);
+}
+
+void TRTRoIAlignForwardCUDAKernelLauncher_float(
+    const float* input, const float* rois, float* output, float* argmax_y,
+    float* argmax_x, int output_size, int channels, int height, int width,
+    int aligned_height, int aligned_width, float spatial_scale,
+    int sampling_ratio, int pool_mode, bool aligned, cudaStream_t stream) {
+  TRTRoIAlignForwardCUDAKernelLauncher<float>(
+      input, rois, output, argmax_y, argmax_x, output_size, channels, height,
+      width, aligned_height, aligned_width, spatial_scale, sampling_ratio,
+      pool_mode, aligned, stream);
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd.cpp b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0d077902091d76249ddc9ffad5c8afdf831b1fd8
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd.cpp
@@ -0,0 +1,207 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include "trt_scatternd.hpp"
+
+#include <assert.h>
+#include <stdio.h>
+
+#include <chrono>
+
+#include "trt_serialize.hpp"
+
+extern void TRTONNXScatterNDKernelLauncher_float(
+    const float *data, const int *indices, const float *update, const int *dims,
+    int nbDims, const int *indices_dims, int indice_nbDims, float *output,
+    cudaStream_t stream);
+
+extern void TRTONNXScatterNDKernelLauncher_int32(
+    const int *data, const int *indices, const int *update, const int *dims,
+    int nbDims, const int *indices_dims, int indice_nbDims, int *output,
+    cudaStream_t stream);
+
+namespace {
+static const char *PLUGIN_VERSION{"1"};
+static const char *PLUGIN_NAME{"ScatterND"};
+}  // namespace
+
+nvinfer1::PluginFieldCollection ONNXScatterNDDynamicCreator::mFC{};
+std::vector<nvinfer1::PluginField>
+    ONNXScatterNDDynamicCreator::mPluginAttributes;
+
+ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string &name)
+    : mLayerName(name) {}
+
+ONNXScatterNDDynamic::ONNXScatterNDDynamic(const std::string name,
+                                           const void *data, size_t length)
+    : mLayerName(name) {}
+
+nvinfer1::IPluginV2DynamicExt *ONNXScatterNDDynamic::clone() const {
+  ONNXScatterNDDynamic *plugin = new ONNXScatterNDDynamic(mLayerName);
+  plugin->setPluginNamespace(getPluginNamespace());
+
+  return plugin;
+}
+
+nvinfer1::DimsExprs ONNXScatterNDDynamic::getOutputDimensions(
+    int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+    nvinfer1::IExprBuilder &exprBuilder) {
+  return inputs[0];
+}
+
+bool ONNXScatterNDDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc *inOut, int nbInputs,
+    int nbOutputs) {
+  if (pos < nbInputs) {
+    switch (pos) {
+      case 0:
+        // data
+        return (inOut[pos].type == nvinfer1::DataType::kFLOAT &&
+                inOut[pos].format == nvinfer1::TensorFormat::kLINEAR) ||
+               (inOut[pos].type == nvinfer1::DataType::kINT32 &&
+                inOut[pos].format == nvinfer1::TensorFormat::kLINEAR);
+      case 1:
+        // indices
+        return inOut[pos].type == nvinfer1::DataType::kINT32 &&
+               inOut[pos].format == nvinfer1::TensorFormat::kLINEAR;
+      case 2:
+        // updates
+        return inOut[pos].type == inOut[0].type &&
+               inOut[pos].format == inOut[0].format;
+      default:
+        return true;
+    }
+  } else {
+    switch (pos - nbInputs) {
+      case 0:
+        // output
+        return inOut[pos].type == inOut[0].type &&
+               inOut[pos].format == inOut[0].format;
+      default:
+        return true;
+    }
+  }
+  return true;
+}
+
+void ONNXScatterNDDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc *outputs, int nbOutputs) {}
+
+size_t ONNXScatterNDDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc *inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc *outputs, int nbOutputs) const {
+  return 0;
+}
+
+int ONNXScatterNDDynamic::enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+                                  const nvinfer1::PluginTensorDesc *outputDesc,
+                                  const void *const *inputs,
+                                  void *const *outputs, void *workSpace,
+                                  cudaStream_t stream) {
+  const int *dims = &(inputDesc[0].dims.d[0]);
+  const int *indices_dims = &(inputDesc[1].dims.d[0]);
+  int nbDims = inputDesc[0].dims.nbDims;
+  int indice_nbDims = inputDesc[1].dims.nbDims;
+
+  const void *data = inputs[0];
+  const void *indices = inputs[1];
+  const void *update = inputs[2];
+  void *output = outputs[0];
+
+  auto data_type = inputDesc[0].type;
+
+  switch (data_type) {
+    case nvinfer1::DataType::kFLOAT:
+      TRTONNXScatterNDKernelLauncher_float(
+          (float *)data, (int *)indices, (float *)update, dims, nbDims,
+          indices_dims, indice_nbDims, (float *)output, stream);
+      break;
+
+    case nvinfer1::DataType::kINT32:
+      TRTONNXScatterNDKernelLauncher_int32(
+          (int *)data, (int *)indices, (int *)update, dims, nbDims,
+          indices_dims, indice_nbDims, (int *)output, stream);
+      break;
+    default:
+      break;
+  }
+
+  return 0;
+}
+
+nvinfer1::DataType ONNXScatterNDDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType *inputTypes, int nbInputs) const {
+  return inputTypes[0];
+}
+
+// IPluginV2 Methods
+const char *ONNXScatterNDDynamic::getPluginType() const { return PLUGIN_NAME; }
+
+const char *ONNXScatterNDDynamic::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+int ONNXScatterNDDynamic::getNbOutputs() const { return 1; }
+
+int ONNXScatterNDDynamic::initialize() { return 0; }
+
+void ONNXScatterNDDynamic::terminate() {}
+
+size_t ONNXScatterNDDynamic::getSerializationSize() const { return 0; }
+
+void ONNXScatterNDDynamic::serialize(void *buffer) const {}
+
+void ONNXScatterNDDynamic::destroy() {
+  // This gets called when the network containing plugin is destroyed
+  delete this;
+}
+
+void ONNXScatterNDDynamic::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ONNXScatterNDDynamic::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
+
+////////////////////// creator /////////////////////////////
+
+ONNXScatterNDDynamicCreator::ONNXScatterNDDynamicCreator() {
+  mPluginAttributes.clear();
+  mFC.nbFields = mPluginAttributes.size();
+  mFC.fields = mPluginAttributes.data();
+}
+
+const char *ONNXScatterNDDynamicCreator::getPluginName() const {
+  return PLUGIN_NAME;
+}
+
+const char *ONNXScatterNDDynamicCreator::getPluginVersion() const {
+  return PLUGIN_VERSION;
+}
+
+const nvinfer1::PluginFieldCollection *
+ONNXScatterNDDynamicCreator::getFieldNames() {
+  return &mFC;
+}
+
+nvinfer1::IPluginV2 *ONNXScatterNDDynamicCreator::createPlugin(
+    const char *name, const nvinfer1::PluginFieldCollection *fc) {
+  ONNXScatterNDDynamic *plugin = new ONNXScatterNDDynamic(name);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+nvinfer1::IPluginV2 *ONNXScatterNDDynamicCreator::deserializePlugin(
+    const char *name, const void *serialData, size_t serialLength) {
+  auto plugin = new ONNXScatterNDDynamic(name, serialData, serialLength);
+  plugin->setPluginNamespace(getPluginNamespace());
+  return plugin;
+}
+
+void ONNXScatterNDDynamicCreator::setPluginNamespace(const char *libNamespace) {
+  mNamespace = libNamespace;
+}
+
+const char *ONNXScatterNDDynamicCreator::getPluginNamespace() const {
+  return mNamespace.c_str();
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f1b095efa45d1aaa91942bbde1b62a24decbb2f0
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/plugins/trt_scatternd_kernel.cu
@@ -0,0 +1,93 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#include <stdio.h>
+
+#include <vector>
+
+#include "common_cuda_helper.hpp"
+#include "trt_cuda_helper.cuh"
+#include "trt_plugin_helper.hpp"
+
+static int const threadsPerBlock = sizeof(unsigned long long int) * 8;
+
+using mmcv::TensorDesc;
+
+template <typename T>
+__global__ void onnx_scatternd_kernel(const int n, const int* indices,
+                                      const T* update, T* output,
+                                      TensorDesc tensor_desc,
+                                      TensorDesc indice_desc) {
+  const int indice_cols = indice_desc.shape[indice_desc.dim - 1];
+  const int copy_stride = tensor_desc.stride[indice_cols - 1];
+  const int* stride = &(tensor_desc.stride[0]);
+  CUDA_1D_KERNEL_LOOP(index, n) {
+    int output_offset = 0;
+    const int* indices_current = indices + index * indice_cols;
+    for (int i = 0; i < indice_cols; ++i) {
+      output_offset += stride[i] * indices_current[i];
+    }
+    memcpy(output + output_offset, update + index * copy_stride,
+           copy_stride * sizeof(T));
+  }
+}
+
+template <typename T>
+void TRTONNXScatterNDKernelLauncher(const T* data, const int* indices,
+                                    const T* update, const int* dims,
+                                    int nbDims, const int* indices_dims,
+                                    int indice_nbDims, T* output,
+                                    cudaStream_t stream) {
+  // fill tensordesc and initial
+  TensorDesc tensor_desc;
+  memset((void*)&tensor_desc, 0, sizeof(TensorDesc));
+  tensor_desc.dim = nbDims;
+  tensor_desc.shape[nbDims - 1] = dims[nbDims - 1];
+  tensor_desc.stride[nbDims - 1] = 1;
+  for (int i = nbDims - 2; i >= 0; --i) {
+    tensor_desc.shape[i] = dims[i];
+    tensor_desc.stride[i] = dims[i + 1] * tensor_desc.stride[i + 1];
+  }
+  const int data_size = tensor_desc.stride[0] * tensor_desc.shape[0];
+
+  TensorDesc indice_desc;
+  memset((void*)&indice_desc, 0, sizeof(TensorDesc));
+  indice_desc.dim = indice_nbDims;
+  indice_desc.shape[indice_nbDims - 1] = indices_dims[indice_nbDims - 1];
+  indice_desc.stride[indice_nbDims - 1] = 1;
+  for (int i = indice_nbDims - 2; i >= 0; --i) {
+    indice_desc.shape[i] = indices_dims[i];
+    indice_desc.stride[i] = indices_dims[i + 1] * indice_desc.stride[i + 1];
+  }
+
+  // output = np.copy(data)
+  cudaMemcpyAsync(output, data, data_size * sizeof(T),
+                  cudaMemcpyDeviceToDevice);
+
+  int num_update_indice = 1;
+  for (int i = 0; i < indice_nbDims - 1; ++i) {
+    num_update_indice *= indice_desc.shape[i];
+  }
+  // scatter
+  const int col_block = GET_BLOCKS(num_update_indice, threadsPerBlock);
+  onnx_scatternd_kernel<<<col_block, threadsPerBlock, 0, stream>>>(
+      num_update_indice, indices, update, output, tensor_desc, indice_desc);
+}
+
+void TRTONNXScatterNDKernelLauncher_float(const float* data, const int* indices,
+                                          const float* update, const int* dims,
+                                          int nbDims, const int* indices_dims,
+                                          int indice_nbDims, float* output,
+                                          cudaStream_t stream) {
+  TRTONNXScatterNDKernelLauncher<float>(data, indices, update, dims, nbDims,
+                                        indices_dims, indice_nbDims, output,
+                                        stream);
+}
+
+void TRTONNXScatterNDKernelLauncher_int32(const int* data, const int* indices,
+                                          const int* update, const int* dims,
+                                          int nbDims, const int* indices_dims,
+                                          int indice_nbDims, int* output,
+                                          cudaStream_t stream) {
+  TRTONNXScatterNDKernelLauncher<int>(data, indices, update, dims, nbDims,
+                                      indices_dims, indice_nbDims, output,
+                                      stream);
+}
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_corner_pool.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_corner_pool.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..f34e15b312ac9f9c3d3f0b5d413711423723f8cb
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_corner_pool.hpp
@@ -0,0 +1,111 @@
+#ifndef TRT_CORNER_POOL_HPP
+#define TRT_CORNER_POOL_HPP
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+enum TRT_CORNER_POOL_TYPE {
+  TRT_TOP_POOL = 0,
+  TRT_BOTTOM_POOL = 1,
+  TRT_LEFT_POOL = 2,
+  TRT_RIGHT_POOL = 3
+};
+
+// implement of CornerPool
+class CornerPoolPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  CornerPoolPluginDynamic(const std::string &name,
+                          TRT_CORNER_POOL_TYPE poolType);
+
+  CornerPoolPluginDynamic(const std::string name, const void *data,
+                          size_t length);
+
+  CornerPoolPluginDynamic() = delete;
+
+  ~CornerPoolPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ protected:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  TRT_CORNER_POOL_TYPE mPoolType;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+// CornerPool creator
+class CornerPoolPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  CornerPoolPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ protected:
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+#endif TRT_CORNER_POOL_HPP  // TRT_CORNER_POOL_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh b/mmcv/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..846d06a419f66bb60bc7b9fceb901b4db263ffab
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_cuda_helper.cuh
@@ -0,0 +1,39 @@
+// Copyright (c) OpenMMLab. All rights reserved
+#ifndef TRT_CUDA_HELPER_HPP
+#define TRT_CUDA_HELPER_HPP
+#include <cublas_v2.h>
+
+#define cudaCheckError()                                       \
+  {                                                            \
+    cudaError_t e = cudaGetLastError();                        \
+    if (e != cudaSuccess) {                                    \
+      printf("Cuda failure %s:%d: '%s'\n", __FILE__, __LINE__, \
+             cudaGetErrorString(e));                           \
+      exit(0);                                                 \
+    }                                                          \
+  }
+
+/**
+ * Returns a view of the original tensor with its dimensions permuted.
+ *
+ * @param[out] dst pointer to the destination tensor
+ * @param[in] src pointer to the source tensor
+ * @param[in] src_size shape of the src tensor
+ * @param[in] permute The desired ordering of dimensions
+ * @param[in] src_dim dim of src tensor
+ * @param[in] stream cuda stream handle
+ */
+template <class scalar_t>
+void memcpyPermute(scalar_t* dst, const scalar_t* src, int* src_size,
+                   int* permute, int src_dim, cudaStream_t stream = 0);
+
+template <typename scalar_t>
+cublasStatus_t cublasGemmWrap(cublasHandle_t handle, cublasOperation_t transa,
+                              cublasOperation_t transb, int m, int n, int k,
+                              const scalar_t* alpha, const scalar_t* A, int lda,
+                              const scalar_t* B, int ldb, const scalar_t* beta,
+                              scalar_t* C, int ldc) {
+  return CUBLAS_STATUS_INTERNAL_ERROR;
+}
+
+#endif  // TRT_CUDA_HELPER_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_cummaxmin.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_cummaxmin.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5b856b02fb523a9c2dc3ac08dfc4265f754ca224
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_cummaxmin.hpp
@@ -0,0 +1,122 @@
+#ifndef TRT_CUMMAXMIN_HPP
+#define TRT_CUMMAXMIN_HPP
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+enum TRT_CUMCMPTYPE { TRT_CUMMAX = 0, TRT_CUMMIN = 1 };
+
+// implement of cummax and cummin
+class CumMaxMinPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  CumMaxMinPluginDynamic(const std::string &name, int dim,
+                         TRT_CUMCMPTYPE cumType);
+
+  CumMaxMinPluginDynamic(const std::string name, const void *data,
+                         size_t length);
+
+  CumMaxMinPluginDynamic() = delete;
+
+  ~CumMaxMinPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ protected:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mDim;
+  TRT_CUMCMPTYPE mCumType;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+// cummax and cummin creator
+class CumMaxMinPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  CumMaxMinPluginDynamicCreator(TRT_CUMCMPTYPE cumType);
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ protected:
+  TRT_CUMCMPTYPE mCumType;
+  nvinfer1::PluginFieldCollection mFC;
+  std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+// cummax creator
+class CumMaxPluginDynamicCreator : public CumMaxMinPluginDynamicCreator {
+ public:
+  CumMaxPluginDynamicCreator();
+  const char *getPluginName() const override;
+};
+
+// cummin creator
+class CumMinPluginDynamicCreator : public CumMaxMinPluginDynamicCreator {
+ public:
+  CumMinPluginDynamicCreator();
+  const char *getPluginName() const override;
+};
+
+#endif TRT_CUMMAXMIN_HPP  // TRT_CUMMAXMIN_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_deform_conv.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_deform_conv.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..fc48ac5dd956f93a571c2a929fd4cf273c6b249a
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_deform_conv.hpp
@@ -0,0 +1,118 @@
+#ifndef TRT_DEFORM_CONV_HPP
+#define TRT_DEFORM_CONV_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class DeformableConvPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  DeformableConvPluginDynamic(const std::string &name,
+                              const nvinfer1::Dims &stride,
+                              const nvinfer1::Dims &padding,
+                              const nvinfer1::Dims &dilation,
+                              const int deformableGroup, const int group,
+                              int im2colStep);
+
+  DeformableConvPluginDynamic(const std::string name, const void *data,
+                              size_t length);
+
+  DeformableConvPluginDynamic() = delete;
+
+  ~DeformableConvPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
+                       nvinfer1::IGpuAllocator *gpuAllocator) override;
+  void detachFromContext() override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  nvinfer1::Dims mStride;
+  nvinfer1::Dims mPadding;
+  nvinfer1::Dims mDilation;
+  int mDeformableGroup;
+  int mGroup;
+  int mIm2colStep;
+
+  cublasHandle_t m_cublas_handle;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class DeformableConvPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  DeformableConvPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_DEFORM_CONV_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_grid_sampler.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_grid_sampler.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..40920ce5f490d8f5369b34dde8e8116e141a9ff3
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_grid_sampler.hpp
@@ -0,0 +1,108 @@
+#ifndef TRT_GRID_SAMPLER_HPP
+#define TRT_GRID_SAMPLER_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+namespace mmcv {
+enum class GridSamplerInterpolation { Bilinear, Nearest };
+enum class GridSamplerPadding { Zeros, Border, Reflection };
+}  // namespace mmcv
+
+class GridSamplerDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  GridSamplerDynamic(const std::string &name, int mode, int paddingMode,
+                     bool alignCorners);
+
+  GridSamplerDynamic(const std::string name, const void *data, size_t length);
+
+  GridSamplerDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mMode;
+  int mPaddingMode;
+  bool mAlignCorners;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class GridSamplerDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  GridSamplerDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_GRID_SAMPLER_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_instance_norm.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_instance_norm.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..78060c390151691057e53f6fc1eae648d1aa95bb
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_instance_norm.hpp
@@ -0,0 +1,120 @@
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/instanceNormalizationPlugin/instanceNormalizationPlugin.h
+
+#ifndef TRT_INSTANCE_NORMALIZATION_PLUGIN_H
+#define TRT_INSTANCE_NORMALIZATION_PLUGIN_H
+#include <cudnn.h>
+
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+typedef unsigned short half_type;
+
+class InstanceNormalizationDynamic final
+    : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  InstanceNormalizationDynamic(const std::string& name, float epsilon);
+
+  InstanceNormalizationDynamic(const std::string& name, void const* serialData,
+                               size_t serialLength);
+
+  InstanceNormalizationDynamic() = delete;
+
+  ~InstanceNormalizationDynamic() override;
+
+  int getNbOutputs() const override;
+
+  // DynamicExt plugins returns DimsExprs class instead of Dims
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+
+  int initialize() override;
+
+  void terminate() override;
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  size_t getSerializationSize() const override;
+
+  void serialize(void* buffer) const override;
+
+  // DynamicExt plugin supportsFormat update.
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+
+  const char* getPluginType() const override;
+
+  const char* getPluginVersion() const override;
+
+  void destroy() override;
+
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+
+  void setPluginNamespace(const char* pluginNamespace) override;
+
+  const char* getPluginNamespace() const override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  void attachToContext(cudnnContext* cudnn, cublasContext* cublas,
+                       nvinfer1::IGpuAllocator* allocator) override;
+
+  void detachFromContext() override;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+
+ private:
+  const std::string mLayerName;
+  float mEpsilon{};
+  cudnnHandle_t _cudnn_handle{};
+  cudnnTensorDescriptor_t _x_desc{}, _y_desc{}, _b_desc{};
+  std::string mPluginNamespace{};
+};
+
+class InstanceNormalizationDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  InstanceNormalizationDynamicCreator();
+
+  ~InstanceNormalizationDynamicCreator() override = default;
+
+  const char* getPluginName() const override;
+
+  const char* getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+
+  nvinfer1::IPluginV2DynamicExt* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+
+  nvinfer1::IPluginV2DynamicExt* deserializePlugin(
+      const char* name, const void* serialData, size_t serialLength) override;
+
+  void setPluginNamespace(const char* pluginNamespace) override;
+
+  const char* getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+
+#endif  // TRT_INSTANCE_NORMALIZATION_PLUGIN_H
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_modulated_deform_conv.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_modulated_deform_conv.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..0907e7ea854b5936a4099b3c5a515ce932de11ed
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_modulated_deform_conv.hpp
@@ -0,0 +1,120 @@
+#ifndef TRT_MODULATED_DEFORM_CONV_HPP
+#define TRT_MODULATED_DEFORM_CONV_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class ModulatedDeformableConvPluginDynamic
+    : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  ModulatedDeformableConvPluginDynamic(const std::string &name,
+                                       const nvinfer1::Dims stride,
+                                       const nvinfer1::Dims padding,
+                                       const nvinfer1::Dims dilation,
+                                       const int deformableGroup,
+                                       const int group);
+
+  ModulatedDeformableConvPluginDynamic(const std::string name, const void *data,
+                                       size_t length);
+
+  ModulatedDeformableConvPluginDynamic() = delete;
+
+  ~ModulatedDeformableConvPluginDynamic();
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+  void attachToContext(cudnnContext *cudnnContext, cublasContext *cublasContext,
+                       nvinfer1::IGpuAllocator *gpuAllocator) override;
+  void detachFromContext() override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  nvinfer1::Dims mStride;
+  nvinfer1::Dims mPadding;
+  nvinfer1::Dims mDilation;
+  int mDeformableGroup;
+  int mGroup;
+  bool mWithBias;
+
+  cublasHandle_t m_cublas_handle;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class ModulatedDeformableConvPluginDynamicCreator
+    : public nvinfer1::IPluginCreator {
+ public:
+  ModulatedDeformableConvPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_MODULATED_DEFORM_CONV_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_nms.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_nms.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a914d9094aacf70a3d9e52c1a526b00a11026d7b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_nms.hpp
@@ -0,0 +1,107 @@
+#ifndef TRT_NMS_HPP
+#define TRT_NMS_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class NonMaxSuppressionDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  NonMaxSuppressionDynamic(const std::string &name, int centerPointBox,
+                           int maxOutputBoxesPerClass, float iouThreshold,
+                           float scoreThreshold, int offset);
+
+  NonMaxSuppressionDynamic(const std::string name, const void *data,
+                           size_t length);
+
+  NonMaxSuppressionDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mCenterPointBox;
+  int mMaxOutputBoxesPerClass;
+  float mIouThreshold;
+  float mScoreThreshold;
+  int mOffset;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class NonMaxSuppressionDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  NonMaxSuppressionDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_NMS_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_plugin.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_plugin.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..a4adf29d22e8bb8bdd0972311ea1aef732e21075
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_plugin.hpp
@@ -0,0 +1,7 @@
+#ifndef TRT_PLUGIN_HPP
+#define TRT_PLUGIN_HPP
+
+extern "C" {
+bool initLibMMCVInferPlugins();
+}  // extern "C"
+#endif  // TRT_PLUGIN_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_plugin_helper.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_plugin_helper.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..70fba7810b30ab2539db2ff50792d36ef85e90f1
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_plugin_helper.hpp
@@ -0,0 +1,41 @@
+#ifndef TRT_PLUGIN_HELPER_HPP
+#define TRT_PLUGIN_HELPER_HPP
+#include <stdexcept>
+
+#include "NvInferPlugin.h"
+
+namespace mmcv {
+
+const int MAXTENSORDIMS = 10;
+
+struct TensorDesc {
+  int shape[MAXTENSORDIMS];
+  int stride[MAXTENSORDIMS];
+  int dim;
+};
+
+inline unsigned int getElementSize(nvinfer1::DataType t) {
+  switch (t) {
+    case nvinfer1::DataType::kINT32:
+      return 4;
+    case nvinfer1::DataType::kFLOAT:
+      return 4;
+    case nvinfer1::DataType::kHALF:
+      return 2;
+    // case nvinfer1::DataType::kBOOL:
+    case nvinfer1::DataType::kINT8:
+      return 1;
+    default:
+      throw std::runtime_error("Invalid DataType.");
+  }
+  throw std::runtime_error("Invalid DataType.");
+  return 0;
+}
+
+inline size_t getAlignedSize(size_t origin_size, size_t aligned_number = 16) {
+  return size_t((origin_size + aligned_number - 1) / aligned_number) *
+         aligned_number;
+}
+
+}  // namespace mmcv
+#endif  // TRT_PLUGIN_HELPER_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_roi_align.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_roi_align.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..5677af90b062ab259f084511e49e2a562a2017bc
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_roi_align.hpp
@@ -0,0 +1,108 @@
+#ifndef TRT_ROI_ALIGN_HPP
+#define TRT_ROI_ALIGN_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class RoIAlignPluginDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  RoIAlignPluginDynamic(const std::string &name, int outWidth, int outHeight,
+                        float spatialScale, int sampleRatio, int poolMode,
+                        bool aligned);
+
+  RoIAlignPluginDynamic(const std::string name, const void *data,
+                        size_t length);
+
+  RoIAlignPluginDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+  int mOutWidth;
+  int mOutHeight;
+  float mSpatialScale;
+  int mSampleRatio;
+  int mPoolMode;  // 1:avg 0:max
+  bool mAligned;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class RoIAlignPluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  RoIAlignPluginDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_ROI_ALIGN_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_scatternd.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_scatternd.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..6087cbefb5cacd7f5bdc41c606662e16f05874f4
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_scatternd.hpp
@@ -0,0 +1,98 @@
+#ifndef TRT_SCATTERND_HPP
+#define TRT_SCATTERND_HPP
+#include <cublas_v2.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "trt_plugin_helper.hpp"
+
+class ONNXScatterNDDynamic : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  ONNXScatterNDDynamic(const std::string &name);
+
+  ONNXScatterNDDynamic(const std::string name, const void *data, size_t length);
+
+  ONNXScatterNDDynamic() = delete;
+
+  // IPluginV2DynamicExt Methods
+  nvinfer1::IPluginV2DynamicExt *clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs *inputs, int nbInputs,
+      nvinfer1::IExprBuilder &exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc *inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc *in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc *out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc *inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc *outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc *inputDesc,
+              const nvinfer1::PluginTensorDesc *outputDesc,
+              const void *const *inputs, void *const *outputs, void *workspace,
+              cudaStream_t stream) override;
+
+  // IPluginV2Ext Methods
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType *inputTypes,
+                                       int nbInputs) const override;
+
+  // IPluginV2 Methods
+  const char *getPluginType() const override;
+  const char *getPluginVersion() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void *buffer) const override;
+  void destroy() override;
+  void setPluginNamespace(const char *pluginNamespace) override;
+  const char *getPluginNamespace() const override;
+
+ private:
+  const std::string mLayerName;
+  std::string mNamespace;
+
+ protected:
+  // To prevent compiler warnings.
+  using nvinfer1::IPluginV2DynamicExt::canBroadcastInputAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::configurePlugin;
+  using nvinfer1::IPluginV2DynamicExt::enqueue;
+  using nvinfer1::IPluginV2DynamicExt::getOutputDimensions;
+  using nvinfer1::IPluginV2DynamicExt::getWorkspaceSize;
+  using nvinfer1::IPluginV2DynamicExt::isOutputBroadcastAcrossBatch;
+  using nvinfer1::IPluginV2DynamicExt::supportsFormat;
+};
+
+class ONNXScatterNDDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  ONNXScatterNDDynamicCreator();
+
+  const char *getPluginName() const override;
+
+  const char *getPluginVersion() const override;
+
+  const nvinfer1::PluginFieldCollection *getFieldNames() override;
+
+  nvinfer1::IPluginV2 *createPlugin(
+      const char *name, const nvinfer1::PluginFieldCollection *fc) override;
+
+  nvinfer1::IPluginV2 *deserializePlugin(const char *name,
+                                         const void *serialData,
+                                         size_t serialLength) override;
+
+  void setPluginNamespace(const char *pluginNamespace) override;
+
+  const char *getPluginNamespace() const override;
+
+ private:
+  static nvinfer1::PluginFieldCollection mFC;
+  static std::vector<nvinfer1::PluginField> mPluginAttributes;
+  std::string mNamespace;
+};
+#endif  // TRT_SCATTERND_HPP
diff --git a/mmcv/mmcv/ops/csrc/tensorrt/trt_serialize.hpp b/mmcv/mmcv/ops/csrc/tensorrt/trt_serialize.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..1f0899fdfe4f3db7d73ad13b9ff16a03c6941d7b
--- /dev/null
+++ b/mmcv/mmcv/ops/csrc/tensorrt/trt_serialize.hpp
@@ -0,0 +1,105 @@
+// Modified from:
+// https://github.com/NVIDIA/TensorRT/blob/master/plugin/common/serialize.hpp
+
+#ifndef TRT_SERIALIZE_HPP
+#define TRT_SERIALIZE_HPP
+#include <cassert>
+#include <cstring>
+#include <iostream>
+#include <type_traits>
+#include <vector>
+using std::cerr;
+using std::cout;
+using std::endl;
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value);
+
+template <typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size,
+                              T* value);
+
+namespace {
+
+template <typename T, class Enable = void>
+struct Serializer {};
+
+template <typename T>
+struct Serializer<T, typename std::enable_if<std::is_arithmetic<T>::value ||
+                                             std::is_enum<T>::value ||
+                                             std::is_pod<T>::value>::type> {
+  static size_t serialized_size(T const& value) { return sizeof(T); }
+  static void serialize(void** buffer, T const& value) {
+    ::memcpy(*buffer, &value, sizeof(T));
+    reinterpret_cast<char*&>(*buffer) += sizeof(T);
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size, T* value) {
+    assert(*buffer_size >= sizeof(T));
+    ::memcpy(value, *buffer, sizeof(T));
+    reinterpret_cast<char const*&>(*buffer) += sizeof(T);
+    *buffer_size -= sizeof(T);
+  }
+};
+
+template <>
+struct Serializer<const char*> {
+  static size_t serialized_size(const char* value) { return strlen(value) + 1; }
+  static void serialize(void** buffer, const char* value) {
+    ::strcpy(static_cast<char*>(*buffer), value);
+    reinterpret_cast<char*&>(*buffer) += strlen(value) + 1;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size,
+                          const char** value) {
+    *value = static_cast<char const*>(*buffer);
+    size_t data_size = strnlen(*value, *buffer_size) + 1;
+    assert(*buffer_size >= data_size);
+    reinterpret_cast<char const*&>(*buffer) += data_size;
+    *buffer_size -= data_size;
+  }
+};
+
+template <typename T>
+struct Serializer<std::vector<T>,
+                  typename std::enable_if<std::is_arithmetic<T>::value ||
+                                          std::is_enum<T>::value ||
+                                          std::is_pod<T>::value>::type> {
+  static size_t serialized_size(std::vector<T> const& value) {
+    return sizeof(value.size()) + value.size() * sizeof(T);
+  }
+  static void serialize(void** buffer, std::vector<T> const& value) {
+    serialize_value(buffer, value.size());
+    size_t nbyte = value.size() * sizeof(T);
+    ::memcpy(*buffer, value.data(), nbyte);
+    reinterpret_cast<char*&>(*buffer) += nbyte;
+  }
+  static void deserialize(void const** buffer, size_t* buffer_size,
+                          std::vector<T>* value) {
+    size_t size;
+    deserialize_value(buffer, buffer_size, &size);
+    value->resize(size);
+    size_t nbyte = value->size() * sizeof(T);
+    assert(*buffer_size >= nbyte);
+    ::memcpy(value->data(), *buffer, nbyte);
+    reinterpret_cast<char const*&>(*buffer) += nbyte;
+    *buffer_size -= nbyte;
+  }
+};
+
+}  // namespace
+
+template <typename T>
+inline size_t serialized_size(T const& value) {
+  return Serializer<T>::serialized_size(value);
+}
+
+template <typename T>
+inline void serialize_value(void** buffer, T const& value) {
+  return Serializer<T>::serialize(buffer, value);
+}
+
+template <typename T>
+inline void deserialize_value(void const** buffer, size_t* buffer_size,
+                              T* value) {
+  return Serializer<T>::deserialize(buffer, buffer_size, value);
+}
+#endif  // TRT_SERIALIZE_HPP
diff --git a/mmcv/mmcv/ops/deform_conv.py b/mmcv/mmcv/ops/deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..85f665cd329ad2dbb8d0511e1dca92620b21344a
--- /dev/null
+++ b/mmcv/mmcv/ops/deform_conv.py
@@ -0,0 +1,408 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import deprecated_api_warning
+from ..cnn import CONV_LAYERS
+from ..utils import ext_loader, print_log
+
+ext_module = ext_loader.load_ext('_ext', [
+    'deform_conv_forward', 'deform_conv_backward_input',
+    'deform_conv_backward_parameters'
+])
+
+
+class DeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g,
+                 input,
+                 offset,
+                 weight,
+                 stride,
+                 padding,
+                 dilation,
+                 groups,
+                 deform_groups,
+                 bias=False,
+                 im2col_step=32):
+        return g.op(
+            'mmcv::MMCVDeformConv2d',
+            input,
+            offset,
+            weight,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups,
+            bias_i=bias,
+            im2col_step_i=im2col_step)
+
+    @staticmethod
+    def forward(ctx,
+                input: Tensor,
+                offset: Tensor,
+                weight: Tensor,
+                stride: Union[int, Tuple[int, ...]] = 1,
+                padding: Union[int, Tuple[int, ...]] = 0,
+                dilation: Union[int, Tuple[int, ...]] = 1,
+                groups: int = 1,
+                deform_groups: int = 1,
+                bias: bool = False,
+                im2col_step: int = 32) -> Tensor:
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        assert bias is False, 'Only support bias is False.'
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.im2col_step = im2col_step
+
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        ctx.save_for_backward(input, offset, weight)
+
+        output = input.new_empty(
+            DeformConv2dFunction._output_size(ctx, input, weight))
+
+        ctx.bufs_ = [input.new_empty(0), input.new_empty(0)]  # columns, ones
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+        ext_module.deform_conv_forward(
+            input,
+            weight,
+            offset,
+            output,
+            ctx.bufs_[0],
+            ctx.bufs_[1],
+            kW=weight.size(3),
+            kH=weight.size(2),
+            dW=ctx.stride[1],
+            dH=ctx.stride[0],
+            padW=ctx.padding[1],
+            padH=ctx.padding[0],
+            dilationW=ctx.dilation[1],
+            dilationH=ctx.dilation[0],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            im2col_step=cur_im2col_step)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Optional[Tensor], Optional[Tensor], Optional[Tensor], None,
+               None, None, None, None, None, None]:
+        input, offset, weight = ctx.saved_tensors
+
+        grad_input = grad_offset = grad_weight = None
+
+        cur_im2col_step = min(ctx.im2col_step, input.size(0))
+        assert (input.size(0) % cur_im2col_step
+                ) == 0, 'batch size must be divisible by im2col_step'
+
+        grad_output = grad_output.contiguous()
+        if ctx.needs_input_grad[0] or ctx.needs_input_grad[1]:
+            grad_input = torch.zeros_like(input)
+            grad_offset = torch.zeros_like(offset)
+            ext_module.deform_conv_backward_input(
+                input,
+                offset,
+                grad_output,
+                grad_input,
+                grad_offset,
+                weight,
+                ctx.bufs_[0],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                im2col_step=cur_im2col_step)
+
+        if ctx.needs_input_grad[2]:
+            grad_weight = torch.zeros_like(weight)
+            ext_module.deform_conv_backward_parameters(
+                input,
+                offset,
+                grad_output,
+                grad_weight,
+                ctx.bufs_[0],
+                ctx.bufs_[1],
+                kW=weight.size(3),
+                kH=weight.size(2),
+                dW=ctx.stride[1],
+                dH=ctx.stride[0],
+                padW=ctx.padding[1],
+                padH=ctx.padding[0],
+                dilationW=ctx.dilation[1],
+                dilationH=ctx.dilation[0],
+                group=ctx.groups,
+                deformable_group=ctx.deform_groups,
+                scale=1,
+                im2col_step=cur_im2col_step)
+
+        return grad_input, grad_offset, grad_weight, \
+            None, None, None, None, None, None, None
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+deform_conv2d = DeformConv2dFunction.apply
+
+
+class DeformConv2d(nn.Module):
+    r"""Deformable 2D convolution.
+
+    Applies a deformable 2D convolution over an input signal composed of
+    several input planes. DeformConv2d was described in the paper
+    `Deformable Convolutional Networks
+    <https://arxiv.org/pdf/1703.06211.pdf>`_
+
+    Note:
+        The argument ``im2col_step`` was added in version 1.3.17, which means
+        number of samples processed by the ``im2col_cuda_kernel`` per call.
+        It enables users to define ``batch_size`` and ``im2col_step`` more
+        flexibly and solved `issue mmcv#1440
+        <https://github.com/open-mmlab/mmcv/issues/1440>`_.
+
+    Args:
+        in_channels (int): Number of channels in the input image.
+        out_channels (int): Number of channels produced by the convolution.
+        kernel_size(int, tuple): Size of the convolving kernel.
+        stride(int, tuple): Stride of the convolution. Default: 1.
+        padding (int or tuple): Zero-padding added to both sides of the input.
+            Default: 0.
+        dilation (int or tuple): Spacing between kernel elements. Default: 1.
+        groups (int): Number of blocked connections from input.
+            channels to output channels. Default: 1.
+        deform_groups (int): Number of deformable group partitions.
+        bias (bool): If True, adds a learnable bias to the output.
+            Default: False.
+        im2col_step (int): Number of samples processed by im2col_cuda_kernel
+            per call. It will work when ``batch_size`` > ``im2col_step``, but
+            ``batch_size`` must be divisible by ``im2col_step``. Default: 32.
+            `New in version 1.3.17.`
+    """
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='DeformConv2d')
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, ...]],
+                 stride: Union[int, Tuple[int, ...]] = 1,
+                 padding: Union[int, Tuple[int, ...]] = 0,
+                 dilation: Union[int, Tuple[int, ...]] = 1,
+                 groups: int = 1,
+                 deform_groups: int = 1,
+                 bias: bool = False,
+                 im2col_step: int = 32) -> None:
+        super().__init__()
+
+        assert not bias, \
+            f'bias={bias} is not supported in DeformConv2d.'
+        assert in_channels % groups == 0, \
+            f'in_channels {in_channels} cannot be divisible by groups {groups}'
+        assert out_channels % groups == 0, \
+            f'out_channels {out_channels} cannot be divisible by groups \
+              {groups}'
+
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        self.im2col_step = im2col_step
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        # only weight, no bias
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // self.groups,
+                         *self.kernel_size))
+
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        # switch the initialization of `self.weight` to the standard kaiming
+        # method described in `Delving deep into rectifiers: Surpassing
+        # human-level performance on ImageNet classification` - He, K. et al.
+        # (2015), using a uniform distribution
+        nn.init.kaiming_uniform_(self.weight, nonlinearity='relu')
+
+    def forward(self, x: Tensor, offset: Tensor) -> Tensor:
+        """Deformable Convolutional forward function.
+
+        Args:
+            x (Tensor): Input feature, shape (B, C_in, H_in, W_in)
+            offset (Tensor): Offset for deformable convolution, shape
+                (B, deform_groups*kernel_size[0]*kernel_size[1]*2,
+                H_out, W_out), H_out, W_out are equal to the output's.
+
+                An offset is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+                The spatial arrangement is like:
+
+                .. code:: text
+
+                    (x0, y0) (x1, y1) (x2, y2)
+                    (x3, y3) (x4, y4) (x5, y5)
+                    (x6, y6) (x7, y7) (x8, y8)
+
+        Returns:
+            Tensor: Output of the layer.
+        """
+        # To fix an assert error in deform_conv_cuda.cpp:128
+        # input image is smaller than kernel
+        input_pad = (x.size(2) < self.kernel_size[0]) or (x.size(3) <
+                                                          self.kernel_size[1])
+        if input_pad:
+            pad_h = max(self.kernel_size[0] - x.size(2), 0)
+            pad_w = max(self.kernel_size[1] - x.size(3), 0)
+            x = F.pad(x, (0, pad_w, 0, pad_h), 'constant', 0).contiguous()
+            offset = F.pad(offset, (0, pad_w, 0, pad_h), 'constant', 0)
+            offset = offset.contiguous()
+        out = deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                            self.dilation, self.groups, self.deform_groups,
+                            False, self.im2col_step)
+        if input_pad:
+            out = out[:, :, :out.size(2) - pad_h, :out.size(3) -
+                      pad_w].contiguous()
+        return out
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(in_channels={self.in_channels},\n'
+        s += f'out_channels={self.out_channels},\n'
+        s += f'kernel_size={self.kernel_size},\n'
+        s += f'stride={self.stride},\n'
+        s += f'padding={self.padding},\n'
+        s += f'dilation={self.dilation},\n'
+        s += f'groups={self.groups},\n'
+        s += f'deform_groups={self.deform_groups},\n'
+        # bias is not supported in DeformConv2d.
+        s += 'bias=False)'
+        return s
+
+
+@CONV_LAYERS.register_module('DCN')
+class DeformConv2dPack(DeformConv2d):
+    """A Deformable Conv Encapsulation that acts as normal Conv layers.
+
+    The offset tensor is like `[y0, x0, y1, x1, y2, x2, ..., y8, x8]`.
+    The spatial arrangement is like:
+
+    .. code:: text
+
+        (x0, y0) (x1, y1) (x2, y2)
+        (x3, y3) (x4, y4) (x5, y5)
+        (x6, y6) (x7, y7) (x8, y8)
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int or tuple[int]): Same as nn.Conv2d.
+        padding (int or tuple[int]): Same as nn.Conv2d.
+        dilation (int or tuple[int]): Same as nn.Conv2d.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 2 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=_pair(self.stride),
+            padding=_pair(self.padding),
+            dilation=_pair(self.dilation),
+            bias=True)
+        self.init_offset()
+
+    def init_offset(self):
+        self.conv_offset.weight.data.zero_()
+        self.conv_offset.bias.data.zero_()
+
+    def forward(self, x: Tensor) -> Tensor:  # type: ignore
+        offset = self.conv_offset(x)
+        return deform_conv2d(x, offset, self.weight, self.stride, self.padding,
+                             self.dilation, self.groups, self.deform_groups,
+                             False, self.im2col_step)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, DeformConvPack loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'DeformConv2dPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='root')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
diff --git a/mmcv/mmcv/ops/deform_roi_pool.py b/mmcv/mmcv/ops/deform_roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec9a4c124685733c1f11191a8ff83c657f870ec5
--- /dev/null
+++ b/mmcv/mmcv/ops/deform_roi_pool.py
@@ -0,0 +1,209 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple
+
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['deform_roi_pool_forward', 'deform_roi_pool_backward'])
+
+
+class DeformRoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, offset, output_size, spatial_scale,
+                 sampling_ratio, gamma):
+        return g.op(
+            'mmcv::MMCVDeformRoIPool',
+            input,
+            rois,
+            offset,
+            pooled_height_i=output_size[0],
+            pooled_width_i=output_size[1],
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_f=sampling_ratio,
+            gamma_f=gamma)
+
+    @staticmethod
+    def forward(ctx,
+                input: Tensor,
+                rois: Tensor,
+                offset: Optional[Tensor],
+                output_size: Tuple[int, ...],
+                spatial_scale: float = 1.0,
+                sampling_ratio: int = 0,
+                gamma: float = 0.1) -> Tensor:
+        if offset is None:
+            offset = input.new_zeros(0)
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = float(spatial_scale)
+        ctx.sampling_ratio = int(sampling_ratio)
+        ctx.gamma = float(gamma)
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+
+        ext_module.deform_roi_pool_forward(
+            input,
+            rois,
+            offset,
+            output,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            gamma=ctx.gamma)
+
+        ctx.save_for_backward(input, rois, offset)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: Tensor
+    ) -> Tuple[Tensor, None, Tensor, None, None, None, None]:
+        input, rois, offset = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(input.shape)
+        grad_offset = grad_output.new_zeros(offset.shape)
+
+        ext_module.deform_roi_pool_backward(
+            grad_output,
+            input,
+            rois,
+            offset,
+            grad_input,
+            grad_offset,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            gamma=ctx.gamma)
+        if grad_offset.numel() == 0:
+            grad_offset = None
+        return grad_input, None, grad_offset, None, None, None, None
+
+
+deform_roi_pool = DeformRoIPoolFunction.apply
+
+
+class DeformRoIPool(nn.Module):
+
+    def __init__(self,
+                 output_size: Tuple[int, ...],
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.gamma = float(gamma)
+
+    def forward(self,
+                input: Tensor,
+                rois: Tensor,
+                offset: Optional[Tensor] = None) -> Tensor:
+        return deform_roi_pool(input, rois, offset, self.output_size,
+                               self.spatial_scale, self.sampling_ratio,
+                               self.gamma)
+
+
+class DeformRoIPoolPack(DeformRoIPool):
+
+    def __init__(self,
+                 output_size: Tuple[int, ...],
+                 output_channels: int,
+                 deform_fc_channels: int = 1024,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)
+
+        self.output_channels = output_channels
+        self.deform_fc_channels = deform_fc_channels
+
+        self.offset_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 2))
+        self.offset_fc[-1].weight.data.zero_()
+        self.offset_fc[-1].bias.data.zero_()
+
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
+        assert input.size(1) == self.output_channels
+        x = deform_roi_pool(input, rois, None, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        rois_num = rois.size(0)
+        offset = self.offset_fc(x.view(rois_num, -1))
+        offset = offset.view(rois_num, 2, self.output_size[0],
+                             self.output_size[1])
+        return deform_roi_pool(input, rois, offset, self.output_size,
+                               self.spatial_scale, self.sampling_ratio,
+                               self.gamma)
+
+
+class ModulatedDeformRoIPoolPack(DeformRoIPool):
+
+    def __init__(self,
+                 output_size: Tuple[int, ...],
+                 output_channels: int,
+                 deform_fc_channels: int = 1024,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 gamma: float = 0.1):
+        super().__init__(output_size, spatial_scale, sampling_ratio, gamma)
+
+        self.output_channels = output_channels
+        self.deform_fc_channels = deform_fc_channels
+
+        self.offset_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 2))
+        self.offset_fc[-1].weight.data.zero_()
+        self.offset_fc[-1].bias.data.zero_()
+
+        self.mask_fc = nn.Sequential(
+            nn.Linear(
+                self.output_size[0] * self.output_size[1] *
+                self.output_channels, self.deform_fc_channels),
+            nn.ReLU(inplace=True),
+            nn.Linear(self.deform_fc_channels,
+                      self.output_size[0] * self.output_size[1] * 1),
+            nn.Sigmoid())
+        self.mask_fc[2].weight.data.zero_()
+        self.mask_fc[2].bias.data.zero_()
+
+    def forward(self, input: Tensor, rois: Tensor) -> Tensor:  # type: ignore
+        assert input.size(1) == self.output_channels
+        x = deform_roi_pool(input, rois, None, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        rois_num = rois.size(0)
+        offset = self.offset_fc(x.view(rois_num, -1))
+        offset = offset.view(rois_num, 2, self.output_size[0],
+                             self.output_size[1])
+        mask = self.mask_fc(x.view(rois_num, -1))
+        mask = mask.view(rois_num, 1, self.output_size[0], self.output_size[1])
+        d = deform_roi_pool(input, rois, offset, self.output_size,
+                            self.spatial_scale, self.sampling_ratio,
+                            self.gamma)
+        return d * mask
diff --git a/mmcv/mmcv/ops/deprecated_wrappers.py b/mmcv/mmcv/ops/deprecated_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..629a8033ff56be221b71a475ffd650ab7164f114
--- /dev/null
+++ b/mmcv/mmcv/ops/deprecated_wrappers.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# This file is for backward compatibility.
+# Module wrappers for empty tensor have been moved to mmcv.cnn.bricks.
+import warnings
+
+from ..cnn.bricks.wrappers import Conv2d, ConvTranspose2d, Linear, MaxPool2d
+
+
+class Conv2d_deprecated(Conv2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing Conv2d wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
+
+
+class ConvTranspose2d_deprecated(ConvTranspose2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing ConvTranspose2d wrapper from "mmcv.ops" will be '
+            'deprecated in the future. Please import them from "mmcv.cnn" '
+            'instead', DeprecationWarning)
+
+
+class MaxPool2d_deprecated(MaxPool2d):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing MaxPool2d wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
+
+
+class Linear_deprecated(Linear):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'Importing Linear wrapper from "mmcv.ops" will be deprecated in'
+            ' the future. Please import them from "mmcv.cnn" instead',
+            DeprecationWarning)
diff --git a/mmcv/mmcv/ops/diff_iou_rotated.py b/mmcv/mmcv/ops/diff_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdc6c72f872967d3f6c8909a509ef00f1d58e2b8
--- /dev/null
+++ b/mmcv/mmcv/ops/diff_iou_rotated.py
@@ -0,0 +1,301 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/box_intersection_2d.py  # noqa
+# Adapted from https://github.com/lilanxiao/Rotated_IoU/blob/master/oriented_iou_loss.py  # noqa
+from typing import Tuple
+
+import torch
+from torch import Tensor
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+EPSILON = 1e-8
+ext_module = ext_loader.load_ext('_ext',
+                                 ['diff_iou_rotated_sort_vertices_forward'])
+
+
+class SortVertices(Function):
+
+    @staticmethod
+    def forward(ctx, vertices, mask, num_valid):
+        idx = ext_module.diff_iou_rotated_sort_vertices_forward(
+            vertices, mask, num_valid)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, gradout):
+        return ()
+
+
+def box_intersection(corners1: Tensor,
+                     corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Find intersection points of rectangles.
+    Convention: if two edges are collinear, there is no intersection point.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 4, 4, 2) Intersections.
+         - Tensor: (B, N, 4, 4) Valid intersections mask.
+    """
+    # build edges from corners
+    # B, N, 4, 4: Batch, Box, edge, point
+    line1 = torch.cat([corners1, corners1[:, :, [1, 2, 3, 0], :]], dim=3)
+    line2 = torch.cat([corners2, corners2[:, :, [1, 2, 3, 0], :]], dim=3)
+    # duplicate data to pair each edges from the boxes
+    # (B, N, 4, 4) -> (B, N, 4, 4, 4) : Batch, Box, edge1, edge2, point
+    line1_ext = line1.unsqueeze(3)
+    line2_ext = line2.unsqueeze(2)
+    x1, y1, x2, y2 = line1_ext.split([1, 1, 1, 1], dim=-1)
+    x3, y3, x4, y4 = line2_ext.split([1, 1, 1, 1], dim=-1)
+    # math: https://en.wikipedia.org/wiki/Line%E2%80%93line_intersection
+    numerator = (x1 - x2) * (y3 - y4) - (y1 - y2) * (x3 - x4)
+    denumerator_t = (x1 - x3) * (y3 - y4) - (y1 - y3) * (x3 - x4)
+    t = denumerator_t / numerator
+    t[numerator == .0] = -1.
+    mask_t = (t > 0) & (t < 1)  # intersection on line segment 1
+    denumerator_u = (x1 - x2) * (y1 - y3) - (y1 - y2) * (x1 - x3)
+    u = -denumerator_u / numerator
+    u[numerator == .0] = -1.
+    mask_u = (u > 0) & (u < 1)  # intersection on line segment 2
+    mask = mask_t * mask_u
+    # overwrite with EPSILON. otherwise numerically unstable
+    t = denumerator_t / (numerator + EPSILON)
+    intersections = torch.stack([x1 + t * (x2 - x1), y1 + t * (y2 - y1)],
+                                dim=-1)
+    intersections = intersections * mask.float().unsqueeze(-1)
+    return intersections, mask
+
+
+def box1_in_box2(corners1: Tensor, corners2: Tensor) -> Tensor:
+    """Check if corners of box1 lie in box2.
+    Convention: if a corner is exactly on the edge of the other box,
+    it's also a valid point.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tensor: (B, N, 4) Intersection.
+    """
+    # a, b, c, d - 4 vertices of box2
+    a = corners2[:, :, 0:1, :]  # (B, N, 1, 2)
+    b = corners2[:, :, 1:2, :]  # (B, N, 1, 2)
+    d = corners2[:, :, 3:4, :]  # (B, N, 1, 2)
+    # ab, am, ad - vectors between corresponding vertices
+    ab = b - a  # (B, N, 1, 2)
+    am = corners1 - a  # (B, N, 4, 2)
+    ad = d - a  # (B, N, 1, 2)
+    prod_ab = torch.sum(ab * am, dim=-1)  # (B, N, 4)
+    norm_ab = torch.sum(ab * ab, dim=-1)  # (B, N, 1)
+    prod_ad = torch.sum(ad * am, dim=-1)  # (B, N, 4)
+    norm_ad = torch.sum(ad * ad, dim=-1)  # (B, N, 1)
+    # NOTE: the expression looks ugly but is stable if the two boxes
+    # are exactly the same also stable with different scale of bboxes
+    cond1 = (prod_ab / norm_ab > -1e-6) * (prod_ab / norm_ab < 1 + 1e-6
+                                           )  # (B, N, 4)
+    cond2 = (prod_ad / norm_ad > -1e-6) * (prod_ad / norm_ad < 1 + 1e-6
+                                           )  # (B, N, 4)
+    return cond1 * cond2
+
+
+def box_in_box(corners1: Tensor, corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Check if corners of two boxes lie in each other.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 4) True if i-th corner of box1 is in box2.
+         - Tensor: (B, N, 4) True if i-th corner of box2 is in box1.
+    """
+    c1_in_2 = box1_in_box2(corners1, corners2)
+    c2_in_1 = box1_in_box2(corners2, corners1)
+    return c1_in_2, c2_in_1
+
+
+def build_vertices(corners1: Tensor, corners2: Tensor, c1_in_2: Tensor,
+                   c2_in_1: Tensor, intersections: Tensor,
+                   valid_mask: Tensor) -> Tuple[Tensor, Tensor]:
+    """Find vertices of intersection area.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+        c1_in_2 (Tensor): (B, N, 4) True if i-th corner of box1 is in box2.
+        c2_in_1 (Tensor): (B, N, 4) True if i-th corner of box2 is in box1.
+        intersections (Tensor): (B, N, 4, 4, 2) Intersections.
+        valid_mask (Tensor): (B, N, 4, 4) Valid intersections mask.
+
+    Returns:
+        Tuple:
+         - Tensor: (B, N, 24, 2) Vertices of intersection area;
+               only some elements are valid.
+         - Tensor: (B, N, 24) Mask of valid elements in vertices.
+    """
+    # NOTE: inter has elements equals zero and has zeros gradient
+    # (masked by multiplying with 0); can be used as trick
+    B = corners1.size()[0]
+    N = corners1.size()[1]
+    # (B, N, 4 + 4 + 16, 2)
+    vertices = torch.cat(
+        [corners1, corners2,
+         intersections.view([B, N, -1, 2])], dim=2)
+    # Bool (B, N, 4 + 4 + 16)
+    mask = torch.cat([c1_in_2, c2_in_1, valid_mask.view([B, N, -1])], dim=2)
+    return vertices, mask
+
+
+def sort_indices(vertices: Tensor, mask: Tensor) -> Tensor:
+    """Sort indices.
+    Note:
+        why 9? the polygon has maximal 8 vertices.
+        +1 to duplicate the first element.
+        the index should have following structure:
+            (A, B, C, ... , A, X, X, X)
+        and X indicates the index of arbitrary elements in the last
+        16 (intersections not corners) with value 0 and mask False.
+        (cause they have zero value and zero gradient)
+
+    Args:
+        vertices (Tensor): (B, N, 24, 2) Box vertices.
+        mask (Tensor): (B, N, 24) Mask.
+
+    Returns:
+        Tensor: (B, N, 9) Sorted indices.
+
+    """
+    num_valid = torch.sum(mask.int(), dim=2).int()  # (B, N)
+    mean = torch.sum(
+        vertices * mask.float().unsqueeze(-1), dim=2,
+        keepdim=True) / num_valid.unsqueeze(-1).unsqueeze(-1)
+    vertices_normalized = vertices - mean  # normalization makes sorting easier
+    return SortVertices.apply(vertices_normalized, mask, num_valid).long()
+
+
+def calculate_area(idx_sorted: Tensor,
+                   vertices: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calculate area of intersection.
+
+    Args:
+        idx_sorted (Tensor): (B, N, 9) Sorted vertex ids.
+        vertices (Tensor): (B, N, 24, 2) Vertices.
+
+    Returns:
+        Tuple:
+         - Tensor (B, N): Area of intersection.
+         - Tensor: (B, N, 9, 2) Vertices of polygon with zero padding.
+    """
+    idx_ext = idx_sorted.unsqueeze(-1).repeat([1, 1, 1, 2])
+    selected = torch.gather(vertices, 2, idx_ext)
+    total = selected[:, :, 0:-1, 0] * selected[:, :, 1:, 1] \
+        - selected[:, :, 0:-1, 1] * selected[:, :, 1:, 0]
+    total = torch.sum(total, dim=2)
+    area = torch.abs(total) / 2
+    return area, selected
+
+
+def oriented_box_intersection_2d(corners1: Tensor,
+                                 corners2: Tensor) -> Tuple[Tensor, Tensor]:
+    """Calculate intersection area of 2d rotated boxes.
+
+    Args:
+        corners1 (Tensor): (B, N, 4, 2) First batch of boxes.
+        corners2 (Tensor): (B, N, 4, 2) Second batch of boxes.
+
+    Returns:
+        Tuple:
+         - Tensor (B, N): Area of intersection.
+         - Tensor (B, N, 9, 2): Vertices of polygon with zero padding.
+    """
+    intersections, valid_mask = box_intersection(corners1, corners2)
+    c12, c21 = box_in_box(corners1, corners2)
+    vertices, mask = build_vertices(corners1, corners2, c12, c21,
+                                    intersections, valid_mask)
+    sorted_indices = sort_indices(vertices, mask)
+    return calculate_area(sorted_indices, vertices)
+
+
+def box2corners(box: Tensor) -> Tensor:
+    """Convert rotated 2d box coordinate to corners.
+
+    Args:
+        box (Tensor): (B, N, 5) with x, y, w, h, alpha.
+
+    Returns:
+        Tensor: (B, N, 4, 2) Corners.
+    """
+    B = box.size()[0]
+    x, y, w, h, alpha = box.split([1, 1, 1, 1, 1], dim=-1)
+    x4 = torch.FloatTensor([0.5, -0.5, -0.5, 0.5]).to(box.device)
+    x4 = x4 * w  # (B, N, 4)
+    y4 = torch.FloatTensor([0.5, 0.5, -0.5, -0.5]).to(box.device)
+    y4 = y4 * h  # (B, N, 4)
+    corners = torch.stack([x4, y4], dim=-1)  # (B, N, 4, 2)
+    sin = torch.sin(alpha)
+    cos = torch.cos(alpha)
+    row1 = torch.cat([cos, sin], dim=-1)
+    row2 = torch.cat([-sin, cos], dim=-1)  # (B, N, 2)
+    rot_T = torch.stack([row1, row2], dim=-2)  # (B, N, 2, 2)
+    rotated = torch.bmm(corners.view([-1, 4, 2]), rot_T.view([-1, 2, 2]))
+    rotated = rotated.view([B, -1, 4, 2])  # (B * N, 4, 2) -> (B, N, 4, 2)
+    rotated[..., 0] += x
+    rotated[..., 1] += y
+    return rotated
+
+
+def diff_iou_rotated_2d(box1: Tensor, box2: Tensor) -> Tensor:
+    """Calculate differentiable iou of rotated 2d boxes.
+
+    Args:
+        box1 (Tensor): (B, N, 5) First box.
+        box2 (Tensor): (B, N, 5) Second box.
+
+    Returns:
+        Tensor: (B, N) IoU.
+    """
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1,
+                                                   corners2)  # (B, N)
+    area1 = box1[:, :, 2] * box1[:, :, 3]
+    area2 = box2[:, :, 2] * box2[:, :, 3]
+    union = area1 + area2 - intersection
+    iou = intersection / union
+    return iou
+
+
+def diff_iou_rotated_3d(box3d1: Tensor, box3d2: Tensor) -> Tensor:
+    """Calculate differentiable iou of rotated 3d boxes.
+
+    Args:
+        box3d1 (Tensor): (B, N, 3+3+1) First box (x,y,z,w,h,l,alpha).
+        box3d2 (Tensor): (B, N, 3+3+1) Second box (x,y,z,w,h,l,alpha).
+
+    Returns:
+        Tensor: (B, N) IoU.
+    """
+    box1 = box3d1[..., [0, 1, 3, 4, 6]]  # 2d box
+    box2 = box3d2[..., [0, 1, 3, 4, 6]]
+    corners1 = box2corners(box1)
+    corners2 = box2corners(box2)
+    intersection, _ = oriented_box_intersection_2d(corners1, corners2)
+    zmax1 = box3d1[..., 2] + box3d1[..., 5] * 0.5
+    zmin1 = box3d1[..., 2] - box3d1[..., 5] * 0.5
+    zmax2 = box3d2[..., 2] + box3d2[..., 5] * 0.5
+    zmin2 = box3d2[..., 2] - box3d2[..., 5] * 0.5
+    z_overlap = (torch.min(zmax1, zmax2) -
+                 torch.max(zmin1, zmin2)).clamp_(min=0.)
+    intersection_3d = intersection * z_overlap
+    volume1 = box3d1[..., 3] * box3d1[..., 4] * box3d1[..., 5]
+    volume2 = box3d2[..., 3] * box3d2[..., 4] * box3d2[..., 5]
+    union_3d = volume1 + volume2 - intersection_3d
+    return intersection_3d / union_3d
diff --git a/mmcv/mmcv/ops/focal_loss.py b/mmcv/mmcv/ops/focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b203fc15bd8c1d50464936a9b731d8b1977c3e5
--- /dev/null
+++ b/mmcv/mmcv/ops/focal_loss.py
@@ -0,0 +1,235 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'sigmoid_focal_loss_forward', 'sigmoid_focal_loss_backward',
+    'softmax_focal_loss_forward', 'softmax_focal_loss_backward'
+])
+
+
+class SigmoidFocalLossFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input: torch.Tensor, target: torch.LongTensor,
+                 gamma: float, alpha: float, weight: torch.Tensor,
+                 reduction: str):
+        return g.op(
+            'mmcv::MMCVSigmoidFocalLoss',
+            input,
+            target,
+            gamma_f=gamma,
+            alpha_f=alpha,
+            weight_f=weight,
+            reduction_s=reduction)
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                target: Union[torch.LongTensor, torch.cuda.LongTensor],
+                gamma: float = 2.0,
+                alpha: float = 0.25,
+                weight: Optional[torch.Tensor] = None,
+                reduction: str = 'mean') -> torch.Tensor:
+
+        assert isinstance(
+            target, (torch.Tensor, torch.LongTensor, torch.cuda.LongTensor))
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+
+        output = input.new_zeros(input.size())
+
+        ext_module.sigmoid_focal_loss_forward(
+            input, target, weight, output, gamma=ctx.gamma, alpha=ctx.alpha)
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input, target, weight)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        input, target, weight = ctx.saved_tensors
+
+        grad_input = input.new_zeros(input.size())
+
+        ext_module.sigmoid_focal_loss_backward(
+            input,
+            target,
+            weight,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input.size(0)
+        return grad_input, None, None, None, None, None
+
+
+sigmoid_focal_loss = SigmoidFocalLossFunction.apply
+
+
+class SigmoidFocalLoss(nn.Module):
+
+    def __init__(self,
+                 gamma: float,
+                 alpha: float,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean'):
+        super().__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        target: Union[torch.LongTensor, torch.cuda.LongTensor],
+    ) -> torch.Tensor:
+        return sigmoid_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
+
+
+class SoftmaxFocalLossFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input: torch.Tensor, target: torch.LongTensor,
+                 gamma: float, alpha: float, weight: torch.Tensor,
+                 reduction: str):
+        return g.op(
+            'mmcv::MMCVSoftmaxFocalLoss',
+            input,
+            target,
+            gamma_f=gamma,
+            alpha_f=alpha,
+            weight_f=weight,
+            reduction_s=reduction)
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                target: Union[torch.LongTensor, torch.cuda.LongTensor],
+                gamma: float = 2.0,
+                alpha: float = 0.25,
+                weight: Optional[torch.Tensor] = None,
+                reduction='mean') -> torch.Tensor:
+
+        assert isinstance(target, (torch.LongTensor, torch.cuda.LongTensor))
+        assert input.dim() == 2
+        assert target.dim() == 1
+        assert input.size(0) == target.size(0)
+        if weight is None:
+            weight = input.new_empty(0)
+        else:
+            assert weight.dim() == 1
+            assert input.size(1) == weight.size(0)
+        ctx.reduction_dict = {'none': 0, 'mean': 1, 'sum': 2}
+        assert reduction in ctx.reduction_dict.keys()
+
+        ctx.gamma = float(gamma)
+        ctx.alpha = float(alpha)
+        ctx.reduction = ctx.reduction_dict[reduction]
+
+        channel_stats, _ = torch.max(input, dim=1)
+        input_softmax = input - channel_stats.unsqueeze(1).expand_as(input)
+        input_softmax.exp_()
+
+        channel_stats = input_softmax.sum(dim=1)
+        input_softmax /= channel_stats.unsqueeze(1).expand_as(input)
+
+        output = input.new_zeros(input.size(0))
+        ext_module.softmax_focal_loss_forward(
+            input_softmax,
+            target,
+            weight,
+            output,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            output = output.sum() / input.size(0)
+        elif ctx.reduction == ctx.reduction_dict['sum']:
+            output = output.sum()
+        ctx.save_for_backward(input_softmax, target, weight)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        input_softmax, target, weight = ctx.saved_tensors
+        buff = input_softmax.new_zeros(input_softmax.size(0))
+        grad_input = input_softmax.new_zeros(input_softmax.size())
+
+        ext_module.softmax_focal_loss_backward(
+            input_softmax,
+            target,
+            weight,
+            buff,
+            grad_input,
+            gamma=ctx.gamma,
+            alpha=ctx.alpha)
+
+        grad_input *= grad_output
+        if ctx.reduction == ctx.reduction_dict['mean']:
+            grad_input /= input_softmax.size(0)
+        return grad_input, None, None, None, None, None
+
+
+softmax_focal_loss = SoftmaxFocalLossFunction.apply
+
+
+class SoftmaxFocalLoss(nn.Module):
+
+    def __init__(self,
+                 gamma: float,
+                 alpha: float,
+                 weight: Optional[torch.Tensor] = None,
+                 reduction: str = 'mean'):
+        super().__init__()
+        self.gamma = gamma
+        self.alpha = alpha
+        self.register_buffer('weight', weight)
+        self.reduction = reduction
+
+    def forward(
+        self,
+        input: torch.Tensor,
+        target: Union[torch.LongTensor, torch.cuda.LongTensor],
+    ) -> torch.Tensor:
+        return softmax_focal_loss(input, target, self.gamma, self.alpha,
+                                  self.weight, self.reduction)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(gamma={self.gamma}, '
+        s += f'alpha={self.alpha}, '
+        s += f'reduction={self.reduction})'
+        return s
diff --git a/mmcv/mmcv/ops/furthest_point_sample.py b/mmcv/mmcv/ops/furthest_point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..22b1a3048d08b3f1eda43e4a3d5c36a6f6ab5349
--- /dev/null
+++ b/mmcv/mmcv/ops/furthest_point_sample.py
@@ -0,0 +1,84 @@
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'furthest_point_sampling_forward',
+    'furthest_point_sampling_with_dist_forward'
+])
+
+
+class FurthestPointSampling(Function):
+    """Uses iterative furthest point sampling to select a set of features whose
+    corresponding points have the furthest distance."""
+
+    @staticmethod
+    def forward(ctx, points_xyz: torch.Tensor,
+                num_points: int) -> torch.Tensor:
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) where N > num_points.
+            num_points (int): Number of points in the sampled set.
+
+        Returns:
+            torch.Tensor: (B, num_points) indices of the sampled points.
+        """
+        assert points_xyz.is_contiguous()
+
+        B, N = points_xyz.size()[:2]
+        output = torch.cuda.IntTensor(B, num_points)
+        temp = torch.cuda.FloatTensor(B, N).fill_(1e10)
+
+        ext_module.furthest_point_sampling_forward(
+            points_xyz,
+            temp,
+            output,
+            b=B,
+            n=N,
+            m=num_points,
+        )
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(output)
+        return output
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+class FurthestPointSamplingWithDist(Function):
+    """Uses iterative furthest point sampling to select a set of features whose
+    corresponding points have the furthest distance."""
+
+    @staticmethod
+    def forward(ctx, points_dist: torch.Tensor,
+                num_points: int) -> torch.Tensor:
+        """
+        Args:
+            points_dist (torch.Tensor): (B, N, N) Distance between each point
+                pair.
+            num_points (int): Number of points in the sampled set.
+
+        Returns:
+            torch.Tensor: (B, num_points) indices of the sampled points.
+        """
+        assert points_dist.is_contiguous()
+
+        B, N, _ = points_dist.size()
+        output = points_dist.new_zeros([B, num_points], dtype=torch.int32)
+        temp = points_dist.new_zeros([B, N]).fill_(1e10)
+
+        ext_module.furthest_point_sampling_with_dist_forward(
+            points_dist, temp, output, b=B, n=N, m=num_points)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(output)
+        return output
+
+    @staticmethod
+    def backward(xyz, a=None):
+        return None, None
+
+
+furthest_point_sample = FurthestPointSampling.apply
+furthest_point_sample_with_dist = FurthestPointSamplingWithDist.apply
diff --git a/mmcv/mmcv/ops/fused_bias_leakyrelu.py b/mmcv/mmcv/ops/fused_bias_leakyrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..e23617fb3af36234f1694e7c1210797d04b72113
--- /dev/null
+++ b/mmcv/mmcv/ops/fused_bias_leakyrelu.py
@@ -0,0 +1,282 @@
+# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/fused_act.py # noqa:E501
+
+# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+# Augmentation (ADA)
+# =======================================================================
+
+# 1. Definitions
+
+# "Licensor" means any person or entity that distributes its Work.
+
+# "Software" means the original work of authorship made available under
+# this License.
+
+# "Work" means the Software and any additions to or derivative works of
+# the Software that are made available under this License.
+
+# The terms "reproduce," "reproduction," "derivative works," and
+# "distribution" have the meaning as provided under U.S. copyright law;
+# provided, however, that for the purposes of this License, derivative
+# works shall not include works that remain separable from, or merely
+# link (or bind by name) to the interfaces of, the Work.
+
+# Works, including the Software, are "made available" under this License
+# by including in or with the Work either (a) a copyright notice
+# referencing the applicability of this License to the Work, or (b) a
+# copy of this License.
+
+# 2. License Grants
+
+#     2.1 Copyright Grant. Subject to the terms and conditions of this
+#     License, each Licensor grants to you a perpetual, worldwide,
+#     non-exclusive, royalty-free, copyright license to reproduce,
+#     prepare derivative works of, publicly display, publicly perform,
+#     sublicense and distribute its Work and any resulting derivative
+#     works in any form.
+
+# 3. Limitations
+
+#     3.1 Redistribution. You may reproduce or distribute the Work only
+#     if (a) you do so under this License, (b) you include a complete
+#     copy of this License with your distribution, and (c) you retain
+#     without modification any copyright, patent, trademark, or
+#     attribution notices that are present in the Work.
+
+#     3.2 Derivative Works. You may specify that additional or different
+#     terms apply to the use, reproduction, and distribution of your
+#     derivative works of the Work ("Your Terms") only if (a) Your Terms
+#     provide that the use limitation in Section 3.3 applies to your
+#     derivative works, and (b) you identify the specific derivative
+#     works that are subject to Your Terms. Notwithstanding Your Terms,
+#     this License (including the redistribution requirements in Section
+#     3.1) will continue to apply to the Work itself.
+
+#     3.3 Use Limitation. The Work and any derivative works thereof only
+#     may be used or intended for use non-commercially. Notwithstanding
+#     the foregoing, NVIDIA and its affiliates may use the Work and any
+#     derivative works commercially. As used herein, "non-commercially"
+#     means for research or evaluation purposes only.
+
+#     3.4 Patent Claims. If you bring or threaten to bring a patent claim
+#     against any Licensor (including any claim, cross-claim or
+#     counterclaim in a lawsuit) to enforce any patents that you allege
+#     are infringed by any Work, then your rights under this License from
+#     such Licensor (including the grant in Section 2.1) will terminate
+#     immediately.
+
+#     3.5 Trademarks. This License does not grant any rights to use any
+#     Licensor’s or its affiliates’ names, logos, or trademarks, except
+#     as necessary to reproduce the notices described in this License.
+
+#     3.6 Termination. If you violate any term of this License, then your
+#     rights under this License (including the grant in Section 2.1) will
+#     terminate immediately.
+
+# 4. Disclaimer of Warranty.
+
+# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+# THIS LICENSE.
+
+# 5. Limitation of Liability.
+
+# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGES.
+
+# =======================================================================
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['fused_bias_leakyrelu'])
+
+
+class FusedBiasLeakyReLUFunctionBackward(Function):
+    """Calculate second order deviation.
+
+    This function is to compute the second order deviation for the fused leaky
+    relu operation.
+    """
+
+    @staticmethod
+    def forward(ctx, grad_output: torch.Tensor, out: torch.Tensor,
+                negative_slope: float, scale: float) -> tuple:
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        empty = grad_output.new_empty(0)
+
+        grad_input = ext_module.fused_bias_leakyrelu(
+            grad_output,
+            empty,
+            out,
+            act=3,
+            grad=1,
+            alpha=negative_slope,
+            scale=scale)
+
+        dim = [0]
+
+        if grad_input.ndim > 2:
+            dim += list(range(2, grad_input.ndim))
+
+        grad_bias = grad_input.sum(dim).detach()
+
+        return grad_input, grad_bias
+
+    @staticmethod
+    def backward(ctx, gradgrad_input: torch.Tensor,
+                 gradgrad_bias: nn.Parameter) -> tuple:
+        out, = ctx.saved_tensors
+
+        # The second order deviation, in fact, contains two parts, while the
+        # the first part is zero. Thus, we direct consider the second part
+        # which is similar with the first order deviation in implementation.
+        gradgrad_out = ext_module.fused_bias_leakyrelu(
+            gradgrad_input,
+            gradgrad_bias.to(out.dtype),
+            out,
+            act=3,
+            grad=1,
+            alpha=ctx.negative_slope,
+            scale=ctx.scale)
+
+        return gradgrad_out, None, None, None
+
+
+class FusedBiasLeakyReLUFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, bias: nn.Parameter,
+                negative_slope: float, scale: float) -> torch.Tensor:
+        empty = input.new_empty(0)
+
+        out = ext_module.fused_bias_leakyrelu(
+            input,
+            bias,
+            empty,
+            act=3,
+            grad=0,
+            alpha=negative_slope,
+            scale=scale)
+        ctx.save_for_backward(out)
+        ctx.negative_slope = negative_slope
+        ctx.scale = scale
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        out, = ctx.saved_tensors
+
+        grad_input, grad_bias = FusedBiasLeakyReLUFunctionBackward.apply(
+            grad_output, out, ctx.negative_slope, ctx.scale)
+
+        return grad_input, grad_bias, None, None
+
+
+class FusedBiasLeakyReLU(nn.Module):
+    r"""Fused bias leaky ReLU.
+
+    This function is introduced in the StyleGAN2:
+    `Analyzing and Improving the Image Quality of StyleGAN
+    <http://arxiv.org/abs/1912.04958>`_
+
+    The bias term comes from the convolution operation. In addition, to keep
+    the variance of the feature map or gradients unchanged, they also adopt a
+    scale similarly with Kaiming initialization. However, since the
+    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
+    your own scale.
+
+    TODO: Implement the CPU version.
+
+    Args:
+        num_channels (int): The channel number of the feature map.
+        negative_slope (float, optional): Same as nn.LeakyRelu.
+            Defaults to 0.2.
+        scale (float, optional): A scalar to adjust the variance of the feature
+            map. Defaults to 2**0.5.
+    """
+
+    def __init__(self,
+                 num_channels: int,
+                 negative_slope: float = 0.2,
+                 scale: float = 2**0.5):
+        super().__init__()
+
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.negative_slope = negative_slope
+        self.scale = scale
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return fused_bias_leakyrelu(input, self.bias, self.negative_slope,
+                                    self.scale)
+
+
+def fused_bias_leakyrelu(input: torch.Tensor,
+                         bias: nn.Parameter,
+                         negative_slope: float = 0.2,
+                         scale: float = 2**0.5) -> torch.Tensor:
+    r"""Fused bias leaky ReLU function.
+
+    This function is introduced in the StyleGAN2:
+    `Analyzing and Improving the Image Quality of StyleGAN
+    <http://arxiv.org/abs/1912.04958>`_
+
+    The bias term comes from the convolution operation. In addition, to keep
+    the variance of the feature map or gradients unchanged, they also adopt a
+    scale similarly with Kaiming initialization. However, since the
+    :math:`1+{alpha}^2` is too small, we can just ignore it. Therefore, the
+    final scale is just :math:`\sqrt{2}`. Of course, you may change it with
+    your own scale.
+
+    Args:
+        input (torch.Tensor): Input feature map.
+        bias (nn.Parameter): The bias from convolution operation.
+        negative_slope (float, optional): Same as nn.LeakyRelu.
+            Defaults to 0.2.
+        scale (float, optional): A scalar to adjust the variance of the feature
+            map. Defaults to 2**0.5.
+
+    Returns:
+        torch.Tensor: Feature map after non-linear activation.
+    """
+
+    if not input.is_cuda:
+        return bias_leakyrelu_ref(input, bias, negative_slope, scale)
+
+    return FusedBiasLeakyReLUFunction.apply(input, bias.to(input.dtype),
+                                            negative_slope, scale)
+
+
+def bias_leakyrelu_ref(x: torch.Tensor,
+                       bias: nn.Parameter,
+                       negative_slope: float = 0.2,
+                       scale: float = 2**0.5) -> torch.Tensor:
+
+    if bias is not None:
+        assert bias.ndim == 1
+        assert bias.shape[0] == x.shape[1]
+        x = x + bias.reshape([-1 if i == 1 else 1 for i in range(x.ndim)])
+
+    x = F.leaky_relu(x, negative_slope)
+    if scale != 1:
+        x = x * scale
+
+    return x
diff --git a/mmcv/mmcv/ops/gather_points.py b/mmcv/mmcv/ops/gather_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..895bfab643ba5c9da218e398501c12a646b869e8
--- /dev/null
+++ b/mmcv/mmcv/ops/gather_points.py
@@ -0,0 +1,59 @@
+from typing import Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['gather_points_forward', 'gather_points_backward'])
+
+
+class GatherPoints(Function):
+    """Gather points with given index."""
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): (B, C, N) features to gather.
+            indices (torch.Tensor): (B, M) where M is the number of points.
+
+        Returns:
+            torch.Tensor: (B, C, M) where M is the number of points.
+        """
+        assert features.is_contiguous()
+        assert indices.is_contiguous()
+
+        B, npoint = indices.size()
+        _, C, N = features.size()
+        output = features.new_zeros((B, C, npoint))
+
+        ext_module.gather_points_forward(
+            features, indices, output, b=B, c=C, n=N, npoints=npoint)
+
+        ctx.for_backwards = (indices, C, N)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(indices)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        idx, C, N = ctx.for_backwards
+        B, npoint = idx.size()
+
+        grad_features = grad_out.new_zeros((B, C, N))
+        grad_out_data = grad_out.data.contiguous()
+        ext_module.gather_points_backward(
+            grad_out_data,
+            idx,
+            grad_features.data,
+            b=B,
+            c=C,
+            n=N,
+            npoints=npoint)
+        return grad_features, None
+
+
+gather_points = GatherPoints.apply
diff --git a/mmcv/mmcv/ops/group_points.py b/mmcv/mmcv/ops/group_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..5268a265f140984694bd5ef576d212938b47184b
--- /dev/null
+++ b/mmcv/mmcv/ops/group_points.py
@@ -0,0 +1,245 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+from .ball_query import ball_query
+from .knn import knn
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['group_points_forward', 'group_points_backward'])
+
+
+class QueryAndGroup(nn.Module):
+    """Groups points with a ball query of radius.
+
+    Args:
+        max_radius (float): The maximum radius of the balls.
+            If None is given, we will use kNN sampling instead of ball query.
+        sample_num (int): Maximum number of features to gather in the ball.
+        min_radius (float, optional): The minimum radius of the balls.
+            Default: 0.
+        use_xyz (bool, optional): Whether to use xyz.
+            Default: True.
+        return_grouped_xyz (bool, optional): Whether to return grouped xyz.
+            Default: False.
+        normalize_xyz (bool, optional): Whether to normalize xyz.
+            Default: False.
+        uniform_sample (bool, optional): Whether to sample uniformly.
+            Default: False
+        return_unique_cnt (bool, optional): Whether to return the count of
+            unique samples. Default: False.
+        return_grouped_idx (bool, optional): Whether to return grouped idx.
+            Default: False.
+    """
+
+    def __init__(self,
+                 max_radius: float,
+                 sample_num: int,
+                 min_radius: float = 0.,
+                 use_xyz: bool = True,
+                 return_grouped_xyz: bool = False,
+                 normalize_xyz: bool = False,
+                 uniform_sample: bool = False,
+                 return_unique_cnt: bool = False,
+                 return_grouped_idx: bool = False):
+        super().__init__()
+        self.max_radius = max_radius
+        self.min_radius = min_radius
+        self.sample_num = sample_num
+        self.use_xyz = use_xyz
+        self.return_grouped_xyz = return_grouped_xyz
+        self.normalize_xyz = normalize_xyz
+        self.uniform_sample = uniform_sample
+        self.return_unique_cnt = return_unique_cnt
+        self.return_grouped_idx = return_grouped_idx
+        if self.return_unique_cnt:
+            assert self.uniform_sample, \
+                'uniform_sample should be True when ' \
+                'returning the count of unique samples'
+        if self.max_radius is None:
+            assert not self.normalize_xyz, \
+                'can not normalize grouped xyz when max_radius is None'
+
+    def forward(
+        self,
+        points_xyz: torch.Tensor,
+        center_xyz: torch.Tensor,
+        features: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple]:
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of the
+                points.
+            center_xyz (torch.Tensor): (B, npoint, 3) coordinates of the
+                centriods.
+            features (torch.Tensor): (B, C, N) The features of grouped
+                points.
+
+        Returns:
+            Tuple | torch.Tensor: (B, 3 + C, npoint, sample_num) Grouped
+            concatenated coordinates and features of points.
+        """
+        # if self.max_radius is None, we will perform kNN instead of ball query
+        # idx is of shape [B, npoint, sample_num]
+        if self.max_radius is None:
+            idx = knn(self.sample_num, points_xyz, center_xyz, False)
+            idx = idx.transpose(1, 2).contiguous()
+        else:
+            idx = ball_query(self.min_radius, self.max_radius, self.sample_num,
+                             points_xyz, center_xyz)
+
+        if self.uniform_sample:
+            unique_cnt = torch.zeros((idx.shape[0], idx.shape[1]))
+            for i_batch in range(idx.shape[0]):
+                for i_region in range(idx.shape[1]):
+                    unique_ind = torch.unique(idx[i_batch, i_region, :])
+                    num_unique = unique_ind.shape[0]
+                    unique_cnt[i_batch, i_region] = num_unique
+                    sample_ind = torch.randint(
+                        0,
+                        num_unique, (self.sample_num - num_unique, ),
+                        dtype=torch.long)
+                    all_ind = torch.cat((unique_ind, unique_ind[sample_ind]))
+                    idx[i_batch, i_region, :] = all_ind
+
+        xyz_trans = points_xyz.transpose(1, 2).contiguous()
+        # (B, 3, npoint, sample_num)
+        grouped_xyz = grouping_operation(xyz_trans, idx)
+        grouped_xyz_diff = grouped_xyz - \
+            center_xyz.transpose(1, 2).unsqueeze(-1)  # relative offsets
+        if self.normalize_xyz:
+            grouped_xyz_diff /= self.max_radius
+
+        if features is not None:
+            grouped_features = grouping_operation(features, idx)
+            if self.use_xyz:
+                # (B, C + 3, npoint, sample_num)
+                new_features = torch.cat([grouped_xyz_diff, grouped_features],
+                                         dim=1)
+            else:
+                new_features = grouped_features
+        else:
+            assert (self.use_xyz
+                    ), 'Cannot have not features and not use xyz as a feature!'
+            new_features = grouped_xyz_diff
+
+        ret = [new_features]
+        if self.return_grouped_xyz:
+            ret.append(grouped_xyz)
+        if self.return_unique_cnt:
+            ret.append(unique_cnt)
+        if self.return_grouped_idx:
+            ret.append(idx)
+        if len(ret) == 1:
+            return ret[0]
+        else:
+            return tuple(ret)
+
+
+class GroupAll(nn.Module):
+    """Group xyz with feature.
+
+    Args:
+        use_xyz (bool): Whether to use xyz.
+    """
+
+    def __init__(self, use_xyz: bool = True):
+        super().__init__()
+        self.use_xyz = use_xyz
+
+    def forward(self,
+                xyz: torch.Tensor,
+                new_xyz: torch.Tensor,
+                features: Optional[torch.Tensor] = None) -> torch.Tensor:
+        """
+        Args:
+            xyz (Tensor): (B, N, 3) xyz coordinates of the features.
+            new_xyz (Tensor): new xyz coordinates of the features.
+            features (Tensor): (B, C, N) features to group.
+
+        Returns:
+            Tensor: (B, C + 3, 1, N) Grouped feature.
+        """
+        grouped_xyz = xyz.transpose(1, 2).unsqueeze(2)
+        if features is not None:
+            grouped_features = features.unsqueeze(2)
+            if self.use_xyz:
+                # (B, 3 + C, 1, N)
+                new_features = torch.cat([grouped_xyz, grouped_features],
+                                         dim=1)
+            else:
+                new_features = grouped_features
+        else:
+            new_features = grouped_xyz
+
+        return new_features
+
+
+class GroupingOperation(Function):
+    """Group feature with given index."""
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor,
+                indices: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (Tensor): (B, C, N) tensor of features to group.
+            indices (Tensor): (B, npoint, nsample) the indices of
+                features to group with.
+
+        Returns:
+            Tensor: (B, C, npoint, nsample) Grouped features.
+        """
+        features = features.contiguous()
+        indices = indices.contiguous()
+
+        B, nfeatures, nsample = indices.size()
+        _, C, N = features.size()
+        output = torch.cuda.FloatTensor(B, C, nfeatures, nsample)
+
+        ext_module.group_points_forward(
+            features,
+            indices,
+            output,
+            b=B,
+            c=C,
+            n=N,
+            npoints=nfeatures,
+            nsample=nsample)
+
+        ctx.for_backwards = (indices, N)
+        return output
+
+    @staticmethod
+    def backward(ctx, grad_out: torch.Tensor) -> Tuple[torch.Tensor, None]:
+        """
+        Args:
+            grad_out (Tensor): (B, C, npoint, nsample) tensor of the gradients
+                of the output from forward.
+
+        Returns:
+            Tensor: (B, C, N) gradient of the features.
+        """
+        idx, N = ctx.for_backwards
+
+        B, C, npoint, nsample = grad_out.size()
+        grad_features = torch.cuda.FloatTensor(B, C, N).zero_()
+
+        grad_out_data = grad_out.data.contiguous()
+        ext_module.group_points_backward(
+            grad_out_data,
+            idx,
+            grad_features.data,
+            b=B,
+            c=C,
+            n=N,
+            npoints=npoint,
+            nsample=nsample)
+        return grad_features, None
+
+
+grouping_operation = GroupingOperation.apply
diff --git a/mmcv/mmcv/ops/info.py b/mmcv/mmcv/ops/info.py
new file mode 100644
index 0000000000000000000000000000000000000000..29f2e5598ae2bb5866ccd15a7d3b4de33c0cd14d
--- /dev/null
+++ b/mmcv/mmcv/ops/info.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import glob
+import os
+
+import torch
+
+if torch.__version__ == 'parrots':
+    import parrots
+
+    def get_compiler_version():
+        return 'GCC ' + parrots.version.compiler
+
+    def get_compiling_cuda_version():
+        return parrots.version.cuda
+else:
+    from ..utils import ext_loader
+    ext_module = ext_loader.load_ext(
+        '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])
+
+    def get_compiler_version():
+        return ext_module.get_compiler_version()
+
+    def get_compiling_cuda_version():
+        return ext_module.get_compiling_cuda_version()
+
+
+def get_onnxruntime_op_path():
+    wildcard = os.path.join(
+        os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
+        '_ext_ort.*.so')
+
+    paths = glob.glob(wildcard)
+    if len(paths) > 0:
+        return paths[0]
+    else:
+        return ''
diff --git a/mmcv/mmcv/ops/iou3d.py b/mmcv/mmcv/ops/iou3d.py
new file mode 100755
index 0000000000000000000000000000000000000000..dc45ee94b62e8e74fed257b5e443fc87bde61df6
--- /dev/null
+++ b/mmcv/mmcv/ops/iou3d.py
@@ -0,0 +1,224 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Optional
+
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'iou3d_boxes_overlap_bev_forward', 'iou3d_nms3d_forward',
+    'iou3d_nms3d_normal_forward'
+])
+
+
+def boxes_overlap_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes BEV overlap.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).
+
+    Returns:
+        torch.Tensor: BEV overlap result with shape (M, N).
+    """
+    ans_overlap = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
+                                               boxes_b.contiguous(),
+                                               ans_overlap)
+
+    return ans_overlap
+
+
+def boxes_iou3d(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes 3D IoU.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 7).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 7).
+
+    Returns:
+        torch.Tensor: 3D IoU result with shape (M, N).
+    """
+    assert boxes_a.shape[1] == boxes_b.shape[1] == 7,\
+        'Input boxes shape should be (N, 7)'
+
+    boxes_a_height_max = (boxes_a[:, 2] + boxes_a[:, 5] / 2).view(-1, 1)
+    boxes_a_height_min = (boxes_a[:, 2] - boxes_a[:, 5] / 2).view(-1, 1)
+    boxes_b_height_max = (boxes_b[:, 2] + boxes_b[:, 5] / 2).view(1, -1)
+    boxes_b_height_min = (boxes_b[:, 2] - boxes_b[:, 5] / 2).view(1, -1)
+
+    overlaps_bev = boxes_a.new_zeros(
+        torch.Size((boxes_a.shape[0], boxes_b.shape[0])))
+    ext_module.iou3d_boxes_overlap_bev_forward(boxes_a.contiguous(),
+                                               boxes_b.contiguous(),
+                                               overlaps_bev)
+
+    max_of_min = torch.max(boxes_a_height_min, boxes_b_height_min)
+    min_of_max = torch.min(boxes_a_height_max, boxes_b_height_max)
+    overlaps_h = torch.clamp(min_of_max - max_of_min, min=0)
+    overlaps_3d = overlaps_bev * overlaps_h
+    vol_a = (boxes_a[:, 3] * boxes_a[:, 4] * boxes_a[:, 5]).view(-1, 1)
+    vol_b = (boxes_b[:, 3] * boxes_b[:, 4] * boxes_b[:, 5]).view(1, -1)
+    iou3d = overlaps_3d / torch.clamp(vol_a + vol_b - overlaps_3d, min=1e-6)
+    return iou3d
+
+
+def nms3d(boxes: Tensor, scores: Tensor, iou_threshold: float) -> Tensor:
+    """3D NMS function GPU implementation (for BEV boxes).
+
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of (N, 7)
+            ([x, y, z, dx, dy, dz, heading]).
+        scores (torch.Tensor): Scores of boxes with the shape of (N).
+        iou_threshold (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Indexes after NMS.
+    """
+    assert boxes.size(1) == 7, 'Input boxes shape should be (N, 7)'
+    order = scores.sort(0, descending=True)[1]
+    boxes = boxes[order].contiguous()
+
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = torch.zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms3d_forward(
+        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
+    keep = order[keep[:num_out].cuda(boxes.device)].contiguous()
+    return keep
+
+
+def nms3d_normal(boxes: Tensor, scores: Tensor,
+                 iou_threshold: float) -> Tensor:
+    """Normal 3D NMS function GPU implementation. The overlap of two boxes for
+    IoU calculation is defined as the exact overlapping area of the two boxes
+    WITH their yaw angle set to 0.
+
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 7).
+            ([x, y, z, dx, dy, dz, heading]).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N).
+        iou_threshold (float): Overlap threshold of NMS.
+
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    assert boxes.shape[1] == 7, 'Input boxes shape should be (N, 7)'
+    order = scores.sort(0, descending=True)[1]
+    boxes = boxes[order].contiguous()
+
+    keep = torch.zeros(boxes.size(0), dtype=torch.long)
+    num_out = torch.zeros(size=(), dtype=torch.long)
+    ext_module.iou3d_nms3d_normal_forward(
+        boxes, keep, num_out, nms_overlap_thresh=iou_threshold)
+    return order[keep[:num_out].cuda(boxes.device)].contiguous()
+
+
+def _xyxyr2xywhr(boxes: Tensor) -> Tensor:
+    """Convert [x1, y1, x2, y2, heading] box to [x, y, dx, dy, heading] box.
+
+    Args:
+        box (torch.Tensor): Input boxes with shape (N, 5).
+
+    Returns:
+        torch.Tensor: Converted boxes with shape (N, 7).
+    """
+    warnings.warn(
+        'This function is deprecated and will be removed in the future.',
+        DeprecationWarning)
+    return torch.stack(
+        ((boxes[:, 0] + boxes[:, 2]) / 2, (boxes[:, 1] + boxes[:, 3]) / 2,
+         boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1], boxes[:, 4]),
+        dim=-1)
+
+
+def boxes_iou_bev(boxes_a: Tensor, boxes_b: Tensor) -> Tensor:
+    """Calculate boxes IoU in the Bird's Eye View.
+
+    Args:
+        boxes_a (torch.Tensor): Input boxes a with shape (M, 5)
+            ([x1, y1, x2, y2, ry]).
+        boxes_b (torch.Tensor): Input boxes b with shape (N, 5)
+            ([x1, y1, x2, y2, ry]).
+
+    Returns:
+        torch.Tensor: IoU result with shape (M, N).
+    """
+    from .box_iou_rotated import box_iou_rotated
+
+    warnings.warn(
+        '`iou3d.boxes_iou_bev` is deprecated and will be removed in'
+        ' the future. Please, use `box_iou_rotated.box_iou_rotated`.',
+        DeprecationWarning)
+
+    return box_iou_rotated(_xyxyr2xywhr(boxes_a), _xyxyr2xywhr(boxes_b))
+
+
+def nms_bev(boxes: Tensor,
+            scores: Tensor,
+            thresh: float,
+            pre_max_size: Optional[int] = None,
+            post_max_size: Optional[int] = None) -> Tensor:
+    """NMS function GPU implementation (for BEV boxes).
+
+    The overlap of two
+    boxes for IoU calculation is defined as the exact overlapping area of the
+    two boxes. In this function, one can also set ``pre_max_size`` and
+    ``post_max_size``.
+    Args:
+        boxes (torch.Tensor): Input boxes with the shape of (N, 5)
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of boxes with the shape of (N,).
+        thresh (float): Overlap threshold of NMS.
+        pre_max_size (int, optional): Max size of boxes before NMS.
+            Default: None.
+        post_max_size (int, optional): Max size of boxes after NMS.
+            Default: None.
+    Returns:
+        torch.Tensor: Indexes after NMS.
+    """
+    from .nms import nms_rotated
+
+    warnings.warn(
+        '`iou3d.nms_bev` is deprecated and will be removed in'
+        ' the future. Please, use `nms.nms_rotated`.', DeprecationWarning)
+    assert boxes.size(1) == 5, 'Input boxes shape should be (N, 5)'
+    order = scores.sort(0, descending=True)[1]
+
+    if pre_max_size is not None:
+        order = order[:pre_max_size]
+    boxes = _xyxyr2xywhr(boxes)[order]
+    scores = scores[order]
+
+    keep = nms_rotated(boxes, scores, thresh)[1]
+    keep = order[keep]
+
+    if post_max_size is not None:
+        keep = keep[:post_max_size]
+    return keep
+
+
+def nms_normal_bev(boxes: Tensor, scores: Tensor, thresh: float) -> Tensor:
+    """Normal NMS function GPU implementation (for BEV boxes).
+
+    The overlap of
+    two boxes for IoU calculation is defined as the exact overlapping area of
+    the two boxes WITH their yaw angle set to 0.
+    Args:
+        boxes (torch.Tensor): Input boxes with shape (N, 5)
+            ([x1, y1, x2, y2, ry]).
+        scores (torch.Tensor): Scores of predicted boxes with shape (N,).
+        thresh (float): Overlap threshold of NMS.
+    Returns:
+        torch.Tensor: Remaining indices with scores in descending order.
+    """
+    from .nms import nms
+
+    warnings.warn(
+        '`iou3d.nms_normal_bev` is deprecated and will be removed in'
+        ' the future. Please, use `nms.nms`.', DeprecationWarning)
+    assert boxes.shape[1] == 5, 'Input boxes shape should be (N, 5)'
+
+    return nms(boxes[:, :-1], scores, thresh)[1]
diff --git a/mmcv/mmcv/ops/knn.py b/mmcv/mmcv/ops/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..48ce92f9259bdcec166a23be2ba81544a69bc8c1
--- /dev/null
+++ b/mmcv/mmcv/ops/knn.py
@@ -0,0 +1,80 @@
+from typing import Optional
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['knn_forward'])
+
+
+class KNN(Function):
+    r"""KNN (CUDA) based on heap data structure.
+
+    Modified from `PAConv <https://github.com/CVMI-Lab/PAConv/tree/main/
+    scene_seg/lib/pointops/src/knnquery_heap>`_.
+
+    Find k-nearest points.
+    """
+
+    @staticmethod
+    def forward(ctx,
+                k: int,
+                xyz: torch.Tensor,
+                center_xyz: Optional[torch.Tensor] = None,
+                transposed: bool = False) -> torch.Tensor:
+        """
+        Args:
+            k (int): number of nearest neighbors.
+            xyz (torch.Tensor): (B, N, 3) if transposed == False, else
+                (B, 3, N). xyz coordinates of the features.
+            center_xyz (torch.Tensor, optional): (B, npoint, 3) if transposed
+                is False, else (B, 3, npoint). centers of the knn query.
+                Default: None.
+            transposed (bool, optional): whether the input tensors are
+                transposed. Should not explicitly use this keyword when
+                calling knn (=KNN.apply), just add the fourth param.
+                Default: False.
+
+        Returns:
+            torch.Tensor: (B, k, npoint) tensor with the indices of the
+            features that form k-nearest neighbours.
+        """
+        assert (k > 0) & (k < 100), 'k should be in range(0, 100)'
+
+        if center_xyz is None:
+            center_xyz = xyz
+
+        if transposed:
+            xyz = xyz.transpose(2, 1).contiguous()
+            center_xyz = center_xyz.transpose(2, 1).contiguous()
+
+        assert xyz.is_contiguous()  # [B, N, 3]
+        assert center_xyz.is_contiguous()  # [B, npoint, 3]
+
+        center_xyz_device = center_xyz.get_device()
+        assert center_xyz_device == xyz.get_device(), \
+            'center_xyz and xyz should be put on the same device'
+        if torch.cuda.current_device() != center_xyz_device:
+            torch.cuda.set_device(center_xyz_device)
+
+        B, npoint, _ = center_xyz.shape
+        N = xyz.shape[1]
+
+        idx = center_xyz.new_zeros((B, npoint, k)).int()
+        dist2 = center_xyz.new_zeros((B, npoint, k)).float()
+
+        ext_module.knn_forward(
+            xyz, center_xyz, idx, dist2, b=B, n=N, m=npoint, nsample=k)
+        # idx shape to [B, k, npoint]
+        idx = idx.transpose(2, 1).contiguous()
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+        return idx
+
+    @staticmethod
+    def backward(ctx, a=None):
+        return None, None, None
+
+
+knn = KNN.apply
diff --git a/mmcv/mmcv/ops/masked_conv.py b/mmcv/mmcv/ops/masked_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6f258d04478729d8e99df35206d1b50cdec139a
--- /dev/null
+++ b/mmcv/mmcv/ops/masked_conv.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['masked_im2col_forward', 'masked_col2im_forward'])
+
+
+class MaskedConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, mask, weight, bias, padding, stride):
+        return g.op(
+            'mmcv::MMCVMaskedConv2d',
+            features,
+            mask,
+            weight,
+            bias,
+            padding_i=padding,
+            stride_i=stride)
+
+    @staticmethod
+    def forward(ctx,
+                features: torch.Tensor,
+                mask: torch.Tensor,
+                weight: torch.nn.Parameter,
+                bias: torch.nn.Parameter,
+                padding: int = 0,
+                stride: int = 1) -> torch.Tensor:
+        assert mask.dim() == 3 and mask.size(0) == 1
+        assert features.dim() == 4 and features.size(0) == 1
+        assert features.size()[2:] == mask.size()[1:]
+        pad_h, pad_w = _pair(padding)
+        stride_h, stride_w = _pair(stride)
+        if stride_h != 1 or stride_w != 1:
+            raise ValueError(
+                'Stride could not only be 1 in masked_conv2d currently.')
+        out_channel, in_channel, kernel_h, kernel_w = weight.size()
+
+        batch_size = features.size(0)
+        out_h = int(
+            math.floor((features.size(2) + 2 * pad_h -
+                        (kernel_h - 1) - 1) / stride_h + 1))
+        out_w = int(
+            math.floor((features.size(3) + 2 * pad_w -
+                        (kernel_h - 1) - 1) / stride_w + 1))
+        mask_inds = torch.nonzero(mask[0] > 0, as_tuple=False)
+        output = features.new_zeros(batch_size, out_channel, out_h, out_w)
+        if mask_inds.numel() > 0:
+            mask_h_idx = mask_inds[:, 0].contiguous()
+            mask_w_idx = mask_inds[:, 1].contiguous()
+            data_col = features.new_zeros(in_channel * kernel_h * kernel_w,
+                                          mask_inds.size(0))
+            ext_module.masked_im2col_forward(
+                features,
+                mask_h_idx,
+                mask_w_idx,
+                data_col,
+                kernel_h=kernel_h,
+                kernel_w=kernel_w,
+                pad_h=pad_h,
+                pad_w=pad_w)
+            masked_output = torch.addmm(1, bias[:, None], 1,
+                                        weight.view(out_channel, -1), data_col)
+            ext_module.masked_col2im_forward(
+                masked_output,
+                mask_h_idx,
+                mask_w_idx,
+                output,
+                height=out_h,
+                width=out_w,
+                channels=out_channel)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        return (None, ) * 5
+
+
+masked_conv2d = MaskedConv2dFunction.apply
+
+
+class MaskedConv2d(nn.Conv2d):
+    """A MaskedConv2d which inherits the official Conv2d.
+
+    The masked forward doesn't implement the backward function and only
+    supports the stride parameter to be 1 currently.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int, ...]],
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 groups: int = 1,
+                 bias: bool = True):
+        super().__init__(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, bias)
+
+    def forward(self,
+                input: torch.Tensor,
+                mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if mask is None:  # fallback to the normal Conv2d
+            return super().forward(input)
+        else:
+            return masked_conv2d(input, mask, self.weight, self.bias,
+                                 self.padding)
diff --git a/mmcv/mmcv/ops/merge_cells.py b/mmcv/mmcv/ops/merge_cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..19c3fe6582bc04390819b1da9b2620548b462836
--- /dev/null
+++ b/mmcv/mmcv/ops/merge_cells.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from abc import abstractmethod
+from typing import Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..cnn import ConvModule
+
+
+class BaseMergeCell(nn.Module):
+    """The basic class for cells used in NAS-FPN and NAS-FCOS.
+
+    BaseMergeCell takes 2 inputs. After applying convolution
+    on them, they are resized to the target size. Then,
+    they go through binary_op, which depends on the type of cell.
+    If with_out_conv is True, the result of output will go through
+    another convolution layer.
+
+    Args:
+        fused_channels (int): number of input channels in out_conv layer.
+        out_channels (int): number of output channels in out_conv layer.
+        with_out_conv (bool): Whether to use out_conv layer
+        out_conv_cfg (dict): Config dict for convolution layer, which should
+            contain "groups", "kernel_size", "padding", "bias" to build
+            out_conv layer.
+        out_norm_cfg (dict): Config dict for normalization layer in out_conv.
+        out_conv_order (tuple): The order of conv/norm/activation layers in
+            out_conv.
+        with_input1_conv (bool): Whether to use convolution on input1.
+        with_input2_conv (bool): Whether to use convolution on input2.
+        input_conv_cfg (dict): Config dict for building input1_conv layer and
+            input2_conv layer, which is expected to contain the type of
+            convolution.
+            Default: None, which means using conv2d.
+        input_norm_cfg (dict): Config dict for normalization layer in
+            input1_conv and input2_conv layer. Default: None.
+        upsample_mode (str): Interpolation method used to resize the output
+            of input1_conv and input2_conv to target size. Currently, we
+            support ['nearest', 'bilinear']. Default: 'nearest'.
+    """
+
+    def __init__(self,
+                 fused_channels: Optional[int] = 256,
+                 out_channels: Optional[int] = 256,
+                 with_out_conv: bool = True,
+                 out_conv_cfg: dict = dict(
+                     groups=1, kernel_size=3, padding=1, bias=True),
+                 out_norm_cfg: Optional[dict] = None,
+                 out_conv_order: tuple = ('act', 'conv', 'norm'),
+                 with_input1_conv: bool = False,
+                 with_input2_conv: bool = False,
+                 input_conv_cfg: Optional[dict] = None,
+                 input_norm_cfg: Optional[dict] = None,
+                 upsample_mode: str = 'nearest'):
+        super().__init__()
+        assert upsample_mode in ['nearest', 'bilinear']
+        self.with_out_conv = with_out_conv
+        self.with_input1_conv = with_input1_conv
+        self.with_input2_conv = with_input2_conv
+        self.upsample_mode = upsample_mode
+
+        if self.with_out_conv:
+            self.out_conv = ConvModule(
+                fused_channels,  # type: ignore
+                out_channels,  # type: ignore
+                **out_conv_cfg,
+                norm_cfg=out_norm_cfg,
+                order=out_conv_order)
+
+        self.input1_conv = self._build_input_conv(
+            out_channels, input_conv_cfg,
+            input_norm_cfg) if with_input1_conv else nn.Sequential()
+        self.input2_conv = self._build_input_conv(
+            out_channels, input_conv_cfg,
+            input_norm_cfg) if with_input2_conv else nn.Sequential()
+
+    def _build_input_conv(self, channel, conv_cfg, norm_cfg):
+        return ConvModule(
+            channel,
+            channel,
+            3,
+            padding=1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            bias=True)
+
+    @abstractmethod
+    def _binary_op(self, x1, x2):
+        pass
+
+    def _resize(self, x, size):
+        if x.shape[-2:] == size:
+            return x
+        elif x.shape[-2:] < size:
+            return F.interpolate(x, size=size, mode=self.upsample_mode)
+        else:
+            if x.shape[-2] % size[-2] != 0 or x.shape[-1] % size[-1] != 0:
+                h, w = x.shape[-2:]
+                target_h, target_w = size
+                pad_h = math.ceil(h / target_h) * target_h - h
+                pad_w = math.ceil(w / target_w) * target_w - w
+                pad_l = pad_w // 2
+                pad_r = pad_w - pad_l
+                pad_t = pad_h // 2
+                pad_b = pad_h - pad_t
+                pad = (pad_l, pad_r, pad_t, pad_b)
+                x = F.pad(x, pad, mode='constant', value=0.0)
+            kernel_size = (x.shape[-2] // size[-2], x.shape[-1] // size[-1])
+            x = F.max_pool2d(x, kernel_size=kernel_size, stride=kernel_size)
+            return x
+
+    def forward(self,
+                x1: torch.Tensor,
+                x2: torch.Tensor,
+                out_size: Optional[tuple] = None) -> torch.Tensor:
+        assert x1.shape[:2] == x2.shape[:2]
+        assert out_size is None or len(out_size) == 2
+        if out_size is None:  # resize to larger one
+            out_size = max(x1.size()[2:], x2.size()[2:])
+
+        x1 = self.input1_conv(x1)
+        x2 = self.input2_conv(x2)
+
+        x1 = self._resize(x1, out_size)
+        x2 = self._resize(x2, out_size)
+
+        x = self._binary_op(x1, x2)
+        if self.with_out_conv:
+            x = self.out_conv(x)
+        return x
+
+
+class SumCell(BaseMergeCell):
+
+    def __init__(self, in_channels: int, out_channels: int, **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+
+    def _binary_op(self, x1, x2):
+        return x1 + x2
+
+
+class ConcatCell(BaseMergeCell):
+
+    def __init__(self, in_channels: int, out_channels: int, **kwargs):
+        super().__init__(in_channels * 2, out_channels, **kwargs)
+
+    def _binary_op(self, x1, x2):
+        ret = torch.cat([x1, x2], dim=1)
+        return ret
+
+
+class GlobalPoolingCell(BaseMergeCell):
+
+    def __init__(self,
+                 in_channels: Optional[int] = None,
+                 out_channels: Optional[int] = None,
+                 **kwargs):
+        super().__init__(in_channels, out_channels, **kwargs)
+        self.global_pool = nn.AdaptiveAvgPool2d((1, 1))
+
+    def _binary_op(self, x1, x2):
+        x2_att = self.global_pool(x2).sigmoid()
+        return x2 + x2_att * x1
diff --git a/mmcv/mmcv/ops/min_area_polygons.py b/mmcv/mmcv/ops/min_area_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..b95f58796f4a894ab5cc48e2d766319f4c3640c7
--- /dev/null
+++ b/mmcv/mmcv/ops/min_area_polygons.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['min_area_polygons'])
+
+
+def min_area_polygons(pointsets: torch.Tensor) -> torch.Tensor:
+    """Find the smallest polygons that surrounds all points in the point sets.
+
+    Args:
+        pointsets (Tensor): point sets with shape  (N, 18).
+
+    Returns:
+        torch.Tensor: Return the smallest polygons with shape (N, 8).
+    """
+    polygons = pointsets.new_zeros((pointsets.size(0), 8))
+    ext_module.min_area_polygons(pointsets, polygons)
+    return polygons
diff --git a/mmcv/mmcv/ops/modulated_deform_conv.py b/mmcv/mmcv/ops/modulated_deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..933428a2217ecc48c7c63792ff6d4dedc2ca1dd7
--- /dev/null
+++ b/mmcv/mmcv/ops/modulated_deform_conv.py
@@ -0,0 +1,285 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair, _single
+
+from mmcv.utils import deprecated_api_warning
+from ..cnn import CONV_LAYERS
+from ..utils import ext_loader, print_log
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['modulated_deform_conv_forward', 'modulated_deform_conv_backward'])
+
+
+class ModulatedDeformConv2dFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, offset, mask, weight, bias, stride, padding,
+                 dilation, groups, deform_groups):
+        input_tensors = [input, offset, mask, weight]
+        if bias is not None:
+            input_tensors.append(bias)
+        return g.op(
+            'mmcv::MMCVModulatedDeformConv2d',
+            *input_tensors,
+            stride_i=stride,
+            padding_i=padding,
+            dilation_i=dilation,
+            groups_i=groups,
+            deform_groups_i=deform_groups)
+
+    @staticmethod
+    def forward(ctx,
+                input: torch.Tensor,
+                offset: torch.Tensor,
+                mask: torch.Tensor,
+                weight: nn.Parameter,
+                bias: Optional[nn.Parameter] = None,
+                stride: int = 1,
+                padding: int = 0,
+                dilation: int = 1,
+                groups: int = 1,
+                deform_groups: int = 1) -> torch.Tensor:
+        if input is not None and input.dim() != 4:
+            raise ValueError(
+                f'Expected 4D tensor as input, got {input.dim()}D tensor \
+                  instead.')
+        ctx.stride = _pair(stride)
+        ctx.padding = _pair(padding)
+        ctx.dilation = _pair(dilation)
+        ctx.groups = groups
+        ctx.deform_groups = deform_groups
+        ctx.with_bias = bias is not None
+        if not ctx.with_bias:
+            bias = input.new_empty(0)  # fake tensor
+        # When pytorch version >= 1.6.0, amp is adopted for fp16 mode;
+        # amp won't cast the type of model (float32), but "offset" is cast
+        # to float16 by nn.Conv2d automatically, leading to the type
+        # mismatch with input (when it is float32) or weight.
+        # The flag for whether to use fp16 or amp is the type of "offset",
+        # we cast weight and input to temporarily support fp16 and amp
+        # whatever the pytorch version is.
+        input = input.type_as(offset)
+        weight = weight.type_as(input)
+        bias = bias.type_as(input)  # type: ignore
+        ctx.save_for_backward(input, offset, mask, weight, bias)
+        output = input.new_empty(
+            ModulatedDeformConv2dFunction._output_size(ctx, input, weight))
+        ctx._bufs = [input.new_empty(0), input.new_empty(0)]
+        ext_module.modulated_deform_conv_forward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            output,
+            ctx._bufs[1],
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        input, offset, mask, weight, bias = ctx.saved_tensors
+        grad_input = torch.zeros_like(input)
+        grad_offset = torch.zeros_like(offset)
+        grad_mask = torch.zeros_like(mask)
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias)
+        grad_output = grad_output.contiguous()
+        ext_module.modulated_deform_conv_backward(
+            input,
+            weight,
+            bias,
+            ctx._bufs[0],
+            offset,
+            mask,
+            ctx._bufs[1],
+            grad_input,
+            grad_weight,
+            grad_bias,
+            grad_offset,
+            grad_mask,
+            grad_output,
+            kernel_h=weight.size(2),
+            kernel_w=weight.size(3),
+            stride_h=ctx.stride[0],
+            stride_w=ctx.stride[1],
+            pad_h=ctx.padding[0],
+            pad_w=ctx.padding[1],
+            dilation_h=ctx.dilation[0],
+            dilation_w=ctx.dilation[1],
+            group=ctx.groups,
+            deformable_group=ctx.deform_groups,
+            with_bias=ctx.with_bias)
+        if not ctx.with_bias:
+            grad_bias = None
+
+        return (grad_input, grad_offset, grad_mask, grad_weight, grad_bias,
+                None, None, None, None, None)
+
+    @staticmethod
+    def _output_size(ctx, input, weight):
+        channels = weight.size(0)
+        output_size = (input.size(0), channels)
+        for d in range(input.dim() - 2):
+            in_size = input.size(d + 2)
+            pad = ctx.padding[d]
+            kernel = ctx.dilation[d] * (weight.size(d + 2) - 1) + 1
+            stride_ = ctx.stride[d]
+            output_size += ((in_size + (2 * pad) - kernel) // stride_ + 1, )
+        if not all(map(lambda s: s > 0, output_size)):
+            raise ValueError(
+                'convolution input is too small (output would be ' +
+                'x'.join(map(str, output_size)) + ')')
+        return output_size
+
+
+modulated_deform_conv2d = ModulatedDeformConv2dFunction.apply
+
+
+class ModulatedDeformConv2d(nn.Module):
+
+    @deprecated_api_warning({'deformable_groups': 'deform_groups'},
+                            cls_name='ModulatedDeformConv2d')
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: Union[int, Tuple[int]],
+                 stride: int = 1,
+                 padding: int = 0,
+                 dilation: int = 1,
+                 groups: int = 1,
+                 deform_groups: int = 1,
+                 bias: Union[bool, str] = True):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = _pair(kernel_size)
+        self.stride = _pair(stride)
+        self.padding = _pair(padding)
+        self.dilation = _pair(dilation)
+        self.groups = groups
+        self.deform_groups = deform_groups
+        # enable compatibility with nn.Conv2d
+        self.transposed = False
+        self.output_padding = _single(0)
+
+        self.weight = nn.Parameter(
+            torch.Tensor(out_channels, in_channels // groups,
+                         *self.kernel_size))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.init_weights()
+
+    def init_weights(self):
+        n = self.in_channels
+        for k in self.kernel_size:
+            n *= k
+        stdv = 1. / math.sqrt(n)
+        self.weight.data.uniform_(-stdv, stdv)
+        if self.bias is not None:
+            self.bias.data.zero_()
+
+    def forward(self, x: torch.Tensor, offset: torch.Tensor,
+                mask: torch.Tensor) -> torch.Tensor:
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+
+@CONV_LAYERS.register_module('DCNv2')
+class ModulatedDeformConv2dPack(ModulatedDeformConv2d):
+    """A ModulatedDeformable Conv Encapsulation that acts as normal Conv
+    layers.
+
+    Args:
+        in_channels (int): Same as nn.Conv2d.
+        out_channels (int): Same as nn.Conv2d.
+        kernel_size (int or tuple[int]): Same as nn.Conv2d.
+        stride (int): Same as nn.Conv2d, while tuple is not supported.
+        padding (int): Same as nn.Conv2d, while tuple is not supported.
+        dilation (int): Same as nn.Conv2d, while tuple is not supported.
+        groups (int): Same as nn.Conv2d.
+        bias (bool or str): If specified as `auto`, it will be decided by the
+            norm_cfg. Bias will be set as True if norm_cfg is None, otherwise
+            False.
+    """
+
+    _version = 2
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.conv_offset = nn.Conv2d(
+            self.in_channels,
+            self.deform_groups * 3 * self.kernel_size[0] * self.kernel_size[1],
+            kernel_size=self.kernel_size,
+            stride=self.stride,
+            padding=self.padding,
+            dilation=self.dilation,
+            bias=True)
+        self.init_weights()
+
+    def init_weights(self) -> None:
+        super().init_weights()
+        if hasattr(self, 'conv_offset'):
+            self.conv_offset.weight.data.zero_()
+            self.conv_offset.bias.data.zero_()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:  # type: ignore
+        out = self.conv_offset(x)
+        o1, o2, mask = torch.chunk(out, 3, dim=1)
+        offset = torch.cat((o1, o2), dim=1)
+        mask = torch.sigmoid(mask)
+        return modulated_deform_conv2d(x, offset, mask, self.weight, self.bias,
+                                       self.stride, self.padding,
+                                       self.dilation, self.groups,
+                                       self.deform_groups)
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        version = local_metadata.get('version', None)
+
+        if version is None or version < 2:
+            # the key is different in early versions
+            # In version < 2, ModulatedDeformConvPack
+            # loads previous benchmark models.
+            if (prefix + 'conv_offset.weight' not in state_dict
+                    and prefix[:-1] + '_offset.weight' in state_dict):
+                state_dict[prefix + 'conv_offset.weight'] = state_dict.pop(
+                    prefix[:-1] + '_offset.weight')
+            if (prefix + 'conv_offset.bias' not in state_dict
+                    and prefix[:-1] + '_offset.bias' in state_dict):
+                state_dict[prefix +
+                           'conv_offset.bias'] = state_dict.pop(prefix[:-1] +
+                                                                '_offset.bias')
+
+        if version is not None and version > 1:
+            print_log(
+                f'ModulatedDeformConvPack {prefix.rstrip(".")} is upgraded to '
+                'version 2.',
+                logger='root')
+
+        super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                      strict, missing_keys, unexpected_keys,
+                                      error_msgs)
diff --git a/mmcv/mmcv/ops/multi_scale_deform_attn.py b/mmcv/mmcv/ops/multi_scale_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..a06466fa5be4b75c18f975976f7c7025ebf789ae
--- /dev/null
+++ b/mmcv/mmcv/ops/multi_scale_deform_attn.py
@@ -0,0 +1,363 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+from typing import Optional, no_type_check
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.autograd.function import Function, once_differentiable
+
+import mmcv
+from mmcv import deprecated_api_warning
+from mmcv.cnn import constant_init, xavier_init
+from mmcv.cnn.bricks.registry import ATTENTION
+from mmcv.runner import BaseModule
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['ms_deform_attn_backward', 'ms_deform_attn_forward'])
+
+
+class MultiScaleDeformableAttnFunction(Function):
+
+    @staticmethod
+    def forward(ctx, value: torch.Tensor, value_spatial_shapes: torch.Tensor,
+                value_level_start_index: torch.Tensor,
+                sampling_locations: torch.Tensor,
+                attention_weights: torch.Tensor,
+                im2col_step: torch.Tensor) -> torch.Tensor:
+        """GPU version of multi-scale deformable attention.
+
+        Args:
+            value (torch.Tensor): The value has shape
+                (bs, num_keys, mum_heads, embed_dims//num_heads)
+            value_spatial_shapes (torch.Tensor): Spatial shape of
+                each feature map, has shape (num_levels, 2),
+                last dimension 2 represent (h, w)
+            sampling_locations (torch.Tensor): The location of sampling points,
+                has shape
+                (bs ,num_queries, num_heads, num_levels, num_points, 2),
+                the last dimension 2 represent (x, y).
+            attention_weights (torch.Tensor): The weight of sampling points
+                used when calculate the attention, has shape
+                (bs ,num_queries, num_heads, num_levels, num_points),
+            im2col_step (torch.Tensor): The step used in image to column.
+
+        Returns:
+            torch.Tensor: has shape (bs, num_queries, embed_dims)
+        """
+
+        ctx.im2col_step = im2col_step
+        output = ext_module.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            im2col_step=ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output: torch.Tensor) -> tuple:
+        """GPU version of backward function.
+
+        Args:
+            grad_output (torch.Tensor): Gradient of output tensor of forward.
+
+        Returns:
+            tuple[Tensor]: Gradient of input tensors in forward.
+        """
+        value, value_spatial_shapes, value_level_start_index,\
+            sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value = torch.zeros_like(value)
+        grad_sampling_loc = torch.zeros_like(sampling_locations)
+        grad_attn_weight = torch.zeros_like(attention_weights)
+
+        ext_module.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output.contiguous(),
+            grad_value,
+            grad_sampling_loc,
+            grad_attn_weight,
+            im2col_step=ctx.im2col_step)
+
+        return grad_value, None, None, \
+            grad_sampling_loc, grad_attn_weight, None
+
+
+def multi_scale_deformable_attn_pytorch(
+        value: torch.Tensor, value_spatial_shapes: torch.Tensor,
+        sampling_locations: torch.Tensor,
+        attention_weights: torch.Tensor) -> torch.Tensor:
+    """CPU version of multi-scale deformable attention.
+
+    Args:
+        value (torch.Tensor): The value has shape
+            (bs, num_keys, num_heads, embed_dims//num_heads)
+        value_spatial_shapes (torch.Tensor): Spatial shape of
+            each feature map, has shape (num_levels, 2),
+            last dimension 2 represent (h, w)
+        sampling_locations (torch.Tensor): The location of sampling points,
+            has shape
+            (bs ,num_queries, num_heads, num_levels, num_points, 2),
+            the last dimension 2 represent (x, y).
+        attention_weights (torch.Tensor): The weight of sampling points used
+            when calculate the attention, has shape
+            (bs ,num_queries, num_heads, num_levels, num_points),
+
+    Returns:
+        torch.Tensor: has shape (bs, num_queries, embed_dims)
+    """
+
+    bs, _, num_heads, embed_dims = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ =\
+        sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
+                             dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level, (H_, W_) in enumerate(value_spatial_shapes):
+        # bs, H_*W_, num_heads, embed_dims ->
+        # bs, H_*W_, num_heads*embed_dims ->
+        # bs, num_heads*embed_dims, H_*W_ ->
+        # bs*num_heads, embed_dims, H_, W_
+        value_l_ = value_list[level].flatten(2).transpose(1, 2).reshape(
+            bs * num_heads, embed_dims, H_, W_)
+        # bs, num_queries, num_heads, num_points, 2 ->
+        # bs, num_heads, num_queries, num_points, 2 ->
+        # bs*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :,
+                                          level].transpose(1, 2).flatten(0, 1)
+        # bs*num_heads, embed_dims, num_queries, num_points
+        sampling_value_l_ = F.grid_sample(
+            value_l_,
+            sampling_grid_l_,
+            mode='bilinear',
+            padding_mode='zeros',
+            align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (bs, num_queries, num_heads, num_levels, num_points) ->
+    # (bs, num_heads, num_queries, num_levels, num_points) ->
+    # (bs, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        bs * num_heads, 1, num_queries, num_levels * num_points)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).view(bs, num_heads * embed_dims,
+                                              num_queries)
+    return output.transpose(1, 2).contiguous()
+
+
+@ATTENTION.register_module()
+class MultiScaleDeformableAttention(BaseModule):
+    """An attention module used in Deformable-Detr.
+
+    `Deformable DETR: Deformable Transformers for End-to-End Object Detection.
+    <https://arxiv.org/pdf/2010.04159.pdf>`_.
+
+    Args:
+        embed_dims (int): The embedding dimension of Attention.
+            Default: 256.
+        num_heads (int): Parallel attention heads. Default: 64.
+        num_levels (int): The number of feature map used in
+            Attention. Default: 4.
+        num_points (int): The number of sampling points for
+            each query in each head. Default: 4.
+        im2col_step (int): The step used in image_to_column.
+            Default: 64.
+        dropout (float): A Dropout layer on `inp_identity`.
+            Default: 0.1.
+        batch_first (bool): Key, Query and Value are shape of
+            (batch, n, embed_dim)
+            or (n, batch, embed_dim). Default to False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: None.
+        init_cfg (obj:`mmcv.ConfigDict`): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims: int = 256,
+                 num_heads: int = 8,
+                 num_levels: int = 4,
+                 num_points: int = 4,
+                 im2col_step: int = 64,
+                 dropout: float = 0.1,
+                 batch_first: bool = False,
+                 norm_cfg: Optional[dict] = None,
+                 init_cfg: Optional[mmcv.ConfigDict] = None):
+        super().__init__(init_cfg)
+        if embed_dims % num_heads != 0:
+            raise ValueError(f'embed_dims must be divisible by num_heads, '
+                             f'but got {embed_dims} and {num_heads}')
+        dim_per_head = embed_dims // num_heads
+        self.norm_cfg = norm_cfg
+        self.dropout = nn.Dropout(dropout)
+        self.batch_first = batch_first
+
+        # you'd better set dim_per_head to a power of 2
+        # which is more efficient in the CUDA implementation
+        def _is_power_of_2(n):
+            if (not isinstance(n, int)) or (n < 0):
+                raise ValueError(
+                    'invalid input for _is_power_of_2: {} (type: {})'.format(
+                        n, type(n)))
+            return (n & (n - 1) == 0) and n != 0
+
+        if not _is_power_of_2(dim_per_head):
+            warnings.warn(
+                "You'd better set embed_dims in "
+                'MultiScaleDeformAttention to make '
+                'the dimension of each attention head a power of 2 '
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = im2col_step
+        self.embed_dims = embed_dims
+        self.num_levels = num_levels
+        self.num_heads = num_heads
+        self.num_points = num_points
+        self.sampling_offsets = nn.Linear(
+            embed_dims, num_heads * num_levels * num_points * 2)
+        self.attention_weights = nn.Linear(embed_dims,
+                                           num_heads * num_levels * num_points)
+        self.value_proj = nn.Linear(embed_dims, embed_dims)
+        self.output_proj = nn.Linear(embed_dims, embed_dims)
+        self.init_weights()
+
+    def init_weights(self) -> None:
+        """Default initialization for Parameters of Module."""
+        constant_init(self.sampling_offsets, 0.)
+        thetas = torch.arange(
+            self.num_heads,
+            dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.num_heads, 1, 1,
+                         2).repeat(1, self.num_levels, self.num_points, 1)
+        for i in range(self.num_points):
+            grid_init[:, :, i, :] *= i + 1
+
+        self.sampling_offsets.bias.data = grid_init.view(-1)
+        constant_init(self.attention_weights, val=0., bias=0.)
+        xavier_init(self.value_proj, distribution='uniform', bias=0.)
+        xavier_init(self.output_proj, distribution='uniform', bias=0.)
+        self._is_init = True
+
+    @no_type_check
+    @deprecated_api_warning({'residual': 'identity'},
+                            cls_name='MultiScaleDeformableAttention')
+    def forward(self,
+                query: torch.Tensor,
+                key: Optional[torch.Tensor] = None,
+                value: Optional[torch.Tensor] = None,
+                identity: Optional[torch.Tensor] = None,
+                query_pos: Optional[torch.Tensor] = None,
+                key_padding_mask: Optional[torch.Tensor] = None,
+                reference_points: Optional[torch.Tensor] = None,
+                spatial_shapes: Optional[torch.Tensor] = None,
+                level_start_index: Optional[torch.Tensor] = None,
+                **kwargs) -> torch.Tensor:
+        """Forward Function of MultiScaleDeformAttention.
+
+        Args:
+            query (torch.Tensor): Query of Transformer with shape
+                (num_query, bs, embed_dims).
+            key (torch.Tensor): The key tensor with shape
+                `(num_key, bs, embed_dims)`.
+            value (torch.Tensor): The value tensor with shape
+                `(num_key, bs, embed_dims)`.
+            identity (torch.Tensor): The tensor used for addition, with the
+                same shape as `query`. Default None. If None,
+                `query` will be used.
+            query_pos (torch.Tensor): The positional encoding for `query`.
+                Default: None.
+            key_padding_mask (torch.Tensor): ByteTensor for `query`, with
+                shape [bs, num_key].
+            reference_points (torch.Tensor):  The normalized reference
+                points with shape (bs, num_query, num_levels, 2),
+                all elements is range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area.
+                or (N, Length_{query}, num_levels, 4), add
+                additional two dimensions is (w, h) to
+                form reference boxes.
+            spatial_shapes (torch.Tensor): Spatial shape of features in
+                different levels. With shape (num_levels, 2),
+                last dimension represents (h, w).
+            level_start_index (torch.Tensor): The start index of each level.
+                A tensor has shape ``(num_levels, )`` and can be represented
+                as [0, h_0*w_0, h_0*w_0+h_1*w_1, ...].
+
+        Returns:
+            torch.Tensor: forwarded results with shape
+            [num_query, bs, embed_dims].
+        """
+
+        if value is None:
+            value = query
+
+        if identity is None:
+            identity = query
+        if query_pos is not None:
+            query = query + query_pos
+        if not self.batch_first:
+            # change to (bs, num_query ,embed_dims)
+            query = query.permute(1, 0, 2)
+            value = value.permute(1, 0, 2)
+
+        bs, num_query, _ = query.shape
+        bs, num_value, _ = value.shape
+        assert (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() == num_value
+
+        value = self.value_proj(value)
+        if key_padding_mask is not None:
+            value = value.masked_fill(key_padding_mask[..., None], 0.0)
+        value = value.view(bs, num_value, self.num_heads, -1)
+        sampling_offsets = self.sampling_offsets(query).view(
+            bs, num_query, self.num_heads, self.num_levels, self.num_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            bs, num_query, self.num_heads, self.num_levels * self.num_points)
+        attention_weights = attention_weights.softmax(-1)
+
+        attention_weights = attention_weights.view(bs, num_query,
+                                                   self.num_heads,
+                                                   self.num_levels,
+                                                   self.num_points)
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                + sampling_offsets \
+                / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                + sampling_offsets / self.num_points \
+                * reference_points[:, :, None, :, None, 2:] \
+                * 0.5
+        else:
+            raise ValueError(
+                f'Last dim of reference_points must be'
+                f' 2 or 4, but get {reference_points.shape[-1]} instead.')
+        if torch.cuda.is_available() and value.is_cuda:
+            output = MultiScaleDeformableAttnFunction.apply(
+                value, spatial_shapes, level_start_index, sampling_locations,
+                attention_weights, self.im2col_step)
+        else:
+            output = multi_scale_deformable_attn_pytorch(
+                value, spatial_shapes, sampling_locations, attention_weights)
+
+        output = self.output_proj(output)
+
+        if not self.batch_first:
+            # (num_query, bs ,embed_dims)
+            output = output.permute(1, 0, 2)
+
+        return self.dropout(output) + identity
diff --git a/mmcv/mmcv/ops/nms.py b/mmcv/mmcv/ops/nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..d41b1ac966b1b302952e354dcc5e0b6724bef5a0
--- /dev/null
+++ b/mmcv/mmcv/ops/nms.py
@@ -0,0 +1,477 @@
+import os
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from mmcv.utils import deprecated_api_warning
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['nms', 'softnms', 'nms_match', 'nms_rotated'])
+
+
+# This function is modified from: https://github.com/pytorch/vision/
+class NMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, bboxes: Tensor, scores: Tensor, iou_threshold: float,
+                offset: int, score_threshold: float, max_num: int) -> Tensor:
+        is_filtering_by_score = score_threshold > 0
+        if is_filtering_by_score:
+            valid_mask = scores > score_threshold
+            bboxes, scores = bboxes[valid_mask], scores[valid_mask]
+            valid_inds = torch.nonzero(
+                valid_mask, as_tuple=False).squeeze(dim=1)
+
+        inds = ext_module.nms(
+            bboxes, scores, iou_threshold=float(iou_threshold), offset=offset)
+
+        if max_num > 0:
+            inds = inds[:max_num]
+        if is_filtering_by_score:
+            inds = valid_inds[inds]
+        return inds
+
+    @staticmethod
+    def symbolic(g, bboxes, scores, iou_threshold, offset, score_threshold,
+                 max_num):
+        from ..onnx import is_custom_op_loaded
+        has_custom_op = is_custom_op_loaded()
+        # TensorRT nms plugin is aligned with original nms in ONNXRuntime
+        is_trt_backend = os.environ.get('ONNX_BACKEND') == 'MMCVTensorRT'
+        if has_custom_op and (not is_trt_backend):
+            return g.op(
+                'mmcv::NonMaxSuppression',
+                bboxes,
+                scores,
+                iou_threshold_f=float(iou_threshold),
+                offset_i=int(offset))
+        else:
+            from torch.onnx.symbolic_opset9 import select, squeeze, unsqueeze
+
+            from ..onnx.onnx_utils.symbolic_helper import _size_helper
+
+            boxes = unsqueeze(g, bboxes, 0)
+            scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
+
+            if max_num > 0:
+                max_num = g.op(
+                    'Constant',
+                    value_t=torch.tensor(max_num, dtype=torch.long))
+            else:
+                dim = g.op('Constant', value_t=torch.tensor(0))
+                max_num = _size_helper(g, bboxes, dim)
+            max_output_per_class = max_num
+            iou_threshold = g.op(
+                'Constant',
+                value_t=torch.tensor([iou_threshold], dtype=torch.float))
+            score_threshold = g.op(
+                'Constant',
+                value_t=torch.tensor([score_threshold], dtype=torch.float))
+            nms_out = g.op('NonMaxSuppression', boxes, scores,
+                           max_output_per_class, iou_threshold,
+                           score_threshold)
+            return squeeze(
+                g,
+                select(
+                    g, nms_out, 1,
+                    g.op(
+                        'Constant',
+                        value_t=torch.tensor([2], dtype=torch.long))), 1)
+
+
+class SoftNMSop(torch.autograd.Function):
+
+    @staticmethod
+    def forward(ctx: Any, boxes: Tensor, scores: Tensor, iou_threshold: float,
+                sigma: float, min_score: float, method: int,
+                offset: int) -> Tuple[Tensor, Tensor]:
+        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
+        inds = ext_module.softnms(
+            boxes.cpu(),
+            scores.cpu(),
+            dets.cpu(),
+            iou_threshold=float(iou_threshold),
+            sigma=float(sigma),
+            min_score=float(min_score),
+            method=int(method),
+            offset=int(offset))
+        return dets, inds
+
+    @staticmethod
+    def symbolic(g, boxes, scores, iou_threshold, sigma, min_score, method,
+                 offset):
+        from packaging import version
+        assert version.parse(torch.__version__) >= version.parse('1.7.0')
+        nms_out = g.op(
+            'mmcv::SoftNonMaxSuppression',
+            boxes,
+            scores,
+            iou_threshold_f=float(iou_threshold),
+            sigma_f=float(sigma),
+            min_score_f=float(min_score),
+            method_i=int(method),
+            offset_i=int(offset),
+            outputs=2)
+        return nms_out
+
+
+array_like_type = Union[Tensor, np.ndarray]
+
+
+@deprecated_api_warning({'iou_thr': 'iou_threshold'})
+def nms(boxes: array_like_type,
+        scores: array_like_type,
+        iou_threshold: float,
+        offset: int = 0,
+        score_threshold: float = 0,
+        max_num: int = -1) -> Tuple[array_like_type, array_like_type]:
+    """Dispatch to either CPU or GPU NMS implementations.
+
+    The input can be either torch tensor or numpy array. GPU NMS will be used
+    if the input is gpu tensor, otherwise CPU NMS
+    will be used. The returned type will always be the same as inputs.
+
+    Arguments:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+        score_threshold (float): score threshold for NMS.
+        max_num (int): maximum number of boxes after NMS.
+
+    Returns:
+        tuple: kept dets (boxes and scores) and indice, which always have
+        the same data type as the input.
+
+    Example:
+        >>> boxes = np.array([[49.1, 32.4, 51.0, 35.9],
+        >>>                   [49.3, 32.9, 51.0, 35.3],
+        >>>                   [49.2, 31.8, 51.0, 35.4],
+        >>>                   [35.1, 11.5, 39.1, 15.7],
+        >>>                   [35.6, 11.8, 39.3, 14.2],
+        >>>                   [35.3, 11.5, 39.9, 14.5],
+        >>>                   [35.2, 11.7, 39.7, 15.7]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.5, 0.4, 0.3],\
+               dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = nms(boxes, scores, iou_threshold)
+        >>> assert len(inds) == len(dets) == 3
+    """
+    assert isinstance(boxes, (Tensor, np.ndarray))
+    assert isinstance(scores, (Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+
+    inds = NMSop.apply(boxes, scores, iou_threshold, offset, score_threshold,
+                       max_num)
+    dets = torch.cat((boxes[inds], scores[inds].reshape(-1, 1)), dim=1)
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+    return dets, inds
+
+
+@deprecated_api_warning({'iou_thr': 'iou_threshold'})
+def soft_nms(boxes: array_like_type,
+             scores: array_like_type,
+             iou_threshold: float = 0.3,
+             sigma: float = 0.5,
+             min_score: float = 1e-3,
+             method: str = 'linear',
+             offset: int = 0) -> Tuple[array_like_type, array_like_type]:
+    """Dispatch to only CPU Soft NMS implementations.
+
+    The input can be either a torch tensor or numpy array.
+    The returned type will always be the same as inputs.
+
+    Args:
+        boxes (torch.Tensor or np.ndarray): boxes in shape (N, 4).
+        scores (torch.Tensor or np.ndarray): scores in shape (N, ).
+        iou_threshold (float): IoU threshold for NMS.
+        sigma (float): hyperparameter for gaussian method
+        min_score (float): score filter threshold
+        method (str): either 'linear' or 'gaussian'
+        offset (int, 0 or 1): boxes' width or height is (x2 - x1 + offset).
+
+    Returns:
+        tuple: kept dets (boxes and scores) and indice, which always have
+        the same data type as the input.
+
+    Example:
+        >>> boxes = np.array([[4., 3., 5., 3.],
+        >>>                   [4., 3., 5., 4.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.],
+        >>>                   [3., 1., 3., 1.]], dtype=np.float32)
+        >>> scores = np.array([0.9, 0.9, 0.5, 0.5, 0.4, 0.0], dtype=np.float32)
+        >>> iou_threshold = 0.6
+        >>> dets, inds = soft_nms(boxes, scores, iou_threshold, sigma=0.5)
+        >>> assert len(inds) == len(dets) == 5
+    """
+
+    assert isinstance(boxes, (Tensor, np.ndarray))
+    assert isinstance(scores, (Tensor, np.ndarray))
+    is_numpy = False
+    if isinstance(boxes, np.ndarray):
+        is_numpy = True
+        boxes = torch.from_numpy(boxes)
+    if isinstance(scores, np.ndarray):
+        scores = torch.from_numpy(scores)
+    assert boxes.size(1) == 4
+    assert boxes.size(0) == scores.size(0)
+    assert offset in (0, 1)
+    method_dict = {'naive': 0, 'linear': 1, 'gaussian': 2}
+    assert method in method_dict.keys()
+
+    if torch.__version__ == 'parrots':
+        dets = boxes.new_empty((boxes.size(0), 5), device='cpu')
+        indata_list = [boxes.cpu(), scores.cpu(), dets.cpu()]
+        indata_dict = {
+            'iou_threshold': float(iou_threshold),
+            'sigma': float(sigma),
+            'min_score': min_score,
+            'method': method_dict[method],
+            'offset': int(offset)
+        }
+        inds = ext_module.softnms(*indata_list, **indata_dict)
+    else:
+        dets, inds = SoftNMSop.apply(boxes.cpu(), scores.cpu(),
+                                     float(iou_threshold), float(sigma),
+                                     float(min_score), method_dict[method],
+                                     int(offset))
+
+    dets = dets[:inds.size(0)]
+
+    if is_numpy:
+        dets = dets.cpu().numpy()
+        inds = inds.cpu().numpy()
+        return dets, inds
+    else:
+        return dets.to(device=boxes.device), inds.to(device=boxes.device)
+
+
+def batched_nms(boxes: Tensor,
+                scores: Tensor,
+                idxs: Tensor,
+                nms_cfg: Optional[Dict],
+                class_agnostic: bool = False) -> Tuple[Tensor, Tensor]:
+    r"""Performs non-maximum suppression in a batched fashion.
+
+    Modified from `torchvision/ops/boxes.py#L39
+    <https://github.com/pytorch/vision/blob/
+    505cd6957711af790211896d32b40291bea1bc21/torchvision/ops/boxes.py#L39>`_.
+    In order to perform NMS independently per class, we add an offset to all
+    the boxes. The offset is dependent only on the class idx, and is large
+    enough so that boxes from different classes do not overlap.
+
+    Note:
+        In v1.4.1 and later, ``batched_nms`` supports skipping the NMS and
+        returns sorted raw results when `nms_cfg` is None.
+
+    Args:
+        boxes (torch.Tensor): boxes in shape (N, 4) or (N, 5).
+        scores (torch.Tensor): scores in shape (N, ).
+        idxs (torch.Tensor): each index value correspond to a bbox cluster,
+            and NMS will not be applied between elements of different idxs,
+            shape (N, ).
+        nms_cfg (dict | optional): Supports skipping the nms when `nms_cfg`
+            is None, otherwise it should specify nms type and other
+            parameters like `iou_thr`. Possible keys includes the following.
+
+            - iou_threshold (float): IoU threshold used for NMS.
+            - split_thr (float): threshold number of boxes. In some cases the
+              number of boxes is large (e.g., 200k). To avoid OOM during
+              training, the users could set `split_thr` to a small value.
+              If the number of boxes is greater than the threshold, it will
+              perform NMS on each group of boxes separately and sequentially.
+              Defaults to 10000.
+        class_agnostic (bool): if true, nms is class agnostic,
+            i.e. IoU thresholding happens over all boxes,
+            regardless of the predicted class. Defaults to False.
+
+    Returns:
+        tuple: kept dets and indice.
+
+        - boxes (Tensor): Bboxes with score after nms, has shape
+          (num_bboxes, 5). last dimension 5 arrange as
+          (x1, y1, x2, y2, score)
+        - keep (Tensor): The indices of remaining boxes in input
+          boxes.
+    """
+    # skip nms when nms_cfg is None
+    if nms_cfg is None:
+        scores, inds = scores.sort(descending=True)
+        boxes = boxes[inds]
+        return torch.cat([boxes, scores[:, None]], -1), inds
+
+    nms_cfg_ = nms_cfg.copy()
+    class_agnostic = nms_cfg_.pop('class_agnostic', class_agnostic)
+    if class_agnostic:
+        boxes_for_nms = boxes
+    else:
+        # When using rotated boxes, only apply offsets on center.
+        if boxes.size(-1) == 5:
+            # Strictly, the maximum coordinates of the rotating box
+            # (x,y,w,h,a) should be calculated by polygon coordinates.
+            # But the conversion from rotated box to polygon will
+            # slow down the speed.
+            # So we use max(x,y) + max(w,h) as max coordinate
+            # which is larger than polygon max coordinate
+            # max(x1, y1, x2, y2,x3, y3, x4, y4)
+            max_coordinate = boxes[..., :2].max() + boxes[..., 2:4].max()
+            offsets = idxs.to(boxes) * (
+                max_coordinate + torch.tensor(1).to(boxes))
+            boxes_ctr_for_nms = boxes[..., :2] + offsets[:, None]
+            boxes_for_nms = torch.cat([boxes_ctr_for_nms, boxes[..., 2:5]],
+                                      dim=-1)
+        else:
+            max_coordinate = boxes.max()
+            offsets = idxs.to(boxes) * (
+                max_coordinate + torch.tensor(1).to(boxes))
+            boxes_for_nms = boxes + offsets[:, None]
+
+    nms_type = nms_cfg_.pop('type', 'nms')
+    nms_op = eval(nms_type)
+
+    split_thr = nms_cfg_.pop('split_thr', 10000)
+    # Won't split to multiple nms nodes when exporting to onnx
+    if boxes_for_nms.shape[0] < split_thr or torch.onnx.is_in_onnx_export():
+        dets, keep = nms_op(boxes_for_nms, scores, **nms_cfg_)
+        boxes = boxes[keep]
+
+        # This assumes `dets` has arbitrary dimensions where
+        # the last dimension is score.
+        # Currently it supports bounding boxes [x1, y1, x2, y2, score] or
+        # rotated boxes [cx, cy, w, h, angle_radian, score].
+
+        scores = dets[:, -1]
+    else:
+        max_num = nms_cfg_.pop('max_num', -1)
+        total_mask = scores.new_zeros(scores.size(), dtype=torch.bool)
+        # Some type of nms would reweight the score, such as SoftNMS
+        scores_after_nms = scores.new_zeros(scores.size())
+        for id in torch.unique(idxs):
+            mask = (idxs == id).nonzero(as_tuple=False).view(-1)
+            dets, keep = nms_op(boxes_for_nms[mask], scores[mask], **nms_cfg_)
+            total_mask[mask[keep]] = True
+            scores_after_nms[mask[keep]] = dets[:, -1]
+        keep = total_mask.nonzero(as_tuple=False).view(-1)
+
+        scores, inds = scores_after_nms[keep].sort(descending=True)
+        keep = keep[inds]
+        boxes = boxes[keep]
+
+        if max_num > 0:
+            keep = keep[:max_num]
+            boxes = boxes[:max_num]
+            scores = scores[:max_num]
+
+    boxes = torch.cat([boxes, scores[:, None]], -1)
+    return boxes, keep
+
+
+def nms_match(dets: array_like_type,
+              iou_threshold: float) -> List[array_like_type]:
+    """Matched dets into different groups by NMS.
+
+    NMS match is Similar to NMS but when a bbox is suppressed, nms match will
+    record the indice of suppressed bbox and form a group with the indice of
+    kept bbox. In each group, indice is sorted as score order.
+
+    Args:
+        dets (torch.Tensor | np.ndarray): Det boxes with scores, shape (N, 5).
+        iou_threshold (float): IoU thresh for NMS.
+
+    Returns:
+        list[torch.Tensor | np.ndarray]: The outer list corresponds different
+        matched group, the inner Tensor corresponds the indices for a group
+        in score order.
+    """
+    if dets.shape[0] == 0:
+        matched = []
+    else:
+        assert dets.shape[-1] == 5, 'inputs dets.shape should be (N, 5), ' \
+                                    f'but get {dets.shape}'
+        if isinstance(dets, Tensor):
+            dets_t = dets.detach().cpu()
+        else:
+            dets_t = torch.from_numpy(dets)
+        indata_list = [dets_t]
+        indata_dict = {'iou_threshold': float(iou_threshold)}
+        matched = ext_module.nms_match(*indata_list, **indata_dict)
+        if torch.__version__ == 'parrots':
+            matched = matched.tolist()  # type: ignore
+
+    if isinstance(dets, Tensor):
+        return [dets.new_tensor(m, dtype=torch.long) for m in matched]
+    else:
+        return [np.array(m, dtype=int) for m in matched]
+
+
+def nms_rotated(dets: Tensor,
+                scores: Tensor,
+                iou_threshold: float,
+                labels: Optional[Tensor] = None,
+                clockwise: bool = True) -> Tuple[Tensor, Tensor]:
+    """Performs non-maximum suppression (NMS) on the rotated boxes according to
+    their intersection-over-union (IoU).
+
+    Rotated NMS iteratively removes lower scoring rotated boxes which have an
+    IoU greater than iou_threshold with another (higher scoring) rotated box.
+
+    Args:
+        dets (torch.Tensor):  Rotated boxes in shape (N, 5).
+            They are expected to be in
+            (x_ctr, y_ctr, width, height, angle_radian) format.
+        scores (torch.Tensor): scores in shape (N, ).
+        iou_threshold (float): IoU thresh for NMS.
+        labels (torch.Tensor, optional): boxes' label in shape (N,).
+        clockwise (bool): flag indicating whether the positive angular
+            orientation is clockwise. default True.
+            `New in version 1.4.3.`
+
+    Returns:
+        tuple: kept dets(boxes and scores) and indice, which is always the
+        same data type as the input.
+    """
+    if dets.shape[0] == 0:
+        return dets, None
+    if not clockwise:
+        flip_mat = dets.new_ones(dets.shape[-1])
+        flip_mat[-1] = -1
+        dets_cw = dets * flip_mat
+    else:
+        dets_cw = dets
+    multi_label = labels is not None
+    if multi_label:
+        dets_wl = torch.cat((dets_cw, labels.unsqueeze(1)), 1)  # type: ignore
+    else:
+        dets_wl = dets_cw
+    _, order = scores.sort(0, descending=True)
+    dets_sorted = dets_wl.index_select(0, order)
+
+    if torch.__version__ == 'parrots':
+        keep_inds = ext_module.nms_rotated(
+            dets_wl,
+            scores,
+            order,
+            dets_sorted,
+            iou_threshold=iou_threshold,
+            multi_label=multi_label)
+    else:
+        keep_inds = ext_module.nms_rotated(dets_wl, scores, order, dets_sorted,
+                                           iou_threshold, multi_label)
+    dets = torch.cat((dets[keep_inds], scores[keep_inds].reshape(-1, 1)),
+                     dim=1)
+    return dets, keep_inds
diff --git a/mmcv/mmcv/ops/pixel_group.py b/mmcv/mmcv/ops/pixel_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf73e326da8f46bf899b84955d0b911dd3f65014
--- /dev/null
+++ b/mmcv/mmcv/ops/pixel_group.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import numpy as np
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['pixel_group'])
+
+
+def pixel_group(
+    score: Union[np.ndarray, Tensor],
+    mask: Union[np.ndarray, Tensor],
+    embedding: Union[np.ndarray, Tensor],
+    kernel_label: Union[np.ndarray, Tensor],
+    kernel_contour: Union[np.ndarray, Tensor],
+    kernel_region_num: int,
+    distance_threshold: float,
+) -> List[List[float]]:
+    """Group pixels into text instances, which is widely used text detection
+    methods.
+
+    Arguments:
+        score (np.array or torch.Tensor): The foreground score with size hxw.
+        mask (np.array or Tensor): The foreground mask with size hxw.
+        embedding (np.array or torch.Tensor): The embedding with size hxwxc to
+            distinguish instances.
+        kernel_label (np.array or torch.Tensor): The instance kernel index with
+            size hxw.
+        kernel_contour (np.array or torch.Tensor): The kernel contour with
+            size hxw.
+        kernel_region_num (int): The instance kernel region number.
+        distance_threshold (float): The embedding distance threshold between
+            kernel and pixel in one instance.
+
+    Returns:
+        list[list[float]]: The instance coordinates and attributes list. Each
+        element consists of averaged confidence, pixel number, and coordinates
+        (x_i, y_i for all pixels) in order.
+    """
+    assert isinstance(score, (torch.Tensor, np.ndarray))
+    assert isinstance(mask, (torch.Tensor, np.ndarray))
+    assert isinstance(embedding, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_label, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_contour, (torch.Tensor, np.ndarray))
+    assert isinstance(kernel_region_num, int)
+    assert isinstance(distance_threshold, float)
+
+    if isinstance(score, np.ndarray):
+        score = torch.from_numpy(score)
+    if isinstance(mask, np.ndarray):
+        mask = torch.from_numpy(mask)
+    if isinstance(embedding, np.ndarray):
+        embedding = torch.from_numpy(embedding)
+    if isinstance(kernel_label, np.ndarray):
+        kernel_label = torch.from_numpy(kernel_label)
+    if isinstance(kernel_contour, np.ndarray):
+        kernel_contour = torch.from_numpy(kernel_contour)
+
+    if torch.__version__ == 'parrots':
+        label = ext_module.pixel_group(
+            score,
+            mask,
+            embedding,
+            kernel_label,
+            kernel_contour,
+            kernel_region_num=kernel_region_num,
+            distance_threshold=distance_threshold)
+        label = label.tolist()
+        label = label[0]
+        list_index = kernel_region_num
+        pixel_assignment = []
+        for x in range(kernel_region_num):
+            pixel_assignment.append(
+                np.array(
+                    label[list_index:list_index + int(label[x])],
+                    dtype=np.float))
+            list_index = list_index + int(label[x])
+    else:
+        pixel_assignment = ext_module.pixel_group(score, mask, embedding,
+                                                  kernel_label, kernel_contour,
+                                                  kernel_region_num,
+                                                  distance_threshold)
+    return pixel_assignment
diff --git a/mmcv/mmcv/ops/point_sample.py b/mmcv/mmcv/ops/point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..b40ccaba8275990f900cfae82df4f21b81a9c0c2
--- /dev/null
+++ b/mmcv/mmcv/ops/point_sample.py
@@ -0,0 +1,360 @@
+# Modified from https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend  # noqa
+
+from os import path as osp
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+from torch.nn.modules.utils import _pair
+from torch.onnx.operators import shape_as_tensor
+
+
+def bilinear_grid_sample(im: Tensor,
+                         grid: Tensor,
+                         align_corners: bool = False) -> Tensor:
+    """Given an input and a flow-field grid, computes the output using input
+    values and pixel locations from grid. Supported only bilinear interpolation
+    method to sample the input pixels.
+
+    Args:
+        im (torch.Tensor): Input feature map, shape (N, C, H, W)
+        grid (torch.Tensor): Point coordinates, shape (N, Hg, Wg, 2)
+        align_corners (bool): If set to True, the extrema (-1 and 1) are
+            considered as referring to the center points of the input’s
+            corner pixels. If set to False, they are instead considered as
+            referring to the corner points of the input’s corner pixels,
+            making the sampling more resolution agnostic.
+
+    Returns:
+        torch.Tensor: A tensor with sampled points, shape (N, C, Hg, Wg)
+    """
+    n, c, h, w = im.shape
+    gn, gh, gw, _ = grid.shape
+    assert n == gn
+
+    x = grid[:, :, :, 0]
+    y = grid[:, :, :, 1]
+
+    if align_corners:
+        x = ((x + 1) / 2) * (w - 1)
+        y = ((y + 1) / 2) * (h - 1)
+    else:
+        x = ((x + 1) * w - 1) / 2
+        y = ((y + 1) * h - 1) / 2
+
+    x = x.view(n, -1)
+    y = y.view(n, -1)
+
+    x0 = torch.floor(x).long()
+    y0 = torch.floor(y).long()
+    x1 = x0 + 1
+    y1 = y0 + 1
+
+    wa = ((x1 - x) * (y1 - y)).unsqueeze(1)
+    wb = ((x1 - x) * (y - y0)).unsqueeze(1)
+    wc = ((x - x0) * (y1 - y)).unsqueeze(1)
+    wd = ((x - x0) * (y - y0)).unsqueeze(1)
+
+    # Apply default for grid_sample function zero padding
+    im_padded = F.pad(im, pad=[1, 1, 1, 1], mode='constant', value=0)
+    padded_h = h + 2
+    padded_w = w + 2
+    # save points positions after padding
+    x0, x1, y0, y1 = x0 + 1, x1 + 1, y0 + 1, y1 + 1
+
+    # Clip coordinates to padded image size
+    x0 = torch.where(x0 < 0, torch.tensor(0), x0)
+    x0 = torch.where(x0 > padded_w - 1, torch.tensor(padded_w - 1), x0)
+    x1 = torch.where(x1 < 0, torch.tensor(0), x1)
+    x1 = torch.where(x1 > padded_w - 1, torch.tensor(padded_w - 1), x1)
+    y0 = torch.where(y0 < 0, torch.tensor(0), y0)
+    y0 = torch.where(y0 > padded_h - 1, torch.tensor(padded_h - 1), y0)
+    y1 = torch.where(y1 < 0, torch.tensor(0), y1)
+    y1 = torch.where(y1 > padded_h - 1, torch.tensor(padded_h - 1), y1)
+
+    im_padded = im_padded.view(n, c, -1)
+
+    x0_y0 = (x0 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x0_y1 = (x0 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x1_y0 = (x1 + y0 * padded_w).unsqueeze(1).expand(-1, c, -1)
+    x1_y1 = (x1 + y1 * padded_w).unsqueeze(1).expand(-1, c, -1)
+
+    Ia = torch.gather(im_padded, 2, x0_y0)
+    Ib = torch.gather(im_padded, 2, x0_y1)
+    Ic = torch.gather(im_padded, 2, x1_y0)
+    Id = torch.gather(im_padded, 2, x1_y1)
+
+    return (Ia * wa + Ib * wb + Ic * wc + Id * wd).reshape(n, c, gh, gw)
+
+
+def is_in_onnx_export_without_custom_ops() -> bool:
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    return torch.onnx.is_in_onnx_export(
+    ) and not osp.exists(ort_custom_op_path)
+
+
+def normalize(grid: Tensor) -> Tensor:
+    """Normalize input grid from [-1, 1] to [0, 1]
+
+    Args:
+        grid (torch.Tensor): The grid to be normalize, range [-1, 1].
+
+    Returns:
+        torch.Tensor: Normalized grid, range [0, 1].
+    """
+
+    return (grid + 1.0) / 2.0
+
+
+def denormalize(grid: Tensor) -> Tensor:
+    """Denormalize input grid from range [0, 1] to [-1, 1]
+
+    Args:
+        grid (torch.Tensor): The grid to be denormalize, range [0, 1].
+
+    Returns:
+        torch.Tensor: Denormalized grid, range [-1, 1].
+    """
+
+    return grid * 2.0 - 1.0
+
+
+def generate_grid(num_grid: int, size: Tuple[int, int],
+                  device: torch.device) -> Tensor:
+    """Generate regular square grid of points in [0, 1] x [0, 1] coordinate
+    space.
+
+    Args:
+        num_grid (int): The number of grids to sample, one for each region.
+        size (tuple[int, int]): The side size of the regular grid.
+        device (torch.device): Desired device of returned tensor.
+
+    Returns:
+        torch.Tensor: A tensor of shape (num_grid, size[0]*size[1], 2) that
+        contains coordinates for the regular grids.
+    """
+
+    affine_trans = torch.tensor([[[1., 0., 0.], [0., 1., 0.]]], device=device)
+    grid = F.affine_grid(
+        affine_trans, torch.Size((1, 1, *size)), align_corners=False)
+    grid = normalize(grid)
+    return grid.view(1, -1, 2).expand(num_grid, -1, -1)
+
+
+def rel_roi_point_to_abs_img_point(rois: Tensor,
+                                   rel_roi_points: Tensor) -> Tensor:
+    """Convert roi based relative point coordinates to image based absolute
+    point coordinates.
+
+    Args:
+        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
+            to RoI, location, range (0, 1), shape (N, P, 2)
+    Returns:
+        torch.Tensor: Image based absolute point coordinates, shape (N, P, 2)
+    """
+
+    with torch.no_grad():
+        assert rel_roi_points.size(0) == rois.size(0)
+        assert rois.dim() == 2
+        assert rel_roi_points.dim() == 3
+        assert rel_roi_points.size(2) == 2
+        # remove batch idx
+        if rois.size(1) == 5:
+            rois = rois[:, 1:]
+        abs_img_points = rel_roi_points.clone()
+        # To avoid an error during exporting to onnx use independent
+        # variables instead inplace computation
+        xs = abs_img_points[:, :, 0] * (rois[:, None, 2] - rois[:, None, 0])
+        ys = abs_img_points[:, :, 1] * (rois[:, None, 3] - rois[:, None, 1])
+        xs += rois[:, None, 0]
+        ys += rois[:, None, 1]
+        abs_img_points = torch.stack([xs, ys], dim=2)
+    return abs_img_points
+
+
+def get_shape_from_feature_map(x: Tensor) -> Tensor:
+    """Get spatial resolution of input feature map considering exporting to
+    onnx mode.
+
+    Args:
+        x (torch.Tensor): Input tensor, shape (N, C, H, W)
+
+    Returns:
+        torch.Tensor: Spatial resolution (width, height), shape (1, 1, 2)
+    """
+    if torch.onnx.is_in_onnx_export():
+        img_shape = shape_as_tensor(x)[2:].flip(0).view(1, 1, 2).to(
+            x.device).float()
+    else:
+        img_shape = torch.tensor(x.shape[2:]).flip(0).view(1, 1, 2).to(
+            x.device).float()
+    return img_shape
+
+
+def abs_img_point_to_rel_img_point(abs_img_points: Tensor,
+                                   img: Union[tuple, Tensor],
+                                   spatial_scale: float = 1.) -> Tensor:
+    """Convert image based absolute point coordinates to image based relative
+    coordinates for sampling.
+
+    Args:
+        abs_img_points (torch.Tensor): Image based absolute point coordinates,
+            shape (N, P, 2)
+        img (tuple or torch.Tensor): (height, width) of image or feature map.
+        spatial_scale (float, optional): Scale points by this factor.
+            Default: 1.
+
+    Returns:
+        Tensor: Image based relative point coordinates for sampling, shape
+        (N, P, 2).
+    """
+
+    assert (isinstance(img, tuple) and len(img) == 2) or \
+           (isinstance(img, torch.Tensor) and len(img.shape) == 4)
+
+    if isinstance(img, tuple):
+        h, w = img
+        scale = torch.tensor([w, h],
+                             dtype=torch.float,
+                             device=abs_img_points.device)
+        scale = scale.view(1, 1, 2)
+    else:
+        scale = get_shape_from_feature_map(img)
+
+    return abs_img_points / scale * spatial_scale
+
+
+def rel_roi_point_to_rel_img_point(rois: Tensor,
+                                   rel_roi_points: Tensor,
+                                   img: Union[tuple, Tensor],
+                                   spatial_scale: float = 1.) -> Tensor:
+    """Convert roi based relative point coordinates to image based absolute
+    point coordinates.
+
+    Args:
+        rois (torch.Tensor): RoIs or BBoxes, shape (N, 4) or (N, 5)
+        rel_roi_points (torch.Tensor): Point coordinates inside RoI, relative
+            to RoI, location, range (0, 1), shape (N, P, 2)
+        img (tuple or torch.Tensor): (height, width) of image or feature map.
+        spatial_scale (float, optional): Scale points by this factor.
+            Default: 1.
+
+    Returns:
+        torch.Tensor: Image based relative point coordinates for sampling,
+        shape (N, P, 2).
+    """
+
+    abs_img_point = rel_roi_point_to_abs_img_point(rois, rel_roi_points)
+    rel_img_point = abs_img_point_to_rel_img_point(abs_img_point, img,
+                                                   spatial_scale)
+
+    return rel_img_point
+
+
+def point_sample(input: Tensor,
+                 points: Tensor,
+                 align_corners: bool = False,
+                 **kwargs) -> Tensor:
+    """A wrapper around :func:`grid_sample` to support 3D point_coords tensors
+    Unlike :func:`torch.nn.functional.grid_sample` it assumes point_coords to
+    lie inside ``[0, 1] x [0, 1]`` square.
+
+    Args:
+        input (torch.Tensor): Feature map, shape (N, C, H, W).
+        points (torch.Tensor): Image based absolute point coordinates
+            (normalized), range [0, 1] x [0, 1], shape (N, P, 2) or
+            (N, Hgrid, Wgrid, 2).
+        align_corners (bool, optional): Whether align_corners.
+            Default: False
+
+    Returns:
+        torch.Tensor: Features of `point` on `input`, shape (N, C, P) or
+        (N, C, Hgrid, Wgrid).
+    """
+
+    add_dim = False
+    if points.dim() == 3:
+        add_dim = True
+        points = points.unsqueeze(2)
+    if is_in_onnx_export_without_custom_ops():
+        # If custom ops for onnx runtime not compiled use python
+        # implementation of grid_sample function to make onnx graph
+        # with supported nodes
+        output = bilinear_grid_sample(
+            input, denormalize(points), align_corners=align_corners)
+    else:
+        output = F.grid_sample(
+            input, denormalize(points), align_corners=align_corners, **kwargs)
+    if add_dim:
+        output = output.squeeze(3)
+    return output
+
+
+class SimpleRoIAlign(nn.Module):
+
+    def __init__(self,
+                 output_size: Tuple[int],
+                 spatial_scale: float,
+                 aligned: bool = True) -> None:
+        """Simple RoI align in PointRend, faster than standard RoIAlign.
+
+        Args:
+            output_size (tuple[int]): h, w
+            spatial_scale (float): scale the input boxes by this number
+            aligned (bool): if False, use the legacy implementation in
+                MMDetection, align_corners=True will be used in F.grid_sample.
+                If True, align the results more perfectly.
+        """
+
+        super().__init__()
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        # to be consistent with other RoI ops
+        self.use_torchvision = False
+        self.aligned = aligned
+
+    def forward(self, features: Tensor, rois: Tensor) -> Tensor:
+        num_imgs = features.size(0)
+        num_rois = rois.size(0)
+        rel_roi_points = generate_grid(
+            num_rois, self.output_size, device=rois.device)
+
+        if torch.onnx.is_in_onnx_export():
+            rel_img_points = rel_roi_point_to_rel_img_point(
+                rois, rel_roi_points, features, self.spatial_scale)
+            rel_img_points = rel_img_points.reshape(num_imgs, -1,
+                                                    *rel_img_points.shape[1:])
+            point_feats = point_sample(
+                features, rel_img_points, align_corners=not self.aligned)
+            point_feats = point_feats.transpose(1, 2)
+        else:
+            point_feats = []
+            for batch_ind in range(num_imgs):
+                # unravel batch dim
+                feat = features[batch_ind].unsqueeze(0)
+                inds = (rois[:, 0].long() == batch_ind)
+                if inds.any():
+                    rel_img_points = rel_roi_point_to_rel_img_point(
+                        rois[inds], rel_roi_points[inds], feat,
+                        self.spatial_scale).unsqueeze(0)
+                    point_feat = point_sample(
+                        feat, rel_img_points, align_corners=not self.aligned)
+                    point_feat = point_feat.squeeze(0).transpose(0, 1)
+                    point_feats.append(point_feat)
+
+            point_feats = torch.cat(point_feats, dim=0)
+
+        channels = features.size(1)
+        roi_feats = point_feats.reshape(num_rois, channels, *self.output_size)
+
+        return roi_feats
+
+    def __repr__(self) -> str:
+        format_str = self.__class__.__name__
+        format_str += '(output_size={}, spatial_scale={}'.format(
+            self.output_size, self.spatial_scale)
+        return format_str
diff --git a/mmcv/mmcv/ops/points_in_boxes.py b/mmcv/mmcv/ops/points_in_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..4915e6b573923fe40658d9dca09b39da9dcb31ed
--- /dev/null
+++ b/mmcv/mmcv/ops/points_in_boxes.py
@@ -0,0 +1,137 @@
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'points_in_boxes_part_forward', 'points_in_boxes_cpu_forward',
+    'points_in_boxes_all_forward'
+])
+
+
+def points_in_boxes_part(points: Tensor, boxes: Tensor) -> Tensor:
+    """Find the box in which each point is (CUDA).
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate.
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz] in
+            LiDAR/DEPTH coordinate, (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M). Default background = -1.
+    """
+    assert points.shape[0] == boxes.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {points.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points),
+                                       dtype=torch.int).fill_(-1)
+
+    # If manually put the tensor 'points' or 'boxes' on a device
+    # which is not the current device, some temporary variables
+    # will be created on the current device in the cuda op,
+    # and the output will be incorrect.
+    # Therefore, we force the current device to be the same
+    # as the device of the tensors if it was not.
+    # Please refer to https://github.com/open-mmlab/mmdetection3d/issues/305
+    # for the incorrect output before the fix.
+    points_device = points.get_device()
+    assert points_device == boxes.get_device(), \
+        'Points and boxes should be put on the same device'
+    if torch.cuda.current_device() != points_device:
+        torch.cuda.set_device(points_device)
+
+    ext_module.points_in_boxes_part_forward(boxes.contiguous(),
+                                            points.contiguous(),
+                                            box_idxs_of_pts)
+
+    return box_idxs_of_pts
+
+
+def points_in_boxes_cpu(points: Tensor, boxes: Tensor) -> Tensor:
+    """Find all boxes in which each point is (CPU). The CPU version of
+    :meth:`points_in_boxes_all`.
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in
+            LiDAR/DEPTH coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+            (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M, T). Default background = 0.
+    """
+    assert points.shape[0] == boxes.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {points.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+
+    point_indices = points.new_zeros((batch_size, num_boxes, num_points),
+                                     dtype=torch.int)
+    for b in range(batch_size):
+        ext_module.points_in_boxes_cpu_forward(boxes[b].float().contiguous(),
+                                               points[b].float().contiguous(),
+                                               point_indices[b])
+    point_indices = point_indices.transpose(1, 2)
+
+    return point_indices
+
+
+def points_in_boxes_all(points: Tensor, boxes: Tensor) -> Tensor:
+    """Find all boxes in which each point is (CUDA).
+
+    Args:
+        points (torch.Tensor): [B, M, 3], [x, y, z] in LiDAR/DEPTH coordinate
+        boxes (torch.Tensor): [B, T, 7],
+            num_valid_boxes <= T, [x, y, z, x_size, y_size, z_size, rz],
+            (x, y, z) is the bottom center.
+
+    Returns:
+        torch.Tensor: Return the box indices of points with the shape of
+        (B, M, T). Default background = 0.
+    """
+    assert boxes.shape[0] == points.shape[0], \
+        'Points and boxes should have the same batch size, ' \
+        f'but got {boxes.shape[0]} and {boxes.shape[0]}'
+    assert boxes.shape[2] == 7, \
+        'boxes dimension should be 7, ' \
+        f'but got unexpected shape {boxes.shape[2]}'
+    assert points.shape[2] == 3, \
+        'points dimension should be 3, ' \
+        f'but got unexpected shape {points.shape[2]}'
+    batch_size, num_points, _ = points.shape
+    num_boxes = boxes.shape[1]
+
+    box_idxs_of_pts = points.new_zeros((batch_size, num_points, num_boxes),
+                                       dtype=torch.int).fill_(0)
+
+    # Same reason as line 25-32
+    points_device = points.get_device()
+    assert points_device == boxes.get_device(), \
+        'Points and boxes should be put on the same device'
+    if torch.cuda.current_device() != points_device:
+        torch.cuda.set_device(points_device)
+
+    ext_module.points_in_boxes_all_forward(boxes.contiguous(),
+                                           points.contiguous(),
+                                           box_idxs_of_pts)
+
+    return box_idxs_of_pts
diff --git a/mmcv/mmcv/ops/points_in_polygons.py b/mmcv/mmcv/ops/points_in_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..62d0dbdc908db3a68c1073334295ee43e4ac5f61
--- /dev/null
+++ b/mmcv/mmcv/ops/points_in_polygons.py
@@ -0,0 +1,38 @@
+import torch
+from torch import Tensor
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['points_in_polygons_forward'])
+
+
+def points_in_polygons(points: Tensor, polygons: Tensor) -> Tensor:
+    """Judging whether points are inside polygons, which is used in the ATSS
+    assignment for the rotated boxes.
+
+    It should be noted that when the point is just at the polygon boundary, the
+    judgment will be inaccurate, but the effect on assignment is limited.
+
+    Args:
+        points (torch.Tensor): It has shape (B, 2), indicating (x, y).
+            M means the number of predicted points.
+        polygons (torch.Tensor): It has shape (M, 8), indicating
+            (x1, y1, x2, y2, x3, y3, x4, y4). M means the number of
+            ground truth polygons.
+
+    Returns:
+        torch.Tensor: Return the result with the shape of (B, M),
+        1 indicates that the point is inside the polygon,
+        0 indicates that the point is outside the polygon.
+    """
+    assert points.shape[1] == 2, \
+        'points dimension should be 2, ' \
+        f'but got unexpected shape {points.shape[1]}'
+    assert polygons.shape[1] == 8, \
+        'polygons dimension should be 8, ' \
+        f'but got unexpected shape {polygons.shape[1]}'
+    output = torch.full([points.shape[0], polygons.shape[0]],
+                        0.).cuda().float()
+    ext_module.points_in_polygons_forward(points.contiguous(),
+                                          polygons.contiguous(), output)
+    return output
diff --git a/mmcv/mmcv/ops/points_sampler.py b/mmcv/mmcv/ops/points_sampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1fd376051986464c0bb84aa9010b9692ecdadc5
--- /dev/null
+++ b/mmcv/mmcv/ops/points_sampler.py
@@ -0,0 +1,180 @@
+from typing import List
+
+import torch
+from torch import Tensor
+from torch import nn as nn
+
+from mmcv.runner import force_fp32
+from .furthest_point_sample import (furthest_point_sample,
+                                    furthest_point_sample_with_dist)
+
+
+def calc_square_dist(point_feat_a: Tensor,
+                     point_feat_b: Tensor,
+                     norm: bool = True) -> Tensor:
+    """Calculating square distance between a and b.
+
+    Args:
+        point_feat_a (torch.Tensor): (B, N, C) Feature vector of each point.
+        point_feat_b (torch.Tensor): (B, M, C) Feature vector of each point.
+        norm (bool, optional): Whether to normalize the distance.
+            Default: True.
+
+    Returns:
+        torch.Tensor: (B, N, M) Square distance between each point pair.
+    """
+    num_channel = point_feat_a.shape[-1]
+    # [bs, n, 1]
+    a_square = torch.sum(point_feat_a.unsqueeze(dim=2).pow(2), dim=-1)
+    # [bs, 1, m]
+    b_square = torch.sum(point_feat_b.unsqueeze(dim=1).pow(2), dim=-1)
+
+    corr_matrix = torch.matmul(point_feat_a, point_feat_b.transpose(1, 2))
+
+    dist = a_square + b_square - 2 * corr_matrix
+    if norm:
+        dist = torch.sqrt(dist) / num_channel
+    return dist
+
+
+def get_sampler_cls(sampler_type: str) -> nn.Module:
+    """Get the type and mode of points sampler.
+
+    Args:
+        sampler_type (str): The type of points sampler.
+            The valid value are "D-FPS", "F-FPS", or "FS".
+
+    Returns:
+        class: Points sampler type.
+    """
+    sampler_mappings = {
+        'D-FPS': DFPSSampler,
+        'F-FPS': FFPSSampler,
+        'FS': FSSampler,
+    }
+    try:
+        return sampler_mappings[sampler_type]
+    except KeyError:
+        raise KeyError(
+            f'Supported `sampler_type` are {sampler_mappings.keys()}, but got \
+                {sampler_type}')
+
+
+class PointsSampler(nn.Module):
+    """Points sampling.
+
+    Args:
+        num_point (list[int]): Number of sample points.
+        fps_mod_list (list[str], optional): Type of FPS method, valid mod
+            ['F-FPS', 'D-FPS', 'FS'], Default: ['D-FPS'].
+            F-FPS: using feature distances for FPS.
+            D-FPS: using Euclidean distances of points for FPS.
+            FS: using F-FPS and D-FPS simultaneously.
+        fps_sample_range_list (list[int], optional):
+            Range of points to apply FPS. Default: [-1].
+    """
+
+    def __init__(self,
+                 num_point: List[int],
+                 fps_mod_list: List[str] = ['D-FPS'],
+                 fps_sample_range_list: List[int] = [-1]) -> None:
+        super().__init__()
+        # FPS would be applied to different fps_mod in the list,
+        # so the length of the num_point should be equal to
+        # fps_mod_list and fps_sample_range_list.
+        assert len(num_point) == len(fps_mod_list) == len(
+            fps_sample_range_list)
+        self.num_point = num_point
+        self.fps_sample_range_list = fps_sample_range_list
+        self.samplers = nn.ModuleList()
+        for fps_mod in fps_mod_list:
+            self.samplers.append(get_sampler_cls(fps_mod)())
+        self.fp16_enabled = False
+
+    @force_fp32()
+    def forward(self, points_xyz: Tensor, features: Tensor) -> Tensor:
+        """
+        Args:
+            points_xyz (torch.Tensor): (B, N, 3) xyz coordinates of
+                the points.
+            features (torch.Tensor): (B, C, N) features of the points.
+
+        Returns:
+            torch.Tensor: (B, npoint, sample_num) Indices of sampled points.
+        """
+        indices = []
+        last_fps_end_index = 0
+        for fps_sample_range, sampler, npoint in zip(
+                self.fps_sample_range_list, self.samplers, self.num_point):
+            assert fps_sample_range < points_xyz.shape[1]
+
+            if fps_sample_range == -1:
+                sample_points_xyz = points_xyz[:, last_fps_end_index:]
+                if features is not None:
+                    sample_features = features[:, :, last_fps_end_index:]
+                else:
+                    sample_features = None
+            else:
+                sample_points_xyz = points_xyz[:, last_fps_end_index:
+                                               fps_sample_range]
+                if features is not None:
+                    sample_features = features[:, :, last_fps_end_index:
+                                               fps_sample_range]
+                else:
+                    sample_features = None
+
+            fps_idx = sampler(sample_points_xyz.contiguous(), sample_features,
+                              npoint)
+
+            indices.append(fps_idx + last_fps_end_index)
+            last_fps_end_index = fps_sample_range
+        indices = torch.cat(indices, dim=1)
+
+        return indices
+
+
+class DFPSSampler(nn.Module):
+    """Using Euclidean distances of points for FPS."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+        """Sampling points with D-FPS."""
+        fps_idx = furthest_point_sample(points.contiguous(), npoint)
+        return fps_idx
+
+
+class FFPSSampler(nn.Module):
+    """Using feature distances for FPS."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+        """Sampling points with F-FPS."""
+        assert features is not None, \
+            'feature input to FFPS_Sampler should not be None'
+        features_for_fps = torch.cat([points, features.transpose(1, 2)], dim=2)
+        features_dist = calc_square_dist(
+            features_for_fps, features_for_fps, norm=False)
+        fps_idx = furthest_point_sample_with_dist(features_dist, npoint)
+        return fps_idx
+
+
+class FSSampler(nn.Module):
+    """Using F-FPS and D-FPS simultaneously."""
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(self, points: Tensor, features: Tensor, npoint: int) -> Tensor:
+        """Sampling points with FS_Sampling."""
+        assert features is not None, \
+            'feature input to FS_Sampler should not be None'
+        ffps_sampler = FFPSSampler()
+        dfps_sampler = DFPSSampler()
+        fps_idx_ffps = ffps_sampler(points, features, npoint)
+        fps_idx_dfps = dfps_sampler(points, features, npoint)
+        fps_idx = torch.cat([fps_idx_ffps, fps_idx_dfps], dim=1)
+        return fps_idx
diff --git a/mmcv/mmcv/ops/prroi_pool.py b/mmcv/mmcv/ops/prroi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..47c223aa583990c078c94521a287e09920076392
--- /dev/null
+++ b/mmcv/mmcv/ops/prroi_pool.py
@@ -0,0 +1,134 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['prroi_pool_forward', 'prroi_pool_backward', 'prroi_pool_coor_backward'])
+
+
+class PrRoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, features, rois, output_size, spatial_scale):
+        return g.op(
+            'mmcv::PrRoIPool',
+            features,
+            rois,
+            pooled_height_i=int(output_size[0]),
+            pooled_width_i=int(output_size[1]),
+            spatial_scale_f=float(spatial_scale))
+
+    @staticmethod
+    def forward(ctx,
+                features: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Tuple,
+                spatial_scale: float = 1.0) -> torch.Tensor:
+        if 'FloatTensor' not in features.type(
+        ) or 'FloatTensor' not in rois.type():
+            raise ValueError(
+                'Precise RoI Pooling only takes float input, got '
+                f'{features.type()} for features and {rois.type()} for rois.')
+
+        pooled_height = int(output_size[0])
+        pooled_width = int(output_size[1])
+        spatial_scale = float(spatial_scale)
+
+        features = features.contiguous()
+        rois = rois.contiguous()
+        output_shape = (rois.size(0), features.size(1), pooled_height,
+                        pooled_width)
+        output = features.new_zeros(output_shape)
+        params = (pooled_height, pooled_width, spatial_scale)
+
+        ext_module.prroi_pool_forward(features, rois, output, *params)
+        ctx.params = params
+        # everything here is contiguous.
+        ctx.save_for_backward(features, rois, output)
+
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+        ctx, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, None, None, None]:
+        features, rois, output = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(*features.shape)
+        grad_coor = grad_output.new_zeros(*rois.shape)
+
+        if features.requires_grad:
+            grad_output = grad_output.contiguous()
+            ext_module.prroi_pool_backward(grad_output, rois, grad_input,
+                                           *ctx.params)
+        if rois.requires_grad:
+            grad_output = grad_output.contiguous()
+            ext_module.prroi_pool_coor_backward(output, grad_output, features,
+                                                rois, grad_coor, *ctx.params)
+
+        return grad_input, grad_coor, None, None, None
+
+
+prroi_pool = PrRoIPoolFunction.apply
+
+
+class PrRoIPool(nn.Module):
+    """The operation of precision RoI pooling. The implementation of PrRoIPool
+    is modified from https://github.com/vacancy/PreciseRoIPooling/
+
+    Precise RoI Pooling (PrRoIPool) is an integration-based (bilinear
+    interpolation) average pooling method for RoI Pooling. It avoids any
+    quantization and has a continuous gradient on bounding box coordinates.
+    It is:
+
+    1. different from the original RoI Pooling proposed in Fast R-CNN. PrRoI
+    Pooling uses average pooling instead of max pooling for each bin and has a
+    continuous gradient on bounding box coordinates. That is, one can take the
+    derivatives of some loss function w.r.t the coordinates of each RoI and
+    optimize the RoI coordinates.
+    2. different from the RoI Align proposed in Mask R-CNN. PrRoI Pooling uses
+    a full integration-based average pooling instead of sampling a constant
+    number of points. This makes the gradient w.r.t. the coordinates
+    continuous.
+
+    Args:
+        output_size (Union[int, tuple]): h, w.
+        spatial_scale (float, optional): scale the input boxes by this number.
+            Defaults to 1.0.
+    """
+
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float = 1.0):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, features: torch.Tensor,
+                rois: torch.Tensor) -> torch.Tensor:
+        """Forward function.
+
+        Args:
+            features (torch.Tensor): The feature map.
+            rois (torch.Tensor): The RoI bboxes in [tl_x, tl_y, br_x, br_y]
+                format.
+
+        Returns:
+            torch.Tensor: The pooled results.
+        """
+        return prroi_pool(features, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        return s
diff --git a/mmcv/mmcv/ops/psa_mask.py b/mmcv/mmcv/ops/psa_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..45f4946662c6751fe72fe6fd139f6e4b508d6cba
--- /dev/null
+++ b/mmcv/mmcv/ops/psa_mask.py
@@ -0,0 +1,98 @@
+# Modified from https://github.com/hszhao/semseg/blob/master/lib/psa
+from typing import Optional, Tuple
+
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['psamask_forward', 'psamask_backward'])
+
+
+class PSAMaskFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, psa_type, mask_size):
+        return g.op(
+            'mmcv::MMCVPSAMask',
+            input,
+            psa_type_i=psa_type,
+            mask_size_i=mask_size)
+
+    @staticmethod
+    def forward(ctx, input: torch.Tensor, psa_type: str,
+                mask_size: int) -> torch.Tensor:
+        ctx.psa_type = psa_type
+        ctx.mask_size = _pair(mask_size)
+        ctx.save_for_backward(input)
+
+        h_mask, w_mask = ctx.mask_size
+        batch_size, channels, h_feature, w_feature = input.size()
+        assert channels == h_mask * w_mask
+        output = input.new_zeros(
+            (batch_size, h_feature * w_feature, h_feature, w_feature))
+
+        ext_module.psamask_forward(
+            input,
+            output,
+            psa_type=psa_type,
+            num_=batch_size,
+            h_feature=h_feature,
+            w_feature=w_feature,
+            h_mask=h_mask,
+            w_mask=w_mask,
+            half_h_mask=(h_mask - 1) // 2,
+            half_w_mask=(w_mask - 1) // 2)
+        return output
+
+    @staticmethod
+    def backward(
+            ctx, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, None, None, None]:
+        input = ctx.saved_tensors[0]
+        psa_type = ctx.psa_type
+        h_mask, w_mask = ctx.mask_size
+        batch_size, channels, h_feature, w_feature = input.size()
+        grad_input = grad_output.new_zeros(
+            (batch_size, channels, h_feature, w_feature))
+        ext_module.psamask_backward(
+            grad_output,
+            grad_input,
+            psa_type=psa_type,
+            num_=batch_size,
+            h_feature=h_feature,
+            w_feature=w_feature,
+            h_mask=h_mask,
+            w_mask=w_mask,
+            half_h_mask=(h_mask - 1) // 2,
+            half_w_mask=(w_mask - 1) // 2)
+        return grad_input, None, None, None
+
+
+psa_mask = PSAMaskFunction.apply
+
+
+class PSAMask(nn.Module):
+
+    def __init__(self, psa_type: str, mask_size: Optional[tuple] = None):
+        super().__init__()
+        assert psa_type in ['collect', 'distribute']
+        if psa_type == 'collect':
+            psa_type_enum = 0
+        else:
+            psa_type_enum = 1
+        self.psa_type_enum = psa_type_enum
+        self.mask_size = mask_size
+        self.psa_type = psa_type
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        return psa_mask(input, self.psa_type_enum, self.mask_size)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(psa_type={self.psa_type}, '
+        s += f'mask_size={self.mask_size})'
+        return s
diff --git a/mmcv/mmcv/ops/riroi_align_rotated.py b/mmcv/mmcv/ops/riroi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..1de810cc5f3ac5ae7847ac60184705832d46f5c0
--- /dev/null
+++ b/mmcv/mmcv/ops/riroi_align_rotated.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader, is_tuple_of
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['riroi_align_rotated_forward', 'riroi_align_rotated_backward'])
+
+
+class RiRoIAlignRotatedFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any,
+                features: torch.Tensor,
+                rois: torch.Tensor,
+                out_size: Union[int, tuple],
+                spatial_scale: float,
+                num_samples: int = 0,
+                num_orientations: int = 8,
+                clockwise: bool = False) -> torch.Tensor:
+        if isinstance(out_size, int):
+            out_h = out_size
+            out_w = out_size
+        elif is_tuple_of(out_size, int):
+            assert len(out_size) == 2
+            out_h, out_w = out_size
+        else:
+            raise TypeError(
+                f'"out_size" should be an integer or tuple of integers,'
+                f' but got {out_size}')
+        ctx.spatial_scale = spatial_scale
+        ctx.num_samples = num_samples
+        ctx.num_orientations = num_orientations
+        ctx.clockwise = clockwise
+        ctx.save_for_backward(rois)
+        ctx.feature_size = features.size()
+
+        batch_size, num_channels, _, _ = features.size()
+        num_rois = rois.size(0)
+
+        output = features.new_zeros(num_rois, num_channels, out_h, out_w)
+
+        ext_module.riroi_align_rotated_forward(
+            features,
+            rois,
+            output,
+            pooled_height=out_h,
+            pooled_width=out_w,
+            spatial_scale=spatial_scale,
+            num_samples=num_samples,
+            num_orientations=num_orientations,
+            clockwise=clockwise)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_output: torch.Tensor
+    ) -> Optional[Tuple[torch.Tensor, None, None, None, None, None, None]]:
+        feature_size = ctx.feature_size
+        spatial_scale = ctx.spatial_scale
+        num_orientations = ctx.num_orientations
+        clockwise = ctx.clockwise
+        num_samples = ctx.num_samples
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+        batch_size, num_channels, feature_h, feature_w = feature_size
+
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = rois.new_zeros(batch_size, num_channels, feature_h,
+                                        feature_w)
+            ext_module.riroi_align_rotated_backward(
+                grad_output.contiguous(),
+                rois,
+                grad_input,
+                pooled_height=out_h,
+                pooled_width=out_w,
+                spatial_scale=spatial_scale,
+                num_samples=num_samples,
+                num_orientations=num_orientations,
+                clockwise=clockwise)
+
+            return grad_input, None, None, None, None, None, None
+        return None
+
+
+riroi_align_rotated = RiRoIAlignRotatedFunction.apply
+
+
+class RiRoIAlignRotated(nn.Module):
+    """Rotation-invariant RoI align pooling layer for rotated proposals.
+
+    It accepts a feature map of shape (N, C, H, W) and rois with shape
+    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
+    w, h, angle). The angle is in radian.
+
+    The details are described in the paper `ReDet: A Rotation-equivariant
+    Detector for Aerial Object Detection  <https://arxiv.org/abs/2103.07733>`_.
+
+    Args:
+        out_size (tuple): fixed dimensional RoI output with shape (h, w).
+        spatial_scale (float): scale the input boxes by this number
+        num_samples (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        num_orientations (int): number of oriented channels.
+        clockwise (bool): If True, the angle in each proposal follows a
+            clockwise fashion in image space, otherwise, the angle is
+            counterclockwise. Default: False.
+    """
+
+    def __init__(self,
+                 out_size: tuple,
+                 spatial_scale: float,
+                 num_samples: int = 0,
+                 num_orientations: int = 8,
+                 clockwise: bool = False):
+        super().__init__()
+
+        self.out_size = out_size
+        self.spatial_scale = float(spatial_scale)
+        self.num_samples = int(num_samples)
+        self.num_orientations = int(num_orientations)
+        self.clockwise = clockwise
+
+    def forward(self, features: torch.Tensor,
+                rois: torch.Tensor) -> torch.Tensor:
+        return RiRoIAlignRotatedFunction.apply(features, rois, self.out_size,
+                                               self.spatial_scale,
+                                               self.num_samples,
+                                               self.num_orientations,
+                                               self.clockwise)
diff --git a/mmcv/mmcv/ops/roi_align.py b/mmcv/mmcv/ops/roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca802f60cd1eb46374698f923236d42159f00088
--- /dev/null
+++ b/mmcv/mmcv/ops/roi_align.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import deprecated_api_warning, ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_align_forward', 'roi_align_backward'])
+
+
+class RoIAlignFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
+                 pool_mode, aligned):
+        from ..onnx import is_custom_op_loaded
+        has_custom_op = is_custom_op_loaded()
+        if has_custom_op:
+            return g.op(
+                'mmcv::MMCVRoiAlign',
+                input,
+                rois,
+                output_height_i=output_size[0],
+                output_width_i=output_size[1],
+                spatial_scale_f=spatial_scale,
+                sampling_ratio_i=sampling_ratio,
+                mode_s=pool_mode,
+                aligned_i=aligned)
+        else:
+            from torch.onnx import TensorProtoDataType
+            from torch.onnx.symbolic_helper import _slice_helper
+            from torch.onnx.symbolic_opset9 import squeeze, sub
+
+            # batch_indices = rois[:, 0].long()
+            batch_indices = _slice_helper(
+                g, rois, axes=[1], starts=[0], ends=[1])
+            batch_indices = squeeze(g, batch_indices, 1)
+            batch_indices = g.op(
+                'Cast', batch_indices, to_i=TensorProtoDataType.INT64)
+            # rois = rois[:, 1:]
+            rois = _slice_helper(g, rois, axes=[1], starts=[1], ends=[5])
+            if aligned:
+                # rois -= 0.5/spatial_scale
+                aligned_offset = g.op(
+                    'Constant',
+                    value_t=torch.tensor([0.5 / spatial_scale],
+                                         dtype=torch.float32))
+                rois = sub(g, rois, aligned_offset)
+            # roi align
+            return g.op(
+                'RoiAlign',
+                input,
+                rois,
+                batch_indices,
+                output_height_i=output_size[0],
+                output_width_i=output_size[1],
+                spatial_scale_f=spatial_scale,
+                sampling_ratio_i=max(0, sampling_ratio),
+                mode_s=pool_mode)
+
+    @staticmethod
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: int,
+                spatial_scale: float = 1.0,
+                sampling_ratio: int = 0,
+                pool_mode: str = 'avg',
+                aligned: bool = True) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        assert pool_mode in ('max', 'avg')
+        ctx.pool_mode = 0 if pool_mode == 'max' else 1
+        ctx.aligned = aligned
+        ctx.input_shape = input.size()
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        if ctx.pool_mode == 0:
+            argmax_y = input.new_zeros(output_shape)
+            argmax_x = input.new_zeros(output_shape)
+        else:
+            argmax_y = input.new_zeros(0)
+            argmax_x = input.new_zeros(0)
+
+        ext_module.roi_align_forward(
+            input,
+            rois,
+            output,
+            argmax_y,
+            argmax_x,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+
+        ctx.save_for_backward(rois, argmax_y, argmax_x)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        rois, argmax_y, argmax_x = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+        # complex head architecture may cause grad_output uncontiguous.
+        grad_output = grad_output.contiguous()
+        ext_module.roi_align_backward(
+            grad_output,
+            rois,
+            argmax_y,
+            argmax_x,
+            grad_input,
+            aligned_height=ctx.output_size[0],
+            aligned_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            pool_mode=ctx.pool_mode,
+            aligned=ctx.aligned)
+        return grad_input, None, None, None, None, None, None
+
+
+roi_align = RoIAlignFunction.apply
+
+
+class RoIAlign(nn.Module):
+    """RoI align pooling layer.
+
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio (int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        pool_mode (str, 'avg' or 'max'): pooling mode in each bin.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+        use_torchvision (bool): whether to use roi_align from torchvision.
+
+    Note:
+        The implementation of RoIAlign when aligned=True is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    @deprecated_api_warning(
+        {
+            'out_size': 'output_size',
+            'sample_num': 'sampling_ratio'
+        },
+        cls_name='RoIAlign')
+    def __init__(self,
+                 output_size: tuple,
+                 spatial_scale: float = 1.0,
+                 sampling_ratio: int = 0,
+                 pool_mode: str = 'avg',
+                 aligned: bool = True,
+                 use_torchvision: bool = False):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.pool_mode = pool_mode
+        self.aligned = aligned
+        self.use_torchvision = use_torchvision
+
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            input: NCHW images
+            rois: Bx5 boxes. First column is the index into N.\
+                The other 4 columns are xyxy.
+        """
+        if self.use_torchvision:
+            from torchvision.ops import roi_align as tv_roi_align
+            if 'aligned' in tv_roi_align.__code__.co_varnames:
+                return tv_roi_align(input, rois, self.output_size,
+                                    self.spatial_scale, self.sampling_ratio,
+                                    self.aligned)
+            else:
+                if self.aligned:
+                    rois -= rois.new_tensor([0.] +
+                                            [0.5 / self.spatial_scale] * 4)
+                return tv_roi_align(input, rois, self.output_size,
+                                    self.spatial_scale, self.sampling_ratio)
+        else:
+            return roi_align(input, rois, self.output_size, self.spatial_scale,
+                             self.sampling_ratio, self.pool_mode, self.aligned)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale}, '
+        s += f'sampling_ratio={self.sampling_ratio}, '
+        s += f'pool_mode={self.pool_mode}, '
+        s += f'aligned={self.aligned}, '
+        s += f'use_torchvision={self.use_torchvision})'
+        return s
diff --git a/mmcv/mmcv/ops/roi_align_rotated.py b/mmcv/mmcv/ops/roi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..f970ef4d8a57cb1e5f1eca47646b15ec19445ef5
--- /dev/null
+++ b/mmcv/mmcv/ops/roi_align_rotated.py
@@ -0,0 +1,186 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import deprecated_api_warning, ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['roi_align_rotated_forward', 'roi_align_rotated_backward'])
+
+
+class RoIAlignRotatedFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale, sampling_ratio,
+                 aligned, clockwise):
+        if isinstance(output_size, int):
+            out_h = output_size
+            out_w = output_size
+        elif isinstance(output_size, tuple):
+            assert len(output_size) == 2
+            assert isinstance(output_size[0], int)
+            assert isinstance(output_size[1], int)
+            out_h, out_w = output_size
+        else:
+            raise TypeError(
+                '"output_size" must be an integer or tuple of integers')
+        return g.op(
+            'mmcv::MMCVRoIAlignRotated',
+            input,
+            rois,
+            output_height_i=out_h,
+            output_width_i=out_h,
+            spatial_scale_f=spatial_scale,
+            sampling_ratio_i=sampling_ratio,
+            aligned_i=aligned,
+            clockwise_i=clockwise)
+
+    @staticmethod
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Union[int, tuple],
+                spatial_scale: float,
+                sampling_ratio: int = 0,
+                aligned: bool = True,
+                clockwise: bool = False) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.sampling_ratio = sampling_ratio
+        ctx.aligned = aligned
+        ctx.clockwise = clockwise
+        ctx.save_for_backward(rois)
+        ctx.feature_size = input.size()
+
+        batch_size, num_channels, data_height, data_width = input.size()
+        num_rois = rois.size(0)
+
+        output = input.new_zeros(num_rois, num_channels, ctx.output_size[0],
+                                 ctx.output_size[1])
+        ext_module.roi_align_rotated_forward(
+            input,
+            rois,
+            output,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale,
+            sampling_ratio=ctx.sampling_ratio,
+            aligned=ctx.aligned,
+            clockwise=ctx.clockwise)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_output: torch.Tensor
+    ) -> Tuple[Optional[torch.Tensor], Optional[torch.Tensor], None, None,
+               None, None, None]:
+        feature_size = ctx.feature_size
+        rois = ctx.saved_tensors[0]
+        assert feature_size is not None
+        batch_size, num_channels, data_height, data_width = feature_size
+
+        out_w = grad_output.size(3)
+        out_h = grad_output.size(2)
+
+        grad_input = grad_rois = None
+
+        if ctx.needs_input_grad[0]:
+            grad_input = rois.new_zeros(batch_size, num_channels, data_height,
+                                        data_width)
+            ext_module.roi_align_rotated_backward(
+                grad_output.contiguous(),
+                rois,
+                grad_input,
+                pooled_height=out_h,
+                pooled_width=out_w,
+                spatial_scale=ctx.spatial_scale,
+                sampling_ratio=ctx.sampling_ratio,
+                aligned=ctx.aligned,
+                clockwise=ctx.clockwise)
+        return grad_input, grad_rois, None, None, None, None, None
+
+
+roi_align_rotated = RoIAlignRotatedFunction.apply
+
+
+class RoIAlignRotated(nn.Module):
+    """RoI align pooling layer for rotated proposals.
+
+    It accepts a feature map of shape (N, C, H, W) and rois with shape
+    (n, 6) with each roi decoded as (batch_index, center_x, center_y,
+    w, h, angle). The angle is in radian.
+
+    Args:
+        output_size (tuple): h, w
+        spatial_scale (float): scale the input boxes by this number
+        sampling_ratio(int): number of inputs samples to take for each
+            output sample. 0 to take samples densely for current models.
+        aligned (bool): if False, use the legacy implementation in
+            MMDetection. If True, align the results more perfectly.
+            Default: True.
+        clockwise (bool): If True, the angle in each proposal follows a
+            clockwise fashion in image space, otherwise, the angle is
+            counterclockwise. Default: False.
+
+    Note:
+        The implementation of RoIAlign when aligned=True is modified from
+        https://github.com/facebookresearch/detectron2/
+
+        The meaning of aligned=True:
+
+        Given a continuous coordinate c, its two neighboring pixel
+        indices (in our pixel model) are computed by floor(c - 0.5) and
+        ceil(c - 0.5). For example, c=1.3 has pixel neighbors with discrete
+        indices [0] and [1] (which are sampled from the underlying signal
+        at continuous coordinates 0.5 and 1.5). But the original roi_align
+        (aligned=False) does not subtract the 0.5 when computing
+        neighboring pixel indices and therefore it uses pixels with a
+        slightly incorrect alignment (relative to our pixel model) when
+        performing bilinear interpolation.
+
+        With `aligned=True`,
+        we first appropriately scale the ROI and then shift it by -0.5
+        prior to calling roi_align. This produces the correct neighbors;
+
+        The difference does not make a difference to the model's
+        performance if ROIAlign is used together with conv layers.
+    """
+
+    @deprecated_api_warning(
+        {
+            'out_size': 'output_size',
+            'sample_num': 'sampling_ratio'
+        },
+        cls_name='RoIAlignRotated')
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float,
+                 sampling_ratio: int = 0,
+                 aligned: bool = True,
+                 clockwise: bool = False):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+        self.sampling_ratio = int(sampling_ratio)
+        self.aligned = aligned
+        self.clockwise = clockwise
+
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        return RoIAlignRotatedFunction.apply(input, rois, self.output_size,
+                                             self.spatial_scale,
+                                             self.sampling_ratio, self.aligned,
+                                             self.clockwise)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale}, '
+        s += f'sampling_ratio={self.sampling_ratio}, '
+        s += f'aligned={self.aligned}, '
+        s += f'clockwise={self.clockwise})'
+        return s
diff --git a/mmcv/mmcv/ops/roi_pool.py b/mmcv/mmcv/ops/roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..e295b6a0c16b893688be3a574c6ce423df3399e4
--- /dev/null
+++ b/mmcv/mmcv/ops/roi_pool.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Tuple, Union
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['roi_pool_forward', 'roi_pool_backward'])
+
+
+class RoIPoolFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, rois, output_size, spatial_scale):
+        return g.op(
+            'MaxRoiPool',
+            input,
+            rois,
+            pooled_shape_i=output_size,
+            spatial_scale_f=spatial_scale)
+
+    @staticmethod
+    def forward(ctx: Any,
+                input: torch.Tensor,
+                rois: torch.Tensor,
+                output_size: Union[int, tuple],
+                spatial_scale: float = 1.0) -> torch.Tensor:
+        ctx.output_size = _pair(output_size)
+        ctx.spatial_scale = spatial_scale
+        ctx.input_shape = input.size()
+
+        assert rois.size(1) == 5, 'RoI must be (idx, x1, y1, x2, y2)!'
+
+        output_shape = (rois.size(0), input.size(1), ctx.output_size[0],
+                        ctx.output_size[1])
+        output = input.new_zeros(output_shape)
+        argmax = input.new_zeros(output_shape, dtype=torch.int)
+
+        ext_module.roi_pool_forward(
+            input,
+            rois,
+            output,
+            argmax,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale)
+
+        ctx.save_for_backward(rois, argmax)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(
+            ctx: Any, grad_output: torch.Tensor
+    ) -> Tuple[torch.Tensor, None, None, None]:
+        rois, argmax = ctx.saved_tensors
+        grad_input = grad_output.new_zeros(ctx.input_shape)
+
+        ext_module.roi_pool_backward(
+            grad_output,
+            rois,
+            argmax,
+            grad_input,
+            pooled_height=ctx.output_size[0],
+            pooled_width=ctx.output_size[1],
+            spatial_scale=ctx.spatial_scale)
+
+        return grad_input, None, None, None
+
+
+roi_pool = RoIPoolFunction.apply
+
+
+class RoIPool(nn.Module):
+
+    def __init__(self,
+                 output_size: Union[int, tuple],
+                 spatial_scale: float = 1.0):
+        super().__init__()
+
+        self.output_size = _pair(output_size)
+        self.spatial_scale = float(spatial_scale)
+
+    def forward(self, input: torch.Tensor, rois: torch.Tensor) -> torch.Tensor:
+        return roi_pool(input, rois, self.output_size, self.spatial_scale)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'(output_size={self.output_size}, '
+        s += f'spatial_scale={self.spatial_scale})'
+        return s
diff --git a/mmcv/mmcv/ops/roiaware_pool3d.py b/mmcv/mmcv/ops/roiaware_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a09049b55dada86c885bc45df73ac7a723ac507
--- /dev/null
+++ b/mmcv/mmcv/ops/roiaware_pool3d.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Tuple, Union
+
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+import mmcv
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['roiaware_pool3d_forward', 'roiaware_pool3d_backward'])
+
+
+class RoIAwarePool3d(nn.Module):
+    """Encode the geometry-specific features of each 3D proposal.
+
+    Please refer to `PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_ for more
+    details.
+
+    Args:
+        out_size (int or tuple): The size of output features. n or
+            [n1, n2, n3].
+        max_pts_per_voxel (int, optional): The maximum number of points per
+            voxel. Default: 128.
+        mode (str, optional): Pooling method of RoIAware, 'max' or 'avg'.
+            Default: 'max'.
+    """
+
+    def __init__(self,
+                 out_size: Union[int, tuple],
+                 max_pts_per_voxel: int = 128,
+                 mode: str = 'max'):
+        super().__init__()
+
+        self.out_size = out_size
+        self.max_pts_per_voxel = max_pts_per_voxel
+        assert mode in ['max', 'avg']
+        pool_mapping = {'max': 0, 'avg': 1}
+        self.mode = pool_mapping[mode]
+
+    def forward(self, rois: torch.Tensor, pts: torch.Tensor,
+                pts_feature: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
+                (x, y, z) is the bottom center of rois.
+            pts (torch.Tensor): [npoints, 3], coordinates of input points.
+            pts_feature (torch.Tensor): [npoints, C], features of input points.
+
+        Returns:
+            torch.Tensor: Pooled features whose shape is
+            [N, out_x, out_y, out_z, C].
+        """
+
+        return RoIAwarePool3dFunction.apply(rois, pts, pts_feature,
+                                            self.out_size,
+                                            self.max_pts_per_voxel, self.mode)
+
+
+class RoIAwarePool3dFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, rois: torch.Tensor, pts: torch.Tensor,
+                pts_feature: torch.Tensor, out_size: Union[int, tuple],
+                max_pts_per_voxel: int, mode: int) -> torch.Tensor:
+        """
+        Args:
+            rois (torch.Tensor): [N, 7], in LiDAR coordinate,
+                (x, y, z) is the bottom center of rois.
+            pts (torch.Tensor): [npoints, 3], coordinates of input points.
+            pts_feature (torch.Tensor): [npoints, C], features of input points.
+            out_size (int or tuple): The size of output features. n or
+                [n1, n2, n3].
+            max_pts_per_voxel (int): The maximum number of points per voxel.
+                Default: 128.
+            mode (int): Pooling method of RoIAware, 0 (max pool) or 1 (average
+                pool).
+
+        Returns:
+            torch.Tensor: Pooled features whose shape is
+            [N, out_x, out_y, out_z, C].
+        """
+
+        if isinstance(out_size, int):
+            out_x = out_y = out_z = out_size
+        else:
+            assert len(out_size) == 3
+            assert mmcv.is_tuple_of(out_size, int)
+            out_x, out_y, out_z = out_size
+
+        num_rois = rois.shape[0]
+        num_channels = pts_feature.shape[-1]
+        num_pts = pts.shape[0]
+
+        pooled_features = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, num_channels))
+        argmax = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, num_channels), dtype=torch.int)
+        pts_idx_of_voxels = pts_feature.new_zeros(
+            (num_rois, out_x, out_y, out_z, max_pts_per_voxel),
+            dtype=torch.int)
+
+        ext_module.roiaware_pool3d_forward(
+            rois,
+            pts,
+            pts_feature,
+            argmax,
+            pts_idx_of_voxels,
+            pooled_features,
+            pool_method=mode)
+
+        ctx.roiaware_pool3d_for_backward = (pts_idx_of_voxels, argmax, mode,
+                                            num_pts, num_channels)
+        return pooled_features
+
+    @staticmethod
+    def backward(
+        ctx: Any, grad_out: torch.Tensor
+    ) -> Tuple[None, None, torch.Tensor, None, None, None]:
+        ret = ctx.roiaware_pool3d_for_backward
+        pts_idx_of_voxels, argmax, mode, num_pts, num_channels = ret
+
+        grad_in = grad_out.new_zeros((num_pts, num_channels))
+        ext_module.roiaware_pool3d_backward(
+            pts_idx_of_voxels,
+            argmax,
+            grad_out.contiguous(),
+            grad_in,
+            pool_method=mode)
+
+        return None, None, grad_in, None, None, None
diff --git a/mmcv/mmcv/ops/roipoint_pool3d.py b/mmcv/mmcv/ops/roipoint_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c16f5fa67cb3cf6d48d4263b5acf0173ccde7bf
--- /dev/null
+++ b/mmcv/mmcv/ops/roipoint_pool3d.py
@@ -0,0 +1,87 @@
+from typing import Any, Tuple
+
+import torch
+from torch import nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['roipoint_pool3d_forward'])
+
+
+class RoIPointPool3d(nn.Module):
+    """Encode the geometry-specific features of each 3D proposal.
+
+    Please refer to `Paper of PartA2 <https://arxiv.org/pdf/1907.03670.pdf>`_
+    for more details.
+
+    Args:
+        num_sampled_points (int, optional): Number of samples in each roi.
+            Default: 512.
+    """
+
+    def __init__(self, num_sampled_points: int = 512):
+        super().__init__()
+        self.num_sampled_points = num_sampled_points
+
+    def forward(self, points: torch.Tensor, point_features: torch.Tensor,
+                boxes3d: torch.Tensor) -> Tuple[torch.Tensor]:
+        """
+        Args:
+            points (torch.Tensor): Input points whose shape is (B, N, C).
+            point_features (torch.Tensor): Features of input points whose shape
+                is (B, N, C).
+            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the pooled features whose shape is (B, M, 512, 3 + C). The
+            second is an empty flag whose shape is (B, M).
+        """
+        return RoIPointPool3dFunction.apply(points, point_features, boxes3d,
+                                            self.num_sampled_points)
+
+
+class RoIPointPool3dFunction(Function):
+
+    @staticmethod
+    def forward(
+            ctx: Any,
+            points: torch.Tensor,
+            point_features: torch.Tensor,
+            boxes3d: torch.Tensor,
+            num_sampled_points: int = 512
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            points (torch.Tensor): Input points whose shape is (B, N, C).
+            point_features (torch.Tensor): Features of input points whose shape
+                is (B, N, C).
+            boxes3d (B, M, 7), Input bounding boxes whose shape is (B, M, 7).
+            num_sampled_points (int, optional): The num of sampled points.
+                Default: 512.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the pooled features whose shape is (B, M, 512, 3 + C). The
+            second is an empty flag whose shape is (B, M).
+        """
+        assert len(points.shape) == 3 and points.shape[2] == 3
+        batch_size, boxes_num, feature_len = points.shape[0], boxes3d.shape[
+            1], point_features.shape[2]
+        pooled_boxes3d = boxes3d.view(batch_size, -1, 7)
+        pooled_features = point_features.new_zeros(
+            (batch_size, boxes_num, num_sampled_points, 3 + feature_len))
+        pooled_empty_flag = point_features.new_zeros(
+            (batch_size, boxes_num)).int()
+
+        ext_module.roipoint_pool3d_forward(points.contiguous(),
+                                           pooled_boxes3d.contiguous(),
+                                           point_features.contiguous(),
+                                           pooled_features, pooled_empty_flag)
+
+        return pooled_features, pooled_empty_flag
+
+    @staticmethod
+    def backward(ctx: Any, grad_out: torch.Tensor) -> torch.Tensor:
+        raise NotImplementedError
diff --git a/mmcv/mmcv/ops/rotated_feature_align.py b/mmcv/mmcv/ops/rotated_feature_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d5954ddf29a6505abf007c47394c2ecc78e32d6
--- /dev/null
+++ b/mmcv/mmcv/ops/rotated_feature_align.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any
+
+import torch
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['rotated_feature_align_forward', 'rotated_feature_align_backward'])
+
+
+class RotatedFeatureAlignFunction(Function):
+    """Using the feature interpolation to obtain the position information
+    correspond to the refined rotate anchors and reconstruct the feature maps
+    in pixel-wise manner to achieve feature alignment.
+
+    The details are described in the paper
+    `R3Det: Refined Single-Stage Detector with Feature Refinement for Rotating
+    Object <https://arxiv.org/abs/1908.05612>`_.
+    """
+
+    @staticmethod
+    def symbolic(g, features, best_rbboxes, spatial_scale, points):
+        assert points in [1, 5]
+        return g.op(
+            'mmcv::MMCVRotatedFeatureAlign',
+            features,
+            best_rbboxes,
+            spatial_scale_f=spatial_scale,
+            points_i=points)
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, best_rbboxes: torch.Tensor,
+                spatial_scale: float, points: int) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Input features with shape [N,C,H,W].
+            best_rbboxes (torch.Tensor): Refined rotate anchors with
+                shape [N,H,W,5]. Coordinate format (cx,cx,h,w,a).
+            spatial_scale (float): The scale of feature map size and
+                input image size.
+            points (int, optional): The number of sample points.
+                Only 1 and 5 are supported. Defaults to 1.
+
+        Returns:
+            torch.Tensor: Refined features with shape [N,C,H,W].
+        """
+        ctx.spatial_scale = spatial_scale
+        ctx.points = points
+        ctx.save_for_backward(best_rbboxes)
+        assert points in [1, 5]
+        output = torch.zeros_like(features)
+        ext_module.rotated_feature_align_forward(
+            features,
+            best_rbboxes,
+            output,
+            spatial_scale=spatial_scale,
+            points=points)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        """
+        Args:
+            grad_output (torch.Tensor): The gradiant of output features
+                with shape [N,C,H,W].
+
+        Returns:
+            torch.Tensor: The gradiant of input features with shape [N,C,H,W].
+        """
+        best_rbboxes = ctx.saved_tensors[0]
+        points = ctx.points
+        spatial_scale = ctx.spatial_scale
+        grad_input = None
+        if ctx.needs_input_grad[0]:
+            grad_input = torch.zeros_like(grad_output)
+            ext_module.rotated_feature_align_backward(
+                grad_output.contiguous(),
+                best_rbboxes,
+                grad_input,
+                spatial_scale=spatial_scale,
+                points=points)
+        return grad_input, None, None, None
+
+
+def rotated_feature_align(features: torch.Tensor,
+                          best_rbboxes: torch.Tensor,
+                          spatial_scale: float = 1 / 8,
+                          points: int = 1) -> torch.Tensor:
+    return RotatedFeatureAlignFunction.apply(features, best_rbboxes,
+                                             spatial_scale, points)
diff --git a/mmcv/mmcv/ops/saconv.py b/mmcv/mmcv/ops/saconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..817ef9496edb3556fb33f8e9b6d049af6c0dcb04
--- /dev/null
+++ b/mmcv/mmcv/ops/saconv.py
@@ -0,0 +1,146 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmcv.cnn import CONV_LAYERS, ConvAWS2d, constant_init
+from mmcv.ops.deform_conv import deform_conv2d
+from mmcv.utils import TORCH_VERSION, digit_version
+
+
+@CONV_LAYERS.register_module(name='SAC')
+class SAConv2d(ConvAWS2d):
+    """SAC (Switchable Atrous Convolution)
+
+    This is an implementation of `DetectoRS: Detecting Objects with Recursive
+    Feature Pyramid and Switchable Atrous Convolution
+    <https://arxiv.org/abs/2006.02334>`_.
+
+    Args:
+        in_channels (int): Number of channels in the input image
+        out_channels (int): Number of channels produced by the convolution
+        kernel_size (int or tuple): Size of the convolving kernel
+        stride (int or tuple, optional): Stride of the convolution. Default: 1
+        padding (int or tuple, optional): Zero-padding added to both sides of
+            the input. Default: 0
+        padding_mode (string, optional): ``'zeros'``, ``'reflect'``,
+            ``'replicate'`` or ``'circular'``. Default: ``'zeros'``
+        dilation (int or tuple, optional): Spacing between kernel elements.
+            Default: 1
+        groups (int, optional): Number of blocked connections from input
+            channels to output channels. Default: 1
+        bias (bool, optional): If ``True``, adds a learnable bias to the
+            output. Default: ``True``
+        use_deform: If ``True``, replace convolution with deformable
+            convolution. Default: ``False``.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 use_deform=False):
+        super().__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias)
+        self.use_deform = use_deform
+        self.switch = nn.Conv2d(
+            self.in_channels, 1, kernel_size=1, stride=stride, bias=True)
+        self.weight_diff = nn.Parameter(torch.Tensor(self.weight.size()))
+        self.pre_context = nn.Conv2d(
+            self.in_channels, self.in_channels, kernel_size=1, bias=True)
+        self.post_context = nn.Conv2d(
+            self.out_channels, self.out_channels, kernel_size=1, bias=True)
+        if self.use_deform:
+            self.offset_s = nn.Conv2d(
+                self.in_channels,
+                18,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                bias=True)
+            self.offset_l = nn.Conv2d(
+                self.in_channels,
+                18,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                bias=True)
+        self.init_weights()
+
+    def init_weights(self):
+        constant_init(self.switch, 0, bias=1)
+        self.weight_diff.data.zero_()
+        constant_init(self.pre_context, 0)
+        constant_init(self.post_context, 0)
+        if self.use_deform:
+            constant_init(self.offset_s, 0)
+            constant_init(self.offset_l, 0)
+
+    def forward(self, x):
+        # pre-context
+        avg_x = F.adaptive_avg_pool2d(x, output_size=1)
+        avg_x = self.pre_context(avg_x)
+        avg_x = avg_x.expand_as(x)
+        x = x + avg_x
+        # switch
+        avg_x = F.pad(x, pad=(2, 2, 2, 2), mode='reflect')
+        avg_x = F.avg_pool2d(avg_x, kernel_size=5, stride=1, padding=0)
+        switch = self.switch(avg_x)
+        # sac
+        weight = self._get_weight(self.weight)
+        zero_bias = torch.zeros(
+            self.out_channels, device=weight.device, dtype=weight.dtype)
+
+        if self.use_deform:
+            offset = self.offset_s(avg_x)
+            out_s = deform_conv2d(x, offset, weight, self.stride, self.padding,
+                                  self.dilation, self.groups, 1)
+        else:
+            if (TORCH_VERSION == 'parrots'
+                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
+                out_s = super().conv2d_forward(x, weight)
+            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+                # bias is a required argument of _conv_forward in torch 1.8.0
+                out_s = super()._conv_forward(x, weight, zero_bias)
+            else:
+                out_s = super()._conv_forward(x, weight)
+        ori_p = self.padding
+        ori_d = self.dilation
+        self.padding = tuple(3 * p for p in self.padding)
+        self.dilation = tuple(3 * d for d in self.dilation)
+        weight = weight + self.weight_diff
+        if self.use_deform:
+            offset = self.offset_l(avg_x)
+            out_l = deform_conv2d(x, offset, weight, self.stride, self.padding,
+                                  self.dilation, self.groups, 1)
+        else:
+            if (TORCH_VERSION == 'parrots'
+                    or digit_version(TORCH_VERSION) < digit_version('1.5.0')):
+                out_l = super().conv2d_forward(x, weight)
+            elif digit_version(TORCH_VERSION) >= digit_version('1.8.0'):
+                # bias is a required argument of _conv_forward in torch 1.8.0
+                out_l = super()._conv_forward(x, weight, zero_bias)
+            else:
+                out_l = super()._conv_forward(x, weight)
+
+        out = switch * out_s + (1 - switch) * out_l
+        self.padding = ori_p
+        self.dilation = ori_d
+        # post-context
+        avg_x = F.adaptive_avg_pool2d(out, output_size=1)
+        avg_x = self.post_context(avg_x)
+        avg_x = avg_x.expand_as(out)
+        out = out + avg_x
+        return out
diff --git a/mmcv/mmcv/ops/scatter_points.py b/mmcv/mmcv/ops/scatter_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d881bfe63309fb406c123ee69d4e37125f45843
--- /dev/null
+++ b/mmcv/mmcv/ops/scatter_points.py
@@ -0,0 +1,148 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext',
+    ['dynamic_point_to_voxel_forward', 'dynamic_point_to_voxel_backward'])
+
+
+class _DynamicScatter(Function):
+
+    @staticmethod
+    def forward(ctx: Any,
+                feats: torch.Tensor,
+                coors: torch.Tensor,
+                reduce_type: str = 'max') -> Tuple[torch.Tensor, torch.Tensor]:
+        """convert kitti points(N, >=3) to voxels.
+
+        Args:
+            feats (torch.Tensor): [N, C]. Points features to be reduced
+                into voxels.
+            coors (torch.Tensor): [N, ndim]. Corresponding voxel coordinates
+                (specifically multi-dim voxel index) of each points.
+            reduce_type (str, optional): Reduce op. support 'max', 'sum' and
+                'mean'. Default: 'max'.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        results = ext_module.dynamic_point_to_voxel_forward(
+            feats, coors, reduce_type)
+        (voxel_feats, voxel_coors, point2voxel_map,
+         voxel_points_count) = results
+        ctx.reduce_type = reduce_type
+        ctx.save_for_backward(feats, voxel_feats, point2voxel_map,
+                              voxel_points_count)
+        ctx.mark_non_differentiable(voxel_coors)
+        return voxel_feats, voxel_coors
+
+    @staticmethod
+    def backward(ctx: Any,
+                 grad_voxel_feats: torch.Tensor,
+                 grad_voxel_coors: Optional[torch.Tensor] = None) -> tuple:
+        (feats, voxel_feats, point2voxel_map,
+         voxel_points_count) = ctx.saved_tensors
+        grad_feats = torch.zeros_like(feats)
+        # TODO: whether to use index put or use cuda_backward
+        # To use index put, need point to voxel index
+        ext_module.dynamic_point_to_voxel_backward(
+            grad_feats, grad_voxel_feats.contiguous(), feats, voxel_feats,
+            point2voxel_map, voxel_points_count, ctx.reduce_type)
+        return grad_feats, None, None
+
+
+dynamic_scatter = _DynamicScatter.apply
+
+
+class DynamicScatter(nn.Module):
+    """Scatters points into voxels, used in the voxel encoder with dynamic
+    voxelization.
+
+    Note:
+        The CPU and GPU implementation get the same output, but have numerical
+        difference after summation and division (e.g., 5e-7).
+
+    Args:
+        voxel_size (list): list [x, y, z] size of three dimension.
+        point_cloud_range (list): The coordinate range of points, [x_min,
+            y_min, z_min, x_max, y_max, z_max].
+        average_points (bool): whether to use avg pooling to scatter points
+            into voxel.
+    """
+
+    def __init__(self, voxel_size: List, point_cloud_range: List,
+                 average_points: bool):
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.average_points = average_points
+
+    def forward_single(
+            self, points: torch.Tensor,
+            coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Scatters points into voxels.
+
+        Args:
+            points (torch.Tensor): Points to be reduced into voxels.
+            coors (torch.Tensor): Corresponding voxel coordinates (specifically
+                multi-dim voxel index) of each points.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        reduce = 'mean' if self.average_points else 'max'
+        return dynamic_scatter(points.contiguous(), coors.contiguous(), reduce)
+
+    def forward(self, points: torch.Tensor,
+                coors: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Scatters points/features into voxels.
+
+        Args:
+            points (torch.Tensor): Points to be reduced into voxels.
+            coors (torch.Tensor): Corresponding voxel coordinates (specifically
+                multi-dim voxel index) of each points.
+
+        Returns:
+            tuple[torch.Tensor]: A tuple contains two elements. The first one
+            is the voxel features with shape [M, C] which are respectively
+            reduced from input features that share the same voxel coordinates.
+            The second is voxel coordinates with shape [M, ndim].
+        """
+        if coors.size(-1) == 3:
+            return self.forward_single(points, coors)
+        else:
+            batch_size = coors[-1, 0] + 1
+            voxels, voxel_coors = [], []
+            for i in range(batch_size):
+                inds = torch.where(coors[:, 0] == i)
+                voxel, voxel_coor = self.forward_single(
+                    points[inds], coors[inds][:, 1:])
+                coor_pad = F.pad(voxel_coor, (1, 0), mode='constant', value=i)
+                voxel_coors.append(coor_pad)
+                voxels.append(voxel)
+            features = torch.cat(voxels, dim=0)
+            feature_coors = torch.cat(voxel_coors, dim=0)
+
+            return features, feature_coors
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', average_points=' + str(self.average_points)
+        s += ')'
+        return s
diff --git a/mmcv/mmcv/ops/sparse_conv.py b/mmcv/mmcv/ops/sparse_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..3df88d003b7402c06ebed34a2e658a7db17151aa
--- /dev/null
+++ b/mmcv/mmcv/ops/sparse_conv.py
@@ -0,0 +1,455 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+
+import numpy as np
+import torch
+from torch.nn import init
+from torch.nn.parameter import Parameter
+
+from ..cnn import CONV_LAYERS
+from . import sparse_functional as Fsp
+from . import sparse_ops as ops
+from .sparse_modules import SparseModule
+from .sparse_structure import SparseConvTensor
+
+
+def _calculate_fan_in_and_fan_out_hwio(tensor):
+    dimensions = tensor.ndimension()
+    if dimensions < 2:
+        raise ValueError('fan in and fan out can not be computed for tensor'
+                         'with fewer than 2 dimensions')
+
+    if dimensions == 2:  # Linear
+        fan_in = tensor.size(-2)
+        fan_out = tensor.size(-1)
+    else:
+        num_input_fmaps = tensor.size(-2)
+        num_output_fmaps = tensor.size(-1)
+        receptive_field_size = 1
+        if tensor.dim() > 2:
+            receptive_field_size = tensor[..., 0, 0].numel()
+        fan_in = num_input_fmaps * receptive_field_size
+        fan_out = num_output_fmaps * receptive_field_size
+
+    return fan_in, fan_out
+
+
+class SparseConvolution(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 in_channels,
+                 out_channels,
+                 kernel_size=3,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 subm=False,
+                 output_padding=0,
+                 transposed=False,
+                 inverse=False,
+                 indice_key=None,
+                 fused_bn=False):
+        super().__init__()
+        assert groups == 1
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+        if not isinstance(output_padding, (list, tuple)):
+            output_padding = [output_padding] * ndim
+
+        for d, s in zip(dilation, stride):
+            assert any([s == 1, d == 1]), "don't support this."
+
+        self.ndim = ndim
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.conv1x1 = np.prod(kernel_size) == 1
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.transposed = transposed
+        self.inverse = inverse
+        self.output_padding = output_padding
+        self.groups = groups
+        self.subm = subm
+        self.indice_key = indice_key
+        self.fused_bn = fused_bn
+
+        self.weight = Parameter(
+            torch.Tensor(*kernel_size, in_channels, out_channels))
+        if bias:
+            self.bias = Parameter(torch.Tensor(out_channels))
+        else:
+            self.register_parameter('bias', None)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
+        if self.bias is not None:
+            fan_in, _ = _calculate_fan_in_and_fan_out_hwio(self.weight)
+            bound = 1 / math.sqrt(fan_in)
+            init.uniform_(self.bias, -bound, bound)
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            if self.transposed:
+                out_spatial_shape = ops.get_deconv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation, self.output_padding)
+            else:
+                out_spatial_shape = ops.get_conv_output_size(
+                    spatial_shape, self.kernel_size, self.stride, self.padding,
+                    self.dilation)
+
+        else:
+            out_spatial_shape = spatial_shape
+
+        if self.conv1x1:
+            features = torch.mm(
+                input.features,
+                self.weight.view(self.in_channels, self.out_channels))
+            if self.bias is not None:
+                features += self.bias
+            out_tensor = SparseConvTensor(features, input.indices,
+                                          input.spatial_shape,
+                                          input.batch_size)
+            out_tensor.indice_dict = input.indice_dict
+            out_tensor.grid = input.grid
+            return out_tensor
+        data = input.find_indice_pair(self.indice_key)
+        if self.inverse:
+            assert data is not None and self.indice_key is not None
+            _, outids, indice_pairs, indice_pair_num, out_spatial_shape = data
+            assert indice_pairs.shape[0] == np.prod(
+                self.kernel_size
+            ), 'inverse conv must have same kernel size as its couple conv'
+        else:
+            if self.indice_key is not None and data is not None:
+                outids, _, indice_pairs, indice_pair_num, _ = data
+            else:
+                outids, indice_pairs, indice_pair_num = ops.get_indice_pairs(
+                    indices,
+                    batch_size,
+                    spatial_shape,
+                    self.kernel_size,
+                    self.stride,
+                    self.padding,
+                    self.dilation,
+                    self.output_padding,
+                    self.subm,
+                    self.transposed,
+                    grid=input.grid)
+                input.indice_dict[self.indice_key] = (outids, indices,
+                                                      indice_pairs,
+                                                      indice_pair_num,
+                                                      spatial_shape)
+        if self.fused_bn:
+            assert self.bias is not None
+            out_features = ops.fused_indice_conv(features, self.weight,
+                                                 self.bias,
+                                                 indice_pairs.to(device),
+                                                 indice_pair_num,
+                                                 outids.shape[0], self.inverse,
+                                                 self.subm)
+        else:
+            if self.subm:
+                out_features = Fsp.indice_subm_conv(features, self.weight,
+                                                    indice_pairs.to(device),
+                                                    indice_pair_num,
+                                                    outids.shape[0])
+            else:
+                if self.inverse:
+                    out_features = Fsp.indice_inverse_conv(
+                        features, self.weight, indice_pairs.to(device),
+                        indice_pair_num, outids.shape[0])
+                else:
+                    out_features = Fsp.indice_conv(features, self.weight,
+                                                   indice_pairs.to(device),
+                                                   indice_pair_num,
+                                                   outids.shape[0])
+
+            if self.bias is not None:
+                out_features += self.bias
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+@CONV_LAYERS.register_module()
+class SparseConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseConv4d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseConvTranspose2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseConvTranspose3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            transposed=True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseInverseConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key=None,
+                 bias=True):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SparseInverseConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 indice_key=None,
+                 bias=True):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            bias=bias,
+            inverse=True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SubMConv2d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            2,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SubMConv3d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            3,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
+
+
+@CONV_LAYERS.register_module()
+class SubMConv4d(SparseConvolution):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 indice_key=None):
+        super().__init__(
+            4,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding,
+            dilation,
+            groups,
+            bias,
+            True,
+            indice_key=indice_key)
diff --git a/mmcv/mmcv/ops/sparse_functional.py b/mmcv/mmcv/ops/sparse_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a80a545aa5411b047518bf1286bb8489bece76b
--- /dev/null
+++ b/mmcv/mmcv/ops/sparse_functional.py
@@ -0,0 +1,156 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+
+import torch
+from torch.autograd import Function
+
+from . import sparse_ops as ops
+
+
+class SparseConvFunction(Function):
+    """Sparse Convolution.
+
+    Please refer to `SECOND <https://www.mdpi.com/1424-8220/18/10/3337>`_ for
+    more details.
+    """
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseInverseConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, True, False)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            True, False)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SubMConvFunction(Function):
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, filters: torch.nn.Parameter,
+                indice_pairs: torch.Tensor, indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            filters (torch.nn.parameter.Parameter): Convolution filters.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from gather-gemm-scatter.
+        """
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, filters)
+        return ops.indice_conv(features, filters, indice_pairs,
+                               indice_pair_num, num_activate_out, False, True)
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, filters = ctx.saved_tensors
+        input_bp, filters_bp = ops.indice_conv_backward(
+            features, filters, grad_output, indice_pairs, indice_pair_num,
+            False, True)
+
+        return input_bp, filters_bp, None, None, None
+
+
+class SparseMaxPoolFunction(Function):
+
+    @staticmethod
+    def forward(ctx, features: torch.Tensor, indice_pairs: torch.Tensor,
+                indice_pair_num: torch.Tensor,
+                num_activate_out: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): Features that needs to convolute.
+            indice_pairs (torch.Tensor): Indice pairs between inputs locations
+                and outputs locations.
+            indice_pair_num (torch.Tensor): Indice pairs num.
+            num_activate_out (torch.Tensor): Output channels num.
+
+        Returns:
+            torch.Tensor: Output features from sparse maxpooling.
+        """
+        out = ops.indice_maxpool(features, indice_pairs, indice_pair_num,
+                                 num_activate_out)
+        ctx.save_for_backward(indice_pairs, indice_pair_num, features, out)
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        indice_pairs, indice_pair_num, features, out = ctx.saved_tensors
+        input_bp = ops.indice_maxpool_backward(features, out, grad_output,
+                                               indice_pairs, indice_pair_num)
+        return input_bp, None, None, None
+
+
+indice_conv = SparseConvFunction.apply
+indice_inverse_conv = SparseInverseConvFunction.apply
+indice_subm_conv = SubMConvFunction.apply
+indice_maxpool = SparseMaxPoolFunction.apply
diff --git a/mmcv/mmcv/ops/sparse_modules.py b/mmcv/mmcv/ops/sparse_modules.py
new file mode 100644
index 0000000000000000000000000000000000000000..20a92aa279754767da493eee876cb1ab716bc770
--- /dev/null
+++ b/mmcv/mmcv/ops/sparse_modules.py
@@ -0,0 +1,203 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from collections import OrderedDict
+from typing import Any, List, Optional, Union
+
+import torch
+from torch import nn
+
+from .sparse_structure import SparseConvTensor
+
+
+def is_spconv_module(module: nn.Module) -> bool:
+    spconv_modules = (SparseModule, )
+    return isinstance(module, spconv_modules)
+
+
+def is_sparse_conv(module: nn.Module) -> bool:
+    from .sparse_conv import SparseConvolution
+    return isinstance(module, SparseConvolution)
+
+
+def _mean_update(vals: Union[int, List], m_vals: Union[int, List],
+                 t: float) -> List:
+    outputs = []
+    if not isinstance(vals, list):
+        vals = [vals]
+    if not isinstance(m_vals, list):
+        m_vals = [m_vals]
+    for val, m_val in zip(vals, m_vals):
+        output = t / float(t + 1) * m_val + 1 / float(t + 1) * val
+        outputs.append(output)
+    if len(outputs) == 1:
+        outputs = outputs[0]
+    return outputs
+
+
+class SparseModule(nn.Module):
+    """place holder, All module subclass from this will take sptensor in
+    SparseSequential."""
+    pass
+
+
+class SparseSequential(SparseModule):
+    r"""A sequential container.
+    Modules will be added to it in the order they are passed in the
+    constructor.
+    Alternatively, an ordered dict of modules can also be passed in.
+
+    To make it easier to understand, given is a small example::
+
+    Example:
+        >>> # using Sequential:
+        >>> from mmcv.ops import SparseSequential
+        >>> model = SparseSequential(
+                    SparseConv2d(1,20,5),
+                    nn.ReLU(),
+                    SparseConv2d(20,64,5),
+                    nn.ReLU()
+                    )
+
+        >>> # using Sequential with OrderedDict
+        >>> model = SparseSequential(OrderedDict([
+                      ('conv1', SparseConv2d(1,20,5)),
+                      ('relu1', nn.ReLU()),
+                      ('conv2', SparseConv2d(20,64,5)),
+                      ('relu2', nn.ReLU())
+                    ]))
+
+        >>> # using Sequential with kwargs(python 3.6+)
+        >>> model = SparseSequential(
+                      conv1=SparseConv2d(1,20,5),
+                      relu1=nn.ReLU(),
+                      conv2=SparseConv2d(20,64,5),
+                      relu2=nn.ReLU()
+                    )
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        if len(args) == 1 and isinstance(args[0], OrderedDict):
+            for key, module in args[0].items():
+                self.add_module(key, module)
+        else:
+            for idx, module in enumerate(args):
+                self.add_module(str(idx), module)
+        for name, module in kwargs.items():
+            if sys.version_info < (3, 6):
+                raise ValueError('kwargs only supported in py36+')
+            if name in self._modules:
+                raise ValueError('name exists.')
+            self.add_module(name, module)
+        self._sparity_dict = {}
+
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        if not (-len(self) <= idx < len(self)):
+            raise IndexError(f'index {idx} is out of range')
+        if idx < 0:
+            idx += len(self)
+        it = iter(self._modules.values())
+        for i in range(idx):
+            next(it)
+        return next(it)
+
+    def __len__(self):
+        return len(self._modules)
+
+    @property
+    def sparity_dict(self):
+        return self._sparity_dict
+
+    def add(self, module: Any, name: Optional[str] = None) -> None:
+        if name is None:
+            name = str(len(self._modules))
+            if name in self._modules:
+                raise KeyError('name exists')
+        self.add_module(name, module)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        for k, module in self._modules.items():
+            if is_spconv_module(module):
+                assert isinstance(input, SparseConvTensor)
+                self._sparity_dict[k] = input.sparity
+                input = module(input)
+            else:
+                if isinstance(input, SparseConvTensor):
+                    if input.indices.shape[0] != 0:
+                        input.features = module(input.features)
+                else:
+                    input = module(input)
+        return input
+
+    def fused(self):
+        from .sparse_conv import SparseConvolution
+        mods = [v for k, v in self._modules.items()]
+        fused_mods = []
+        idx = 0
+        while idx < len(mods):
+            if is_sparse_conv(mods[idx]):
+                if idx < len(mods) - 1 and isinstance(mods[idx + 1],
+                                                      nn.BatchNorm1d):
+                    new_module = SparseConvolution(
+                        ndim=mods[idx].ndim,
+                        in_channels=mods[idx].in_channels,
+                        out_channels=mods[idx].out_channels,
+                        kernel_size=mods[idx].kernel_size,
+                        stride=mods[idx].stride,
+                        padding=mods[idx].padding,
+                        dilation=mods[idx].dilation,
+                        groups=mods[idx].groups,
+                        bias=True,
+                        subm=mods[idx].subm,
+                        output_padding=mods[idx].output_padding,
+                        transposed=mods[idx].transposed,
+                        inverse=mods[idx].inverse,
+                        indice_key=mods[idx].indice_key,
+                        fused_bn=True,
+                    )
+                    new_module.load_state_dict(mods[idx].state_dict(), False)
+                    new_module.to(mods[idx].weight.device)
+                    conv = new_module
+                    bn = mods[idx + 1]
+                    conv.bias.data.zero_()
+                    conv.weight.data[:] = conv.weight.data * bn.weight.data / (
+                        torch.sqrt(bn.running_var) + bn.eps)
+                    conv.bias.data[:] = (
+                        conv.bias.data - bn.running_mean) * bn.weight.data / (
+                            torch.sqrt(bn.running_var) + bn.eps) + bn.bias.data
+                    fused_mods.append(conv)
+                    idx += 2
+                else:
+                    fused_mods.append(mods[idx])
+                    idx += 1
+            else:
+                fused_mods.append(mods[idx])
+                idx += 1
+        return SparseSequential(*fused_mods)
+
+
+class ToDense(SparseModule):
+    """convert SparseConvTensor to NCHW dense tensor."""
+
+    def forward(self, x: SparseConvTensor):
+        return x.dense()
+
+
+class RemoveGrid(SparseModule):
+    """remove pre-allocated grid buffer."""
+
+    def forward(self, x: SparseConvTensor):
+        x.grid = None
+        return x
diff --git a/mmcv/mmcv/ops/sparse_ops.py b/mmcv/mmcv/ops/sparse_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3b3f54bcffe7f6d8aae166ab06bceb9d2494b93
--- /dev/null
+++ b/mmcv/mmcv/ops/sparse_ops.py
@@ -0,0 +1,174 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'get_indice_pairs_2d_forward', 'get_indice_pairs_3d_forward',
+    'get_indice_pairs_4d_forward', 'get_indice_pairs_2d_backward',
+    'get_indice_pairs_3d_backward', 'indice_conv_forward',
+    'indice_conv_backward', 'fused_indice_conv_forward',
+    'indice_maxpool_forward', 'indice_maxpool_backward'
+])
+
+
+def get_conv_output_size(input_size, kernel_size, stride, padding, dilation):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        size = (input_size[i] + 2 * padding[i] - dilation[i] *
+                (kernel_size[i] - 1) - 1) // stride[i] + 1
+        if kernel_size[i] == -1:
+            output_size.append(1)
+        else:
+            output_size.append(size)
+    return output_size
+
+
+def get_deconv_output_size(input_size, kernel_size, stride, padding, dilation,
+                           output_padding):
+    ndim = len(input_size)
+    output_size = []
+    for i in range(ndim):
+        if kernel_size[i] == -1:
+            raise ValueError("deconv don't support kernel_size < 0")
+        size = (input_size[i] - 1) * stride[i] - 2 * padding[i] + kernel_size[
+            i] + output_padding[i]
+        output_size.append(size)
+    return output_size
+
+
+def get_indice_pairs(indices,
+                     batch_size,
+                     spatial_shape,
+                     ksize=3,
+                     stride=1,
+                     padding=0,
+                     dilation=1,
+                     out_padding=0,
+                     subm=False,
+                     transpose=False,
+                     grid=None):
+    ndim = indices.shape[1] - 1
+    if not isinstance(ksize, (list, tuple)):
+        ksize = [ksize] * ndim
+    if not isinstance(stride, (list, tuple)):
+        stride = [stride] * ndim
+    if not isinstance(padding, (list, tuple)):
+        padding = [padding] * ndim
+    if not isinstance(dilation, (list, tuple)):
+        dilation = [dilation] * ndim
+    if not isinstance(out_padding, (list, tuple)):
+        out_padding = [out_padding] * ndim
+
+    for d, s in zip(dilation, stride):
+        assert any([s == 1, d == 1]), "don't support this."
+
+    if not subm:
+        if transpose:
+            out_shape = get_deconv_output_size(spatial_shape, ksize, stride,
+                                               padding, dilation, out_padding)
+        else:
+            out_shape = get_conv_output_size(spatial_shape, ksize, stride,
+                                             padding, dilation)
+
+    else:
+        out_shape = spatial_shape
+    if grid is None:
+        if ndim == 2:
+            get_indice_pairs_func = ext_module.get_indice_pairs_2d_forward
+        elif ndim == 3:
+            get_indice_pairs_func = ext_module.get_indice_pairs_3d_forward
+        elif ndim == 4:
+            get_indice_pairs_func = ext_module.get_indice_pairs_4d_forward
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+    else:
+        if ndim == 2:
+            get_indice_pairs_func = ext_module.get_indice_pairs_2d_backward
+        elif ndim == 3:
+            get_indice_pairs_func = ext_module.get_indice_pairs_3d_backward
+        else:
+            raise NotImplementedError
+        return get_indice_pairs_func(indices, grid, batch_size, out_shape,
+                                     spatial_shape, ksize, stride, padding,
+                                     dilation, out_padding, int(subm),
+                                     int(transpose))
+
+
+def indice_conv(features,
+                filters,
+                indice_pairs,
+                indice_pair_num,
+                num_activate_out,
+                inverse=False,
+                subm=False):
+    if filters.dtype == torch.float32 or filters.dtype == torch.half:
+        return ext_module.indice_conv_forward(features, filters, indice_pairs,
+                                              indice_pair_num,
+                                              num_activate_out, int(inverse),
+                                              int(subm))
+    else:
+        raise NotImplementedError
+
+
+def fused_indice_conv(features, filters, bias, indice_pairs, indice_pair_num,
+                      num_activate_out, inverse, subm):
+    if features.dtype == torch.half or filters.dtypes == torch.float32:
+        func = ext_module.fused_indice_conv_forward
+    else:
+        raise NotImplementedError
+
+    return func(features, filters, bias, indice_pairs, indice_pair_num,
+                num_activate_out, int(inverse), int(subm))
+
+
+def indice_conv_backward(features,
+                         filters,
+                         out_bp,
+                         indice_pairs,
+                         indice_pair_num,
+                         inverse=False,
+                         subm=False):
+    if filters.dtype == torch.float32 or filters.dtype == torch.half:
+        return ext_module.indice_conv_backward(features, filters, out_bp,
+                                               indice_pairs, indice_pair_num,
+                                               int(inverse), int(subm))
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool(features, indice_pairs, indice_pair_num, num_activate_out):
+    if features.dtype == torch.float32 or features.dtype == torch.half:
+        return ext_module.indice_maxpool_forward(features, indice_pairs,
+                                                 indice_pair_num,
+                                                 num_activate_out)
+    else:
+        raise NotImplementedError
+
+
+def indice_maxpool_backward(features, out_features, out_bp, indice_pairs,
+                            indice_pair_num):
+    if features.dtype == torch.float32 or features.dtype == torch.half:
+        return ext_module.indice_maxpool_backward(features, out_features,
+                                                  out_bp, indice_pairs,
+                                                  indice_pair_num)
+    else:
+        raise NotImplementedError
diff --git a/mmcv/mmcv/ops/sparse_pool.py b/mmcv/mmcv/ops/sparse_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4edb1d4e4029bfff2978bc9ea7961719d873110
--- /dev/null
+++ b/mmcv/mmcv/ops/sparse_pool.py
@@ -0,0 +1,86 @@
+# Copyright 2019 Yan Yan
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# import sparse_functional as Fsp
+# import sparse_ops as ops
+from .sparse_functional import indice_maxpool
+from .sparse_modules import SparseModule
+from .sparse_ops import get_conv_output_size, get_indice_pairs
+from .sparse_structure import SparseConvTensor
+
+
+class SparseMaxPool(SparseModule):
+
+    def __init__(self,
+                 ndim,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 subm=False):
+        super().__init__()
+        if not isinstance(kernel_size, (list, tuple)):
+            kernel_size = [kernel_size] * ndim
+        if not isinstance(stride, (list, tuple)):
+            stride = [stride] * ndim
+        if not isinstance(padding, (list, tuple)):
+            padding = [padding] * ndim
+        if not isinstance(dilation, (list, tuple)):
+            dilation = [dilation] * ndim
+
+        self.ndim = ndim
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.subm = subm
+        self.dilation = dilation
+
+    def forward(self, input):
+        assert isinstance(input, SparseConvTensor)
+        features = input.features
+        device = features.device
+        indices = input.indices
+        spatial_shape = input.spatial_shape
+        batch_size = input.batch_size
+        if not self.subm:
+            out_spatial_shape = get_conv_output_size(spatial_shape,
+                                                     self.kernel_size,
+                                                     self.stride, self.padding,
+                                                     self.dilation)
+        else:
+            out_spatial_shape = spatial_shape
+        outids, indice_pairs, indice_pairs_num = get_indice_pairs(
+            indices, batch_size, spatial_shape, self.kernel_size, self.stride,
+            self.padding, self.dilation, 0, self.subm)
+
+        out_features = indice_maxpool(features, indice_pairs.to(device),
+                                      indice_pairs_num.to(device),
+                                      outids.shape[0])
+        out_tensor = SparseConvTensor(out_features, outids, out_spatial_shape,
+                                      batch_size)
+        out_tensor.indice_dict = input.indice_dict
+        out_tensor.grid = input.grid
+        return out_tensor
+
+
+class SparseMaxPool2d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super().__init__(2, kernel_size, stride, padding, dilation)
+
+
+class SparseMaxPool3d(SparseMaxPool):
+
+    def __init__(self, kernel_size, stride=1, padding=0, dilation=1):
+        super().__init__(3, kernel_size, stride, padding, dilation)
diff --git a/mmcv/mmcv/ops/sparse_structure.py b/mmcv/mmcv/ops/sparse_structure.py
new file mode 100644
index 0000000000000000000000000000000000000000..83907ab5563ff292e8c48715f5b1149a7d31f460
--- /dev/null
+++ b/mmcv/mmcv/ops/sparse_structure.py
@@ -0,0 +1,66 @@
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+
+def scatter_nd(indices: torch.Tensor, updates: torch.Tensor,
+               shape: torch.Tensor) -> torch.Tensor:
+    """pytorch edition of tensorflow scatter_nd.
+
+    this function don't contain except handle code. so use this carefully when
+    indice repeats, don't support repeat add which is supported in tensorflow.
+    """
+    ret = torch.zeros(*shape, dtype=updates.dtype, device=updates.device)
+    ndim = indices.shape[-1]
+    output_shape = list(indices.shape[:-1]) + shape[indices.shape[-1]:]
+    flatted_indices = indices.view(-1, ndim)
+    slices = [flatted_indices[:, i] for i in range(ndim)]
+    slices += [Ellipsis]
+    ret[slices] = updates.view(*output_shape)
+    return ret
+
+
+class SparseConvTensor:
+
+    def __init__(self,
+                 features: torch.Tensor,
+                 indices: torch.Tensor,
+                 spatial_shape: Union[List, Tuple],
+                 batch_size: int,
+                 grid: Optional[torch.Tensor] = None):
+        self.features = features
+        self.indices = indices
+        if self.indices.dtype != torch.int32:
+            self.indices.int()
+        self.spatial_shape = spatial_shape
+        self.batch_size = batch_size
+        self.indice_dict: dict = {}
+        self.grid = grid
+
+    @property
+    def spatial_size(self):
+        return np.prod(self.spatial_shape)
+
+    def find_indice_pair(self, key):
+        if key is None:
+            return None
+        if key in self.indice_dict:
+            return self.indice_dict[key]
+        return None
+
+    def dense(self, channels_first: bool = True) -> torch.Tensor:
+        output_shape = [self.batch_size] + list(
+            self.spatial_shape) + [self.features.shape[1]]
+        res = scatter_nd(self.indices.long(), self.features, output_shape)
+        if not channels_first:
+            return res
+        ndim = len(self.spatial_shape)
+        trans_params = list(range(0, ndim + 1))
+        trans_params.insert(1, ndim + 1)
+        return res.permute(*trans_params).contiguous()
+
+    @property
+    def sparity(self):
+        return (self.indices.shape[0] / np.prod(self.spatial_shape) /
+                self.batch_size)
diff --git a/mmcv/mmcv/ops/sync_bn.py b/mmcv/mmcv/ops/sync_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce8727cb379ec2448156661a0795e8df6cb5c900
--- /dev/null
+++ b/mmcv/mmcv/ops/sync_bn.py
@@ -0,0 +1,283 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.distributed as dist
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+from torch.nn.modules.module import Module
+from torch.nn.parameter import Parameter
+
+from mmcv.cnn import NORM_LAYERS
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', [
+    'sync_bn_forward_mean', 'sync_bn_forward_var', 'sync_bn_forward_output',
+    'sync_bn_backward_param', 'sync_bn_backward_data'
+])
+
+
+class SyncBatchNormFunction(Function):
+
+    @staticmethod
+    def symbolic(g, input, running_mean, running_var, weight, bias, momentum,
+                 eps, group, group_size, stats_mode):
+        return g.op(
+            'mmcv::MMCVSyncBatchNorm',
+            input,
+            running_mean,
+            running_var,
+            weight,
+            bias,
+            momentum_f=momentum,
+            eps_f=eps,
+            group_i=group,
+            group_size_i=group_size,
+            stats_mode=stats_mode)
+
+    @staticmethod
+    def forward(self, input: torch.Tensor, running_mean: torch.Tensor,
+                running_var: torch.Tensor, weight: torch.Tensor,
+                bias: torch.Tensor, momentum: float, eps: float, group: int,
+                group_size: int, stats_mode: str) -> torch.Tensor:
+        self.momentum = momentum
+        self.eps = eps
+        self.group = group
+        self.group_size = group_size
+        self.stats_mode = stats_mode
+
+        assert isinstance(
+                   input, (torch.HalfTensor, torch.FloatTensor,
+                           torch.cuda.HalfTensor, torch.cuda.FloatTensor)), \
+               f'only support Half or Float Tensor, but {input.type()}'
+        output = torch.zeros_like(input)
+        input3d = input.flatten(start_dim=2)
+        output3d = output.view_as(input3d)
+        num_channels = input3d.size(1)
+
+        # ensure mean/var/norm/std are initialized as zeros
+        # ``torch.empty()`` does not guarantee that
+        mean = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+        var = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+        norm = torch.zeros_like(
+            input3d, dtype=torch.float, device=input3d.device)
+        std = torch.zeros(
+            num_channels, dtype=torch.float, device=input3d.device)
+
+        batch_size = input3d.size(0)
+        if batch_size > 0:
+            ext_module.sync_bn_forward_mean(input3d, mean)
+            batch_flag = torch.ones([1], device=mean.device, dtype=mean.dtype)
+        else:
+            # skip updating mean and leave it as zeros when the input is empty
+            batch_flag = torch.zeros([1], device=mean.device, dtype=mean.dtype)
+
+        # synchronize mean and the batch flag
+        vec = torch.cat([mean, batch_flag])
+        if self.stats_mode == 'N':
+            vec *= batch_size
+        if self.group_size > 1:
+            dist.all_reduce(vec, group=self.group)
+        total_batch = vec[-1].detach()
+        mean = vec[:num_channels]
+
+        if self.stats_mode == 'default':
+            mean = mean / self.group_size
+        elif self.stats_mode == 'N':
+            mean = mean / total_batch.clamp(min=1)
+        else:
+            raise NotImplementedError
+
+        # leave var as zeros when the input is empty
+        if batch_size > 0:
+            ext_module.sync_bn_forward_var(input3d, mean, var)
+
+        if self.stats_mode == 'N':
+            var *= batch_size
+        if self.group_size > 1:
+            dist.all_reduce(var, group=self.group)
+
+        if self.stats_mode == 'default':
+            var /= self.group_size
+        elif self.stats_mode == 'N':
+            var /= total_batch.clamp(min=1)
+        else:
+            raise NotImplementedError
+
+        # if the total batch size over all the ranks is zero,
+        # we should not update the statistics in the current batch
+        update_flag = total_batch.clamp(max=1)
+        momentum = update_flag * self.momentum
+        ext_module.sync_bn_forward_output(
+            input3d,
+            mean,
+            var,
+            weight,
+            bias,
+            running_mean,
+            running_var,
+            norm,
+            std,
+            output3d,
+            eps=self.eps,
+            momentum=momentum,
+            group_size=self.group_size)
+        self.save_for_backward(norm, std, weight)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(self, grad_output: torch.Tensor) -> tuple:
+        norm, std, weight = self.saved_tensors
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(weight)
+        grad_input = torch.zeros_like(grad_output)
+        grad_output3d = grad_output.flatten(start_dim=2)
+        grad_input3d = grad_input.view_as(grad_output3d)
+
+        batch_size = grad_input3d.size(0)
+        if batch_size > 0:
+            ext_module.sync_bn_backward_param(grad_output3d, norm, grad_weight,
+                                              grad_bias)
+
+        # all reduce
+        if self.group_size > 1:
+            dist.all_reduce(grad_weight, group=self.group)
+            dist.all_reduce(grad_bias, group=self.group)
+            grad_weight /= self.group_size
+            grad_bias /= self.group_size
+
+        if batch_size > 0:
+            ext_module.sync_bn_backward_data(grad_output3d, weight,
+                                             grad_weight, grad_bias, norm, std,
+                                             grad_input3d)
+
+        return grad_input, None, None, grad_weight, grad_bias, \
+            None, None, None, None, None
+
+
+@NORM_LAYERS.register_module(name='MMSyncBN')
+class SyncBatchNorm(Module):
+    """Synchronized Batch Normalization.
+
+    Args:
+        num_features (int): number of features/chennels in input tensor
+        eps (float, optional): a value added to the denominator for numerical
+            stability. Defaults to 1e-5.
+        momentum (float, optional): the value used for the running_mean and
+            running_var computation. Defaults to 0.1.
+        affine (bool, optional): whether to use learnable affine parameters.
+            Defaults to True.
+        track_running_stats (bool, optional): whether to track the running
+            mean and variance during training. When set to False, this
+            module does not track such statistics, and initializes statistics
+            buffers ``running_mean`` and ``running_var`` as ``None``. When
+            these buffers are ``None``, this module always uses batch
+            statistics in both training and eval modes. Defaults to True.
+        group (int, optional): synchronization of stats happen within
+            each process group individually. By default it is synchronization
+            across the whole world. Defaults to None.
+        stats_mode (str, optional): The statistical mode. Available options
+            includes ``'default'`` and ``'N'``. Defaults to 'default'.
+            When ``stats_mode=='default'``, it computes the overall statistics
+            using those from each worker with equal weight, i.e., the
+            statistics are synchronized and simply divied by ``group``. This
+            mode will produce inaccurate statistics when empty tensors occur.
+            When ``stats_mode=='N'``, it compute the overall statistics using
+            the total number of batches in each worker ignoring the number of
+            group, i.e., the statistics are synchronized and then divied by
+            the total batch ``N``. This mode is beneficial when empty tensors
+            occur during training, as it average the total mean by the real
+            number of batch.
+    """
+
+    def __init__(self,
+                 num_features: int,
+                 eps: float = 1e-5,
+                 momentum: float = 0.1,
+                 affine: bool = True,
+                 track_running_stats: bool = True,
+                 group: Optional[int] = None,
+                 stats_mode: str = 'default'):
+        super().__init__()
+        self.num_features = num_features
+        self.eps = eps
+        self.momentum = momentum
+        self.affine = affine
+        self.track_running_stats = track_running_stats
+        group = dist.group.WORLD if group is None else group
+        self.group = group
+        self.group_size = dist.get_world_size(group)
+        assert stats_mode in ['default', 'N'], \
+            f'"stats_mode" only accepts "default" and "N", got "{stats_mode}"'
+        self.stats_mode = stats_mode
+        if self.affine:
+            self.weight = Parameter(torch.Tensor(num_features))
+            self.bias = Parameter(torch.Tensor(num_features))
+        else:
+            self.register_parameter('weight', None)
+            self.register_parameter('bias', None)
+        if self.track_running_stats:
+            self.register_buffer('running_mean', torch.zeros(num_features))
+            self.register_buffer('running_var', torch.ones(num_features))
+            self.register_buffer('num_batches_tracked',
+                                 torch.tensor(0, dtype=torch.long))
+        else:
+            self.register_buffer('running_mean', None)
+            self.register_buffer('running_var', None)
+            self.register_buffer('num_batches_tracked', None)
+        self.reset_parameters()
+
+    def reset_running_stats(self):
+        if self.track_running_stats:
+            self.running_mean.zero_()
+            self.running_var.fill_(1)
+            self.num_batches_tracked.zero_()
+
+    def reset_parameters(self):
+        self.reset_running_stats()
+        if self.affine:
+            self.weight.data.uniform_()  # pytorch use ones_()
+            self.bias.data.zero_()
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if input.dim() < 2:
+            raise ValueError(
+                f'expected at least 2D input, got {input.dim()}D input')
+        if self.momentum is None:
+            exponential_average_factor = 0.0
+        else:
+            exponential_average_factor = self.momentum
+
+        if self.training and self.track_running_stats:
+            if self.num_batches_tracked is not None:
+                self.num_batches_tracked += 1
+                if self.momentum is None:  # use cumulative moving average
+                    exponential_average_factor = 1.0 / float(
+                        self.num_batches_tracked)
+                else:  # use exponential moving average
+                    exponential_average_factor = self.momentum
+
+        if self.training or not self.track_running_stats:
+            return SyncBatchNormFunction.apply(
+                input, self.running_mean, self.running_var, self.weight,
+                self.bias, exponential_average_factor, self.eps, self.group,
+                self.group_size, self.stats_mode)
+        else:
+            return F.batch_norm(input, self.running_mean, self.running_var,
+                                self.weight, self.bias, False,
+                                exponential_average_factor, self.eps)
+
+    def __repr__(self):
+        s = self.__class__.__name__
+        s += f'({self.num_features}, '
+        s += f'eps={self.eps}, '
+        s += f'momentum={self.momentum}, '
+        s += f'affine={self.affine}, '
+        s += f'track_running_stats={self.track_running_stats}, '
+        s += f'group_size={self.group_size},'
+        s += f'stats_mode={self.stats_mode})'
+        return s
diff --git a/mmcv/mmcv/ops/three_interpolate.py b/mmcv/mmcv/ops/three_interpolate.py
new file mode 100644
index 0000000000000000000000000000000000000000..12b2f7611e738f48ff24fe26e36e82cee424d6b9
--- /dev/null
+++ b/mmcv/mmcv/ops/three_interpolate.py
@@ -0,0 +1,69 @@
+from typing import Any, Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['three_interpolate_forward', 'three_interpolate_backward'])
+
+
+class ThreeInterpolate(Function):
+    """Performs weighted linear interpolation on 3 features.
+
+    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
+    for more details.
+    """
+
+    @staticmethod
+    def forward(ctx: Any, features: torch.Tensor, indices: torch.Tensor,
+                weight: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            features (torch.Tensor): (B, C, M) Features descriptors to be
+                interpolated.
+            indices (torch.Tensor): (B, n, 3) indices of three nearest
+                neighbor features for the target features.
+            weight (torch.Tensor): (B, n, 3) weights of three nearest
+                neighbor features for the target features.
+
+        Returns:
+            torch.Tensor: (B, C, N) tensor of the interpolated features
+        """
+        assert features.is_contiguous()
+        assert indices.is_contiguous()
+        assert weight.is_contiguous()
+
+        B, c, m = features.size()
+        n = indices.size(1)
+        ctx.three_interpolate_for_backward = (indices, weight, m)
+        output = torch.cuda.FloatTensor(B, c, n)
+
+        ext_module.three_interpolate_forward(
+            features, indices, weight, output, b=B, c=c, m=m, n=n)
+        return output
+
+    @staticmethod
+    def backward(
+        ctx, grad_out: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            grad_out (torch.Tensor): (B, C, N) tensor with gradients of outputs
+
+        Returns:
+            torch.Tensor: (B, C, M) tensor with gradients of features
+        """
+        idx, weight, m = ctx.three_interpolate_for_backward
+        B, c, n = grad_out.size()
+
+        grad_features = torch.cuda.FloatTensor(B, c, m).zero_()
+        grad_out_data = grad_out.data.contiguous()
+
+        ext_module.three_interpolate_backward(
+            grad_out_data, idx, weight, grad_features.data, b=B, c=c, n=n, m=m)
+        return grad_features, None, None
+
+
+three_interpolate = ThreeInterpolate.apply
diff --git a/mmcv/mmcv/ops/three_nn.py b/mmcv/mmcv/ops/three_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..7893c8363b1ceead58d3f6f554886bba61948fad
--- /dev/null
+++ b/mmcv/mmcv/ops/three_nn.py
@@ -0,0 +1,51 @@
+from typing import Any, Tuple
+
+import torch
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext', ['three_nn_forward'])
+
+
+class ThreeNN(Function):
+    """Find the top-3 nearest neighbors of the target set from the source set.
+
+    Please refer to `Paper of PointNet++ <https://arxiv.org/abs/1706.02413>`_
+    for more details.
+    """
+
+    @staticmethod
+    def forward(ctx: Any, target: torch.Tensor,
+                source: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Args:
+            target (torch.Tensor): shape (B, N, 3), points set that needs to
+                find the nearest neighbors.
+            source (torch.Tensor): shape (B, M, 3), points set that is used
+                to find the nearest neighbors of points in target set.
+
+        Returns:
+            torch.Tensor: shape (B, N, 3), L2 distance of each point in target
+            set to their corresponding top three nearest neighbors.
+        """
+        target = target.contiguous()
+        source = source.contiguous()
+
+        B, N, _ = target.size()
+        m = source.size(1)
+        dist2 = torch.cuda.FloatTensor(B, N, 3)
+        idx = torch.cuda.IntTensor(B, N, 3)
+
+        ext_module.three_nn_forward(target, source, dist2, idx, b=B, n=N, m=m)
+        if torch.__version__ != 'parrots':
+            ctx.mark_non_differentiable(idx)
+
+        return torch.sqrt(dist2), idx
+
+    @staticmethod
+    def backward(ctx, a=None, b=None):
+        return None, None
+
+
+three_nn = ThreeNN.apply
diff --git a/mmcv/mmcv/ops/tin_shift.py b/mmcv/mmcv/ops/tin_shift.py
new file mode 100755
index 0000000000000000000000000000000000000000..473231cc0de002bbf8bdb22cc19755487fbddb48
--- /dev/null
+++ b/mmcv/mmcv/ops/tin_shift.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Code reference from "Temporal Interlacing Network"
+# https://github.com/deepcs233/TIN/blob/master/cuda_shift/rtc_wrap.py
+# Hao Shao, Shengju Qian, Yu Liu
+# shaoh19@mails.tsinghua.edu.cn, sjqian@cse.cuhk.edu.hk, yuliu@ee.cuhk.edu.hk
+
+import torch
+import torch.nn as nn
+from torch.autograd import Function
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext('_ext',
+                                 ['tin_shift_forward', 'tin_shift_backward'])
+
+
+class TINShiftFunction(Function):
+
+    @staticmethod
+    def forward(ctx, input, shift):
+        if input.size(0) != shift.size(0):
+            raise ValueError(
+                'The first dim (batch) of `input` and `shift` should be '
+                f'same, but got {input.size(0)} and {shift.size(0)}.')
+        C = input.size(2)
+        num_segments = shift.size(1)
+        if C // num_segments <= 0 or C % num_segments != 0:
+            raise ValueError('C should be a multiple of num_segments, '
+                             f'but got C={C} and num_segments={num_segments}.')
+
+        ctx.save_for_backward(shift)
+
+        out = torch.zeros_like(input)
+        ext_module.tin_shift_forward(input, shift, out)
+
+        return out
+
+    @staticmethod
+    def backward(ctx, grad_output):
+
+        shift = ctx.saved_tensors[0]
+        data_grad_input = grad_output.new(*grad_output.size()).zero_()
+        shift_grad_input = shift.new(*shift.size()).zero_()
+        ext_module.tin_shift_backward(grad_output, shift, data_grad_input)
+
+        return data_grad_input, shift_grad_input
+
+
+tin_shift = TINShiftFunction.apply
+
+
+class TINShift(nn.Module):
+    """Temporal Interlace Shift.
+
+    Temporal Interlace shift is a differentiable temporal-wise frame shifting
+    which is proposed in "Temporal Interlacing Network"
+
+    Please refer to `Temporal Interlacing Network
+    <https://arxiv.org/abs/2001.06499>`_ for more details.
+
+    Code is modified from https://github.com/mit-han-lab/temporal-shift-module
+    """
+
+    def forward(self, input, shift):
+        """Perform temporal interlace shift.
+
+        Args:
+            input (torch.Tensor): Feature map with shape
+                [N, num_segments, C, H * W].
+            shift (torch.Tensor): Shift tensor with shape [N, num_segments].
+
+        Returns:
+            Feature map after temporal interlace shift.
+        """
+        return tin_shift(input, shift)
diff --git a/mmcv/mmcv/ops/upfirdn2d.py b/mmcv/mmcv/ops/upfirdn2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..434238359a04cdf42bdcb71d1266668f2534b296
--- /dev/null
+++ b/mmcv/mmcv/ops/upfirdn2d.py
@@ -0,0 +1,341 @@
+# modified from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py  # noqa:E501
+
+# Copyright (c) 2021, NVIDIA Corporation. All rights reserved.
+# NVIDIA Source Code License for StyleGAN2 with Adaptive Discriminator
+# Augmentation (ADA)
+# =======================================================================
+
+# 1. Definitions
+
+# "Licensor" means any person or entity that distributes its Work.
+
+# "Software" means the original work of authorship made available under
+# this License.
+
+# "Work" means the Software and any additions to or derivative works of
+# the Software that are made available under this License.
+
+# The terms "reproduce," "reproduction," "derivative works," and
+# "distribution" have the meaning as provided under U.S. copyright law;
+# provided, however, that for the purposes of this License, derivative
+# works shall not include works that remain separable from, or merely
+# link (or bind by name) to the interfaces of, the Work.
+
+# Works, including the Software, are "made available" under this License
+# by including in or with the Work either (a) a copyright notice
+# referencing the applicability of this License to the Work, or (b) a
+# copy of this License.
+
+# 2. License Grants
+
+#     2.1 Copyright Grant. Subject to the terms and conditions of this
+#     License, each Licensor grants to you a perpetual, worldwide,
+#     non-exclusive, royalty-free, copyright license to reproduce,
+#     prepare derivative works of, publicly display, publicly perform,
+#     sublicense and distribute its Work and any resulting derivative
+#     works in any form.
+
+# 3. Limitations
+
+#     3.1 Redistribution. You may reproduce or distribute the Work only
+#     if (a) you do so under this License, (b) you include a complete
+#     copy of this License with your distribution, and (c) you retain
+#     without modification any copyright, patent, trademark, or
+#     attribution notices that are present in the Work.
+
+#     3.2 Derivative Works. You may specify that additional or different
+#     terms apply to the use, reproduction, and distribution of your
+#     derivative works of the Work ("Your Terms") only if (a) Your Terms
+#     provide that the use limitation in Section 3.3 applies to your
+#     derivative works, and (b) you identify the specific derivative
+#     works that are subject to Your Terms. Notwithstanding Your Terms,
+#     this License (including the redistribution requirements in Section
+#     3.1) will continue to apply to the Work itself.
+
+#     3.3 Use Limitation. The Work and any derivative works thereof only
+#     may be used or intended for use non-commercially. Notwithstanding
+#     the foregoing, NVIDIA and its affiliates may use the Work and any
+#     derivative works commercially. As used herein, "non-commercially"
+#     means for research or evaluation purposes only.
+
+#     3.4 Patent Claims. If you bring or threaten to bring a patent claim
+#     against any Licensor (including any claim, cross-claim or
+#     counterclaim in a lawsuit) to enforce any patents that you allege
+#     are infringed by any Work, then your rights under this License from
+#     such Licensor (including the grant in Section 2.1) will terminate
+#     immediately.
+
+#     3.5 Trademarks. This License does not grant any rights to use any
+#     Licensor’s or its affiliates’ names, logos, or trademarks, except
+#     as necessary to reproduce the notices described in this License.
+
+#     3.6 Termination. If you violate any term of this License, then your
+#     rights under this License (including the grant in Section 2.1) will
+#     terminate immediately.
+
+# 4. Disclaimer of Warranty.
+
+# THE WORK IS PROVIDED "AS IS" WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WARRANTIES OR CONDITIONS OF
+# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE OR
+# NON-INFRINGEMENT. YOU BEAR THE RISK OF UNDERTAKING ANY ACTIVITIES UNDER
+# THIS LICENSE.
+
+# 5. Limitation of Liability.
+
+# EXCEPT AS PROHIBITED BY APPLICABLE LAW, IN NO EVENT AND UNDER NO LEGAL
+# THEORY, WHETHER IN TORT (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE
+# SHALL ANY LICENSOR BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY DIRECT,
+# INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES ARISING OUT OF
+# OR RELATED TO THIS LICENSE, THE USE OR INABILITY TO USE THE WORK
+# (INCLUDING BUT NOT LIMITED TO LOSS OF GOODWILL, BUSINESS INTERRUPTION,
+# LOST PROFITS OR DATA, COMPUTER FAILURE OR MALFUNCTION, OR ANY OTHER
+# COMMERCIAL DAMAGES OR LOSSES), EVEN IF THE LICENSOR HAS BEEN ADVISED OF
+# THE POSSIBILITY OF SUCH DAMAGES.
+
+# =======================================================================
+
+from typing import Any, List, Tuple, Union
+
+import torch
+from torch.autograd import Function
+from torch.nn import functional as F
+
+from mmcv.utils import to_2tuple
+from ..utils import ext_loader
+
+upfirdn2d_ext = ext_loader.load_ext('_ext', ['upfirdn2d'])
+
+
+class UpFirDn2dBackward(Function):
+
+    @staticmethod
+    def forward(ctx: Any, grad_output: torch.Tensor, kernel: torch.Tensor,
+                grad_kernel: torch.Tensor, up: tuple, down: tuple, pad: tuple,
+                g_pad: tuple, in_size: Union[List, Tuple],
+                out_size: Union[List, Tuple]) -> torch.Tensor:
+
+        up_x, up_y = up
+        down_x, down_y = down
+        g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad
+
+        grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1)
+
+        grad_input = upfirdn2d_ext.upfirdn2d(
+            grad_output,
+            grad_kernel,
+            up_x=down_x,
+            up_y=down_y,
+            down_x=up_x,
+            down_y=up_y,
+            pad_x0=g_pad_x0,
+            pad_x1=g_pad_x1,
+            pad_y0=g_pad_y0,
+            pad_y1=g_pad_y1)
+        grad_input = grad_input.view(in_size[0], in_size[1], in_size[2],
+                                     in_size[3])
+
+        ctx.save_for_backward(kernel)
+
+        pad_x0, pad_x1, pad_y0, pad_y1 = pad
+
+        ctx.up_x = up_x
+        ctx.up_y = up_y
+        ctx.down_x = down_x
+        ctx.down_y = down_y
+        ctx.pad_x0 = pad_x0
+        ctx.pad_x1 = pad_x1
+        ctx.pad_y0 = pad_y0
+        ctx.pad_y1 = pad_y1
+        ctx.in_size = in_size
+        ctx.out_size = out_size
+
+        return grad_input
+
+    @staticmethod
+    def backward(ctx: Any, gradgrad_input: torch.Tensor) -> tuple:
+        kernel, = ctx.saved_tensors
+
+        gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2],
+                                                ctx.in_size[3], 1)
+
+        gradgrad_out = upfirdn2d_ext.upfirdn2d(
+            gradgrad_input,
+            kernel,
+            up_x=ctx.up_x,
+            up_y=ctx.up_y,
+            down_x=ctx.down_x,
+            down_y=ctx.down_y,
+            pad_x0=ctx.pad_x0,
+            pad_x1=ctx.pad_x1,
+            pad_y0=ctx.pad_y0,
+            pad_y1=ctx.pad_y1)
+        # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0],
+        #                                  ctx.out_size[1], ctx.in_size[3])
+        gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.in_size[1],
+                                         ctx.out_size[0], ctx.out_size[1])
+
+        return gradgrad_out, None, None, None, None, None, None, None, None
+
+
+class UpFirDn2d(Function):
+
+    @staticmethod
+    def forward(ctx: Any, input: torch.Tensor, kernel: torch.Tensor, up: tuple,
+                down: tuple, pad: tuple) -> torch.Tensor:
+        up_x, up_y = up
+        down_x, down_y = down
+        pad_x0, pad_x1, pad_y0, pad_y1 = pad
+
+        kernel_h, kernel_w = kernel.shape
+        batch, channel, in_h, in_w = input.shape
+        ctx.in_size = input.shape
+
+        input = input.reshape(-1, in_h, in_w, 1)
+
+        ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1]))
+
+        out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+        out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+        ctx.out_size = (out_h, out_w)
+
+        ctx.up = (up_x, up_y)
+        ctx.down = (down_x, down_y)
+        ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1)
+
+        g_pad_x0 = kernel_w - pad_x0 - 1
+        g_pad_y0 = kernel_h - pad_y0 - 1
+        g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1
+        g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1
+
+        ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1)
+
+        out = upfirdn2d_ext.upfirdn2d(
+            input,
+            kernel,
+            up_x=up_x,
+            up_y=up_y,
+            down_x=down_x,
+            down_y=down_y,
+            pad_x0=pad_x0,
+            pad_x1=pad_x1,
+            pad_y0=pad_y0,
+            pad_y1=pad_y1)
+        # out = out.view(major, out_h, out_w, minor)
+        out = out.view(-1, channel, out_h, out_w)
+
+        return out
+
+    @staticmethod
+    def backward(ctx: Any, grad_output: torch.Tensor) -> tuple:
+        kernel, grad_kernel = ctx.saved_tensors
+
+        grad_input = UpFirDn2dBackward.apply(
+            grad_output,
+            kernel,
+            grad_kernel,
+            ctx.up,
+            ctx.down,
+            ctx.pad,
+            ctx.g_pad,
+            ctx.in_size,
+            ctx.out_size,
+        )
+
+        return grad_input, None, None, None, None
+
+
+def upfirdn2d(
+    input: torch.Tensor,
+    kernel: torch.Tensor,
+    up: Union[int, tuple] = 1,
+    down: Union[int, tuple] = 1,
+    pad: tuple = (0, 0)) -> torch.Tensor:  # noqa E125
+    """UpFRIDn for 2d features.
+
+    UpFIRDn is short for upsample, apply FIR filter and downsample. More
+    details can be found in:
+    https://www.mathworks.com/help/signal/ref/upfirdn.html
+
+    Args:
+        input (torch.Tensor): Tensor with shape of (n, c, h, w).
+        kernel (torch.Tensor): Filter kernel.
+        up (int | tuple[int], optional): Upsampling factor. If given a number,
+            we will use this factor for the both height and width side.
+            Defaults to 1.
+        down (int | tuple[int], optional): Downsampling factor. If given a
+            number, we will use this factor for the both height and width side.
+            Defaults to 1.
+        pad (tuple[int], optional): Padding for tensors, (x_pad, y_pad) or
+            (x_pad_0, x_pad_1, y_pad_0, y_pad_1). Defaults to (0, 0).
+
+    Returns:
+        torch.Tensor: Tensor after UpFIRDn.
+    """
+    if input.device.type == 'cpu':
+        if len(pad) == 2:
+            pad = (pad[0], pad[1], pad[0], pad[1])  # type: ignore
+
+        _up = to_2tuple(up)
+
+        _down = to_2tuple(down)
+
+        out = upfirdn2d_native(input, kernel, _up[0], _up[1], _down[0],
+                               _down[1], pad[0], pad[1], pad[2], pad[3])
+    else:
+        _up = to_2tuple(up)
+
+        _down = to_2tuple(down)
+
+        if len(pad) == 4:
+            _pad = pad
+        elif len(pad) == 2:
+            _pad = (pad[0], pad[1], pad[0], pad[1])
+
+        out = UpFirDn2d.apply(input, kernel, _up, _down, _pad)
+
+    return out
+
+
+def upfirdn2d_native(input: torch.Tensor, kernel: torch.Tensor, up_x: int,
+                     up_y: int, down_x: int, down_y: int, pad_x0: int,
+                     pad_x1: int, pad_y0: int, pad_y1: int) -> torch.Tensor:
+    _, channel, in_h, in_w = input.shape
+    input = input.reshape(-1, in_h, in_w, 1)
+
+    _, in_h, in_w, minor = input.shape
+    kernel_h, kernel_w = kernel.shape
+
+    out = input.view(-1, in_h, 1, in_w, 1, minor)
+    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
+    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
+
+    out = F.pad(
+        out,
+        [0, 0,
+         max(pad_x0, 0),
+         max(pad_x1, 0),
+         max(pad_y0, 0),
+         max(pad_y1, 0)])
+    out = out[:,
+              max(-pad_y0, 0):out.shape[1] - max(-pad_y1, 0),
+              max(-pad_x0, 0):out.shape[2] - max(-pad_x1, 0), :, ]
+
+    out = out.permute(0, 3, 1, 2)
+    out = out.reshape(
+        [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1])
+    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
+    out = F.conv2d(out, w)
+    out = out.reshape(
+        -1,
+        minor,
+        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
+        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
+    )
+    out = out.permute(0, 2, 3, 1)
+    out = out[:, ::down_y, ::down_x, :]
+
+    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
+    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
+
+    return out.view(-1, channel, out_h, out_w)
diff --git a/mmcv/mmcv/ops/voxelize.py b/mmcv/mmcv/ops/voxelize.py
new file mode 100644
index 0000000000000000000000000000000000000000..992ce68fd2a970bd475abaae68e62c78fec0e4c8
--- /dev/null
+++ b/mmcv/mmcv/ops/voxelize.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Tuple, Union
+
+import torch
+from torch import nn
+from torch.autograd import Function
+from torch.nn.modules.utils import _pair
+
+from ..utils import ext_loader
+
+ext_module = ext_loader.load_ext(
+    '_ext', ['dynamic_voxelize_forward', 'hard_voxelize_forward'])
+
+
+class _Voxelization(Function):
+
+    @staticmethod
+    def forward(
+            ctx: Any,
+            points: torch.Tensor,
+            voxel_size: Union[tuple, float],
+            coors_range: Union[tuple, float],
+            max_points: int = 35,
+            max_voxels: int = 20000,
+            deterministic: bool = True) -> Union[Tuple[torch.Tensor], Tuple]:
+        """Convert kitti points(N, >=3) to voxels.
+
+        Args:
+            points (torch.Tensor): [N, ndim]. Points[:, :3] contain xyz points
+                and points[:, 3:] contain other information like reflectivity.
+            voxel_size (tuple or float): The size of voxel with the shape of
+                [3].
+            coors_range (tuple or float): The coordinate range of voxel with
+                the shape of [6].
+            max_points (int, optional): maximum points contained in a voxel. if
+                max_points=-1, it means using dynamic_voxelize. Default: 35.
+            max_voxels (int, optional): maximum voxels this function create.
+                for second, 20000 is a good choice. Users should shuffle points
+                before call this function because max_voxels may drop points.
+                Default: 20000.
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+
+        Returns:
+            tuple[torch.Tensor]: tuple[torch.Tensor]: A tuple contains three
+            elements. The first one is the output voxels with the shape of
+            [M, max_points, n_dim], which only contain points and returned
+            when max_points != -1. The second is the voxel coordinates with
+            shape of [M, 3]. The last is number of point per voxel with the
+            shape of [M], which only returned when max_points != -1.
+        """
+        if max_points == -1 or max_voxels == -1:
+            coors = points.new_zeros(size=(points.size(0), 3), dtype=torch.int)
+            ext_module.dynamic_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                coors,
+                NDim=3)
+            return coors
+        else:
+            voxels = points.new_zeros(
+                size=(max_voxels, max_points, points.size(1)))
+            coors = points.new_zeros(size=(max_voxels, 3), dtype=torch.int)
+            num_points_per_voxel = points.new_zeros(
+                size=(max_voxels, ), dtype=torch.int)
+            voxel_num = torch.zeros(size=(), dtype=torch.long)
+            ext_module.hard_voxelize_forward(
+                points,
+                torch.tensor(voxel_size, dtype=torch.float),
+                torch.tensor(coors_range, dtype=torch.float),
+                voxels,
+                coors,
+                num_points_per_voxel,
+                voxel_num,
+                max_points=max_points,
+                max_voxels=max_voxels,
+                NDim=3,
+                deterministic=deterministic)
+            # select the valid voxels
+            voxels_out = voxels[:voxel_num]
+            coors_out = coors[:voxel_num]
+            num_points_per_voxel_out = num_points_per_voxel[:voxel_num]
+            return voxels_out, coors_out, num_points_per_voxel_out
+
+
+voxelization = _Voxelization.apply
+
+
+class Voxelization(nn.Module):
+    """Convert kitti points(N, >=3) to voxels.
+
+    Please refer to `Point-Voxel CNN for Efficient 3D Deep Learning
+    <https://arxiv.org/abs/1907.03739>`_ for more details.
+
+    Args:
+        voxel_size (tuple or float): The size of voxel with the shape of [3].
+        point_cloud_range (tuple or float): The coordinate range of voxel with
+            the shape of [6].
+        max_num_points (int): maximum points contained in a voxel. if
+            max_points=-1, it means using dynamic_voxelize.
+        max_voxels (int, optional): maximum voxels this function create.
+            for second, 20000 is a good choice. Users should shuffle points
+            before call this function because max_voxels may drop points.
+            Default: 20000.
+    """
+
+    def __init__(self,
+                 voxel_size: List,
+                 point_cloud_range: List,
+                 max_num_points: int,
+                 max_voxels: Union[tuple, int] = 20000,
+                 deterministic: bool = True):
+        """
+        Args:
+            voxel_size (list): list [x, y, z] size of three dimension
+            point_cloud_range (list):
+                [x_min, y_min, z_min, x_max, y_max, z_max]
+            max_num_points (int): max number of points per voxel
+            max_voxels (tuple or int): max number of voxels in
+                (training, testing) time
+            deterministic: bool. whether to invoke the non-deterministic
+                version of hard-voxelization implementations. non-deterministic
+                version is considerablly fast but is not deterministic. only
+                affects hard voxelization. default True. for more information
+                of this argument and the implementation insights, please refer
+                to the following links:
+                https://github.com/open-mmlab/mmdetection3d/issues/894
+                https://github.com/open-mmlab/mmdetection3d/pull/904
+                it is an experimental feature and we will appreciate it if
+                you could share with us the failing cases.
+        """
+        super().__init__()
+
+        self.voxel_size = voxel_size
+        self.point_cloud_range = point_cloud_range
+        self.max_num_points = max_num_points
+        if isinstance(max_voxels, tuple):
+            self.max_voxels = max_voxels
+        else:
+            self.max_voxels = _pair(max_voxels)
+        self.deterministic = deterministic
+
+        point_cloud_range = torch.tensor(
+            point_cloud_range, dtype=torch.float32)
+        voxel_size = torch.tensor(voxel_size, dtype=torch.float32)
+        grid_size = (
+            point_cloud_range[3:] -  # type: ignore
+            point_cloud_range[:3]) / voxel_size  # type: ignore
+        grid_size = torch.round(grid_size).long()
+        input_feat_shape = grid_size[:2]
+        self.grid_size = grid_size
+        # the origin shape is as [x-len, y-len, z-len]
+        # [w, h, d] -> [d, h, w]
+        self.pcd_shape = [*input_feat_shape, 1][::-1]
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if self.training:
+            max_voxels = self.max_voxels[0]
+        else:
+            max_voxels = self.max_voxels[1]
+
+        return voxelization(input, self.voxel_size, self.point_cloud_range,
+                            self.max_num_points, max_voxels,
+                            self.deterministic)
+
+    def __repr__(self):
+        s = self.__class__.__name__ + '('
+        s += 'voxel_size=' + str(self.voxel_size)
+        s += ', point_cloud_range=' + str(self.point_cloud_range)
+        s += ', max_num_points=' + str(self.max_num_points)
+        s += ', max_voxels=' + str(self.max_voxels)
+        s += ', deterministic=' + str(self.deterministic)
+        s += ')'
+        return s
diff --git a/mmcv/mmcv/parallel/__init__.py b/mmcv/mmcv/parallel/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ed2c17ad357742e423beeaf4d35db03fe9af469
--- /dev/null
+++ b/mmcv/mmcv/parallel/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .collate import collate
+from .data_container import DataContainer
+from .data_parallel import MMDataParallel
+from .distributed import MMDistributedDataParallel
+from .registry import MODULE_WRAPPERS
+from .scatter_gather import scatter, scatter_kwargs
+from .utils import is_module_wrapper
+
+__all__ = [
+    'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel',
+    'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS'
+]
diff --git a/mmcv/mmcv/parallel/_functions.py b/mmcv/mmcv/parallel/_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..b683623a6ebcb12913fa0af7686434cdd940c4e8
--- /dev/null
+++ b/mmcv/mmcv/parallel/_functions.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import torch
+from torch import Tensor
+from torch.nn.parallel._functions import _get_stream
+
+from packaging import version
+
+def scatter(input: Union[List, Tensor],
+            devices: List,
+            streams: Optional[List] = None) -> Union[List, Tensor]:
+    """Scatters tensor across multiple GPUs."""
+    if streams is None:
+        streams = [None] * len(devices)
+
+    if isinstance(input, list):
+        chunk_size = (len(input) - 1) // len(devices) + 1
+        outputs = [
+            scatter(input[i], [devices[i // chunk_size]],
+                    [streams[i // chunk_size]]) for i in range(len(input))
+        ]
+        return outputs
+    elif isinstance(input, Tensor):
+        output = input.contiguous()
+        # TODO: copy to a pinned buffer first (if copying from CPU)
+        stream = streams[0] if output.numel() > 0 else None
+        if devices != [-1]:
+            with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
+                output = output.cuda(devices[0], non_blocking=True)
+
+        return output
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+def synchronize_stream(output: Union[List, Tensor], devices: List,
+                       streams: List) -> None:
+    if isinstance(output, list):
+        chunk_size = len(output) // len(devices)
+        for i in range(len(devices)):
+            for j in range(chunk_size):
+                synchronize_stream(output[i * chunk_size + j], [devices[i]],
+                                   [streams[i]])
+    elif isinstance(output, Tensor):
+        if output.numel() != 0:
+            with torch.cuda.device(devices[0]):
+                main_stream = torch.cuda.current_stream()
+                main_stream.wait_stream(streams[0])
+                output.record_stream(main_stream)
+    else:
+        raise Exception(f'Unknown type {type(output)}.')
+
+
+def get_input_device(input: Union[List, Tensor]) -> int:
+    if isinstance(input, list):
+        for item in input:
+            input_device = get_input_device(item)
+            if input_device != -1:
+                return input_device
+        return -1
+    elif isinstance(input, Tensor):
+        return input.get_device() if input.is_cuda else -1
+    else:
+        raise Exception(f'Unknown type {type(input)}.')
+
+
+class Scatter:
+
+    @staticmethod
+    def forward(target_gpus: List[int], input: Union[List, Tensor]) -> tuple:
+        input_device = get_input_device(input)
+        streams = None
+        if input_device == -1 and target_gpus != [-1]:
+            # Perform CPU to GPU copies in a background stream
+            if version.parse(torch.__version__) >= version.parse('2.1.0'):
+                streams = [_get_stream(torch.device("cuda", device)) for device in target_gpus]
+            else:
+                streams = [_get_stream(device) for device in target_gpus]
+
+        outputs = scatter(input, target_gpus, streams)
+        # Synchronize with the copy stream
+        if streams is not None:
+            synchronize_stream(outputs, target_gpus, streams)
+
+        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
diff --git a/mmcv/mmcv/parallel/collate.py b/mmcv/mmcv/parallel/collate.py
new file mode 100644
index 0000000000000000000000000000000000000000..50c408bedc90ed7ee3fcce471ea29d7d65beb7d5
--- /dev/null
+++ b/mmcv/mmcv/parallel/collate.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections.abc import Mapping, Sequence
+
+import torch
+import torch.nn.functional as F
+from torch.utils.data.dataloader import default_collate
+
+from .data_container import DataContainer
+
+
+def collate(batch: Sequence, samples_per_gpu: int = 1):
+    """Puts each data field into a tensor/DataContainer with outer dimension
+    batch size.
+
+    Extend default_collate to add support for
+    :type:`~mmcv.parallel.DataContainer`. There are 3 cases.
+
+    1. cpu_only = True, e.g., meta data
+    2. cpu_only = False, stack = True, e.g., images tensors
+    3. cpu_only = False, stack = False, e.g., gt bboxes
+    """
+
+    if not isinstance(batch, Sequence):
+        raise TypeError(f'{batch.dtype} is not supported.')
+
+    if isinstance(batch[0], DataContainer):
+        stacked = []
+        if batch[0].cpu_only:
+            for i in range(0, len(batch), samples_per_gpu):
+                stacked.append(
+                    [sample.data for sample in batch[i:i + samples_per_gpu]])
+            return DataContainer(
+                stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
+        elif batch[0].stack:
+            for i in range(0, len(batch), samples_per_gpu):
+                assert isinstance(batch[i].data, torch.Tensor)
+
+                if batch[i].pad_dims is not None:
+                    ndim = batch[i].dim()
+                    assert ndim > batch[i].pad_dims
+                    max_shape = [0 for _ in range(batch[i].pad_dims)]
+                    for dim in range(1, batch[i].pad_dims + 1):
+                        max_shape[dim - 1] = batch[i].size(-dim)
+                    for sample in batch[i:i + samples_per_gpu]:
+                        for dim in range(0, ndim - batch[i].pad_dims):
+                            assert batch[i].size(dim) == sample.size(dim)
+                        for dim in range(1, batch[i].pad_dims + 1):
+                            max_shape[dim - 1] = max(max_shape[dim - 1],
+                                                     sample.size(-dim))
+                    padded_samples = []
+                    for sample in batch[i:i + samples_per_gpu]:
+                        pad = [0 for _ in range(batch[i].pad_dims * 2)]
+                        for dim in range(1, batch[i].pad_dims + 1):
+                            pad[2 * dim -
+                                1] = max_shape[dim - 1] - sample.size(-dim)
+                        padded_samples.append(
+                            F.pad(
+                                sample.data, pad, value=sample.padding_value))
+                    stacked.append(default_collate(padded_samples))
+                elif batch[i].pad_dims is None:
+                    stacked.append(
+                        default_collate([
+                            sample.data
+                            for sample in batch[i:i + samples_per_gpu]
+                        ]))
+                else:
+                    raise ValueError(
+                        'pad_dims should be either None or integers (1-3)')
+
+        else:
+            for i in range(0, len(batch), samples_per_gpu):
+                stacked.append(
+                    [sample.data for sample in batch[i:i + samples_per_gpu]])
+        return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
+    elif isinstance(batch[0], Sequence):
+        transposed = zip(*batch)
+        return [collate(samples, samples_per_gpu) for samples in transposed]
+    elif isinstance(batch[0], Mapping):
+        return {
+            key: collate([d[key] for d in batch], samples_per_gpu)
+            for key in batch[0]
+        }
+    else:
+        return default_collate(batch)
diff --git a/mmcv/mmcv/parallel/data_container.py b/mmcv/mmcv/parallel/data_container.py
new file mode 100644
index 0000000000000000000000000000000000000000..62f257311095e96ba5547acd44687362947f9185
--- /dev/null
+++ b/mmcv/mmcv/parallel/data_container.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+from typing import Callable, Type, Union
+
+import numpy as np
+import torch
+
+
+def assert_tensor_type(func: Callable) -> Callable:
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        if not isinstance(args[0].data, torch.Tensor):
+            raise AttributeError(
+                f'{args[0].__class__.__name__} has no attribute '
+                f'{func.__name__} for type {args[0].datatype}')
+        return func(*args, **kwargs)
+
+    return wrapper
+
+
+class DataContainer:
+    """A container for any type of objects.
+
+    Typically tensors will be stacked in the collate function and sliced along
+    some dimension in the scatter function. This behavior has some limitations.
+    1. All tensors have to be the same size.
+    2. Types are limited (numpy array or Tensor).
+
+    We design `DataContainer` and `MMDataParallel` to overcome these
+    limitations. The behavior can be either of the following.
+
+    - copy to GPU, pad all tensors to the same size and stack them
+    - copy to GPU without stacking
+    - leave the objects as is and pass it to the model
+    - pad_dims specifies the number of last few dimensions to do padding
+    """
+
+    def __init__(self,
+                 data: Union[torch.Tensor, np.ndarray],
+                 stack: bool = False,
+                 padding_value: int = 0,
+                 cpu_only: bool = False,
+                 pad_dims: int = 2):
+        self._data = data
+        self._cpu_only = cpu_only
+        self._stack = stack
+        self._padding_value = padding_value
+        assert pad_dims in [None, 1, 2, 3]
+        self._pad_dims = pad_dims
+
+    def __repr__(self) -> str:
+        return f'{self.__class__.__name__}({repr(self.data)})'
+
+    def __len__(self) -> int:
+        return len(self._data)
+
+    @property
+    def data(self) -> Union[torch.Tensor, np.ndarray]:
+        return self._data
+
+    @property
+    def datatype(self) -> Union[Type, str]:
+        if isinstance(self.data, torch.Tensor):
+            return self.data.type()
+        else:
+            return type(self.data)
+
+    @property
+    def cpu_only(self) -> bool:
+        return self._cpu_only
+
+    @property
+    def stack(self) -> bool:
+        return self._stack
+
+    @property
+    def padding_value(self) -> int:
+        return self._padding_value
+
+    @property
+    def pad_dims(self) -> int:
+        return self._pad_dims
+
+    @assert_tensor_type
+    def size(self, *args, **kwargs) -> torch.Size:
+        return self.data.size(*args, **kwargs)
+
+    @assert_tensor_type
+    def dim(self) -> int:
+        return self.data.dim()
diff --git a/mmcv/mmcv/parallel/data_parallel.py b/mmcv/mmcv/parallel/data_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..eea088fa0c74804b2949894167cd3f2a6f6d540f
--- /dev/null
+++ b/mmcv/mmcv/parallel/data_parallel.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from itertools import chain
+from typing import List, Tuple
+
+from torch.nn.parallel import DataParallel
+
+from .scatter_gather import ScatterInputs, scatter_kwargs
+
+
+class MMDataParallel(DataParallel):
+    """The DataParallel module that supports DataContainer.
+
+    MMDataParallel has two main differences with PyTorch DataParallel:
+
+    - It supports a custom type :class:`DataContainer` which allows more
+      flexible control of input data during both GPU and CPU inference.
+    - It implements two more APIs ``train_step()`` and ``val_step()``.
+
+    .. warning::
+        MMDataParallel only supports single GPU training, if you need to
+        train with multiple GPUs, please use MMDistributedDataParallel
+        instead. If you have multiple GPUs and you just want to use
+        MMDataParallel, you can set the environment variable
+        ``CUDA_VISIBLE_DEVICES=0`` or instantiate ``MMDataParallel`` with
+        ``device_ids=[0]``.
+
+    Args:
+        module (:class:`nn.Module`): Module to be encapsulated.
+        device_ids (list[int]): Device IDS of modules to be scattered to.
+            Defaults to None when GPU is not available.
+        output_device (str | int): Device ID for output. Defaults to None.
+        dim (int): Dimension used to scatter the data. Defaults to 0.
+    """
+
+    def __init__(self, *args, dim: int = 0, **kwargs):
+        super().__init__(*args, dim=dim, **kwargs)
+        self.dim = dim
+
+    def forward(self, *inputs, **kwargs):
+        """Override the original forward function.
+
+        The main difference lies in the CPU inference where the data in
+        :class:`DataContainers` will still be gathered.
+        """
+        if not self.device_ids:
+            # We add the following line thus the module could gather and
+            # convert data containers as those in GPU inference
+            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
+            return self.module(*inputs[0], **kwargs[0])
+        else:
+            return super().forward(*inputs, **kwargs)
+
+    def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs,
+                device_ids: List[int]) -> Tuple[tuple, tuple]:
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def train_step(self, *inputs, **kwargs):
+        if not self.device_ids:
+            # We add the following line thus the module could gather and
+            # convert data containers as those in GPU inference
+            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
+            return self.module.train_step(*inputs[0], **kwargs[0])
+
+        assert len(self.device_ids) == 1, \
+            ('MMDataParallel only supports single GPU training, if you need to'
+             ' train with multiple GPUs, please use MMDistributedDataParallel'
+             ' instead.')
+
+        for t in chain(self.module.parameters(), self.module.buffers()):
+            if t.device != self.src_device_obj:
+                raise RuntimeError(
+                    'module must have its parameters and buffers '
+                    f'on device {self.src_device_obj} (device_ids[0]) but '
+                    f'found one of them on device: {t.device}')
+
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        return self.module.train_step(*inputs[0], **kwargs[0])
+
+    def val_step(self, *inputs, **kwargs):
+        if not self.device_ids:
+            # We add the following line thus the module could gather and
+            # convert data containers as those in GPU inference
+            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
+            return self.module.val_step(*inputs[0], **kwargs[0])
+
+        assert len(self.device_ids) == 1, \
+            ('MMDataParallel only supports single GPU training, if you need to'
+             ' train with multiple GPUs, please use MMDistributedDataParallel'
+             ' instead.')
+
+        for t in chain(self.module.parameters(), self.module.buffers()):
+            if t.device != self.src_device_obj:
+                raise RuntimeError(
+                    'module must have its parameters and buffers '
+                    f'on device {self.src_device_obj} (device_ids[0]) but '
+                    f'found one of them on device: {t.device}')
+
+        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+        return self.module.val_step(*inputs[0], **kwargs[0])
diff --git a/mmcv/mmcv/parallel/distributed.py b/mmcv/mmcv/parallel/distributed.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0dfecc9c2e2cfbc8e7173c31fcce169e3e61388
--- /dev/null
+++ b/mmcv/mmcv/parallel/distributed.py
@@ -0,0 +1,166 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, List, Tuple
+
+import torch
+from torch.nn.parallel.distributed import (DistributedDataParallel,
+                                           _find_tensors)
+
+from mmcv import print_log
+from mmcv.utils import TORCH_VERSION, digit_version
+from .scatter_gather import ScatterInputs, scatter_kwargs
+
+
+class MMDistributedDataParallel(DistributedDataParallel):
+    """The DDP module that supports DataContainer.
+
+    MMDDP has two main differences with PyTorch DDP:
+
+    - It supports a custom type :class:`DataContainer` which allows more
+      flexible control of input data.
+    - It implement two APIs ``train_step()`` and ``val_step()``.
+    """
+
+    def to_kwargs(self, inputs: ScatterInputs, kwargs: ScatterInputs,
+                  device_id: int) -> Tuple[tuple, tuple]:
+        # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8
+        # to move all tensors to device_id
+        return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim)
+
+    def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs,
+                device_ids: List[int]) -> Tuple[tuple, tuple]:
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def train_step(self, *inputs, **kwargs):
+        """train_step() API for module wrapped by DistributedDataParallel.
+
+        This method is basically the same as
+        ``DistributedDataParallel.forward()``, while replacing
+        ``self.module.forward()`` with ``self.module.train_step()``.
+        It is compatible with PyTorch 1.1 - 1.5.
+        """
+
+        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
+        # end of backward to the beginning of forward.
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.7')
+                and self.reducer._rebuild_buckets()):
+            print_log(
+                'Reducer buckets have been rebuilt in this iteration.',
+                logger='mmcv')
+
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_pre_fwd():
+                self._sync_buffers()
+        else:
+            if (getattr(self, 'require_forward_param_sync', False)
+                    and self.require_forward_param_sync):
+                self._sync_params()
+
+        if self.device_ids:
+            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            if len(self.device_ids) == 1:
+                output = self.module.train_step(*inputs[0], **kwargs[0])
+            else:
+                outputs = self.parallel_apply(
+                    self._module_copies[:len(inputs)], inputs, kwargs)
+                output = self.gather(outputs, self.output_device)
+        else:
+            output = self.module.train_step(*inputs, **kwargs)
+
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_post_fwd():
+                self._sync_buffers()
+
+        if (torch.is_grad_enabled()
+                and getattr(self, 'require_backward_grad_sync', False)
+                and self.require_backward_grad_sync):
+            if self.find_unused_parameters:
+                self.reducer.prepare_for_backward(list(_find_tensors(output)))
+            else:
+                self.reducer.prepare_for_backward([])
+        else:
+            if ('parrots' not in TORCH_VERSION
+                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
+                self.require_forward_param_sync = False
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        """val_step() API for module wrapped by DistributedDataParallel.
+
+        This method is basically the same as
+        ``DistributedDataParallel.forward()``, while replacing
+        ``self.module.forward()`` with ``self.module.val_step()``.
+        It is compatible with PyTorch 1.1 - 1.5.
+        """
+        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
+        # end of backward to the beginning of forward.
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.7')
+                and self.reducer._rebuild_buckets()):
+            print_log(
+                'Reducer buckets have been rebuilt in this iteration.',
+                logger='mmcv')
+
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_pre_fwd():
+                self._sync_buffers()
+        else:
+            if (getattr(self, 'require_forward_param_sync', False)
+                    and self.require_forward_param_sync):
+                self._sync_params()
+
+        if self.device_ids:
+            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
+            if len(self.device_ids) == 1:
+                output = self.module.val_step(*inputs[0], **kwargs[0])
+            else:
+                outputs = self.parallel_apply(
+                    self._module_copies[:len(inputs)], inputs, kwargs)
+                output = self.gather(outputs, self.output_device)
+        else:
+            output = self.module.val_step(*inputs, **kwargs)
+
+        if ('parrots' not in TORCH_VERSION
+                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
+            if self._check_sync_bufs_post_fwd():
+                self._sync_buffers()
+
+        if (torch.is_grad_enabled()
+                and getattr(self, 'require_backward_grad_sync', False)
+                and self.require_backward_grad_sync):
+            if self.find_unused_parameters:
+                self.reducer.prepare_for_backward(list(_find_tensors(output)))
+            else:
+                self.reducer.prepare_for_backward([])
+        else:
+            if ('parrots' not in TORCH_VERSION
+                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
+                self.require_forward_param_sync = False
+        return output
+
+    def _run_ddp_forward(self, *inputs, **kwargs) -> Any:
+        """Processes inputs and runs ``self.module.forward``.
+
+        Pytorch 1.12.0 performs ``self.module.forward`` in ``_run_ddp_forward``
+        and deprecates using ``DistributedDataParallel.to_kwargs`` to
+        process inputs, which leads to inputs cannot be processed by
+        :meth:`MMDistributedDataParallel.to_kwargs` anymore. Therefore,
+        ``MMDistributedDataParallel`` overrides this method to call
+        :meth:`to_kwargs` explicitly.
+
+        See more information in `<https://github.com/open-mmlab/mmsegmentation/issues/1742>`_.  # noqa: E501
+
+        Returns:
+            Any: Forward result of :attr:`module`.
+        """
+        module_to_run = self.module
+
+        if self.device_ids:
+            inputs, kwargs = self.to_kwargs(  # type: ignore
+                inputs, kwargs, self.device_ids[0])
+            return module_to_run(*inputs[0], **kwargs[0])  # type: ignore
+        else:
+            return module_to_run(*inputs, **kwargs)
diff --git a/mmcv/mmcv/parallel/distributed_deprecated.py b/mmcv/mmcv/parallel/distributed_deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..21b6c4ec1599a9c30b5afbc0d8e8dd4f1fe49e21
--- /dev/null
+++ b/mmcv/mmcv/parallel/distributed_deprecated.py
@@ -0,0 +1,74 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Sequence, Tuple
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from .registry import MODULE_WRAPPERS
+from .scatter_gather import ScatterInputs, scatter_kwargs
+
+
+@MODULE_WRAPPERS.register_module()
+class MMDistributedDataParallel(nn.Module):
+
+    def __init__(self,
+                 module: nn.Module,
+                 dim: int = 0,
+                 broadcast_buffers: bool = True,
+                 bucket_cap_mb: int = 25):
+        super().__init__()
+        self.module = module
+        self.dim = dim
+        self.broadcast_buffers = broadcast_buffers
+
+        self.broadcast_bucket_size = bucket_cap_mb * 1024 * 1024
+        self._sync_params()
+
+    def _dist_broadcast_coalesced(self, tensors: Sequence[torch.Tensor],
+                                  buffer_size: int) -> None:
+        for tensors in _take_tensors(tensors, buffer_size):
+            flat_tensors = _flatten_dense_tensors(tensors)
+            dist.broadcast(flat_tensors, 0)
+            for tensor, synced in zip(
+                    tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
+                tensor.copy_(synced)
+
+    def _sync_params(self) -> None:
+        module_states = list(self.module.state_dict().values())
+        if len(module_states) > 0:
+            self._dist_broadcast_coalesced(module_states,
+                                           self.broadcast_bucket_size)
+        if self.broadcast_buffers:
+            if (TORCH_VERSION != 'parrots'
+                    and digit_version(TORCH_VERSION) < digit_version('1.0')):
+                buffers = [b.data for b in self.module._all_buffers()]
+            else:
+                buffers = [b.data for b in self.module.buffers()]
+            if len(buffers) > 0:
+                self._dist_broadcast_coalesced(buffers,
+                                               self.broadcast_bucket_size)
+
+    def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs,
+                device_ids: List[int]) -> Tuple[tuple, tuple]:
+        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
+
+    def forward(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        return self.module(*inputs[0], **kwargs[0])
+
+    def train_step(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.train_step(*inputs[0], **kwargs[0])
+        return output
+
+    def val_step(self, *inputs, **kwargs):
+        inputs, kwargs = self.scatter(inputs, kwargs,
+                                      [torch.cuda.current_device()])
+        output = self.module.val_step(*inputs[0], **kwargs[0])
+        return output
diff --git a/mmcv/mmcv/parallel/registry.py b/mmcv/mmcv/parallel/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..144f9fb168a45bfe3dd0abde9886be41174121a1
--- /dev/null
+++ b/mmcv/mmcv/parallel/registry.py
@@ -0,0 +1,8 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from mmcv.utils import Registry
+
+MODULE_WRAPPERS = Registry('module wrapper')
+MODULE_WRAPPERS.register_module(module=DataParallel)
+MODULE_WRAPPERS.register_module(module=DistributedDataParallel)
diff --git a/mmcv/mmcv/parallel/scatter_gather.py b/mmcv/mmcv/parallel/scatter_gather.py
new file mode 100644
index 0000000000000000000000000000000000000000..3133b253c9a7a0f19c75bd6c99e8df6e9fb15bb9
--- /dev/null
+++ b/mmcv/mmcv/parallel/scatter_gather.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple, Union
+
+from torch import Tensor
+from torch.nn.parallel._functions import Scatter as OrigScatter
+
+from ._functions import Scatter
+from .data_container import DataContainer
+
+ScatterInputs = Union[Tensor, DataContainer, tuple, list, dict]
+
+
+def scatter(inputs: ScatterInputs,
+            target_gpus: List[int],
+            dim: int = 0) -> list:
+    """Scatter inputs to target gpus.
+
+    The only difference from original :func:`scatter` is to add support for
+    :type:`~mmcv.parallel.DataContainer`.
+    """
+
+    def scatter_map(obj):
+        if isinstance(obj, Tensor):
+            if target_gpus != [-1]:
+                return OrigScatter.apply(target_gpus, None, dim, obj)
+            else:
+                # for CPU inference we use self-implemented scatter
+                return Scatter.forward(target_gpus, obj)
+        if isinstance(obj, DataContainer):
+            if obj.cpu_only:
+                return obj.data
+            else:
+                return Scatter.forward(target_gpus, obj.data)
+        if isinstance(obj, tuple) and len(obj) > 0:
+            return list(zip(*map(scatter_map, obj)))
+        if isinstance(obj, list) and len(obj) > 0:
+            out = list(map(list, zip(*map(scatter_map, obj))))
+            return out
+        if isinstance(obj, dict) and len(obj) > 0:
+            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
+            return out
+        return [obj for _ in target_gpus]
+
+    # After scatter_map is called, a scatter_map cell will exist. This cell
+    # has a reference to the actual function scatter_map, which has references
+    # to a closure that has a reference to the scatter_map cell (because the
+    # fn is recursive). To avoid this reference cycle, we set the function to
+    # None, clearing the cell
+    try:
+        return scatter_map(inputs)
+    finally:
+        scatter_map = None  # type: ignore
+
+
+def scatter_kwargs(inputs: ScatterInputs,
+                   kwargs: ScatterInputs,
+                   target_gpus: List[int],
+                   dim: int = 0) -> Tuple[tuple, tuple]:
+    """Scatter with support for kwargs dictionary."""
+    inputs = scatter(inputs, target_gpus, dim) if inputs else []
+    kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
+    if len(inputs) < len(kwargs):
+        length = len(kwargs) - len(inputs)
+        inputs.extend([() for _ in range(length)])  # type: ignore
+    elif len(kwargs) < len(inputs):
+        length = len(inputs) - len(kwargs)
+        kwargs.extend([{} for _ in range(length)])  # type: ignore
+    inputs = tuple(inputs)
+    kwargs = tuple(kwargs)
+    return inputs, kwargs
diff --git a/mmcv/mmcv/parallel/utils.py b/mmcv/mmcv/parallel/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd52622b1bb5b9b217d7be1651004e179ff7b86b
--- /dev/null
+++ b/mmcv/mmcv/parallel/utils.py
@@ -0,0 +1,32 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from torch import nn
+
+from .registry import MODULE_WRAPPERS
+
+
+def is_module_wrapper(module: nn.Module) -> bool:
+    """Check if a module is a module wrapper.
+
+    The following 3 modules in MMCV (and their subclasses) are regarded as
+    module wrappers: DataParallel, DistributedDataParallel,
+    MMDistributedDataParallel (the deprecated version). You may add you own
+    module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS or
+    its children registries.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: True if the input module is a module wrapper.
+    """
+
+    def is_module_in_wrapper(module, module_wrapper):
+        module_wrappers = tuple(module_wrapper.module_dict.values())
+        if isinstance(module, module_wrappers):
+            return True
+        for child in module_wrapper.children.values():
+            if is_module_in_wrapper(module, child):
+                return True
+        return False
+
+    return is_module_in_wrapper(module, MODULE_WRAPPERS)
diff --git a/mmcv/mmcv/runner/__init__.py b/mmcv/mmcv/runner/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..183d536727e4a217936378d74b3a64ef4a6377e8
--- /dev/null
+++ b/mmcv/mmcv/runner/__init__.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_module import BaseModule, ModuleDict, ModuleList, Sequential
+from .base_runner import BaseRunner
+from .builder import RUNNERS, build_runner
+from .checkpoint import (CheckpointLoader, _load_checkpoint,
+                         _load_checkpoint_with_prefix, load_checkpoint,
+                         load_state_dict, save_checkpoint, weights_to_cpu)
+from .default_constructor import DefaultRunnerConstructor
+from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info,
+                         init_dist, master_only)
+from .epoch_based_runner import EpochBasedRunner, Runner
+from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model
+from .hooks import (HOOKS, CheckpointHook, ClearMLLoggerHook, ClosureHook,
+                    DistEvalHook, DistSamplerSeedHook, DvcliveLoggerHook,
+                    EMAHook, EvalHook, Fp16OptimizerHook,
+                    GradientCumulativeFp16OptimizerHook,
+                    GradientCumulativeOptimizerHook, Hook, IterTimerHook,
+                    LoggerHook, MlflowLoggerHook, NeptuneLoggerHook,
+                    OptimizerHook, PaviLoggerHook, SegmindLoggerHook,
+                    SyncBuffersHook, TensorboardLoggerHook, TextLoggerHook,
+                    WandbLoggerHook)
+from .hooks.lr_updater import StepLrUpdaterHook  # noqa
+from .hooks.lr_updater import (CosineAnnealingLrUpdaterHook,
+                               CosineRestartLrUpdaterHook, CyclicLrUpdaterHook,
+                               ExpLrUpdaterHook, FixedLrUpdaterHook,
+                               FlatCosineAnnealingLrUpdaterHook,
+                               InvLrUpdaterHook, LinearAnnealingLrUpdaterHook,
+                               LrUpdaterHook, OneCycleLrUpdaterHook,
+                               PolyLrUpdaterHook)
+from .hooks.momentum_updater import (CosineAnnealingMomentumUpdaterHook,
+                                     CyclicMomentumUpdaterHook,
+                                     LinearAnnealingMomentumUpdaterHook,
+                                     MomentumUpdaterHook,
+                                     OneCycleMomentumUpdaterHook,
+                                     StepMomentumUpdaterHook)
+from .iter_based_runner import IterBasedRunner, IterLoader
+from .log_buffer import LogBuffer
+from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS,
+                        DefaultOptimizerConstructor, build_optimizer,
+                        build_optimizer_constructor)
+from .priority import Priority, get_priority
+from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed
+
+# initialize ipu to registor ipu runner to RUNNERS
+from mmcv.device import ipu  # isort:skip  # noqa
+
+__all__ = [
+    'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer',
+    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
+    'FixedLrUpdaterHook', 'StepLrUpdaterHook', 'ExpLrUpdaterHook',
+    'PolyLrUpdaterHook', 'InvLrUpdaterHook', 'CosineAnnealingLrUpdaterHook',
+    'FlatCosineAnnealingLrUpdaterHook', 'CosineRestartLrUpdaterHook',
+    'CyclicLrUpdaterHook', 'OneCycleLrUpdaterHook', 'MomentumUpdaterHook',
+    'StepMomentumUpdaterHook', 'CosineAnnealingMomentumUpdaterHook',
+    'CyclicMomentumUpdaterHook', 'OneCycleMomentumUpdaterHook',
+    'OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'LoggerHook',
+    'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
+    'NeptuneLoggerHook', 'WandbLoggerHook', 'MlflowLoggerHook',
+    'DvcliveLoggerHook', '_load_checkpoint', 'load_state_dict',
+    'load_checkpoint', 'weights_to_cpu', 'save_checkpoint', 'Priority',
+    'get_priority', 'get_host_info', 'get_time_str', 'obj_from_dict',
+    'init_dist', 'get_dist_info', 'master_only', 'OPTIMIZER_BUILDERS',
+    'OPTIMIZERS', 'DefaultOptimizerConstructor', 'build_optimizer',
+    'build_optimizer_constructor', 'IterLoader', 'set_random_seed',
+    'auto_fp16', 'force_fp32', 'wrap_fp16_model', 'Fp16OptimizerHook',
+    'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads',
+    'allreduce_params', 'LossScaler', 'CheckpointLoader', 'BaseModule',
+    '_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook', 'Sequential',
+    'ModuleDict', 'ModuleList', 'GradientCumulativeOptimizerHook',
+    'GradientCumulativeFp16OptimizerHook', 'DefaultRunnerConstructor',
+    'SegmindLoggerHook', 'LinearAnnealingMomentumUpdaterHook',
+    'LinearAnnealingLrUpdaterHook', 'ClearMLLoggerHook'
+]
diff --git a/mmcv/mmcv/runner/base_module.py b/mmcv/mmcv/runner/base_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..845e8c8ff2c1bd29593865d5707a33a79c741c76
--- /dev/null
+++ b/mmcv/mmcv/runner/base_module.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import warnings
+from abc import ABCMeta
+from collections import defaultdict
+from logging import FileHandler
+from typing import Iterable, Optional
+
+import torch.nn as nn
+
+from mmcv.runner.dist_utils import master_only
+from mmcv.utils.logging import get_logger, logger_initialized, print_log
+
+
+class BaseModule(nn.Module, metaclass=ABCMeta):
+    """Base module for all modules in openmmlab.
+
+    ``BaseModule`` is a wrapper of ``torch.nn.Module`` with additional
+    functionality of parameter initialization. Compared with
+    ``torch.nn.Module``, ``BaseModule`` mainly adds three attributes.
+
+    - ``init_cfg``: the config to control the initialization.
+    - ``init_weights``: The function of parameter initialization and recording
+      initialization information.
+    - ``_params_init_info``: Used to track the parameter initialization
+      information. This attribute only exists during executing the
+      ``init_weights``.
+
+    Args:
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, init_cfg: Optional[dict] = None):
+        """Initialize BaseModule, inherited from `torch.nn.Module`"""
+
+        # NOTE init_cfg can be defined in different levels, but init_cfg
+        # in low levels has a higher priority.
+
+        super().__init__()
+        # define default value of init_cfg instead of hard code
+        # in init_weights() function
+        self._is_init = False
+
+        self.init_cfg = copy.deepcopy(init_cfg)
+
+        # Backward compatibility in derived classes
+        # if pretrained is not None:
+        #     warnings.warn('DeprecationWarning: pretrained is a deprecated \
+        #         key, please consider using init_cfg')
+        #     self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+
+    @property
+    def is_init(self) -> bool:
+        return self._is_init
+
+    def init_weights(self) -> None:
+        """Initialize the weights."""
+
+        is_top_level_module = False
+        # check if it is top-level module
+        if not hasattr(self, '_params_init_info'):
+            # The `_params_init_info` is used to record the initialization
+            # information of the parameters
+            # the key should be the obj:`nn.Parameter` of model and the value
+            # should be a dict containing
+            # - init_info (str): The string that describes the initialization.
+            # - tmp_mean_value (FloatTensor): The mean of the parameter,
+            #       which indicates whether the parameter has been modified.
+            # this attribute would be deleted after all parameters
+            # is initialized.
+            self._params_init_info: defaultdict = defaultdict(dict)
+            is_top_level_module = True
+
+            # Initialize the `_params_init_info`,
+            # When detecting the `tmp_mean_value` of
+            # the corresponding parameter is changed, update related
+            # initialization information
+            for name, param in self.named_parameters():
+                self._params_init_info[param][
+                    'init_info'] = f'The value is the same before and ' \
+                                   f'after calling `init_weights` ' \
+                                   f'of {self.__class__.__name__} '
+                self._params_init_info[param][
+                    'tmp_mean_value'] = param.data.mean()
+
+            # pass `params_init_info` to all submodules
+            # All submodules share the same `params_init_info`,
+            # so it will be updated when parameters are
+            # modified at any level of the model.
+            for sub_module in self.modules():
+                sub_module._params_init_info = self._params_init_info
+
+        # Get the initialized logger, if not exist,
+        # create a logger named `mmcv`
+        logger_names = list(logger_initialized.keys())
+        logger_name = logger_names[0] if logger_names else 'mmcv'
+
+        from ..cnn import initialize
+        from ..cnn.utils.weight_init import update_init_info
+        module_name = self.__class__.__name__
+        if not self._is_init:
+            if self.init_cfg:
+                print_log(
+                    f'initialize {module_name} with init_cfg {self.init_cfg}',
+                    logger=logger_name)
+                initialize(self, self.init_cfg)
+                if isinstance(self.init_cfg, dict):
+                    # prevent the parameters of
+                    # the pre-trained model
+                    # from being overwritten by
+                    # the `init_weights`
+                    if self.init_cfg['type'] == 'Pretrained':
+                        return
+
+            for m in self.children():
+                if hasattr(m, 'init_weights'):
+                    m.init_weights()
+                    # users may overload the `init_weights`
+                    update_init_info(
+                        m,
+                        init_info=f'Initialized by '
+                        f'user-defined `init_weights`'
+                        f' in {m.__class__.__name__} ')
+
+            self._is_init = True
+        else:
+            warnings.warn(f'init_weights of {self.__class__.__name__} has '
+                          f'been called more than once.')
+
+        if is_top_level_module:
+            self._dump_init_info(logger_name)
+
+            for sub_module in self.modules():
+                del sub_module._params_init_info
+
+    @master_only
+    def _dump_init_info(self, logger_name: str) -> None:
+        """Dump the initialization information to a file named
+        `initialization.log.json` in workdir.
+
+        Args:
+            logger_name (str): The name of logger.
+        """
+
+        logger = get_logger(logger_name)
+
+        with_file_handler = False
+        # dump the information to the logger file if there is a `FileHandler`
+        for handler in logger.handlers:
+            if isinstance(handler, FileHandler):
+                handler.stream.write(
+                    'Name of parameter - Initialization information\n')
+                for name, param in self.named_parameters():
+                    handler.stream.write(
+                        f'\n{name} - {param.shape}: '
+                        f"\n{self._params_init_info[param]['init_info']} \n")
+                handler.stream.flush()
+                with_file_handler = True
+        if not with_file_handler:
+            for name, param in self.named_parameters():
+                print_log(
+                    f'\n{name} - {param.shape}: '
+                    f"\n{self._params_init_info[param]['init_info']} \n ",
+                    logger=logger_name)
+
+    def __repr__(self):
+        s = super().__repr__()
+        if self.init_cfg:
+            s += f'\ninit_cfg={self.init_cfg}'
+        return s
+
+
+class Sequential(BaseModule, nn.Sequential):
+    """Sequential module in openmmlab.
+
+    Args:
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self, *args, init_cfg: Optional[dict] = None):
+        BaseModule.__init__(self, init_cfg)
+        nn.Sequential.__init__(self, *args)
+
+
+class ModuleList(BaseModule, nn.ModuleList):
+    """ModuleList in openmmlab.
+
+    Args:
+        modules (iterable, optional): an iterable of modules to add.
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 modules: Optional[Iterable] = None,
+                 init_cfg: Optional[dict] = None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleList.__init__(self, modules)
+
+
+class ModuleDict(BaseModule, nn.ModuleDict):
+    """ModuleDict in openmmlab.
+
+    Args:
+        modules (dict, optional): a mapping (dictionary) of (string: module)
+            or an iterable of key-value pairs of type (string, module).
+        init_cfg (dict, optional): Initialization config dict.
+    """
+
+    def __init__(self,
+                 modules: Optional[dict] = None,
+                 init_cfg: Optional[dict] = None):
+        BaseModule.__init__(self, init_cfg)
+        nn.ModuleDict.__init__(self, modules)
diff --git a/mmcv/mmcv/runner/base_runner.py b/mmcv/mmcv/runner/base_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c5a9ddd008ad7ab1c241b1cbaff238c3a2b0165
--- /dev/null
+++ b/mmcv/mmcv/runner/base_runner.py
@@ -0,0 +1,566 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+import os.path as osp
+import warnings
+from abc import ABCMeta, abstractmethod
+from collections import OrderedDict
+from typing import (Any, Callable, Dict, List, Optional, Tuple, Union,
+                    no_type_check)
+
+import torch
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader
+
+import mmcv
+from ..parallel import is_module_wrapper
+from .checkpoint import load_checkpoint
+from .dist_utils import get_dist_info
+from .hooks import HOOKS, Hook
+from .log_buffer import LogBuffer
+from .priority import Priority, get_priority
+from .utils import get_time_str
+
+
+class BaseRunner(metaclass=ABCMeta):
+    """The base class of Runner, a training helper for PyTorch.
+
+    All subclasses should implement the following APIs:
+
+    - ``run()``
+    - ``train()``
+    - ``val()``
+    - ``save_checkpoint()``
+
+    Args:
+        model (:obj:`torch.nn.Module`): The model to be run.
+        batch_processor (callable): A callable method that process a data
+            batch. The interface of this method should be
+            `batch_processor(model, data, train_mode) -> dict`
+        optimizer (dict or :obj:`torch.optim.Optimizer`): It can be either an
+            optimizer (in most cases) or a dict of optimizers (in models that
+            requires more than one optimizer, e.g., GAN).
+        work_dir (str, optional): The working directory to save checkpoints
+            and logs. Defaults to None.
+        logger (:obj:`logging.Logger`): Logger used during training.
+             Defaults to None. (The default value is just for backward
+             compatibility)
+        meta (dict | None): A dict records some import information such as
+            environment info and seed, which will be logged in logger hook.
+            Defaults to None.
+        max_epochs (int, optional): Total training epochs.
+        max_iters (int, optional): Total training iterations.
+    """
+
+    def __init__(self,
+                 model: torch.nn.Module,
+                 batch_processor: Optional[Callable] = None,
+                 optimizer: Union[Dict, torch.optim.Optimizer, None] = None,
+                 work_dir: Optional[str] = None,
+                 logger: Optional[logging.Logger] = None,
+                 meta: Optional[Dict] = None,
+                 max_iters: Optional[int] = None,
+                 max_epochs: Optional[int] = None) -> None:
+        if batch_processor is not None:
+            if not callable(batch_processor):
+                raise TypeError('batch_processor must be callable, '
+                                f'but got {type(batch_processor)}')
+            warnings.warn(
+                'batch_processor is deprecated, please implement '
+                'train_step() and val_step() in the model instead.',
+                DeprecationWarning)
+            # raise an error is `batch_processor` is not None and
+            # `model.train_step()` exists.
+            if is_module_wrapper(model):
+                _model = model.module
+            else:
+                _model = model
+            if hasattr(_model, 'train_step') or hasattr(_model, 'val_step'):
+                raise RuntimeError(
+                    'batch_processor and model.train_step()/model.val_step() '
+                    'cannot be both available.')
+        else:
+            assert hasattr(model, 'train_step')
+
+        # check the type of `optimizer`
+        if isinstance(optimizer, dict):
+            for name, optim in optimizer.items():
+                if not isinstance(optim, Optimizer):
+                    raise TypeError(
+                        f'optimizer must be a dict of torch.optim.Optimizers, '
+                        f'but optimizer["{name}"] is a {type(optim)}')
+        elif not isinstance(optimizer, Optimizer) and optimizer is not None:
+            raise TypeError(
+                f'optimizer must be a torch.optim.Optimizer object '
+                f'or dict or None, but got {type(optimizer)}')
+
+        # check the type of `logger`
+        if not isinstance(logger, logging.Logger):
+            raise TypeError(f'logger must be a logging.Logger object, '
+                            f'but got {type(logger)}')
+
+        # check the type of `meta`
+        if meta is not None and not isinstance(meta, dict):
+            raise TypeError(
+                f'meta must be a dict or None, but got {type(meta)}')
+
+        self.model = model
+        self.batch_processor = batch_processor
+        self.optimizer = optimizer
+        self.logger = logger
+        self.meta = meta
+        # create work_dir
+        if isinstance(work_dir, str):
+            self.work_dir: Optional[str] = osp.abspath(work_dir)
+            mmcv.mkdir_or_exist(self.work_dir)
+        elif work_dir is None:
+            self.work_dir = None
+        else:
+            raise TypeError('"work_dir" must be a str or None')
+
+        # get model name from the model class
+        if hasattr(self.model, 'module'):
+            self._model_name = self.model.module.__class__.__name__
+        else:
+            self._model_name = self.model.__class__.__name__
+
+        self._rank, self._world_size = get_dist_info()
+        self.timestamp = get_time_str()
+        self.mode: Optional[str] = None
+        self._hooks: List[Hook] = []
+        self._epoch = 0
+        self._iter = 0
+        self._inner_iter = 0
+
+        if max_epochs is not None and max_iters is not None:
+            raise ValueError(
+                'Only one of `max_epochs` or `max_iters` can be set.')
+
+        self._max_epochs = max_epochs
+        self._max_iters = max_iters
+        # TODO: Redesign LogBuffer, it is not flexible and elegant enough
+        self.log_buffer = LogBuffer()
+
+    @property
+    def model_name(self) -> str:
+        """str: Name of the model, usually the module class name."""
+        return self._model_name
+
+    @property
+    def rank(self) -> int:
+        """int: Rank of current process. (distributed training)"""
+        return self._rank
+
+    @property
+    def world_size(self) -> int:
+        """int: Number of processes participating in the job.
+        (distributed training)"""
+        return self._world_size
+
+    @property
+    def hooks(self) -> List[Hook]:
+        """list[:obj:`Hook`]: A list of registered hooks."""
+        return self._hooks
+
+    @property
+    def epoch(self) -> int:
+        """int: Current epoch."""
+        return self._epoch
+
+    @property
+    def iter(self) -> int:
+        """int: Current iteration."""
+        return self._iter
+
+    @property
+    def inner_iter(self) -> int:
+        """int: Iteration in an epoch."""
+        return self._inner_iter
+
+    @property
+    def max_epochs(self):
+        """int: Maximum training epochs."""
+        return self._max_epochs
+
+    @property
+    def max_iters(self):
+        """int: Maximum training iterations."""
+        return self._max_iters
+
+    @abstractmethod
+    def train(self):
+        pass
+
+    @abstractmethod
+    def val(self):
+        pass
+
+    @abstractmethod
+    def run(self, data_loaders: List[DataLoader],
+            workflow: List[Tuple[str, int]], **kwargs) -> Any:
+        pass
+
+    @abstractmethod
+    def save_checkpoint(self,
+                        out_dir: str,
+                        filename_tmpl: str,
+                        save_optimizer: bool = True,
+                        meta: Optional[Dict] = None,
+                        create_symlink: bool = True) -> None:
+        pass
+
+    def current_lr(self) -> Union[List[float], Dict[str, List[float]]]:
+        """Get current learning rates.
+
+        Returns:
+            list[float] | dict[str, list[float]]: Current learning rates of all
+            param groups. If the runner has a dict of optimizers, this method
+            will return a dict.
+        """
+        lr: Union[List[float], Dict[str, List[float]]]
+        if isinstance(self.optimizer, torch.optim.Optimizer):
+            lr = [group['lr'] for group in self.optimizer.param_groups]
+        elif isinstance(self.optimizer, dict):
+            lr = dict()
+            for name, optim in self.optimizer.items():
+                lr[name] = [group['lr'] for group in optim.param_groups]
+        else:
+            raise RuntimeError(
+                'lr is not applicable because optimizer does not exist.')
+        return lr
+
+    def current_momentum(self) -> Union[List[float], Dict[str, List[float]]]:
+        """Get current momentums.
+
+        Returns:
+            list[float] | dict[str, list[float]]: Current momentums of all
+            param groups. If the runner has a dict of optimizers, this method
+            will return a dict.
+        """
+
+        def _get_momentum(optimizer):
+            momentums = []
+            for group in optimizer.param_groups:
+                if 'momentum' in group.keys():
+                    momentums.append(group['momentum'])
+                elif 'betas' in group.keys():
+                    momentums.append(group['betas'][0])
+                else:
+                    momentums.append(0)
+            return momentums
+
+        if self.optimizer is None:
+            raise RuntimeError(
+                'momentum is not applicable because optimizer does not exist.')
+        elif isinstance(self.optimizer, torch.optim.Optimizer):
+            momentums = _get_momentum(self.optimizer)
+        elif isinstance(self.optimizer, dict):
+            momentums = dict()
+            for name, optim in self.optimizer.items():
+                momentums[name] = _get_momentum(optim)
+        return momentums
+
+    def register_hook(self,
+                      hook: Hook,
+                      priority: Union[int, str, Priority] = 'NORMAL') -> None:
+        """Register a hook into the hook list.
+
+        The hook will be inserted into a priority queue, with the specified
+        priority (See :class:`Priority` for details of priorities).
+        For hooks with the same priority, they will be triggered in the same
+        order as they are registered.
+
+        Args:
+            hook (:obj:`Hook`): The hook to be registered.
+            priority (int or str or :obj:`Priority`): Hook priority.
+                Lower value means higher priority.
+        """
+        assert isinstance(hook, Hook)
+        if hasattr(hook, 'priority'):
+            raise ValueError('"priority" is a reserved attribute for hooks')
+        priority = get_priority(priority)
+        hook.priority = priority  # type: ignore
+        # insert the hook to a sorted list
+        inserted = False
+        for i in range(len(self._hooks) - 1, -1, -1):
+            if priority >= self._hooks[i].priority:  # type: ignore
+                self._hooks.insert(i + 1, hook)
+                inserted = True
+                break
+        if not inserted:
+            self._hooks.insert(0, hook)
+
+    def register_hook_from_cfg(self, hook_cfg: Dict) -> None:
+        """Register a hook from its cfg.
+
+        Args:
+            hook_cfg (dict): Hook config. It should have at least keys 'type'
+              and 'priority' indicating its type and priority.
+
+        Note:
+            The specific hook class to register should not use 'type' and
+            'priority' arguments during initialization.
+        """
+        hook_cfg = hook_cfg.copy()
+        priority = hook_cfg.pop('priority', 'NORMAL')
+        hook = mmcv.build_from_cfg(hook_cfg, HOOKS)
+        self.register_hook(hook, priority=priority)
+
+    def call_hook(self, fn_name: str) -> None:
+        """Call all hooks.
+
+        Args:
+            fn_name (str): The function name in each hook to be called, such as
+                "before_train_epoch".
+        """
+        for hook in self._hooks:
+            getattr(hook, fn_name)(self)
+
+    def get_hook_info(self) -> str:
+        # Get hooks info in each stage
+        stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages}
+        for hook in self.hooks:
+            try:
+                priority = Priority(hook.priority).name  # type: ignore
+            except ValueError:
+                priority = hook.priority  # type: ignore
+            classname = hook.__class__.__name__
+            hook_info = f'({priority:<12}) {classname:<35}'
+            for trigger_stage in hook.get_triggered_stages():
+                stage_hook_map[trigger_stage].append(hook_info)
+
+        stage_hook_infos = []
+        for stage in Hook.stages:
+            hook_infos = stage_hook_map[stage]
+            if len(hook_infos) > 0:
+                info = f'{stage}:\n'
+                info += '\n'.join(hook_infos)
+                info += '\n -------------------- '
+                stage_hook_infos.append(info)
+        return '\n'.join(stage_hook_infos)
+
+    def load_checkpoint(
+        self,
+        filename: str,
+        map_location: Union[str, Callable] = 'cpu',
+        strict: bool = False,
+        revise_keys: List = [(r'^module.', '')],
+    ) -> Union[Dict, OrderedDict]:
+        return load_checkpoint(
+            self.model,
+            filename,
+            map_location,
+            strict,
+            self.logger,
+            revise_keys=revise_keys)
+
+    @no_type_check
+    def resume(self,
+               checkpoint: str,
+               resume_optimizer: bool = True,
+               map_location: Union[str, Callable] = 'default') -> None:
+        if map_location == 'default':
+            if torch.cuda.is_available():
+                device_id = torch.cuda.current_device()
+                checkpoint = self.load_checkpoint(
+                    checkpoint,
+                    map_location=lambda storage, loc: storage.cuda(device_id))
+            else:
+                checkpoint = self.load_checkpoint(checkpoint)
+        else:
+            checkpoint = self.load_checkpoint(
+                checkpoint, map_location=map_location)
+
+        self._epoch = checkpoint['meta']['epoch']
+        self._iter = checkpoint['meta']['iter']
+        if self.meta is None:
+            self.meta = {}
+        self.meta.setdefault('hook_msgs', {})
+        # load `last_ckpt`, `best_score`, `best_ckpt`, etc. for hook messages
+        self.meta['hook_msgs'].update(checkpoint['meta'].get('hook_msgs', {}))
+
+        # Re-calculate the number of iterations when resuming
+        # models with different number of GPUs
+        if 'config' in checkpoint['meta']:
+            config = mmcv.Config.fromstring(
+                checkpoint['meta']['config'], file_format='.py')
+            previous_gpu_ids = config.get('gpu_ids', None)
+            if previous_gpu_ids and len(previous_gpu_ids) > 0 and len(
+                    previous_gpu_ids) != self.world_size:
+                self._iter = int(self._iter * len(previous_gpu_ids) /
+                                 self.world_size)
+                self.logger.info('the iteration number is changed due to '
+                                 'change of GPU number')
+
+        # resume meta information meta
+        self.meta = checkpoint['meta']
+
+        if 'optimizer' in checkpoint and resume_optimizer:
+            if isinstance(self.optimizer, Optimizer):
+                self.optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(self.optimizer, dict):
+                for k in self.optimizer.keys():
+                    self.optimizer[k].load_state_dict(
+                        checkpoint['optimizer'][k])
+            else:
+                raise TypeError(
+                    'Optimizer should be dict or torch.optim.Optimizer '
+                    f'but got {type(self.optimizer)}')
+
+        self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter)
+
+    def register_lr_hook(self, lr_config: Union[Dict, Hook, None]) -> None:
+        if lr_config is None:
+            return
+        elif isinstance(lr_config, dict):
+            assert 'policy' in lr_config
+            policy_type = lr_config.pop('policy')
+            # If the type of policy is all in lower case, e.g., 'cyclic',
+            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
+            # This is for the convenient usage of Lr updater.
+            # Since this is not applicable for `
+            # CosineAnnealingLrUpdater`,
+            # the string will not be changed if it contains capital letters.
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'LrUpdaterHook'
+            lr_config['type'] = hook_type
+            hook = mmcv.build_from_cfg(lr_config, HOOKS)
+        else:
+            hook = lr_config
+        self.register_hook(hook, priority='VERY_HIGH')
+
+    def register_momentum_hook(
+            self, momentum_config: Union[Dict, Hook, None]) -> None:
+        if momentum_config is None:
+            return
+        if isinstance(momentum_config, dict):
+            assert 'policy' in momentum_config
+            policy_type = momentum_config.pop('policy')
+            # If the type of policy is all in lower case, e.g., 'cyclic',
+            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
+            # This is for the convenient usage of momentum updater.
+            # Since this is not applicable for
+            # `CosineAnnealingMomentumUpdater`,
+            # the string will not be changed if it contains capital letters.
+            if policy_type == policy_type.lower():
+                policy_type = policy_type.title()
+            hook_type = policy_type + 'MomentumUpdaterHook'
+            momentum_config['type'] = hook_type
+            hook = mmcv.build_from_cfg(momentum_config, HOOKS)
+        else:
+            hook = momentum_config
+        self.register_hook(hook, priority='HIGH')
+
+    def register_optimizer_hook(
+            self, optimizer_config: Union[Dict, Hook, None]) -> None:
+        if optimizer_config is None:
+            return
+        if isinstance(optimizer_config, dict):
+            optimizer_config.setdefault('type', 'OptimizerHook')
+            hook = mmcv.build_from_cfg(optimizer_config, HOOKS)
+        else:
+            hook = optimizer_config
+        self.register_hook(hook, priority='ABOVE_NORMAL')
+
+    def register_checkpoint_hook(
+            self, checkpoint_config: Union[Dict, Hook, None]) -> None:
+        if checkpoint_config is None:
+            return
+        if isinstance(checkpoint_config, dict):
+            checkpoint_config.setdefault('type', 'CheckpointHook')
+            hook = mmcv.build_from_cfg(checkpoint_config, HOOKS)
+        else:
+            hook = checkpoint_config
+        self.register_hook(hook, priority='NORMAL')
+
+    def register_logger_hooks(self, log_config: Optional[Dict]) -> None:
+        if log_config is None:
+            return
+        log_interval = log_config['interval']
+        for info in log_config['hooks']:
+            logger_hook = mmcv.build_from_cfg(
+                info, HOOKS, default_args=dict(interval=log_interval))
+            self.register_hook(logger_hook, priority='VERY_LOW')
+
+    def register_timer_hook(
+        self,
+        timer_config: Union[Dict, Hook, None],
+    ) -> None:
+        if timer_config is None:
+            return
+        if isinstance(timer_config, dict):
+            timer_config_ = copy.deepcopy(timer_config)
+            hook = mmcv.build_from_cfg(timer_config_, HOOKS)
+        else:
+            hook = timer_config
+        self.register_hook(hook, priority='LOW')
+
+    def register_custom_hooks(
+            self, custom_config: Union[List, Dict, Hook, None]) -> None:
+        if custom_config is None:
+            return
+
+        if not isinstance(custom_config, list):
+            custom_config = [custom_config]
+
+        for item in custom_config:
+            if isinstance(item, dict):
+                self.register_hook_from_cfg(item)
+            else:
+                self.register_hook(item, priority='NORMAL')
+
+    def register_profiler_hook(
+        self,
+        profiler_config: Union[Dict, Hook, None],
+    ) -> None:
+        if profiler_config is None:
+            return
+        if isinstance(profiler_config, dict):
+            profiler_config.setdefault('type', 'ProfilerHook')
+            hook = mmcv.build_from_cfg(profiler_config, HOOKS)
+        else:
+            hook = profiler_config
+        self.register_hook(hook)
+
+    def register_training_hooks(
+            self,
+            lr_config: Union[Dict, Hook, None],
+            optimizer_config: Union[Dict, Hook, None] = None,
+            checkpoint_config: Union[Dict, Hook, None] = None,
+            log_config: Optional[Dict] = None,
+            momentum_config: Union[Dict, Hook, None] = None,
+            timer_config: Union[Dict, Hook] = dict(type='IterTimerHook'),
+            custom_hooks_config: Union[List, Dict, Hook, None] = None) -> None:
+        """Register default and custom hooks for training.
+
+        Default and custom hooks include:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | LrUpdaterHook        | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | MomentumUpdaterHook  | HIGH (30)               |
+        +----------------------+-------------------------+
+        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
+        +----------------------+-------------------------+
+        | CheckpointSaverHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | IterTimerHook        | LOW (70)                |
+        +----------------------+-------------------------+
+        | LoggerHook(s)        | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+        | CustomHook(s)        | defaults to NORMAL (50) |
+        +----------------------+-------------------------+
+
+        If custom hooks have same priority with default hooks, custom hooks
+        will be triggered after default hooks.
+        """
+        self.register_lr_hook(lr_config)
+        self.register_momentum_hook(momentum_config)
+        self.register_optimizer_hook(optimizer_config)
+        self.register_checkpoint_hook(checkpoint_config)
+        self.register_timer_hook(timer_config)
+        self.register_logger_hooks(log_config)
+        self.register_custom_hooks(custom_hooks_config)
diff --git a/mmcv/mmcv/runner/builder.py b/mmcv/mmcv/runner/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..008da32aa00f289487895c144d84df3332380553
--- /dev/null
+++ b/mmcv/mmcv/runner/builder.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+from typing import Optional
+
+from ..utils import Registry
+
+RUNNERS = Registry('runner')
+RUNNER_BUILDERS = Registry('runner builder')
+
+
+def build_runner_constructor(cfg: dict):
+    return RUNNER_BUILDERS.build(cfg)
+
+
+def build_runner(cfg: dict, default_args: Optional[dict] = None):
+    runner_cfg = copy.deepcopy(cfg)
+    constructor_type = runner_cfg.pop('constructor',
+                                      'DefaultRunnerConstructor')
+    runner_constructor = build_runner_constructor(
+        dict(
+            type=constructor_type,
+            runner_cfg=runner_cfg,
+            default_args=default_args))
+    runner = runner_constructor()
+    return runner
diff --git a/mmcv/mmcv/runner/checkpoint.py b/mmcv/mmcv/runner/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..08118566423b77d4228af1c93f8f055fc38eaaa0
--- /dev/null
+++ b/mmcv/mmcv/runner/checkpoint.py
@@ -0,0 +1,810 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import io
+import logging
+import os
+import os.path as osp
+import pkgutil
+import re
+import time
+import warnings
+from collections import OrderedDict
+from importlib import import_module
+from tempfile import TemporaryDirectory
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+import torchvision
+from torch.optim import Optimizer
+
+import mmcv
+from ..fileio import FileClient
+from ..fileio import load as load_file
+from ..parallel import is_module_wrapper
+from ..utils import digit_version, load_url, mkdir_or_exist
+from .dist_utils import get_dist_info
+
+ENV_MMCV_HOME = 'MMCV_HOME'
+ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
+DEFAULT_CACHE_DIR = '~/.cache'
+
+
+def _get_mmcv_home() -> str:
+    mmcv_home = os.path.expanduser(
+        os.getenv(
+            ENV_MMCV_HOME,
+            os.path.join(
+                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
+
+    mkdir_or_exist(mmcv_home)
+    return mmcv_home
+
+
+def load_state_dict(module: nn.Module,
+                    state_dict: Union[dict, OrderedDict],
+                    strict: bool = False,
+                    logger: Optional[logging.Logger] = None) -> None:
+    """Load state_dict to a module.
+
+    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
+    Default value for ``strict`` is set to ``False`` and the message for
+    param mismatch will be shown even if strict is False.
+
+    Args:
+        module (Module): Module that receives the state_dict.
+        state_dict (dict or OrderedDict): Weights.
+        strict (bool): whether to strictly enforce that the keys
+            in :attr:`state_dict` match the keys returned by this module's
+            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
+        logger (:obj:`logging.Logger`, optional): Logger to log the error
+            message. If not specified, print function will be used.
+    """
+    unexpected_keys: List[str] = []
+    all_missing_keys: List[str] = []
+    err_msg: List[str] = []
+
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = state_dict.copy()  # type: ignore
+    if metadata is not None:
+        state_dict._metadata = metadata  # type: ignore
+
+    # use _load_from_state_dict to enable checkpoint version control
+    def load(module, prefix=''):
+        # recursively check parallel module in case that the model has a
+        # complicated structure, e.g., nn.Module(nn.Module(DDP))
+        if is_module_wrapper(module):
+            module = module.module
+        local_metadata = {} if metadata is None else metadata.get(
+            prefix[:-1], {})
+        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
+                                     all_missing_keys, unexpected_keys,
+                                     err_msg)
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + '.')
+
+    load(module)
+    # break load->load reference cycle
+    load = None  # type: ignore
+
+    # ignore "num_batches_tracked" of BN layers
+    missing_keys = [
+        key for key in all_missing_keys if 'num_batches_tracked' not in key
+    ]
+
+    if unexpected_keys:
+        err_msg.append('unexpected key in source '
+                       f'state_dict: {", ".join(unexpected_keys)}\n')
+    if missing_keys:
+        err_msg.append(
+            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
+
+    rank, _ = get_dist_info()
+    if len(err_msg) > 0 and rank == 0:
+        err_msg.insert(
+            0, 'The model and loaded state dict do not match exactly\n')
+        err_msg = '\n'.join(err_msg)  # type: ignore
+        if strict:
+            raise RuntimeError(err_msg)
+        elif logger is not None:
+            logger.warning(err_msg)
+        else:
+            print(err_msg)
+
+
+def get_torchvision_models():
+    if digit_version(torchvision.__version__) < digit_version('0.13.0a0'):
+        model_urls = dict()
+        # When the version of torchvision is lower than 0.13, the model url is
+        # not declared in `torchvision.model.__init__.py`, so we need to
+        # iterate through `torchvision.models.__path__` to get the url for each
+        # model.
+        for _, name, ispkg in pkgutil.walk_packages(
+                torchvision.models.__path__):
+            if ispkg:
+                continue
+            _zoo = import_module(f'torchvision.models.{name}')
+            if hasattr(_zoo, 'model_urls'):
+                _urls = getattr(_zoo, 'model_urls')
+                model_urls.update(_urls)
+    else:
+        # Since torchvision bumps to v0.13, the weight loading logic,
+        # model keys and model urls have been changed. Here the URLs of old
+        # version is loaded to avoid breaking back compatibility. If the
+        # torchvision version>=0.13.0, new URLs will be added. Users can get
+        # the resnet50 checkpoint by setting 'resnet50.imagent1k_v1',
+        # 'resnet50' or 'ResNet50_Weights.IMAGENET1K_V1' in the config.
+        json_path = osp.join(mmcv.__path__[0],
+                             'model_zoo/torchvision_0.12.json')
+        model_urls = mmcv.load(json_path)
+        for cls_name, cls in torchvision.models.__dict__.items():
+            # The name of torchvision model weights classes ends with
+            # `_Weights` such as `ResNet18_Weights`. However, some model weight
+            # classes, such as `MNASNet0_75_Weights` does not have any urls in
+            # torchvision 0.13.0 and cannot be iterated. Here we simply check
+            # `DEFAULT` attribute to ensure the class is not empty.
+            if (not cls_name.endswith('_Weights')
+                    or not hasattr(cls, 'DEFAULT')):
+                continue
+            # Since `cls.DEFAULT` can not be accessed by iterating cls, we set
+            # default urls explicitly.
+            cls_key = cls_name.replace('_Weights', '').lower()
+            model_urls[f'{cls_key}.default'] = cls.DEFAULT.url
+            for weight_enum in cls:
+                cls_key = cls_name.replace('_Weights', '').lower()
+                cls_key = f'{cls_key}.{weight_enum.name.lower()}'
+                model_urls[cls_key] = weight_enum.url
+
+    return model_urls
+
+
+def get_external_models():
+    mmcv_home = _get_mmcv_home()
+    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
+    default_urls = load_file(default_json_path)
+    assert isinstance(default_urls, dict)
+    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
+    if osp.exists(external_json_path):
+        external_urls = load_file(external_json_path)
+        assert isinstance(external_urls, dict)
+        default_urls.update(external_urls)
+
+    return default_urls
+
+
+def get_mmcls_models():
+    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
+    mmcls_urls = load_file(mmcls_json_path)
+
+    return mmcls_urls
+
+
+def get_deprecated_model_names():
+    deprecate_json_path = osp.join(mmcv.__path__[0],
+                                   'model_zoo/deprecated.json')
+    deprecate_urls = load_file(deprecate_json_path)
+    assert isinstance(deprecate_urls, dict)
+
+    return deprecate_urls
+
+
+def _process_mmcls_checkpoint(checkpoint: Dict) -> Dict:
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        # Some checkpoints converted from 3rd-party repo don't
+        # have the "state_dict" key.
+        state_dict = checkpoint
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k.startswith('backbone.'):
+            new_state_dict[k[9:]] = v
+    new_checkpoint = dict(state_dict=new_state_dict)
+
+    return new_checkpoint
+
+
+class CheckpointLoader:
+    """A general checkpoint loader to manage all schemes."""
+
+    _schemes: dict = {}
+
+    @classmethod
+    def _register_scheme(cls,
+                         prefixes: Union[str, List, Tuple],
+                         loader: Callable,
+                         force: bool = False) -> None:
+        if isinstance(prefixes, str):
+            prefixes = [prefixes]
+        else:
+            assert isinstance(prefixes, (list, tuple))
+        for prefix in prefixes:
+            if (prefix not in cls._schemes) or force:
+                cls._schemes[prefix] = loader
+            else:
+                raise KeyError(
+                    f'{prefix} is already registered as a loader backend, '
+                    'add "force=True" if you want to override it')
+        # sort, longer prefixes take priority
+        cls._schemes = OrderedDict(
+            sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True))
+
+    @classmethod
+    def register_scheme(cls,
+                        prefixes: Union[str, List[str], Tuple[str, ...]],
+                        loader: Optional[Callable] = None,
+                        force: bool = False) -> Callable:
+        """Register a loader to CheckpointLoader.
+
+        This method can be used as a normal class method or a decorator.
+
+        Args:
+            prefixes (str or Sequence[str]):
+            The prefix of the registered loader.
+            loader (function, optional): The loader function to be registered.
+                When this method is used as a decorator, loader is None.
+                Defaults to None.
+            force (bool, optional): Whether to override the loader
+                if the prefix has already been registered. Defaults to False.
+        """
+
+        if loader is not None:
+            cls._register_scheme(prefixes, loader, force=force)
+            return  # type: ignore
+
+        def _register(loader_cls):
+            cls._register_scheme(prefixes, loader_cls, force=force)
+            return loader_cls
+
+        return _register
+
+    @classmethod
+    def _get_checkpoint_loader(cls, path: str):
+        """Finds a loader that supports the given path. Falls back to the local
+        loader if no other loader is found.
+
+        Args:
+            path (str): checkpoint path
+
+        Returns:
+            callable: checkpoint loader
+        """
+        for p in cls._schemes:
+            # use regular match to handle some cases that where the prefix of
+            # loader has a prefix. For example, both 's3://path' and
+            # 'open-mmlab:s3://path' should return `load_from_ceph`
+            if re.match(p, path) is not None:
+                return cls._schemes[p]
+
+    @classmethod
+    def load_checkpoint(
+            cls,
+            filename: str,
+            map_location: Union[str, Callable, None] = None,
+            logger: Optional[logging.Logger] = None
+    ) -> Union[dict, OrderedDict]:
+        """load checkpoint through URL scheme path.
+
+        Args:
+            filename (str): checkpoint file name with given prefix
+            map_location (str, optional): Same as :func:`torch.load`.
+                Default: None
+            logger (:mod:`logging.Logger`, optional): The logger for message.
+                Default: None
+
+        Returns:
+            dict or OrderedDict: The loaded checkpoint.
+        """
+
+        checkpoint_loader = cls._get_checkpoint_loader(filename)
+        class_name = checkpoint_loader.__name__  # type: ignore
+        mmcv.print_log(
+            f'load checkpoint from {class_name[10:]} path: {filename}', logger)
+        return checkpoint_loader(filename, map_location)  # type: ignore
+
+
+@CheckpointLoader.register_scheme(prefixes='')
+def load_from_local(
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
+    """load checkpoint by local file path.
+
+    Args:
+        filename (str): local checkpoint file path
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    filename = osp.expanduser(filename)
+    if not osp.isfile(filename):
+        raise FileNotFoundError(f'{filename} can not be found.')
+    checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=('http://', 'https://'))
+def load_from_http(
+        filename: str,
+        map_location: Union[str, Callable, None] = None,
+        model_dir: Optional[str] = None) -> Union[dict, OrderedDict]:
+    """load checkpoint through HTTP or HTTPS scheme path. In distributed
+    setting, this function only download checkpoint at local rank 0.
+
+    Args:
+        filename (str): checkpoint file path with modelzoo or
+            torchvision prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+        model_dir (str, optional): directory in which to save the object,
+            Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    rank, world_size = get_dist_info()
+    if rank == 0:
+        checkpoint = load_url(
+            filename, model_dir=model_dir, map_location=map_location)
+    if world_size > 1:
+        torch.distributed.barrier()
+        if rank > 0:
+            checkpoint = load_url(
+                filename, model_dir=model_dir, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='pavi://')
+def load_from_pavi(
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
+    """load checkpoint through the file path prefixed with pavi. In distributed
+    setting, this function download ckpt at all ranks to different temporary
+    directories.
+
+    Args:
+        filename (str): checkpoint file path with pavi prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+          Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    assert filename.startswith('pavi://'), \
+        f'Expected filename startswith `pavi://`, but get {filename}'
+    model_path = filename[7:]
+
+    try:
+        from pavi import modelcloud
+    except ImportError:
+        raise ImportError(
+            'Please install pavi to load checkpoint from modelcloud.')
+
+    model = modelcloud.get(model_path)
+    with TemporaryDirectory() as tmp_dir:
+        downloaded_file = osp.join(tmp_dir, model.name)
+        model.download(downloaded_file)
+        checkpoint = torch.load(downloaded_file, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=r'(\S+\:)?s3://')
+def load_from_ceph(filename: str,
+                   map_location: Union[str, Callable, None] = None,
+                   backend: str = 'petrel') -> Union[dict, OrderedDict]:
+    """load checkpoint through the file path prefixed with s3.  In distributed
+    setting, this function download ckpt at all ranks to different temporary
+    directories.
+
+    Note:
+        Since v1.4.1, the registered scheme prefixes have been enhanced to
+        support bucket names in the path prefix, e.g. 's3://xx.xx/xx.path',
+        'bucket1:s3://xx.xx/xx.path'.
+
+    Args:
+        filename (str): checkpoint file path with s3 prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+        backend (str): The storage backend type. Options are 'ceph',
+            'petrel'. Default: 'petrel'.
+
+    .. warning::
+        :class:`mmcv.fileio.file_client.CephBackend` will be deprecated,
+        please use :class:`mmcv.fileio.file_client.PetrelBackend` instead.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    allowed_backends = ['ceph', 'petrel']
+    if backend not in allowed_backends:
+        raise ValueError(f'Load from Backend {backend} is not supported.')
+
+    if backend == 'ceph':
+        warnings.warn(
+            'CephBackend will be deprecated, please use PetrelBackend instead',
+            DeprecationWarning)
+
+    # CephClient and PetrelBackend have the same prefix 's3://' and the latter
+    # will be chosen as default. If PetrelBackend can not be instantiated
+    # successfully, the CephClient will be chosen.
+    try:
+        file_client = FileClient(backend=backend)
+    except ImportError:
+        allowed_backends.remove(backend)
+        file_client = FileClient(backend=allowed_backends[0])
+
+    with io.BytesIO(file_client.get(filename)) as buffer:
+        checkpoint = torch.load(buffer, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://'))
+def load_from_torchvision(
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
+    """load checkpoint through the file path prefixed with modelzoo or
+    torchvision.
+
+    Args:
+        filename (str): checkpoint file path with modelzoo or
+            torchvision prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    model_urls = get_torchvision_models()
+    if filename.startswith('modelzoo://'):
+        warnings.warn(
+            'The URL scheme of "modelzoo://" is deprecated, please '
+            'use "torchvision://" instead', DeprecationWarning)
+        model_name = filename[11:]
+    else:
+        model_name = filename[14:]
+
+    # Support getting model urls in the same way as torchvision
+    # `ResNet50_Weights.IMAGENET1K_V1` will be mapped to
+    # resnet50.imagenet1k_v1.
+    model_name = model_name.lower().replace('_weights', '')
+    return load_from_http(model_urls[model_name], map_location=map_location)
+
+
+@CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://'))
+def load_from_openmmlab(
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
+    """load checkpoint through the file path prefixed with open-mmlab or
+    openmmlab.
+
+    Args:
+        filename (str): checkpoint file path with open-mmlab or
+        openmmlab prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+          Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    model_urls = get_external_models()
+    prefix_str = 'open-mmlab://'
+    if filename.startswith(prefix_str):
+        model_name = filename[13:]
+    else:
+        model_name = filename[12:]
+        prefix_str = 'openmmlab://'
+
+    deprecated_urls = get_deprecated_model_names()
+    if model_name in deprecated_urls:
+        warnings.warn(
+            f'{prefix_str}{model_name} is deprecated in favor '
+            f'of {prefix_str}{deprecated_urls[model_name]}',
+            DeprecationWarning)
+        model_name = deprecated_urls[model_name]
+    model_url = model_urls[model_name]
+    # check if is url
+    if model_url.startswith(('http://', 'https://')):
+        checkpoint = load_from_http(model_url, map_location=map_location)
+    else:
+        filename = osp.join(_get_mmcv_home(), model_url)
+        if not osp.isfile(filename):
+            raise FileNotFoundError(f'{filename} can not be found.')
+        checkpoint = torch.load(filename, map_location=map_location)
+    return checkpoint
+
+
+@CheckpointLoader.register_scheme(prefixes='mmcls://')
+def load_from_mmcls(
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
+    """load checkpoint through the file path prefixed with mmcls.
+
+    Args:
+        filename (str): checkpoint file path with mmcls prefix
+        map_location (str, optional): Same as :func:`torch.load`.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    model_urls = get_mmcls_models()
+    model_name = filename[8:]
+    checkpoint = load_from_http(
+        model_urls[model_name], map_location=map_location)
+    checkpoint = _process_mmcls_checkpoint(checkpoint)
+    return checkpoint
+
+
+def _load_checkpoint(
+        filename: str,
+        map_location: Union[str, Callable, None] = None,
+        logger: Optional[logging.Logger] = None) -> Union[dict, OrderedDict]:
+    """Load checkpoint from somewhere (modelzoo, file, url).
+
+    Args:
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str, optional): Same as :func:`torch.load`.
+           Default: None.
+        logger (:mod:`logging.Logger`, optional): The logger for error message.
+           Default: None
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint. It can be either an
+           OrderedDict storing model weights or a dict containing other
+           information, which depends on the checkpoint.
+    """
+    return CheckpointLoader.load_checkpoint(filename, map_location, logger)
+
+
+def _load_checkpoint_with_prefix(
+    prefix: str,
+    filename: str,
+    map_location: Union[str, Callable, None] = None,
+) -> Union[dict, OrderedDict]:
+    """Load partial pretrained model with specific prefix.
+
+    Args:
+        prefix (str): The prefix of sub-module.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str | None): Same as :func:`torch.load`. Default: None.
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+
+    checkpoint = _load_checkpoint(filename, map_location=map_location)
+
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+    if not prefix.endswith('.'):
+        prefix += '.'
+    prefix_len = len(prefix)
+
+    state_dict = {
+        k[prefix_len:]: v
+        for k, v in state_dict.items() if k.startswith(prefix)
+    }
+
+    assert state_dict, f'{prefix} is not in the pretrained model'
+    return state_dict
+
+
+def load_checkpoint(
+        model: torch.nn.Module,
+        filename: str,
+        map_location: Union[str, Callable, None] = None,
+        strict: bool = False,
+        logger: Optional[logging.Logger] = None,
+        revise_keys: list = [(r'^module\.', '')]) -> Union[dict, OrderedDict]:
+    """Load checkpoint from a file or URI.
+
+    Args:
+        model (Module): Module to load checkpoint.
+        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
+            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
+            details.
+        map_location (str): Same as :func:`torch.load`.
+        strict (bool): Whether to allow different params for the model and
+            checkpoint.
+        logger (:mod:`logging.Logger` or None): The logger for error message.
+        revise_keys (list): A list of customized keywords to modify the
+            state_dict in checkpoint. Each item is a (pattern, replacement)
+            pair of the regular expression operations. Default: strip
+            the prefix 'module.' by [(r'^module\\.', '')].
+
+    Returns:
+        dict or OrderedDict: The loaded checkpoint.
+    """
+    checkpoint = _load_checkpoint(filename, map_location, logger)
+    # OrderedDict is a subclass of dict
+    if not isinstance(checkpoint, dict):
+        raise RuntimeError(
+            f'No state_dict found in checkpoint file {filename}')
+    # get state_dict from checkpoint
+    if 'state_dict' in checkpoint:
+        state_dict = checkpoint['state_dict']
+    else:
+        state_dict = checkpoint
+
+    # strip prefix of state_dict
+    metadata = getattr(state_dict, '_metadata', OrderedDict())
+    for p, r in revise_keys:
+        state_dict = OrderedDict(
+            {re.sub(p, r, k): v
+             for k, v in state_dict.items()})
+    # Keep metadata in state_dict
+    state_dict._metadata = metadata
+
+    # load state_dict
+    load_state_dict(model, state_dict, strict, logger)
+    return checkpoint
+
+
+def weights_to_cpu(state_dict: OrderedDict) -> OrderedDict:
+    """Copy a model state_dict to cpu.
+
+    Args:
+        state_dict (OrderedDict): Model weights on GPU.
+
+    Returns:
+        OrderedDict: Model weights on GPU.
+    """
+    state_dict_cpu = OrderedDict()
+    for key, val in state_dict.items():
+        state_dict_cpu[key] = val.cpu()
+    # Keep metadata in state_dict
+    state_dict_cpu._metadata = getattr(  # type: ignore
+        state_dict, '_metadata', OrderedDict())
+    return state_dict_cpu
+
+
+def _save_to_state_dict(module: torch.nn.Module, destination: dict,
+                        prefix: str, keep_vars: bool) -> None:
+    """Saves module state to `destination` dictionary.
+
+    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (dict): A dict where state will be stored.
+        prefix (str): The prefix for parameters and buffers used in this
+            module.
+    """
+    for name, param in module._parameters.items():
+        if param is not None:
+            destination[prefix + name] = param if keep_vars else param.detach()
+    for name, buf in module._buffers.items():
+        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
+        if buf is not None:
+            destination[prefix + name] = buf if keep_vars else buf.detach()
+
+
+def get_state_dict(module: torch.nn.Module,
+                   destination: Optional[OrderedDict] = None,
+                   prefix: str = '',
+                   keep_vars: bool = False) -> OrderedDict:
+    """Returns a dictionary containing a whole state of the module.
+
+    Both parameters and persistent buffers (e.g. running averages) are
+    included. Keys are corresponding parameter and buffer names.
+
+    This method is modified from :meth:`torch.nn.Module.state_dict` to
+    recursively check parallel module in case that the model has a complicated
+    structure, e.g., nn.Module(nn.Module(DDP)).
+
+    Args:
+        module (nn.Module): The module to generate state_dict.
+        destination (OrderedDict): Returned dict for the state of the
+            module.
+        prefix (str): Prefix of the key.
+        keep_vars (bool): Whether to keep the variable property of the
+            parameters. Default: False.
+
+    Returns:
+        dict: A dictionary containing a whole state of the module.
+    """
+    # recursively check parallel module in case that the model has a
+    # complicated structure, e.g., nn.Module(nn.Module(DDP))
+    if is_module_wrapper(module):
+        module = module.module
+
+    # below is the same as torch.nn.Module.state_dict()
+    if destination is None:
+        destination = OrderedDict()
+        destination._metadata = OrderedDict()  # type: ignore
+    destination._metadata[prefix[:-1]] = local_metadata = dict(  # type: ignore
+        version=module._version)
+    _save_to_state_dict(module, destination, prefix, keep_vars)  # type: ignore
+    for name, child in module._modules.items():
+        if child is not None:
+            get_state_dict(
+                child, destination, prefix + name + '.', keep_vars=keep_vars)
+    for hook in module._state_dict_hooks.values():
+        hook_result = hook(module, destination, prefix, local_metadata)
+        if hook_result is not None:
+            destination = hook_result
+    return destination  # type: ignore
+
+
+def save_checkpoint(model: torch.nn.Module,
+                    filename: str,
+                    optimizer: Optional[Optimizer] = None,
+                    meta: Optional[dict] = None,
+                    file_client_args: Optional[dict] = None) -> None:
+    """Save checkpoint to file.
+
+    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
+    ``optimizer``. By default ``meta`` will contain version and time info.
+
+    Args:
+        model (Module): Module whose params are to be saved.
+        filename (str): Checkpoint filename.
+        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
+        meta (dict, optional): Metadata to be saved in checkpoint.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+    """
+    if meta is None:
+        meta = {}
+    elif not isinstance(meta, dict):
+        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
+    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
+
+    if is_module_wrapper(model):
+        model = model.module
+
+    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
+        # save class name to the meta
+        meta.update(CLASSES=model.CLASSES)
+
+    checkpoint = {
+        'meta': meta,
+        'state_dict': weights_to_cpu(get_state_dict(model))  # type: ignore
+    }
+    # save optimizer state dict in the checkpoint
+    if isinstance(optimizer, Optimizer):
+        checkpoint['optimizer'] = optimizer.state_dict()
+    elif isinstance(optimizer, dict):
+        checkpoint['optimizer'] = {}
+        for name, optim in optimizer.items():
+            checkpoint['optimizer'][name] = optim.state_dict()
+
+    if filename.startswith('pavi://'):
+        if file_client_args is not None:
+            raise ValueError(
+                'file_client_args should be "None" if filename starts with'
+                f'"pavi://", but got {file_client_args}')
+        try:
+            from pavi import exception, modelcloud
+        except ImportError:
+            raise ImportError(
+                'Please install pavi to load checkpoint from modelcloud.')
+        model_path = filename[7:]
+        root = modelcloud.Folder()
+        model_dir, model_name = osp.split(model_path)
+        try:
+            model = modelcloud.get(model_dir)
+        except exception.NodeNotFoundError:
+            model = root.create_training_model(model_dir)
+        with TemporaryDirectory() as tmp_dir:
+            checkpoint_file = osp.join(tmp_dir, model_name)
+            with open(checkpoint_file, 'wb') as f:
+                torch.save(checkpoint, f)
+                f.flush()
+            model.create_file(checkpoint_file, name=model_name)
+    else:
+        file_client = FileClient.infer_client(file_client_args, filename)
+        with io.BytesIO() as f:
+            torch.save(checkpoint, f)
+            file_client.put(f.getvalue(), filename)
diff --git a/mmcv/mmcv/runner/default_constructor.py b/mmcv/mmcv/runner/default_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..394b51cfd773108c704c731c0136884a17956122
--- /dev/null
+++ b/mmcv/mmcv/runner/default_constructor.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from .builder import RUNNER_BUILDERS, RUNNERS
+
+
+@RUNNER_BUILDERS.register_module()
+class DefaultRunnerConstructor:
+    """Default constructor for runners.
+
+    Custom existing `Runner` like `EpocBasedRunner` though `RunnerConstructor`.
+    For example, We can inject some new properties and functions for `Runner`.
+
+    Example:
+        >>> from mmcv.runner import RUNNER_BUILDERS, build_runner
+        >>> # Define a new RunnerReconstructor
+        >>> @RUNNER_BUILDERS.register_module()
+        >>> class MyRunnerConstructor:
+        ...     def __init__(self, runner_cfg, default_args=None):
+        ...         if not isinstance(runner_cfg, dict):
+        ...             raise TypeError('runner_cfg should be a dict',
+        ...                             f'but got {type(runner_cfg)}')
+        ...         self.runner_cfg = runner_cfg
+        ...         self.default_args = default_args
+        ...
+        ...     def __call__(self):
+        ...         runner = RUNNERS.build(self.runner_cfg,
+        ...                                default_args=self.default_args)
+        ...         # Add new properties for existing runner
+        ...         runner.my_name = 'my_runner'
+        ...         runner.my_function = lambda self: print(self.my_name)
+        ...         ...
+        >>> # build your runner
+        >>> runner_cfg = dict(type='EpochBasedRunner', max_epochs=40,
+        ...                   constructor='MyRunnerConstructor')
+        >>> runner = build_runner(runner_cfg)
+    """
+
+    def __init__(self, runner_cfg: dict, default_args: Optional[dict] = None):
+        if not isinstance(runner_cfg, dict):
+            raise TypeError('runner_cfg should be a dict',
+                            f'but got {type(runner_cfg)}')
+        self.runner_cfg = runner_cfg
+        self.default_args = default_args
+
+    def __call__(self):
+        return RUNNERS.build(self.runner_cfg, default_args=self.default_args)
diff --git a/mmcv/mmcv/runner/dist_utils.py b/mmcv/mmcv/runner/dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee55dfda36ed439c41896b8d207385679146e2f5
--- /dev/null
+++ b/mmcv/mmcv/runner/dist_utils.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
+import functools
+import os
+import socket
+import subprocess
+from collections import OrderedDict
+from typing import Callable, List, Optional, Tuple
+
+import torch
+import torch.multiprocessing as mp
+from torch import distributed as dist
+from torch._utils import (_flatten_dense_tensors, _take_tensors,
+                          _unflatten_dense_tensors)
+
+from mmcv.utils import IS_MLU_AVAILABLE
+
+
+def _find_free_port() -> str:
+    # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(('', 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+
+
+def _is_free_port(port: int) -> bool:
+    ips = socket.gethostbyname_ex(socket.gethostname())[-1]
+    ips.append('localhost')
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return all(s.connect_ex((ip, port)) != 0 for ip in ips)
+
+
+def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None:
+    if mp.get_start_method(allow_none=True) is None:
+        mp.set_start_method('spawn')
+    if launcher == 'pytorch':
+        _init_dist_pytorch(backend, **kwargs)
+    elif launcher == 'mpi':
+        _init_dist_mpi(backend, **kwargs)
+    elif launcher == 'slurm':
+        _init_dist_slurm(backend, **kwargs)
+    else:
+        raise ValueError(f'Invalid launcher type: {launcher}')
+
+
+def _init_dist_pytorch(backend: str, **kwargs) -> None:
+    # TODO: use local_rank instead of rank % num_gpus
+    rank = int(os.environ['RANK'])
+    if IS_MLU_AVAILABLE:
+        import torch_mlu  # noqa: F401
+        torch.mlu.set_device(rank)
+        dist.init_process_group(
+            backend='cncl',
+            rank=rank,
+            world_size=int(os.environ['WORLD_SIZE']),
+            **kwargs)
+    else:
+        num_gpus = torch.cuda.device_count()
+        torch.cuda.set_device(rank % num_gpus)
+        dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_mpi(backend: str, **kwargs) -> None:
+    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
+    torch.cuda.set_device(local_rank)
+    if 'MASTER_PORT' not in os.environ:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = '29500'
+    if 'MASTER_ADDR' not in os.environ:
+        raise KeyError('The environment variable MASTER_ADDR is not set')
+    os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
+    os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
+    dist.init_process_group(backend=backend, **kwargs)
+
+
+def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None:
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # if torch.distributed default port(29500) is available
+        # then use it, else find a free port
+        if _is_free_port(29500):
+            os.environ['MASTER_PORT'] = '29500'
+        else:
+            os.environ['MASTER_PORT'] = str(_find_free_port())
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=backend)
+
+
+def get_dist_info() -> Tuple[int, int]:
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:
+        rank = 0
+        world_size = 1
+    return rank, world_size
+
+
+def master_only(func: Callable) -> Callable:
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+
+    return wrapper
+
+
+def allreduce_params(params: List[torch.nn.Parameter],
+                     coalesce: bool = True,
+                     bucket_size_mb: int = -1) -> None:
+    """Allreduce parameters.
+
+    Args:
+        params (list[torch.nn.Parameter]): List of parameters or buffers
+            of a model.
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return
+    params = [param.data for param in params]
+    if coalesce:
+        _allreduce_coalesced(params, world_size, bucket_size_mb)
+    else:
+        for tensor in params:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def allreduce_grads(params: List[torch.nn.Parameter],
+                    coalesce: bool = True,
+                    bucket_size_mb: int = -1) -> None:
+    """Allreduce gradients.
+
+    Args:
+        params (list[torch.nn.Parameter]): List of parameters of a model.
+        coalesce (bool, optional): Whether allreduce parameters as a whole.
+            Defaults to True.
+        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
+            Defaults to -1.
+    """
+    grads = [
+        param.grad.data for param in params
+        if param.requires_grad and param.grad is not None
+    ]
+    _, world_size = get_dist_info()
+    if world_size == 1:
+        return
+    if coalesce:
+        _allreduce_coalesced(grads, world_size, bucket_size_mb)
+    else:
+        for tensor in grads:
+            dist.all_reduce(tensor.div_(world_size))
+
+
+def _allreduce_coalesced(tensors: torch.Tensor,
+                         world_size: int,
+                         bucket_size_mb: int = -1) -> None:
+    if bucket_size_mb > 0:
+        bucket_size_bytes = bucket_size_mb * 1024 * 1024
+        buckets = _take_tensors(tensors, bucket_size_bytes)
+    else:
+        buckets = OrderedDict()
+        for tensor in tensors:
+            tp = tensor.type()
+            if tp not in buckets:
+                buckets[tp] = []
+            buckets[tp].append(tensor)
+        buckets = buckets.values()
+
+    for bucket in buckets:
+        flat_tensors = _flatten_dense_tensors(bucket)
+        dist.all_reduce(flat_tensors)
+        flat_tensors.div_(world_size)
+        for tensor, synced in zip(
+                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
+            tensor.copy_(synced)
diff --git a/mmcv/mmcv/runner/epoch_based_runner.py b/mmcv/mmcv/runner/epoch_based_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6e9069289dd4ff69717b076413218e071812cfb
--- /dev/null
+++ b/mmcv/mmcv/runner/epoch_based_runner.py
@@ -0,0 +1,197 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import platform
+import shutil
+import time
+import warnings
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+from torch.utils.data import DataLoader
+
+import mmcv
+from .base_runner import BaseRunner
+from .builder import RUNNERS
+from .checkpoint import save_checkpoint
+from .utils import get_host_info
+
+
+@RUNNERS.register_module()
+class EpochBasedRunner(BaseRunner):
+    """Epoch-based Runner.
+
+    This runner train models epoch by epoch.
+    """
+
+    def run_iter(self, data_batch: Any, train_mode: bool, **kwargs) -> None:
+        if self.batch_processor is not None:
+            outputs = self.batch_processor(
+                self.model, data_batch, train_mode=train_mode, **kwargs)
+        elif train_mode:
+            outputs = self.model.train_step(data_batch, self.optimizer,
+                                            **kwargs)
+        else:
+            outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
+        if not isinstance(outputs, dict):
+            raise TypeError('"batch_processor()" or "model.train_step()"'
+                            'and "model.val_step()" must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+
+    def train(self, data_loader, **kwargs):
+        self.model.train()
+        self.mode = 'train'
+        self.data_loader = data_loader
+        self._max_iters = self._max_epochs * len(self.data_loader)
+        self.call_hook('before_train_epoch')
+        time.sleep(2)  # Prevent possible deadlock during epoch transition
+        for i, data_batch in enumerate(self.data_loader):
+            self.data_batch = data_batch
+            self._inner_iter = i
+            self.call_hook('before_train_iter')
+            self.run_iter(data_batch, train_mode=True, **kwargs)
+            self.call_hook('after_train_iter')
+            del self.data_batch
+            self._iter += 1
+
+        self.call_hook('after_train_epoch')
+        self._epoch += 1
+
+    @torch.no_grad()
+    def val(self, data_loader, **kwargs):
+        self.model.eval()
+        self.mode = 'val'
+        self.data_loader = data_loader
+        self.call_hook('before_val_epoch')
+        time.sleep(2)  # Prevent possible deadlock during epoch transition
+        for i, data_batch in enumerate(self.data_loader):
+            self.data_batch = data_batch
+            self._inner_iter = i
+            self.call_hook('before_val_iter')
+            self.run_iter(data_batch, train_mode=False)
+            self.call_hook('after_val_iter')
+            del self.data_batch
+        self.call_hook('after_val_epoch')
+
+    def run(self,
+            data_loaders: List[DataLoader],
+            workflow: List[Tuple[str, int]],
+            max_epochs: Optional[int] = None,
+            **kwargs) -> None:
+        """Start running.
+
+        Args:
+            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
+                and validation.
+            workflow (list[tuple]): A list of (phase, epochs) to specify the
+                running order and epochs. E.g, [('train', 2), ('val', 1)] means
+                running 2 epochs for training and 1 epoch for validation,
+                iteratively.
+        """
+        assert isinstance(data_loaders, list)
+        assert mmcv.is_list_of(workflow, tuple)
+        assert len(data_loaders) == len(workflow)
+        if max_epochs is not None:
+            warnings.warn(
+                'setting max_epochs in run is deprecated, '
+                'please set max_epochs in runner_config', DeprecationWarning)
+            self._max_epochs = max_epochs
+
+        assert self._max_epochs is not None, (
+            'max_epochs must be specified during instantiation')
+
+        for i, flow in enumerate(workflow):
+            mode, epochs = flow
+            if mode == 'train':
+                self._max_iters = self._max_epochs * len(data_loaders[i])
+                break
+
+        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
+        self.logger.info('Start running, host: %s, work_dir: %s',
+                         get_host_info(), work_dir)
+        self.logger.info('Hooks will be executed in the following order:\n%s',
+                         self.get_hook_info())
+        self.logger.info('workflow: %s, max: %d epochs', workflow,
+                         self._max_epochs)
+        self.call_hook('before_run')
+
+        while self.epoch < self._max_epochs:
+            for i, flow in enumerate(workflow):
+                mode, epochs = flow
+                if isinstance(mode, str):  # self.train()
+                    if not hasattr(self, mode):
+                        raise ValueError(
+                            f'runner has no method named "{mode}" to run an '
+                            'epoch')
+                    epoch_runner = getattr(self, mode)
+                else:
+                    raise TypeError(
+                        'mode in workflow must be a str, but got {}'.format(
+                            type(mode)))
+
+                for _ in range(epochs):
+                    if mode == 'train' and self.epoch >= self._max_epochs:
+                        break
+                    epoch_runner(data_loaders[i], **kwargs)
+
+        time.sleep(1)  # wait for some hooks like loggers to finish
+        self.call_hook('after_run')
+
+    def save_checkpoint(self,
+                        out_dir: str,
+                        filename_tmpl: str = 'epoch_{}.pth',
+                        save_optimizer: bool = True,
+                        meta: Optional[Dict] = None,
+                        create_symlink: bool = True) -> None:
+        """Save the checkpoint.
+
+        Args:
+            out_dir (str): The directory that checkpoints are saved.
+            filename_tmpl (str, optional): The checkpoint filename template,
+                which contains a placeholder for the epoch number.
+                Defaults to 'epoch_{}.pth'.
+            save_optimizer (bool, optional): Whether to save the optimizer to
+                the checkpoint. Defaults to True.
+            meta (dict, optional): The meta information to be saved in the
+                checkpoint. Defaults to None.
+            create_symlink (bool, optional): Whether to create a symlink
+                "latest.pth" to point to the latest checkpoint.
+                Defaults to True.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+        if self.meta is not None:
+            meta.update(self.meta)
+            # Note: meta.update(self.meta) should be done before
+            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
+            # there will be problems with resumed checkpoints.
+            # More details in https://github.com/open-mmlab/mmcv/pull/1108
+        meta.update(epoch=self.epoch + 1, iter=self.iter)
+
+        filename = filename_tmpl.format(self.epoch + 1)
+        filepath = osp.join(out_dir, filename)
+        optimizer = self.optimizer if save_optimizer else None
+        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
+        # in some environments, `os.symlink` is not supported, you may need to
+        # set `create_symlink` to False
+        if create_symlink:
+            dst_file = osp.join(out_dir, 'latest.pth')
+            if platform.system() != 'Windows':
+                mmcv.symlink(filename, dst_file)
+            else:
+                shutil.copy(filepath, dst_file)
+
+
+@RUNNERS.register_module()
+class Runner(EpochBasedRunner):
+    """Deprecated name of EpochBasedRunner."""
+
+    def __init__(self, *args, **kwargs):
+        warnings.warn(
+            'Runner was deprecated, please use EpochBasedRunner instead',
+            DeprecationWarning)
+        super().__init__(*args, **kwargs)
diff --git a/mmcv/mmcv/runner/fp16_utils.py b/mmcv/mmcv/runner/fp16_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..4674d27a447befbe440c663cc41e86ba4541a100
--- /dev/null
+++ b/mmcv/mmcv/runner/fp16_utils.py
@@ -0,0 +1,435 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import functools
+import warnings
+from collections import abc
+from inspect import getfullargspec
+from typing import Callable, Iterable, List, Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn.parameter import Parameter
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from .dist_utils import allreduce_grads as _allreduce_grads
+
+try:
+    # If PyTorch version >= 1.6.0, torch.cuda.amp.autocast would be imported
+    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
+    # Note that when PyTorch >= 1.6.0, we still cast tensor types to fp16
+    # manually, so the behavior may not be consistent with real amp.
+    from torch.cuda.amp import autocast
+except ImportError:
+    pass
+
+
+def cast_tensor_type(inputs, src_type: torch.dtype, dst_type: torch.dtype):
+    """Recursively convert Tensor in inputs from src_type to dst_type.
+
+    Note:
+        In v1.4.4 and later, ``cast_tersor_type`` will only convert the
+        torch.Tensor which is consistent with ``src_type`` to the ``dst_type``.
+        Before v1.4.4, it ignores the ``src_type`` argument, leading to some
+        potential problems. For example,
+        ``cast_tensor_type(inputs, torch.float, torch.half)`` will convert all
+        tensors in inputs to ``torch.half`` including those originally in
+        ``torch.Int`` or other types, which is not expected.
+
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype): Source type..
+        dst_type (torch.dtype): Destination type.
+
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    if isinstance(inputs, nn.Module):
+        return inputs
+    elif isinstance(inputs, torch.Tensor):
+        # we need to ensure that the type of inputs to be casted are the same
+        # as the argument `src_type`.
+        return inputs.to(dst_type) if inputs.dtype == src_type else inputs
+    elif isinstance(inputs, str):
+        return inputs
+    elif isinstance(inputs, np.ndarray):
+        return inputs
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({  # type: ignore
+            k: cast_tensor_type(v, src_type, dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(  # type: ignore
+            cast_tensor_type(item, src_type, dst_type) for item in inputs)
+    else:
+        return inputs
+
+
+def auto_fp16(
+        apply_to: Optional[Iterable] = None,
+        out_fp32: bool = False,
+        supported_types: tuple = (nn.Module, ),
+) -> Callable:
+    """Decorator to enable fp16 training automatically.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If inputs arguments are fp32 tensors, they will
+    be converted to fp16 automatically. Arguments other than fp32 tensors are
+    ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
+    backend, otherwise, original mmcv implementation will be adopted.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp32 (bool): Whether to convert the output back to fp32.
+        supported_types (tuple): Classes can be decorated by ``auto_fp16``.
+            `New in version 1.5.0.`
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp16
+        >>>     @auto_fp16()
+        >>>     def forward(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp16
+        >>>     @auto_fp16(apply_to=('pred', ))
+        >>>     def do_something(self, pred, others):
+        >>>         pass
+    """
+
+    def auto_fp16_wrapper(old_func: Callable) -> Callable:
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs) -> Callable:
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], supported_types):
+                raise TypeError('@auto_fp16 can only be used to decorate the '
+                                f'method of those classes {supported_types}')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            # NOTE: default args are not taken into consideration
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.float, torch.half))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = {}
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.float, torch.half)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            if (TORCH_VERSION != 'parrots' and
+                    digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+                with autocast(enabled=True):
+                    output = old_func(*new_args, **new_kwargs)
+            else:
+                output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp32:
+                output = cast_tensor_type(output, torch.half, torch.float)
+            return output
+
+        return new_func
+
+    return auto_fp16_wrapper
+
+
+def force_fp32(apply_to: Optional[Iterable] = None,
+               out_fp16: bool = False) -> Callable:
+    """Decorator to convert input arguments to fp32 in force.
+
+    This decorator is useful when you write custom modules and want to support
+    mixed precision training. If there are some inputs that must be processed
+    in fp32 mode, then this decorator can handle it. If inputs arguments are
+    fp16 tensors, they will be converted to fp32 automatically. Arguments other
+    than fp16 tensors are ignored. If you are using PyTorch >= 1.6,
+    torch.cuda.amp is used as the backend, otherwise, original mmcv
+    implementation will be adopted.
+
+    Args:
+        apply_to (Iterable, optional): The argument names to be converted.
+            `None` indicates all arguments.
+        out_fp16 (bool): Whether to convert the output back to fp16.
+
+    Example:
+
+        >>> import torch.nn as nn
+        >>> class MyModule1(nn.Module):
+        >>>
+        >>>     # Convert x and y to fp32
+        >>>     @force_fp32()
+        >>>     def loss(self, x, y):
+        >>>         pass
+
+        >>> import torch.nn as nn
+        >>> class MyModule2(nn.Module):
+        >>>
+        >>>     # convert pred to fp32
+        >>>     @force_fp32(apply_to=('pred', ))
+        >>>     def post_process(self, pred, others):
+        >>>         pass
+    """
+
+    def force_fp32_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs) -> Callable:
+            # check if the module has set the attribute `fp16_enabled`, if not,
+            # just fallback to the original method.
+            if not isinstance(args[0], torch.nn.Module):
+                raise TypeError('@force_fp32 can only be used to decorate the '
+                                'method of nn.Module')
+            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
+                return old_func(*args, **kwargs)
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get the argument names to be casted
+            args_to_cast = args_info.args if apply_to is None else apply_to
+            # convert the args that need to be processed
+            new_args = []
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for i, arg_name in enumerate(arg_names):
+                    if arg_name in args_to_cast:
+                        new_args.append(
+                            cast_tensor_type(args[i], torch.half, torch.float))
+                    else:
+                        new_args.append(args[i])
+            # convert the kwargs that need to be processed
+            new_kwargs = dict()
+            if kwargs:
+                for arg_name, arg_value in kwargs.items():
+                    if arg_name in args_to_cast:
+                        new_kwargs[arg_name] = cast_tensor_type(
+                            arg_value, torch.half, torch.float)
+                    else:
+                        new_kwargs[arg_name] = arg_value
+            # apply converted arguments to the decorated method
+            if (TORCH_VERSION != 'parrots' and
+                    digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+                with autocast(enabled=False):
+                    output = old_func(*new_args, **new_kwargs)
+            else:
+                output = old_func(*new_args, **new_kwargs)
+            # cast the results back to fp32 if necessary
+            if out_fp16:
+                output = cast_tensor_type(output, torch.float, torch.half)
+            return output
+
+        return new_func
+
+    return force_fp32_wrapper
+
+
+def allreduce_grads(params: List[Parameter],
+                    coalesce: bool = True,
+                    bucket_size_mb: int = -1) -> None:
+    warnings.warn(
+        '"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be '
+        'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads',
+        DeprecationWarning)
+    _allreduce_grads(params, coalesce=coalesce, bucket_size_mb=bucket_size_mb)
+
+
+def wrap_fp16_model(model: nn.Module) -> None:
+    """Wrap the FP32 model to FP16.
+
+    If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
+    backend, otherwise, original mmcv implementation will be adopted.
+
+    For PyTorch >= 1.6, this function will
+    1. Set fp16 flag inside the model to True.
+
+    Otherwise:
+    1. Convert FP32 model to FP16.
+    2. Remain some necessary layers to be FP32, e.g., normalization layers.
+    3. Set `fp16_enabled` flag inside the model to True.
+
+    Args:
+        model (nn.Module): Model in FP32.
+    """
+    if (TORCH_VERSION == 'parrots'
+            or digit_version(TORCH_VERSION) < digit_version('1.6.0')):
+        # convert model to fp16
+        model.half()
+        # patch the normalization layers to make it work in fp32 mode
+        patch_norm_fp32(model)
+    # set `fp16_enabled` flag
+    for m in model.modules():
+        if hasattr(m, 'fp16_enabled'):
+            m.fp16_enabled = True
+
+
+def patch_norm_fp32(module: nn.Module) -> nn.Module:
+    """Recursively convert normalization layers from FP16 to FP32.
+
+    Args:
+        module (nn.Module): The modules to be converted in FP16.
+
+    Returns:
+        nn.Module: The converted module, the normalization layers have been
+            converted to FP32.
+    """
+    if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)):
+        module.float()
+        if isinstance(module, nn.GroupNorm) or torch.__version__ < '1.3':
+            module.forward = patch_forward_method(module.forward, torch.half,
+                                                  torch.float)
+    for child in module.children():
+        patch_norm_fp32(child)
+    return module
+
+
+def patch_forward_method(func: Callable,
+                         src_type: torch.dtype,
+                         dst_type: torch.dtype,
+                         convert_output: bool = True) -> Callable:
+    """Patch the forward method of a module.
+
+    Args:
+        func (callable): The original forward method.
+        src_type (torch.dtype): Type of input arguments to be converted from.
+        dst_type (torch.dtype): Type of input arguments to be converted to.
+        convert_output (bool): Whether to convert the output back to src_type.
+
+    Returns:
+        callable: The patched forward method.
+    """
+
+    def new_forward(*args, **kwargs):
+        output = func(*cast_tensor_type(args, src_type, dst_type),
+                      **cast_tensor_type(kwargs, src_type, dst_type))
+        if convert_output:
+            output = cast_tensor_type(output, dst_type, src_type)
+        return output
+
+    return new_forward
+
+
+class LossScaler:
+    """Class that manages loss scaling in mixed precision training which
+    supports both dynamic or static mode.
+
+    The implementation refers to
+    https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py.
+    Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling.
+    It's important to understand how :class:`LossScaler` operates.
+    Loss scaling is designed to combat the problem of underflowing
+    gradients encountered at long times when training fp16 networks.
+    Dynamic loss scaling begins by attempting a very high loss
+    scale.  Ironically, this may result in OVERflowing gradients.
+    If overflowing gradients are encountered, :class:`FP16_Optimizer` then
+    skips the update step for this particular iteration/minibatch,
+    and :class:`LossScaler` adjusts the loss scale to a lower value.
+    If a certain number of iterations occur without overflowing gradients
+    detected,:class:`LossScaler` increases the loss scale once more.
+    In this way :class:`LossScaler` attempts to "ride the edge" of always
+    using the highest loss scale possible without incurring overflow.
+
+    Args:
+        init_scale (float): Initial loss scale value, default: 2**32.
+        scale_factor (float): Factor used when adjusting the loss scale.
+            Default: 2.
+        mode (str): Loss scaling mode. 'dynamic' or 'static'
+        scale_window (int): Number of consecutive iterations without an
+            overflow to wait before increasing the loss scale. Default: 1000.
+    """
+
+    def __init__(self,
+                 init_scale: float = 2**32,
+                 mode: str = 'dynamic',
+                 scale_factor: float = 2.,
+                 scale_window: int = 1000):
+        self.cur_scale = init_scale
+        self.cur_iter = 0
+        assert mode in ('dynamic',
+                        'static'), 'mode can only be dynamic or static'
+        self.mode = mode
+        self.last_overflow_iter = -1
+        self.scale_factor = scale_factor
+        self.scale_window = scale_window
+
+    def has_overflow(self, params: List[Parameter]) -> bool:
+        """Check if params contain overflow."""
+        if self.mode != 'dynamic':
+            return False
+        for p in params:
+            if p.grad is not None and LossScaler._has_inf_or_nan(p.grad.data):
+                return True
+        return False
+
+    def _has_inf_or_nan(x: torch.Tensor) -> bool:
+        """Check if params contain NaN."""
+        try:
+            cpu_sum = float(x.float().sum())
+        except RuntimeError as instance:
+            if 'value cannot be converted' not in instance.args[0]:
+                raise
+            return True
+        else:
+            if cpu_sum == float('inf') or cpu_sum == -float('inf') \
+                    or cpu_sum != cpu_sum:
+                return True
+            return False
+
+    def update_scale(self, overflow: bool) -> None:
+        """update the current loss scale value when overflow happens."""
+        if self.mode != 'dynamic':
+            return
+        if overflow:
+            self.cur_scale = max(self.cur_scale / self.scale_factor, 1)
+            self.last_overflow_iter = self.cur_iter
+        else:
+            if (self.cur_iter - self.last_overflow_iter) % \
+                    self.scale_window == 0:
+                self.cur_scale *= self.scale_factor
+        self.cur_iter += 1
+
+    def state_dict(self) -> dict:
+        """Returns the state of the scaler as a :class:`dict`."""
+        return dict(
+            cur_scale=self.cur_scale,
+            cur_iter=self.cur_iter,
+            mode=self.mode,
+            last_overflow_iter=self.last_overflow_iter,
+            scale_factor=self.scale_factor,
+            scale_window=self.scale_window)
+
+    def load_state_dict(self, state_dict: dict) -> None:
+        """Loads the loss_scaler state dict.
+
+        Args:
+           state_dict (dict): scaler state.
+        """
+        self.cur_scale = state_dict['cur_scale']
+        self.cur_iter = state_dict['cur_iter']
+        self.mode = state_dict['mode']
+        self.last_overflow_iter = state_dict['last_overflow_iter']
+        self.scale_factor = state_dict['scale_factor']
+        self.scale_window = state_dict['scale_window']
+
+    @property
+    def loss_scale(self) -> float:
+        return self.cur_scale
diff --git a/mmcv/mmcv/runner/hooks/__init__.py b/mmcv/mmcv/runner/hooks/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03e2a619e8dd6c516add4a3b23c3c790430255ba
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/__init__.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .checkpoint import CheckpointHook
+from .closure import ClosureHook
+from .ema import EMAHook
+from .evaluation import DistEvalHook, EvalHook
+from .hook import HOOKS, Hook
+from .iter_timer import IterTimerHook
+from .logger import (ClearMLLoggerHook, DvcliveLoggerHook, LoggerHook,
+                     MlflowLoggerHook, NeptuneLoggerHook, PaviLoggerHook,
+                     SegmindLoggerHook, TensorboardLoggerHook, TextLoggerHook,
+                     WandbLoggerHook)
+from .lr_updater import (CosineAnnealingLrUpdaterHook,
+                         CosineRestartLrUpdaterHook, CyclicLrUpdaterHook,
+                         ExpLrUpdaterHook, FixedLrUpdaterHook,
+                         FlatCosineAnnealingLrUpdaterHook, InvLrUpdaterHook,
+                         LinearAnnealingLrUpdaterHook, LrUpdaterHook,
+                         OneCycleLrUpdaterHook, PolyLrUpdaterHook,
+                         StepLrUpdaterHook)
+from .memory import EmptyCacheHook
+from .momentum_updater import (CosineAnnealingMomentumUpdaterHook,
+                               CyclicMomentumUpdaterHook,
+                               LinearAnnealingMomentumUpdaterHook,
+                               MomentumUpdaterHook,
+                               OneCycleMomentumUpdaterHook,
+                               StepMomentumUpdaterHook)
+from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
+                        GradientCumulativeOptimizerHook, OptimizerHook)
+from .profiler import ProfilerHook
+from .sampler_seed import DistSamplerSeedHook
+from .sync_buffer import SyncBuffersHook
+
+__all__ = [
+    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
+    'FixedLrUpdaterHook', 'StepLrUpdaterHook', 'ExpLrUpdaterHook',
+    'PolyLrUpdaterHook', 'InvLrUpdaterHook', 'CosineAnnealingLrUpdaterHook',
+    'FlatCosineAnnealingLrUpdaterHook', 'CosineRestartLrUpdaterHook',
+    'CyclicLrUpdaterHook', 'OneCycleLrUpdaterHook', 'OptimizerHook',
+    'Fp16OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook',
+    'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
+    'TextLoggerHook', 'TensorboardLoggerHook', 'NeptuneLoggerHook',
+    'WandbLoggerHook', 'DvcliveLoggerHook', 'MomentumUpdaterHook',
+    'StepMomentumUpdaterHook', 'CosineAnnealingMomentumUpdaterHook',
+    'CyclicMomentumUpdaterHook', 'OneCycleMomentumUpdaterHook',
+    'SyncBuffersHook', 'EMAHook', 'EvalHook', 'DistEvalHook', 'ProfilerHook',
+    'GradientCumulativeOptimizerHook', 'GradientCumulativeFp16OptimizerHook',
+    'SegmindLoggerHook', 'LinearAnnealingLrUpdaterHook',
+    'LinearAnnealingMomentumUpdaterHook', 'ClearMLLoggerHook'
+]
diff --git a/mmcv/mmcv/runner/hooks/checkpoint.py b/mmcv/mmcv/runner/hooks/checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cc4f356d0ec13c290f9fc5e6af3061893ca4b51
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/checkpoint.py
@@ -0,0 +1,168 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Optional
+
+from mmcv.fileio import FileClient
+from ..dist_utils import allreduce_params, master_only
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class CheckpointHook(Hook):
+    """Save checkpoints periodically.
+
+    Args:
+        interval (int): The saving period. If ``by_epoch=True``, interval
+            indicates epochs, otherwise it indicates iterations.
+            Default: -1, which means "never".
+        by_epoch (bool): Saving checkpoints by epoch or by iteration.
+            Default: True.
+        save_optimizer (bool): Whether to save optimizer state_dict in the
+            checkpoint. It is usually used for resuming experiments.
+            Default: True.
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, ``runner.work_dir`` will be used by default. If
+            specified, the ``out_dir`` will be the concatenation of ``out_dir``
+            and the last level directory of ``runner.work_dir``.
+            `Changed in version 1.3.16.`
+        max_keep_ckpts (int, optional): The maximum checkpoints to keep.
+            In some cases we want only the latest few checkpoints and would
+            like to delete old ones to save the disk space.
+            Default: -1, which means unlimited.
+        save_last (bool, optional): Whether to force the last checkpoint to be
+            saved regardless of interval. Default: True.
+        sync_buffer (bool, optional): Whether to synchronize buffers in
+            different gpus. Default: False.
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+
+    .. warning::
+        Before v1.3.16, the ``out_dir`` argument indicates the path where the
+        checkpoint is stored. However, since v1.3.16, ``out_dir`` indicates the
+        root directory and the final path to save checkpoint is the
+        concatenation of ``out_dir`` and the last level directory of
+        ``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A"
+        and the value of ``runner.work_dir`` is "/path/of/B", then the final
+        path will be "/path/of/A/B".
+    """
+
+    def __init__(self,
+                 interval: int = -1,
+                 by_epoch: bool = True,
+                 save_optimizer: bool = True,
+                 out_dir: Optional[str] = None,
+                 max_keep_ckpts: int = -1,
+                 save_last: bool = True,
+                 sync_buffer: bool = False,
+                 file_client_args: Optional[dict] = None,
+                 **kwargs):
+        self.interval = interval
+        self.by_epoch = by_epoch
+        self.save_optimizer = save_optimizer
+        self.out_dir = out_dir
+        self.max_keep_ckpts = max_keep_ckpts
+        self.save_last = save_last
+        self.args = kwargs
+        self.sync_buffer = sync_buffer
+        self.file_client_args = file_client_args
+
+    def before_run(self, runner):
+        if not self.out_dir:
+            self.out_dir = runner.work_dir
+
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+
+        runner.logger.info(f'Checkpoints will be saved to {self.out_dir} by '
+                           f'{self.file_client.name}.')
+
+        # disable the create_symlink option because some file backends do not
+        # allow to create a symlink
+        if 'create_symlink' in self.args:
+            if self.args[
+                    'create_symlink'] and not self.file_client.allow_symlink:
+                self.args['create_symlink'] = False
+                warnings.warn(
+                    'create_symlink is set as True by the user but is changed'
+                    'to be False because creating symbolic link is not '
+                    f'allowed in {self.file_client.name}')
+        else:
+            self.args['create_symlink'] = self.file_client.allow_symlink
+
+    def after_train_epoch(self, runner):
+        if not self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` epochs
+        # 2. reach the last epoch of training
+        if self.every_n_epochs(
+                runner, self.interval) or (self.save_last
+                                           and self.is_last_epoch(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.epoch + 1} epochs')
+            if self.sync_buffer:
+                allreduce_params(runner.model.buffers())
+            self._save_checkpoint(runner)
+
+    @master_only
+    def _save_checkpoint(self, runner):
+        """Save the current checkpoint and delete unwanted checkpoint."""
+        runner.save_checkpoint(
+            self.out_dir, save_optimizer=self.save_optimizer, **self.args)
+        if runner.meta is not None:
+            if self.by_epoch:
+                cur_ckpt_filename = self.args.get(
+                    'filename_tmpl', 'epoch_{}.pth').format(runner.epoch + 1)
+            else:
+                cur_ckpt_filename = self.args.get(
+                    'filename_tmpl', 'iter_{}.pth').format(runner.iter + 1)
+            runner.meta.setdefault('hook_msgs', dict())
+            runner.meta['hook_msgs']['last_ckpt'] = self.file_client.join_path(
+                self.out_dir, cur_ckpt_filename)
+        # remove other checkpoints
+        if self.max_keep_ckpts > 0:
+            if self.by_epoch:
+                name = 'epoch_{}.pth'
+                current_ckpt = runner.epoch + 1
+            else:
+                name = 'iter_{}.pth'
+                current_ckpt = runner.iter + 1
+            redundant_ckpts = range(
+                current_ckpt - self.max_keep_ckpts * self.interval, 0,
+                -self.interval)
+            filename_tmpl = self.args.get('filename_tmpl', name)
+            for _step in redundant_ckpts:
+                ckpt_path = self.file_client.join_path(
+                    self.out_dir, filename_tmpl.format(_step))
+                if self.file_client.isfile(ckpt_path):
+                    self.file_client.remove(ckpt_path)
+                else:
+                    break
+
+    def after_train_iter(self, runner):
+        if self.by_epoch:
+            return
+
+        # save checkpoint for following cases:
+        # 1. every ``self.interval`` iterations
+        # 2. reach the last iteration of training
+        if self.every_n_iters(
+                runner, self.interval) or (self.save_last
+                                           and self.is_last_iter(runner)):
+            runner.logger.info(
+                f'Saving checkpoint at {runner.iter + 1} iterations')
+            if self.sync_buffer:
+                allreduce_params(runner.model.buffers())
+            self._save_checkpoint(runner)
diff --git a/mmcv/mmcv/runner/hooks/closure.py b/mmcv/mmcv/runner/hooks/closure.py
new file mode 100644
index 0000000000000000000000000000000000000000..73a3e6a90e8ffff8f0ff3c3493dd9fffdb3168b7
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/closure.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable
+
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class ClosureHook(Hook):
+
+    def __init__(self, fn_name: str, fn: Callable):
+        assert hasattr(self, fn_name)
+        assert callable(fn)
+        setattr(self, fn_name, fn)
diff --git a/mmcv/mmcv/runner/hooks/ema.py b/mmcv/mmcv/runner/hooks/ema.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5b578e5e37cd52aeef0b606fe6158d3ae49e1c1
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/ema.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from ...parallel import is_module_wrapper
+from ..hooks.hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class EMAHook(Hook):
+    r"""Exponential Moving Average Hook.
+
+    Use Exponential Moving Average on all parameters of model in training
+    process. All parameters have a ema backup, which update by the formula
+    as below. EMAHook takes priority over EvalHook and CheckpointSaverHook.
+
+        .. math::
+
+            Xema\_{t+1} = (1 - \text{momentum}) \times
+            Xema\_{t} +  \text{momentum} \times X_t
+
+    Args:
+        momentum (float): The momentum used for updating ema parameter.
+            Defaults to 0.0002.
+        interval (int): Update ema parameter every interval iteration.
+            Defaults to 1.
+        warm_up (int): During first warm_up steps, we may use smaller momentum
+            to update ema parameters more slowly. Defaults to 100.
+        resume_from (str, optional): The checkpoint path. Defaults to None.
+    """
+
+    def __init__(self,
+                 momentum: float = 0.0002,
+                 interval: int = 1,
+                 warm_up: int = 100,
+                 resume_from: Optional[str] = None):
+        assert isinstance(interval, int) and interval > 0
+        self.warm_up = warm_up
+        self.interval = interval
+        assert momentum > 0 and momentum < 1
+        self.momentum = momentum**interval
+        self.checkpoint = resume_from
+
+    def before_run(self, runner):
+        """To resume model with it's ema parameters more friendly.
+
+        Register ema parameter as ``named_buffer`` to model
+        """
+        model = runner.model
+        if is_module_wrapper(model):
+            model = model.module
+        self.param_ema_buffer = {}
+        self.model_parameters = dict(model.named_parameters(recurse=True))
+        for name, value in self.model_parameters.items():
+            # "." is not allowed in module's buffer name
+            buffer_name = f"ema_{name.replace('.', '_')}"
+            self.param_ema_buffer[name] = buffer_name
+            model.register_buffer(buffer_name, value.data.clone())
+        self.model_buffers = dict(model.named_buffers(recurse=True))
+        if self.checkpoint is not None:
+            runner.resume(self.checkpoint)
+
+    def after_train_iter(self, runner):
+        """Update ema parameter every self.interval iterations."""
+        curr_step = runner.iter
+        # We warm up the momentum considering the instability at beginning
+        momentum = min(self.momentum,
+                       (1 + curr_step) / (self.warm_up + curr_step))
+        if curr_step % self.interval != 0:
+            return
+        for name, parameter in self.model_parameters.items():
+            buffer_name = self.param_ema_buffer[name]
+            buffer_parameter = self.model_buffers[buffer_name]
+            buffer_parameter.mul_(1 - momentum).add_(momentum, parameter.data)
+
+    def after_train_epoch(self, runner):
+        """We load parameter values from ema backup to model before the
+        EvalHook."""
+        self._swap_ema_parameters()
+
+    def before_train_epoch(self, runner):
+        """We recover model's parameter from ema backup after last epoch's
+        EvalHook."""
+        self._swap_ema_parameters()
+
+    def _swap_ema_parameters(self):
+        """Swap the parameter of model with parameter in ema_buffer."""
+        for name, value in self.model_parameters.items():
+            temp = value.data.clone()
+            ema_buffer = self.model_buffers[self.param_ema_buffer[name]]
+            value.data.copy_(ema_buffer.data)
+            ema_buffer.data.copy_(temp)
diff --git a/mmcv/mmcv/runner/hooks/evaluation.py b/mmcv/mmcv/runner/hooks/evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..181e03409fee13468055c783016c2fc41938224e
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/evaluation.py
@@ -0,0 +1,515 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from math import inf
+from typing import Callable, List, Optional
+
+import torch.distributed as dist
+from torch.nn.modules.batchnorm import _BatchNorm
+from torch.utils.data import DataLoader
+
+from mmcv.fileio import FileClient
+from mmcv.utils import is_seq_of
+from .hook import Hook
+from .logger import LoggerHook
+
+
+class EvalHook(Hook):
+    """Non-Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in non-distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch. It enables
+            evaluation before the training starts if ``start`` <= the resuming
+            epoch. If None, whether to evaluate is merely decided by
+            ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            Default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader, and return the test results. If ``None``, the default
+            test function ``mmcv.engine.single_gpu_test`` will be used.
+            (default: ``None``)
+        greater_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'greater' comparison rule. If ``None``,
+            _default_greater_keys will be used. (default: ``None``)
+        less_keys (List[str] | None, optional): Metric keys that will be
+            inferred by 'less' comparison rule. If ``None``, _default_less_keys
+            will be used. (default: ``None``)
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+            `New in version 1.3.16.`
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+            `New in version 1.3.16.`
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+
+    Note:
+        If new arguments are added for EvalHook, tools/test.py,
+        tools/eval_metric.py may be affected.
+    """
+
+    # Since the key for determine greater or less is related to the downstream
+    # tasks, downstream repos may need to overwrite the following inner
+    # variable accordingly.
+
+    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
+    init_value_map = {'greater': -inf, 'less': inf}
+    _default_greater_keys = [
+        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
+        'mAcc', 'aAcc'
+    ]
+    _default_less_keys = ['loss']
+
+    def __init__(self,
+                 dataloader: DataLoader,
+                 start: Optional[int] = None,
+                 interval: int = 1,
+                 by_epoch: bool = True,
+                 save_best: Optional[str] = None,
+                 rule: Optional[str] = None,
+                 test_fn: Optional[Callable] = None,
+                 greater_keys: Optional[List[str]] = None,
+                 less_keys: Optional[List[str]] = None,
+                 out_dir: Optional[str] = None,
+                 file_client_args: Optional[dict] = None,
+                 **eval_kwargs):
+        if not isinstance(dataloader, DataLoader):
+            raise TypeError(f'dataloader must be a pytorch DataLoader, '
+                            f'but got {type(dataloader)}')
+
+        if interval <= 0:
+            raise ValueError(f'interval must be a positive number, '
+                             f'but got {interval}')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean'
+
+        if start is not None and start < 0:
+            raise ValueError(f'The evaluation start epoch {start} is smaller '
+                             f'than 0')
+
+        self.dataloader = dataloader
+        self.interval = interval
+        self.start = start
+        self.by_epoch = by_epoch
+
+        assert isinstance(save_best, str) or save_best is None, \
+            '""save_best"" should be a str or None ' \
+            f'rather than {type(save_best)}'
+        self.save_best = save_best
+        self.eval_kwargs = eval_kwargs
+        self.initial_flag = True
+
+        if test_fn is None:
+            from mmcv.engine import single_gpu_test
+            self.test_fn = single_gpu_test
+        else:
+            self.test_fn = test_fn
+
+        if greater_keys is None:
+            self.greater_keys = self._default_greater_keys
+        else:
+            if not isinstance(greater_keys, (list, tuple)):
+                assert isinstance(greater_keys, str)
+                greater_keys = (greater_keys, )
+            assert is_seq_of(greater_keys, str)
+            self.greater_keys = greater_keys
+
+        if less_keys is None:
+            self.less_keys = self._default_less_keys
+        else:
+            if not isinstance(less_keys, (list, tuple)):
+                assert isinstance(greater_keys, str)
+                less_keys = (less_keys, )
+            assert is_seq_of(less_keys, str)
+            self.less_keys = less_keys
+
+        if self.save_best is not None:
+            self.best_ckpt_path = None
+            self._init_rule(rule, self.save_best)
+
+        self.out_dir = out_dir
+        self.file_client_args = file_client_args
+
+    def _init_rule(self, rule: Optional[str], key_indicator: str):
+        """Initialize rule, key_indicator, comparison_func, and best score.
+
+        Here is the rule to determine which rule is used for key indicator
+        when the rule is not specific (note that the key indicator matching
+        is case-insensitive):
+        1. If the key indicator is in ``self.greater_keys``, the rule will be
+           specified as 'greater'.
+        2. Or if the key indicator is in ``self.less_keys``, the rule will be
+           specified as 'less'.
+        3. Or if any one item in ``self.greater_keys`` is a substring of
+            key_indicator , the rule will be specified as 'greater'.
+        4. Or if any one item in ``self.less_keys`` is a substring of
+            key_indicator , the rule will be specified as 'less'.
+
+        Args:
+            rule (str | None): Comparison rule for best score.
+            key_indicator (str | None): Key indicator to determine the
+                comparison rule.
+        """
+        if rule not in self.rule_map and rule is not None:
+            raise KeyError(f'rule must be greater, less or None, '
+                           f'but got {rule}.')
+
+        if rule is None:
+            if key_indicator != 'auto':
+                # `_lc` here means we use the lower case of keys for
+                # case-insensitive matching
+                assert isinstance(key_indicator, str)
+                key_indicator_lc = key_indicator.lower()
+                greater_keys = [key.lower() for key in self.greater_keys]
+                less_keys = [key.lower() for key in self.less_keys]
+
+                if key_indicator_lc in greater_keys:
+                    rule = 'greater'
+                elif key_indicator_lc in less_keys:
+                    rule = 'less'
+                elif any(key in key_indicator_lc for key in greater_keys):
+                    rule = 'greater'
+                elif any(key in key_indicator_lc for key in less_keys):
+                    rule = 'less'
+                else:
+                    raise ValueError(f'Cannot infer the rule for key '
+                                     f'{key_indicator}, thus a specific rule '
+                                     f'must be specified.')
+        self.rule = rule
+        self.key_indicator = key_indicator
+        if self.rule is not None:
+            self.compare_func = self.rule_map[self.rule]
+
+    def before_run(self, runner):
+        if not self.out_dir:
+            self.out_dir = runner.work_dir
+
+        self.file_client = FileClient.infer_client(self.file_client_args,
+                                                   self.out_dir)
+
+        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
+        # `self.out_dir` is set so the final `self.out_dir` is the
+        # concatenation of `self.out_dir` and the last level directory of
+        # `runner.work_dir`
+        if self.out_dir != runner.work_dir:
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+            runner.logger.info(
+                f'The best checkpoint will be saved to {self.out_dir} by '
+                f'{self.file_client.name}')
+
+        if self.save_best is not None:
+            if runner.meta is None:
+                warnings.warn('runner.meta is None. Creating an empty one.')
+                runner.meta = dict()
+            runner.meta.setdefault('hook_msgs', dict())
+            self.best_ckpt_path = runner.meta['hook_msgs'].get(
+                'best_ckpt', None)
+
+    def before_train_iter(self, runner):
+        """Evaluate the model only at the start of training by iteration."""
+        if self.by_epoch or not self.initial_flag:
+            return
+        if self.start is not None and runner.iter >= self.start:
+            self.after_train_iter(runner)
+        self.initial_flag = False
+
+    def before_train_epoch(self, runner):
+        """Evaluate the model only at the start of training by epoch."""
+        if not (self.by_epoch and self.initial_flag):
+            return
+        if self.start is not None and runner.epoch >= self.start:
+            self.after_train_epoch(runner)
+        self.initial_flag = False
+
+    def after_train_iter(self, runner):
+        """Called after every training iter to evaluate the results."""
+        if not self.by_epoch and self._should_evaluate(runner):
+            # Because the priority of EvalHook is higher than LoggerHook, the
+            # training log and the evaluating log are mixed. Therefore,
+            # we need to dump the training log and clear it before evaluating
+            # log is generated. In addition, this problem will only appear in
+            # `IterBasedRunner` whose `self.by_epoch` is False, because
+            # `EpochBasedRunner` whose `self.by_epoch` is True calls
+            # `_do_evaluate` in `after_train_epoch` stage, and at this stage
+            # the training log has been printed, so it will not cause any
+            # problem. more details at
+            # https://github.com/open-mmlab/mmsegmentation/issues/694
+            for hook in runner._hooks:
+                if isinstance(hook, LoggerHook):
+                    hook.after_train_iter(runner)
+            runner.log_buffer.clear()
+
+            self._do_evaluate(runner)
+
+    def after_train_epoch(self, runner):
+        """Called after every training epoch to evaluate the results."""
+        if self.by_epoch and self._should_evaluate(runner):
+            self._do_evaluate(runner)
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        results = self.test_fn(runner.model, self.dataloader)
+        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+        key_score = self.evaluate(runner, results)
+        # the key_score may be `None` so it needs to skip the action to save
+        # the best checkpoint
+        if self.save_best and key_score:
+            self._save_ckpt(runner, key_score)
+
+    def _should_evaluate(self, runner):
+        """Judge whether to perform evaluation.
+
+        Here is the rule to judge whether to perform evaluation:
+        1. It will not perform evaluation during the epoch/iteration interval,
+           which is determined by ``self.interval``.
+        2. It will not perform evaluation if the start time is larger than
+           current time.
+        3. It will not perform evaluation when current time is larger than
+           the start time but during epoch/iteration interval.
+
+        Returns:
+            bool: The flag indicating whether to perform evaluation.
+        """
+        if self.by_epoch:
+            current = runner.epoch
+            check_time = self.every_n_epochs
+        else:
+            current = runner.iter
+            check_time = self.every_n_iters
+
+        if self.start is None:
+            if not check_time(runner, self.interval):
+                # No evaluation during the interval.
+                return False
+        elif (current + 1) < self.start:
+            # No evaluation if start is larger than the current time.
+            return False
+        else:
+            # Evaluation only at epochs/iters 3, 5, 7...
+            # if start==3 and interval==2
+            if (current + 1 - self.start) % self.interval:
+                return False
+        return True
+
+    def _save_ckpt(self, runner, key_score):
+        """Save the best checkpoint.
+
+        It will compare the score according to the compare function, write
+        related information (best score, best checkpoint path) and save the
+        best checkpoint into ``work_dir``.
+        """
+        if self.by_epoch:
+            current = f'epoch_{runner.epoch + 1}'
+            cur_type, cur_time = 'epoch', runner.epoch + 1
+        else:
+            current = f'iter_{runner.iter + 1}'
+            cur_type, cur_time = 'iter', runner.iter + 1
+
+        best_score = runner.meta['hook_msgs'].get(
+            'best_score', self.init_value_map[self.rule])
+        if self.compare_func(key_score, best_score):
+            best_score = key_score
+            runner.meta['hook_msgs']['best_score'] = best_score
+
+            if self.best_ckpt_path and self.file_client.isfile(
+                    self.best_ckpt_path):
+                self.file_client.remove(self.best_ckpt_path)
+                runner.logger.info(
+                    f'The previous best checkpoint {self.best_ckpt_path} was '
+                    'removed')
+
+            best_ckpt_name = f'best_{self.key_indicator}_{current}.pth'
+            self.best_ckpt_path = self.file_client.join_path(
+                self.out_dir, best_ckpt_name)
+            runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path
+
+            runner.save_checkpoint(
+                self.out_dir,
+                filename_tmpl=best_ckpt_name,
+                create_symlink=False)
+            runner.logger.info(
+                f'Now best checkpoint is saved as {best_ckpt_name}.')
+            runner.logger.info(
+                f'Best {self.key_indicator} is {best_score:0.4f} '
+                f'at {cur_time} {cur_type}.')
+
+    def evaluate(self, runner, results):
+        """Evaluate the results.
+
+        Args:
+            runner (:obj:`mmcv.Runner`): The underlined training runner.
+            results (list): Output results.
+        """
+        eval_res = self.dataloader.dataset.evaluate(
+            results, logger=runner.logger, **self.eval_kwargs)
+
+        for name, val in eval_res.items():
+            runner.log_buffer.output[name] = val
+        runner.log_buffer.ready = True
+
+        if self.save_best is not None:
+            # If the performance of model is pool, the `eval_res` may be an
+            # empty dict and it will raise exception when `self.save_best` is
+            # not None. More details at
+            # https://github.com/open-mmlab/mmdetection/issues/6265.
+            if not eval_res:
+                warnings.warn(
+                    'Since `eval_res` is an empty dict, the behavior to save '
+                    'the best checkpoint will be skipped in this evaluation.')
+                return None
+
+            if self.key_indicator == 'auto':
+                # infer from eval_results
+                self._init_rule(self.rule, list(eval_res.keys())[0])
+            return eval_res[self.key_indicator]
+
+        return None
+
+
+class DistEvalHook(EvalHook):
+    """Distributed evaluation hook.
+
+    This hook will regularly perform evaluation in a given interval when
+    performing in distributed environment.
+
+    Args:
+        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
+            implemented ``evaluate`` function.
+        start (int | None, optional): Evaluation starting epoch. It enables
+            evaluation before the training starts if ``start`` <= the resuming
+            epoch. If None, whether to evaluate is merely decided by
+            ``interval``. Default: None.
+        interval (int): Evaluation interval. Default: 1.
+        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
+            If set to True, it will perform by epoch. Otherwise, by iteration.
+            default: True.
+        save_best (str, optional): If a metric is specified, it would measure
+            the best checkpoint during evaluation. The information about best
+            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
+            best score value and best checkpoint path, which will be also
+            loaded when resume checkpoint. Options are the evaluation metrics
+            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
+            detection and instance segmentation. ``AR@100`` for proposal
+            recall. If ``save_best`` is ``auto``, the first key of the returned
+            ``OrderedDict`` result will be used. Default: None.
+        rule (str | None, optional): Comparison rule for best score. If set to
+            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
+            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
+            be inferred by 'less' rule. Options are 'greater', 'less', None.
+            Default: None.
+        test_fn (callable, optional): test a model with samples from a
+            dataloader in a multi-gpu manner, and return the test results. If
+            ``None``, the default test function ``mmcv.engine.multi_gpu_test``
+            will be used. (default: ``None``)
+        tmpdir (str | None): Temporary directory to save the results of all
+            processes. Default: None.
+        gpu_collect (bool): Whether to use gpu or cpu to collect results.
+            Default: False.
+        broadcast_bn_buffer (bool): Whether to broadcast the
+            buffer(running_mean and running_var) of rank 0 to other rank
+            before evaluation. Default: True.
+        out_dir (str, optional): The root directory to save checkpoints. If not
+            specified, `runner.work_dir` will be used by default. If specified,
+            the `out_dir` will be the concatenation of `out_dir` and the last
+            level directory of `runner.work_dir`.
+        file_client_args (dict): Arguments to instantiate a FileClient.
+            See :class:`mmcv.fileio.FileClient` for details. Default: None.
+        **eval_kwargs: Evaluation arguments fed into the evaluate function of
+            the dataset.
+    """
+
+    def __init__(self,
+                 dataloader: DataLoader,
+                 start: Optional[int] = None,
+                 interval: int = 1,
+                 by_epoch: bool = True,
+                 save_best: Optional[str] = None,
+                 rule: Optional[str] = None,
+                 test_fn: Optional[Callable] = None,
+                 greater_keys: Optional[List[str]] = None,
+                 less_keys: Optional[List[str]] = None,
+                 broadcast_bn_buffer: bool = True,
+                 tmpdir: Optional[str] = None,
+                 gpu_collect: bool = False,
+                 out_dir: Optional[str] = None,
+                 file_client_args: Optional[dict] = None,
+                 **eval_kwargs):
+
+        if test_fn is None:
+            from mmcv.engine import multi_gpu_test
+            test_fn = multi_gpu_test
+
+        super().__init__(
+            dataloader,
+            start=start,
+            interval=interval,
+            by_epoch=by_epoch,
+            save_best=save_best,
+            rule=rule,
+            test_fn=test_fn,
+            greater_keys=greater_keys,
+            less_keys=less_keys,
+            out_dir=out_dir,
+            file_client_args=file_client_args,
+            **eval_kwargs)
+
+        self.broadcast_bn_buffer = broadcast_bn_buffer
+        self.tmpdir = tmpdir
+        self.gpu_collect = gpu_collect
+
+    def _do_evaluate(self, runner):
+        """perform evaluation and save ckpt."""
+        # Synchronization of BatchNorm's buffer (running_mean
+        # and running_var) is not supported in the DDP of pytorch,
+        # which may cause the inconsistent performance of models in
+        # different ranks, so we broadcast BatchNorm's buffers
+        # of rank 0 to other ranks to avoid this.
+        if self.broadcast_bn_buffer:
+            model = runner.model
+            for name, module in model.named_modules():
+                if isinstance(module,
+                              _BatchNorm) and module.track_running_stats:
+                    dist.broadcast(module.running_var, 0)
+                    dist.broadcast(module.running_mean, 0)
+
+        tmpdir = self.tmpdir
+        if tmpdir is None:
+            tmpdir = osp.join(runner.work_dir, '.eval_hook')
+
+        results = self.test_fn(
+            runner.model,
+            self.dataloader,
+            tmpdir=tmpdir,
+            gpu_collect=self.gpu_collect)
+        if runner.rank == 0:
+            print('\n')
+            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
+            key_score = self.evaluate(runner, results)
+            # the key_score may be `None` so it needs to skip the action to
+            # save the best checkpoint
+            if self.save_best and key_score:
+                self._save_ckpt(runner, key_score)
diff --git a/mmcv/mmcv/runner/hooks/hook.py b/mmcv/mmcv/runner/hooks/hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2d1c9865b5a6b5927029774cce9731187c127ed
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/hook.py
@@ -0,0 +1,92 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.utils import Registry, is_method_overridden
+
+HOOKS = Registry('hook')
+
+
+class Hook:
+    stages = ('before_run', 'before_train_epoch', 'before_train_iter',
+              'after_train_iter', 'after_train_epoch', 'before_val_epoch',
+              'before_val_iter', 'after_val_iter', 'after_val_epoch',
+              'after_run')
+
+    def before_run(self, runner):
+        pass
+
+    def after_run(self, runner):
+        pass
+
+    def before_epoch(self, runner):
+        pass
+
+    def after_epoch(self, runner):
+        pass
+
+    def before_iter(self, runner):
+        pass
+
+    def after_iter(self, runner):
+        pass
+
+    def before_train_epoch(self, runner):
+        self.before_epoch(runner)
+
+    def before_val_epoch(self, runner):
+        self.before_epoch(runner)
+
+    def after_train_epoch(self, runner):
+        self.after_epoch(runner)
+
+    def after_val_epoch(self, runner):
+        self.after_epoch(runner)
+
+    def before_train_iter(self, runner):
+        self.before_iter(runner)
+
+    def before_val_iter(self, runner):
+        self.before_iter(runner)
+
+    def after_train_iter(self, runner):
+        self.after_iter(runner)
+
+    def after_val_iter(self, runner):
+        self.after_iter(runner)
+
+    def every_n_epochs(self, runner, n):
+        return (runner.epoch + 1) % n == 0 if n > 0 else False
+
+    def every_n_inner_iters(self, runner, n):
+        return (runner.inner_iter + 1) % n == 0 if n > 0 else False
+
+    def every_n_iters(self, runner, n):
+        return (runner.iter + 1) % n == 0 if n > 0 else False
+
+    def end_of_epoch(self, runner):
+        return runner.inner_iter + 1 == len(runner.data_loader)
+
+    def is_last_epoch(self, runner):
+        return runner.epoch + 1 == runner._max_epochs
+
+    def is_last_iter(self, runner):
+        return runner.iter + 1 == runner._max_iters
+
+    def get_triggered_stages(self):
+        trigger_stages = set()
+        for stage in Hook.stages:
+            if is_method_overridden(stage, Hook, self):
+                trigger_stages.add(stage)
+
+        # some methods will be triggered in multi stages
+        # use this dict to map method to stages.
+        method_stages_map = {
+            'before_epoch': ['before_train_epoch', 'before_val_epoch'],
+            'after_epoch': ['after_train_epoch', 'after_val_epoch'],
+            'before_iter': ['before_train_iter', 'before_val_iter'],
+            'after_iter': ['after_train_iter', 'after_val_iter'],
+        }
+
+        for method, map_stages in method_stages_map.items():
+            if is_method_overridden(method, Hook, self):
+                trigger_stages.update(map_stages)
+
+        return [stage for stage in Hook.stages if stage in trigger_stages]
diff --git a/mmcv/mmcv/runner/hooks/iter_timer.py b/mmcv/mmcv/runner/hooks/iter_timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cfd5002fe85ffc6992155ac01003878064a1d9be
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/iter_timer.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class IterTimerHook(Hook):
+
+    def before_epoch(self, runner):
+        self.t = time.time()
+
+    def before_iter(self, runner):
+        runner.log_buffer.update({'data_time': time.time() - self.t})
+
+    def after_iter(self, runner):
+        runner.log_buffer.update({'time': time.time() - self.t})
+        self.t = time.time()
diff --git a/mmcv/mmcv/runner/hooks/logger/__init__.py b/mmcv/mmcv/runner/hooks/logger/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..062709e704f08fc313a7f422cc7cd1e34bde5f68
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base import LoggerHook
+from .clearml import ClearMLLoggerHook
+from .dvclive import DvcliveLoggerHook
+from .mlflow import MlflowLoggerHook
+from .neptune import NeptuneLoggerHook
+from .pavi import PaviLoggerHook
+from .segmind import SegmindLoggerHook
+from .tensorboard import TensorboardLoggerHook
+from .text import TextLoggerHook
+from .wandb import WandbLoggerHook
+
+__all__ = [
+    'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
+    'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
+    'NeptuneLoggerHook', 'DvcliveLoggerHook', 'SegmindLoggerHook',
+    'ClearMLLoggerHook'
+]
diff --git a/mmcv/mmcv/runner/hooks/logger/base.py b/mmcv/mmcv/runner/hooks/logger/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..416a1b75106c1b3e1f8153f77bb92701d66f43d0
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/base.py
@@ -0,0 +1,172 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+from abc import ABCMeta, abstractmethod
+from typing import Dict
+
+import numpy as np
+import torch
+
+from ..hook import Hook
+
+
+class LoggerHook(Hook):
+    """Base class for logger hooks.
+
+    Args:
+        interval (int): Logging interval (every k iterations). Default 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default True.
+    """
+
+    __metaclass__ = ABCMeta
+
+    def __init__(self,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True):
+        self.interval = interval
+        self.ignore_last = ignore_last
+        self.reset_flag = reset_flag
+        self.by_epoch = by_epoch
+
+    @abstractmethod
+    def log(self, runner):
+        pass
+
+    @staticmethod
+    def is_scalar(val,
+                  include_np: bool = True,
+                  include_torch: bool = True) -> bool:
+        """Tell the input variable is a scalar or not.
+
+        Args:
+            val: Input variable.
+            include_np (bool): Whether include 0-d np.ndarray as a scalar.
+            include_torch (bool): Whether include 0-d torch.Tensor as a scalar.
+
+        Returns:
+            bool: True or False.
+        """
+        if isinstance(val, numbers.Number):
+            return True
+        elif include_np and isinstance(val, np.ndarray) and val.ndim == 0:
+            return True
+        elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1:
+            return True
+        else:
+            return False
+
+    def get_mode(self, runner) -> str:
+        if runner.mode == 'train':
+            if 'time' in runner.log_buffer.output:
+                mode = 'train'
+            else:
+                mode = 'val'
+        elif runner.mode == 'val':
+            mode = 'val'
+        else:
+            raise ValueError(f"runner mode should be 'train' or 'val', "
+                             f'but got {runner.mode}')
+        return mode
+
+    def get_epoch(self, runner) -> int:
+        if runner.mode == 'train':
+            epoch = runner.epoch + 1
+        elif runner.mode == 'val':
+            # normal val mode
+            # runner.epoch += 1 has been done before val workflow
+            epoch = runner.epoch
+        else:
+            raise ValueError(f"runner mode should be 'train' or 'val', "
+                             f'but got {runner.mode}')
+        return epoch
+
+    def get_iter(self, runner, inner_iter: bool = False) -> int:
+        """Get the current training iteration step."""
+        if self.by_epoch and inner_iter:
+            current_iter = runner.inner_iter + 1
+        else:
+            current_iter = runner.iter + 1
+        return current_iter
+
+    def get_lr_tags(self, runner) -> Dict[str, float]:
+        tags = {}
+        lrs = runner.current_lr()
+        if isinstance(lrs, dict):
+            for name, value in lrs.items():
+                tags[f'learning_rate/{name}'] = value[0]
+        else:
+            tags['learning_rate'] = lrs[0]
+        return tags
+
+    def get_momentum_tags(self, runner) -> Dict[str, float]:
+        tags = {}
+        momentums = runner.current_momentum()
+        if isinstance(momentums, dict):
+            for name, value in momentums.items():
+                tags[f'momentum/{name}'] = value[0]
+        else:
+            tags['momentum'] = momentums[0]
+        return tags
+
+    def get_loggable_tags(
+        self,
+        runner,
+        allow_scalar: bool = True,
+        allow_text: bool = False,
+        add_mode: bool = True,
+        tags_to_skip: tuple = ('time', 'data_time')
+    ) -> Dict:
+        tags = {}
+        for var, val in runner.log_buffer.output.items():
+            if var in tags_to_skip:
+                continue
+            if self.is_scalar(val) and not allow_scalar:
+                continue
+            if isinstance(val, str) and not allow_text:
+                continue
+            if add_mode:
+                var = f'{self.get_mode(runner)}/{var}'
+            tags[var] = val
+        tags.update(self.get_lr_tags(runner))
+        tags.update(self.get_momentum_tags(runner))
+        return tags
+
+    def before_run(self, runner) -> None:
+        for hook in runner.hooks[::-1]:
+            if isinstance(hook, LoggerHook):
+                hook.reset_flag = True
+                break
+
+    def before_epoch(self, runner) -> None:
+        runner.log_buffer.clear()  # clear logs of last epoch
+
+    def after_train_iter(self, runner) -> None:
+        if self.by_epoch and self.every_n_inner_iters(runner, self.interval):
+            runner.log_buffer.average(self.interval)
+        elif not self.by_epoch and self.every_n_iters(runner, self.interval):
+            runner.log_buffer.average(self.interval)
+        elif self.end_of_epoch(runner) and not self.ignore_last:
+            # not precise but more stable
+            runner.log_buffer.average(self.interval)
+
+        if runner.log_buffer.ready:
+            self.log(runner)
+            if self.reset_flag:
+                runner.log_buffer.clear_output()
+
+    def after_train_epoch(self, runner) -> None:
+        if runner.log_buffer.ready:
+            self.log(runner)
+            if self.reset_flag:
+                runner.log_buffer.clear_output()
+
+    def after_val_epoch(self, runner) -> None:
+        runner.log_buffer.average()
+        self.log(runner)
+        if self.reset_flag:
+            runner.log_buffer.clear_output()
diff --git a/mmcv/mmcv/runner/hooks/logger/clearml.py b/mmcv/mmcv/runner/hooks/logger/clearml.py
new file mode 100644
index 0000000000000000000000000000000000000000..7db651f03160237fd1fac03dc93d6676864c525f
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/clearml.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from typing import Dict, Optional
+
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class ClearMLLoggerHook(LoggerHook):
+    """Class to log metrics with clearml.
+
+    It requires `clearml`_ to be installed.
+
+
+    Args:
+        init_kwargs (dict): A dict contains the `clearml.Task.init`
+            initialization keys. See `taskinit`_  for more details.
+        interval (int): Logging interval (every k iterations). Default 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+
+    .. _clearml:
+        https://clear.ml/docs/latest/docs/
+    .. _taskinit:
+        https://clear.ml/docs/latest/docs/references/sdk/task/#taskinit
+    """
+
+    def __init__(self,
+                 init_kwargs: Optional[Dict] = None,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.import_clearml()
+        self.init_kwargs = init_kwargs
+
+    def import_clearml(self):
+        try:
+            import clearml
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install clearml" to install clearml')
+        self.clearml = clearml
+
+    @master_only
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
+        task_kwargs = self.init_kwargs if self.init_kwargs else {}
+        self.task = self.clearml.Task.init(**task_kwargs)
+        self.task_logger = self.task.get_logger()
+
+    @master_only
+    def log(self, runner) -> None:
+        tags = self.get_loggable_tags(runner)
+        for tag, val in tags.items():
+            self.task_logger.report_scalar(tag, tag, val,
+                                           self.get_iter(runner))
diff --git a/mmcv/mmcv/runner/hooks/logger/dvclive.py b/mmcv/mmcv/runner/hooks/logger/dvclive.py
new file mode 100644
index 0000000000000000000000000000000000000000..fc0a58c497fe77a580f507f13949a8a81e7274e3
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/dvclive.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from pathlib import Path
+from typing import Optional
+
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class DvcliveLoggerHook(LoggerHook):
+    """Class to log metrics with dvclive.
+
+    It requires `dvclive`_ to be installed.
+
+    Args:
+        model_file (str): Default None. If not None, after each epoch the
+            model will be saved to {model_file}.
+        interval (int): Logging interval (every k iterations). Default 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+        kwargs: Arguments for instantiating `Live`_.
+
+    .. _dvclive:
+        https://dvc.org/doc/dvclive
+
+    .. _Live:
+        https://dvc.org/doc/dvclive/api-reference/live#parameters
+    """
+
+    def __init__(self,
+                 model_file: Optional[str] = None,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True,
+                 **kwargs):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.model_file = model_file
+        self.import_dvclive(**kwargs)
+
+    def import_dvclive(self, **kwargs) -> None:
+        try:
+            from dvclive import Live
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install dvclive" to install dvclive')
+        self.dvclive = Live(**kwargs)
+
+    @master_only
+    def log(self, runner) -> None:
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            self.dvclive.set_step(self.get_iter(runner))
+            for k, v in tags.items():
+                self.dvclive.log(k, v)
+
+    @master_only
+    def after_train_epoch(self, runner) -> None:
+        super().after_train_epoch(runner)
+        if self.model_file is not None:
+            runner.save_checkpoint(
+                Path(self.model_file).parent,
+                filename_tmpl=Path(self.model_file).name,
+                create_symlink=False,
+            )
diff --git a/mmcv/mmcv/runner/hooks/logger/mlflow.py b/mmcv/mmcv/runner/hooks/logger/mlflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..a76b0426b7ddc216774d0be95628de55ee0d2334
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/mlflow.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+from mmcv.utils import TORCH_VERSION
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class MlflowLoggerHook(LoggerHook):
+    """Class to log metrics and (optionally) a trained model to MLflow.
+
+    It requires `MLflow`_ to be installed.
+
+    Args:
+        exp_name (str, optional): Name of the experiment to be used.
+            Default None. If not None, set the active experiment.
+            If experiment does not exist, an experiment with provided name
+            will be created.
+        tags (Dict[str], optional): Tags for the current run.
+            Default None. If not None, set tags for the current run.
+        log_model (bool, optional): Whether to log an MLflow artifact.
+            Default True. If True, log runner.model as an MLflow artifact
+            for the current run.
+        interval (int): Logging interval (every k iterations). Default: 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+
+    .. _MLflow:
+        https://www.mlflow.org/docs/latest/index.html
+    """
+
+    def __init__(self,
+                 exp_name: Optional[str] = None,
+                 tags: Optional[Dict] = None,
+                 log_model: bool = True,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.import_mlflow()
+        self.exp_name = exp_name
+        self.tags = tags
+        self.log_model = log_model
+
+    def import_mlflow(self) -> None:
+        try:
+            import mlflow
+            import mlflow.pytorch as mlflow_pytorch
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install mlflow" to install mlflow')
+        self.mlflow = mlflow
+        self.mlflow_pytorch = mlflow_pytorch
+
+    @master_only
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
+        if self.exp_name is not None:
+            self.mlflow.set_experiment(self.exp_name)
+        if self.tags is not None:
+            self.mlflow.set_tags(self.tags)
+
+    @master_only
+    def log(self, runner) -> None:
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            self.mlflow.log_metrics(tags, step=self.get_iter(runner))
+
+    @master_only
+    def after_run(self, runner) -> None:
+        if self.log_model:
+            self.mlflow_pytorch.log_model(
+                runner.model,
+                'models',
+                pip_requirements=[f'torch=={TORCH_VERSION}'])
diff --git a/mmcv/mmcv/runner/hooks/logger/neptune.py b/mmcv/mmcv/runner/hooks/logger/neptune.py
new file mode 100644
index 0000000000000000000000000000000000000000..e398fe1e79ae929508c68266de332bde3367a3b9
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/neptune.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, Optional
+
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class NeptuneLoggerHook(LoggerHook):
+    """Class to log metrics to NeptuneAI.
+
+    It requires `Neptune`_ to be installed.
+
+    Args:
+        init_kwargs (dict): a dict contains the initialization keys as below:
+
+            - project (str): Name of a project in a form of
+              namespace/project_name. If None, the value of NEPTUNE_PROJECT
+              environment variable will be taken.
+            - api_token (str): User’s API token. If None, the value of
+              NEPTUNE_API_TOKEN environment variable will be taken. Note: It is
+              strongly recommended to use NEPTUNE_API_TOKEN environment
+              variable rather than placing your API token in plain text in your
+              source code.
+            - name (str, optional, default is 'Untitled'): Editable name of the
+              run. Name is displayed in the run's Details and in Runs table as
+              a column.
+
+            Check https://docs.neptune.ai/api-reference/neptune#init for more
+            init arguments.
+        interval (int): Logging interval (every k iterations). Default: 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than ``interval``. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: True.
+        with_step (bool): If True, the step will be logged from
+            ``self.get_iters``. Otherwise, step will not be logged.
+            Default: True.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+
+    .. _Neptune:
+        https://docs.neptune.ai
+    """
+
+    def __init__(self,
+                 init_kwargs: Optional[Dict] = None,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = True,
+                 with_step: bool = True,
+                 by_epoch: bool = True):
+
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.import_neptune()
+        self.init_kwargs = init_kwargs
+        self.with_step = with_step
+
+    def import_neptune(self) -> None:
+        try:
+            import neptune.new as neptune
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install neptune-client" to install neptune')
+        self.neptune = neptune
+        self.run = None
+
+    @master_only
+    def before_run(self, runner) -> None:
+        if self.init_kwargs:
+            self.run = self.neptune.init(**self.init_kwargs)
+        else:
+            self.run = self.neptune.init()
+
+    @master_only
+    def log(self, runner) -> None:
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            for tag_name, tag_value in tags.items():
+                if self.with_step:
+                    self.run[tag_name].log(  # type: ignore
+                        tag_value, step=self.get_iter(runner))
+                else:
+                    tags['global_step'] = self.get_iter(runner)
+                    self.run[tag_name].log(tags)  # type: ignore
+
+    @master_only
+    def after_run(self, runner) -> None:
+        self.run.stop()  # type: ignore
diff --git a/mmcv/mmcv/runner/hooks/logger/pavi.py b/mmcv/mmcv/runner/hooks/logger/pavi.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d2e12cb8db428836e12ddb7c8a811970c527844
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/pavi.py
@@ -0,0 +1,149 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+import os.path as osp
+from typing import Dict, Optional
+
+import torch
+import yaml
+
+import mmcv
+from ....parallel.utils import is_module_wrapper
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class PaviLoggerHook(LoggerHook):
+    """Class to visual model, log metrics (for internal use).
+
+    Args:
+        init_kwargs (dict): A dict contains the initialization keys as below:
+
+            - name (str, optional): Custom training name. Defaults to None,
+              which means current work_dir.
+            - project (str, optional): Project name. Defaults to "default".
+            - model (str, optional): Training model name. Defaults to current
+              model.
+            - session_text (str, optional): Session string in YAML format.
+              Defaults to current config.
+            - training_id (int, optional): Training ID in PAVI, if you want to
+              use an existing training. Defaults to None.
+            - compare_id (int, optional): Compare ID in PAVI, if you want to
+              add the task to an existing compare. Defaults to None.
+            - overwrite_last_training (bool, optional): Whether to upload data
+              to the training with the same name in the same project, rather
+              than creating a new one. Defaults to False.
+        add_graph (bool): Whether to visual model. Default: False.
+        add_last_ckpt (bool): Whether to save checkpoint after run.
+            Default: False.
+        interval (int): Logging interval (every k iterations). Default: True.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+        img_key (string): Get image data from Dataset. Default: 'img_info'.
+    """
+
+    def __init__(self,
+                 init_kwargs: Optional[Dict] = None,
+                 add_graph: bool = False,
+                 add_last_ckpt: bool = False,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True,
+                 img_key: str = 'img_info'):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.init_kwargs = init_kwargs
+        self.add_graph = add_graph
+        self.add_last_ckpt = add_last_ckpt
+        self.img_key = img_key
+
+    @master_only
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
+        try:
+            from pavi import SummaryWriter
+        except ImportError:
+            raise ImportError(
+                'No module named pavi, please contact pavi team or visit'
+                'document for pavi installation instructions.')
+
+        self.run_name = runner.work_dir.split('/')[-1]
+
+        if not self.init_kwargs:
+            self.init_kwargs = dict()
+        self.init_kwargs.setdefault('name', self.run_name)
+        self.init_kwargs.setdefault('model', runner._model_name)
+        if runner.meta is not None:
+            if 'config_dict' in runner.meta:
+                config_dict = runner.meta['config_dict']
+                assert isinstance(
+                    config_dict,
+                    dict), ('meta["config_dict"] has to be of a dict, '
+                            f'but got {type(config_dict)}')
+            elif 'config_file' in runner.meta:
+                config_file = runner.meta['config_file']
+                config_dict = dict(mmcv.Config.fromfile(config_file))
+            else:
+                config_dict = None
+            if config_dict is not None:
+                # 'max_.*iter' is parsed in pavi sdk as the maximum iterations
+                #  to properly set up the progress bar.
+                config_dict = config_dict.copy()
+                config_dict.setdefault('max_iter', runner.max_iters)
+                # non-serializable values are first converted in
+                # mmcv.dump to json
+                config_dict = json.loads(
+                    mmcv.dump(config_dict, file_format='json'))
+                session_text = yaml.dump(config_dict)
+                self.init_kwargs.setdefault('session_text', session_text)
+        self.writer = SummaryWriter(**self.init_kwargs)
+
+    def get_step(self, runner) -> int:
+        """Get the total training step/epoch."""
+        if self.get_mode(runner) == 'val' and self.by_epoch:
+            return self.get_epoch(runner)
+        else:
+            return self.get_iter(runner)
+
+    @master_only
+    def log(self, runner) -> None:
+        tags = self.get_loggable_tags(runner, add_mode=False)
+        if tags:
+            self.writer.add_scalars(
+                self.get_mode(runner), tags, self.get_step(runner))
+
+    @master_only
+    def after_run(self, runner) -> None:
+        if self.add_last_ckpt:
+            ckpt_path = osp.join(runner.work_dir, 'latest.pth')
+            if osp.islink(ckpt_path):
+                ckpt_path = osp.join(runner.work_dir, os.readlink(ckpt_path))
+
+            if osp.isfile(ckpt_path):
+                # runner.epoch += 1 has been done before `after_run`.
+                iteration = runner.epoch if self.by_epoch else runner.iter
+                return self.writer.add_snapshot_file(
+                    tag=self.run_name,
+                    snapshot_file_path=ckpt_path,
+                    iteration=iteration)
+
+        # flush the buffer and send a task ending signal to Pavi
+        self.writer.close()
+
+    @master_only
+    def before_epoch(self, runner) -> None:
+        if runner.epoch == 0 and self.add_graph:
+            if is_module_wrapper(runner.model):
+                _model = runner.model.module
+            else:
+                _model = runner.model
+            device = next(_model.parameters()).device
+            data = next(iter(runner.data_loader))
+            image = data[self.img_key][0:1].to(device)
+            with torch.no_grad():
+                self.writer.add_graph(_model, image)
diff --git a/mmcv/mmcv/runner/hooks/logger/segmind.py b/mmcv/mmcv/runner/hooks/logger/segmind.py
new file mode 100644
index 0000000000000000000000000000000000000000..ecb3751ed7cb571fd511b6bd788cb236e07fc601
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/segmind.py
@@ -0,0 +1,48 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class SegmindLoggerHook(LoggerHook):
+    """Class to log metrics to Segmind.
+
+    It requires `Segmind`_ to be installed.
+
+    Args:
+        interval (int): Logging interval (every k iterations). Default: 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default True.
+
+    .. _Segmind:
+        https://docs.segmind.com/python-library
+    """
+
+    def __init__(self,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch=True):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.import_segmind()
+
+    def import_segmind(self) -> None:
+        try:
+            import segmind
+        except ImportError:
+            raise ImportError(
+                "Please run 'pip install segmind' to install segmind")
+        self.log_metrics = segmind.tracking.fluent.log_metrics
+        self.mlflow_log = segmind.utils.logging_utils.try_mlflow_log
+
+    @master_only
+    def log(self, runner) -> None:
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            # logging metrics to segmind
+            self.mlflow_log(
+                self.log_metrics, tags, step=runner.epoch, epoch=runner.epoch)
diff --git a/mmcv/mmcv/runner/hooks/logger/tensorboard.py b/mmcv/mmcv/runner/hooks/logger/tensorboard.py
new file mode 100644
index 0000000000000000000000000000000000000000..11d079911262497ba91265d2cbbf89f7010a7c52
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/tensorboard.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Optional
+
+from mmcv.utils import TORCH_VERSION, digit_version
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class TensorboardLoggerHook(LoggerHook):
+    """Class to log metrics to Tensorboard.
+
+    Args:
+        log_dir (string): Save directory location. Default: None. If default
+            values are used, directory location is ``runner.work_dir``/tf_logs.
+        interval (int): Logging interval (every k iterations). Default: True.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`. Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
+    """
+
+    def __init__(self,
+                 log_dir: Optional[str] = None,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 by_epoch: bool = True):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.log_dir = log_dir
+
+    @master_only
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
+        if (TORCH_VERSION == 'parrots'
+                or digit_version(TORCH_VERSION) < digit_version('1.1')):
+            try:
+                from tensorboardX import SummaryWriter
+            except ImportError:
+                raise ImportError('Please install tensorboardX to use '
+                                  'TensorboardLoggerHook.')
+        else:
+            try:
+                from torch.utils.tensorboard import SummaryWriter
+            except ImportError:
+                raise ImportError(
+                    'Please run "pip install future tensorboard" to install '
+                    'the dependencies to use torch.utils.tensorboard '
+                    '(applicable to PyTorch 1.1 or higher)')
+
+        if self.log_dir is None:
+            self.log_dir = osp.join(runner.work_dir, 'tf_logs')
+        self.writer = SummaryWriter(self.log_dir)
+
+    @master_only
+    def log(self, runner) -> None:
+        tags = self.get_loggable_tags(runner, allow_text=True)
+        for tag, val in tags.items():
+            if isinstance(val, str):
+                self.writer.add_text(tag, val, self.get_iter(runner))
+            else:
+                self.writer.add_scalar(tag, val, self.get_iter(runner))
+
+    @master_only
+    def after_run(self, runner) -> None:
+        self.writer.close()
diff --git a/mmcv/mmcv/runner/hooks/logger/text.py b/mmcv/mmcv/runner/hooks/logger/text.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbfa208a6201fd7cb60efcc2000ec0b07e25076f
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/text.py
@@ -0,0 +1,256 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import datetime
+import os
+import os.path as osp
+from collections import OrderedDict
+from typing import Dict, Optional, Union
+
+import torch
+import torch.distributed as dist
+
+import mmcv
+from mmcv.fileio.file_client import FileClient
+from mmcv.utils import is_tuple_of, scandir
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class TextLoggerHook(LoggerHook):
+    """Logger hook in text.
+
+    In this logger hook, the information will be printed on terminal and
+    saved in json file.
+
+    Args:
+        by_epoch (bool, optional): Whether EpochBasedRunner is used.
+            Default: True.
+        interval (int, optional): Logging interval (every k iterations).
+            Default: 10.
+        ignore_last (bool, optional): Ignore the log of last iterations in each
+            epoch if less than :attr:`interval`. Default: True.
+        reset_flag (bool, optional): Whether to clear the output buffer after
+            logging. Default: False.
+        interval_exp_name (int, optional): Logging interval for experiment
+            name. This feature is to help users conveniently get the experiment
+            information from screen or log file. Default: 1000.
+        out_dir (str, optional): Logs are saved in ``runner.work_dir`` default.
+            If ``out_dir`` is specified, logs will be copied to a new directory
+            which is the concatenation of ``out_dir`` and the last level
+            directory of ``runner.work_dir``. Default: None.
+            `New in version 1.3.16.`
+        out_suffix (str or tuple[str], optional): Those filenames ending with
+            ``out_suffix`` will be copied to ``out_dir``.
+            Default: ('.log.json', '.log', '.py').
+            `New in version 1.3.16.`
+        keep_local (bool, optional): Whether to keep local log when
+            :attr:`out_dir` is specified. If False, the local log will be
+            removed. Default: True.
+            `New in version 1.3.16.`
+        file_client_args (dict, optional): Arguments to instantiate a
+            FileClient. See :class:`mmcv.fileio.FileClient` for details.
+            Default: None.
+            `New in version 1.3.16.`
+    """
+
+    def __init__(self,
+                 by_epoch: bool = True,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 interval_exp_name: int = 1000,
+                 out_dir: Optional[str] = None,
+                 out_suffix: Union[str, tuple] = ('.log.json', '.log', '.py'),
+                 keep_local: bool = True,
+                 file_client_args: Optional[Dict] = None):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.by_epoch = by_epoch
+        self.time_sec_tot = 0
+        self.interval_exp_name = interval_exp_name
+
+        if out_dir is None and file_client_args is not None:
+            raise ValueError(
+                'file_client_args should be "None" when `out_dir` is not'
+                'specified.')
+        self.out_dir = out_dir
+
+        if not (out_dir is None or isinstance(out_dir, str)
+                or is_tuple_of(out_dir, str)):
+            raise TypeError('out_dir should be  "None" or string or tuple of '
+                            'string, but got {out_dir}')
+        self.out_suffix = out_suffix
+
+        self.keep_local = keep_local
+        self.file_client_args = file_client_args
+        if self.out_dir is not None:
+            self.file_client = FileClient.infer_client(file_client_args,
+                                                       self.out_dir)
+
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
+
+        if self.out_dir is not None:
+            self.file_client = FileClient.infer_client(self.file_client_args,
+                                                       self.out_dir)
+            # The final `self.out_dir` is the concatenation of `self.out_dir`
+            # and the last level directory of `runner.work_dir`
+            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+            self.out_dir = self.file_client.join_path(self.out_dir, basename)
+            runner.logger.info(
+                f'Text logs will be saved to {self.out_dir} by '
+                f'{self.file_client.name} after the training process.')
+
+        self.start_iter = runner.iter
+        self.json_log_path = osp.join(runner.work_dir,
+                                      f'{runner.timestamp}.log.json')
+        if runner.meta is not None:
+            self._dump_log(runner.meta, runner)
+
+    def _get_max_memory(self, runner) -> int:
+        device = getattr(runner.model, 'output_device', None)
+        mem = torch.cuda.max_memory_allocated(device=device)
+        mem_mb = torch.tensor([int(mem) // (1024 * 1024)],
+                              dtype=torch.int,
+                              device=device)
+        if runner.world_size > 1:
+            dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
+        return mem_mb.item()
+
+    def _log_info(self, log_dict: Dict, runner) -> None:
+        # print exp name for users to distinguish experiments
+        # at every ``interval_exp_name`` iterations and the end of each epoch
+        if runner.meta is not None and 'exp_name' in runner.meta:
+            if (self.every_n_iters(runner, self.interval_exp_name)) or (
+                    self.by_epoch and self.end_of_epoch(runner)):
+                exp_info = f'Exp name: {runner.meta["exp_name"]}'
+                runner.logger.info(exp_info)
+
+        if log_dict['mode'] == 'train':
+            if isinstance(log_dict['lr'], dict):
+                lr_str = []
+                for k, val in log_dict['lr'].items():
+                    lr_str.append(f'lr_{k}: {val:.3e}')
+                lr_str = ' '.join(lr_str)  # type: ignore
+            else:
+                lr_str = f'lr: {log_dict["lr"]:.3e}'  # type: ignore
+
+            # by epoch: Epoch [4][100/1000]
+            # by iter:  Iter [100/100000]
+            if self.by_epoch:
+                log_str = f'Epoch [{log_dict["epoch"]}]' \
+                          f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
+            else:
+                log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
+            log_str += f'{lr_str}, '
+
+            if 'time' in log_dict.keys():
+                self.time_sec_tot += (log_dict['time'] * self.interval)
+                time_sec_avg = self.time_sec_tot / (
+                    runner.iter - self.start_iter + 1)
+                eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
+                eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
+                log_str += f'eta: {eta_str}, '
+                log_str += f'time: {log_dict["time"]:.3f}, ' \
+                           f'data_time: {log_dict["data_time"]:.3f}, '
+                # statistic memory
+                if torch.cuda.is_available():
+                    log_str += f'memory: {log_dict["memory"]}, '
+        else:
+            # val/test time
+            # here 1000 is the length of the val dataloader
+            # by epoch: Epoch[val] [4][1000]
+            # by iter: Iter[val] [1000]
+            if self.by_epoch:
+                log_str = f'Epoch({log_dict["mode"]}) ' \
+                    f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
+            else:
+                log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'
+
+        log_items = []
+        for name, val in log_dict.items():
+            # TODO: resolve this hack
+            # these items have been in log_str
+            if name in [
+                    'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time',
+                    'memory', 'epoch'
+            ]:
+                continue
+            if isinstance(val, float):
+                val = f'{val:.4f}'
+            log_items.append(f'{name}: {val}')
+        log_str += ', '.join(log_items)
+
+        runner.logger.info(log_str)
+
+    def _dump_log(self, log_dict: Dict, runner) -> None:
+        # dump log in json format
+        json_log = OrderedDict()
+        for k, v in log_dict.items():
+            json_log[k] = self._round_float(v)
+        # only append log at last line
+        if runner.rank == 0:
+            with open(self.json_log_path, 'a+') as f:
+                mmcv.dump(json_log, f, file_format='json')
+                f.write('\n')
+
+    def _round_float(self, items):
+        if isinstance(items, list):
+            return [self._round_float(item) for item in items]
+        elif isinstance(items, float):
+            return round(items, 5)
+        else:
+            return items
+
+    def log(self, runner) -> OrderedDict:
+        if 'eval_iter_num' in runner.log_buffer.output:
+            # this doesn't modify runner.iter and is regardless of by_epoch
+            cur_iter = runner.log_buffer.output.pop('eval_iter_num')
+        else:
+            cur_iter = self.get_iter(runner, inner_iter=True)
+
+        log_dict = OrderedDict(
+            mode=self.get_mode(runner),
+            epoch=self.get_epoch(runner),
+            iter=cur_iter)
+
+        # only record lr of the first param group
+        cur_lr = runner.current_lr()
+        if isinstance(cur_lr, list):
+            log_dict['lr'] = cur_lr[0]
+        else:
+            assert isinstance(cur_lr, dict)
+            log_dict['lr'] = {}
+            for k, lr_ in cur_lr.items():
+                assert isinstance(lr_, list)
+                log_dict['lr'].update({k: lr_[0]})
+
+        if 'time' in runner.log_buffer.output:
+            # statistic memory
+            if torch.cuda.is_available():
+                log_dict['memory'] = self._get_max_memory(runner)
+
+        log_dict = dict(log_dict, **runner.log_buffer.output)  # type: ignore
+
+        self._log_info(log_dict, runner)
+        self._dump_log(log_dict, runner)
+        return log_dict
+
+    def after_run(self, runner) -> None:
+        # copy or upload logs to self.out_dir
+        if self.out_dir is not None:
+            for filename in scandir(runner.work_dir, self.out_suffix, True):
+                local_filepath = osp.join(runner.work_dir, filename)
+                out_filepath = self.file_client.join_path(
+                    self.out_dir, filename)
+                with open(local_filepath) as f:
+                    self.file_client.put_text(f.read(), out_filepath)
+
+                runner.logger.info(
+                    f'The file {local_filepath} has been uploaded to '
+                    f'{out_filepath}.')
+
+                if not self.keep_local:
+                    os.remove(local_filepath)
+                    runner.logger.info(
+                        f'{local_filepath} was removed due to the '
+                        '`self.keep_local=False`')
diff --git a/mmcv/mmcv/runner/hooks/logger/wandb.py b/mmcv/mmcv/runner/hooks/logger/wandb.py
new file mode 100644
index 0000000000000000000000000000000000000000..1cf165507ef29af71f70f03108d85939980dd20e
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/logger/wandb.py
@@ -0,0 +1,107 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import Dict, Optional, Union
+
+from mmcv.utils import scandir
+from ...dist_utils import master_only
+from ..hook import HOOKS
+from .base import LoggerHook
+
+
+@HOOKS.register_module()
+class WandbLoggerHook(LoggerHook):
+    """Class to log metrics with wandb.
+
+    It requires `wandb`_ to be installed.
+
+
+    Args:
+        init_kwargs (dict): A dict contains the initialization keys. Check
+            https://docs.wandb.ai/ref/python/init for more init arguments.
+        interval (int): Logging interval (every k iterations).
+            Default 10.
+        ignore_last (bool): Ignore the log of last iterations in each epoch
+            if less than `interval`.
+            Default: True.
+        reset_flag (bool): Whether to clear the output buffer after logging.
+            Default: False.
+        commit (bool): Save the metrics dict to the wandb server and increment
+            the step. If false ``wandb.log`` just updates the current metrics
+            dict with the row argument and metrics won't be saved until
+            ``wandb.log`` is called with ``commit=True``.
+            Default: True.
+        by_epoch (bool): Whether EpochBasedRunner is used.
+            Default: True.
+        with_step (bool): If True, the step will be logged from
+            ``self.get_iters``. Otherwise, step will not be logged.
+            Default: True.
+        log_artifact (bool): If True, artifacts in {work_dir} will be uploaded
+            to wandb after training ends.
+            Default: True
+            `New in version 1.4.3.`
+        out_suffix (str or tuple[str], optional): Those filenames ending with
+            ``out_suffix`` will be uploaded to wandb.
+            Default: ('.log.json', '.log', '.py').
+            `New in version 1.4.3.`
+
+    .. _wandb:
+        https://docs.wandb.ai
+    """
+
+    def __init__(self,
+                 init_kwargs: Optional[Dict] = None,
+                 interval: int = 10,
+                 ignore_last: bool = True,
+                 reset_flag: bool = False,
+                 commit: bool = True,
+                 by_epoch: bool = True,
+                 with_step: bool = True,
+                 log_artifact: bool = True,
+                 out_suffix: Union[str, tuple] = ('.log.json', '.log', '.py')):
+        super().__init__(interval, ignore_last, reset_flag, by_epoch)
+        self.import_wandb()
+        self.init_kwargs = init_kwargs
+        self.commit = commit
+        self.with_step = with_step
+        self.log_artifact = log_artifact
+        self.out_suffix = out_suffix
+
+    def import_wandb(self) -> None:
+        try:
+            import wandb
+        except ImportError:
+            raise ImportError(
+                'Please run "pip install wandb" to install wandb')
+        self.wandb = wandb
+
+    @master_only
+    def before_run(self, runner) -> None:
+        super().before_run(runner)
+        if self.wandb is None:
+            self.import_wandb()
+        if self.init_kwargs:
+            self.wandb.init(**self.init_kwargs)  # type: ignore
+        else:
+            self.wandb.init()  # type: ignore
+
+    @master_only
+    def log(self, runner) -> None:
+        tags = self.get_loggable_tags(runner)
+        if tags:
+            if self.with_step:
+                self.wandb.log(
+                    tags, step=self.get_iter(runner), commit=self.commit)
+            else:
+                tags['global_step'] = self.get_iter(runner)
+                self.wandb.log(tags, commit=self.commit)
+
+    @master_only
+    def after_run(self, runner) -> None:
+        if self.log_artifact:
+            wandb_artifact = self.wandb.Artifact(
+                name='artifacts', type='model')
+            for filename in scandir(runner.work_dir, self.out_suffix, True):
+                local_filepath = osp.join(runner.work_dir, filename)
+                wandb_artifact.add_file(local_filepath)
+            self.wandb.log_artifact(wandb_artifact)
+        self.wandb.join()
diff --git a/mmcv/mmcv/runner/hooks/lr_updater.py b/mmcv/mmcv/runner/hooks/lr_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0be405596d1d9153f9fa03de2557da62c688615
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/lr_updater.py
@@ -0,0 +1,754 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numbers
+from math import cos, pi
+from typing import Callable, List, Optional, Union
+
+import mmcv
+from mmcv import runner
+from .hook import HOOKS, Hook
+
+
+class LrUpdaterHook(Hook):
+    """LR Scheduler in MMCV.
+
+    Args:
+        by_epoch (bool): LR changes epoch by epoch
+        warmup (string): Type of warmup used. It can be None(use no warmup),
+            'constant', 'linear' or 'exp'
+        warmup_iters (int): The number of iterations or epochs that warmup
+            lasts
+        warmup_ratio (float): LR used at the beginning of warmup equals to
+            warmup_ratio * initial_lr
+        warmup_by_epoch (bool): When warmup_by_epoch == True, warmup_iters
+            means the number of epochs that warmup lasts, otherwise means the
+            number of iteration that warmup lasts
+    """
+
+    def __init__(self,
+                 by_epoch: bool = True,
+                 warmup: Optional[str] = None,
+                 warmup_iters: int = 0,
+                 warmup_ratio: float = 0.1,
+                 warmup_by_epoch: bool = False) -> None:
+        # validate the "warmup" argument
+        if warmup is not None:
+            if warmup not in ['constant', 'linear', 'exp']:
+                raise ValueError(
+                    f'"{warmup}" is not a supported type for warming up, valid'
+                    ' types are "constant", "linear" and "exp"')
+        if warmup is not None:
+            assert warmup_iters > 0, \
+                '"warmup_iters" must be a positive integer'
+            assert 0 < warmup_ratio <= 1.0, \
+                '"warmup_ratio" must be in range (0,1]'
+
+        self.by_epoch = by_epoch
+        self.warmup = warmup
+        self.warmup_iters: Optional[int] = warmup_iters
+        self.warmup_ratio = warmup_ratio
+        self.warmup_by_epoch = warmup_by_epoch
+
+        if self.warmup_by_epoch:
+            self.warmup_epochs: Optional[int] = self.warmup_iters
+            self.warmup_iters = None
+        else:
+            self.warmup_epochs = None
+
+        self.base_lr: Union[list, dict] = []  # initial lr for all param groups
+        self.regular_lr: list = []  # expected lr if no warming up is performed
+
+    def _set_lr(self, runner, lr_groups):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                for param_group, lr in zip(optim.param_groups, lr_groups[k]):
+                    param_group['lr'] = lr
+        else:
+            for param_group, lr in zip(runner.optimizer.param_groups,
+                                       lr_groups):
+                param_group['lr'] = lr
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        raise NotImplementedError
+
+    def get_regular_lr(self, runner: 'runner.BaseRunner'):
+        if isinstance(runner.optimizer, dict):
+            lr_groups = {}
+            for k in runner.optimizer.keys():
+                _lr_group = [
+                    self.get_lr(runner, _base_lr)
+                    for _base_lr in self.base_lr[k]
+                ]
+                lr_groups.update({k: _lr_group})
+
+            return lr_groups
+        else:
+            return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr]
+
+    def get_warmup_lr(self, cur_iters: int):
+
+        def _get_warmup_lr(cur_iters, regular_lr):
+            if self.warmup == 'constant':
+                warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr]
+            elif self.warmup == 'linear':
+                k = (1 - cur_iters / self.warmup_iters) * (1 -
+                                                           self.warmup_ratio)
+                warmup_lr = [_lr * (1 - k) for _lr in regular_lr]
+            elif self.warmup == 'exp':
+                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
+                warmup_lr = [_lr * k for _lr in regular_lr]
+            return warmup_lr
+
+        if isinstance(self.regular_lr, dict):
+            lr_groups = {}
+            for key, regular_lr in self.regular_lr.items():
+                lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr)
+            return lr_groups
+        else:
+            return _get_warmup_lr(cur_iters, self.regular_lr)
+
+    def before_run(self, runner: 'runner.BaseRunner'):
+        # NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved,
+        # it will be set according to the optimizer params
+        if isinstance(runner.optimizer, dict):
+            self.base_lr = {}
+            for k, optim in runner.optimizer.items():
+                for group in optim.param_groups:
+                    group.setdefault('initial_lr', group['lr'])
+                _base_lr = [
+                    group['initial_lr'] for group in optim.param_groups
+                ]
+                self.base_lr.update({k: _base_lr})
+        else:
+            for group in runner.optimizer.param_groups:  # type: ignore
+                group.setdefault('initial_lr', group['lr'])
+            self.base_lr = [
+                group['initial_lr']
+                for group in runner.optimizer.param_groups  # type: ignore
+            ]
+
+    def before_train_epoch(self, runner: 'runner.BaseRunner'):
+        if self.warmup_iters is None:
+            epoch_len = len(runner.data_loader)  # type: ignore
+            self.warmup_iters = self.warmup_epochs * epoch_len  # type: ignore
+
+        if not self.by_epoch:
+            return
+
+        self.regular_lr = self.get_regular_lr(runner)
+        self._set_lr(runner, self.regular_lr)
+
+    def before_train_iter(self, runner: 'runner.BaseRunner'):
+        cur_iter = runner.iter
+        assert isinstance(self.warmup_iters, int)
+        if not self.by_epoch:
+            self.regular_lr = self.get_regular_lr(runner)
+            if self.warmup is None or cur_iter >= self.warmup_iters:
+                self._set_lr(runner, self.regular_lr)
+            else:
+                warmup_lr = self.get_warmup_lr(cur_iter)
+                self._set_lr(runner, warmup_lr)
+        elif self.by_epoch:
+            if self.warmup is None or cur_iter > self.warmup_iters:
+                return
+            elif cur_iter == self.warmup_iters:
+                self._set_lr(runner, self.regular_lr)
+            else:
+                warmup_lr = self.get_warmup_lr(cur_iter)
+                self._set_lr(runner, warmup_lr)
+
+
+@HOOKS.register_module()
+class FixedLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def get_lr(self, runner, base_lr):
+        return base_lr
+
+
+@HOOKS.register_module()
+class StepLrUpdaterHook(LrUpdaterHook):
+    """Step LR scheduler with min_lr clipping.
+
+    Args:
+        step (int | list[int]): Step to decay the LR. If an int value is given,
+            regard it as the decay interval. If a list is given, decay LR at
+            these steps.
+        gamma (float): Decay LR ratio. Defaults to 0.1.
+        min_lr (float, optional): Minimum LR value to keep. If LR after decay
+            is lower than `min_lr`, it will be clipped to this value. If None
+            is given, we don't perform lr clipping. Default: None.
+    """
+
+    def __init__(self,
+                 step: Union[int, List[int]],
+                 gamma: float = 0.1,
+                 min_lr: Optional[float] = None,
+                 **kwargs) -> None:
+        if isinstance(step, list):
+            assert mmcv.is_list_of(step, int)
+            assert all([s > 0 for s in step])
+        elif isinstance(step, int):
+            assert step > 0
+        else:
+            raise TypeError('"step" must be a list or integer')
+        self.step = step
+        self.gamma = gamma
+        self.min_lr = min_lr
+        super().__init__(**kwargs)
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        progress = runner.epoch if self.by_epoch else runner.iter
+
+        # calculate exponential term
+        if isinstance(self.step, int):
+            exp = progress // self.step
+        else:
+            exp = len(self.step)
+            for i, s in enumerate(self.step):
+                if progress < s:
+                    exp = i
+                    break
+
+        lr = base_lr * (self.gamma**exp)
+        if self.min_lr is not None:
+            # clip to a minimum value
+            lr = max(lr, self.min_lr)
+        return lr
+
+
+@HOOKS.register_module()
+class ExpLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, gamma: float, **kwargs) -> None:
+        self.gamma = gamma
+        super().__init__(**kwargs)
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        progress = runner.epoch if self.by_epoch else runner.iter
+        return base_lr * self.gamma**progress
+
+
+@HOOKS.register_module()
+class PolyLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self,
+                 power: float = 1.,
+                 min_lr: float = 0.,
+                 **kwargs) -> None:
+        self.power = power
+        self.min_lr = min_lr
+        super().__init__(**kwargs)
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        coeff = (1 - progress / max_progress)**self.power
+        return (base_lr - self.min_lr) * coeff + self.min_lr
+
+
+@HOOKS.register_module()
+class InvLrUpdaterHook(LrUpdaterHook):
+
+    def __init__(self, gamma: float, power: float = 1., **kwargs) -> None:
+        self.gamma = gamma
+        self.power = power
+        super().__init__(**kwargs)
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        progress = runner.epoch if self.by_epoch else runner.iter
+        return base_lr * (1 + self.gamma * progress)**(-self.power)
+
+
+@HOOKS.register_module()
+class CosineAnnealingLrUpdaterHook(LrUpdaterHook):
+    """CosineAnnealing LR scheduler.
+
+    Args:
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 min_lr: Optional[float] = None,
+                 min_lr_ratio: Optional[float] = None,
+                 **kwargs) -> None:
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        super().__init__(**kwargs)
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr  # type:ignore
+        return annealing_cos(base_lr, target_lr, progress / max_progress)
+
+
+@HOOKS.register_module()
+class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook):
+    """Flat + Cosine lr schedule.
+
+    Modified from https://github.com/fastai/fastai/blob/master/fastai/callback/schedule.py#L128 # noqa: E501
+
+    Args:
+        start_percent (float): When to start annealing the learning rate
+            after the percentage of the total training steps.
+            The value should be in range [0, 1).
+            Default: 0.75
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 start_percent: float = 0.75,
+                 min_lr: Optional[float] = None,
+                 min_lr_ratio: Optional[float] = None,
+                 **kwargs) -> None:
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        if start_percent < 0 or start_percent > 1 or not isinstance(
+                start_percent, float):
+            raise ValueError(
+                'expected float between 0 and 1 start_percent, but '
+                f'got {start_percent}')
+        self.start_percent = start_percent
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        super().__init__(**kwargs)
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        if self.by_epoch:
+            start = round(runner.max_epochs * self.start_percent)
+            progress = runner.epoch - start
+            max_progress = runner.max_epochs - start
+        else:
+            start = round(runner.max_iters * self.start_percent)
+            progress = runner.iter - start
+            max_progress = runner.max_iters - start
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr  # type:ignore
+
+        if progress < 0:
+            return base_lr
+        else:
+            return annealing_cos(base_lr, target_lr, progress / max_progress)
+
+
+@HOOKS.register_module()
+class CosineRestartLrUpdaterHook(LrUpdaterHook):
+    """Cosine annealing with restarts learning rate scheme.
+
+    Args:
+        periods (list[int]): Periods for each cosine anneling cycle.
+        restart_weights (list[float]): Restart weights at each
+            restart iteration. Defaults to [1].
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 periods: List[int],
+                 restart_weights: List[float] = [1],
+                 min_lr: Optional[float] = None,
+                 min_lr_ratio: Optional[float] = None,
+                 **kwargs) -> None:
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        self.periods = periods
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        self.restart_weights = restart_weights
+        assert (len(self.periods) == len(self.restart_weights)
+                ), 'periods and restart_weights should have the same length.'
+        super().__init__(**kwargs)
+
+        self.cumulative_periods = [
+            sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
+        ]
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        if self.by_epoch:
+            progress = runner.epoch
+        else:
+            progress = runner.iter
+
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr  # type:ignore
+
+        idx = get_position_from_periods(progress, self.cumulative_periods)
+        current_weight = self.restart_weights[idx]
+        nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1]
+        current_periods = self.periods[idx]
+
+        alpha = min((progress - nearest_restart) / current_periods, 1)
+        return annealing_cos(base_lr, target_lr, alpha, current_weight)
+
+
+def get_position_from_periods(iteration: int, cumulative_periods: List[int]):
+    """Get the position from a period list.
+
+    It will return the index of the right-closest number in the period list.
+    For example, the cumulative_periods = [100, 200, 300, 400],
+    if iteration == 50, return 0;
+    if iteration == 210, return 2;
+    if iteration == 300, return 3.
+
+    Args:
+        iteration (int): Current iteration.
+        cumulative_periods (list[int]): Cumulative period list.
+
+    Returns:
+        int: The position of the right-closest number in the period list.
+    """
+    for i, period in enumerate(cumulative_periods):
+        if iteration < period:
+            return i
+    raise ValueError(f'Current iteration {iteration} exceeds '
+                     f'cumulative_periods {cumulative_periods}')
+
+
+@HOOKS.register_module()
+class CyclicLrUpdaterHook(LrUpdaterHook):
+    """Cyclic LR Scheduler.
+
+    Implement the cyclical learning rate policy (CLR) described in
+    https://arxiv.org/pdf/1506.01186.pdf
+
+    Different from the original paper, we use cosine annealing rather than
+    triangular policy inside a cycle. This improves the performance in the
+    3D detection area.
+
+    Args:
+        by_epoch (bool, optional): Whether to update LR by epoch.
+        target_ratio (tuple[float], optional): Relative ratio of the highest LR
+            and the lowest LR to the initial LR.
+        cyclic_times (int, optional): Number of cycles during training
+        step_ratio_up (float, optional): The ratio of the increasing process of
+            LR in the total cycle.
+        anneal_strategy (str, optional): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing. Default: 'cos'.
+        gamma (float, optional): Cycle decay ratio. Default: 1.
+            It takes values in the range (0, 1]. The difference between the
+            maximum learning rate and the minimum learning rate decreases
+            periodically when it is less than 1. `New in version 1.4.4.`
+    """
+
+    def __init__(self,
+                 by_epoch: bool = False,
+                 target_ratio: Union[float, tuple] = (10, 1e-4),
+                 cyclic_times: int = 1,
+                 step_ratio_up: float = 0.4,
+                 anneal_strategy: str = 'cos',
+                 gamma: float = 1,
+                 **kwargs) -> None:
+        if isinstance(target_ratio, float):
+            target_ratio = (target_ratio, target_ratio / 1e5)
+        elif isinstance(target_ratio, tuple):
+            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
+                if len(target_ratio) == 1 else target_ratio
+        else:
+            raise ValueError('target_ratio should be either float '
+                             f'or tuple, got {type(target_ratio)}')
+
+        assert len(target_ratio) == 2, \
+            '"target_ratio" must be list or tuple of two floats'
+        assert 0 <= step_ratio_up < 1.0, \
+            '"step_ratio_up" must be in range [0,1)'
+        assert 0 < gamma <= 1, \
+            '"gamma" must be in range (0, 1]'
+
+        self.target_ratio = target_ratio
+        self.cyclic_times = cyclic_times
+        self.step_ratio_up = step_ratio_up
+        self.gamma = gamma
+        self.max_iter_per_phase = None
+        self.lr_phases: list = []  # init lr_phases
+        # validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func: Callable[[float, float, float],
+                                       float] = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+
+        assert not by_epoch, \
+            'currently only support "by_epoch" = False'
+        super().__init__(by_epoch, **kwargs)
+
+    def before_run(self, runner: 'runner.BaseRunner'):
+        super().before_run(runner)
+        # initiate lr_phases
+        # total lr_phases are separated as up and down
+        self.max_iter_per_phase = runner.max_iters // self.cyclic_times
+        iter_up_phase = int(self.step_ratio_up *
+                            self.max_iter_per_phase)  # type: ignore
+        self.lr_phases.append([0, iter_up_phase, 1, self.target_ratio[0]])
+        self.lr_phases.append([
+            iter_up_phase, self.max_iter_per_phase, self.target_ratio[0],
+            self.target_ratio[1]
+        ])
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        curr_iter = runner.iter % self.max_iter_per_phase  # type: ignore
+        curr_cycle = runner.iter // self.max_iter_per_phase  # type: ignore
+        # Update weight decay
+        scale = self.gamma**curr_cycle
+
+        for (start_iter, end_iter, start_ratio, end_ratio) in self.lr_phases:
+            if start_iter <= curr_iter < end_iter:
+                # Apply cycle scaling to gradually reduce the difference
+                # between max_lr and base lr. The target end_ratio can be
+                # expressed as:
+                # end_ratio = (base_lr + scale * (max_lr - base_lr)) / base_lr
+                # iteration: 0-iter_up_phase:
+                if start_iter == 0:
+                    end_ratio = 1 - scale + end_ratio * scale
+                # iteration: iter_up_phase-self.max_iter_per_phase
+                else:
+                    start_ratio = 1 - scale + start_ratio * scale
+                progress = curr_iter - start_iter
+                return self.anneal_func(base_lr * start_ratio,
+                                        base_lr * end_ratio,
+                                        progress / (end_iter - start_iter))
+
+
+@HOOKS.register_module()
+class OneCycleLrUpdaterHook(LrUpdaterHook):
+    """One Cycle LR Scheduler.
+
+    The 1cycle learning rate policy changes the learning rate after every
+    batch. The one cycle learning rate policy is described in
+    https://arxiv.org/pdf/1708.07120.pdf
+
+    Args:
+        max_lr (float or list): Upper learning rate boundaries in the cycle
+            for each parameter group.
+        total_steps (int, optional): The total number of steps in the cycle.
+            Note that if a value is not provided here, it will be the max_iter
+            of runner. Default: None.
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Default: 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing.
+            Default: 'cos'
+        div_factor (float): Determines the initial learning rate via
+            initial_lr = max_lr/div_factor
+            Default: 25
+        final_div_factor (float): Determines the minimum learning rate via
+            min_lr = initial_lr/final_div_factor
+            Default: 1e4
+        three_phase (bool): If three_phase is True, use a third phase of the
+            schedule to annihilate the learning rate according to
+            final_div_factor instead of modifying the second phase (the first
+            two phases will be symmetrical about the step indicated by
+            pct_start).
+            Default: False
+    """
+
+    def __init__(self,
+                 max_lr: Union[float, List],
+                 total_steps: Optional[int] = None,
+                 pct_start: float = 0.3,
+                 anneal_strategy: str = 'cos',
+                 div_factor: float = 25,
+                 final_div_factor: float = 1e4,
+                 three_phase: bool = False,
+                 **kwargs) -> None:
+        # validate by_epoch, currently only support by_epoch = False
+        if 'by_epoch' not in kwargs:
+            kwargs['by_epoch'] = False
+        else:
+            assert not kwargs['by_epoch'], \
+                'currently only support "by_epoch" = False'
+        if not isinstance(max_lr, (numbers.Number, list, dict)):
+            raise ValueError('the type of max_lr must be the one of list or '
+                             f'dict, but got {type(max_lr)}')
+        self._max_lr = max_lr
+        if total_steps is not None:
+            if not isinstance(total_steps, int):
+                raise ValueError('the type of total_steps must be int, but'
+                                 f'got {type(total_steps)}')
+            self.total_steps = total_steps
+        # validate pct_start
+        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
+            raise ValueError('expected float between 0 and 1 pct_start, but '
+                             f'got {pct_start}')
+        self.pct_start = pct_start
+        # validate anneal_strategy
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func: Callable[[float, float, float],
+                                       float] = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+        self.div_factor = div_factor
+        self.final_div_factor = final_div_factor
+        self.three_phase = three_phase
+        self.lr_phases: list = []  # init lr_phases
+        super().__init__(**kwargs)
+
+    def before_run(self, runner: 'runner.BaseRunner'):
+        if hasattr(self, 'total_steps'):
+            total_steps = self.total_steps
+        else:
+            total_steps = runner.max_iters
+        if total_steps < runner.max_iters:
+            raise ValueError(
+                'The total steps must be greater than or equal to max '
+                f'iterations {runner.max_iters} of runner, but total steps '
+                f'is {total_steps}.')
+
+        if isinstance(runner.optimizer, dict):
+            self.base_lr = {}
+            for k, optim in runner.optimizer.items():
+                _max_lr = format_param(k, optim, self._max_lr)
+                self.base_lr[k] = [lr / self.div_factor for lr in _max_lr]
+                for group, lr in zip(optim.param_groups, self.base_lr[k]):
+                    group.setdefault('initial_lr', lr)
+        else:
+            k = type(runner.optimizer).__name__
+            _max_lr = format_param(k, runner.optimizer, self._max_lr)
+            self.base_lr = [lr / self.div_factor for lr in _max_lr]
+            optim_param_groups = runner.optimizer.param_groups  # type: ignore
+            for group, lr in zip(optim_param_groups, self.base_lr):
+                group.setdefault('initial_lr', lr)
+
+        if self.three_phase:
+            self.lr_phases.append(
+                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
+            self.lr_phases.append([
+                float(2 * self.pct_start * total_steps) - 2, self.div_factor, 1
+            ])
+            self.lr_phases.append(
+                [total_steps - 1, 1, 1 / self.final_div_factor])
+        else:
+            self.lr_phases.append(
+                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
+            self.lr_phases.append(
+                [total_steps - 1, self.div_factor, 1 / self.final_div_factor])
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        curr_iter = runner.iter
+        start_iter = 0
+        for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases):
+            if curr_iter <= end_iter:
+                pct = (curr_iter - start_iter) / (end_iter - start_iter)
+                lr = self.anneal_func(base_lr * start_lr, base_lr * end_lr,
+                                      pct)
+                break
+            start_iter = end_iter
+        return lr
+
+
+@HOOKS.register_module()
+class LinearAnnealingLrUpdaterHook(LrUpdaterHook):
+    """Linear annealing LR Scheduler decays the learning rate of each parameter
+    group linearly.
+
+    Args:
+        min_lr (float, optional): The minimum lr. Default: None.
+        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
+            Either `min_lr` or `min_lr_ratio` should be specified.
+            Default: None.
+    """
+
+    def __init__(self,
+                 min_lr: Optional[float] = None,
+                 min_lr_ratio: Optional[float] = None,
+                 **kwargs):
+        assert (min_lr is None) ^ (min_lr_ratio is None)
+        self.min_lr = min_lr
+        self.min_lr_ratio = min_lr_ratio
+        super().__init__(**kwargs)
+
+    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        if self.min_lr_ratio is not None:
+            target_lr = base_lr * self.min_lr_ratio
+        else:
+            target_lr = self.min_lr  # type:ignore
+        return annealing_linear(base_lr, target_lr, progress / max_progress)
+
+
+def annealing_cos(start: float,
+                  end: float,
+                  factor: float,
+                  weight: float = 1.) -> float:
+    """Calculate annealing cos learning rate.
+
+    Cosine anneal from `weight * start + (1 - weight) * end` to `end` as
+    percentage goes from 0.0 to 1.0.
+
+    Args:
+        start (float): The starting learning rate of the cosine annealing.
+        end (float): The ending learing rate of the cosine annealing.
+        factor (float): The coefficient of `pi` when calculating the current
+            percentage. Range from 0.0 to 1.0.
+        weight (float, optional): The combination factor of `start` and `end`
+            when calculating the actual starting learning rate. Default to 1.
+    """
+    cos_out = cos(pi * factor) + 1
+    return end + 0.5 * weight * (start - end) * cos_out
+
+
+def annealing_linear(start: float, end: float, factor: float) -> float:
+    """Calculate annealing linear learning rate.
+
+    Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0.
+
+    Args:
+        start (float): The starting learning rate of the linear annealing.
+        end (float): The ending learing rate of the linear annealing.
+        factor (float): The coefficient of `pi` when calculating the current
+            percentage. Range from 0.0 to 1.0.
+    """
+    return start + (end - start) * factor
+
+
+def format_param(name, optim, param):
+    if isinstance(param, numbers.Number):
+        return [param] * len(optim.param_groups)
+    elif isinstance(param, (list, tuple)):  # multi param groups
+        if len(param) != len(optim.param_groups):
+            raise ValueError(f'expected {len(optim.param_groups)} '
+                             f'values for {name}, got {len(param)}')
+        return param
+    else:  # multi optimizers
+        if name not in param:
+            raise KeyError(f'{name} is not found in {param.keys()}')
+        return param[name]
diff --git a/mmcv/mmcv/runner/hooks/memory.py b/mmcv/mmcv/runner/hooks/memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..78d1a7e3684a22d20193af13665013a4aabadd60
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/memory.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class EmptyCacheHook(Hook):
+
+    def __init__(self,
+                 before_epoch: bool = False,
+                 after_epoch: bool = True,
+                 after_iter: bool = False):
+        self._before_epoch = before_epoch
+        self._after_epoch = after_epoch
+        self._after_iter = after_iter
+
+    def after_iter(self, runner):
+        if self._after_iter:
+            torch.cuda.empty_cache()
+
+    def before_epoch(self, runner):
+        if self._before_epoch:
+            torch.cuda.empty_cache()
+
+    def after_epoch(self, runner):
+        if self._after_epoch:
+            torch.cuda.empty_cache()
diff --git a/mmcv/mmcv/runner/hooks/momentum_updater.py b/mmcv/mmcv/runner/hooks/momentum_updater.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd9bc4834b76ef249dad7f13ef0eccd4447d6867
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/momentum_updater.py
@@ -0,0 +1,594 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Callable, Dict, List, Optional, Tuple, Union
+
+import mmcv
+from .hook import HOOKS, Hook
+from .lr_updater import annealing_cos, annealing_linear, format_param
+
+
+class MomentumUpdaterHook(Hook):
+
+    def __init__(self,
+                 by_epoch: bool = True,
+                 warmup: Optional[str] = None,
+                 warmup_iters: int = 0,
+                 warmup_ratio: float = 0.9):
+        # validate the "warmup" argument
+        if warmup is not None:
+            if warmup not in ['constant', 'linear', 'exp']:
+                raise ValueError(
+                    f'"{warmup}" is not a supported type for warming up, valid'
+                    ' types are "constant" and "linear"')
+        if warmup is not None:
+            assert warmup_iters > 0, \
+                '"warmup_iters" must be a positive integer'
+            assert 0 < warmup_ratio <= 1.0, \
+                '"warmup_momentum" must be in range (0,1]'
+
+        self.by_epoch = by_epoch
+        self.warmup = warmup
+        self.warmup_iters = warmup_iters
+        self.warmup_ratio = warmup_ratio
+
+        # initial momentum for all param groups
+        self.base_momentum: Union[list, dict] = []
+        # expected momentum if no warming up is performed
+        self.regular_momentum: Union[list, dict] = []
+
+    def _set_momentum(self, runner, momentum_groups):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                for param_group, mom in zip(optim.param_groups,
+                                            momentum_groups[k]):
+                    if 'momentum' in param_group.keys():
+                        param_group['momentum'] = mom
+                    elif 'betas' in param_group.keys():
+                        param_group['betas'] = (mom, param_group['betas'][1])
+        else:
+            for param_group, mom in zip(runner.optimizer.param_groups,
+                                        momentum_groups):
+                if 'momentum' in param_group.keys():
+                    param_group['momentum'] = mom
+                elif 'betas' in param_group.keys():
+                    param_group['betas'] = (mom, param_group['betas'][1])
+
+    def get_momentum(self, runner, base_momentum) -> float:
+        raise NotImplementedError
+
+    def get_regular_momentum(self, runner) -> Union[list, Dict[str, list]]:
+        if isinstance(runner.optimizer, dict):
+            assert isinstance(self.base_momentum, dict)
+            momentum_groups: Dict[str, List[float]] = {}
+            for k in runner.optimizer.keys():
+                _momentum_group: List[float] = [
+                    self.get_momentum(runner, _base_momentum)
+                    for _base_momentum in self.base_momentum[k]
+                ]
+                momentum_groups.update({k: _momentum_group})
+            return momentum_groups
+        else:
+            assert isinstance(self.base_momentum, list)
+            return [
+                self.get_momentum(runner, _base_momentum)
+                for _base_momentum in self.base_momentum
+            ]
+
+    def get_warmup_momentum(
+            self,
+            cur_iters: int) -> Union[List[float], Dict[str, List[float]]]:
+
+        def _get_warmup_momentum(cur_iters, regular_momentum):
+            if self.warmup == 'constant':
+                warmup_momentum = [
+                    _momentum / self.warmup_ratio
+                    for _momentum in regular_momentum
+                ]
+            elif self.warmup == 'linear':
+                k = (1 - cur_iters / self.warmup_iters) * (1 -
+                                                           self.warmup_ratio)
+                warmup_momentum = [
+                    _momentum / (1 - k) for _momentum in regular_momentum
+                ]
+            elif self.warmup == 'exp':
+                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
+                warmup_momentum = [
+                    _momentum / k for _momentum in regular_momentum
+                ]
+            else:
+                raise ValueError(
+                    'Expected values of `self.warmup` to be "constant", '
+                    f'"linear", or "exp", got {self.warmup}')
+            return warmup_momentum
+
+        if isinstance(self.regular_momentum, dict):
+            momentum_groups = {}
+            for key, regular_momentum in self.regular_momentum.items():
+                momentum_groups[key] = _get_warmup_momentum(
+                    cur_iters, regular_momentum)
+            return momentum_groups
+        else:
+            return _get_warmup_momentum(cur_iters, self.regular_momentum)
+
+    def before_run(self, runner):
+        # NOTE: when resuming from a checkpoint,
+        # if 'initial_momentum' is not saved,
+        # it will be set according to the optimizer params
+        if isinstance(runner.optimizer, dict):
+            self.base_momentum = {}
+            for k, optim in runner.optimizer.items():
+                for group in optim.param_groups:
+                    if 'momentum' in group.keys():
+                        group.setdefault('initial_momentum', group['momentum'])
+                    else:
+                        group.setdefault('initial_momentum', group['betas'][0])
+                _base_momentum = [
+                    group['initial_momentum'] for group in optim.param_groups
+                ]
+                self.base_momentum.update({k: _base_momentum})
+        else:
+            for group in runner.optimizer.param_groups:
+                if 'momentum' in group.keys():
+                    group.setdefault('initial_momentum', group['momentum'])
+                else:
+                    group.setdefault('initial_momentum', group['betas'][0])
+            self.base_momentum = [
+                group['initial_momentum']
+                for group in runner.optimizer.param_groups
+            ]
+
+    def before_train_epoch(self, runner):
+        if not self.by_epoch:
+            return
+        self.regular_momentum = self.get_regular_momentum(runner)
+        self._set_momentum(runner, self.regular_momentum)
+
+    def before_train_iter(self, runner):
+        cur_iter = runner.iter
+        if not self.by_epoch:
+            self.regular_momentum = self.get_regular_momentum(runner)
+            if self.warmup is None or cur_iter >= self.warmup_iters:
+                self._set_momentum(runner, self.regular_momentum)
+            else:
+                warmup_momentum = self.get_warmup_momentum(cur_iter)
+                self._set_momentum(runner, warmup_momentum)
+        elif self.by_epoch:
+            if self.warmup is None or cur_iter > self.warmup_iters:
+                return
+            elif cur_iter == self.warmup_iters:
+                self._set_momentum(runner, self.regular_momentum)
+            else:
+                warmup_momentum = self.get_warmup_momentum(cur_iter)
+                self._set_momentum(runner, warmup_momentum)
+
+
+@HOOKS.register_module()
+class StepMomentumUpdaterHook(MomentumUpdaterHook):
+    """Step momentum scheduler with min value clipping.
+
+    Args:
+        step (int | list[int]): Step to decay the momentum. If an int value is
+            given, regard it as the decay interval. If a list is given, decay
+            momentum at these steps.
+        gamma (float, optional): Decay momentum ratio. Default: 0.5.
+        min_momentum (float, optional): Minimum momentum value to keep. If
+            momentum after decay is lower than this value, it will be clipped
+            accordingly. If None is given, we don't perform lr clipping.
+            Default: None.
+    """
+
+    def __init__(self,
+                 step: Union[int, List[int]],
+                 gamma: float = 0.5,
+                 min_momentum: Optional[float] = None,
+                 **kwargs):
+        if isinstance(step, list):
+            assert mmcv.is_list_of(step, int)
+            assert all([s > 0 for s in step])
+        elif isinstance(step, int):
+            assert step > 0
+        else:
+            raise TypeError('"step" must be a list or integer')
+        self.step = step
+        self.gamma = gamma
+        self.min_momentum = min_momentum
+        super().__init__(**kwargs)
+
+    def get_momentum(self, runner, base_momentum: float) -> float:
+        progress = runner.epoch if self.by_epoch else runner.iter
+
+        # calculate exponential term
+        if isinstance(self.step, int):
+            exp = progress // self.step
+        else:
+            exp = len(self.step)
+            for i, s in enumerate(self.step):
+                if progress < s:
+                    exp = i
+                    break
+
+        momentum = base_momentum * (self.gamma**exp)
+        if self.min_momentum is not None:
+            # clip to a minimum value
+            momentum = max(momentum, self.min_momentum)
+        return momentum
+
+
+@HOOKS.register_module()
+class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
+    """Cosine annealing LR Momentum decays the Momentum of each parameter group
+    linearly.
+
+    Args:
+        min_momentum (float, optional): The minimum momentum. Default: None.
+        min_momentum_ratio (float, optional): The ratio of minimum momentum to
+            the base momentum. Either `min_momentum` or `min_momentum_ratio`
+            should be specified. Default: None.
+    """
+
+    def __init__(self,
+                 min_momentum: Optional[float] = None,
+                 min_momentum_ratio: Optional[float] = None,
+                 **kwargs):
+        assert (min_momentum is None) ^ (min_momentum_ratio is None)
+        self.min_momentum = min_momentum
+        self.min_momentum_ratio = min_momentum_ratio
+        super().__init__(**kwargs)
+
+    def get_momentum(self, runner, base_momentum: float) -> float:
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        if self.min_momentum_ratio is not None:
+            target_momentum = base_momentum * self.min_momentum_ratio
+        else:
+            assert self.min_momentum is not None
+            target_momentum = self.min_momentum
+        return annealing_cos(base_momentum, target_momentum,
+                             progress / max_progress)
+
+
+@HOOKS.register_module()
+class LinearAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
+    """Linear annealing LR Momentum decays the Momentum of each parameter group
+    linearly.
+
+    Args:
+        min_momentum (float, optional): The minimum momentum. Default: None.
+        min_momentum_ratio (float, optional): The ratio of minimum momentum to
+            the base momentum. Either `min_momentum` or `min_momentum_ratio`
+            should be specified. Default: None.
+    """
+
+    def __init__(self,
+                 min_momentum: Optional[float] = None,
+                 min_momentum_ratio: Optional[float] = None,
+                 **kwargs):
+        assert (min_momentum is None) ^ (min_momentum_ratio is None)
+        self.min_momentum = min_momentum
+        self.min_momentum_ratio = min_momentum_ratio
+        super().__init__(**kwargs)
+
+    def get_momentum(self, runner, base_momentum: float) -> float:
+        if self.by_epoch:
+            progress = runner.epoch
+            max_progress = runner.max_epochs
+        else:
+            progress = runner.iter
+            max_progress = runner.max_iters
+        if self.min_momentum_ratio is not None:
+            target_momentum = base_momentum * self.min_momentum_ratio
+        else:
+            assert self.min_momentum is not None
+            target_momentum = self.min_momentum
+        return annealing_linear(base_momentum, target_momentum,
+                                progress / max_progress)
+
+
+@HOOKS.register_module()
+class CyclicMomentumUpdaterHook(MomentumUpdaterHook):
+    """Cyclic momentum Scheduler.
+
+    Implement the cyclical momentum scheduler policy described in
+    https://arxiv.org/pdf/1708.07120.pdf
+
+    This momentum scheduler usually used together with the CyclicLRUpdater
+    to improve the performance in the 3D detection area.
+
+    Args:
+        target_ratio (tuple[float]): Relative ratio of the lowest momentum and
+            the highest momentum to the initial momentum.
+        cyclic_times (int): Number of cycles during training
+        step_ratio_up (float): The ratio of the increasing process of momentum
+            in  the total cycle.
+        by_epoch (bool): Whether to update momentum by epoch.
+        anneal_strategy (str, optional): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing. Default: 'cos'.
+        gamma (float, optional): Cycle decay ratio. Default: 1.
+            It takes values in the range (0, 1]. The difference between the
+            maximum learning rate and the minimum learning rate decreases
+            periodically when it is less than 1. `New in version 1.4.4.`
+    """
+
+    def __init__(self,
+                 by_epoch: bool = False,
+                 target_ratio: Tuple[float, float] = (0.85 / 0.95, 1.),
+                 cyclic_times: int = 1,
+                 step_ratio_up: float = 0.4,
+                 anneal_strategy: str = 'cos',
+                 gamma: float = 1.,
+                 **kwargs):
+        if isinstance(target_ratio, float):
+            target_ratio = (target_ratio, target_ratio / 1e5)
+        elif isinstance(target_ratio, tuple):
+            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
+                if len(target_ratio) == 1 else target_ratio
+        else:
+            raise ValueError('target_ratio should be either float '
+                             f'or tuple, got {type(target_ratio)}')
+
+        assert len(target_ratio) == 2, \
+            '"target_ratio" must be list or tuple of two floats'
+        assert 0 <= step_ratio_up < 1.0, \
+            '"step_ratio_up" must be in range [0,1)'
+
+        self.target_ratio = target_ratio
+        self.cyclic_times = cyclic_times
+        self.step_ratio_up = step_ratio_up
+        self.gamma = gamma
+        self.momentum_phases: List[list] = []  # init momentum_phases
+
+        self.anneal_func: Callable[[float, float, float], float]
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must be one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+        # currently only support by_epoch=False
+        assert not by_epoch, \
+            'currently only support "by_epoch" = False'
+        super().__init__(by_epoch, **kwargs)
+
+    def before_run(self, runner):
+        super().before_run(runner)
+        # initiate momentum_phases
+        # total momentum_phases are separated as up and down
+        max_iter_per_phase = runner.max_iters // self.cyclic_times
+        iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
+        self.max_iter_per_phase = max_iter_per_phase
+        self.momentum_phases.append(
+            [0, iter_up_phase, 1, self.target_ratio[0]])
+        self.momentum_phases.append([
+            iter_up_phase, max_iter_per_phase, self.target_ratio[0],
+            self.target_ratio[1]
+        ])
+
+    def get_momentum(self, runner, base_momentum: float) -> float:
+        curr_iter = runner.iter % self.max_iter_per_phase
+        curr_cycle = runner.iter // self.max_iter_per_phase
+        scale = self.gamma**curr_cycle
+        for (start_iter, end_iter, start_ratio, end_ratio) \
+                in self.momentum_phases:
+            if start_iter <= curr_iter < end_iter:
+                # Apply cycle scaling to gradually reduce the difference
+                # between max_momentum and base momentum. The target end_ratio
+                # can be expressed as:
+                # end_ratio = (base_momentum + scale * \
+                # (max_momentum - base_momentum)) / base_momentum
+                # iteration: 0-iter_up_phase:
+                if start_iter == 0:
+                    end_ratio = 1 - scale + end_ratio * scale
+                # iteration: iter_up_phase-self.max_iter_per_phase
+                else:
+                    start_ratio = 1 - scale + start_ratio * scale
+                progress = curr_iter - start_iter
+                return self.anneal_func(base_momentum * start_ratio,
+                                        base_momentum * end_ratio,
+                                        progress / (end_iter - start_iter))
+        raise RuntimeError('The method should return in the for-loop and '
+                           'should not be executed until this')
+
+
+@HOOKS.register_module()
+class OneCycleMomentumUpdaterHook(MomentumUpdaterHook):
+    """OneCycle momentum Scheduler.
+
+    This momentum scheduler usually used together with the OneCycleLrUpdater
+    to improve the performance.
+
+    Args:
+        base_momentum (float or list): Lower momentum boundaries in the cycle
+            for each parameter group. Note that momentum is cycled inversely
+            to learning rate; at the peak of a cycle, momentum is
+            'base_momentum' and learning rate is 'max_lr'.
+            Default: 0.85
+        max_momentum (float or list): Upper momentum boundaries in the cycle
+            for each parameter group. Functionally,
+            it defines the cycle amplitude (max_momentum - base_momentum).
+            Note that momentum is cycled inversely
+            to learning rate; at the start of a cycle, momentum is
+            'max_momentum' and learning rate is 'base_lr'
+            Default: 0.95
+        pct_start (float): The percentage of the cycle (in number of steps)
+            spent increasing the learning rate.
+            Default: 0.3
+        anneal_strategy (str): {'cos', 'linear'}
+            Specifies the annealing strategy: 'cos' for cosine annealing,
+            'linear' for linear annealing.
+            Default: 'cos'
+        three_phase (bool): If three_phase is True, use a third phase of the
+            schedule to annihilate the learning rate according to
+            final_div_factor instead of modifying the second phase (the first
+            two phases will be symmetrical about the step indicated by
+            pct_start).
+            Default: False
+    """
+
+    def __init__(self,
+                 base_momentum: Union[float, list, dict] = 0.85,
+                 max_momentum: Union[float, list, dict] = 0.95,
+                 pct_start: float = 0.3,
+                 anneal_strategy: str = 'cos',
+                 three_phase: bool = False,
+                 **kwargs):
+        # validate by_epoch, currently only support by_epoch=False
+        if 'by_epoch' not in kwargs:
+            kwargs['by_epoch'] = False
+        else:
+            assert not kwargs['by_epoch'], \
+                'currently only support "by_epoch" = False'
+        if not isinstance(base_momentum, (float, list, dict)):
+            raise ValueError('base_momentum must be the type among of float,'
+                             'list or dict.')
+        self._base_momentum = base_momentum
+        if not isinstance(max_momentum, (float, list, dict)):
+            raise ValueError('max_momentum must be the type among of float,'
+                             'list or dict.')
+        self._max_momentum = max_momentum
+        # validate pct_start
+        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
+            raise ValueError('Expected float between 0 and 1 pct_start, but '
+                             f'got {pct_start}')
+        self.pct_start = pct_start
+        # validate anneal_strategy
+        self.anneal_func: Callable[[float, float, float], float]
+        if anneal_strategy not in ['cos', 'linear']:
+            raise ValueError('anneal_strategy must by one of "cos" or '
+                             f'"linear", instead got {anneal_strategy}')
+        elif anneal_strategy == 'cos':
+            self.anneal_func = annealing_cos
+        elif anneal_strategy == 'linear':
+            self.anneal_func = annealing_linear
+        self.three_phase = three_phase
+        self.momentum_phases: List[dict] = []  # init momentum_phases
+        super().__init__(**kwargs)
+
+    def before_run(self, runner):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                if ('momentum' not in optim.defaults
+                        and 'betas' not in optim.defaults):
+                    raise ValueError('optimizer must support momentum with'
+                                     'option enabled')
+                self.use_beta1 = 'betas' in optim.defaults
+                _base_momentum = format_param(k, optim, self._base_momentum)
+                _max_momentum = format_param(k, optim, self._max_momentum)
+                for group, b_momentum, m_momentum in zip(
+                        optim.param_groups, _base_momentum, _max_momentum):
+                    if self.use_beta1:
+                        _, beta2 = group['betas']
+                        group['betas'] = (m_momentum, beta2)
+                    else:
+                        group['momentum'] = m_momentum
+                    group['base_momentum'] = b_momentum
+                    group['max_momentum'] = m_momentum
+        else:
+            optim = runner.optimizer
+            if ('momentum' not in optim.defaults
+                    and 'betas' not in optim.defaults):
+                raise ValueError('optimizer must support momentum with'
+                                 'option enabled')
+            self.use_beta1 = 'betas' in optim.defaults
+            k = type(optim).__name__
+            _base_momentum = format_param(k, optim, self._base_momentum)
+            _max_momentum = format_param(k, optim, self._max_momentum)
+            for group, b_momentum, m_momentum in zip(optim.param_groups,
+                                                     _base_momentum,
+                                                     _max_momentum):
+                if self.use_beta1:
+                    _, beta2 = group['betas']
+                    group['betas'] = (m_momentum, beta2)
+                else:
+                    group['momentum'] = m_momentum
+                group['base_momentum'] = b_momentum
+                group['max_momentum'] = m_momentum
+
+        if self.three_phase:
+            self.momentum_phases.append({
+                'end_iter':
+                float(self.pct_start * runner.max_iters) - 1,
+                'start_momentum':
+                'max_momentum',
+                'end_momentum':
+                'base_momentum'
+            })
+            self.momentum_phases.append({
+                'end_iter':
+                float(2 * self.pct_start * runner.max_iters) - 2,
+                'start_momentum':
+                'base_momentum',
+                'end_momentum':
+                'max_momentum'
+            })
+            self.momentum_phases.append({
+                'end_iter': runner.max_iters - 1,
+                'start_momentum': 'max_momentum',
+                'end_momentum': 'max_momentum'
+            })
+        else:
+            self.momentum_phases.append({
+                'end_iter':
+                float(self.pct_start * runner.max_iters) - 1,
+                'start_momentum':
+                'max_momentum',
+                'end_momentum':
+                'base_momentum'
+            })
+            self.momentum_phases.append({
+                'end_iter': runner.max_iters - 1,
+                'start_momentum': 'base_momentum',
+                'end_momentum': 'max_momentum'
+            })
+
+    def _set_momentum(self, runner, momentum_groups):
+        if isinstance(runner.optimizer, dict):
+            for k, optim in runner.optimizer.items():
+                for param_group, mom in zip(optim.param_groups,
+                                            momentum_groups[k]):
+                    if 'momentum' in param_group.keys():
+                        param_group['momentum'] = mom
+                    elif 'betas' in param_group.keys():
+                        param_group['betas'] = (mom, param_group['betas'][1])
+        else:
+            for param_group, mom in zip(runner.optimizer.param_groups,
+                                        momentum_groups):
+                if 'momentum' in param_group.keys():
+                    param_group['momentum'] = mom
+                elif 'betas' in param_group.keys():
+                    param_group['betas'] = (mom, param_group['betas'][1])
+
+    def get_momentum(self, runner, param_group: Dict[str, float]) -> float:
+        curr_iter = runner.iter
+        start_iter = 0
+        momentum = 0.
+        for i, phase in enumerate(self.momentum_phases):
+            end_iter = phase['end_iter']
+            if curr_iter <= end_iter or i == len(self.momentum_phases) - 1:
+                pct = (curr_iter - start_iter) / (end_iter - start_iter)
+                momentum = self.anneal_func(
+                    param_group[phase['start_momentum']],
+                    param_group[phase['end_momentum']], pct)
+                break
+            start_iter = end_iter
+        return momentum
+
+    def get_regular_momentum(self, runner):
+        if isinstance(runner.optimizer, dict):
+            momentum_groups = {}
+            for k, optim in runner.optimizer.items():
+                _momentum_group = [
+                    self.get_momentum(runner, param_group)
+                    for param_group in optim.param_groups
+                ]
+                momentum_groups.update({k: _momentum_group})
+            return momentum_groups
+        else:
+            momentum_groups = []
+            for param_group in runner.optimizer.param_groups:
+                momentum_groups.append(self.get_momentum(runner, param_group))
+            return momentum_groups
diff --git a/mmcv/mmcv/runner/hooks/optimizer.py b/mmcv/mmcv/runner/hooks/optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb3f90e656b4655b8919c11a53bf559326c7b779
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/optimizer.py
@@ -0,0 +1,563 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import logging
+from collections import defaultdict
+from itertools import chain
+from typing import Optional, Union
+
+import torch.nn as nn
+from torch import Tensor
+from torch.nn.utils import clip_grad
+
+from mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version
+from ..dist_utils import allreduce_grads
+from ..fp16_utils import LossScaler, wrap_fp16_model
+from .hook import HOOKS, Hook
+
+try:
+    # If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported
+    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
+    from torch.cuda.amp import GradScaler
+except ImportError:
+    pass
+
+
+@HOOKS.register_module()
+class OptimizerHook(Hook):
+    """A hook contains custom operations for the optimizer.
+
+    Args:
+        grad_clip (dict, optional): A config dict to control the clip_grad.
+            Default: None.
+        detect_anomalous_params (bool): This option is only used for
+            debugging which will slow down the training speed.
+            Detect anomalous parameters that are not included in
+            the computational graph with `loss` as the root.
+            There are two cases
+
+                - Parameters were not used during
+                  forward pass.
+                - Parameters were not used to produce
+                  loss.
+            Default: False.
+    """
+
+    def __init__(self,
+                 grad_clip: Optional[dict] = None,
+                 detect_anomalous_params: bool = False):
+        self.grad_clip = grad_clip
+        self.detect_anomalous_params = detect_anomalous_params
+
+    def clip_grads(self, params):
+        params = list(
+            filter(lambda p: p.requires_grad and p.grad is not None, params))
+        if len(params) > 0:
+            return clip_grad.clip_grad_norm_(params, **self.grad_clip)
+
+    def after_train_iter(self, runner):
+        runner.optimizer.zero_grad()
+        if self.detect_anomalous_params:
+            self.detect_anomalous_parameters(runner.outputs['loss'], runner)
+        runner.outputs['loss'].backward()
+
+        if self.grad_clip is not None:
+            grad_norm = self.clip_grads(runner.model.parameters())
+            if grad_norm is not None:
+                # Add grad norm to the logger
+                runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                         runner.outputs['num_samples'])
+        runner.optimizer.step()
+
+    def detect_anomalous_parameters(self, loss: Tensor, runner) -> None:
+        logger = runner.logger
+        parameters_in_graph = set()
+        visited = set()
+
+        def traverse(grad_fn):
+            if grad_fn is None:
+                return
+            if grad_fn not in visited:
+                visited.add(grad_fn)
+                if hasattr(grad_fn, 'variable'):
+                    parameters_in_graph.add(grad_fn.variable)
+                parents = grad_fn.next_functions
+                if parents is not None:
+                    for parent in parents:
+                        grad_fn = parent[0]
+                        traverse(grad_fn)
+
+        traverse(loss.grad_fn)
+        for n, p in runner.model.named_parameters():
+            if p not in parameters_in_graph and p.requires_grad:
+                logger.log(
+                    level=logging.ERROR,
+                    msg=f'{n} with shape {p.size()} is not '
+                    f'in the computational graph \n')
+
+
+@HOOKS.register_module()
+class GradientCumulativeOptimizerHook(OptimizerHook):
+    """Optimizer Hook implements multi-iters gradient cumulating.
+
+    Args:
+        cumulative_iters (int, optional): Num of gradient cumulative iters.
+            The optimizer will step every `cumulative_iters` iters.
+            Defaults to 1.
+
+    Examples:
+        >>> # Use cumulative_iters to simulate a large batch size
+        >>> # It is helpful when the hardware cannot handle a large batch size.
+        >>> loader = DataLoader(data, batch_size=64)
+        >>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4)
+        >>> # almost equals to
+        >>> loader = DataLoader(data, batch_size=256)
+        >>> optim_hook = OptimizerHook()
+    """
+
+    def __init__(self, cumulative_iters: int = 1, **kwargs):
+        super().__init__(**kwargs)
+
+        assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \
+            f'cumulative_iters only accepts positive int, but got ' \
+            f'{type(cumulative_iters)} instead.'
+
+        self.cumulative_iters = cumulative_iters
+        self.divisible_iters = 0
+        self.remainder_iters = 0
+        self.initialized = False
+
+    def has_batch_norm(self, module: nn.Module) -> bool:
+        if isinstance(module, _BatchNorm):
+            return True
+        for m in module.children():
+            if self.has_batch_norm(m):
+                return True
+        return False
+
+    def _init(self, runner):
+        if runner.iter % self.cumulative_iters != 0:
+            runner.logger.warning(
+                'Resume iter number is not divisible by cumulative_iters in '
+                'GradientCumulativeOptimizerHook, which means the gradient of '
+                'some iters is lost and the result may be influenced slightly.'
+            )
+
+        if self.has_batch_norm(runner.model) and self.cumulative_iters > 1:
+            runner.logger.warning(
+                'GradientCumulativeOptimizerHook may slightly decrease '
+                'performance if the model has BatchNorm layers.')
+
+        residual_iters = runner.max_iters - runner.iter
+
+        self.divisible_iters = (
+            residual_iters // self.cumulative_iters * self.cumulative_iters)
+        self.remainder_iters = residual_iters - self.divisible_iters
+
+        self.initialized = True
+
+    def after_train_iter(self, runner):
+        if not self.initialized:
+            self._init(runner)
+
+        if runner.iter < self.divisible_iters:
+            loss_factor = self.cumulative_iters
+        else:
+            loss_factor = self.remainder_iters
+        loss = runner.outputs['loss']
+        loss = loss / loss_factor
+        loss.backward()
+
+        if (self.every_n_iters(runner, self.cumulative_iters)
+                or self.is_last_iter(runner)):
+
+            if self.grad_clip is not None:
+                grad_norm = self.clip_grads(runner.model.parameters())
+                if grad_norm is not None:
+                    # Add grad norm to the logger
+                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                             runner.outputs['num_samples'])
+            runner.optimizer.step()
+            runner.optimizer.zero_grad()
+
+
+if (TORCH_VERSION != 'parrots'
+        and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+
+    @HOOKS.register_module()
+    class Fp16OptimizerHook(OptimizerHook):
+        """FP16 optimizer hook (using PyTorch's implementation).
+
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of GradScalar.
+                Defaults to 512. For Pytorch >= 1.6, mmcv uses official
+                implementation of GradScaler. If you use a dict version of
+                loss_scale to create GradScaler, please refer to:
+                https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
+                for the parameters.
+
+        Examples:
+            >>> loss_scale = dict(
+            ...     init_scale=65536.0,
+            ...     growth_factor=2.0,
+            ...     backoff_factor=0.5,
+            ...     growth_interval=2000
+            ... )
+            >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
+        """
+
+        def __init__(self,
+                     grad_clip: Optional[dict] = None,
+                     coalesce: bool = True,
+                     bucket_size_mb: int = -1,
+                     loss_scale: Union[float, str, dict] = 512.,
+                     distributed: bool = True):
+            self.grad_clip = grad_clip
+            self.coalesce = coalesce
+            self.bucket_size_mb = bucket_size_mb
+            self.distributed = distributed
+            self._scale_update_param = None
+            if loss_scale == 'dynamic':
+                self.loss_scaler = GradScaler()
+            elif isinstance(loss_scale, float):
+                self._scale_update_param = loss_scale
+                self.loss_scaler = GradScaler(init_scale=loss_scale)
+            elif isinstance(loss_scale, dict):
+                self.loss_scaler = GradScaler(**loss_scale)
+            else:
+                raise ValueError('loss_scale must be of type float, dict, or '
+                                 f'"dynamic", got {loss_scale}')
+
+        def before_run(self, runner) -> None:
+            """Preparing steps before Mixed Precision Training."""
+            # wrap model mode to fp16
+            wrap_fp16_model(runner.model)
+            # resume from state dict
+            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
+                scaler_state_dict = runner.meta['fp16']['loss_scaler']
+                self.loss_scaler.load_state_dict(scaler_state_dict)
+
+        def copy_grads_to_fp32(self, fp16_net: nn.Module,
+                               fp32_weights: Tensor) -> None:
+            """Copy gradients from fp16 model to fp32 weight copy."""
+            for fp32_param, fp16_param in zip(fp32_weights,
+                                              fp16_net.parameters()):
+                if fp16_param.grad is not None:
+                    if fp32_param.grad is None:
+                        fp32_param.grad = fp32_param.data.new(
+                            fp32_param.size())
+                    fp32_param.grad.copy_(fp16_param.grad)
+
+        def copy_params_to_fp16(self, fp16_net: nn.Module,
+                                fp32_weights: Tensor) -> None:
+            """Copy updated params from fp32 weight copy to fp16 model."""
+            for fp16_param, fp32_param in zip(fp16_net.parameters(),
+                                              fp32_weights):
+                fp16_param.data.copy_(fp32_param.data)
+
+        def after_train_iter(self, runner) -> None:
+            """Backward optimization steps for Mixed Precision Training. For
+            dynamic loss scaling, please refer to
+            https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.
+
+            1. Scale the loss by a scale factor.
+            2. Backward the loss to obtain the gradients.
+            3. Unscale the optimizer’s gradient tensors.
+            4. Call optimizer.step() and update scale factor.
+            5. Save loss_scaler state_dict for resume purpose.
+            """
+            # clear grads of last iteration
+            runner.model.zero_grad()
+            runner.optimizer.zero_grad()
+
+            self.loss_scaler.scale(runner.outputs['loss']).backward()
+            self.loss_scaler.unscale_(runner.optimizer)
+            # grad clip
+            if self.grad_clip is not None:
+                grad_norm = self.clip_grads(runner.model.parameters())
+                if grad_norm is not None:
+                    # Add grad norm to the logger
+                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
+                                             runner.outputs['num_samples'])
+            # backward and update scaler
+            self.loss_scaler.step(runner.optimizer)
+            self.loss_scaler.update(self._scale_update_param)
+
+            # save state_dict of loss_scaler
+            runner.meta.setdefault(
+                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+    @HOOKS.register_module()
+    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
+                                              Fp16OptimizerHook):
+        """Fp16 optimizer Hook (using PyTorch's implementation) implements
+        multi-iters gradient cumulating.
+
+        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
+        to take care of the optimization procedure.
+        """
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def after_train_iter(self, runner) -> None:
+            if not self.initialized:
+                self._init(runner)
+
+            if runner.iter < self.divisible_iters:
+                loss_factor = self.cumulative_iters
+            else:
+                loss_factor = self.remainder_iters
+            loss = runner.outputs['loss']
+            loss = loss / loss_factor
+
+            self.loss_scaler.scale(loss).backward()
+
+            if (self.every_n_iters(runner, self.cumulative_iters)
+                    or self.is_last_iter(runner)):
+
+                # copy fp16 grads in the model to fp32 params in the optimizer
+                self.loss_scaler.unscale_(runner.optimizer)
+
+                if self.grad_clip is not None:
+                    grad_norm = self.clip_grads(runner.model.parameters())
+                    if grad_norm is not None:
+                        # Add grad norm to the logger
+                        runner.log_buffer.update(
+                            {'grad_norm': float(grad_norm)},
+                            runner.outputs['num_samples'])
+
+                # backward and update scaler
+                self.loss_scaler.step(runner.optimizer)
+                self.loss_scaler.update(self._scale_update_param)
+
+                # save state_dict of loss_scaler
+                runner.meta.setdefault(
+                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+                # clear grads
+                runner.model.zero_grad()
+                runner.optimizer.zero_grad()
+
+else:
+
+    @HOOKS.register_module()
+    class Fp16OptimizerHook(OptimizerHook):  # type: ignore
+        """FP16 optimizer hook (mmcv's implementation).
+
+        The steps of fp16 optimizer is as follows.
+        1. Scale the loss value.
+        2. BP in the fp16 model.
+        2. Copy gradients from fp16 model to fp32 weights.
+        3. Update fp32 weights.
+        4. Copy updated parameters from fp32 weights to fp16 model.
+
+        Refer to https://arxiv.org/abs/1710.03740 for more details.
+
+        Args:
+            loss_scale (float | str | dict): Scale factor configuration.
+                If loss_scale is a float, static loss scaling will be used with
+                the specified scale. If loss_scale is a string, it must be
+                'dynamic', then dynamic loss scaling will be used.
+                It can also be a dict containing arguments of LossScaler.
+                Defaults to 512.
+        """
+
+        def __init__(self,
+                     grad_clip: Optional[dict] = None,
+                     coalesce: bool = True,
+                     bucket_size_mb: int = -1,
+                     loss_scale: Union[float, str, dict] = 512.,
+                     distributed: bool = True):
+            self.grad_clip = grad_clip
+            self.coalesce = coalesce
+            self.bucket_size_mb = bucket_size_mb
+            self.distributed = distributed
+            if loss_scale == 'dynamic':
+                self.loss_scaler = LossScaler(mode='dynamic')
+            elif isinstance(loss_scale, float):
+                self.loss_scaler = LossScaler(
+                    init_scale=loss_scale, mode='static')
+            elif isinstance(loss_scale, dict):
+                self.loss_scaler = LossScaler(**loss_scale)
+            else:
+                raise ValueError('loss_scale must be of type float, dict, or '
+                                 f'"dynamic", got {loss_scale}')
+
+        def before_run(self, runner) -> None:
+            """Preparing steps before Mixed Precision Training.
+
+            1. Make a master copy of fp32 weights for optimization.
+            2. Convert the main model from fp32 to fp16.
+            """
+            # keep a copy of fp32 weights
+            old_groups = runner.optimizer.param_groups
+            runner.optimizer.param_groups = copy.deepcopy(
+                runner.optimizer.param_groups)
+            state: defaultdict = defaultdict(dict)
+            p_map = {
+                old_p: p
+                for old_p, p in zip(
+                    chain(*(g['params'] for g in old_groups)),
+                    chain(*(g['params']
+                            for g in runner.optimizer.param_groups)))
+            }
+            for k, v in runner.optimizer.state.items():
+                state[p_map[k]] = v
+            runner.optimizer.state = state
+            # convert model to fp16
+            wrap_fp16_model(runner.model)
+            # resume from state dict
+            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
+                scaler_state_dict = runner.meta['fp16']['loss_scaler']
+                self.loss_scaler.load_state_dict(scaler_state_dict)
+
+        def copy_grads_to_fp32(self, fp16_net: nn.Module,
+                               fp32_weights: Tensor) -> None:
+            """Copy gradients from fp16 model to fp32 weight copy."""
+            for fp32_param, fp16_param in zip(fp32_weights,
+                                              fp16_net.parameters()):
+                if fp16_param.grad is not None:
+                    if fp32_param.grad is None:
+                        fp32_param.grad = fp32_param.data.new(
+                            fp32_param.size())
+                    fp32_param.grad.copy_(fp16_param.grad)
+
+        def copy_params_to_fp16(self, fp16_net: nn.Module,
+                                fp32_weights: Tensor) -> None:
+            """Copy updated params from fp32 weight copy to fp16 model."""
+            for fp16_param, fp32_param in zip(fp16_net.parameters(),
+                                              fp32_weights):
+                fp16_param.data.copy_(fp32_param.data)
+
+        def after_train_iter(self, runner) -> None:
+            """Backward optimization steps for Mixed Precision Training. For
+            dynamic loss scaling, please refer `loss_scalar.py`
+
+            1. Scale the loss by a scale factor.
+            2. Backward the loss to obtain the gradients (fp16).
+            3. Copy gradients from the model to the fp32 weight copy.
+            4. Scale the gradients back and update the fp32 weight copy.
+            5. Copy back the params from fp32 weight copy to the fp16 model.
+            6. Save loss_scaler state_dict for resume purpose.
+            """
+            # clear grads of last iteration
+            runner.model.zero_grad()
+            runner.optimizer.zero_grad()
+            # scale the loss value
+            scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale
+            scaled_loss.backward()
+            # copy fp16 grads in the model to fp32 params in the optimizer
+
+            fp32_weights = []
+            for param_group in runner.optimizer.param_groups:
+                fp32_weights += param_group['params']
+            self.copy_grads_to_fp32(runner.model, fp32_weights)
+            # allreduce grads
+            if self.distributed:
+                allreduce_grads(fp32_weights, self.coalesce,
+                                self.bucket_size_mb)
+
+            has_overflow = self.loss_scaler.has_overflow(fp32_weights)
+            # if has overflow, skip this iteration
+            if not has_overflow:
+                # scale the gradients back
+                for param in fp32_weights:
+                    if param.grad is not None:
+                        param.grad.div_(self.loss_scaler.loss_scale)
+                if self.grad_clip is not None:
+                    grad_norm = self.clip_grads(fp32_weights)
+                    if grad_norm is not None:
+                        # Add grad norm to the logger
+                        runner.log_buffer.update(
+                            {'grad_norm': float(grad_norm)},
+                            runner.outputs['num_samples'])
+                # update fp32 params
+                runner.optimizer.step()
+                # copy fp32 params to the fp16 model
+                self.copy_params_to_fp16(runner.model, fp32_weights)
+            self.loss_scaler.update_scale(has_overflow)
+            if has_overflow:
+                runner.logger.warning('Check overflow, downscale loss scale '
+                                      f'to {self.loss_scaler.cur_scale}')
+
+            # save state_dict of loss_scaler
+            runner.meta.setdefault(
+                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+    @HOOKS.register_module()
+    class GradientCumulativeFp16OptimizerHook(  # type: ignore
+            GradientCumulativeOptimizerHook, Fp16OptimizerHook):
+        """Fp16 optimizer Hook (using mmcv implementation) implements multi-
+        iters gradient cumulating."""
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+
+        def after_train_iter(self, runner) -> None:
+            if not self.initialized:
+                self._init(runner)
+
+            if runner.iter < self.divisible_iters:
+                loss_factor = self.cumulative_iters
+            else:
+                loss_factor = self.remainder_iters
+
+            loss = runner.outputs['loss']
+            loss = loss / loss_factor
+
+            # scale the loss value
+            scaled_loss = loss * self.loss_scaler.loss_scale
+            scaled_loss.backward()
+
+            if (self.every_n_iters(runner, self.cumulative_iters)
+                    or self.is_last_iter(runner)):
+
+                # copy fp16 grads in the model to fp32 params in the optimizer
+                fp32_weights = []
+                for param_group in runner.optimizer.param_groups:
+                    fp32_weights += param_group['params']
+                self.copy_grads_to_fp32(runner.model, fp32_weights)
+                # allreduce grads
+                if self.distributed:
+                    allreduce_grads(fp32_weights, self.coalesce,
+                                    self.bucket_size_mb)
+
+                has_overflow = self.loss_scaler.has_overflow(fp32_weights)
+                # if has overflow, skip this iteration
+                if not has_overflow:
+                    # scale the gradients back
+                    for param in fp32_weights:
+                        if param.grad is not None:
+                            param.grad.div_(self.loss_scaler.loss_scale)
+                    if self.grad_clip is not None:
+                        grad_norm = self.clip_grads(fp32_weights)
+                        if grad_norm is not None:
+                            # Add grad norm to the logger
+                            runner.log_buffer.update(
+                                {'grad_norm': float(grad_norm)},
+                                runner.outputs['num_samples'])
+                    # update fp32 params
+                    runner.optimizer.step()
+                    # copy fp32 params to the fp16 model
+                    self.copy_params_to_fp16(runner.model, fp32_weights)
+                else:
+                    runner.logger.warning(
+                        'Check overflow, downscale loss scale '
+                        f'to {self.loss_scaler.cur_scale}')
+
+                self.loss_scaler.update_scale(has_overflow)
+
+                # save state_dict of loss_scaler
+                runner.meta.setdefault(
+                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
+
+                # clear grads
+                runner.model.zero_grad()
+                runner.optimizer.zero_grad()
diff --git a/mmcv/mmcv/runner/hooks/profiler.py b/mmcv/mmcv/runner/hooks/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b0fc4b86466aef85e2ea5c554766a94f6378125
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/profiler.py
@@ -0,0 +1,190 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Callable, List, Optional, Union
+
+import torch
+
+from ..dist_utils import master_only
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class ProfilerHook(Hook):
+    """Profiler to analyze performance during training.
+
+    PyTorch Profiler is a tool that allows the collection of the performance
+    metrics during the training. More details on Profiler can be found at
+    https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile
+
+    Args:
+        by_epoch (bool): Profile performance by epoch or by iteration.
+            Default: True.
+        profile_iters (int): Number of iterations for profiling.
+            If ``by_epoch=True``, profile_iters indicates that they are the
+            first profile_iters epochs at the beginning of the
+            training, otherwise it indicates the first profile_iters
+            iterations. Default: 1.
+        activities (list[str]): List of activity groups (CPU, CUDA) to use in
+            profiling. Default: ['cpu', 'cuda'].
+        schedule (dict, optional): Config of generating the callable schedule.
+            if schedule is None, profiler will not add step markers into the
+            trace and table view. Default: None.
+        on_trace_ready (callable, dict): Either a handler or a dict of generate
+            handler. Default: None.
+        record_shapes (bool): Save information about operator's input shapes.
+            Default: False.
+        profile_memory (bool): Track tensor memory allocation/deallocation.
+            Default: False.
+        with_stack (bool): Record source information (file and line number)
+            for the ops. Default: False.
+        with_flops (bool): Use formula to estimate the FLOPS of specific
+            operators (matrix multiplication and 2D convolution).
+            Default: False.
+        json_trace_path (str, optional): Exports the collected trace in Chrome
+            JSON format. Default: None.
+
+    Example:
+        >>> runner = ... # instantiate a Runner
+        >>> # tensorboard trace
+        >>> trace_config = dict(type='tb_trace', dir_name='work_dir')
+        >>> profiler_config = dict(on_trace_ready=trace_config)
+        >>> runner.register_profiler_hook(profiler_config)
+        >>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)])
+    """
+
+    def __init__(self,
+                 by_epoch: bool = True,
+                 profile_iters: int = 1,
+                 activities: List[str] = ['cpu', 'cuda'],
+                 schedule: Optional[dict] = None,
+                 on_trace_ready: Optional[Union[Callable, dict]] = None,
+                 record_shapes: bool = False,
+                 profile_memory: bool = False,
+                 with_stack: bool = False,
+                 with_flops: bool = False,
+                 json_trace_path: Optional[str] = None) -> None:
+        try:
+            from torch import profiler  # torch version >= 1.8.1
+        except ImportError:
+            raise ImportError('profiler is the new feature of torch1.8.1, '
+                              f'but your version is {torch.__version__}')
+
+        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
+        self.by_epoch = by_epoch
+
+        if profile_iters < 1:
+            raise ValueError('profile_iters should be greater than 0, but got '
+                             f'{profile_iters}')
+        self.profile_iters = profile_iters
+
+        if not isinstance(activities, list):
+            raise ValueError(
+                f'activities should be list, but got {type(activities)}')
+        self.activities = []
+        for activity in activities:
+            activity = activity.lower()
+            if activity == 'cpu':
+                self.activities.append(profiler.ProfilerActivity.CPU)
+            elif activity == 'cuda':
+                self.activities.append(profiler.ProfilerActivity.CUDA)
+            else:
+                raise ValueError(
+                    f'activity should be "cpu" or "cuda", but got {activity}')
+
+        if schedule is not None:
+            self.schedule = profiler.schedule(**schedule)
+        else:
+            self.schedule = None
+
+        self.on_trace_ready = on_trace_ready
+        self.record_shapes = record_shapes
+        self.profile_memory = profile_memory
+        self.with_stack = with_stack
+        self.with_flops = with_flops
+        self.json_trace_path = json_trace_path
+
+    @master_only
+    def before_run(self, runner):
+        if self.by_epoch and runner.max_epochs < self.profile_iters:
+            raise ValueError('self.profile_iters should not be greater than '
+                             f'{runner.max_epochs}')
+
+        if not self.by_epoch and runner.max_iters < self.profile_iters:
+            raise ValueError('self.profile_iters should not be greater than '
+                             f'{runner.max_iters}')
+
+        if callable(self.on_trace_ready):  # handler
+            _on_trace_ready = self.on_trace_ready
+        elif isinstance(self.on_trace_ready, dict):  # config of handler
+            trace_cfg = self.on_trace_ready.copy()
+            trace_type = trace_cfg.pop('type')  # log_trace handler
+            if trace_type == 'log_trace':
+
+                def _log_handler(prof):
+                    print(prof.key_averages().table(**trace_cfg))
+
+                _on_trace_ready = _log_handler
+            elif trace_type == 'tb_trace':  # tensorboard_trace handler
+                try:
+                    import torch_tb_profiler  # noqa: F401
+                except ImportError:
+                    raise ImportError('please run "pip install '
+                                      'torch-tb-profiler" to install '
+                                      'torch_tb_profiler')
+                if 'dir_name' not in trace_cfg:
+                    trace_cfg['dir_name'] = osp.join(runner.work_dir,
+                                                     'tf_tracing_logs')
+                elif not osp.isabs(trace_cfg['dir_name']):
+                    trace_cfg['dir_name'] = osp.join(runner.work_dir,
+                                                     trace_cfg['dir_name'])
+                runner.logger.info(
+                    'tracing files of ProfilerHook will be saved to '
+                    f"{trace_cfg['dir_name']}.")
+                _on_trace_ready = torch.profiler.tensorboard_trace_handler(
+                    **trace_cfg)
+            else:
+                raise ValueError('trace_type should be "log_trace" or '
+                                 f'"tb_trace", but got {trace_type}')
+        elif self.on_trace_ready is None:
+            _on_trace_ready = None  # type: ignore
+        else:
+            raise ValueError('on_trace_ready should be handler, dict or None, '
+                             f'but got {type(self.on_trace_ready)}')
+
+        if self.by_epoch and runner.max_epochs > 1:
+            warnings.warn(f'profiler will profile {runner.max_epochs} epochs '
+                          'instead of 1 epoch. Since profiler will slow down '
+                          'the training, it is recommended to train 1 epoch '
+                          'with ProfilerHook and adjust your setting according'
+                          ' to the profiler summary. During normal training '
+                          '(epoch > 1), you may disable the ProfilerHook.')
+
+        self.profiler = torch.profiler.profile(
+            activities=self.activities,
+            schedule=self.schedule,
+            on_trace_ready=_on_trace_ready,
+            record_shapes=self.record_shapes,
+            profile_memory=self.profile_memory,
+            with_stack=self.with_stack,
+            with_flops=self.with_flops)
+
+        self.profiler.__enter__()
+        runner.logger.info('profiler is profiling...')
+
+    @master_only
+    def after_train_epoch(self, runner):
+        if self.by_epoch and runner.epoch == self.profile_iters - 1:
+            runner.logger.info('profiler may take a few minutes...')
+            self.profiler.__exit__(None, None, None)
+            if self.json_trace_path is not None:
+                self.profiler.export_chrome_trace(self.json_trace_path)
+
+    @master_only
+    def after_train_iter(self, runner):
+        self.profiler.step()
+        if not self.by_epoch and runner.iter == self.profile_iters - 1:
+            runner.logger.info('profiler may take a few minutes...')
+            self.profiler.__exit__(None, None, None)
+            if self.json_trace_path is not None:
+                self.profiler.export_chrome_trace(self.json_trace_path)
diff --git a/mmcv/mmcv/runner/hooks/sampler_seed.py b/mmcv/mmcv/runner/hooks/sampler_seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee0dc6bdd8df5775857028aaed5444c0f59caf80
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/sampler_seed.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class DistSamplerSeedHook(Hook):
+    """Data-loading sampler for distributed training.
+
+    When distributed training, it is only useful in conjunction with
+    :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
+    purpose with :obj:`IterLoader`.
+    """
+
+    def before_epoch(self, runner):
+        if hasattr(runner.data_loader.sampler, 'set_epoch'):
+            # in case the data loader uses `SequentialSampler` in Pytorch
+            runner.data_loader.sampler.set_epoch(runner.epoch)
+        elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'):
+            # batch sampler in pytorch warps the sampler as its attributes.
+            runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch)
diff --git a/mmcv/mmcv/runner/hooks/sync_buffer.py b/mmcv/mmcv/runner/hooks/sync_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f07ae656ad04b87552bbd16e7b3e5b22ad03a68
--- /dev/null
+++ b/mmcv/mmcv/runner/hooks/sync_buffer.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from ..dist_utils import allreduce_params
+from .hook import HOOKS, Hook
+
+
+@HOOKS.register_module()
+class SyncBuffersHook(Hook):
+    """Synchronize model buffers such as running_mean and running_var in BN at
+    the end of each epoch.
+
+    Args:
+        distributed (bool): Whether distributed training is used. It is
+          effective only for distributed training. Defaults to True.
+    """
+
+    def __init__(self, distributed: bool = True):
+        self.distributed = distributed
+
+    def after_epoch(self, runner):
+        """All-reduce model buffers at the end of each epoch."""
+        if self.distributed:
+            allreduce_params(runner.model.buffers())
diff --git a/mmcv/mmcv/runner/iter_based_runner.py b/mmcv/mmcv/runner/iter_based_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..06b4b7d2a0ab056dc7689b95548c4427ce115aa2
--- /dev/null
+++ b/mmcv/mmcv/runner/iter_based_runner.py
@@ -0,0 +1,285 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import platform
+import shutil
+import time
+import warnings
+from typing import Callable, Dict, List, Optional, Tuple, Union, no_type_check
+
+import torch
+from torch.optim import Optimizer
+from torch.utils.data import DataLoader
+
+import mmcv
+from .base_runner import BaseRunner
+from .builder import RUNNERS
+from .checkpoint import save_checkpoint
+from .hooks import IterTimerHook
+from .utils import get_host_info
+
+
+class IterLoader:
+
+    def __init__(self, dataloader: DataLoader):
+        self._dataloader = dataloader
+        self.iter_loader = iter(self._dataloader)
+        self._epoch = 0
+
+    @property
+    def epoch(self) -> int:
+        return self._epoch
+
+    def __next__(self):
+        try:
+            data = next(self.iter_loader)
+        except StopIteration:
+            self._epoch += 1
+            if hasattr(self._dataloader.sampler, 'set_epoch'):
+                self._dataloader.sampler.set_epoch(self._epoch)
+            time.sleep(2)  # Prevent possible deadlock during epoch transition
+            self.iter_loader = iter(self._dataloader)
+            data = next(self.iter_loader)
+
+        return data
+
+    def __len__(self):
+        return len(self._dataloader)
+
+
+@RUNNERS.register_module()
+class IterBasedRunner(BaseRunner):
+    """Iteration-based Runner.
+
+    This runner train models iteration by iteration.
+    """
+
+    def train(self, data_loader, **kwargs):
+        self.model.train()
+        self.mode = 'train'
+        self.data_loader = data_loader
+        self._epoch = data_loader.epoch
+        data_batch = next(data_loader)
+        self.data_batch = data_batch
+        self.call_hook('before_train_iter')
+        outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
+        if not isinstance(outputs, dict):
+            raise TypeError('model.train_step() must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+        self.call_hook('after_train_iter')
+        del self.data_batch
+        self._inner_iter += 1
+        self._iter += 1
+
+    @torch.no_grad()
+    def val(self, data_loader, **kwargs):
+        self.model.eval()
+        self.mode = 'val'
+        self.data_loader = data_loader
+        data_batch = next(data_loader)
+        self.data_batch = data_batch
+        self.call_hook('before_val_iter')
+        outputs = self.model.val_step(data_batch, **kwargs)
+        if not isinstance(outputs, dict):
+            raise TypeError('model.val_step() must return a dict')
+        if 'log_vars' in outputs:
+            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
+        self.outputs = outputs
+        self.call_hook('after_val_iter')
+        del self.data_batch
+        self._inner_iter += 1
+
+    def run(self,
+            data_loaders: List[DataLoader],
+            workflow: List[Tuple[str, int]],
+            max_iters: Optional[int] = None,
+            **kwargs) -> None:
+        """Start running.
+
+        Args:
+            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
+                and validation.
+            workflow (list[tuple]): A list of (phase, iters) to specify the
+                running order and iterations. E.g, [('train', 10000),
+                ('val', 1000)] means running 10000 iterations for training and
+                1000 iterations for validation, iteratively.
+        """
+        assert isinstance(data_loaders, list)
+        assert mmcv.is_list_of(workflow, tuple)
+        assert len(data_loaders) == len(workflow)
+        if max_iters is not None:
+            warnings.warn(
+                'setting max_iters in run is deprecated, '
+                'please set max_iters in runner_config', DeprecationWarning)
+            self._max_iters = max_iters
+        assert self._max_iters is not None, (
+            'max_iters must be specified during instantiation')
+
+        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
+        self.logger.info('Start running, host: %s, work_dir: %s',
+                         get_host_info(), work_dir)
+        self.logger.info('Hooks will be executed in the following order:\n%s',
+                         self.get_hook_info())
+        self.logger.info('workflow: %s, max: %d iters', workflow,
+                         self._max_iters)
+        self.call_hook('before_run')
+
+        iter_loaders = [IterLoader(x) for x in data_loaders]
+
+        self.call_hook('before_epoch')
+
+        while self.iter < self._max_iters:
+            for i, flow in enumerate(workflow):
+                self._inner_iter = 0
+                mode, iters = flow
+                if not isinstance(mode, str) or not hasattr(self, mode):
+                    raise ValueError(
+                        'runner has no method named "{}" to run a workflow'.
+                        format(mode))
+                iter_runner = getattr(self, mode)
+                for _ in range(iters):
+                    if mode == 'train' and self.iter >= self._max_iters:
+                        break
+                    iter_runner(iter_loaders[i], **kwargs)
+
+        time.sleep(1)  # wait for some hooks like loggers to finish
+        self.call_hook('after_epoch')
+        self.call_hook('after_run')
+
+    @no_type_check
+    def resume(self,
+               checkpoint: str,
+               resume_optimizer: bool = True,
+               map_location: Union[str, Callable] = 'default') -> None:
+        """Resume model from checkpoint.
+
+        Args:
+            checkpoint (str): Checkpoint to resume from.
+            resume_optimizer (bool, optional): Whether resume the optimizer(s)
+                if the checkpoint file includes optimizer(s). Default to True.
+            map_location (str, optional): Same as :func:`torch.load`.
+                Default to 'default'.
+        """
+        if map_location == 'default':
+            device_id = torch.cuda.current_device()
+            checkpoint = self.load_checkpoint(
+                checkpoint,
+                map_location=lambda storage, loc: storage.cuda(device_id))
+        else:
+            checkpoint = self.load_checkpoint(
+                checkpoint, map_location=map_location)
+
+        self._epoch = checkpoint['meta']['epoch']
+        self._iter = checkpoint['meta']['iter']
+        self._inner_iter = checkpoint['meta']['iter']
+        if 'optimizer' in checkpoint and resume_optimizer:
+            if isinstance(self.optimizer, Optimizer):
+                self.optimizer.load_state_dict(checkpoint['optimizer'])
+            elif isinstance(self.optimizer, dict):
+                for k in self.optimizer.keys():
+                    self.optimizer[k].load_state_dict(
+                        checkpoint['optimizer'][k])
+            else:
+                raise TypeError(
+                    'Optimizer should be dict or torch.optim.Optimizer '
+                    f'but got {type(self.optimizer)}')
+
+        self.logger.info(f'resumed from epoch: {self.epoch}, iter {self.iter}')
+
+    def save_checkpoint(  # type: ignore
+            self,
+            out_dir: str,
+            filename_tmpl: str = 'iter_{}.pth',
+            meta: Optional[Dict] = None,
+            save_optimizer: bool = True,
+            create_symlink: bool = True) -> None:
+        """Save checkpoint to file.
+
+        Args:
+            out_dir (str): Directory to save checkpoint files.
+            filename_tmpl (str, optional): Checkpoint file template.
+                Defaults to 'iter_{}.pth'.
+            meta (dict, optional): Metadata to be saved in checkpoint.
+                Defaults to None.
+            save_optimizer (bool, optional): Whether save optimizer.
+                Defaults to True.
+            create_symlink (bool, optional): Whether create symlink to the
+                latest checkpoint file. Defaults to True.
+        """
+        if meta is None:
+            meta = {}
+        elif not isinstance(meta, dict):
+            raise TypeError(
+                f'meta should be a dict or None, but got {type(meta)}')
+        if self.meta is not None:
+            meta.update(self.meta)
+            # Note: meta.update(self.meta) should be done before
+            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
+            # there will be problems with resumed checkpoints.
+            # More details in https://github.com/open-mmlab/mmcv/pull/1108
+        meta.update(epoch=self.epoch + 1, iter=self.iter)
+
+        filename = filename_tmpl.format(self.iter + 1)
+        filepath = osp.join(out_dir, filename)
+        optimizer = self.optimizer if save_optimizer else None
+        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
+        # in some environments, `os.symlink` is not supported, you may need to
+        # set `create_symlink` to False
+        if create_symlink:
+            dst_file = osp.join(out_dir, 'latest.pth')
+            if platform.system() != 'Windows':
+                mmcv.symlink(filename, dst_file)
+            else:
+                shutil.copy(filepath, dst_file)
+
+    def register_training_hooks(self,
+                                lr_config,
+                                optimizer_config=None,
+                                checkpoint_config=None,
+                                log_config=None,
+                                momentum_config=None,
+                                custom_hooks_config=None):
+        """Register default hooks for iter-based training.
+
+        Checkpoint hook, optimizer stepper hook and logger hooks will be set to
+        `by_epoch=False` by default.
+
+        Default hooks include:
+
+        +----------------------+-------------------------+
+        | Hooks                | Priority                |
+        +======================+=========================+
+        | LrUpdaterHook        | VERY_HIGH (10)          |
+        +----------------------+-------------------------+
+        | MomentumUpdaterHook  | HIGH (30)               |
+        +----------------------+-------------------------+
+        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
+        +----------------------+-------------------------+
+        | CheckpointSaverHook  | NORMAL (50)             |
+        +----------------------+-------------------------+
+        | IterTimerHook        | LOW (70)                |
+        +----------------------+-------------------------+
+        | LoggerHook(s)        | VERY_LOW (90)           |
+        +----------------------+-------------------------+
+        | CustomHook(s)        | defaults to NORMAL (50) |
+        +----------------------+-------------------------+
+
+        If custom hooks have same priority with default hooks, custom hooks
+        will be triggered after default hooks.
+        """
+        if checkpoint_config is not None:
+            checkpoint_config.setdefault('by_epoch', False)  # type: ignore
+        if lr_config is not None:
+            lr_config.setdefault('by_epoch', False)  # type: ignore
+        if log_config is not None:
+            for info in log_config['hooks']:
+                info.setdefault('by_epoch', False)
+        super().register_training_hooks(
+            lr_config=lr_config,
+            momentum_config=momentum_config,
+            optimizer_config=optimizer_config,
+            checkpoint_config=checkpoint_config,
+            log_config=log_config,
+            timer_config=IterTimerHook(),
+            custom_hooks_config=custom_hooks_config)
diff --git a/mmcv/mmcv/runner/log_buffer.py b/mmcv/mmcv/runner/log_buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c9f3796376959bc46df2a6bb909d2fade1ff2ee
--- /dev/null
+++ b/mmcv/mmcv/runner/log_buffer.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+
+import numpy as np
+
+
+class LogBuffer:
+
+    def __init__(self):
+        self.val_history = OrderedDict()
+        self.n_history = OrderedDict()
+        self.output = OrderedDict()
+        self.ready = False
+
+    def clear(self) -> None:
+        self.val_history.clear()
+        self.n_history.clear()
+        self.clear_output()
+
+    def clear_output(self) -> None:
+        self.output.clear()
+        self.ready = False
+
+    def update(self, vars: dict, count: int = 1) -> None:
+        assert isinstance(vars, dict)
+        for key, var in vars.items():
+            if key not in self.val_history:
+                self.val_history[key] = []
+                self.n_history[key] = []
+            self.val_history[key].append(var)
+            self.n_history[key].append(count)
+
+    def average(self, n: int = 0) -> None:
+        """Average latest n values or all values."""
+        assert n >= 0
+        for key in self.val_history:
+            values = np.array(self.val_history[key][-n:])
+            nums = np.array(self.n_history[key][-n:])
+            avg = np.sum(values * nums) / np.sum(nums)
+            self.output[key] = avg
+        self.ready = True
diff --git a/mmcv/mmcv/runner/optimizer/__init__.py b/mmcv/mmcv/runner/optimizer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53c34d0470992cbc374f29681fdd00dc0e57968d
--- /dev/null
+++ b/mmcv/mmcv/runner/optimizer/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer,
+                      build_optimizer_constructor)
+from .default_constructor import DefaultOptimizerConstructor
+
+__all__ = [
+    'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor',
+    'build_optimizer', 'build_optimizer_constructor'
+]
diff --git a/mmcv/mmcv/runner/optimizer/builder.py b/mmcv/mmcv/runner/optimizer/builder.py
new file mode 100644
index 0000000000000000000000000000000000000000..49d8f05a2ccb6d469a72d2dce33fab0d4b15c8a5
--- /dev/null
+++ b/mmcv/mmcv/runner/optimizer/builder.py
@@ -0,0 +1,45 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import inspect
+from typing import Dict, List
+
+import torch
+
+from ...utils import Registry, build_from_cfg
+
+OPTIMIZERS = Registry('optimizer')
+OPTIMIZER_BUILDERS = Registry('optimizer builder')
+
+
+def register_torch_optimizers() -> List:
+    torch_optimizers = []
+    for module_name in dir(torch.optim):
+        if module_name.startswith('__'):
+            continue
+        _optim = getattr(torch.optim, module_name)
+        if inspect.isclass(_optim) and issubclass(_optim,
+                                                  torch.optim.Optimizer):
+            OPTIMIZERS.register_module()(_optim)
+            torch_optimizers.append(module_name)
+    return torch_optimizers
+
+
+TORCH_OPTIMIZERS = register_torch_optimizers()
+
+
+def build_optimizer_constructor(cfg: Dict):
+    return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
+
+
+def build_optimizer(model, cfg: Dict):
+    optimizer_cfg = copy.deepcopy(cfg)
+    constructor_type = optimizer_cfg.pop('constructor',
+                                         'DefaultOptimizerConstructor')
+    paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
+    optim_constructor = build_optimizer_constructor(
+        dict(
+            type=constructor_type,
+            optimizer_cfg=optimizer_cfg,
+            paramwise_cfg=paramwise_cfg))
+    optimizer = optim_constructor(model)
+    return optimizer
diff --git a/mmcv/mmcv/runner/optimizer/default_constructor.py b/mmcv/mmcv/runner/optimizer/default_constructor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c82b56e52f24958bb9ac1845259eaeea5c126a8c
--- /dev/null
+++ b/mmcv/mmcv/runner/optimizer/default_constructor.py
@@ -0,0 +1,258 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Dict, List, Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import GroupNorm, LayerNorm
+
+from mmcv.utils import _BatchNorm, _InstanceNorm, build_from_cfg, is_list_of
+from mmcv.utils.ext_loader import check_ops_exist
+from .builder import OPTIMIZER_BUILDERS, OPTIMIZERS
+
+
+@OPTIMIZER_BUILDERS.register_module()
+class DefaultOptimizerConstructor:
+    """Default constructor for optimizers.
+
+    By default each parameter share the same optimizer settings, and we
+    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
+    It is a dict and may contain the following fields:
+
+    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
+      one of the keys in ``custom_keys`` is a substring of the name of one
+      parameter, then the setting of the parameter will be specified by
+      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
+      be ignored. It should be noted that the aforementioned ``key`` is the
+      longest key that is a substring of the name of the parameter. If there
+      are multiple matched keys with the same length, then the key with lower
+      alphabet order will be chosen.
+      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
+      and ``decay_mult``. See Example 2 below.
+    - ``bias_lr_mult`` (float): It will be multiplied to the learning
+      rate for all bias parameters (except for those in normalization
+      layers and offset layers of DCN).
+    - ``bias_decay_mult`` (float): It will be multiplied to the weight
+      decay for all bias parameters (except for those in
+      normalization layers, depthwise conv layers, offset layers of DCN).
+    - ``norm_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of normalization
+      layers.
+    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of depthwise conv
+      layers.
+    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
+      rate for parameters of offset layer in the deformable convs
+      of a model.
+    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
+      would not be added into optimizer. Default: False.
+
+    Note:
+
+        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        override the effect of ``bias_lr_mult`` in the bias of offset layer.
+        So be careful when using both ``bias_lr_mult`` and
+        ``dcn_offset_lr_mult``. If you wish to apply both of them to the offset
+        layer in deformable convs, set ``dcn_offset_lr_mult`` to the original
+        ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+
+        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        apply it to all the DCN layers in the model. So be careful when the
+        model contains multiple DCN layers in places other than backbone.
+
+    Args:
+        model (:obj:`nn.Module`): The model with parameters to be optimized.
+        optimizer_cfg (dict): The config dict of the optimizer.
+            Positional fields are
+
+                - `type`: class name of the optimizer.
+
+            Optional fields are
+
+                - any arguments of the corresponding optimizer type, e.g.,
+                  lr, weight_decay, momentum, etc.
+        paramwise_cfg (dict, optional): Parameter-wise options.
+
+    Example 1:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
+        >>>                      weight_decay=0.0001)
+        >>> paramwise_cfg = dict(norm_decay_mult=0.)
+        >>> optim_builder = DefaultOptimizerConstructor(
+        >>>     optimizer_cfg, paramwise_cfg)
+        >>> optimizer = optim_builder(model)
+
+    Example 2:
+        >>> # assume model have attribute model.backbone and model.cls_head
+        >>> optimizer_cfg = dict(type='SGD', lr=0.01, weight_decay=0.95)
+        >>> paramwise_cfg = dict(custom_keys={
+                'backbone': dict(lr_mult=0.1, decay_mult=0.9)})
+        >>> optim_builder = DefaultOptimizerConstructor(
+        >>>     optimizer_cfg, paramwise_cfg)
+        >>> optimizer = optim_builder(model)
+        >>> # Then the `lr` and `weight_decay` for model.backbone is
+        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
+        >>> # model.cls_head is (0.01, 0.95).
+    """
+
+    def __init__(self,
+                 optimizer_cfg: Dict,
+                 paramwise_cfg: Optional[Dict] = None):
+        if not isinstance(optimizer_cfg, dict):
+            raise TypeError('optimizer_cfg should be a dict',
+                            f'but got {type(optimizer_cfg)}')
+        self.optimizer_cfg = optimizer_cfg
+        self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg
+        self.base_lr = optimizer_cfg.get('lr', None)
+        self.base_wd = optimizer_cfg.get('weight_decay', None)
+        self._validate_cfg()
+
+    def _validate_cfg(self) -> None:
+        if not isinstance(self.paramwise_cfg, dict):
+            raise TypeError('paramwise_cfg should be None or a dict, '
+                            f'but got {type(self.paramwise_cfg)}')
+
+        if 'custom_keys' in self.paramwise_cfg:
+            if not isinstance(self.paramwise_cfg['custom_keys'], dict):
+                raise TypeError(
+                    'If specified, custom_keys must be a dict, '
+                    f'but got {type(self.paramwise_cfg["custom_keys"])}')
+            if self.base_wd is None:
+                for key in self.paramwise_cfg['custom_keys']:
+                    if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]:
+                        raise ValueError('base_wd should not be None')
+
+        # get base lr and weight decay
+        # weight_decay must be explicitly specified if mult is specified
+        if ('bias_decay_mult' in self.paramwise_cfg
+                or 'norm_decay_mult' in self.paramwise_cfg
+                or 'dwconv_decay_mult' in self.paramwise_cfg):
+            if self.base_wd is None:
+                raise ValueError('base_wd should not be None')
+
+    def _is_in(self, param_group: Dict, param_group_list: List) -> bool:
+        assert is_list_of(param_group_list, dict)
+        param = set(param_group['params'])
+        param_set = set()
+        for group in param_group_list:
+            param_set.update(set(group['params']))
+
+        return not param.isdisjoint(param_set)
+
+    def add_params(self,
+                   params: List[Dict],
+                   module: nn.Module,
+                   prefix: str = '',
+                   is_dcn_module: Union[int, float, None] = None) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.)
+        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.)
+        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
+        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.)
+
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+        is_dwconv = (
+            isinstance(module, torch.nn.Conv2d)
+            and module.in_channels == module.groups)
+
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            if not param.requires_grad:
+                params.append(param_group)
+                continue
+            if bypass_duplicate and self._is_in(param_group, params):
+                warnings.warn(f'{prefix} is duplicate. It is skipped since '
+                              f'bypass_duplicate={bypass_duplicate}')
+                continue
+            # if the parameter match one of the custom keys, ignore other rules
+            is_custom = False
+            for key in sorted_keys:
+                if key in f'{prefix}.{name}':
+                    is_custom = True
+                    lr_mult = custom_keys[key].get('lr_mult', 1.)
+                    param_group['lr'] = self.base_lr * lr_mult
+                    if self.base_wd is not None:
+                        decay_mult = custom_keys[key].get('decay_mult', 1.)
+                        param_group['weight_decay'] = self.base_wd * decay_mult
+                    break
+
+            if not is_custom:
+                # bias_lr_mult affects all bias parameters
+                # except for norm.bias dcn.conv_offset.bias
+                if name == 'bias' and not (is_norm or is_dcn_module):
+                    param_group['lr'] = self.base_lr * bias_lr_mult
+
+                if (prefix.find('conv_offset') != -1 and is_dcn_module
+                        and isinstance(module, torch.nn.Conv2d)):
+                    # deal with both dcn_offset's bias & weight
+                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
+
+                # apply weight decay policies
+                if self.base_wd is not None:
+                    # norm decay
+                    if is_norm:
+                        param_group[
+                            'weight_decay'] = self.base_wd * norm_decay_mult
+                    # depth-wise conv
+                    elif is_dwconv:
+                        param_group[
+                            'weight_decay'] = self.base_wd * dwconv_decay_mult
+                    # bias lr and decay
+                    elif name == 'bias' and not is_dcn_module:
+                        # TODO: current bias_decay_mult will have affect on DCN
+                        param_group[
+                            'weight_decay'] = self.base_wd * bias_decay_mult
+            params.append(param_group)
+
+        if check_ops_exist():
+            from mmcv.ops import DeformConv2d, ModulatedDeformConv2d
+            is_dcn_module = isinstance(module,
+                                       (DeformConv2d, ModulatedDeformConv2d))
+        else:
+            is_dcn_module = False
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                is_dcn_module=is_dcn_module)
+
+    def __call__(self, model: nn.Module):
+        if hasattr(model, 'module'):
+            model = model.module
+
+        optimizer_cfg = self.optimizer_cfg.copy()
+        # if no paramwise option is specified, just use the global setting
+        if not self.paramwise_cfg:
+            optimizer_cfg['params'] = model.parameters()
+            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
+
+        # set param-wise lr and weight decay recursively
+        params: List[Dict] = []
+        self.add_params(params, model)
+        optimizer_cfg['params'] = params
+
+        return build_from_cfg(optimizer_cfg, OPTIMIZERS)
diff --git a/mmcv/mmcv/runner/priority.py b/mmcv/mmcv/runner/priority.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff644043b810c49dbe673e2ba5e35900650c3f02
--- /dev/null
+++ b/mmcv/mmcv/runner/priority.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import Enum
+from typing import Union
+
+
+class Priority(Enum):
+    """Hook priority levels.
+
+    +--------------+------------+
+    | Level        | Value      |
+    +==============+============+
+    | HIGHEST      | 0          |
+    +--------------+------------+
+    | VERY_HIGH    | 10         |
+    +--------------+------------+
+    | HIGH         | 30         |
+    +--------------+------------+
+    | ABOVE_NORMAL | 40         |
+    +--------------+------------+
+    | NORMAL       | 50         |
+    +--------------+------------+
+    | BELOW_NORMAL | 60         |
+    +--------------+------------+
+    | LOW          | 70         |
+    +--------------+------------+
+    | VERY_LOW     | 90         |
+    +--------------+------------+
+    | LOWEST       | 100        |
+    +--------------+------------+
+    """
+
+    HIGHEST = 0
+    VERY_HIGH = 10
+    HIGH = 30
+    ABOVE_NORMAL = 40
+    NORMAL = 50
+    BELOW_NORMAL = 60
+    LOW = 70
+    VERY_LOW = 90
+    LOWEST = 100
+
+
+def get_priority(priority: Union[int, str, Priority]) -> int:
+    """Get priority value.
+
+    Args:
+        priority (int or str or :obj:`Priority`): Priority.
+
+    Returns:
+        int: The priority value.
+    """
+    if isinstance(priority, int):
+        if priority < 0 or priority > 100:
+            raise ValueError('priority must be between 0 and 100')
+        return priority
+    elif isinstance(priority, Priority):
+        return priority.value
+    elif isinstance(priority, str):
+        return Priority[priority.upper()].value
+    else:
+        raise TypeError('priority must be an integer or Priority enum value')
diff --git a/mmcv/mmcv/runner/utils.py b/mmcv/mmcv/runner/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cdc6faddb2cb85cc67cf7d408e7ccbc1ef5d75e
--- /dev/null
+++ b/mmcv/mmcv/runner/utils.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import random
+import sys
+import time
+import warnings
+from getpass import getuser
+from socket import gethostname
+from types import ModuleType
+from typing import Optional
+
+import numpy as np
+import torch
+
+import mmcv
+
+
+def get_host_info() -> str:
+    """Get hostname and username.
+
+    Return empty string if exception raised, e.g. ``getpass.getuser()`` will
+    lead to error in docker container
+    """
+    host = ''
+    try:
+        host = f'{getuser()}@{gethostname()}'
+    except Exception as e:
+        warnings.warn(f'Host or user not found: {str(e)}')
+    finally:
+        return host
+
+
+def get_time_str() -> str:
+    return time.strftime('%Y%m%d_%H%M%S', time.localtime())
+
+
+def obj_from_dict(info: dict,
+                  parent: Optional[ModuleType] = None,
+                  default_args: Optional[dict] = None):
+    """Initialize an object from dict.
+
+    The dict must contain the key "type", which indicates the object type, it
+    can be either a string or type, such as "list" or ``list``. Remaining
+    fields are treated as the arguments for constructing the object.
+
+    Args:
+        info (dict): Object types and arguments.
+        parent (:class:`module`): Module which may containing expected object
+            classes.
+        default_args (dict, optional): Default arguments for initializing the
+            object.
+
+    Returns:
+        any type: Object built from the dict.
+    """
+    assert isinstance(info, dict) and 'type' in info
+    assert isinstance(default_args, dict) or default_args is None
+    args = info.copy()
+    obj_type = args.pop('type')
+    if mmcv.is_str(obj_type):
+        if parent is not None:
+            obj_type = getattr(parent, obj_type)
+        else:
+            obj_type = sys.modules[obj_type]
+    elif not isinstance(obj_type, type):
+        raise TypeError('type must be a str or valid type, but '
+                        f'got {type(obj_type)}')
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+    return obj_type(**args)
+
+
+def set_random_seed(seed: int,
+                    deterministic: bool = False,
+                    use_rank_shift: bool = False) -> None:
+    """Set random seed.
+
+    Args:
+        seed (int): Seed to be used.
+        deterministic (bool): Whether to set the deterministic option for
+            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
+            to True and `torch.backends.cudnn.benchmark` to False.
+            Default: False.
+        rank_shift (bool): Whether to add rank number to the random seed to
+            have different random seed in different threads. Default: False.
+    """
+    if use_rank_shift:
+        rank, _ = mmcv.runner.get_dist_info()
+        seed += rank
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    os.environ['PYTHONHASHSEED'] = str(seed)
+    if deterministic:
+        torch.backends.cudnn.deterministic = True
+        torch.backends.cudnn.benchmark = False
diff --git a/mmcv/mmcv/tensorrt/__init__.py b/mmcv/mmcv/tensorrt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d86ddbf4bb876f7a66c87381a653dcb9c69d1c6f
--- /dev/null
+++ b/mmcv/mmcv/tensorrt/__init__.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# flake8: noqa
+from .init_plugins import is_tensorrt_plugin_loaded, load_tensorrt_plugin
+from .preprocess import preprocess_onnx
+
+
+def is_tensorrt_available():
+    try:
+        import tensorrt
+        del tensorrt
+        return True
+    except ModuleNotFoundError:
+        return False
+
+
+__all__ = []
+
+if is_tensorrt_available():
+    from .tensorrt_utils import (TRTWraper, TRTWrapper, load_trt_engine,
+                                 onnx2trt, save_trt_engine)
+
+    # load tensorrt plugin lib
+    load_tensorrt_plugin()
+
+    __all__.extend([
+        'onnx2trt', 'save_trt_engine', 'load_trt_engine', 'TRTWraper',
+        'TRTWrapper'
+    ])
+
+__all__.extend(['is_tensorrt_plugin_loaded', 'preprocess_onnx'])
diff --git a/mmcv/mmcv/tensorrt/init_plugins.py b/mmcv/mmcv/tensorrt/init_plugins.py
new file mode 100644
index 0000000000000000000000000000000000000000..909b9ae28f16caed229735890410a42f19615e31
--- /dev/null
+++ b/mmcv/mmcv/tensorrt/init_plugins.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ctypes
+import glob
+import os
+import warnings
+
+
+def get_tensorrt_op_path() -> str:
+    """Get TensorRT plugins library path."""
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    wildcard = os.path.join(
+        os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
+        '_ext_trt.*.so')
+
+    paths = glob.glob(wildcard)
+    lib_path = paths[0] if len(paths) > 0 else ''
+    return lib_path
+
+
+plugin_is_loaded = False
+
+
+def is_tensorrt_plugin_loaded() -> bool:
+    """Check if TensorRT plugins library is loaded or not.
+
+    Returns:
+        bool: plugin_is_loaded flag
+    """
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    global plugin_is_loaded
+    return plugin_is_loaded
+
+
+def load_tensorrt_plugin() -> None:
+    """load TensorRT plugins library."""
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    global plugin_is_loaded
+    lib_path = get_tensorrt_op_path()
+    if (not plugin_is_loaded) and os.path.exists(lib_path):
+        ctypes.CDLL(lib_path)
+        plugin_is_loaded = True
diff --git a/mmcv/mmcv/tensorrt/preprocess.py b/mmcv/mmcv/tensorrt/preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0ad25428c61c83caf82caa29a722d0265a9bb25
--- /dev/null
+++ b/mmcv/mmcv/tensorrt/preprocess.py
@@ -0,0 +1,136 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import numpy as np
+import onnx
+
+
+def preprocess_onnx(onnx_model: onnx.ModelProto) -> onnx.ModelProto:
+    """Modify onnx model to match with TensorRT plugins in mmcv.
+
+    There are some conflict between onnx node definition and TensorRT limit.
+    This function perform preprocess on the onnx model to solve the conflicts.
+    For example, onnx `attribute` is loaded in TensorRT on host and onnx
+    `input` is loaded on device. The shape inference is performed on host, so
+    any `input` related to shape (such as `max_output_boxes_per_class` in
+    NonMaxSuppression) should be transformed to `attribute` before conversion.
+
+    Arguments:
+        onnx_model (onnx.ModelProto): Input onnx model.
+
+    Returns:
+        onnx.ModelProto: Modified onnx model.
+    """
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    graph = onnx_model.graph
+    nodes = graph.node
+    initializers = graph.initializer
+    node_dict = {}
+    for node in nodes:
+        node_outputs = node.output
+        for output in node_outputs:
+            if len(output) > 0:
+                node_dict[output] = node
+
+    init_dict = {_.name: _ for _ in initializers}
+
+    nodes_name_to_remove = set()
+
+    def is_node_without_output(name):
+        for node_name, node in node_dict.items():
+            if node_name not in nodes_name_to_remove:
+                if name in node.input:
+                    return False
+        return True
+
+    def mark_nodes_to_remove(name):
+        node = node_dict[name]
+        nodes_name_to_remove.add(name)
+        for input_node_name in node.input:
+            if is_node_without_output(input_node_name):
+                mark_nodes_to_remove(input_node_name)
+
+    def parse_data(name, typ, default_value=0):
+        if name in node_dict:
+            node = node_dict[name]
+            if node.op_type == 'Constant':
+                raw_data = node.attribute[0].t.raw_data
+            else:
+                mark_nodes_to_remove(name)
+                return default_value
+        elif name in init_dict:
+            raw_data = init_dict[name].raw_data
+        else:
+            raise ValueError(f'{name} not found in node or initilizer.')
+        return np.frombuffer(raw_data, typ).item()
+
+    nrof_node = len(nodes)
+    for idx in range(nrof_node):
+        node = nodes[idx]
+        node_attributes = node.attribute
+        node_inputs = node.input
+        node_outputs = node.output
+        node_name = node.name
+        # process NonMaxSuppression node
+        if node.op_type == 'NonMaxSuppression':
+            center_point_box = 0
+            max_output_boxes_per_class = 1000000
+            iou_threshold = 0.3
+            score_threshold = 0.0
+            offset = 0
+            for attribute in node_attributes:
+                if attribute.name == 'center_point_box':
+                    center_point_box = attribute.i
+                elif attribute.name == 'offset':
+                    offset = attribute.i
+
+            if len(node_inputs) >= 3:
+                max_output_boxes_per_class = parse_data(
+                    node_inputs[2], np.int64, max_output_boxes_per_class)
+                mark_nodes_to_remove(node_inputs[2])
+
+            if len(node_inputs) >= 4:
+                iou_threshold = parse_data(node_inputs[3], np.float32,
+                                           iou_threshold)
+                mark_nodes_to_remove(node_inputs[3])
+
+            if len(node_inputs) >= 5:
+                score_threshold = parse_data(node_inputs[4], np.float32)
+                mark_nodes_to_remove(node_inputs[4])
+
+            new_node = onnx.helper.make_node(
+                'NonMaxSuppression',
+                node_inputs[:2],
+                node_outputs,
+                name=node_name,
+                center_point_box=center_point_box,
+                max_output_boxes_per_class=max_output_boxes_per_class,
+                iou_threshold=iou_threshold,
+                score_threshold=score_threshold,
+                offset=offset)
+
+            for output in node_outputs:
+                if output in node_dict:
+                    node_dict[output] = new_node
+            nodes.insert(idx, new_node)
+            nodes.remove(node)
+        elif node.op_type == 'InstanceNormalization':
+            # directly change op name
+            node.op_type = 'MMCVInstanceNormalization'
+
+    for node_name in nodes_name_to_remove:
+        nodes.remove(node_dict[node_name])
+
+    return onnx_model
diff --git a/mmcv/mmcv/tensorrt/tensorrt_utils.py b/mmcv/mmcv/tensorrt/tensorrt_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b415abcd7bf79040053903b99af3e078ed2cb586
--- /dev/null
+++ b/mmcv/mmcv/tensorrt/tensorrt_utils.py
@@ -0,0 +1,291 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Union
+
+import onnx
+import tensorrt as trt
+import torch
+
+from .preprocess import preprocess_onnx
+
+
+def onnx2trt(onnx_model: Union[str, onnx.ModelProto],
+             opt_shape_dict: dict,
+             log_level: trt.ILogger.Severity = trt.Logger.ERROR,
+             fp16_mode: bool = False,
+             max_workspace_size: int = 0,
+             device_id: int = 0) -> trt.ICudaEngine:
+    """Convert onnx model to tensorrt engine.
+
+    Arguments:
+        onnx_model (str or onnx.ModelProto): the onnx model to convert from
+        opt_shape_dict (dict): the min/opt/max shape of each input
+        log_level (TensorRT log level): the log level of TensorRT
+        fp16_mode (bool): enable fp16 mode
+        max_workspace_size (int): set max workspace size of TensorRT engine.
+            some tactic and layers need large workspace.
+        device_id (int): choice the device to create engine.
+
+    Returns:
+        tensorrt.ICudaEngine: the TensorRT engine created from onnx_model
+
+    Example:
+        >>> engine = onnx2trt(
+        >>>             "onnx_model.onnx",
+        >>>             {'input': [[1, 3, 160, 160],
+        >>>                        [1, 3, 320, 320],
+        >>>                        [1, 3, 640, 640]]},
+        >>>             log_level=trt.Logger.WARNING,
+        >>>             fp16_mode=True,
+        >>>             max_workspace_size=1 << 30,
+        >>>             device_id=0)
+        >>>             })
+    """
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    device = torch.device(f'cuda:{device_id}')
+    # create builder and network
+    logger = trt.Logger(log_level)
+    builder = trt.Builder(logger)
+    EXPLICIT_BATCH = 1 << (int)(
+        trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
+    network = builder.create_network(EXPLICIT_BATCH)
+
+    # parse onnx
+    parser = trt.OnnxParser(network, logger)
+
+    if isinstance(onnx_model, str):
+        onnx_model = onnx.load(onnx_model)
+
+    onnx_model = preprocess_onnx(onnx_model)
+
+    if not parser.parse(onnx_model.SerializeToString()):
+        error_msgs = ''
+        for error in range(parser.num_errors):
+            error_msgs += f'{parser.get_error(error)}\n'
+        raise RuntimeError(f'parse onnx failed:\n{error_msgs}')
+
+    # config builder
+    builder.max_workspace_size = max_workspace_size
+
+    config = builder.create_builder_config()
+    config.max_workspace_size = max_workspace_size
+    profile = builder.create_optimization_profile()
+
+    for input_name, param in opt_shape_dict.items():
+        min_shape = tuple(param[0][:])
+        opt_shape = tuple(param[1][:])
+        max_shape = tuple(param[2][:])
+        profile.set_shape(input_name, min_shape, opt_shape, max_shape)
+    config.add_optimization_profile(profile)
+
+    if fp16_mode:
+        builder.fp16_mode = fp16_mode
+        config.set_flag(trt.BuilderFlag.FP16)
+
+    # create engine
+    with torch.cuda.device(device):
+        engine = builder.build_engine(network, config)
+
+    return engine
+
+
+def save_trt_engine(engine: trt.ICudaEngine, path: str) -> None:
+    """Serialize TensorRT engine to disk.
+
+    Arguments:
+        engine (tensorrt.ICudaEngine): TensorRT engine to serialize
+        path (str): disk path to write the engine
+    """
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    with open(path, mode='wb') as f:
+        f.write(bytearray(engine.serialize()))
+
+
+def load_trt_engine(path: str) -> trt.ICudaEngine:
+    """Deserialize TensorRT engine from disk.
+
+    Arguments:
+        path (str): disk path to read the engine
+
+    Returns:
+        tensorrt.ICudaEngine: the TensorRT engine loaded from disk
+    """
+
+    # Following strings of text style are from colorama package
+    bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+    red_text, blue_text = '\x1b[31m', '\x1b[34m'
+    white_background = '\x1b[107m'
+
+    msg = white_background + bright_style + red_text
+    msg += 'DeprecationWarning: This function will be deprecated in future. '
+    msg += blue_text + 'Welcome to use the unified model deployment toolbox '
+    msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+    msg += reset_style
+    warnings.warn(msg)
+
+    with trt.Logger() as logger, trt.Runtime(logger) as runtime:
+        with open(path, mode='rb') as f:
+            engine_bytes = f.read()
+        engine = runtime.deserialize_cuda_engine(engine_bytes)
+        return engine
+
+
+def torch_dtype_from_trt(dtype: trt.DataType) -> Union[torch.dtype, TypeError]:
+    """Convert pytorch dtype to TensorRT dtype."""
+    if dtype == trt.bool:
+        return torch.bool
+    elif dtype == trt.int8:
+        return torch.int8
+    elif dtype == trt.int32:
+        return torch.int32
+    elif dtype == trt.float16:
+        return torch.float16
+    elif dtype == trt.float32:
+        return torch.float32
+    else:
+        raise TypeError('%s is not supported by torch' % dtype)
+
+
+def torch_device_from_trt(
+        device: trt.TensorLocation) -> Union[torch.device, TypeError]:
+    """Convert pytorch device to TensorRT device."""
+    if device == trt.TensorLocation.DEVICE:
+        return torch.device('cuda')
+    elif device == trt.TensorLocation.HOST:
+        return torch.device('cpu')
+    else:
+        return TypeError('%s is not supported by torch' % device)
+
+
+class TRTWrapper(torch.nn.Module):
+    """TensorRT engine Wrapper.
+
+    Arguments:
+        engine (tensorrt.ICudaEngine): TensorRT engine to wrap
+        input_names (list[str]): names of each inputs
+        output_names (list[str]): names of each outputs
+
+    Note:
+        If the engine is converted from onnx model. The input_names and
+        output_names should be the same as onnx model.
+    """
+
+    def __init__(self, engine, input_names=None, output_names=None):
+
+        # Following strings of text style are from colorama package
+        bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+        red_text, blue_text = '\x1b[31m', '\x1b[34m'
+        white_background = '\x1b[107m'
+
+        msg = white_background + bright_style + red_text
+        msg += 'DeprecationWarning: This tool will be deprecated in future. '
+        msg += blue_text + \
+            'Welcome to use the unified model deployment toolbox '
+        msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+        msg += reset_style
+        warnings.warn(msg)
+
+        super().__init__()
+        self.engine = engine
+        if isinstance(self.engine, str):
+            self.engine = load_trt_engine(engine)
+
+        if not isinstance(self.engine, trt.ICudaEngine):
+            raise TypeError('engine should be str or trt.ICudaEngine')
+
+        self._register_state_dict_hook(TRTWrapper._on_state_dict)
+        self.context = self.engine.create_execution_context()
+
+        # get input and output names from engine
+        if input_names is None or output_names is None:
+            names = [_ for _ in self.engine]
+            input_names = list(filter(self.engine.binding_is_input, names))
+            output_names = list(set(names) - set(input_names))
+        self.input_names = input_names
+        self.output_names = output_names
+
+    def _on_state_dict(self, state_dict, prefix, local_metadata):
+        state_dict[prefix + 'engine'] = bytearray(self.engine.serialize())
+        state_dict[prefix + 'input_names'] = self.input_names
+        state_dict[prefix + 'output_names'] = self.output_names
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        engine_bytes = state_dict[prefix + 'engine']
+
+        with trt.Logger() as logger, trt.Runtime(logger) as runtime:
+            self.engine = runtime.deserialize_cuda_engine(engine_bytes)
+            self.context = self.engine.create_execution_context()
+
+        self.input_names = state_dict[prefix + 'input_names']
+        self.output_names = state_dict[prefix + 'output_names']
+
+    def forward(self, inputs):
+        """
+        Arguments:
+            inputs (dict): dict of input name-tensors pair
+
+        Return:
+            dict: dict of output name-tensors pair
+        """
+        assert self.input_names is not None
+        assert self.output_names is not None
+        bindings = [None] * (len(self.input_names) + len(self.output_names))
+
+        for input_name, input_tensor in inputs.items():
+            idx = self.engine.get_binding_index(input_name)
+
+            if input_tensor.dtype == torch.long:
+                input_tensor = input_tensor.int()
+            self.context.set_binding_shape(idx, tuple(input_tensor.shape))
+            bindings[idx] = input_tensor.contiguous().data_ptr()
+
+        # create output tensors
+        outputs = {}
+        for i, output_name in enumerate(self.output_names):
+            idx = self.engine.get_binding_index(output_name)
+            dtype = torch_dtype_from_trt(self.engine.get_binding_dtype(idx))
+            shape = tuple(self.context.get_binding_shape(idx))
+
+            device = torch_device_from_trt(self.engine.get_location(idx))
+            output = torch.empty(size=shape, dtype=dtype, device=device)
+            outputs[output_name] = output
+            bindings[idx] = output.data_ptr()
+
+        self.context.execute_async_v2(bindings,
+                                      torch.cuda.current_stream().cuda_stream)
+
+        return outputs
+
+
+class TRTWraper(TRTWrapper):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        warnings.warn(
+            'TRTWraper will be deprecated in'
+            ' future. Please use TRTWrapper instead', DeprecationWarning)
diff --git a/mmcv/mmcv/utils/__init__.py b/mmcv/mmcv/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8bb5a8173d0faf442e139fcd41157904de62b501
--- /dev/null
+++ b/mmcv/mmcv/utils/__init__.py
@@ -0,0 +1,81 @@
+# flake8: noqa
+# Copyright (c) OpenMMLab. All rights reserved.
+from .config import Config, ConfigDict, DictAction
+from .misc import (check_prerequisites, concat_list, deprecated_api_warning,
+                   has_method, import_modules_from_strings, is_list_of,
+                   is_method_overridden, is_seq_of, is_str, is_tuple_of,
+                   iter_cast, list_cast, requires_executable, requires_package,
+                   slice_list, to_1tuple, to_2tuple, to_3tuple, to_4tuple,
+                   to_ntuple, tuple_cast)
+from .path import (check_file_exist, fopen, is_filepath, mkdir_or_exist,
+                   scandir, symlink)
+from .progressbar import (ProgressBar, track_iter_progress,
+                          track_parallel_progress, track_progress)
+from .testing import (assert_attrs_equal, assert_dict_contains_subset,
+                      assert_dict_has_keys, assert_is_norm_layer,
+                      assert_keys_equal, assert_params_all_zeros,
+                      check_python_script)
+from .timer import Timer, TimerError, check_time
+from .version_utils import digit_version, get_git_hash
+
+try:
+    import torch
+except ImportError:
+    __all__ = [
+        'Config', 'ConfigDict', 'DictAction', 'is_str', 'iter_cast',
+        'list_cast', 'tuple_cast', 'is_seq_of', 'is_list_of', 'is_tuple_of',
+        'slice_list', 'concat_list', 'check_prerequisites', 'requires_package',
+        'requires_executable', 'is_filepath', 'fopen', 'check_file_exist',
+        'mkdir_or_exist', 'symlink', 'scandir', 'ProgressBar',
+        'track_progress', 'track_iter_progress', 'track_parallel_progress',
+        'Timer', 'TimerError', 'check_time', 'deprecated_api_warning',
+        'digit_version', 'get_git_hash', 'import_modules_from_strings',
+        'assert_dict_contains_subset', 'assert_attrs_equal',
+        'assert_dict_has_keys', 'assert_keys_equal', 'check_python_script',
+        'to_1tuple', 'to_2tuple', 'to_3tuple', 'to_4tuple', 'to_ntuple',
+        'is_method_overridden', 'has_method'
+    ]
+else:
+    from .device_type import (IS_IPU_AVAILABLE, IS_MLU_AVAILABLE,
+                              IS_MPS_AVAILABLE)
+    from .env import collect_env
+    from .hub import load_url
+    from .logging import get_logger, print_log
+    from .parrots_jit import jit, skip_no_elena
+    # yapf: disable
+    from .parrots_wrapper import (IS_CUDA_AVAILABLE, TORCH_VERSION,
+                                  BuildExtension, CppExtension, CUDAExtension,
+                                  DataLoader, PoolDataLoader, SyncBatchNorm,
+                                  _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd,
+                                  _AvgPoolNd, _BatchNorm, _ConvNd,
+                                  _ConvTransposeMixin, _get_cuda_home,
+                                  _InstanceNorm, _MaxPoolNd, get_build_config,
+                                  is_rocm_pytorch)
+    # yapf: enable
+    from .registry import Registry, build_from_cfg
+    from .seed import worker_init_fn
+    from .torch_ops import torch_meshgrid
+    from .trace import is_jit_tracing
+    __all__ = [
+        'Config', 'ConfigDict', 'DictAction', 'collect_env', 'get_logger',
+        'print_log', 'is_str', 'iter_cast', 'list_cast', 'tuple_cast',
+        'is_seq_of', 'is_list_of', 'is_tuple_of', 'slice_list', 'concat_list',
+        'check_prerequisites', 'requires_package', 'requires_executable',
+        'is_filepath', 'fopen', 'check_file_exist', 'mkdir_or_exist',
+        'symlink', 'scandir', 'ProgressBar', 'track_progress',
+        'track_iter_progress', 'track_parallel_progress', 'Registry',
+        'build_from_cfg', 'Timer', 'TimerError', 'check_time', 'SyncBatchNorm',
+        '_AdaptiveAvgPoolNd', '_AdaptiveMaxPoolNd', '_AvgPoolNd', '_BatchNorm',
+        '_ConvNd', '_ConvTransposeMixin', '_InstanceNorm', '_MaxPoolNd',
+        'get_build_config', 'BuildExtension', 'CppExtension', 'CUDAExtension',
+        'DataLoader', 'PoolDataLoader', 'TORCH_VERSION',
+        'deprecated_api_warning', 'digit_version', 'get_git_hash',
+        'import_modules_from_strings', 'jit', 'skip_no_elena',
+        'assert_dict_contains_subset', 'assert_attrs_equal',
+        'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer',
+        'assert_params_all_zeros', 'check_python_script',
+        'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch',
+        '_get_cuda_home', 'load_url', 'has_method', 'IS_CUDA_AVAILABLE',
+        'worker_init_fn', 'IS_MLU_AVAILABLE', 'IS_IPU_AVAILABLE',
+        'IS_MPS_AVAILABLE', 'torch_meshgrid'
+    ]
diff --git a/mmcv/mmcv/utils/config.py b/mmcv/mmcv/utils/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..a76bc48724fbe2c30ada930b6ca3e1340c45cad3
--- /dev/null
+++ b/mmcv/mmcv/utils/config.py
@@ -0,0 +1,741 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import ast
+import copy
+import os
+import os.path as osp
+import platform
+import shutil
+import sys
+import tempfile
+import types
+import uuid
+import warnings
+from argparse import Action, ArgumentParser
+from collections import abc
+from importlib import import_module
+from pathlib import Path
+
+from addict import Dict
+from yapf.yapflib.yapf_api import FormatCode
+
+from .misc import import_modules_from_strings
+from .path import check_file_exist
+
+if platform.system() == 'Windows':
+    import regex as re  # type: ignore
+else:
+    import re  # type: ignore
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+DEPRECATION_KEY = '_deprecation_'
+RESERVED_KEYS = ['filename', 'text', 'pretty_text']
+
+
+class ConfigDict(Dict):
+
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super().__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
+                                f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+
+
+def add_args(parser, cfg, prefix=''):
+    for k, v in cfg.items():
+        if isinstance(v, str):
+            parser.add_argument('--' + prefix + k)
+        elif isinstance(v, int):
+            parser.add_argument('--' + prefix + k, type=int)
+        elif isinstance(v, float):
+            parser.add_argument('--' + prefix + k, type=float)
+        elif isinstance(v, bool):
+            parser.add_argument('--' + prefix + k, action='store_true')
+        elif isinstance(v, dict):
+            add_args(parser, v, prefix + k + '.')
+        elif isinstance(v, abc.Iterable):
+            parser.add_argument('--' + prefix + k, type=type(v[0]), nargs='+')
+        else:
+            print(f'cannot parse key {prefix + k} of type {type(v)}')
+    return parser
+
+
+class Config:
+    """A facility for config and config files.
+
+    It supports common file formats as configs: python/json/yaml. The interface
+    is the same as a dict object and also allows access config values as
+    attributes.
+
+    Example:
+        >>> cfg = Config(dict(a=1, b=dict(b1=[0, 1])))
+        >>> cfg.a
+        1
+        >>> cfg.b
+        {'b1': [0, 1]}
+        >>> cfg.b.b1
+        [0, 1]
+        >>> cfg = Config.fromfile('tests/data/config/a.py')
+        >>> cfg.filename
+        "/home/kchen/projects/mmcv/tests/data/config/a.py"
+        >>> cfg.item4
+        'test'
+        >>> cfg
+        "Config [path: /home/kchen/projects/mmcv/tests/data/config/a.py]: "
+        "{'item1': [1, 2], 'item2': {'a': 0}, 'item3': True, 'item4': 'test'}"
+    """
+
+    @staticmethod
+    def _validate_py_syntax(filename):
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError as e:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}: {e}')
+
+    @staticmethod
+    def _substitute_predefined_vars(filename, temp_config_name):
+        file_dirname = osp.dirname(filename)
+        file_basename = osp.basename(filename)
+        file_basename_no_extension = osp.splitext(file_basename)[0]
+        file_extname = osp.splitext(filename)[1]
+        support_templates = dict(
+            fileDirname=file_dirname,
+            fileBasename=file_basename,
+            fileBasenameNoExtension=file_basename_no_extension,
+            fileExtname=file_extname)
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        for key, value in support_templates.items():
+            regexp = r'\{\{\s*' + str(key) + r'\s*\}\}'
+            value = value.replace('\\', '/')
+            config_file = re.sub(regexp, value, config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+
+    @staticmethod
+    def _pre_substitute_base_vars(filename, temp_config_name):
+        """Substitute base variable placehoders to string, so that parsing
+        would work."""
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            config_file = f.read()
+        base_var_dict = {}
+        regexp = r'\{\{\s*' + BASE_KEY + r'\.([\w\.]+)\s*\}\}'
+        base_vars = set(re.findall(regexp, config_file))
+        for base_var in base_vars:
+            randstr = f'_{base_var}_{uuid.uuid4().hex.lower()[:6]}'
+            base_var_dict[randstr] = base_var
+            regexp = r'\{\{\s*' + BASE_KEY + r'\.' + base_var + r'\s*\}\}'
+            config_file = re.sub(regexp, f'"{randstr}"', config_file)
+        with open(temp_config_name, 'w', encoding='utf-8') as tmp_config_file:
+            tmp_config_file.write(config_file)
+        return base_var_dict
+
+    @staticmethod
+    def _substitute_base_vars(cfg, base_var_dict, base_cfg):
+        """Substitute variable strings to their actual values."""
+        cfg = copy.deepcopy(cfg)
+
+        if isinstance(cfg, dict):
+            for k, v in cfg.items():
+                if isinstance(v, str) and v in base_var_dict:
+                    new_v = base_cfg
+                    for new_k in base_var_dict[v].split('.'):
+                        new_v = new_v[new_k]
+                    cfg[k] = new_v
+                elif isinstance(v, (list, tuple, dict)):
+                    cfg[k] = Config._substitute_base_vars(
+                        v, base_var_dict, base_cfg)
+        elif isinstance(cfg, tuple):
+            cfg = tuple(
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg)
+        elif isinstance(cfg, list):
+            cfg = [
+                Config._substitute_base_vars(c, base_var_dict, base_cfg)
+                for c in cfg
+            ]
+        elif isinstance(cfg, str) and cfg in base_var_dict:
+            new_v = base_cfg
+            for new_k in base_var_dict[cfg].split('.'):
+                new_v = new_v[new_k]
+            cfg = new_v
+
+        return cfg
+
+    @staticmethod
+    def _file2dict(filename, use_predefined_variables=True):
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        fileExtname = osp.splitext(filename)[1]
+        if fileExtname not in ['.py', '.json', '.yaml', '.yml']:
+            raise OSError('Only py/yml/yaml/json type are supported now!')
+
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            temp_config_file = tempfile.NamedTemporaryFile(
+                dir=temp_config_dir, suffix=fileExtname)
+            if platform.system() == 'Windows':
+                temp_config_file.close()
+            temp_config_name = osp.basename(temp_config_file.name)
+            # Substitute predefined variables
+            if use_predefined_variables:
+                Config._substitute_predefined_vars(filename,
+                                                   temp_config_file.name)
+            else:
+                shutil.copyfile(filename, temp_config_file.name)
+            # Substitute base variables from placeholders to strings
+            base_var_dict = Config._pre_substitute_base_vars(
+                temp_config_file.name, temp_config_file.name)
+
+            if filename.endswith('.py'):
+                temp_module_name = osp.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                Config._validate_py_syntax(filename)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {
+                    name: value
+                    for name, value in mod.__dict__.items()
+                    if not name.startswith('__')
+                    and not isinstance(value, types.ModuleType)
+                    and not isinstance(value, types.FunctionType)
+                }
+                # delete imported module
+                del sys.modules[temp_module_name]
+            elif filename.endswith(('.yml', '.yaml', '.json')):
+                import mmcv
+                cfg_dict = mmcv.load(temp_config_file.name)
+            # close temp file
+            temp_config_file.close()
+
+        # check deprecation information
+        if DEPRECATION_KEY in cfg_dict:
+            deprecation_info = cfg_dict.pop(DEPRECATION_KEY)
+            warning_msg = f'The config file {filename} will be deprecated ' \
+                'in the future.'
+            if 'expected' in deprecation_info:
+                warning_msg += f' Please use {deprecation_info["expected"]} ' \
+                    'instead.'
+            if 'reference' in deprecation_info:
+                warning_msg += ' More information can be found at ' \
+                    f'{deprecation_info["reference"]}'
+            warnings.warn(warning_msg, DeprecationWarning)
+
+        cfg_text = filename + '\n'
+        with open(filename, encoding='utf-8') as f:
+            # Setting encoding explicitly to resolve coding issue on windows
+            cfg_text += f.read()
+
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(
+                base_filename, list) else [base_filename]
+
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                duplicate_keys = base_cfg_dict.keys() & c.keys()
+                if len(duplicate_keys) > 0:
+                    raise KeyError('Duplicate key is not allowed among bases. '
+                                   f'Duplicate keys: {duplicate_keys}')
+                base_cfg_dict.update(c)
+
+            # Substitute base variables from strings to their actual values
+            cfg_dict = Config._substitute_base_vars(cfg_dict, base_var_dict,
+                                                    base_cfg_dict)
+
+            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+
+            # merge cfg_text
+            cfg_text_list.append(cfg_text)
+            cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text
+
+    @staticmethod
+    def _merge_a_into_b(a, b, allow_list_keys=False):
+        """merge dict ``a`` into dict ``b`` (non-inplace).
+
+        Values in ``a`` will overwrite ``b``. ``b`` is copied first to avoid
+        in-place modifications.
+
+        Args:
+            a (dict): The source dict to be merged into ``b``.
+            b (dict): The origin dict to be fetch keys from ``a``.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in source ``a`` and will replace the element of the
+              corresponding index in b if b is a list. Default: False.
+
+        Returns:
+            dict: The modified dict of ``b`` using ``a``.
+
+        Examples:
+            # Normally merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # Delete b first and merge a into b.
+            >>> Config._merge_a_into_b(
+            ...     dict(obj=dict(_delete_=True, a=2)), dict(obj=dict(a=1)))
+            {'obj': {'a': 2}}
+
+            # b is a list
+            >>> Config._merge_a_into_b(
+            ...     {'0': dict(a=2)}, [dict(a=1), dict(b=2)], True)
+            [{'a': 2}, {'b': 2}]
+        """
+        b = b.copy()
+        for k, v in a.items():
+            if allow_list_keys and k.isdigit() and isinstance(b, list):
+                k = int(k)
+                if len(b) <= k:
+                    raise KeyError(f'Index {k} exceeds the length of list {b}')
+                b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+            elif isinstance(v, dict):
+                if k in b and not v.pop(DELETE_KEY, False):
+                    allowed_types = (dict, list) if allow_list_keys else dict
+                    if not isinstance(b[k], allowed_types):
+                        raise TypeError(
+                            f'{k}={v} in child config cannot inherit from '
+                            f'base because {k} is a dict in the child config '
+                            f'but is of type {type(b[k])} in base config. '
+                            f'You may set `{DELETE_KEY}=True` to ignore the '
+                            f'base config.')
+                    b[k] = Config._merge_a_into_b(v, b[k], allow_list_keys)
+                else:
+                    b[k] = ConfigDict(v)
+            else:
+                b[k] = v
+        return b
+
+    @staticmethod
+    def fromfile(filename,
+                 use_predefined_variables=True,
+                 import_custom_modules=True):
+        if isinstance(filename, Path):
+            filename = str(filename)
+        cfg_dict, cfg_text = Config._file2dict(filename,
+                                               use_predefined_variables)
+        if import_custom_modules and cfg_dict.get('custom_imports', None):
+            import_modules_from_strings(**cfg_dict['custom_imports'])
+        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
+
+    @staticmethod
+    def fromstring(cfg_str, file_format):
+        """Generate config from config str.
+
+        Args:
+            cfg_str (str): Config str.
+            file_format (str): Config file format corresponding to the
+               config str. Only py/yml/yaml/json type are supported now!
+
+        Returns:
+            :obj:`Config`: Config obj.
+        """
+        if file_format not in ['.py', '.json', '.yaml', '.yml']:
+            raise OSError('Only py/yml/yaml/json type are supported now!')
+        if file_format != '.py' and 'dict(' in cfg_str:
+            # check if users specify a wrong suffix for python
+            warnings.warn(
+                'Please check "file_format", the file format may be .py')
+        with tempfile.NamedTemporaryFile(
+                'w', encoding='utf-8', suffix=file_format,
+                delete=False) as temp_file:
+            temp_file.write(cfg_str)
+            # on windows, previous implementation cause error
+            # see PR 1077 for details
+        cfg = Config.fromfile(temp_file.name)
+        os.remove(temp_file.name)
+        return cfg
+
+    @staticmethod
+    def auto_argparser(description=None):
+        """Generate argparser from config file automatically (experimental)"""
+        partial_parser = ArgumentParser(description=description)
+        partial_parser.add_argument('config', help='config file path')
+        cfg_file = partial_parser.parse_known_args()[0].config
+        cfg = Config.fromfile(cfg_file)
+        parser = ArgumentParser(description=description)
+        parser.add_argument('config', help='config file path')
+        add_args(parser, cfg)
+        return parser, cfg
+
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        if isinstance(filename, Path):
+            filename = str(filename)
+
+        super().__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super().__setattr__('_filename', filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename) as f:
+                text = f.read()
+        else:
+            text = ''
+        super().__setattr__('_text', text)
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def pretty_text(self):
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list(k, v, use_mapping=False):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = '[\n'
+                v_str += '\n'.join(
+                    f'dict({_indent(_format_dict(v_), indent)}),'
+                    for v_ in v).rstrip(',')
+                if use_mapping:
+                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                    attr_str = f'{k_str}: {v_str}'
+                else:
+                    attr_str = f'{str(k)}={v_str}'
+                attr_str = _indent(attr_str, indent) + ']'
+            else:
+                attr_str = _format_basic_types(k, v, use_mapping)
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(input_dict.items()):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        # copied from setup.cfg
+        yapf_style = dict(
+            based_on_style='pep8',
+            blank_line_before_nested_class_or_def=True,
+            split_before_expression_after_opening_paren=True)
+        text, _ = FormatCode(text, style_config=yapf_style, verify=True)
+
+        return text
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name):
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def __getstate__(self):
+        return (self._cfg_dict, self._filename, self._text)
+
+    def __copy__(self):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        other.__dict__.update(self.__dict__)
+
+        return other
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        other = cls.__new__(cls)
+        memo[id(self)] = other
+
+        for key, value in self.__dict__.items():
+            super(Config, other).__setattr__(key, copy.deepcopy(value, memo))
+
+        return other
+
+    def __setstate__(self, state):
+        _cfg_dict, _filename, _text = state
+        super().__setattr__('_cfg_dict', _cfg_dict)
+        super().__setattr__('_filename', _filename)
+        super().__setattr__('_text', _text)
+
+    def dump(self, file=None):
+        """Dumps config into a file or returns a string representation of the
+        config.
+
+        If a file argument is given, saves the config to that file using the
+        format defined by the file argument extension.
+
+        Otherwise, returns a string representing the config. The formatting of
+        this returned string is defined by the extension of `self.filename`. If
+        `self.filename` is not defined, returns a string representation of a
+         dict (lowercased and using ' for strings).
+
+        Examples:
+            >>> cfg_dict = dict(item1=[1, 2], item2=dict(a=0),
+            ...     item3=True, item4='test')
+            >>> cfg = Config(cfg_dict=cfg_dict)
+            >>> dump_file = "a.py"
+            >>> cfg.dump(dump_file)
+
+        Args:
+            file (str, optional): Path of the output file where the config
+                will be dumped. Defaults to None.
+        """
+        import mmcv
+        cfg_dict = super().__getattribute__('_cfg_dict').to_dict()
+        if file is None:
+            if self.filename is None or self.filename.endswith('.py'):
+                return self.pretty_text
+            else:
+                file_format = self.filename.split('.')[-1]
+                return mmcv.dump(cfg_dict, file_format=file_format)
+        elif file.endswith('.py'):
+            with open(file, 'w', encoding='utf-8') as f:
+                f.write(self.pretty_text)
+        else:
+            file_format = file.split('.')[-1]
+            return mmcv.dump(cfg_dict, file=file, file_format=file_format)
+
+    def merge_from_dict(self, options, allow_list_keys=True):
+        """Merge list into cfg_dict.
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+
+            >>> # Merge list element
+            >>> cfg = Config(dict(pipeline=[
+            ...     dict(type='LoadImage'), dict(type='LoadAnnotations')]))
+            >>> options = dict(pipeline={'0': dict(type='SelfLoadImage')})
+            >>> cfg.merge_from_dict(options, allow_list_keys=True)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(pipeline=[
+            ...     dict(type='SelfLoadImage'), dict(type='LoadAnnotations')])
+
+        Args:
+            options (dict): dict of configs to merge from.
+            allow_list_keys (bool): If True, int string keys (e.g. '0', '1')
+              are allowed in ``options`` and will replace the element of the
+              corresponding index in the config if the config is a list.
+              Default: True.
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super().__getattribute__('_cfg_dict')
+        super().__setattr__(
+            '_cfg_dict',
+            Config._merge_a_into_b(
+                option_cfg_dict, cfg_dict, allow_list_keys=allow_list_keys))
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options can
+    be passed as comma separated values, i.e 'KEY=V1,V2,V3', or with explicit
+    brackets, i.e. 'KEY=[V1,V2,V3]'. It also support nested brackets to build
+    list/tuple values. e.g. 'KEY=[(V1,V2),(V3,V4)]'
+    """
+
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        if val == 'None':
+            return None
+        return val
+
+    @staticmethod
+    def _parse_iterable(val):
+        """Parse iterable values in the string.
+
+        All elements inside '()' or '[]' are treated as iterable values.
+
+        Args:
+            val (str): Value string.
+
+        Returns:
+            list | tuple: The expanded list or tuple from the string.
+
+        Examples:
+            >>> DictAction._parse_iterable('1,2,3')
+            [1, 2, 3]
+            >>> DictAction._parse_iterable('[a, b, c]')
+            ['a', 'b', 'c']
+            >>> DictAction._parse_iterable('[(1, 2, 3), [a, b], c]')
+            [(1, 2, 3), ['a', 'b'], 'c']
+        """
+
+        def find_next_comma(string):
+            """Find the position of next comma in the string.
+
+            If no ',' is found in the string, return the string length. All
+            chars inside '()' and '[]' are treated as one element and thus ','
+            inside these brackets are ignored.
+            """
+            assert (string.count('(') == string.count(')')) and (
+                    string.count('[') == string.count(']')), \
+                f'Imbalanced brackets exist in {string}'
+            end = len(string)
+            for idx, char in enumerate(string):
+                pre = string[:idx]
+                # The string before this ',' is balanced
+                if ((char == ',') and (pre.count('(') == pre.count(')'))
+                        and (pre.count('[') == pre.count(']'))):
+                    end = idx
+                    break
+            return end
+
+        # Strip ' and " characters and replace whitespace.
+        val = val.strip('\'\"').replace(' ', '')
+        is_tuple = False
+        if val.startswith('(') and val.endswith(')'):
+            is_tuple = True
+            val = val[1:-1]
+        elif val.startswith('[') and val.endswith(']'):
+            val = val[1:-1]
+        elif ',' not in val:
+            # val is a single value
+            return DictAction._parse_int_float_bool(val)
+
+        values = []
+        while len(val) > 0:
+            comma_idx = find_next_comma(val)
+            element = DictAction._parse_iterable(val[:comma_idx])
+            values.append(element)
+            val = val[comma_idx + 1:]
+        if is_tuple:
+            values = tuple(values)
+        return values
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            options[key] = self._parse_iterable(val)
+        setattr(namespace, self.dest, options)
diff --git a/mmcv/mmcv/utils/device_type.py b/mmcv/mmcv/utils/device_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..d42ff72e9ff9a43963aa832cd1113c340562e1ae
--- /dev/null
+++ b/mmcv/mmcv/utils/device_type.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+
+def is_ipu_available() -> bool:
+    try:
+        import poptorch
+        return poptorch.ipuHardwareIsAvailable()
+    except ImportError:
+        return False
+
+
+IS_IPU_AVAILABLE = is_ipu_available()
+
+
+def is_mlu_available() -> bool:
+    try:
+        import torch
+        return (hasattr(torch, 'is_mlu_available')
+                and torch.is_mlu_available())
+    except Exception:
+        return False
+
+
+IS_MLU_AVAILABLE = is_mlu_available()
+
+
+def is_mps_available() -> bool:
+    """Return True if mps devices exist.
+
+    It's specialized for mac m1 chips and require torch version 1.12 or higher.
+    """
+    try:
+        import torch
+        return hasattr(torch.backends,
+                       'mps') and torch.backends.mps.is_available()
+    except Exception:
+        return False
+
+
+IS_MPS_AVAILABLE = is_mps_available()
diff --git a/mmcv/mmcv/utils/env.py b/mmcv/mmcv/utils/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..511332506f88774efee9c01b0236e70462af41f7
--- /dev/null
+++ b/mmcv/mmcv/utils/env.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""This file holding some environment constant for sharing by other files."""
+
+import os.path as osp
+import subprocess
+import sys
+from collections import defaultdict
+
+import cv2
+import torch
+
+import mmcv
+from .parrots_wrapper import get_build_config
+
+
+def collect_env():
+    """Collect the information of the running environments.
+
+    Returns:
+        dict: The environment information. The following fields are contained.
+
+            - sys.platform: The variable of ``sys.platform``.
+            - Python: Python version.
+            - CUDA available: Bool, indicating if CUDA is available.
+            - GPU devices: Device type of each GPU.
+            - CUDA_HOME (optional): The env var ``CUDA_HOME``.
+            - NVCC (optional): NVCC version.
+            - GCC: GCC version, "n/a" if GCC is not installed.
+            - MSVC: Microsoft Virtual C++ Compiler version, Windows only.
+            - PyTorch: PyTorch version.
+            - PyTorch compiling details: The output of \
+                ``torch.__config__.show()``.
+            - TorchVision (optional): TorchVision version.
+            - OpenCV: OpenCV version.
+            - MMCV: MMCV version.
+            - MMCV Compiler: The GCC version for compiling MMCV ops.
+            - MMCV CUDA Compiler: The CUDA version for compiling MMCV ops.
+    """
+    env_info = {}
+    env_info['sys.platform'] = sys.platform
+    env_info['Python'] = sys.version.replace('\n', '')
+
+    cuda_available = torch.cuda.is_available()
+    env_info['CUDA available'] = cuda_available
+
+    if cuda_available:
+        devices = defaultdict(list)
+        for k in range(torch.cuda.device_count()):
+            devices[torch.cuda.get_device_name(k)].append(str(k))
+        for name, device_ids in devices.items():
+            env_info['GPU ' + ','.join(device_ids)] = name
+
+        from mmcv.utils.parrots_wrapper import _get_cuda_home
+        CUDA_HOME = _get_cuda_home()
+        env_info['CUDA_HOME'] = CUDA_HOME
+
+        if CUDA_HOME is not None and osp.isdir(CUDA_HOME):
+            try:
+                nvcc = osp.join(CUDA_HOME, 'bin/nvcc')
+                nvcc = subprocess.check_output(f'"{nvcc}" -V', shell=True)
+                nvcc = nvcc.decode('utf-8').strip()
+                release = nvcc.rfind('Cuda compilation tools')
+                build = nvcc.rfind('Build ')
+                nvcc = nvcc[release:build].strip()
+            except subprocess.SubprocessError:
+                nvcc = 'Not Available'
+            env_info['NVCC'] = nvcc
+
+    try:
+        # Check C++ Compiler.
+        # For Unix-like, sysconfig has 'CC' variable like 'gcc -pthread ...',
+        # indicating the compiler used, we use this to get the compiler name
+        import sysconfig
+        cc = sysconfig.get_config_var('CC')
+        if cc:
+            cc = osp.basename(cc.split()[0])
+            cc_info = subprocess.check_output(f'{cc} --version', shell=True)
+            env_info['GCC'] = cc_info.decode('utf-8').partition(
+                '\n')[0].strip()
+        else:
+            # on Windows, cl.exe is not in PATH. We need to find the path.
+            # distutils.ccompiler.new_compiler() returns a msvccompiler
+            # object and after initialization, path to cl.exe is found.
+            import locale
+            import os
+            from distutils.ccompiler import new_compiler
+            ccompiler = new_compiler()
+            ccompiler.initialize()
+            cc = subprocess.check_output(
+                f'{ccompiler.cc}', stderr=subprocess.STDOUT, shell=True)
+            encoding = os.device_encoding(
+                sys.stdout.fileno()) or locale.getpreferredencoding()
+            env_info['MSVC'] = cc.decode(encoding).partition('\n')[0].strip()
+            env_info['GCC'] = 'n/a'
+    except subprocess.CalledProcessError:
+        env_info['GCC'] = 'n/a'
+
+    env_info['PyTorch'] = torch.__version__
+    env_info['PyTorch compiling details'] = get_build_config()
+
+    try:
+        import torchvision
+        env_info['TorchVision'] = torchvision.__version__
+    except ModuleNotFoundError:
+        pass
+
+    env_info['OpenCV'] = cv2.__version__
+
+    env_info['MMCV'] = mmcv.__version__
+
+    try:
+        from mmcv.ops import get_compiler_version, get_compiling_cuda_version
+    except ModuleNotFoundError:
+        env_info['MMCV Compiler'] = 'n/a'
+        env_info['MMCV CUDA Compiler'] = 'n/a'
+    else:
+        env_info['MMCV Compiler'] = get_compiler_version()
+        env_info['MMCV CUDA Compiler'] = get_compiling_cuda_version()
+
+    return env_info
diff --git a/mmcv/mmcv/utils/ext_loader.py b/mmcv/mmcv/utils/ext_loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..a31e107dfef8b710dc56fd887f569097d1c63208
--- /dev/null
+++ b/mmcv/mmcv/utils/ext_loader.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import importlib
+import os
+import pkgutil
+import warnings
+from collections import namedtuple
+
+import torch
+
+if torch.__version__ != 'parrots':
+
+    def load_ext(name, funcs):
+        ext = importlib.import_module('mmcv.' + name)
+        for fun in funcs:
+            assert hasattr(ext, fun), f'{fun} miss in module {name}'
+        return ext
+else:
+    from parrots import extension
+    from parrots.base import ParrotsException
+
+    has_return_value_ops = [
+        'nms',
+        'softnms',
+        'nms_match',
+        'nms_rotated',
+        'top_pool_forward',
+        'top_pool_backward',
+        'bottom_pool_forward',
+        'bottom_pool_backward',
+        'left_pool_forward',
+        'left_pool_backward',
+        'right_pool_forward',
+        'right_pool_backward',
+        'fused_bias_leakyrelu',
+        'upfirdn2d',
+        'ms_deform_attn_forward',
+        'pixel_group',
+        'contour_expand',
+        'diff_iou_rotated_sort_vertices_forward',
+    ]
+
+    def get_fake_func(name, e):
+
+        def fake_func(*args, **kwargs):
+            warnings.warn(f'{name} is not supported in parrots now')
+            raise e
+
+        return fake_func
+
+    def load_ext(name, funcs):
+        ExtModule = namedtuple('ExtModule', funcs)
+        ext_list = []
+        lib_root = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
+        for fun in funcs:
+            try:
+                ext_fun = extension.load(fun, name, lib_dir=lib_root)
+            except ParrotsException as e:
+                if 'No element registered' not in e.message:
+                    warnings.warn(e.message)
+                ext_fun = get_fake_func(fun, e)
+                ext_list.append(ext_fun)
+            else:
+                if fun in has_return_value_ops:
+                    ext_list.append(ext_fun.op)
+                else:
+                    ext_list.append(ext_fun.op_)
+        return ExtModule(*ext_list)
+
+
+def check_ops_exist() -> bool:
+    ext_loader = pkgutil.find_loader('mmcv._ext')
+    return ext_loader is not None
diff --git a/mmcv/mmcv/utils/hub.py b/mmcv/mmcv/utils/hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..a9cbbc95bafa55bfff36acbe56bd49ac372d0ec8
--- /dev/null
+++ b/mmcv/mmcv/utils/hub.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# The 1.6 release of PyTorch switched torch.save to use a new zipfile-based
+# file format. It will cause RuntimeError when a checkpoint was saved in
+# torch >= 1.6.0 but loaded in torch < 1.7.0.
+# More details at https://github.com/open-mmlab/mmpose/issues/904
+from .parrots_wrapper import TORCH_VERSION
+from .path import mkdir_or_exist
+from .version_utils import digit_version
+
+if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version(
+        '1.7.0'):
+    # Modified from https://github.com/pytorch/pytorch/blob/master/torch/hub.py
+    import os
+    import sys
+    import warnings
+    import zipfile
+    from urllib.parse import urlparse
+
+    import torch
+    from torch.hub import HASH_REGEX, _get_torch_home, download_url_to_file
+
+    # Hub used to support automatically extracts from zipfile manually
+    # compressed by users. The legacy zip format expects only one file from
+    # torch.save() < 1.6 in the zip. We should remove this support since
+    # zipfile is now default zipfile format for torch.save().
+    def _is_legacy_zip_format(filename):
+        if zipfile.is_zipfile(filename):
+            infolist = zipfile.ZipFile(filename).infolist()
+            return len(infolist) == 1 and not infolist[0].is_dir()
+        return False
+
+    def _legacy_zip_load(filename, model_dir, map_location):
+        warnings.warn(
+            'Falling back to the old format < 1.6. This support will'
+            ' be deprecated in favor of default zipfile format '
+            'introduced in 1.6. Please redo torch.save() to save it '
+            'in the new zipfile format.', DeprecationWarning)
+        # Note: extractall() defaults to overwrite file if exists. No need to
+        #       clean up beforehand. We deliberately don't handle tarfile here
+        #       since our legacy serialization format was in tar.
+        #       E.g. resnet18-5c106cde.pth which is widely used.
+        with zipfile.ZipFile(filename) as f:
+            members = f.infolist()
+            if len(members) != 1:
+                raise RuntimeError(
+                    'Only one file(not dir) is allowed in the zipfile')
+            f.extractall(model_dir)
+            extraced_name = members[0].filename
+            extracted_file = os.path.join(model_dir, extraced_name)
+        return torch.load(extracted_file, map_location=map_location)
+
+    def load_url(url,
+                 model_dir=None,
+                 map_location=None,
+                 progress=True,
+                 check_hash=False,
+                 file_name=None):
+        r"""Loads the Torch serialized object at the given URL.
+
+        If downloaded file is a zip file, it will be automatically decompressed
+
+        If the object is already present in `model_dir`, it's deserialized and
+        returned.
+        The default value of ``model_dir`` is ``<hub_dir>/checkpoints`` where
+        ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`.
+
+        Args:
+            url (str): URL of the object to download
+            model_dir (str, optional): directory in which to save the object
+            map_location (optional): a function or a dict specifying how to
+                remap storage locations (see torch.load)
+            progress (bool, optional): whether or not to display a progress bar
+                to stderr. Default: True
+            check_hash(bool, optional): If True, the filename part of the URL
+                should follow the naming convention ``filename-<sha256>.ext``
+                where ``<sha256>`` is the first eight or more digits of the
+                SHA256 hash of the contents of the file. The hash is used to
+                ensure unique names and to verify the contents of the file.
+                Default: False
+            file_name (str, optional): name for the downloaded file. Filename
+                from ``url`` will be used if not set. Default: None.
+
+        Example:
+            >>> url = ('https://s3.amazonaws.com/pytorch/models/resnet18-5c106'
+            ...        'cde.pth')
+            >>> state_dict = torch.hub.load_state_dict_from_url(url)
+        """
+        # Issue warning to move data if old env is set
+        if os.getenv('TORCH_MODEL_ZOO'):
+            warnings.warn(
+                'TORCH_MODEL_ZOO is deprecated, please use env '
+                'TORCH_HOME instead', DeprecationWarning)
+
+        if model_dir is None:
+            torch_home = _get_torch_home()
+            model_dir = os.path.join(torch_home, 'checkpoints')
+
+        mkdir_or_exist(model_dir)
+
+        parts = urlparse(url)
+        filename = os.path.basename(parts.path)
+        if file_name is not None:
+            filename = file_name
+        cached_file = os.path.join(model_dir, filename)
+        if not os.path.exists(cached_file):
+            sys.stderr.write('Downloading: "{}" to {}\n'.format(
+                url, cached_file))
+            hash_prefix = None
+            if check_hash:
+                r = HASH_REGEX.search(filename)  # r is Optional[Match[str]]
+                hash_prefix = r.group(1) if r else None
+            download_url_to_file(
+                url, cached_file, hash_prefix, progress=progress)
+
+        if _is_legacy_zip_format(cached_file):
+            return _legacy_zip_load(cached_file, model_dir, map_location)
+
+        try:
+            return torch.load(cached_file, map_location=map_location)
+        except RuntimeError as error:
+            if digit_version(TORCH_VERSION) < digit_version('1.5.0'):
+                warnings.warn(
+                    f'If the error is the same as "{cached_file} is a zip '
+                    'archive (did you mean to use torch.jit.load()?)", you can'
+                    ' upgrade your torch to 1.5.0 or higher (current torch '
+                    f'version is {TORCH_VERSION}). The error was raised '
+                    ' because the checkpoint was saved in torch>=1.6.0 but '
+                    'loaded in torch<1.5.')
+            raise error
+else:
+    from torch.utils.model_zoo import load_url  # type: ignore # noqa: F401
diff --git a/mmcv/mmcv/utils/logging.py b/mmcv/mmcv/utils/logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a90aac8b270a6e7f420477e7d7d06b74aff59de
--- /dev/null
+++ b/mmcv/mmcv/utils/logging.py
@@ -0,0 +1,111 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+import torch.distributed as dist
+
+logger_initialized: dict = {}
+
+
+def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
+    """Initialize and get a logger by name.
+
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added. If `log_file` is specified and the process rank is 0, a FileHandler
+    will also be added.
+
+    Args:
+        name (str): Logger name.
+        log_file (str | None): The log filename. If specified, a FileHandler
+            will be added to the logger.
+        log_level (int): The logger level. Note that only the process of
+            rank 0 is affected, and other processes will set the level to
+            "Error" thus be silent most of the time.
+        file_mode (str): The file mode used in opening log file.
+            Defaults to 'w'.
+
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+    # handle hierarchical names
+    # e.g., logger "a" is initialized, then logger "a.b" will skip the
+    # initialization since it is a child of "a".
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+
+    # handle duplicate logs to the console
+    # Starting in 1.8.0, PyTorch DDP attaches a StreamHandler <stderr> (NOTSET)
+    # to the root logger. As logger.propagate is True by default, this root
+    # level handler causes logging messages from rank>0 processes to
+    # unexpectedly show up on the console, creating much unwanted clutter.
+    # To fix this issue, we set the root logger's StreamHandler, if any, to log
+    # at the ERROR level.
+    for handler in logger.root.handlers:
+        if type(handler) is logging.StreamHandler:
+            handler.setLevel(logging.ERROR)
+
+    stream_handler = logging.StreamHandler()
+    handlers = [stream_handler]
+
+    if dist.is_available() and dist.is_initialized():
+        rank = dist.get_rank()
+    else:
+        rank = 0
+
+    # only rank 0 will add a FileHandler
+    if rank == 0 and log_file is not None:
+        # Here, the default behaviour of the official logger is 'a'. Thus, we
+        # provide an interface to change the file mode to the default
+        # behaviour.
+        file_handler = logging.FileHandler(log_file, file_mode)
+        handlers.append(file_handler)
+
+    formatter = logging.Formatter(
+        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+    for handler in handlers:
+        handler.setFormatter(formatter)
+        handler.setLevel(log_level)
+        logger.addHandler(handler)
+
+    if rank == 0:
+        logger.setLevel(log_level)
+    else:
+        logger.setLevel(logging.ERROR)
+
+    logger_initialized[name] = True
+
+    return logger
+
+
+def print_log(msg, logger=None, level=logging.INFO):
+    """Print a log message.
+
+    Args:
+        msg (str): The message to be logged.
+        logger (logging.Logger | str | None): The logger to be used.
+            Some special loggers are:
+
+            - "silent": no message will be printed.
+            - other str: the logger obtained with `get_root_logger(logger)`.
+            - None: The `print()` method will be used to print log messages.
+        level (int): Logging level. Only available when `logger` is a Logger
+            object or "root".
+    """
+    if logger is None:
+        print(msg)
+    elif isinstance(logger, logging.Logger):
+        logger.log(level, msg)
+    elif logger == 'silent':
+        pass
+    elif isinstance(logger, str):
+        _logger = get_logger(logger)
+        _logger.log(level, msg)
+    else:
+        raise TypeError(
+            'logger should be either a logging.Logger object, str, '
+            f'"silent" or None, but got {type(logger)}')
diff --git a/mmcv/mmcv/utils/misc.py b/mmcv/mmcv/utils/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..7957ea89b762763566139edfbf0a75401dc4e268
--- /dev/null
+++ b/mmcv/mmcv/utils/misc.py
@@ -0,0 +1,377 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import collections.abc
+import functools
+import itertools
+import subprocess
+import warnings
+from collections import abc
+from importlib import import_module
+from inspect import getfullargspec
+from itertools import repeat
+
+
+# From PyTorch internals
+def _ntuple(n):
+
+    def parse(x):
+        if isinstance(x, collections.abc.Iterable):
+            return x
+        return tuple(repeat(x, n))
+
+    return parse
+
+
+to_1tuple = _ntuple(1)
+to_2tuple = _ntuple(2)
+to_3tuple = _ntuple(3)
+to_4tuple = _ntuple(4)
+to_ntuple = _ntuple
+
+
+def is_str(x):
+    """Whether the input is an string instance.
+
+    Note: This method is deprecated since python 2 is no longer supported.
+    """
+    return isinstance(x, str)
+
+
+def import_modules_from_strings(imports, allow_failed_imports=False):
+    """Import modules from the given list of strings.
+
+    Args:
+        imports (list | str | None): The given module names to be imported.
+        allow_failed_imports (bool): If True, the failed imports will return
+            None. Otherwise, an ImportError is raise. Default: False.
+
+    Returns:
+        list[module] | module | None: The imported modules.
+
+    Examples:
+        >>> osp, sys = import_modules_from_strings(
+        ...     ['os.path', 'sys'])
+        >>> import os.path as osp_
+        >>> import sys as sys_
+        >>> assert osp == osp_
+        >>> assert sys == sys_
+    """
+    if not imports:
+        return
+    single_import = False
+    if isinstance(imports, str):
+        single_import = True
+        imports = [imports]
+    if not isinstance(imports, list):
+        raise TypeError(
+            f'custom_imports must be a list but got type {type(imports)}')
+    imported = []
+    for imp in imports:
+        if not isinstance(imp, str):
+            raise TypeError(
+                f'{imp} is of type {type(imp)} and cannot be imported.')
+        try:
+            imported_tmp = import_module(imp)
+        except ImportError:
+            if allow_failed_imports:
+                warnings.warn(f'{imp} failed to import and is ignored.',
+                              UserWarning)
+                imported_tmp = None
+            else:
+                raise ImportError
+        imported.append(imported_tmp)
+    if single_import:
+        imported = imported[0]
+    return imported
+
+
+def iter_cast(inputs, dst_type, return_type=None):
+    """Cast elements of an iterable object into some type.
+
+    Args:
+        inputs (Iterable): The input object.
+        dst_type (type): Destination type.
+        return_type (type, optional): If specified, the output object will be
+            converted to this type, otherwise an iterator.
+
+    Returns:
+        iterator or specified type: The converted object.
+    """
+    if not isinstance(inputs, abc.Iterable):
+        raise TypeError('inputs must be an iterable object')
+    if not isinstance(dst_type, type):
+        raise TypeError('"dst_type" must be a valid type')
+
+    out_iterable = map(dst_type, inputs)
+
+    if return_type is None:
+        return out_iterable
+    else:
+        return return_type(out_iterable)
+
+
+def list_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a list of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=list)
+
+
+def tuple_cast(inputs, dst_type):
+    """Cast elements of an iterable object into a tuple of some type.
+
+    A partial method of :func:`iter_cast`.
+    """
+    return iter_cast(inputs, dst_type, return_type=tuple)
+
+
+def is_seq_of(seq, expected_type, seq_type=None):
+    """Check whether it is a sequence of some type.
+
+    Args:
+        seq (Sequence): The sequence to be checked.
+        expected_type (type): Expected type of sequence items.
+        seq_type (type, optional): Expected sequence type.
+
+    Returns:
+        bool: Whether the sequence is valid.
+    """
+    if seq_type is None:
+        exp_seq_type = abc.Sequence
+    else:
+        assert isinstance(seq_type, type)
+        exp_seq_type = seq_type
+    if not isinstance(seq, exp_seq_type):
+        return False
+    for item in seq:
+        if not isinstance(item, expected_type):
+            return False
+    return True
+
+
+def is_list_of(seq, expected_type):
+    """Check whether it is a list of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=list)
+
+
+def is_tuple_of(seq, expected_type):
+    """Check whether it is a tuple of some type.
+
+    A partial method of :func:`is_seq_of`.
+    """
+    return is_seq_of(seq, expected_type, seq_type=tuple)
+
+
+def slice_list(in_list, lens):
+    """Slice a list into several sub lists by a list of given length.
+
+    Args:
+        in_list (list): The list to be sliced.
+        lens(int or list): The expected length of each out list.
+
+    Returns:
+        list: A list of sliced list.
+    """
+    if isinstance(lens, int):
+        assert len(in_list) % lens == 0
+        lens = [lens] * int(len(in_list) / lens)
+    if not isinstance(lens, list):
+        raise TypeError('"indices" must be an integer or a list of integers')
+    elif sum(lens) != len(in_list):
+        raise ValueError('sum of lens and list length does not '
+                         f'match: {sum(lens)} != {len(in_list)}')
+    out_list = []
+    idx = 0
+    for i in range(len(lens)):
+        out_list.append(in_list[idx:idx + lens[i]])
+        idx += lens[i]
+    return out_list
+
+
+def concat_list(in_list):
+    """Concatenate a list of list into a single list.
+
+    Args:
+        in_list (list): The list of list to be merged.
+
+    Returns:
+        list: The concatenated flat list.
+    """
+    return list(itertools.chain(*in_list))
+
+
+def check_prerequisites(
+        prerequisites,
+        checker,
+        msg_tmpl='Prerequisites "{}" are required in method "{}" but not '
+        'found, please install them first.'):  # yapf: disable
+    """A decorator factory to check if prerequisites are satisfied.
+
+    Args:
+        prerequisites (str of list[str]): Prerequisites to be checked.
+        checker (callable): The checker method that returns True if a
+            prerequisite is meet, False otherwise.
+        msg_tmpl (str): The message template with two variables.
+
+    Returns:
+        decorator: A specific decorator.
+    """
+
+    def wrap(func):
+
+        @functools.wraps(func)
+        def wrapped_func(*args, **kwargs):
+            requirements = [prerequisites] if isinstance(
+                prerequisites, str) else prerequisites
+            missing = []
+            for item in requirements:
+                if not checker(item):
+                    missing.append(item)
+            if missing:
+                print(msg_tmpl.format(', '.join(missing), func.__name__))
+                raise RuntimeError('Prerequisites not meet.')
+            else:
+                return func(*args, **kwargs)
+
+        return wrapped_func
+
+    return wrap
+
+
+def _check_py_package(package):
+    try:
+        import_module(package)
+    except ImportError:
+        return False
+    else:
+        return True
+
+
+def _check_executable(cmd):
+    if subprocess.call(f'which {cmd}', shell=True) != 0:
+        return False
+    else:
+        return True
+
+
+def requires_package(prerequisites):
+    """A decorator to check if some python packages are installed.
+
+    Example:
+        >>> @requires_package('numpy')
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        array([0.])
+        >>> @requires_package(['numpy', 'non_package'])
+        >>> func(arg1, args):
+        >>>     return numpy.zeros(1)
+        ImportError
+    """
+    return check_prerequisites(prerequisites, checker=_check_py_package)
+
+
+def requires_executable(prerequisites):
+    """A decorator to check if some executable files are installed.
+
+    Example:
+        >>> @requires_executable('ffmpeg')
+        >>> func(arg1, args):
+        >>>     print(1)
+        1
+    """
+    return check_prerequisites(prerequisites, checker=_check_executable)
+
+
+def deprecated_api_warning(name_dict, cls_name=None):
+    """A decorator to check if some arguments are deprecate and try to replace
+    deprecate src_arg_name to dst_arg_name.
+
+    Args:
+        name_dict(dict):
+            key (str): Deprecate argument names.
+            val (str): Expected argument names.
+
+    Returns:
+        func: New function.
+    """
+
+    def api_warning_wrapper(old_func):
+
+        @functools.wraps(old_func)
+        def new_func(*args, **kwargs):
+            # get the arg spec of the decorated method
+            args_info = getfullargspec(old_func)
+            # get name of the function
+            func_name = old_func.__name__
+            if cls_name is not None:
+                func_name = f'{cls_name}.{func_name}'
+            if args:
+                arg_names = args_info.args[:len(args)]
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in arg_names:
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        arg_names[arg_names.index(src_arg_name)] = dst_arg_name
+            if kwargs:
+                for src_arg_name, dst_arg_name in name_dict.items():
+                    if src_arg_name in kwargs:
+
+                        assert dst_arg_name not in kwargs, (
+                            f'The expected behavior is to replace '
+                            f'the deprecated key `{src_arg_name}` to '
+                            f'new key `{dst_arg_name}`, but got them '
+                            f'in the arguments at the same time, which '
+                            f'is confusing. `{src_arg_name} will be '
+                            f'deprecated in the future, please '
+                            f'use `{dst_arg_name}` instead.')
+
+                        warnings.warn(
+                            f'"{src_arg_name}" is deprecated in '
+                            f'`{func_name}`, please use "{dst_arg_name}" '
+                            'instead', DeprecationWarning)
+                        kwargs[dst_arg_name] = kwargs.pop(src_arg_name)
+
+            # apply converted arguments to the decorated method
+            output = old_func(*args, **kwargs)
+            return output
+
+        return new_func
+
+    return api_warning_wrapper
+
+
+def is_method_overridden(method, base_class, derived_class):
+    """Check if a method of base class is overridden in derived class.
+
+    Args:
+        method (str): the method name to check.
+        base_class (type): the class of the base class.
+        derived_class (type | Any): the class or instance of the derived class.
+    """
+    assert isinstance(base_class, type), \
+        "base_class doesn't accept instance, Please pass class instead."
+
+    if not isinstance(derived_class, type):
+        derived_class = derived_class.__class__
+
+    base_method = getattr(base_class, method)
+    derived_method = getattr(derived_class, method)
+    return derived_method != base_method
+
+
+def has_method(obj: object, method: str) -> bool:
+    """Check whether the object has a method.
+
+    Args:
+        method (str): The method name to check.
+        obj (object): The object to check.
+
+    Returns:
+        bool: True if the object has the method else False.
+    """
+    return hasattr(obj, method) and callable(getattr(obj, method))
diff --git a/mmcv/mmcv/utils/parrots_jit.py b/mmcv/mmcv/utils/parrots_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..61873f6dbb9b10ed972c90aa8faa321e3cb3249e
--- /dev/null
+++ b/mmcv/mmcv/utils/parrots_jit.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+from .parrots_wrapper import TORCH_VERSION
+
+parrots_jit_option = os.getenv('PARROTS_JIT_OPTION')
+
+if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON':
+    from parrots.jit import pat as jit
+else:
+
+    def jit(func=None,
+            check_input=None,
+            full_shape=True,
+            derivate=False,
+            coderize=False,
+            optimize=False):
+
+        def wrapper(func):
+
+            def wrapper_inner(*args, **kargs):
+                return func(*args, **kargs)
+
+            return wrapper_inner
+
+        if func is None:
+            return wrapper
+        else:
+            return func
+
+
+if TORCH_VERSION == 'parrots':
+    from parrots.utils.tester import skip_no_elena
+else:
+
+    def skip_no_elena(func):
+
+        def wrapper(*args, **kargs):
+            return func(*args, **kargs)
+
+        return wrapper
diff --git a/mmcv/mmcv/utils/parrots_wrapper.py b/mmcv/mmcv/utils/parrots_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf2c7e5ce0c2a640bdd7d8f91ef4973e5e944bbd
--- /dev/null
+++ b/mmcv/mmcv/utils/parrots_wrapper.py
@@ -0,0 +1,114 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+
+import torch
+
+TORCH_VERSION = torch.__version__
+
+
+def is_cuda_available() -> bool:
+    return torch.cuda.is_available()
+
+
+IS_CUDA_AVAILABLE = is_cuda_available()
+
+
+def is_rocm_pytorch() -> bool:
+    is_rocm = False
+    if TORCH_VERSION != 'parrots':
+        try:
+            from torch.utils.cpp_extension import ROCM_HOME
+            is_rocm = True if ((torch.version.hip is not None) and
+                               (ROCM_HOME is not None)) else False
+        except ImportError:
+            pass
+    return is_rocm
+
+
+def _get_cuda_home():
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import CUDA_HOME
+    else:
+        if is_rocm_pytorch():
+            from torch.utils.cpp_extension import ROCM_HOME
+            CUDA_HOME = ROCM_HOME
+        else:
+            from torch.utils.cpp_extension import CUDA_HOME
+    return CUDA_HOME
+
+
+def get_build_config():
+    if TORCH_VERSION == 'parrots':
+        from parrots.config import get_build_info
+        return get_build_info()
+    else:
+        return torch.__config__.show()
+
+
+def _get_conv():
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+    else:
+        from torch.nn.modules.conv import _ConvNd, _ConvTransposeMixin
+    return _ConvNd, _ConvTransposeMixin
+
+
+def _get_dataloader():
+    if TORCH_VERSION == 'parrots':
+        from torch.utils.data import DataLoader, PoolDataLoader
+    else:
+        from torch.utils.data import DataLoader
+        PoolDataLoader = DataLoader
+    return DataLoader, PoolDataLoader
+
+
+def _get_extension():
+    if TORCH_VERSION == 'parrots':
+        from parrots.utils.build_extension import BuildExtension, Extension
+        CppExtension = partial(Extension, cuda=False)
+        CUDAExtension = partial(Extension, cuda=True)
+    else:
+        from torch.utils.cpp_extension import (BuildExtension, CppExtension,
+                                               CUDAExtension)
+    return BuildExtension, CppExtension, CUDAExtension
+
+
+def _get_pool():
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.pool import (_AdaptiveAvgPoolNd,
+                                             _AdaptiveMaxPoolNd, _AvgPoolNd,
+                                             _MaxPoolNd)
+    else:
+        from torch.nn.modules.pooling import (_AdaptiveAvgPoolNd,
+                                              _AdaptiveMaxPoolNd, _AvgPoolNd,
+                                              _MaxPoolNd)
+    return _AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd
+
+
+def _get_norm():
+    if TORCH_VERSION == 'parrots':
+        from parrots.nn.modules.batchnorm import _BatchNorm, _InstanceNorm
+        SyncBatchNorm_ = torch.nn.SyncBatchNorm2d
+    else:
+        from torch.nn.modules.batchnorm import _BatchNorm
+        from torch.nn.modules.instancenorm import _InstanceNorm
+        SyncBatchNorm_ = torch.nn.SyncBatchNorm
+    return _BatchNorm, _InstanceNorm, SyncBatchNorm_
+
+
+_ConvNd, _ConvTransposeMixin = _get_conv()
+DataLoader, PoolDataLoader = _get_dataloader()
+BuildExtension, CppExtension, CUDAExtension = _get_extension()
+_BatchNorm, _InstanceNorm, SyncBatchNorm_ = _get_norm()
+_AdaptiveAvgPoolNd, _AdaptiveMaxPoolNd, _AvgPoolNd, _MaxPoolNd = _get_pool()
+
+
+class SyncBatchNorm(SyncBatchNorm_):  # type: ignore
+
+    def _check_input_dim(self, input):
+        if TORCH_VERSION == 'parrots':
+            if input.dim() < 2:
+                raise ValueError(
+                    f'expected at least 2D input (got {input.dim()}D input)')
+        else:
+            super()._check_input_dim(input)
diff --git a/mmcv/mmcv/utils/path.py b/mmcv/mmcv/utils/path.py
new file mode 100644
index 0000000000000000000000000000000000000000..56808183777d8070a94f8c346b7929da1f56ceb4
--- /dev/null
+++ b/mmcv/mmcv/utils/path.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from pathlib import Path
+
+from .misc import is_str
+
+
+def is_filepath(x):
+    return is_str(x) or isinstance(x, Path)
+
+
+def fopen(filepath, *args, **kwargs):
+    if is_str(filepath):
+        return open(filepath, *args, **kwargs)
+    elif isinstance(filepath, Path):
+        return filepath.open(*args, **kwargs)
+    raise ValueError('`filepath` should be a string or a Path')
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+def mkdir_or_exist(dir_name, mode=0o777):
+    if dir_name == '':
+        return
+    dir_name = osp.expanduser(dir_name)
+    os.makedirs(dir_name, mode=mode, exist_ok=True)
+
+
+def symlink(src, dst, overwrite=True, **kwargs):
+    if os.path.lexists(dst) and overwrite:
+        os.remove(dst)
+    os.symlink(src, dst, **kwargs)
+
+
+def scandir(dir_path, suffix=None, recursive=False, case_sensitive=True):
+    """Scan a directory to find the interested files.
+
+    Args:
+        dir_path (str | :obj:`Path`): Path of the directory.
+        suffix (str | tuple(str), optional): File suffix that we are
+            interested in. Default: None.
+        recursive (bool, optional): If set to True, recursively scan the
+            directory. Default: False.
+        case_sensitive (bool, optional) : If set to False, ignore the case of
+            suffix. Default: True.
+
+    Returns:
+        A generator for all the interested files with relative paths.
+    """
+    if isinstance(dir_path, (str, Path)):
+        dir_path = str(dir_path)
+    else:
+        raise TypeError('"dir_path" must be a string or Path object')
+
+    if (suffix is not None) and not isinstance(suffix, (str, tuple)):
+        raise TypeError('"suffix" must be a string or tuple of strings')
+
+    if suffix is not None and not case_sensitive:
+        suffix = suffix.lower() if isinstance(suffix, str) else tuple(
+            item.lower() for item in suffix)
+
+    root = dir_path
+
+    def _scandir(dir_path, suffix, recursive, case_sensitive):
+        for entry in os.scandir(dir_path):
+            if not entry.name.startswith('.') and entry.is_file():
+                rel_path = osp.relpath(entry.path, root)
+                _rel_path = rel_path if case_sensitive else rel_path.lower()
+                if suffix is None or _rel_path.endswith(suffix):
+                    yield rel_path
+            elif recursive and os.path.isdir(entry.path):
+                # scan recursively if entry.path is a directory
+                yield from _scandir(entry.path, suffix, recursive,
+                                    case_sensitive)
+
+    return _scandir(dir_path, suffix, recursive, case_sensitive)
+
+
+def find_vcs_root(path, markers=('.git', )):
+    """Finds the root directory (including itself) of specified markers.
+
+    Args:
+        path (str): Path of directory or file.
+        markers (list[str], optional): List of file or directory names.
+
+    Returns:
+        The directory contained one of the markers or None if not found.
+    """
+    if osp.isfile(path):
+        path = osp.dirname(path)
+
+    prev, cur = None, osp.abspath(osp.expanduser(path))
+    while cur != prev:
+        if any(osp.exists(osp.join(cur, marker)) for marker in markers):
+            return cur
+        prev, cur = cur, osp.split(cur)[0]
+    return None
diff --git a/mmcv/mmcv/utils/progressbar.py b/mmcv/mmcv/utils/progressbar.py
new file mode 100644
index 0000000000000000000000000000000000000000..0062f670dd94fa9da559ab26ef85517dcf5211c7
--- /dev/null
+++ b/mmcv/mmcv/utils/progressbar.py
@@ -0,0 +1,208 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from collections.abc import Iterable
+from multiprocessing import Pool
+from shutil import get_terminal_size
+
+from .timer import Timer
+
+
+class ProgressBar:
+    """A progress bar which can print the progress."""
+
+    def __init__(self, task_num=0, bar_width=50, start=True, file=sys.stdout):
+        self.task_num = task_num
+        self.bar_width = bar_width
+        self.completed = 0
+        self.file = file
+        if start:
+            self.start()
+
+    @property
+    def terminal_width(self):
+        width, _ = get_terminal_size()
+        return width
+
+    def start(self):
+        if self.task_num > 0:
+            self.file.write(f'[{" " * self.bar_width}] 0/{self.task_num}, '
+                            'elapsed: 0s, ETA:')
+        else:
+            self.file.write('completed: 0, elapsed: 0s')
+        self.file.flush()
+        self.timer = Timer()
+
+    def update(self, num_tasks=1):
+        assert num_tasks > 0
+        self.completed += num_tasks
+        elapsed = self.timer.since_start()
+        if elapsed > 0:
+            fps = self.completed / elapsed
+        else:
+            fps = float('inf')
+        if self.task_num > 0:
+            percentage = self.completed / float(self.task_num)
+            eta = int(elapsed * (1 - percentage) / percentage + 0.5)
+            msg = f'\r[{{}}] {self.completed}/{self.task_num}, ' \
+                  f'{fps:.1f} task/s, elapsed: {int(elapsed + 0.5)}s, ' \
+                  f'ETA: {eta:5}s'
+
+            bar_width = min(self.bar_width,
+                            int(self.terminal_width - len(msg)) + 2,
+                            int(self.terminal_width * 0.6))
+            bar_width = max(2, bar_width)
+            mark_width = int(bar_width * percentage)
+            bar_chars = '>' * mark_width + ' ' * (bar_width - mark_width)
+            self.file.write(msg.format(bar_chars))
+        else:
+            self.file.write(
+                f'completed: {self.completed}, elapsed: {int(elapsed + 0.5)}s,'
+                f' {fps:.1f} tasks/s')
+        self.file.flush()
+
+
+def track_progress(func, tasks, bar_width=50, file=sys.stdout, **kwargs):
+    """Track the progress of tasks execution with a progress bar.
+
+    Tasks are done with a simple for-loop.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        bar_width (int): Width of progress bar.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    results = []
+    for task in tasks:
+        results.append(func(task, **kwargs))
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    return results
+
+
+def init_pool(process_num, initializer=None, initargs=None):
+    if initializer is None:
+        return Pool(process_num)
+    elif initargs is None:
+        return Pool(process_num, initializer)
+    else:
+        if not isinstance(initargs, tuple):
+            raise TypeError('"initargs" must be a tuple')
+        return Pool(process_num, initializer, initargs)
+
+
+def track_parallel_progress(func,
+                            tasks,
+                            nproc,
+                            initializer=None,
+                            initargs=None,
+                            bar_width=50,
+                            chunksize=1,
+                            skip_first=False,
+                            keep_order=True,
+                            file=sys.stdout):
+    """Track the progress of parallel task execution with a progress bar.
+
+    The built-in :mod:`multiprocessing` module is used for process pools and
+    tasks are done with :func:`Pool.map` or :func:`Pool.imap_unordered`.
+
+    Args:
+        func (callable): The function to be applied to each task.
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        nproc (int): Process (worker) number.
+        initializer (None or callable): Refer to :class:`multiprocessing.Pool`
+            for details.
+        initargs (None or tuple): Refer to :class:`multiprocessing.Pool` for
+            details.
+        chunksize (int): Refer to :class:`multiprocessing.Pool` for details.
+        bar_width (int): Width of progress bar.
+        skip_first (bool): Whether to skip the first sample for each worker
+            when estimating fps, since the initialization step may takes
+            longer.
+        keep_order (bool): If True, :func:`Pool.imap` is used, otherwise
+            :func:`Pool.imap_unordered` is used.
+
+    Returns:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    pool = init_pool(nproc, initializer, initargs)
+    start = not skip_first
+    task_num -= nproc * chunksize * int(skip_first)
+    prog_bar = ProgressBar(task_num, bar_width, start, file=file)
+    results = []
+    if keep_order:
+        gen = pool.imap(func, tasks, chunksize)
+    else:
+        gen = pool.imap_unordered(func, tasks, chunksize)
+    for result in gen:
+        results.append(result)
+        if skip_first:
+            if len(results) < nproc * chunksize:
+                continue
+            elif len(results) == nproc * chunksize:
+                prog_bar.start()
+                continue
+        prog_bar.update()
+    prog_bar.file.write('\n')
+    pool.close()
+    pool.join()
+    return results
+
+
+def track_iter_progress(tasks, bar_width=50, file=sys.stdout):
+    """Track the progress of tasks iteration or enumeration with a progress
+    bar.
+
+    Tasks are yielded with a simple for-loop.
+
+    Args:
+        tasks (list or tuple[Iterable, int]): A list of tasks or
+            (tasks, total num).
+        bar_width (int): Width of progress bar.
+
+    Yields:
+        list: The task results.
+    """
+    if isinstance(tasks, tuple):
+        assert len(tasks) == 2
+        assert isinstance(tasks[0], Iterable)
+        assert isinstance(tasks[1], int)
+        task_num = tasks[1]
+        tasks = tasks[0]
+    elif isinstance(tasks, Iterable):
+        task_num = len(tasks)
+    else:
+        raise TypeError(
+            '"tasks" must be an iterable object or a (iterator, int) tuple')
+    prog_bar = ProgressBar(task_num, bar_width, file=file)
+    for task in tasks:
+        yield task
+        prog_bar.update()
+    prog_bar.file.write('\n')
diff --git a/mmcv/mmcv/utils/registry.py b/mmcv/mmcv/utils/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7db6bd442170319c4b115d6d289339f3bf50d25
--- /dev/null
+++ b/mmcv/mmcv/utils/registry.py
@@ -0,0 +1,340 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import inspect
+import warnings
+from functools import partial
+from typing import Any, Dict, Optional
+
+from .misc import deprecated_api_warning, is_seq_of
+
+
+def build_from_cfg(cfg: Dict,
+                   registry: 'Registry',
+                   default_args: Optional[Dict] = None) -> Any:
+    """Build a module from config dict when it is a class configuration, or
+    call a function from config dict when it is a function configuration.
+
+    Example:
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> resnet = build_from_cfg(dict(type='Resnet'), MODELS)
+        >>> # Returns an instantiated object
+        >>> @MODELS.register_module()
+        >>> def resnet50():
+        >>>     pass
+        >>> resnet = build_from_cfg(dict(type='resnet50'), MODELS)
+        >>> # Return a result of the calling function
+
+    Args:
+        cfg (dict): Config dict. It should at least contain the key "type".
+        registry (:obj:`Registry`): The registry to search the type from.
+        default_args (dict, optional): Default initialization arguments.
+
+    Returns:
+        object: The constructed object.
+    """
+    if not isinstance(cfg, dict):
+        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
+    if 'type' not in cfg:
+        if default_args is None or 'type' not in default_args:
+            raise KeyError(
+                '`cfg` or `default_args` must contain the key "type", '
+                f'but got {cfg}\n{default_args}')
+    if not isinstance(registry, Registry):
+        raise TypeError('registry must be an mmcv.Registry object, '
+                        f'but got {type(registry)}')
+    if not (isinstance(default_args, dict) or default_args is None):
+        raise TypeError('default_args must be a dict or None, '
+                        f'but got {type(default_args)}')
+
+    args = cfg.copy()
+
+    if default_args is not None:
+        for name, value in default_args.items():
+            args.setdefault(name, value)
+
+    obj_type = args.pop('type')
+    if isinstance(obj_type, str):
+        obj_cls = registry.get(obj_type)
+        if obj_cls is None:
+            raise KeyError(
+                f'{obj_type} is not in the {registry.name} registry')
+    elif inspect.isclass(obj_type) or inspect.isfunction(obj_type):
+        obj_cls = obj_type
+    else:
+        raise TypeError(
+            f'type must be a str or valid type, but got {type(obj_type)}')
+    try:
+        return obj_cls(**args)
+    except Exception as e:
+        # Normal TypeError does not print class name.
+        raise type(e)(f'{obj_cls.__name__}: {e}')
+
+
+class Registry:
+    """A registry to map strings to classes or functions.
+
+    Registered object could be built from registry. Meanwhile, registered
+    functions could be called from registry.
+
+    Example:
+        >>> MODELS = Registry('models')
+        >>> @MODELS.register_module()
+        >>> class ResNet:
+        >>>     pass
+        >>> resnet = MODELS.build(dict(type='ResNet'))
+        >>> @MODELS.register_module()
+        >>> def resnet50():
+        >>>     pass
+        >>> resnet = MODELS.build(dict(type='resnet50'))
+
+    Please refer to
+    https://mmcv.readthedocs.io/en/latest/understand_mmcv/registry.html for
+    advanced usage.
+
+    Args:
+        name (str): Registry name.
+        build_func(func, optional): Build function to construct instance from
+            Registry, func:`build_from_cfg` is used if neither ``parent`` or
+            ``build_func`` is specified. If ``parent`` is specified and
+            ``build_func`` is not given,  ``build_func`` will be inherited
+            from ``parent``. Default: None.
+        parent (Registry, optional): Parent registry. The class registered in
+            children registry could be built from parent. Default: None.
+        scope (str, optional): The scope of registry. It is the key to search
+            for children registry. If not specified, scope will be the name of
+            the package where class is defined, e.g. mmdet, mmcls, mmseg.
+            Default: None.
+    """
+
+    def __init__(self, name, build_func=None, parent=None, scope=None):
+        self._name = name
+        self._module_dict = dict()
+        self._children = dict()
+        self._scope = self.infer_scope() if scope is None else scope
+
+        # self.build_func will be set with the following priority:
+        # 1. build_func
+        # 2. parent.build_func
+        # 3. build_from_cfg
+        if build_func is None:
+            if parent is not None:
+                self.build_func = parent.build_func
+            else:
+                self.build_func = build_from_cfg
+        else:
+            self.build_func = build_func
+        if parent is not None:
+            assert isinstance(parent, Registry)
+            parent._add_children(self)
+            self.parent = parent
+        else:
+            self.parent = None
+
+    def __len__(self):
+        return len(self._module_dict)
+
+    def __contains__(self, key):
+        return self.get(key) is not None
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + \
+                     f'(name={self._name}, ' \
+                     f'items={self._module_dict})'
+        return format_str
+
+    @staticmethod
+    def infer_scope():
+        """Infer the scope of registry.
+
+        The name of the package where registry is defined will be returned.
+
+        Example:
+            >>> # in mmdet/models/backbone/resnet.py
+            >>> MODELS = Registry('models')
+            >>> @MODELS.register_module()
+            >>> class ResNet:
+            >>>     pass
+            The scope of ``ResNet`` will be ``mmdet``.
+
+        Returns:
+            str: The inferred scope name.
+        """
+        # We access the caller using inspect.currentframe() instead of
+        # inspect.stack() for performance reasons. See details in PR #1844
+        frame = inspect.currentframe()
+        # get the frame where `infer_scope()` is called
+        infer_scope_caller = frame.f_back.f_back
+        filename = inspect.getmodule(infer_scope_caller).__name__
+        split_filename = filename.split('.')
+        return split_filename[0]
+
+    @staticmethod
+    def split_scope_key(key):
+        """Split scope and key.
+
+        The first scope will be split from key.
+
+        Examples:
+            >>> Registry.split_scope_key('mmdet.ResNet')
+            'mmdet', 'ResNet'
+            >>> Registry.split_scope_key('ResNet')
+            None, 'ResNet'
+
+        Return:
+            tuple[str | None, str]: The former element is the first scope of
+            the key, which can be ``None``. The latter is the remaining key.
+        """
+        split_index = key.find('.')
+        if split_index != -1:
+            return key[:split_index], key[split_index + 1:]
+        else:
+            return None, key
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def scope(self):
+        return self._scope
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    @property
+    def children(self):
+        return self._children
+
+    def get(self, key):
+        """Get the registry record.
+
+        Args:
+            key (str): The class name in string format.
+
+        Returns:
+            class: The corresponding class.
+        """
+        scope, real_key = self.split_scope_key(key)
+        if scope is None or scope == self._scope:
+            # get from self
+            if real_key in self._module_dict:
+                return self._module_dict[real_key]
+        else:
+            # get from self._children
+            if scope in self._children:
+                return self._children[scope].get(real_key)
+            else:
+                # goto root
+                parent = self.parent
+                while parent.parent is not None:
+                    parent = parent.parent
+                return parent.get(key)
+
+    def build(self, *args, **kwargs):
+        return self.build_func(*args, **kwargs, registry=self)
+
+    def _add_children(self, registry):
+        """Add children for a registry.
+
+        The ``registry`` will be added as children based on its scope.
+        The parent registry could build objects from children registry.
+
+        Example:
+            >>> models = Registry('models')
+            >>> mmdet_models = Registry('models', parent=models)
+            >>> @mmdet_models.register_module()
+            >>> class ResNet:
+            >>>     pass
+            >>> resnet = models.build(dict(type='mmdet.ResNet'))
+        """
+
+        assert isinstance(registry, Registry)
+        assert registry.scope is not None
+        assert registry.scope not in self.children, \
+            f'scope {registry.scope} exists in {self.name} registry'
+        self.children[registry.scope] = registry
+
+    @deprecated_api_warning(name_dict=dict(module_class='module'))
+    def _register_module(self, module, module_name=None, force=False):
+        if not inspect.isclass(module) and not inspect.isfunction(module):
+            raise TypeError('module must be a class or a function, '
+                            f'but got {type(module)}')
+
+        if module_name is None:
+            module_name = module.__name__
+        if isinstance(module_name, str):
+            module_name = [module_name]
+        for name in module_name:
+            if not force and name in self._module_dict:
+                raise KeyError(f'{name} is already registered '
+                               f'in {self.name}')
+            self._module_dict[name] = module
+
+    def deprecated_register_module(self, cls=None, force=False):
+        warnings.warn(
+            'The old API of register_module(module, force=False) '
+            'is deprecated and will be removed, please use the new API '
+            'register_module(name=None, force=False, module=None) instead.',
+            DeprecationWarning)
+        if cls is None:
+            return partial(self.deprecated_register_module, force=force)
+        self._register_module(cls, force=force)
+        return cls
+
+    def register_module(self, name=None, force=False, module=None):
+        """Register a module.
+
+        A record will be added to `self._module_dict`, whose key is the class
+        name or the specified name, and value is the class itself.
+        It can be used as a decorator or a normal function.
+
+        Example:
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module()
+            >>> class ResNet:
+            >>>     pass
+
+            >>> backbones = Registry('backbone')
+            >>> @backbones.register_module(name='mnet')
+            >>> class MobileNet:
+            >>>     pass
+
+            >>> backbones = Registry('backbone')
+            >>> class ResNet:
+            >>>     pass
+            >>> backbones.register_module(ResNet)
+
+        Args:
+            name (str | None): The module name to be registered. If not
+                specified, the class name will be used.
+            force (bool, optional): Whether to override an existing class with
+                the same name. Default: False.
+            module (type): Module class or function to be registered.
+        """
+        if not isinstance(force, bool):
+            raise TypeError(f'force must be a boolean, but got {type(force)}')
+        # NOTE: This is a walkaround to be compatible with the old api,
+        # while it may introduce unexpected bugs.
+        if isinstance(name, type):
+            return self.deprecated_register_module(name, force=force)
+
+        # raise the error ahead of time
+        if not (name is None or isinstance(name, str) or is_seq_of(name, str)):
+            raise TypeError(
+                'name must be either of None, an instance of str or a sequence'
+                f'  of str, but got {type(name)}')
+
+        # use it as a normal method: x.register_module(module=SomeClass)
+        if module is not None:
+            self._register_module(module=module, module_name=name, force=force)
+            return module
+
+        # use it as a decorator: @x.register_module()
+        def _register(module):
+            self._register_module(module=module, module_name=name, force=force)
+            return module
+
+        return _register
diff --git a/mmcv/mmcv/utils/seed.py b/mmcv/mmcv/utils/seed.py
new file mode 100644
index 0000000000000000000000000000000000000000..003f9236774165af2de921af3c06f9fe057725dc
--- /dev/null
+++ b/mmcv/mmcv/utils/seed.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+
+import numpy as np
+import torch
+
+
+def worker_init_fn(worker_id: int, num_workers: int, rank: int, seed: int):
+    """Function to initialize each worker.
+
+    The seed of each worker equals to
+    ``num_worker * rank + worker_id + user_seed``.
+
+    Args:
+        worker_id (int): Id for each worker.
+        num_workers (int): Number of workers.
+        rank (int): Rank in distributed training.
+        seed (int): Random seed.
+    """
+    worker_seed = num_workers * rank + worker_id + seed
+    np.random.seed(worker_seed)
+    random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
diff --git a/mmcv/mmcv/utils/testing.py b/mmcv/mmcv/utils/testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..7b64e8fae39022fece6f5910cc6656598f31bff5
--- /dev/null
+++ b/mmcv/mmcv/utils/testing.py
@@ -0,0 +1,141 @@
+# Copyright (c) Open-MMLab.
+import sys
+from collections.abc import Iterable
+from runpy import run_path
+from shlex import split
+from typing import Any, Dict, List
+from unittest.mock import patch
+
+
+def check_python_script(cmd):
+    """Run the python cmd script with `__main__`. The difference between
+    `os.system` is that, this function exectues code in the current process, so
+    that it can be tracked by coverage tools. Currently it supports two forms:
+
+    - ./tests/data/scripts/hello.py zz
+    - python tests/data/scripts/hello.py zz
+    """
+    args = split(cmd)
+    if args[0] == 'python':
+        args = args[1:]
+    with patch.object(sys, 'argv', args):
+        run_path(args[0], run_name='__main__')
+
+
+def _any(judge_result):
+    """Since built-in ``any`` works only when the element of iterable is not
+    iterable, implement the function."""
+    if not isinstance(judge_result, Iterable):
+        return judge_result
+
+    try:
+        for element in judge_result:
+            if _any(element):
+                return True
+    except TypeError:
+        # Maybe encounter the case: torch.tensor(True) | torch.tensor(False)
+        if judge_result:
+            return True
+    return False
+
+
+def assert_dict_contains_subset(dict_obj: Dict[Any, Any],
+                                expected_subset: Dict[Any, Any]) -> bool:
+    """Check if the dict_obj contains the expected_subset.
+
+    Args:
+        dict_obj (Dict[Any, Any]): Dict object to be checked.
+        expected_subset (Dict[Any, Any]): Subset expected to be contained in
+            dict_obj.
+
+    Returns:
+        bool: Whether the dict_obj contains the expected_subset.
+    """
+
+    for key, value in expected_subset.items():
+        if key not in dict_obj.keys() or _any(dict_obj[key] != value):
+            return False
+    return True
+
+
+def assert_attrs_equal(obj: Any, expected_attrs: Dict[str, Any]) -> bool:
+    """Check if attribute of class object is correct.
+
+    Args:
+        obj (object): Class object to be checked.
+        expected_attrs (Dict[str, Any]): Dict of the expected attrs.
+
+    Returns:
+        bool: Whether the attribute of class object is correct.
+    """
+    for attr, value in expected_attrs.items():
+        if not hasattr(obj, attr) or _any(getattr(obj, attr) != value):
+            return False
+    return True
+
+
+def assert_dict_has_keys(obj: Dict[str, Any],
+                         expected_keys: List[str]) -> bool:
+    """Check if the obj has all the expected_keys.
+
+    Args:
+        obj (Dict[str, Any]): Object to be checked.
+        expected_keys (List[str]): Keys expected to contained in the keys of
+            the obj.
+
+    Returns:
+        bool: Whether the obj has the expected keys.
+    """
+    return set(expected_keys).issubset(set(obj.keys()))
+
+
+def assert_keys_equal(result_keys: List[str], target_keys: List[str]) -> bool:
+    """Check if target_keys is equal to result_keys.
+
+    Args:
+        result_keys (List[str]): Result keys to be checked.
+        target_keys (List[str]): Target keys to be checked.
+
+    Returns:
+        bool: Whether target_keys is equal to result_keys.
+    """
+    return set(result_keys) == set(target_keys)
+
+
+def assert_is_norm_layer(module) -> bool:
+    """Check if the module is a norm layer.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: Whether the module is a norm layer.
+    """
+    from torch.nn import GroupNorm, LayerNorm
+
+    from .parrots_wrapper import _BatchNorm, _InstanceNorm
+    norm_layer_candidates = (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm)
+    return isinstance(module, norm_layer_candidates)
+
+
+def assert_params_all_zeros(module) -> bool:
+    """Check if the parameters of the module is all zeros.
+
+    Args:
+        module (nn.Module): The module to be checked.
+
+    Returns:
+        bool: Whether the parameters of the module is all zeros.
+    """
+    weight_data = module.weight.data
+    is_weight_zero = weight_data.allclose(
+        weight_data.new_zeros(weight_data.size()))
+
+    if hasattr(module, 'bias') and module.bias is not None:
+        bias_data = module.bias.data
+        is_bias_zero = bias_data.allclose(
+            bias_data.new_zeros(bias_data.size()))
+    else:
+        is_bias_zero = True
+
+    return is_weight_zero and is_bias_zero
diff --git a/mmcv/mmcv/utils/timer.py b/mmcv/mmcv/utils/timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..087a969cfabe30ce0ed3080fd6eb6b81e232502f
--- /dev/null
+++ b/mmcv/mmcv/utils/timer.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from time import time
+
+
+class TimerError(Exception):
+
+    def __init__(self, message):
+        self.message = message
+        super().__init__(message)
+
+
+class Timer:
+    """A flexible Timer class.
+
+    Examples:
+        >>> import time
+        >>> import mmcv
+        >>> with mmcv.Timer():
+        >>>     # simulate a code block that will run for 1s
+        >>>     time.sleep(1)
+        1.000
+        >>> with mmcv.Timer(print_tmpl='it takes {:.1f} seconds'):
+        >>>     # simulate a code block that will run for 1s
+        >>>     time.sleep(1)
+        it takes 1.0 seconds
+        >>> timer = mmcv.Timer()
+        >>> time.sleep(0.5)
+        >>> print(timer.since_start())
+        0.500
+        >>> time.sleep(0.5)
+        >>> print(timer.since_last_check())
+        0.500
+        >>> print(timer.since_start())
+        1.000
+    """
+
+    def __init__(self, start=True, print_tmpl=None):
+        self._is_running = False
+        self.print_tmpl = print_tmpl if print_tmpl else '{:.3f}'
+        if start:
+            self.start()
+
+    @property
+    def is_running(self):
+        """bool: indicate whether the timer is running"""
+        return self._is_running
+
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, type, value, traceback):
+        print(self.print_tmpl.format(self.since_last_check()))
+        self._is_running = False
+
+    def start(self):
+        """Start the timer."""
+        if not self._is_running:
+            self._t_start = time()
+            self._is_running = True
+        self._t_last = time()
+
+    def since_start(self):
+        """Total time since the timer is started.
+
+        Returns:
+            float: Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        self._t_last = time()
+        return self._t_last - self._t_start
+
+    def since_last_check(self):
+        """Time since the last checking.
+
+        Either :func:`since_start` or :func:`since_last_check` is a checking
+        operation.
+
+        Returns:
+            float: Time in seconds.
+        """
+        if not self._is_running:
+            raise TimerError('timer is not running')
+        dur = time() - self._t_last
+        self._t_last = time()
+        return dur
+
+
+_g_timers = {}  # global timers
+
+
+def check_time(timer_id):
+    """Add check points in a single line.
+
+    This method is suitable for running a task on a list of items. A timer will
+    be registered when the method is called for the first time.
+
+    Examples:
+        >>> import time
+        >>> import mmcv
+        >>> for i in range(1, 6):
+        >>>     # simulate a code block
+        >>>     time.sleep(i)
+        >>>     mmcv.check_time('task1')
+        2.000
+        3.000
+        4.000
+        5.000
+
+    Args:
+        str: Timer identifier.
+    """
+    if timer_id not in _g_timers:
+        _g_timers[timer_id] = Timer()
+        return 0
+    else:
+        return _g_timers[timer_id].since_last_check()
diff --git a/mmcv/mmcv/utils/torch_ops.py b/mmcv/mmcv/utils/torch_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4f2213a43eda55ebfeb0ec61c9060a37224c25d
--- /dev/null
+++ b/mmcv/mmcv/utils/torch_ops.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from .parrots_wrapper import TORCH_VERSION
+from .version_utils import digit_version
+
+_torch_version_meshgrid_indexing = (
+    'parrots' not in TORCH_VERSION
+    and digit_version(TORCH_VERSION) >= digit_version('1.10.0a0'))
+
+
+def torch_meshgrid(*tensors):
+    """A wrapper of torch.meshgrid to compat different PyTorch versions.
+
+    Since PyTorch 1.10.0a0, torch.meshgrid supports the arguments ``indexing``.
+    So we implement a wrapper here to avoid warning when using high-version
+    PyTorch and avoid compatibility issues when using previous versions of
+    PyTorch.
+
+    Args:
+        tensors (List[Tensor]): List of scalars or 1 dimensional tensors.
+
+    Returns:
+        Sequence[Tensor]: Sequence of meshgrid tensors.
+    """
+    if _torch_version_meshgrid_indexing:
+        return torch.meshgrid(*tensors, indexing='ij')
+    else:
+        return torch.meshgrid(*tensors)  # Uses indexing='ij' by default
diff --git a/mmcv/mmcv/utils/trace.py b/mmcv/mmcv/utils/trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..45423bd0551b8c4824193110546d5328ea4253d1
--- /dev/null
+++ b/mmcv/mmcv/utils/trace.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+
+from mmcv.utils import digit_version
+
+
+def is_jit_tracing() -> bool:
+    if (torch.__version__ != 'parrots'
+            and digit_version(torch.__version__) >= digit_version('1.6.0')):
+        on_trace = torch.jit.is_tracing()
+        # In PyTorch 1.6, torch.jit.is_tracing has a bug.
+        # Refers to https://github.com/pytorch/pytorch/issues/42448
+        if isinstance(on_trace, bool):
+            return on_trace
+        else:
+            return torch._C._is_tracing()
+    else:
+        warnings.warn(
+            'torch.jit.is_tracing is only supported after v1.6.0. '
+            'Therefore is_tracing returns False automatically. Please '
+            'set on_trace manually if you are using trace.', UserWarning)
+        return False
diff --git a/mmcv/mmcv/utils/version_utils.py b/mmcv/mmcv/utils/version_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c41f608439f85aa29f8a6c9bd148b04d0c5973
--- /dev/null
+++ b/mmcv/mmcv/utils/version_utils.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import subprocess
+import warnings
+
+from packaging.version import parse
+
+
+def digit_version(version_str: str, length: int = 4):
+    """Convert a version string into a tuple of integers.
+
+    This method is usually used for comparing two versions. For pre-release
+    versions: alpha < beta < rc.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int]: The version info in digits (integers).
+    """
+    assert 'parrots' not in version_str
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        mapping = {'a': -3, 'b': -2, 'rc': -1}
+        val = -4
+        # version.pre can be None
+        if version.pre:
+            if version.pre[0] not in mapping:
+                warnings.warn(f'unknown prerelease version {version.pre[0]}, '
+                              'version checking may go wrong')
+            else:
+                val = mapping[version.pre[0]]
+            release.extend([val, version.pre[-1]])
+        else:
+            release.extend([val, 0])
+
+    elif version.is_postrelease:
+        release.extend([1, version.post])  # type: ignore
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+def _minimal_ext_cmd(cmd):
+    # construct minimal environment
+    env = {}
+    for k in ['SYSTEMROOT', 'PATH', 'HOME']:
+        v = os.environ.get(k)
+        if v is not None:
+            env[k] = v
+    # LANGUAGE is used on win32
+    env['LANGUAGE'] = 'C'
+    env['LANG'] = 'C'
+    env['LC_ALL'] = 'C'
+    out = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, env=env).communicate()[0]
+    return out
+
+
+def get_git_hash(fallback='unknown', digits=None):
+    """Get the git hash of the current repo.
+
+    Args:
+        fallback (str, optional): The fallback string when git hash is
+            unavailable. Defaults to 'unknown'.
+        digits (int, optional): kept digits of the hash. Defaults to None,
+            meaning all digits are kept.
+
+    Returns:
+        str: Git commit hash.
+    """
+
+    if digits is not None and not isinstance(digits, int):
+        raise TypeError('digits must be None or an integer')
+
+    try:
+        out = _minimal_ext_cmd(['git', 'rev-parse', 'HEAD'])
+        sha = out.strip().decode('ascii')
+        if digits is not None:
+            sha = sha[:digits]
+    except OSError:
+        sha = fallback
+
+    return sha
diff --git a/mmcv/mmcv/version.py b/mmcv/mmcv/version.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e03bf9bd6e93c42e17a8b8c43d5bc9e7dadc9ef
--- /dev/null
+++ b/mmcv/mmcv/version.py
@@ -0,0 +1,35 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+__version__ = '1.6.1'
+
+
+def parse_version_info(version_str: str, length: int = 4) -> tuple:
+    """Parse a version string into a tuple.
+
+    Args:
+        version_str (str): The version string.
+        length (int): The maximum number of version levels. Default: 4.
+
+    Returns:
+        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
+            (1, 3, 0, 0, 0, 0), and "2.0.0rc1" is parsed into
+            (2, 0, 0, 0, 'rc', 1) (when length is set to 4).
+    """
+    from packaging.version import parse
+    version = parse(version_str)
+    assert version.release, f'failed to parse version {version_str}'
+    release = list(version.release)
+    release = release[:length]
+    if len(release) < length:
+        release = release + [0] * (length - len(release))
+    if version.is_prerelease:
+        release.extend(list(version.pre))  # type: ignore
+    elif version.is_postrelease:
+        release.extend(list(version.post))  # type: ignore
+    else:
+        release.extend([0, 0])
+    return tuple(release)
+
+
+version_info = tuple(int(x) for x in __version__.split('.')[:3])
+
+__all__ = ['__version__', 'version_info', 'parse_version_info']
diff --git a/mmcv/mmcv/video/__init__.py b/mmcv/mmcv/video/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..73199b01dec52820dc6ca0139903536344d5a1eb
--- /dev/null
+++ b/mmcv/mmcv/video/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .io import Cache, VideoReader, frames2video
+from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread,
+                      flowwrite, quantize_flow, sparse_flow_from_bytes)
+from .processing import concat_video, convert_video, cut_video, resize_video
+
+__all__ = [
+    'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video',
+    'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow',
+    'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes'
+]
diff --git a/mmcv/mmcv/video/io.py b/mmcv/mmcv/video/io.py
new file mode 100644
index 0000000000000000000000000000000000000000..09fa770db3dac1c5b2bec13b8743ccf3b2b46ef6
--- /dev/null
+++ b/mmcv/mmcv/video/io.py
@@ -0,0 +1,317 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict
+
+import cv2
+from cv2 import (CAP_PROP_FOURCC, CAP_PROP_FPS, CAP_PROP_FRAME_COUNT,
+                 CAP_PROP_FRAME_HEIGHT, CAP_PROP_FRAME_WIDTH,
+                 CAP_PROP_POS_FRAMES, VideoWriter_fourcc)
+
+from mmcv.utils import (check_file_exist, mkdir_or_exist, scandir,
+                        track_progress)
+
+
+class Cache:
+
+    def __init__(self, capacity):
+        self._cache = OrderedDict()
+        self._capacity = int(capacity)
+        if capacity <= 0:
+            raise ValueError('capacity must be a positive integer')
+
+    @property
+    def capacity(self):
+        return self._capacity
+
+    @property
+    def size(self):
+        return len(self._cache)
+
+    def put(self, key, val):
+        if key in self._cache:
+            return
+        if len(self._cache) >= self.capacity:
+            self._cache.popitem(last=False)
+        self._cache[key] = val
+
+    def get(self, key, default=None):
+        val = self._cache[key] if key in self._cache else default
+        return val
+
+
+class VideoReader:
+    """Video class with similar usage to a list object.
+
+    This video warpper class provides convenient apis to access frames.
+    There exists an issue of OpenCV's VideoCapture class that jumping to a
+    certain frame may be inaccurate. It is fixed in this class by checking
+    the position after jumping each time.
+    Cache is used when decoding videos. So if the same frame is visited for
+    the second time, there is no need to decode again if it is stored in the
+    cache.
+
+    Examples:
+        >>> import mmcv
+        >>> v = mmcv.VideoReader('sample.mp4')
+        >>> len(v)  # get the total frame number with `len()`
+        120
+        >>> for img in v:  # v is iterable
+        >>>     mmcv.imshow(img)
+        >>> v[5]  # get the 6th frame
+    """
+
+    def __init__(self, filename, cache_capacity=10):
+        # Check whether the video path is a url
+        if not filename.startswith(('https://', 'http://')):
+            check_file_exist(filename, 'Video file not found: ' + filename)
+        self._vcap = cv2.VideoCapture(filename)
+        assert cache_capacity > 0
+        self._cache = Cache(cache_capacity)
+        self._position = 0
+        # get basic info
+        self._width = int(self._vcap.get(CAP_PROP_FRAME_WIDTH))
+        self._height = int(self._vcap.get(CAP_PROP_FRAME_HEIGHT))
+        self._fps = self._vcap.get(CAP_PROP_FPS)
+        self._frame_cnt = int(self._vcap.get(CAP_PROP_FRAME_COUNT))
+        self._fourcc = self._vcap.get(CAP_PROP_FOURCC)
+
+    @property
+    def vcap(self):
+        """:obj:`cv2.VideoCapture`: The raw VideoCapture object."""
+        return self._vcap
+
+    @property
+    def opened(self):
+        """bool: Indicate whether the video is opened."""
+        return self._vcap.isOpened()
+
+    @property
+    def width(self):
+        """int: Width of video frames."""
+        return self._width
+
+    @property
+    def height(self):
+        """int: Height of video frames."""
+        return self._height
+
+    @property
+    def resolution(self):
+        """tuple: Video resolution (width, height)."""
+        return (self._width, self._height)
+
+    @property
+    def fps(self):
+        """float: FPS of the video."""
+        return self._fps
+
+    @property
+    def frame_cnt(self):
+        """int: Total frames of the video."""
+        return self._frame_cnt
+
+    @property
+    def fourcc(self):
+        """str: "Four character code" of the video."""
+        return self._fourcc
+
+    @property
+    def position(self):
+        """int: Current cursor position, indicating frame decoded."""
+        return self._position
+
+    def _get_real_position(self):
+        return int(round(self._vcap.get(CAP_PROP_POS_FRAMES)))
+
+    def _set_real_position(self, frame_id):
+        self._vcap.set(CAP_PROP_POS_FRAMES, frame_id)
+        pos = self._get_real_position()
+        for _ in range(frame_id - pos):
+            self._vcap.read()
+        self._position = frame_id
+
+    def read(self):
+        """Read the next frame.
+
+        If the next frame have been decoded before and in the cache, then
+        return it directly, otherwise decode, cache and return it.
+
+        Returns:
+            ndarray or None: Return the frame if successful, otherwise None.
+        """
+        # pos = self._position
+        if self._cache:
+            img = self._cache.get(self._position)
+            if img is not None:
+                ret = True
+            else:
+                if self._position != self._get_real_position():
+                    self._set_real_position(self._position)
+                ret, img = self._vcap.read()
+                if ret:
+                    self._cache.put(self._position, img)
+        else:
+            ret, img = self._vcap.read()
+        if ret:
+            self._position += 1
+        return img
+
+    def get_frame(self, frame_id):
+        """Get frame by index.
+
+        Args:
+            frame_id (int): Index of the expected frame, 0-based.
+
+        Returns:
+            ndarray or None: Return the frame if successful, otherwise None.
+        """
+        if frame_id < 0 or frame_id >= self._frame_cnt:
+            raise IndexError(
+                f'"frame_id" must be between 0 and {self._frame_cnt - 1}')
+        if frame_id == self._position:
+            return self.read()
+        if self._cache:
+            img = self._cache.get(frame_id)
+            if img is not None:
+                self._position = frame_id + 1
+                return img
+        self._set_real_position(frame_id)
+        ret, img = self._vcap.read()
+        if ret:
+            if self._cache:
+                self._cache.put(self._position, img)
+            self._position += 1
+        return img
+
+    def current_frame(self):
+        """Get the current frame (frame that is just visited).
+
+        Returns:
+            ndarray or None: If the video is fresh, return None, otherwise
+            return the frame.
+        """
+        if self._position == 0:
+            return None
+        return self._cache.get(self._position - 1)
+
+    def cvt2frames(self,
+                   frame_dir,
+                   file_start=0,
+                   filename_tmpl='{:06d}.jpg',
+                   start=0,
+                   max_num=0,
+                   show_progress=True):
+        """Convert a video to frame images.
+
+        Args:
+            frame_dir (str): Output directory to store all the frame images.
+            file_start (int): Filenames will start from the specified number.
+            filename_tmpl (str): Filename template with the index as the
+                placeholder.
+            start (int): The starting frame index.
+            max_num (int): Maximum number of frames to be written.
+            show_progress (bool): Whether to show a progress bar.
+        """
+        mkdir_or_exist(frame_dir)
+        if max_num == 0:
+            task_num = self.frame_cnt - start
+        else:
+            task_num = min(self.frame_cnt - start, max_num)
+        if task_num <= 0:
+            raise ValueError('start must be less than total frame number')
+        if start > 0:
+            self._set_real_position(start)
+
+        def write_frame(file_idx):
+            img = self.read()
+            if img is None:
+                return
+            filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
+            cv2.imwrite(filename, img)
+
+        if show_progress:
+            track_progress(write_frame, range(file_start,
+                                              file_start + task_num))
+        else:
+            for i in range(task_num):
+                write_frame(file_start + i)
+
+    def __len__(self):
+        return self.frame_cnt
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            return [
+                self.get_frame(i)
+                for i in range(*index.indices(self.frame_cnt))
+            ]
+        # support negative indexing
+        if index < 0:
+            index += self.frame_cnt
+            if index < 0:
+                raise IndexError('index out of range')
+        return self.get_frame(index)
+
+    def __iter__(self):
+        self._set_real_position(0)
+        return self
+
+    def __next__(self):
+        img = self.read()
+        if img is not None:
+            return img
+        else:
+            raise StopIteration
+
+    next = __next__
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self._vcap.release()
+
+
+def frames2video(frame_dir: str,
+                 video_file: str,
+                 fps: float = 30,
+                 fourcc: str = 'XVID',
+                 filename_tmpl: str = '{:06d}.jpg',
+                 start: int = 0,
+                 end: int = 0,
+                 show_progress: bool = True) -> None:
+    """Read the frame images from a directory and join them as a video.
+
+    Args:
+        frame_dir (str): The directory containing video frames.
+        video_file (str): Output filename.
+        fps (float): FPS of the output video.
+        fourcc (str): Fourcc of the output video, this should be compatible
+            with the output file type.
+        filename_tmpl (str): Filename template with the index as the variable.
+        start (int): Starting frame index.
+        end (int): Ending frame index.
+        show_progress (bool): Whether to show a progress bar.
+    """
+    if end == 0:
+        ext = filename_tmpl.split('.')[-1]
+        end = len([name for name in scandir(frame_dir, ext)])
+    first_file = osp.join(frame_dir, filename_tmpl.format(start))
+    check_file_exist(first_file, 'The start frame not found: ' + first_file)
+    img = cv2.imread(first_file)
+    height, width = img.shape[:2]
+    resolution = (width, height)
+    vwriter = cv2.VideoWriter(video_file, VideoWriter_fourcc(*fourcc), fps,
+                              resolution)
+
+    def write_frame(file_idx):
+        filename = osp.join(frame_dir, filename_tmpl.format(file_idx))
+        img = cv2.imread(filename)
+        vwriter.write(img)
+
+    if show_progress:
+        track_progress(write_frame, range(start, end))
+    else:
+        for i in range(start, end):
+            write_frame(i)
+    vwriter.release()
diff --git a/mmcv/mmcv/video/optflow.py b/mmcv/mmcv/video/optflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..91ce004570d356b373ce8707e570c05ae026f1d5
--- /dev/null
+++ b/mmcv/mmcv/video/optflow.py
@@ -0,0 +1,272 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from typing import Tuple, Union
+
+import cv2
+import numpy as np
+
+from mmcv.arraymisc import dequantize, quantize
+from mmcv.image import imread, imwrite
+from mmcv.utils import is_str
+
+
+def flowread(flow_or_path: Union[np.ndarray, str],
+             quantize: bool = False,
+             concat_axis: int = 0,
+             *args,
+             **kwargs) -> np.ndarray:
+    """Read an optical flow map.
+
+    Args:
+        flow_or_path (ndarray or str): A flow map or filepath.
+        quantize (bool): whether to read quantized pair, if set to True,
+            remaining args will be passed to :func:`dequantize_flow`.
+        concat_axis (int): The axis that dx and dy are concatenated,
+            can be either 0 or 1. Ignored if quantize is False.
+
+    Returns:
+        ndarray: Optical flow represented as a (h, w, 2) numpy array
+    """
+    if isinstance(flow_or_path, np.ndarray):
+        if (flow_or_path.ndim != 3) or (flow_or_path.shape[-1] != 2):
+            raise ValueError(f'Invalid flow with shape {flow_or_path.shape}')
+        return flow_or_path
+    elif not is_str(flow_or_path):
+        raise TypeError(f'"flow_or_path" must be a filename or numpy array, '
+                        f'not {type(flow_or_path)}')
+
+    if not quantize:
+        with open(flow_or_path, 'rb') as f:
+            try:
+                header = f.read(4).decode('utf-8')
+            except Exception:
+                raise OSError(f'Invalid flow file: {flow_or_path}')
+            else:
+                if header != 'PIEH':
+                    raise OSError(f'Invalid flow file: {flow_or_path}, '
+                                  'header does not contain PIEH')
+
+            w = np.fromfile(f, np.int32, 1).squeeze()
+            h = np.fromfile(f, np.int32, 1).squeeze()
+            flow = np.fromfile(f, np.float32, w * h * 2).reshape((h, w, 2))
+    else:
+        assert concat_axis in [0, 1]
+        cat_flow = imread(flow_or_path, flag='unchanged')
+        if cat_flow.ndim != 2:
+            raise OSError(
+                f'{flow_or_path} is not a valid quantized flow file, '
+                f'its dimension is {cat_flow.ndim}.')
+        assert cat_flow.shape[concat_axis] % 2 == 0
+        dx, dy = np.split(cat_flow, 2, axis=concat_axis)
+        flow = dequantize_flow(dx, dy, *args, **kwargs)
+
+    return flow.astype(np.float32)
+
+
+def flowwrite(flow: np.ndarray,
+              filename: str,
+              quantize: bool = False,
+              concat_axis: int = 0,
+              *args,
+              **kwargs) -> None:
+    """Write optical flow to file.
+
+    If the flow is not quantized, it will be saved as a .flo file losslessly,
+    otherwise a jpeg image which is lossy but of much smaller size. (dx and dy
+    will be concatenated horizontally into a single image if quantize is True.)
+
+    Args:
+        flow (ndarray): (h, w, 2) array of optical flow.
+        filename (str): Output filepath.
+        quantize (bool): Whether to quantize the flow and save it to 2 jpeg
+            images. If set to True, remaining args will be passed to
+            :func:`quantize_flow`.
+        concat_axis (int): The axis that dx and dy are concatenated,
+            can be either 0 or 1. Ignored if quantize is False.
+    """
+    if not quantize:
+        with open(filename, 'wb') as f:
+            f.write(b'PIEH')
+            np.array([flow.shape[1], flow.shape[0]], dtype=np.int32).tofile(f)
+            flow = flow.astype(np.float32)
+            flow.tofile(f)
+            f.flush()
+    else:
+        assert concat_axis in [0, 1]
+        dx, dy = quantize_flow(flow, *args, **kwargs)
+        dxdy = np.concatenate((dx, dy), axis=concat_axis)
+        imwrite(dxdy, filename)
+
+
+def quantize_flow(flow: np.ndarray,
+                  max_val: float = 0.02,
+                  norm: bool = True) -> tuple:
+    """Quantize flow to [0, 255].
+
+    After this step, the size of flow will be much smaller, and can be
+    dumped as jpeg images.
+
+    Args:
+        flow (ndarray): (h, w, 2) array of optical flow.
+        max_val (float): Maximum value of flow, values beyond
+                        [-max_val, max_val] will be truncated.
+        norm (bool): Whether to divide flow values by image width/height.
+
+    Returns:
+        tuple[ndarray]: Quantized dx and dy.
+    """
+    h, w, _ = flow.shape
+    dx = flow[..., 0]
+    dy = flow[..., 1]
+    if norm:
+        dx = dx / w  # avoid inplace operations
+        dy = dy / h
+    # use 255 levels instead of 256 to make sure 0 is 0 after dequantization.
+    flow_comps = [
+        quantize(d, -max_val, max_val, 255, np.uint8) for d in [dx, dy]
+    ]
+    return tuple(flow_comps)
+
+
+def dequantize_flow(dx: np.ndarray,
+                    dy: np.ndarray,
+                    max_val: float = 0.02,
+                    denorm: bool = True) -> np.ndarray:
+    """Recover from quantized flow.
+
+    Args:
+        dx (ndarray): Quantized dx.
+        dy (ndarray): Quantized dy.
+        max_val (float): Maximum value used when quantizing.
+        denorm (bool): Whether to multiply flow values with width/height.
+
+    Returns:
+        ndarray: Dequantized flow.
+    """
+    assert dx.shape == dy.shape
+    assert dx.ndim == 2 or (dx.ndim == 3 and dx.shape[-1] == 1)
+
+    dx, dy = (dequantize(d, -max_val, max_val, 255) for d in [dx, dy])
+
+    if denorm:
+        dx *= dx.shape[1]
+        dy *= dx.shape[0]
+    flow = np.dstack((dx, dy))
+    return flow
+
+
+def flow_warp(img: np.ndarray,
+              flow: np.ndarray,
+              filling_value: int = 0,
+              interpolate_mode: str = 'nearest') -> np.ndarray:
+    """Use flow to warp img.
+
+    Args:
+        img (ndarray): Image to be warped.
+        flow (ndarray): Optical Flow.
+        filling_value (int): The missing pixels will be set with filling_value.
+        interpolate_mode (str): bilinear -> Bilinear Interpolation;
+                                nearest -> Nearest Neighbor.
+
+    Returns:
+        ndarray: Warped image with the same shape of img
+    """
+    warnings.warn('This function is just for prototyping and cannot '
+                  'guarantee the computational efficiency.')
+    assert flow.ndim == 3, 'Flow must be in 3D arrays.'
+    height = flow.shape[0]
+    width = flow.shape[1]
+    channels = img.shape[2]
+
+    output = np.ones(
+        (height, width, channels), dtype=img.dtype) * filling_value
+
+    grid = np.indices((height, width)).swapaxes(0, 1).swapaxes(1, 2)
+    dx = grid[:, :, 0] + flow[:, :, 1]
+    dy = grid[:, :, 1] + flow[:, :, 0]
+    sx = np.floor(dx).astype(int)
+    sy = np.floor(dy).astype(int)
+    valid = (sx >= 0) & (sx < height - 1) & (sy >= 0) & (sy < width - 1)
+
+    if interpolate_mode == 'nearest':
+        output[valid, :] = img[dx[valid].round().astype(int),
+                               dy[valid].round().astype(int), :]
+    elif interpolate_mode == 'bilinear':
+        # dirty walkround for integer positions
+        eps_ = 1e-6
+        dx, dy = dx + eps_, dy + eps_
+        left_top_ = img[np.floor(dx[valid]).astype(int),
+                        np.floor(dy[valid]).astype(int), :] * (
+                            np.ceil(dx[valid]) - dx[valid])[:, None] * (
+                                np.ceil(dy[valid]) - dy[valid])[:, None]
+        left_down_ = img[np.ceil(dx[valid]).astype(int),
+                         np.floor(dy[valid]).astype(int), :] * (
+                             dx[valid] - np.floor(dx[valid]))[:, None] * (
+                                 np.ceil(dy[valid]) - dy[valid])[:, None]
+        right_top_ = img[np.floor(dx[valid]).astype(int),
+                         np.ceil(dy[valid]).astype(int), :] * (
+                             np.ceil(dx[valid]) - dx[valid])[:, None] * (
+                                 dy[valid] - np.floor(dy[valid]))[:, None]
+        right_down_ = img[np.ceil(dx[valid]).astype(int),
+                          np.ceil(dy[valid]).astype(int), :] * (
+                              dx[valid] - np.floor(dx[valid]))[:, None] * (
+                                  dy[valid] - np.floor(dy[valid]))[:, None]
+        output[valid, :] = left_top_ + left_down_ + right_top_ + right_down_
+    else:
+        raise NotImplementedError(
+            'We only support interpolation modes of nearest and bilinear, '
+            f'but got {interpolate_mode}.')
+    return output.astype(img.dtype)
+
+
+def flow_from_bytes(content: bytes) -> np.ndarray:
+    """Read dense optical flow from bytes.
+
+    .. note::
+        This load optical flow function works for FlyingChairs, FlyingThings3D,
+        Sintel, FlyingChairsOcc datasets, but cannot load the data from
+        ChairsSDHom.
+
+    Args:
+        content (bytes): Optical flow bytes got from files or other streams.
+
+    Returns:
+        ndarray: Loaded optical flow with the shape (H, W, 2).
+    """
+
+    # header in first 4 bytes
+    header = content[:4]
+    if header.decode('utf-8') != 'PIEH':
+        raise Exception('Flow file header does not contain PIEH')
+    # width in second 4 bytes
+    width = np.frombuffer(content[4:], np.int32, 1).squeeze()
+    # height in third 4 bytes
+    height = np.frombuffer(content[8:], np.int32, 1).squeeze()
+    # after first 12 bytes, all bytes are flow
+    flow = np.frombuffer(content[12:], np.float32, width * height * 2).reshape(
+        (height, width, 2))
+
+    return flow
+
+
+def sparse_flow_from_bytes(content: bytes) -> Tuple[np.ndarray, np.ndarray]:
+    """Read the optical flow in KITTI datasets from bytes.
+
+    This function is modified from RAFT load the `KITTI datasets
+    <https://github.com/princeton-vl/RAFT/blob/224320502d66c356d88e6c712f38129e60661e80/core/utils/frame_utils.py#L102>`_.
+
+    Args:
+        content (bytes): Optical flow bytes got from files or other streams.
+
+    Returns:
+        Tuple(ndarray, ndarray): Loaded optical flow with the shape (H, W, 2)
+        and flow valid mask with the shape (H, W).
+    """  # nopa
+
+    content = np.frombuffer(content, np.uint8)
+    flow = cv2.imdecode(content, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+    flow = flow[:, :, ::-1].astype(np.float32)
+    # flow shape (H, W, 2) valid shape (H, W)
+    flow, valid = flow[:, :, :2], flow[:, :, 2]
+    flow = (flow - 2**15) / 64.0
+    return flow, valid
diff --git a/mmcv/mmcv/video/processing.py b/mmcv/mmcv/video/processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..90e2a4c0228b8ed44ac50309a6e77092ca5a4ed0
--- /dev/null
+++ b/mmcv/mmcv/video/processing.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import subprocess
+import tempfile
+from typing import List, Optional, Union
+
+from mmcv.utils import requires_executable
+
+
+@requires_executable('ffmpeg')
+def convert_video(in_file: str,
+                  out_file: str,
+                  print_cmd: bool = False,
+                  pre_options: str = '',
+                  **kwargs) -> None:
+    """Convert a video with ffmpeg.
+
+    This provides a general api to ffmpeg, the executed command is::
+
+        `ffmpeg -y <pre_options> -i <in_file> <options> <out_file>`
+
+    Options(kwargs) are mapped to ffmpeg commands with the following rules:
+
+    - key=val: "-key val"
+    - key=True: "-key"
+    - key=False: ""
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        pre_options (str): Options appears before "-i <in_file>".
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    options = []
+    for k, v in kwargs.items():
+        if isinstance(v, bool):
+            if v:
+                options.append(f'-{k}')
+        elif k == 'log_level':
+            assert v in [
+                'quiet', 'panic', 'fatal', 'error', 'warning', 'info',
+                'verbose', 'debug', 'trace'
+            ]
+            options.append(f'-loglevel {v}')
+        else:
+            options.append(f'-{k} {v}')
+    cmd = f'ffmpeg -y {pre_options} -i {in_file} {" ".join(options)} ' \
+          f'{out_file}'
+    if print_cmd:
+        print(cmd)
+    subprocess.call(cmd, shell=True)
+
+
+@requires_executable('ffmpeg')
+def resize_video(in_file: str,
+                 out_file: str,
+                 size: Optional[tuple] = None,
+                 ratio: Union[tuple, float, None] = None,
+                 keep_ar: bool = False,
+                 log_level: str = 'info',
+                 print_cmd: bool = False) -> None:
+    """Resize a video.
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        size (tuple): Expected size (w, h), eg, (320, 240) or (320, -1).
+        ratio (tuple or float): Expected resize ratio, (2, 0.5) means
+            (w*2, h*0.5).
+        keep_ar (bool): Whether to keep original aspect ratio.
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    if size is None and ratio is None:
+        raise ValueError('expected size or ratio must be specified')
+    if size is not None and ratio is not None:
+        raise ValueError('size and ratio cannot be specified at the same time')
+    options = {'log_level': log_level}
+    if size:
+        if not keep_ar:
+            options['vf'] = f'scale={size[0]}:{size[1]}'
+        else:
+            options['vf'] = f'scale=w={size[0]}:h={size[1]}:' \
+                            'force_original_aspect_ratio=decrease'
+    else:
+        if not isinstance(ratio, tuple):
+            ratio = (ratio, ratio)
+        options['vf'] = f'scale="trunc(iw*{ratio[0]}):trunc(ih*{ratio[1]})"'
+    convert_video(in_file, out_file, print_cmd, **options)
+
+
+@requires_executable('ffmpeg')
+def cut_video(in_file: str,
+              out_file: str,
+              start: Optional[float] = None,
+              end: Optional[float] = None,
+              vcodec: Optional[str] = None,
+              acodec: Optional[str] = None,
+              log_level: str = 'info',
+              print_cmd: bool = False) -> None:
+    """Cut a clip from a video.
+
+    Args:
+        in_file (str): Input video filename.
+        out_file (str): Output video filename.
+        start (None or float): Start time (in seconds).
+        end (None or float): End time (in seconds).
+        vcodec (None or str): Output video codec, None for unchanged.
+        acodec (None or str): Output audio codec, None for unchanged.
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    options = {'log_level': log_level}
+    if vcodec is None:
+        options['vcodec'] = 'copy'
+    if acodec is None:
+        options['acodec'] = 'copy'
+    if start:
+        options['ss'] = start  # type: ignore
+    else:
+        start = 0
+    if end:
+        options['t'] = end - start  # type: ignore
+    convert_video(in_file, out_file, print_cmd, **options)
+
+
+@requires_executable('ffmpeg')
+def concat_video(video_list: List,
+                 out_file: str,
+                 vcodec: Optional[str] = None,
+                 acodec: Optional[str] = None,
+                 log_level: str = 'info',
+                 print_cmd: bool = False) -> None:
+    """Concatenate multiple videos into a single one.
+
+    Args:
+        video_list (list): A list of video filenames
+        out_file (str): Output video filename
+        vcodec (None or str): Output video codec, None for unchanged
+        acodec (None or str): Output audio codec, None for unchanged
+        log_level (str): Logging level of ffmpeg.
+        print_cmd (bool): Whether to print the final ffmpeg command.
+    """
+    tmp_filehandler, tmp_filename = tempfile.mkstemp(suffix='.txt', text=True)
+    with open(tmp_filename, 'w') as f:
+        for filename in video_list:
+            f.write(f'file {osp.abspath(filename)}\n')
+    options = {'log_level': log_level}
+    if vcodec is None:
+        options['vcodec'] = 'copy'
+    if acodec is None:
+        options['acodec'] = 'copy'
+    convert_video(
+        tmp_filename,
+        out_file,
+        print_cmd,
+        pre_options='-f concat -safe 0',
+        **options)
+    os.close(tmp_filehandler)
+    os.remove(tmp_filename)
diff --git a/mmcv/mmcv/visualization/__init__.py b/mmcv/mmcv/visualization/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..835df136bdcf69348281d22914d41aa84cdf92b1
--- /dev/null
+++ b/mmcv/mmcv/visualization/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .color import Color, color_val
+from .image import imshow, imshow_bboxes, imshow_det_bboxes
+from .optflow import flow2rgb, flowshow, make_color_wheel
+
+__all__ = [
+    'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes',
+    'flowshow', 'flow2rgb', 'make_color_wheel'
+]
diff --git a/mmcv/mmcv/visualization/color.py b/mmcv/mmcv/visualization/color.py
new file mode 100644
index 0000000000000000000000000000000000000000..2cc0b523e02754d2636afea018c2eb39c4f8709e
--- /dev/null
+++ b/mmcv/mmcv/visualization/color.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from enum import Enum
+from typing import Union
+
+import numpy as np
+
+from mmcv.utils import is_str
+
+
+class Color(Enum):
+    """An enum that defines common colors.
+
+    Contains red, green, blue, cyan, yellow, magenta, white and black.
+    """
+    red = (0, 0, 255)
+    green = (0, 255, 0)
+    blue = (255, 0, 0)
+    cyan = (255, 255, 0)
+    yellow = (0, 255, 255)
+    magenta = (255, 0, 255)
+    white = (255, 255, 255)
+    black = (0, 0, 0)
+
+
+def color_val(color: Union[Color, str, tuple, int, np.ndarray]) -> tuple:
+    """Convert various input to color tuples.
+
+    Args:
+        color (:obj:`Color`/str/tuple/int/ndarray): Color inputs
+
+    Returns:
+        tuple[int]: A tuple of 3 integers indicating BGR channels.
+    """
+    if is_str(color):
+        return Color[color].value  # type: ignore
+    elif isinstance(color, Color):
+        return color.value
+    elif isinstance(color, tuple):
+        assert len(color) == 3
+        for channel in color:
+            assert 0 <= channel <= 255
+        return color
+    elif isinstance(color, int):
+        assert 0 <= color <= 255
+        return color, color, color
+    elif isinstance(color, np.ndarray):
+        assert color.ndim == 1 and color.size == 3
+        assert np.all((color >= 0) & (color <= 255))
+        color = color.astype(np.uint8)
+        return tuple(color)
+    else:
+        raise TypeError(f'Invalid type for color: {type(color)}')
diff --git a/mmcv/mmcv/visualization/image.py b/mmcv/mmcv/visualization/image.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7ac4c181744cb08e51a77707c970400a9198a74
--- /dev/null
+++ b/mmcv/mmcv/visualization/image.py
@@ -0,0 +1,161 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import cv2
+import numpy as np
+
+from mmcv.image import imread, imwrite
+from .color import Color, color_val
+
+# a type alias declares the optional types of color argument
+ColorType = Union[Color, str, tuple, int, np.ndarray]
+
+
+def imshow(img: Union[str, np.ndarray],
+           win_name: str = '',
+           wait_time: int = 0):
+    """Show an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+    """
+    cv2.imshow(win_name, imread(img))
+    if wait_time == 0:  # prevent from hanging if windows was closed
+        while True:
+            ret = cv2.waitKey(1)
+
+            closed = cv2.getWindowProperty(win_name, cv2.WND_PROP_VISIBLE) < 1
+            # if user closed window or if some key pressed
+            if closed or ret != -1:
+                break
+    else:
+        ret = cv2.waitKey(wait_time)
+
+
+def imshow_bboxes(img: Union[str, np.ndarray],
+                  bboxes: Union[list, np.ndarray],
+                  colors: ColorType = 'green',
+                  top_k: int = -1,
+                  thickness: int = 1,
+                  show: bool = True,
+                  win_name: str = '',
+                  wait_time: int = 0,
+                  out_file: Optional[str] = None):
+    """Draw bboxes on an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (list or ndarray): A list of ndarray of shape (k, 4).
+        colors (Color or str or tuple or int or ndarray): A list of colors.
+        top_k (int): Plot the first k bboxes only if set positive.
+        thickness (int): Thickness of lines.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str, optional): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+    img = imread(img)
+    img = np.ascontiguousarray(img)
+
+    if isinstance(bboxes, np.ndarray):
+        bboxes = [bboxes]
+    if not isinstance(colors, list):
+        colors = [colors for _ in range(len(bboxes))]
+    colors = [color_val(c) for c in colors]
+    assert len(bboxes) == len(colors)
+
+    for i, _bboxes in enumerate(bboxes):
+        _bboxes = _bboxes.astype(np.int32)
+        if top_k <= 0:
+            _top_k = _bboxes.shape[0]
+        else:
+            _top_k = min(top_k, _bboxes.shape[0])
+        for j in range(_top_k):
+            left_top = (_bboxes[j, 0], _bboxes[j, 1])
+            right_bottom = (_bboxes[j, 2], _bboxes[j, 3])
+            cv2.rectangle(
+                img, left_top, right_bottom, colors[i], thickness=thickness)
+
+    if show:
+        imshow(img, win_name, wait_time)
+    if out_file is not None:
+        imwrite(img, out_file)
+    return img
+
+
+def imshow_det_bboxes(img: Union[str, np.ndarray],
+                      bboxes: np.ndarray,
+                      labels: np.ndarray,
+                      class_names: List[str] = None,
+                      score_thr: float = 0,
+                      bbox_color: ColorType = 'green',
+                      text_color: ColorType = 'green',
+                      thickness: int = 1,
+                      font_scale: float = 0.5,
+                      show: bool = True,
+                      win_name: str = '',
+                      wait_time: int = 0,
+                      out_file: Optional[str] = None):
+    """Draw bboxes and class labels (with scores) on an image.
+
+    Args:
+        img (str or ndarray): The image to be displayed.
+        bboxes (ndarray): Bounding boxes (with scores), shaped (n, 4) or
+            (n, 5).
+        labels (ndarray): Labels of bboxes.
+        class_names (list[str]): Names of each classes.
+        score_thr (float): Minimum score of bboxes to be shown.
+        bbox_color (Color or str or tuple or int or ndarray): Color
+            of bbox lines.
+        text_color (Color or str or tuple or int or ndarray): Color
+            of texts.
+        thickness (int): Thickness of lines.
+        font_scale (float): Font scales of texts.
+        show (bool): Whether to show the image.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+        out_file (str or None): The filename to write the image.
+
+    Returns:
+        ndarray: The image with bboxes drawn on it.
+    """
+    assert bboxes.ndim == 2
+    assert labels.ndim == 1
+    assert bboxes.shape[0] == labels.shape[0]
+    assert bboxes.shape[1] == 4 or bboxes.shape[1] == 5
+    img = imread(img)
+    img = np.ascontiguousarray(img)
+
+    if score_thr > 0:
+        assert bboxes.shape[1] == 5
+        scores = bboxes[:, -1]
+        inds = scores > score_thr
+        bboxes = bboxes[inds, :]
+        labels = labels[inds]
+
+    bbox_color = color_val(bbox_color)
+    text_color = color_val(text_color)
+
+    for bbox, label in zip(bboxes, labels):
+        bbox_int = bbox.astype(np.int32)
+        left_top = (bbox_int[0], bbox_int[1])
+        right_bottom = (bbox_int[2], bbox_int[3])
+        cv2.rectangle(
+            img, left_top, right_bottom, bbox_color, thickness=thickness)
+        label_text = class_names[
+            label] if class_names is not None else f'cls {label}'
+        if len(bbox) > 4:
+            label_text += f'|{bbox[-1]:.02f}'
+        cv2.putText(img, label_text, (bbox_int[0], bbox_int[1] - 2),
+                    cv2.FONT_HERSHEY_COMPLEX, font_scale, text_color)
+
+    if show:
+        imshow(img, win_name, wait_time)
+    if out_file is not None:
+        imwrite(img, out_file)
+    return img
diff --git a/mmcv/mmcv/visualization/optflow.py b/mmcv/mmcv/visualization/optflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..080b0e61f401c2aab3eedd307d8fc8686b0cae08
--- /dev/null
+++ b/mmcv/mmcv/visualization/optflow.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import numpy as np
+
+from mmcv.image import rgb2bgr
+from mmcv.video import flowread
+from .image import imshow
+
+
+def flowshow(flow: Union[np.ndarray, str],
+             win_name: str = '',
+             wait_time: int = 0) -> None:
+    """Show optical flow.
+
+    Args:
+        flow (ndarray or str): The optical flow to be displayed.
+        win_name (str): The window name.
+        wait_time (int): Value of waitKey param.
+    """
+    flow = flowread(flow)
+    flow_img = flow2rgb(flow)
+    imshow(rgb2bgr(flow_img), win_name, wait_time)
+
+
+def flow2rgb(flow: np.ndarray,
+             color_wheel: Optional[np.ndarray] = None,
+             unknown_thr: float = 1e6) -> np.ndarray:
+    """Convert flow map to RGB image.
+
+    Args:
+        flow (ndarray): Array of optical flow.
+        color_wheel (ndarray or None): Color wheel used to map flow field to
+            RGB colorspace. Default color wheel will be used if not specified.
+        unknown_thr (float): Values above this threshold will be marked as
+            unknown and thus ignored.
+
+    Returns:
+        ndarray: RGB image that can be visualized.
+    """
+    assert flow.ndim == 3 and flow.shape[-1] == 2
+    if color_wheel is None:
+        color_wheel = make_color_wheel()
+    assert color_wheel.ndim == 2 and color_wheel.shape[1] == 3
+    num_bins = color_wheel.shape[0]
+
+    dx = flow[:, :, 0].copy()
+    dy = flow[:, :, 1].copy()
+
+    ignore_inds = (
+        np.isnan(dx) | np.isnan(dy) | (np.abs(dx) > unknown_thr) |
+        (np.abs(dy) > unknown_thr))
+    dx[ignore_inds] = 0
+    dy[ignore_inds] = 0
+
+    rad = np.sqrt(dx**2 + dy**2)
+    if np.any(rad > np.finfo(float).eps):
+        max_rad = np.max(rad)
+        dx /= max_rad
+        dy /= max_rad
+
+    rad = np.sqrt(dx**2 + dy**2)
+    angle = np.arctan2(-dy, -dx) / np.pi
+
+    bin_real = (angle + 1) / 2 * (num_bins - 1)
+    bin_left = np.floor(bin_real).astype(int)
+    bin_right = (bin_left + 1) % num_bins
+    w = (bin_real - bin_left.astype(np.float32))[..., None]
+    flow_img = (1 -
+                w) * color_wheel[bin_left, :] + w * color_wheel[bin_right, :]
+    small_ind = rad <= 1
+    flow_img[small_ind] = 1 - rad[small_ind, None] * (1 - flow_img[small_ind])
+    flow_img[np.logical_not(small_ind)] *= 0.75
+
+    flow_img[ignore_inds, :] = 0
+
+    return flow_img
+
+
+def make_color_wheel(bins: Optional[Union[list, tuple]] = None) -> np.ndarray:
+    """Build a color wheel.
+
+    Args:
+        bins(list or tuple, optional): Specify the number of bins for each
+            color range, corresponding to six ranges: red -> yellow,
+            yellow -> green, green -> cyan, cyan -> blue, blue -> magenta,
+            magenta -> red. [15, 6, 4, 11, 13, 6] is used for default
+            (see Middlebury).
+
+    Returns:
+        ndarray: Color wheel of shape (total_bins, 3).
+    """
+    if bins is None:
+        bins = [15, 6, 4, 11, 13, 6]
+    assert len(bins) == 6
+
+    RY, YG, GC, CB, BM, MR = tuple(bins)
+
+    ry = [1, np.arange(RY) / RY, 0]
+    yg = [1 - np.arange(YG) / YG, 1, 0]
+    gc = [0, 1, np.arange(GC) / GC]
+    cb = [0, 1 - np.arange(CB) / CB, 1]
+    bm = [np.arange(BM) / BM, 0, 1]
+    mr = [1, 0, 1 - np.arange(MR) / MR]
+
+    num_bins = RY + YG + GC + CB + BM + MR
+
+    color_wheel = np.zeros((3, num_bins), dtype=np.float32)
+
+    col = 0
+    for i, color in enumerate([ry, yg, gc, cb, bm, mr]):
+        for j in range(3):
+            color_wheel[j, col:col + bins[i]] = color[j]
+        col += bins[i]
+
+    return color_wheel.T
diff --git a/mmcv/requirements.txt b/mmcv/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..448e224f92ec0e79f5aed2efc5c749f1b4447fd0
--- /dev/null
+++ b/mmcv/requirements.txt
@@ -0,0 +1,4 @@
+-r requirements/build.txt
+-r requirements/optional.txt
+-r requirements/runtime.txt
+-r requirements/test.txt
diff --git a/mmcv/requirements/build.txt b/mmcv/requirements/build.txt
new file mode 100644
index 0000000000000000000000000000000000000000..abf514853e58db1b0903721c7624cb313bf3aa57
--- /dev/null
+++ b/mmcv/requirements/build.txt
@@ -0,0 +1 @@
+pytest-runner
diff --git a/mmcv/requirements/docs.txt b/mmcv/requirements/docs.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d502a8fe55c8a5e4d3b92f11457f15f1cb2ceec3
--- /dev/null
+++ b/mmcv/requirements/docs.txt
@@ -0,0 +1,9 @@
+docutils==0.16.0
+markdown<3.4.0
+myst-parser
+opencv-python
+-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+sphinx==4.0.2
+sphinx-copybutton
+sphinx_markdown_tables
+torch
diff --git a/mmcv/requirements/optional.txt b/mmcv/requirements/optional.txt
new file mode 100644
index 0000000000000000000000000000000000000000..63730036fd34e349d7856c5401d395262f99db16
--- /dev/null
+++ b/mmcv/requirements/optional.txt
@@ -0,0 +1 @@
+ninja
diff --git a/mmcv/requirements/runtime.txt b/mmcv/requirements/runtime.txt
new file mode 100644
index 0000000000000000000000000000000000000000..66e90d6748971e14a852a7008d9301aea262606c
--- /dev/null
+++ b/mmcv/requirements/runtime.txt
@@ -0,0 +1,7 @@
+addict
+numpy
+packaging
+Pillow
+pyyaml
+regex;sys_platform=='win32'
+yapf
diff --git a/mmcv/requirements/test.txt b/mmcv/requirements/test.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9b7e0c0210969efd3a63d1ab20a74c3546b304c7
--- /dev/null
+++ b/mmcv/requirements/test.txt
@@ -0,0 +1,10 @@
+coverage
+lmdb
+onnx==1.7.0; python_version < '3.10'
+onnxoptimizer; python_version < '3.10'
+onnxruntime>=1.8.0; python_version < '3.10'
+protobuf~=3.19.0
+pytest
+PyTurboJPEG
+scipy
+tifffile
diff --git a/mmcv/setup.cfg b/mmcv/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..f3ba2bdeb16e27fac82c92d1ab60e396667bb80d
--- /dev/null
+++ b/mmcv/setup.cfg
@@ -0,0 +1,26 @@
+[bdist_wheel]
+universal=1
+
+[aliases]
+test=pytest
+
+[yapf]
+based_on_style = pep8
+blank_line_before_nested_class_or_def = true
+split_before_expression_after_opening_paren = true
+
+[isort]
+line_length = 79
+multi_line_output = 0
+extra_standard_library = pkg_resources,setuptools,logging,os,warnings,abc
+known_first_party = mmcv
+known_third_party = addict,cv2,matplotlib,numpy,onnx,onnxruntime,packaging,pytest,pytorch_sphinx_theme,scipy,sphinx,tensorrt,torch,torchvision,yaml,yapf
+no_lines_before = STDLIB,LOCALFOLDER
+default_section = THIRDPARTY
+
+# ignore-words-list needs to be lowercase format. For example, if we want to
+# ignore word "BA", then we need to append "ba" to ignore-words-list rather
+# than "BA"
+[codespell]
+quiet-level = 3
+ignore-words-list = inout,hist,ba
diff --git a/mmcv/setup.py b/mmcv/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..274c13de339786b635d1851909c70f0a4d810cce
--- /dev/null
+++ b/mmcv/setup.py
@@ -0,0 +1,447 @@
+import glob
+import os
+import platform
+import re
+import warnings
+from pkg_resources import DistributionNotFound, get_distribution
+from setuptools import find_packages, setup
+
+EXT_TYPE = ''
+try:
+    import torch
+    if torch.__version__ == 'parrots':
+        from parrots.utils.build_extension import BuildExtension
+        EXT_TYPE = 'parrots'
+    elif (hasattr(torch, 'is_mlu_available') and torch.is_mlu_available()) or \
+            os.getenv('FORCE_MLU', '0') == '1':
+        from torch_mlu.utils.cpp_extension import BuildExtension
+        EXT_TYPE = 'pytorch'
+    else:
+        from torch.utils.cpp_extension import BuildExtension
+        EXT_TYPE = 'pytorch'
+    cmd_class = {'build_ext': BuildExtension}
+except ModuleNotFoundError:
+    cmd_class = {}
+    print('Skip building ext ops due to the absence of torch.')
+
+
+def choose_requirement(primary, secondary):
+    """If some version of primary requirement installed, return primary, else
+    return secondary."""
+    try:
+        name = re.split(r'[!<>=]', primary)[0]
+        get_distribution(name)
+    except DistributionNotFound:
+        return secondary
+
+    return str(primary)
+
+
+def get_version():
+    version_file = 'mmcv/version.py'
+    with open(version_file, encoding='utf-8') as f:
+        exec(compile(f.read(), version_file, 'exec'))
+    return locals()['__version__']
+
+
+def parse_requirements(fname='requirements/runtime.txt', with_version=True):
+    """Parse the package dependencies listed in a requirements file but strips
+    specific versioning information.
+
+    Args:
+        fname (str): path to requirements file
+        with_version (bool, default=False): if True include version specs
+
+    Returns:
+        List[str]: list of requirements items
+
+    CommandLine:
+        python -c "import setup; print(setup.parse_requirements())"
+    """
+    import sys
+    from os.path import exists
+    require_fpath = fname
+
+    def parse_line(line):
+        """Parse information from a line in a requirements text file."""
+        if line.startswith('-r '):
+            # Allow specifying requirements in other files
+            target = line.split(' ')[1]
+            for info in parse_require_file(target):
+                yield info
+        else:
+            info = {'line': line}
+            if line.startswith('-e '):
+                info['package'] = line.split('#egg=')[1]
+            else:
+                # Remove versioning from the package
+                pat = '(' + '|'.join(['>=', '==', '>']) + ')'
+                parts = re.split(pat, line, maxsplit=1)
+                parts = [p.strip() for p in parts]
+
+                info['package'] = parts[0]
+                if len(parts) > 1:
+                    op, rest = parts[1:]
+                    if ';' in rest:
+                        # Handle platform specific dependencies
+                        # http://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies
+                        version, platform_deps = map(str.strip,
+                                                     rest.split(';'))
+                        info['platform_deps'] = platform_deps
+                    else:
+                        version = rest  # NOQA
+                    info['version'] = (op, version)
+            yield info
+
+    def parse_require_file(fpath):
+        with open(fpath) as f:
+            for line in f.readlines():
+                line = line.strip()
+                if line and not line.startswith('#'):
+                    yield from parse_line(line)
+
+    def gen_packages_items():
+        if exists(require_fpath):
+            for info in parse_require_file(require_fpath):
+                parts = [info['package']]
+                if with_version and 'version' in info:
+                    parts.extend(info['version'])
+                if not sys.version.startswith('3.4'):
+                    # apparently package_deps are broken in 3.4
+                    platform_deps = info.get('platform_deps')
+                    if platform_deps is not None:
+                        parts.append(';' + platform_deps)
+                item = ''.join(parts)
+                yield item
+
+    packages = list(gen_packages_items())
+    return packages
+
+
+install_requires = parse_requirements()
+
+try:
+    # OpenCV installed via conda.
+    import cv2  # NOQA: F401
+    major, minor, *rest = cv2.__version__.split('.')
+    if int(major) < 3:
+        raise RuntimeError(
+            f'OpenCV >=3 is required but {cv2.__version__} is installed')
+except ImportError:
+    # If first not installed install second package
+    CHOOSE_INSTALL_REQUIRES = [('opencv-python-headless>=3',
+                                'opencv-python>=3')]
+    for main, secondary in CHOOSE_INSTALL_REQUIRES:
+        install_requires.append(choose_requirement(main, secondary))
+
+
+def get_extensions():
+    extensions = []
+
+    if os.getenv('MMCV_WITH_TRT', '0') != '0':
+
+        # Following strings of text style are from colorama package
+        bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+        red_text, blue_text = '\x1b[31m', '\x1b[34m'
+        white_background = '\x1b[107m'
+
+        msg = white_background + bright_style + red_text
+        msg += 'DeprecationWarning: ' + \
+            'Custom TensorRT Ops will be deprecated in future. '
+        msg += blue_text + \
+            'Welcome to use the unified model deployment toolbox '
+        msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+        msg += reset_style
+        warnings.warn(msg)
+
+        ext_name = 'mmcv._ext_trt'
+        from torch.utils.cpp_extension import include_paths, library_paths
+        library_dirs = []
+        libraries = []
+        include_dirs = []
+        tensorrt_path = os.getenv('TENSORRT_DIR', '0')
+        tensorrt_lib_path = glob.glob(
+            os.path.join(tensorrt_path, 'targets', '*', 'lib'))[0]
+        library_dirs += [tensorrt_lib_path]
+        libraries += ['nvinfer', 'nvparsers', 'nvinfer_plugin']
+        libraries += ['cudart']
+        define_macros = []
+        extra_compile_args = {'cxx': []}
+
+        include_path = os.path.abspath('./mmcv/ops/csrc/common/cuda')
+        include_trt_path = os.path.abspath('./mmcv/ops/csrc/tensorrt')
+        include_dirs.append(include_path)
+        include_dirs.append(include_trt_path)
+        include_dirs.append(os.path.join(tensorrt_path, 'include'))
+        include_dirs += include_paths(cuda=True)
+
+        op_files = glob.glob('./mmcv/ops/csrc/tensorrt/plugins/*')
+        define_macros += [('MMCV_WITH_CUDA', None)]
+        define_macros += [('MMCV_WITH_TRT', None)]
+        cuda_args = os.getenv('MMCV_CUDA_ARGS')
+        extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
+        # prevent cub/thrust conflict with other python library
+        # More context See issues #1454
+        extra_compile_args['nvcc'] += ['-Xcompiler=-fno-gnu-unique']
+        library_dirs += library_paths(cuda=True)
+
+        from setuptools import Extension
+        ext_ops = Extension(
+            name=ext_name,
+            sources=op_files,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+            language='c++',
+            library_dirs=library_dirs,
+            libraries=libraries)
+        extensions.append(ext_ops)
+
+    if os.getenv('MMCV_WITH_OPS', '0') == '0':
+        return extensions
+
+    if EXT_TYPE == 'parrots':
+        ext_name = 'mmcv._ext'
+        from parrots.utils.build_extension import Extension
+
+        # new parrots op impl do not use MMCV_USE_PARROTS
+        # define_macros = [('MMCV_USE_PARROTS', None)]
+        define_macros = []
+        include_dirs = []
+        op_files = glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') +\
+            glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') +\
+            glob.glob('./mmcv/ops/csrc/parrots/*.cpp')
+        include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+        include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
+        cuda_args = os.getenv('MMCV_CUDA_ARGS')
+        extra_compile_args = {
+            'nvcc': [cuda_args, '-std=c++14'] if cuda_args else ['-std=c++14'],
+            'cxx': ['-std=c++14'],
+        }
+        if torch.cuda.is_available() or os.getenv('FORCE_CUDA', '0') == '1':
+            define_macros += [('MMCV_WITH_CUDA', None)]
+            extra_compile_args['nvcc'] += [
+                '-D__CUDA_NO_HALF_OPERATORS__',
+                '-D__CUDA_NO_HALF_CONVERSIONS__',
+                '-D__CUDA_NO_HALF2_OPERATORS__',
+            ]
+        ext_ops = Extension(
+            name=ext_name,
+            sources=op_files,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+            cuda=True,
+            pytorch=True)
+        extensions.append(ext_ops)
+    elif EXT_TYPE == 'pytorch':
+        ext_name = 'mmcv._ext'
+        from torch.utils.cpp_extension import CppExtension, CUDAExtension
+
+        # prevent ninja from using too many resources
+        try:
+            import psutil
+            num_cpu = len(psutil.Process().cpu_affinity())
+            cpu_use = max(4, num_cpu - 1)
+        except (ModuleNotFoundError, AttributeError):
+            cpu_use = 4
+
+        os.environ.setdefault('MAX_JOBS', str(cpu_use))
+        define_macros = []
+
+        # Before PyTorch1.8.0, when compiling CUDA code, `cxx` is a
+        # required key passed to PyTorch. Even if there is no flag passed
+        # to cxx, users also need to pass an empty list to PyTorch.
+        # Since PyTorch1.8.0, it has a default value so users do not need
+        # to pass an empty list anymore.
+        # More details at https://github.com/pytorch/pytorch/pull/45956
+        extra_compile_args = {'cxx': []}
+
+        # Since the PR (https://github.com/open-mmlab/mmcv/pull/1463) uses
+        # c++14 features, the argument ['std=c++14'] must be added here.
+        # However, in the windows environment, some standard libraries
+        # will depend on c++17 or higher. In fact, for the windows
+        # environment, the compiler will choose the appropriate compiler
+        # to compile those cpp files, so there is no need to add the
+        # argument
+        if platform.system() != 'Windows':
+            extra_compile_args['cxx'] = ['-std=c++14']
+
+        include_dirs = []
+
+        is_rocm_pytorch = False
+        try:
+            from torch.utils.cpp_extension import ROCM_HOME
+            is_rocm_pytorch = True if ((torch.version.hip is not None) and
+                                       (ROCM_HOME is not None)) else False
+        except ImportError:
+            pass
+
+        if is_rocm_pytorch or torch.cuda.is_available() or os.getenv(
+                'FORCE_CUDA', '0') == '1':
+            if is_rocm_pytorch:
+                define_macros += [('HIP_DIFF', None)]
+            define_macros += [('MMCV_WITH_CUDA', None)]
+            cuda_args = os.getenv('MMCV_CUDA_ARGS')
+            extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cu') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cuda/*.cpp')
+            extension = CUDAExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/cuda'))
+        elif (hasattr(torch, 'is_mlu_available') and
+                torch.is_mlu_available()) or \
+                os.getenv('FORCE_MLU', '0') == '1':
+            from torch_mlu.utils.cpp_extension import MLUExtension
+            define_macros += [('MMCV_WITH_MLU', None)]
+            mlu_args = os.getenv('MMCV_MLU_ARGS')
+            extra_compile_args['cncc'] = [mlu_args] if mlu_args else []
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/mlu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/common/mlu/*.mlu')
+            extension = MLUExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mlu'))
+        elif (hasattr(torch.backends, 'mps')
+              and torch.backends.mps.is_available()) or os.getenv(
+                  'FORCE_MPS', '0') == '1':
+            # objc compiler support
+            from distutils.unixccompiler import UnixCCompiler
+            if '.mm' not in UnixCCompiler.src_extensions:
+                UnixCCompiler.src_extensions.append('.mm')
+                UnixCCompiler.language_map['.mm'] = 'objc'
+
+            define_macros += [('MMCV_WITH_MPS', None)]
+            extra_compile_args = {}
+            extra_compile_args['cxx'] = ['-Wall', '-std=c++17']
+            extra_compile_args['cxx'] += [
+                '-framework', 'Metal', '-framework', 'Foundation'
+            ]
+            extra_compile_args['cxx'] += ['-ObjC++']
+            # src
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/common/mps/*.mm') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/mps/*.mm')
+            extension = CppExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common/mps'))
+        else:
+            print(f'Compiling {ext_name} only with CPU')
+            op_files = glob.glob('./mmcv/ops/csrc/pytorch/*.cpp') + \
+                glob.glob('./mmcv/ops/csrc/pytorch/cpu/*.cpp')
+            extension = CppExtension
+            include_dirs.append(os.path.abspath('./mmcv/ops/csrc/common'))
+
+        # Since the PR (https://github.com/open-mmlab/mmcv/pull/1463) uses
+        # c++14 features, the argument ['std=c++14'] must be added here.
+        # However, in the windows environment, some standard libraries
+        # will depend on c++17 or higher. In fact, for the windows
+        # environment, the compiler will choose the appropriate compiler
+        # to compile those cpp files, so there is no need to add the
+        # argument
+        if 'nvcc' in extra_compile_args and platform.system() != 'Windows':
+            extra_compile_args['nvcc'] += ['-std=c++14']
+
+        ext_ops = extension(
+            name=ext_name,
+            sources=op_files,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args)
+        extensions.append(ext_ops)
+
+    if EXT_TYPE == 'pytorch' and os.getenv('MMCV_WITH_ORT', '0') != '0':
+
+        # Following strings of text style are from colorama package
+        bright_style, reset_style = '\x1b[1m', '\x1b[0m'
+        red_text, blue_text = '\x1b[31m', '\x1b[34m'
+        white_background = '\x1b[107m'
+
+        msg = white_background + bright_style + red_text
+        msg += 'DeprecationWarning: ' + \
+            'Custom ONNXRuntime Ops will be deprecated in future. '
+        msg += blue_text + \
+            'Welcome to use the unified model deployment toolbox '
+        msg += 'MMDeploy: https://github.com/open-mmlab/mmdeploy'
+        msg += reset_style
+        warnings.warn(msg)
+        ext_name = 'mmcv._ext_ort'
+        import onnxruntime
+        from torch.utils.cpp_extension import include_paths, library_paths
+        library_dirs = []
+        libraries = []
+        include_dirs = []
+        ort_path = os.getenv('ONNXRUNTIME_DIR', '0')
+        library_dirs += [os.path.join(ort_path, 'lib')]
+        libraries.append('onnxruntime')
+        define_macros = []
+        extra_compile_args = {'cxx': []}
+
+        include_path = os.path.abspath('./mmcv/ops/csrc/onnxruntime')
+        include_dirs.append(include_path)
+        include_dirs.append(os.path.join(ort_path, 'include'))
+
+        op_files = glob.glob('./mmcv/ops/csrc/onnxruntime/cpu/*')
+        if onnxruntime.get_device() == 'GPU' or os.getenv('FORCE_CUDA',
+                                                          '0') == '1':
+            define_macros += [('MMCV_WITH_CUDA', None)]
+            cuda_args = os.getenv('MMCV_CUDA_ARGS')
+            extra_compile_args['nvcc'] = [cuda_args] if cuda_args else []
+            op_files += glob.glob('./mmcv/ops/csrc/onnxruntime/gpu/*')
+            include_dirs += include_paths(cuda=True)
+            library_dirs += library_paths(cuda=True)
+        else:
+            include_dirs += include_paths(cuda=False)
+            library_dirs += library_paths(cuda=False)
+
+        from setuptools import Extension
+        ext_ops = Extension(
+            name=ext_name,
+            sources=op_files,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+            language='c++',
+            library_dirs=library_dirs,
+            libraries=libraries)
+        extensions.append(ext_ops)
+
+    return extensions
+
+
+setup(
+    name='mmcv' if os.getenv('MMCV_WITH_OPS', '0') == '0' else 'mmcv-full',
+    version=get_version(),
+    description='OpenMMLab Computer Vision Foundation',
+    keywords='computer vision',
+    packages=find_packages(),
+    include_package_data=True,
+    classifiers=[
+        'Development Status :: 4 - Beta',
+        'License :: OSI Approved :: Apache Software License',
+        'Operating System :: OS Independent',
+        'Programming Language :: Python :: 3',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
+        'Programming Language :: Python :: 3.9',
+        'Programming Language :: Python :: 3.10',
+        'Topic :: Utilities',
+    ],
+    url='https://github.com/open-mmlab/mmcv',
+    author='MMCV Contributors',
+    author_email='openmmlab@gmail.com',
+    install_requires=install_requires,
+    extras_require={
+        'all': parse_requirements('requirements.txt'),
+        'tests': parse_requirements('requirements/test.txt'),
+        'build': parse_requirements('requirements/build.txt'),
+        'optional': parse_requirements('requirements/optional.txt'),
+    },
+    ext_modules=get_extensions(),
+    cmdclass=cmd_class,
+    zip_safe=False)
diff --git a/mmcv/tests/test_arraymisc.py b/mmcv/tests/test_arraymisc.py
new file mode 100644
index 0000000000000000000000000000000000000000..b29e5f670c3b43663a3390c0e5d4206d49680b70
--- /dev/null
+++ b/mmcv/tests/test_arraymisc.py
@@ -0,0 +1,70 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+import numpy as np
+import pytest
+
+import mmcv
+
+
+def test_quantize():
+    arr = np.random.randn(10, 10)
+    levels = 20
+
+    qarr = mmcv.quantize(arr, -1, 1, levels)
+    assert qarr.shape == arr.shape
+    assert qarr.dtype == np.dtype('int64')
+    for i in range(arr.shape[0]):
+        for j in range(arr.shape[1]):
+            ref = min(levels - 1,
+                      int(np.floor(10 * (1 + max(min(arr[i, j], 1), -1)))))
+            assert qarr[i, j] == ref
+
+    qarr = mmcv.quantize(arr, -1, 1, 20, dtype=np.uint8)
+    assert qarr.shape == arr.shape
+    assert qarr.dtype == np.dtype('uint8')
+
+    with pytest.raises(ValueError):
+        mmcv.quantize(arr, -1, 1, levels=0)
+    with pytest.raises(ValueError):
+        mmcv.quantize(arr, -1, 1, levels=10.0)
+    with pytest.raises(ValueError):
+        mmcv.quantize(arr, 2, 1, levels)
+
+
+def test_dequantize():
+    levels = 20
+    qarr = np.random.randint(levels, size=(10, 10))
+
+    arr = mmcv.dequantize(qarr, -1, 1, levels)
+    assert arr.shape == qarr.shape
+    assert arr.dtype == np.dtype('float64')
+    for i in range(qarr.shape[0]):
+        for j in range(qarr.shape[1]):
+            assert arr[i, j] == (qarr[i, j] + 0.5) / 10 - 1
+
+    arr = mmcv.dequantize(qarr, -1, 1, levels, dtype=np.float32)
+    assert arr.shape == qarr.shape
+    assert arr.dtype == np.dtype('float32')
+
+    with pytest.raises(ValueError):
+        mmcv.dequantize(arr, -1, 1, levels=0)
+    with pytest.raises(ValueError):
+        mmcv.dequantize(arr, -1, 1, levels=10.0)
+    with pytest.raises(ValueError):
+        mmcv.dequantize(arr, 2, 1, levels)
+
+
+def test_joint():
+    arr = np.random.randn(100, 100)
+    levels = 1000
+    qarr = mmcv.quantize(arr, -1, 1, levels)
+    recover = mmcv.dequantize(qarr, -1, 1, levels)
+    assert np.abs(recover[arr < -1] + 0.999).max() < 1e-6
+    assert np.abs(recover[arr > 1] - 0.999).max() < 1e-6
+    assert np.abs((recover - arr)[(arr >= -1) & (arr <= 1)]).max() <= 1e-3
+
+    arr = np.clip(np.random.randn(100) / 1000, -0.01, 0.01)
+    levels = 99
+    qarr = mmcv.quantize(arr, -1, 1, levels)
+    recover = mmcv.dequantize(qarr, -1, 1, levels)
+    assert np.all(recover == 0)
diff --git a/mmcv/tests/test_cnn/test_build_layers.py b/mmcv/tests/test_cnn/test_build_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a9b3eb9087b04ed6e809af1d1172e43f74c19d3
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_build_layers.py
@@ -0,0 +1,407 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.cnn.bricks import (ACTIVATION_LAYERS, CONV_LAYERS, NORM_LAYERS,
+                             PADDING_LAYERS, PLUGIN_LAYERS,
+                             build_activation_layer, build_conv_layer,
+                             build_norm_layer, build_padding_layer,
+                             build_plugin_layer, build_upsample_layer, is_norm)
+from mmcv.cnn.bricks.norm import infer_abbr as infer_norm_abbr
+from mmcv.cnn.bricks.plugin import infer_abbr as infer_plugin_abbr
+from mmcv.cnn.bricks.upsample import PixelShufflePack
+from mmcv.utils.parrots_wrapper import _BatchNorm
+
+
+def test_build_conv_layer():
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'Conv2d'
+        build_conv_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict(kernel_size=3)
+        build_conv_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # unsupported conv type
+        cfg = dict(type='FancyConv')
+        build_conv_layer(cfg)
+
+    kwargs = dict(
+        in_channels=4, out_channels=8, kernel_size=3, groups=2, dilation=2)
+    cfg = None
+    layer = build_conv_layer(cfg, **kwargs)
+    assert isinstance(layer, nn.Conv2d)
+    assert layer.in_channels == kwargs['in_channels']
+    assert layer.out_channels == kwargs['out_channels']
+    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
+    assert layer.groups == kwargs['groups']
+    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])
+
+    cfg = dict(type='Conv')
+    layer = build_conv_layer(cfg, **kwargs)
+    assert isinstance(layer, nn.Conv2d)
+    assert layer.in_channels == kwargs['in_channels']
+    assert layer.out_channels == kwargs['out_channels']
+    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
+    assert layer.groups == kwargs['groups']
+    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])
+
+    cfg = dict(type='deconv')
+    layer = build_conv_layer(cfg, **kwargs)
+    assert isinstance(layer, nn.ConvTranspose2d)
+    assert layer.in_channels == kwargs['in_channels']
+    assert layer.out_channels == kwargs['out_channels']
+    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
+    assert layer.groups == kwargs['groups']
+    assert layer.dilation == (kwargs['dilation'], kwargs['dilation'])
+
+    # sparse convs cannot support the case when groups>1
+    kwargs.pop('groups')
+
+    for type_name, module in CONV_LAYERS.module_dict.items():
+        cfg = dict(type=type_name)
+        # SparseInverseConv2d and SparseInverseConv3d do not have the argument
+        # 'dilation'
+        if type_name == 'SparseInverseConv2d' or type_name == \
+                'SparseInverseConv3d':
+            kwargs.pop('dilation')
+        layer = build_conv_layer(cfg, **kwargs)
+        assert isinstance(layer, module)
+        assert layer.in_channels == kwargs['in_channels']
+        assert layer.out_channels == kwargs['out_channels']
+        kwargs['dilation'] = 2  # recover the key
+
+
+def test_infer_norm_abbr():
+    with pytest.raises(TypeError):
+        # class_type must be a class
+        infer_norm_abbr(0)
+
+    class MyNorm:
+
+        _abbr_ = 'mn'
+
+    assert infer_norm_abbr(MyNorm) == 'mn'
+
+    class FancyBatchNorm:
+        pass
+
+    assert infer_norm_abbr(FancyBatchNorm) == 'bn'
+
+    class FancyInstanceNorm:
+        pass
+
+    assert infer_norm_abbr(FancyInstanceNorm) == 'in'
+
+    class FancyLayerNorm:
+        pass
+
+    assert infer_norm_abbr(FancyLayerNorm) == 'ln'
+
+    class FancyGroupNorm:
+        pass
+
+    assert infer_norm_abbr(FancyGroupNorm) == 'gn'
+
+    class FancyNorm:
+        pass
+
+    assert infer_norm_abbr(FancyNorm) == 'norm_layer'
+
+
+def test_build_norm_layer():
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'BN'
+        build_norm_layer(cfg, 3)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict()
+        build_norm_layer(cfg, 3)
+
+    with pytest.raises(KeyError):
+        # unsupported norm type
+        cfg = dict(type='FancyNorm')
+        build_norm_layer(cfg, 3)
+
+    with pytest.raises(AssertionError):
+        # postfix must be int or str
+        cfg = dict(type='BN')
+        build_norm_layer(cfg, 3, postfix=[1, 2])
+
+    with pytest.raises(AssertionError):
+        # `num_groups` must be in cfg when using 'GN'
+        cfg = dict(type='GN')
+        build_norm_layer(cfg, 3)
+
+    # test each type of norm layer in norm_cfg
+    abbr_mapping = {
+        'BN': 'bn',
+        'BN1d': 'bn',
+        'BN2d': 'bn',
+        'BN3d': 'bn',
+        'SyncBN': 'bn',
+        'GN': 'gn',
+        'LN': 'ln',
+        'IN': 'in',
+        'IN1d': 'in',
+        'IN2d': 'in',
+        'IN3d': 'in',
+    }
+    for type_name, module in NORM_LAYERS.module_dict.items():
+        if type_name == 'MMSyncBN':  # skip MMSyncBN
+            continue
+        for postfix in ['_test', 1]:
+            cfg = dict(type=type_name)
+            if type_name == 'GN':
+                cfg['num_groups'] = 3
+            name, layer = build_norm_layer(cfg, 3, postfix=postfix)
+            assert name == abbr_mapping[type_name] + str(postfix)
+            assert isinstance(layer, module)
+            if type_name == 'GN':
+                assert layer.num_channels == 3
+                assert layer.num_groups == cfg['num_groups']
+            elif type_name != 'LN':
+                assert layer.num_features == 3
+
+
+def test_build_activation_layer():
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'ReLU'
+        build_activation_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict()
+        build_activation_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # unsupported activation type
+        cfg = dict(type='FancyReLU')
+        build_activation_layer(cfg)
+
+    # test each type of activation layer in activation_cfg
+    for type_name, module in ACTIVATION_LAYERS.module_dict.items():
+        cfg['type'] = type_name
+        layer = build_activation_layer(cfg)
+        assert isinstance(layer, module)
+
+    # sanity check for Clamp
+    act = build_activation_layer(dict(type='Clamp'))
+    x = torch.randn(10) * 1000
+    y = act(x)
+    assert np.logical_and((y >= -1).numpy(), (y <= 1).numpy()).all()
+    act = build_activation_layer(dict(type='Clip', min=0))
+    y = act(x)
+    assert np.logical_and((y >= 0).numpy(), (y <= 1).numpy()).all()
+    act = build_activation_layer(dict(type='Clamp', max=0))
+    y = act(x)
+    assert np.logical_and((y >= -1).numpy(), (y <= 0).numpy()).all()
+
+
+def test_build_padding_layer():
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'reflect'
+        build_padding_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict()
+        build_padding_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # unsupported activation type
+        cfg = dict(type='FancyPad')
+        build_padding_layer(cfg)
+
+    for type_name, module in PADDING_LAYERS.module_dict.items():
+        cfg['type'] = type_name
+        layer = build_padding_layer(cfg, 2)
+        assert isinstance(layer, module)
+
+    input_x = torch.randn(1, 2, 5, 5)
+    cfg = dict(type='reflect')
+    padding_layer = build_padding_layer(cfg, 2)
+    res = padding_layer(input_x)
+    assert res.shape == (1, 2, 9, 9)
+
+
+def test_upsample_layer():
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'bilinear'
+        build_upsample_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict()
+        build_upsample_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # unsupported activation type
+        cfg = dict(type='FancyUpsample')
+        build_upsample_layer(cfg)
+
+    for type_name in ['nearest', 'bilinear']:
+        cfg['type'] = type_name
+        layer = build_upsample_layer(cfg)
+        assert isinstance(layer, nn.Upsample)
+        assert layer.mode == type_name
+
+    cfg = dict(
+        type='deconv', in_channels=3, out_channels=3, kernel_size=3, stride=2)
+    layer = build_upsample_layer(cfg)
+    assert isinstance(layer, nn.ConvTranspose2d)
+
+    cfg = dict(type='deconv')
+    kwargs = dict(in_channels=3, out_channels=3, kernel_size=3, stride=2)
+    layer = build_upsample_layer(cfg, **kwargs)
+    assert isinstance(layer, nn.ConvTranspose2d)
+    assert layer.in_channels == kwargs['in_channels']
+    assert layer.out_channels == kwargs['out_channels']
+    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
+    assert layer.stride == (kwargs['stride'], kwargs['stride'])
+
+    layer = build_upsample_layer(cfg, 3, 3, 3, 2)
+    assert isinstance(layer, nn.ConvTranspose2d)
+    assert layer.in_channels == kwargs['in_channels']
+    assert layer.out_channels == kwargs['out_channels']
+    assert layer.kernel_size == (kwargs['kernel_size'], kwargs['kernel_size'])
+    assert layer.stride == (kwargs['stride'], kwargs['stride'])
+
+    cfg = dict(
+        type='pixel_shuffle',
+        in_channels=3,
+        out_channels=3,
+        scale_factor=2,
+        upsample_kernel=3)
+    layer = build_upsample_layer(cfg)
+
+    assert isinstance(layer, PixelShufflePack)
+    assert layer.scale_factor == 2
+    assert layer.upsample_kernel == 3
+
+
+def test_pixel_shuffle_pack():
+    x_in = torch.rand(2, 3, 10, 10)
+    pixel_shuffle = PixelShufflePack(3, 3, scale_factor=2, upsample_kernel=3)
+    assert pixel_shuffle.upsample_conv.kernel_size == (3, 3)
+    x_out = pixel_shuffle(x_in)
+    assert x_out.shape == (2, 3, 20, 20)
+
+
+def test_is_norm():
+    norm_set1 = [
+        nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d, nn.InstanceNorm1d,
+        nn.InstanceNorm2d, nn.InstanceNorm3d, nn.LayerNorm
+    ]
+    norm_set2 = [nn.GroupNorm]
+    for norm_type in norm_set1:
+        layer = norm_type(3)
+        assert is_norm(layer)
+        assert not is_norm(layer, exclude=(norm_type, ))
+    for norm_type in norm_set2:
+        layer = norm_type(3, 6)
+        assert is_norm(layer)
+        assert not is_norm(layer, exclude=(norm_type, ))
+
+    class MyNorm(nn.BatchNorm2d):
+        pass
+
+    layer = MyNorm(3)
+    assert is_norm(layer)
+    assert not is_norm(layer, exclude=_BatchNorm)
+    assert not is_norm(layer, exclude=(_BatchNorm, ))
+
+    layer = nn.Conv2d(3, 8, 1)
+    assert not is_norm(layer)
+
+    with pytest.raises(TypeError):
+        layer = nn.BatchNorm1d(3)
+        is_norm(layer, exclude='BN')
+
+    with pytest.raises(TypeError):
+        layer = nn.BatchNorm1d(3)
+        is_norm(layer, exclude=('BN', ))
+
+
+def test_infer_plugin_abbr():
+    with pytest.raises(TypeError):
+        # class_type must be a class
+        infer_plugin_abbr(0)
+
+    class MyPlugin:
+
+        _abbr_ = 'mp'
+
+    assert infer_plugin_abbr(MyPlugin) == 'mp'
+
+    class FancyPlugin:
+        pass
+
+    assert infer_plugin_abbr(FancyPlugin) == 'fancy_plugin'
+
+
+def test_build_plugin_layer():
+    with pytest.raises(TypeError):
+        # cfg must be a dict
+        cfg = 'Plugin'
+        build_plugin_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # `type` must be in cfg
+        cfg = dict()
+        build_plugin_layer(cfg)
+
+    with pytest.raises(KeyError):
+        # unsupported plugin type
+        cfg = dict(type='FancyPlugin')
+        build_plugin_layer(cfg)
+
+    with pytest.raises(AssertionError):
+        # postfix must be int or str
+        cfg = dict(type='ConvModule')
+        build_plugin_layer(cfg, postfix=[1, 2])
+
+    # test ContextBlock
+    for postfix in ['', '_test', 1]:
+        cfg = dict(type='ContextBlock')
+        name, layer = build_plugin_layer(
+            cfg, postfix=postfix, in_channels=16, ratio=1. / 4)
+        assert name == 'context_block' + str(postfix)
+        assert isinstance(layer, PLUGIN_LAYERS.module_dict['ContextBlock'])
+
+    # test GeneralizedAttention
+    for postfix in ['', '_test', 1]:
+        cfg = dict(type='GeneralizedAttention')
+        name, layer = build_plugin_layer(cfg, postfix=postfix, in_channels=16)
+        assert name == 'gen_attention_block' + str(postfix)
+        assert isinstance(layer,
+                          PLUGIN_LAYERS.module_dict['GeneralizedAttention'])
+
+    # test NonLocal2d
+    for postfix in ['', '_test', 1]:
+        cfg = dict(type='NonLocal2d')
+        name, layer = build_plugin_layer(cfg, postfix=postfix, in_channels=16)
+        assert name == 'nonlocal_block' + str(postfix)
+        assert isinstance(layer, PLUGIN_LAYERS.module_dict['NonLocal2d'])
+
+    # test ConvModule
+    for postfix in ['', '_test', 1]:
+        cfg = dict(type='ConvModule')
+        name, layer = build_plugin_layer(
+            cfg,
+            postfix=postfix,
+            in_channels=16,
+            out_channels=4,
+            kernel_size=3)
+        assert name == 'conv_block' + str(postfix)
+        assert isinstance(layer, PLUGIN_LAYERS.module_dict['ConvModule'])
diff --git a/mmcv/tests/test_cnn/test_context_block.py b/mmcv/tests/test_cnn/test_context_block.py
new file mode 100644
index 0000000000000000000000000000000000000000..864cb417937603d162235c4a72b4eff09b151518
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_context_block.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.cnn.bricks import ContextBlock
+
+
+def test_context_block():
+    with pytest.raises(AssertionError):
+        # pooling_type should be in ['att', 'avg']
+        ContextBlock(16, 1. / 4, pooling_type='unsupport_type')
+
+    with pytest.raises(AssertionError):
+        # fusion_types should be of type list or tuple
+        ContextBlock(16, 1. / 4, fusion_types='unsupport_type')
+
+    with pytest.raises(AssertionError):
+        # fusion_types should be in ['channel_add', 'channel_mul']
+        ContextBlock(16, 1. / 4, fusion_types=('unsupport_type', ))
+
+    # test pooling_type='att'
+    imgs = torch.randn(2, 16, 20, 20)
+    context_block = ContextBlock(16, 1. / 4, pooling_type='att')
+    out = context_block(imgs)
+    assert context_block.conv_mask.in_channels == 16
+    assert context_block.conv_mask.out_channels == 1
+    assert out.shape == imgs.shape
+
+    # test pooling_type='avg'
+    imgs = torch.randn(2, 16, 20, 20)
+    context_block = ContextBlock(16, 1. / 4, pooling_type='avg')
+    out = context_block(imgs)
+    assert hasattr(context_block, 'avg_pool')
+    assert out.shape == imgs.shape
+
+    # test fusion_types=('channel_add',)
+    imgs = torch.randn(2, 16, 20, 20)
+    context_block = ContextBlock(16, 1. / 4, fusion_types=('channel_add', ))
+    out = context_block(imgs)
+    assert context_block.channel_add_conv is not None
+    assert context_block.channel_mul_conv is None
+    assert out.shape == imgs.shape
+
+    # test fusion_types=('channel_mul',)
+    imgs = torch.randn(2, 16, 20, 20)
+    context_block = ContextBlock(16, 1. / 4, fusion_types=('channel_mul', ))
+    out = context_block(imgs)
+    assert context_block.channel_add_conv is None
+    assert context_block.channel_mul_conv is not None
+    assert out.shape == imgs.shape
+
+    # test fusion_types=('channel_add', 'channel_mul')
+    imgs = torch.randn(2, 16, 20, 20)
+    context_block = ContextBlock(
+        16, 1. / 4, fusion_types=('channel_add', 'channel_mul'))
+    out = context_block(imgs)
+    assert context_block.channel_add_conv is not None
+    assert context_block.channel_mul_conv is not None
+    assert out.shape == imgs.shape
diff --git a/mmcv/tests/test_cnn/test_conv2d_adaptive_padding.py b/mmcv/tests/test_cnn/test_conv2d_adaptive_padding.py
new file mode 100644
index 0000000000000000000000000000000000000000..83114bd5b5588dd37523a2a7476cef15b8c15df5
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_conv2d_adaptive_padding.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.cnn.bricks import Conv2dAdaptivePadding
+
+
+def test_conv2d_samepadding():
+    # test Conv2dAdaptivePadding with stride=1
+    inputs = torch.rand((1, 3, 28, 28))
+    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=1)
+    output = conv(inputs)
+    assert output.shape == inputs.shape
+
+    inputs = torch.rand((1, 3, 13, 13))
+    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=1)
+    output = conv(inputs)
+    assert output.shape == inputs.shape
+
+    # test Conv2dAdaptivePadding with stride=2
+    inputs = torch.rand((1, 3, 28, 28))
+    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=2)
+    output = conv(inputs)
+    assert output.shape == torch.Size([1, 3, 14, 14])
+
+    inputs = torch.rand((1, 3, 13, 13))
+    conv = Conv2dAdaptivePadding(3, 3, kernel_size=3, stride=2)
+    output = conv(inputs)
+    assert output.shape == torch.Size([1, 3, 7, 7])
diff --git a/mmcv/tests/test_cnn/test_conv_module.py b/mmcv/tests/test_cnn/test_conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..3fa8600f6a0250982d0c9ea7af7cbaf842384dd5
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_conv_module.py
@@ -0,0 +1,251 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+from unittest.mock import patch
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.cnn.bricks import CONV_LAYERS, ConvModule, HSigmoid, HSwish
+from mmcv.utils import TORCH_VERSION, digit_version
+
+
+@CONV_LAYERS.register_module()
+class ExampleConv(nn.Module):
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 kernel_size,
+                 stride=1,
+                 padding=0,
+                 dilation=1,
+                 groups=1,
+                 bias=True,
+                 norm_cfg=None):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = out_channels
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+        self.dilation = dilation
+        self.groups = groups
+        self.bias = bias
+        self.norm_cfg = norm_cfg
+        self.output_padding = (0, 0, 0)
+        self.transposed = False
+
+        self.conv0 = nn.Conv2d(in_channels, out_channels, kernel_size)
+        self.init_weights()
+
+    def forward(self, x):
+        x = self.conv0(x)
+        return x
+
+    def init_weights(self):
+        nn.init.constant_(self.conv0.weight, 0)
+
+
+def test_conv_module():
+    with pytest.raises(AssertionError):
+        # conv_cfg must be a dict or None
+        conv_cfg = 'conv'
+        ConvModule(3, 8, 2, conv_cfg=conv_cfg)
+
+    with pytest.raises(AssertionError):
+        # norm_cfg must be a dict or None
+        norm_cfg = 'norm'
+        ConvModule(3, 8, 2, norm_cfg=norm_cfg)
+
+    with pytest.raises(KeyError):
+        # softmax is not supported
+        act_cfg = dict(type='softmax')
+        ConvModule(3, 8, 2, act_cfg=act_cfg)
+
+    # conv + norm + act
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    assert conv.with_activation
+    assert hasattr(conv, 'activate')
+    assert conv.with_norm
+    assert hasattr(conv, 'norm')
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # conv + act
+    conv = ConvModule(3, 8, 2)
+    assert conv.with_activation
+    assert hasattr(conv, 'activate')
+    assert not conv.with_norm
+    assert conv.norm is None
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # conv
+    conv = ConvModule(3, 8, 2, act_cfg=None)
+    assert not conv.with_norm
+    assert conv.norm is None
+    assert not conv.with_activation
+    assert not hasattr(conv, 'activate')
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # conv with its own `init_weights` method
+    conv_module = ConvModule(
+        3, 8, 2, conv_cfg=dict(type='ExampleConv'), act_cfg=None)
+    assert torch.equal(conv_module.conv.conv0.weight, torch.zeros(8, 3, 2, 2))
+
+    # with_spectral_norm=True
+    conv = ConvModule(3, 8, 3, padding=1, with_spectral_norm=True)
+    assert hasattr(conv.conv, 'weight_orig')
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # padding_mode='reflect'
+    conv = ConvModule(3, 8, 3, padding=1, padding_mode='reflect')
+    assert isinstance(conv.padding_layer, nn.ReflectionPad2d)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # non-existing padding mode
+    with pytest.raises(KeyError):
+        conv = ConvModule(3, 8, 3, padding=1, padding_mode='non_exists')
+
+    # leaky relu
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
+    assert isinstance(conv.activate, nn.LeakyReLU)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # tanh
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='Tanh'))
+    assert isinstance(conv.activate, nn.Tanh)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # Sigmoid
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='Sigmoid'))
+    assert isinstance(conv.activate, nn.Sigmoid)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # PReLU
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='PReLU'))
+    assert isinstance(conv.activate, nn.PReLU)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # HSwish
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='HSwish'))
+    if (TORCH_VERSION == 'parrots'
+            or digit_version(TORCH_VERSION) < digit_version('1.7')):
+        assert isinstance(conv.activate, HSwish)
+    else:
+        assert isinstance(conv.activate, nn.Hardswish)
+
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # HSigmoid
+    conv = ConvModule(3, 8, 3, padding=1, act_cfg=dict(type='HSigmoid'))
+    assert isinstance(conv.activate, HSigmoid)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+
+def test_bias():
+    # bias: auto, without norm
+    conv = ConvModule(3, 8, 2)
+    assert conv.conv.bias is not None
+
+    # bias: auto, with norm
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    assert conv.conv.bias is None
+
+    # bias: False, without norm
+    conv = ConvModule(3, 8, 2, bias=False)
+    assert conv.conv.bias is None
+
+    # bias: True, with batch norm
+    with pytest.warns(UserWarning) as record:
+        ConvModule(3, 8, 2, bias=True, norm_cfg=dict(type='BN'))
+    assert len(record) == 1
+    assert record[0].message.args[
+        0] == 'Unnecessary conv bias before batch/instance norm'
+
+    # bias: True, with instance norm
+    with pytest.warns(UserWarning) as record:
+        ConvModule(3, 8, 2, bias=True, norm_cfg=dict(type='IN'))
+    assert len(record) == 1
+    assert record[0].message.args[
+        0] == 'Unnecessary conv bias before batch/instance norm'
+
+    # bias: True, with other norm
+    with pytest.warns(UserWarning) as record:
+        norm_cfg = dict(type='GN', num_groups=1)
+        ConvModule(3, 8, 2, bias=True, norm_cfg=norm_cfg)
+        warnings.warn('No warnings')
+    assert len(record) == 1
+    assert record[0].message.args[0] == 'No warnings'
+
+
+def conv_forward(self, x):
+    return x + '_conv'
+
+
+def bn_forward(self, x):
+    return x + '_bn'
+
+
+def relu_forward(self, x):
+    return x + '_relu'
+
+
+@patch('torch.nn.ReLU.forward', relu_forward)
+@patch('torch.nn.BatchNorm2d.forward', bn_forward)
+@patch('torch.nn.Conv2d.forward', conv_forward)
+def test_order():
+
+    with pytest.raises(AssertionError):
+        # order must be a tuple
+        order = ['conv', 'norm', 'act']
+        ConvModule(3, 8, 2, order=order)
+
+    with pytest.raises(AssertionError):
+        # length of order must be 3
+        order = ('conv', 'norm')
+        ConvModule(3, 8, 2, order=order)
+
+    with pytest.raises(AssertionError):
+        # order must be an order of 'conv', 'norm', 'act'
+        order = ('conv', 'norm', 'norm')
+        ConvModule(3, 8, 2, order=order)
+
+    with pytest.raises(AssertionError):
+        # order must be an order of 'conv', 'norm', 'act'
+        order = ('conv', 'norm', 'something')
+        ConvModule(3, 8, 2, order=order)
+
+    # ('conv', 'norm', 'act')
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    out = conv('input')
+    assert out == 'input_conv_bn_relu'
+
+    # ('norm', 'conv', 'act')
+    conv = ConvModule(
+        3, 8, 2, norm_cfg=dict(type='BN'), order=('norm', 'conv', 'act'))
+    out = conv('input')
+    assert out == 'input_bn_conv_relu'
+
+    # ('conv', 'norm', 'act'), activate=False
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    out = conv('input', activate=False)
+    assert out == 'input_conv_bn'
+
+    # ('conv', 'norm', 'act'), activate=False
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    out = conv('input', norm=False)
+    assert out == 'input_conv_relu'
diff --git a/mmcv/tests/test_cnn/test_depthwise_seperable_conv_module.py b/mmcv/tests/test_cnn/test_depthwise_seperable_conv_module.py
new file mode 100644
index 0000000000000000000000000000000000000000..748fc1bf88166b50aec9665900e664e638b78186
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_depthwise_seperable_conv_module.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.cnn.bricks import DepthwiseSeparableConvModule
+
+
+def test_depthwise_separable_conv():
+    with pytest.raises(AssertionError):
+        # conv_cfg must be a dict or None
+        DepthwiseSeparableConvModule(4, 8, 2, groups=2)
+
+    # test default config
+    conv = DepthwiseSeparableConvModule(3, 8, 2)
+    assert conv.depthwise_conv.conv.groups == 3
+    assert conv.pointwise_conv.conv.kernel_size == (1, 1)
+    assert not conv.depthwise_conv.with_norm
+    assert not conv.pointwise_conv.with_norm
+    assert conv.depthwise_conv.activate.__class__.__name__ == 'ReLU'
+    assert conv.pointwise_conv.activate.__class__.__name__ == 'ReLU'
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # test dw_norm_cfg
+    conv = DepthwiseSeparableConvModule(3, 8, 2, dw_norm_cfg=dict(type='BN'))
+    assert conv.depthwise_conv.norm_name == 'bn'
+    assert not conv.pointwise_conv.with_norm
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # test pw_norm_cfg
+    conv = DepthwiseSeparableConvModule(3, 8, 2, pw_norm_cfg=dict(type='BN'))
+    assert not conv.depthwise_conv.with_norm
+    assert conv.pointwise_conv.norm_name == 'bn'
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # test norm_cfg
+    conv = DepthwiseSeparableConvModule(3, 8, 2, norm_cfg=dict(type='BN'))
+    assert conv.depthwise_conv.norm_name == 'bn'
+    assert conv.pointwise_conv.norm_name == 'bn'
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    # add test for ['norm', 'conv', 'act']
+    conv = DepthwiseSeparableConvModule(3, 8, 2, order=('norm', 'conv', 'act'))
+    x = torch.rand(1, 3, 256, 256)
+    output = conv(x)
+    assert output.shape == (1, 8, 255, 255)
+
+    conv = DepthwiseSeparableConvModule(
+        3, 8, 3, padding=1, with_spectral_norm=True)
+    assert hasattr(conv.depthwise_conv.conv, 'weight_orig')
+    assert hasattr(conv.pointwise_conv.conv, 'weight_orig')
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    conv = DepthwiseSeparableConvModule(
+        3, 8, 3, padding=1, padding_mode='reflect')
+    assert isinstance(conv.depthwise_conv.padding_layer, nn.ReflectionPad2d)
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # test dw_act_cfg
+    conv = DepthwiseSeparableConvModule(
+        3, 8, 3, padding=1, dw_act_cfg=dict(type='LeakyReLU'))
+    assert conv.depthwise_conv.activate.__class__.__name__ == 'LeakyReLU'
+    assert conv.pointwise_conv.activate.__class__.__name__ == 'ReLU'
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # test pw_act_cfg
+    conv = DepthwiseSeparableConvModule(
+        3, 8, 3, padding=1, pw_act_cfg=dict(type='LeakyReLU'))
+    assert conv.depthwise_conv.activate.__class__.__name__ == 'ReLU'
+    assert conv.pointwise_conv.activate.__class__.__name__ == 'LeakyReLU'
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
+
+    # test act_cfg
+    conv = DepthwiseSeparableConvModule(
+        3, 8, 3, padding=1, act_cfg=dict(type='LeakyReLU'))
+    assert conv.depthwise_conv.activate.__class__.__name__ == 'LeakyReLU'
+    assert conv.pointwise_conv.activate.__class__.__name__ == 'LeakyReLU'
+    output = conv(x)
+    assert output.shape == (1, 8, 256, 256)
diff --git a/mmcv/tests/test_cnn/test_flops_counter.py b/mmcv/tests/test_cnn/test_flops_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2ba6e242fd95f0d5f7f645046e2871915fae086
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_flops_counter.py
@@ -0,0 +1,152 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.cnn import get_model_complexity_info
+from mmcv.cnn.utils.flops_counter import flops_to_string, params_to_string
+
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import StringIO
+
+# yapf: disable
+gt_results = [
+    {'model': nn.Conv1d(3, 8, 3), 'input': (3, 16), 'flops': 1120.0, 'params': 80.0},  # noqa: E501
+    {'model': nn.Conv2d(3, 8, 3), 'input': (3, 16, 16), 'flops': 43904.0, 'params': 224.0},  # noqa: E501
+    {'model': nn.Conv3d(3, 8, 3), 'input': (3, 3, 16, 16), 'flops': 128576.0, 'params': 656.0},  # noqa: E501
+    {'model': nn.ReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.PReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 1},  # noqa: E501
+    {'model': nn.ELU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.LeakyReLU(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.ReLU6(), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.MaxPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
+    {'model': nn.MaxPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.MaxPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
+    {'model': nn.AvgPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
+    {'model': nn.AvgPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.AvgPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveMaxPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveMaxPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveMaxPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveAvgPool1d(2), 'input': (3, 16), 'flops': 48.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveAvgPool2d(2), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.AdaptiveAvgPool3d(2), 'input': (3, 3, 16, 16), 'flops': 2304.0, 'params': 0},  # noqa: E501
+    {'model': nn.BatchNorm1d(3), 'input': (3, 16), 'flops': 96.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.BatchNorm2d(3), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.BatchNorm3d(3), 'input': (3, 3, 16, 16), 'flops': 4608.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.GroupNorm(2, 6), 'input': (6, 16, 16), 'flops': 3072.0, 'params': 12.0},  # noqa: E501
+    {'model': nn.InstanceNorm1d(3, affine=True), 'input': (3, 16), 'flops': 96.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.InstanceNorm2d(3, affine=True), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.InstanceNorm3d(3, affine=True), 'input': (3, 3, 16, 16), 'flops': 4608.0, 'params': 6.0},  # noqa: E501
+    {'model': nn.LayerNorm((3, 16, 16)), 'input': (3, 16, 16), 'flops': 1536.0, 'params': 1536.0},  # noqa: E501
+    {'model': nn.LayerNorm((3, 16, 16), elementwise_affine=False), 'input': (3, 16, 16), 'flops': 768.0, 'params': 0},  # noqa: E501
+    {'model': nn.Linear(1024, 2), 'input': (1024, ), 'flops': 2048.0, 'params': 2050.0},  # noqa: E501
+    {'model': nn.ConvTranspose2d(3, 8, 3), 'input': (3, 16, 16), 'flops': 57888, 'params': 224.0},  # noqa: E501
+    {'model': nn.Upsample((32, 32)), 'input': (3, 16, 16), 'flops': 3072.0, 'params': 0}  # noqa: E501
+]
+# yapf: enable
+
+
+class ExampleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv2d = nn.Conv2d(3, 8, 3)
+
+    def forward(self, imgs):
+        x = torch.randn((1, *imgs))
+        return self.conv2d(x)
+
+
+def input_constructor(x):
+    return dict(imgs=x)
+
+
+def test_flops_counter():
+    with pytest.raises(AssertionError):
+        # input_res should be a tuple
+        model = nn.Conv2d(3, 8, 3)
+        input_res = [1, 3, 16, 16]
+        get_model_complexity_info(model, input_res)
+
+    with pytest.raises(AssertionError):
+        # len(input_res) >= 2
+        model = nn.Conv2d(3, 8, 3)
+        input_res = tuple()
+        get_model_complexity_info(model, input_res)
+
+    # test common layers
+    for item in gt_results:
+        model = item['model']
+        input = item['input']
+        flops, params = get_model_complexity_info(
+            model, input, as_strings=False, print_per_layer_stat=False)
+        assert flops == item['flops'] and params == item['params']
+
+    # test input constructor
+    model = ExampleModel()
+    x = (3, 16, 16)
+    flops, params = get_model_complexity_info(
+        model,
+        x,
+        as_strings=False,
+        print_per_layer_stat=False,
+        input_constructor=input_constructor)
+    assert flops == 43904.0 and params == 224.0
+
+    # test output string
+    model = nn.Conv3d(3, 8, 3)
+    x = (3, 3, 512, 512)
+    flops, params = get_model_complexity_info(
+        model, x, print_per_layer_stat=False)
+    assert flops == '0.17 GFLOPs' and params == str(656)
+
+    # test print per layer status
+    model = nn.Conv1d(3, 8, 3)
+    x = (3, 16)
+    out = StringIO()
+    get_model_complexity_info(model, x, ost=out)
+    assert out.getvalue() == \
+        'Conv1d(0.0 M, 100.000% Params, 0.0 GFLOPs, 100.000% FLOPs, 3, 8, kernel_size=(3,), stride=(1,))\n'  # noqa: E501
+
+    # test when model is not a common instance
+    model = nn.Sequential(nn.Conv2d(3, 8, 3), nn.Flatten(), nn.Linear(1568, 2))
+    x = (3, 16, 16)
+    flops, params = get_model_complexity_info(
+        model, x, as_strings=False, print_per_layer_stat=True)
+    assert flops == 47040.0 and params == 3362
+
+
+def test_flops_to_string():
+    flops = 6.54321 * 10.**9
+    assert flops_to_string(flops) == '6.54 GFLOPs'
+    assert flops_to_string(flops, 'MFLOPs') == '6543.21 MFLOPs'
+    assert flops_to_string(flops, 'KFLOPs') == '6543210.0 KFLOPs'
+    assert flops_to_string(flops, 'FLOPs') == '6543210000.0 FLOPs'
+    assert flops_to_string(flops, precision=4) == '6.5432 GFLOPs'
+
+    flops = 6.54321 * 10.**9
+    assert flops_to_string(flops, None) == '6.54 GFLOPs'
+    flops = 3.21 * 10.**7
+    assert flops_to_string(flops, None) == '32.1 MFLOPs'
+    flops = 5.4 * 10.**3
+    assert flops_to_string(flops, None) == '5.4 KFLOPs'
+    flops = 987
+    assert flops_to_string(flops, None) == '987 FLOPs'
+
+
+def test_params_to_string():
+    num_params = 3.21 * 10.**7
+    assert params_to_string(num_params) == '32.1 M'
+    num_params = 4.56 * 10.**5
+    assert params_to_string(num_params) == '456.0 k'
+    num_params = 7.89 * 10.**2
+    assert params_to_string(num_params) == '789.0'
+
+    num_params = 6.54321 * 10.**7
+    assert params_to_string(num_params, 'M') == '65.43 M'
+    assert params_to_string(num_params, 'K') == '65432.1 K'
+    assert params_to_string(num_params, '') == '65432100.0'
+    assert params_to_string(num_params, precision=4) == '65.4321 M'
diff --git a/mmcv/tests/test_cnn/test_fuse_conv_bn.py b/mmcv/tests/test_cnn/test_fuse_conv_bn.py
new file mode 100644
index 0000000000000000000000000000000000000000..e60be5386c5cc96c765caf066a8d9a82de127996
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_fuse_conv_bn.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmcv.cnn import ConvModule, fuse_conv_bn
+
+
+def test_fuse_conv_bn():
+    inputs = torch.rand((1, 3, 5, 5))
+    modules = nn.ModuleList()
+    modules.append(nn.BatchNorm2d(3))
+    modules.append(ConvModule(3, 5, 3, norm_cfg=dict(type='BN')))
+    modules.append(ConvModule(5, 5, 3, norm_cfg=dict(type='BN')))
+    modules = nn.Sequential(*modules)
+    fused_modules = fuse_conv_bn(modules)
+    assert torch.equal(modules(inputs), fused_modules(inputs))
diff --git a/mmcv/tests/test_cnn/test_generalized_attention.py b/mmcv/tests/test_cnn/test_generalized_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b844f0ad57ec8a1410956d7c928e40714d06eeb
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_generalized_attention.py
@@ -0,0 +1,76 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.cnn.bricks import GeneralizedAttention
+
+
+def test_context_block():
+
+    # test attention_type='1000'
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, attention_type='1000')
+    assert gen_attention_block.query_conv.in_channels == 16
+    assert gen_attention_block.key_conv.in_channels == 16
+    assert gen_attention_block.key_conv.in_channels == 16
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test attention_type='0100'
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, attention_type='0100')
+    assert gen_attention_block.query_conv.in_channels == 16
+    assert gen_attention_block.appr_geom_fc_x.in_features == 8
+    assert gen_attention_block.appr_geom_fc_y.in_features == 8
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test attention_type='0010'
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, attention_type='0010')
+    assert gen_attention_block.key_conv.in_channels == 16
+    assert hasattr(gen_attention_block, 'appr_bias')
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test attention_type='0001'
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, attention_type='0001')
+    assert gen_attention_block.appr_geom_fc_x.in_features == 8
+    assert gen_attention_block.appr_geom_fc_y.in_features == 8
+    assert hasattr(gen_attention_block, 'geom_bias')
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test spatial_range >= 0
+    imgs = torch.randn(2, 256, 20, 20)
+    gen_attention_block = GeneralizedAttention(256, spatial_range=10)
+    assert hasattr(gen_attention_block, 'local_constraint_map')
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test q_stride > 1
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, q_stride=2)
+    assert gen_attention_block.q_downsample is not None
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test kv_stride > 1
+    imgs = torch.randn(2, 16, 20, 20)
+    gen_attention_block = GeneralizedAttention(16, kv_stride=2)
+    assert gen_attention_block.kv_downsample is not None
+    out = gen_attention_block(imgs)
+    assert out.shape == imgs.shape
+
+    # test fp16 with attention_type='1111'
+    if torch.cuda.is_available():
+        imgs = torch.randn(2, 16, 20, 20).cuda().to(torch.half)
+        gen_attention_block = GeneralizedAttention(
+            16,
+            spatial_range=-1,
+            num_heads=8,
+            attention_type='1111',
+            kv_stride=2)
+        gen_attention_block.cuda().type(torch.half)
+        out = gen_attention_block(imgs)
+        assert out.shape == imgs.shape
diff --git a/mmcv/tests/test_cnn/test_hsigmoid.py b/mmcv/tests/test_cnn/test_hsigmoid.py
new file mode 100644
index 0000000000000000000000000000000000000000..43e9f624a2ccf369d844a9e8ec7238158b364187
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_hsigmoid.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.cnn.bricks import HSigmoid
+
+
+def test_hsigmoid():
+    # test assertion divisor can not be zero
+    with pytest.raises(AssertionError):
+        HSigmoid(divisor=0)
+
+    # test with default parameters
+    act = HSigmoid()
+    input_shape = torch.Size([1, 3, 64, 64])
+    input = torch.randn(input_shape)
+    output = act(input)
+    expected_output = torch.min(
+        torch.max((input + 3) / 6, torch.zeros(input_shape)),
+        torch.ones(input_shape))
+    # test output shape
+    assert output.shape == expected_output.shape
+    # test output value
+    assert torch.equal(output, expected_output)
+
+    # test with designated parameters
+    act = HSigmoid(1, 2, 0, 1)
+    input_shape = torch.Size([1, 3, 64, 64])
+    input = torch.randn(input_shape)
+    output = act(input)
+    expected_output = torch.min(
+        torch.max((input + 1) / 2, torch.zeros(input_shape)),
+        torch.ones(input_shape))
+    # test output shape
+    assert output.shape == expected_output.shape
+    # test output value
+    assert torch.equal(output, expected_output)
diff --git a/mmcv/tests/test_cnn/test_hswish.py b/mmcv/tests/test_cnn/test_hswish.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cd1bcf31221b14fec0b60537b869f4ebe12f26a
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_hswish.py
@@ -0,0 +1,21 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.nn.functional import relu6
+
+from mmcv.cnn.bricks import HSwish
+
+
+def test_hswish():
+    # test inplace
+    act = HSwish(inplace=True)
+    assert act.act.inplace
+    act = HSwish()
+    assert not act.act.inplace
+
+    input = torch.randn(1, 3, 64, 64)
+    expected_output = input * relu6(input + 3) / 6
+    output = act(input)
+    # test output shape
+    assert output.shape == expected_output.shape
+    # test output value
+    assert torch.equal(output, expected_output)
diff --git a/mmcv/tests/test_cnn/test_model_registry.py b/mmcv/tests/test_cnn/test_model_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..dd446cef5d6b688afca7fb498d27ff9368e59a38
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_model_registry.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+import mmcv
+from mmcv.cnn import MODELS, build_model_from_cfg
+
+
+def test_build_model_from_cfg():
+    BACKBONES = mmcv.Registry('backbone', build_func=build_model_from_cfg)
+
+    @BACKBONES.register_module()
+    class ResNet(nn.Module):
+
+        def __init__(self, depth, stages=4):
+            super().__init__()
+            self.depth = depth
+            self.stages = stages
+
+        def forward(self, x):
+            return x
+
+    @BACKBONES.register_module()
+    class ResNeXt(nn.Module):
+
+        def __init__(self, depth, stages=4):
+            super().__init__()
+            self.depth = depth
+            self.stages = stages
+
+        def forward(self, x):
+            return x
+
+    cfg = dict(type='ResNet', depth=50)
+    model = BACKBONES.build(cfg)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    cfg = dict(type='ResNeXt', depth=50, stages=3)
+    model = BACKBONES.build(cfg)
+    assert isinstance(model, ResNeXt)
+    assert model.depth == 50 and model.stages == 3
+
+    cfg = [
+        dict(type='ResNet', depth=50),
+        dict(type='ResNeXt', depth=50, stages=3)
+    ]
+    model = BACKBONES.build(cfg)
+    assert isinstance(model, nn.Sequential)
+    assert isinstance(model[0], ResNet)
+    assert model[0].depth == 50 and model[0].stages == 4
+    assert isinstance(model[1], ResNeXt)
+    assert model[1].depth == 50 and model[1].stages == 3
+
+    # test inherit `build_func` from parent
+    NEW_MODELS = mmcv.Registry('models', parent=MODELS, scope='new')
+    assert NEW_MODELS.build_func is build_model_from_cfg
+
+    # test specify `build_func`
+    def pseudo_build(cfg):
+        return cfg
+
+    NEW_MODELS = mmcv.Registry(
+        'models', parent=MODELS, build_func=pseudo_build)
+    assert NEW_MODELS.build_func is pseudo_build
diff --git a/mmcv/tests/test_cnn/test_non_local.py b/mmcv/tests/test_cnn/test_non_local.py
new file mode 100644
index 0000000000000000000000000000000000000000..25d78833912a195532eb946a8939d1ea986043a5
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_non_local.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.cnn import NonLocal1d, NonLocal2d, NonLocal3d
+from mmcv.cnn.bricks.non_local import _NonLocalNd
+
+
+def test_nonlocal():
+    with pytest.raises(ValueError):
+        # mode should be in ['embedded_gaussian', 'dot_product']
+        _NonLocalNd(3, mode='unsupport_mode')
+
+    # _NonLocalNd with zero initialization
+    _NonLocalNd(3)
+    _NonLocalNd(3, norm_cfg=dict(type='BN'))
+
+    # _NonLocalNd without zero initialization
+    _NonLocalNd(3, zeros_init=False)
+    _NonLocalNd(3, norm_cfg=dict(type='BN'), zeros_init=False)
+
+
+def test_nonlocal3d():
+    # NonLocal3d with 'embedded_gaussian' mode
+    imgs = torch.randn(2, 3, 10, 20, 20)
+    nonlocal_3d = NonLocal3d(3)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            # NonLocal is only implemented on gpu in parrots
+            imgs = imgs.cuda()
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal3d with 'dot_product' mode
+    nonlocal_3d = NonLocal3d(3, mode='dot_product')
+    assert nonlocal_3d.mode == 'dot_product'
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal3d with 'concatenation' mode
+    nonlocal_3d = NonLocal3d(3, mode='concatenation')
+    assert nonlocal_3d.mode == 'concatenation'
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal3d with 'gaussian' mode
+    nonlocal_3d = NonLocal3d(3, mode='gaussian')
+    assert not hasattr(nonlocal_3d, 'phi')
+    assert nonlocal_3d.mode == 'gaussian'
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal3d with 'gaussian' mode and sub_sample
+    nonlocal_3d = NonLocal3d(3, mode='gaussian', sub_sample=True)
+    assert isinstance(nonlocal_3d.g, nn.Sequential) and len(nonlocal_3d.g) == 2
+    assert isinstance(nonlocal_3d.g[1], nn.MaxPool3d)
+    assert nonlocal_3d.g[1].kernel_size == (1, 2, 2)
+    assert isinstance(nonlocal_3d.phi, nn.MaxPool3d)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal3d with 'dot_product' mode and sub_sample
+    nonlocal_3d = NonLocal3d(3, mode='dot_product', sub_sample=True)
+    for m in [nonlocal_3d.g, nonlocal_3d.phi]:
+        assert isinstance(m, nn.Sequential) and len(m) == 2
+        assert isinstance(m[1], nn.MaxPool3d)
+        assert m[1].kernel_size == (1, 2, 2)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_3d.cuda()
+    out = nonlocal_3d(imgs)
+    assert out.shape == imgs.shape
+
+
+def test_nonlocal2d():
+    # NonLocal2d with 'embedded_gaussian' mode
+    imgs = torch.randn(2, 3, 20, 20)
+    nonlocal_2d = NonLocal2d(3)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal2d with 'dot_product' mode
+    imgs = torch.randn(2, 3, 20, 20)
+    nonlocal_2d = NonLocal2d(3, mode='dot_product')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal2d with 'concatenation' mode
+    imgs = torch.randn(2, 3, 20, 20)
+    nonlocal_2d = NonLocal2d(3, mode='concatenation')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal2d with 'gaussian' mode
+    imgs = torch.randn(2, 3, 20, 20)
+    nonlocal_2d = NonLocal2d(3, mode='gaussian')
+    assert not hasattr(nonlocal_2d, 'phi')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal2d with 'gaussian' mode and sub_sample
+    nonlocal_2d = NonLocal2d(3, mode='gaussian', sub_sample=True)
+    assert isinstance(nonlocal_2d.g, nn.Sequential) and len(nonlocal_2d.g) == 2
+    assert isinstance(nonlocal_2d.g[1], nn.MaxPool2d)
+    assert nonlocal_2d.g[1].kernel_size == (2, 2)
+    assert isinstance(nonlocal_2d.phi, nn.MaxPool2d)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal2d with 'dot_product' mode and sub_sample
+    nonlocal_2d = NonLocal2d(3, mode='dot_product', sub_sample=True)
+    for m in [nonlocal_2d.g, nonlocal_2d.phi]:
+        assert isinstance(m, nn.Sequential) and len(m) == 2
+        assert isinstance(m[1], nn.MaxPool2d)
+        assert m[1].kernel_size == (2, 2)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_2d.cuda()
+    out = nonlocal_2d(imgs)
+    assert out.shape == imgs.shape
+
+
+def test_nonlocal1d():
+    # NonLocal1d with 'embedded_gaussian' mode
+    imgs = torch.randn(2, 3, 20)
+    nonlocal_1d = NonLocal1d(3)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal1d with 'dot_product' mode
+    imgs = torch.randn(2, 3, 20)
+    nonlocal_1d = NonLocal1d(3, mode='dot_product')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal1d with 'concatenation' mode
+    imgs = torch.randn(2, 3, 20)
+    nonlocal_1d = NonLocal1d(3, mode='concatenation')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal1d with 'gaussian' mode
+    imgs = torch.randn(2, 3, 20)
+    nonlocal_1d = NonLocal1d(3, mode='gaussian')
+    assert not hasattr(nonlocal_1d, 'phi')
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            imgs = imgs.cuda()
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal1d with 'gaussian' mode and sub_sample
+    nonlocal_1d = NonLocal1d(3, mode='gaussian', sub_sample=True)
+    assert isinstance(nonlocal_1d.g, nn.Sequential) and len(nonlocal_1d.g) == 2
+    assert isinstance(nonlocal_1d.g[1], nn.MaxPool1d)
+    assert nonlocal_1d.g[1].kernel_size == 2
+    assert isinstance(nonlocal_1d.phi, nn.MaxPool1d)
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
+
+    # NonLocal1d with 'dot_product' mode and sub_sample
+    nonlocal_1d = NonLocal1d(3, mode='dot_product', sub_sample=True)
+    for m in [nonlocal_1d.g, nonlocal_1d.phi]:
+        assert isinstance(m, nn.Sequential) and len(m) == 2
+        assert isinstance(m[1], nn.MaxPool1d)
+        assert m[1].kernel_size == 2
+    if torch.__version__ == 'parrots':
+        if torch.cuda.is_available():
+            nonlocal_1d.cuda()
+    out = nonlocal_1d(imgs)
+    assert out.shape == imgs.shape
diff --git a/mmcv/tests/test_cnn/test_revert_syncbn.py b/mmcv/tests/test_cnn/test_revert_syncbn.py
new file mode 100644
index 0000000000000000000000000000000000000000..187c2a6d0bbace6b80c7152abaf731b60e10812a
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_revert_syncbn.py
@@ -0,0 +1,61 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+
+import numpy as np
+import pytest
+import torch
+import torch.distributed as dist
+
+from mmcv.cnn.bricks import ConvModule
+from mmcv.cnn.utils import revert_sync_batchnorm
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not supported in parrots now')
+def test_revert_syncbn():
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='SyncBN'))
+    x = torch.randn(1, 3, 10, 10)
+    # Expect a ValueError prompting that SyncBN is not supported on CPU
+    with pytest.raises(ValueError):
+        y = conv(x)
+    conv = revert_sync_batchnorm(conv)
+    y = conv(x)
+    assert y.shape == (1, 8, 9, 9)
+
+
+def test_revert_mmsyncbn():
+    if 'SLURM_NTASKS' not in os.environ or int(os.environ['SLURM_NTASKS']) < 2:
+        print('Must run on slurm with more than 1 process!\n'
+              'srun -p test --gres=gpu:2 -n2')
+        return
+    rank = int(os.environ['SLURM_PROCID'])
+    world_size = int(os.environ['SLURM_NTASKS'])
+    local_rank = int(os.environ['SLURM_LOCALID'])
+    node_list = str(os.environ['SLURM_NODELIST'])
+
+    node_parts = re.findall('[0-9]+', node_list)
+    os.environ['MASTER_ADDR'] = (f'{node_parts[1]}.{node_parts[2]}' +
+                                 f'.{node_parts[3]}.{node_parts[4]}')
+    os.environ['MASTER_PORT'] = '12341'
+    os.environ['WORLD_SIZE'] = str(world_size)
+    os.environ['RANK'] = str(rank)
+
+    dist.init_process_group('nccl')
+    torch.cuda.set_device(local_rank)
+    x = torch.randn(1, 3, 10, 10).cuda()
+    dist.broadcast(x, src=0)
+    conv = ConvModule(3, 8, 2, norm_cfg=dict(type='MMSyncBN')).cuda()
+    conv.eval()
+    y_mmsyncbn = conv(x).detach().cpu().numpy()
+    conv = revert_sync_batchnorm(conv)
+    y_bn = conv(x).detach().cpu().numpy()
+    assert np.all(np.isclose(y_bn, y_mmsyncbn, 1e-3))
+    conv, x = conv.to('cpu'), x.to('cpu')
+    y_bn_cpu = conv(x).detach().numpy()
+    assert np.all(np.isclose(y_bn, y_bn_cpu, 1e-3))
diff --git a/mmcv/tests/test_cnn/test_scale.py b/mmcv/tests/test_cnn/test_scale.py
new file mode 100644
index 0000000000000000000000000000000000000000..bee78eb57f2f4ddd519b4aad101b25dc31798bef
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_scale.py
@@ -0,0 +1,22 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmcv.cnn.bricks import Scale
+
+
+def test_scale():
+    # test default scale
+    scale = Scale()
+    assert scale.scale.data == 1.
+    assert scale.scale.dtype == torch.float
+    x = torch.rand(1, 3, 64, 64)
+    output = scale(x)
+    assert output.shape == (1, 3, 64, 64)
+
+    # test given scale
+    scale = Scale(10.)
+    assert scale.scale.data == 10.
+    assert scale.scale.dtype == torch.float
+    x = torch.rand(1, 3, 64, 64)
+    output = scale(x)
+    assert output.shape == (1, 3, 64, 64)
diff --git a/mmcv/tests/test_cnn/test_swish.py b/mmcv/tests/test_cnn/test_swish.py
new file mode 100644
index 0000000000000000000000000000000000000000..2317f5a139a5228c049848a260ea914ac02eecee
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_swish.py
@@ -0,0 +1,16 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+from mmcv.cnn.bricks import Swish
+
+
+def test_swish():
+    act = Swish()
+    input = torch.randn(1, 3, 64, 64)
+    expected_output = input * F.sigmoid(input)
+    output = act(input)
+    # test output shape
+    assert output.shape == expected_output.shape
+    # test output value
+    assert torch.equal(output, expected_output)
diff --git a/mmcv/tests/test_cnn/test_transformer.py b/mmcv/tests/test_cnn/test_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b330aed34a8d9837784b1e7cb03f783ecbe5fae8
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_transformer.py
@@ -0,0 +1,681 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import pytest
+import torch
+
+from mmcv.cnn.bricks.drop import DropPath
+from mmcv.cnn.bricks.transformer import (FFN, AdaptivePadding,
+                                         BaseTransformerLayer,
+                                         MultiheadAttention, PatchEmbed,
+                                         PatchMerging,
+                                         TransformerLayerSequence)
+from mmcv.runner import ModuleList
+
+
+def test_adaptive_padding():
+
+    for padding in ('same', 'corner'):
+        kernel_size = 16
+        stride = 16
+        dilation = 1
+        input = torch.rand(1, 1, 15, 17)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        out = adap_pad(input)
+        # padding to divisible by 16
+        assert (out.shape[2], out.shape[3]) == (16, 32)
+        input = torch.rand(1, 1, 16, 17)
+        out = adap_pad(input)
+        # padding to divisible by 16
+        assert (out.shape[2], out.shape[3]) == (16, 32)
+
+        kernel_size = (2, 2)
+        stride = (2, 2)
+        dilation = (1, 1)
+
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 11, 13)
+        out = adap_pad(input)
+        # padding to divisible by 2
+        assert (out.shape[2], out.shape[3]) == (12, 14)
+
+        kernel_size = (2, 2)
+        stride = (10, 10)
+        dilation = (1, 1)
+
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 10, 13)
+        out = adap_pad(input)
+        #  no padding
+        assert (out.shape[2], out.shape[3]) == (10, 13)
+
+        kernel_size = (11, 11)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        input = torch.rand(1, 1, 11, 13)
+        out = adap_pad(input)
+        #  all padding
+        assert (out.shape[2], out.shape[3]) == (21, 21)
+
+        # test padding as kernel is (7,9)
+        input = torch.rand(1, 1, 11, 13)
+        stride = (3, 4)
+        kernel_size = (4, 5)
+        dilation = (2, 2)
+        # actually (7, 9)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        dilation_out = adap_pad(input)
+        assert (dilation_out.shape[2], dilation_out.shape[3]) == (16, 21)
+        kernel_size = (7, 9)
+        dilation = (1, 1)
+        adap_pad = AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=padding)
+        kernel79_out = adap_pad(input)
+        assert (kernel79_out.shape[2], kernel79_out.shape[3]) == (16, 21)
+        assert kernel79_out.shape == dilation_out.shape
+
+    # assert only support "same" "corner"
+    with pytest.raises(AssertionError):
+        AdaptivePadding(
+            kernel_size=kernel_size,
+            stride=stride,
+            dilation=dilation,
+            padding=1)
+
+
+def test_patch_embed():
+    B = 2
+    H = 3
+    W = 4
+    C = 3
+    embed_dims = 10
+    kernel_size = 3
+    stride = 1
+    dummy_input = torch.rand(B, C, H, W)
+    patch_merge_1 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=1,
+        norm_cfg=None)
+
+    x1, shape = patch_merge_1(dummy_input)
+    # test out shape
+    assert x1.shape == (2, 2, 10)
+    # test outsize is correct
+    assert shape == (1, 2)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x1.shape[1]
+
+    B = 2
+    H = 10
+    W = 10
+    C = 3
+    embed_dims = 10
+    kernel_size = 5
+    stride = 2
+    dummy_input = torch.rand(B, C, H, W)
+    # test dilation
+    patch_merge_2 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=None,
+    )
+
+    x2, shape = patch_merge_2(dummy_input)
+    # test out shape
+    assert x2.shape == (2, 1, 10)
+    # test outsize is correct
+    assert shape == (1, 1)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x2.shape[1]
+
+    stride = 2
+    input_size = (10, 10)
+
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    x3, shape = patch_merge_3(dummy_input)
+    # test out shape
+    assert x3.shape == (2, 1, 10)
+    # test outsize is correct
+    assert shape == (1, 1)
+    # test L = out_h * out_w
+    assert shape[0] * shape[1] == x3.shape[1]
+
+    # test the init_out_size with nn.Unfold
+    assert patch_merge_3.init_out_size[1] == (input_size[0] - 2 * 4 -
+                                              1) // 2 + 1
+    assert patch_merge_3.init_out_size[0] == (input_size[0] - 2 * 4 -
+                                              1) // 2 + 1
+    H = 11
+    W = 12
+    input_size = (H, W)
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    _, shape = patch_merge_3(dummy_input)
+    # when input_size equal to real input
+    # the out_size should be equal to `init_out_size`
+    assert shape == patch_merge_3.init_out_size
+
+    input_size = (H, W)
+    dummy_input = torch.rand(B, C, H, W)
+    # test stride and norm
+    patch_merge_3 = PatchEmbed(
+        in_channels=C,
+        embed_dims=embed_dims,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=0,
+        dilation=2,
+        norm_cfg=dict(type='LN'),
+        input_size=input_size)
+
+    _, shape = patch_merge_3(dummy_input)
+    # when input_size equal to real input
+    # the out_size should be equal to `init_out_size`
+    assert shape == patch_merge_3.init_out_size
+
+    # test adap padding
+    for padding in ('same', 'corner'):
+        in_c = 2
+        embed_dims = 3
+        B = 2
+
+        # test stride is 1
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (1, 1)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 25, 3)
+        assert out_size == (5, 5)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 1, 3)
+        assert out_size == (1, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (6, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 2, 3)
+        assert out_size == (2, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test different kernel_size with different stride
+        input_size = (6, 5)
+        kernel_size = (6, 2)
+        stride = (6, 2)
+        dilation = 1
+        bias = False
+
+        x = torch.rand(B, in_c, *input_size)
+        patch_embed = PatchEmbed(
+            in_channels=in_c,
+            embed_dims=embed_dims,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_embed(x)
+        assert x_out.size() == (B, 3, 3)
+        assert out_size == (1, 3)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+
+def test_patch_merging():
+
+    # Test the model with int padding
+    in_c = 3
+    out_c = 4
+    kernel_size = 3
+    stride = 3
+    padding = 1
+    dilation = 1
+    bias = False
+    # test the case `pad_to_stride` is False
+    patch_merge = PatchMerging(
+        in_channels=in_c,
+        out_channels=out_c,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias)
+    B, L, C = 1, 100, 3
+    input_size = (10, 10)
+    x = torch.rand(B, L, C)
+    x_out, out_size = patch_merge(x, input_size)
+    assert x_out.size() == (1, 16, 4)
+    assert out_size == (4, 4)
+    # assert out size is consistent with real output
+    assert x_out.size(1) == out_size[0] * out_size[1]
+    in_c = 4
+    out_c = 5
+    kernel_size = 6
+    stride = 3
+    padding = 2
+    dilation = 2
+    bias = False
+    patch_merge = PatchMerging(
+        in_channels=in_c,
+        out_channels=out_c,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=bias)
+    B, L, C = 1, 100, 4
+    input_size = (10, 10)
+    x = torch.rand(B, L, C)
+    x_out, out_size = patch_merge(x, input_size)
+    assert x_out.size() == (1, 4, 5)
+    assert out_size == (2, 2)
+    # assert out size is consistent with real output
+    assert x_out.size(1) == out_size[0] * out_size[1]
+
+    # Test with adaptive padding
+    for padding in ('same', 'corner'):
+        in_c = 2
+        out_c = 3
+        B = 2
+
+        # test stride is 1
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (1, 1)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 25, 3)
+        assert out_size == (5, 5)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (5, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 1, 3)
+        assert out_size == (1, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test kernel_size == stride
+        input_size = (6, 5)
+        kernel_size = (5, 5)
+        stride = (5, 5)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 2, 3)
+        assert out_size == (2, 1)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+        # test different kernel_size with different stride
+        input_size = (6, 5)
+        kernel_size = (6, 2)
+        stride = (6, 2)
+        dilation = 1
+        bias = False
+        L = input_size[0] * input_size[1]
+
+        x = torch.rand(B, L, in_c)
+        patch_merge = PatchMerging(
+            in_channels=in_c,
+            out_channels=out_c,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            bias=bias)
+
+        x_out, out_size = patch_merge(x, input_size)
+        assert x_out.size() == (B, 3, 3)
+        assert out_size == (1, 3)
+        assert x_out.size(1) == out_size[0] * out_size[1]
+
+
+def test_multiheadattention():
+    MultiheadAttention(
+        embed_dims=5,
+        num_heads=5,
+        attn_drop=0,
+        proj_drop=0,
+        dropout_layer=dict(type='Dropout', drop_prob=0.),
+        batch_first=True)
+    batch_dim = 2
+    embed_dim = 5
+    num_query = 100
+    attn_batch_first = MultiheadAttention(
+        embed_dims=5,
+        num_heads=5,
+        attn_drop=0,
+        proj_drop=0,
+        dropout_layer=dict(type='DropPath', drop_prob=0.),
+        batch_first=True)
+
+    attn_query_first = MultiheadAttention(
+        embed_dims=5,
+        num_heads=5,
+        attn_drop=0,
+        proj_drop=0,
+        dropout_layer=dict(type='DropPath', drop_prob=0.),
+        batch_first=False)
+
+    param_dict = dict(attn_query_first.named_parameters())
+    for n, v in attn_batch_first.named_parameters():
+        param_dict[n].data = v.data
+
+    input_batch_first = torch.rand(batch_dim, num_query, embed_dim)
+    input_query_first = input_batch_first.transpose(0, 1)
+
+    assert torch.allclose(
+        attn_query_first(input_query_first).sum(),
+        attn_batch_first(input_batch_first).sum())
+
+    key_batch_first = torch.rand(batch_dim, num_query, embed_dim)
+    key_query_first = key_batch_first.transpose(0, 1)
+
+    assert torch.allclose(
+        attn_query_first(input_query_first, key_query_first).sum(),
+        attn_batch_first(input_batch_first, key_batch_first).sum())
+
+    identity = torch.ones_like(input_query_first)
+
+    # check deprecated arguments can be used normally
+
+    assert torch.allclose(
+        attn_query_first(
+            input_query_first, key_query_first, residual=identity).sum(),
+        attn_batch_first(input_batch_first, key_batch_first).sum() +
+        identity.sum() - input_batch_first.sum())
+
+    assert torch.allclose(
+        attn_query_first(
+            input_query_first, key_query_first, identity=identity).sum(),
+        attn_batch_first(input_batch_first, key_batch_first).sum() +
+        identity.sum() - input_batch_first.sum())
+
+    attn_query_first(
+        input_query_first, key_query_first, identity=identity).sum(),
+
+
+def test_ffn():
+    with pytest.raises(AssertionError):
+        # num_fcs should be no less than 2
+        FFN(num_fcs=1)
+    FFN(dropout=0, add_residual=True)
+    ffn = FFN(dropout=0, add_identity=True)
+
+    input_tensor = torch.rand(2, 20, 256)
+    input_tensor_nbc = input_tensor.transpose(0, 1)
+    assert torch.allclose(ffn(input_tensor).sum(), ffn(input_tensor_nbc).sum())
+    residual = torch.rand_like(input_tensor)
+    torch.allclose(
+        ffn(input_tensor, residual=residual).sum(),
+        ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())
+
+    torch.allclose(
+        ffn(input_tensor, identity=residual).sum(),
+        ffn(input_tensor).sum() + residual.sum() - input_tensor.sum())
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='Cuda not available')
+def test_basetransformerlayer_cuda():
+    # To test if the BaseTransformerLayer's behaviour remains
+    # consistent after being deepcopied
+    operation_order = ('self_attn', 'ffn')
+    baselayer = BaseTransformerLayer(
+        operation_order=operation_order,
+        batch_first=True,
+        attn_cfgs=dict(
+            type='MultiheadAttention',
+            embed_dims=256,
+            num_heads=8,
+        ),
+    )
+    baselayers = ModuleList([copy.deepcopy(baselayer) for _ in range(2)])
+    baselayers.to('cuda')
+    x = torch.rand(2, 10, 256).cuda()
+    for m in baselayers:
+        x = m(x)
+        assert x.shape == torch.Size([2, 10, 256])
+
+
+@pytest.mark.parametrize('embed_dims', [False, 256])
+def test_basetransformerlayer(embed_dims):
+    attn_cfgs = dict(type='MultiheadAttention', embed_dims=256, num_heads=8),
+    if embed_dims:
+        ffn_cfgs = dict(
+            type='FFN',
+            embed_dims=embed_dims,
+            feedforward_channels=1024,
+            num_fcs=2,
+            ffn_drop=0.,
+            act_cfg=dict(type='ReLU', inplace=True),
+        )
+    else:
+        ffn_cfgs = dict(
+            type='FFN',
+            feedforward_channels=1024,
+            num_fcs=2,
+            ffn_drop=0.,
+            act_cfg=dict(type='ReLU', inplace=True),
+        )
+
+    feedforward_channels = 2048
+    ffn_dropout = 0.1
+    operation_order = ('self_attn', 'norm', 'ffn', 'norm')
+
+    # test deprecated_args
+    baselayer = BaseTransformerLayer(
+        attn_cfgs=attn_cfgs,
+        ffn_cfgs=ffn_cfgs,
+        feedforward_channels=feedforward_channels,
+        ffn_dropout=ffn_dropout,
+        operation_order=operation_order)
+    assert baselayer.batch_first is False
+    assert baselayer.ffns[0].feedforward_channels == feedforward_channels
+
+    attn_cfgs = dict(type='MultiheadAttention', num_heads=8, embed_dims=256),
+    feedforward_channels = 2048
+    ffn_dropout = 0.1
+    operation_order = ('self_attn', 'norm', 'ffn', 'norm')
+    baselayer = BaseTransformerLayer(
+        attn_cfgs=attn_cfgs,
+        feedforward_channels=feedforward_channels,
+        ffn_dropout=ffn_dropout,
+        operation_order=operation_order,
+        batch_first=True)
+    assert baselayer.attentions[0].batch_first
+    in_tensor = torch.rand(2, 10, 256)
+    baselayer(in_tensor)
+
+
+def test_transformerlayersequence():
+    squeue = TransformerLayerSequence(
+        num_layers=6,
+        transformerlayers=dict(
+            type='BaseTransformerLayer',
+            attn_cfgs=[
+                dict(
+                    type='MultiheadAttention',
+                    embed_dims=256,
+                    num_heads=8,
+                    dropout=0.1),
+                dict(type='MultiheadAttention', embed_dims=256, num_heads=4)
+            ],
+            feedforward_channels=1024,
+            ffn_dropout=0.1,
+            operation_order=('self_attn', 'norm', 'cross_attn', 'norm', 'ffn',
+                             'norm')))
+    assert len(squeue.layers) == 6
+    assert squeue.pre_norm is False
+    with pytest.raises(AssertionError):
+        # if transformerlayers is a list, len(transformerlayers)
+        # should be equal to num_layers
+        TransformerLayerSequence(
+            num_layers=6,
+            transformerlayers=[
+                dict(
+                    type='BaseTransformerLayer',
+                    attn_cfgs=[
+                        dict(
+                            type='MultiheadAttention',
+                            embed_dims=256,
+                            num_heads=8,
+                            dropout=0.1),
+                        dict(type='MultiheadAttention', embed_dims=256)
+                    ],
+                    feedforward_channels=1024,
+                    ffn_dropout=0.1,
+                    operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
+                                     'ffn', 'norm'))
+            ])
+
+
+def test_drop_path():
+    drop_path = DropPath(drop_prob=0)
+    test_in = torch.rand(2, 3, 4, 5)
+    assert test_in is drop_path(test_in)
+
+    drop_path = DropPath(drop_prob=0.1)
+    drop_path.training = False
+    test_in = torch.rand(2, 3, 4, 5)
+    assert test_in is drop_path(test_in)
+    drop_path.training = True
+    assert test_in is not drop_path(test_in)
diff --git a/mmcv/tests/test_cnn/test_weight_init.py b/mmcv/tests/test_cnn/test_weight_init.py
new file mode 100644
index 0000000000000000000000000000000000000000..c14be66287e944e42e87c8a6eecfed7daf992188
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_weight_init.py
@@ -0,0 +1,562 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import random
+from tempfile import TemporaryDirectory
+
+import numpy as np
+import pytest
+import torch
+from scipy import stats
+from torch import nn
+
+from mmcv.cnn import (Caffe2XavierInit, ConstantInit, KaimingInit, NormalInit,
+                      PretrainedInit, TruncNormalInit, UniformInit, XavierInit,
+                      bias_init_with_prob, caffe2_xavier_init, constant_init,
+                      initialize, kaiming_init, normal_init, trunc_normal_init,
+                      uniform_init, xavier_init)
+
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
+
+def test_constant_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    constant_init(conv_module, 0.1)
+    assert conv_module.weight.allclose(
+        torch.full_like(conv_module.weight, 0.1))
+    assert conv_module.bias.allclose(torch.zeros_like(conv_module.bias))
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    constant_init(conv_module_no_bias, 0.1)
+    assert conv_module.weight.allclose(
+        torch.full_like(conv_module.weight, 0.1))
+
+
+def test_xavier_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    xavier_init(conv_module, bias=0.1)
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, 0.1))
+    xavier_init(conv_module, distribution='uniform')
+    # TODO: sanity check of weight distribution, e.g. mean, std
+    with pytest.raises(AssertionError):
+        xavier_init(conv_module, distribution='student-t')
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    xavier_init(conv_module_no_bias)
+
+
+def test_normal_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    normal_init(conv_module, bias=0.1)
+    # TODO: sanity check of weight distribution, e.g. mean, std
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, 0.1))
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    normal_init(conv_module_no_bias)
+    # TODO: sanity check distribution, e.g. mean, std
+
+
+def test_trunc_normal_init():
+
+    def _random_float(a, b):
+        return (b - a) * random.random() + a
+
+    def _is_trunc_normal(tensor, mean, std, a, b):
+        # scipy's trunc norm is suited for data drawn from N(0, 1),
+        # so we need to transform our data to test it using scipy.
+        z_samples = (tensor.view(-1) - mean) / std
+        z_samples = z_samples.tolist()
+        a0 = (a - mean) / std
+        b0 = (b - mean) / std
+        p_value = stats.kstest(z_samples, 'truncnorm', args=(a0, b0))[1]
+        return p_value > 0.0001
+
+    conv_module = nn.Conv2d(3, 16, 3)
+    mean = _random_float(-3, 3)
+    std = _random_float(.01, 1)
+    a = _random_float(mean - 2 * std, mean)
+    b = _random_float(mean, mean + 2 * std)
+    trunc_normal_init(conv_module, mean, std, a, b, bias=0.1)
+    assert _is_trunc_normal(conv_module.weight, mean, std, a, b)
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, 0.1))
+
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    trunc_normal_init(conv_module_no_bias)
+    # TODO: sanity check distribution, e.g. mean, std
+
+
+def test_uniform_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    uniform_init(conv_module, bias=0.1)
+    # TODO: sanity check of weight distribution, e.g. mean, std
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, 0.1))
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    uniform_init(conv_module_no_bias)
+
+
+def test_kaiming_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    kaiming_init(conv_module, bias=0.1)
+    # TODO: sanity check of weight distribution, e.g. mean, std
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, 0.1))
+    kaiming_init(conv_module, distribution='uniform')
+    with pytest.raises(AssertionError):
+        kaiming_init(conv_module, distribution='student-t')
+    conv_module_no_bias = nn.Conv2d(3, 16, 3, bias=False)
+    kaiming_init(conv_module_no_bias)
+
+
+def test_caffe_xavier_init():
+    conv_module = nn.Conv2d(3, 16, 3)
+    caffe2_xavier_init(conv_module)
+
+
+def test_bias_init_with_prob():
+    conv_module = nn.Conv2d(3, 16, 3)
+    prior_prob = 0.1
+    normal_init(conv_module, bias=bias_init_with_prob(0.1))
+    # TODO: sanity check of weight distribution, e.g. mean, std
+    bias = float(-np.log((1 - prior_prob) / prior_prob))
+    assert conv_module.bias.allclose(torch.full_like(conv_module.bias, bias))
+
+
+def test_constaninit():
+    """test ConstantInit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    func = ConstantInit(val=1, bias=2, layer='Conv2d')
+    func(model)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 1.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 2.))
+
+    assert not torch.equal(model[2].weight,
+                           torch.full(model[2].weight.shape, 1.))
+    assert not torch.equal(model[2].bias, torch.full(model[2].bias.shape, 2.))
+
+    func = ConstantInit(val=3, bias_prob=0.01, layer='Linear')
+    func(model)
+    res = bias_init_with_prob(0.01)
+
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 1.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 3.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 2.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, res))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+    func = ConstantInit(val=4., bias=5., layer='_ConvNd')
+    func(model)
+    assert torch.all(model[0].weight == 4.)
+    assert torch.all(model[2].weight == 4.)
+    assert torch.all(model[0].bias == 5.)
+    assert torch.all(model[2].bias == 5.)
+
+    # test bias input type
+    with pytest.raises(TypeError):
+        func = ConstantInit(val=1, bias='1')
+    # test bias_prob type
+    with pytest.raises(TypeError):
+        func = ConstantInit(val=1, bias_prob='1')
+    # test layer input type
+    with pytest.raises(TypeError):
+        func = ConstantInit(val=1, layer=1)
+
+
+def test_xavierinit():
+    """test XavierInit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    func = XavierInit(bias=0.1, layer='Conv2d')
+    func(model)
+    assert model[0].bias.allclose(torch.full_like(model[2].bias, 0.1))
+    assert not model[2].bias.allclose(torch.full_like(model[0].bias, 0.1))
+
+    constant_func = ConstantInit(val=0, bias=0, layer=['Conv2d', 'Linear'])
+    func = XavierInit(gain=100, bias_prob=0.01, layer=['Conv2d', 'Linear'])
+    model.apply(constant_func)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 0.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 0.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 0.))
+
+    res = bias_init_with_prob(0.01)
+    func(model)
+    assert not torch.equal(model[0].weight,
+                           torch.full(model[0].weight.shape, 0.))
+    assert not torch.equal(model[2].weight,
+                           torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, res))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, res))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+    func = ConstantInit(val=4., bias=5., layer='_ConvNd')
+    func(model)
+    assert torch.all(model[0].weight == 4.)
+    assert torch.all(model[2].weight == 4.)
+    assert torch.all(model[0].bias == 5.)
+    assert torch.all(model[2].bias == 5.)
+
+    func = XavierInit(gain=100, bias_prob=0.01, layer='_ConvNd')
+    func(model)
+    assert not torch.all(model[0].weight == 4.)
+    assert not torch.all(model[2].weight == 4.)
+    assert torch.all(model[0].bias == res)
+    assert torch.all(model[2].bias == res)
+
+    # test bias input type
+    with pytest.raises(TypeError):
+        func = XavierInit(bias='0.1', layer='Conv2d')
+    # test layer inpur type
+    with pytest.raises(TypeError):
+        func = XavierInit(bias=0.1, layer=1)
+
+
+def test_normalinit():
+    """test Normalinit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+
+    func = NormalInit(mean=100, std=1e-5, bias=200, layer=['Conv2d', 'Linear'])
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(100.))
+    assert model[2].weight.allclose(torch.tensor(100.))
+    assert model[0].bias.allclose(torch.tensor(200.))
+    assert model[2].bias.allclose(torch.tensor(200.))
+
+    func = NormalInit(
+        mean=300, std=1e-5, bias_prob=0.01, layer=['Conv2d', 'Linear'])
+    res = bias_init_with_prob(0.01)
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(300.))
+    assert model[2].weight.allclose(torch.tensor(300.))
+    assert model[0].bias.allclose(torch.tensor(res))
+    assert model[2].bias.allclose(torch.tensor(res))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+
+    func = NormalInit(mean=300, std=1e-5, bias_prob=0.01, layer='_ConvNd')
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(300.))
+    assert model[2].weight.allclose(torch.tensor(300.))
+    assert torch.all(model[0].bias == res)
+    assert torch.all(model[2].bias == res)
+
+
+def test_truncnormalinit():
+    """test TruncNormalInit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+
+    func = TruncNormalInit(
+        mean=100, std=1e-5, bias=200, a=0, b=200, layer=['Conv2d', 'Linear'])
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(100.))
+    assert model[2].weight.allclose(torch.tensor(100.))
+    assert model[0].bias.allclose(torch.tensor(200.))
+    assert model[2].bias.allclose(torch.tensor(200.))
+
+    func = TruncNormalInit(
+        mean=300,
+        std=1e-5,
+        a=100,
+        b=400,
+        bias_prob=0.01,
+        layer=['Conv2d', 'Linear'])
+    res = bias_init_with_prob(0.01)
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(300.))
+    assert model[2].weight.allclose(torch.tensor(300.))
+    assert model[0].bias.allclose(torch.tensor(res))
+    assert model[2].bias.allclose(torch.tensor(res))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+
+    func = TruncNormalInit(
+        mean=300, std=1e-5, a=100, b=400, bias_prob=0.01, layer='_ConvNd')
+    func(model)
+    assert model[0].weight.allclose(torch.tensor(300.))
+    assert model[2].weight.allclose(torch.tensor(300.))
+    assert torch.all(model[0].bias == res)
+    assert torch.all(model[2].bias == res)
+
+
+def test_uniforminit():
+    """"test UniformInit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    func = UniformInit(a=1, b=1, bias=2, layer=['Conv2d', 'Linear'])
+    func(model)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 1.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 1.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 2.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 2.))
+
+    func = UniformInit(a=100, b=100, layer=['Conv2d', 'Linear'], bias=10)
+    func(model)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape,
+                                                   100.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape,
+                                                   100.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 10.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 10.))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+
+    func = UniformInit(a=100, b=100, bias_prob=0.01, layer='_ConvNd')
+    res = bias_init_with_prob(0.01)
+    func(model)
+    assert torch.all(model[0].weight == 100.)
+    assert torch.all(model[2].weight == 100.)
+    assert torch.all(model[0].bias == res)
+    assert torch.all(model[2].bias == res)
+
+
+def test_kaiminginit():
+    """test KaimingInit class."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    func = KaimingInit(bias=0.1, layer='Conv2d')
+    func(model)
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 0.1))
+    assert not torch.equal(model[2].bias, torch.full(model[2].bias.shape, 0.1))
+
+    func = KaimingInit(a=100, bias=10, layer=['Conv2d', 'Linear'])
+    constant_func = ConstantInit(val=0, bias=0, layer=['Conv2d', 'Linear'])
+    model.apply(constant_func)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 0.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 0.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 0.))
+
+    func(model)
+    assert not torch.equal(model[0].weight,
+                           torch.full(model[0].weight.shape, 0.))
+    assert not torch.equal(model[2].weight,
+                           torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 10.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 10.))
+
+    # test layer key with base class name
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Conv1d(1, 2, 1))
+    func = KaimingInit(bias=0.1, layer='_ConvNd')
+    func(model)
+    assert torch.all(model[0].bias == 0.1)
+    assert torch.all(model[2].bias == 0.1)
+
+    func = KaimingInit(a=100, bias=10, layer='_ConvNd')
+    constant_func = ConstantInit(val=0, bias=0, layer='_ConvNd')
+    model.apply(constant_func)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 0.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 0.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 0.))
+
+    func(model)
+    assert not torch.equal(model[0].weight,
+                           torch.full(model[0].weight.shape, 0.))
+    assert not torch.equal(model[2].weight,
+                           torch.full(model[2].weight.shape, 0.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 10.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 10.))
+
+
+def test_caffe2xavierinit():
+    """test Caffe2XavierInit."""
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    func = Caffe2XavierInit(bias=0.1, layer='Conv2d')
+    func(model)
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 0.1))
+    assert not torch.equal(model[2].bias, torch.full(model[2].bias.shape, 0.1))
+
+
+class FooModule(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.linear = nn.Linear(1, 2)
+        self.conv2d = nn.Conv2d(3, 1, 3)
+        self.conv2d_2 = nn.Conv2d(3, 2, 3)
+
+
+def test_pretrainedinit():
+    """test PretrainedInit class."""
+
+    modelA = FooModule()
+    constant_func = ConstantInit(val=1, bias=2, layer=['Conv2d', 'Linear'])
+    modelA.apply(constant_func)
+    modelB = FooModule()
+    funcB = PretrainedInit(checkpoint='modelA.pth')
+    modelC = nn.Linear(1, 2)
+    funcC = PretrainedInit(checkpoint='modelA.pth', prefix='linear.')
+    with TemporaryDirectory():
+        torch.save(modelA.state_dict(), 'modelA.pth')
+        funcB(modelB)
+        assert torch.equal(modelB.linear.weight,
+                           torch.full(modelB.linear.weight.shape, 1.))
+        assert torch.equal(modelB.linear.bias,
+                           torch.full(modelB.linear.bias.shape, 2.))
+        assert torch.equal(modelB.conv2d.weight,
+                           torch.full(modelB.conv2d.weight.shape, 1.))
+        assert torch.equal(modelB.conv2d.bias,
+                           torch.full(modelB.conv2d.bias.shape, 2.))
+        assert torch.equal(modelB.conv2d_2.weight,
+                           torch.full(modelB.conv2d_2.weight.shape, 1.))
+        assert torch.equal(modelB.conv2d_2.bias,
+                           torch.full(modelB.conv2d_2.bias.shape, 2.))
+
+        funcC(modelC)
+        assert torch.equal(modelC.weight, torch.full(modelC.weight.shape, 1.))
+        assert torch.equal(modelC.bias, torch.full(modelC.bias.shape, 2.))
+
+
+def test_initialize():
+    model = nn.Sequential(nn.Conv2d(3, 1, 3), nn.ReLU(), nn.Linear(1, 2))
+    foonet = FooModule()
+
+    # test layer key
+    init_cfg = dict(type='Constant', layer=['Conv2d', 'Linear'], val=1, bias=2)
+    initialize(model, init_cfg)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 1.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 1.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 2.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 2.))
+    assert init_cfg == dict(
+        type='Constant', layer=['Conv2d', 'Linear'], val=1, bias=2)
+
+    # test init_cfg with list type
+    init_cfg = [
+        dict(type='Constant', layer='Conv2d', val=1, bias=2),
+        dict(type='Constant', layer='Linear', val=3, bias=4)
+    ]
+    initialize(model, init_cfg)
+    assert torch.equal(model[0].weight, torch.full(model[0].weight.shape, 1.))
+    assert torch.equal(model[2].weight, torch.full(model[2].weight.shape, 3.))
+    assert torch.equal(model[0].bias, torch.full(model[0].bias.shape, 2.))
+    assert torch.equal(model[2].bias, torch.full(model[2].bias.shape, 4.))
+    assert init_cfg == [
+        dict(type='Constant', layer='Conv2d', val=1, bias=2),
+        dict(type='Constant', layer='Linear', val=3, bias=4)
+    ]
+
+    # test layer key and override key
+    init_cfg = dict(
+        type='Constant',
+        val=1,
+        bias=2,
+        layer=['Conv2d', 'Linear'],
+        override=dict(type='Constant', name='conv2d_2', val=3, bias=4))
+    initialize(foonet, init_cfg)
+    assert torch.equal(foonet.linear.weight,
+                       torch.full(foonet.linear.weight.shape, 1.))
+    assert torch.equal(foonet.linear.bias,
+                       torch.full(foonet.linear.bias.shape, 2.))
+    assert torch.equal(foonet.conv2d.weight,
+                       torch.full(foonet.conv2d.weight.shape, 1.))
+    assert torch.equal(foonet.conv2d.bias,
+                       torch.full(foonet.conv2d.bias.shape, 2.))
+    assert torch.equal(foonet.conv2d_2.weight,
+                       torch.full(foonet.conv2d_2.weight.shape, 3.))
+    assert torch.equal(foonet.conv2d_2.bias,
+                       torch.full(foonet.conv2d_2.bias.shape, 4.))
+    assert init_cfg == dict(
+        type='Constant',
+        val=1,
+        bias=2,
+        layer=['Conv2d', 'Linear'],
+        override=dict(type='Constant', name='conv2d_2', val=3, bias=4))
+
+    # test override key
+    init_cfg = dict(
+        type='Constant', val=5, bias=6, override=dict(name='conv2d_2'))
+    initialize(foonet, init_cfg)
+    assert not torch.equal(foonet.linear.weight,
+                           torch.full(foonet.linear.weight.shape, 5.))
+    assert not torch.equal(foonet.linear.bias,
+                           torch.full(foonet.linear.bias.shape, 6.))
+    assert not torch.equal(foonet.conv2d.weight,
+                           torch.full(foonet.conv2d.weight.shape, 5.))
+    assert not torch.equal(foonet.conv2d.bias,
+                           torch.full(foonet.conv2d.bias.shape, 6.))
+    assert torch.equal(foonet.conv2d_2.weight,
+                       torch.full(foonet.conv2d_2.weight.shape, 5.))
+    assert torch.equal(foonet.conv2d_2.bias,
+                       torch.full(foonet.conv2d_2.bias.shape, 6.))
+    assert init_cfg == dict(
+        type='Constant', val=5, bias=6, override=dict(name='conv2d_2'))
+
+    init_cfg = dict(
+        type='Pretrained',
+        checkpoint='modelA.pth',
+        override=dict(type='Constant', name='conv2d_2', val=3, bias=4))
+    modelA = FooModule()
+    constant_func = ConstantInit(val=1, bias=2, layer=['Conv2d', 'Linear'])
+    modelA.apply(constant_func)
+    with TemporaryDirectory():
+        torch.save(modelA.state_dict(), 'modelA.pth')
+        initialize(foonet, init_cfg)
+        assert torch.equal(foonet.linear.weight,
+                           torch.full(foonet.linear.weight.shape, 1.))
+        assert torch.equal(foonet.linear.bias,
+                           torch.full(foonet.linear.bias.shape, 2.))
+        assert torch.equal(foonet.conv2d.weight,
+                           torch.full(foonet.conv2d.weight.shape, 1.))
+        assert torch.equal(foonet.conv2d.bias,
+                           torch.full(foonet.conv2d.bias.shape, 2.))
+        assert torch.equal(foonet.conv2d_2.weight,
+                           torch.full(foonet.conv2d_2.weight.shape, 3.))
+        assert torch.equal(foonet.conv2d_2.bias,
+                           torch.full(foonet.conv2d_2.bias.shape, 4.))
+    assert init_cfg == dict(
+        type='Pretrained',
+        checkpoint='modelA.pth',
+        override=dict(type='Constant', name='conv2d_2', val=3, bias=4))
+
+    # test init_cfg type
+    with pytest.raises(TypeError):
+        init_cfg = 'init_cfg'
+        initialize(foonet, init_cfg)
+
+    # test override value type
+    with pytest.raises(TypeError):
+        init_cfg = dict(
+            type='Constant',
+            val=1,
+            bias=2,
+            layer=['Conv2d', 'Linear'],
+            override='conv')
+        initialize(foonet, init_cfg)
+
+    # test override name
+    with pytest.raises(RuntimeError):
+        init_cfg = dict(
+            type='Constant',
+            val=1,
+            bias=2,
+            layer=['Conv2d', 'Linear'],
+            override=dict(type='Constant', name='conv2d_3', val=3, bias=4))
+        initialize(foonet, init_cfg)
+
+    # test list override name
+    with pytest.raises(RuntimeError):
+        init_cfg = dict(
+            type='Constant',
+            val=1,
+            bias=2,
+            layer=['Conv2d', 'Linear'],
+            override=[
+                dict(type='Constant', name='conv2d', val=3, bias=4),
+                dict(type='Constant', name='conv2d_3', val=5, bias=6)
+            ])
+        initialize(foonet, init_cfg)
+
+    # test override with args except type key
+    with pytest.raises(ValueError):
+        init_cfg = dict(
+            type='Constant',
+            val=1,
+            bias=2,
+            override=dict(name='conv2d_2', val=3, bias=4))
+        initialize(foonet, init_cfg)
+
+    # test override without name
+    with pytest.raises(ValueError):
+        init_cfg = dict(
+            type='Constant',
+            val=1,
+            bias=2,
+            override=dict(type='Constant', val=3, bias=4))
+        initialize(foonet, init_cfg)
diff --git a/mmcv/tests/test_cnn/test_wrappers.py b/mmcv/tests/test_cnn/test_wrappers.py
new file mode 100644
index 0000000000000000000000000000000000000000..02e0f13cd790de613fac388d54a7f33bc2b9ce5d
--- /dev/null
+++ b/mmcv/tests/test_cnn/test_wrappers.py
@@ -0,0 +1,376 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import patch
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.cnn.bricks import (Conv2d, Conv3d, ConvTranspose2d, ConvTranspose3d,
+                             Linear, MaxPool2d, MaxPool3d)
+
+if torch.__version__ != 'parrots':
+    torch_version = '1.1'
+else:
+    torch_version = 'parrots'
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',
+    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])
+def test_conv2d(in_w, in_h, in_channel, out_channel, kernel_size, stride,
+                padding, dilation):
+    """
+    CommandLine:
+        xdoctest -m tests/test_wrappers.py test_conv2d
+    """
+    # train mode
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_h, in_w)
+    torch.manual_seed(0)
+    wrapper = Conv2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_h, in_w).requires_grad_(True)
+    torch.manual_seed(0)
+    ref = nn.Conv2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    wrapper_out.sum().backward()
+    assert wrapper.weight.grad is not None
+    assert wrapper.weight.grad.shape == wrapper.weight.shape
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+    # eval mode
+    x_empty = torch.randn(0, in_channel, in_h, in_w)
+    wrapper = Conv2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    wrapper.eval()
+    wrapper(x_empty)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501
+    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])
+def test_conv3d(in_w, in_h, in_t, in_channel, out_channel, kernel_size, stride,
+                padding, dilation):
+    """
+    CommandLine:
+        xdoctest -m tests/test_wrappers.py test_conv3d
+    """
+    # train mode
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)
+    torch.manual_seed(0)
+    wrapper = Conv3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_t, in_h,
+                           in_w).requires_grad_(True)
+    torch.manual_seed(0)
+    ref = nn.Conv3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    wrapper_out.sum().backward()
+    assert wrapper.weight.grad is not None
+    assert wrapper.weight.grad.shape == wrapper.weight.shape
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+    # eval mode
+    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)
+    wrapper = Conv3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation)
+    wrapper.eval()
+    wrapper(x_empty)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',
+    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])
+def test_conv_transposed_2d(in_w, in_h, in_channel, out_channel, kernel_size,
+                            stride, padding, dilation):
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_h, in_w, requires_grad=True)
+    # out padding must be smaller than either stride or dilation
+    op = min(stride, dilation) - 1
+    if torch.__version__ == 'parrots':
+        op = 0
+    torch.manual_seed(0)
+    wrapper = ConvTranspose2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_h, in_w)
+    torch.manual_seed(0)
+    ref = nn.ConvTranspose2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    wrapper_out.sum().backward()
+    assert wrapper.weight.grad is not None
+    assert wrapper.weight.grad.shape == wrapper.weight.shape
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+    # eval mode
+    x_empty = torch.randn(0, in_channel, in_h, in_w)
+    wrapper = ConvTranspose2d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    wrapper.eval()
+    wrapper(x_empty)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501
+    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])
+def test_conv_transposed_3d(in_w, in_h, in_t, in_channel, out_channel,
+                            kernel_size, stride, padding, dilation):
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w, requires_grad=True)
+    # out padding must be smaller than either stride or dilation
+    op = min(stride, dilation) - 1
+    torch.manual_seed(0)
+    wrapper = ConvTranspose3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_t, in_h, in_w)
+    torch.manual_seed(0)
+    ref = nn.ConvTranspose3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    wrapper_out.sum().backward()
+    assert wrapper.weight.grad is not None
+    assert wrapper.weight.grad.shape == wrapper.weight.shape
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+    # eval mode
+    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w)
+    wrapper = ConvTranspose3d(
+        in_channel,
+        out_channel,
+        kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        output_padding=op)
+    wrapper.eval()
+    wrapper(x_empty)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_channel,out_channel,kernel_size,stride,padding,dilation',
+    [(10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 3, 3, 5, 2, 1, 2)])
+def test_max_pool_2d(in_w, in_h, in_channel, out_channel, kernel_size, stride,
+                     padding, dilation):
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_h, in_w, requires_grad=True)
+    wrapper = MaxPool2d(
+        kernel_size, stride=stride, padding=padding, dilation=dilation)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_h, in_w)
+    ref = nn.MaxPool2d(
+        kernel_size, stride=stride, padding=padding, dilation=dilation)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize(
+    'in_w,in_h,in_t,in_channel,out_channel,kernel_size,stride,padding,dilation',  # noqa: E501
+    [(10, 10, 10, 1, 1, 3, 1, 0, 1), (20, 20, 20, 3, 3, 5, 2, 1, 2)])
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots' and not torch.cuda.is_available(),
+    reason='parrots requires CUDA support')
+def test_max_pool_3d(in_w, in_h, in_t, in_channel, out_channel, kernel_size,
+                     stride, padding, dilation):
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_channel, in_t, in_h, in_w, requires_grad=True)
+    wrapper = MaxPool3d(
+        kernel_size, stride=stride, padding=padding, dilation=dilation)
+    if torch.__version__ == 'parrots':
+        x_empty = x_empty.cuda()
+    wrapper_out = wrapper(x_empty)
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_channel, in_t, in_h, in_w)
+    ref = nn.MaxPool3d(
+        kernel_size, stride=stride, padding=padding, dilation=dilation)
+    if torch.__version__ == 'parrots':
+        x_normal = x_normal.cuda()
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+
+@patch('torch.__version__', torch_version)
+@pytest.mark.parametrize('in_w,in_h,in_feature,out_feature', [(10, 10, 1, 1),
+                                                              (20, 20, 3, 3)])
+def test_linear(in_w, in_h, in_feature, out_feature):
+    # wrapper op with 0-dim input
+    x_empty = torch.randn(0, in_feature, requires_grad=True)
+    torch.manual_seed(0)
+    wrapper = Linear(in_feature, out_feature)
+    wrapper_out = wrapper(x_empty)
+
+    # torch op with 3-dim input as shape reference
+    x_normal = torch.randn(3, in_feature)
+    torch.manual_seed(0)
+    ref = nn.Linear(in_feature, out_feature)
+    ref_out = ref(x_normal)
+
+    assert wrapper_out.shape[0] == 0
+    assert wrapper_out.shape[1:] == ref_out.shape[1:]
+
+    wrapper_out.sum().backward()
+    assert wrapper.weight.grad is not None
+    assert wrapper.weight.grad.shape == wrapper.weight.shape
+
+    assert torch.equal(wrapper(x_normal), ref_out)
+
+    # eval mode
+    x_empty = torch.randn(0, in_feature)
+    wrapper = Linear(in_feature, out_feature)
+    wrapper.eval()
+    wrapper(x_empty)
+
+
+@patch('mmcv.cnn.bricks.wrappers.TORCH_VERSION', (1, 10))
+def test_nn_op_forward_called():
+
+    for m in ['Conv2d', 'ConvTranspose2d', 'MaxPool2d']:
+        with patch(f'torch.nn.{m}.forward') as nn_module_forward:
+            # randn input
+            x_empty = torch.randn(0, 3, 10, 10)
+            wrapper = eval(m)(3, 2, 1)
+            wrapper(x_empty)
+            nn_module_forward.assert_called_with(x_empty)
+
+            # non-randn input
+            x_normal = torch.randn(1, 3, 10, 10)
+            wrapper = eval(m)(3, 2, 1)
+            wrapper(x_normal)
+            nn_module_forward.assert_called_with(x_normal)
+
+    for m in ['Conv3d', 'ConvTranspose3d', 'MaxPool3d']:
+        with patch(f'torch.nn.{m}.forward') as nn_module_forward:
+            # randn input
+            x_empty = torch.randn(0, 3, 10, 10, 10)
+            wrapper = eval(m)(3, 2, 1)
+            wrapper(x_empty)
+            nn_module_forward.assert_called_with(x_empty)
+
+            # non-randn input
+            x_normal = torch.randn(1, 3, 10, 10, 10)
+            wrapper = eval(m)(3, 2, 1)
+            wrapper(x_normal)
+            nn_module_forward.assert_called_with(x_normal)
+
+    with patch('torch.nn.Linear.forward') as nn_module_forward:
+        # randn input
+        x_empty = torch.randn(0, 3)
+        wrapper = Linear(3, 3)
+        wrapper(x_empty)
+        nn_module_forward.assert_called_with(x_empty)
+
+        # non-randn input
+        x_normal = torch.randn(1, 3)
+        wrapper = Linear(3, 3)
+        wrapper(x_normal)
+        nn_module_forward.assert_called_with(x_normal)
diff --git a/mmcv/tests/test_device/test_device_utils.py b/mmcv/tests/test_device/test_device_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6597efa5a3c1f16de4ca6c10d66a09cc77f45686
--- /dev/null
+++ b/mmcv/tests/test_device/test_device_utils.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmcv.device import get_device
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
+
+
+def test_get_device():
+    current_device = get_device()
+    if IS_CUDA_AVAILABLE:
+        assert current_device == 'cuda'
+    elif IS_MLU_AVAILABLE:
+        assert current_device == 'mlu'
+    elif IS_MPS_AVAILABLE:
+        assert current_device == 'mps'
+    else:
+        assert current_device == 'cpu'
diff --git a/mmcv/tests/test_device/test_functions.py b/mmcv/tests/test_device/test_functions.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbbb8978b522ce8fe1fb5155b8b880eec7b281fa
--- /dev/null
+++ b/mmcv/tests/test_device/test_functions.py
@@ -0,0 +1,90 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.device._functions import Scatter, scatter
+from mmcv.utils import IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
+
+
+def test_scatter():
+    # if the device is CPU, just return the input
+    input = torch.zeros([1, 3, 3, 3])
+    output = scatter(input=input, devices=[-1])
+    assert torch.allclose(input, output)
+
+    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+    outputs = scatter(input=inputs, devices=[-1])
+    for input, output in zip(inputs, outputs):
+        assert torch.allclose(input, output)
+
+    # if the device is MLU, copy the input from CPU to MLU
+    if IS_MLU_AVAILABLE:
+        input = torch.zeros([1, 3, 3, 3])
+        output = scatter(input=input, devices=[0])
+        assert torch.allclose(input.to('mlu'), output)
+
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = scatter(input=inputs, devices=[0])
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.to('mlu'), output)
+
+    # if the device is MPS, copy the input from CPU to MPS
+    if IS_MPS_AVAILABLE:
+        input = torch.zeros([1, 3, 3, 3])
+        output = scatter(input=input, devices=[0])
+        assert torch.allclose(input.to('mps'), output)
+
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = scatter(input=inputs, devices=[0])
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.to('mps'), output)
+
+    # input should be a tensor or list of tensor
+    with pytest.raises(Exception):
+        scatter(5, [-1])
+
+
+def test_Scatter():
+    # if the device is CPU, just return the input
+    target_devices = [-1]
+    input = torch.zeros([1, 3, 3, 3])
+    outputs = Scatter.forward(target_devices, input)
+    assert isinstance(outputs, tuple)
+    assert torch.allclose(input, outputs[0])
+
+    target_devices = [-1]
+    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+    outputs = Scatter.forward(target_devices, inputs)
+    assert isinstance(outputs, tuple)
+    for input, output in zip(inputs, outputs):
+        assert torch.allclose(input, output)
+
+    # if the device is MLU, copy the input from CPU to MLU
+    if IS_MLU_AVAILABLE:
+        target_devices = [0]
+        input = torch.zeros([1, 3, 3, 3])
+        outputs = Scatter.forward(target_devices, input)
+        assert isinstance(outputs, tuple)
+        assert torch.allclose(input.to('mlu'), outputs[0])
+
+        target_devices = [0]
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = Scatter.forward(target_devices, inputs)
+        assert isinstance(outputs, tuple)
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.to('mlu'), output[0])
+
+    # if the device is MPS, copy the input from CPU to MPS
+    if IS_MPS_AVAILABLE:
+        target_devices = [0]
+        input = torch.zeros([1, 3, 3, 3])
+        outputs = Scatter.forward(target_devices, input)
+        assert isinstance(outputs, tuple)
+        assert torch.allclose(input.to('mps'), outputs[0])
+
+        target_devices = [0]
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = Scatter.forward(target_devices, inputs)
+        assert isinstance(outputs, tuple)
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.to('mps'), output[0])
diff --git a/mmcv/tests/test_device/test_ipu/test_hierarchicaldatamanager.py b/mmcv/tests/test_device/test_ipu/test_hierarchicaldatamanager.py
new file mode 100755
index 0000000000000000000000000000000000000000..e0a0f012fa57a471464f1ddf8a1832879d58ed9e
--- /dev/null
+++ b/mmcv/tests/test_device/test_ipu/test_hierarchicaldatamanager.py
@@ -0,0 +1,106 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+import numpy as np
+import pytest
+import torch
+
+from mmcv.parallel.data_container import DataContainer
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu.hierarchical_data_manager import \
+        HierarchicalDataManager
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+
+@skip_no_ipu
+def test_HierarchicalData():
+    # test hierarchical data
+    hierarchical_data_sample = {
+        'a': torch.rand(3, 4),
+        'b': np.random.rand(3, 4),
+        'c': DataContainer({
+            'a': torch.rand(3, 4),
+            'b': 4,
+            'c': 'd'
+        }),
+        'd': 123,
+        'e': [1, 3, torch.rand(3, 4),
+              np.random.rand(3, 4)],
+        'f': {
+            'a': torch.rand(3, 4),
+            'b': np.random.rand(3, 4),
+            'c': [1, 'asd']
+        }
+    }
+    all_tensors = []
+    all_tensors.append(hierarchical_data_sample['a'])
+    all_tensors.append(hierarchical_data_sample['c'].data['a'])
+    all_tensors.append(hierarchical_data_sample['e'][2])
+    all_tensors.append(hierarchical_data_sample['f']['a'])
+    all_tensors_id = [id(ele) for ele in all_tensors]
+
+    hd = HierarchicalDataManager(logging.getLogger())
+    hd.record_hierarchical_data(hierarchical_data_sample)
+    tensors = hd.collect_all_tensors()
+    for t in tensors:
+        assert id(t) in all_tensors_id
+    tensors[0].add_(1)
+    hd.update_all_tensors(tensors)
+    data = hd.hierarchical_data
+    data['c'].data['a'].sub_(1)
+    hd.record_hierarchical_data(data)
+    tensors = hd.collect_all_tensors()
+    for t in tensors:
+        assert id(t) in all_tensors_id
+    hd.quick()
+
+    with pytest.raises(
+            AssertionError,
+            match='original hierarchical data is not torch.tensor'):
+        hd.record_hierarchical_data(torch.rand(3, 4))
+
+    class AuxClass:
+        pass
+
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hd.record_hierarchical_data(AuxClass())
+
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hierarchical_data_sample['a'] = AuxClass()
+        hd.update_all_tensors(tensors)
+
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hierarchical_data_sample['a'] = AuxClass()
+        hd.collect_all_tensors()
+
+    with pytest.raises(NotImplementedError, match='not supported datatype:'):
+        hierarchical_data_sample['a'] = AuxClass()
+        hd.clean_all_tensors()
+
+    hd = HierarchicalDataManager(logging.getLogger())
+    hd.record_hierarchical_data(hierarchical_data_sample)
+    hierarchical_data_sample['a'] = torch.rand(3, 4)
+    with pytest.raises(ValueError, match='all data except torch.Tensor'):
+        new_hierarchical_data_sample = {
+            **hierarchical_data_sample, 'b': np.random.rand(3, 4)
+        }
+        hd.update_hierarchical_data(new_hierarchical_data_sample)
+
+    hd.update_hierarchical_data(new_hierarchical_data_sample, strict=False)
+
+    hd.clean_all_tensors()
+
+    # test single tensor
+    single_tensor = torch.rand(3, 4)
+    hd = HierarchicalDataManager(logging.getLogger())
+    hd.record_hierarchical_data(single_tensor)
+    tensors = hd.collect_all_tensors()
+    assert len(tensors) == 1 and single_tensor in tensors
+    single_tensor_to_update = [torch.rand(3, 4)]
+    hd.update_all_tensors(single_tensor_to_update)
+    new_tensors = hd.collect_all_tensors()
+    assert new_tensors == single_tensor_to_update
diff --git a/mmcv/tests/test_device/test_ipu/test_ipu_dataloder.py b/mmcv/tests/test_device/test_ipu/test_ipu_dataloder.py
new file mode 100755
index 0000000000000000000000000000000000000000..b1db1480517247411f1478d1dcc26b2572e64af2
--- /dev/null
+++ b/mmcv/tests/test_device/test_ipu/test_ipu_dataloder.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+from torch.utils.data import Dataset
+
+from mmcv.parallel.data_container import DataContainer
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu import IPUDataLoader, cfg2options
+    from mmcv.device.ipu.dataloader import collate
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+
+class ToyDataset(Dataset):
+
+    def __getitem__(self, index):
+        return 111
+
+    def __len__(self, ):
+        return 3
+
+
+@skip_no_ipu
+def test_ipu_dataloader():
+    # test lazy initialization
+    dataloader = IPUDataLoader(
+        ToyDataset(), None, batch_size=256, num_workers=1, mode='async')
+    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
+    ipu_options = cfg2options(options_cfg)
+    dataloader.init(ipu_options['training'])
+
+    # test normal initialization
+    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
+    ipu_options = cfg2options(options_cfg)['training']
+    dataloader = IPUDataLoader(
+        ToyDataset(), ipu_options, batch_size=256, num_workers=1, mode='async')
+
+
+@skip_no_ipu
+def test_ipu_collate():
+    with pytest.raises(TypeError, match='`batch` should be a sequence'):
+        collate(123)
+
+    with pytest.raises(TypeError, match='DataContainer is not supported'):
+        collate([DataContainer(666)])
+
+    data_list = [[1, 2, 3], [2, 3, 4], DataContainer(666)]
+    batch0 = {
+        'tensor': torch.rand(3, 4, 5),
+        'arr': np.random.rand(3, 4, 5, 6),
+        'data_list': data_list
+    }
+    batch1 = {
+        'tensor': torch.rand(3, 4, 5),
+        'arr': np.random.rand(3, 4, 5, 6),
+        'data_list': data_list
+    }
+    batch = [batch1, batch0]
+    results = collate(batch)
+    assert results['tensor'].shape == (2, 3, 4, 5)
+    assert results['arr'].shape == (2, 3, 4, 5, 6)
+    for data in results['data_list']:
+        for tensor in data:
+            assert not isinstance(tensor, DataContainer)
+            assert tensor.shape == (2, )
diff --git a/mmcv/tests/test_device/test_ipu/test_ipu_hooks.py b/mmcv/tests/test_device/test_ipu/test_ipu_hooks.py
new file mode 100755
index 0000000000000000000000000000000000000000..d76291a372dc63a8486d5c889264cad753c78c0d
--- /dev/null
+++ b/mmcv/tests/test_device/test_ipu/test_ipu_hooks.py
@@ -0,0 +1,130 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os.path as osp
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.runner import build_runner
+from mmcv.runner.fp16_utils import auto_fp16
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu.hook_wrapper import IPUFp16OptimizerHook
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+
+# TODO Once the model training and inference interfaces
+# of MMCLS and MMDET are unified,
+# construct the model according to the unified standards
+class ToyModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = nn.BatchNorm2d(3)
+        self.relu = nn.ReLU6()
+        self.fp16_enabled = False
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, return_loss=True, **kwargs):
+        x = self.conv(img)
+        x = self.bn(x)
+        x = self.relu(x)
+        if return_loss:
+            loss = ((x - kwargs['gt_label'])**2).sum()
+            return {
+                'loss': loss,
+                'loss_list': [loss, loss],
+                'loss_dict': {
+                    'loss1': loss
+                }
+            }
+        return x
+
+    def _parse_losses(self, losses):
+        return losses['loss'], losses['loss']
+
+    def train_step(self, data, optimizer=None, **kwargs):
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+        return outputs
+
+
+@skip_no_ipu
+def test_ipu_hook_wrapper(tmp_path):
+
+    model = ToyModel()
+    dummy_input = {
+        'data': {
+            'img': torch.rand((16, 3, 10, 10)),
+            'gt_label': torch.rand((16, 3, 10, 10))
+        }
+    }
+
+    dir_name = 'a_tmp_dir'
+    working_dir = osp.join(tmp_path, dir_name)
+
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+
+    default_args = dict(
+        model=model,
+        work_dir=working_dir,
+        optimizer=optimizer,
+        logger=logging.getLogger())
+    cfg = dict(type='IPUEpochBasedRunner', max_epochs=1)
+    dummy_runner = build_runner(cfg, default_args=default_args)
+
+    # learning policy
+    lr_config = dict(policy='step', step=[1, 150])
+    # test optimizer config
+    optimizer_config = dict(
+        grad_clip=dict(max_norm=2), detect_anomalous_params=True)
+
+    # test building ipu_lr_hook_class
+    dummy_runner.register_training_hooks(
+        lr_config=lr_config, optimizer_config=None, timer_config=None)
+
+    # test _set_lr()
+    output = dummy_runner.model.train_step(**dummy_input)
+    dummy_runner.outputs = output
+    dummy_runner.call_hook('before_train_epoch')
+
+    # test building ipu_optimizer_hook_class
+    with pytest.raises(
+            NotImplementedError, match='IPU does not support gradient clip'):
+        dummy_runner.register_training_hooks(
+            lr_config=None,
+            optimizer_config=optimizer_config,
+            timer_config=None)
+
+    # test fp16 optimizer hook
+    lr_config = dict(policy='step', step=[1, 150])
+    optimizer_config = dict(grad_clip=dict(max_norm=2))
+    dummy_runner.hooks.pop(0)
+
+    with pytest.raises(NotImplementedError, match='IPU mode does not support'):
+        optimizer_config = IPUFp16OptimizerHook(
+            loss_scale='dynamic', distributed=False)
+
+    with pytest.raises(NotImplementedError, match='IPU mode supports single'):
+        optimizer_config = IPUFp16OptimizerHook(
+            loss_scale={}, distributed=False)
+
+    with pytest.raises(ValueError, match='loss_scale should be float'):
+        optimizer_config = IPUFp16OptimizerHook(
+            loss_scale=[], distributed=False)
+
+    optimizer_config = IPUFp16OptimizerHook(loss_scale=2.0, distributed=False)
+
+    dummy_runner.register_training_hooks(
+        lr_config=lr_config,
+        optimizer_config=optimizer_config,
+        timer_config=None)
+
+    dummy_runner.call_hook('after_train_iter')
diff --git a/mmcv/tests/test_device/test_ipu/test_ipu_model.py b/mmcv/tests/test_device/test_ipu/test_ipu_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..390d09a1344417b5b9ff07df8ea81a6b7343a64e
--- /dev/null
+++ b/mmcv/tests/test_device/test_ipu/test_ipu_model.py
@@ -0,0 +1,301 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.runner.fp16_utils import auto_fp16
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu import cfg2options, ipu_model_wrapper
+    from mmcv.device.ipu.utils import compare_ndarray
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+
+class MyBN(nn.BatchNorm2d):
+
+    def forward(self, *args, **kwargs):
+        result = super().forward(*args, **kwargs)
+        return result, self.running_mean
+
+
+# TODO Once the model training and inference interfaces
+# of MMCLS and MMDET are unified,
+# construct the model according to the unified standards
+class ToyModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = MyBN(3)
+        self.relu = nn.ReLU6()
+        self.fp16_enabled = False
+
+    @auto_fp16(apply_to=('img', ))
+    def forward(self, img, return_loss=True, **kwargs):
+        x = self.conv(img)
+        x, running_mean = self.bn(x)
+        x = self.relu(x)
+        if return_loss:
+            loss = ((x - kwargs['gt_label'])**2).sum()
+            return {
+                'loss': loss,
+                'loss_list': [loss, loss],
+                'loss_dict': {
+                    'loss1': loss
+                }
+            }
+        return x
+
+    def _parse_losses(self, losses):
+        return losses['loss'], losses['loss']
+
+    def train_step(self, data, optimizer=None, **kwargs):
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+        return outputs
+
+
+@skip_no_ipu
+def test_build_model():
+    for execution_strategy in \
+            ['SameAsIpu', 'ShardedExecution', 'error_strategy']:
+        if execution_strategy == 'error_strategy':
+
+            def maybe_catch_error(_error):
+                return pytest.raises(_error)
+        else:
+
+            class NullContextManager:
+
+                def __enter__(self, ):
+                    pass
+
+                def __exit__(self, exc_type, exc_value, exc_traceback):
+                    pass
+
+            def maybe_catch_error(_error):
+                return NullContextManager()
+
+        with maybe_catch_error(NotImplementedError):
+            options_cfg = dict(
+                randomSeed=888,
+                enableExecutableCaching='cache_engine',
+                train_cfg=dict(
+                    executionStrategy=execution_strategy,
+                    Training=dict(gradientAccumulation=8),
+                    availableMemoryProportion=[0.3, 0.3, 0.3, 0.3]),
+                eval_cfg=dict(deviceIterations=1, ),
+                partialsType='half')
+
+            ipu_options = cfg2options(options_cfg)
+            model = ToyModel()
+            optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+            logger = logging.getLogger()
+            modules_to_record = None
+            ipu_model_cfg = dict(
+                train_split_edges=[dict(layer_to_call='conv', ipu_id=0)],
+                train_ckpt_nodes=['bn', 'conv'])
+            fp16_cfg = {'loss_scale': 0.5}
+            ipu_model = ipu_model_wrapper(
+                model,
+                ipu_options,
+                optimizer,
+                logger,
+                modules_to_record=modules_to_record,
+                ipu_model_cfg=ipu_model_cfg,
+                fp16_cfg=fp16_cfg)
+
+            ipu_model.train()
+            ipu_model.eval()
+            ipu_model.train()
+
+
+def run_model(ipu_options,
+              fp16_cfg,
+              modules_to_record,
+              ipu_model_wrapper_func,
+              only_eval=False):
+    model = ToyModel()
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)\
+        if not only_eval else None
+    logger = logging.getLogger()
+    ipu_model_cfg = dict(
+        train_split_edges=[dict(layer_to_call='conv', ipu_id=0)],
+        train_ckpt_nodes=['bn', 'conv'])
+    ipu_model = ipu_model_wrapper_func(
+        model,
+        ipu_options,
+        optimizer,
+        logger,
+        modules_to_record=modules_to_record,
+        ipu_model_cfg=ipu_model_cfg,
+        fp16_cfg=fp16_cfg)
+
+    def get_dummy_input(training):
+        if training:
+            return {
+                'data': {
+                    'img': torch.rand((16, 3, 10, 10)),
+                    'gt_label': torch.rand((16, 3, 10, 10))
+                }
+            }
+        else:
+            return {
+                'img': torch.rand((16, 3, 10, 10)),
+                'img_metas': {
+                    'img': torch.rand((16, 3, 10, 10))
+                },
+                'return_loss': False
+            }
+
+    if not only_eval:
+        training = True
+        ipu_model.train()
+        for _ in range(3):
+            dummy_input = get_dummy_input(training)
+            output = ipu_model.train_step(**dummy_input)
+    training = False
+    ipu_model.eval()
+    for _ in range(3):
+        dummy_input = get_dummy_input(training)
+        output = ipu_model(**dummy_input)
+    return output, ipu_model
+
+
+@skip_no_ipu
+def test_run_model():
+
+    # test feature alignment not support gradientAccumulation mode
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            Training=dict(gradientAccumulation=8),
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = ['bn']
+    with pytest.raises(AssertionError, match='Feature alignment'):
+        run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+
+    # test feature alignment not support multi-replica mode
+    options_cfg = dict(
+        randomSeed=888,
+        replicationFactor=2,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = ['bn']
+    with pytest.raises(AssertionError, match='Feature alignment'):
+        run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+
+    # test feature alignment not support fp16 mode
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    fp16_cfg = {
+        'loss_scale': 0.5,
+        'velocity_accum_type': 'half',
+        'accum_type': 'half'
+    }
+    modules_to_record = ['bn']
+    with pytest.raises(NotImplementedError):
+        run_model(ipu_options, fp16_cfg, modules_to_record, ipu_model_wrapper)
+
+    # test velocity_accum_type and accum_type
+    fp16_cfg = {
+        'loss_scale': 0.5,
+        'velocity_accum_type': 'float',
+        'accum_type': 'float'
+    }
+    run_model(ipu_options, fp16_cfg, None, ipu_model_wrapper)
+
+    # test compile and run
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = ['bn']
+    run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+
+    # test feature alignment
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ))
+    ipu_options = cfg2options(options_cfg)
+    modules_to_record = None
+    run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
+
+    # test inference mode
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+        partialsType='half')
+    ipu_options = cfg2options(options_cfg)
+    fp16_cfg = {'loss_scale': 0.5}
+    modules_to_record = None
+    _, ipu_model = run_model(
+        ipu_options,
+        fp16_cfg,
+        modules_to_record,
+        ipu_model_wrapper,
+        only_eval=True)
+    with pytest.raises(RuntimeError):
+        ipu_model.train()
+    with pytest.raises(ValueError):
+        ipu_model.train(123)
+    _, ipu_model = run_model(ipu_options, None, modules_to_record,
+                             ipu_model_wrapper)
+
+    # test NotImplementedError in __call__
+    ipu_model.train()
+    with pytest.raises(NotImplementedError):
+        ipu_model()
+
+    # test parse_losses
+    with pytest.raises(TypeError):
+        ipu_model._model.model._parse_losses({'loss': None})
+
+
+@skip_no_ipu
+def test_compare_tensor():
+    compare_ndarray(np.random.rand(3, 4), np.random.rand(3, 4))
diff --git a/mmcv/tests/test_device/test_ipu/test_ipu_runner.py b/mmcv/tests/test_device/test_ipu/test_ipu_runner.py
new file mode 100755
index 0000000000000000000000000000000000000000..4de4fb7089829f0729a46737d27f4b24b2add549
--- /dev/null
+++ b/mmcv/tests/test_device/test_ipu/test_ipu_runner.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os.path as osp
+
+import pytest
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset
+
+from mmcv.runner import build_runner
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from mmcv.device.ipu import IPUDataLoader, runner
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+# Most of its functions are inherited from EpochBasedRunner and IterBasedRunner
+# So only do incremental testing on overridden methods
+# Comparing with base runner,
+# Overridden functions are listed below:
+# __init__, register_lr_hook, register_optimizer_hook
+# register_lr_hook and register_optimizer_hook are tested in test_runner.py
+
+
+class OldStyleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+
+
+class Model(OldStyleModel):
+
+    def train_step(self):
+        pass
+
+    def val_step(self):
+        pass
+
+
+class ToyModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = nn.BatchNorm2d(3)
+        self.relu = nn.ReLU6()
+        self.fp16_enabled = False
+
+    def forward(self, img, return_loss=True, **kwargs):
+        x = self.conv(img)
+        x = self.bn(x)
+        x = self.relu(x)
+        if return_loss:
+            loss = ((x - kwargs['gt_label'])**2).sum()
+            return {'loss': loss, 'loss1': loss + 1}
+        return x
+
+    def _parse_losses(self, losses):
+        return losses['loss'], {'loss1': losses['loss']}
+
+    def train_step(self, data, optimizer=None, **kwargs):
+        losses = self(**data)
+        loss, log_vars = self._parse_losses(losses)
+        outputs = dict(
+            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
+        return outputs
+
+
+class ToyDataset(Dataset):
+
+    def __getitem__(self, index):
+        return {
+            'img': torch.rand((3, 10, 10)),
+            'gt_label': torch.rand((3, 10, 10))
+        }
+
+    def __len__(self, ):
+        return 3
+
+
+@skip_no_ipu
+def test_build_runner(tmp_path):
+    # __init__
+    dir_name = 'a_tmp_dir'
+
+    default_args = dict(
+        model=Model(),
+        work_dir=osp.join(tmp_path, dir_name),
+        logger=logging.getLogger())
+    cfg = dict(type='IPUEpochBasedRunner', max_epochs=1)
+    ipu_runner = build_runner(cfg, default_args=default_args)
+    assert ipu_runner._max_epochs == 1
+    cfg = dict(type='IPUIterBasedRunner', max_iters=1)
+    ipu_runner = build_runner(cfg, default_args=default_args)
+    assert ipu_runner._max_iters == 1
+
+    runner.IS_IPU_AVAILABLE = False
+    cfg = dict(type='IPUIterBasedRunner', max_iters=1)
+    with pytest.raises(
+            NotImplementedError,
+            match='cpu mode on IPURunner is not supported'):
+        ipu_runner = build_runner(cfg, default_args=default_args)
+
+    runner.IS_IPU_AVAILABLE = True
+    with pytest.raises(ValueError, match='Only one of'):
+        cfg = dict(type='IPUIterBasedRunner', max_epochs=1, max_iters=1)
+        ipu_runner = build_runner(cfg, default_args=default_args)
+
+    model = ToyModel()
+    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
+    dataloader = IPUDataLoader(ToyDataset(), None, num_workers=1)
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
+    cfg = dict(type='IPUIterBasedRunner', max_iters=2, options_cfg=options_cfg)
+    default_args = dict(
+        model=model,
+        optimizer=optimizer,
+        work_dir=osp.join(tmp_path, dir_name),
+        logger=logging.getLogger())
+    ipu_runner = build_runner(cfg, default_args=default_args)
+    ipu_runner.run([dataloader], [('train', 2)])
+    ipu_runner.get_options('val')
+    with pytest.raises(ValueError, match='mode should be train or val'):
+        ipu_runner.get_options('666')
diff --git a/mmcv/tests/test_device/test_ipu/test_ipu_utils.py b/mmcv/tests/test_device/test_ipu/test_ipu_utils.py
new file mode 100755
index 0000000000000000000000000000000000000000..f554c2414f63f6a94f4309fd0105df1249706856
--- /dev/null
+++ b/mmcv/tests/test_device/test_ipu/test_ipu_utils.py
@@ -0,0 +1,194 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import pytest
+import torch.nn as nn
+
+import mmcv
+from mmcv.utils import IS_IPU_AVAILABLE
+
+if IS_IPU_AVAILABLE:
+    from poptorch.options import _IExecutionStrategy
+
+    from mmcv.device.ipu import cfg2options
+    from mmcv.device.ipu.utils import (build_from_cfg_with_wrapper,
+                                       model_sharding)
+
+skip_no_ipu = pytest.mark.skipif(
+    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
+
+
+class ToyModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.bn = nn.BatchNorm2d(3)
+        self.relu = nn.ReLU6()
+
+
+@skip_no_ipu
+def test_build_from_cfg():
+    BACKBONES = mmcv.Registry('backbone')
+
+    @BACKBONES.register_module()
+    class ResNet:
+
+        def __init__(self, depth, stages=4):
+            self.depth = depth
+            self.stages = stages
+
+    @BACKBONES.register_module()
+    class ResNeXt:
+
+        def __init__(self, depth, stages=4):
+            self.depth = depth
+            self.stages = stages
+
+    cfg = dict(type='ResNet', depth=50)
+    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    cfg = dict(type='ResNet', depth=50)
+    model = build_from_cfg_with_wrapper(
+        cfg, BACKBONES, default_args={'stages': 3})
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 3
+
+    cfg = dict(type='ResNeXt', depth=50, stages=3)
+    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    assert isinstance(model, ResNeXt)
+    assert model.depth == 50 and model.stages == 3
+
+    cfg = dict(type=ResNet, depth=50)
+    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    # type defined using default_args
+    cfg = dict(depth=50)
+    model = build_from_cfg_with_wrapper(
+        cfg, BACKBONES, default_args=dict(type='ResNet'))
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    cfg = dict(depth=50)
+    model = build_from_cfg_with_wrapper(
+        cfg, BACKBONES, default_args=dict(type=ResNet))
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    # not a registry
+    with pytest.raises(TypeError):
+        cfg = dict(type='VGG')
+        model = build_from_cfg_with_wrapper(cfg, 'BACKBONES')
+
+    # non-registered class
+    with pytest.raises(KeyError):
+        cfg = dict(type='VGG')
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+
+    # default_args must be a dict or None
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES, default_args=1)
+
+    # cfg['type'] should be a str or class
+    with pytest.raises(TypeError):
+        cfg = dict(type=1000)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+
+    # cfg should contain the key "type"
+    with pytest.raises(KeyError, match='must contain the key "type"'):
+        cfg = dict(depth=50, stages=4)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+
+    # cfg or default_args should contain the key "type"
+    with pytest.raises(KeyError, match='must contain the key "type"'):
+        cfg = dict(depth=50)
+        model = build_from_cfg_with_wrapper(
+            cfg, BACKBONES, default_args=dict(stages=4))
+
+    # incorrect registry type
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = build_from_cfg_with_wrapper(cfg, 'BACKBONES')
+
+    # incorrect default_args type
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES, default_args=0)
+
+    # incorrect arguments
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', non_existing_arg=50)
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+
+    # cfg not dict
+    with pytest.raises(TypeError):
+        cfg = []
+        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
+
+
+@skip_no_ipu
+def test_cast_to_options():
+    options_cfg = dict(
+        randomSeed=888,
+        enableExecutableCaching='cache_engine',
+        train_cfg=dict(
+            executionStrategy='SameAsIpu',
+            Training=dict(gradientAccumulation=8),
+            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
+        ),
+        eval_cfg=dict(deviceIterations=1, ),
+    )
+    ipu_options = cfg2options(copy.deepcopy(options_cfg))
+    assert 'training' in ipu_options
+    assert 'inference' in ipu_options
+    assert ipu_options['training']._values['random_seed'] == 888
+    assert ipu_options['training']._values['replication_factor'] == 1
+    assert ipu_options['training']._values['available_memory_proportion'] == {
+        0: 0.3,
+        1: 0.3,
+        2: 0.3,
+        3: 0.3
+    }
+    assert ipu_options['training']._popart.options[
+        'cachePath'] == 'cache_engine'
+    assert isinstance(ipu_options['training']._execution_strategy,
+                      _IExecutionStrategy)
+    assert ipu_options['inference']._values['device_iterations'] == 1
+
+    with pytest.raises(NotImplementedError, match='cfg type'):
+        _options_cfg = copy.deepcopy(options_cfg)
+        _options_cfg['randomSeed'] = (1, 3)
+        cfg2options(_options_cfg)
+
+    with pytest.raises(NotImplementedError, match='options_node type'):
+        _options_cfg = copy.deepcopy(options_cfg)
+        _options_cfg['train_cfg']['Precision'] = {'autocast_policy': 123}
+        cfg2options(_options_cfg)
+
+
+@skip_no_ipu
+def test_model_sharding():
+
+    model = ToyModel()
+    split_edges = [dict(layer_to_call='666', ipu_id=0)]
+
+    with pytest.raises(RuntimeError, match='split_edges:'):
+        model_sharding(model, split_edges)
+
+    model = ToyModel()
+    split_edges = [
+        dict(layer_to_call='conv', ipu_id=0),
+        dict(layer_to_call=1, ipu_id=0)
+    ]
+
+    with pytest.raises(ValueError, match='The same layer is referenced'):
+        model_sharding(model, split_edges)
+
+    model = ToyModel()
+    split_edges = [dict(layer_to_call='conv', ipu_id=0)]
+    model_sharding(model, split_edges)
diff --git a/mmcv/tests/test_device/test_mlu/test_mlu_parallel.py b/mmcv/tests/test_device/test_mlu/test_mlu_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d04fb6551cfb0b9f871c2c3f0f4274af1a1a83c
--- /dev/null
+++ b/mmcv/tests/test_device/test_mlu/test_mlu_parallel.py
@@ -0,0 +1,37 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import MagicMock, patch
+
+import torch.nn as nn
+
+from mmcv.device.mlu import MLUDataParallel, MLUDistributedDataParallel
+from mmcv.parallel import is_module_wrapper
+from mmcv.utils import IS_MLU_AVAILABLE
+
+
+def mock(*args, **kwargs):
+    pass
+
+
+@patch('torch.distributed._broadcast_coalesced', mock)
+@patch('torch.distributed.broadcast', mock)
+@patch('torch.nn.parallel.DistributedDataParallel._ddp_init_helper', mock)
+def test_is_module_wrapper():
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(2, 2, 1)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    model = Model()
+    assert not is_module_wrapper(model)
+
+    if IS_MLU_AVAILABLE:
+        mludp = MLUDataParallel(model)
+        assert is_module_wrapper(mludp)
+
+        mluddp = MLUDistributedDataParallel(model, process_group=MagicMock())
+        assert is_module_wrapper(mluddp)
diff --git a/mmcv/tests/test_device/test_mps/test_mps_parallel.py b/mmcv/tests/test_device/test_mps/test_mps_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b4e0b86e1dcee5e7e64ba5b69fe62138a939f9c
--- /dev/null
+++ b/mmcv/tests/test_device/test_mps/test_mps_parallel.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import patch
+
+import torch.nn as nn
+
+from mmcv.device.mps import MPSDataParallel
+from mmcv.parallel import is_module_wrapper
+from mmcv.utils import IS_MPS_AVAILABLE
+
+
+def mock(*args, **kwargs):
+    pass
+
+
+@patch('torch.distributed._broadcast_coalesced', mock)
+@patch('torch.distributed.broadcast', mock)
+@patch('torch.nn.parallel.DistributedDataParallel._ddp_init_helper', mock)
+def test_is_module_wrapper():
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(2, 2, 1)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    model = Model()
+    assert not is_module_wrapper(model)
+
+    if IS_MPS_AVAILABLE:
+        mpsdp = MPSDataParallel(model)
+        assert is_module_wrapper(mpsdp)
diff --git a/mmcv/tests/test_fileclient.py b/mmcv/tests/test_fileclient.py
new file mode 100644
index 0000000000000000000000000000000000000000..292779f36a924de0171f422d33ed4f219d51e5e3
--- /dev/null
+++ b/mmcv/tests/test_fileclient.py
@@ -0,0 +1,862 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import sys
+import tempfile
+from contextlib import contextmanager
+from copy import deepcopy
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import mmcv
+from mmcv import BaseStorageBackend, FileClient
+from mmcv.utils import has_method
+
+sys.modules['ceph'] = MagicMock()
+sys.modules['petrel_client'] = MagicMock()
+sys.modules['petrel_client.client'] = MagicMock()
+sys.modules['mc'] = MagicMock()
+
+
+@contextmanager
+def build_temporary_directory():
+    """Build a temporary directory containing many files to test
+    ``FileClient.list_dir_or_file``.
+
+    . \n
+    | -- dir1 \n
+    | -- | -- text3.txt \n
+    | -- dir2 \n
+    | -- | -- dir3 \n
+    | -- | -- | -- text4.txt \n
+    | -- | -- img.jpg \n
+    | -- text1.txt \n
+    | -- text2.txt \n
+    """
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        text1 = Path(tmp_dir) / 'text1.txt'
+        text1.open('w').write('text1')
+        text2 = Path(tmp_dir) / 'text2.txt'
+        text2.open('w').write('text2')
+        dir1 = Path(tmp_dir) / 'dir1'
+        dir1.mkdir()
+        text3 = dir1 / 'text3.txt'
+        text3.open('w').write('text3')
+        dir2 = Path(tmp_dir) / 'dir2'
+        dir2.mkdir()
+        jpg1 = dir2 / 'img.jpg'
+        jpg1.open('wb').write(b'img')
+        dir3 = dir2 / 'dir3'
+        dir3.mkdir()
+        text4 = dir3 / 'text4.txt'
+        text4.open('w').write('text4')
+        yield tmp_dir
+
+
+@contextmanager
+def delete_and_reset_method(obj, method):
+    method_obj = deepcopy(getattr(type(obj), method))
+    try:
+        delattr(type(obj), method)
+        yield
+    finally:
+        setattr(type(obj), method, method_obj)
+
+
+class MockS3Client:
+
+    def __init__(self, enable_mc=True):
+        self.enable_mc = enable_mc
+
+    def Get(self, filepath):
+        with open(filepath, 'rb') as f:
+            content = f.read()
+        return content
+
+
+class MockPetrelClient:
+
+    def __init__(self, enable_mc=True, enable_multi_cluster=False):
+        self.enable_mc = enable_mc
+        self.enable_multi_cluster = enable_multi_cluster
+
+    def Get(self, filepath):
+        with open(filepath, 'rb') as f:
+            content = f.read()
+        return content
+
+    def put(self):
+        pass
+
+    def delete(self):
+        pass
+
+    def contains(self):
+        pass
+
+    def isdir(self):
+        pass
+
+    def list(self, dir_path):
+        for entry in os.scandir(dir_path):
+            if not entry.name.startswith('.') and entry.is_file():
+                yield entry.name
+            elif osp.isdir(entry.path):
+                yield entry.name + '/'
+
+
+class MockMemcachedClient:
+
+    def __init__(self, server_list_cfg, client_cfg):
+        pass
+
+    def Get(self, filepath, buffer):
+        with open(filepath, 'rb') as f:
+            buffer.content = f.read()
+
+
+class TestFileClient:
+
+    @classmethod
+    def setup_class(cls):
+        cls.test_data_dir = Path(__file__).parent / 'data'
+        cls.img_path = cls.test_data_dir / 'color.jpg'
+        cls.img_shape = (300, 400, 3)
+        cls.text_path = cls.test_data_dir / 'filelist.txt'
+
+    def test_error(self):
+        with pytest.raises(ValueError):
+            FileClient('hadoop')
+
+    def test_disk_backend(self):
+        disk_backend = FileClient('disk')
+
+        # test `name` attribute
+        assert disk_backend.name == 'HardDiskBackend'
+        # test `allow_symlink` attribute
+        assert disk_backend.allow_symlink
+        # test `get`
+        # input path is Path object
+        img_bytes = disk_backend.get(self.img_path)
+        img = mmcv.imfrombytes(img_bytes)
+        assert self.img_path.open('rb').read() == img_bytes
+        assert img.shape == self.img_shape
+        # input path is str
+        img_bytes = disk_backend.get(str(self.img_path))
+        img = mmcv.imfrombytes(img_bytes)
+        assert self.img_path.open('rb').read() == img_bytes
+        assert img.shape == self.img_shape
+
+        # test `get_text`
+        # input path is Path object
+        value_buf = disk_backend.get_text(self.text_path)
+        assert self.text_path.open('r').read() == value_buf
+        # input path is str
+        value_buf = disk_backend.get_text(str(self.text_path))
+        assert self.text_path.open('r').read() == value_buf
+
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            # test `put`
+            filepath1 = Path(tmp_dir) / 'test.jpg'
+            disk_backend.put(b'disk', filepath1)
+            assert filepath1.open('rb').read() == b'disk'
+            # test the `mkdir_or_exist` behavior in `put`
+            _filepath1 = Path(tmp_dir) / 'not_existed_dir1' / 'test.jpg'
+            disk_backend.put(b'disk', _filepath1)
+            assert _filepath1.open('rb').read() == b'disk'
+
+            # test `put_text`
+            filepath2 = Path(tmp_dir) / 'test.txt'
+            disk_backend.put_text('disk', filepath2)
+            assert filepath2.open('r').read() == 'disk'
+            # test the `mkdir_or_exist` behavior in `put_text`
+            _filepath2 = Path(tmp_dir) / 'not_existed_dir2' / 'test.txt'
+            disk_backend.put_text('disk', _filepath2)
+            assert _filepath2.open('r').read() == 'disk'
+
+            # test `isfile`
+            assert disk_backend.isfile(filepath2)
+            assert not disk_backend.isfile(Path(tmp_dir) / 'not/existed/path')
+
+            # test `remove`
+            disk_backend.remove(filepath2)
+
+            # test `exists`
+            assert not disk_backend.exists(filepath2)
+
+            # test `get_local_path`
+            # if the backend is disk, `get_local_path` just return the input
+            with disk_backend.get_local_path(filepath1) as path:
+                assert str(filepath1) == path
+            assert osp.isfile(filepath1)
+
+        # test `join_path`
+        disk_dir = '/path/of/your/directory'
+        assert disk_backend.join_path(disk_dir, 'file') == \
+            osp.join(disk_dir, 'file')
+        assert disk_backend.join_path(disk_dir, 'dir', 'file') == \
+            osp.join(disk_dir, 'dir', 'file')
+
+        # test `list_dir_or_file`
+        with build_temporary_directory() as tmp_dir:
+            # 1. list directories and files
+            assert set(disk_backend.list_dir_or_file(tmp_dir)) == {
+                'dir1', 'dir2', 'text1.txt', 'text2.txt'
+            }
+            # 2. list directories and files recursively
+            assert set(disk_backend.list_dir_or_file(
+                tmp_dir, recursive=True)) == {
+                    'dir1',
+                    osp.join('dir1', 'text3.txt'), 'dir2',
+                    osp.join('dir2', 'dir3'),
+                    osp.join('dir2', 'dir3', 'text4.txt'),
+                    osp.join('dir2', 'img.jpg'), 'text1.txt', 'text2.txt'
+                }
+            # 3. only list directories
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_file=False)) == {'dir1', 'dir2'}
+            with pytest.raises(
+                    TypeError,
+                    match='`suffix` should be None when `list_dir` is True'):
+                # Exception is raised among the `list_dir_or_file` of client,
+                # so we need to invode the client to trigger the exception
+                disk_backend.client.list_dir_or_file(
+                    tmp_dir, list_file=False, suffix='.txt')
+            # 4. only list directories recursively
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_file=False, recursive=True)) == {
+                        'dir1', 'dir2',
+                        osp.join('dir2', 'dir3')
+                    }
+            # 5. only list files
+            assert set(disk_backend.list_dir_or_file(
+                tmp_dir, list_dir=False)) == {'text1.txt', 'text2.txt'}
+            # 6. only list files recursively
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False, recursive=True)) == {
+                        osp.join('dir1', 'text3.txt'),
+                        osp.join('dir2', 'dir3', 'text4.txt'),
+                        osp.join('dir2', 'img.jpg'), 'text1.txt', 'text2.txt'
+                    }
+            # 7. only list files ending with suffix
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False,
+                    suffix='.txt')) == {'text1.txt', 'text2.txt'}
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False,
+                    suffix=('.txt', '.jpg'))) == {'text1.txt', 'text2.txt'}
+            with pytest.raises(
+                    TypeError,
+                    match='`suffix` must be a string or tuple of strings'):
+                disk_backend.client.list_dir_or_file(
+                    tmp_dir, list_dir=False, suffix=['.txt', '.jpg'])
+            # 8. only list files ending with suffix recursively
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False, suffix='.txt',
+                    recursive=True)) == {
+                        osp.join('dir1', 'text3.txt'),
+                        osp.join('dir2', 'dir3', 'text4.txt'), 'text1.txt',
+                        'text2.txt'
+                    }
+            # 7. only list files ending with suffix
+            assert set(
+                disk_backend.list_dir_or_file(
+                    tmp_dir,
+                    list_dir=False,
+                    suffix=('.txt', '.jpg'),
+                    recursive=True)) == {
+                        osp.join('dir1', 'text3.txt'),
+                        osp.join('dir2', 'dir3', 'text4.txt'),
+                        osp.join('dir2', 'img.jpg'), 'text1.txt', 'text2.txt'
+                    }
+
+    @patch('ceph.S3Client', MockS3Client)
+    def test_ceph_backend(self):
+        ceph_backend = FileClient('ceph')
+
+        # test `allow_symlink` attribute
+        assert not ceph_backend.allow_symlink
+
+        # input path is Path object
+        with pytest.raises(NotImplementedError):
+            ceph_backend.get_text(self.text_path)
+        # input path is str
+        with pytest.raises(NotImplementedError):
+            ceph_backend.get_text(str(self.text_path))
+
+        # input path is Path object
+        img_bytes = ceph_backend.get(self.img_path)
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+        # input path is str
+        img_bytes = ceph_backend.get(str(self.img_path))
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+
+        # `path_mapping` is either None or dict
+        with pytest.raises(AssertionError):
+            FileClient('ceph', path_mapping=1)
+        # test `path_mapping`
+        ceph_path = 's3://user/data'
+        ceph_backend = FileClient(
+            'ceph', path_mapping={str(self.test_data_dir): ceph_path})
+        ceph_backend.client._client.Get = MagicMock(
+            return_value=ceph_backend.client._client.Get(self.img_path))
+        img_bytes = ceph_backend.get(self.img_path)
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+        ceph_backend.client._client.Get.assert_called_with(
+            str(self.img_path).replace(str(self.test_data_dir), ceph_path))
+
+    @patch('petrel_client.client.Client', MockPetrelClient)
+    @pytest.mark.parametrize('backend,prefix', [('petrel', None),
+                                                (None, 's3')])
+    def test_petrel_backend(self, backend, prefix):
+        petrel_backend = FileClient(backend=backend, prefix=prefix)
+
+        # test `allow_symlink` attribute
+        assert not petrel_backend.allow_symlink
+
+        # input path is Path object
+        img_bytes = petrel_backend.get(self.img_path)
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+        # input path is str
+        img_bytes = petrel_backend.get(str(self.img_path))
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+
+        # `path_mapping` is either None or dict
+        with pytest.raises(AssertionError):
+            FileClient('petrel', path_mapping=1)
+
+        # test `_map_path`
+        petrel_dir = 's3://user/data'
+        petrel_backend = FileClient(
+            'petrel', path_mapping={str(self.test_data_dir): petrel_dir})
+        assert petrel_backend.client._map_path(str(self.img_path)) == \
+            str(self.img_path).replace(str(self.test_data_dir), petrel_dir)
+
+        petrel_path = f'{petrel_dir}/test.jpg'
+        petrel_backend = FileClient('petrel')
+
+        # test `_format_path`
+        assert petrel_backend.client._format_path('s3://user\\data\\test.jpg')\
+            == petrel_path
+
+        # test `get`
+        with patch.object(
+                petrel_backend.client._client, 'Get',
+                return_value=b'petrel') as mock_get:
+            assert petrel_backend.get(petrel_path) == b'petrel'
+            mock_get.assert_called_once_with(petrel_path)
+
+        # test `get_text`
+        with patch.object(
+                petrel_backend.client._client, 'Get',
+                return_value=b'petrel') as mock_get:
+            assert petrel_backend.get_text(petrel_path) == 'petrel'
+            mock_get.assert_called_once_with(petrel_path)
+
+        # test `put`
+        with patch.object(petrel_backend.client._client, 'put') as mock_put:
+            petrel_backend.put(b'petrel', petrel_path)
+            mock_put.assert_called_once_with(petrel_path, b'petrel')
+
+        # test `put_text`
+        with patch.object(petrel_backend.client._client, 'put') as mock_put:
+            petrel_backend.put_text('petrel', petrel_path)
+            mock_put.assert_called_once_with(petrel_path, b'petrel')
+
+        # test `remove`
+        assert has_method(petrel_backend.client._client, 'delete')
+        # raise Exception if `delete` is not implemented
+        with delete_and_reset_method(petrel_backend.client._client, 'delete'):
+            assert not has_method(petrel_backend.client._client, 'delete')
+            with pytest.raises(NotImplementedError):
+                petrel_backend.remove(petrel_path)
+
+        with patch.object(petrel_backend.client._client,
+                          'delete') as mock_delete:
+            petrel_backend.remove(petrel_path)
+            mock_delete.assert_called_once_with(petrel_path)
+
+        # test `exists`
+        assert has_method(petrel_backend.client._client, 'contains')
+        assert has_method(petrel_backend.client._client, 'isdir')
+        # raise Exception if `delete` is not implemented
+        with delete_and_reset_method(petrel_backend.client._client,
+                                     'contains'), delete_and_reset_method(
+                                         petrel_backend.client._client,
+                                         'isdir'):
+            assert not has_method(petrel_backend.client._client, 'contains')
+            assert not has_method(petrel_backend.client._client, 'isdir')
+            with pytest.raises(NotImplementedError):
+                petrel_backend.exists(petrel_path)
+
+        with patch.object(
+                petrel_backend.client._client, 'contains',
+                return_value=True) as mock_contains:
+            assert petrel_backend.exists(petrel_path)
+            mock_contains.assert_called_once_with(petrel_path)
+
+        # test `isdir`
+        assert has_method(petrel_backend.client._client, 'isdir')
+        with delete_and_reset_method(petrel_backend.client._client, 'isdir'):
+            assert not has_method(petrel_backend.client._client, 'isdir')
+            with pytest.raises(NotImplementedError):
+                petrel_backend.isdir(petrel_path)
+
+        with patch.object(
+                petrel_backend.client._client, 'isdir',
+                return_value=True) as mock_isdir:
+            assert petrel_backend.isdir(petrel_dir)
+            mock_isdir.assert_called_once_with(petrel_dir)
+
+        # test `isfile`
+        assert has_method(petrel_backend.client._client, 'contains')
+        with delete_and_reset_method(petrel_backend.client._client,
+                                     'contains'):
+            assert not has_method(petrel_backend.client._client, 'contains')
+            with pytest.raises(NotImplementedError):
+                petrel_backend.isfile(petrel_path)
+
+        with patch.object(
+                petrel_backend.client._client, 'contains',
+                return_value=True) as mock_contains:
+            assert petrel_backend.isfile(petrel_path)
+            mock_contains.assert_called_once_with(petrel_path)
+
+        # test `join_path`
+        assert petrel_backend.join_path(petrel_dir, 'file') == \
+            f'{petrel_dir}/file'
+        assert petrel_backend.join_path(f'{petrel_dir}/', 'file') == \
+            f'{petrel_dir}/file'
+        assert petrel_backend.join_path(petrel_dir, 'dir', 'file') == \
+            f'{petrel_dir}/dir/file'
+
+        # test `get_local_path`
+        with patch.object(petrel_backend.client._client, 'Get',
+                          return_value=b'petrel') as mock_get, \
+             patch.object(petrel_backend.client._client, 'contains',
+                          return_value=True) as mock_contains:
+            with petrel_backend.get_local_path(petrel_path) as path:
+                assert Path(path).open('rb').read() == b'petrel'
+            # exist the with block and path will be released
+            assert not osp.isfile(path)
+            mock_get.assert_called_once_with(petrel_path)
+            mock_contains.assert_called_once_with(petrel_path)
+
+        # test `list_dir_or_file`
+        assert has_method(petrel_backend.client._client, 'list')
+        with delete_and_reset_method(petrel_backend.client._client, 'list'):
+            assert not has_method(petrel_backend.client._client, 'list')
+            with pytest.raises(NotImplementedError):
+                list(petrel_backend.list_dir_or_file(petrel_dir))
+
+        with build_temporary_directory() as tmp_dir:
+            # 1. list directories and files
+            assert set(petrel_backend.list_dir_or_file(tmp_dir)) == {
+                'dir1', 'dir2', 'text1.txt', 'text2.txt'
+            }
+            # 2. list directories and files recursively
+            assert set(
+                petrel_backend.list_dir_or_file(tmp_dir, recursive=True)) == {
+                    'dir1', '/'.join(('dir1', 'text3.txt')), 'dir2', '/'.join(
+                        ('dir2', 'dir3')), '/'.join(
+                            ('dir2', 'dir3', 'text4.txt')), '/'.join(
+                                ('dir2', 'img.jpg')), 'text1.txt', 'text2.txt'
+                }
+            # 3. only list directories
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_file=False)) == {'dir1', 'dir2'}
+            with pytest.raises(
+                    TypeError,
+                    match=('`list_dir` should be False when `suffix` is not '
+                           'None')):
+                # Exception is raised among the `list_dir_or_file` of client,
+                # so we need to invode the client to trigger the exception
+                petrel_backend.client.list_dir_or_file(
+                    tmp_dir, list_file=False, suffix='.txt')
+            # 4. only list directories recursively
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_file=False, recursive=True)) == {
+                        'dir1', 'dir2', '/'.join(('dir2', 'dir3'))
+                    }
+            # 5. only list files
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False)) == {'text1.txt', 'text2.txt'}
+            # 6. only list files recursively
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False, recursive=True)) == {
+                        '/'.join(('dir1', 'text3.txt')), '/'.join(
+                            ('dir2', 'dir3', 'text4.txt')), '/'.join(
+                                ('dir2', 'img.jpg')), 'text1.txt', 'text2.txt'
+                    }
+            # 7. only list files ending with suffix
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False,
+                    suffix='.txt')) == {'text1.txt', 'text2.txt'}
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False,
+                    suffix=('.txt', '.jpg'))) == {'text1.txt', 'text2.txt'}
+            with pytest.raises(
+                    TypeError,
+                    match='`suffix` must be a string or tuple of strings'):
+                petrel_backend.client.list_dir_or_file(
+                    tmp_dir, list_dir=False, suffix=['.txt', '.jpg'])
+            # 8. only list files ending with suffix recursively
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir, list_dir=False, suffix='.txt',
+                    recursive=True)) == {
+                        '/'.join(('dir1', 'text3.txt')), '/'.join(
+                            ('dir2', 'dir3', 'text4.txt')), 'text1.txt',
+                        'text2.txt'
+                    }
+            # 7. only list files ending with suffix
+            assert set(
+                petrel_backend.list_dir_or_file(
+                    tmp_dir,
+                    list_dir=False,
+                    suffix=('.txt', '.jpg'),
+                    recursive=True)) == {
+                        '/'.join(('dir1', 'text3.txt')), '/'.join(
+                            ('dir2', 'dir3', 'text4.txt')), '/'.join(
+                                ('dir2', 'img.jpg')), 'text1.txt', 'text2.txt'
+                    }
+
+    @patch('mc.MemcachedClient.GetInstance', MockMemcachedClient)
+    @patch('mc.pyvector', MagicMock)
+    @patch('mc.ConvertBuffer', lambda x: x.content)
+    def test_memcached_backend(self):
+        mc_cfg = dict(server_list_cfg='', client_cfg='', sys_path=None)
+        mc_backend = FileClient('memcached', **mc_cfg)
+
+        # test `allow_symlink` attribute
+        assert not mc_backend.allow_symlink
+
+        # input path is Path object
+        with pytest.raises(NotImplementedError):
+            mc_backend.get_text(self.text_path)
+        # input path is str
+        with pytest.raises(NotImplementedError):
+            mc_backend.get_text(str(self.text_path))
+
+        # input path is Path object
+        img_bytes = mc_backend.get(self.img_path)
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+        # input path is str
+        img_bytes = mc_backend.get(str(self.img_path))
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+
+    def test_lmdb_backend(self):
+        lmdb_path = self.test_data_dir / 'demo.lmdb'
+
+        # db_path is Path object
+        lmdb_backend = FileClient('lmdb', db_path=lmdb_path)
+
+        # test `allow_symlink` attribute
+        assert not lmdb_backend.allow_symlink
+
+        with pytest.raises(NotImplementedError):
+            lmdb_backend.get_text(self.text_path)
+
+        img_bytes = lmdb_backend.get('baboon')
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == (120, 125, 3)
+
+        # db_path is str
+        lmdb_backend = FileClient('lmdb', db_path=str(lmdb_path))
+        with pytest.raises(NotImplementedError):
+            lmdb_backend.get_text(str(self.text_path))
+        img_bytes = lmdb_backend.get('baboon')
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == (120, 125, 3)
+
+    @pytest.mark.parametrize('backend,prefix', [('http', None),
+                                                (None, 'http')])
+    def test_http_backend(self, backend, prefix):
+        http_backend = FileClient(backend=backend, prefix=prefix)
+        img_url = 'https://raw.githubusercontent.com/open-mmlab/mmcv/' \
+            'master/tests/data/color.jpg'
+        text_url = 'https://raw.githubusercontent.com/open-mmlab/mmcv/' \
+            'master/tests/data/filelist.txt'
+
+        # test `allow_symlink` attribute
+        assert not http_backend.allow_symlink
+
+        # input is path or Path object
+        with pytest.raises(Exception):
+            http_backend.get(self.img_path)
+        with pytest.raises(Exception):
+            http_backend.get(str(self.img_path))
+        with pytest.raises(Exception):
+            http_backend.get_text(self.text_path)
+        with pytest.raises(Exception):
+            http_backend.get_text(str(self.text_path))
+
+        # input url is http image
+        img_bytes = http_backend.get(img_url)
+        img = mmcv.imfrombytes(img_bytes)
+        assert img.shape == self.img_shape
+
+        # input url is http text
+        value_buf = http_backend.get_text(text_url)
+        assert self.text_path.open('r').read() == value_buf
+
+        # test `_get_local_path`
+        # exist the with block and path will be released
+        with http_backend.get_local_path(img_url) as path:
+            assert mmcv.imread(path).shape == self.img_shape
+        assert not osp.isfile(path)
+
+    def test_new_magic_method(self):
+
+        class DummyBackend1(BaseStorageBackend):
+
+            def get(self, filepath):
+                return filepath
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return filepath
+
+        FileClient.register_backend('dummy_backend', DummyBackend1)
+        client1 = FileClient(backend='dummy_backend')
+        client2 = FileClient(backend='dummy_backend')
+        assert client1 is client2
+
+        # if a backend is overwrote, it will disable the singleton pattern for
+        # the backend
+        class DummyBackend2(BaseStorageBackend):
+
+            def get(self, filepath):
+                pass
+
+            def get_text(self, filepath):
+                pass
+
+        FileClient.register_backend('dummy_backend', DummyBackend2, force=True)
+        client3 = FileClient(backend='dummy_backend')
+        client4 = FileClient(backend='dummy_backend')
+        assert client2 is not client3
+        assert client3 is client4
+
+    def test_parse_uri_prefix(self):
+        # input path is None
+        with pytest.raises(AssertionError):
+            FileClient.parse_uri_prefix(None)
+        # input path is list
+        with pytest.raises(AssertionError):
+            FileClient.parse_uri_prefix([])
+
+        # input path is Path object
+        assert FileClient.parse_uri_prefix(self.img_path) is None
+        # input path is str
+        assert FileClient.parse_uri_prefix(str(self.img_path)) is None
+
+        # input path starts with https
+        img_url = 'https://raw.githubusercontent.com/open-mmlab/mmcv/' \
+            'master/tests/data/color.jpg'
+        assert FileClient.parse_uri_prefix(img_url) == 'https'
+
+        # input path starts with s3
+        img_url = 's3://your_bucket/img.png'
+        assert FileClient.parse_uri_prefix(img_url) == 's3'
+
+        # input path starts with clusterName:s3
+        img_url = 'clusterName:s3://your_bucket/img.png'
+        assert FileClient.parse_uri_prefix(img_url) == 's3'
+
+    def test_infer_client(self):
+        # HardDiskBackend
+        file_client_args = {'backend': 'disk'}
+        client = FileClient.infer_client(file_client_args)
+        assert client.name == 'HardDiskBackend'
+        client = FileClient.infer_client(uri=self.img_path)
+        assert client.name == 'HardDiskBackend'
+
+        # PetrelBackend
+        file_client_args = {'backend': 'petrel'}
+        client = FileClient.infer_client(file_client_args)
+        assert client.name == 'PetrelBackend'
+        uri = 's3://user_data'
+        client = FileClient.infer_client(uri=uri)
+        assert client.name == 'PetrelBackend'
+
+    def test_register_backend(self):
+
+        # name must be a string
+        with pytest.raises(TypeError):
+
+            class TestClass1:
+                pass
+
+            FileClient.register_backend(1, TestClass1)
+
+        # module must be a class
+        with pytest.raises(TypeError):
+            FileClient.register_backend('int', 0)
+
+        # module must be a subclass of BaseStorageBackend
+        with pytest.raises(TypeError):
+
+            class TestClass1:
+                pass
+
+            FileClient.register_backend('TestClass1', TestClass1)
+
+        class ExampleBackend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return filepath
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return filepath
+
+        FileClient.register_backend('example', ExampleBackend)
+        example_backend = FileClient('example')
+        assert example_backend.get(self.img_path) == self.img_path
+        assert example_backend.get_text(self.text_path) == self.text_path
+        assert 'example' in FileClient._backends
+
+        class Example2Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes2'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text2'
+
+        # force=False
+        with pytest.raises(KeyError):
+            FileClient.register_backend('example', Example2Backend)
+
+        FileClient.register_backend('example', Example2Backend, force=True)
+        example_backend = FileClient('example')
+        assert example_backend.get(self.img_path) == b'bytes2'
+        assert example_backend.get_text(self.text_path) == 'text2'
+
+        @FileClient.register_backend(name='example3')
+        class Example3Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes3'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text3'
+
+        example_backend = FileClient('example3')
+        assert example_backend.get(self.img_path) == b'bytes3'
+        assert example_backend.get_text(self.text_path) == 'text3'
+        assert 'example3' in FileClient._backends
+
+        # force=False
+        with pytest.raises(KeyError):
+
+            @FileClient.register_backend(name='example3')
+            class Example4Backend(BaseStorageBackend):
+
+                def get(self, filepath):
+                    return b'bytes4'
+
+                def get_text(self, filepath, encoding='utf-8'):
+                    return 'text4'
+
+        @FileClient.register_backend(name='example3', force=True)
+        class Example5Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes5'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text5'
+
+        example_backend = FileClient('example3')
+        assert example_backend.get(self.img_path) == b'bytes5'
+        assert example_backend.get_text(self.text_path) == 'text5'
+
+        # prefixes is a str
+        class Example6Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes6'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text6'
+
+        FileClient.register_backend(
+            'example4',
+            Example6Backend,
+            force=True,
+            prefixes='example4_prefix')
+        example_backend = FileClient('example4')
+        assert example_backend.get(self.img_path) == b'bytes6'
+        assert example_backend.get_text(self.text_path) == 'text6'
+        example_backend = FileClient(prefix='example4_prefix')
+        assert example_backend.get(self.img_path) == b'bytes6'
+        assert example_backend.get_text(self.text_path) == 'text6'
+        example_backend = FileClient('example4', prefix='example4_prefix')
+        assert example_backend.get(self.img_path) == b'bytes6'
+        assert example_backend.get_text(self.text_path) == 'text6'
+
+        # prefixes is a list of str
+        class Example7Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes7'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text7'
+
+        FileClient.register_backend(
+            'example5',
+            Example7Backend,
+            force=True,
+            prefixes=['example5_prefix1', 'example5_prefix2'])
+        example_backend = FileClient('example5')
+        assert example_backend.get(self.img_path) == b'bytes7'
+        assert example_backend.get_text(self.text_path) == 'text7'
+        example_backend = FileClient(prefix='example5_prefix1')
+        assert example_backend.get(self.img_path) == b'bytes7'
+        assert example_backend.get_text(self.text_path) == 'text7'
+        example_backend = FileClient(prefix='example5_prefix2')
+        assert example_backend.get(self.img_path) == b'bytes7'
+        assert example_backend.get_text(self.text_path) == 'text7'
+
+        # backend has a higher priority than prefixes
+        class Example8Backend(BaseStorageBackend):
+
+            def get(self, filepath):
+                return b'bytes8'
+
+            def get_text(self, filepath, encoding='utf-8'):
+                return 'text8'
+
+        FileClient.register_backend(
+            'example6',
+            Example8Backend,
+            force=True,
+            prefixes='example6_prefix')
+        example_backend = FileClient('example6')
+        assert example_backend.get(self.img_path) == b'bytes8'
+        assert example_backend.get_text(self.text_path) == 'text8'
+        example_backend = FileClient('example6', prefix='example4_prefix')
+        assert example_backend.get(self.img_path) == b'bytes8'
+        assert example_backend.get_text(self.text_path) == 'text8'
diff --git a/mmcv/tests/test_fileio.py b/mmcv/tests/test_fileio.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5e23bf7f4c2f4ba9aa2d98f04793971a89bb6de
--- /dev/null
+++ b/mmcv/tests/test_fileio.py
@@ -0,0 +1,211 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import sys
+import tempfile
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+import mmcv
+from mmcv.fileio.file_client import HTTPBackend, PetrelBackend
+
+sys.modules['petrel_client'] = MagicMock()
+sys.modules['petrel_client.client'] = MagicMock()
+
+
+def _test_handler(file_format, test_obj, str_checker, mode='r+'):
+    # dump to a string
+    dump_str = mmcv.dump(test_obj, file_format=file_format)
+    str_checker(dump_str)
+
+    # load/dump with filenames from disk
+    tmp_filename = osp.join(tempfile.gettempdir(), 'mmcv_test_dump')
+    mmcv.dump(test_obj, tmp_filename, file_format=file_format)
+    assert osp.isfile(tmp_filename)
+    load_obj = mmcv.load(tmp_filename, file_format=file_format)
+    assert load_obj == test_obj
+    os.remove(tmp_filename)
+
+    # load/dump with filename from petrel
+    method = 'put' if 'b' in mode else 'put_text'
+    with patch.object(PetrelBackend, method, return_value=None) as mock_method:
+        filename = 's3://path/of/your/file'
+        mmcv.dump(test_obj, filename, file_format=file_format)
+    mock_method.assert_called()
+
+    # json load/dump with a file-like object
+    with tempfile.NamedTemporaryFile(mode, delete=False) as f:
+        tmp_filename = f.name
+        mmcv.dump(test_obj, f, file_format=file_format)
+    assert osp.isfile(tmp_filename)
+    with open(tmp_filename, mode) as f:
+        load_obj = mmcv.load(f, file_format=file_format)
+    assert load_obj == test_obj
+    os.remove(tmp_filename)
+
+    # automatically inference the file format from the given filename
+    tmp_filename = osp.join(tempfile.gettempdir(),
+                            'mmcv_test_dump.' + file_format)
+    mmcv.dump(test_obj, tmp_filename)
+    assert osp.isfile(tmp_filename)
+    load_obj = mmcv.load(tmp_filename)
+    assert load_obj == test_obj
+    os.remove(tmp_filename)
+
+
+obj_for_test = [{'a': 'abc', 'b': 1}, 2, 'c']
+
+
+def test_json():
+
+    def json_checker(dump_str):
+        assert dump_str in [
+            '[{"a": "abc", "b": 1}, 2, "c"]', '[{"b": 1, "a": "abc"}, 2, "c"]'
+        ]
+
+    _test_handler('json', obj_for_test, json_checker)
+
+
+def test_yaml():
+
+    def yaml_checker(dump_str):
+        assert dump_str in [
+            '- {a: abc, b: 1}\n- 2\n- c\n', '- {b: 1, a: abc}\n- 2\n- c\n',
+            '- a: abc\n  b: 1\n- 2\n- c\n', '- b: 1\n  a: abc\n- 2\n- c\n'
+        ]
+
+    _test_handler('yaml', obj_for_test, yaml_checker)
+
+
+def test_pickle():
+
+    def pickle_checker(dump_str):
+        import pickle
+        assert pickle.loads(dump_str) == obj_for_test
+
+    _test_handler('pickle', obj_for_test, pickle_checker, mode='rb+')
+
+
+def test_exception():
+    test_obj = [{'a': 'abc', 'b': 1}, 2, 'c']
+
+    with pytest.raises(ValueError):
+        mmcv.dump(test_obj)
+
+    with pytest.raises(TypeError):
+        mmcv.dump(test_obj, 'tmp.txt')
+
+
+def test_register_handler():
+
+    @mmcv.register_handler('txt')
+    class TxtHandler1(mmcv.BaseFileHandler):
+
+        def load_from_fileobj(self, file):
+            return file.read()
+
+        def dump_to_fileobj(self, obj, file):
+            file.write(str(obj))
+
+        def dump_to_str(self, obj, **kwargs):
+            return str(obj)
+
+    @mmcv.register_handler(['txt1', 'txt2'])
+    class TxtHandler2(mmcv.BaseFileHandler):
+
+        def load_from_fileobj(self, file):
+            return file.read()
+
+        def dump_to_fileobj(self, obj, file):
+            file.write('\n')
+            file.write(str(obj))
+
+        def dump_to_str(self, obj, **kwargs):
+            return str(obj)
+
+    content = mmcv.load(osp.join(osp.dirname(__file__), 'data/filelist.txt'))
+    assert content == '1.jpg\n2.jpg\n3.jpg\n4.jpg\n5.jpg'
+    tmp_filename = osp.join(tempfile.gettempdir(), 'mmcv_test.txt2')
+    mmcv.dump(content, tmp_filename)
+    with open(tmp_filename) as f:
+        written = f.read()
+    os.remove(tmp_filename)
+    assert written == '\n' + content
+
+
+def test_list_from_file():
+    # get list from disk
+    filename = osp.join(osp.dirname(__file__), 'data/filelist.txt')
+    filelist = mmcv.list_from_file(filename)
+    assert filelist == ['1.jpg', '2.jpg', '3.jpg', '4.jpg', '5.jpg']
+    filelist = mmcv.list_from_file(filename, prefix='a/')
+    assert filelist == ['a/1.jpg', 'a/2.jpg', 'a/3.jpg', 'a/4.jpg', 'a/5.jpg']
+    filelist = mmcv.list_from_file(filename, offset=2)
+    assert filelist == ['3.jpg', '4.jpg', '5.jpg']
+    filelist = mmcv.list_from_file(filename, max_num=2)
+    assert filelist == ['1.jpg', '2.jpg']
+    filelist = mmcv.list_from_file(filename, offset=3, max_num=3)
+    assert filelist == ['4.jpg', '5.jpg']
+
+    # get list from http
+    with patch.object(
+            HTTPBackend, 'get_text', return_value='1.jpg\n2.jpg\n3.jpg'):
+        filename = 'http://path/of/your/file'
+        filelist = mmcv.list_from_file(
+            filename, file_client_args={'backend': 'http'})
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+        filelist = mmcv.list_from_file(
+            filename, file_client_args={'prefix': 'http'})
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+        filelist = mmcv.list_from_file(filename)
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+
+    # get list from petrel
+    with patch.object(
+            PetrelBackend, 'get_text', return_value='1.jpg\n2.jpg\n3.jpg'):
+        filename = 's3://path/of/your/file'
+        filelist = mmcv.list_from_file(
+            filename, file_client_args={'backend': 'petrel'})
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+        filelist = mmcv.list_from_file(
+            filename, file_client_args={'prefix': 's3'})
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+        filelist = mmcv.list_from_file(filename)
+        assert filelist == ['1.jpg', '2.jpg', '3.jpg']
+
+
+def test_dict_from_file():
+    # get dict from disk
+    filename = osp.join(osp.dirname(__file__), 'data/mapping.txt')
+    mapping = mmcv.dict_from_file(filename)
+    assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+    mapping = mmcv.dict_from_file(filename, key_type=int)
+    assert mapping == {1: 'cat', 2: ['dog', 'cow'], 3: 'panda'}
+
+    # get dict from http
+    with patch.object(
+            HTTPBackend, 'get_text', return_value='1 cat\n2 dog cow\n3 panda'):
+        filename = 'http://path/of/your/file'
+        mapping = mmcv.dict_from_file(
+            filename, file_client_args={'backend': 'http'})
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+        mapping = mmcv.dict_from_file(
+            filename, file_client_args={'prefix': 'http'})
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+        mapping = mmcv.dict_from_file(filename)
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+
+    # get dict from petrel
+    with patch.object(
+            PetrelBackend, 'get_text',
+            return_value='1 cat\n2 dog cow\n3 panda'):
+        filename = 's3://path/of/your/file'
+        mapping = mmcv.dict_from_file(
+            filename, file_client_args={'backend': 'petrel'})
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+        mapping = mmcv.dict_from_file(
+            filename, file_client_args={'prefix': 's3'})
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
+        mapping = mmcv.dict_from_file(filename)
+        assert mapping == {'1': 'cat', '2': ['dog', 'cow'], '3': 'panda'}
diff --git a/mmcv/tests/test_image/test_colorspace.py b/mmcv/tests/test_image/test_colorspace.py
new file mode 100644
index 0000000000000000000000000000000000000000..d53e4e44da7bf656fa5b35cb042eb2ee37979a42
--- /dev/null
+++ b/mmcv/tests/test_image/test_colorspace.py
@@ -0,0 +1,355 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import cv2
+import numpy as np
+import pytest
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+import mmcv
+from mmcv.image.colorspace import (_convert_input_type_range,
+                                   _convert_output_type_range)
+
+
+def test_bgr2gray():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2gray(in_img)
+    computed_gray = (
+        in_img[:, :, 0] * 0.114 + in_img[:, :, 1] * 0.587 +
+        in_img[:, :, 2] * 0.299)
+    assert_array_almost_equal(out_img, computed_gray, decimal=4)
+    out_img_3d = mmcv.bgr2gray(in_img, True)
+    assert out_img_3d.shape == (10, 10, 1)
+    assert_array_almost_equal(out_img_3d[..., 0], out_img, decimal=4)
+
+
+def test_rgb2gray():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.rgb2gray(in_img)
+    computed_gray = (
+        in_img[:, :, 0] * 0.299 + in_img[:, :, 1] * 0.587 +
+        in_img[:, :, 2] * 0.114)
+    assert_array_almost_equal(out_img, computed_gray, decimal=4)
+    out_img_3d = mmcv.rgb2gray(in_img, True)
+    assert out_img_3d.shape == (10, 10, 1)
+    assert_array_almost_equal(out_img_3d[..., 0], out_img, decimal=4)
+
+
+def test_gray2bgr():
+    in_img = np.random.rand(10, 10).astype(np.float32)
+    out_img = mmcv.gray2bgr(in_img)
+    assert out_img.shape == (10, 10, 3)
+    for i in range(3):
+        assert_array_almost_equal(out_img[..., i], in_img, decimal=4)
+
+
+def test_gray2rgb():
+    in_img = np.random.rand(10, 10).astype(np.float32)
+    out_img = mmcv.gray2rgb(in_img)
+    assert out_img.shape == (10, 10, 3)
+    for i in range(3):
+        assert_array_almost_equal(out_img[..., i], in_img, decimal=4)
+
+
+def test_bgr2rgb():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2rgb(in_img)
+    assert out_img.shape == in_img.shape
+    assert_array_equal(out_img[..., 0], in_img[..., 2])
+    assert_array_equal(out_img[..., 1], in_img[..., 1])
+    assert_array_equal(out_img[..., 2], in_img[..., 0])
+
+
+def test_rgb2bgr():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.rgb2bgr(in_img)
+    assert out_img.shape == in_img.shape
+    assert_array_equal(out_img[..., 0], in_img[..., 2])
+    assert_array_equal(out_img[..., 1], in_img[..., 1])
+    assert_array_equal(out_img[..., 2], in_img[..., 0])
+
+
+def test_bgr2hsv():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2hsv(in_img)
+    argmax = in_img.argmax(axis=2)
+    computed_hsv = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            v = max(r, g, b)
+            s = (v - min(r, g, b)) / v if v != 0 else 0
+            if argmax[i, j] == 0:
+                h = 240 + 60 * (r - g) / (v - min(r, g, b))
+            elif argmax[i, j] == 1:
+                h = 120 + 60 * (b - r) / (v - min(r, g, b))
+            else:
+                h = 60 * (g - b) / (v - min(r, g, b))
+            if h < 0:
+                h += 360
+            computed_hsv[i, j, :] = [h, s, v]
+    assert_array_almost_equal(out_img, computed_hsv, decimal=2)
+
+
+def test_convert_input_type_range():
+    with pytest.raises(TypeError):
+        # The img type should be np.float32 or np.uint8
+        in_img = np.random.rand(10, 10, 3).astype(np.uint64)
+        _convert_input_type_range(in_img)
+    # np.float32
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = _convert_input_type_range(in_img)
+    assert out_img.dtype == np.float32
+    assert np.absolute(out_img).mean() < 1
+    # np.uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = _convert_input_type_range(in_img)
+    assert out_img.dtype == np.float32
+    assert np.absolute(out_img).mean() < 1
+
+
+def test_convert_output_type_range():
+    with pytest.raises(TypeError):
+        # The dst_type should be np.float32 or np.uint8
+        in_img = np.random.rand(10, 10, 3).astype(np.float32)
+        _convert_output_type_range(in_img, np.uint64)
+    # np.float32
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.float32)
+    out_img = _convert_output_type_range(in_img, np.float32)
+    assert out_img.dtype == np.float32
+    assert np.absolute(out_img).mean() < 1
+    # np.uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.float32)
+    out_img = _convert_output_type_range(in_img, np.uint8)
+    assert out_img.dtype == np.uint8
+    assert np.absolute(out_img).mean() > 1
+
+
+def assert_image_almost_equal(x, y, atol=1):
+    assert x.dtype == np.uint8
+    assert y.dtype == np.uint8
+    assert np.all(np.abs(x.astype(np.int32) - y.astype(np.int32)) <= atol)
+
+
+def test_rgb2ycbcr():
+    with pytest.raises(TypeError):
+        # The img type should be np.float32 or np.uint8
+        in_img = np.random.rand(10, 10, 3).astype(np.uint64)
+        mmcv.rgb2ycbcr(in_img)
+
+    # float32
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.rgb2ycbcr(in_img)
+    computed_ycbcr = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            r, g, b = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
+            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
+            computed_ycbcr[i, j, :] = [y, cb, cr]
+    computed_ycbcr /= 255.
+    assert_array_almost_equal(out_img, computed_ycbcr, decimal=2)
+    # y_only=True
+    out_img = mmcv.rgb2ycbcr(in_img, y_only=True)
+    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            r, g, b = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            computed_y[i, j] = y
+    computed_y /= 255.
+    assert_array_almost_equal(out_img, computed_y, decimal=2)
+
+    # uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.rgb2ycbcr(in_img)
+    computed_ycbcr = np.empty_like(in_img)
+    in_img = in_img / 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            r, g, b = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
+            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
+            y, cb, cr = y.round(), cb.round(), cr.round()
+            computed_ycbcr[i, j, :] = [y, cb, cr]
+    assert_image_almost_equal(out_img, computed_ycbcr)
+    # y_only=True
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.rgb2ycbcr(in_img, y_only=True)
+    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
+    in_img = in_img / 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            r, g, b = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            y = y.round()
+            computed_y[i, j] = y
+    assert_image_almost_equal(out_img, computed_y)
+
+
+def test_bgr2ycbcr():
+    # float32
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2ycbcr(in_img)
+    computed_ycbcr = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
+            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
+            computed_ycbcr[i, j, :] = [y, cb, cr]
+    computed_ycbcr /= 255.
+    assert_array_almost_equal(out_img, computed_ycbcr, decimal=2)
+    # y_only=True
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2ycbcr(in_img, y_only=True)
+    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            computed_y[i, j] = y
+    computed_y /= 255.
+    assert_array_almost_equal(out_img, computed_y, decimal=2)
+
+    # uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.bgr2ycbcr(in_img)
+    computed_ycbcr = np.empty_like(in_img)
+    in_img = in_img / 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            cb = 128 - r * 37.797 - g * 74.203 + b * 112.0
+            cr = 128 + r * 112.0 - g * 93.786 - b * 18.214
+            y, cb, cr = y.round(), cb.round(), cr.round()
+            computed_ycbcr[i, j, :] = [y, cb, cr]
+    assert_image_almost_equal(out_img, computed_ycbcr)
+    # y_only = True
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.bgr2ycbcr(in_img, y_only=True)
+    computed_y = np.empty_like(out_img, dtype=out_img.dtype)
+    in_img = in_img / 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            y = 16 + r * 65.481 + g * 128.553 + b * 24.966
+            y = y.round()
+            computed_y[i, j] = y
+    assert_image_almost_equal(out_img, computed_y)
+
+
+def test_ycbcr2rgb():
+    with pytest.raises(TypeError):
+        # The img type should be np.float32 or np.uint8
+        in_img = np.random.rand(10, 10, 3).astype(np.uint64)
+        mmcv.ycbcr2rgb(in_img)
+
+    # float32
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.ycbcr2rgb(in_img)
+    computed_rgb = np.empty_like(in_img)
+    in_img *= 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            y, cb, cr = in_img[i, j]
+            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
+            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
+                cr * 0.00318811 * 255
+            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
+            computed_rgb[i, j, :] = [r, g, b]
+    computed_rgb /= 255.
+    assert_array_almost_equal(out_img, computed_rgb, decimal=2)
+
+    # uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.ycbcr2rgb(in_img)
+    computed_rgb = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            y, cb, cr = in_img[i, j]
+            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
+            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
+                cr * 0.00318811 * 255
+            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
+            r, g, b = r.round(), g.round(), b.round()
+            computed_rgb[i, j, :] = [r, g, b]
+    assert_image_almost_equal(out_img, computed_rgb)
+
+
+def test_ycbcr2bgr():
+    # float32
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.ycbcr2bgr(in_img)
+    computed_bgr = np.empty_like(in_img)
+    in_img *= 255.
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            y, cb, cr = in_img[i, j]
+            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
+            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
+                cr * 0.00318811 * 255
+            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
+            computed_bgr[i, j, :] = [b, g, r]
+    computed_bgr /= 255.
+    assert_array_almost_equal(out_img, computed_bgr, decimal=2)
+
+    # uint8
+    in_img = (np.random.rand(10, 10, 3) * 255).astype(np.uint8)
+    out_img = mmcv.ycbcr2bgr(in_img)
+    computed_bgr = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            y, cb, cr = in_img[i, j]
+            r = -222.921 + y * 0.00456621 * 255 + cr * 0.00625893 * 255
+            g = 135.576 + y * 0.00456621 * 255 - cb * 0.00153632 * 255 - \
+                cr * 0.00318811 * 255
+            b = -276.836 + y * 0.00456621 * 255. + cb * 0.00791071 * 255
+            r, g, b = r.round(), g.round(), b.round()
+            computed_bgr[i, j, :] = [b, g, r]
+    assert_image_almost_equal(out_img, computed_bgr)
+
+
+def test_bgr2hls():
+    in_img = np.random.rand(10, 10, 3).astype(np.float32)
+    out_img = mmcv.bgr2hls(in_img)
+    argmax = in_img.argmax(axis=2)
+    computed_hls = np.empty_like(in_img)
+    for i in range(in_img.shape[0]):
+        for j in range(in_img.shape[1]):
+            b, g, r = in_img[i, j]
+            maxc = max(r, g, b)
+            minc = min(r, g, b)
+            _l = (minc + maxc) / 2.0
+            if minc == maxc:
+                h = 0.0
+                s = 0.0
+            if _l <= 0.5:
+                s = (maxc - minc) / (maxc + minc)
+            else:
+                s = (maxc - minc) / (2.0 - maxc - minc)
+            if argmax[i, j] == 2:
+                h = 60 * (g - b) / (maxc - minc)
+            elif argmax[i, j] == 1:
+                h = 60 * (2.0 + (b - r) / (maxc - minc))
+            else:
+                h = 60 * (4.0 + (r - g) / (maxc - minc))
+            if h < 0:
+                h += 360
+            computed_hls[i, j, :] = [h, _l, s]
+    assert_array_almost_equal(out_img, computed_hls, decimal=2)
+
+
+@pytest.mark.parametrize('src,dst,ref', [('bgr', 'gray', cv2.COLOR_BGR2GRAY),
+                                         ('rgb', 'gray', cv2.COLOR_RGB2GRAY),
+                                         ('bgr', 'rgb', cv2.COLOR_BGR2RGB),
+                                         ('rgb', 'bgr', cv2.COLOR_RGB2BGR),
+                                         ('bgr', 'hsv', cv2.COLOR_BGR2HSV),
+                                         ('hsv', 'bgr', cv2.COLOR_HSV2BGR),
+                                         ('bgr', 'hls', cv2.COLOR_BGR2HLS),
+                                         ('hls', 'bgr', cv2.COLOR_HLS2BGR)])
+def test_imconvert(src, dst, ref):
+    img = np.random.rand(10, 10, 3).astype(np.float32)
+    assert_array_equal(mmcv.imconvert(img, src, dst), cv2.cvtColor(img, ref))
diff --git a/mmcv/tests/test_image/test_geometric.py b/mmcv/tests/test_image/test_geometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..308f96c1c3f312eb15dff9469c74bb15733ef9d0
--- /dev/null
+++ b/mmcv/tests/test_image/test_geometric.py
@@ -0,0 +1,614 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import cv2
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import mmcv
+
+
+class TestGeometric:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_dir = osp.join(osp.dirname(__file__), '../data')
+        # the test img resolution is 400x300
+        cls.img_path = osp.join(cls.data_dir, 'color.jpg')
+        cls.img = cv2.imread(cls.img_path)
+
+    def test_imresize(self):
+        resized_img = mmcv.imresize(self.img, (1000, 600))
+        assert resized_img.shape == (600, 1000, 3)
+        resized_img, w_scale, h_scale = mmcv.imresize(self.img, (1000, 600),
+                                                      True)
+        assert (resized_img.shape == (600, 1000, 3) and w_scale == 2.5
+                and h_scale == 2.0)
+        resized_img_dst = np.empty((600, 1000, 3), dtype=self.img.dtype)
+        resized_img = mmcv.imresize(self.img, (1000, 600), out=resized_img_dst)
+        assert id(resized_img_dst) == id(resized_img)
+        assert_array_equal(resized_img_dst,
+                           mmcv.imresize(self.img, (1000, 600)))
+        for mode in ['nearest', 'bilinear', 'bicubic', 'area', 'lanczos']:
+            resized_img = mmcv.imresize(
+                self.img, (1000, 600), interpolation=mode)
+            assert resized_img.shape == (600, 1000, 3)
+
+        # test pillow resize
+        for mode in [
+                'nearest', 'bilinear', 'bicubic', 'box', 'lanczos', 'hamming'
+        ]:
+            resized_img = mmcv.imresize(
+                self.img, (1000, 600), interpolation=mode, backend='pillow')
+            assert resized_img.shape == (600, 1000, 3)
+
+        # resize backend must be 'cv2' or 'pillow'
+        with pytest.raises(ValueError):
+            mmcv.imresize(self.img, (1000, 600), backend='not support')
+
+    def test_imresize_to_multiple(self):
+        # test size and keep_ratio = False
+        resized_img = mmcv.imresize_to_multiple(
+            self.img, divisor=16, size=(511, 513), keep_ratio=False)
+        assert resized_img.shape == (528, 512, 3)
+        resized_img = mmcv.imresize_to_multiple(
+            self.img, divisor=(16, 32), size=(511, 513), keep_ratio=False)
+        assert resized_img.shape == (544, 512, 3)
+
+        # test size, keep_ratio = True, and return_scale
+        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
+            self.img,
+            divisor=16,
+            size=(1000, 600),
+            keep_ratio=True,
+            return_scale=True)
+        assert resized_img.shape == (
+            608, 800, 3) and h_scale == 608 / 300 and w_scale == 800 / 400
+        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
+            self.img,
+            divisor=(18, 16),
+            size=(1000, 600),
+            keep_ratio=True,
+            return_scale=True)
+        assert resized_img.shape == (
+            608, 810, 3) and h_scale == 608 / 300 and w_scale == 810 / 400
+
+        # test scale_factor and return_scale
+        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
+            self.img, divisor=16, scale_factor=2, return_scale=True)
+        assert resized_img.shape == (
+            608, 800, 3) and h_scale == 608 / 300 and w_scale == 800 / 400
+        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
+            self.img, divisor=16, scale_factor=(2, 3), return_scale=True)
+        assert resized_img.shape == (
+            912, 800, 3) and h_scale == 912 / 300 and w_scale == 800 / 400
+        resized_img, w_scale, h_scale = mmcv.imresize_to_multiple(
+            self.img, divisor=(18, 16), scale_factor=(2, 3), return_scale=True)
+        assert resized_img.shape == (
+            912, 810, 3) and h_scale == 912 / 300 and w_scale == 810 / 400
+
+        # one of size and scale_factor should be given
+        with pytest.raises(ValueError):
+            mmcv.imresize_to_multiple(
+                self.img, divisor=16, size=(1000, 600), scale_factor=2)
+        with pytest.raises(ValueError):
+            mmcv.imresize_to_multiple(
+                self.img, divisor=16, size=None, scale_factor=None)
+
+    def test_imresize_like(self):
+        a = np.zeros((100, 200, 3))
+        resized_img = mmcv.imresize_like(self.img, a)
+        assert resized_img.shape == (100, 200, 3)
+
+    def test_rescale_size(self):
+        new_size, scale_factor = mmcv.rescale_size((400, 300), 1.5, True)
+        assert new_size == (600, 450) and scale_factor == 1.5
+        new_size, scale_factor = mmcv.rescale_size((400, 300), 0.934, True)
+        assert new_size == (374, 280) and scale_factor == 0.934
+
+        new_size = mmcv.rescale_size((400, 300), 1.5)
+        assert new_size == (600, 450)
+        new_size = mmcv.rescale_size((400, 300), 0.934)
+        assert new_size == (374, 280)
+
+        new_size, scale_factor = mmcv.rescale_size((400, 300), (1000, 600),
+                                                   True)
+        assert new_size == (800, 600) and scale_factor == 2.0
+        new_size, scale_factor = mmcv.rescale_size((400, 300), (180, 200),
+                                                   True)
+        assert new_size == (200, 150) and scale_factor == 0.5
+
+        new_size = mmcv.rescale_size((400, 300), (1000, 600))
+        assert new_size == (800, 600)
+        new_size = mmcv.rescale_size((400, 300), (180, 200))
+        assert new_size == (200, 150)
+
+        with pytest.raises(ValueError):
+            mmcv.rescale_size((400, 300), -0.5)
+        with pytest.raises(TypeError):
+            mmcv.rescale_size()((400, 300), [100, 100])
+
+    def test_imrescale(self):
+        # rescale by a certain factor
+        resized_img = mmcv.imrescale(self.img, 1.5)
+        assert resized_img.shape == (450, 600, 3)
+        resized_img = mmcv.imrescale(self.img, 0.934)
+        assert resized_img.shape == (280, 374, 3)
+
+        # rescale by a certain max_size
+        # resize (400, 300) to (max_1000, max_600)
+        resized_img = mmcv.imrescale(self.img, (1000, 600))
+        assert resized_img.shape == (600, 800, 3)
+        resized_img, scale = mmcv.imrescale(
+            self.img, (1000, 600), return_scale=True)
+        assert resized_img.shape == (600, 800, 3) and scale == 2.0
+        # resize (400, 300) to (max_200, max_180)
+        resized_img = mmcv.imrescale(self.img, (180, 200))
+        assert resized_img.shape == (150, 200, 3)
+        resized_img, scale = mmcv.imrescale(
+            self.img, (180, 200), return_scale=True)
+        assert resized_img.shape == (150, 200, 3) and scale == 0.5
+
+        # test exceptions
+        with pytest.raises(ValueError):
+            mmcv.imrescale(self.img, -0.5)
+        with pytest.raises(TypeError):
+            mmcv.imrescale(self.img, [100, 100])
+
+    def test_imflip(self):
+        # direction must be "horizontal" or "vertical" or "diagonal"
+        with pytest.raises(AssertionError):
+            mmcv.imflip(np.random.rand(80, 60, 3), direction='random')
+
+        # test horizontal flip (color image)
+        img = np.random.rand(80, 60, 3)
+        h, w, c = img.shape
+        flipped_img = mmcv.imflip(img)
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[i, w - 1 - j, k]
+
+        # test vertical flip (color image)
+        flipped_img = mmcv.imflip(img, direction='vertical')
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[h - 1 - i, j, k]
+
+        # test diagonal flip (color image)
+        flipped_img = mmcv.imflip(img, direction='diagonal')
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[h - 1 - i, w - 1 - j, k]
+
+        # test horizontal flip (grayscale image)
+        img = np.random.rand(80, 60)
+        h, w = img.shape
+        flipped_img = mmcv.imflip(img)
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[i, w - 1 - j]
+
+        # test vertical flip (grayscale image)
+        flipped_img = mmcv.imflip(img, direction='vertical')
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[h - 1 - i, j]
+
+        # test diagonal flip (grayscale image)
+        flipped_img = mmcv.imflip(img, direction='diagonal')
+        assert flipped_img.shape == img.shape
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[h - 1 - i, w - 1 - j]
+
+    def test_imflip_(self):
+        # direction must be "horizontal" or "vertical" or "diagonal"
+        with pytest.raises(AssertionError):
+            mmcv.imflip_(np.random.rand(80, 60, 3), direction='random')
+
+        # test horizontal flip (color image)
+        img = np.random.rand(80, 60, 3)
+        h, w, c = img.shape
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip)
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[i, w - 1 - j, k]
+                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]
+
+        # test vertical flip (color image)
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip, direction='vertical')
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[h - 1 - i, j, k]
+                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]
+
+        # test diagonal flip (color image)
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip, direction='diagonal')
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                for k in range(c):
+                    assert flipped_img[i, j, k] == img[h - 1 - i, w - 1 - j, k]
+                    assert flipped_img[i, j, k] == img_for_flip[i, j, k]
+
+        # test horizontal flip (grayscale image)
+        img = np.random.rand(80, 60)
+        h, w = img.shape
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip)
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[i, w - 1 - j]
+                assert flipped_img[i, j] == img_for_flip[i, j]
+
+        # test vertical flip (grayscale image)
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip, direction='vertical')
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[h - 1 - i, j]
+                assert flipped_img[i, j] == img_for_flip[i, j]
+
+        # test diagonal flip (grayscale image)
+        img_for_flip = img.copy()
+        flipped_img = mmcv.imflip_(img_for_flip, direction='diagonal')
+        assert flipped_img.shape == img.shape
+        assert flipped_img.shape == img_for_flip.shape
+        assert id(flipped_img) == id(img_for_flip)
+        for i in range(h):
+            for j in range(w):
+                assert flipped_img[i, j] == img[h - 1 - i, w - 1 - j]
+                assert flipped_img[i, j] == img_for_flip[i, j]
+
+    def test_imcrop(self):
+        # yapf: disable
+        bboxes = np.array([[100, 100, 199, 199],  # center
+                           [0, 0, 150, 100],  # left-top corner
+                           [250, 200, 399, 299],  # right-bottom corner
+                           [0, 100, 399, 199],  # wide
+                           [150, 0, 299, 299]])  # tall
+        # yapf: enable
+
+        # crop one bbox
+        patch = mmcv.imcrop(self.img, bboxes[0, :])
+        patches = mmcv.imcrop(self.img, bboxes[[0], :])
+        assert patch.shape == (100, 100, 3)
+        patch_path = osp.join(self.data_dir, 'patches')
+        ref_patch = np.load(patch_path + '/0.npy')
+        assert_array_equal(patch, ref_patch)
+        assert isinstance(patches, list) and len(patches) == 1
+        assert_array_equal(patches[0], ref_patch)
+
+        # crop with no scaling and padding
+        patches = mmcv.imcrop(self.img, bboxes)
+        assert len(patches) == bboxes.shape[0]
+        for i in range(len(patches)):
+            ref_patch = np.load(patch_path + f'/{i}.npy')
+            assert_array_equal(patches[i], ref_patch)
+
+        # crop with scaling and no padding
+        patches = mmcv.imcrop(self.img, bboxes, 1.2)
+        for i in range(len(patches)):
+            ref_patch = np.load(patch_path + f'/scale_{i}.npy')
+            assert_array_equal(patches[i], ref_patch)
+
+        # crop with scaling and padding
+        patches = mmcv.imcrop(self.img, bboxes, 1.2, pad_fill=[255, 255, 0])
+        for i in range(len(patches)):
+            ref_patch = np.load(patch_path + f'/pad_{i}.npy')
+            assert_array_equal(patches[i], ref_patch)
+        patches = mmcv.imcrop(self.img, bboxes, 1.2, pad_fill=0)
+        for i in range(len(patches)):
+            ref_patch = np.load(patch_path + f'/pad0_{i}.npy')
+            assert_array_equal(patches[i], ref_patch)
+
+    def test_impad(self):
+        # grayscale image
+        img = np.random.rand(10, 10).astype(np.float32)
+        padded_img = mmcv.impad(img, padding=(0, 0, 2, 5), pad_val=0)
+        assert_array_equal(img, padded_img[:10, :10])
+        assert_array_equal(
+            np.zeros((5, 12), dtype='float32'), padded_img[10:, :])
+        assert_array_equal(
+            np.zeros((15, 2), dtype='float32'), padded_img[:, 10:])
+
+        # RGB image
+        img = np.random.rand(10, 10, 3).astype(np.float32)
+        padded_img = mmcv.impad(img, padding=(0, 0, 2, 5), pad_val=0)
+        assert_array_equal(img, padded_img[:10, :10, :])
+        assert_array_equal(
+            np.zeros((5, 12, 3), dtype='float32'), padded_img[10:, :, :])
+        assert_array_equal(
+            np.zeros((15, 2, 3), dtype='float32'), padded_img[:, 10:, :])
+
+        # RGB image with different values for three channels.
+        img = np.random.randint(256, size=(10, 10, 3)).astype('uint8')
+        padded_img = mmcv.impad(
+            img, padding=(0, 0, 2, 5), pad_val=(100, 110, 120))
+        assert_array_equal(img, padded_img[:10, :10, :])
+        assert_array_equal(
+            np.array([100, 110, 120], dtype='uint8') * np.ones(
+                (5, 12, 3), dtype='uint8'), padded_img[10:, :, :])
+        assert_array_equal(
+            np.array([100, 110, 120], dtype='uint8') * np.ones(
+                (15, 2, 3), dtype='uint8'), padded_img[:, 10:, :])
+
+        # Pad the grayscale image to shape (15, 12)
+        img = np.random.rand(10, 10).astype(np.float32)
+        padded_img = mmcv.impad(img, shape=(15, 12))
+        assert_array_equal(img, padded_img[:10, :10])
+        assert_array_equal(
+            np.zeros((5, 12), dtype='float32'), padded_img[10:, :])
+        assert_array_equal(
+            np.zeros((15, 2), dtype='float32'), padded_img[:, 10:])
+
+        # Pad the RGB image to shape (15, 12)
+        img = np.random.rand(10, 10, 3).astype(np.float32)
+        padded_img = mmcv.impad(img, shape=(15, 12))
+        assert_array_equal(img, padded_img[:10, :10, :])
+        assert_array_equal(
+            np.zeros((5, 12, 3), dtype='float32'), padded_img[10:, :, :])
+        assert_array_equal(
+            np.zeros((15, 2, 3), dtype='float32'), padded_img[:, 10:, :])
+
+        # Pad the RGB image to shape (15, 12) with different values for
+        # three channels.
+        img = np.random.randint(256, size=(10, 10, 3)).astype('uint8')
+        padded_img = mmcv.impad(img, shape=(15, 12), pad_val=(100, 110, 120))
+        assert_array_equal(img, padded_img[:10, :10, :])
+        assert_array_equal(
+            np.array([100, 110, 120], dtype='uint8') * np.ones(
+                (5, 12, 3), dtype='uint8'), padded_img[10:, :, :])
+        assert_array_equal(
+            np.array([100, 110, 120], dtype='uint8') * np.ones(
+                (15, 2, 3), dtype='uint8'), padded_img[:, 10:, :])
+
+        # RGB image with padding=[5, 2]
+        img = np.random.rand(10, 10, 3).astype(np.float32)
+        padded_img = mmcv.impad(img, padding=(5, 2), pad_val=0)
+
+        assert padded_img.shape == (14, 20, 3)
+        assert_array_equal(img, padded_img[2:12, 5:15, :])
+        assert_array_equal(
+            np.zeros((2, 5, 3), dtype='float32'), padded_img[:2, :5, :])
+        assert_array_equal(
+            np.zeros((2, 5, 3), dtype='float32'), padded_img[12:, :5, :])
+        assert_array_equal(
+            np.zeros((2, 5, 3), dtype='float32'), padded_img[:2, 15:, :])
+        assert_array_equal(
+            np.zeros((2, 5, 3), dtype='float32'), padded_img[12:, 15:, :])
+
+        # RGB image with type(pad_val) = tuple
+        pad_val = (0, 1, 2)
+        img = np.random.rand(10, 10, 3).astype(np.float32)
+        padded_img = mmcv.impad(img, padding=(0, 0, 5, 2), pad_val=pad_val)
+
+        assert padded_img.shape == (12, 15, 3)
+        assert_array_equal(img, padded_img[:10, :10, :])
+        assert_array_equal(pad_val[0] * np.ones((2, 15, 1), dtype='float32'),
+                           padded_img[10:, :, 0:1])
+        assert_array_equal(pad_val[1] * np.ones((2, 15, 1), dtype='float32'),
+                           padded_img[10:, :, 1:2])
+        assert_array_equal(pad_val[2] * np.ones((2, 15, 1), dtype='float32'),
+                           padded_img[10:, :, 2:3])
+
+        assert_array_equal(pad_val[0] * np.ones((12, 5, 1), dtype='float32'),
+                           padded_img[:, 10:, 0:1])
+        assert_array_equal(pad_val[1] * np.ones((12, 5, 1), dtype='float32'),
+                           padded_img[:, 10:, 1:2])
+        assert_array_equal(pad_val[2] * np.ones((12, 5, 1), dtype='float32'),
+                           padded_img[:, 10:, 2:3])
+
+        # test different padding mode with channel number = 3
+        for mode in ['constant', 'edge', 'reflect', 'symmetric']:
+            img = np.random.rand(10, 10, 3).astype(np.float32)
+            padded_img = mmcv.impad(
+                img, padding=(0, 0, 5, 2), pad_val=pad_val, padding_mode=mode)
+            assert padded_img.shape == (12, 15, 3)
+
+        # test different padding mode with channel number = 1
+        for mode in ['constant', 'edge', 'reflect', 'symmetric']:
+            img = np.random.rand(10, 10).astype(np.float32)
+            padded_img = mmcv.impad(
+                img, padding=(0, 0, 5, 2), pad_val=0, padding_mode=mode)
+            assert padded_img.shape == (12, 15)
+
+        # Padding must be a int or a 2, or 4 element tuple.
+        with pytest.raises(ValueError):
+            mmcv.impad(img, padding=(1, 1, 1))
+
+        # pad_val must be a int or a tuple
+        with pytest.raises(TypeError):
+            mmcv.impad(img, padding=(1, 1, 1, 1), pad_val='wrong')
+
+        # When pad_val is a tuple,
+        # len(pad_val) should be equal to img.shape[-1]
+        img = np.random.rand(10, 10, 3).astype(np.float32)
+        with pytest.raises(AssertionError):
+            mmcv.impad(img, padding=3, pad_val=(100, 200))
+
+        with pytest.raises(AssertionError):
+            mmcv.impad(img, padding=2, pad_val=0, padding_mode='unknown')
+
+        with pytest.raises(AssertionError):
+            mmcv.impad(img, shape=(12, 15), padding=(0, 0, 5, 2))
+
+        # Pad shape smaller than image shape
+        padded_img = mmcv.impad(img, shape=(8, 8))
+        assert padded_img.shape == (10, 10, 3)
+
+    def test_impad_to_multiple(self):
+        img = np.random.rand(11, 14, 3).astype(np.float32)
+        padded_img = mmcv.impad_to_multiple(img, 4)
+        assert padded_img.shape == (12, 16, 3)
+        img = np.random.rand(20, 12).astype(np.float32)
+        padded_img = mmcv.impad_to_multiple(img, 5)
+        assert padded_img.shape == (20, 15)
+        img = np.random.rand(20, 12).astype(np.float32)
+        padded_img = mmcv.impad_to_multiple(img, 2)
+        assert padded_img.shape == (20, 12)
+
+    def test_cutout(self):
+        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
+
+        # shape must be int or tuple
+        with pytest.raises(AssertionError):
+            mmcv.cutout(img, 2.5)
+        # pad_val must be int or float or tuple with the same length
+        # of img channels
+        with pytest.raises(AssertionError):
+            mmcv.cutout(img, 1, (1, 2, 3))
+        with pytest.raises(TypeError):
+            mmcv.cutout(img, 1, None)
+
+        # test cutout the whole img
+        assert_array_equal(mmcv.cutout(img, 6), np.zeros_like(img))
+        # test not cutout
+        assert_array_equal(mmcv.cutout(img, 0), img)
+        # test cutout when shape is int
+        np.random.seed(0)
+        img_cutout = np.array([[1, 2, 3], [4, 0, 6], [7, 8,
+                                                      9]]).astype(np.uint8)
+        assert_array_equal(mmcv.cutout(img, 1), img_cutout)
+        img_cutout = np.array([[1, 2, 3], [4, 10, 6], [7, 8,
+                                                       9]]).astype(np.uint8)
+        assert_array_equal(mmcv.cutout(img, 1, pad_val=10), img_cutout)
+        # test cutout when shape is tuple
+        np.random.seed(0)
+        img_cutout = np.array([[1, 2, 3], [0, 0, 6], [7, 8,
+                                                      9]]).astype(np.uint8)
+        assert_array_equal(mmcv.cutout(img, (1, 2)), img_cutout)
+        img_cutout = np.array([[1, 2, 3], [10, 10, 6], [7, 8,
+                                                        9]]).astype(np.uint8)
+        assert_array_equal(mmcv.cutout(img, (1, 2), pad_val=10), img_cutout)
+
+    def test_imrotate(self):
+        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
+        assert_array_equal(mmcv.imrotate(img, 0), img)
+        img_r = np.array([[7, 4, 1], [8, 5, 2], [9, 6, 3]])
+        assert_array_equal(mmcv.imrotate(img, 90), img_r)
+        img_r = np.array([[3, 6, 9], [2, 5, 8], [1, 4, 7]])
+        assert_array_equal(mmcv.imrotate(img, -90), img_r)
+
+        img = np.array([[1, 2, 3, 4], [5, 6, 7, 8]]).astype(np.uint8)
+        img_r = np.array([[0, 6, 2, 0], [0, 7, 3, 0]])
+        assert_array_equal(mmcv.imrotate(img, 90), img_r)
+        img_r = np.array([[1, 0, 0, 0], [2, 0, 0, 0]])
+        assert_array_equal(mmcv.imrotate(img, 90, center=(0, 0)), img_r)
+        img_r = np.array([[255, 6, 2, 255], [255, 7, 3, 255]])
+        assert_array_equal(mmcv.imrotate(img, 90, border_value=255), img_r)
+        img_r = np.array([[5, 1], [6, 2], [7, 3], [8, 4]])
+        assert_array_equal(mmcv.imrotate(img, 90, auto_bound=True), img_r)
+
+        with pytest.raises(ValueError):
+            mmcv.imrotate(img, 90, center=(0, 0), auto_bound=True)
+
+    def test_imshear(self):
+        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
+        assert_array_equal(mmcv.imshear(img, 0), img)
+        # magnitude=1, horizontal
+        img_sheared = np.array([[1, 2, 3], [0, 4, 5], [0, 0, 7]],
+                               dtype=np.uint8)
+        assert_array_equal(mmcv.imshear(img, 1), img_sheared)
+        # magnitude=-1, vertical
+        img_sheared = np.array([[1, 5, 9], [4, 8, 0], [7, 0, 0]],
+                               dtype=np.uint8)
+        assert_array_equal(mmcv.imshear(img, -1, 'vertical'), img_sheared)
+        # magnitude=1, vertical, borderValue=100
+        borderValue = 100
+        img_sheared = np.array(
+            [[1, borderValue, borderValue], [4, 2, borderValue], [7, 5, 3]],
+            dtype=np.uint8)
+        assert_array_equal(
+            mmcv.imshear(img, 1, 'vertical', borderValue), img_sheared)
+        # magnitude=1, vertical, borderValue=100, img shape (h,w,3)
+        img = np.stack([img, img, img], axis=-1)
+        img_sheared = np.stack([img_sheared, img_sheared, img_sheared],
+                               axis=-1)
+        assert_array_equal(
+            mmcv.imshear(img, 1, 'vertical', borderValue), img_sheared)
+        # test tuple format of borderValue
+        assert_array_equal(
+            mmcv.imshear(img, 1, 'vertical',
+                         (borderValue, borderValue, borderValue)), img_sheared)
+
+        # test invalid length of borderValue
+        with pytest.raises(AssertionError):
+            mmcv.imshear(img, 0.5, 'horizontal', (borderValue, ))
+
+        # test invalid type of borderValue
+        with pytest.raises(ValueError):
+            mmcv.imshear(img, 0.5, 'horizontal', [borderValue])
+
+        # test invalid value of direction
+        with pytest.raises(AssertionError):
+            mmcv.imshear(img, 0.5, 'diagonal')
+
+    def test_imtranslate(self):
+        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]], dtype=np.uint8)
+        assert_array_equal(mmcv.imtranslate(img, 0), img)
+        # offset=1, horizontal
+        img_translated = np.array([[128, 1, 2], [128, 4, 5], [128, 7, 8]],
+                                  dtype=np.uint8)
+        assert_array_equal(
+            mmcv.imtranslate(img, 1, border_value=128), img_translated)
+        # offset=-1, vertical
+        img_translated = np.array([[4, 5, 6], [7, 8, 9], [0, 0, 0]],
+                                  dtype=np.uint8)
+        assert_array_equal(
+            mmcv.imtranslate(img, -1, 'vertical'), img_translated)
+        # offset=-2, horizontal
+        img = np.array([[1, 2, 3, 4], [5, 6, 7, 8]], dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        img_translated = [[3, 4, 128, 128], [7, 8, 128, 128]]
+        img_translated = np.stack(
+            [img_translated, img_translated, img_translated], axis=-1)
+        assert_array_equal(
+            mmcv.imtranslate(img, -2, border_value=128), img_translated)
+        # offset=2, vertical
+        border_value = (110, 120, 130)
+        img_translated = np.stack([
+            np.ones((2, 4)) * border_value[0],
+            np.ones((2, 4)) * border_value[1],
+            np.ones((2, 4)) * border_value[2]
+        ],
+                                  axis=-1).astype(np.uint8)
+        assert_array_equal(
+            mmcv.imtranslate(img, 2, 'vertical', border_value), img_translated)
+        # test invalid number elements in border_value
+        with pytest.raises(AssertionError):
+            mmcv.imtranslate(img, 1, border_value=(1, ))
+        # test invalid type of border_value
+        with pytest.raises(ValueError):
+            mmcv.imtranslate(img, 1, border_value=[1, 2, 3])
+        # test invalid value of direction
+        with pytest.raises(AssertionError):
+            mmcv.imtranslate(img, 1, 'diagonal')
diff --git a/mmcv/tests/test_image/test_image_misc.py b/mmcv/tests/test_image/test_image_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..51e61d8e60e719f118bd275f1c7637c7a7adab1e
--- /dev/null
+++ b/mmcv/tests/test_image/test_image_misc.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import mmcv
+
+try:
+    import torch
+except ImportError:
+    torch = None
+
+
+@pytest.mark.skipif(torch is None, reason='requires torch library')
+def test_tensor2imgs():
+
+    # test tensor obj
+    with pytest.raises(AssertionError):
+        tensor = np.random.rand(2, 3, 3)
+        mmcv.tensor2imgs(tensor)
+
+    # test tensor ndim
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 3, 3)
+        mmcv.tensor2imgs(tensor)
+
+    # test tensor dim-1
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 4, 3, 3)
+        mmcv.tensor2imgs(tensor)
+
+    # test mean length
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 3, 5, 5)
+        mmcv.tensor2imgs(tensor, mean=(1, ))
+        tensor = torch.randn(2, 1, 5, 5)
+        mmcv.tensor2imgs(tensor, mean=(0, 0, 0))
+
+    # test std length
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 3, 5, 5)
+        mmcv.tensor2imgs(tensor, std=(1, ))
+        tensor = torch.randn(2, 1, 5, 5)
+        mmcv.tensor2imgs(tensor, std=(1, 1, 1))
+
+    # test to_rgb
+    with pytest.raises(AssertionError):
+        tensor = torch.randn(2, 1, 5, 5)
+        mmcv.tensor2imgs(tensor, mean=(0, ), std=(1, ), to_rgb=True)
+
+    # test rgb=True
+    tensor = torch.randn(2, 3, 5, 5)
+    gts = [
+        t.cpu().numpy().transpose(1, 2, 0).astype(np.uint8)
+        for t in tensor.flip(1)
+    ]
+    outputs = mmcv.tensor2imgs(tensor, to_rgb=True)
+    for gt, output in zip(gts, outputs):
+        assert_array_equal(gt, output)
+
+    # test rgb=False
+    tensor = torch.randn(2, 3, 5, 5)
+    gts = [t.cpu().numpy().transpose(1, 2, 0).astype(np.uint8) for t in tensor]
+    outputs = mmcv.tensor2imgs(tensor, to_rgb=False)
+    for gt, output in zip(gts, outputs):
+        assert_array_equal(gt, output)
+
+    # test tensor channel 1 and rgb=False
+    tensor = torch.randn(2, 1, 5, 5)
+    gts = [t.squeeze(0).cpu().numpy().astype(np.uint8) for t in tensor]
+    outputs = mmcv.tensor2imgs(tensor, to_rgb=False)
+    for gt, output in zip(gts, outputs):
+        assert_array_equal(gt, output)
diff --git a/mmcv/tests/test_image/test_io.py b/mmcv/tests/test_image/test_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c1b4dd68337e1267aba0a11dc7cc1aee1cc7d79
--- /dev/null
+++ b/mmcv/tests/test_image/test_io.py
@@ -0,0 +1,389 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import sys
+import tempfile
+from pathlib import Path
+from unittest.mock import MagicMock, patch
+
+import cv2
+import numpy as np
+import pytest
+import torch
+from numpy.testing import assert_allclose, assert_array_equal
+
+import mmcv
+from mmcv.fileio.file_client import HTTPBackend, PetrelBackend
+
+if torch.__version__ == 'parrots':
+    pytest.skip('not necessary in parrots test', allow_module_level=True)
+
+
+class TestIO:
+
+    @classmethod
+    def setup_class(cls):
+        cls.data_dir = osp.join(osp.dirname(__file__), '../data')
+        # the test img resolution is 400x300
+        cls.img_path = osp.join(cls.data_dir, 'color.jpg')
+        cls.img_path_obj = Path(cls.img_path)
+        cls.gray_img_path = osp.join(cls.data_dir, 'grayscale.jpg')
+        cls.gray_img_path_obj = Path(cls.gray_img_path)
+        cls.gray_img_dim3_path = osp.join(cls.data_dir, 'grayscale_dim3.jpg')
+        cls.gray_alpha_img_path = osp.join(cls.data_dir, 'gray_alpha.png')
+        cls.palette_img_path = osp.join(cls.data_dir, 'palette.gif')
+        cls.exif_img_path = osp.join(cls.data_dir, 'color_exif.jpg')
+        cls.img = cv2.imread(cls.img_path)
+        cls.tiff_path = osp.join(cls.data_dir, 'uint16-5channel.tif')
+        # petrel s3 path
+        cls.s3_path = 's3://path/of/your/file.jpg'
+        # http path
+        cls.http_path = 'http://path/of/your/file.jpg'
+        # add mock package
+        sys.modules['petrel_client'] = MagicMock()
+        sys.modules['petrel_client.client'] = MagicMock()
+
+    @classmethod
+    def teardown_class(cls):
+        # clean instances avoid to influence other unittest
+        mmcv.FileClient._instances = {}
+
+    def assert_img_equal(self, img, ref_img, ratio_thr=0.999):
+        assert img.shape == ref_img.shape
+        assert img.dtype == ref_img.dtype
+        area = ref_img.shape[0] * ref_img.shape[1]
+        diff = np.abs(img.astype('int32') - ref_img.astype('int32'))
+        assert np.sum(diff <= 1) / float(area) > ratio_thr
+
+    def test_imread(self):
+        # backend cv2
+        mmcv.use_backend('cv2')
+
+        # HardDiskBackend
+        img_cv2_color_bgr = mmcv.imread(self.img_path)
+        assert img_cv2_color_bgr.shape == (300, 400, 3)
+        img_cv2_color_rgb = mmcv.imread(self.img_path, channel_order='rgb')
+        assert img_cv2_color_rgb.shape == (300, 400, 3)
+        assert_array_equal(img_cv2_color_rgb[:, :, ::-1], img_cv2_color_bgr)
+        img_cv2_grayscale1 = mmcv.imread(self.img_path, 'grayscale')
+        assert img_cv2_grayscale1.shape == (300, 400)
+        img_cv2_grayscale2 = mmcv.imread(self.gray_img_path)
+        assert img_cv2_grayscale2.shape == (300, 400, 3)
+        img_cv2_unchanged = mmcv.imread(self.gray_img_path, 'unchanged')
+        assert img_cv2_unchanged.shape == (300, 400)
+        img_cv2_unchanged = mmcv.imread(img_cv2_unchanged)
+        assert_array_equal(img_cv2_unchanged, mmcv.imread(img_cv2_unchanged))
+
+        img_cv2_color_bgr = mmcv.imread(self.img_path_obj)
+        assert img_cv2_color_bgr.shape == (300, 400, 3)
+        img_cv2_color_rgb = mmcv.imread(self.img_path_obj, channel_order='rgb')
+        assert img_cv2_color_rgb.shape == (300, 400, 3)
+        assert_array_equal(img_cv2_color_rgb[:, :, ::-1], img_cv2_color_bgr)
+        img_cv2_grayscale1 = mmcv.imread(self.img_path_obj, 'grayscale')
+        assert img_cv2_grayscale1.shape == (300, 400)
+        img_cv2_grayscale2 = mmcv.imread(self.gray_img_path_obj)
+        assert img_cv2_grayscale2.shape == (300, 400, 3)
+        img_cv2_unchanged = mmcv.imread(self.gray_img_path_obj, 'unchanged')
+        assert img_cv2_unchanged.shape == (300, 400)
+        with pytest.raises(TypeError):
+            mmcv.imread(1)
+
+        # PetrelBackend
+        img_cv2_color_bgr = mmcv.imread(self.img_path)
+        with patch.object(
+                PetrelBackend, 'get',
+                return_value=img_cv2_color_bgr) as mock_method:
+            img_cv2_color_bgr_petrel = mmcv.imread(self.s3_path, backend='cv2')
+            img_cv2_color_bgr_petrel_with_args = mmcv.imread(
+                self.s3_path,
+                backend='cv2',
+                file_client_args={'backend': 'petrel'})
+            mock_method.assert_called()
+            assert_array_equal(img_cv2_color_bgr_petrel,
+                               img_cv2_color_bgr_petrel_with_args)
+
+        # HTTPBackend
+        img_cv2_color_bgr = mmcv.imread(self.img_path)
+        with patch.object(
+                HTTPBackend, 'get',
+                return_value=img_cv2_color_bgr) as mock_method:
+            img_cv2_color_bgr_http = mmcv.imread(self.http_path, backend='cv2')
+            img_cv2_color_bgr_http_with_args = mmcv.imread(
+                self.http_path,
+                backend='cv2',
+                file_client_args={'backend': 'http'})
+            mock_method.assert_called()
+            assert_array_equal(img_cv2_color_bgr_http,
+                               img_cv2_color_bgr_http_with_args)
+
+        with pytest.raises(FileNotFoundError):
+            mmcv.imread('/not/exists/' + self.img_path)
+
+        # test arg backend pillow
+        img_pil_gray_alpha = mmcv.imread(
+            self.gray_alpha_img_path, 'grayscale', backend='pillow')
+        assert img_pil_gray_alpha.shape == (400, 500)
+        mean = img_pil_gray_alpha[300:, 400:].mean()
+        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
+        img_pil_gray_alpha = mmcv.imread(
+            self.gray_alpha_img_path, backend='pillow')
+        mean = img_pil_gray_alpha[300:, 400:].mean(axis=(0, 1))
+        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
+        assert img_pil_gray_alpha.shape == (400, 500, 3)
+        img_pil_gray_alpha = mmcv.imread(
+            self.gray_alpha_img_path, 'unchanged', backend='pillow')
+        assert img_pil_gray_alpha.shape == (400, 500, 2)
+        img_pil_palette = mmcv.imread(
+            self.palette_img_path, 'grayscale', backend='pillow')
+        assert img_pil_palette.shape == (300, 400)
+        img_pil_palette = mmcv.imread(self.palette_img_path, backend='pillow')
+        assert img_pil_palette.shape == (300, 400, 3)
+        img_pil_palette = mmcv.imread(
+            self.palette_img_path, 'unchanged', backend='pillow')
+        assert img_pil_palette.shape == (300, 400)
+
+        # backend pillow
+        mmcv.use_backend('pillow')
+        img_pil_grayscale1 = mmcv.imread(self.img_path, 'grayscale')
+        assert img_pil_grayscale1.shape == (300, 400)
+        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path, 'grayscale')
+        assert img_pil_gray_alpha.shape == (400, 500)
+        mean = img_pil_gray_alpha[300:, 400:].mean()
+        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
+        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path)
+        mean = img_pil_gray_alpha[300:, 400:].mean(axis=(0, 1))
+        assert_allclose(img_pil_gray_alpha[300:, 400:] - mean, 0)
+        assert img_pil_gray_alpha.shape == (400, 500, 3)
+        img_pil_gray_alpha = mmcv.imread(self.gray_alpha_img_path, 'unchanged')
+        assert img_pil_gray_alpha.shape == (400, 500, 2)
+        img_pil_palette = mmcv.imread(self.palette_img_path, 'grayscale')
+        assert img_pil_palette.shape == (300, 400)
+        img_pil_palette = mmcv.imread(self.palette_img_path)
+        assert img_pil_palette.shape == (300, 400, 3)
+        img_pil_palette = mmcv.imread(self.palette_img_path, 'unchanged')
+        assert img_pil_palette.shape == (300, 400)
+        img_pil_grayscale2 = mmcv.imread(self.gray_img_path)
+        assert img_pil_grayscale2.shape == (300, 400, 3)
+        img_pil_unchanged = mmcv.imread(self.gray_img_path, 'unchanged')
+        assert img_pil_unchanged.shape == (300, 400)
+        img_pil_unchanged = mmcv.imread(img_pil_unchanged)
+        assert_array_equal(img_pil_unchanged, mmcv.imread(img_pil_unchanged))
+
+        img_pil_color_bgr = mmcv.imread(self.img_path_obj)
+        assert img_pil_color_bgr.shape == (300, 400, 3)
+        img_pil_color_rgb = mmcv.imread(self.img_path_obj, channel_order='rgb')
+        assert img_pil_color_rgb.shape == (300, 400, 3)
+        assert (img_pil_color_rgb == img_cv2_color_rgb).sum() / float(
+            img_cv2_color_rgb.size) > 0.5
+        assert_array_equal(img_pil_color_rgb[:, :, ::-1], img_pil_color_bgr)
+        img_pil_grayscale1 = mmcv.imread(self.img_path_obj, 'grayscale')
+        assert img_pil_grayscale1.shape == (300, 400)
+        img_pil_grayscale2 = mmcv.imread(self.gray_img_path_obj)
+        assert img_pil_grayscale2.shape == (300, 400, 3)
+        img_pil_unchanged = mmcv.imread(self.gray_img_path_obj, 'unchanged')
+        assert img_pil_unchanged.shape == (300, 400)
+        with pytest.raises(TypeError):
+            mmcv.imread(1)
+
+        # backend turbojpeg
+        mmcv.use_backend('turbojpeg')
+
+        img_turbojpeg_color_bgr = mmcv.imread(self.img_path)
+        assert img_turbojpeg_color_bgr.shape == (300, 400, 3)
+        assert_array_equal(img_turbojpeg_color_bgr, img_cv2_color_bgr)
+
+        img_turbojpeg_color_rgb = mmcv.imread(
+            self.img_path, channel_order='rgb')
+        assert img_turbojpeg_color_rgb.shape == (300, 400, 3)
+        assert_array_equal(img_turbojpeg_color_rgb, img_cv2_color_rgb)
+
+        with pytest.raises(ValueError):
+            mmcv.imread(self.img_path, channel_order='unsupport_order')
+
+        img_turbojpeg_grayscale1 = mmcv.imread(self.img_path, flag='grayscale')
+        assert img_turbojpeg_grayscale1.shape == (300, 400)
+        assert_array_equal(img_turbojpeg_grayscale1, img_cv2_grayscale1)
+
+        img_turbojpeg_grayscale2 = mmcv.imread(self.gray_img_path)
+        assert img_turbojpeg_grayscale2.shape == (300, 400, 3)
+        assert_array_equal(img_turbojpeg_grayscale2, img_cv2_grayscale2)
+
+        img_turbojpeg_grayscale2 = mmcv.imread(img_turbojpeg_grayscale2)
+        assert_array_equal(img_turbojpeg_grayscale2,
+                           mmcv.imread(img_turbojpeg_grayscale2))
+
+        with pytest.raises(ValueError):
+            mmcv.imread(self.gray_img_path, 'unchanged')
+
+        with pytest.raises(TypeError):
+            mmcv.imread(1)
+
+        with pytest.raises(AssertionError):
+            mmcv.use_backend('unsupport_backend')
+
+        with pytest.raises(ValueError):
+            mmcv.imread(self.img_path, 'unsupported_backend')
+
+        # backend tifffile, multi channel tiff file(> 4 channels).
+        mmcv.use_backend('tifffile')
+        img_tifffile = mmcv.imread(self.tiff_path)
+        assert img_tifffile.shape == (200, 150, 5)
+
+        mmcv.use_backend('cv2')
+
+        # consistent exif behaviour
+        img_cv2_exif = mmcv.imread(self.exif_img_path)
+        img_pil_exif = mmcv.imread(self.exif_img_path, backend='pillow')
+        assert img_cv2_exif.shape == (400, 300, 3)
+        assert img_pil_exif.shape == (400, 300, 3)
+        img_cv2_exif_unchanged = mmcv.imread(
+            self.exif_img_path, flag='unchanged')
+        img_pil_exif_unchanged = mmcv.imread(
+            self.exif_img_path, backend='pillow', flag='unchanged')
+        assert img_cv2_exif_unchanged.shape == (300, 400, 3)
+        assert img_pil_exif_unchanged.shape == (300, 400, 3)
+        img_cv2_color_ignore_exif = mmcv.imread(
+            self.exif_img_path, flag='color_ignore_orientation')
+        img_pil_color_ignore_exif = mmcv.imread(
+            self.exif_img_path,
+            backend='pillow',
+            flag='color_ignore_orientation')
+        assert img_cv2_color_ignore_exif.shape == (300, 400, 3)
+        assert img_pil_color_ignore_exif.shape == (300, 400, 3)
+        img_cv2_grayscale_ignore_exif = mmcv.imread(
+            self.exif_img_path, flag='grayscale_ignore_orientation')
+        img_pil_grayscale_ignore_exif = mmcv.imread(
+            self.exif_img_path,
+            backend='pillow',
+            flag='grayscale_ignore_orientation')
+        assert img_cv2_grayscale_ignore_exif.shape == (300, 400)
+        assert img_pil_grayscale_ignore_exif.shape == (300, 400)
+
+    def test_imfrombytes(self):
+        # backend cv2, channel order: bgr
+        mmcv.use_backend('cv2')
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_cv2 = mmcv.imfrombytes(img_bytes)
+        assert img_cv2.shape == (300, 400, 3)
+
+        # backend cv2, channel order: rgb
+        mmcv.use_backend('cv2')
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_rgb_cv2 = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+        assert img_rgb_cv2.shape == (300, 400, 3)
+        assert_array_equal(img_rgb_cv2, img_cv2[:, :, ::-1])
+
+        # backend cv2, grayscale, decode as 3 channels
+        with open(self.gray_img_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_rgb_cv2 = mmcv.imfrombytes(img_bytes)
+        assert gray_img_rgb_cv2.shape == (300, 400, 3)
+
+        # backend cv2, grayscale
+        with open(self.gray_img_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_cv2 = mmcv.imfrombytes(img_bytes, flag='grayscale')
+        assert gray_img_cv2.shape == (300, 400)
+
+        # backend cv2, grayscale dim3
+        with open(self.gray_img_dim3_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_dim3_cv2 = mmcv.imfrombytes(img_bytes, flag='grayscale')
+        assert gray_img_dim3_cv2.shape == (300, 400)
+
+        # arg backend pillow, channel order: bgr
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_pillow = mmcv.imfrombytes(img_bytes, backend='pillow')
+        assert img_pillow.shape == (300, 400, 3)
+        # Pillow and opencv decoding may not be the same
+        assert (img_cv2 == img_pillow).sum() / float(img_cv2.size) > 0.5
+
+        # backend pillow, channel order: bgr
+        mmcv.use_backend('pillow')
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_pillow = mmcv.imfrombytes(img_bytes)
+        assert img_pillow.shape == (300, 400, 3)
+        # Pillow and opencv decoding may not be the same
+        assert (img_cv2 == img_pillow).sum() / float(img_cv2.size) > 0.5
+
+        # backend turbojpeg, channel order: bgr
+        mmcv.use_backend('turbojpeg')
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_turbojpeg = mmcv.imfrombytes(img_bytes)
+        assert img_turbojpeg.shape == (300, 400, 3)
+        assert_array_equal(img_cv2, img_turbojpeg)
+
+        # backend turbojpeg, channel order: rgb
+        with open(self.img_path, 'rb') as f:
+            img_bytes = f.read()
+        img_rgb_turbojpeg = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+        assert img_rgb_turbojpeg.shape == (300, 400, 3)
+        assert_array_equal(img_rgb_turbojpeg, img_cv2[:, :, ::-1])
+
+        # backend turbojpeg, grayscale, decode as 3 channels
+        with open(self.gray_img_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_turbojpeg = mmcv.imfrombytes(img_bytes)
+        assert gray_img_turbojpeg.shape == (300, 400, 3)
+        assert_array_equal(gray_img_rgb_cv2, gray_img_turbojpeg)
+
+        # backend turbojpeg, grayscale
+        with open(self.gray_img_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_turbojpeg = mmcv.imfrombytes(img_bytes, flag='grayscale')
+        assert gray_img_turbojpeg.shape == (300, 400)
+        assert_array_equal(gray_img_cv2, gray_img_turbojpeg)
+
+        # backend turbojpeg, grayscale dim3
+        with open(self.gray_img_dim3_path, 'rb') as f:
+            img_bytes = f.read()
+        gray_img_dim3_turbojpeg = mmcv.imfrombytes(img_bytes, flag='grayscale')
+        assert gray_img_dim3_turbojpeg.shape == (300, 400)
+        assert_array_equal(gray_img_dim3_cv2, gray_img_dim3_turbojpeg)
+
+        mmcv.use_backend('cv2')
+
+        with pytest.raises(ValueError):
+            with open(self.img_path, 'rb') as f:
+                img_bytes = f.read()
+            mmcv.imfrombytes(img_bytes, backend='unsupported_backend')
+
+    def test_imwrite(self):
+        img = mmcv.imread(self.img_path)
+        out_file = osp.join(tempfile.gettempdir(), 'mmcv_test.jpg')
+        mmcv.imwrite(img, out_file)
+        rewrite_img = mmcv.imread(out_file)
+        os.remove(out_file)
+        self.assert_img_equal(img, rewrite_img)
+
+        # test petrel client
+        with patch.object(
+                PetrelBackend, 'put', return_value=None) as mock_method:
+            ret = mmcv.imwrite(img, self.s3_path)
+            ret_with_args = mmcv.imwrite(
+                img, self.s3_path, file_client_args={'backend': 'petrel'})
+            assert ret
+            assert ret_with_args
+            mock_method.assert_called()
+
+        with pytest.raises(cv2.error):
+            mmcv.imwrite(img, 'error_file.jppg')
+
+    @patch('mmcv.image.io.TurboJPEG', None)
+    def test_no_turbojpeg(self):
+        with pytest.raises(ImportError):
+            mmcv.use_backend('turbojpeg')
+
+        mmcv.use_backend('cv2')
+
+    @patch('mmcv.image.io.Image', None)
+    def test_no_pillow(self):
+        with pytest.raises(ImportError):
+            mmcv.use_backend('pillow')
+
+        mmcv.use_backend('cv2')
diff --git a/mmcv/tests/test_image/test_photometric.py b/mmcv/tests/test_image/test_photometric.py
new file mode 100644
index 0000000000000000000000000000000000000000..1391a7ebcafc6d90acb94bf39628ba278e894cfb
--- /dev/null
+++ b/mmcv/tests/test_image/test_photometric.py
@@ -0,0 +1,414 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import cv2
+import numpy as np
+import pytest
+from numpy.testing import assert_array_equal
+
+import mmcv
+
+
+class TestPhotometric:
+
+    @classmethod
+    def setup_class(cls):
+        # the test img resolution is 400x300
+        cls.img_path = osp.join(osp.dirname(__file__), '../data/color.jpg')
+        cls.img = cv2.imread(cls.img_path)
+        cls.mean = np.array([123.675, 116.28, 103.53], dtype=np.float32)
+        cls.std = np.array([58.395, 57.12, 57.375], dtype=np.float32)
+
+    def test_imnormalize(self):
+        rgb_img = self.img[:, :, ::-1]
+        baseline = (rgb_img - self.mean) / self.std
+        img = mmcv.imnormalize(self.img, self.mean, self.std)
+        assert np.allclose(img, baseline)
+        assert id(img) != id(self.img)
+        img = mmcv.imnormalize(rgb_img, self.mean, self.std, to_rgb=False)
+        assert np.allclose(img, baseline)
+        assert id(img) != id(rgb_img)
+
+    def test_imnormalize_(self):
+        img_for_normalize = np.float32(self.img)
+        rgb_img_for_normalize = np.float32(self.img[:, :, ::-1])
+        baseline = (rgb_img_for_normalize - self.mean) / self.std
+        img = mmcv.imnormalize_(img_for_normalize, self.mean, self.std)
+        assert np.allclose(img_for_normalize, baseline)
+        assert id(img) == id(img_for_normalize)
+        img = mmcv.imnormalize_(
+            rgb_img_for_normalize, self.mean, self.std, to_rgb=False)
+        assert np.allclose(img, baseline)
+        assert id(img) == id(rgb_img_for_normalize)
+
+    def test_imdenormalize(self):
+        norm_img = (self.img[:, :, ::-1] - self.mean) / self.std
+        rgb_baseline = (norm_img * self.std + self.mean)
+        bgr_baseline = rgb_baseline[:, :, ::-1]
+        img = mmcv.imdenormalize(norm_img, self.mean, self.std)
+        assert np.allclose(img, bgr_baseline)
+        img = mmcv.imdenormalize(norm_img, self.mean, self.std, to_bgr=False)
+        assert np.allclose(img, rgb_baseline)
+
+    def test_iminvert(self):
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img_r = np.array([[255, 127, 0], [254, 128, 1], [253, 126, 2]],
+                         dtype=np.uint8)
+        assert_array_equal(mmcv.iminvert(img), img_r)
+
+    def test_solarize(self):
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img_r = np.array([[0, 127, 0], [1, 127, 1], [2, 126, 2]],
+                         dtype=np.uint8)
+        assert_array_equal(mmcv.solarize(img), img_r)
+        img_r = np.array([[0, 127, 0], [1, 128, 1], [2, 126, 2]],
+                         dtype=np.uint8)
+        assert_array_equal(mmcv.solarize(img, 100), img_r)
+
+    def test_posterize(self):
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img_r = np.array([[0, 128, 128], [0, 0, 128], [0, 128, 128]],
+                         dtype=np.uint8)
+        assert_array_equal(mmcv.posterize(img, 1), img_r)
+        img_r = np.array([[0, 128, 224], [0, 96, 224], [0, 128, 224]],
+                         dtype=np.uint8)
+        assert_array_equal(mmcv.posterize(img, 3), img_r)
+
+    def test_adjust_color(self):
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        assert_array_equal(mmcv.adjust_color(img), img)
+        img_gray = mmcv.bgr2gray(img)
+        img_r = np.stack([img_gray, img_gray, img_gray], axis=-1)
+        assert_array_equal(mmcv.adjust_color(img, 0), img_r)
+        assert_array_equal(mmcv.adjust_color(img, 0, 1), img_r)
+        assert_array_equal(
+            mmcv.adjust_color(img, 0.5, 0.5),
+            np.round(np.clip((img * 0.5 + img_r * 0.5), 0,
+                             255)).astype(img.dtype))
+        assert_array_equal(
+            mmcv.adjust_color(img, 1, 1.5),
+            np.round(np.clip(img * 1 + img_r * 1.5, 0, 255)).astype(img.dtype))
+        assert_array_equal(
+            mmcv.adjust_color(img, 0.8, -0.6, gamma=2),
+            np.round(np.clip(img * 0.8 - 0.6 * img_r + 2, 0,
+                             255)).astype(img.dtype))
+        assert_array_equal(
+            mmcv.adjust_color(img, 0.8, -0.6, gamma=-0.6),
+            np.round(np.clip(img * 0.8 - 0.6 * img_r - 0.6, 0,
+                             255)).astype(img.dtype))
+
+        # test float type of image
+        img = img.astype(np.float32)
+        assert_array_equal(
+            np.round(mmcv.adjust_color(img, 0.8, -0.6, gamma=-0.6)),
+            np.round(np.clip(img * 0.8 - 0.6 * img_r - 0.6, 0, 255)))
+
+    def test_imequalize(self, nb_rand_test=100):
+
+        def _imequalize(img):
+            # equalize the image using PIL.ImageOps.equalize
+            from PIL import Image, ImageOps
+            img = Image.fromarray(img)
+            equalized_img = np.asarray(ImageOps.equalize(img))
+            return equalized_img
+
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        equalized_img = mmcv.imequalize(img)
+        assert_array_equal(equalized_img, _imequalize(img))
+
+        # test equalize with case step=0
+        img = np.array([[0, 0, 0], [120, 120, 120], [255, 255, 255]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        assert_array_equal(mmcv.imequalize(img), img)
+
+        # test equalize with randomly sampled image.
+        for _ in range(nb_rand_test):
+            img = np.clip(np.random.normal(0, 1, (256, 256, 3)) * 260, 0,
+                          255).astype(np.uint8)
+            equalized_img = mmcv.imequalize(img)
+            assert_array_equal(equalized_img, _imequalize(img))
+
+    def test_adjust_brightness(self, nb_rand_test=100):
+
+        def _adjust_brightness(img, factor):
+            # adjust the brightness of image using
+            # PIL.ImageEnhance.Brightness
+            from PIL import Image
+            from PIL.ImageEnhance import Brightness
+            img = Image.fromarray(img)
+            brightened_img = Brightness(img).enhance(factor)
+            return np.asarray(brightened_img)
+
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        # test case with factor 1.0
+        assert_array_equal(mmcv.adjust_brightness(img, 1.), img)
+        # test case with factor 0.0
+        assert_array_equal(mmcv.adjust_brightness(img, 0.), np.zeros_like(img))
+        # test adjust_brightness with randomly sampled images and factors.
+        for _ in range(nb_rand_test):
+            img = np.clip(
+                np.random.uniform(0, 1, (1000, 1200, 3)) * 260, 0,
+                255).astype(np.uint8)
+            factor = np.random.uniform() + np.random.choice([0, 1])
+            np.testing.assert_allclose(
+                mmcv.adjust_brightness(img, factor).astype(np.int32),
+                _adjust_brightness(img, factor).astype(np.int32),
+                rtol=0,
+                atol=1)
+
+    def test_adjust_contrast(self, nb_rand_test=100):
+
+        def _adjust_contrast(img, factor):
+            from PIL import Image
+            from PIL.ImageEnhance import Contrast
+
+            # Image.fromarray defaultly supports RGB, not BGR.
+            # convert from BGR to RGB
+            img = Image.fromarray(img[..., ::-1], mode='RGB')
+            contrasted_img = Contrast(img).enhance(factor)
+            # convert from RGB to BGR
+            return np.asarray(contrasted_img)[..., ::-1]
+
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+        # test case with factor 1.0
+        assert_array_equal(mmcv.adjust_contrast(img, 1.), img)
+        # test case with factor 0.0
+        assert_array_equal(
+            mmcv.adjust_contrast(img, 0.), _adjust_contrast(img, 0.))
+        # test adjust_contrast with randomly sampled images and factors.
+        for _ in range(nb_rand_test):
+            img = np.clip(
+                np.random.uniform(0, 1, (1200, 1000, 3)) * 260, 0,
+                255).astype(np.uint8)
+            factor = np.random.uniform() + np.random.choice([0, 1])
+            # Note the gap (less_equal 1) between PIL.ImageEnhance.Contrast
+            # and mmcv.adjust_contrast comes from the gap that converts from
+            # a color image to gray image using mmcv or PIL.
+            np.testing.assert_allclose(
+                mmcv.adjust_contrast(img, factor).astype(np.int32),
+                _adjust_contrast(img, factor).astype(np.int32),
+                rtol=0,
+                atol=1)
+
+    def test_auto_contrast(self, nb_rand_test=100):
+
+        def _auto_contrast(img, cutoff=0):
+            from PIL import Image
+            from PIL.ImageOps import autocontrast
+
+            # Image.fromarray defaultly supports RGB, not BGR.
+            # convert from BGR to RGB
+            img = Image.fromarray(img[..., ::-1], mode='RGB')
+            contrasted_img = autocontrast(img, cutoff)
+            # convert from RGB to BGR
+            return np.asarray(contrasted_img)[..., ::-1]
+
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+
+        # test case without cut-off
+        assert_array_equal(mmcv.auto_contrast(img), _auto_contrast(img))
+        # test case with cut-off as int
+        assert_array_equal(
+            mmcv.auto_contrast(img, 10), _auto_contrast(img, 10))
+        # test case with cut-off as float
+        assert_array_equal(
+            mmcv.auto_contrast(img, 12.5), _auto_contrast(img, 12.5))
+        # test case with cut-off as tuple
+        assert_array_equal(
+            mmcv.auto_contrast(img, (10, 10)), _auto_contrast(img, 10))
+        # test case with cut-off with sum over 100
+        assert_array_equal(
+            mmcv.auto_contrast(img, 60), _auto_contrast(img, 60))
+
+        # test auto_contrast with randomly sampled images and factors.
+        for _ in range(nb_rand_test):
+            img = np.clip(
+                np.random.uniform(0, 1, (1200, 1000, 3)) * 260, 0,
+                255).astype(np.uint8)
+            # cut-offs are not set as tuple since in `build.yml`, pillow 6.2.2
+            # is installed, which does not support setting low cut-off and high
+            #  cut-off differently.
+            # With pillow above 8.0.0, cutoff can be set as tuple
+            cutoff = np.random.rand() * 100
+            assert_array_equal(
+                mmcv.auto_contrast(img, cutoff), _auto_contrast(img, cutoff))
+
+    def test_adjust_sharpness(self, nb_rand_test=100):
+
+        def _adjust_sharpness(img, factor):
+            # adjust the sharpness of image using
+            # PIL.ImageEnhance.Sharpness
+            from PIL import Image
+            from PIL.ImageEnhance import Sharpness
+            img = Image.fromarray(img)
+            sharpened_img = Sharpness(img).enhance(factor)
+            return np.asarray(sharpened_img)
+
+        img = np.array([[0, 128, 255], [1, 127, 254], [2, 129, 253]],
+                       dtype=np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+
+        # test case with invalid type of kernel
+        with pytest.raises(AssertionError):
+            mmcv.adjust_sharpness(img, 1., kernel=1.)
+        # test case with invalid shape of kernel
+        kernel = np.ones((3, 3, 3))
+        with pytest.raises(AssertionError):
+            mmcv.adjust_sharpness(img, 1., kernel=kernel)
+        # test case with all-zero kernel, factor 0.0
+        kernel = np.zeros((3, 3))
+        assert_array_equal(
+            mmcv.adjust_sharpness(img, 0., kernel=kernel), np.zeros_like(img))
+
+        # test case with factor 1.0
+        assert_array_equal(mmcv.adjust_sharpness(img, 1.), img)
+        # test adjust_sharpness with randomly sampled images and factors.
+        for _ in range(nb_rand_test):
+            img = np.clip(
+                np.random.uniform(0, 1, (1000, 1200, 3)) * 260, 0,
+                255).astype(np.uint8)
+            factor = np.random.uniform()
+            # Note the gap between PIL.ImageEnhance.Sharpness and
+            # mmcv.adjust_sharpness mainly comes from the difference ways of
+            # handling img edges when applying filters
+            np.testing.assert_allclose(
+                mmcv.adjust_sharpness(img, factor).astype(np.int32)[1:-1,
+                                                                    1:-1],
+                _adjust_sharpness(img, factor).astype(np.int32)[1:-1, 1:-1],
+                rtol=0,
+                atol=1)
+
+    def test_adjust_lighting(self):
+        img = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).astype(np.uint8)
+        img = np.stack([img, img, img], axis=-1)
+
+        # eigval and eigvec must be np.ndarray
+        with pytest.raises(AssertionError):
+            mmcv.adjust_lighting(img, 1, np.ones((3, 1)))
+        with pytest.raises(AssertionError):
+            mmcv.adjust_lighting(img, np.array([1]), (1, 1, 1))
+        # we must have the same number of eigval and eigvec
+        with pytest.raises(AssertionError):
+            mmcv.adjust_lighting(img, np.array([1]), np.eye(2))
+        with pytest.raises(AssertionError):
+            mmcv.adjust_lighting(img, np.array([1]), np.array([1]))
+
+        img_adjusted = mmcv.adjust_lighting(
+            img,
+            np.random.normal(0, 1, 2),
+            np.random.normal(0, 1, (3, 2)),
+            alphastd=0.)
+        assert_array_equal(img_adjusted, img)
+
+    def test_lut_transform(self):
+        lut_table = np.array(list(range(256)))
+
+        # test assertion image values should between 0 and 255.
+        with pytest.raises(AssertionError):
+            mmcv.lut_transform(np.array([256]), lut_table)
+        with pytest.raises(AssertionError):
+            mmcv.lut_transform(np.array([-1]), lut_table)
+
+        # test assertion lut_table should be ndarray with shape (256, )
+        with pytest.raises(AssertionError):
+            mmcv.lut_transform(np.array([0]), list(range(256)))
+        with pytest.raises(AssertionError):
+            mmcv.lut_transform(np.array([1]), np.array(list(range(257))))
+
+        img = mmcv.lut_transform(self.img, lut_table)
+        baseline = cv2.LUT(self.img, lut_table)
+        assert np.allclose(img, baseline)
+
+        input_img = np.array(
+            [[[0, 128, 255], [255, 128, 0]], [[0, 128, 255], [255, 128, 0]]],
+            dtype=float)
+        img = mmcv.lut_transform(input_img, lut_table)
+        baseline = cv2.LUT(np.array(input_img, dtype=np.uint8), lut_table)
+        assert np.allclose(img, baseline)
+
+        input_img = np.random.randint(0, 256, size=(7, 8, 9, 10, 11))
+        img = mmcv.lut_transform(input_img, lut_table)
+        baseline = cv2.LUT(np.array(input_img, dtype=np.uint8), lut_table)
+        assert np.allclose(img, baseline)
+
+    def test_clahe(self):
+
+        def _clahe(img, clip_limit=40.0, tile_grid_size=(8, 8)):
+            clahe = cv2.createCLAHE(clip_limit, tile_grid_size)
+            return clahe.apply(np.array(img, dtype=np.uint8))
+
+        # test assertion image should have the right shape
+        with pytest.raises(AssertionError):
+            mmcv.clahe(self.img)
+
+        # test assertion tile_grid_size should be a tuple with 2 integers
+        with pytest.raises(AssertionError):
+            mmcv.clahe(self.img[:, :, 0], tile_grid_size=(8.0, 8.0))
+        with pytest.raises(AssertionError):
+            mmcv.clahe(self.img[:, :, 0], tile_grid_size=(8, 8, 8))
+        with pytest.raises(AssertionError):
+            mmcv.clahe(self.img[:, :, 0], tile_grid_size=[8, 8])
+
+        # test with different channels
+        for i in range(self.img.shape[-1]):
+            img = mmcv.clahe(self.img[:, :, i])
+            img_std = _clahe(self.img[:, :, i])
+            assert np.allclose(img, img_std)
+            assert id(img) != id(self.img[:, :, i])
+            assert id(img_std) != id(self.img[:, :, i])
+
+        # test case with clip_limit=1.2
+        for i in range(self.img.shape[-1]):
+            img = mmcv.clahe(self.img[:, :, i], 1.2)
+            img_std = _clahe(self.img[:, :, i], 1.2)
+            assert np.allclose(img, img_std)
+            assert id(img) != id(self.img[:, :, i])
+            assert id(img_std) != id(self.img[:, :, i])
+
+    def test_adjust_hue(self):
+        from PIL import Image
+
+        def _adjust_hue(img, hue_factor):
+            input_mode = img.mode
+            if input_mode in {'L', '1', 'I', 'F'}:
+                return img
+            h, s, v = img.convert('HSV').split()
+            np_h = np.array(h, dtype=np.uint8)
+            # uint8 addition take cares of rotation across boundaries
+            with np.errstate(over='ignore'):
+                np_h += np.uint8(hue_factor * 255)
+            h = Image.fromarray(np_h, 'L')
+            img = Image.merge('HSV', (h, s, v)).convert(input_mode)
+            return img
+
+        pil_img = Image.fromarray(self.img)
+
+        # test case with img is not ndarray
+        with pytest.raises(TypeError):
+            mmcv.adjust_hue(pil_img, hue_factor=0.0)
+
+        # test case with hue_factor > 0.5 or hue_factor < -0.5
+        with pytest.raises(ValueError):
+            mmcv.adjust_hue(self.img, hue_factor=-0.6)
+        with pytest.raises(ValueError):
+            mmcv.adjust_hue(self.img, hue_factor=0.6)
+
+        for i in np.arange(-0.5, 0.5, 0.2):
+            pil_res = _adjust_hue(pil_img, hue_factor=i)
+            pil_res = np.array(pil_res)
+            cv2_res = mmcv.adjust_hue(self.img, hue_factor=i)
+            assert np.allclose(pil_res, cv2_res, atol=10.0)
diff --git a/mmcv/tests/test_load_model_zoo.py b/mmcv/tests/test_load_model_zoo.py
new file mode 100644
index 0000000000000000000000000000000000000000..904cb940313efa634ead0a02d7d6f8b3850d5e89
--- /dev/null
+++ b/mmcv/tests/test_load_model_zoo.py
@@ -0,0 +1,156 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+from unittest.mock import patch
+
+import pytest
+import torchvision
+
+import mmcv
+from mmcv.runner.checkpoint import (DEFAULT_CACHE_DIR, ENV_MMCV_HOME,
+                                    ENV_XDG_CACHE_HOME, _get_mmcv_home,
+                                    _load_checkpoint,
+                                    get_deprecated_model_names,
+                                    get_external_models)
+from mmcv.utils import digit_version
+
+
+@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
+def test_set_mmcv_home():
+    os.environ.pop(ENV_MMCV_HOME, None)
+    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home/')
+    os.environ[ENV_MMCV_HOME] = mmcv_home
+    assert _get_mmcv_home() == mmcv_home
+
+
+@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
+def test_default_mmcv_home():
+    os.environ.pop(ENV_MMCV_HOME, None)
+    os.environ.pop(ENV_XDG_CACHE_HOME, None)
+    assert _get_mmcv_home() == os.path.expanduser(
+        os.path.join(DEFAULT_CACHE_DIR, 'mmcv'))
+    model_urls = get_external_models()
+    assert model_urls == mmcv.load(
+        osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json'))
+
+
+@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
+def test_get_external_models():
+    os.environ.pop(ENV_MMCV_HOME, None)
+    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home/')
+    os.environ[ENV_MMCV_HOME] = mmcv_home
+    ext_urls = get_external_models()
+    assert ext_urls == {
+        'train': 'https://localhost/train.pth',
+        'test': 'test.pth',
+        'val': 'val.pth',
+        'train_empty': 'train.pth'
+    }
+
+
+@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
+def test_get_deprecated_models():
+    os.environ.pop(ENV_MMCV_HOME, None)
+    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home/')
+    os.environ[ENV_MMCV_HOME] = mmcv_home
+    dep_urls = get_deprecated_model_names()
+    assert dep_urls == {
+        'train_old': 'train',
+        'test_old': 'test',
+    }
+
+
+def load_from_http(url, map_location=None):
+    return 'url:' + url
+
+
+def load_url(url, map_location=None, model_dir=None):
+    return load_from_http(url)
+
+
+def load(filepath, map_location=None):
+    return 'local:' + filepath
+
+
+@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
+@patch('mmcv.runner.checkpoint.load_from_http', load_from_http)
+@patch('mmcv.runner.checkpoint.load_url', load_url)
+@patch('torch.load', load)
+def test_load_external_url():
+    # test modelzoo://
+    torchvision_version = torchvision.__version__
+    if digit_version(torchvision_version) < digit_version('0.10.0a0'):
+        assert (_load_checkpoint('modelzoo://resnet50') ==
+                'url:https://download.pytorch.org/models/resnet50-19c8e'
+                '357.pth')
+        assert (_load_checkpoint('torchvision://resnet50') ==
+                'url:https://download.pytorch.org/models/resnet50-19c8e'
+                '357.pth')
+    else:
+        assert (_load_checkpoint('modelzoo://resnet50') ==
+                'url:https://download.pytorch.org/models/resnet50-0676b'
+                'a61.pth')
+        assert (_load_checkpoint('torchvision://resnet50') ==
+                'url:https://download.pytorch.org/models/resnet50-0676b'
+                'a61.pth')
+
+    if digit_version(torchvision_version) >= digit_version('0.13.0a0'):
+        # Test load new format torchvision models.
+        assert (
+            _load_checkpoint('torchvision://resnet50.imagenet1k_v1') ==
+            'url:https://download.pytorch.org/models/resnet50-0676ba61.pth')
+
+        assert (
+            _load_checkpoint('torchvision://ResNet50_Weights.IMAGENET1K_V1') ==
+            'url:https://download.pytorch.org/models/resnet50-0676ba61.pth')
+
+        _load_checkpoint('torchvision://resnet50.default')
+
+    # test open-mmlab:// with default MMCV_HOME
+    os.environ.pop(ENV_MMCV_HOME, None)
+    os.environ.pop(ENV_XDG_CACHE_HOME, None)
+    url = _load_checkpoint('open-mmlab://train')
+    assert url == 'url:https://localhost/train.pth'
+
+    # test open-mmlab:// with deprecated model name
+    os.environ.pop(ENV_MMCV_HOME, None)
+    os.environ.pop(ENV_XDG_CACHE_HOME, None)
+    with pytest.warns(
+            Warning,
+            match='open-mmlab://train_old is deprecated in favor of '
+            'open-mmlab://train'):
+        url = _load_checkpoint('open-mmlab://train_old')
+        assert url == 'url:https://localhost/train.pth'
+
+    # test openmmlab:// with deprecated model name
+    os.environ.pop(ENV_MMCV_HOME, None)
+    os.environ.pop(ENV_XDG_CACHE_HOME, None)
+    with pytest.warns(
+            Warning,
+            match='openmmlab://train_old is deprecated in favor of '
+            'openmmlab://train'):
+        url = _load_checkpoint('openmmlab://train_old')
+        assert url == 'url:https://localhost/train.pth'
+
+    # test open-mmlab:// with user-defined MMCV_HOME
+    os.environ.pop(ENV_MMCV_HOME, None)
+    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home')
+    os.environ[ENV_MMCV_HOME] = mmcv_home
+    url = _load_checkpoint('open-mmlab://train')
+    assert url == 'url:https://localhost/train.pth'
+    with pytest.raises(FileNotFoundError, match='train.pth can not be found.'):
+        _load_checkpoint('open-mmlab://train_empty')
+    url = _load_checkpoint('open-mmlab://test')
+    assert url == f'local:{osp.join(_get_mmcv_home(), "test.pth")}'
+    url = _load_checkpoint('open-mmlab://val')
+    assert url == f'local:{osp.join(_get_mmcv_home(), "val.pth")}'
+
+    # test http:// https://
+    url = _load_checkpoint('http://localhost/train.pth')
+    assert url == 'url:http://localhost/train.pth'
+
+    # test local file
+    with pytest.raises(FileNotFoundError, match='train.pth can not be found.'):
+        _load_checkpoint('train.pth')
+    url = _load_checkpoint(osp.join(_get_mmcv_home(), 'test.pth'))
+    assert url == f'local:{osp.join(_get_mmcv_home(), "test.pth")}'
diff --git a/mmcv/tests/test_ops/test_active_rotated_filter.py b/mmcv/tests/test_ops/test_active_rotated_filter.py
new file mode 100644
index 0000000000000000000000000000000000000000..30ea59c5c62a4fd7c01fbd03a98485be359984f4
--- /dev/null
+++ b/mmcv/tests/test_ops/test_active_rotated_filter.py
@@ -0,0 +1,258 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import active_rotated_filter
+
+np_feature = np.array([[[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
+                          [-1.0986e+00, -1.1463e+00, -1.3176e+00],
+                          [1.4808e+00, 7.6572e-01, -1.4548e+00]]]],
+                       [[[[1.9370e+00, 6.2799e-01, 2.5834e-02],
+                          [-1.4242e+00, 7.6566e-01, 1.0015e+00],
+                          [9.8669e-01, 4.1356e-01, 6.1068e-01]]]],
+                       [[[[1.4565e+00, 1.4960e+00, 2.4339e-01],
+                          [-2.2484e-01, 7.5942e-01, -8.1184e-01],
+                          [-1.7077e+00, 1.0658e+00, 3.8311e-01]]]],
+                       [[[[8.4734e-01, 1.0904e+00, 2.4356e+00],
+                          [9.5822e-01, 2.2260e-01, -2.4450e-01],
+                          [-1.5078e+00, 7.0902e-02, -1.5921e+00]]]],
+                       [[[[2.1173e+00, -7.3524e-01, 1.8888e+00],
+                          [1.0169e+00, 4.7033e-01, -1.0875e+00],
+                          [-1.0736e+00, -5.2245e-01, -2.8733e-01]]]],
+                       [[[[-5.6433e-01, 1.5835e+00, -1.5826e+00],
+                          [-8.8974e-01, -4.3128e-01, -2.2423e-01],
+                          [1.6552e-03, -1.7292e+00, 2.6639e-01]]]],
+                       [[[[-1.2951e-01, 1.3493e+00, -1.9329e+00],
+                          [5.6248e-01, -5.1189e-01, 1.3614e+00],
+                          [3.3680e-01, -8.7148e-01, 5.0592e-01]]]],
+                       [[[[1.6781e-02, -8.3929e-01, 1.2060e+00],
+                          [-1.0764e+00, 4.7821e-01, 1.5342e+00],
+                          [-4.4542e-01, -1.8606e+00, 3.0827e-01]]]]])
+
+np_indices = np.array([[[[1, 2, 3, 6, 9, 8, 7, 4], [2, 3, 6, 9, 8, 7, 4, 1],
+                         [3, 6, 9, 8, 7, 4, 1, 2]],
+                        [[4, 1, 2, 3, 6, 9, 8, 7], [5, 5, 5, 5, 5, 5, 5, 5],
+                         [6, 9, 8, 7, 4, 1, 2, 3]],
+                        [[7, 4, 1, 2, 3, 6, 9, 8], [8, 7, 4, 1, 2, 3, 6, 9],
+                         [9, 8, 7, 4, 1, 2, 3, 6]]]])
+
+expected_output = np.array([[[[-1.4934e-01, 1.1341e+00, -1.6241e-01],
+                              [-1.0986e+00, -1.1463e+00, -1.3176e+00],
+                              [1.4808e+00, 7.6572e-01, -1.4548e+00]]],
+                            [[[-1.0986e+00, -1.4934e-01, 1.1341e+00],
+                              [1.4808e+00, -1.1463e+00, -1.6241e-01],
+                              [7.6572e-01, -1.4548e+00, -1.3176e+00]]],
+                            [[[1.4808e+00, -1.0986e+00, -1.4934e-01],
+                              [7.6572e-01, -1.1463e+00, 1.1341e+00],
+                              [-1.4548e+00, -1.3176e+00, -1.6241e-01]]],
+                            [[[7.6572e-01, 1.4808e+00, -1.0986e+00],
+                              [-1.4548e+00, -1.1463e+00, -1.4934e-01],
+                              [-1.3176e+00, -1.6241e-01, 1.1341e+00]]],
+                            [[[-1.4548e+00, 7.6572e-01, 1.4808e+00],
+                              [-1.3176e+00, -1.1463e+00, -1.0986e+00],
+                              [-1.6241e-01, 1.1341e+00, -1.4934e-01]]],
+                            [[[-1.3176e+00, -1.4548e+00, 7.6572e-01],
+                              [-1.6241e-01, -1.1463e+00, 1.4808e+00],
+                              [1.1341e+00, -1.4934e-01, -1.0986e+00]]],
+                            [[[-1.6241e-01, -1.3176e+00, -1.4548e+00],
+                              [1.1341e+00, -1.1463e+00, 7.6572e-01],
+                              [-1.4934e-01, -1.0986e+00, 1.4808e+00]]],
+                            [[[1.1341e+00, -1.6241e-01, -1.3176e+00],
+                              [-1.4934e-01, -1.1463e+00, -1.4548e+00],
+                              [-1.0986e+00, 1.4808e+00, 7.6572e-01]]],
+                            [[[1.9370e+00, 6.2799e-01, 2.5834e-02],
+                              [-1.4242e+00, 7.6566e-01, 1.0015e+00],
+                              [9.8669e-01, 4.1356e-01, 6.1068e-01]]],
+                            [[[-1.4242e+00, 1.9370e+00, 6.2799e-01],
+                              [9.8669e-01, 7.6566e-01, 2.5834e-02],
+                              [4.1356e-01, 6.1068e-01, 1.0015e+00]]],
+                            [[[9.8669e-01, -1.4242e+00, 1.9370e+00],
+                              [4.1356e-01, 7.6566e-01, 6.2799e-01],
+                              [6.1068e-01, 1.0015e+00, 2.5834e-02]]],
+                            [[[4.1356e-01, 9.8669e-01, -1.4242e+00],
+                              [6.1068e-01, 7.6566e-01, 1.9370e+00],
+                              [1.0015e+00, 2.5834e-02, 6.2799e-01]]],
+                            [[[6.1068e-01, 4.1356e-01, 9.8669e-01],
+                              [1.0015e+00, 7.6566e-01, -1.4242e+00],
+                              [2.5834e-02, 6.2799e-01, 1.9370e+00]]],
+                            [[[1.0015e+00, 6.1068e-01, 4.1356e-01],
+                              [2.5834e-02, 7.6566e-01, 9.8669e-01],
+                              [6.2799e-01, 1.9370e+00, -1.4242e+00]]],
+                            [[[2.5834e-02, 1.0015e+00, 6.1068e-01],
+                              [6.2799e-01, 7.6566e-01, 4.1356e-01],
+                              [1.9370e+00, -1.4242e+00, 9.8669e-01]]],
+                            [[[6.2799e-01, 2.5834e-02, 1.0015e+00],
+                              [1.9370e+00, 7.6566e-01, 6.1068e-01],
+                              [-1.4242e+00, 9.8669e-01, 4.1356e-01]]],
+                            [[[1.4565e+00, 1.4960e+00, 2.4339e-01],
+                              [-2.2484e-01, 7.5942e-01, -8.1184e-01],
+                              [-1.7077e+00, 1.0658e+00, 3.8311e-01]]],
+                            [[[-2.2484e-01, 1.4565e+00, 1.4960e+00],
+                              [-1.7077e+00, 7.5942e-01, 2.4339e-01],
+                              [1.0658e+00, 3.8311e-01, -8.1184e-01]]],
+                            [[[-1.7077e+00, -2.2484e-01, 1.4565e+00],
+                              [1.0658e+00, 7.5942e-01, 1.4960e+00],
+                              [3.8311e-01, -8.1184e-01, 2.4339e-01]]],
+                            [[[1.0658e+00, -1.7077e+00, -2.2484e-01],
+                              [3.8311e-01, 7.5942e-01, 1.4565e+00],
+                              [-8.1184e-01, 2.4339e-01, 1.4960e+00]]],
+                            [[[3.8311e-01, 1.0658e+00, -1.7077e+00],
+                              [-8.1184e-01, 7.5942e-01, -2.2484e-01],
+                              [2.4339e-01, 1.4960e+00, 1.4565e+00]]],
+                            [[[-8.1184e-01, 3.8311e-01, 1.0658e+00],
+                              [2.4339e-01, 7.5942e-01, -1.7077e+00],
+                              [1.4960e+00, 1.4565e+00, -2.2484e-01]]],
+                            [[[2.4339e-01, -8.1184e-01, 3.8311e-01],
+                              [1.4960e+00, 7.5942e-01, 1.0658e+00],
+                              [1.4565e+00, -2.2484e-01, -1.7077e+00]]],
+                            [[[1.4960e+00, 2.4339e-01, -8.1184e-01],
+                              [1.4565e+00, 7.5942e-01, 3.8311e-01],
+                              [-2.2484e-01, -1.7077e+00, 1.0658e+00]]],
+                            [[[8.4734e-01, 1.0904e+00, 2.4356e+00],
+                              [9.5822e-01, 2.2260e-01, -2.4450e-01],
+                              [-1.5078e+00, 7.0902e-02, -1.5921e+00]]],
+                            [[[9.5822e-01, 8.4734e-01, 1.0904e+00],
+                              [-1.5078e+00, 2.2260e-01, 2.4356e+00],
+                              [7.0902e-02, -1.5921e+00, -2.4450e-01]]],
+                            [[[-1.5078e+00, 9.5822e-01, 8.4734e-01],
+                              [7.0902e-02, 2.2260e-01, 1.0904e+00],
+                              [-1.5921e+00, -2.4450e-01, 2.4356e+00]]],
+                            [[[7.0902e-02, -1.5078e+00, 9.5822e-01],
+                              [-1.5921e+00, 2.2260e-01, 8.4734e-01],
+                              [-2.4450e-01, 2.4356e+00, 1.0904e+00]]],
+                            [[[-1.5921e+00, 7.0902e-02, -1.5078e+00],
+                              [-2.4450e-01, 2.2260e-01, 9.5822e-01],
+                              [2.4356e+00, 1.0904e+00, 8.4734e-01]]],
+                            [[[-2.4450e-01, -1.5921e+00, 7.0902e-02],
+                              [2.4356e+00, 2.2260e-01, -1.5078e+00],
+                              [1.0904e+00, 8.4734e-01, 9.5822e-01]]],
+                            [[[2.4356e+00, -2.4450e-01, -1.5921e+00],
+                              [1.0904e+00, 2.2260e-01, 7.0902e-02],
+                              [8.4734e-01, 9.5822e-01, -1.5078e+00]]],
+                            [[[1.0904e+00, 2.4356e+00, -2.4450e-01],
+                              [8.4734e-01, 2.2260e-01, -1.5921e+00],
+                              [9.5822e-01, -1.5078e+00, 7.0902e-02]]],
+                            [[[2.1173e+00, -7.3524e-01, 1.8888e+00],
+                              [1.0169e+00, 4.7033e-01, -1.0875e+00],
+                              [-1.0736e+00, -5.2245e-01, -2.8733e-01]]],
+                            [[[1.0169e+00, 2.1173e+00, -7.3524e-01],
+                              [-1.0736e+00, 4.7033e-01, 1.8888e+00],
+                              [-5.2245e-01, -2.8733e-01, -1.0875e+00]]],
+                            [[[-1.0736e+00, 1.0169e+00, 2.1173e+00],
+                              [-5.2245e-01, 4.7033e-01, -7.3524e-01],
+                              [-2.8733e-01, -1.0875e+00, 1.8888e+00]]],
+                            [[[-5.2245e-01, -1.0736e+00, 1.0169e+00],
+                              [-2.8733e-01, 4.7033e-01, 2.1173e+00],
+                              [-1.0875e+00, 1.8888e+00, -7.3524e-01]]],
+                            [[[-2.8733e-01, -5.2245e-01, -1.0736e+00],
+                              [-1.0875e+00, 4.7033e-01, 1.0169e+00],
+                              [1.8888e+00, -7.3524e-01, 2.1173e+00]]],
+                            [[[-1.0875e+00, -2.8733e-01, -5.2245e-01],
+                              [1.8888e+00, 4.7033e-01, -1.0736e+00],
+                              [-7.3524e-01, 2.1173e+00, 1.0169e+00]]],
+                            [[[1.8888e+00, -1.0875e+00, -2.8733e-01],
+                              [-7.3524e-01, 4.7033e-01, -5.2245e-01],
+                              [2.1173e+00, 1.0169e+00, -1.0736e+00]]],
+                            [[[-7.3524e-01, 1.8888e+00, -1.0875e+00],
+                              [2.1173e+00, 4.7033e-01, -2.8733e-01],
+                              [1.0169e+00, -1.0736e+00, -5.2245e-01]]],
+                            [[[-5.6433e-01, 1.5835e+00, -1.5826e+00],
+                              [-8.8974e-01, -4.3128e-01, -2.2423e-01],
+                              [1.6552e-03, -1.7292e+00, 2.6639e-01]]],
+                            [[[-8.8974e-01, -5.6433e-01, 1.5835e+00],
+                              [1.6552e-03, -4.3128e-01, -1.5826e+00],
+                              [-1.7292e+00, 2.6639e-01, -2.2423e-01]]],
+                            [[[1.6552e-03, -8.8974e-01, -5.6433e-01],
+                              [-1.7292e+00, -4.3128e-01, 1.5835e+00],
+                              [2.6639e-01, -2.2423e-01, -1.5826e+00]]],
+                            [[[-1.7292e+00, 1.6552e-03, -8.8974e-01],
+                              [2.6639e-01, -4.3128e-01, -5.6433e-01],
+                              [-2.2423e-01, -1.5826e+00, 1.5835e+00]]],
+                            [[[2.6639e-01, -1.7292e+00, 1.6552e-03],
+                              [-2.2423e-01, -4.3128e-01, -8.8974e-01],
+                              [-1.5826e+00, 1.5835e+00, -5.6433e-01]]],
+                            [[[-2.2423e-01, 2.6639e-01, -1.7292e+00],
+                              [-1.5826e+00, -4.3128e-01, 1.6552e-03],
+                              [1.5835e+00, -5.6433e-01, -8.8974e-01]]],
+                            [[[-1.5826e+00, -2.2423e-01, 2.6639e-01],
+                              [1.5835e+00, -4.3128e-01, -1.7292e+00],
+                              [-5.6433e-01, -8.8974e-01, 1.6552e-03]]],
+                            [[[1.5835e+00, -1.5826e+00, -2.2423e-01],
+                              [-5.6433e-01, -4.3128e-01, 2.6639e-01],
+                              [-8.8974e-01, 1.6552e-03, -1.7292e+00]]],
+                            [[[-1.2951e-01, 1.3493e+00, -1.9329e+00],
+                              [5.6248e-01, -5.1189e-01, 1.3614e+00],
+                              [3.3680e-01, -8.7148e-01, 5.0592e-01]]],
+                            [[[5.6248e-01, -1.2951e-01, 1.3493e+00],
+                              [3.3680e-01, -5.1189e-01, -1.9329e+00],
+                              [-8.7148e-01, 5.0592e-01, 1.3614e+00]]],
+                            [[[3.3680e-01, 5.6248e-01, -1.2951e-01],
+                              [-8.7148e-01, -5.1189e-01, 1.3493e+00],
+                              [5.0592e-01, 1.3614e+00, -1.9329e+00]]],
+                            [[[-8.7148e-01, 3.3680e-01, 5.6248e-01],
+                              [5.0592e-01, -5.1189e-01, -1.2951e-01],
+                              [1.3614e+00, -1.9329e+00, 1.3493e+00]]],
+                            [[[5.0592e-01, -8.7148e-01, 3.3680e-01],
+                              [1.3614e+00, -5.1189e-01, 5.6248e-01],
+                              [-1.9329e+00, 1.3493e+00, -1.2951e-01]]],
+                            [[[1.3614e+00, 5.0592e-01, -8.7148e-01],
+                              [-1.9329e+00, -5.1189e-01, 3.3680e-01],
+                              [1.3493e+00, -1.2951e-01, 5.6248e-01]]],
+                            [[[-1.9329e+00, 1.3614e+00, 5.0592e-01],
+                              [1.3493e+00, -5.1189e-01, -8.7148e-01],
+                              [-1.2951e-01, 5.6248e-01, 3.3680e-01]]],
+                            [[[1.3493e+00, -1.9329e+00, 1.3614e+00],
+                              [-1.2951e-01, -5.1189e-01, 5.0592e-01],
+                              [5.6248e-01, 3.3680e-01, -8.7148e-01]]],
+                            [[[1.6781e-02, -8.3929e-01, 1.2060e+00],
+                              [-1.0764e+00, 4.7821e-01, 1.5342e+00],
+                              [-4.4542e-01, -1.8606e+00, 3.0827e-01]]],
+                            [[[-1.0764e+00, 1.6781e-02, -8.3929e-01],
+                              [-4.4542e-01, 4.7821e-01, 1.2060e+00],
+                              [-1.8606e+00, 3.0827e-01, 1.5342e+00]]],
+                            [[[-4.4542e-01, -1.0764e+00, 1.6781e-02],
+                              [-1.8606e+00, 4.7821e-01, -8.3929e-01],
+                              [3.0827e-01, 1.5342e+00, 1.2060e+00]]],
+                            [[[-1.8606e+00, -4.4542e-01, -1.0764e+00],
+                              [3.0827e-01, 4.7821e-01, 1.6781e-02],
+                              [1.5342e+00, 1.2060e+00, -8.3929e-01]]],
+                            [[[3.0827e-01, -1.8606e+00, -4.4542e-01],
+                              [1.5342e+00, 4.7821e-01, -1.0764e+00],
+                              [1.2060e+00, -8.3929e-01, 1.6781e-02]]],
+                            [[[1.5342e+00, 3.0827e-01, -1.8606e+00],
+                              [1.2060e+00, 4.7821e-01, -4.4542e-01],
+                              [-8.3929e-01, 1.6781e-02, -1.0764e+00]]],
+                            [[[1.2060e+00, 1.5342e+00, 3.0827e-01],
+                              [-8.3929e-01, 4.7821e-01, -1.8606e+00],
+                              [1.6781e-02, -1.0764e+00, -4.4542e-01]]],
+                            [[[-8.3929e-01, 1.2060e+00, 1.5342e+00],
+                              [1.6781e-02, 4.7821e-01, 3.0827e-01],
+                              [-1.0764e+00, -4.4542e-01, -1.8606e+00]]]])
+
+expected_grad = np.array([[[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]],
+                          [[[[8., 8., 8.], [8., 8., 8.], [8., 8., 8.]]]]])
+
+
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not torch.cuda.is_available(), reason='requires CUDA support')),
+])
+def test_active_rotated_filter(device):
+    feature = torch.tensor(
+        np_feature, dtype=torch.float, device=device, requires_grad=True)
+    indices = torch.tensor(np_indices, dtype=torch.int, device=device)
+    output = active_rotated_filter(feature, indices)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(output.data.cpu().numpy(), expected_output, atol=1e-3)
+    assert np.allclose(
+        feature.grad.data.cpu().numpy(), expected_grad, atol=1e-3)
diff --git a/mmcv/tests/test_ops/test_assign_score_withk.py b/mmcv/tests/test_ops/test_assign_score_withk.py
new file mode 100644
index 0000000000000000000000000000000000000000..f8fc6ae6261b77a634e7681c4939612fe80ddf38
--- /dev/null
+++ b/mmcv/tests/test_ops/test_assign_score_withk.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import assign_score_withk
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_paconv_assign_scores():
+    scores = torch.tensor([[[[0.06947571, 0.6065746], [0.28462553, 0.8378516],
+                             [0.7595994, 0.97220325], [0.519155, 0.766185]],
+                            [[0.15348864, 0.6051019], [0.21510637, 0.31916398],
+                             [0.00236845, 0.5842595], [0.6783676, 0.5216348]]],
+                           [[[0.23089725, 0.5568468], [0.7405102, 0.06438422],
+                             [0.6887394, 0.22089851], [0.0502342, 0.79228795]],
+                            [[0.44883424, 0.15427643],
+                             [0.13817799, 0.34856772], [0.7989621, 0.33788306],
+                             [0.15699774, 0.7693662]]]]).float().cuda()
+    scores.requires_grad_()
+    points = torch.tensor([[[[0.06001121, 0.92963666, 0.5753327, 0.7251477],
+                             [0.53563064, 0.23129565, 0.92366195, 0.44261628]],
+                            [[0.5770022, 0.56625944, 0.23560429, 0.11178821],
+                             [0.7735967, 0.95678777, 0.25468266, 0.02895975]],
+                            [[0.0589869, 0.09017515, 0.5977862, 0.02797985],
+                             [0.603862, 0.35991007, 0.85761684, 0.3096559]],
+                            [[0.22359002, 0.13983732, 0.5544243, 0.68863827],
+                             [0.85646236, 0.75651926, 0.8638947, 0.83600986]],
+                            [[0.45424145, 0.27458847, 0.6456112, 0.47162914],
+                             [0.15773582, 0.47645122, 0.79964715, 0.3323908]],
+                            [[0.8351399, 0.84696376, 0.9431732, 0.29418713],
+                             [0.77168906, 0.6996871, 0.19354361, 0.03392768]],
+                            [[0.30976456, 0.7074133, 0.581795, 0.976677],
+                             [0.69656056, 0.07199162, 0.4708506, 0.29117996]],
+                            [[0.5829035, 0.30201727, 0.76556486, 0.0935446],
+                             [0.88030535, 0.16129416, 0.9242525, 0.49545723]]],
+                           [[[0.50899494, 0.06482804, 0.44939405, 0.37704808],
+                             [0.47028124, 0.11969638, 0.62823206, 0.28560323]],
+                            [[0.40690207, 0.689753, 0.51636654, 0.23040164],
+                             [0.06935787, 0.00488842, 0.22462702, 0.09182382]],
+                            [[0.26611632, 0.00184339, 0.7730655, 0.5228131],
+                             [0.87776035, 0.77895886, 0.2787183, 0.16620636]],
+                            [[0.502574, 0.04039001, 0.5368497, 0.98379374],
+                             [0.40973026, 0.3238272, 0.9733018, 0.13988364]],
+                            [[0.04586202, 0.20983845, 0.20662665, 0.22270602],
+                             [0.60387236, 0.5155574, 0.51237285, 0.6528438]],
+                            [[0.45735973, 0.86821306, 0.61054605, 0.8370336],
+                             [0.45193362, 0.3734138, 0.7825672, 0.5699416]],
+                            [[0.44591594, 0.12447512, 0.09282011, 0.7055254],
+                             [0.25223452, 0.46696228, 0.7051136, 0.892151]],
+                            [[0.49615085, 0.47321403, 0.93138885, 0.7652197],
+                             [0.38766378, 0.30332977, 0.23131835,
+                              0.02863514]]]]).float().cuda()
+    points.requires_grad_()
+    centers = torch.tensor([[[[0.83878064, 0.96658987, 0.8033424, 0.9598312],
+                              [0.45035273, 0.8768925, 0.977736, 0.54547966]],
+                             [[0.01041394, 0.597893, 0.36212963, 0.4410367],
+                              [0.94879234, 0.8372817, 0.21237361, 0.67945415]],
+                             [[0.5096087, 0.26401454, 0.60034937, 0.5417416],
+                              [0.87591463, 0.546456, 0.4096033, 0.16373193]],
+                             [[0.79547447, 0.1482386, 0.12840575, 0.45384115],
+                              [0.5640288, 0.944541, 0.5745328, 0.73229736]],
+                             [[0.93011934, 0.7406011, 0.62621707, 0.8677915],
+                              [0.91563636, 0.3595413, 0.6678378, 0.6085383]],
+                             [[0.22431666, 0.65617776, 0.7483924, 0.6263364],
+                              [0.30968404, 0.78204364, 0.14899081,
+                               0.09628749]],
+                             [[0.73675203, 0.72104895, 0.4648038, 0.6101647],
+                              [0.7817645, 0.16572917, 0.3311919, 0.43407398]],
+                             [[0.8193154, 0.09559608, 0.05978829, 0.90262103],
+                              [0.4256065, 0.8165596, 0.8206446, 0.6604721]]],
+                            [[[0.7159653, 0.18600845, 0.21433902, 0.3159626],
+                              [0.3921569, 0.33221376, 0.5061177, 0.7961841]],
+                             [[0.95338356, 0.04785997, 0.67185795, 0.6538394],
+                              [0.4729132, 0.33404195, 0.17750603, 0.8445621]],
+                             [[0.6755793, 0.16193843, 0.75943846, 0.92123103],
+                              [0.2781859, 0.03114432, 0.710638, 0.52729136]],
+                             [[0.8376105, 0.10858494, 0.13208169, 0.365772],
+                              [0.5930795, 0.27390373, 0.14036089, 0.170403]],
+                             [[0.3479789, 0.89855295, 0.04844379, 0.9871029],
+                              [0.29781651, 0.0244137, 0.9179047, 0.8081611]],
+                             [[0.12460887, 0.44991326, 0.19382608, 0.35037738],
+                              [0.2773472, 0.4362057, 0.36757517, 0.5993509]],
+                             [[0.29630446, 0.90046406, 0.5417113, 0.13510644],
+                              [0.09623539, 0.04226565, 0.32001644,
+                               0.44358212]],
+                             [[0.5274848, 0.82096446, 0.9415489, 0.7123748],
+                              [0.7537517, 0.8086482, 0.85345286,
+                               0.7472754]]]]).float().cuda()
+    centers.requires_grad_()
+    knn_idx = torch.tensor([[[6, 7, 4, 6], [2, 4, 2, 4]],
+                            [[7, 1, 3, 2], [6, 0, 2, 6]]]).long().cuda()
+    aggregate = 'sum'
+    expected_output = torch.tensor(
+        [[[[-0.08134781, 0.03877336, -0.8212776, -0.2869547],
+           [-0.23378491, -0.24112664, -0.1600166, -0.4121864]],
+          [[-0.05780616, -0.12298299, -0.0370461, -0.07889931],
+           [-0.13956165, -0.02006848, -0.10940295, -0.0293439]],
+          [[0.09284145, 0.58250105, 0.5927749, 0.16774094],
+           [0.27070042, 0.13422406, 0.2617501, 0.23416464]],
+          [[-0.06121218, -0.09561322, -0.20408826, 0.08079343],
+           [0.00944228, 0.03874819, 0.08404065, 0.04041629]]],
+         [[[-0.2110898, -0.13335688, -0.09315082, 0.08512095],
+           [0.09121774, 0.15976946, 0.23994486, 0.14350912]],
+          [[-0.36167958, -0.14891288, -0.64470863, -0.0646704],
+           [-0.28276974, -0.08847666, -0.46904767, 0.20491874]],
+          [[-0.34877953, -0.35533834, -0.25225785, -0.4638189],
+           [-0.1420663, 0.09467781, 0.17088932, 0.22580585]],
+          [[-0.3879708, -0.3991068, 0.05276498, -0.46989647],
+           [0.32522714, -0.02163534, 0.21604237, 0.4346682]]]]).float()
+
+    # test forward
+    output = assign_score_withk(scores, points, centers, knn_idx, aggregate)
+    assert torch.allclose(output.detach().cpu(), expected_output, atol=1e-6)
+
+    # test backward
+    loss = output.sum()
+    loss.backward()
+    expected_scores_grad = torch.tensor([[[[0.04288036, -0.18217683],
+                                           [-0.78873926, 0.7485497],
+                                           [-0.6866992, 0.05346543],
+                                           [0.04288036, -0.18217683]],
+                                          [[-1.1407862, 0.13533896],
+                                           [-0.06964391, -0.22948086],
+                                           [-1.1407862, 0.13533896],
+                                           [-0.06964391, -0.22948086]]],
+                                         [[[-0.3363995, -2.212181],
+                                           [-1.1589496, -2.7724311],
+                                           [-0.9387654, -1.3163853],
+                                           [-1.4385346, -1.0614843]],
+                                          [[-0.5048497, 1.4143617],
+                                           [-0.47332114, 0.6017133],
+                                           [-0.30974793, 1.1995442],
+                                           [-0.5048497, 1.4143617]]]]).float()
+    expected_points_grad = torch.tensor(
+        [[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0.15585709, 0.15585709, 0.15585709, 0.15585709],
+           [1.1893613, 1.1893613, 1.1893613, 1.1893613]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[1.6530733, 1.6530733, 1.6530733, 1.6530733],
+           [1.8130021, 1.8130021, 1.8130021, 1.8130021]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0.58863074, 0.58863074, 0.58863074, 0.58863074],
+           [1.3727596, 1.3727596, 1.3727596, 1.3727596]],
+          [[0.28462553, 0.28462553, 0.28462553, 0.28462553],
+           [0.8378516, 0.8378516, 0.8378516, 0.8378516]]],
+         [[[0.13817799, 0.13817799, 0.13817799, 0.13817799],
+           [0.34856772, 0.34856772, 0.34856772, 0.34856772]],
+          [[0.7405102, 0.7405102, 0.7405102, 0.7405102],
+           [0.06438422, 0.06438422, 0.06438422, 0.06438422]],
+          [[0.8491963, 0.8491963, 0.8491963, 0.8491963],
+           [1.1301711, 1.1301711, 1.1301711, 1.1301711]],
+          [[0.6887394, 0.6887394, 0.6887394, 0.6887394],
+           [0.22089851, 0.22089851, 0.22089851, 0.22089851]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0.605832, 0.605832, 0.605832, 0.605832],
+           [0.92364264, 0.92364264, 0.92364264, 0.92364264]],
+          [[0.23089725, 0.23089725, 0.23089725, 0.23089725],
+           [0.5568468, 0.5568468, 0.5568468, 0.5568468]]]]).float()
+    expected_centers_grad = torch.tensor(
+        [[[[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[-1.0493311, -1.0493311, -1.0493311, -1.0493311],
+           [-2.0301602, -2.0301602, -2.0301602, -2.0301602]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[-1.6328557, -1.6328557, -1.6328557, -1.6328557],
+           [-3.1828144, -3.1828144, -3.1828144, -3.1828144]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]]],
+         [[[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[0., 0., 0., 0.], [0., 0., 0., 0.]],
+          [[-1.5429721, -1.5429721, -1.5429721, -1.5429721],
+           [-1.6100934, -1.6100934, -1.6100934, -1.6100934]],
+          [[-1.7103812, -1.7103812, -1.7103812, -1.7103812],
+           [-1.6344175, -1.6344175, -1.6344175, -1.6344175]]]]).float()
+    assert torch.allclose(
+        scores.grad.detach().cpu(), expected_scores_grad, atol=1e-6)
+    assert torch.allclose(
+        points.grad.detach().cpu(), expected_points_grad, atol=1e-6)
+    assert torch.allclose(
+        centers.grad.detach().cpu(), expected_centers_grad, atol=1e-6)
diff --git a/mmcv/tests/test_ops/test_ball_query.py b/mmcv/tests/test_ops/test_ball_query.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c78dc6600a7e4e0fba60c9ff06512776a591728
--- /dev/null
+++ b/mmcv/tests/test_ops/test_ball_query.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import ball_query
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_ball_query():
+    new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625],
+                             [-2.2769, 2.7817, -0.2334],
+                             [-0.4003, 2.4666, -0.5116],
+                             [-0.0740, 1.3147, -1.3625],
+                             [-0.0740, 1.3147, -1.3625]],
+                            [[-2.0289, 2.4952, -0.1708],
+                             [-2.0668, 6.0278, -0.4875],
+                             [0.4066, 1.4211, -0.2947],
+                             [-2.0289, 2.4952, -0.1708],
+                             [-2.0289, 2.4952, -0.1708]]]).cuda()
+
+    xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
+                         [-0.4003, 2.4666,
+                          -0.5116], [-0.5251, 2.4379, -0.8466],
+                         [-0.9691, 1.1418,
+                          -1.3733], [-0.2232, 0.9561, -1.3626],
+                         [-2.2769, 2.7817, -0.2334],
+                         [-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432],
+                         [0.4917, 1.1529, -1.3496]],
+                        [[-2.0289, 2.4952,
+                          -0.1708], [-0.7188, 0.9956, -0.5096],
+                         [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
+                         [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
+                         [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
+                         [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
+                                                    -1.2000]]]).cuda()
+
+    idx = ball_query(0, 0.2, 5, xyz, new_xyz)
+    expected_idx = torch.tensor([[[0, 0, 0, 0, 0], [6, 6, 6, 6, 6],
+                                  [2, 2, 2, 2, 2], [0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0]],
+                                 [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2],
+                                  [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0]]]).cuda()
+    assert torch.all(idx == expected_idx)
+
+    # test dilated ball query
+    idx = ball_query(0.2, 0.4, 5, xyz, new_xyz)
+    expected_idx = torch.tensor([[[0, 5, 7, 0, 0], [6, 6, 6, 6, 6],
+                                  [2, 3, 2, 2, 2], [0, 5, 7, 0, 0],
+                                  [0, 5, 7, 0, 0]],
+                                 [[0, 0, 0, 0, 0], [2, 2, 2, 2, 2],
+                                  [7, 7, 7, 7, 7], [0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0]]]).cuda()
+    assert torch.all(idx == expected_idx)
diff --git a/mmcv/tests/test_ops/test_bbox.py b/mmcv/tests/test_ops/test_bbox.py
new file mode 100644
index 0000000000000000000000000000000000000000..7123b1ee103fa5a0e1b8865ce97c14a359e4918d
--- /dev/null
+++ b/mmcv/tests/test_ops/test_bbox.py
@@ -0,0 +1,66 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
+
+
+class TestBBox:
+
+    def _test_bbox_overlaps(self, device='cpu', dtype=torch.float):
+        from mmcv.ops import bbox_overlaps
+        b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0, 4.0],
+                           [7.0, 7.0, 8.0, 8.0]]).to(device).type(dtype)
+        b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,
+                                                  3.0]]).to(device).type(dtype)
+        should_output = np.array([[0.33333334, 0.5], [0.2, 0.5], [0.0, 0.0]])
+        out = bbox_overlaps(b1, b2, offset=1)
+        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
+
+        b1 = torch.tensor([[1.0, 1.0, 3.0, 4.0], [2.0, 2.0, 3.0,
+                                                  4.0]]).to(device).type(dtype)
+        b2 = torch.tensor([[0.0, 2.0, 2.0, 5.0], [2.0, 1.0, 3.0,
+                                                  3.0]]).to(device).type(dtype)
+        should_output = np.array([0.33333334, 0.5])
+        out = bbox_overlaps(b1, b2, aligned=True, offset=1)
+        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
+
+        b1 = torch.tensor([[0.0, 0.0, 3.0, 3.0]]).to(device).type(dtype)
+        b2 = torch.tensor([[4.0, 0.0, 5.0, 3.0], [3.0, 0.0, 4.0, 3.0],
+                           [2.0, 0.0, 3.0, 3.0], [1.0, 0.0, 2.0,
+                                                  3.0]]).to(device).type(dtype)
+        should_output = np.array([0, 0.2, 0.5, 0.5])
+        out = bbox_overlaps(b1, b2, offset=1)
+        assert np.allclose(out.cpu().numpy(), should_output, 1e-2)
+
+    @pytest.mark.parametrize('device', [
+        'cpu',
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support')),
+        pytest.param(
+            'mps',
+            marks=pytest.mark.skipif(
+                not IS_MPS_AVAILABLE, reason='requires MPS support'))
+    ])
+    def test_bbox_overlaps_float(self, device):
+        self._test_bbox_overlaps(device, dtype=torch.float)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_bbox_overlaps_half(self, device):
+        self._test_bbox_overlaps(device, dtype=torch.half)
diff --git a/mmcv/tests/test_ops/test_bilinear_grid_sample.py b/mmcv/tests/test_ops/test_bilinear_grid_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f43d4ff244457a56594068db234313bd1b1a2af
--- /dev/null
+++ b/mmcv/tests/test_ops/test_bilinear_grid_sample.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn.functional as F
+
+
+class TestBilinearGridSample:
+
+    def _test_bilinear_grid_sample(self,
+                                   dtype=torch.float,
+                                   align_corners=False,
+                                   multiplier=1,
+                                   precision=1e-3):
+        from mmcv.ops.point_sample import bilinear_grid_sample
+
+        input = torch.rand(1, 1, 20, 20, dtype=dtype)
+        grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
+        grid = F.affine_grid(
+            grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
+        grid *= multiplier
+
+        out = bilinear_grid_sample(input, grid, align_corners=align_corners)
+        ref_out = F.grid_sample(input, grid, align_corners=align_corners)
+
+        assert np.allclose(out.data.detach().cpu().numpy(),
+                           ref_out.data.detach().cpu().numpy(), precision)
+
+    def test_bilinear_grid_sample(self):
+        self._test_bilinear_grid_sample(torch.double, False)
+        self._test_bilinear_grid_sample(torch.double, True)
+        self._test_bilinear_grid_sample(torch.float, False)
+        self._test_bilinear_grid_sample(torch.float, True)
+        self._test_bilinear_grid_sample(torch.float, False)
+        self._test_bilinear_grid_sample(torch.float, True, 5)
+        self._test_bilinear_grid_sample(torch.float, False, 10)
+        self._test_bilinear_grid_sample(torch.float, True, -6)
+        self._test_bilinear_grid_sample(torch.float, False, -10)
+        self._test_bilinear_grid_sample(torch.double, True, 5)
+        self._test_bilinear_grid_sample(torch.double, False, 10)
+        self._test_bilinear_grid_sample(torch.double, True, -6)
+        self._test_bilinear_grid_sample(torch.double, False, -10)
diff --git a/mmcv/tests/test_ops/test_border_align.py b/mmcv/tests/test_ops/test_border_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..71518ce9606f8bc0e9bf54c66c9118c483f6e0f5
--- /dev/null
+++ b/mmcv/tests/test_ops/test_border_align.py
@@ -0,0 +1,91 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+
+import numpy as np
+import pytest
+import torch
+
+# [1,4c,h,w]
+input_arr = [[[[1., 2., 3., 4.], [5., 6., 7., 8.], [9., 10., 11., 12.]],
+              [[6, 7, 5, 8], [2, 1, 3, 4], [12, 9, 11, 10]],
+              [[-2, -3, 2, 0], [-4, -5, 1, -1], [-1, -1, -1, -1]],
+              [[0, -1, 2, 1], [-4, -3, -2, -1], [-1, -2, -3, -4]]]]
+# [1,h*w,4]
+boxes_arr = [[[0, 0, 2, 1], [1, 0, 3, 1], [1, 0, 2, 1], [0, 0, 3, 1],
+              [0, 0, 1, 2], [0, 0, 2, 2], [1, 0, 2, 1], [1, 0, 3, 1],
+              [0, 1, 1, 2], [0, 0, 3, 2], [1, 0, 3, 2], [2, 0, 3, 2]]]
+output_dict = {
+    # [1,c,h*w,4] for each value,
+    # the output is manually checked for its correctness
+
+    # pool_size=1
+    1: [[[[3., 6., 1., 2.], [4., 7., -1., 1.], [3., 7., 1., 2.],
+          [4., 6., -1., 1.], [2., 12., -1., -1.], [3., 12., -1., 2.],
+          [3., 7., 1., 2.], [4., 7., -1., 1.], [6., 12., -1., -2.],
+          [4., 12., -1., 1.], [4., 9., -1., 1.], [4., 11., -1., 1.]]]],
+
+    # pool_size=2
+    2: [[[[3., 6., 1., 2.], [4., 7., 1., 1.], [3., 7., 1., 2.],
+          [4., 6., -1., 1.], [2., 12., -1., -1.], [3., 12., -1., 2.],
+          [3., 7., 1., 2.], [4., 7., 1., 1.], [6., 12., -1., -2.],
+          [4., 12., -1., 1.], [4., 9., -1., 1.], [4., 11., -1., 1.]]]],
+}
+input_grad_dict = {
+    # [1,4c,h,w] for each value
+    # the grad is manually checked for its correctness
+
+    # pool_size=1
+    1: [[[[0., 1., 4., 6.], [0., 1., 0., 0.], [0., 0., 0., 0.]],
+         [[2., 4., 0., 0.], [0., 0., 0., 0.], [4., 1., 1., 0.]],
+         [[0., 0., 0., 0.], [0., 0., 3., 3.], [0., 2., 1., 3.]],
+         [[0., 1., 4., 6.], [0., 0., 0., 0.], [0., 1., 0., 0.]]]],
+
+    # pool_size=2
+    2: [[[[0., 1., 4., 6.], [0., 1., 0., 0.], [0., 0., 0., 0.]],
+         [[2., 4., 0., 0.], [0., 0., 0., 0.], [4., 1., 1., 0.]],
+         [[0., 0., 0., 0.], [0., 0., 5., 1.], [0., 2., 1., 3.]],
+         [[0., 1., 4., 6.], [0., 0., 0., 0.], [0., 1., 0., 0.]]]],
+}
+
+
+def _test_border_align_allclose(device, dtype, pool_size):
+    if not torch.cuda.is_available() and device == 'cuda':
+        pytest.skip('test requires GPU')
+    try:
+        from mmcv.ops import BorderAlign, border_align
+    except ModuleNotFoundError:
+        pytest.skip('BorderAlign op is not successfully compiled')
+
+    np_input = np.array(input_arr)
+    np_boxes = np.array(boxes_arr)
+    np_output = np.array(output_dict[pool_size])
+    np_grad = np.array(input_grad_dict[pool_size])
+
+    input = torch.tensor(
+        np_input, dtype=dtype, device=device, requires_grad=True)
+    boxes = torch.tensor(np_boxes, dtype=dtype, device=device)
+
+    # test for border_align
+    input_cp = copy.deepcopy(input)
+    output = border_align(input_cp, boxes, pool_size)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(
+        output.data.type(dtype).cpu().numpy(), np_output, atol=1e-5)
+    assert np.allclose(
+        input_cp.grad.data.type(dtype).cpu().numpy(), np_grad, atol=1e-5)
+
+    # test for BorderAlign
+    pool_module = BorderAlign(pool_size)
+    output = pool_module(input, boxes)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(
+        output.data.type(dtype).cpu().numpy(), np_output, atol=1e-5)
+    assert np.allclose(
+        input.grad.data.type(dtype).cpu().numpy(), np_grad, atol=1e-5)
+
+
+@pytest.mark.parametrize('device', ['cuda'])
+@pytest.mark.parametrize('dtype', [torch.float, torch.half, torch.double])
+@pytest.mark.parametrize('pool_size', [1, 2])
+def test_border_align(device, dtype, pool_size):
+    _test_border_align_allclose(device, dtype, pool_size)
diff --git a/mmcv/tests/test_ops/test_box_iou_rotated.py b/mmcv/tests/test_ops/test_box_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f5e0dfa3e4e56a4e5c5ea43df2b1ee2b625fbbe
--- /dev/null
+++ b/mmcv/tests/test_ops/test_box_iou_rotated.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+
+class TestBoxIoURotated:
+
+    def test_box_iou_rotated_cpu(self):
+        from mmcv.ops import box_iou_rotated
+        np_boxes1 = np.asarray(
+            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+             [7.0, 7.0, 8.0, 8.0, 0.4]],
+            dtype=np.float32)
+        np_boxes2 = np.asarray(
+            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+             [5.0, 5.0, 6.0, 7.0, 0.4]],
+            dtype=np.float32)
+        np_expect_ious = np.asarray(
+            [[0.3708, 0.4351, 0.0000], [0.1104, 0.4487, 0.0424],
+             [0.0000, 0.0000, 0.3622]],
+            dtype=np.float32)
+        np_expect_ious_aligned = np.asarray([0.3708, 0.4487, 0.3622],
+                                            dtype=np.float32)
+
+        boxes1 = torch.from_numpy(np_boxes1)
+        boxes2 = torch.from_numpy(np_boxes2)
+
+        # test cw angle definition
+        ious = box_iou_rotated(boxes1, boxes2)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, aligned=True)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+    @pytest.mark.skipif(
+        not torch.cuda.is_available(), reason='requires CUDA support')
+    def test_box_iou_rotated_cuda(self):
+        from mmcv.ops import box_iou_rotated
+        np_boxes1 = np.asarray(
+            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+             [7.0, 7.0, 8.0, 8.0, 0.4]],
+            dtype=np.float32)
+        np_boxes2 = np.asarray(
+            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+             [5.0, 5.0, 6.0, 7.0, 0.4]],
+            dtype=np.float32)
+        np_expect_ious = np.asarray(
+            [[0.3708, 0.4351, 0.0000], [0.1104, 0.4487, 0.0424],
+             [0.0000, 0.0000, 0.3622]],
+            dtype=np.float32)
+        np_expect_ious_aligned = np.asarray([0.3708, 0.4487, 0.3622],
+                                            dtype=np.float32)
+
+        boxes1 = torch.from_numpy(np_boxes1).cuda()
+        boxes2 = torch.from_numpy(np_boxes2).cuda()
+
+        # test cw angle definition
+        ious = box_iou_rotated(boxes1, boxes2)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, aligned=True)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+    def test_box_iou_rotated_iof_cpu(self):
+        from mmcv.ops import box_iou_rotated
+        np_boxes1 = np.asarray(
+            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+             [7.0, 7.0, 8.0, 8.0, 0.4]],
+            dtype=np.float32)
+        np_boxes2 = np.asarray(
+            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+             [5.0, 5.0, 6.0, 7.0, 0.4]],
+            dtype=np.float32)
+        np_expect_ious = np.asarray(
+            [[0.4959, 0.5306, 0.0000], [0.1823, 0.5420, 0.1832],
+             [0.0000, 0.0000, 0.4404]],
+            dtype=np.float32)
+        np_expect_ious_aligned = np.asarray([0.4959, 0.5420, 0.4404],
+                                            dtype=np.float32)
+
+        boxes1 = torch.from_numpy(np_boxes1)
+        boxes2 = torch.from_numpy(np_boxes2)
+
+        # test cw angle definition
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof')
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+        ious = box_iou_rotated(
+            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+    @pytest.mark.skipif(
+        not torch.cuda.is_available(), reason='requires CUDA support')
+    def test_box_iou_rotated_iof_cuda(self):
+        from mmcv.ops import box_iou_rotated
+        np_boxes1 = np.asarray(
+            [[1.0, 1.0, 3.0, 4.0, 0.5], [2.0, 2.0, 3.0, 4.0, 0.6],
+             [7.0, 7.0, 8.0, 8.0, 0.4]],
+            dtype=np.float32)
+        np_boxes2 = np.asarray(
+            [[0.0, 2.0, 2.0, 5.0, 0.3], [2.0, 1.0, 3.0, 3.0, 0.5],
+             [5.0, 5.0, 6.0, 7.0, 0.4]],
+            dtype=np.float32)
+        np_expect_ious = np.asarray(
+            [[0.4959, 0.5306, 0.0000], [0.1823, 0.5420, 0.1832],
+             [0.0000, 0.0000, 0.4404]],
+            dtype=np.float32)
+        np_expect_ious_aligned = np.asarray([0.4959, 0.5420, 0.4404],
+                                            dtype=np.float32)
+
+        boxes1 = torch.from_numpy(np_boxes1).cuda()
+        boxes2 = torch.from_numpy(np_boxes2).cuda()
+
+        # test cw angle definition
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof')
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof', aligned=True)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
+
+        # test ccw angle definition
+        boxes1[..., -1] *= -1
+        boxes2[..., -1] *= -1
+        ious = box_iou_rotated(boxes1, boxes2, mode='iof', clockwise=False)
+        assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+        ious = box_iou_rotated(
+            boxes1, boxes2, mode='iof', aligned=True, clockwise=False)
+        assert np.allclose(
+            ious.cpu().numpy(), np_expect_ious_aligned, atol=1e-4)
diff --git a/mmcv/tests/test_ops/test_carafe.py b/mmcv/tests/test_ops/test_carafe.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b545a02762481f342e4e88e655b1278f02eb1cc
--- /dev/null
+++ b/mmcv/tests/test_ops/test_carafe.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch.autograd import gradcheck
+
+
+class TestCarafe:
+
+    def test_carafe_naive_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import CARAFENaive
+        feat = torch.randn(
+            2, 64, 3, 3, requires_grad=True, device='cuda').double()
+        mask = torch.randn(
+            2, 100, 6, 6, requires_grad=True,
+            device='cuda').sigmoid().double()
+        gradcheck(CARAFENaive(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)
+
+    def test_carafe_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import CARAFE
+        feat = torch.randn(
+            2, 64, 3, 3, requires_grad=True, device='cuda').double()
+        mask = torch.randn(
+            2, 100, 6, 6, requires_grad=True,
+            device='cuda').sigmoid().double()
+        gradcheck(CARAFE(5, 4, 2), (feat, mask), atol=1e-4, eps=1e-4)
diff --git a/mmcv/tests/test_ops/test_cc_attention.py b/mmcv/tests/test_ops/test_cc_attention.py
new file mode 100644
index 0000000000000000000000000000000000000000..b2a8d22a39424c4401b0d6c35a1169da72c58dc2
--- /dev/null
+++ b/mmcv/tests/test_ops/test_cc_attention.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+class Loss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input, target):
+        input = input.view(-1)
+        target = target.view(-1)
+        return torch.mean(input - target)
+
+
+class TestCrissCrossAttention:
+
+    def test_cc_attention(self):
+        device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+
+        from mmcv.ops import CrissCrossAttention
+        loss_func = Loss()
+
+        input = np.fromfile(
+            'tests/data/for_ccattention/ccattention_input.bin',
+            dtype=np.float32)
+        output = np.fromfile(
+            'tests/data/for_ccattention/ccattention_output.bin',
+            dtype=np.float32)
+        input = input.reshape((1, 32, 45, 45))
+        output = output.reshape((1, 32, 45, 45))
+        label = torch.ones((1, 32, 45, 45))
+
+        input = torch.FloatTensor(input)
+        output = torch.FloatTensor(output)
+
+        input.requires_grad = True
+
+        shape = input.shape
+        channel = shape[1]
+
+        cca = CrissCrossAttention(channel)
+        cca.to(device)
+        input = input.to(device)
+        label = label.to(device)
+        cca.train()
+        test_output = cca(input)
+        test_loss = loss_func(test_output, label)
+        test_loss.backward()
+        test_output = test_output.detach().cpu().numpy()
+        output = output.numpy()
+
+        assert np.allclose(test_output, output)
+        assert test_output.shape == shape
diff --git a/mmcv/tests/test_ops/test_chamfer_distance.py b/mmcv/tests/test_ops/test_chamfer_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..522dcdddc76d49cab6e5b5846bee9ae32d116c66
--- /dev/null
+++ b/mmcv/tests/test_ops/test_chamfer_distance.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import chamfer_distance
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_chamfer_distance():
+    pointset1 = torch.tensor(
+        [[[1.3, 9.39], [2.3, 9.39], [2.3, 10.39], [1.3, 10.39]],
+         [[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]],
+         [[1.6, 9.99], [2.3, 9.99], [2.3, 10.39], [1.6, 10.39]]],
+        device='cuda',
+        requires_grad=True)
+
+    pointset2 = torch.tensor(
+        [[[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]],
+         [[1.3, 9.39], [2.3, 9.39], [2.3, 10.39], [1.3, 10.39]],
+         [[1.0, 9.39], [3.0, 9.39], [3.0, 10.39], [1.0, 10.39]]],
+        device='cuda',
+        requires_grad=True)
+
+    expected_dist1 = torch.tensor(
+        [[0.0900, 0.4900, 0.4900, 0.0900], [0.0900, 0.4900, 0.4900, 0.0900],
+         [0.5200, 0.6500, 0.4900, 0.3600]],
+        device='cuda')
+    expected_dist2 = torch.tensor(
+        [[0.0900, 0.4900, 0.4900, 0.0900], [0.0900, 0.4900, 0.4900, 0.0900],
+         [0.7200, 0.8500, 0.4900, 0.3600]],
+        device='cuda')
+
+    expected_pointset1_grad = torch.tensor(
+        [[[0.6000, 0.0000], [-1.4000, 0.0000], [-1.4000, 0.0000],
+          [0.6000, 0.0000]],
+         [[-0.6000, 0.0000], [1.4000, 0.0000], [1.4000, 0.0000],
+          [-0.6000, 0.0000]],
+         [[1.2000, -0.8000], [-1.4000, -0.8000], [-1.4000, 0.0000],
+          [1.2000, 0.0000]]],
+        device='cuda')
+
+    expected_pointset2_grad = torch.tensor(
+        [[[-0.6000, 0.0000], [1.4000, 0.0000], [1.4000, 0.0000],
+          [-0.6000, 0.0000]],
+         [[0.6000, 0.0000], [-1.4000, 0.0000], [-1.4000, 0.0000],
+          [0.6000, 0.0000]],
+         [[0.0000, 0.0000], [0.0000, 0.0000], [2.8000, 0.8000],
+          [-2.4000, 0.8000]]],
+        device='cuda')
+
+    dist1, dist2, idx1, idx2 = chamfer_distance(pointset1, pointset2)
+    dist1.backward(torch.ones_like(dist1))
+    assert torch.allclose(dist1, expected_dist1, 1e-2)
+    assert torch.allclose(dist2, expected_dist2, 1e-2)
+    assert torch.allclose(pointset1.grad.data, expected_pointset1_grad, 1e-2)
+    assert torch.allclose(pointset2.grad.data, expected_pointset2_grad, 1e-2)
diff --git a/mmcv/tests/test_ops/test_contour_expand.py b/mmcv/tests/test_ops/test_contour_expand.py
new file mode 100644
index 0000000000000000000000000000000000000000..b36bbf4155c282418b3659984a536a24fad0d8b4
--- /dev/null
+++ b/mmcv/tests/test_ops/test_contour_expand.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def test_contour_expand():
+    from mmcv.ops import contour_expand
+
+    np_internal_kernel_label = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
+                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
+                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
+                                         [0, 0, 1, 1, 0, 0, 0, 0, 2, 0],
+                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                         [0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                          0]]).astype(np.int32)
+    np_kernel_mask1 = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                                [0, 0, 1, 1, 1, 1, 1, 1, 1, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0]]).astype(np.uint8)
+    np_kernel_mask2 = (np_internal_kernel_label > 0).astype(np.uint8)
+
+    np_kernel_mask = np.stack([np_kernel_mask1, np_kernel_mask2])
+    min_area = 1
+    kernel_region_num = 3
+    result = contour_expand(np_kernel_mask, np_internal_kernel_label, min_area,
+                            kernel_region_num)
+    gt = [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 1, 1, 1, 2, 2, 2, 0],
+          [0, 0, 1, 1, 1, 1, 2, 2, 2, 0], [0, 0, 1, 1, 1, 1, 2, 2, 2, 0],
+          [0, 0, 1, 1, 1, 1, 2, 2, 2, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+          [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
+    assert np.allclose(result, gt)
+
+    np_kernel_mask_t = torch.from_numpy(np_kernel_mask)
+    np_internal_kernel_label_t = torch.from_numpy(np_internal_kernel_label)
+    result = contour_expand(np_kernel_mask_t, np_internal_kernel_label_t,
+                            min_area, kernel_region_num)
+    assert np.allclose(result, gt)
diff --git a/mmcv/tests/test_ops/test_convex_iou.py b/mmcv/tests/test_ops/test_convex_iou.py
new file mode 100644
index 0000000000000000000000000000000000000000..95dc482434bf2189a714dce62883ce0f0309d174
--- /dev/null
+++ b/mmcv/tests/test_ops/test_convex_iou.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import convex_giou, convex_iou
+
+np_pointsets = np.asarray([[
+    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,
+    2.0, 1.5, 1.5
+],
+                           [
+                               1.5, 1.5, 2.5, 2.5, 1.5, 2.5, 2.5, 1.5, 1.5,
+                               3.5, 3.5, 1.5, 2.5, 3.5, 3.5, 2.5, 2.0, 2.0
+                           ]])
+
+np_polygons = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 1.0],
+                          [1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 1.0]])
+
+np_expected_iou = np.asarray([[0.2857, 0.8750], [0.0588, 0.4286]])
+
+np_expected_giou = np.asarray([0.2857, 0.3831])
+
+np_expected_grad = np.asarray([[
+    0.0204, 0.0408, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0612,
+    -0.0408, -0.0408, 0.0816, -0.0408, -0.0816, -0.0816, -0.0408, 0.0000,
+    0.0000
+],
+                               [
+                                   -0.1848, -0.1848, 0.0000, 0.0000, 0.0000,
+                                   0.0000, 0.0000, 0.0000, -0.1076, -0.0801,
+                                   -0.0801, -0.1076, -0.0367, -0.0734, -0.0734,
+                                   -0.0367, 0.0000, 0.0000
+                               ]])
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_convex_iou():
+    pointsets = torch.from_numpy(np_pointsets).cuda().float()
+    polygons = torch.from_numpy(np_polygons).cuda().float()
+    expected_iou = torch.from_numpy(np_expected_iou).cuda().float()
+    assert torch.allclose(
+        convex_iou(pointsets, polygons), expected_iou, atol=1e-3)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_convex_giou():
+    pointsets = torch.from_numpy(np_pointsets).cuda().float()
+    polygons = torch.from_numpy(np_polygons).cuda().float()
+    expected_giou = torch.from_numpy(np_expected_giou).cuda().float()
+    expected_grad = torch.from_numpy(np_expected_grad).cuda().float()
+    giou, grad = convex_giou(pointsets, polygons)
+    assert torch.allclose(giou, expected_giou, atol=1e-3)
+    assert torch.allclose(grad, expected_grad, atol=1e-3)
diff --git a/mmcv/tests/test_ops/test_corner_pool.py b/mmcv/tests/test_ops/test_corner_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6dd25f2232f0a420249b8e538357280bf05de61
--- /dev/null
+++ b/mmcv/tests/test_ops/test_corner_pool.py
@@ -0,0 +1,59 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""
+CommandLine:
+    pytest tests/test_corner_pool.py
+"""
+import pytest
+import torch
+
+from mmcv.ops import CornerPool
+
+
+def test_corner_pool_device_and_dtypes_cpu():
+    """
+    CommandLine:
+        xdoctest -m tests/test_corner_pool.py \
+            test_corner_pool_device_and_dtypes_cpu
+    """
+    with pytest.raises(AssertionError):
+        # pool mode must in ['bottom', 'left', 'right', 'top']
+        pool = CornerPool('corner')
+
+    lr_tensor = torch.tensor([[[[0, 0, 0, 0, 0], [2, 1, 3, 0, 2],
+                                [5, 4, 1, 1, 6], [0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0]]]])
+    tb_tensor = torch.tensor([[[[0, 3, 1, 0, 0], [0, 1, 1, 0, 0],
+                                [0, 3, 4, 0, 0], [0, 2, 2, 0, 0],
+                                [0, 0, 2, 0, 0]]]])
+    # Left Pool
+    left_answer = torch.tensor([[[[0, 0, 0, 0, 0], [3, 3, 3, 2, 2],
+                                  [6, 6, 6, 6, 6], [0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0]]]])
+    pool = CornerPool('left')
+    left_tensor = pool(lr_tensor)
+    assert left_tensor.type() == lr_tensor.type()
+    assert torch.equal(left_tensor, left_answer)
+    # Right Pool
+    right_answer = torch.tensor([[[[0, 0, 0, 0, 0], [2, 2, 3, 3, 3],
+                                   [5, 5, 5, 5, 6], [0, 0, 0, 0, 0],
+                                   [0, 0, 0, 0, 0]]]])
+    pool = CornerPool('right')
+    right_tensor = pool(lr_tensor)
+    assert right_tensor.type() == lr_tensor.type()
+    assert torch.equal(right_tensor, right_answer)
+    # Top Pool
+    top_answer = torch.tensor([[[[0, 3, 4, 0, 0], [0, 3, 4, 0, 0],
+                                 [0, 3, 4, 0, 0], [0, 2, 2, 0, 0],
+                                 [0, 0, 2, 0, 0]]]])
+    pool = CornerPool('top')
+    top_tensor = pool(tb_tensor)
+    assert top_tensor.type() == tb_tensor.type()
+    assert torch.equal(top_tensor, top_answer)
+    # Bottom Pool
+    bottom_answer = torch.tensor([[[[0, 3, 1, 0, 0], [0, 3, 1, 0, 0],
+                                    [0, 3, 4, 0, 0], [0, 3, 4, 0, 0],
+                                    [0, 3, 4, 0, 0]]]])
+    pool = CornerPool('bottom')
+    bottom_tensor = pool(tb_tensor)
+    assert bottom_tensor.type() == tb_tensor.type()
+    assert torch.equal(bottom_tensor, bottom_answer)
diff --git a/mmcv/tests/test_ops/test_correlation.py b/mmcv/tests/test_ops/test_correlation.py
new file mode 100644
index 0000000000000000000000000000000000000000..6cf5f9f72d23fd846fc34932ae336c2d46e16107
--- /dev/null
+++ b/mmcv/tests/test_ops/test_correlation.py
@@ -0,0 +1,46 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import Correlation
+
+_input1 = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
+_input2 = [[[[1., 2., 3.], [3., 1., 2.], [8., 5., 2.]]]]
+
+gt_out_shape = (1, 1, 1, 3, 3)
+_gt_out = [[[[[1., 4., 9.], [0., 1., 4.], [24., 25., 4.]]]]]
+gt_input1_grad = [[[[1., 2., 3.], [3., 1., 2.], [8., 5., 2.]]]]
+
+
+def assert_equal_tensor(tensor_a, tensor_b):
+
+    assert tensor_a.eq(tensor_b).all()
+
+
+class TestCorrelation:
+
+    def _test_correlation(self, dtype=torch.float):
+
+        layer = Correlation(max_displacement=0)
+
+        input1 = torch.tensor(_input1, dtype=dtype).cuda()
+        input2 = torch.tensor(_input2, dtype=dtype).cuda()
+        input1.requires_grad = True
+        input2.requires_grad = True
+        out = layer(input1, input2)
+        out.backward(torch.ones_like(out))
+
+        # `eq_cpu` is not implemented for 'Half' in torch1.5.0,
+        # so we need to make a comparison for cuda tensor
+        # rather than cpu tensor
+        gt_out = torch.tensor(_gt_out, dtype=dtype).cuda()
+        assert_equal_tensor(out, gt_out)
+        assert_equal_tensor(input1.grad.detach(), input2)
+        assert_equal_tensor(input2.grad.detach(), input1)
+
+    @pytest.mark.skipif(
+        not torch.cuda.is_available(), reason='requires CUDA support')
+    def test_correlation(self):
+        self._test_correlation(torch.float)
+        self._test_correlation(torch.double)
+        self._test_correlation(torch.half)
diff --git a/mmcv/tests/test_ops/test_deform_conv.py b/mmcv/tests/test_ops/test_deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..e77b5f97536b2b97f32e9416b501708d1bb1131d
--- /dev/null
+++ b/mmcv/tests/test_ops/test_deform_conv.py
@@ -0,0 +1,200 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import TORCH_VERSION, digit_version
+
+try:
+    # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
+    # would be imported and used; we should test if our modules support it.
+    from torch.cuda.amp import autocast
+except ImportError:
+    pass
+
+input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
+offset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]],
+                 [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]],
+                 [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]],
+                 [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]]
+offset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7]
+deform_weight = [[[0.4, 0.2, 0.1, 0.9]]]
+
+gt_out = [[[[1.650, 0.], [0.000, 0.]]]]
+gt_x_grad = [[[[-0.666, 0.204, 0.000], [0.030, -0.416, 0.012],
+               [0.000, 0.252, 0.129]]]]
+gt_offset_weight_grad = [[[[1.44, 2.88], [0.00, 1.44]]],
+                         [[[-0.72, -1.44], [0.00, -0.72]]],
+                         [[[0.00, 0.00], [0.00, 0.00]]],
+                         [[[0.00, 0.00], [0.00, 0.00]]],
+                         [[[-0.10, -0.20], [0.00, -0.10]]],
+                         [[[-0.08, -0.16], [0.00, -0.08]]],
+                         [[[-0.54, -1.08], [0.00, -0.54]]],
+                         [[[-0.54, -1.08], [0.00, -0.54]]]]
+gt_offset_bias_grad = [1.44, -0.72, 0., 0., -0.10, -0.08, -0.54, -0.54],
+gt_deform_weight_grad = [[[[3.62, 0.], [0.40, 0.18]]]]
+
+
+class TestDeformconv:
+
+    def _test_deformconv(self,
+                         dtype=torch.float,
+                         threshold=1e-3,
+                         device='cuda',
+                         batch_size=10,
+                         im2col_step=2):
+        if not torch.cuda.is_available() and device == 'cuda':
+            pytest.skip('test requires GPU')
+        from mmcv.ops import DeformConv2dPack
+        c_in = 1
+        c_out = 1
+        batch_size = 10
+        repeated_input = np.repeat(input, batch_size, axis=0)
+        repeated_gt_out = np.repeat(gt_out, batch_size, axis=0)
+        repeated_gt_x_grad = np.repeat(gt_x_grad, batch_size, axis=0)
+        x = torch.tensor(repeated_input, device=device, dtype=dtype)
+        x.requires_grad = True
+        model = DeformConv2dPack(
+            in_channels=c_in,
+            out_channels=c_out,
+            kernel_size=2,
+            stride=1,
+            padding=0,
+            im2col_step=im2col_step)
+        model.conv_offset.weight.data = torch.nn.Parameter(
+            torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
+        model.conv_offset.bias.data = torch.nn.Parameter(
+            torch.Tensor(offset_bias).reshape(8))
+        model.weight.data = torch.nn.Parameter(
+            torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
+        if device == 'cuda':
+            model.cuda()
+        model.type(dtype)
+
+        out = model(x)
+        out.backward(torch.ones_like(out))
+
+        assert np.allclose(out.data.detach().cpu().numpy(), repeated_gt_out,
+                           threshold)
+        assert np.allclose(x.grad.detach().cpu().numpy(), repeated_gt_x_grad,
+                           threshold)
+        # the batch size of the input is increased which results in
+        # a larger gradient so we need to divide by the batch_size
+        assert np.allclose(
+            model.conv_offset.weight.grad.detach().cpu().numpy() / batch_size,
+            gt_offset_weight_grad, threshold)
+        assert np.allclose(
+            model.conv_offset.bias.grad.detach().cpu().numpy() / batch_size,
+            gt_offset_bias_grad, threshold)
+        assert np.allclose(
+            model.weight.grad.detach().cpu().numpy() / batch_size,
+            gt_deform_weight_grad, threshold)
+
+        from mmcv.ops import DeformConv2d
+
+        # test bias
+        model = DeformConv2d(1, 1, 2, stride=1, padding=0)
+        assert not hasattr(model, 'bias')
+        # test bias=True
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(1, 1, 2, stride=1, padding=0, bias=True)
+        # test in_channels % group != 0
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(3, 2, 3, groups=2)
+        # test out_channels % group != 0
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(3, 4, 3, groups=3)
+
+    def _test_amp_deformconv(self,
+                             input_dtype,
+                             threshold=1e-3,
+                             batch_size=10,
+                             im2col_step=2):
+        """The function to test amp released on pytorch 1.6.0.
+
+        The type of input data might be torch.float or torch.half,
+        so we should test deform_conv in both cases. With amp, the
+        data type of model will NOT be set manually.
+
+        Args:
+            input_dtype: torch.float or torch.half.
+            threshold: the same as above function.
+        """
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import DeformConv2dPack
+        c_in = 1
+        c_out = 1
+        repeated_input = np.repeat(input, batch_size, axis=0)
+        repeated_gt_out = np.repeat(gt_out, batch_size, axis=0)
+        repeated_gt_x_grad = np.repeat(gt_x_grad, batch_size, axis=0)
+        x = torch.Tensor(repeated_input).cuda().type(input_dtype)
+        x.requires_grad = True
+        model = DeformConv2dPack(
+            in_channels=c_in,
+            out_channels=c_out,
+            kernel_size=2,
+            stride=1,
+            padding=0,
+            im2col_step=im2col_step)
+        model.conv_offset.weight.data = torch.nn.Parameter(
+            torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
+        model.conv_offset.bias.data = torch.nn.Parameter(
+            torch.Tensor(offset_bias).reshape(8))
+        model.weight.data = torch.nn.Parameter(
+            torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
+        model.cuda()
+
+        out = model(x)
+        out.backward(torch.ones_like(out))
+
+        assert np.allclose(out.data.detach().cpu().numpy(), repeated_gt_out,
+                           threshold)
+        assert np.allclose(x.grad.detach().cpu().numpy(), repeated_gt_x_grad,
+                           threshold)
+        assert np.allclose(
+            model.conv_offset.weight.grad.detach().cpu().numpy() / batch_size,
+            gt_offset_weight_grad, threshold)
+        assert np.allclose(
+            model.conv_offset.bias.grad.detach().cpu().numpy() / batch_size,
+            gt_offset_bias_grad, threshold)
+        assert np.allclose(
+            model.weight.grad.detach().cpu().numpy() / batch_size,
+            gt_deform_weight_grad, threshold)
+
+        from mmcv.ops import DeformConv2d
+
+        # test bias
+        model = DeformConv2d(1, 1, 2, stride=1, padding=0)
+        assert not hasattr(model, 'bias')
+        # test bias=True
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(1, 1, 2, stride=1, padding=0, bias=True)
+        # test in_channels % group != 0
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(3, 2, 3, groups=2)
+        # test out_channels % group != 0
+        with pytest.raises(AssertionError):
+            model = DeformConv2d(3, 4, 3, groups=3)
+
+    def test_deformconv(self):
+        self._test_deformconv(torch.double, device='cpu')
+        self._test_deformconv(torch.float, device='cpu', threshold=1e-1)
+        self._test_deformconv(torch.double)
+        self._test_deformconv(torch.float)
+        self._test_deformconv(torch.half, threshold=1e-1)
+        # test batch_size < im2col_step
+        self._test_deformconv(torch.float, batch_size=1, im2col_step=2)
+        # test bach_size % im2col_step != 0
+        with pytest.raises(
+                AssertionError,
+                match='batch size must be divisible by im2col_step'):
+            self._test_deformconv(torch.float, batch_size=10, im2col_step=3)
+
+        # test amp when torch version >= '1.6.0', the type of
+        # input data for deformconv might be torch.float or torch.half
+        if (TORCH_VERSION != 'parrots'
+                and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+            with autocast(enabled=True):
+                self._test_amp_deformconv(torch.float, 1e-1)
+                self._test_amp_deformconv(torch.half, 1e-1)
diff --git a/mmcv/tests/test_ops/test_deform_roi_pool.py b/mmcv/tests/test_ops/test_deform_roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..37a279ec9b698e8d4399693ab885c311acbdcc27
--- /dev/null
+++ b/mmcv/tests/test_ops/test_deform_roi_pool.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import numpy as np
+import torch
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
+                                               1.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+outputs = [([[[[1, 1.25], [1.5, 1.75]]]], [[[[3.0625, 0.4375],
+                                             [0.4375, 0.0625]]]]),
+           ([[[[1., 1.25], [1.5, 1.75]], [[4, 3.75],
+                                          [3.5, 3.25]]]], [[[[3.0625, 0.4375],
+                                                             [0.4375, 0.0625]],
+                                                            [[3.0625, 0.4375],
+                                                             [0.4375,
+                                                              0.0625]]]]),
+           ([[[[1.9375, 4.75],
+               [7.5625,
+                10.375]]]], [[[[0.47265625, 0.4296875, 0.4296875, 0.04296875],
+                               [0.4296875, 0.390625, 0.390625, 0.0390625],
+                               [0.4296875, 0.390625, 0.390625, 0.0390625],
+                               [0.04296875, 0.0390625, 0.0390625,
+                                0.00390625]]]])]
+
+
+class TestDeformRoIPool:
+
+    def test_deform_roi_pool_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import DeformRoIPoolPack
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+        sampling_ratio = 2
+
+        for case in inputs:
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+
+            x = torch.tensor(
+                np_input, device='cuda', dtype=torch.float, requires_grad=True)
+            rois = torch.tensor(np_rois, device='cuda', dtype=torch.float)
+            output_c = x.size(1)
+
+            droipool = DeformRoIPoolPack((pool_h, pool_w),
+                                         output_c,
+                                         spatial_scale=spatial_scale,
+                                         sampling_ratio=sampling_ratio).cuda()
+
+            if _USING_PARROTS:
+                gradcheck(droipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)
+
+    def test_modulated_deform_roi_pool_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import ModulatedDeformRoIPoolPack
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+        sampling_ratio = 2
+
+        for case in inputs:
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+
+            x = torch.tensor(
+                np_input, device='cuda', dtype=torch.float, requires_grad=True)
+            rois = torch.tensor(np_rois, device='cuda', dtype=torch.float)
+            output_c = x.size(1)
+
+            droipool = ModulatedDeformRoIPoolPack(
+                (pool_h, pool_w),
+                output_c,
+                spatial_scale=spatial_scale,
+                sampling_ratio=sampling_ratio).cuda()
+
+            if _USING_PARROTS:
+                gradcheck(droipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(droipool, (x, rois), eps=1e-2, atol=1e-2)
diff --git a/mmcv/tests/test_ops/test_diff_iou_rotated.py b/mmcv/tests/test_ops/test_diff_iou_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..01e05551b04b4df2994cebe4af65ad232be1234d
--- /dev/null
+++ b/mmcv/tests/test_ops/test_diff_iou_rotated.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import diff_iou_rotated_2d, diff_iou_rotated_3d
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_diff_iou_rotated_2d():
+    np_boxes1 = np.asarray([[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
+                             [0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., .0],
+                             [0.5, 0.5, 1., 1., .0]]],
+                           dtype=np.float32)
+    np_boxes2 = np.asarray(
+        [[[0.5, 0.5, 1., 1., .0], [0.5, 0.5, 1., 1., np.pi / 2],
+          [0.5, 0.5, 1., 1., np.pi / 4], [1., 1., 1., 1., .0],
+          [1.5, 1.5, 1., 1., .0]]],
+        dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).cuda()
+
+    np_expect_ious = np.asarray([[1., 1., .7071, 1 / 7, .0]])
+    ious = diff_iou_rotated_2d(boxes1, boxes2)
+    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_diff_iou_rotated_3d():
+    np_boxes1 = np.asarray(
+        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
+          [.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 1., .0],
+          [.5, .5, .5, 1., 1., 1., .0]]],
+        dtype=np.float32)
+    np_boxes2 = np.asarray(
+        [[[.5, .5, .5, 1., 1., 1., .0], [.5, .5, .5, 1., 1., 2., np.pi / 2],
+          [.5, .5, .5, 1., 1., 1., np.pi / 4], [1., 1., 1., 1., 1., 1., .0],
+          [-1.5, -1.5, -1.5, 2.5, 2.5, 2.5, .0]]],
+        dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).cuda()
+
+    np_expect_ious = np.asarray([[1., .5, .7071, 1 / 15, .0]])
+    ious = diff_iou_rotated_3d(boxes1, boxes2)
+    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
diff --git a/mmcv/tests/test_ops/test_focal_loss.py b/mmcv/tests/test_ops/test_focal_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..316f58469df262c5c1ca11e59bf0e73388c0e2ad
--- /dev/null
+++ b/mmcv/tests/test_ops/test_focal_loss.py
@@ -0,0 +1,162 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+# torch.set_printoptions(precision=8, threshold=100)
+
+inputs = [
+    ([[1., 0], [0, 1.]], [0, 1]),
+    ([[1., 0, -1.], [0, 1., 2.]], [2, 1]),
+    ([[1e-6, 2e-6, 3e-6], [4e-6, 5e-5, 6e-4], [7e-3, 8e-2, 9e-1]], [1, 2, 0]),
+]
+
+softmax_outputs = [(0.00566451, [[-0.00657264, 0.00657264],
+                                 [0.00657264, -0.00657264]]),
+                   (0.34956908, [[0.10165970, 0.03739851, -0.13905823],
+                                 [0.01227554, -0.10298023, 0.09070466]]),
+                   (0.15754992, [[0.02590877, -0.05181759, 0.02590882],
+                                 [0.02589641, 0.02589760, -0.05179400],
+                                 [-0.07307514, 0.02234372, 0.05073142]])]
+
+sigmoid_outputs = [(0.13562961, [[-0.00657264, 0.11185755],
+                                 [0.11185755, -0.00657264]]),
+                   (1.10251057, [[0.28808805, 0.11185755, -0.09602935],
+                                 [0.11185755, -0.00657264, 0.40376765]]),
+                   (0.42287254, [[0.07457182, -0.02485716, 0.07457201],
+                                 [0.07457211, 0.07457669, -0.02483728],
+                                 [-0.02462499, 0.08277918, 0.18050370]])]
+
+
+class Testfocalloss:
+
+    def _test_softmax(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import softmax_focal_loss
+        alpha = 0.25
+        gamma = 2.0
+        for case, output in zip(inputs, softmax_outputs):
+            np_x = np.array(case[0])
+            np_y = np.array(case[1])
+            np_x_grad = np.array(output[1])
+
+            x = torch.from_numpy(np_x).cuda().type(dtype)
+            x.requires_grad_()
+            y = torch.from_numpy(np_y).cuda().long()
+
+            loss = softmax_focal_loss(x, y, gamma, alpha, None, 'mean')
+            loss.backward()
+
+            assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2)
+            assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2)
+
+    def _test_sigmoid(self, device, dtype=torch.float):
+        from mmcv.ops import sigmoid_focal_loss
+        alpha = 0.25
+        gamma = 2.0
+        for case, output in zip(inputs, sigmoid_outputs):
+            np_x = np.array(case[0])
+            np_y = np.array(case[1])
+            np_x_grad = np.array(output[1])
+
+            x = torch.from_numpy(np_x).to(device).type(dtype)
+            x.requires_grad_()
+            y = torch.from_numpy(np_y).to(device).long()
+
+            loss = sigmoid_focal_loss(x, y, gamma, alpha, None, 'mean')
+            loss.backward()
+
+            assert np.allclose(loss.data.cpu().numpy(), output[0], 1e-2)
+            assert np.allclose(x.grad.data.cpu(), np_x_grad, 1e-2)
+
+    def _test_grad_softmax(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import SoftmaxFocalLoss
+        alpha = 0.25
+        gamma = 2.0
+        for case in inputs:
+            np_x = np.array(case[0])
+            np_y = np.array(case[1])
+
+            x = torch.from_numpy(np_x).cuda().type(dtype)
+            x.requires_grad_()
+            y = torch.from_numpy(np_y).cuda().long()
+
+            floss = SoftmaxFocalLoss(gamma, alpha)
+            if _USING_PARROTS:
+                # gradcheck(floss, (x, y),
+                #           no_grads=[y])
+                pass
+            else:
+                gradcheck(floss, (x, y), eps=1e-2, atol=1e-2)
+
+    def _test_grad_sigmoid(self, dtype=torch.float):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import SigmoidFocalLoss
+        alpha = 0.25
+        gamma = 2.0
+        for case in inputs:
+            np_x = np.array(case[0])
+            np_y = np.array(case[1])
+
+            x = torch.from_numpy(np_x).cuda().type(dtype)
+            x.requires_grad_()
+            y = torch.from_numpy(np_y).cuda().long()
+
+            floss = SigmoidFocalLoss(gamma, alpha)
+            if _USING_PARROTS:
+                # gradcheck(floss, (x, y),
+                #           no_grads=[y])
+                pass
+            else:
+                gradcheck(floss, (x, y), eps=1e-2, atol=1e-2)
+
+    def test_softmax_float(self):
+        self._test_softmax(dtype=torch.float)
+
+    def test_softmax_half(self):
+        self._test_softmax(dtype=torch.half)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_sigmoid_float(self, device):
+        self._test_sigmoid(device=device, dtype=torch.float)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_sigmoid_half(self, device):
+        self._test_sigmoid(device, dtype=torch.half)
+
+    def test_grad_softmax_float(self):
+        self._test_grad_softmax(dtype=torch.float)
+
+    def test_grad_sigmoid_float(self):
+        self._test_grad_sigmoid(dtype=torch.float)
diff --git a/mmcv/tests/test_ops/test_furthest_point_sample.py b/mmcv/tests/test_ops/test_furthest_point_sample.py
new file mode 100644
index 0000000000000000000000000000000000000000..7e61e64a91f541f49828d1e91e6b79c06aa1470a
--- /dev/null
+++ b/mmcv/tests/test_ops/test_furthest_point_sample.py
@@ -0,0 +1,52 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import furthest_point_sample, furthest_point_sample_with_dist
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_fps():
+    xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],
+                         [-0.8070, 2.4137,
+                          -0.5845], [-1.0001, 2.1982, -0.5859],
+                         [0.3841, 1.8983, -0.7431]],
+                        [[-1.0696, 3.0758,
+                          -0.1899], [-0.2559, 3.5521, -0.1402],
+                         [0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],
+                         [-0.0518, 3.7251, -0.3950]]]).cuda()
+
+    idx = furthest_point_sample(xyz, 3)
+    expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).cuda()
+    assert torch.all(idx == expected_idx)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_fps_with_dist():
+    xyz = torch.tensor([[[-0.2748, 1.0020, -1.1674], [0.1015, 1.3952, -1.2681],
+                         [-0.8070, 2.4137,
+                          -0.5845], [-1.0001, 2.1982, -0.5859],
+                         [0.3841, 1.8983, -0.7431]],
+                        [[-1.0696, 3.0758,
+                          -0.1899], [-0.2559, 3.5521, -0.1402],
+                         [0.8164, 4.0081, -0.1839], [-1.1000, 3.0213, -0.8205],
+                         [-0.0518, 3.7251, -0.3950]]]).cuda()
+
+    expected_idx = torch.tensor([[0, 2, 4], [0, 2, 1]]).cuda()
+    xyz_square_dist = ((xyz.unsqueeze(dim=1) -
+                        xyz.unsqueeze(dim=2))**2).sum(-1)
+    idx = furthest_point_sample_with_dist(xyz_square_dist, 3)
+    assert torch.all(idx == expected_idx)
+
+    import numpy as np
+    fps_idx = np.load('tests/data/for_3d_ops/fps_idx.npy')
+    features_for_fps_distance = np.load(
+        'tests/data/for_3d_ops/features_for_fps_distance.npy')
+    expected_idx = torch.from_numpy(fps_idx).cuda()
+    features_for_fps_distance = torch.from_numpy(
+        features_for_fps_distance).cuda()
+
+    idx = furthest_point_sample_with_dist(features_for_fps_distance, 16)
+    assert torch.all(idx == expected_idx)
diff --git a/mmcv/tests/test_ops/test_fused_bias_leakyrelu.py b/mmcv/tests/test_ops/test_fused_bias_leakyrelu.py
new file mode 100644
index 0000000000000000000000000000000000000000..47357860de0d0202c69db7b60799b8b38d4fc633
--- /dev/null
+++ b/mmcv/tests/test_ops/test_fused_bias_leakyrelu.py
@@ -0,0 +1,49 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck, gradgradcheck
+    _USING_PARROTS = False
+
+
+class TestFusedBiasLeakyReLU:
+
+    @classmethod
+    def setup_class(cls):
+        if not torch.cuda.is_available():
+            return
+        cls.input_tensor = torch.randn((2, 2, 2, 2), requires_grad=True).cuda()
+        cls.bias = torch.zeros(2, requires_grad=True).cuda()
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
+    def test_gradient(self):
+
+        from mmcv.ops import FusedBiasLeakyReLU
+        if _USING_PARROTS:
+            gradcheck(
+                FusedBiasLeakyReLU(2).cuda(),
+                self.input_tensor,
+                delta=1e-4,
+                pt_atol=1e-3)
+        else:
+            gradcheck(
+                FusedBiasLeakyReLU(2).cuda(),
+                self.input_tensor,
+                eps=1e-4,
+                atol=1e-3)
+
+    @pytest.mark.skipif(
+        not torch.cuda.is_available() or _USING_PARROTS,
+        reason='requires cuda')
+    def test_gradgradient(self):
+
+        from mmcv.ops import FusedBiasLeakyReLU
+        gradgradcheck(
+            FusedBiasLeakyReLU(2).cuda(),
+            self.input_tensor,
+            eps=1e-4,
+            atol=1e-3)
diff --git a/mmcv/tests/test_ops/test_gather_points.py b/mmcv/tests/test_ops/test_gather_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..a93df692a58425140fc1fd73f5cefe9c07cf0d6b
--- /dev/null
+++ b/mmcv/tests/test_ops/test_gather_points.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import gather_points
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_gather_points():
+    features = torch.tensor([[[
+        -1.6095, -0.1029, -0.8876, -1.2447, -2.4031, 0.3708, -1.1586, -1.4967,
+        -0.4800, 0.2252
+    ],
+                              [
+                                  1.9138, 3.4979, 1.6854, 1.5631, 3.6776,
+                                  3.1154, 2.1705, 2.5221, 2.0411, 3.1446
+                              ],
+                              [
+                                  -1.4173, 0.3073, -1.4339, -1.4340, -1.2770,
+                                  -0.2867, -1.4162, -1.4044, -1.4245, -1.4074
+                              ]],
+                             [[
+                                 0.2160, 0.0842, 0.3661, -0.2749, -0.4909,
+                                 -0.6066, -0.8773, -0.0745, -0.9496, 0.1434
+                             ],
+                              [
+                                  1.3644, 1.8087, 1.6855, 1.9563, 1.2746,
+                                  1.9662, 0.9566, 1.8778, 1.1437, 1.3639
+                              ],
+                              [
+                                  -0.7172, 0.1692, 0.2241, 0.0721, -0.7540,
+                                  0.0462, -0.6227, 0.3223, -0.6944, -0.5294
+                              ]]]).cuda()
+
+    idx = torch.tensor([[0, 1, 4, 0, 0, 0], [0, 5, 6, 0, 0, 0]]).int().cuda()
+
+    output = gather_points(features, idx)
+    expected_output = torch.tensor(
+        [[[-1.6095, -0.1029, -2.4031, -1.6095, -1.6095, -1.6095],
+          [1.9138, 3.4979, 3.6776, 1.9138, 1.9138, 1.9138],
+          [-1.4173, 0.3073, -1.2770, -1.4173, -1.4173, -1.4173]],
+         [[0.2160, -0.6066, -0.8773, 0.2160, 0.2160, 0.2160],
+          [1.3644, 1.9662, 0.9566, 1.3644, 1.3644, 1.3644],
+          [-0.7172, 0.0462, -0.6227, -0.7172, -0.7172, -0.7172]]]).cuda()
+
+    assert torch.allclose(output, expected_output)
+
+    # test fp16
+    output_half = gather_points(features.half(), idx)
+    assert torch.allclose(output_half, expected_output.half())
diff --git a/mmcv/tests/test_ops/test_group_points.py b/mmcv/tests/test_ops/test_group_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..b295437fb8c0da0317c3d61c6187252334ba88d2
--- /dev/null
+++ b/mmcv/tests/test_ops/test_group_points.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import grouping_operation
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_grouping_points():
+    idx = torch.tensor([[[0, 0, 0], [3, 3, 3], [8, 8, 8], [0, 0, 0], [0, 0, 0],
+                         [0, 0, 0]],
+                        [[0, 0, 0], [6, 6, 6], [9, 9, 9], [0, 0, 0], [0, 0, 0],
+                         [0, 0, 0]]]).int().cuda()
+    festures = torch.tensor([[[
+        0.5798, -0.7981, -0.9280, -1.3311, 1.3687, 0.9277, -0.4164, -1.8274,
+        0.9268, 0.8414
+    ],
+                              [
+                                  5.4247, 1.5113, 2.3944, 1.4740, 5.0300,
+                                  5.1030, 1.9360, 2.1939, 2.1581, 3.4666
+                              ],
+                              [
+                                  -1.6266, -1.0281, -1.0393, -1.6931, -1.3982,
+                                  -0.5732, -1.0830, -1.7561, -1.6786, -1.6967
+                              ]],
+                             [[
+                                 -0.0380, -0.1880, -1.5724, 0.6905, -0.3190,
+                                 0.7798, -0.3693, -0.9457, -0.2942, -1.8527
+                             ],
+                              [
+                                  1.1773, 1.5009, 2.6399, 5.9242, 1.0962,
+                                  2.7346, 6.0865, 1.5555, 4.3303, 2.8229
+                              ],
+                              [
+                                  -0.6646, -0.6870, -0.1125, -0.2224, -0.3445,
+                                  -1.4049, 0.4990, -0.7037, -0.9924, 0.0386
+                              ]]]).cuda()
+
+    output = grouping_operation(festures, idx)
+    expected_output = torch.tensor([[[[0.5798, 0.5798, 0.5798],
+                                      [-1.3311, -1.3311, -1.3311],
+                                      [0.9268, 0.9268, 0.9268],
+                                      [0.5798, 0.5798, 0.5798],
+                                      [0.5798, 0.5798, 0.5798],
+                                      [0.5798, 0.5798, 0.5798]],
+                                     [[5.4247, 5.4247, 5.4247],
+                                      [1.4740, 1.4740, 1.4740],
+                                      [2.1581, 2.1581, 2.1581],
+                                      [5.4247, 5.4247, 5.4247],
+                                      [5.4247, 5.4247, 5.4247],
+                                      [5.4247, 5.4247, 5.4247]],
+                                     [[-1.6266, -1.6266, -1.6266],
+                                      [-1.6931, -1.6931, -1.6931],
+                                      [-1.6786, -1.6786, -1.6786],
+                                      [-1.6266, -1.6266, -1.6266],
+                                      [-1.6266, -1.6266, -1.6266],
+                                      [-1.6266, -1.6266, -1.6266]]],
+                                    [[[-0.0380, -0.0380, -0.0380],
+                                      [-0.3693, -0.3693, -0.3693],
+                                      [-1.8527, -1.8527, -1.8527],
+                                      [-0.0380, -0.0380, -0.0380],
+                                      [-0.0380, -0.0380, -0.0380],
+                                      [-0.0380, -0.0380, -0.0380]],
+                                     [[1.1773, 1.1773, 1.1773],
+                                      [6.0865, 6.0865, 6.0865],
+                                      [2.8229, 2.8229, 2.8229],
+                                      [1.1773, 1.1773, 1.1773],
+                                      [1.1773, 1.1773, 1.1773],
+                                      [1.1773, 1.1773, 1.1773]],
+                                     [[-0.6646, -0.6646, -0.6646],
+                                      [0.4990, 0.4990, 0.4990],
+                                      [0.0386, 0.0386, 0.0386],
+                                      [-0.6646, -0.6646, -0.6646],
+                                      [-0.6646, -0.6646, -0.6646],
+                                      [-0.6646, -0.6646, -0.6646]]]]).cuda()
+    assert torch.allclose(output, expected_output)
diff --git a/mmcv/tests/test_ops/test_info.py b/mmcv/tests/test_ops/test_info.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3c1722eba09f41222352bf54962a3859566702a
--- /dev/null
+++ b/mmcv/tests/test_ops/test_info.py
@@ -0,0 +1,14 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+class TestInfo:
+
+    def test_info(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import get_compiler_version, get_compiling_cuda_version
+        cv = get_compiler_version()
+        ccv = get_compiling_cuda_version()
+        assert cv is not None
+        assert ccv is not None
diff --git a/mmcv/tests/test_ops/test_iou3d.py b/mmcv/tests/test_ops/test_iou3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..0576a1b816f6f0d903d08765e7592a62d3236edf
--- /dev/null
+++ b/mmcv/tests/test_ops/test_iou3d.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import boxes_iou3d, boxes_overlap_bev, nms3d, nms3d_normal
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_boxes_overlap_bev():
+    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
+                           dtype=np.float32)
+    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],
+                           dtype=np.float32)
+    np_expect_overlaps = np.asarray(
+        [[4.0, 4.0, (8 + 8 * 2**0.5) /
+          (3 + 2 * 2**0.5)], [1.0, 1.0, 1.0], [0.0, 0.0, 0.0]],
+        dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).cuda()
+
+    # test for 3 boxes
+    overlaps = boxes_overlap_bev(boxes1, boxes2)
+    assert np.allclose(overlaps.cpu().numpy(), np_expect_overlaps, atol=1e-4)
+
+    # test for many boxes
+    boxes2 = boxes2.repeat_interleave(555, 0)
+
+    overlaps = boxes_overlap_bev(boxes1, boxes2)
+    assert np.allclose(
+        overlaps.cpu().numpy(), np_expect_overlaps.repeat(555, 1), atol=1e-4)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_boxes_iou3d():
+    np_boxes1 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                            [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0]],
+                           dtype=np.float32)
+    np_boxes2 = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 2],
+                            [1.0, 1.0, 1.0, 2.0, 2.0, 2.0, np.pi / 4]],
+                           dtype=np.float32)
+    np_expect_ious = np.asarray(
+        [[1.0, 1.0, 1.0 / 2**0.5], [1.0 / 15, 1.0 / 15, 1.0 / 15],
+         [0.0, 0.0, 0.0]],
+        dtype=np.float32)
+
+    boxes1 = torch.from_numpy(np_boxes1).cuda()
+    boxes2 = torch.from_numpy(np_boxes2).cuda()
+
+    ious = boxes_iou3d(boxes1, boxes2)
+    assert np.allclose(ious.cpu().numpy(), np_expect_ious, atol=1e-4)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_nms3d():
+    # test for 5 boxes
+    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],
+                          dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)
+    np_inds = np.array([1, 0, 3])
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms3d(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
+
+    assert np.allclose(inds.cpu().numpy(), np_inds)
+
+    # test for many boxes
+    np.random.seed(42)
+    np_boxes = np.random.rand(555, 7).astype(np.float32)
+    np_scores = np.random.rand(555).astype(np.float32)
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms3d(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
+
+    assert len(inds.cpu().numpy()) == 176
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_nms3d_normal():
+    # test for 5 boxes
+    np_boxes = np.asarray([[1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 0.0],
+                           [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.3],
+                           [3.0, 3.0, 3.0, 3.0, 2.0, 2.0, 0.0],
+                           [3.0, 3.2, 3.2, 3.0, 2.0, 2.0, 0.3]],
+                          dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.1, 0.2, 0.15], dtype=np.float32)
+    np_inds = np.array([1, 0, 3])
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms3d_normal(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
+
+    assert np.allclose(inds.cpu().numpy(), np_inds)
+
+    # test for many boxes
+    np.random.seed(42)
+    np_boxes = np.random.rand(555, 7).astype(np.float32)
+    np_scores = np.random.rand(555).astype(np.float32)
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+    inds = nms3d_normal(boxes.cuda(), scores.cuda(), iou_threshold=0.3)
+
+    assert len(inds.cpu().numpy()) == 148
diff --git a/mmcv/tests/test_ops/test_knn.py b/mmcv/tests/test_ops/test_knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1236a5fcbe732fd287cea0a97e3166dbcd5555fa
--- /dev/null
+++ b/mmcv/tests/test_ops/test_knn.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import knn
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_knn():
+    new_xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625],
+                             [-2.2769, 2.7817, -0.2334],
+                             [-0.4003, 2.4666, -0.5116],
+                             [-0.0740, 1.3147, -1.3625],
+                             [-0.0740, 1.3147, -1.3625]],
+                            [[-2.0289, 2.4952, -0.1708],
+                             [-2.0668, 6.0278, -0.4875],
+                             [0.4066, 1.4211, -0.2947],
+                             [-2.0289, 2.4952, -0.1708],
+                             [-2.0289, 2.4952, -0.1708]]]).cuda()
+
+    xyz = torch.tensor([[[-0.0740, 1.3147, -1.3625], [0.5555, 1.0399, -1.3634],
+                         [-0.4003, 2.4666,
+                          -0.5116], [-0.5251, 2.4379, -0.8466],
+                         [-0.9691, 1.1418,
+                          -1.3733], [-0.2232, 0.9561, -1.3626],
+                         [-2.2769, 2.7817, -0.2334],
+                         [-0.2822, 1.3192, -1.3645], [0.1533, 1.5024, -1.0432],
+                         [0.4917, 1.1529, -1.3496]],
+                        [[-2.0289, 2.4952,
+                          -0.1708], [-0.7188, 0.9956, -0.5096],
+                         [-2.0668, 6.0278, -0.4875], [-1.9304, 3.3092, 0.6610],
+                         [0.0949, 1.4332, 0.3140], [-1.2879, 2.0008, -0.7791],
+                         [-0.7252, 0.9611, -0.6371], [0.4066, 1.4211, -0.2947],
+                         [0.3220, 1.4447, 0.3548], [-0.9744, 2.3856,
+                                                    -1.2000]]]).cuda()
+
+    idx = knn(5, xyz, new_xyz)
+    new_xyz_ = new_xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)
+    xyz_ = xyz.unsqueeze(1).repeat(1, new_xyz.shape[1], 1, 1)
+    dist = ((new_xyz_ - xyz_) * (new_xyz_ - xyz_)).sum(-1)
+    expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)
+    assert torch.all(idx == expected_idx)
+
+    idx = knn(5,
+              xyz.transpose(1, 2).contiguous(),
+              new_xyz.transpose(1, 2).contiguous(), True)
+    assert torch.all(idx == expected_idx)
+
+    idx = knn(5, xyz, xyz)
+    xyz_ = xyz.unsqueeze(2).repeat(1, 1, xyz.shape[1], 1)
+    xyz__ = xyz.unsqueeze(1).repeat(1, xyz.shape[1], 1, 1)
+    dist = ((xyz_ - xyz__) * (xyz_ - xyz__)).sum(-1)
+    expected_idx = dist.topk(k=5, dim=2, largest=False)[1].transpose(2, 1)
+    assert torch.all(idx == expected_idx)
diff --git a/mmcv/tests/test_ops/test_masked_conv2d.py b/mmcv/tests/test_ops/test_masked_conv2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4516b22e99202095a7afe432567059f286e94f21
--- /dev/null
+++ b/mmcv/tests/test_ops/test_masked_conv2d.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+
+class TestMaskedConv2d:
+
+    def test_masked_conv2d(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import MaskedConv2d
+        input = torch.randn(1, 3, 16, 16, requires_grad=True, device='cuda')
+        mask = torch.randn(1, 16, 16, requires_grad=True, device='cuda')
+        conv = MaskedConv2d(3, 3, 3).cuda()
+        output = conv(input, mask)
+        assert output is not None
diff --git a/mmcv/tests/test_ops/test_merge_cells.py b/mmcv/tests/test_ops/test_merge_cells.py
new file mode 100644
index 0000000000000000000000000000000000000000..51551c1416eb39340ed0ec170ce5dd35e436df68
--- /dev/null
+++ b/mmcv/tests/test_ops/test_merge_cells.py
@@ -0,0 +1,95 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""
+CommandLine:
+    pytest tests/test_merge_cells.py
+"""
+import math
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from mmcv.ops.merge_cells import (BaseMergeCell, ConcatCell, GlobalPoolingCell,
+                                  SumCell)
+
+
+# All size (14, 7) below is to test the situation that
+# the input size can't be divisible by the target size.
+@pytest.mark.parametrize(
+    'inputs_x, inputs_y',
+    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
+     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
+def test_sum_cell(inputs_x, inputs_y):
+    sum_cell = SumCell(256, 256)
+    output = sum_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
+    assert output.size() == inputs_x.size()
+    output = sum_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
+    assert output.size() == inputs_y.size()
+    output = sum_cell(inputs_x, inputs_y)
+    assert output.size() == inputs_y.size()
+
+
+@pytest.mark.parametrize(
+    'inputs_x, inputs_y',
+    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
+     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
+def test_concat_cell(inputs_x, inputs_y):
+    concat_cell = ConcatCell(256, 256)
+    output = concat_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
+    assert output.size() == inputs_x.size()
+    output = concat_cell(inputs_x, inputs_y, out_size=inputs_y.shape[-2:])
+    assert output.size() == inputs_y.size()
+    output = concat_cell(inputs_x, inputs_y)
+    assert output.size() == inputs_y.size()
+
+
+@pytest.mark.parametrize(
+    'inputs_x, inputs_y',
+    [(torch.randn([2, 256, 16, 16]), torch.randn([2, 256, 32, 32])),
+     (torch.randn([2, 256, 14, 7]), torch.randn([2, 256, 32, 32]))])
+def test_global_pool_cell(inputs_x, inputs_y):
+    gp_cell = GlobalPoolingCell(with_out_conv=False)
+    gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
+    assert (gp_cell_out.size() == inputs_x.size())
+    gp_cell = GlobalPoolingCell(256, 256)
+    gp_cell_out = gp_cell(inputs_x, inputs_y, out_size=inputs_x.shape[-2:])
+    assert (gp_cell_out.size() == inputs_x.size())
+
+
+@pytest.mark.parametrize('target_size', [(256, 256), (128, 128), (64, 64),
+                                         (14, 7)])
+def test_resize_methods(target_size):
+    inputs_x = torch.randn([2, 256, 128, 128])
+    h, w = inputs_x.shape[-2:]
+    target_h, target_w = target_size
+    if (h <= target_h) or w <= target_w:
+        rs_mode = 'upsample'
+    else:
+        rs_mode = 'downsample'
+
+    if rs_mode == 'upsample':
+        upsample_methods_list = ['nearest', 'bilinear']
+        for method in upsample_methods_list:
+            merge_cell = BaseMergeCell(upsample_mode=method)
+            merge_cell_out = merge_cell._resize(inputs_x, target_size)
+            gt_out = F.interpolate(inputs_x, size=target_size, mode=method)
+            assert merge_cell_out.equal(gt_out)
+    elif rs_mode == 'downsample':
+        merge_cell = BaseMergeCell()
+        merge_cell_out = merge_cell._resize(inputs_x, target_size)
+        if h % target_h != 0 or w % target_w != 0:
+            pad_h = math.ceil(h / target_h) * target_h - h
+            pad_w = math.ceil(w / target_w) * target_w - w
+            pad_l = pad_w // 2
+            pad_r = pad_w - pad_l
+            pad_t = pad_h // 2
+            pad_b = pad_h - pad_t
+            pad = (pad_l, pad_r, pad_t, pad_b)
+            inputs_x = F.pad(inputs_x, pad, mode='constant', value=0.0)
+        kernel_size = (inputs_x.shape[-2] // target_h,
+                       inputs_x.shape[-1] // target_w)
+        gt_out = F.max_pool2d(
+            inputs_x, kernel_size=kernel_size, stride=kernel_size)
+        print(merge_cell_out.shape, gt_out.shape)
+        assert (merge_cell_out == gt_out).all()
+        assert merge_cell_out.shape[-2:] == target_size
diff --git a/mmcv/tests/test_ops/test_min_area_polygons.py b/mmcv/tests/test_ops/test_min_area_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..649bdecfd62bcba2f782758802c97b265eaa9887
--- /dev/null
+++ b/mmcv/tests/test_ops/test_min_area_polygons.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import min_area_polygons
+
+np_pointsets = np.asarray([[
+    1.0, 1.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0,
+    2.0, 1.5, 1.5
+],
+                           [
+                               1.0, 1.0, 8.0, 8.0, 1.0, 2.0, 2.0, 1.0, 1.0,
+                               3.0, 3.0, 1.0, 2.0, 3.0, 3.0, 2.0, 1.5, 1.5
+                           ]])
+
+expected_polygons = np.asarray(
+    [[3.0000, 1.0000, 1.0000, 1.0000, 1.0000, 3.0000, 3.0000, 3.0000],
+     [8.0, 8.0, 2.3243, 0.0541, 0.0541, 1.6757, 5.7297, 9.6216]])
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_min_area_polygons():
+    pointsets = torch.from_numpy(np_pointsets).cuda().float()
+
+    assert np.allclose(
+        min_area_polygons(pointsets).cpu().numpy(),
+        expected_polygons,
+        atol=1e-4)
diff --git a/mmcv/tests/test_ops/test_modulated_deform_conv.py b/mmcv/tests/test_ops/test_modulated_deform_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b9070491a4c4efca070a92aadfbebec59f3fec5
--- /dev/null
+++ b/mmcv/tests/test_ops/test_modulated_deform_conv.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import numpy
+import pytest
+import torch
+
+from mmcv.utils import TORCH_VERSION, digit_version
+
+try:
+    # If PyTorch version >= 1.6.0 and fp16 is enabled, torch.cuda.amp.autocast
+    # would be imported and used; we should test if our modules support it.
+    from torch.cuda.amp import autocast
+except ImportError:
+    pass
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+input_t = [[[[1., 2., 3.], [1., 2., 3.], [1., 2., 3.]]]]
+output_t = [[[[0.5, 1.5, 2.5, 1.5], [1.0, 3.0, 5.0, 3.0], [1.0, 3.0, 5.0, 3.0],
+              [0.5, 1.5, 2.5, 1.5]]]]
+input_grad = [[[[2., 2., 2.], [2., 2., 2.], [2., 2., 2.]]]]
+dcn_w_grad = [[[[9., 9.], [9., 9.]]]]
+dcn_offset_w_grad = [[[[-7.0, -4.0], [0.0, 0.0]]], [[[-9.0, 7.5], [-6.0,
+                                                                   5.0]]],
+                     [[[-4.0, -7.0], [0.0, 0.0]]],
+                     [[[-7.5, -9.0], [-5.0, -6.0]]],
+                     [[[-7.0, -4.0], [-7.0, -4.0]]],
+                     [[[-6.0, 5.0], [-9.0, 7.5]]],
+                     [[[-4.0, -7.0], [-4.0, -7.0]]],
+                     [[[-5.0, -6.0], [-7.5, -9.0]]], [[[10.5, 6.0], [7.0,
+                                                                     4.0]]],
+                     [[[6.0, 10.5], [4.0, 7.0]]], [[[7.0, 4.0], [10.5, 6.0]]],
+                     [[[4.0, 7.0], [6.0, 10.5]]]]
+dcn_offset_b_grad = [
+    -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, -3.0, -1.5, 4.5, 4.5, 4.5, 4.5
+]
+
+
+class TestMdconv:
+
+    def _test_mdconv(self, dtype=torch.float, device='cuda'):
+        if not torch.cuda.is_available() and device == 'cuda':
+            pytest.skip('test requires GPU')
+        from mmcv.ops import ModulatedDeformConv2dPack
+        input = torch.tensor(input_t, dtype=dtype, device=device)
+        input.requires_grad = True
+
+        dcn = ModulatedDeformConv2dPack(
+            1,
+            1,
+            kernel_size=(2, 2),
+            stride=1,
+            padding=1,
+            deform_groups=1,
+            bias=False)
+
+        if device == 'cuda':
+            dcn.cuda()
+
+        dcn.weight.data.fill_(1.)
+        dcn.type(dtype)
+        output = dcn(input)
+        output.sum().backward()
+        assert numpy.allclose(output.cpu().detach().numpy(), output_t, 1e-2)
+        assert numpy.allclose(input.grad.cpu().detach().numpy(), input_grad,
+                              1e-2)
+        assert numpy.allclose(dcn.weight.grad.cpu().detach().numpy(),
+                              dcn_w_grad, 1e-2)
+        assert numpy.allclose(
+            dcn.conv_offset.weight.grad.cpu().detach().numpy(),
+            dcn_offset_w_grad, 1e-2)
+        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
+                              dcn_offset_b_grad, 1e-2)
+
+    def _test_amp_mdconv(self, input_dtype=torch.float):
+        """The function to test amp released on pytorch 1.6.0.
+
+        The type of input data might be torch.float or torch.half,
+        so we should test mdconv in both cases. With amp, the data
+        type of model will NOT be set manually.
+
+        Args:
+            input_dtype: torch.float or torch.half.
+        """
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import ModulatedDeformConv2dPack
+        input = torch.tensor(input_t).cuda().type(input_dtype)
+        input.requires_grad = True
+
+        dcn = ModulatedDeformConv2dPack(
+            1,
+            1,
+            kernel_size=(2, 2),
+            stride=1,
+            padding=1,
+            deform_groups=1,
+            bias=False).cuda()
+        dcn.weight.data.fill_(1.)
+        output = dcn(input)
+        output.sum().backward()
+        assert numpy.allclose(output.cpu().detach().numpy(), output_t, 1e-2)
+        assert numpy.allclose(input.grad.cpu().detach().numpy(), input_grad,
+                              1e-2)
+        assert numpy.allclose(dcn.weight.grad.cpu().detach().numpy(),
+                              dcn_w_grad, 1e-2)
+        assert numpy.allclose(
+            dcn.conv_offset.weight.grad.cpu().detach().numpy(),
+            dcn_offset_w_grad, 1e-2)
+        assert numpy.allclose(dcn.conv_offset.bias.grad.cpu().detach().numpy(),
+                              dcn_offset_b_grad, 1e-2)
+
+    def test_mdconv(self):
+        self._test_mdconv(torch.double, device='cpu')
+        self._test_mdconv(torch.float, device='cpu')
+        self._test_mdconv(torch.double)
+        self._test_mdconv(torch.float)
+        self._test_mdconv(torch.half)
+
+        # test amp when torch version >= '1.6.0', the type of
+        # input data for mdconv might be torch.float or torch.half
+        if (TORCH_VERSION != 'parrots'
+                and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
+            with autocast(enabled=True):
+                self._test_amp_mdconv(torch.float)
+                self._test_amp_mdconv(torch.half)
diff --git a/mmcv/tests/test_ops/test_ms_deformable_attn.py b/mmcv/tests/test_ops/test_ms_deformable_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..3ebbf6bdf5d691a0b2f1f948775118b5356a88a6
--- /dev/null
+++ b/mmcv/tests/test_ops/test_ms_deformable_attn.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops.multi_scale_deform_attn import (
+    MultiScaleDeformableAttention, MultiScaleDeformableAttnFunction,
+    multi_scale_deformable_attn_pytorch)
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+
+@pytest.mark.parametrize('device_type', [
+    'cpu',
+    pytest.param(
+        'cuda:0',
+        marks=pytest.mark.skipif(
+            not torch.cuda.is_available(), reason='requires CUDA support'))
+])
+def test_multiscale_deformable_attention(device_type):
+
+    with pytest.raises(ValueError):
+        # embed_dims must be divisible by num_heads,
+        MultiScaleDeformableAttention(
+            embed_dims=256,
+            num_heads=7,
+        )
+    device = torch.device(device_type)
+    msda = MultiScaleDeformableAttention(
+        embed_dims=3, num_levels=2, num_heads=3)
+    msda.init_weights()
+    num_query = 5
+    bs = 1
+    embed_dims = 3
+    query = torch.rand(num_query, bs, embed_dims).to(device)
+    key = torch.rand(num_query, bs, embed_dims).to(device)
+    spatial_shapes = torch.Tensor([[2, 2], [1, 1]]).long().to(device)
+    level_start_index = torch.Tensor([0, 4]).long().to(device)
+    reference_points = torch.rand(bs, num_query, 2, 2).to(device)
+    msda.to(device)
+    msda(
+        query,
+        key,
+        key,
+        reference_points=reference_points,
+        spatial_shapes=spatial_shapes,
+        level_start_index=level_start_index)
+
+
+def test_forward_multi_scale_deformable_attn_pytorch():
+    N, M, D = 1, 2, 2
+    Lq, L, P = 2, 2, 2
+    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long)
+    S = sum((H * W).item() for H, W in shapes)
+
+    torch.manual_seed(3)
+    value = torch.rand(N, S, M, D) * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2)
+    attention_weights = torch.rand(N, Lq, M, L, P) + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+
+    multi_scale_deformable_attn_pytorch(value.double(), shapes,
+                                        sampling_locations.double(),
+                                        attention_weights.double()).detach()
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_forward_equal_with_pytorch_double():
+    N, M, D = 1, 2, 2
+    Lq, L, P = 2, 2, 2
+    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+    level_start_index = torch.cat((shapes.new_zeros(
+        (1, )), shapes.prod(1).cumsum(0)[:-1]))
+    S = sum((H * W).item() for H, W in shapes)
+
+    torch.manual_seed(3)
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = multi_scale_deformable_attn_pytorch(
+        value.double(), shapes, sampling_locations.double(),
+        attention_weights.double()).detach().cpu()
+
+    output_cuda = MultiScaleDeformableAttnFunction.apply(
+        value.double(), shapes, level_start_index, sampling_locations.double(),
+        attention_weights.double(), im2col_step).detach().cpu()
+    assert torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    assert max_abs_err < 1e-18
+    assert max_rel_err < 1e-15
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_forward_equal_with_pytorch_float():
+    N, M, D = 1, 2, 2
+    Lq, L, P = 2, 2, 2
+    shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+    level_start_index = torch.cat((shapes.new_zeros(
+        (1, )), shapes.prod(1).cumsum(0)[:-1]))
+    S = sum((H * W).item() for H, W in shapes)
+
+    torch.manual_seed(3)
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = multi_scale_deformable_attn_pytorch(
+        value, shapes, sampling_locations, attention_weights).detach().cpu()
+
+    output_cuda = MultiScaleDeformableAttnFunction.apply(
+        value, shapes, level_start_index, sampling_locations,
+        attention_weights, im2col_step).detach().cpu()
+    assert torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+    assert max_abs_err < 1e-9
+    assert max_rel_err < 1e-6
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+@pytest.mark.parametrize('channels', [
+    4,
+    30,
+    32,
+    64,
+    71,
+    1025,
+])
+def test_gradient_numerical(channels,
+                            grad_value=True,
+                            grad_sampling_loc=True,
+                            grad_attn_weight=True):
+
+    N, M, _ = 1, 2, 2
+    Lq, L, P = 2, 2, 2
+    shapes = torch.as_tensor([(3, 2), (2, 1)], dtype=torch.long).cuda()
+    level_start_index = torch.cat((shapes.new_zeros(
+        (1, )), shapes.prod(1).cumsum(0)[:-1]))
+    S = sum((H * W).item() for H, W in shapes)
+
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(
+        -1, keepdim=True).sum(
+            -2, keepdim=True)
+    im2col_step = 2
+
+    func = MultiScaleDeformableAttnFunction.apply
+
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+    if _USING_PARROTS:
+        assert gradcheck(
+            func, (value.double(), shapes, level_start_index,
+                   sampling_locations.double(), attention_weights.double(),
+                   im2col_step),
+            no_grads=[shapes, level_start_index])
+    else:
+        assert gradcheck(func, (value.double(), shapes, level_start_index,
+                                sampling_locations.double(),
+                                attention_weights.double(), im2col_step))
diff --git a/mmcv/tests/test_ops/test_nms.py b/mmcv/tests/test_ops/test_nms.py
new file mode 100644
index 0000000000000000000000000000000000000000..aece8ad5e43305c89317f2e15dc873155f409f26
--- /dev/null
+++ b/mmcv/tests/test_ops/test_nms.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+
+class Testnms:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_nms_allclose(self, device):
+        from mmcv.ops import nms
+        np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
+                             [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
+                            dtype=np.float32)
+        np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+        np_inds = np.array([1, 0, 3])
+        np_dets = np.array([[3.0, 6.0, 9.0, 11.0, 0.9],
+                            [6.0, 3.0, 8.0, 7.0, 0.6],
+                            [1.0, 4.0, 13.0, 7.0, 0.2]])
+        boxes = torch.from_numpy(np_boxes)
+        scores = torch.from_numpy(np_scores)
+        dets, inds = nms(boxes, scores, iou_threshold=0.3, offset=0)
+        assert np.allclose(dets, np_dets)  # test cpu
+        assert np.allclose(inds, np_inds)  # test cpu
+        dets, inds = nms(
+            boxes.to(device), scores.to(device), iou_threshold=0.3, offset=0)
+        assert np.allclose(dets.cpu().numpy(), np_dets)  # test gpu
+        assert np.allclose(inds.cpu().numpy(), np_inds)  # test gpu
+
+    def test_softnms_allclose(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import soft_nms
+        np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
+                             [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
+                            dtype=np.float32)
+        np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+
+        np_output = {
+            'linear': {
+                'dets':
+                np.array(
+                    [[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],
+                     [3., 7., 10., 12., 0.29024392], [1., 4., 13., 7., 0.2]],
+                    dtype=np.float32),
+                'inds':
+                np.array([1, 0, 2, 3], dtype=np.int64)
+            },
+            'gaussian': {
+                'dets':
+                np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.59630775],
+                          [3., 7., 10., 12., 0.35275510],
+                          [1., 4., 13., 7., 0.18650459]],
+                         dtype=np.float32),
+                'inds':
+                np.array([1, 0, 2, 3], dtype=np.int64)
+            },
+            'naive': {
+                'dets':
+                np.array([[3., 6., 9., 11., 0.9], [6., 3., 8., 7., 0.6],
+                          [1., 4., 13., 7., 0.2]],
+                         dtype=np.float32),
+                'inds':
+                np.array([1, 0, 3], dtype=np.int64)
+            }
+        }
+
+        boxes = torch.from_numpy(np_boxes)
+        scores = torch.from_numpy(np_scores)
+
+        configs = [[0.3, 0.5, 0.01, 'linear'], [0.3, 0.5, 0.01, 'gaussian'],
+                   [0.3, 0.5, 0.01, 'naive']]
+
+        for iou, sig, mscore, m in configs:
+            dets, inds = soft_nms(
+                boxes,
+                scores,
+                iou_threshold=iou,
+                sigma=sig,
+                min_score=mscore,
+                method=m)
+            assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])
+            assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])
+
+        if torch.__version__ != 'parrots':
+            boxes = boxes.cuda()
+            scores = scores.cuda()
+            for iou, sig, mscore, m in configs:
+                dets, inds = soft_nms(
+                    boxes,
+                    scores,
+                    iou_threshold=iou,
+                    sigma=sig,
+                    min_score=mscore,
+                    method=m)
+                assert np.allclose(dets.cpu().numpy(), np_output[m]['dets'])
+                assert np.allclose(inds.cpu().numpy(), np_output[m]['inds'])
+
+    def test_nms_match(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import nms, nms_match
+        iou_thr = 0.6
+        # empty input
+        empty_dets = np.array([])
+        assert len(nms_match(empty_dets, iou_thr)) == 0
+
+        # non empty ndarray input
+        np_dets = np.array(
+            [[49.1, 32.4, 51.0, 35.9, 0.9], [49.3, 32.9, 51.0, 35.3, 0.9],
+             [35.3, 11.5, 39.9, 14.5, 0.4], [35.2, 11.7, 39.7, 15.7, 0.3]],
+            dtype=np.float32)
+        np_groups = nms_match(np_dets, iou_thr)
+        assert isinstance(np_groups[0], np.ndarray)
+        assert len(np_groups) == 2
+        tensor_dets = torch.from_numpy(np_dets)
+        boxes = tensor_dets[:, :4]
+        scores = tensor_dets[:, 4]
+        nms_keep_inds = nms(boxes.contiguous(), scores.contiguous(),
+                            iou_thr)[1]
+        assert {g[0].item() for g in np_groups} == set(nms_keep_inds.tolist())
+
+        # non empty tensor input
+        tensor_dets = torch.from_numpy(np_dets)
+        tensor_groups = nms_match(tensor_dets, iou_thr)
+        assert isinstance(tensor_groups[0], torch.Tensor)
+        for i in range(len(tensor_groups)):
+            assert np.equal(tensor_groups[i].numpy(), np_groups[i]).all()
+
+        # input of wrong shape
+        wrong_dets = np.zeros((2, 3))
+        with pytest.raises(AssertionError):
+            nms_match(wrong_dets, iou_thr)
+
+    def test_batched_nms(self):
+        import mmcv
+        from mmcv.ops import batched_nms
+        results = mmcv.load('./tests/data/batched_nms_data.pkl')
+
+        nms_max_num = 100
+        nms_cfg = dict(
+            type='nms',
+            iou_threshold=0.7,
+            score_threshold=0.5,
+            max_num=nms_max_num)
+        boxes, keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            nms_cfg,
+            class_agnostic=False)
+
+        nms_cfg.update(split_thr=100)
+        seq_boxes, seq_keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            nms_cfg,
+            class_agnostic=False)
+
+        assert torch.equal(keep, seq_keep)
+        assert torch.equal(boxes, seq_boxes)
+        assert torch.equal(keep,
+                           torch.from_numpy(results['keep'][:nms_max_num]))
+
+        nms_cfg = dict(type='soft_nms', iou_threshold=0.7)
+        boxes, keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            nms_cfg,
+            class_agnostic=False)
+
+        nms_cfg.update(split_thr=100)
+        seq_boxes, seq_keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            nms_cfg,
+            class_agnostic=False)
+
+        assert torch.equal(keep, seq_keep)
+        assert torch.equal(boxes, seq_boxes)
+
+        # test skip nms when `nms_cfg` is None
+        seq_boxes, seq_keep = batched_nms(
+            torch.from_numpy(results['boxes']),
+            torch.from_numpy(results['scores']),
+            torch.from_numpy(results['idxs']),
+            None,
+            class_agnostic=False)
+        assert len(seq_keep) == len(results['boxes'])
+        # assert score is descending order
+        assert ((seq_boxes[:, -1][1:] - seq_boxes[:, -1][:-1]) < 0).all()
diff --git a/mmcv/tests/test_ops/test_nms_rotated.py b/mmcv/tests/test_ops/test_nms_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b7f3607b0dedcb23564b021f13125d1681c1485
--- /dev/null
+++ b/mmcv/tests/test_ops/test_nms_rotated.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason='GPU is required to test NMSRotated op')
+class TestNmsRotated:
+
+    def test_ml_nms_rotated(self):
+        from mmcv.ops import nms_rotated
+        np_boxes = np.array(
+            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
+             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
+             ],
+            dtype=np.float32)
+        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)
+
+        np_expect_dets = np.array(
+            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
+             [6.0, 3.0, 8.0, 7.0, 0.5]],
+            dtype=np.float32)
+        np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)
+
+        boxes = torch.from_numpy(np_boxes).cuda()
+        labels = torch.from_numpy(np_labels).cuda()
+
+        # test cw angle definition
+        dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5, labels)
+
+        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+        # test ccw angle definition
+        boxes[..., -2] *= -1
+        dets, keep_inds = nms_rotated(
+            boxes[:, :5], boxes[:, -1], 0.5, labels, clockwise=False)
+        dets[..., -2] *= -1
+        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+    def test_nms_rotated(self):
+        from mmcv.ops import nms_rotated
+        np_boxes = np.array(
+            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
+             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
+             ],
+            dtype=np.float32)
+
+        np_expect_dets = np.array(
+            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
+             [6.0, 3.0, 8.0, 7.0, 0.5]],
+            dtype=np.float32)
+        np_expect_keep_inds = np.array([3, 1, 0], dtype=np.int64)
+
+        boxes = torch.from_numpy(np_boxes).cuda()
+
+        # test cw angle definition
+        dets, keep_inds = nms_rotated(boxes[:, :5], boxes[:, -1], 0.5)
+        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+        # test ccw angle definition
+        boxes[..., -2] *= -1
+        dets, keep_inds = nms_rotated(
+            boxes[:, :5], boxes[:, -1], 0.5, clockwise=False)
+        dets[..., -2] *= -1
+        assert np.allclose(dets.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep_inds.cpu().numpy(), np_expect_keep_inds)
+
+    def test_batched_nms(self):
+        # test batched_nms with nms_rotated
+        from mmcv.ops import batched_nms
+
+        np_boxes = np.array(
+            [[6.0, 3.0, 8.0, 7.0, 0.5, 0.7], [3.0, 6.0, 9.0, 11.0, 0.6, 0.8],
+             [3.0, 7.0, 10.0, 12.0, 0.3, 0.5], [1.0, 4.0, 13.0, 7.0, 0.6, 0.9]
+             ],
+            dtype=np.float32)
+        np_labels = np.array([1, 0, 1, 0], dtype=np.float32)
+
+        np_expect_agnostic_dets = np.array(
+            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
+             [6.0, 3.0, 8.0, 7.0, 0.5]],
+            dtype=np.float32)
+        np_expect_agnostic_keep_inds = np.array([3, 1, 0], dtype=np.int64)
+
+        np_expect_dets = np.array(
+            [[1.0, 4.0, 13.0, 7.0, 0.6], [3.0, 6.0, 9.0, 11.0, 0.6],
+             [6.0, 3.0, 8.0, 7.0, 0.5], [3.0, 7.0, 10.0, 12.0, 0.3]],
+            dtype=np.float32)
+        np_expect_keep_inds = np.array([3, 1, 0, 2], dtype=np.int64)
+
+        nms_cfg = dict(type='nms_rotated', iou_threshold=0.5)
+
+        # test class_agnostic is True
+        boxes, keep = batched_nms(
+            torch.from_numpy(np_boxes[:, :5]),
+            torch.from_numpy(np_boxes[:, -1]),
+            torch.from_numpy(np_labels),
+            nms_cfg,
+            class_agnostic=True)
+        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_agnostic_dets)
+        assert np.allclose(keep.cpu().numpy(), np_expect_agnostic_keep_inds)
+
+        # test class_agnostic is False
+        boxes, keep = batched_nms(
+            torch.from_numpy(np_boxes[:, :5]),
+            torch.from_numpy(np_boxes[:, -1]),
+            torch.from_numpy(np_labels),
+            nms_cfg,
+            class_agnostic=False)
+        assert np.allclose(boxes.cpu().numpy()[:, :5], np_expect_dets)
+        assert np.allclose(keep.cpu().numpy(), np_expect_keep_inds)
diff --git a/mmcv/tests/test_ops/test_onnx.py b/mmcv/tests/test_ops/test_onnx.py
new file mode 100644
index 0000000000000000000000000000000000000000..b80140a3750bc4be91e1e6349594b0c5c6c7f5b9
--- /dev/null
+++ b/mmcv/tests/test_ops/test_onnx.py
@@ -0,0 +1,916 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import warnings
+from functools import partial
+
+import numpy as np
+import onnx
+import onnxruntime as rt
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from packaging import version
+
+onnx_file = 'tmp.onnx'
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
+
+@pytest.fixture(autouse=True)
+def run_before_and_after_test():
+    # clear onnx_file before test
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+
+    yield
+
+    # clear onnx_file after test
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+
+
+class WrapFunction(nn.Module):
+
+    def __init__(self, wrapped_function):
+        super().__init__()
+        self.wrapped_function = wrapped_function
+
+    def forward(self, *args, **kwargs):
+        return self.wrapped_function(*args, **kwargs)
+
+
+def process_grid_sample(func, input, grid, ort_custom_op_path=''):
+    wrapped_model = WrapFunction(func).eval()
+
+    input_names = ['input', 'grid']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (input, grid),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    session_options = rt.SessionOptions()
+    if ort_custom_op_path:
+        session_options.register_custom_ops_library(ort_custom_op_path)
+
+    # get onnx output
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 2)
+    sess = rt.InferenceSession(onnx_file, session_options)
+    ort_result = sess.run(None, {
+        'input': input.detach().numpy(),
+        'grid': grid.detach().numpy()
+    })
+    pytorch_results = wrapped_model(input.clone(), grid.clone())
+    assert np.allclose(pytorch_results, ort_result, atol=1e-3)
+
+
+@pytest.mark.parametrize('mode', ['bilinear', 'nearest'])
+@pytest.mark.parametrize('padding_mode', ['zeros', 'border', 'reflection'])
+@pytest.mark.parametrize('align_corners', [True, False])
+def test_grid_sample(mode, padding_mode, align_corners):
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    opset_version = 11
+    register_extra_symbolics(opset_version)
+
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
+    input = torch.rand(1, 1, 10, 10)
+    grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
+    grid = F.affine_grid(
+        grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
+
+    def func(input, grid):
+        return F.grid_sample(
+            input,
+            grid,
+            mode=mode,
+            padding_mode=padding_mode,
+            align_corners=align_corners)
+
+    return process_grid_sample(func, input, grid, ort_custom_op_path)
+
+
+@pytest.mark.parametrize('align_corners', [True, False])
+def test_bilinear_grid_sample(align_corners):
+    from mmcv.ops.point_sample import bilinear_grid_sample
+
+    # only support pytorch >= 1.5.0
+    if version.parse(torch.__version__) < version.parse('1.5.0'):
+        pytest.skip('Only support PyTorch >= 1.5.0')
+
+    input = torch.rand(1, 1, 10, 10)
+    grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
+    grid = F.affine_grid(
+        grid, (1, 1, 15, 15), align_corners=align_corners).type_as(input)
+
+    def func(input, grid):
+        return bilinear_grid_sample(input, grid, align_corners=align_corners)
+
+    return process_grid_sample(func, input, grid)
+
+
+def test_nms():
+    from mmcv.ops import get_onnxruntime_op_path, nms
+    np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
+                         [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
+                        dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+
+    nms = partial(
+        nms, iou_threshold=0.3, offset=0, score_threshold=0, max_num=0)
+    pytorch_dets, _ = nms(boxes, scores)
+    pytorch_score = pytorch_dets[:, 4]
+
+    wrapped_model = WrapFunction(nms)
+    wrapped_model.cpu().eval()
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (boxes, scores),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['boxes', 'scores'],
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+    ort_custom_op_path = get_onnxruntime_op_path()
+    session_options = rt.SessionOptions()
+    if os.path.exists(ort_custom_op_path):
+        session_options.register_custom_ops_library(ort_custom_op_path)
+
+    # get onnx output
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 2)
+    sess = rt.InferenceSession(onnx_file, session_options)
+    onnx_dets, _ = sess.run(None, {
+        'scores': scores.detach().numpy(),
+        'boxes': boxes.detach().numpy()
+    })
+    onnx_score = onnx_dets[:, 4]
+    assert np.allclose(pytorch_score, onnx_score, atol=1e-3)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_softnms():
+    from mmcv.ops import get_onnxruntime_op_path, soft_nms
+
+    # only support pytorch >= 1.7.0
+    if version.parse(torch.__version__) < version.parse('1.7.0'):
+        warnings.warn('test_softnms should be ran with pytorch >= 1.7.0')
+        return
+
+    # only support onnxruntime >= 1.5.1
+    assert version.parse(rt.__version__) >= version.parse(
+        '1.5.1'), 'test_softnms should be ran with onnxruntime >= 1.5.1'
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('softnms for onnxruntime is not compiled.')
+
+    np_boxes = np.array([[6.0, 3.0, 8.0, 7.0], [3.0, 6.0, 9.0, 11.0],
+                         [3.0, 7.0, 10.0, 12.0], [1.0, 4.0, 13.0, 7.0]],
+                        dtype=np.float32)
+    np_scores = np.array([0.6, 0.9, 0.7, 0.2], dtype=np.float32)
+
+    boxes = torch.from_numpy(np_boxes)
+    scores = torch.from_numpy(np_scores)
+
+    configs = [[0.3, 0.5, 0.01, 'linear'], [0.3, 0.5, 0.01, 'gaussian'],
+               [0.3, 0.5, 0.01, 'naive']]
+
+    session_options = rt.SessionOptions()
+    session_options.register_custom_ops_library(ort_custom_op_path)
+
+    for _iou_threshold, _sigma, _min_score, _method in configs:
+        pytorch_dets, pytorch_inds = soft_nms(
+            boxes,
+            scores,
+            iou_threshold=_iou_threshold,
+            sigma=_sigma,
+            min_score=_min_score,
+            method=_method)
+        nms = partial(
+            soft_nms,
+            iou_threshold=_iou_threshold,
+            sigma=_sigma,
+            min_score=_min_score,
+            method=_method)
+
+        wrapped_model = WrapFunction(nms)
+        wrapped_model.cpu().eval()
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (boxes, scores),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['boxes', 'scores'],
+                opset_version=11)
+        onnx_model = onnx.load(onnx_file)
+
+        # get onnx output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert (len(net_feed_input) == 2)
+        sess = rt.InferenceSession(onnx_file, session_options)
+        onnx_dets, onnx_inds = sess.run(None, {
+            'scores': scores.detach().numpy(),
+            'boxes': boxes.detach().numpy()
+        })
+
+        assert np.allclose(pytorch_dets, onnx_dets, atol=1e-3)
+        assert np.allclose(onnx_inds, onnx_inds, atol=1e-3)
+
+
+def test_roialign():
+    try:
+        from mmcv.ops import get_onnxruntime_op_path, roi_align
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('roi_align op is not successfully compiled')
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    # roi align config
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+
+    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2.], [3., 4.]], [[4., 3.],
+                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+
+    def warpped_function(torch_input, torch_rois):
+        return roi_align(torch_input, torch_rois, (pool_w, pool_h),
+                         spatial_scale, sampling_ratio, 'avg', True)
+
+    for case in inputs:
+        np_input = np.array(case[0], dtype=np.float32)
+        np_rois = np.array(case[1], dtype=np.float32)
+        input = torch.from_numpy(np_input)
+        rois = torch.from_numpy(np_rois)
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_output = roi_align(input, rois, (pool_w, pool_h),
+                                       spatial_scale, sampling_ratio, 'avg',
+                                       True)
+
+        # export and load onnx model
+        wrapped_model = WrapFunction(warpped_function)
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (input, rois),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['input', 'rois'],
+                opset_version=11)
+
+        onnx_model = onnx.load(onnx_file)
+        session_options = rt.SessionOptions()
+        if os.path.exists(ort_custom_op_path):
+            session_options.register_custom_ops_library(ort_custom_op_path)
+
+        # compute onnx_output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert (len(net_feed_input) == 2)
+        sess = rt.InferenceSession(onnx_file, session_options)
+        onnx_output = sess.run(None, {
+            'input': input.detach().numpy(),
+            'rois': rois.detach().numpy()
+        })
+        onnx_output = onnx_output[0]
+
+        # allclose
+
+        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
+def test_roialign_rotated():
+    try:
+        from mmcv.ops import get_onnxruntime_op_path, roi_align_rotated
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('roi_align_aligned op is not successfully compiled')
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+    # roi align config
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+
+    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0.5, 0.5, 1., 1., 0]]),
+              ([[[[1., 2.], [3., 4.]]]], [[0., 0.5, 0.5, 1., 1., np.pi / 2]]),
+              ([[[[1., 2.], [3., 4.]],
+                 [[4., 3.], [2., 1.]]]], [[0., 0.5, 0.5, 1., 1., 0]]),
+              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+                  [11., 12., 15., 16.]]]], [[0., 1.5, 1.5, 3., 3., 0]]),
+              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+                  [11., 12., 15., 16.]]]], [[0., 1.5, 1.5, 3., 3.,
+                                             np.pi / 2]])]
+
+    def warpped_function(torch_input, torch_rois):
+        return roi_align_rotated(torch_input, torch_rois, (pool_w, pool_h),
+                                 spatial_scale, sampling_ratio, True, False)
+
+    for case in inputs:
+        np_input = np.array(case[0], dtype=np.float32)
+        np_rois = np.array(case[1], dtype=np.float32)
+        input = torch.from_numpy(np_input)
+        rois = torch.from_numpy(np_rois)
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_output = roi_align_rotated(input, rois, (pool_w, pool_h),
+                                               spatial_scale, sampling_ratio,
+                                               True, False)
+
+        # export and load onnx model
+        wrapped_model = WrapFunction(warpped_function)
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (input, rois),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['features', 'rois'],
+                opset_version=11)
+
+        onnx_model = onnx.load(onnx_file)
+        session_options = rt.SessionOptions()
+        if os.path.exists(ort_custom_op_path):
+            session_options.register_custom_ops_library(ort_custom_op_path)
+
+        # compute onnx_output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert (len(net_feed_input) == 2)
+        sess = rt.InferenceSession(onnx_file, session_options)
+        onnx_output = sess.run(None, {
+            'features': input.detach().numpy(),
+            'rois': rois.detach().numpy()
+        })
+        onnx_output = onnx_output[0]
+
+        # allclose
+
+        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
+@pytest.mark.skipif(not torch.cuda.is_available(), reason='test requires GPU')
+def test_roipool():
+    from mmcv.ops import roi_pool
+
+    # roi pool config
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+
+    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2.], [3., 4.]], [[4., 3.],
+                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+
+    def warpped_function(torch_input, torch_rois):
+        return roi_pool(torch_input, torch_rois, (pool_w, pool_h),
+                        spatial_scale)
+
+    for case in inputs:
+        np_input = np.array(case[0], dtype=np.float32)
+        np_rois = np.array(case[1], dtype=np.float32)
+        input = torch.from_numpy(np_input).cuda()
+        rois = torch.from_numpy(np_rois).cuda()
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_output = roi_pool(input, rois, (pool_w, pool_h),
+                                      spatial_scale)
+            pytorch_output = pytorch_output.cpu()
+
+        # export and load onnx model
+        wrapped_model = WrapFunction(warpped_function)
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (input, rois),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['input', 'rois'],
+                opset_version=11)
+        onnx_model = onnx.load(onnx_file)
+
+        # compute onnx_output
+        input_all = [node.name for node in onnx_model.graph.input]
+        input_initializer = [
+            node.name for node in onnx_model.graph.initializer
+        ]
+        net_feed_input = list(set(input_all) - set(input_initializer))
+        assert (len(net_feed_input) == 2)
+        sess = rt.InferenceSession(onnx_file)
+        onnx_output = sess.run(
+            None, {
+                'input': input.detach().cpu().numpy(),
+                'rois': rois.detach().cpu().numpy()
+            })
+        onnx_output = onnx_output[0]
+
+        # allclose
+        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
+def test_interpolate():
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    opset_version = 11
+    register_extra_symbolics(opset_version)
+
+    def func(feat, scale_factor=2):
+        out = F.interpolate(feat, scale_factor=scale_factor)
+        return out
+
+    net = WrapFunction(func)
+    net = net.cpu().eval()
+    dummy_input = torch.randn(2, 4, 8, 8).cpu()
+    torch.onnx.export(
+        net,
+        dummy_input,
+        onnx_file,
+        input_names=['input'],
+        opset_version=opset_version)
+    sess = rt.InferenceSession(onnx_file)
+    onnx_result = sess.run(None, {'input': dummy_input.detach().numpy()})
+    pytorch_result = func(dummy_input).detach().numpy()
+
+    assert np.allclose(pytorch_result, onnx_result, atol=1e-3)
+
+
+def test_rotated_feature_align():
+    if torch.__version__ == 'parrots':
+        pytest.skip('onnx is not supported in parrots directly')
+    try:
+        from mmcv.ops import get_onnxruntime_op_path, rotated_feature_align
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('rotated_feature_align op is not successfully compiled')
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
+    spatial_scale = 1.0 / 8
+    points = 1
+
+    def warpped_function(feature, bbox):
+        return rotated_feature_align(
+            feature, bbox, spatial_scale=spatial_scale, points=points)
+
+    feature = torch.tensor([[[[1.2924, -0.2172, -0.5222, 0.1172],
+                              [0.9144, 1.2248, 1.3115, -0.9690],
+                              [-0.8949, -1.1797, -0.9093, -0.3961],
+                              [-0.4586, 0.5062, -0.7947, -0.7397]],
+                             [[-1.0943, -0.7495, 1.3461, -1.1652],
+                              [0.2034, 0.6763, -1.2357, 0.5231],
+                              [-1.0062, 1.2592, 1.4225, -0.3951],
+                              [-0.1242, -1.6240, 0.1932, 2.7181]],
+                             [[-1.6271, -1.0276, 0.0578, -0.2997],
+                              [-0.9684, -1.6946, -1.3188, -1.1938],
+                              [-1.6744, -0.8917, -0.6556, 1.0073],
+                              [-0.1205, 0.3671, -0.3731, -0.5347]]],
+                            [[[0.7035, 0.2089, -0.1774, 3.4670],
+                              [-0.8505, -0.9278, 1.4714, 0.1644],
+                              [0.0898, 0.3531, -0.4007, 0.1927],
+                              [1.2569, -0.2636, -0.5223, 0.0616]],
+                             [[0.1760, -0.7639, -0.4600, -1.3260],
+                              [-0.9921, -0.2970, -0.8955, 1.0508],
+                              [1.3515, -0.1641, 1.9679, 1.1986],
+                              [-0.3616, 0.6287, 0.4933, 0.3360]],
+                             [[-0.5860, 0.2124, -0.8700, 2.4200],
+                              [-0.0551, -1.5103, -1.6779, 0.8399],
+                              [0.8431, 1.2414, -1.1243, -0.3887],
+                              [-2.1254, 0.6047, -0.3515, 0.7254]]]])
+
+    bbox = torch.tensor(
+        [[[[1.3080e+01, 1.2688e+01, 1.1214e+01, 9.3944e+01, -9.1905e-01],
+           [3.8104e+01, 1.0134e+01, 1.4659e+02, 9.0306e+01, -9.8211e-01],
+           [-5.3213e+01, 4.9508e+01, 5.1513e+01, 3.2055e+01, -3.1954e-01],
+           [2.6974e+01, 2.5248e+01, 5.4495e+01, 3.1083e+00, -6.2127e-01]],
+          [[-1.5604e+01, -5.1908e+01, 2.3998e+02, 1.5008e+01, -1.2546e+00],
+           [3.1354e+01, -7.3635e+00, 6.7879e+01, 3.5081e+01, -3.3851e-01],
+           [-5.3292e+00, 9.1946e+00, 1.2834e+01, 1.0485e+01, -1.3039e+00],
+           [-2.3925e+01, 3.6623e+01, 3.9875e+01, 7.2009e+01, -6.5934e-01]],
+          [[7.2114e+01, -2.3781e+01, 2.9106e+01, 8.4501e+01, -1.1340e+00],
+           [2.6258e+01, -7.7034e+00, 1.7629e+02, 1.0615e+02, -1.2156e+00],
+           [3.8057e+01, 4.6016e+01, 1.2965e+01, 6.9384e+00, -1.0855e+00],
+           [2.4428e+01, -1.6189e+01, 2.0572e+02, 3.1622e+01, -1.5719e-01]],
+          [[3.8226e+00, 2.9608e+01, 1.4457e+01, 6.8179e+01, -9.1997e-01],
+           [2.5003e+01, -4.2490e+01, 9.6007e+01, 4.9086e+01, -1.4786e+00],
+           [8.5983e+01, 5.4980e+01, 7.8080e+01, 1.0003e+02, -1.0926e+00],
+           [9.9065e+00, 4.1457e+01, 5.9799e+00, 1.7973e+01, -5.6313e-01]]],
+         [[[-1.8244e+01, 4.6309e+00, 5.3010e+01, 2.4310e+01, -7.0345e-01],
+           [1.9419e+01, 3.6704e+01, 5.2390e+01, 5.4133e+01, -3.7730e-01],
+           [5.6387e+01, 2.3752e+01, 9.0441e+00, 1.7792e+01, -1.5583e+00],
+           [3.6303e+01, 1.6396e+01, 2.0283e+01, 1.9148e+01, -8.3419e-01]],
+          [[3.2169e+01, 3.0521e+01, 2.6283e+01, 1.9680e+02, -3.0454e-01],
+           [2.5788e+01, -3.2189e+01, 8.8882e+01, 1.0207e+02, -1.5328e+00],
+           [8.4676e+00, -1.6668e+01, 2.4657e+01, 1.1275e+02, -4.0388e-01],
+           [-1.0799e+01, 6.0422e+00, 9.5807e+00, 3.3677e+01, -3.5438e-01]],
+          [[6.9363e+01, 1.0850e+01, 2.5968e+01, 2.2311e+01, -1.6408e-01],
+           [2.8140e+00, 4.6843e+00, 3.1289e+00, 2.1480e+01, -6.7583e-01],
+           [2.6661e+01, 4.5290e+01, 6.1679e+00, 3.0005e+01, -8.9806e-01],
+           [5.0871e+00, 1.3234e+01, 9.2087e+01, 4.9622e+01, -2.8020e-01]],
+          [[-1.2643e+01, 2.5176e+01, 5.0488e+01, 5.4246e+01, -4.4840e-01],
+           [-3.4521e+01, 9.8435e-01, 5.2413e+01, 9.7996e+00, -8.4218e-01],
+           [4.9829e+01, -1.0808e+01, 2.9848e+01, 7.3579e+01, -6.2672e-01],
+           [8.0446e+01, 2.8064e+01, 4.5273e+01, 5.3809e+01, -1.2359e+00]]]])
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_output = rotated_feature_align(
+            feature, bbox, spatial_scale=spatial_scale, points=points)
+
+    # export and load onnx model
+    wrapped_model = WrapFunction(warpped_function)
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (feature, bbox),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['feature', 'bbox'],
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+    session_options = rt.SessionOptions()
+    if os.path.exists(ort_custom_op_path):
+        session_options.register_custom_ops_library(ort_custom_op_path)
+
+    # compute onnx_output
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 2)
+    sess = rt.InferenceSession(onnx_file, session_options)
+    onnx_output = sess.run(None, {
+        'feature': feature.detach().numpy(),
+        'bbox': bbox.detach().numpy()
+    })
+    onnx_output = onnx_output[0]
+
+    # allclose
+    assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
+@pytest.mark.parametrize('mode', ['top', 'bottom', 'left', 'right'])
+def test_corner_pool(mode, opset=11):
+
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
+    from mmcv.ops.corner_pool import CornerPool
+
+    def corner_pool_func(input):
+        corner_pool_module = CornerPool(mode)
+        return corner_pool_module.corner_pool.apply(input)
+
+    wrapped_model = WrapFunction(corner_pool_func).eval()
+
+    input = torch.rand((2, 3, 9, 12))  # (n,c,h,w)
+
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model,
+            input,
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['input'],
+            output_names=['output'],
+            opset_version=opset)
+
+    onnx_model = onnx.load(onnx_file)
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 1)
+
+    session_options = rt.SessionOptions()
+    session_options.register_custom_ops_library(ort_custom_op_path)
+    sess = rt.InferenceSession(onnx_file, session_options)
+    ort_result = sess.run(None, {'input': input.detach().numpy()})
+    pytorch_results = wrapped_model(input.clone())
+
+    assert np.allclose(pytorch_results, ort_result, atol=1e-5)
+
+
+@pytest.mark.parametrize('key', ['cummax', 'cummin'])
+def test_cummax_cummin(key, opset=11):
+
+    # Note generally `cummax` or `cummin` is exportable to ONNX
+    # as long as the pytorch version >= 1.5.0, since `torch.cummax`
+    # is only supported with torch >= 1.5.0.
+    # But when `cummax` or `cummin` serves as an intermediate component
+    # whose outputs is used as inputs for another modules, it's expected
+    # that pytorch version must be >= 1.7.0. Otherwise error appears like:
+    # `RuntimeError: tuple  appears in op that does not forward tuples,
+    # unsupported 'kind: prim::PythonOp`.
+    if version.parse(torch.__version__) < version.parse('1.7.0'):
+        pytest.skip('test_cummax_cummin should be ran with pytorch >= 1.7.0')
+
+    # register custom op `mmcv::cummax` and `mmcv::cummin`
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    register_extra_symbolics(opset)
+
+    from mmcv.ops import get_onnxruntime_op_path
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
+    input_list = [
+        # arbitrary shape, e.g. 1-D, 2-D, 3-D, ...
+        torch.rand((2, 3, 4, 1, 5)),
+        torch.rand(1),
+        torch.rand((2, 0, 1)),  # tensor.numel() is 0
+        torch.FloatTensor(),  # empty tensor
+    ]
+
+    cummax_cummin_funcs = {'cummax': torch.cummax, 'cummin': torch.cummin}
+
+    for input in input_list:
+        ndims = input.dim()
+        # valid dim range is [-ndims, ndims-1]
+        # test for all `dim` value which is valid
+        for dim in range(-ndims, ndims):
+            cummax_func = partial(cummax_cummin_funcs[key], dim=dim)
+            wrapped_model = WrapFunction(cummax_func).eval()
+
+            with torch.no_grad():
+                torch.onnx.export(
+                    wrapped_model,
+                    input,
+                    onnx_file,
+                    export_params=True,
+                    keep_initializers_as_inputs=True,
+                    input_names=['input'],
+                    output_names=['output', 'indices'],
+                    opset_version=opset)
+
+            onnx_model = onnx.load(onnx_file)
+            input_all = [node.name for node in onnx_model.graph.input]
+            input_initializer = [
+                node.name for node in onnx_model.graph.initializer
+            ]
+            net_feed_input = list(set(input_all) - set(input_initializer))
+            assert (len(net_feed_input) == 1)
+
+            session_options = rt.SessionOptions()
+            session_options.register_custom_ops_library(ort_custom_op_path)
+            sess = rt.InferenceSession(onnx_file, session_options)
+            ort_output, ort_inds = sess.run(None,
+                                            {'input': input.detach().numpy()})
+            pytorch_output, pytorch_inds = wrapped_model(input.clone())
+            pytorch_output = pytorch_output.detach().numpy()
+            pytorch_inds = pytorch_inds.detach().numpy()
+            assert np.allclose(pytorch_output, ort_output, atol=1e-5)
+            assert np.all(pytorch_inds == ort_inds)
+
+
+@pytest.mark.parametrize('shifts_dims_pair', [([-3, 5], [2, 0]), (5, None)])
+def test_roll(shifts_dims_pair):
+    opset = 11
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    register_extra_symbolics(opset)
+
+    input = torch.arange(0, 4 * 5 * 6, dtype=torch.float32).view(4, 5, 6)
+
+    shifts, dims = shifts_dims_pair
+    func = partial(torch.roll, shifts=shifts, dims=dims)
+    wrapped_model = WrapFunction(func).eval()
+
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model,
+            input,
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['input'],
+            output_names=['output'],
+            opset_version=opset)
+
+    onnx_model = onnx.load(onnx_file)
+    input_all = [node.name for node in onnx_model.graph.input]
+    input_initializer = [node.name for node in onnx_model.graph.initializer]
+    net_feed_input = list(set(input_all) - set(input_initializer))
+    assert (len(net_feed_input) == 1)
+
+    sess = rt.InferenceSession(onnx_file)
+    ort_output = sess.run(None, {'input': input.detach().numpy()})[0]
+
+    with torch.no_grad():
+        pytorch_output = wrapped_model(input.clone())
+
+    torch.testing.assert_allclose(ort_output, pytorch_output)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(),
+    reason='modulated_deform_conv2d only supports in GPU')
+def test_modulated_deform_conv2d():
+    try:
+        from mmcv.ops import ModulatedDeformConv2d, get_onnxruntime_op_path
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('modulated_deform_conv op is not successfully compiled')
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
+    # modulated deform conv config
+    in_channels = 3
+    out_channels = 64
+    stride = 1
+    padding = 0
+    dilation = 1
+    groups = 1
+    deform_groups = 1
+    kernel_size = 3
+
+    input = torch.rand(1, in_channels, 28, 28).cuda()  # (n, c, h, w)
+    conv_offset = nn.Conv2d(
+        in_channels=3,
+        out_channels=deform_groups * 3 * kernel_size * kernel_size,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=True).cuda()
+    conv_offset.cuda()
+    out = conv_offset(input)
+    o1, o2, mask = torch.chunk(out, 3, dim=1)
+    offset = torch.cat((o1, o2), dim=1)
+    mask = torch.sigmoid(mask)
+
+    model_with_bias = ModulatedDeformConv2d(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        deform_groups,
+        bias=True)
+    model_without_bias = ModulatedDeformConv2d(
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride,
+        padding,
+        dilation,
+        groups,
+        deform_groups,
+        bias=False)
+    models = [model_with_bias.cuda(), model_without_bias.cuda()]
+
+    for model in models:
+        # export and load onnx model
+        with torch.no_grad():
+            torch.onnx.export(
+                model, (input, offset, mask),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['input', 'offset', 'mask'],
+                opset_version=11)
+
+        session_options = rt.SessionOptions()
+        if os.path.exists(ort_custom_op_path):
+            session_options.register_custom_ops_library(ort_custom_op_path)
+
+        # compute onnx_output
+        sess = rt.InferenceSession(onnx_file, session_options)
+        onnx_output = sess.run(
+            None, {
+                'input': input.cpu().detach().numpy(),
+                'offset': offset.cpu().detach().numpy(),
+                'mask': mask.cpu().detach().numpy()
+            })[0]
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_output = model(input, offset, mask).cpu()
+        # allclose
+        assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
+
+
+def test_deform_conv2d(threshold=1e-3):
+    try:
+        from mmcv.ops import DeformConv2d, get_onnxruntime_op_path
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('deform_conv op is not successfully compiled')
+
+    ort_custom_op_path = get_onnxruntime_op_path()
+    if not os.path.exists(ort_custom_op_path):
+        pytest.skip('custom ops for onnxruntime are not compiled.')
+
+    # deform conv config
+    # modulated deform conv config
+    in_channels = 1
+    out_channels = 64
+    stride = 1
+    padding = 0
+    dilation = 1
+    groups = 1
+    deform_groups = 1
+    kernel_size = 2
+    input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
+    offset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]],
+                     [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]],
+                     [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]],
+                     [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]]
+    offset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7]
+    deform_weight = [[[0.4, 0.2, 0.1, 0.9]]]
+
+    x = torch.tensor(input)
+    conv_offset = nn.Conv2d(
+        in_channels=in_channels,
+        out_channels=deform_groups * 2 * kernel_size * kernel_size,
+        kernel_size=kernel_size,
+        stride=stride,
+        padding=padding,
+        dilation=dilation,
+        bias=True)
+
+    conv_offset.weight.data = torch.nn.Parameter(
+        torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
+    conv_offset.bias.data = torch.nn.Parameter(
+        torch.Tensor(offset_bias).reshape(8))
+
+    offset = conv_offset(x)
+
+    model = DeformConv2d(in_channels, out_channels, kernel_size, stride,
+                         padding, dilation, groups, deform_groups)
+
+    model.weight.data = torch.nn.Parameter(
+        torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
+
+    with torch.no_grad():
+        torch.onnx.export(
+            model, (x, offset),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['input', 'offset'],
+            opset_version=11)
+
+    session_options = rt.SessionOptions()
+    if os.path.exists(ort_custom_op_path):
+        session_options.register_custom_ops_library(ort_custom_op_path)
+
+    # compute onnx_output
+    sess = rt.InferenceSession(onnx_file, session_options)
+    onnx_output = sess.run(
+        None, {
+            'input': x.cpu().detach().numpy(),
+            'offset': offset.cpu().detach().numpy(),
+        })[0]
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_output = model(x, offset).cpu()
+    # allclose
+    assert np.allclose(pytorch_output, onnx_output, atol=1e-3)
diff --git a/mmcv/tests/test_ops/test_pixel_group.py b/mmcv/tests/test_ops/test_pixel_group.py
new file mode 100644
index 0000000000000000000000000000000000000000..ceb257365729d0238359ccce8d6a0e60939c6ef6
--- /dev/null
+++ b/mmcv/tests/test_ops/test_pixel_group.py
@@ -0,0 +1,78 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import torch
+
+
+def test_pixel_group():
+    from mmcv.ops import pixel_group
+    np_score = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
+                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
+                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
+                         [0, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0.9, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]).astype(np.float32)
+    np_mask = (np_score > 0.5)
+    np_embedding = np.zeros((10, 10, 8)).astype(np.float32)
+    np_embedding[:, :7] = 0.9
+    np_embedding[:, 7:] = 10.0
+    np_kernel_label = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
+                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
+                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
+                                [0, 0, 1, 1, 1, 0, 0, 0, 2, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                [0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                 0]]).astype(np.int32)
+    np_kernel_contour = np.array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                  [0, 0, 1, 1, 1, 0, 0, 0, 1, 0],
+                                  [0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
+                                  [0, 0, 1, 0, 1, 0, 0, 0, 1, 0],
+                                  [0, 0, 1, 1, 1, 0, 0, 0, 1, 0],
+                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                                  [0, 0, 0, 0, 0, 0, 0, 0, 0,
+                                   0]]).astype(np.uint8)
+    kernel_region_num = 3
+    distance_threshold = float(0.8)
+    result = pixel_group(np_score, np_mask, np_embedding, np_kernel_label,
+                         np_kernel_contour, kernel_region_num,
+                         distance_threshold)
+    gt_1 = [
+        0.8999997973442078, 24.0, 1.0, 3.0, 2.0, 3.0, 3.0, 3.0, 4.0, 3.0, 5.0,
+        3.0, 6.0, 3.0, 1.0, 4.0, 2.0, 4.0, 3.0, 4.0, 4.0, 4.0, 5.0, 4.0, 6.0,
+        4.0, 1.0, 5.0, 2.0, 5.0, 3.0, 5.0, 4.0, 5.0, 5.0, 5.0, 6.0, 5.0, 1.0,
+        6.0, 2.0, 6.0, 3.0, 6.0, 4.0, 6.0, 5.0, 6.0, 6.0, 6.0
+    ]
+
+    gt_2 = [
+        0.9000000357627869, 8.0, 7.0, 3.0, 8.0, 3.0, 7.0, 4.0, 8.0, 4.0, 7.0,
+        5.0, 8.0, 5.0, 7.0, 6.0, 8.0, 6.0
+    ]
+
+    assert np.allclose(result[0], [0, 0])
+    assert np.allclose(result[1], gt_1)
+    assert np.allclose(result[2], gt_2)
+
+    # test torch Tensor
+    np_score_t = torch.from_numpy(np_score)
+    np_mask_t = torch.from_numpy(np_mask)
+    np_embedding_t = torch.from_numpy(np_embedding)
+    np_kernel_label_t = torch.from_numpy(np_kernel_label)
+    np_kernel_contour_t = torch.from_numpy(np_kernel_contour)
+
+    result = pixel_group(np_score_t, np_mask_t, np_embedding_t,
+                         np_kernel_label_t, np_kernel_contour_t,
+                         kernel_region_num, distance_threshold)
+
+    assert np.allclose(result[0], [0, 0])
+    assert np.allclose(result[1], gt_1)
+    assert np.allclose(result[2], gt_2)
diff --git a/mmcv/tests/test_ops/test_points_in_polygons.py b/mmcv/tests/test_ops/test_points_in_polygons.py
new file mode 100644
index 0000000000000000000000000000000000000000..dde8ab023913f04214e44f1642212b13b817589a
--- /dev/null
+++ b/mmcv/tests/test_ops/test_points_in_polygons.py
@@ -0,0 +1,23 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import points_in_polygons
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_points_in_polygons():
+    points = np.array([[300., 300.], [400., 400.], [100., 100], [300, 250],
+                       [100, 0]])
+    polygons = np.array([[200., 200., 400., 400., 500., 200., 400., 100.],
+                         [400., 400., 500., 500., 600., 300., 500., 200.],
+                         [300., 300., 600., 700., 700., 700., 700., 100.]])
+    expected_output = np.array([[0., 0., 0.], [0., 0., 1.], [0., 0., 0.],
+                                [1., 0., 0.], [0., 0., 0.]])
+    points = torch.from_numpy(points).cuda().float()
+    polygons = torch.from_numpy(polygons).cuda().float()
+    expected_output = torch.from_numpy(expected_output).cuda().float()
+    assert torch.allclose(
+        points_in_polygons(points, polygons), expected_output, 1e-3)
diff --git a/mmcv/tests/test_ops/test_prroi_pool.py b/mmcv/tests/test_ops/test_prroi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..6ee471e82858d0040fd42f9606376624384431b4
--- /dev/null
+++ b/mmcv/tests/test_ops/test_prroi_pool.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+
+    _USING_PARROTS = False
+
+inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
+                                               1.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+outputs = [
+    ([[[[1.75, 2.25], [2.75, 3.25]]]], [[[[1., 1.],
+                                          [1., 1.]]]], [[0., 2., 4., 2., 4.]]),
+    ([[[[1.75, 2.25], [2.75, 3.25]],
+       [[3.25, 2.75], [2.25, 1.75]]]], [[[[1., 1.], [1., 1.]],
+                                         [[1., 1.],
+                                          [1., 1.]]]], [[0., 0., 0., 0., 0.]]),
+    ([[[[3.75, 6.91666651],
+        [10.08333302,
+         13.25]]]], [[[[0.11111111, 0.22222224, 0.22222222, 0.11111111],
+                       [0.22222224, 0.444444448, 0.44444448, 0.22222224],
+                       [0.22222224, 0.44444448, 0.44444448, 0.22222224],
+                       [0.11111111, 0.22222224, 0.22222224, 0.11111111]]]],
+     [[0.0, 3.33333302, 6.66666603, 3.33333349, 6.66666698]])
+]
+
+
+class TestPrRoiPool:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+    ])
+    def test_roipool_gradcheck(self, device):
+        from mmcv.ops import PrRoIPool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case in inputs:
+            np_input = np.array(case[0], dtype=np.float32)
+            np_rois = np.array(case[1], dtype=np.float32)
+
+            x = torch.tensor(np_input, device=device, requires_grad=True)
+            rois = torch.tensor(np_rois, device=device)
+
+            froipool = PrRoIPool((pool_h, pool_w), spatial_scale)
+
+            if _USING_PARROTS:
+                pass
+                # gradcheck(froipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
+
+    def _test_roipool_allclose(self, device, dtype=torch.float):
+        from mmcv.ops import prroi_pool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case, output in zip(inputs, outputs):
+            np_input = np.array(case[0], dtype=np.float32)
+            np_rois = np.array(case[1], dtype=np.float32)
+            np_output = np.array(output[0], dtype=np.float32)
+            np_input_grad = np.array(output[1], dtype=np.float32)
+            np_rois_grad = np.array(output[2], dtype=np.float32)
+
+            x = torch.tensor(
+                np_input, dtype=dtype, device=device, requires_grad=True)
+            rois = torch.tensor(
+                np_rois, dtype=dtype, device=device, requires_grad=True)
+
+            output = prroi_pool(x, rois, (pool_h, pool_w), spatial_scale)
+            output.backward(torch.ones_like(output))
+            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
+            assert np.allclose(x.grad.data.cpu().numpy(), np_input_grad, 1e-3)
+            assert np.allclose(rois.grad.data.cpu().numpy(), np_rois_grad,
+                               1e-3)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support'))
+    ])
+    def test_roipool_allclose_float(self, device):
+        self._test_roipool_allclose(device, dtype=torch.float)
diff --git a/mmcv/tests/test_ops/test_psa_mask.py b/mmcv/tests/test_ops/test_psa_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..8c1f3101ab0d419713d68dd29f81f8bef5e1c7b9
--- /dev/null
+++ b/mmcv/tests/test_ops/test_psa_mask.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+
+class Loss(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, input, target):
+        input = input.view(-1)
+        target = target.view(-1)
+        return torch.mean(input - target)
+
+
+class TestPSAMask:
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_psa_mask_collect(self, device):
+        from mmcv.ops import PSAMask
+        test_loss = Loss()
+
+        input = np.fromfile(
+            'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)
+        output_collect = np.fromfile(
+            'tests/data/for_psa_mask/psa_output_collect.bin', dtype=np.float32)
+
+        input = input.reshape((4, 16, 8, 8))
+        output_collect = output_collect.reshape((4, 64, 8, 8))
+        label = torch.ones((4, 64, 8, 8))
+
+        input = torch.FloatTensor(input)
+        input.requires_grad = True
+
+        psamask_collect = PSAMask('collect', (4, 4))
+
+        # test collect cpu
+        test_output = psamask_collect(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().numpy()
+        assert np.allclose(test_output, output_collect)
+        assert test_output.shape == output_collect.shape
+
+        psamask_collect.to(device)
+        input = input.to(device)
+        label = label.to(device)
+
+        # test collect on device
+        test_output = psamask_collect(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().cpu().numpy()
+        assert np.allclose(test_output, output_collect)
+        assert test_output.shape == output_collect.shape
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    def test_psa_mask_distribute(self, device):
+        from mmcv.ops import PSAMask
+        test_loss = Loss()
+
+        input = np.fromfile(
+            'tests/data/for_psa_mask/psa_input.bin', dtype=np.float32)
+        output_distribute = np.fromfile(
+            'tests/data/for_psa_mask/psa_output_distribute.bin',
+            dtype=np.float32)
+
+        input = input.reshape((4, 16, 8, 8))
+        output_distribute = output_distribute.reshape((4, 64, 8, 8))
+        label = torch.ones((4, 64, 8, 8))
+
+        input = torch.FloatTensor(input)
+        input.requires_grad = True
+
+        psamask_distribute = PSAMask('distribute', (4, 4))
+
+        # test distribute cpu
+        test_output = psamask_distribute(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().numpy()
+        assert np.allclose(test_output, output_distribute)
+        assert test_output.shape == output_distribute.shape
+
+        psamask_distribute.to(device)
+        input = input.to(device)
+        label = label.to(device)
+
+        # test distribute on device
+        test_output = psamask_distribute(input)
+        loss = test_loss(test_output, label)
+        loss.backward()
+        test_output = test_output.detach().cpu().numpy()
+        assert np.allclose(test_output, output_distribute)
+        assert test_output.shape == output_distribute.shape
diff --git a/mmcv/tests/test_ops/test_riroi_align_rotated.py b/mmcv/tests/test_ops/test_riroi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7b501cf44b89b687cc8bf687e0583c84705143e
--- /dev/null
+++ b/mmcv/tests/test_ops/test_riroi_align_rotated.py
@@ -0,0 +1,84 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import RiRoIAlignRotated
+
+if torch.__version__ == 'parrots':
+    from parrots.autograd import gradcheck
+    _USING_PARROTS = True
+else:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+np_feature = np.array([[[[1, 2], [3, 4]], [[1, 2], [4, 3]], [[4, 3], [2, 1]],
+                        [[1, 2], [5, 6]], [[3, 4], [7, 8]], [[9, 10], [13,
+                                                                       14]],
+                        [[11, 12], [15, 16]], [[1, 1], [2, 2]]]])
+np_rois = np.array([[0., 0.5, 0.5, 1., 1., np.pi / 3],
+                    [0., 1., 1., 3., 3., np.pi / 2]])
+expect_output = np.array([[[[1.8425, 1.3516], [2.3151, 1.8241]],
+                           [[2.4779, 1.7416], [3.2173, 2.5632]],
+                           [[2.7149, 2.2638], [2.6540, 2.3673]],
+                           [[2.9461, 2.8638], [2.8028, 2.7205]],
+                           [[4.1943, 2.7214], [5.6119, 4.1391]],
+                           [[7.5276, 6.0547], [8.9453, 7.4724]],
+                           [[12.1943, 10.7214], [13.6119, 12.1391]],
+                           [[9.5489, 8.4237], [10.5763, 9.4511]]],
+                          [[[7.6562, 12.5625], [4.0000, 6.6250]],
+                           [[1.0000, 1.3125], [0.5000, 0.6562]],
+                           [[1.6562, 1.9375], [1.0000, 1.3125]],
+                           [[1.8438, 2.0547], [0.7500, 1.1562]],
+                           [[0.8438, 3.0625], [0.2500, 1.1875]],
+                           [[2.6562, 2.5625], [1.5000, 1.6250]],
+                           [[3.6562, 4.5625], [2.0000, 2.6250]],
+                           [[6.6562, 10.5625], [3.5000, 5.6250]]]])
+
+expect_grad = np.array([[[[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]],
+                         [[1.4727, 1.5586], [1.5586, 1.6602]]]])
+
+pool_h = 2
+pool_w = 2
+spatial_scale = 1.0
+num_samples = 2
+sampling_ratio = 2
+num_orientations = 8
+clockwise = False
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_roialign_rotated_gradcheck():
+    x = torch.tensor(
+        np_feature, dtype=torch.float, device='cuda', requires_grad=True)
+    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')
+    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,
+                                 num_orientations, clockwise)
+    if _USING_PARROTS:
+        gradcheck(
+            froipool, (x, rois), no_grads=[rois], delta=1e-3, pt_atol=1e-3)
+    else:
+        gradcheck(froipool, (x, rois), eps=1e-3, atol=1e-3)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_roialign_rotated_allclose():
+    x = torch.tensor(
+        np_feature, dtype=torch.float, device='cuda', requires_grad=True)
+    rois = torch.tensor(np_rois, dtype=torch.float, device='cuda')
+    froipool = RiRoIAlignRotated((pool_h, pool_w), spatial_scale, num_samples,
+                                 num_orientations, clockwise)
+    output = froipool(x, rois)
+    output.backward(torch.ones_like(output))
+    assert np.allclose(
+        output.data.type(torch.float).cpu().numpy(), expect_output, atol=1e-3)
+    assert np.allclose(
+        x.grad.data.type(torch.float).cpu().numpy(), expect_grad, atol=1e-3)
diff --git a/mmcv/tests/test_ops/test_roi_align.py b/mmcv/tests/test_ops/test_roi_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..6caf5c53566d5a903b1dee64a6aa33fa6daf5b03
--- /dev/null
+++ b/mmcv/tests/test_ops/test_roi_align.py
@@ -0,0 +1,120 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+# yapf:disable
+
+inputs = [([[[[1., 2.], [3., 4.]]]],
+           [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]],
+             [[4., 3.], [2., 1.]]]],
+           [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
+              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],
+           [[0., 0., 0., 3., 3.]])]
+outputs = [([[[[1.0, 1.25], [1.5, 1.75]]]],
+            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
+           ([[[[1.0, 1.25], [1.5, 1.75]],
+              [[4.0, 3.75], [3.5, 3.25]]]],
+            [[[[3.0625, 0.4375], [0.4375, 0.0625]],
+              [[3.0625, 0.4375], [0.4375, 0.0625]]]]),
+           ([[[[1.9375, 4.75], [7.5625, 10.375]]]],
+            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]
+# yapf:enable
+
+pool_h = 2
+pool_w = 2
+spatial_scale = 1.0
+sampling_ratio = 2
+
+
+def _test_roialign_gradcheck(device, dtype):
+    try:
+        from mmcv.ops import RoIAlign
+    except ModuleNotFoundError:
+        pytest.skip('RoIAlign op is not successfully compiled')
+    if dtype is torch.half:
+        pytest.skip('grad check does not support fp16')
+    for case in inputs:
+        np_input = np.array(case[0])
+        np_rois = np.array(case[1])
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        rois = torch.tensor(np_rois, dtype=dtype, device=device)
+
+        froipool = RoIAlign((pool_h, pool_w), spatial_scale, sampling_ratio)
+
+        if torch.__version__ == 'parrots':
+            gradcheck(
+                froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)
+        else:
+            gradcheck(froipool, (x, rois), eps=1e-5, atol=1e-5)
+
+
+def _test_roialign_allclose(device, dtype):
+    try:
+        from mmcv.ops import roi_align
+    except ModuleNotFoundError:
+        pytest.skip('test requires compilation')
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+    for case, output in zip(inputs, outputs):
+        np_input = np.array(case[0])
+        np_rois = np.array(case[1])
+        np_output = np.array(output[0])
+        np_grad = np.array(output[1])
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        rois = torch.tensor(np_rois, dtype=dtype, device=device)
+
+        output = roi_align(x, rois, (pool_h, pool_w), spatial_scale,
+                           sampling_ratio, 'avg', True)
+        output.backward(torch.ones_like(output))
+        assert np.allclose(
+            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
+        assert np.allclose(
+            x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
+
+
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype', [
+    torch.float,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE,
+            reason='MLU does not support for 64-bit floating point')),
+    torch.half
+])
+def test_roialign(device, dtype):
+    # check double only
+    if dtype is torch.double:
+        _test_roialign_gradcheck(device=device, dtype=dtype)
+    _test_roialign_allclose(device=device, dtype=dtype)
diff --git a/mmcv/tests/test_ops/test_roi_align_rotated.py b/mmcv/tests/test_ops/test_roi_align_rotated.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad6b6e9273af9cbdc569a586d88a9ee65dbd5f0
--- /dev/null
+++ b/mmcv/tests/test_ops/test_roi_align_rotated.py
@@ -0,0 +1,151 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+    _USING_PARROTS = False
+
+# yapf:disable
+inputs = [([[[[1., 2.], [3., 4.]]]],
+           [[0., 0.5, 0.5, 1., 1., 0]]),
+          ([[[[1., 2.], [3., 4.]]]],
+           [[0., 0.5, 0.5, 1., 1., np.pi / 2]]),
+          ([[[[1., 2.], [3., 4.]],
+             [[4., 3.], [2., 1.]]]],
+           [[0., 0.5, 0.5, 1., 1., 0]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
+              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],
+           [[0., 1.5, 1.5, 3., 3., 0]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.],
+              [9., 10., 13., 14.], [11., 12., 15., 16.]]]],
+           [[0., 1.5, 1.5, 3., 3., np.pi / 2]])]
+outputs = [([[[[1.0, 1.25], [1.5, 1.75]]]],
+            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
+           ([[[[1.5, 1], [1.75, 1.25]]]],
+            [[[[3.0625, 0.4375], [0.4375, 0.0625]]]]),
+           ([[[[1.0, 1.25], [1.5, 1.75]],
+              [[4.0, 3.75], [3.5, 3.25]]]],
+            [[[[3.0625, 0.4375], [0.4375, 0.0625]],
+              [[3.0625, 0.4375], [0.4375, 0.0625]]]]),
+           ([[[[1.9375, 4.75], [7.5625, 10.375]]]],
+            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]]),
+           ([[[[7.5625, 1.9375], [10.375, 4.75]]]],
+            [[[[0.47265625, 0.42968750, 0.42968750, 0.04296875],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.42968750, 0.39062500, 0.39062500, 0.03906250],
+               [0.04296875, 0.03906250, 0.03906250, 0.00390625]]]])]
+# yapf:enable
+
+pool_h = 2
+pool_w = 2
+spatial_scale = 1.0
+sampling_ratio = 2
+
+
+def _test_roialign_rotated_gradcheck(device, dtype):
+    try:
+        from mmcv.ops import RoIAlignRotated
+    except ModuleNotFoundError:
+        pytest.skip('RoIAlignRotated op is not successfully compiled')
+    if dtype is torch.half:
+        pytest.skip('grad check does not support fp16')
+    for case in inputs:
+        np_input = np.array(case[0])
+        np_rois = np.array(case[1])
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        rois = torch.tensor(np_rois, dtype=dtype, device=device)
+
+        froipool = RoIAlignRotated((pool_h, pool_w), spatial_scale,
+                                   sampling_ratio)
+        if torch.__version__ == 'parrots':
+            gradcheck(
+                froipool, (x, rois), no_grads=[rois], delta=1e-5, pt_atol=1e-5)
+        else:
+            gradcheck(froipool, (x, rois), eps=1e-5, atol=1e-5)
+
+
+def _test_roialign_rotated_allclose(device, dtype):
+    try:
+        from mmcv.ops import RoIAlignRotated, roi_align_rotated
+    except ModuleNotFoundError:
+        pytest.skip('test requires compilation')
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+
+    for case, output in zip(inputs, outputs):
+        np_input = np.array(case[0])
+        np_rois = np.array(case[1])
+        np_output = np.array(output[0])
+        np_grad = np.array(output[1])
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        rois = torch.tensor(np_rois, dtype=dtype, device=device)
+
+        output = roi_align_rotated(x, rois, (pool_h, pool_w), spatial_scale,
+                                   sampling_ratio, True)
+        output.backward(torch.ones_like(output))
+        assert np.allclose(
+            output.data.type(torch.float).cpu().numpy(), np_output, atol=1e-3)
+        assert np.allclose(
+            x.grad.data.type(torch.float).cpu().numpy(), np_grad, atol=1e-3)
+
+    # Test deprecated parameters
+    roi_align_rotated_module_deprecated = RoIAlignRotated(
+        out_size=(pool_h, pool_w),
+        spatial_scale=spatial_scale,
+        sample_num=sampling_ratio)
+
+    output_1 = roi_align_rotated_module_deprecated(x, rois)
+
+    roi_align_rotated_module_new = RoIAlignRotated(
+        output_size=(pool_h, pool_w),
+        spatial_scale=spatial_scale,
+        sampling_ratio=sampling_ratio)
+
+    output_2 = roi_align_rotated_module_new(x, rois)
+
+    assert np.allclose(
+        output_1.data.type(torch.float).cpu().numpy(),
+        output_2.data.type(torch.float).cpu().numpy())
+
+
+@pytest.mark.parametrize('device', [
+    'cpu',
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype', [
+    torch.float,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE,
+            reason='MLU does not support for 64-bit floating point')),
+    torch.half
+])
+def test_roialign_rotated(device, dtype):
+    # check double only
+    if dtype is torch.double:
+        _test_roialign_rotated_gradcheck(device=device, dtype=dtype)
+    _test_roialign_rotated_allclose(device=device, dtype=dtype)
diff --git a/mmcv/tests/test_ops/test_roi_pool.py b/mmcv/tests/test_ops/test_roi_pool.py
new file mode 100644
index 0000000000000000000000000000000000000000..39d0ddea96396bc1c7f9c47cbc4dbec2dcfa8a61
--- /dev/null
+++ b/mmcv/tests/test_ops/test_roi_pool.py
@@ -0,0 +1,101 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+
+    _USING_PARROTS = False
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2.], [3., 4.]], [[4., 3.], [2.,
+                                               1.]]]], [[0., 0., 0., 1., 1.]]),
+          ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+              [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+outputs = [([[[[1., 2.], [3., 4.]]]], [[[[1., 1.], [1., 1.]]]]),
+           ([[[[1., 2.], [3., 4.]], [[4., 3.], [2., 1.]]]], [[[[1., 1.],
+                                                               [1., 1.]],
+                                                              [[1., 1.],
+                                                               [1., 1.]]]]),
+           ([[[[4., 8.], [12., 16.]]]], [[[[0., 0., 0., 0.], [0., 1., 0., 1.],
+                                           [0., 0., 0., 0.], [0., 1., 0.,
+                                                              1.]]]])]
+
+
+class TestRoiPool:
+
+    def test_roipool_gradcheck(self):
+        if not torch.cuda.is_available():
+            return
+        from mmcv.ops import RoIPool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case in inputs:
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+
+            x = torch.tensor(np_input, device='cuda', requires_grad=True)
+            rois = torch.tensor(np_rois, device='cuda')
+
+            froipool = RoIPool((pool_h, pool_w), spatial_scale)
+
+            if _USING_PARROTS:
+                pass
+                # gradcheck(froipool, (x, rois), no_grads=[rois])
+            else:
+                gradcheck(froipool, (x, rois), eps=1e-2, atol=1e-2)
+
+    def _test_roipool_allclose(self, device, dtype=torch.float):
+        from mmcv.ops import roi_pool
+        pool_h = 2
+        pool_w = 2
+        spatial_scale = 1.0
+
+        for case, output in zip(inputs, outputs):
+            np_input = np.array(case[0])
+            np_rois = np.array(case[1])
+            np_output = np.array(output[0])
+            np_grad = np.array(output[1])
+
+            x = torch.tensor(
+                np_input, dtype=dtype, device=device, requires_grad=True)
+            rois = torch.tensor(np_rois, dtype=dtype, device=device)
+
+            output = roi_pool(x, rois, (pool_h, pool_w), spatial_scale)
+            output.backward(torch.ones_like(output))
+            assert np.allclose(output.data.cpu().numpy(), np_output, 1e-3)
+            assert np.allclose(x.grad.data.cpu().numpy(), np_grad, 1e-3)
+
+    @pytest.mark.parametrize('device', [
+        pytest.param(
+            'cuda',
+            marks=pytest.mark.skipif(
+                not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+        pytest.param(
+            'mlu',
+            marks=pytest.mark.skipif(
+                not IS_MLU_AVAILABLE, reason='requires MLU support'))
+    ])
+    @pytest.mark.parametrize('dtype', [
+        torch.float,
+        pytest.param(
+            torch.double,
+            marks=pytest.mark.skipif(
+                IS_MLU_AVAILABLE,
+                reason='MLU does not support for 64-bit floating point')),
+        torch.half
+    ])
+    def test_roipool_allclose(self, device, dtype):
+        self._test_roipool_allclose(device, dtype)
diff --git a/mmcv/tests/test_ops/test_roiaware_pool3d.py b/mmcv/tests/test_ops/test_roiaware_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d043b00f5a0354f5f8a7b201a82c099454d5f5d
--- /dev/null
+++ b/mmcv/tests/test_ops/test_roiaware_pool3d.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import (RoIAwarePool3d, points_in_boxes_all, points_in_boxes_cpu,
+                      points_in_boxes_part)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_RoIAwarePool3d():
+    roiaware_pool3d_max = RoIAwarePool3d(
+        out_size=4, max_pts_per_voxel=128, mode='max')
+    roiaware_pool3d_avg = RoIAwarePool3d(
+        out_size=4, max_pts_per_voxel=128, mode='avg')
+    rois = torch.tensor(
+        [[1.0, 2.0, 3.0, 5.0, 4.0, 6.0, -0.3 - np.pi / 2],
+         [-10.0, 23.0, 16.0, 20.0, 10.0, 20.0, -0.5 - np.pi / 2]],
+        dtype=torch.float32).cuda(
+        )  # boxes (m, 7) with bottom center in lidar coordinate
+    pts = torch.tensor(
+        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
+         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
+        dtype=torch.float32).cuda()  # points (n, 3) in lidar coordinate
+    pts_feature = pts.clone()
+
+    pooled_features_max = roiaware_pool3d_max(
+        rois=rois, pts=pts, pts_feature=pts_feature)
+    assert pooled_features_max.shape == torch.Size([2, 4, 4, 4, 3])
+    assert torch.allclose(pooled_features_max.sum(),
+                          torch.tensor(51.100).cuda(), 1e-3)
+
+    pooled_features_avg = roiaware_pool3d_avg(
+        rois=rois, pts=pts, pts_feature=pts_feature)
+    assert pooled_features_avg.shape == torch.Size([2, 4, 4, 4, 3])
+    assert torch.allclose(pooled_features_avg.sum(),
+                          torch.tensor(49.750).cuda(), 1e-3)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_points_in_boxes_part():
+    boxes = torch.tensor(
+        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3]],
+         [[-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
+        dtype=torch.float32).cuda(
+        )  # boxes (b, t, 7) with bottom center in lidar coordinate
+    pts = torch.tensor(
+        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+          [4.7, 3.5, -12.2]],
+         [[3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9], [-21.3, -52, -5],
+          [0, 0, 0], [6, 7, 8], [-2, -3, -4], [6, 4, 9]]],
+        dtype=torch.float32).cuda()  # points (b, m, 3) in lidar coordinate
+
+    point_indices = points_in_boxes_part(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor(
+        [[0, 0, 0, 0, 0, -1, -1, -1], [-1, -1, -1, -1, -1, -1, -1, -1]],
+        dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([2, 8])
+    assert (point_indices == expected_point_indices).all()
+
+    boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],
+                         dtype=torch.float32).cuda()  # 30 degrees
+    pts = torch.tensor(
+        [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],
+          [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],
+        dtype=torch.float32).cuda()
+    point_indices = points_in_boxes_part(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor([[-1, -1, 0, -1, 0, -1, -1, -1]],
+                                          dtype=torch.int32).cuda()
+    assert (point_indices == expected_point_indices).all()
+
+
+def test_points_in_boxes_cpu():
+    boxes = torch.tensor(
+        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
+          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
+        dtype=torch.float32
+    )  # boxes (m, 7) with bottom center in lidar coordinate
+    pts = torch.tensor(
+        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
+              -16, -18, 9
+          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
+        dtype=torch.float32)  # points (n, 3) in lidar coordinate
+
+    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor(
+        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
+          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
+        dtype=torch.int32)
+    assert point_indices.shape == torch.Size([1, 15, 2])
+    assert (point_indices == expected_point_indices).all()
+
+    boxes = torch.tensor([[[0.0, 0.0, 0.0, 1.0, 20.0, 1.0, 0.523598]]],
+                         dtype=torch.float32)  # 30 degrees
+    pts = torch.tensor(
+        [[[4, 6.928, 0], [6.928, 4, 0], [4, -6.928, 0], [6.928, -4, 0],
+          [-4, 6.928, 0], [-6.928, 4, 0], [-4, -6.928, 0], [-6.928, -4, 0]]],
+        dtype=torch.float32)
+    point_indices = points_in_boxes_cpu(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor(
+        [[[0], [0], [1], [0], [1], [0], [0], [0]]], dtype=torch.int32)
+    assert (point_indices == expected_point_indices).all()
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_points_in_boxes_all():
+
+    boxes = torch.tensor(
+        [[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
+          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
+        dtype=torch.float32).cuda(
+        )  # boxes (m, 7) with bottom center in lidar coordinate
+    pts = torch.tensor(
+        [[[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+          [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+          [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [
+              -16, -18, 9
+          ], [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]]],
+        dtype=torch.float32).cuda()  # points (n, 3) in lidar coordinate
+
+    point_indices = points_in_boxes_all(points=pts, boxes=boxes)
+    expected_point_indices = torch.tensor(
+        [[[1, 0], [1, 0], [1, 0], [1, 0], [1, 0], [0, 1], [0, 0], [0, 0],
+          [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]]],
+        dtype=torch.int32).cuda()
+    assert point_indices.shape == torch.Size([1, 15, 2])
+    assert (point_indices == expected_point_indices).all()
+
+    if torch.cuda.device_count() > 1:
+        pts = pts.to('cuda:1')
+        boxes = boxes.to('cuda:1')
+        expected_point_indices = expected_point_indices.to('cuda:1')
+        point_indices = points_in_boxes_all(points=pts, boxes=boxes)
+        assert point_indices.shape == torch.Size([1, 15, 2])
+        assert (point_indices == expected_point_indices).all()
diff --git a/mmcv/tests/test_ops/test_roipoint_pool3d.py b/mmcv/tests/test_ops/test_roipoint_pool3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6619a36148cff02e937f56e926bae3cb5ecb2a68
--- /dev/null
+++ b/mmcv/tests/test_ops/test_roipoint_pool3d.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import RoIPointPool3d
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_roipoint():
+    feats = torch.tensor(
+        [[1, 2, 3.3], [1.2, 2.5, 3.0], [0.8, 2.1, 3.5], [1.6, 2.6, 3.6],
+         [0.8, 1.2, 3.9], [-9.2, 21.0, 18.2], [3.8, 7.9, 6.3],
+         [4.7, 3.5, -12.2], [3.8, 7.6, -2], [-10.6, -12.9, -20], [-16, -18, 9],
+         [-21.3, -52, -5], [0, 0, 0], [6, 7, 8], [-2, -3, -4]],
+        dtype=torch.float32).unsqueeze(0).cuda()
+    points = feats.clone()
+    rois = torch.tensor([[[1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 0.3],
+                          [-10.0, 23.0, 16.0, 10, 20, 20, 0.5]]],
+                        dtype=torch.float32).cuda()
+
+    roipoint_pool3d = RoIPointPool3d(num_sampled_points=4)
+    roi_feat, empty_flag = roipoint_pool3d(feats, points, rois)
+    expected_roi_feat = torch.tensor([[[[1, 2, 3.3, 1, 2, 3.3],
+                                        [1.2, 2.5, 3, 1.2, 2.5, 3],
+                                        [0.8, 2.1, 3.5, 0.8, 2.1, 3.5],
+                                        [1.6, 2.6, 3.6, 1.6, 2.6, 3.6]],
+                                       [[-9.2, 21, 18.2, -9.2, 21, 18.2],
+                                        [-9.2, 21, 18.2, -9.2, 21, 18.2],
+                                        [-9.2, 21, 18.2, -9.2, 21, 18.2],
+                                        [-9.2, 21, 18.2, -9.2, 21,
+                                         18.2]]]]).cuda()
+    expected_empty_flag = torch.tensor([[0, 0]]).int().cuda()
+
+    assert torch.allclose(roi_feat, expected_roi_feat)
+    assert torch.allclose(empty_flag, expected_empty_flag)
diff --git a/mmcv/tests/test_ops/test_rotated_feature_align.py b/mmcv/tests/test_ops/test_rotated_feature_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7422a3106bb71ccfdec1919c0a6fb939fb182ac
--- /dev/null
+++ b/mmcv/tests/test_ops/test_rotated_feature_align.py
@@ -0,0 +1,131 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import rotated_feature_align
+from mmcv.utils import IS_CUDA_AVAILABLE
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'cpu',
+        marks=pytest.mark.skipif(
+            torch.__version__ == 'parrots', reason='requires PyTorch support'))
+])
+def test_rotated_feature_align(device):
+    feature = torch.tensor([[[[1.2924, -0.2172, -0.5222, 0.1172],
+                              [0.9144, 1.2248, 1.3115, -0.9690],
+                              [-0.8949, -1.1797, -0.9093, -0.3961],
+                              [-0.4586, 0.5062, -0.7947, -0.7397]],
+                             [[-1.0943, -0.7495, 1.3461, -1.1652],
+                              [0.2034, 0.6763, -1.2357, 0.5231],
+                              [-1.0062, 1.2592, 1.4225, -0.3951],
+                              [-0.1242, -1.6240, 0.1932, 2.7181]],
+                             [[-1.6271, -1.0276, 0.0578, -0.2997],
+                              [-0.9684, -1.6946, -1.3188, -1.1938],
+                              [-1.6744, -0.8917, -0.6556,
+                               1.0073], [-0.1205, 0.3671, -0.3731, -0.5347]]],
+                            [[[0.7035, 0.2089, -0.1774, 3.4670],
+                              [-0.8505, -0.9278, 1.4714, 0.1644],
+                              [0.0898, 0.3531, -0.4007, 0.1927],
+                              [1.2569, -0.2636, -0.5223, 0.0616]],
+                             [[0.1760, -0.7639, -0.4600, -1.3260],
+                              [-0.9921, -0.2970, -0.8955, 1.0508],
+                              [1.3515, -0.1641, 1.9679, 1.1986],
+                              [-0.3616, 0.6287, 0.4933, 0.3360]],
+                             [[-0.5860, 0.2124, -0.8700, 2.4200],
+                              [-0.0551, -1.5103, -1.6779, 0.8399],
+                              [0.8431, 1.2414, -1.1243, -0.3887],
+                              [-2.1254, 0.6047, -0.3515, 0.7254]]]],
+                           device=device,
+                           requires_grad=True)
+
+    bbox = torch.tensor(
+        [[[[1.3080e+01, 1.2688e+01, 1.1214e+01, 9.3944e+01, -9.1905e-01],
+           [3.8104e+01, 1.0134e+01, 1.4659e+02, 9.0306e+01, -9.8211e-01],
+           [-5.3213e+01, 4.9508e+01, 5.1513e+01, 3.2055e+01, -3.1954e-01],
+           [2.6974e+01, 2.5248e+01, 5.4495e+01, 3.1083e+00, -6.2127e-01]],
+          [[-1.5604e+01, -5.1908e+01, 2.3998e+02, 1.5008e+01, -1.2546e+00],
+           [3.1354e+01, -7.3635e+00, 6.7879e+01, 3.5081e+01, -3.3851e-01],
+           [-5.3292e+00, 9.1946e+00, 1.2834e+01, 1.0485e+01, -1.3039e+00],
+           [-2.3925e+01, 3.6623e+01, 3.9875e+01, 7.2009e+01, -6.5934e-01]],
+          [[7.2114e+01, -2.3781e+01, 2.9106e+01, 8.4501e+01, -1.1340e+00],
+           [2.6258e+01, -7.7034e+00, 1.7629e+02, 1.0615e+02, -1.2156e+00],
+           [3.8057e+01, 4.6016e+01, 1.2965e+01, 6.9384e+00, -1.0855e+00],
+           [2.4428e+01, -1.6189e+01, 2.0572e+02, 3.1622e+01, -1.5719e-01]],
+          [[3.8226e+00, 2.9608e+01, 1.4457e+01, 6.8179e+01, -9.1997e-01],
+           [2.5003e+01, -4.2490e+01, 9.6007e+01, 4.9086e+01, -1.4786e+00],
+           [8.5983e+01, 5.4980e+01, 7.8080e+01, 1.0003e+02, -1.0926e+00],
+           [9.9065e+00, 4.1457e+01, 5.9799e+00, 1.7973e+01, -5.6313e-01]]],
+         [[[-1.8244e+01, 4.6309e+00, 5.3010e+01, 2.4310e+01, -7.0345e-01],
+           [1.9419e+01, 3.6704e+01, 5.2390e+01, 5.4133e+01, -3.7730e-01],
+           [5.6387e+01, 2.3752e+01, 9.0441e+00, 1.7792e+01, -1.5583e+00],
+           [3.6303e+01, 1.6396e+01, 2.0283e+01, 1.9148e+01, -8.3419e-01]],
+          [[3.2169e+01, 3.0521e+01, 2.6283e+01, 1.9680e+02, -3.0454e-01],
+           [2.5788e+01, -3.2189e+01, 8.8882e+01, 1.0207e+02, -1.5328e+00],
+           [8.4676e+00, -1.6668e+01, 2.4657e+01, 1.1275e+02, -4.0388e-01],
+           [-1.0799e+01, 6.0422e+00, 9.5807e+00, 3.3677e+01, -3.5438e-01]],
+          [[6.9363e+01, 1.0850e+01, 2.5968e+01, 2.2311e+01, -1.6408e-01],
+           [2.8140e+00, 4.6843e+00, 3.1289e+00, 2.1480e+01, -6.7583e-01],
+           [2.6661e+01, 4.5290e+01, 6.1679e+00, 3.0005e+01, -8.9806e-01],
+           [5.0871e+00, 1.3234e+01, 9.2087e+01, 4.9622e+01, -2.8020e-01]],
+          [[-1.2643e+01, 2.5176e+01, 5.0488e+01, 5.4246e+01, -4.4840e-01],
+           [-3.4521e+01, 9.8435e-01, 5.2413e+01, 9.7996e+00, -8.4218e-01],
+           [4.9829e+01, -1.0808e+01, 2.9848e+01, 7.3579e+01, -6.2672e-01],
+           [8.0446e+01, 2.8064e+01, 4.5273e+01, 5.3809e+01, -1.2359e+00]]]],
+        device=device,
+        requires_grad=True)
+
+    expected_output = torch.tensor([[[[1.1095, -0.2172, -0.5222, -0.6225],
+                                      [0.9144, 0.7662, 1.0487, -0.9690],
+                                      [-0.8949, -1.6384, -0.9093, -0.3961],
+                                      [-0.8604, 0.5062, -0.7947, -0.7397]],
+                                     [[-0.3961, -0.7495, 1.3461, 1.5528],
+                                      [0.2034, 0.5522, -1.6722, 0.5231],
+                                      [-1.0062, 1.1350, 1.4225, -0.3951],
+                                      [-0.4826, -1.6240, 0.1932, 2.7181]],
+                                     [[-2.6436, -1.0276, 0.0578, -0.8344],
+                                      [-0.9684, -1.8151, -2.1843, -1.1938],
+                                      [-1.6744, -1.0121, -0.6556, 1.0073],
+                                      [-0.8474, 0.3671, -0.3731, -0.5347]]],
+                                    [[[0.7035, 0.2089, -0.1774, 3.4670],
+                                      [-0.8505, -0.9278, 1.4714, 0.1644],
+                                      [0.0898, 0.3064, -0.4007, 0.5849],
+                                      [1.2569, -0.2636, -0.5223, 0.0616]],
+                                     [[0.1760, -0.7639, -0.4600, -1.3260],
+                                      [-0.9921, -0.2970, -0.8955, 1.0508],
+                                      [1.3515, -0.6125, 1.9679, 0.5550],
+                                      [-0.3616, 0.6287, 0.4933, 0.3360]],
+                                     [[-0.5860, 0.2124, -0.8700, 2.4200],
+                                      [-0.0551, -1.5103, -1.6779, 0.8399],
+                                      [0.8431, 0.8455, -1.1243, -1.5994],
+                                      [-2.1254, 0.6047, -0.3515, 0.7254]]]],
+                                   device=device)
+
+    expected_grad = torch.tensor([
+        [[[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
+          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],
+         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
+          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]],
+         [[1.0000, 1.8507, 1.1493, 1.5222], [1.0000, 1.1511, 1.2139, 1.4778],
+          [1.0000, 1.2629, 1.3721, 1.0000], [3.0000, 1.0000, 1.0000, 2.0000]]],
+        [[[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
+          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],
+         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
+          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]],
+         [[1.2687, 1.5055, 1.2382, 1.0000], [1.1458, 1.4258, 1.4160, 1.0000],
+          [1.0000, 1.0000, 1.0000, 1.0000], [1.0000, 1.0000, 1.0000, 1.0000]]]
+    ],
+                                 device=device)
+
+    output = rotated_feature_align(
+        feature, bbox, spatial_scale=1 / 8, points=1)
+    output.backward(torch.ones_like(output))
+    assert torch.allclose(output, expected_output, 1e-2)
+    assert torch.allclose(feature.grad, expected_grad, 1e-2)
diff --git a/mmcv/tests/test_ops/test_saconv.py b/mmcv/tests/test_ops/test_saconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..607775c38511d5f3afd01ae4656a232474420761
--- /dev/null
+++ b/mmcv/tests/test_ops/test_saconv.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+
+from mmcv.ops import SAConv2d
+
+
+def test_sacconv():
+
+    # test with normal cast
+    x = torch.rand(1, 3, 256, 256)
+    saconv = SAConv2d(3, 5, kernel_size=3, padding=1)
+    sac_out = saconv(x)
+    refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1)
+    refer_out = refer_conv(x)
+    assert sac_out.shape == refer_out.shape
+
+    # test with dilation >= 2
+    dalited_saconv = SAConv2d(3, 5, kernel_size=3, padding=2, dilation=2)
+    dalited_sac_out = dalited_saconv(x)
+    refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=2, dilation=2)
+    refer_out = refer_conv(x)
+    assert dalited_sac_out.shape == refer_out.shape
+
+    # test with deform
+    deform_saconv = SAConv2d(3, 5, kernel_size=3, padding=1, use_deform=True)
+    if torch.cuda.is_available():
+        x = torch.rand(1, 3, 256, 256).cuda()
+        deform_saconv = SAConv2d(
+            3, 5, kernel_size=3, padding=1, use_deform=True).cuda()
+        deform_sac_out = deform_saconv(x).cuda()
+        refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1).cuda()
+        refer_out = refer_conv(x)
+        assert deform_sac_out.shape == refer_out.shape
+    else:
+        deform_sac_out = deform_saconv(x)
+        refer_conv = nn.Conv2d(3, 5, kernel_size=3, padding=1)
+        refer_out = refer_conv(x)
+        assert deform_sac_out.shape == refer_out.shape
+
+    # test with groups >= 2
+    x = torch.rand(1, 4, 256, 256)
+    group_saconv = SAConv2d(4, 4, kernel_size=3, padding=1, groups=2)
+    group_sac_out = group_saconv(x)
+    refer_conv = nn.Conv2d(4, 4, kernel_size=3, padding=1, groups=2)
+    refer_out = refer_conv(x)
+    assert group_sac_out.shape == refer_out.shape
diff --git a/mmcv/tests/test_ops/test_scatter_points.py b/mmcv/tests/test_ops/test_scatter_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf4516047a11117fbd79b3a985d902446001afdf
--- /dev/null
+++ b/mmcv/tests/test_ops/test_scatter_points.py
@@ -0,0 +1,132 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.autograd import gradcheck
+
+from mmcv.ops import DynamicScatter
+
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_dynamic_scatter():
+    dsmean = DynamicScatter([0.32, 0.32, 6],
+                            [-74.88, -74.88, -2, 74.88, 74.88, 4], True)
+    dsmax = DynamicScatter([0.32, 0.32, 6],
+                           [-74.88, -74.88, -2, 74.88, 74.88, 4], False)
+
+    # test empty input
+    empty_feats = torch.empty(size=(0, 3), dtype=torch.float32, device='cuda')
+    empty_coors = torch.empty(size=(0, 3), dtype=torch.int32, device='cuda')
+
+    empty_feats.requires_grad_()
+    empty_feats_out_mean, empty_coors_out_mean = dsmean(
+        empty_feats, empty_coors)
+    empty_feats_out_mean.sum().backward()
+    empty_feats_out_max, empty_coors_out_max = dsmax(empty_feats, empty_coors)
+    empty_feats_out_max.sum().backward()
+
+    assert empty_feats_out_mean.shape == empty_feats.shape
+    assert empty_feats_out_max.shape == empty_feats.shape
+    assert empty_coors_out_mean.shape == empty_coors.shape
+    assert empty_coors_out_max.shape == empty_coors.shape
+
+    # test empty reduced output
+    empty_o_feats = torch.rand(
+        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+    empty_o_coors = torch.randint(
+        low=-1, high=0, size=(200000, 3), dtype=torch.int32, device='cuda')
+
+    empty_o_feats.requires_grad_()
+    empty_o_feats_out_mean, empty_o_coors_out_mean = dsmean(
+        empty_o_feats, empty_o_coors)
+    empty_o_feats_out_mean.sum().backward()
+    assert (empty_o_feats.grad == 0).all()
+
+    empty_o_feats_out_max, empty_o_coors_out_max = dsmax(
+        empty_o_feats, empty_o_coors)
+    empty_o_feats_out_max.sum().backward()
+    assert (empty_o_feats.grad == 0).all()
+
+    # test non-empty input
+    feats = torch.rand(
+        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+    coors = torch.randint(
+        low=-1, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
+
+    ref_voxel_coors = coors.unique(dim=0, sorted=True)
+    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
+    ref_voxel_feats_mean = []
+    ref_voxel_feats_max = []
+    for ref_voxel_coor in ref_voxel_coors:
+        voxel_mask = (coors == ref_voxel_coor).all(dim=-1)
+        ref_voxel_feats_mean.append(feats[voxel_mask].mean(dim=0))
+        ref_voxel_feats_max.append(feats[voxel_mask].max(dim=0).values)
+    ref_voxel_feats_mean = torch.stack(ref_voxel_feats_mean)
+    ref_voxel_feats_max = torch.stack(ref_voxel_feats_max)
+
+    feats_out_mean, coors_out_mean = dsmean(feats, coors)
+    seq_mean = (coors_out_mean[:, 0] * 400 + coors_out_mean[:, 1] * 20 +
+                coors_out_mean[:, 2]).argsort()
+    feats_out_mean = feats_out_mean[seq_mean]
+    coors_out_mean = coors_out_mean[seq_mean]
+
+    feats_out_max, coors_out_max = dsmax(feats, coors)
+    seq_max = (coors_out_max[:, 0] * 400 + coors_out_max[:, 1] * 20 +
+               coors_out_max[:, 2]).argsort()
+    feats_out_max = feats_out_max[seq_max]
+    coors_cout_max = coors_out_max[seq_max]
+
+    assert (coors_out_mean == ref_voxel_coors).all()
+    assert torch.allclose(
+        feats_out_mean, ref_voxel_feats_mean, atol=1e-2, rtol=1e-5)
+    assert (coors_cout_max == ref_voxel_coors).all()
+    assert torch.allclose(
+        feats_out_max, ref_voxel_feats_max, atol=1e-2, rtol=1e-5)
+
+    # test non-empty input without any point out of bound
+    feats = torch.rand(
+        size=(200000, 3), dtype=torch.float32, device='cuda') * 100 - 50
+    coors = torch.randint(
+        low=0, high=20, size=(200000, 3), dtype=torch.int32, device='cuda')
+
+    ref_voxel_coors = coors.unique(dim=0, sorted=True)
+    ref_voxel_coors = ref_voxel_coors[ref_voxel_coors.min(dim=-1).values >= 0]
+    ref_voxel_feats_mean = []
+    ref_voxel_feats_max = []
+    for ref_voxel_coor in ref_voxel_coors:
+        voxel_mask = (coors == ref_voxel_coor).all(dim=-1)
+        ref_voxel_feats_mean.append(feats[voxel_mask].mean(dim=0))
+        ref_voxel_feats_max.append(feats[voxel_mask].max(dim=0).values)
+    ref_voxel_feats_mean = torch.stack(ref_voxel_feats_mean)
+    ref_voxel_feats_max = torch.stack(ref_voxel_feats_max)
+
+    feats_out_mean, coors_out_mean = dsmean(feats, coors)
+    seq_mean = (coors_out_mean[:, 0] * 400 + coors_out_mean[:, 1] * 20 +
+                coors_out_mean[:, 2]).argsort()
+    feats_out_mean = feats_out_mean[seq_mean]
+    coors_out_mean = coors_out_mean[seq_mean]
+
+    feats_out_max, coors_out_max = dsmax(feats, coors)
+    seq_max = (coors_out_max[:, 0] * 400 + coors_out_max[:, 1] * 20 +
+               coors_out_max[:, 2]).argsort()
+    feats_out_max = feats_out_max[seq_max]
+    coors_cout_max = coors_out_max[seq_max]
+
+    assert (coors_out_mean == ref_voxel_coors).all()
+    assert torch.allclose(
+        feats_out_mean, ref_voxel_feats_mean, atol=1e-2, rtol=1e-5)
+    assert (coors_cout_max == ref_voxel_coors).all()
+    assert torch.allclose(
+        feats_out_max, ref_voxel_feats_max, atol=1e-2, rtol=1e-5)
+
+    # test grad #
+    feats = torch.rand(
+        size=(100, 4), dtype=torch.float32, device='cuda') * 100 - 50
+    coors = torch.randint(
+        low=-1, high=3, size=(100, 3), dtype=torch.int32, device='cuda')
+    feats.requires_grad_()
+    gradcheck(dsmean, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)
+    gradcheck(dsmax, (feats, coors), eps=1e-2, atol=1e-2, rtol=1e-5)
diff --git a/mmcv/tests/test_ops/test_spconv.py b/mmcv/tests/test_ops/test_spconv.py
new file mode 100644
index 0000000000000000000000000000000000000000..098ff2189ae5c44ae2acac8f11f54aa43d5ba4cb
--- /dev/null
+++ b/mmcv/tests/test_ops/test_spconv.py
@@ -0,0 +1,133 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch import nn
+
+from mmcv.cnn import build_conv_layer, build_norm_layer
+from mmcv.ops import (SparseConvTensor, SparseInverseConv3d, SparseSequential,
+                      SubMConv3d)
+
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
+
+def make_sparse_convmodule(in_channels,
+                           out_channels,
+                           kernel_size,
+                           indice_key,
+                           stride=1,
+                           padding=0,
+                           conv_type='SubMConv3d',
+                           norm_cfg=None,
+                           order=('conv', 'norm', 'act')):
+    """Make sparse convolution module.
+
+    Args:
+        in_channels (int): the number of input channels
+        out_channels (int): the number of out channels
+        kernel_size (int|tuple(int)): kernel size of convolution
+        indice_key (str): the indice key used for sparse tensor
+        stride (int|tuple(int)): the stride of convolution
+        padding (int or list[int]): the padding number of input
+        conv_type (str): sparse conv type in spconv
+        norm_cfg (dict[str]): config of normalization layer
+        order (tuple[str]): The order of conv/norm/activation layers. It is a
+            sequence of "conv", "norm" and "act". Common examples are
+            ("conv", "norm", "act") and ("act", "conv", "norm").
+
+    Returns:
+        spconv.SparseSequential: sparse convolution module.
+    """
+    assert isinstance(order, tuple) and len(order) <= 3
+    assert set(order) | {'conv', 'norm', 'act'} == {'conv', 'norm', 'act'}
+
+    conv_cfg = dict(type=conv_type, indice_key=indice_key)
+
+    layers = list()
+    for layer in order:
+        if layer == 'conv':
+            if conv_type not in [
+                    'SparseInverseConv3d', 'SparseInverseConv2d',
+                    'SparseInverseConv1d'
+            ]:
+                layers.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        in_channels,
+                        out_channels,
+                        kernel_size,
+                        stride=stride,
+                        padding=padding,
+                        bias=False))
+            else:
+                layers.append(
+                    build_conv_layer(
+                        conv_cfg,
+                        in_channels,
+                        out_channels,
+                        kernel_size,
+                        bias=False))
+        elif layer == 'norm':
+            layers.append(build_norm_layer(norm_cfg, out_channels)[1])
+        elif layer == 'act':
+            layers.append(nn.ReLU(inplace=True))
+
+    layers = SparseSequential(*layers)
+    return layers
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_make_sparse_convmodule():
+    torch.cuda.empty_cache()
+    voxel_features = torch.tensor([[6.56126, 0.9648336, -1.7339306, 0.315],
+                                   [6.8162713, -2.480431, -1.3616394, 0.36],
+                                   [11.643568, -4.744306, -1.3580885, 0.16],
+                                   [23.482342, 6.5036807, 0.5806964, 0.35]],
+                                  dtype=torch.float32,
+                                  device='cuda')  # n, point_features
+    coordinates = torch.tensor(
+        [[0, 12, 819, 131], [0, 16, 750, 136], [1, 16, 705, 232],
+         [1, 35, 930, 469]],
+        dtype=torch.int32,
+        device='cuda')  # n, 4(batch, ind_x, ind_y, ind_z)
+
+    # test
+    input_sp_tensor = SparseConvTensor(voxel_features, coordinates,
+                                       [41, 1600, 1408], 2)
+
+    sparse_block0 = make_sparse_convmodule(
+        4,
+        16,
+        3,
+        'test0',
+        stride=1,
+        padding=0,
+        conv_type='SubMConv3d',
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        order=('conv', 'norm', 'act')).cuda()
+    assert isinstance(sparse_block0[0], SubMConv3d)
+    assert sparse_block0[0].in_channels == 4
+    assert sparse_block0[0].out_channels == 16
+    assert isinstance(sparse_block0[1], torch.nn.BatchNorm1d)
+    assert sparse_block0[1].eps == 0.001
+    assert sparse_block0[1].momentum == 0.01
+    assert isinstance(sparse_block0[2], torch.nn.ReLU)
+
+    # test forward
+    out_features = sparse_block0(input_sp_tensor)
+    assert out_features.features.shape == torch.Size([4, 16])
+
+    sparse_block1 = make_sparse_convmodule(
+        4,
+        16,
+        3,
+        'test1',
+        stride=1,
+        padding=0,
+        conv_type='SparseInverseConv3d',
+        norm_cfg=dict(type='BN1d', eps=1e-3, momentum=0.01),
+        order=('norm', 'act', 'conv')).cuda()
+    assert isinstance(sparse_block1[0], torch.nn.BatchNorm1d)
+    assert isinstance(sparse_block1[1], torch.nn.ReLU)
+    assert isinstance(sparse_block1[2], SparseInverseConv3d)
diff --git a/mmcv/tests/test_ops/test_syncbn.py b/mmcv/tests/test_ops/test_syncbn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1c1605ad5aa4f846cbd62db62a27e8af32b6840
--- /dev/null
+++ b/mmcv/tests/test_ops/test_syncbn.py
@@ -0,0 +1,295 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import platform
+
+import numpy as np
+import pytest
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+
+class TestSyncBN:
+
+    def dist_init(self):
+        rank = int(os.environ['SLURM_PROCID'])
+        world_size = int(os.environ['SLURM_NTASKS'])
+        local_rank = int(os.environ['SLURM_LOCALID'])
+        node_list = str(os.environ['SLURM_NODELIST'])
+
+        node_parts = re.findall('[0-9]+', node_list)
+        os.environ['MASTER_ADDR'] = (f'{node_parts[1]}.{node_parts[2]}' +
+                                     f'.{node_parts[3]}.{node_parts[4]}')
+        os.environ['MASTER_PORT'] = '12341'
+        os.environ['WORLD_SIZE'] = str(world_size)
+        os.environ['RANK'] = str(rank)
+
+        dist.init_process_group('nccl')
+        torch.cuda.set_device(local_rank)
+
+    def _test_syncbn_train(self, size=1, half=False):
+
+        if 'SLURM_NTASKS' not in os.environ or int(
+                os.environ['SLURM_NTASKS']) != 4:
+            print('must run with slurm has 4 processes!\n'
+                  'srun -p test --gres=gpu:4 -n4')
+            return
+        else:
+            print('Running syncbn test')
+        from mmcv.ops import SyncBatchNorm
+
+        assert size in (1, 2, 4)
+        if not dist.is_initialized():
+            self.dist_init()
+        rank = dist.get_rank()
+
+        torch.manual_seed(9)
+        torch.cuda.manual_seed(9)
+
+        self.x = torch.rand(16, 3, 2, 3).cuda()
+        self.y_bp = torch.rand(16, 3, 2, 3).cuda()
+
+        if half:
+            self.x = self.x.half()
+            self.y_bp = self.y_bp.half()
+        dist.broadcast(self.x, src=0)
+        dist.broadcast(self.y_bp, src=0)
+
+        torch.cuda.synchronize()
+        if size == 1:
+            groups = [None, None, None, None]
+            groups[0] = dist.new_group([0])
+            groups[1] = dist.new_group([1])
+            groups[2] = dist.new_group([2])
+            groups[3] = dist.new_group([3])
+            group = groups[rank]
+        elif size == 2:
+            groups = [None, None, None, None]
+            groups[0] = groups[1] = dist.new_group([0, 1])
+            groups[2] = groups[3] = dist.new_group([2, 3])
+            group = groups[rank]
+        elif size == 4:
+            group = dist.group.WORLD
+        syncbn = SyncBatchNorm(3, group=group).cuda()
+        syncbn.weight.data[0] = 0.2
+        syncbn.weight.data[1] = 0.5
+        syncbn.weight.data[2] = 0.7
+        syncbn.train()
+
+        bn = nn.BatchNorm2d(3).cuda()
+        bn.weight.data[0] = 0.2
+        bn.weight.data[1] = 0.5
+        bn.weight.data[2] = 0.7
+        bn.train()
+
+        sx = self.x[rank * 4:rank * 4 + 4]
+        sx.requires_grad_()
+        sy = syncbn(sx)
+        sy.backward(self.y_bp[rank * 4:rank * 4 + 4])
+
+        smean = syncbn.running_mean
+        svar = syncbn.running_var
+        sx_grad = sx.grad
+        sw_grad = syncbn.weight.grad
+        sb_grad = syncbn.bias.grad
+
+        if size == 1:
+            x = self.x[rank * 4:rank * 4 + 4]
+            y_bp = self.y_bp[rank * 4:rank * 4 + 4]
+        elif size == 2:
+            x = self.x[rank // 2 * 8:rank // 2 * 8 + 8]
+            y_bp = self.y_bp[rank // 2 * 8:rank // 2 * 8 + 8]
+        elif size == 4:
+            x = self.x
+            y_bp = self.y_bp
+        x.requires_grad_()
+        y = bn(x)
+        y.backward(y_bp)
+
+        if size == 2:
+            y = y[rank % 2 * 4:rank % 2 * 4 + 4]
+        elif size == 4:
+            y = y[rank * 4:rank * 4 + 4]
+
+        mean = bn.running_mean
+        var = bn.running_var
+        if size == 1:
+            x_grad = x.grad
+            w_grad = bn.weight.grad
+            b_grad = bn.bias.grad
+        elif size == 2:
+            x_grad = x.grad[rank % 2 * 4:rank % 2 * 4 + 4]
+            w_grad = bn.weight.grad / 2
+            b_grad = bn.bias.grad / 2
+        elif size == 4:
+            x_grad = x.grad[rank * 4:rank * 4 + 4]
+            w_grad = bn.weight.grad / 4
+            b_grad = bn.bias.grad / 4
+
+        assert np.allclose(mean.data.cpu().numpy(),
+                           smean.data.cpu().numpy(), 1e-3)
+        assert np.allclose(var.data.cpu().numpy(),
+                           svar.data.cpu().numpy(), 1e-3)
+        assert np.allclose(y.data.cpu().numpy(), sy.data.cpu().numpy(), 1e-3)
+        assert np.allclose(w_grad.data.cpu().numpy(),
+                           sw_grad.data.cpu().numpy(), 1e-3)
+        assert np.allclose(b_grad.data.cpu().numpy(),
+                           sb_grad.data.cpu().numpy(), 1e-3)
+        assert np.allclose(x_grad.data.cpu().numpy(),
+                           sx_grad.data.cpu().numpy(), 1e-2)
+
+    def _test_syncbn_empty_train(self, size=1, half=False):
+
+        if 'SLURM_NTASKS' not in os.environ or int(
+                os.environ['SLURM_NTASKS']) != 4:
+            print('must run with slurm has 4 processes!\n'
+                  'srun -p test --gres=gpu:4 -n4')
+            return
+        else:
+            print('Running syncbn test')
+        from mmcv.ops import SyncBatchNorm
+
+        assert size in (1, 2, 4)
+        if not dist.is_initialized():
+            self.dist_init()
+        rank = dist.get_rank()
+
+        torch.manual_seed(9)
+        torch.cuda.manual_seed(9)
+
+        self.x = torch.rand(0, 3, 2, 3).cuda()
+        self.y_bp = torch.rand(0, 3, 2, 3).cuda()
+
+        if half:
+            self.x = self.x.half()
+            self.y_bp = self.y_bp.half()
+        dist.broadcast(self.x, src=0)
+        dist.broadcast(self.y_bp, src=0)
+
+        torch.cuda.synchronize()
+        if size == 1:
+            groups = [None, None, None, None]
+            groups[0] = dist.new_group([0])
+            groups[1] = dist.new_group([1])
+            groups[2] = dist.new_group([2])
+            groups[3] = dist.new_group([3])
+            group = groups[rank]
+        elif size == 2:
+            groups = [None, None, None, None]
+            groups[0] = groups[1] = dist.new_group([0, 1])
+            groups[2] = groups[3] = dist.new_group([2, 3])
+            group = groups[rank]
+        elif size == 4:
+            group = dist.group.WORLD
+
+        syncbn = SyncBatchNorm(3, group=group, stats_mode='N').cuda()
+        syncbn.weight.data[0] = 0.2
+        syncbn.weight.data[1] = 0.5
+        syncbn.weight.data[2] = 0.7
+        syncbn.train()
+
+        bn = nn.BatchNorm2d(3).cuda()
+        bn.weight.data[0] = 0.2
+        bn.weight.data[1] = 0.5
+        bn.weight.data[2] = 0.7
+        bn.train()
+
+        sx = self.x[rank * 4:rank * 4 + 4]
+        sx.requires_grad_()
+        sy = syncbn(sx)
+        sy.backward(self.y_bp[rank * 4:rank * 4 + 4])
+        smean = syncbn.running_mean
+        svar = syncbn.running_var
+        sx_grad = sx.grad
+        sw_grad = syncbn.weight.grad
+        sb_grad = syncbn.bias.grad
+
+        if size == 1:
+            x = self.x[rank * 4:rank * 4 + 4]
+            y_bp = self.y_bp[rank * 4:rank * 4 + 4]
+        elif size == 2:
+            x = self.x[rank // 2 * 8:rank // 2 * 8 + 8]
+            y_bp = self.y_bp[rank // 2 * 8:rank // 2 * 8 + 8]
+        elif size == 4:
+            x = self.x
+            y_bp = self.y_bp
+        x.requires_grad_()
+        y = bn(x)
+        y.backward(y_bp)
+
+        if size == 2:
+            y = y[rank % 2 * 4:rank % 2 * 4 + 4]
+        elif size == 4:
+            y = y[rank * 4:rank * 4 + 4]
+
+        mean = bn.running_mean
+        var = bn.running_var
+        if size == 1:
+            x_grad = x.grad
+            w_grad = bn.weight.grad
+            b_grad = bn.bias.grad
+        elif size == 2:
+            x_grad = x.grad[rank % 2 * 4:rank % 2 * 4 + 4]
+            w_grad = bn.weight.grad / 2
+            b_grad = bn.bias.grad / 2
+        elif size == 4:
+            x_grad = x.grad[rank * 4:rank * 4 + 4]
+            w_grad = bn.weight.grad / 4
+            b_grad = bn.bias.grad / 4
+
+        assert np.allclose(mean.data.cpu().numpy(),
+                           smean.data.cpu().numpy(), 1e-3)
+        assert np.allclose(var.data.cpu().numpy(),
+                           svar.data.cpu().numpy(), 1e-3)
+        assert np.allclose(y.data.cpu().numpy(), sy.data.cpu().numpy(), 1e-3)
+        assert np.allclose(w_grad.data.cpu().numpy(),
+                           sw_grad.data.cpu().numpy(), 1e-3)
+        assert np.allclose(b_grad.data.cpu().numpy(),
+                           sb_grad.data.cpu().numpy(), 1e-3)
+        assert np.allclose(x_grad.data.cpu().numpy(),
+                           sx_grad.data.cpu().numpy(), 1e-2)
+
+        # 'stats_mode' only allows 'default' and 'N'
+        with pytest.raises(AssertionError):
+            SyncBatchNorm(3, group=group, stats_mode='X')
+
+    def test_syncbn_1(self):
+        self._test_syncbn_train(size=1)
+
+    def test_syncbn_2(self):
+        self._test_syncbn_train(size=2)
+
+    def test_syncbn_4(self):
+        self._test_syncbn_train(size=4)
+
+    def test_syncbn_1_half(self):
+        self._test_syncbn_train(size=1, half=True)
+
+    def test_syncbn_2_half(self):
+        self._test_syncbn_train(size=2, half=True)
+
+    def test_syncbn_4_half(self):
+        self._test_syncbn_train(size=4, half=True)
+
+    def test_syncbn_empty_1(self):
+        self._test_syncbn_empty_train(size=1)
+
+    def test_syncbn_empty_2(self):
+        self._test_syncbn_empty_train(size=2)
+
+    def test_syncbn_empty_4(self):
+        self._test_syncbn_empty_train(size=4)
+
+    def test_syncbn_empty_1_half(self):
+        self._test_syncbn_empty_train(size=1, half=True)
+
+    def test_syncbn_empty_2_half(self):
+        self._test_syncbn_empty_train(size=2, half=True)
+
+    def test_syncbn_empty_4_half(self):
+        self._test_syncbn_empty_train(size=4, half=True)
diff --git a/mmcv/tests/test_ops/test_tensorrt.py b/mmcv/tests/test_ops/test_tensorrt.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7a1057f898b8da91cebb07130a2011b4763351d
--- /dev/null
+++ b/mmcv/tests/test_ops/test_tensorrt.py
@@ -0,0 +1,808 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from functools import partial
+from typing import Callable
+
+import numpy as np
+import onnx
+import pytest
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+try:
+    from mmcv.tensorrt import (TRTWrapper, is_tensorrt_plugin_loaded, onnx2trt,
+                               save_trt_engine)
+except ImportError:
+    pytest.skip(
+        'TensorRT should be installed from source.', allow_module_level=True)
+
+if not torch.cuda.is_available():
+    pytest.skip(
+        'CUDA is required for this test module', allow_module_level=True)
+
+if not is_tensorrt_plugin_loaded():
+    pytest.skip(
+        'Test requires to complie TensorRT plugins in mmcv',
+        allow_module_level=True)
+
+
+class WrapFunction(nn.Module):
+
+    def __init__(self, wrapped_function):
+        super().__init__()
+        self.wrapped_function = wrapped_function
+
+    def forward(self, *args, **kwargs):
+        return self.wrapped_function(*args, **kwargs)
+
+
+onnx_file = 'tmp.onnx'
+trt_file = 'tmp.engine'
+
+
+def test_roialign():
+    try:
+        from mmcv.ops import RoIAlign
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    # roi align config
+    pool_h = 2
+    pool_w = 2
+    spatial_scale = 1.0
+    sampling_ratio = 2
+
+    inputs = [([[[[1., 2.], [3., 4.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2.], [3., 4.]], [[4., 3.],
+                                        [2., 1.]]]], [[0., 0., 0., 1., 1.]]),
+              ([[[[1., 2., 5., 6.], [3., 4., 7., 8.], [9., 10., 13., 14.],
+                  [11., 12., 15., 16.]]]], [[0., 0., 0., 3., 3.]])]
+
+    wrapped_model = RoIAlign((pool_w, pool_h), spatial_scale, sampling_ratio,
+                             'avg', True).cuda()
+    for case in inputs:
+        np_input = np.array(case[0], dtype=np.float32)
+        np_rois = np.array(case[1], dtype=np.float32)
+        input = torch.from_numpy(np_input).cuda()
+        rois = torch.from_numpy(np_rois).cuda()
+
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (input, rois),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['input', 'rois'],
+                output_names=['roi_feat'],
+                opset_version=11)
+        onnx_model = onnx.load(onnx_file)
+
+        # create trt engine and wrapper
+        opt_shape_dict = {
+            'input': [list(input.shape),
+                      list(input.shape),
+                      list(input.shape)],
+            'rois': [list(rois.shape),
+                     list(rois.shape),
+                     list(rois.shape)]
+        }
+        trt_engine = onnx2trt(
+            onnx_model,
+            opt_shape_dict,
+            fp16_mode=fp16_mode,
+            max_workspace_size=max_workspace_size)
+        save_trt_engine(trt_engine, trt_file)
+        trt_model = TRTWrapper(trt_file, ['input', 'rois'], ['roi_feat'])
+
+        with torch.no_grad():
+            trt_outputs = trt_model({'input': input, 'rois': rois})
+            trt_roi_feat = trt_outputs['roi_feat']
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_roi_feat = wrapped_model(input, rois)
+
+        # allclose
+        if os.path.exists(onnx_file):
+            os.remove(onnx_file)
+        if os.path.exists(trt_file):
+            os.remove(trt_file)
+        assert torch.allclose(pytorch_roi_feat, trt_roi_feat)
+
+
+def test_nms():
+    try:
+        import mmcv
+        from mmcv.ops import nms
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+    os.environ['ONNX_BACKEND'] = 'MMCVTensorRT'
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+    data = mmcv.load('./tests/data/batched_nms_data.pkl')
+    boxes = torch.from_numpy(data['boxes']).cuda()
+    scores = torch.from_numpy(data['scores']).cuda()
+    nms = partial(
+        nms, iou_threshold=0.7, offset=0, score_threshold=0.1, max_num=100)
+    wrapped_model = WrapFunction(nms)
+    wrapped_model.cpu().eval()
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (boxes.detach().cpu(), scores.detach().cpu()),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=['boxes', 'scores'],
+            output_names=['dets', 'inds'],
+            opset_version=11)
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'boxes': [list(boxes.shape),
+                  list(boxes.shape),
+                  list(boxes.shape)],
+        'scores': [list(scores.shape),
+                   list(scores.shape),
+                   list(scores.shape)]
+    }
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, ['boxes', 'scores'], ['dets', 'inds'])
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'boxes': boxes, 'scores': scores})
+        trt_dets = trt_outputs['dets']
+        trt_inds = trt_outputs['inds']
+        trt_inds = trt_inds.long()
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_outputs = wrapped_model(boxes, scores)
+        pytorch_dets, pytorch_inds = pytorch_outputs
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    num_boxes = pytorch_dets.shape[0]
+    trt_dets = trt_dets[:num_boxes, ...]
+    trt_inds = trt_inds[:num_boxes]
+    trt_scores = trt_dets[:, 4]
+    pytorch_scores = pytorch_dets[:, 4]
+    os.environ.pop('ONNX_BACKEND')
+    assert torch.allclose(pytorch_scores, trt_scores, atol=1e-3)
+    assert torch.equal(pytorch_inds, trt_inds)
+
+
+def test_batched_nms():
+    try:
+        import mmcv
+        from mmcv.ops import batched_nms
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+
+    # trt config
+    os.environ['ONNX_BACKEND'] = 'MMCVTensorRT'
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+    data = mmcv.load('./tests/data/batched_nms_data.pkl')
+    nms_cfg = dict(type='nms', iou_threshold=0.7, score_threshold=0.1)
+    boxes = torch.from_numpy(data['boxes']).cuda()
+    scores = torch.from_numpy(data['scores']).cuda()
+    idxs = torch.from_numpy(data['idxs']).cuda()
+    class_agnostic = False
+
+    nms = partial(batched_nms, nms_cfg=nms_cfg, class_agnostic=class_agnostic)
+    wrapped_model = WrapFunction(nms)
+    wrapped_model.cpu().eval()
+    input_data = (boxes.detach().cpu(), scores.detach().cpu(),
+                  idxs.detach().cpu())
+    input_names = ['boxes', 'scores', 'idxs']
+    output_names = ['dets', 'inds']
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model,
+            input_data,
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+    onnx_model = onnx.load(onnx_file)
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'boxes': [list(boxes.shape),
+                  list(boxes.shape),
+                  list(boxes.shape)],
+        'scores': [list(scores.shape),
+                   list(scores.shape),
+                   list(scores.shape)],
+        'idxs': [list(idxs.shape),
+                 list(idxs.shape),
+                 list(idxs.shape)]
+    }
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({
+            'boxes': boxes,
+            'scores': scores,
+            'idxs': idxs
+        })
+        trt_dets = trt_outputs['dets']
+        trt_inds = trt_outputs['inds']
+        trt_inds = trt_inds.long()
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_outputs = wrapped_model(boxes, scores, idxs)
+        pytorch_dets, pytorch_inds = pytorch_outputs
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    num_boxes = pytorch_dets.shape[0]
+    trt_dets = trt_dets[:num_boxes, ...]
+    trt_inds = trt_inds[:num_boxes]
+    trt_scores = trt_dets[:, 4]
+    pytorch_scores = pytorch_dets[:, 4]
+
+    os.environ.pop('ONNX_BACKEND')
+    assert torch.allclose(pytorch_scores, trt_scores)
+    assert torch.equal(pytorch_inds, trt_inds)
+
+
+def test_scatternd():
+
+    def func(data):
+        data[:, :-2] += 1
+        data[:2, :] -= 1
+        return data
+
+    data = torch.zeros(4, 4).cuda()
+    wrapped_model = WrapFunction(func).eval().cuda()
+
+    input_names = ['input']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (data.clone(), ),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'input': [list(data.shape),
+                  list(data.shape),
+                  list(data.shape)],
+    }
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'input': data.clone()})
+        trt_results = trt_outputs['output']
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_results = wrapped_model(data.clone())
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    assert torch.allclose(pytorch_results, trt_results)
+
+
+def test_deform_conv():
+    try:
+        from mmcv.ops import DeformConv2dPack
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+
+    input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
+    offset_weight = [[[0.1, 0.4, 0.6, 0.1]], [[0.3, 0.2, 0.1, 0.3]],
+                     [[0.5, 0.5, 0.2, 0.8]], [[0.8, 0.3, 0.9, 0.1]],
+                     [[0.3, 0.1, 0.2, 0.5]], [[0.3, 0.7, 0.5, 0.3]],
+                     [[0.6, 0.2, 0.5, 0.3]], [[0.4, 0.1, 0.8, 0.4]]]
+    offset_bias = [0.7, 0.1, 0.8, 0.5, 0.6, 0.5, 0.4, 0.7]
+    deform_weight = [[[0.4, 0.2, 0.1, 0.9]]]
+
+    c_in = 1
+    c_out = 1
+    x = torch.Tensor(input).cuda()
+    x.requires_grad = True
+    model = DeformConv2dPack(c_in, c_out, 2, stride=1, padding=0)
+    model.conv_offset.weight.data = torch.nn.Parameter(
+        torch.Tensor(offset_weight).reshape(8, 1, 2, 2))
+    model.conv_offset.bias.data = torch.nn.Parameter(
+        torch.Tensor(offset_bias).reshape(8))
+    model.weight.data = torch.nn.Parameter(
+        torch.Tensor(deform_weight).reshape(1, 1, 2, 2))
+    model.cuda().eval()
+
+    input_names = ['input']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            model, (x.clone(), ),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'input': [list(x.shape), list(x.shape),
+                  list(x.shape)],
+    }
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'input': x.clone()})
+        trt_results = trt_outputs['output']
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_results = model(x.clone())
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    assert torch.allclose(pytorch_results, trt_results)
+
+
+@pytest.mark.parametrize('with_bias', [True, False])
+def test_modulated_deform_conv(with_bias):
+    try:
+        from mmcv.ops import ModulatedDeformConv2dPack
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+
+    input = [[[[1., 2., 3.], [0., 1., 2.], [3., 5., 2.]]]]
+
+    x = torch.Tensor(input).cuda()
+    model = ModulatedDeformConv2dPack(
+        1,
+        1,
+        kernel_size=(2, 2),
+        stride=1,
+        padding=1,
+        deform_groups=1,
+        bias=with_bias)
+    model.weight.data.fill_(1.)
+    model.type(torch.float32)
+    model = model.cuda().eval()
+
+    input_names = ['input']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            model, (x.clone(), ),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'input': [list(x.shape), list(x.shape),
+                  list(x.shape)],
+    }
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'input': x.clone()})
+        trt_results = trt_outputs['output']
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_results = model(x.clone())
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    torch.testing.assert_allclose(pytorch_results, trt_results)
+
+
+@pytest.mark.parametrize('mode', ['bilinear', 'nearest'])
+@pytest.mark.parametrize('padding_mode', ['zeros', 'border', 'reflection'])
+@pytest.mark.parametrize('align_corners', [True, False])
+def test_grid_sample(mode, padding_mode, align_corners):
+    from mmcv.onnx.symbolic import register_extra_symbolics
+
+    register_extra_symbolics(11)
+
+    input = torch.rand(1, 1, 10, 10).cuda()
+    grid = torch.Tensor([[[1, 0, 0], [0, 1, 0]]])
+    grid = F.affine_grid(grid, (1, 1, 15, 15)).type_as(input).cuda()
+
+    def func(input, grid):
+        return F.grid_sample(
+            input,
+            grid,
+            mode=mode,
+            padding_mode=padding_mode,
+            align_corners=align_corners)
+
+    wrapped_model = WrapFunction(func).eval().cuda()
+
+    input_names = ['input', 'grid']
+    output_names = ['output']
+
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (input.clone(), grid.clone()),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    opt_shape_dict = {
+        'input': [list(input.shape),
+                  list(input.shape),
+                  list(input.shape)],
+        'grid': [list(grid.shape),
+                 list(grid.shape),
+                 list(grid.shape)],
+    }
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'input': input.clone(), 'grid': grid.clone()})
+        trt_results = trt_outputs['output']
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_results = wrapped_model(input.clone(), grid.clone())
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    assert torch.allclose(pytorch_results, trt_results)
+
+
+@pytest.mark.parametrize('func', [torch.cummax, torch.cummin])
+def test_cummin_cummax(func: Callable):
+    # Note generally `cummax` or `cummin` is exportable to ONNX
+    # as long as the pytorch version >= 1.5.0, since `torch.cummax`
+    # is only supported with torch >= 1.5.0.
+    # But when `cummax` or `cummin` serves as an intermediate component
+    # whose outputs is used as inputs for another modules, it's expected
+    # that pytorch version must be >= 1.7.0. Otherwise error appears like:
+    # `RuntimeError: tuple  appears in op that does not forward tuples,
+    # unsupported 'kind: prim::PythonOp`.
+    from packaging import version
+    if version.parse(torch.__version__) < version.parse('1.7.0'):
+        pytest.skip('test_cummax_cummin should be ran with pytorch >= 1.7.0')
+
+    opset = 11
+    # register custom op `mmcv::cummax` and `mmcv::cummin`
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    register_extra_symbolics(opset)
+
+    input_list = [
+        # arbitrary shape, e.g. 1-D, 2-D, 3-D, ...
+        torch.rand((2, 3, 4, 1, 5)).cuda(),
+        torch.rand(1).cuda()
+    ]
+
+    input_names = ['input']
+    output_names = ['output', 'indices']
+
+    for input in input_list:
+        ndims = input.dim()
+        # valid dim range is [-ndims, ndims-1]
+        # test for all `dim` value which is valid
+        for dim in range(-ndims, ndims):
+            cummax_func = partial(func, dim=dim)
+            wrapped_model = WrapFunction(cummax_func).eval().cuda()
+
+            with torch.no_grad():
+                torch.onnx.export(
+                    wrapped_model,
+                    input,
+                    onnx_file,
+                    export_params=True,
+                    keep_initializers_as_inputs=False,
+                    input_names=input_names,
+                    output_names=output_names,
+                    opset_version=opset)
+
+            onnx_model = onnx.load(onnx_file)
+
+            # create trt engine and wrapper
+            opt_shape_dict = {
+                'input':
+                [list(input.shape),
+                 list(input.shape),
+                 list(input.shape)]
+            }
+            # trt config
+            fp16_mode = False
+            max_workspace_size = 1 << 30
+
+            trt_engine = onnx2trt(
+                onnx_model,
+                opt_shape_dict,
+                fp16_mode=fp16_mode,
+                max_workspace_size=max_workspace_size)
+
+            # remove ONNX model after conversion
+            if os.path.exists(onnx_file):
+                os.remove(onnx_file)
+
+            # save TensorRT model
+            save_trt_engine(trt_engine, trt_file)
+
+            # load and wrap TensorRT model
+            trt_model = TRTWrapper(trt_file)
+
+            # remove trt model after loading
+            if os.path.exists(trt_file):
+                os.remove(trt_file)
+
+            # compute trt output
+            with torch.no_grad():
+                trt_results = trt_model({'input': input.contiguous().clone()})
+                trt_output = trt_results['output']
+                trt_indices = trt_results['indices']
+
+            # compute pytorch output
+            with torch.no_grad():
+                pytorch_results = wrapped_model(input.clone())
+                pytorch_output = pytorch_results[0]
+                pytorch_indices = pytorch_results[1]
+
+            torch.testing.assert_allclose(trt_output, pytorch_output)
+            torch.testing.assert_allclose(trt_indices, pytorch_indices)
+
+
+@pytest.mark.parametrize('dynamic_export', [True, False])
+@pytest.mark.parametrize('fp16_mode', [True, False])
+def test_instance_norm(dynamic_export, fp16_mode):
+
+    n, c, h, w = 2, 3, 10, 10
+    data = torch.randn(n, c, h, w).cuda()
+    norm = nn.InstanceNorm2d(c, affine=True)
+
+    wrapped_model = WrapFunction(norm).eval().cuda()
+
+    input_names = ['input']
+    output_names = ['output']
+    dynamic_axes = None
+    if dynamic_export:
+        dynamic_axes = {
+            'input': {
+                0: 'n',
+                2: 'h',
+                3: 'w',
+            },
+            'output': {
+                0: 'n',
+                2: 'h',
+                3: 'w',
+            },
+        }
+    with torch.no_grad():
+        torch.onnx.export(
+            wrapped_model, (data.clone(), ),
+            onnx_file,
+            export_params=True,
+            keep_initializers_as_inputs=True,
+            input_names=input_names,
+            output_names=output_names,
+            dynamic_axes=dynamic_axes,
+            opset_version=11)
+
+    onnx_model = onnx.load(onnx_file)
+
+    # create trt engine and wrapper
+    if dynamic_export:
+        opt_shape_dict = {
+            'input':
+            [list(data.shape),
+             list(data.shape), [2 * n, c, 2 * h, 2 * w]],
+        }
+    else:
+        opt_shape_dict = {
+            'input': [list(data.shape),
+                      list(data.shape),
+                      list(data.shape)],
+        }
+    # trt config
+    max_workspace_size = 1 << 30
+
+    trt_engine = onnx2trt(
+        onnx_model,
+        opt_shape_dict,
+        fp16_mode=fp16_mode,
+        max_workspace_size=max_workspace_size)
+
+    save_trt_engine(trt_engine, trt_file)
+    trt_model = TRTWrapper(trt_file, input_names, output_names)
+
+    with torch.no_grad():
+        trt_outputs = trt_model({'input': data.clone()})
+        trt_results = trt_outputs['output']
+
+    # compute pytorch_output
+    with torch.no_grad():
+        pytorch_results = wrapped_model(data.clone())
+
+    # allclose
+    if os.path.exists(onnx_file):
+        os.remove(onnx_file)
+    if os.path.exists(trt_file):
+        os.remove(trt_file)
+    assert torch.allclose(pytorch_results, trt_results)
+
+
+@pytest.mark.parametrize('mode', ['top', 'bottom', 'left', 'right'])
+def test_corner_pool(mode):
+    try:
+        from mmcv.ops import CornerPool
+    except (ImportError, ModuleNotFoundError):
+        pytest.skip('test requires compilation')
+
+    opset = 11
+    # register custom op `mmcv::MMCVCornerPool`
+    from mmcv.onnx.symbolic import register_extra_symbolics
+    register_extra_symbolics(opset)
+
+    # trt config
+    fp16_mode = False
+    max_workspace_size = 1 << 30
+
+    inputs = [
+        # (n, c, h, w)
+        torch.rand((2, 3, 5, 5)),
+        torch.rand((1, 2, 4, 6)),
+        torch.rand((2, 1, 3, 2)),
+    ]
+
+    class CornerPoolWrapper(CornerPool):
+
+        def __init__(self, mode):
+            super().__init__(mode)
+
+        def forward(self, x):
+            # no use `torch.cummax`, instead `corner_pool` is used
+            # for various torch version
+            return self.corner_pool.apply(x)
+
+    wrapped_model = CornerPoolWrapper(mode).cuda()
+    for input in inputs:
+        input = input.cuda()
+
+        with torch.no_grad():
+            torch.onnx.export(
+                wrapped_model, (input, ),
+                onnx_file,
+                export_params=True,
+                keep_initializers_as_inputs=True,
+                input_names=['input'],
+                output_names=['output'],
+                opset_version=opset)
+        onnx_model = onnx.load(onnx_file)
+
+        # create trt engine and wrapper
+        opt_shape_dict = {
+            'input': [list(input.shape),
+                      list(input.shape),
+                      list(input.shape)],
+        }
+        trt_engine = onnx2trt(
+            onnx_model,
+            opt_shape_dict,
+            fp16_mode=fp16_mode,
+            max_workspace_size=max_workspace_size)
+        save_trt_engine(trt_engine, trt_file)
+        trt_model = TRTWrapper(trt_file, ['input'], ['output'])
+
+        with torch.no_grad():
+            trt_outputs = trt_model({'input': input})
+            trt_pool_feat = trt_outputs['output']
+
+        # compute pytorch_output
+        with torch.no_grad():
+            pytorch_pool_feat = wrapped_model(input)
+
+        # allclose
+        if os.path.exists(onnx_file):
+            os.remove(onnx_file)
+        if os.path.exists(trt_file):
+            os.remove(trt_file)
+        assert torch.allclose(pytorch_pool_feat, trt_pool_feat, atol=1e-5)
diff --git a/mmcv/tests/test_ops/test_tensorrt_preprocess.py b/mmcv/tests/test_ops/test_tensorrt_preprocess.py
new file mode 100644
index 0000000000000000000000000000000000000000..22d0db76e474889b6766af5e20b43e44f3ed6e7b
--- /dev/null
+++ b/mmcv/tests/test_ops/test_tensorrt_preprocess.py
@@ -0,0 +1,80 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from functools import wraps
+
+import onnx
+import pytest
+import torch
+
+from mmcv.ops import nms
+from mmcv.tensorrt.preprocess import preprocess_onnx
+
+if torch.__version__ == 'parrots':
+    pytest.skip('not supported in parrots now', allow_module_level=True)
+
+
+def remove_tmp_file(func):
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        onnx_file = 'tmp.onnx'
+        kwargs['onnx_file'] = onnx_file
+        try:
+            result = func(*args, **kwargs)
+        finally:
+            if os.path.exists(onnx_file):
+                os.remove(onnx_file)
+        return result
+
+    return wrapper
+
+
+@remove_tmp_file
+def export_nms_module_to_onnx(module, onnx_file):
+    torch_model = module()
+    torch_model.eval()
+
+    input = (torch.rand([100, 4], dtype=torch.float32),
+             torch.rand([100], dtype=torch.float32))
+
+    torch.onnx.export(
+        torch_model,
+        input,
+        onnx_file,
+        opset_version=11,
+        input_names=['boxes', 'scores'],
+        output_names=['output'])
+
+    onnx_model = onnx.load(onnx_file)
+    return onnx_model
+
+
+def test_can_handle_nms_with_constant_maxnum():
+
+    class ModuleNMS(torch.nn.Module):
+
+        def forward(self, boxes, scores):
+            return nms(boxes, scores, iou_threshold=0.4, max_num=10)
+
+    onnx_model = export_nms_module_to_onnx(ModuleNMS)
+    preprocess_onnx_model = preprocess_onnx(onnx_model)
+    for node in preprocess_onnx_model.graph.node:
+        if 'NonMaxSuppression' in node.name:
+            assert len(node.attribute) == 5, 'The NMS must have 5 attributes.'
+
+
+def test_can_handle_nms_with_undefined_maxnum():
+
+    class ModuleNMS(torch.nn.Module):
+
+        def forward(self, boxes, scores):
+            return nms(boxes, scores, iou_threshold=0.4)
+
+    onnx_model = export_nms_module_to_onnx(ModuleNMS)
+    preprocess_onnx_model = preprocess_onnx(onnx_model)
+    for node in preprocess_onnx_model.graph.node:
+        if 'NonMaxSuppression' in node.name:
+            assert len(node.attribute) == 5, \
+                'The NMS must have 5 attributes.'
+            assert node.attribute[2].i > 0, \
+                'The max_output_boxes_per_class is not defined correctly.'
diff --git a/mmcv/tests/test_ops/test_three_interpolate.py b/mmcv/tests/test_ops/test_three_interpolate.py
new file mode 100644
index 0000000000000000000000000000000000000000..900f451ff8de17f84b972a0364dc14c5e8520939
--- /dev/null
+++ b/mmcv/tests/test_ops/test_three_interpolate.py
@@ -0,0 +1,75 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import three_interpolate
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_three_interpolate():
+    features = torch.tensor([[[2.4350, 4.7516, 4.4995, 2.4350, 2.4350, 2.4350],
+                              [3.1236, 2.6278, 3.0447, 3.1236, 3.1236, 3.1236],
+                              [2.6732, 2.8677, 2.6436, 2.6732, 2.6732, 2.6732],
+                              [0.0124, 7.0150, 7.0199, 0.0124, 0.0124, 0.0124],
+                              [0.3207, 0.0000, 0.3411, 0.3207, 0.3207,
+                               0.3207]],
+                             [[0.0000, 0.9544, 2.4532, 0.0000, 0.0000, 0.0000],
+                              [0.5346, 1.9176, 1.4715, 0.5346, 0.5346, 0.5346],
+                              [0.0000, 0.2744, 2.0842, 0.0000, 0.0000, 0.0000],
+                              [0.3414, 1.5063, 1.6209, 0.3414, 0.3414, 0.3414],
+                              [0.5814, 0.0103, 0.0000, 0.5814, 0.5814,
+                               0.5814]]]).cuda()
+
+    idx = torch.tensor([[[0, 1, 2], [2, 3, 4], [2, 3, 4], [0, 1, 2], [0, 1, 2],
+                         [0, 1, 3]],
+                        [[0, 2, 3], [1, 3, 4], [2, 1, 4], [0, 2, 4], [0, 2, 4],
+                         [0, 1, 2]]]).int().cuda()
+
+    weight = torch.tensor([[[3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [1.0000e+00, 5.8155e-08, 2.2373e-08],
+                            [1.0000e+00, 1.7737e-08, 1.7356e-08],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01]],
+                           [[3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [1.0000e+00, 1.3651e-08, 7.7312e-09],
+                            [1.0000e+00, 1.7148e-08, 1.4070e-08],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01],
+                            [3.3333e-01, 3.3333e-01, 3.3333e-01]]]).cuda()
+
+    output = three_interpolate(features, idx, weight)
+    expected_output = torch.tensor([[[
+        3.8953e+00, 4.4995e+00, 4.4995e+00, 3.8953e+00, 3.8953e+00, 3.2072e+00
+    ], [
+        2.9320e+00, 3.0447e+00, 3.0447e+00, 2.9320e+00, 2.9320e+00, 2.9583e+00
+    ], [
+        2.7281e+00, 2.6436e+00, 2.6436e+00, 2.7281e+00, 2.7281e+00, 2.7380e+00
+    ], [
+        4.6824e+00, 7.0199e+00, 7.0199e+00, 4.6824e+00, 4.6824e+00, 2.3466e+00
+    ], [
+        2.2060e-01, 3.4110e-01, 3.4110e-01, 2.2060e-01, 2.2060e-01, 2.1380e-01
+    ]],
+                                    [[
+                                        8.1773e-01, 9.5440e-01, 2.4532e+00,
+                                        8.1773e-01, 8.1773e-01, 1.1359e+00
+                                    ],
+                                     [
+                                         8.4689e-01, 1.9176e+00, 1.4715e+00,
+                                         8.4689e-01, 8.4689e-01, 1.3079e+00
+                                     ],
+                                     [
+                                         6.9473e-01, 2.7440e-01, 2.0842e+00,
+                                         6.9473e-01, 6.9473e-01, 7.8619e-01
+                                     ],
+                                     [
+                                         7.6789e-01, 1.5063e+00, 1.6209e+00,
+                                         7.6789e-01, 7.6789e-01, 1.1562e+00
+                                     ],
+                                     [
+                                         3.8760e-01, 1.0300e-02, 8.3569e-09,
+                                         3.8760e-01, 3.8760e-01, 1.9723e-01
+                                     ]]]).cuda()
+
+    assert torch.allclose(output, expected_output, 1e-4)
diff --git a/mmcv/tests/test_ops/test_three_nn.py b/mmcv/tests/test_ops/test_three_nn.py
new file mode 100644
index 0000000000000000000000000000000000000000..c30690310c4a1c642872f9548b1c770d0ba3c6ae
--- /dev/null
+++ b/mmcv/tests/test_ops/test_three_nn.py
@@ -0,0 +1,72 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.ops import three_nn
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_three_nn():
+    known = torch.tensor([[[-1.8373, 3.5605,
+                            -0.7867], [0.7615, 2.9420, 0.2314],
+                           [-0.6503, 3.6637, -1.0622],
+                           [-1.8373, 3.5605, -0.7867],
+                           [-1.8373, 3.5605, -0.7867]],
+                          [[-1.3399, 1.9991, -0.3698],
+                           [-0.0799, 0.9698,
+                            -0.8457], [0.0858, 2.4721, -0.1928],
+                           [-1.3399, 1.9991, -0.3698],
+                           [-1.3399, 1.9991, -0.3698]]]).cuda()
+
+    unknown = torch.tensor([[[-1.8373, 3.5605, -0.7867],
+                             [0.7615, 2.9420, 0.2314],
+                             [-0.6503, 3.6637, -1.0622],
+                             [-1.5237, 2.3976, -0.8097],
+                             [-0.0722, 3.4017, -0.2880],
+                             [0.5198, 3.0661, -0.4605],
+                             [-2.0185, 3.5019, -0.3236],
+                             [0.5098, 3.1020, 0.5799],
+                             [-1.6137, 3.8443, -0.5269],
+                             [0.7341, 2.9626, -0.3189]],
+                            [[-1.3399, 1.9991, -0.3698],
+                             [-0.0799, 0.9698, -0.8457],
+                             [0.0858, 2.4721, -0.1928],
+                             [-0.9022, 1.6560, -1.3090],
+                             [0.1156, 1.6901, -0.4366],
+                             [-0.6477, 2.3576, -0.1563],
+                             [-0.8482, 1.1466, -1.2704],
+                             [-0.8753, 2.0845, -0.3460],
+                             [-0.5621, 1.4233, -1.2858],
+                             [-0.5883, 1.3114, -1.2899]]]).cuda()
+
+    dist, idx = three_nn(unknown, known)
+    expected_dist = torch.tensor([[[0.0000, 0.0000, 0.0000],
+                                   [0.0000, 2.0463, 2.8588],
+                                   [0.0000, 1.2229, 1.2229],
+                                   [1.2047, 1.2047, 1.2047],
+                                   [1.0011, 1.0845, 1.8411],
+                                   [0.7433, 1.4451, 2.4304],
+                                   [0.5007, 0.5007, 0.5007],
+                                   [0.4587, 2.0875, 2.7544],
+                                   [0.4450, 0.4450, 0.4450],
+                                   [0.5514, 1.7206, 2.6811]],
+                                  [[0.0000, 0.0000, 0.0000],
+                                   [0.0000, 1.6464, 1.6952],
+                                   [0.0000, 1.5125, 1.5125],
+                                   [1.0915, 1.0915, 1.0915],
+                                   [0.8197, 0.8511, 1.4894],
+                                   [0.7433, 0.8082, 0.8082],
+                                   [0.8955, 1.3340, 1.3340],
+                                   [0.4730, 0.4730, 0.4730],
+                                   [0.7949, 1.3325, 1.3325],
+                                   [0.7566, 1.3727, 1.3727]]]).cuda()
+    expected_idx = torch.tensor([[[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4],
+                                  [2, 1, 0], [1, 2, 0], [0, 3, 4], [1, 2, 0],
+                                  [0, 3, 4], [1, 2, 0]],
+                                 [[0, 3, 4], [1, 2, 0], [2, 0, 3], [0, 3, 4],
+                                  [2, 1, 0], [2, 0, 3], [1, 0, 3], [0, 3, 4],
+                                  [1, 0, 3], [1, 0, 3]]]).cuda()
+
+    assert torch.allclose(dist, expected_dist, 1e-4)
+    assert torch.all(idx == expected_idx)
diff --git a/mmcv/tests/test_ops/test_tin_shift.py b/mmcv/tests/test_ops/test_tin_shift.py
new file mode 100755
index 0000000000000000000000000000000000000000..c8ce14465cf957e13df4dcd72c95d647c1cba3aa
--- /dev/null
+++ b/mmcv/tests/test_ops/test_tin_shift.py
@@ -0,0 +1,226 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+
+import numpy as np
+import pytest
+import torch
+
+from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck
+
+    _USING_PARROTS = False
+
+cur_dir = os.path.dirname(os.path.abspath(__file__))
+
+inputs = ([[[[0.88572276, 0.46422583], [0.97408265, 0.59547687],
+             [0.030812204, 0.96236038], [0.75418317, 0.44058233],
+             [0.33279222, 0.00084149837], [0.7069388, 0.23255438],
+             [0.13547045, 0.81549376], [0.40174931, 0.36317211]],
+            [[0.57444429, 0.15905505], [0.39897251, 0.25790238],
+             [0.93282568, 0.18451685], [0.92526674, 0.18283755],
+             [0.31664443, 0.59323865], [0.1957739, 0.42505842],
+             [0.081158757, 0.81340349], [0.43456328, 0.30195212]],
+            [[0.8198145, 0.05990988], [0.98062474, 0.34803438],
+             [0.10412294, 0.37183142], [0.15021622, 0.038857818],
+             [0.40985721, 0.42253625], [0.71150124, 0.59778064],
+             [0.83851069, 0.15194464], [0.097513378, 0.74820143]],
+            [[0.80680406, 0.49327564], [0.17821097, 0.12980539],
+             [0.50657678, 0.14446253], [0.04178369, 0.53071898],
+             [0.84983683, 0.3826949], [0.32193625, 0.91275406],
+             [0.75628334, 0.52934098], [0.27994192, 0.3053292]]],
+           [[[0.082397044, 0.4210068], [0.23563534, 0.7938987],
+             [0.63669145, 0.69397897], [0.8844561, 0.97854084],
+             [0.79027033, 0.60640401], [0.63528901, 0.72172403],
+             [0.0097346902, 0.70800996], [0.87891227, 0.13674974]],
+            [[0.74329448, 0.0243572], [0.82178867, 0.85750699],
+             [0.7568835, 0.73146772], [0.5031184, 0.30479157],
+             [0.28713053, 0.47414285], [0.4682079, 0.067471564],
+             [0.48368263, 0.14590704], [0.25397325, 0.19946373]],
+            [[0.4291026, 0.068739474], [0.7159555, 0.79903615],
+             [0.76412082, 0.85348046], [0.081224024, 0.82264912],
+             [0.97173303, 0.24291694], [0.48957139, 0.43488795],
+             [0.67382395, 0.21889746], [0.36712623, 0.67127824]],
+            [[0.12054044, 0.18096751], [0.86675781, 0.54755616],
+             [0.68208277, 0.15164375], [0.79991871, 0.80811197],
+             [0.85256428, 0.68253738], [0.185983, 0.95642138],
+             [0.48102546, 0.28009653], [0.35726011, 0.58168036]]]])
+
+shifts = [([[1, 0, 1, -2], [-2, 1, -1, 1]]), ([[2, 1, 2, -1], [-1, 2, 0, 2]])]
+
+outputs = [([[[[0.0, 0.0], [0.0, 0.0], [0.030812, 0.96236], [0.75418, 0.44058],
+               [0.0, 0.0], [0.0, 0.0], [0.83851, 0.15194], [0.097513, 0.7482]],
+              [[0.88572, 0.46423], [0.97408, 0.59548], [0.93283, 0.18452],
+               [0.92527, 0.18284], [0.33279, 0.0008415], [0.70694, 0.23255],
+               [0.75628, 0.52934], [0.27994, 0.30533]],
+              [[0.57444, 0.15906], [0.39897, 0.2579], [0.10412, 0.37183],
+               [0.15022, 0.038858], [0.31664, 0.59324], [0.19577, 0.42506],
+               [0.0, 0.0], [0.0, 0.0]],
+              [[0.81981, 0.05991], [0.98062, 0.34803], [0.50658, 0.14446],
+               [0.041784, 0.53072], [0.40986, 0.42254], [0.7115, 0.59778],
+               [0.0, 0.0], [0.0, 0.0]]],
+             [[[0.4291, 0.068739], [0.71596, 0.79904], [0.0, 0.0], [0.0, 0.0],
+               [0.28713, 0.47414], [0.46821, 0.067472], [0.0, 0.0], [0.0,
+                                                                     0.0]],
+              [[0.12054, 0.18097], [0.86676, 0.54756], [0.63669, 0.69398],
+               [0.88446, 0.97854], [0.97173, 0.24292], [0.48957, 0.43489],
+               [0.0097347, 0.70801], [0.87891, 0.13675]],
+              [[0.0, 0.0], [0.0, 0.0], [0.75688, 0.73147], [0.50312, 0.30479],
+               [0.85256, 0.68254], [0.18598, 0.95642], [0.48368, 0.14591],
+               [0.25397, 0.19946]],
+              [[0.0, 0.0], [0.0, 0.0], [0.76412, 0.85348], [0.081224, 0.82265],
+               [0.0, 0.0], [0.0, 0.0], [0.67382, 0.2189], [0.36713,
+                                                           0.67128]]]]),
+           ([[[[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0],
+               [0.0, 0.0], [0.081159, 0.8134], [0.43456, 0.30195]],
+              [[0.0, 0.0], [0.0, 0.0], [0.030812, 0.96236], [0.75418, 0.44058],
+               [0.0, 0.0], [0.0, 0.0], [0.83851, 0.15194], [0.097513, 0.7482]],
+              [[0.88572, 0.46423], [0.97408, 0.59548], [0.93283, 0.18452],
+               [0.92527, 0.18284], [0.33279, 0.0008415], [0.70694, 0.23255],
+               [0.75628, 0.52934], [0.27994, 0.30533]],
+              [[0.57444, 0.15906], [0.39897, 0.2579], [0.10412, 0.37183],
+               [0.15022, 0.038858], [0.31664, 0.59324], [0.19577, 0.42506],
+               [0.0, 0.0], [0.0, 0.0]]],
+             [[[0.74329, 0.024357], [0.82179, 0.85751], [0.0, 0.0], [0.0, 0.0],
+               [0.79027, 0.6064], [0.63529, 0.72172], [0.0, 0.0], [0.0, 0.0]],
+              [[0.4291, 0.068739], [0.71596, 0.79904], [0.0, 0.0], [0.0, 0.0],
+               [0.28713, 0.47414], [0.46821, 0.067472], [0.0, 0.0], [0.0,
+                                                                     0.0]],
+              [[0.12054, 0.18097], [0.86676, 0.54756], [0.63669, 0.69398],
+               [0.88446, 0.97854], [0.97173, 0.24292], [0.48957, 0.43489],
+               [0.0097347, 0.70801], [0.87891, 0.13675]],
+              [[0.0, 0.0], [0.0, 0.0], [0.75688, 0.73147], [0.50312, 0.30479],
+               [0.85256, 0.68254], [0.18598, 0.95642], [0.48368, 0.14591],
+               [0.25397, 0.19946]]]])]
+
+grads = [
+    [[[[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],
+       [1., 1.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]]],
+     [[[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]],
+      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]],
+      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],
+       [1., 1.]]]],
+    [[[[0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [0., 0.], [1., 1.],
+       [1., 1.]],
+      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.],
+       [1., 1.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]]],
+     [[[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]],
+      [[1., 1.], [1., 1.], [0., 0.], [0., 0.], [1., 1.], [1., 1.], [0., 0.],
+       [0., 0.]],
+      [[1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]],
+      [[0., 0.], [0., 0.], [1., 1.], [1., 1.], [1., 1.], [1., 1.], [1., 1.],
+       [1., 1.]]]]
+]
+
+
+def _test_tinshift_gradcheck(device, dtype):
+    try:
+        from mmcv.ops import tin_shift
+    except ModuleNotFoundError:
+        pytest.skip('TINShift op is not successfully compiled')
+
+    if dtype == torch.half:
+        pytest.skip('"add_cpu/sub_cpu" not implemented for Half')
+
+    for shift in shifts:
+        np_input = np.array(inputs)
+        np_shift = np.array(shift)
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        shift = torch.tensor(np_shift, device=device).int()
+        if torch.__version__ == 'parrots':
+            gradcheck(tin_shift, (x, shift))
+        else:
+            gradcheck(tin_shift, (x, shift), atol=1, rtol=0.1)
+
+
+def _test_tinshift_allclose(device, dtype):
+    try:
+        from mmcv.ops import tin_shift
+    except ModuleNotFoundError:
+        pytest.skip('TINShift op is not successfully compiled')
+
+    for shift, output, grad in zip(shifts, outputs, grads):
+        np_input = np.array(inputs)
+        np_shift = np.array(shift)
+        np_output = np.array(output)
+        np_grad = np.array(grad)
+
+        x = torch.tensor(
+            np_input, dtype=dtype, device=device, requires_grad=True)
+        shift = torch.tensor(np_shift, device=device).int()
+
+        output = tin_shift(x, shift)
+        output.backward(torch.ones_like(output))
+        assert np.allclose(
+            output.data.type(torch.float).cpu().numpy(), np_output, 1e-3)
+        assert np.allclose(
+            x.grad.data.type(torch.float).cpu().numpy(), np_grad, 1e-3)
+
+
+def _test_tinshift_assert(device, dtype):
+    try:
+        from mmcv.ops import tin_shift
+    except ModuleNotFoundError:
+        pytest.skip('TINShift op is not successfully compiled')
+
+    inputs = [
+        torch.rand(2, 3, 4, 2),
+        torch.rand(2, 3, 4, 2),
+        torch.rand(1, 3, 4, 2)
+    ]
+    shifts = [torch.rand(2, 3), torch.rand(2, 5)]
+
+    for x, shift in zip(inputs, shifts):
+        x = x.to(device).type(dtype)
+        shift = shift.to(device).type(dtype)
+
+        # A ValueError should be raised if ops get inputs with wrong shapes.
+        with pytest.raises(ValueError):
+            tin_shift(x, shift)
+
+
+@pytest.mark.parametrize('device', [
+    pytest.param(
+        'cuda',
+        marks=pytest.mark.skipif(
+            not IS_CUDA_AVAILABLE, reason='requires CUDA support')),
+    pytest.param(
+        'mlu',
+        marks=pytest.mark.skipif(
+            not IS_MLU_AVAILABLE, reason='requires MLU support'))
+])
+@pytest.mark.parametrize('dtype', [
+    torch.float,
+    pytest.param(
+        torch.double,
+        marks=pytest.mark.skipif(
+            IS_MLU_AVAILABLE,
+            reason='MLU does not support for 64-bit floating point')),
+    torch.half
+])
+def test_tinshift(device, dtype):
+    _test_tinshift_allclose(device=device, dtype=dtype)
+    _test_tinshift_gradcheck(device=device, dtype=dtype)
+    _test_tinshift_assert(device=device, dtype=dtype)
diff --git a/mmcv/tests/test_ops/test_upfirdn2d.py b/mmcv/tests/test_ops/test_upfirdn2d.py
new file mode 100644
index 0000000000000000000000000000000000000000..6037a51c2f59285acb270192ab5e41f437b7c589
--- /dev/null
+++ b/mmcv/tests/test_ops/test_upfirdn2d.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+_USING_PARROTS = True
+try:
+    from parrots.autograd import gradcheck
+except ImportError:
+    from torch.autograd import gradcheck, gradgradcheck
+    _USING_PARROTS = False
+
+
+class TestUpFirDn2d:
+    """Unit test for UpFirDn2d.
+
+    Here, we just test the basic case of upsample version. More gerneal tests
+    will be included in other unit test for UpFirDnUpsample and
+    UpFirDnDownSample modules.
+    """
+
+    @classmethod
+    def setup_class(cls):
+        kernel_1d = torch.tensor([1., 3., 3., 1.])
+        cls.kernel = kernel_1d[:, None] * kernel_1d[None, :]
+        cls.kernel = cls.kernel / cls.kernel.sum()
+        cls.factor = 2
+        pad = cls.kernel.shape[0] - cls.factor
+        cls.pad = ((pad + 1) // 2 + cls.factor - 1, pad // 2)
+
+        cls.input_tensor = torch.randn((2, 3, 4, 4), requires_grad=True)
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason='requires cuda')
+    def test_upfirdn2d(self):
+        from mmcv.ops import upfirdn2d
+        if _USING_PARROTS:
+            gradcheck(
+                upfirdn2d,
+                (self.input_tensor.cuda(),
+                 self.kernel.type_as(
+                     self.input_tensor).cuda(), self.factor, 1, self.pad),
+                delta=1e-4,
+                pt_atol=1e-3)
+        else:
+            gradcheck(
+                upfirdn2d,
+                (self.input_tensor.cuda(),
+                 self.kernel.type_as(
+                     self.input_tensor).cuda(), self.factor, 1, self.pad),
+                eps=1e-4,
+                atol=1e-3)
+
+            gradgradcheck(
+                upfirdn2d,
+                (self.input_tensor.cuda(),
+                 self.kernel.type_as(
+                     self.input_tensor).cuda(), self.factor, 1, self.pad),
+                eps=1e-4,
+                atol=1e-3)
diff --git a/mmcv/tests/test_ops/test_voxelization.py b/mmcv/tests/test_ops/test_voxelization.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3555ac694d5fc0f1ebf03e50bbbd609d3e53682
--- /dev/null
+++ b/mmcv/tests/test_ops/test_voxelization.py
@@ -0,0 +1,139 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+
+from mmcv.ops import Voxelization
+
+
+def _get_voxel_points_indices(points, coors, voxel):
+    result_form = np.equal(coors, voxel)
+    return result_form[:, 0] & result_form[:, 1] & result_form[:, 2]
+
+
+@pytest.mark.parametrize('device_type', [
+    'cpu',
+    pytest.param(
+        'cuda:0',
+        marks=pytest.mark.skipif(
+            not torch.cuda.is_available(), reason='requires CUDA support'))
+])
+def test_voxelization(device_type):
+    voxel_size = [0.5, 0.5, 0.5]
+    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+    voxel_dict = np.load(
+        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
+    expected_coors = voxel_dict['coors']
+    expected_voxels = voxel_dict['voxels']
+    expected_num_points_per_voxel = voxel_dict['num_points_per_voxel']
+    points = voxel_dict['points']
+
+    points = torch.tensor(points)
+    max_num_points = -1
+    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                        max_num_points)
+    max_num_points = 1000
+    hard_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                     max_num_points)
+
+    device = torch.device(device_type)
+
+    # test hard_voxelization on cpu/gpu
+    points = points.contiguous().to(device)
+    coors, voxels, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy()
+    voxels = voxels.cpu().detach().numpy()
+    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy()
+    assert np.all(coors == expected_coors)
+    assert np.all(voxels == expected_voxels)
+    assert np.all(num_points_per_voxel == expected_num_points_per_voxel)
+
+    # test dynamic_voxelization on cpu/gpu
+    coors = dynamic_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy()
+    points = points.cpu().detach().numpy()
+    for i in range(expected_voxels.shape[0]):
+        indices = _get_voxel_points_indices(points, coors, expected_voxels[i])
+        num_points_current_voxel = points[indices].shape[0]
+        assert num_points_current_voxel > 0
+        assert np.all(
+            points[indices] == expected_coors[i][:num_points_current_voxel])
+        assert num_points_current_voxel == expected_num_points_per_voxel[i]
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_voxelization_nondeterministic():
+    voxel_size = [0.5, 0.5, 0.5]
+    point_cloud_range = [0, -40, -3, 70.4, 40, 1]
+
+    voxel_dict = np.load(
+        'tests/data/for_3d_ops/test_voxel.npy', allow_pickle=True).item()
+    points = voxel_dict['points']
+
+    points = torch.tensor(points)
+    max_num_points = -1
+    dynamic_voxelization = Voxelization(voxel_size, point_cloud_range,
+                                        max_num_points)
+
+    max_num_points = 10
+    max_voxels = 50
+    hard_voxelization = Voxelization(
+        voxel_size,
+        point_cloud_range,
+        max_num_points,
+        max_voxels,
+        deterministic=False)
+
+    # test hard_voxelization (non-deterministic version) on gpu
+    points = torch.tensor(points).contiguous().to(device='cuda:0')
+    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy().tolist()
+    voxels = voxels.cpu().detach().numpy().tolist()
+    num_points_per_voxel = num_points_per_voxel.cpu().detach().numpy().tolist()
+
+    coors_all = dynamic_voxelization.forward(points)
+    coors_all = coors_all.cpu().detach().numpy().tolist()
+
+    coors_set = {tuple(c) for c in coors}
+    coors_all_set = {tuple(c) for c in coors_all}
+
+    assert len(coors_set) == len(coors)
+    assert len(coors_set - coors_all_set) == 0
+
+    points = points.cpu().detach().numpy().tolist()
+
+    coors_points_dict = {}
+    for c, ps in zip(coors_all, points):
+        if tuple(c) not in coors_points_dict:
+            coors_points_dict[tuple(c)] = set()
+        coors_points_dict[tuple(c)].add(tuple(ps))
+
+    for c, ps, n in zip(coors, voxels, num_points_per_voxel):
+        ideal_voxel_points_set = coors_points_dict[tuple(c)]
+        voxel_points_set = {tuple(p) for p in ps[:n]}
+        assert len(voxel_points_set) == n
+        if n < max_num_points:
+            assert voxel_points_set == ideal_voxel_points_set
+            for p in ps[n:]:
+                assert max(p) == min(p) == 0
+        else:
+            assert len(voxel_points_set - ideal_voxel_points_set) == 0
+
+    # test hard_voxelization (non-deterministic version) on gpu
+    # with all input point in range
+    points = torch.tensor(points).contiguous().to(device='cuda:0')[:max_voxels]
+    coors_all = dynamic_voxelization.forward(points)
+    valid_mask = coors_all.ge(0).all(-1)
+    points = points[valid_mask]
+    coors_all = coors_all[valid_mask]
+    coors_all = coors_all.cpu().detach().numpy().tolist()
+
+    voxels, coors, num_points_per_voxel = hard_voxelization.forward(points)
+    coors = coors.cpu().detach().numpy().tolist()
+
+    coors_set = {tuple(c) for c in coors}
+    coors_all_set = {tuple(c) for c in coors_all}
+
+    assert len(coors_set) == len(coors) == len(coors_all_set)
diff --git a/mmcv/tests/test_parallel.py b/mmcv/tests/test_parallel.py
new file mode 100644
index 0000000000000000000000000000000000000000..814aaeadfba37a161f2211a8c2a90a604bcfd0f3
--- /dev/null
+++ b/mmcv/tests/test_parallel.py
@@ -0,0 +1,188 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+from torch.nn.parallel import DataParallel, DistributedDataParallel
+
+from mmcv.parallel import (MODULE_WRAPPERS, MMDataParallel,
+                           MMDistributedDataParallel, is_module_wrapper)
+from mmcv.parallel._functions import Scatter, get_input_device, scatter
+from mmcv.parallel.distributed_deprecated import \
+    MMDistributedDataParallel as DeprecatedMMDDP
+from mmcv.utils import Registry
+
+
+def mock(*args, **kwargs):
+    pass
+
+
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not supported in parrots now')
+@patch('torch.distributed._broadcast_coalesced', mock)
+@patch('torch.distributed.broadcast', mock)
+@patch('torch.nn.parallel.DistributedDataParallel._ddp_init_helper', mock)
+def test_is_module_wrapper():
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(2, 2, 1)
+
+        def forward(self, x):
+            return self.conv(x)
+
+    # _verify_model_across_ranks is added in torch1.9.0,
+    # _verify_params_across_processes is added in torch1.11.0,
+    # so we should check whether _verify_model_across_ranks
+    # and _verify_params_across_processes are the member of
+    # torch.distributed before mocking
+    if hasattr(torch.distributed, '_verify_model_across_ranks'):
+        torch.distributed._verify_model_across_ranks = mock
+    if hasattr(torch.distributed, '_verify_params_across_processes'):
+        torch.distributed._verify_params_across_processes = mock
+
+    model = Model()
+    assert not is_module_wrapper(model)
+
+    dp = DataParallel(model)
+    assert is_module_wrapper(dp)
+
+    mmdp = MMDataParallel(model)
+    assert is_module_wrapper(mmdp)
+
+    ddp = DistributedDataParallel(model, process_group=MagicMock())
+    assert is_module_wrapper(ddp)
+
+    mmddp = MMDistributedDataParallel(model, process_group=MagicMock())
+    assert is_module_wrapper(mmddp)
+
+    deprecated_mmddp = DeprecatedMMDDP(model)
+    assert is_module_wrapper(deprecated_mmddp)
+
+    # test module wrapper registry
+    @MODULE_WRAPPERS.register_module()
+    class ModuleWrapper:
+
+        def __init__(self, module):
+            self.module = module
+
+        def forward(self, *args, **kwargs):
+            return self.module(*args, **kwargs)
+
+    module_wraper = ModuleWrapper(model)
+    assert is_module_wrapper(module_wraper)
+
+    # test module wrapper registry in downstream repo
+    MMRAZOR_MODULE_WRAPPERS = Registry(
+        'mmrazor module wrapper', parent=MODULE_WRAPPERS, scope='mmrazor')
+    MMPOSE_MODULE_WRAPPERS = Registry(
+        'mmpose module wrapper', parent=MODULE_WRAPPERS, scope='mmpose')
+
+    @MMRAZOR_MODULE_WRAPPERS.register_module()
+    class ModuleWrapperInRazor:
+
+        def __init__(self, module):
+            self.module = module
+
+        def forward(self, *args, **kwargs):
+            return self.module(*args, **kwargs)
+
+    @MMPOSE_MODULE_WRAPPERS.register_module()
+    class ModuleWrapperInPose:
+
+        def __init__(self, module):
+            self.module = module
+
+        def forward(self, *args, **kwargs):
+            return self.module(*args, **kwargs)
+
+    wrapped_module = ModuleWrapperInRazor(model)
+    assert is_module_wrapper(wrapped_module)
+
+    wrapped_module = ModuleWrapperInPose(model)
+    assert is_module_wrapper(wrapped_module)
+
+
+def test_get_input_device():
+    # if the device is CPU, return -1
+    input = torch.zeros([1, 3, 3, 3])
+    assert get_input_device(input) == -1
+    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+    assert get_input_device(inputs) == -1
+
+    # if the device is GPU, return the index of device
+    if torch.cuda.is_available():
+        input = torch.zeros([1, 3, 3, 3]).cuda()
+        assert get_input_device(input) == 0
+        inputs = [
+            torch.zeros([1, 3, 3, 3]).cuda(),
+            torch.zeros([1, 4, 4, 4]).cuda()
+        ]
+        assert get_input_device(inputs) == 0
+
+    # input should be a tensor or list of tensor
+    with pytest.raises(Exception):
+        get_input_device(5)
+
+
+def test_scatter():
+    # if the device is CPU, just return the input
+    input = torch.zeros([1, 3, 3, 3])
+    output = scatter(input=input, devices=[-1])
+    assert torch.allclose(input, output)
+
+    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+    outputs = scatter(input=inputs, devices=[-1])
+    for input, output in zip(inputs, outputs):
+        assert torch.allclose(input, output)
+
+    # if the device is GPU, copy the input from CPU to GPU
+    if torch.cuda.is_available():
+        input = torch.zeros([1, 3, 3, 3])
+        output = scatter(input=input, devices=[0])
+        assert torch.allclose(input.cuda(), output)
+
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = scatter(input=inputs, devices=[0])
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.cuda(), output)
+
+    # input should be a tensor or list of tensor
+    with pytest.raises(Exception):
+        scatter(5, [-1])
+
+
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not supported in parrots now')
+def test_Scatter():
+    # if the device is CPU, just return the input
+    target_gpus = [-1]
+    input = torch.zeros([1, 3, 3, 3])
+    outputs = Scatter.forward(target_gpus, input)
+    assert isinstance(outputs, tuple)
+    assert torch.allclose(input, outputs[0])
+
+    target_gpus = [-1]
+    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+    outputs = Scatter.forward(target_gpus, inputs)
+    assert isinstance(outputs, tuple)
+    for input, output in zip(inputs, outputs):
+        assert torch.allclose(input, output)
+
+    # if the device is GPU, copy the input from CPU to GPU
+    if torch.cuda.is_available():
+        target_gpus = [0]
+        input = torch.zeros([1, 3, 3, 3])
+        outputs = Scatter.forward(target_gpus, input)
+        assert isinstance(outputs, tuple)
+        assert torch.allclose(input.cuda(), outputs[0])
+
+        target_gpus = [0]
+        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
+        outputs = Scatter.forward(target_gpus, inputs)
+        assert isinstance(outputs, tuple)
+        for input, output in zip(inputs, outputs):
+            assert torch.allclose(input.cuda(), output[0])
diff --git a/mmcv/tests/test_runner/test_basemodule.py b/mmcv/tests/test_runner/test_basemodule.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f186bf8289ff4a21c5b88fedf1c9fe450531b59
--- /dev/null
+++ b/mmcv/tests/test_runner/test_basemodule.py
@@ -0,0 +1,611 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+
+import pytest
+import torch
+from torch import nn
+
+import mmcv
+from mmcv.cnn.utils.weight_init import update_init_info
+from mmcv.runner import BaseModule, ModuleDict, ModuleList, Sequential
+from mmcv.utils import Registry, build_from_cfg
+
+COMPONENTS = Registry('component')
+FOOMODELS = Registry('model')
+
+
+@COMPONENTS.register_module()
+class FooConv1d(BaseModule):
+
+    def __init__(self, init_cfg=None):
+        super().__init__(init_cfg)
+        self.conv1d = nn.Conv1d(4, 1, 4)
+
+    def forward(self, x):
+        return self.conv1d(x)
+
+
+@COMPONENTS.register_module()
+class FooConv2d(BaseModule):
+
+    def __init__(self, init_cfg=None):
+        super().__init__(init_cfg)
+        self.conv2d = nn.Conv2d(3, 1, 3)
+
+    def forward(self, x):
+        return self.conv2d(x)
+
+
+@COMPONENTS.register_module()
+class FooLinear(BaseModule):
+
+    def __init__(self, init_cfg=None):
+        super().__init__(init_cfg)
+        self.linear = nn.Linear(3, 4)
+
+    def forward(self, x):
+        return self.linear(x)
+
+
+@COMPONENTS.register_module()
+class FooLinearConv1d(BaseModule):
+
+    def __init__(self, linear=None, conv1d=None, init_cfg=None):
+        super().__init__(init_cfg)
+        if linear is not None:
+            self.linear = build_from_cfg(linear, COMPONENTS)
+        if conv1d is not None:
+            self.conv1d = build_from_cfg(conv1d, COMPONENTS)
+
+    def forward(self, x):
+        x = self.linear(x)
+        return self.conv1d(x)
+
+
+@FOOMODELS.register_module()
+class FooModel(BaseModule):
+
+    def __init__(self,
+                 component1=None,
+                 component2=None,
+                 component3=None,
+                 component4=None,
+                 init_cfg=None) -> None:
+        super().__init__(init_cfg)
+        if component1 is not None:
+            self.component1 = build_from_cfg(component1, COMPONENTS)
+        if component2 is not None:
+            self.component2 = build_from_cfg(component2, COMPONENTS)
+        if component3 is not None:
+            self.component3 = build_from_cfg(component3, COMPONENTS)
+        if component4 is not None:
+            self.component4 = build_from_cfg(component4, COMPONENTS)
+
+        # its type is not BaseModule, it can be initialized
+        # with "override" key.
+        self.reg = nn.Linear(3, 4)
+
+
+def test_initilization_info_logger():
+    # 'override' has higher priority
+
+    import os
+
+    import torch.nn as nn
+
+    from mmcv.utils.logging import get_logger
+
+    class OverloadInitConv(nn.Conv2d, BaseModule):
+
+        def init_weights(self):
+            for p in self.parameters():
+                with torch.no_grad():
+                    p.fill_(1)
+
+    class CheckLoggerModel(BaseModule):
+
+        def __init__(self, init_cfg=None):
+            super().__init__(init_cfg)
+            self.conv1 = nn.Conv2d(1, 1, 1, 1)
+            self.conv2 = OverloadInitConv(1, 1, 1, 1)
+            self.conv3 = nn.Conv2d(1, 1, 1, 1)
+            self.fc1 = nn.Linear(1, 1)
+
+    init_cfg = [
+        dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='conv3', std=0.01, bias_prob=0.01)),
+        dict(type='Constant', layer='Linear', val=0., bias=1.)
+    ]
+
+    model = CheckLoggerModel(init_cfg=init_cfg)
+
+    train_log = '20210720_132454.log'
+    workdir = tempfile.mkdtemp()
+    log_file = os.path.join(workdir, train_log)
+    # create a logger
+    get_logger('init_logger', log_file=log_file)
+    assert not hasattr(model, '_params_init_info')
+    model.init_weights()
+    # assert `_params_init_info` would be deleted after `init_weights`
+    assert not hasattr(model, '_params_init_info')
+    # assert initialization information has been dumped
+    assert os.path.exists(log_file)
+
+    lines = mmcv.list_from_file(log_file)
+
+    # check initialization information is right
+    for i, line in enumerate(lines):
+        if 'conv1.weight' in line:
+            assert 'NormalInit' in lines[i + 1]
+        if 'conv2.weight' in line:
+            assert 'OverloadInitConv' in lines[i + 1]
+        if 'fc1.weight' in line:
+            assert 'ConstantInit' in lines[i + 1]
+
+    # test corner case
+
+    class OverloadInitConvFc(nn.Conv2d, BaseModule):
+
+        def __init__(self, *args, **kwargs):
+            super().__init__(*args, **kwargs)
+            self.conv1 = nn.Linear(1, 1)
+
+        def init_weights(self):
+            for p in self.parameters():
+                with torch.no_grad():
+                    p.fill_(1)
+
+    class CheckLoggerModel(BaseModule):
+
+        def __init__(self, init_cfg=None):
+            super().__init__(init_cfg)
+            self.conv1 = nn.Conv2d(1, 1, 1, 1)
+            self.conv2 = OverloadInitConvFc(1, 1, 1, 1)
+            self.conv3 = nn.Conv2d(1, 1, 1, 1)
+            self.fc1 = nn.Linear(1, 1)
+
+    class TopLevelModule(BaseModule):
+
+        def __init__(self, init_cfg=None, checklog_init_cfg=None):
+            super().__init__(init_cfg)
+            self.module1 = CheckLoggerModel(checklog_init_cfg)
+            self.module2 = OverloadInitConvFc(1, 1, 1, 1)
+
+    checklog_init_cfg = [
+        dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='conv3', std=0.01, bias_prob=0.01)),
+        dict(type='Constant', layer='Linear', val=0., bias=1.)
+    ]
+
+    top_level_init_cfg = [
+        dict(
+            type='Normal',
+            layer='Conv2d',
+            std=0.01,
+            override=dict(
+                type='Normal', name='module2', std=0.01, bias_prob=0.01))
+    ]
+
+    model = TopLevelModule(
+        init_cfg=top_level_init_cfg, checklog_init_cfg=checklog_init_cfg)
+
+    model.module1.init_weights()
+    model.module2.init_weights()
+    model.init_weights()
+    model.module1.init_weights()
+    model.module2.init_weights()
+
+    assert not hasattr(model, '_params_init_info')
+    model.init_weights()
+    # assert `_params_init_info` would be deleted after `init_weights`
+    assert not hasattr(model, '_params_init_info')
+    # assert initialization information has been dumped
+    assert os.path.exists(log_file)
+
+    lines = mmcv.list_from_file(log_file)
+    # check initialization information is right
+    for i, line in enumerate(lines):
+        if 'TopLevelModule' in line and 'init_cfg' not in line:
+            # have been set init_flag
+            assert 'the same' in line
+
+
+def test_update_init_info():
+
+    class DummyModel(BaseModule):
+
+        def __init__(self, init_cfg=None):
+            super().__init__(init_cfg)
+            self.conv1 = nn.Conv2d(1, 1, 1, 1)
+            self.conv3 = nn.Conv2d(1, 1, 1, 1)
+            self.fc1 = nn.Linear(1, 1)
+
+    model = DummyModel()
+    from collections import defaultdict
+    model._params_init_info = defaultdict(dict)
+    for name, param in model.named_parameters():
+        model._params_init_info[param]['init_info'] = 'init'
+        model._params_init_info[param]['tmp_mean_value'] = param.data.mean()
+
+    with torch.no_grad():
+        for p in model.parameters():
+            p.fill_(1)
+
+    update_init_info(model, init_info='fill_1')
+
+    for item in model._params_init_info.values():
+        assert item['init_info'] == 'fill_1'
+        assert item['tmp_mean_value'] == 1
+
+    # test assert for new parameters
+    model.conv1.bias = nn.Parameter(torch.ones_like(model.conv1.bias))
+    with pytest.raises(AssertionError):
+        update_init_info(model, init_info=' ')
+
+
+def test_model_weight_init():
+    """
+    Config
+    model (FooModel, Linear: weight=1, bias=2, Conv1d: weight=3, bias=4,
+                     Conv2d: weight=5, bias=6)
+    ├──component1 (FooConv1d)
+    ├──component2 (FooConv2d)
+    ├──component3 (FooLinear)
+    ├──component4 (FooLinearConv1d)
+        ├──linear (FooLinear)
+        ├──conv1d (FooConv1d)
+    ├──reg (nn.Linear)
+
+    Parameters after initialization
+    model (FooModel)
+    ├──component1 (FooConv1d, weight=3, bias=4)
+    ├──component2 (FooConv2d, weight=5, bias=6)
+    ├──component3 (FooLinear, weight=1, bias=2)
+    ├──component4 (FooLinearConv1d)
+        ├──linear (FooLinear, weight=1, bias=2)
+        ├──conv1d (FooConv1d, weight=3, bias=4)
+    ├──reg (nn.Linear, weight=1, bias=2)
+    """
+    model_cfg = dict(
+        type='FooModel',
+        init_cfg=[
+            dict(type='Constant', val=1, bias=2, layer='Linear'),
+            dict(type='Constant', val=3, bias=4, layer='Conv1d'),
+            dict(type='Constant', val=5, bias=6, layer='Conv2d')
+        ],
+        component1=dict(type='FooConv1d'),
+        component2=dict(type='FooConv2d'),
+        component3=dict(type='FooLinear'),
+        component4=dict(
+            type='FooLinearConv1d',
+            linear=dict(type='FooLinear'),
+            conv1d=dict(type='FooConv1d')))
+
+    model = build_from_cfg(model_cfg, FOOMODELS)
+    model.init_weights()
+
+    assert torch.equal(model.component1.conv1d.weight,
+                       torch.full(model.component1.conv1d.weight.shape, 3.0))
+    assert torch.equal(model.component1.conv1d.bias,
+                       torch.full(model.component1.conv1d.bias.shape, 4.0))
+    assert torch.equal(model.component2.conv2d.weight,
+                       torch.full(model.component2.conv2d.weight.shape, 5.0))
+    assert torch.equal(model.component2.conv2d.bias,
+                       torch.full(model.component2.conv2d.bias.shape, 6.0))
+    assert torch.equal(model.component3.linear.weight,
+                       torch.full(model.component3.linear.weight.shape, 1.0))
+    assert torch.equal(model.component3.linear.bias,
+                       torch.full(model.component3.linear.bias.shape, 2.0))
+    assert torch.equal(
+        model.component4.linear.linear.weight,
+        torch.full(model.component4.linear.linear.weight.shape, 1.0))
+    assert torch.equal(
+        model.component4.linear.linear.bias,
+        torch.full(model.component4.linear.linear.bias.shape, 2.0))
+    assert torch.equal(
+        model.component4.conv1d.conv1d.weight,
+        torch.full(model.component4.conv1d.conv1d.weight.shape, 3.0))
+    assert torch.equal(
+        model.component4.conv1d.conv1d.bias,
+        torch.full(model.component4.conv1d.conv1d.bias.shape, 4.0))
+    assert torch.equal(model.reg.weight, torch.full(model.reg.weight.shape,
+                                                    1.0))
+    assert torch.equal(model.reg.bias, torch.full(model.reg.bias.shape, 2.0))
+
+
+def test_nest_components_weight_init():
+    """
+    Config
+    model (FooModel, Linear: weight=1, bias=2, Conv1d: weight=3, bias=4,
+                     Conv2d: weight=5, bias=6)
+    ├──component1 (FooConv1d, Conv1d: weight=7, bias=8)
+    ├──component2 (FooConv2d, Conv2d: weight=9, bias=10)
+    ├──component3 (FooLinear)
+    ├──component4 (FooLinearConv1d, Linear: weight=11, bias=12)
+        ├──linear (FooLinear, Linear: weight=11, bias=12)
+        ├──conv1d (FooConv1d)
+    ├──reg (nn.Linear, weight=13, bias=14)
+
+    Parameters after initialization
+    model (FooModel)
+    ├──component1 (FooConv1d, weight=7, bias=8)
+    ├──component2 (FooConv2d, weight=9, bias=10)
+    ├──component3 (FooLinear, weight=1, bias=2)
+    ├──component4 (FooLinearConv1d)
+        ├──linear (FooLinear, weight=1, bias=2)
+        ├──conv1d (FooConv1d, weight=3, bias=4)
+    ├──reg (nn.Linear, weight=13, bias=14)
+    """
+
+    model_cfg = dict(
+        type='FooModel',
+        init_cfg=[
+            dict(
+                type='Constant',
+                val=1,
+                bias=2,
+                layer='Linear',
+                override=dict(type='Constant', name='reg', val=13, bias=14)),
+            dict(type='Constant', val=3, bias=4, layer='Conv1d'),
+            dict(type='Constant', val=5, bias=6, layer='Conv2d'),
+        ],
+        component1=dict(
+            type='FooConv1d',
+            init_cfg=dict(type='Constant', layer='Conv1d', val=7, bias=8)),
+        component2=dict(
+            type='FooConv2d',
+            init_cfg=dict(type='Constant', layer='Conv2d', val=9, bias=10)),
+        component3=dict(type='FooLinear'),
+        component4=dict(
+            type='FooLinearConv1d',
+            linear=dict(type='FooLinear'),
+            conv1d=dict(type='FooConv1d')))
+
+    model = build_from_cfg(model_cfg, FOOMODELS)
+    model.init_weights()
+
+    assert torch.equal(model.component1.conv1d.weight,
+                       torch.full(model.component1.conv1d.weight.shape, 7.0))
+    assert torch.equal(model.component1.conv1d.bias,
+                       torch.full(model.component1.conv1d.bias.shape, 8.0))
+    assert torch.equal(model.component2.conv2d.weight,
+                       torch.full(model.component2.conv2d.weight.shape, 9.0))
+    assert torch.equal(model.component2.conv2d.bias,
+                       torch.full(model.component2.conv2d.bias.shape, 10.0))
+    assert torch.equal(model.component3.linear.weight,
+                       torch.full(model.component3.linear.weight.shape, 1.0))
+    assert torch.equal(model.component3.linear.bias,
+                       torch.full(model.component3.linear.bias.shape, 2.0))
+    assert torch.equal(
+        model.component4.linear.linear.weight,
+        torch.full(model.component4.linear.linear.weight.shape, 1.0))
+    assert torch.equal(
+        model.component4.linear.linear.bias,
+        torch.full(model.component4.linear.linear.bias.shape, 2.0))
+    assert torch.equal(
+        model.component4.conv1d.conv1d.weight,
+        torch.full(model.component4.conv1d.conv1d.weight.shape, 3.0))
+    assert torch.equal(
+        model.component4.conv1d.conv1d.bias,
+        torch.full(model.component4.conv1d.conv1d.bias.shape, 4.0))
+    assert torch.equal(model.reg.weight,
+                       torch.full(model.reg.weight.shape, 13.0))
+    assert torch.equal(model.reg.bias, torch.full(model.reg.bias.shape, 14.0))
+
+
+def test_without_layer_weight_init():
+    model_cfg = dict(
+        type='FooModel',
+        init_cfg=[
+            dict(type='Constant', val=1, bias=2, layer='Linear'),
+            dict(type='Constant', val=3, bias=4, layer='Conv1d'),
+            dict(type='Constant', val=5, bias=6, layer='Conv2d')
+        ],
+        component1=dict(
+            type='FooConv1d', init_cfg=dict(type='Constant', val=7, bias=8)),
+        component2=dict(type='FooConv2d'),
+        component3=dict(type='FooLinear'))
+    model = build_from_cfg(model_cfg, FOOMODELS)
+    model.init_weights()
+
+    assert torch.equal(model.component1.conv1d.weight,
+                       torch.full(model.component1.conv1d.weight.shape, 3.0))
+    assert torch.equal(model.component1.conv1d.bias,
+                       torch.full(model.component1.conv1d.bias.shape, 4.0))
+
+    # init_cfg in component1 does not have layer key, so it does nothing
+    assert torch.equal(model.component2.conv2d.weight,
+                       torch.full(model.component2.conv2d.weight.shape, 5.0))
+    assert torch.equal(model.component2.conv2d.bias,
+                       torch.full(model.component2.conv2d.bias.shape, 6.0))
+    assert torch.equal(model.component3.linear.weight,
+                       torch.full(model.component3.linear.weight.shape, 1.0))
+    assert torch.equal(model.component3.linear.bias,
+                       torch.full(model.component3.linear.bias.shape, 2.0))
+
+    assert torch.equal(model.reg.weight, torch.full(model.reg.weight.shape,
+                                                    1.0))
+    assert torch.equal(model.reg.bias, torch.full(model.reg.bias.shape, 2.0))
+
+
+def test_override_weight_init():
+
+    # only initialize 'override'
+    model_cfg = dict(
+        type='FooModel',
+        init_cfg=[
+            dict(type='Constant', val=10, bias=20, override=dict(name='reg'))
+        ],
+        component1=dict(type='FooConv1d'),
+        component3=dict(type='FooLinear'))
+    model = build_from_cfg(model_cfg, FOOMODELS)
+    model.init_weights()
+    assert torch.equal(model.reg.weight,
+                       torch.full(model.reg.weight.shape, 10.0))
+    assert torch.equal(model.reg.bias, torch.full(model.reg.bias.shape, 20.0))
+    # do not initialize others
+    assert not torch.equal(
+        model.component1.conv1d.weight,
+        torch.full(model.component1.conv1d.weight.shape, 10.0))
+    assert not torch.equal(
+        model.component1.conv1d.bias,
+        torch.full(model.component1.conv1d.bias.shape, 20.0))
+    assert not torch.equal(
+        model.component3.linear.weight,
+        torch.full(model.component3.linear.weight.shape, 10.0))
+    assert not torch.equal(
+        model.component3.linear.bias,
+        torch.full(model.component3.linear.bias.shape, 20.0))
+
+    # 'override' has higher priority
+    model_cfg = dict(
+        type='FooModel',
+        init_cfg=[
+            dict(
+                type='Constant',
+                val=1,
+                bias=2,
+                override=dict(name='reg', type='Constant', val=30, bias=40))
+        ],
+        component1=dict(type='FooConv1d'),
+        component2=dict(type='FooConv2d'),
+        component3=dict(type='FooLinear'))
+    model = build_from_cfg(model_cfg, FOOMODELS)
+    model.init_weights()
+
+    assert torch.equal(model.reg.weight,
+                       torch.full(model.reg.weight.shape, 30.0))
+    assert torch.equal(model.reg.bias, torch.full(model.reg.bias.shape, 40.0))
+
+
+def test_sequential_model_weight_init():
+    seq_model_cfg = [
+        dict(
+            type='FooConv1d',
+            init_cfg=dict(type='Constant', layer='Conv1d', val=0., bias=1.)),
+        dict(
+            type='FooConv2d',
+            init_cfg=dict(type='Constant', layer='Conv2d', val=2., bias=3.)),
+    ]
+    layers = [build_from_cfg(cfg, COMPONENTS) for cfg in seq_model_cfg]
+    seq_model = Sequential(*layers)
+    seq_model.init_weights()
+    assert torch.equal(seq_model[0].conv1d.weight,
+                       torch.full(seq_model[0].conv1d.weight.shape, 0.))
+    assert torch.equal(seq_model[0].conv1d.bias,
+                       torch.full(seq_model[0].conv1d.bias.shape, 1.))
+    assert torch.equal(seq_model[1].conv2d.weight,
+                       torch.full(seq_model[1].conv2d.weight.shape, 2.))
+    assert torch.equal(seq_model[1].conv2d.bias,
+                       torch.full(seq_model[1].conv2d.bias.shape, 3.))
+    # inner init_cfg has higher priority
+    layers = [build_from_cfg(cfg, COMPONENTS) for cfg in seq_model_cfg]
+    seq_model = Sequential(
+        *layers,
+        init_cfg=dict(
+            type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.))
+    seq_model.init_weights()
+    assert torch.equal(seq_model[0].conv1d.weight,
+                       torch.full(seq_model[0].conv1d.weight.shape, 0.))
+    assert torch.equal(seq_model[0].conv1d.bias,
+                       torch.full(seq_model[0].conv1d.bias.shape, 1.))
+    assert torch.equal(seq_model[1].conv2d.weight,
+                       torch.full(seq_model[1].conv2d.weight.shape, 2.))
+    assert torch.equal(seq_model[1].conv2d.bias,
+                       torch.full(seq_model[1].conv2d.bias.shape, 3.))
+
+
+def test_modulelist_weight_init():
+    models_cfg = [
+        dict(
+            type='FooConv1d',
+            init_cfg=dict(type='Constant', layer='Conv1d', val=0., bias=1.)),
+        dict(
+            type='FooConv2d',
+            init_cfg=dict(type='Constant', layer='Conv2d', val=2., bias=3.)),
+    ]
+    layers = [build_from_cfg(cfg, COMPONENTS) for cfg in models_cfg]
+    modellist = ModuleList(layers)
+    modellist.init_weights()
+    assert torch.equal(modellist[0].conv1d.weight,
+                       torch.full(modellist[0].conv1d.weight.shape, 0.))
+    assert torch.equal(modellist[0].conv1d.bias,
+                       torch.full(modellist[0].conv1d.bias.shape, 1.))
+    assert torch.equal(modellist[1].conv2d.weight,
+                       torch.full(modellist[1].conv2d.weight.shape, 2.))
+    assert torch.equal(modellist[1].conv2d.bias,
+                       torch.full(modellist[1].conv2d.bias.shape, 3.))
+    # inner init_cfg has higher priority
+    layers = [build_from_cfg(cfg, COMPONENTS) for cfg in models_cfg]
+    modellist = ModuleList(
+        layers,
+        init_cfg=dict(
+            type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.))
+    modellist.init_weights()
+    assert torch.equal(modellist[0].conv1d.weight,
+                       torch.full(modellist[0].conv1d.weight.shape, 0.))
+    assert torch.equal(modellist[0].conv1d.bias,
+                       torch.full(modellist[0].conv1d.bias.shape, 1.))
+    assert torch.equal(modellist[1].conv2d.weight,
+                       torch.full(modellist[1].conv2d.weight.shape, 2.))
+    assert torch.equal(modellist[1].conv2d.bias,
+                       torch.full(modellist[1].conv2d.bias.shape, 3.))
+
+
+def test_moduledict_weight_init():
+    models_cfg = dict(
+        foo_conv_1d=dict(
+            type='FooConv1d',
+            init_cfg=dict(type='Constant', layer='Conv1d', val=0., bias=1.)),
+        foo_conv_2d=dict(
+            type='FooConv2d',
+            init_cfg=dict(type='Constant', layer='Conv2d', val=2., bias=3.)),
+    )
+    layers = {
+        name: build_from_cfg(cfg, COMPONENTS)
+        for name, cfg in models_cfg.items()
+    }
+    modeldict = ModuleDict(layers)
+    modeldict.init_weights()
+    assert torch.equal(
+        modeldict['foo_conv_1d'].conv1d.weight,
+        torch.full(modeldict['foo_conv_1d'].conv1d.weight.shape, 0.))
+    assert torch.equal(
+        modeldict['foo_conv_1d'].conv1d.bias,
+        torch.full(modeldict['foo_conv_1d'].conv1d.bias.shape, 1.))
+    assert torch.equal(
+        modeldict['foo_conv_2d'].conv2d.weight,
+        torch.full(modeldict['foo_conv_2d'].conv2d.weight.shape, 2.))
+    assert torch.equal(
+        modeldict['foo_conv_2d'].conv2d.bias,
+        torch.full(modeldict['foo_conv_2d'].conv2d.bias.shape, 3.))
+    # inner init_cfg has higher priority
+    layers = {
+        name: build_from_cfg(cfg, COMPONENTS)
+        for name, cfg in models_cfg.items()
+    }
+    modeldict = ModuleDict(
+        layers,
+        init_cfg=dict(
+            type='Constant', layer=['Conv1d', 'Conv2d'], val=4., bias=5.))
+    modeldict.init_weights()
+    assert torch.equal(
+        modeldict['foo_conv_1d'].conv1d.weight,
+        torch.full(modeldict['foo_conv_1d'].conv1d.weight.shape, 0.))
+    assert torch.equal(
+        modeldict['foo_conv_1d'].conv1d.bias,
+        torch.full(modeldict['foo_conv_1d'].conv1d.bias.shape, 1.))
+    assert torch.equal(
+        modeldict['foo_conv_2d'].conv2d.weight,
+        torch.full(modeldict['foo_conv_2d'].conv2d.weight.shape, 2.))
+    assert torch.equal(
+        modeldict['foo_conv_2d'].conv2d.bias,
+        torch.full(modeldict['foo_conv_2d'].conv2d.bias.shape, 3.))
diff --git a/mmcv/tests/test_runner/test_checkpoint.py b/mmcv/tests/test_runner/test_checkpoint.py
new file mode 100644
index 0000000000000000000000000000000000000000..95ab7bcaf7b9eb61844649d2c3a656716f3805f6
--- /dev/null
+++ b/mmcv/tests/test_runner/test_checkpoint.py
@@ -0,0 +1,452 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+from collections import OrderedDict
+from tempfile import TemporaryDirectory
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.nn.parallel import DataParallel
+
+from mmcv.fileio.file_client import PetrelBackend
+from mmcv.parallel.registry import MODULE_WRAPPERS
+from mmcv.runner.checkpoint import (_load_checkpoint_with_prefix,
+                                    get_state_dict, load_checkpoint,
+                                    load_from_local, load_from_pavi,
+                                    save_checkpoint)
+
+sys.modules['petrel_client'] = MagicMock()
+sys.modules['petrel_client.client'] = MagicMock()
+
+
+@MODULE_WRAPPERS.register_module()
+class DDPWrapper:
+
+    def __init__(self, module):
+        self.module = module
+
+
+class Block(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+        self.norm = nn.BatchNorm2d(3)
+
+
+class Model(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.block = Block()
+        self.conv = nn.Conv2d(3, 3, 1)
+
+
+class Mockpavimodel:
+
+    def __init__(self, name='fakename'):
+        self.name = name
+
+    def download(self, file):
+        pass
+
+
+def assert_tensor_equal(tensor_a, tensor_b):
+    assert tensor_a.eq(tensor_b).all()
+
+
+def test_get_state_dict():
+    if torch.__version__ == 'parrots':
+        state_dict_keys = {
+            'block.conv.weight', 'block.conv.bias', 'block.norm.weight',
+            'block.norm.bias', 'block.norm.running_mean',
+            'block.norm.running_var', 'conv.weight', 'conv.bias'
+        }
+    else:
+        state_dict_keys = {
+            'block.conv.weight', 'block.conv.bias', 'block.norm.weight',
+            'block.norm.bias', 'block.norm.running_mean',
+            'block.norm.running_var', 'block.norm.num_batches_tracked',
+            'conv.weight', 'conv.bias'
+        }
+
+    model = Model()
+    state_dict = get_state_dict(model)
+    assert isinstance(state_dict, OrderedDict)
+    assert set(state_dict.keys()) == state_dict_keys
+
+    assert_tensor_equal(state_dict['block.conv.weight'],
+                        model.block.conv.weight)
+    assert_tensor_equal(state_dict['block.conv.bias'], model.block.conv.bias)
+    assert_tensor_equal(state_dict['block.norm.weight'],
+                        model.block.norm.weight)
+    assert_tensor_equal(state_dict['block.norm.bias'], model.block.norm.bias)
+    assert_tensor_equal(state_dict['block.norm.running_mean'],
+                        model.block.norm.running_mean)
+    assert_tensor_equal(state_dict['block.norm.running_var'],
+                        model.block.norm.running_var)
+    if torch.__version__ != 'parrots':
+        assert_tensor_equal(state_dict['block.norm.num_batches_tracked'],
+                            model.block.norm.num_batches_tracked)
+    assert_tensor_equal(state_dict['conv.weight'], model.conv.weight)
+    assert_tensor_equal(state_dict['conv.bias'], model.conv.bias)
+
+    wrapped_model = DDPWrapper(model)
+    state_dict = get_state_dict(wrapped_model)
+    assert isinstance(state_dict, OrderedDict)
+    assert set(state_dict.keys()) == state_dict_keys
+    assert_tensor_equal(state_dict['block.conv.weight'],
+                        wrapped_model.module.block.conv.weight)
+    assert_tensor_equal(state_dict['block.conv.bias'],
+                        wrapped_model.module.block.conv.bias)
+    assert_tensor_equal(state_dict['block.norm.weight'],
+                        wrapped_model.module.block.norm.weight)
+    assert_tensor_equal(state_dict['block.norm.bias'],
+                        wrapped_model.module.block.norm.bias)
+    assert_tensor_equal(state_dict['block.norm.running_mean'],
+                        wrapped_model.module.block.norm.running_mean)
+    assert_tensor_equal(state_dict['block.norm.running_var'],
+                        wrapped_model.module.block.norm.running_var)
+    if torch.__version__ != 'parrots':
+        assert_tensor_equal(
+            state_dict['block.norm.num_batches_tracked'],
+            wrapped_model.module.block.norm.num_batches_tracked)
+    assert_tensor_equal(state_dict['conv.weight'],
+                        wrapped_model.module.conv.weight)
+    assert_tensor_equal(state_dict['conv.bias'],
+                        wrapped_model.module.conv.bias)
+
+    # wrapped inner module
+    for name, module in wrapped_model.module._modules.items():
+        module = DataParallel(module)
+        wrapped_model.module._modules[name] = module
+    state_dict = get_state_dict(wrapped_model)
+    assert isinstance(state_dict, OrderedDict)
+    assert set(state_dict.keys()) == state_dict_keys
+    assert_tensor_equal(state_dict['block.conv.weight'],
+                        wrapped_model.module.block.module.conv.weight)
+    assert_tensor_equal(state_dict['block.conv.bias'],
+                        wrapped_model.module.block.module.conv.bias)
+    assert_tensor_equal(state_dict['block.norm.weight'],
+                        wrapped_model.module.block.module.norm.weight)
+    assert_tensor_equal(state_dict['block.norm.bias'],
+                        wrapped_model.module.block.module.norm.bias)
+    assert_tensor_equal(state_dict['block.norm.running_mean'],
+                        wrapped_model.module.block.module.norm.running_mean)
+    assert_tensor_equal(state_dict['block.norm.running_var'],
+                        wrapped_model.module.block.module.norm.running_var)
+    if torch.__version__ != 'parrots':
+        assert_tensor_equal(
+            state_dict['block.norm.num_batches_tracked'],
+            wrapped_model.module.block.module.norm.num_batches_tracked)
+    assert_tensor_equal(state_dict['conv.weight'],
+                        wrapped_model.module.conv.module.weight)
+    assert_tensor_equal(state_dict['conv.bias'],
+                        wrapped_model.module.conv.module.bias)
+
+
+def test_load_pavimodel_dist():
+
+    sys.modules['pavi'] = MagicMock()
+    sys.modules['pavi.modelcloud'] = MagicMock()
+    pavimodel = Mockpavimodel()
+    import pavi
+    pavi.modelcloud.get = MagicMock(return_value=pavimodel)
+    with pytest.raises(AssertionError):
+        # test pavi prefix
+        _ = load_from_pavi('MyPaviFolder/checkpoint.pth')
+
+    with pytest.raises(FileNotFoundError):
+        # there is not such checkpoint for us to load
+        _ = load_from_pavi('pavi://checkpoint.pth')
+
+
+def test_load_checkpoint_with_prefix():
+
+    class FooModule(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(1, 2)
+            self.conv2d = nn.Conv2d(3, 1, 3)
+            self.conv2d_2 = nn.Conv2d(3, 2, 3)
+
+    model = FooModule()
+    nn.init.constant_(model.linear.weight, 1)
+    nn.init.constant_(model.linear.bias, 2)
+    nn.init.constant_(model.conv2d.weight, 3)
+    nn.init.constant_(model.conv2d.bias, 4)
+    nn.init.constant_(model.conv2d_2.weight, 5)
+    nn.init.constant_(model.conv2d_2.bias, 6)
+
+    with TemporaryDirectory():
+        torch.save(model.state_dict(), 'model.pth')
+        prefix = 'conv2d'
+        state_dict = _load_checkpoint_with_prefix(prefix, 'model.pth')
+        assert torch.equal(model.conv2d.state_dict()['weight'],
+                           state_dict['weight'])
+        assert torch.equal(model.conv2d.state_dict()['bias'],
+                           state_dict['bias'])
+
+        # test whether prefix is in pretrained model
+        with pytest.raises(AssertionError):
+            prefix = 'back'
+            _load_checkpoint_with_prefix(prefix, 'model.pth')
+
+
+def test_load_checkpoint():
+    import os
+    import re
+    import tempfile
+
+    class PrefixModel(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.backbone = Model()
+
+    pmodel = PrefixModel()
+    model = Model()
+    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
+
+    # add prefix
+    torch.save(model.state_dict(), checkpoint_path)
+    state_dict = load_checkpoint(
+        pmodel, checkpoint_path, revise_keys=[(r'^', 'backbone.')])
+    for key in pmodel.backbone.state_dict().keys():
+        assert torch.equal(pmodel.backbone.state_dict()[key], state_dict[key])
+    # strip prefix
+    torch.save(pmodel.state_dict(), checkpoint_path)
+    state_dict = load_checkpoint(
+        model, checkpoint_path, revise_keys=[(r'^backbone\.', '')])
+
+    for key in state_dict.keys():
+        key_stripped = re.sub(r'^backbone\.', '', key)
+        assert torch.equal(model.state_dict()[key_stripped], state_dict[key])
+    os.remove(checkpoint_path)
+
+
+def test_load_checkpoint_metadata():
+    import os
+    import tempfile
+
+    from mmcv.runner import load_checkpoint, save_checkpoint
+
+    class ModelV1(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.block = Block()
+            self.conv1 = nn.Conv2d(3, 3, 1)
+            self.conv2 = nn.Conv2d(3, 3, 1)
+            nn.init.normal_(self.conv1.weight)
+            nn.init.normal_(self.conv2.weight)
+
+    class ModelV2(nn.Module):
+        _version = 2
+
+        def __init__(self):
+            super().__init__()
+            self.block = Block()
+            self.conv0 = nn.Conv2d(3, 3, 1)
+            self.conv1 = nn.Conv2d(3, 3, 1)
+            nn.init.normal_(self.conv0.weight)
+            nn.init.normal_(self.conv1.weight)
+
+        def _load_from_state_dict(self, state_dict, prefix, local_metadata,
+                                  *args, **kwargs):
+            """load checkpoints."""
+
+            # Names of some parameters in has been changed.
+            version = local_metadata.get('version', None)
+            if version is None or version < 2:
+                state_dict_keys = list(state_dict.keys())
+                convert_map = {'conv1': 'conv0', 'conv2': 'conv1'}
+                for k in state_dict_keys:
+                    for ori_str, new_str in convert_map.items():
+                        if k.startswith(prefix + ori_str):
+                            new_key = k.replace(ori_str, new_str)
+                            state_dict[new_key] = state_dict[k]
+                            del state_dict[k]
+
+            super()._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          *args, **kwargs)
+
+    model_v1 = ModelV1()
+    model_v1_conv0_weight = model_v1.conv1.weight.detach()
+    model_v1_conv1_weight = model_v1.conv2.weight.detach()
+    model_v2 = ModelV2()
+    model_v2_conv0_weight = model_v2.conv0.weight.detach()
+    model_v2_conv1_weight = model_v2.conv1.weight.detach()
+    ckpt_v1_path = os.path.join(tempfile.gettempdir(), 'checkpoint_v1.pth')
+    ckpt_v2_path = os.path.join(tempfile.gettempdir(), 'checkpoint_v2.pth')
+
+    # Save checkpoint
+    save_checkpoint(model_v1, ckpt_v1_path)
+    save_checkpoint(model_v2, ckpt_v2_path)
+
+    # test load v1 model
+    load_checkpoint(model_v2, ckpt_v1_path)
+    assert torch.allclose(model_v2.conv0.weight, model_v1_conv0_weight)
+    assert torch.allclose(model_v2.conv1.weight, model_v1_conv1_weight)
+
+    # test load v2 model
+    load_checkpoint(model_v2, ckpt_v2_path)
+    assert torch.allclose(model_v2.conv0.weight, model_v2_conv0_weight)
+    assert torch.allclose(model_v2.conv1.weight, model_v2_conv1_weight)
+
+
+def test_load_classes_name():
+    import os
+    import tempfile
+
+    from mmcv.runner import load_checkpoint, save_checkpoint
+    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
+    model = Model()
+    save_checkpoint(model, checkpoint_path)
+    checkpoint = load_checkpoint(model, checkpoint_path)
+    assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta']
+
+    model.CLASSES = ('class1', 'class2')
+    save_checkpoint(model, checkpoint_path)
+    checkpoint = load_checkpoint(model, checkpoint_path)
+    assert 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']
+    assert checkpoint['meta']['CLASSES'] == ('class1', 'class2')
+
+    model = Model()
+    wrapped_model = DDPWrapper(model)
+    save_checkpoint(wrapped_model, checkpoint_path)
+    checkpoint = load_checkpoint(wrapped_model, checkpoint_path)
+    assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta']
+
+    wrapped_model.module.CLASSES = ('class1', 'class2')
+    save_checkpoint(wrapped_model, checkpoint_path)
+    checkpoint = load_checkpoint(wrapped_model, checkpoint_path)
+    assert 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']
+    assert checkpoint['meta']['CLASSES'] == ('class1', 'class2')
+
+    # remove the temp file
+    os.remove(checkpoint_path)
+
+
+def test_checkpoint_loader():
+    import os
+    import tempfile
+
+    from mmcv.runner import CheckpointLoader, _load_checkpoint, save_checkpoint
+    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
+    model = Model()
+    save_checkpoint(model, checkpoint_path)
+    checkpoint = _load_checkpoint(checkpoint_path)
+    assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta']
+    # remove the temp file
+    os.remove(checkpoint_path)
+
+    filenames = [
+        'http://xx.xx/xx.pth', 'https://xx.xx/xx.pth',
+        'modelzoo://xx.xx/xx.pth', 'torchvision://xx.xx/xx.pth',
+        'open-mmlab://xx.xx/xx.pth', 'openmmlab://xx.xx/xx.pth',
+        'mmcls://xx.xx/xx.pth', 'pavi://xx.xx/xx.pth', 's3://xx.xx/xx.pth',
+        'ss3://xx.xx/xx.pth', ' s3://xx.xx/xx.pth',
+        'open-mmlab:s3://xx.xx/xx.pth', 'openmmlab:s3://xx.xx/xx.pth',
+        'openmmlabs3://xx.xx/xx.pth', ':s3://xx.xx/xx.path'
+    ]
+    fn_names = [
+        'load_from_http', 'load_from_http', 'load_from_torchvision',
+        'load_from_torchvision', 'load_from_openmmlab', 'load_from_openmmlab',
+        'load_from_mmcls', 'load_from_pavi', 'load_from_ceph',
+        'load_from_local', 'load_from_local', 'load_from_ceph',
+        'load_from_ceph', 'load_from_local', 'load_from_local'
+    ]
+
+    for filename, fn_name in zip(filenames, fn_names):
+        loader = CheckpointLoader._get_checkpoint_loader(filename)
+        assert loader.__name__ == fn_name
+
+    @CheckpointLoader.register_scheme(prefixes='ftp://')
+    def load_from_ftp(filename, map_location):
+        return dict(filename=filename)
+
+    # test register_loader
+    filename = 'ftp://xx.xx/xx.pth'
+    loader = CheckpointLoader._get_checkpoint_loader(filename)
+    assert loader.__name__ == 'load_from_ftp'
+
+    def load_from_ftp1(filename, map_location):
+        return dict(filename=filename)
+
+    # test duplicate registered error
+    with pytest.raises(KeyError):
+        CheckpointLoader.register_scheme('ftp://', load_from_ftp1)
+
+    # test force param
+    CheckpointLoader.register_scheme('ftp://', load_from_ftp1, force=True)
+    checkpoint = CheckpointLoader.load_checkpoint(filename)
+    assert checkpoint['filename'] == filename
+
+    # test print function name
+    loader = CheckpointLoader._get_checkpoint_loader(filename)
+    assert loader.__name__ == 'load_from_ftp1'
+
+    # test sort
+    @CheckpointLoader.register_scheme(prefixes='a/b')
+    def load_from_ab(filename, map_location):
+        return dict(filename=filename)
+
+    @CheckpointLoader.register_scheme(prefixes='a/b/c')
+    def load_from_abc(filename, map_location):
+        return dict(filename=filename)
+
+    filename = 'a/b/c/d'
+    loader = CheckpointLoader._get_checkpoint_loader(filename)
+    assert loader.__name__ == 'load_from_abc'
+
+
+def test_save_checkpoint(tmp_path):
+    model = Model()
+    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
+    # meta is not a dict
+    with pytest.raises(TypeError):
+        save_checkpoint(model, '/path/of/your/filename', meta='invalid type')
+
+    # 1. save to disk
+    filename = str(tmp_path / 'checkpoint1.pth')
+    save_checkpoint(model, filename)
+
+    filename = str(tmp_path / 'checkpoint2.pth')
+    save_checkpoint(model, filename, optimizer)
+
+    filename = str(tmp_path / 'checkpoint3.pth')
+    save_checkpoint(model, filename, meta={'test': 'test'})
+
+    filename = str(tmp_path / 'checkpoint4.pth')
+    save_checkpoint(model, filename, file_client_args={'backend': 'disk'})
+
+    # 2. save to petrel oss
+    with patch.object(PetrelBackend, 'put') as mock_method:
+        filename = 's3://path/of/your/checkpoint1.pth'
+        save_checkpoint(model, filename)
+    mock_method.assert_called()
+
+    with patch.object(PetrelBackend, 'put') as mock_method:
+        filename = 's3://path//of/your/checkpoint2.pth'
+        save_checkpoint(
+            model, filename, file_client_args={'backend': 'petrel'})
+    mock_method.assert_called()
+
+
+def test_load_from_local():
+    import os
+    home_path = os.path.expanduser('~')
+    checkpoint_path = os.path.join(
+        home_path, 'dummy_checkpoint_used_to_test_load_from_local.pth')
+    model = Model()
+    save_checkpoint(model, checkpoint_path)
+    checkpoint = load_from_local(
+        '~/dummy_checkpoint_used_to_test_load_from_local.pth',
+        map_location=None)
+    assert_tensor_equal(checkpoint['state_dict']['block.conv.weight'],
+                        model.block.conv.weight)
+    os.remove(checkpoint_path)
diff --git a/mmcv/tests/test_runner/test_dist_utils.py b/mmcv/tests/test_runner/test_dist_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..979c2e4f3f4747d65cc95ac95c81666c8ce653c9
--- /dev/null
+++ b/mmcv/tests/test_runner/test_dist_utils.py
@@ -0,0 +1,53 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from unittest.mock import patch
+
+import pytest
+
+from mmcv.runner import init_dist
+
+
+@patch('torch.cuda.device_count', return_value=1)
+@patch('torch.cuda.set_device')
+@patch('torch.distributed.init_process_group')
+@patch('subprocess.getoutput', return_value='127.0.0.1')
+def test_init_dist(mock_getoutput, mock_dist_init, mock_set_device,
+                   mock_device_count):
+    with pytest.raises(ValueError):
+        # launcher must be one of {'pytorch', 'mpi', 'slurm'}
+        init_dist('invaliad_launcher')
+
+    # test initialize with slurm launcher
+    os.environ['SLURM_PROCID'] = '0'
+    os.environ['SLURM_NTASKS'] = '1'
+    os.environ['SLURM_NODELIST'] = '[0]'  # haven't check the correct form
+
+    init_dist('slurm')
+    # no port is specified, use default port 29500
+    assert os.environ['MASTER_PORT'] == '29500'
+    assert os.environ['MASTER_ADDR'] == '127.0.0.1'
+    assert os.environ['WORLD_SIZE'] == '1'
+    assert os.environ['RANK'] == '0'
+    mock_set_device.assert_called_with(0)
+    mock_getoutput.assert_called_with('scontrol show hostname [0] | head -n1')
+    mock_dist_init.assert_called_with(backend='nccl')
+
+    init_dist('slurm', port=29505)
+    # port is specified with argument 'port'
+    assert os.environ['MASTER_PORT'] == '29505'
+    assert os.environ['MASTER_ADDR'] == '127.0.0.1'
+    assert os.environ['WORLD_SIZE'] == '1'
+    assert os.environ['RANK'] == '0'
+    mock_set_device.assert_called_with(0)
+    mock_getoutput.assert_called_with('scontrol show hostname [0] | head -n1')
+    mock_dist_init.assert_called_with(backend='nccl')
+
+    init_dist('slurm')
+    # port is specified by environment variable 'MASTER_PORT'
+    assert os.environ['MASTER_PORT'] == '29505'
+    assert os.environ['MASTER_ADDR'] == '127.0.0.1'
+    assert os.environ['WORLD_SIZE'] == '1'
+    assert os.environ['RANK'] == '0'
+    mock_set_device.assert_called_with(0)
+    mock_getoutput.assert_called_with('scontrol show hostname [0] | head -n1')
+    mock_dist_init.assert_called_with(backend='nccl')
diff --git a/mmcv/tests/test_runner/test_eval_hook.py b/mmcv/tests/test_runner/test_eval_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..e03ce82db2bcbd2206fbba3cc6cbe7f9a284499f
--- /dev/null
+++ b/mmcv/tests/test_runner/test_eval_hook.py
@@ -0,0 +1,483 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os.path as osp
+import sys
+import tempfile
+import unittest.mock as mock
+from collections import OrderedDict
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader, Dataset
+
+from mmcv.fileio.file_client import PetrelBackend
+from mmcv.runner import DistEvalHook as BaseDistEvalHook
+from mmcv.runner import EpochBasedRunner
+from mmcv.runner import EvalHook as BaseEvalHook
+from mmcv.runner import IterBasedRunner
+from mmcv.utils import get_logger, scandir
+
+sys.modules['petrel_client'] = MagicMock()
+sys.modules['petrel_client.client'] = MagicMock()
+
+
+class ExampleDataset(Dataset):
+
+    def __init__(self):
+        self.index = 0
+        self.eval_result = [1, 4, 3, 7, 2, -3, 4, 6]
+
+    def __getitem__(self, idx):
+        results = dict(x=torch.tensor([1]))
+        return results
+
+    def __len__(self):
+        return 1
+
+    @mock.create_autospec
+    def evaluate(self, results, logger=None):
+        pass
+
+
+class EvalDataset(ExampleDataset):
+
+    def evaluate(self, results, logger=None):
+        acc = self.eval_result[self.index]
+        output = OrderedDict(
+            acc=acc, index=self.index, score=acc, loss_top=acc)
+        self.index += 1
+        return output
+
+
+class Model(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.param = nn.Parameter(torch.tensor([1.0]))
+
+    def forward(self, x, **kwargs):
+        return self.param * x
+
+    def train_step(self, data_batch, optimizer, **kwargs):
+        return {'loss': torch.sum(self(data_batch['x']))}
+
+    def val_step(self, data_batch, optimizer, **kwargs):
+        return {'loss': torch.sum(self(data_batch['x']))}
+
+
+def _build_epoch_runner():
+
+    model = Model()
+    tmp_dir = tempfile.mkdtemp()
+
+    runner = EpochBasedRunner(
+        model=model, work_dir=tmp_dir, logger=get_logger('demo'))
+    return runner
+
+
+def _build_iter_runner():
+
+    model = Model()
+    tmp_dir = tempfile.mkdtemp()
+
+    runner = IterBasedRunner(
+        model=model, work_dir=tmp_dir, logger=get_logger('demo'))
+    return runner
+
+
+class EvalHook(BaseEvalHook):
+
+    _default_greater_keys = ['acc', 'top']
+    _default_less_keys = ['loss', 'loss_top']
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
+class DistEvalHook(BaseDistEvalHook):
+
+    greater_keys = ['acc', 'top']
+    less_keys = ['loss', 'loss_top']
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+
+def test_eval_hook():
+    with pytest.raises(AssertionError):
+        # `save_best` should be a str
+        test_dataset = Model()
+        data_loader = DataLoader(test_dataset)
+        EvalHook(data_loader, save_best=True)
+
+    with pytest.raises(TypeError):
+        # dataloader must be a pytorch DataLoader
+        test_dataset = Model()
+        data_loader = [DataLoader(test_dataset)]
+        EvalHook(data_loader)
+
+    with pytest.raises(ValueError):
+        # key_indicator must be valid when rule_map is None
+        test_dataset = ExampleDataset()
+        data_loader = DataLoader(test_dataset)
+        EvalHook(data_loader, save_best='unsupport')
+
+    with pytest.raises(KeyError):
+        # rule must be in keys of rule_map
+        test_dataset = ExampleDataset()
+        data_loader = DataLoader(test_dataset)
+        EvalHook(data_loader, save_best='auto', rule='unsupport')
+
+    # if eval_res is an empty dict, print a warning information
+    with pytest.warns(UserWarning) as record_warnings:
+
+        class _EvalDataset(ExampleDataset):
+
+            def evaluate(self, results, logger=None):
+                return {}
+
+        test_dataset = _EvalDataset()
+        data_loader = DataLoader(test_dataset)
+        eval_hook = EvalHook(data_loader, save_best='auto')
+        runner = _build_epoch_runner()
+        runner.register_hook(eval_hook)
+        runner.run([data_loader], [('train', 1)], 1)
+    # Since there will be many warnings thrown, we just need to check if the
+    # expected exceptions are thrown
+    expected_message = ('Since `eval_res` is an empty dict, the behavior to '
+                        'save the best checkpoint will be skipped in this '
+                        'evaluation.')
+    for warning in record_warnings:
+        if str(warning.message) == expected_message:
+            break
+    else:
+        assert False
+
+    test_dataset = ExampleDataset()
+    loader = DataLoader(test_dataset)
+    model = Model()
+    data_loader = DataLoader(test_dataset)
+    eval_hook = EvalHook(data_loader, save_best=None)
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+
+        # total_epochs = 1
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 1)
+        test_dataset.evaluate.assert_called_with(
+            test_dataset, [torch.tensor([1])], logger=runner.logger)
+        assert runner.meta is None or 'best_score' not in runner.meta[
+            'hook_msgs']
+        assert runner.meta is None or 'best_ckpt' not in runner.meta[
+            'hook_msgs']
+
+    # when `save_best` is set to 'auto', first metric will be used.
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(data_loader, interval=1, save_best='auto')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 7
+
+    # total_epochs = 8, return the best acc and corresponding epoch
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(data_loader, interval=1, save_best='acc')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 7
+
+    # total_epochs = 8, return the best loss_top and corresponding epoch
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(data_loader, interval=1, save_best='loss_top')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_loss_top_epoch_6.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == -3
+
+    # total_epochs = 8, return the best score and corresponding epoch
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(
+        data_loader, interval=1, save_best='score', rule='greater')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_score_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 7
+
+    # total_epochs = 8, return the best score using less compare func
+    # and indicate corresponding epoch
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(data_loader, save_best='acc', rule='less')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == -3
+
+    # Test the EvalHook when resume happened
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(data_loader, save_best='acc')
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 2)
+
+        old_ckpt_path = osp.join(tmpdir, 'best_acc_epoch_2.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path
+        assert osp.exists(old_ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 4
+
+        resume_from = old_ckpt_path
+        loader = DataLoader(ExampleDataset())
+        eval_hook = EvalHook(data_loader, save_best='acc')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+
+        runner.resume(resume_from)
+        assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path
+        assert osp.exists(old_ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 4
+
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == 7
+        assert not osp.exists(old_ckpt_path)
+
+    # test EvalHook with customer test_fn and greater/less keys
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+
+    eval_hook = EvalHook(
+        data_loader,
+        save_best='acc',
+        test_fn=mock.MagicMock(return_value={}),
+        greater_keys=[],
+        less_keys=['acc'])
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth')
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert osp.exists(ckpt_path)
+        assert runner.meta['hook_msgs']['best_score'] == -3
+
+    # test EvalHook with specified `out_dir`
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    out_dir = 's3://user/data'
+    eval_hook = EvalHook(
+        data_loader, interval=1, save_best='auto', out_dir=out_dir)
+
+    with patch.object(PetrelBackend, 'put') as mock_put, \
+         patch.object(PetrelBackend, 'remove') as mock_remove, \
+         patch.object(PetrelBackend, 'isfile') as mock_isfile, \
+         tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_eval')
+        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
+        runner.register_checkpoint_hook(dict(interval=1))
+        runner.register_hook(eval_hook)
+        runner.run([loader], [('train', 1)], 8)
+
+        basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+        ckpt_path = f'{out_dir}/{basename}/best_acc_epoch_4.pth'
+
+        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
+        assert runner.meta['hook_msgs']['best_score'] == 7
+
+    assert mock_put.call_count == 3
+    assert mock_remove.call_count == 2
+    assert mock_isfile.call_count == 2
+
+
+@patch('mmcv.engine.single_gpu_test', MagicMock)
+@patch('mmcv.engine.multi_gpu_test', MagicMock)
+@pytest.mark.parametrize('EvalHookParam', [EvalHook, DistEvalHook])
+@pytest.mark.parametrize('_build_demo_runner,by_epoch',
+                         [(_build_epoch_runner, True),
+                          (_build_iter_runner, False)])
+def test_start_param(EvalHookParam, _build_demo_runner, by_epoch):
+    # create dummy data
+    dataloader = DataLoader(EvalDataset())
+
+    # 0.1. dataloader is not a DataLoader object
+    with pytest.raises(TypeError):
+        EvalHookParam(dataloader=MagicMock(), interval=-1)
+
+    # 0.2. negative interval
+    with pytest.raises(ValueError):
+        EvalHookParam(dataloader, interval=-1)
+
+    # 0.3. negative start
+    with pytest.raises(ValueError):
+        EvalHookParam(dataloader, start=-1)
+
+    # 1. start=None, interval=1: perform evaluation after each epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, interval=1, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 2
+
+    # 2. start=1, interval=1: perform evaluation after each epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(
+        dataloader, start=1, interval=1, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 2
+
+    # 3. start=None, interval=2: perform evaluation after epoch 2, 4, 6, etc
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, interval=2, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 1  # after epoch 2
+
+    # 4. start=1, interval=2: perform evaluation after epoch 1, 3, 5, etc
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(
+        dataloader, start=1, interval=2, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 3)
+    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 3
+
+    # 5. start=0, interval=1: perform evaluation after each epoch and
+    #    before epoch 1.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, start=0, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    runner.run([dataloader], [('train', 1)], 2)
+    assert evalhook.evaluate.call_count == 3  # before epoch1 and after e1 & e2
+
+    # 6. resuming from epoch i, start = x (x<=i), interval =1: perform
+    #    evaluation after each epoch and before the first epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, start=1, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    if by_epoch:
+        runner._epoch = 2
+    else:
+        runner._iter = 2
+    runner.run([dataloader], [('train', 1)], 3)
+    assert evalhook.evaluate.call_count == 2  # before & after epoch 3
+
+    # 7. resuming from epoch i, start = i+1/None, interval =1: perform
+    #    evaluation after each epoch.
+    runner = _build_demo_runner()
+    evalhook = EvalHookParam(dataloader, start=2, by_epoch=by_epoch)
+    evalhook.evaluate = MagicMock()
+    runner.register_hook(evalhook)
+    if by_epoch:
+        runner._epoch = 1
+    else:
+        runner._iter = 1
+    runner.run([dataloader], [('train', 1)], 3)
+    assert evalhook.evaluate.call_count == 2  # after epoch 2 & 3
+
+
+@pytest.mark.parametrize('runner,by_epoch,eval_hook_priority',
+                         [(EpochBasedRunner, True, 'NORMAL'),
+                          (EpochBasedRunner, True, 'LOW'),
+                          (IterBasedRunner, False, 'LOW')])
+def test_logger(runner, by_epoch, eval_hook_priority):
+    loader = DataLoader(EvalDataset())
+    model = Model()
+    data_loader = DataLoader(EvalDataset())
+    eval_hook = EvalHook(
+        data_loader, interval=1, by_epoch=by_epoch, save_best='acc')
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        logger = get_logger('test_logger')
+        optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+        runner = EpochBasedRunner(
+            model=model, optimizer=optimizer, work_dir=tmpdir, logger=logger)
+        runner.register_logger_hooks(
+            dict(
+                interval=1,
+                hooks=[dict(type='TextLoggerHook', by_epoch=by_epoch)]))
+        runner.register_timer_hook(dict(type='IterTimerHook'))
+        runner.register_hook(eval_hook, priority=eval_hook_priority)
+        runner.run([loader], [('train', 1)], 1)
+
+        path = osp.join(tmpdir, next(scandir(tmpdir, '.json')))
+        with open(path) as fr:
+            fr.readline()  # skip the first line which is `hook_msg`
+            train_log = json.loads(fr.readline())
+            assert train_log['mode'] == 'train' and 'time' in train_log
+            val_log = json.loads(fr.readline())
+            assert val_log['mode'] == 'val' and 'time' not in val_log
diff --git a/mmcv/tests/test_runner/test_fp16.py b/mmcv/tests/test_runner/test_fp16.py
new file mode 100644
index 0000000000000000000000000000000000000000..e34c909cb91576594e6daaed1edbc701560594d6
--- /dev/null
+++ b/mmcv/tests/test_runner/test_fp16.py
@@ -0,0 +1,317 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.runner.fp16_utils import auto_fp16, cast_tensor_type, force_fp32
+
+
+def test_cast_tensor_type():
+    inputs = torch.FloatTensor([5.])
+    src_type = torch.float32
+    dst_type = torch.int32
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, torch.Tensor)
+    assert outputs.dtype == dst_type
+
+    # convert torch.float to torch.half
+    inputs = torch.FloatTensor([5.])
+    src_type = torch.float
+    dst_type = torch.half
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, torch.Tensor)
+    assert outputs.dtype == dst_type
+
+    # skip the conversion when the type of input is not the same as src_type
+    inputs = torch.IntTensor([5])
+    src_type = torch.float
+    dst_type = torch.half
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, torch.Tensor)
+    assert outputs.dtype == inputs.dtype
+
+    inputs = 'tensor'
+    src_type = str
+    dst_type = str
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, str)
+
+    inputs = np.array([5.])
+    src_type = np.ndarray
+    dst_type = np.ndarray
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, np.ndarray)
+
+    inputs = dict(
+        tensor_a=torch.FloatTensor([1.]), tensor_b=torch.FloatTensor([2.]))
+    src_type = torch.float32
+    dst_type = torch.int32
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, dict)
+    assert outputs['tensor_a'].dtype == dst_type
+    assert outputs['tensor_b'].dtype == dst_type
+
+    inputs = [torch.FloatTensor([1.]), torch.FloatTensor([2.])]
+    src_type = torch.float32
+    dst_type = torch.int32
+    outputs = cast_tensor_type(inputs, src_type, dst_type)
+    assert isinstance(outputs, list)
+    assert outputs[0].dtype == dst_type
+    assert outputs[1].dtype == dst_type
+
+    inputs = 5
+    outputs = cast_tensor_type(inputs, None, None)
+    assert isinstance(outputs, int)
+
+
+def test_auto_fp16():
+
+    with pytest.raises(TypeError):
+        # ExampleObject is not a subclass of nn.Module
+
+        class ExampleObject:
+
+            @auto_fp16()
+            def __call__(self, x):
+                return x
+
+        model = ExampleObject()
+        input_x = torch.ones(1, dtype=torch.float32)
+        model(input_x)
+
+    # apply to all input args
+    class ExampleModule(nn.Module):
+
+        @auto_fp16()
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.float32)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.half
+
+    # apply to specified input args
+    class ExampleModule(nn.Module):
+
+        @auto_fp16(apply_to=('x', ))
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.float32)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.float32
+
+    # apply to optional input args
+    class ExampleModule(nn.Module):
+
+        @auto_fp16(apply_to=('x', 'y'))
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.float32)
+    input_z = torch.ones(1, dtype=torch.float32)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.half
+        assert output_z.dtype == torch.float32
+
+    # out_fp32=True
+    class ExampleModule(nn.Module):
+
+        @auto_fp16(apply_to=('x', 'y'), out_fp32=True)
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.float32)
+    input_z = torch.ones(1, dtype=torch.float32)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.float32
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.float32
+        assert output_z.dtype == torch.float32
+
+
+def test_force_fp32():
+
+    with pytest.raises(TypeError):
+        # ExampleObject is not a subclass of nn.Module
+
+        class ExampleObject:
+
+            @force_fp32()
+            def __call__(self, x):
+                return x
+
+        model = ExampleObject()
+        input_x = torch.ones(1, dtype=torch.float32)
+        model(input_x)
+
+    # apply to all input args
+    class ExampleModule(nn.Module):
+
+        @force_fp32()
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.half)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.float32
+
+    # apply to specified input args
+    class ExampleModule(nn.Module):
+
+        @force_fp32(apply_to=('x', ))
+        def forward(self, x, y):
+            return x, y
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.half)
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y = model(input_x, input_y)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y = model(input_x.cuda(), input_y.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.half
+
+    # apply to optional input args
+    class ExampleModule(nn.Module):
+
+        @force_fp32(apply_to=('x', 'y'))
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.half)
+    input_y = torch.ones(1, dtype=torch.half)
+    input_z = torch.ones(1, dtype=torch.half)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.float32
+    assert output_z.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.float32
+        assert output_y.dtype == torch.float32
+        assert output_z.dtype == torch.half
+
+    # out_fp16=True
+    class ExampleModule(nn.Module):
+
+        @force_fp32(apply_to=('x', 'y'), out_fp16=True)
+        def forward(self, x, y=None, z=None):
+            return x, y, z
+
+    model = ExampleModule()
+    input_x = torch.ones(1, dtype=torch.float32)
+    input_y = torch.ones(1, dtype=torch.half)
+    input_z = torch.ones(1, dtype=torch.half)
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.float32
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.half
+
+    model.fp16_enabled = True
+    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
+    assert output_x.dtype == torch.half
+    assert output_y.dtype == torch.half
+    assert output_z.dtype == torch.half
+
+    if torch.cuda.is_available():
+        model.cuda()
+        output_x, output_y, output_z = model(
+            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
+        assert output_x.dtype == torch.half
+        assert output_y.dtype == torch.half
+        assert output_z.dtype == torch.half
diff --git a/mmcv/tests/test_runner/test_hooks.py b/mmcv/tests/test_runner/test_hooks.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdb93a9013d61b21d38b9fe6795cf58a52e9732f
--- /dev/null
+++ b/mmcv/tests/test_runner/test_hooks.py
@@ -0,0 +1,1923 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Tests the hooks with runners.
+
+CommandLine:
+    pytest tests/test_runner/test_hooks.py
+    xdoctest tests/test_hooks.py zero
+"""
+import logging
+import os.path as osp
+import platform
+import random
+import re
+import shutil
+import sys
+import tempfile
+from unittest.mock import MagicMock, Mock, call, patch
+
+import pytest
+import torch
+import torch.nn as nn
+from torch.nn.init import constant_
+from torch.utils.data import DataLoader
+
+from mmcv.fileio.file_client import PetrelBackend
+# yapf: disable
+from mmcv.runner import (CheckpointHook, ClearMLLoggerHook, DvcliveLoggerHook,
+                         EMAHook, Fp16OptimizerHook,
+                         GradientCumulativeFp16OptimizerHook,
+                         GradientCumulativeOptimizerHook, IterTimerHook,
+                         MlflowLoggerHook, NeptuneLoggerHook, OptimizerHook,
+                         PaviLoggerHook, SegmindLoggerHook, WandbLoggerHook,
+                         build_runner)
+# yapf: enable
+from mmcv.runner.fp16_utils import auto_fp16
+from mmcv.runner.hooks.hook import HOOKS, Hook
+from mmcv.runner.hooks.lr_updater import (CosineRestartLrUpdaterHook,
+                                          CyclicLrUpdaterHook,
+                                          FlatCosineAnnealingLrUpdaterHook,
+                                          OneCycleLrUpdaterHook,
+                                          StepLrUpdaterHook)
+from mmcv.utils import TORCH_VERSION
+
+sys.modules['petrel_client'] = MagicMock()
+sys.modules['petrel_client.client'] = MagicMock()
+
+
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not supported in parrots now')
+def test_optimizerhook():
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv1 = nn.Conv2d(
+                in_channels=1,
+                out_channels=2,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dilation=1)
+            self.conv2 = nn.Conv2d(
+                in_channels=2,
+                out_channels=2,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dilation=1)
+            self.conv3 = nn.Conv2d(
+                in_channels=1,
+                out_channels=2,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                dilation=1)
+
+        def forward(self, x):
+            x1 = self.conv1(x)
+            x2 = self.conv2(x1)
+            return x1, x2
+
+    model = Model()
+    x = torch.rand(1, 1, 3, 3)
+
+    dummy_runner = Mock()
+    dummy_runner.optimizer.zero_grad = Mock(return_value=None)
+    dummy_runner.optimizer.step = Mock(return_value=None)
+    dummy_runner.model = model
+    dummy_runner.outputs = dict()
+
+    dummy_runner.outputs['num_samples'] = 0
+
+    class DummyLogger():
+
+        def __init__(self):
+            self.msg = ''
+
+        def log(self, msg=None, **kwargs):
+            self.msg += msg
+
+    dummy_runner.logger = DummyLogger()
+    optimizer_hook = OptimizerHook(
+        dict(max_norm=2), detect_anomalous_params=True)
+
+    dummy_runner.outputs['loss'] = model(x)[0].sum()
+    optimizer_hook.after_train_iter(dummy_runner)
+    # assert the parameters of conv2 and conv3 are not in the
+    # computational graph which is with x1.sum() as root.
+    assert 'conv2.weight' in dummy_runner.logger.msg
+    assert 'conv2.bias' in dummy_runner.logger.msg
+    assert 'conv3.weight' in dummy_runner.logger.msg
+    assert 'conv3.bias' in dummy_runner.logger.msg
+    assert 'conv1.weight' not in dummy_runner.logger.msg
+    assert 'conv1.bias' not in dummy_runner.logger.msg
+
+    dummy_runner.outputs['loss'] = model(x)[1].sum()
+    dummy_runner.logger.msg = ''
+    optimizer_hook.after_train_iter(dummy_runner)
+    # assert the parameters of conv3 are not in the computational graph
+    assert 'conv3.weight' in dummy_runner.logger.msg
+    assert 'conv3.bias' in dummy_runner.logger.msg
+    assert 'conv2.weight' not in dummy_runner.logger.msg
+    assert 'conv2.bias' not in dummy_runner.logger.msg
+    assert 'conv1.weight' not in dummy_runner.logger.msg
+    assert 'conv1.bias' not in dummy_runner.logger.msg
+
+
+def test_checkpoint_hook(tmp_path):
+    """xdoctest -m tests/test_runner/test_hooks.py test_checkpoint_hook."""
+
+    # test epoch based runner
+    loader = DataLoader(torch.ones((5, 2)))
+    runner = _build_demo_runner('EpochBasedRunner', max_epochs=1)
+    runner.meta = dict()
+    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
+    runner.register_hook(checkpointhook)
+    runner.run([loader], [('train', 1)])
+    assert runner.meta['hook_msgs']['last_ckpt'] == osp.join(
+        runner.work_dir, 'epoch_1.pth')
+    shutil.rmtree(runner.work_dir)
+
+    # test petrel oss when the type of runner is `EpochBasedRunner`
+    runner = _build_demo_runner('EpochBasedRunner', max_epochs=4)
+    runner.meta = dict()
+    out_dir = 's3://user/data'
+    with patch.object(PetrelBackend, 'put') as mock_put, \
+            patch.object(PetrelBackend, 'remove') as mock_remove, \
+            patch.object(PetrelBackend, 'isfile') as mock_isfile:
+        checkpointhook = CheckpointHook(
+            interval=1, out_dir=out_dir, by_epoch=True, max_keep_ckpts=2)
+        runner.register_hook(checkpointhook)
+        runner.run([loader], [('train', 1)])
+        basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+        assert runner.meta['hook_msgs']['last_ckpt'] == \
+               '/'.join([out_dir, basename, 'epoch_4.pth'])
+    mock_put.assert_called()
+    mock_remove.assert_called()
+    mock_isfile.assert_called()
+    shutil.rmtree(runner.work_dir)
+
+    # test iter based runner
+    runner = _build_demo_runner(
+        'IterBasedRunner', max_iters=1, max_epochs=None)
+    runner.meta = dict()
+    checkpointhook = CheckpointHook(interval=1, by_epoch=False)
+    runner.register_hook(checkpointhook)
+    runner.run([loader], [('train', 1)])
+    assert runner.meta['hook_msgs']['last_ckpt'] == osp.join(
+        runner.work_dir, 'iter_1.pth')
+    shutil.rmtree(runner.work_dir)
+
+    # test petrel oss when the type of runner is `IterBasedRunner`
+    runner = _build_demo_runner(
+        'IterBasedRunner', max_iters=4, max_epochs=None)
+    runner.meta = dict()
+    out_dir = 's3://user/data'
+    with patch.object(PetrelBackend, 'put') as mock_put, \
+            patch.object(PetrelBackend, 'remove') as mock_remove, \
+            patch.object(PetrelBackend, 'isfile') as mock_isfile:
+        checkpointhook = CheckpointHook(
+            interval=1, out_dir=out_dir, by_epoch=False, max_keep_ckpts=2)
+        runner.register_hook(checkpointhook)
+        runner.run([loader], [('train', 1)])
+        basename = osp.basename(runner.work_dir.rstrip(osp.sep))
+        assert runner.meta['hook_msgs']['last_ckpt'] == \
+               '/'.join([out_dir, basename, 'iter_4.pth'])
+    mock_put.assert_called()
+    mock_remove.assert_called()
+    mock_isfile.assert_called()
+    shutil.rmtree(runner.work_dir)
+
+
+def test_ema_hook():
+    """xdoctest -m tests/test_hooks.py test_ema_hook."""
+
+    class DemoModel(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(
+                in_channels=1,
+                out_channels=2,
+                kernel_size=1,
+                padding=1,
+                bias=True)
+            self._init_weight()
+
+        def _init_weight(self):
+            constant_(self.conv.weight, 0)
+            constant_(self.conv.bias, 0)
+
+        def forward(self, x):
+            return self.conv(x).sum()
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+    loader = DataLoader(torch.ones((1, 1, 1, 1)))
+    runner = _build_demo_runner()
+    demo_model = DemoModel()
+    runner.model = demo_model
+    emahook = EMAHook(momentum=0.1, interval=2, warm_up=100, resume_from=None)
+    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
+    runner.register_hook(emahook, priority='HIGHEST')
+    runner.register_hook(checkpointhook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    checkpoint = torch.load(f'{runner.work_dir}/epoch_1.pth')
+    contain_ema_buffer = False
+    for name, value in checkpoint['state_dict'].items():
+        if 'ema' in name:
+            contain_ema_buffer = True
+            assert value.sum() == 0
+            value.fill_(1)
+        else:
+            assert value.sum() == 0
+    assert contain_ema_buffer
+    torch.save(checkpoint, f'{runner.work_dir}/epoch_1.pth')
+    work_dir = runner.work_dir
+    resume_ema_hook = EMAHook(
+        momentum=0.5, warm_up=0, resume_from=f'{work_dir}/epoch_1.pth')
+    runner = _build_demo_runner(max_epochs=2)
+    runner.model = demo_model
+    runner.register_hook(resume_ema_hook, priority='HIGHEST')
+    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
+    runner.register_hook(checkpointhook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    checkpoint = torch.load(f'{runner.work_dir}/epoch_2.pth')
+    contain_ema_buffer = False
+    for name, value in checkpoint['state_dict'].items():
+        if 'ema' in name:
+            contain_ema_buffer = True
+            assert value.sum() == 2
+        else:
+            assert value.sum() == 1
+    assert contain_ema_buffer
+    shutil.rmtree(runner.work_dir)
+    shutil.rmtree(work_dir)
+
+
+def test_custom_hook():
+
+    @HOOKS.register_module()
+    class ToyHook(Hook):
+
+        def __init__(self, info, *args, **kwargs):
+            super().__init__()
+            self.info = info
+
+    runner = _build_demo_runner_without_hook('EpochBasedRunner', max_epochs=1)
+    # test if custom_hooks is None
+    runner.register_custom_hooks(None)
+    assert len(runner.hooks) == 0
+    # test if custom_hooks is dict list
+    custom_hooks_cfg = [
+        dict(type='ToyHook', priority=51, info=51),
+        dict(type='ToyHook', priority=49, info=49)
+    ]
+    runner.register_custom_hooks(custom_hooks_cfg)
+    assert [hook.info for hook in runner.hooks] == [49, 51]
+    # test if custom_hooks is object and without priority
+    runner.register_custom_hooks(ToyHook(info='default'))
+    assert len(runner.hooks) == 3 and runner.hooks[1].info == 'default'
+    shutil.rmtree(runner.work_dir)
+
+    runner = _build_demo_runner_without_hook('EpochBasedRunner', max_epochs=1)
+    # test custom_hooks with string priority setting
+    priority_ranks = [
+        'HIGHEST', 'VERY_HIGH', 'HIGH', 'ABOVE_NORMAL', 'NORMAL',
+        'BELOW_NORMAL', 'LOW', 'VERY_LOW', 'LOWEST'
+    ]
+    random_priority_ranks = priority_ranks.copy()
+    random.shuffle(random_priority_ranks)
+    custom_hooks_cfg = [
+        dict(type='ToyHook', priority=rank, info=rank)
+        for rank in random_priority_ranks
+    ]
+    runner.register_custom_hooks(custom_hooks_cfg)
+    assert [hook.info for hook in runner.hooks] == priority_ranks
+    shutil.rmtree(runner.work_dir)
+
+    runner = _build_demo_runner_without_hook('EpochBasedRunner', max_epochs=1)
+    # test register_training_hooks order
+    custom_hooks_cfg = [
+        dict(type='ToyHook', priority=1, info='custom 1'),
+        dict(type='ToyHook', priority='NORMAL', info='custom normal'),
+        dict(type='ToyHook', priority=89, info='custom 89')
+    ]
+    runner.register_training_hooks(
+        lr_config=ToyHook('lr'),
+        optimizer_config=ToyHook('optimizer'),
+        checkpoint_config=ToyHook('checkpoint'),
+        log_config=dict(interval=1, hooks=[dict(type='ToyHook', info='log')]),
+        momentum_config=ToyHook('momentum'),
+        timer_config=ToyHook('timer'),
+        custom_hooks_config=custom_hooks_cfg)
+    # If custom hooks have same priority with default hooks, custom hooks
+    # will be triggered after default hooks.
+    hooks_order = [
+        'custom 1', 'lr', 'momentum', 'optimizer', 'checkpoint',
+        'custom normal', 'timer', 'custom 89', 'log'
+    ]
+    assert [hook.info for hook in runner.hooks] == hooks_order
+    shutil.rmtree(runner.work_dir)
+
+
+def test_pavi_hook():
+    sys.modules['pavi'] = MagicMock()
+
+    loader = DataLoader(torch.ones((5, 2)))
+    runner = _build_demo_runner()
+    runner.meta = dict(config_dict=dict(lr=0.02, gpu_ids=range(1)))
+    hook = PaviLoggerHook(add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    assert hasattr(hook, 'writer')
+    hook.writer.add_scalars.assert_called_with('val', {
+        'learning_rate': 0.02,
+        'momentum': 0.95
+    }, 1)
+    # in Windows environment, the latest checkpoint is copied from epoch_1.pth
+    if platform.system() == 'Windows':
+        snapshot_file_path = osp.join(runner.work_dir, 'latest.pth')
+    else:
+        snapshot_file_path = osp.join(runner.work_dir, 'epoch_1.pth')
+    hook.writer.add_snapshot_file.assert_called_with(
+        tag=runner.work_dir.split('/')[-1],
+        snapshot_file_path=snapshot_file_path,
+        iteration=1)
+
+
+def test_sync_buffers_hook():
+    loader = DataLoader(torch.ones((5, 2)))
+    runner = _build_demo_runner()
+    runner.register_hook_from_cfg(dict(type='SyncBuffersHook'))
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+
+@pytest.mark.parametrize('multi_optimizers, max_iters, gamma, cyclic_times',
+                         [(True, 8, 1, 1), (False, 8, 0.5, 2)])
+def test_momentum_runner_hook(multi_optimizers, max_iters, gamma,
+                              cyclic_times):
+    """xdoctest -m tests/test_hooks.py test_momentum_runner_hook."""
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='CyclicMomentumUpdaterHook',
+        by_epoch=False,
+        target_ratio=(0.85 / 0.95, 1),
+        cyclic_times=cyclic_times,
+        step_ratio_up=0.4,
+        gamma=gamma)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add momentum LR scheduler
+    hook_cfg = dict(
+        type='CyclicLrUpdaterHook',
+        by_epoch=False,
+        target_ratio=(10, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01999999999999999,
+                    'learning_rate/model2': 0.009999999999999995,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.2,
+                    'learning_rate/model2': 0.1,
+                    'momentum/model1': 0.85,
+                    'momentum/model2': 0.8052631578947369,
+                }, 5),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.155,
+                    'learning_rate/model2': 0.0775,
+                    'momentum/model1': 0.875,
+                    'momentum/model2': 0.8289473684210527,
+                }, 7)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.01999999999999999,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.11,
+                'momentum': 0.85
+            }, 3),
+            call('train', {
+                'learning_rate': 0.1879422863405995,
+                'momentum': 0.95
+            }, 6),
+            call('train', {
+                'learning_rate': 0.11000000000000001,
+                'momentum': 0.9
+            }, 8),
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+    # test constant momentum warmup
+    sys.modules['pavi'] = MagicMock()
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='StepMomentumUpdaterHook',
+        by_epoch=False,
+        warmup='constant',
+        warmup_iters=5,
+        warmup_ratio=0.5,
+        step=[10],
+    )
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.9,
+                    'momentum/model2': 1.8,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.9,
+                    'momentum/model2': 1.8,
+                }, 5),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 10),
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.9
+            }, 1),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.9
+            }, 5),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 10),
+        ]
+
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+    # test linear momentum warmup
+    sys.modules['pavi'] = MagicMock()
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='StepMomentumUpdaterHook',
+        by_epoch=False,
+        warmup='linear',
+        warmup_iters=5,
+        warmup_ratio=0.5,
+        step=[10],
+    )
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.9,
+                    'momentum/model2': 1.8,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.3571428571428572,
+                    'momentum/model2': 1.2857142857142858,
+                }, 3),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 10),
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.9
+            }, 1),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.3571428571428572
+            }, 3),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 10),
+        ]
+
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+    # test exponentially momentum warmup
+    sys.modules['pavi'] = MagicMock()
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='StepMomentumUpdaterHook',
+        by_epoch=False,
+        warmup='exp',
+        warmup_iters=5,
+        warmup_ratio=0.5,
+        step=[10],
+    )
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.9,
+                    'momentum/model2': 1.8,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 1.4399307381848783,
+                    'momentum/model2': 1.3641449098593583,
+                }, 3),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 10),
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.9
+            }, 1),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 1.4399307381848783
+            }, 3),
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 10),
+        ]
+
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimizers', (True, False))
+def test_cosine_runner_hook(multi_optimizers):
+    """xdoctest -m tests/test_hooks.py test_cosine_runner_hook."""
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='CosineAnnealingMomentumUpdaterHook',
+        min_momentum_ratio=0.99 / 0.95,
+        by_epoch=False,
+        warmup_iters=2,
+        warmup_ratio=0.9 / 0.95)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add momentum LR scheduler
+    hook_cfg = dict(
+        type='CosineAnnealingLrUpdaterHook',
+        by_epoch=False,
+        min_lr_ratio=0,
+        warmup_iters=2,
+        warmup_ratio=0.9)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01,
+                    'learning_rate/model2': 0.005,
+                    'momentum/model1': 0.97,
+                    'momentum/model2': 0.9189473684210527,
+                }, 6),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0004894348370484647,
+                    'learning_rate/model2': 0.00024471741852423234,
+                    'momentum/model1': 0.9890211303259032,
+                    'momentum/model2': 0.9369673866245399,
+                }, 10)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.01,
+                'momentum': 0.97
+            }, 6),
+            call(
+                'train', {
+                    'learning_rate': 0.0004894348370484647,
+                    'momentum': 0.9890211303259032
+                }, 10)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimizers', (True, False))
+def test_linear_runner_hook(multi_optimizers):
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+
+    hook_cfg = dict(
+        type='LinearAnnealingMomentumUpdaterHook',
+        min_momentum_ratio=0.99 / 0.95,
+        by_epoch=False,
+        warmup_iters=2,
+        warmup_ratio=0.9 / 0.95)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add momentum LR scheduler
+    hook_cfg = dict(
+        type='LinearAnnealingLrUpdaterHook',
+        by_epoch=False,
+        min_lr_ratio=0,
+        warmup_iters=2,
+        warmup_ratio=0.9)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01,
+                    'learning_rate/model2': 0.005,
+                    'momentum/model1': 0.97,
+                    'momentum/model2': 0.9189473684210527,
+                }, 6),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0019999999999999983,
+                    'learning_rate/model2': 0.0009999999999999992,
+                    'momentum/model1': 0.9860000000000001,
+                    'momentum/model2': 0.9341052631578949,
+                }, 10)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.01,
+                'momentum': 0.97
+            }, 6),
+            call(
+                'train', {
+                    'learning_rate': 0.0019999999999999983,
+                    'momentum': 0.9860000000000001
+                }, 10)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimizers, by_epoch', [(False, False),
+                                                        (True, False),
+                                                        (False, True),
+                                                        (True, True)])
+def test_flat_cosine_runner_hook(multi_optimizers, by_epoch):
+    """xdoctest -m tests/test_hooks.py test_flat_cosine_runner_hook."""
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    max_epochs = 10 if by_epoch else 1
+    runner = _build_demo_runner(
+        multi_optimizers=multi_optimizers, max_epochs=max_epochs)
+
+    with pytest.raises(ValueError):
+        # start_percent: expected float between 0 and 1
+        FlatCosineAnnealingLrUpdaterHook(start_percent=-0.1, min_lr_ratio=0)
+
+    # add LR scheduler
+    hook_cfg = dict(
+        type='FlatCosineAnnealingLrUpdaterHook',
+        by_epoch=by_epoch,
+        min_lr_ratio=0,
+        warmup='linear',
+        warmup_iters=10 if by_epoch else 2,
+        warmup_ratio=0.9,
+        start_percent=0.5)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        if by_epoch:
+            calls = [
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.018000000000000002,
+                        'learning_rate/model2': 0.009000000000000001,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9,
+                    }, 1),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.02,
+                        'learning_rate/model2': 0.01,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9,
+                    }, 11),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.018090169943749474,
+                        'learning_rate/model2': 0.009045084971874737,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9,
+                    }, 61),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.0019098300562505265,
+                        'learning_rate/model2': 0.0009549150281252633,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9,
+                    }, 100)
+            ]
+        else:
+            calls = [
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.018000000000000002,
+                        'learning_rate/model2': 0.009000000000000001,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9
+                    }, 1),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.02,
+                        'learning_rate/model2': 0.01,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9
+                    }, 6),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.018090169943749474,
+                        'learning_rate/model2': 0.009045084971874737,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9
+                    }, 7),
+                call(
+                    'train', {
+                        'learning_rate/model1': 0.0019098300562505265,
+                        'learning_rate/model2': 0.0009549150281252633,
+                        'momentum/model1': 0.95,
+                        'momentum/model2': 0.9
+                    }, 10)
+            ]
+    else:
+        if by_epoch:
+            calls = [
+                call('train', {
+                    'learning_rate': 0.018000000000000002,
+                    'momentum': 0.95
+                }, 1),
+                call('train', {
+                    'learning_rate': 0.02,
+                    'momentum': 0.95
+                }, 11),
+                call('train', {
+                    'learning_rate': 0.018090169943749474,
+                    'momentum': 0.95
+                }, 61),
+                call('train', {
+                    'learning_rate': 0.0019098300562505265,
+                    'momentum': 0.95
+                }, 100)
+            ]
+        else:
+            calls = [
+                call('train', {
+                    'learning_rate': 0.018000000000000002,
+                    'momentum': 0.95
+                }, 1),
+                call('train', {
+                    'learning_rate': 0.02,
+                    'momentum': 0.95
+                }, 6),
+                call('train', {
+                    'learning_rate': 0.018090169943749474,
+                    'momentum': 0.95
+                }, 7),
+                call('train', {
+                    'learning_rate': 0.0019098300562505265,
+                    'momentum': 0.95
+                }, 10)
+            ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not supported in parrots now')
+@pytest.mark.parametrize('multi_optimizers, max_iters', [(True, 10), (True, 2),
+                                                         (False, 10),
+                                                         (False, 2)])
+def test_one_cycle_runner_hook(multi_optimizers, max_iters):
+    """Test OneCycleLrUpdaterHook and OneCycleMomentumUpdaterHook."""
+    with pytest.raises(AssertionError):
+        # by_epoch should be False
+        OneCycleLrUpdaterHook(max_lr=0.1, by_epoch=True)
+
+    with pytest.raises(ValueError):
+        # expected float between 0 and 1
+        OneCycleLrUpdaterHook(max_lr=0.1, pct_start=-0.1)
+
+    with pytest.raises(ValueError):
+        # anneal_strategy should be either 'cos' or 'linear'
+        OneCycleLrUpdaterHook(max_lr=0.1, anneal_strategy='sin')
+
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='OneCycleMomentumUpdaterHook',
+        base_momentum=0.85,
+        max_momentum=0.95,
+        pct_start=0.5,
+        anneal_strategy='cos',
+        three_phase=False)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add LR scheduler
+    hook_cfg = dict(
+        type='OneCycleLrUpdaterHook',
+        max_lr=0.01,
+        pct_start=0.5,
+        anneal_strategy='cos',
+        div_factor=25,
+        final_div_factor=1e4,
+        three_phase=False)
+    runner.register_hook_from_cfg(hook_cfg)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0003999999999999993,
+                    'learning_rate/model2': 0.0003999999999999993,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.95,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.00904508879153485,
+                    'learning_rate/model2': 0.00904508879153485,
+                    'momentum/model1': 0.8595491502812526,
+                    'momentum/model2': 0.8595491502812526,
+                }, 6),
+            call(
+                'train', {
+                    'learning_rate/model1': 4e-08,
+                    'learning_rate/model2': 4e-08,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.95,
+                }, 10)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.0003999999999999993,
+                'momentum': 0.95
+            }, 1),
+            call(
+                'train', {
+                    'learning_rate': 0.00904508879153485,
+                    'momentum': 0.8595491502812526
+                }, 6),
+            call('train', {
+                'learning_rate': 4e-08,
+                'momentum': 0.95
+            }, 10)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+    # Test OneCycleLrUpdaterHook
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(
+        runner_type='IterBasedRunner', max_epochs=None, max_iters=max_iters)
+
+    args = dict(
+        max_lr=0.01,
+        total_steps=5,
+        pct_start=0.5,
+        anneal_strategy='linear',
+        div_factor=25,
+        final_div_factor=1e4,
+    )
+    hook = OneCycleLrUpdaterHook(**args)
+    runner.register_hook(hook)
+    if max_iters == 10:
+        # test total_steps < max_iters
+        with pytest.raises(ValueError):
+            runner.run([loader], [('train', 1)])
+    else:
+        # test total_steps > max_iters
+        runner.run([loader], [('train', 1)])
+        lr_last = runner.current_lr()
+        t = torch.tensor([0.0], requires_grad=True)
+        optim = torch.optim.SGD([t], lr=0.01)
+        lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optim, **args)
+        lr_target = []
+        for _ in range(max_iters):
+            optim.step()
+            lr_target.append(optim.param_groups[0]['lr'])
+            lr_scheduler.step()
+        assert lr_target[-1] == lr_last[0]
+
+
+@pytest.mark.parametrize('multi_optimizers', (True, False))
+def test_cosine_restart_lr_update_hook(multi_optimizers):
+    """Test CosineRestartLrUpdaterHook."""
+    with pytest.raises(AssertionError):
+        # either `min_lr` or `min_lr_ratio` should be specified
+        CosineRestartLrUpdaterHook(
+            by_epoch=False,
+            periods=[2, 10],
+            restart_weights=[0.5, 0.5],
+            min_lr=0.1,
+            min_lr_ratio=0)
+
+    with pytest.raises(AssertionError):
+        # periods and restart_weights should have the same length
+        CosineRestartLrUpdaterHook(
+            by_epoch=False,
+            periods=[2, 10],
+            restart_weights=[0.5],
+            min_lr_ratio=0)
+
+    with pytest.raises(ValueError):
+        # the last cumulative_periods 7 (out of [5, 7]) should >= 10
+        sys.modules['pavi'] = MagicMock()
+        loader = DataLoader(torch.ones((10, 2)))
+        runner = _build_demo_runner()
+
+        # add cosine restart LR scheduler
+        hook = CosineRestartLrUpdaterHook(
+            by_epoch=False,
+            periods=[5, 2],  # cumulative_periods [5, 7 (5 + 2)]
+            restart_weights=[0.5, 0.5],
+            min_lr=0.0001)
+        runner.register_hook(hook)
+        runner.register_hook(IterTimerHook())
+
+        # add pavi hook
+        hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+        runner.register_hook(hook)
+        runner.run([loader], [('train', 1)])
+        shutil.rmtree(runner.work_dir)
+
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add cosine restart LR scheduler
+    hook = CosineRestartLrUpdaterHook(
+        by_epoch=False,
+        periods=[5, 5],
+        restart_weights=[0.5, 0.5],
+        min_lr_ratio=0)
+    runner.register_hook(hook)
+    runner.register_hook(IterTimerHook())
+
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01,
+                    'learning_rate/model2': 0.005,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01,
+                    'learning_rate/model2': 0.005,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 6),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0009549150281252633,
+                    'learning_rate/model2': 0.00047745751406263163,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 10)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.01,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.01,
+                'momentum': 0.95
+            }, 6),
+            call('train', {
+                'learning_rate': 0.0009549150281252633,
+                'momentum': 0.95
+            }, 10)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimizers', (True, False))
+def test_step_runner_hook(multi_optimizers):
+    """Test StepLrUpdaterHook."""
+    with pytest.raises(TypeError):
+        # `step` should be specified
+        StepLrUpdaterHook()
+    with pytest.raises(AssertionError):
+        # if `step` is int, should be positive
+        StepLrUpdaterHook(-10)
+    with pytest.raises(AssertionError):
+        # if `step` is list of int, should all be positive
+        StepLrUpdaterHook([10, 16, -20])
+
+    # test StepLrUpdaterHook with int `step` value
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((30, 2)))
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='StepMomentumUpdaterHook',
+        by_epoch=False,
+        step=5,
+        gamma=0.5,
+        min_momentum=0.05)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add step LR scheduler
+    hook = StepLrUpdaterHook(by_epoch=False, step=5, gamma=0.5, min_lr=1e-3)
+    runner.register_hook(hook)
+    runner.register_hook(IterTimerHook())
+
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.01,
+                    'learning_rate/model2': 0.005,
+                    'momentum/model1': 0.475,
+                    'momentum/model2': 0.45
+                }, 6),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.0025,
+                    'learning_rate/model2': 0.00125,
+                    'momentum/model1': 0.11875,
+                    'momentum/model2': 0.1125
+                }, 16),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.00125,
+                    'learning_rate/model2': 0.001,
+                    'momentum/model1': 0.059375,
+                    'momentum/model2': 0.05625
+                }, 21),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.001,
+                    'learning_rate/model2': 0.001,
+                    'momentum/model1': 0.05,
+                    'momentum/model2': 0.05
+                }, 26),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.001,
+                    'learning_rate/model2': 0.001,
+                    'momentum/model1': 0.05,
+                    'momentum/model2': 0.05
+                }, 30)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.01,
+                'momentum': 0.475
+            }, 6),
+            call('train', {
+                'learning_rate': 0.0025,
+                'momentum': 0.11875
+            }, 16),
+            call('train', {
+                'learning_rate': 0.00125,
+                'momentum': 0.059375
+            }, 21),
+            call('train', {
+                'learning_rate': 0.001,
+                'momentum': 0.05
+            }, 26),
+            call('train', {
+                'learning_rate': 0.001,
+                'momentum': 0.05
+            }, 30)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+    # test StepLrUpdaterHook with list[int] `step` value
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
+
+    # add momentum scheduler
+    hook_cfg = dict(
+        type='StepMomentumUpdaterHook',
+        by_epoch=False,
+        step=[4, 6, 8],
+        gamma=0.1)
+    runner.register_hook_from_cfg(hook_cfg)
+
+    # add step LR scheduler
+    hook = StepLrUpdaterHook(by_epoch=False, step=[4, 6, 8], gamma=0.1)
+    runner.register_hook(hook)
+    runner.register_hook(IterTimerHook())
+
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    # TODO: use a more elegant way to check values
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.002,
+                    'learning_rate/model2': 0.001,
+                    'momentum/model1': 9.5e-2,
+                    'momentum/model2': 9.000000000000001e-2
+                }, 5),
+            call(
+                'train', {
+                    'learning_rate/model1': 2.0000000000000004e-4,
+                    'learning_rate/model2': 1.0000000000000002e-4,
+                    'momentum/model1': 9.500000000000001e-3,
+                    'momentum/model2': 9.000000000000003e-3
+                }, 7),
+            call(
+                'train', {
+                    'learning_rate/model1': 2.0000000000000005e-05,
+                    'learning_rate/model2': 1.0000000000000003e-05,
+                    'momentum/model1': 9.500000000000002e-4,
+                    'momentum/model2': 9.000000000000002e-4
+                }, 9)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.002,
+                'momentum': 0.095
+            }, 5),
+            call(
+                'train', {
+                    'learning_rate': 2.0000000000000004e-4,
+                    'momentum': 9.500000000000001e-3
+                }, 7),
+            call(
+                'train', {
+                    'learning_rate': 2.0000000000000005e-05,
+                    'momentum': 9.500000000000002e-4
+                }, 9)
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('multi_optimizers, max_iters, gamma, cyclic_times',
+                         [(True, 8, 1, 1), (False, 8, 0.5, 2)])
+def test_cyclic_lr_update_hook(multi_optimizers, max_iters, gamma,
+                               cyclic_times):
+    """Test CyclicLrUpdateHook."""
+    with pytest.raises(AssertionError):
+        # by_epoch should be False
+        CyclicLrUpdaterHook(by_epoch=True)
+
+    with pytest.raises(AssertionError):
+        # target_ratio must be either float or tuple/list of two floats
+        CyclicLrUpdaterHook(by_epoch=False, target_ratio=(10.0, 0.1, 0.2))
+
+    with pytest.raises(AssertionError):
+        # step_ratio_up must be in range [0,1)
+        CyclicLrUpdaterHook(by_epoch=False, step_ratio_up=1.4)
+
+    with pytest.raises(ValueError):
+        # anneal_strategy must be one of "cos" or "linear"
+        CyclicLrUpdaterHook(by_epoch=False, anneal_strategy='sin')
+
+    with pytest.raises(AssertionError):
+        # gamma must be in range (0, 1]
+        CyclicLrUpdaterHook(by_epoch=False, gamma=0)
+
+    sys.modules['pavi'] = MagicMock()
+    loader = DataLoader(torch.ones((10, 2)))
+    runner = _build_demo_runner(
+        runner_type='IterBasedRunner',
+        max_epochs=None,
+        max_iters=max_iters,
+        multi_optimizers=multi_optimizers)
+
+    # add cyclic LR scheduler
+    schedule_hook = CyclicLrUpdaterHook(
+        by_epoch=False,
+        target_ratio=(10.0, 1.0),
+        cyclic_times=cyclic_times,
+        step_ratio_up=0.5,
+        anneal_strategy='linear',
+        gamma=gamma)
+    runner.register_hook(schedule_hook)
+    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
+    runner.register_hook(IterTimerHook())
+    # add pavi hook
+    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
+    runner.register_hook(hook)
+    runner.run([loader], [('train', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    assert hasattr(hook, 'writer')
+    if multi_optimizers:
+        calls = [
+            call(
+                'train', {
+                    'learning_rate/model1': 0.02,
+                    'learning_rate/model2': 0.01,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 1),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.155,
+                    'learning_rate/model2': 0.0775,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 4),
+            call(
+                'train', {
+                    'learning_rate/model1': 0.155,
+                    'learning_rate/model2': 0.0775,
+                    'momentum/model1': 0.95,
+                    'momentum/model2': 0.9,
+                }, 6)
+        ]
+    else:
+        calls = [
+            call('train', {
+                'learning_rate': 0.02,
+                'momentum': 0.95
+            }, 1),
+            call('train', {
+                'learning_rate': 0.11,
+                'momentum': 0.95
+            }, 4),
+            call('train', {
+                'learning_rate': 0.065,
+                'momentum': 0.95
+            }, 6),
+            call('train', {
+                'learning_rate': 0.11,
+                'momentum': 0.95
+            }, 7),
+        ]
+    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
+
+
+@pytest.mark.parametrize('log_model', (True, False))
+def test_mlflow_hook(log_model):
+    sys.modules['mlflow'] = MagicMock()
+    sys.modules['mlflow.pytorch'] = MagicMock()
+
+    runner = _build_demo_runner()
+    loader = DataLoader(torch.ones((5, 2)))
+
+    hook = MlflowLoggerHook(exp_name='test', log_model=log_model)
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    hook.mlflow.set_experiment.assert_called_with('test')
+    hook.mlflow.log_metrics.assert_called_with(
+        {
+            'learning_rate': 0.02,
+            'momentum': 0.95
+        }, step=6)
+    if log_model:
+        hook.mlflow_pytorch.log_model.assert_called_with(
+            runner.model,
+            'models',
+            pip_requirements=[f'torch=={TORCH_VERSION}'])
+    else:
+        assert not hook.mlflow_pytorch.log_model.called
+
+
+def test_segmind_hook():
+    sys.modules['segmind'] = MagicMock()
+    runner = _build_demo_runner()
+    hook = SegmindLoggerHook()
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    hook.mlflow_log.assert_called_with(
+        hook.log_metrics, {
+            'learning_rate': 0.02,
+            'momentum': 0.95
+        },
+        step=runner.epoch,
+        epoch=runner.epoch)
+
+
+def test_wandb_hook():
+    sys.modules['wandb'] = MagicMock()
+    runner = _build_demo_runner()
+    hook = WandbLoggerHook(log_artifact=True)
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+
+    shutil.rmtree(runner.work_dir)
+
+    hook.wandb.init.assert_called_with()
+    hook.wandb.log.assert_called_with({
+        'learning_rate': 0.02,
+        'momentum': 0.95
+    },
+                                      step=6,
+                                      commit=True)
+    hook.wandb.log_artifact.assert_called()
+    hook.wandb.join.assert_called_with()
+
+
+def test_neptune_hook():
+    sys.modules['neptune'] = MagicMock()
+    sys.modules['neptune.new'] = MagicMock()
+    runner = _build_demo_runner()
+    hook = NeptuneLoggerHook()
+
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    hook.neptune.init.assert_called_with()
+    hook.run['momentum'].log.assert_called_with(0.95, step=6)
+    hook.run.stop.assert_called_with()
+
+
+def test_dvclive_hook():
+    sys.modules['dvclive'] = MagicMock()
+    runner = _build_demo_runner()
+
+    hook = DvcliveLoggerHook()
+    dvclive_mock = hook.dvclive
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    dvclive_mock.set_step.assert_called_with(6)
+    dvclive_mock.log.assert_called_with('momentum', 0.95)
+
+
+def test_dvclive_hook_model_file(tmp_path):
+    sys.modules['dvclive'] = MagicMock()
+    runner = _build_demo_runner()
+
+    hook = DvcliveLoggerHook(model_file=osp.join(runner.work_dir, 'model.pth'))
+    runner.register_hook(hook)
+
+    loader = torch.utils.data.DataLoader(torch.ones((5, 2)))
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+
+    assert osp.exists(osp.join(runner.work_dir, 'model.pth'))
+
+    shutil.rmtree(runner.work_dir)
+
+
+def test_clearml_hook():
+    sys.modules['clearml'] = MagicMock()
+    runner = _build_demo_runner()
+    hook = ClearMLLoggerHook(init_kwargs={
+        'project_name': 'proj',
+        'task_name': 'task',
+    })
+
+    loader = DataLoader(torch.ones((5, 2)))
+
+    runner.register_hook(hook)
+    runner.run([loader, loader], [('train', 1), ('val', 1)])
+    shutil.rmtree(runner.work_dir)
+
+    hook.clearml.Task.init.assert_called_with(
+        project_name='proj', task_name='task')
+    hook.task.get_logger.assert_called_with()
+    report_scalar_calls = [
+        call('momentum', 'momentum', 0.95, 6),
+        call('learning_rate', 'learning_rate', 0.02, 6),
+    ]
+    hook.task_logger.report_scalar.assert_has_calls(
+        report_scalar_calls, any_order=True)
+
+
+def _build_demo_runner_without_hook(runner_type='EpochBasedRunner',
+                                    max_epochs=1,
+                                    max_iters=None,
+                                    multi_optimizers=False):
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(2, 1)
+            self.conv = nn.Conv2d(3, 3, 3)
+
+        def forward(self, x):
+            return self.linear(x)
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x))
+
+    model = Model()
+
+    if multi_optimizers:
+        optimizer = {
+            'model1':
+            torch.optim.SGD(model.linear.parameters(), lr=0.02, momentum=0.95),
+            'model2':
+            torch.optim.SGD(model.conv.parameters(), lr=0.01, momentum=0.9),
+        }
+    else:
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.02, momentum=0.95)
+
+    tmp_dir = tempfile.mkdtemp()
+    runner = build_runner(
+        dict(type=runner_type),
+        default_args=dict(
+            model=model,
+            work_dir=tmp_dir,
+            optimizer=optimizer,
+            logger=logging.getLogger(),
+            max_epochs=max_epochs,
+            max_iters=max_iters))
+    return runner
+
+
+def _build_demo_runner(runner_type='EpochBasedRunner',
+                       max_epochs=1,
+                       max_iters=None,
+                       multi_optimizers=False):
+    log_config = dict(
+        interval=1, hooks=[
+            dict(type='TextLoggerHook'),
+        ])
+
+    runner = _build_demo_runner_without_hook(runner_type, max_epochs,
+                                             max_iters, multi_optimizers)
+
+    runner.register_checkpoint_hook(dict(interval=1))
+    runner.register_logger_hooks(log_config)
+    return runner
+
+
+def test_runner_with_revise_keys():
+    import os
+
+    class Model(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(3, 3, 1)
+
+    class PrefixModel(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.backbone = Model()
+
+    pmodel = PrefixModel()
+    model = Model()
+    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
+
+    # add prefix
+    torch.save(model.state_dict(), checkpoint_path)
+    runner = _build_demo_runner(runner_type='EpochBasedRunner')
+    runner.model = pmodel
+    state_dict = runner.load_checkpoint(
+        checkpoint_path, revise_keys=[(r'^', 'backbone.')])
+    for key in pmodel.backbone.state_dict().keys():
+        assert torch.equal(pmodel.backbone.state_dict()[key], state_dict[key])
+    # strip prefix
+    torch.save(pmodel.state_dict(), checkpoint_path)
+    runner.model = model
+    state_dict = runner.load_checkpoint(
+        checkpoint_path, revise_keys=[(r'^backbone\.', '')])
+    for key in state_dict.keys():
+        key_stripped = re.sub(r'^backbone\.', '', key)
+        assert torch.equal(model.state_dict()[key_stripped], state_dict[key])
+    os.remove(checkpoint_path)
+
+
+def test_get_triggered_stages():
+
+    class ToyHook(Hook):
+        # test normal stage
+        def before_run():
+            pass
+
+        # test the method mapped to multi stages.
+        def after_epoch():
+            pass
+
+    hook = ToyHook()
+    # stages output have order, so here is list instead of set.
+    expected_stages = ['before_run', 'after_train_epoch', 'after_val_epoch']
+    assert hook.get_triggered_stages() == expected_stages
+
+
+def test_gradient_cumulative_optimizer_hook():
+
+    class ToyModel(nn.Module):
+
+        def __init__(self, with_norm=False):
+            super().__init__()
+            self.fp16_enabled = False
+            self.fc = nn.Linear(3, 2)
+            nn.init.constant_(self.fc.weight, 1.)
+            nn.init.constant_(self.fc.bias, 1.)
+            self.with_norm = with_norm
+            if with_norm:
+                self.norm = nn.BatchNorm1d(2)
+
+        def forward(self, x):
+            x = self.fc(x)
+            if self.with_norm:
+                x = self.norm(x)
+            return x
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x).mean(), num_samples=x.shape[0])
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x).mean(), num_samples=x.shape[0])
+
+    def build_toy_runner(config=dict(type='EpochBasedRunner', max_epochs=3)):
+        model = ToyModel()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.02)
+        tmp_dir = tempfile.mkdtemp()
+
+        runner = build_runner(
+            config,
+            default_args=dict(
+                model=model,
+                work_dir=tmp_dir,
+                optimizer=optimizer,
+                logger=logging.getLogger(),
+                meta=dict()))
+        return runner
+
+    with pytest.raises(AssertionError):
+        # cumulative_iters only accepts int
+        GradientCumulativeOptimizerHook(cumulative_iters='str')
+
+    with pytest.raises(AssertionError):
+        # cumulative_iters only accepts positive number
+        GradientCumulativeOptimizerHook(cumulative_iters=-1)
+
+    # test epoch based runner
+    data = torch.rand((6, 3))
+    # optimize with cumulative_iters
+    loader_1 = DataLoader(data, batch_size=1)
+    runner_1 = build_toy_runner()
+    optimizer_hook = GradientCumulativeOptimizerHook(
+        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
+    runner_1.register_hook(optimizer_hook)
+    runner_1.run([loader_1], [('train', 1)])
+
+    # optimize without cumulative_iters
+    loader_2 = DataLoader(data, batch_size=3)
+    runner_2 = build_toy_runner()
+    optimizer_hook = OptimizerHook(grad_clip=dict(max_norm=0.2))
+    runner_2.register_hook(optimizer_hook)
+    runner_2.run([loader_2], [('train', 1)])
+
+    # test optimizer works well
+    assert (runner_1.model.fc.weight < 1).all()
+    assert (runner_1.model.fc.bias < 1).all()
+    # test optimizer with cumulative_iters gets the same results
+    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
+    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
+    shutil.rmtree(runner_1.work_dir)
+    shutil.rmtree(runner_2.work_dir)
+
+    # test iter based runner
+    data = torch.rand((8, 3))
+    # optimize with cumulative_iters
+    loader_1 = DataLoader(data, batch_size=1)
+    runner_1 = build_toy_runner(dict(type='IterBasedRunner', max_iters=8))
+    optimizer_hook = GradientCumulativeOptimizerHook(
+        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
+    runner_1.register_hook(optimizer_hook)
+    runner_1.run([loader_1], [('train', 1)])
+
+    # optimize without cumulative_iters
+    loader_2_divisible = DataLoader(data[:6], batch_size=3)
+    loader_2_remainder = DataLoader(data[6:], batch_size=2)
+    runner_2 = build_toy_runner(dict(type='IterBasedRunner', max_iters=3))
+    optimizer_hook = OptimizerHook(grad_clip=dict(max_norm=0.2))
+    runner_2.register_hook(optimizer_hook)
+    runner_2.run([loader_2_divisible, loader_2_remainder], [('train', 2),
+                                                            ('train', 1)])
+
+    # test optimizer works well
+    assert (runner_1.model.fc.weight < 1).all()
+    assert (runner_1.model.fc.bias < 1).all()
+    # test optimizer with cumulative_iters gets the same results
+    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
+    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
+    shutil.rmtree(runner_1.work_dir)
+    shutil.rmtree(runner_2.work_dir)
+
+    # test has_batch_norm
+    model = ToyModel(with_norm=True)
+    optimizer_hook = GradientCumulativeOptimizerHook(
+        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
+    assert optimizer_hook.has_batch_norm(model)
+
+
+@pytest.mark.skipif(
+    not torch.cuda.is_available(), reason='requires CUDA support')
+def test_gradient_cumulative_fp16_optimizer_hook():
+
+    class ToyModel(nn.Module):
+
+        def __init__(self):
+            super().__init__()
+            self.fp16_enabled = False
+            self.fc = nn.Linear(3, 2)
+            nn.init.constant_(self.fc.weight, 1.)
+            nn.init.constant_(self.fc.bias, 1.)
+
+        @auto_fp16(apply_to=('x', ))
+        def forward(self, x):
+            x = self.fc(x)
+            return x
+
+        def train_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x).mean(), num_samples=x.shape[0])
+
+        def val_step(self, x, optimizer, **kwargs):
+            return dict(loss=self(x).mean(), num_samples=x.shape[0])
+
+    def build_toy_runner(config=dict(type='EpochBasedRunner', max_epochs=3)):
+        model = ToyModel().cuda()
+        optimizer = torch.optim.SGD(model.parameters(), lr=0.02)
+        tmp_dir = tempfile.mkdtemp()
+
+        runner = build_runner(
+            config,
+            default_args=dict(
+                model=model,
+                work_dir=tmp_dir,
+                optimizer=optimizer,
+                logger=logging.getLogger(),
+                meta=dict()))
+        return runner
+
+    # test epoch based runner
+    data = torch.rand((6, 3)).cuda()
+    # optimize with cumulative_iters
+    loader_1 = DataLoader(data, batch_size=1)
+    runner_1 = build_toy_runner()
+    optimizer_hook = GradientCumulativeFp16OptimizerHook(
+        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
+    runner_1.register_hook(optimizer_hook)
+    runner_1.run([loader_1], [('train', 1)])
+
+    # optimize without cumulative_iters
+    loader_2 = DataLoader(data, batch_size=3)
+    runner_2 = build_toy_runner()
+    optimizer_hook = Fp16OptimizerHook(grad_clip=dict(max_norm=0.2))
+    runner_2.register_hook(optimizer_hook)
+    runner_2.run([loader_2], [('train', 1)])
+
+    # test optimizer works well
+    assert (runner_1.model.fc.weight < 1).all()
+    assert (runner_1.model.fc.bias < 1).all()
+    # test optimizer with cumulative_iters gets the same results
+    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
+    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
+    shutil.rmtree(runner_1.work_dir)
+    shutil.rmtree(runner_2.work_dir)
+
+    # test iter based runner
+    data = torch.rand((8, 3)).cuda()
+    # optimize with cumulative_iters
+    loader_1 = DataLoader(data, batch_size=1)
+    runner_1 = build_toy_runner(dict(type='IterBasedRunner', max_iters=8))
+    optimizer_hook = GradientCumulativeFp16OptimizerHook(
+        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
+    runner_1.register_hook(optimizer_hook)
+    runner_1.run([loader_1], [('train', 1)])
+
+    # optimize without cumulative_iters
+    loader_2_divisible = DataLoader(data[:6], batch_size=3)
+    loader_2_remainder = DataLoader(data[6:], batch_size=2)
+    runner_2 = build_toy_runner(dict(type='IterBasedRunner', max_iters=3))
+    optimizer_hook = Fp16OptimizerHook(grad_clip=dict(max_norm=0.2))
+    runner_2.register_hook(optimizer_hook)
+    runner_2.run([loader_2_divisible, loader_2_remainder], [('train', 2),
+                                                            ('train', 1)])
+
+    # test optimizer works well
+    assert (runner_1.model.fc.weight < 1).all()
+    assert (runner_1.model.fc.bias < 1).all()
+    # test optimizer with cumulative_iters gets the same results
+    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
+    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
+    shutil.rmtree(runner_1.work_dir)
+    shutil.rmtree(runner_2.work_dir)
diff --git a/mmcv/tests/test_runner/test_optimizer.py b/mmcv/tests/test_runner/test_optimizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..724f45db9648a365c52c7f2a7cc690a6fbc87a69
--- /dev/null
+++ b/mmcv/tests/test_runner/test_optimizer.py
@@ -0,0 +1,640 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+import warnings
+from unittest.mock import MagicMock
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
+from mmcv.runner.optimizer import build_optimizer, build_optimizer_constructor
+from mmcv.runner.optimizer.builder import TORCH_OPTIMIZERS
+from mmcv.utils.ext_loader import check_ops_exist
+
+OPS_AVAILABLE = check_ops_exist()
+if not OPS_AVAILABLE:
+    sys.modules['mmcv.ops'] = MagicMock(
+        DeformConv2d=dict, ModulatedDeformConv2d=dict)
+
+
+class SubModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv1 = nn.Conv2d(2, 2, kernel_size=1, groups=2)
+        self.gn = nn.GroupNorm(2, 2)
+        self.param1 = nn.Parameter(torch.ones(1))
+
+    def forward(self, x):
+        return x
+
+
+class ExampleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.param1 = nn.Parameter(torch.ones(1))
+        self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False)
+        self.conv2 = nn.Conv2d(4, 2, kernel_size=1)
+        self.bn = nn.BatchNorm2d(2)
+        self.sub = SubModel()
+        if OPS_AVAILABLE:
+            from mmcv.ops import DeformConv2dPack
+            self.dcn = DeformConv2dPack(
+                3, 4, kernel_size=3, deformable_groups=1)
+
+    def forward(self, x):
+        return x
+
+
+class ExampleDuplicateModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.param1 = nn.Parameter(torch.ones(1))
+        self.conv1 = nn.Sequential(nn.Conv2d(3, 4, kernel_size=1, bias=False))
+        self.conv2 = nn.Sequential(nn.Conv2d(4, 2, kernel_size=1))
+        self.bn = nn.BatchNorm2d(2)
+        self.sub = SubModel()
+        self.conv3 = nn.Sequential(nn.Conv2d(3, 4, kernel_size=1, bias=False))
+        self.conv3[0] = self.conv1[0]
+        if OPS_AVAILABLE:
+            from mmcv.ops import DeformConv2dPack
+            self.dcn = DeformConv2dPack(
+                3, 4, kernel_size=3, deformable_groups=1)
+
+    def forward(self, x):
+        return x
+
+
+class PseudoDataParallel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.module = ExampleModel()
+
+    def forward(self, x):
+        return x
+
+
+base_lr = 0.01
+base_wd = 0.0001
+momentum = 0.9
+
+
+def check_default_optimizer(optimizer, model, prefix=''):
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == base_wd
+    param_groups = optimizer.param_groups[0]
+    if OPS_AVAILABLE:
+        param_names = [
+            'param1', 'conv1.weight', 'conv2.weight', 'conv2.bias',
+            'bn.weight', 'bn.bias', 'sub.param1', 'sub.conv1.weight',
+            'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias', 'dcn.weight',
+            'dcn.conv_offset.weight', 'dcn.conv_offset.bias'
+        ]
+    else:
+        param_names = [
+            'param1', 'conv1.weight', 'conv2.weight', 'conv2.bias',
+            'bn.weight', 'bn.bias', 'sub.param1', 'sub.conv1.weight',
+            'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias'
+        ]
+    param_dict = dict(model.named_parameters())
+    assert len(param_groups['params']) == len(param_names)
+    for i in range(len(param_groups['params'])):
+        assert torch.equal(param_groups['params'][i],
+                           param_dict[prefix + param_names[i]])
+
+
+def check_sgd_optimizer(optimizer,
+                        model,
+                        prefix='',
+                        bias_lr_mult=1,
+                        bias_decay_mult=1,
+                        norm_decay_mult=1,
+                        dwconv_decay_mult=1,
+                        dcn_offset_lr_mult=1,
+                        bypass_duplicate=False):
+    param_groups = optimizer.param_groups
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == base_wd
+    model_parameters = list(model.parameters())
+    assert len(param_groups) == len(model_parameters)
+    for i, param in enumerate(model_parameters):
+        param_group = param_groups[i]
+        assert torch.equal(param_group['params'][0], param)
+        assert param_group['momentum'] == momentum
+
+    # param1
+    param1 = param_groups[0]
+    assert param1['lr'] == base_lr
+    assert param1['weight_decay'] == base_wd
+    # conv1.weight
+    conv1_weight = param_groups[1]
+    assert conv1_weight['lr'] == base_lr
+    assert conv1_weight['weight_decay'] == base_wd
+    # conv2.weight
+    conv2_weight = param_groups[2]
+    assert conv2_weight['lr'] == base_lr
+    assert conv2_weight['weight_decay'] == base_wd
+    # conv2.bias
+    conv2_bias = param_groups[3]
+    assert conv2_bias['lr'] == base_lr * bias_lr_mult
+    assert conv2_bias['weight_decay'] == base_wd * bias_decay_mult
+    # bn.weight
+    bn_weight = param_groups[4]
+    assert bn_weight['lr'] == base_lr
+    assert bn_weight['weight_decay'] == base_wd * norm_decay_mult
+    # bn.bias
+    bn_bias = param_groups[5]
+    assert bn_bias['lr'] == base_lr
+    assert bn_bias['weight_decay'] == base_wd * norm_decay_mult
+    # sub.param1
+    sub_param1 = param_groups[6]
+    assert sub_param1['lr'] == base_lr
+    assert sub_param1['weight_decay'] == base_wd
+    # sub.conv1.weight
+    sub_conv1_weight = param_groups[7]
+    assert sub_conv1_weight['lr'] == base_lr
+    assert sub_conv1_weight['weight_decay'] == base_wd * dwconv_decay_mult
+    # sub.conv1.bias
+    sub_conv1_bias = param_groups[8]
+    assert sub_conv1_bias['lr'] == base_lr * bias_lr_mult
+    assert sub_conv1_bias['weight_decay'] == base_wd * dwconv_decay_mult
+    # sub.gn.weight
+    sub_gn_weight = param_groups[9]
+    assert sub_gn_weight['lr'] == base_lr
+    assert sub_gn_weight['weight_decay'] == base_wd * norm_decay_mult
+    # sub.gn.bias
+    sub_gn_bias = param_groups[10]
+    assert sub_gn_bias['lr'] == base_lr
+    assert sub_gn_bias['weight_decay'] == base_wd * norm_decay_mult
+
+    if torch.cuda.is_available():
+        dcn_conv_weight = param_groups[11]
+        assert dcn_conv_weight['lr'] == base_lr
+        assert dcn_conv_weight['weight_decay'] == base_wd
+
+        dcn_offset_weight = param_groups[12]
+        assert dcn_offset_weight['lr'] == base_lr * dcn_offset_lr_mult
+        assert dcn_offset_weight['weight_decay'] == base_wd
+
+        dcn_offset_bias = param_groups[13]
+        assert dcn_offset_bias['lr'] == base_lr * dcn_offset_lr_mult
+        assert dcn_offset_bias['weight_decay'] == base_wd
+
+
+def test_default_optimizer_constructor():
+    model = ExampleModel()
+
+    with pytest.raises(TypeError):
+        # optimizer_cfg must be a dict
+        optimizer_cfg = []
+        optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
+        optim_constructor(model)
+
+    with pytest.raises(TypeError):
+        # paramwise_cfg must be a dict or None
+        optimizer_cfg = dict(lr=0.0001)
+        paramwise_cfg = ['error']
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg, paramwise_cfg)
+        optim_constructor(model)
+
+    with pytest.raises(ValueError):
+        # bias_decay_mult/norm_decay_mult is specified but weight_decay is None
+        optimizer_cfg = dict(lr=0.0001, weight_decay=None)
+        paramwise_cfg = dict(bias_decay_mult=1, norm_decay_mult=1)
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg, paramwise_cfg)
+        optim_constructor(model)
+
+    # basic config with ExampleModel
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
+    optimizer = optim_constructor(model)
+    check_default_optimizer(optimizer, model)
+
+    # basic config with pseudo data parallel
+    model = PseudoDataParallel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = None
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
+    optimizer = optim_constructor(model)
+    check_default_optimizer(optimizer, model, prefix='module.')
+
+    # basic config with DataParallel
+    if torch.cuda.is_available():
+        model = torch.nn.DataParallel(ExampleModel())
+        optimizer_cfg = dict(
+            type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+        paramwise_cfg = None
+        optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
+        optimizer = optim_constructor(model)
+        check_default_optimizer(optimizer, model, prefix='module.')
+
+    # Empty paramwise_cfg with ExampleModel
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict()
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    check_default_optimizer(optimizer, model)
+
+    # Empty paramwise_cfg with ExampleModel and no grad
+    model = ExampleModel()
+    for param in model.parameters():
+        param.requires_grad = False
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict()
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
+    optimizer = optim_constructor(model)
+    check_default_optimizer(optimizer, model)
+
+    # paramwise_cfg with ExampleModel
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict(
+        bias_lr_mult=2,
+        bias_decay_mult=0.5,
+        norm_decay_mult=0,
+        dwconv_decay_mult=0.1,
+        dcn_offset_lr_mult=0.1)
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    check_sgd_optimizer(optimizer, model, **paramwise_cfg)
+
+    # paramwise_cfg with ExampleModel, weight decay is None
+    model = ExampleModel()
+    optimizer_cfg = dict(type='Rprop', lr=base_lr)
+    paramwise_cfg = dict(bias_lr_mult=2)
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+
+    param_groups = optimizer.param_groups
+    assert isinstance(optimizer, torch.optim.Rprop)
+    assert optimizer.defaults['lr'] == base_lr
+    model_parameters = list(model.parameters())
+    assert len(param_groups) == len(model_parameters)
+    for i, param in enumerate(model_parameters):
+        param_group = param_groups[i]
+        assert torch.equal(param_group['params'][0], param)
+    # param1
+    assert param_groups[0]['lr'] == base_lr
+    # conv1.weight
+    assert param_groups[1]['lr'] == base_lr
+    # conv2.weight
+    assert param_groups[2]['lr'] == base_lr
+    # conv2.bias
+    assert param_groups[3]['lr'] == base_lr * paramwise_cfg['bias_lr_mult']
+    # bn.weight
+    assert param_groups[4]['lr'] == base_lr
+    # bn.bias
+    assert param_groups[5]['lr'] == base_lr
+    # sub.param1
+    assert param_groups[6]['lr'] == base_lr
+    # sub.conv1.weight
+    assert param_groups[7]['lr'] == base_lr
+    # sub.conv1.bias
+    assert param_groups[8]['lr'] == base_lr * paramwise_cfg['bias_lr_mult']
+    # sub.gn.weight
+    assert param_groups[9]['lr'] == base_lr
+    # sub.gn.bias
+    assert param_groups[10]['lr'] == base_lr
+
+    if OPS_AVAILABLE:
+        # dcn.weight
+        assert param_groups[11]['lr'] == base_lr
+        # dcn.conv_offset.weight
+        assert param_groups[12]['lr'] == base_lr
+        # dcn.conv_offset.bias
+        assert param_groups[13]['lr'] == base_lr
+
+    # paramwise_cfg with pseudo data parallel
+    model = PseudoDataParallel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict(
+        bias_lr_mult=2,
+        bias_decay_mult=0.5,
+        norm_decay_mult=0,
+        dwconv_decay_mult=0.1,
+        dcn_offset_lr_mult=0.1)
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    check_sgd_optimizer(optimizer, model, prefix='module.', **paramwise_cfg)
+
+    # paramwise_cfg with DataParallel
+    if torch.cuda.is_available():
+        model = torch.nn.DataParallel(ExampleModel())
+        optimizer_cfg = dict(
+            type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+        paramwise_cfg = dict(
+            bias_lr_mult=2,
+            bias_decay_mult=0.5,
+            norm_decay_mult=0,
+            dwconv_decay_mult=0.1,
+            dcn_offset_lr_mult=0.1)
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg, paramwise_cfg)
+        optimizer = optim_constructor(model)
+        check_sgd_optimizer(
+            optimizer, model, prefix='module.', **paramwise_cfg)
+
+    # paramwise_cfg with ExampleModel and no grad
+    for param in model.parameters():
+        param.requires_grad = False
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    param_groups = optimizer.param_groups
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == base_wd
+    for i, (name, param) in enumerate(model.named_parameters()):
+        param_group = param_groups[i]
+        assert torch.equal(param_group['params'][0], param)
+        assert param_group['momentum'] == momentum
+        assert param_group['lr'] == base_lr
+        assert param_group['weight_decay'] == base_wd
+
+    # paramwise_cfg with bypass_duplicate option
+    model = ExampleDuplicateModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict(
+        bias_lr_mult=2,
+        bias_decay_mult=0.5,
+        norm_decay_mult=0,
+        dwconv_decay_mult=0.1)
+    with pytest.raises(ValueError) as excinfo:
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg, paramwise_cfg)
+        optim_constructor(model)
+        assert 'some parameters appear in more than one parameter ' \
+               'group' == excinfo.value
+
+    paramwise_cfg = dict(
+        bias_lr_mult=2,
+        bias_decay_mult=0.5,
+        norm_decay_mult=0,
+        dwconv_decay_mult=0.1,
+        dcn_offset_lr_mult=0.1,
+        bypass_duplicate=True)
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    with warnings.catch_warnings(record=True) as w:
+        optimizer = optim_constructor(model)
+        warnings.simplefilter('always')
+        assert len(w) == 1
+        assert str(w[0].message) == 'conv3.0 is duplicate. It is skipped ' \
+                                    'since bypass_duplicate=True'
+    model_parameters = list(model.parameters())
+    num_params = 14 if OPS_AVAILABLE else 11
+    assert len(optimizer.param_groups) == len(model_parameters) == num_params
+    check_sgd_optimizer(optimizer, model, **paramwise_cfg)
+
+    # test DefaultOptimizerConstructor with custom_keys and ExampleModel
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict(
+        custom_keys={
+            'param1': dict(lr_mult=10),
+            'sub': dict(lr_mult=0.1, decay_mult=0),
+            'sub.gn': dict(lr_mult=0.01),
+            'non_exist_key': dict(lr_mult=0.0)
+        },
+        norm_decay_mult=0.5)
+
+    with pytest.raises(TypeError):
+        # custom_keys should be a dict
+        paramwise_cfg_ = dict(custom_keys=[0.1, 0.0001])
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg, paramwise_cfg_)
+        optimizer = optim_constructor(model)
+
+    with pytest.raises(ValueError):
+        # if 'decay_mult' is specified in custom_keys, weight_decay should be
+        # specified
+        optimizer_cfg_ = dict(type='SGD', lr=0.01)
+        paramwise_cfg_ = dict(custom_keys={'.backbone': dict(decay_mult=0.5)})
+        optim_constructor = DefaultOptimizerConstructor(
+            optimizer_cfg_, paramwise_cfg_)
+        optimizer = optim_constructor(model)
+
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    # check optimizer type and default config
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == base_wd
+
+    # check params groups
+    param_groups = optimizer.param_groups
+
+    groups = []
+    group_settings = []
+    # group 1, matches of 'param1'
+    # 'param1' is the longest match for 'sub.param1'
+    groups.append(['param1', 'sub.param1'])
+    group_settings.append({
+        'lr': base_lr * 10,
+        'momentum': momentum,
+        'weight_decay': base_wd,
+    })
+    # group 2, matches of 'sub.gn'
+    groups.append(['sub.gn.weight', 'sub.gn.bias'])
+    group_settings.append({
+        'lr': base_lr * 0.01,
+        'momentum': momentum,
+        'weight_decay': base_wd,
+    })
+    # group 3, matches of 'sub'
+    groups.append(['sub.conv1.weight', 'sub.conv1.bias'])
+    group_settings.append({
+        'lr': base_lr * 0.1,
+        'momentum': momentum,
+        'weight_decay': 0,
+    })
+    # group 4, bn is configured by 'norm_decay_mult'
+    groups.append(['bn.weight', 'bn.bias'])
+    group_settings.append({
+        'lr': base_lr,
+        'momentum': momentum,
+        'weight_decay': base_wd * 0.5,
+    })
+    # group 5, default group
+    groups.append(['conv1.weight', 'conv2.weight', 'conv2.bias'])
+    group_settings.append({
+        'lr': base_lr,
+        'momentum': momentum,
+        'weight_decay': base_wd
+    })
+
+    num_params = 14 if OPS_AVAILABLE else 11
+    assert len(param_groups) == num_params
+    for i, (name, param) in enumerate(model.named_parameters()):
+        assert torch.equal(param_groups[i]['params'][0], param)
+        for group, settings in zip(groups, group_settings):
+            if name in group:
+                for setting in settings:
+                    assert param_groups[i][setting] == settings[
+                        setting], f'{name} {setting}'
+
+    # test DefaultOptimizerConstructor with custom_keys and ExampleModel 2
+    model = ExampleModel()
+    optimizer_cfg = dict(type='SGD', lr=base_lr, momentum=momentum)
+    paramwise_cfg = dict(custom_keys={'param1': dict(lr_mult=10)})
+
+    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
+                                                    paramwise_cfg)
+    optimizer = optim_constructor(model)
+    # check optimizer type and default config
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == 0
+
+    # check params groups
+    param_groups = optimizer.param_groups
+
+    groups = []
+    group_settings = []
+    # group 1, matches of 'param1'
+    groups.append(['param1', 'sub.param1'])
+    group_settings.append({
+        'lr': base_lr * 10,
+        'momentum': momentum,
+        'weight_decay': 0,
+    })
+    # group 2, default group
+    groups.append([
+        'sub.conv1.weight', 'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias',
+        'conv1.weight', 'conv2.weight', 'conv2.bias', 'bn.weight', 'bn.bias'
+    ])
+    group_settings.append({
+        'lr': base_lr,
+        'momentum': momentum,
+        'weight_decay': 0
+    })
+
+    num_params = 14 if OPS_AVAILABLE else 11
+    assert len(param_groups) == num_params
+    for i, (name, param) in enumerate(model.named_parameters()):
+        assert torch.equal(param_groups[i]['params'][0], param)
+        for group, settings in zip(groups, group_settings):
+            if name in group:
+                for setting in settings:
+                    assert param_groups[i][setting] == settings[
+                        setting], f'{name} {setting}'
+
+
+def test_torch_optimizers():
+    torch_optimizers = [
+        'ASGD', 'Adadelta', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'LBFGS',
+        'Optimizer', 'RMSprop', 'Rprop', 'SGD', 'SparseAdam'
+    ]
+    assert set(torch_optimizers).issubset(set(TORCH_OPTIMIZERS))
+
+
+def test_build_optimizer_constructor():
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    paramwise_cfg = dict(
+        bias_lr_mult=2,
+        bias_decay_mult=0.5,
+        norm_decay_mult=0,
+        dwconv_decay_mult=0.1,
+        dcn_offset_lr_mult=0.1)
+    optim_constructor_cfg = dict(
+        type='DefaultOptimizerConstructor',
+        optimizer_cfg=optimizer_cfg,
+        paramwise_cfg=paramwise_cfg)
+    optim_constructor = build_optimizer_constructor(optim_constructor_cfg)
+    optimizer = optim_constructor(model)
+    check_sgd_optimizer(optimizer, model, **paramwise_cfg)
+
+    from mmcv.runner import OPTIMIZERS
+    from mmcv.utils import build_from_cfg
+
+    @OPTIMIZER_BUILDERS.register_module()
+    class MyOptimizerConstructor(DefaultOptimizerConstructor):
+
+        def __call__(self, model):
+            if hasattr(model, 'module'):
+                model = model.module
+
+            conv1_lr_mult = self.paramwise_cfg.get('conv1_lr_mult', 1.)
+
+            params = []
+            for name, param in model.named_parameters():
+                param_group = {'params': [param]}
+                if name.startswith('conv1') and param.requires_grad:
+                    param_group['lr'] = self.base_lr * conv1_lr_mult
+                params.append(param_group)
+            optimizer_cfg['params'] = params
+
+            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
+
+    paramwise_cfg = dict(conv1_lr_mult=5)
+    optim_constructor_cfg = dict(
+        type='MyOptimizerConstructor',
+        optimizer_cfg=optimizer_cfg,
+        paramwise_cfg=paramwise_cfg)
+    optim_constructor = build_optimizer_constructor(optim_constructor_cfg)
+    optimizer = optim_constructor(model)
+
+    param_groups = optimizer.param_groups
+    assert isinstance(optimizer, torch.optim.SGD)
+    assert optimizer.defaults['lr'] == base_lr
+    assert optimizer.defaults['momentum'] == momentum
+    assert optimizer.defaults['weight_decay'] == base_wd
+    for i, param in enumerate(model.parameters()):
+        param_group = param_groups[i]
+        assert torch.equal(param_group['params'][0], param)
+        assert param_group['momentum'] == momentum
+    # conv1.weight
+    assert param_groups[1]['lr'] == base_lr * paramwise_cfg['conv1_lr_mult']
+    assert param_groups[1]['weight_decay'] == base_wd
+
+
+def test_build_optimizer():
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
+    optimizer = build_optimizer(model, optimizer_cfg)
+    check_default_optimizer(optimizer, model)
+
+    model = ExampleModel()
+    optimizer_cfg = dict(
+        type='SGD',
+        lr=base_lr,
+        weight_decay=base_wd,
+        momentum=momentum,
+        paramwise_cfg=dict(
+            bias_lr_mult=2,
+            bias_decay_mult=0.5,
+            norm_decay_mult=0,
+            dwconv_decay_mult=0.1,
+            dcn_offset_lr_mult=0.1))
+    optimizer = build_optimizer(model, optimizer_cfg)
+    check_sgd_optimizer(optimizer, model, **optimizer_cfg['paramwise_cfg'])
diff --git a/mmcv/tests/test_runner/test_runner.py b/mmcv/tests/test_runner/test_runner.py
new file mode 100644
index 0000000000000000000000000000000000000000..d75f20d45f9ace19b04061639bdb6cef284e1c63
--- /dev/null
+++ b/mmcv/tests/test_runner/test_runner.py
@@ -0,0 +1,289 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+import os.path as osp
+import platform
+import random
+import string
+import tempfile
+
+import pytest
+import torch
+import torch.nn as nn
+
+from mmcv.parallel import MMDataParallel
+from mmcv.runner import (RUNNERS, EpochBasedRunner, IterBasedRunner,
+                         build_runner)
+from mmcv.runner.hooks import IterTimerHook
+
+
+class OldStyleModel(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 1)
+
+
+class Model(OldStyleModel):
+
+    def train_step(self):
+        pass
+
+    def val_step(self):
+        pass
+
+
+def test_build_runner():
+    temp_root = tempfile.gettempdir()
+    dir_name = ''.join(
+        [random.choice(string.ascii_letters) for _ in range(10)])
+
+    default_args = dict(
+        model=Model(),
+        work_dir=osp.join(temp_root, dir_name),
+        logger=logging.getLogger())
+    cfg = dict(type='EpochBasedRunner', max_epochs=1)
+    runner = build_runner(cfg, default_args=default_args)
+    assert runner._max_epochs == 1
+    cfg = dict(type='IterBasedRunner', max_iters=1)
+    runner = build_runner(cfg, default_args=default_args)
+    assert runner._max_iters == 1
+
+    with pytest.raises(ValueError, match='Only one of'):
+        cfg = dict(type='IterBasedRunner', max_epochs=1, max_iters=1)
+        runner = build_runner(cfg, default_args=default_args)
+
+
+@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
+def test_epoch_based_runner(runner_class):
+
+    with pytest.warns(DeprecationWarning):
+        # batch_processor is deprecated
+        model = OldStyleModel()
+
+        def batch_processor():
+            pass
+
+        _ = runner_class(model, batch_processor, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # batch_processor must be callable
+        model = OldStyleModel()
+        _ = runner_class(model, batch_processor=0, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # optimizer must be a optimizer or a dict of optimizers
+        model = Model()
+        optimizer = 'NotAOptimizer'
+        _ = runner_class(
+            model, optimizer=optimizer, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # optimizer must be a optimizer or a dict of optimizers
+        model = Model()
+        optimizers = dict(optim1=torch.optim.Adam(), optim2='NotAOptimizer')
+        _ = runner_class(
+            model, optimizer=optimizers, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # logger must be a logging.Logger
+        model = Model()
+        _ = runner_class(model, logger=None)
+
+    with pytest.raises(TypeError):
+        # meta must be a dict or None
+        model = Model()
+        _ = runner_class(model, logger=logging.getLogger(), meta=['list'])
+
+    with pytest.raises(AssertionError):
+        # model must implement the method train_step()
+        model = OldStyleModel()
+        _ = runner_class(model, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # work_dir must be a str or None
+        model = Model()
+        _ = runner_class(model, work_dir=1, logger=logging.getLogger())
+
+    with pytest.raises(RuntimeError):
+        # batch_processor and train_step() cannot be both set
+
+        def batch_processor():
+            pass
+
+        model = Model()
+        _ = runner_class(model, batch_processor, logger=logging.getLogger())
+
+    # test work_dir
+    model = Model()
+    temp_root = tempfile.gettempdir()
+    dir_name = ''.join(
+        [random.choice(string.ascii_letters) for _ in range(10)])
+    work_dir = osp.join(temp_root, dir_name)
+    _ = runner_class(model, work_dir=work_dir, logger=logging.getLogger())
+    assert osp.isdir(work_dir)
+    _ = runner_class(model, work_dir=work_dir, logger=logging.getLogger())
+    assert osp.isdir(work_dir)
+    os.removedirs(work_dir)
+
+
+@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
+def test_runner_with_parallel(runner_class):
+
+    def batch_processor():
+        pass
+
+    model = MMDataParallel(OldStyleModel())
+    _ = runner_class(model, batch_processor, logger=logging.getLogger())
+
+    model = MMDataParallel(Model())
+    _ = runner_class(model, logger=logging.getLogger())
+
+    with pytest.raises(RuntimeError):
+        # batch_processor and train_step() cannot be both set
+
+        def batch_processor():
+            pass
+
+        model = MMDataParallel(Model())
+        _ = runner_class(model, batch_processor, logger=logging.getLogger())
+
+
+@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
+def test_save_checkpoint(runner_class):
+    model = Model()
+    runner = runner_class(model=model, logger=logging.getLogger())
+
+    with pytest.raises(TypeError):
+        # meta should be None or dict
+        runner.save_checkpoint('.', meta=list())
+
+    with tempfile.TemporaryDirectory() as root:
+        runner.save_checkpoint(root)
+
+        latest_path = osp.join(root, 'latest.pth')
+        assert osp.exists(latest_path)
+
+        if isinstance(runner, EpochBasedRunner):
+            first_ckp_path = osp.join(root, 'epoch_1.pth')
+        elif isinstance(runner, IterBasedRunner):
+            first_ckp_path = osp.join(root, 'iter_1.pth')
+
+        assert osp.exists(first_ckp_path)
+
+        if platform.system() != 'Windows':
+            assert osp.realpath(latest_path) == osp.realpath(first_ckp_path)
+        else:
+            # use copy instead of symlink on windows
+            pass
+
+        torch.load(latest_path)
+
+
+@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
+def test_build_lr_momentum_hook(runner_class):
+    model = Model()
+    runner = runner_class(model=model, logger=logging.getLogger())
+
+    # test policy that is already title
+    lr_config = dict(
+        policy='CosineAnnealing',
+        by_epoch=False,
+        min_lr_ratio=0,
+        warmup_iters=2,
+        warmup_ratio=0.9)
+    runner.register_lr_hook(lr_config)
+    assert len(runner.hooks) == 1
+
+    # test policy that is already title
+    lr_config = dict(
+        policy='Cyclic',
+        by_epoch=False,
+        target_ratio=(10, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_lr_hook(lr_config)
+    assert len(runner.hooks) == 2
+
+    # test policy that is not title
+    lr_config = dict(
+        policy='cyclic',
+        by_epoch=False,
+        target_ratio=(0.85 / 0.95, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_lr_hook(lr_config)
+    assert len(runner.hooks) == 3
+
+    # test policy that is title
+    lr_config = dict(
+        policy='Step',
+        warmup='linear',
+        warmup_iters=500,
+        warmup_ratio=1.0 / 3,
+        step=[8, 11])
+    runner.register_lr_hook(lr_config)
+    assert len(runner.hooks) == 4
+
+    # test policy that is not title
+    lr_config = dict(
+        policy='step',
+        warmup='linear',
+        warmup_iters=500,
+        warmup_ratio=1.0 / 3,
+        step=[8, 11])
+    runner.register_lr_hook(lr_config)
+    assert len(runner.hooks) == 5
+
+    # test policy that is already title
+    mom_config = dict(
+        policy='CosineAnnealing',
+        min_momentum_ratio=0.99 / 0.95,
+        by_epoch=False,
+        warmup_iters=2,
+        warmup_ratio=0.9 / 0.95)
+    runner.register_momentum_hook(mom_config)
+    assert len(runner.hooks) == 6
+
+    # test policy that is already title
+    mom_config = dict(
+        policy='Cyclic',
+        by_epoch=False,
+        target_ratio=(0.85 / 0.95, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_momentum_hook(mom_config)
+    assert len(runner.hooks) == 7
+
+    # test policy that is already title
+    mom_config = dict(
+        policy='cyclic',
+        by_epoch=False,
+        target_ratio=(0.85 / 0.95, 1),
+        cyclic_times=1,
+        step_ratio_up=0.4)
+    runner.register_momentum_hook(mom_config)
+    assert len(runner.hooks) == 8
+
+
+@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
+def test_register_timer_hook(runner_class):
+    model = Model()
+    runner = runner_class(model=model, logger=logging.getLogger())
+
+    # test register None
+    timer_config = None
+    runner.register_timer_hook(timer_config)
+    assert len(runner.hooks) == 0
+
+    # test register IterTimerHook with config
+    timer_config = dict(type='IterTimerHook')
+    runner.register_timer_hook(timer_config)
+    assert len(runner.hooks) == 1
+    assert isinstance(runner.hooks[0], IterTimerHook)
+
+    # test register IterTimerHook
+    timer_config = IterTimerHook()
+    runner.register_timer_hook(timer_config)
+    assert len(runner.hooks) == 2
+    assert isinstance(runner.hooks[1], IterTimerHook)
diff --git a/mmcv/tests/test_runner/test_utils.py b/mmcv/tests/test_runner/test_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3d2d18146c85476d707f1113ba7e4dcd2106d9b6
--- /dev/null
+++ b/mmcv/tests/test_runner/test_utils.py
@@ -0,0 +1,39 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import random
+
+import numpy as np
+import torch
+
+from mmcv.runner import set_random_seed
+from mmcv.utils import TORCH_VERSION, digit_version
+
+is_rocm_pytorch = False
+if digit_version(TORCH_VERSION) >= digit_version('1.5'):
+    from torch.utils.cpp_extension import ROCM_HOME
+    is_rocm_pytorch = True if ((torch.version.hip is not None) and
+                               (ROCM_HOME is not None)) else False
+
+
+def test_set_random_seed():
+    set_random_seed(0)
+    a_random = random.randint(0, 10)
+    a_np_random = np.random.rand(2, 2)
+    a_torch_random = torch.rand(2, 2)
+    assert torch.backends.cudnn.deterministic is False
+    assert torch.backends.cudnn.benchmark is False
+    assert os.environ['PYTHONHASHSEED'] == str(0)
+
+    set_random_seed(0, True)
+    b_random = random.randint(0, 10)
+    b_np_random = np.random.rand(2, 2)
+    b_torch_random = torch.rand(2, 2)
+    assert torch.backends.cudnn.deterministic is True
+    if is_rocm_pytorch:
+        assert torch.backends.cudnn.benchmark is True
+    else:
+        assert torch.backends.cudnn.benchmark is False
+
+    assert a_random == b_random
+    assert np.equal(a_np_random, b_np_random).all()
+    assert torch.equal(a_torch_random, b_torch_random)
diff --git a/mmcv/tests/test_utils/test_config.py b/mmcv/tests/test_utils/test_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..96118e7109a133a2058f7a722adcd20fe6bfe566
--- /dev/null
+++ b/mmcv/tests/test_utils/test_config.py
@@ -0,0 +1,611 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import copy
+import json
+import os
+import os.path as osp
+import shutil
+import tempfile
+from pathlib import Path
+
+import pytest
+import yaml
+
+from mmcv import Config, ConfigDict, DictAction, dump, load
+
+data_path = osp.join(osp.dirname(osp.dirname(__file__)), 'data')
+
+
+def test_construct():
+    cfg = Config()
+    assert cfg.filename is None
+    assert cfg.text == ''
+    assert len(cfg) == 0
+    assert cfg._cfg_dict == {}
+
+    with pytest.raises(TypeError):
+        Config([0, 1])
+
+    cfg_dict = dict(item1=[1, 2], item2=dict(a=0), item3=True, item4='test')
+    # test a.py
+    cfg_file = osp.join(data_path, 'config/a.py')
+    cfg_file_path = Path(cfg_file)
+    file_list = [cfg_file, cfg_file_path]
+    for item in file_list:
+        cfg = Config(cfg_dict, filename=item)
+        assert isinstance(cfg, Config)
+        assert isinstance(cfg.filename, str) and cfg.filename == str(item)
+        assert cfg.text == open(item).read()
+        assert cfg.dump() == cfg.pretty_text
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            dump_file = osp.join(temp_config_dir, 'a.py')
+            cfg.dump(dump_file)
+            assert cfg.dump() == open(dump_file).read()
+            assert Config.fromfile(dump_file)
+
+    # test b.json
+    cfg_file = osp.join(data_path, 'config/b.json')
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file).read()
+    assert cfg.dump() == json.dumps(cfg_dict)
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'b.json')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file).read()
+        assert Config.fromfile(dump_file)
+
+    # test c.yaml
+    cfg_file = osp.join(data_path, 'config/c.yaml')
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file).read()
+    assert cfg.dump() == yaml.dump(cfg_dict)
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'c.yaml')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file).read()
+        assert Config.fromfile(dump_file)
+
+    # test h.py
+    cfg_file = osp.join(data_path, 'config/h.py')
+    path = osp.join(osp.dirname(__file__), 'data', 'config')
+    # the value of osp.dirname(__file__) may be `D:\a\xxx` in windows
+    # environment. When dumping the cfg_dict to file, `D:\a\xxx` will be
+    # converted to `D:\x07\xxx` and it will cause unexpected result when
+    # checking whether `D:\a\xxx` equals to `D:\x07\xxx`. Therefore, we forcely
+    # convert a string representation of the path with forward slashes (/)
+    path = Path(path).as_posix()
+    cfg_dict = dict(item1='h.py', item2=path, item3='abc_h')
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file).read()
+    assert cfg.dump() == cfg.pretty_text
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'h.py')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file).read()
+        assert Config.fromfile(dump_file)
+        assert Config.fromfile(dump_file)['item1'] == cfg_dict['item1']
+        assert Config.fromfile(dump_file)['item2'] == cfg_dict['item2']
+        assert Config.fromfile(dump_file)['item3'] == cfg_dict['item3']
+
+    # test no use_predefined_variable
+    cfg_dict = dict(
+        item1='{{fileBasename}}',
+        item2='{{ fileDirname}}',
+        item3='abc_{{ fileBasenameNoExtension }}')
+    assert Config.fromfile(cfg_file, False)
+    assert Config.fromfile(cfg_file, False)['item1'] == cfg_dict['item1']
+    assert Config.fromfile(cfg_file, False)['item2'] == cfg_dict['item2']
+    assert Config.fromfile(cfg_file, False)['item3'] == cfg_dict['item3']
+
+    # test p.yaml
+    cfg_file = osp.join(data_path, 'config/p.yaml')
+    cfg_dict = dict(item1=osp.join(osp.dirname(__file__), 'data', 'config'))
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file).read()
+    assert cfg.dump() == yaml.dump(cfg_dict)
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'p.yaml')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file).read()
+        assert Config.fromfile(dump_file)
+        assert Config.fromfile(dump_file)['item1'] == cfg_dict['item1']
+
+    # test no use_predefined_variable
+    assert Config.fromfile(cfg_file, False)
+    assert Config.fromfile(cfg_file, False)['item1'] == '{{ fileDirname }}'
+
+    # test o.json
+    cfg_file = osp.join(data_path, 'config/o.json')
+    cfg_dict = dict(item1=osp.join(osp.dirname(__file__), 'data', 'config'))
+    cfg = Config(cfg_dict, filename=cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.text == open(cfg_file).read()
+    assert cfg.dump() == json.dumps(cfg_dict)
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        dump_file = osp.join(temp_config_dir, 'o.json')
+        cfg.dump(dump_file)
+        assert cfg.dump() == open(dump_file).read()
+        assert Config.fromfile(dump_file)
+        assert Config.fromfile(dump_file)['item1'] == cfg_dict['item1']
+
+    # test no use_predefined_variable
+    assert Config.fromfile(cfg_file, False)
+    assert Config.fromfile(cfg_file, False)['item1'] == '{{ fileDirname }}'
+
+
+def test_fromfile():
+    for filename in ['a.py', 'a.b.py', 'b.json', 'c.yaml']:
+        cfg_file = osp.join(data_path, 'config', filename)
+        cfg_file_path = Path(cfg_file)
+        file_list = [cfg_file, cfg_file_path]
+        for item in file_list:
+            cfg = Config.fromfile(item)
+            assert isinstance(cfg, Config)
+            assert isinstance(cfg.filename, str) and cfg.filename == str(item)
+            assert cfg.text == osp.abspath(osp.expanduser(item)) + '\n' + \
+                open(item).read()
+
+    # test custom_imports for Config.fromfile
+    cfg_file = osp.join(data_path, 'config', 'q.py')
+    imported_file = osp.join(data_path, 'config', 'r.py')
+    target_pkg = osp.join(osp.dirname(__file__), 'r.py')
+
+    # Since the imported config will be regarded as a tmp file
+    # it should be copied to the directory at the same level
+    shutil.copy(imported_file, target_pkg)
+    Config.fromfile(cfg_file, import_custom_modules=True)
+
+    assert os.environ.pop('TEST_VALUE') == 'test'
+    os.remove(target_pkg)
+
+    with pytest.raises(FileNotFoundError):
+        Config.fromfile('no_such_file.py')
+    with pytest.raises(IOError):
+        Config.fromfile(osp.join(data_path, 'color.jpg'))
+
+
+def test_fromstring():
+    for filename in ['a.py', 'a.b.py', 'b.json', 'c.yaml']:
+        cfg_file = osp.join(data_path, 'config', filename)
+        file_format = osp.splitext(filename)[-1]
+        in_cfg = Config.fromfile(cfg_file)
+
+        out_cfg = Config.fromstring(in_cfg.pretty_text, '.py')
+        assert in_cfg._cfg_dict == out_cfg._cfg_dict
+
+        cfg_str = open(cfg_file).read()
+        out_cfg = Config.fromstring(cfg_str, file_format)
+        assert in_cfg._cfg_dict == out_cfg._cfg_dict
+
+    # test pretty_text only supports py file format
+    cfg_file = osp.join(data_path, 'config', 'b.json')
+    in_cfg = Config.fromfile(cfg_file)
+    with pytest.raises(Exception):
+        Config.fromstring(in_cfg.pretty_text, '.json')
+
+    # test file format error
+    cfg_str = open(cfg_file).read()
+    with pytest.raises(Exception):
+        Config.fromstring(cfg_str, '.py')
+
+
+def test_merge_from_base():
+    cfg_file = osp.join(data_path, 'config/d.py')
+    cfg = Config.fromfile(cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    base_cfg_file = osp.join(data_path, 'config/base.py')
+    merge_text = osp.abspath(osp.expanduser(base_cfg_file)) + '\n' + \
+        open(base_cfg_file).read()
+    merge_text += '\n' + osp.abspath(osp.expanduser(cfg_file)) + '\n' + \
+                  open(cfg_file).read()
+    assert cfg.text == merge_text
+    assert cfg.item1 == [2, 3]
+    assert cfg.item2.a == 1
+    assert cfg.item3 is False
+    assert cfg.item4 == 'test_base'
+
+    with pytest.raises(TypeError):
+        Config.fromfile(osp.join(data_path, 'config/e.py'))
+
+
+def test_merge_from_multiple_bases():
+    cfg_file = osp.join(data_path, 'config/l.py')
+    cfg = Config.fromfile(cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    # cfg.field
+    assert cfg.item1 == [1, 2]
+    assert cfg.item2.a == 0
+    assert cfg.item3 is False
+    assert cfg.item4 == 'test'
+    assert cfg.item5 == dict(a=0, b=1)
+    assert cfg.item6 == [dict(a=0), dict(b=1)]
+    assert cfg.item7 == dict(a=[0, 1, 2], b=dict(c=[3.1, 4.2, 5.3]))
+
+    with pytest.raises(KeyError):
+        Config.fromfile(osp.join(data_path, 'config/m.py'))
+
+
+def test_base_variables():
+    for file in ['t.py', 't.json', 't.yaml']:
+        cfg_file = osp.join(data_path, f'config/{file}')
+        cfg = Config.fromfile(cfg_file)
+        assert isinstance(cfg, Config)
+        assert cfg.filename == cfg_file
+        # cfg.field
+        assert cfg.item1 == [1, 2]
+        assert cfg.item2.a == 0
+        assert cfg.item3 is False
+        assert cfg.item4 == 'test'
+        assert cfg.item5 == dict(a=0, b=1)
+        assert cfg.item6 == [dict(a=0), dict(b=1)]
+        assert cfg.item7 == dict(a=[0, 1, 2], b=dict(c=[3.1, 4.2, 5.3]))
+        assert cfg.item8 == file
+        assert cfg.item9 == dict(a=0)
+        assert cfg.item10 == [3.1, 4.2, 5.3]
+
+    # test nested base
+    for file in ['u.py', 'u.json', 'u.yaml']:
+        cfg_file = osp.join(data_path, f'config/{file}')
+        cfg = Config.fromfile(cfg_file)
+        assert isinstance(cfg, Config)
+        assert cfg.filename == cfg_file
+        # cfg.field
+        assert cfg.base == '_base_.item8'
+        assert cfg.item1 == [1, 2]
+        assert cfg.item2.a == 0
+        assert cfg.item3 is False
+        assert cfg.item4 == 'test'
+        assert cfg.item5 == dict(a=0, b=1)
+        assert cfg.item6 == [dict(a=0), dict(b=1)]
+        assert cfg.item7 == dict(a=[0, 1, 2], b=dict(c=[3.1, 4.2, 5.3]))
+        assert cfg.item8 == 't.py'
+        assert cfg.item9 == dict(a=0)
+        assert cfg.item10 == [3.1, 4.2, 5.3]
+        assert cfg.item11 == 't.py'
+        assert cfg.item12 == dict(a=0)
+        assert cfg.item13 == [3.1, 4.2, 5.3]
+        assert cfg.item14 == [1, 2]
+        assert cfg.item15 == dict(
+            a=dict(b=dict(a=0)),
+            b=[False],
+            c=['test'],
+            d=[[{
+                'e': 0
+            }], [{
+                'a': 0
+            }, {
+                'b': 1
+            }]],
+            e=[1, 2])
+
+    # test reference assignment for py
+    cfg_file = osp.join(data_path, 'config/v.py')
+    cfg = Config.fromfile(cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    assert cfg.item21 == 't.py'
+    assert cfg.item22 == 't.py'
+    assert cfg.item23 == [3.1, 4.2, 5.3]
+    assert cfg.item24 == [3.1, 4.2, 5.3]
+    assert cfg.item25 == dict(
+        a=dict(b=[3.1, 4.2, 5.3]),
+        b=[[3.1, 4.2, 5.3]],
+        c=[[{
+            'e': 't.py'
+        }], [{
+            'a': 0
+        }, {
+            'b': 1
+        }]],
+        e='t.py')
+
+
+def test_merge_recursive_bases():
+    cfg_file = osp.join(data_path, 'config/f.py')
+    cfg = Config.fromfile(cfg_file)
+    assert isinstance(cfg, Config)
+    assert cfg.filename == cfg_file
+    # cfg.field
+    assert cfg.item1 == [2, 3]
+    assert cfg.item2.a == 1
+    assert cfg.item3 is False
+    assert cfg.item4 == 'test_recursive_bases'
+
+
+def test_merge_from_dict():
+    cfg_file = osp.join(data_path, 'config/a.py')
+    cfg = Config.fromfile(cfg_file)
+    input_options = {'item2.a': 1, 'item2.b': 0.1, 'item3': False}
+    cfg.merge_from_dict(input_options)
+    assert cfg.item2 == dict(a=1, b=0.1)
+    assert cfg.item3 is False
+
+    cfg_file = osp.join(data_path, 'config/s.py')
+    cfg = Config.fromfile(cfg_file)
+
+    # Allow list keys
+    input_options = {'item.0.a': 1, 'item.1.b': 1}
+    cfg.merge_from_dict(input_options, allow_list_keys=True)
+    assert cfg.item == [{'a': 1}, {'b': 1, 'c': 0}]
+
+    # allow_list_keys is False
+    input_options = {'item.0.a': 1, 'item.1.b': 1}
+    with pytest.raises(TypeError):
+        cfg.merge_from_dict(input_options, allow_list_keys=False)
+
+    # Overflowed index number
+    input_options = {'item.2.a': 1}
+    with pytest.raises(KeyError):
+        cfg.merge_from_dict(input_options, allow_list_keys=True)
+
+
+def test_merge_delete():
+    cfg_file = osp.join(data_path, 'config/delete.py')
+    cfg = Config.fromfile(cfg_file)
+    # cfg.field
+    assert cfg.item1 == dict(a=0)
+    assert cfg.item2 == dict(a=0, b=0)
+    assert cfg.item3 is True
+    assert cfg.item4 == 'test'
+    assert '_delete_' not in cfg.item2
+
+    # related issue: https://github.com/open-mmlab/mmcv/issues/1570
+    assert type(cfg.item1) == ConfigDict
+    assert type(cfg.item2) == ConfigDict
+
+
+def test_merge_intermediate_variable():
+
+    cfg_file = osp.join(data_path, 'config/i_child.py')
+    cfg = Config.fromfile(cfg_file)
+    # cfg.field
+    assert cfg.item1 == [1, 2]
+    assert cfg.item2 == dict(a=0)
+    assert cfg.item3 is True
+    assert cfg.item4 == 'test'
+    assert cfg.item_cfg == dict(b=2)
+    assert cfg.item5 == dict(cfg=dict(b=1))
+    assert cfg.item6 == dict(cfg=dict(b=2))
+
+
+def test_fromfile_in_config():
+    cfg_file = osp.join(data_path, 'config/code.py')
+    cfg = Config.fromfile(cfg_file)
+    # cfg.field
+    assert cfg.cfg.item1 == [1, 2]
+    assert cfg.cfg.item2 == dict(a=0)
+    assert cfg.cfg.item3 is True
+    assert cfg.cfg.item4 == 'test'
+    assert cfg.item5 == 1
+
+
+def test_dict():
+    cfg_dict = dict(item1=[1, 2], item2=dict(a=0), item3=True, item4='test')
+
+    for filename in ['a.py', 'b.json', 'c.yaml']:
+        cfg_file = osp.join(data_path, 'config', filename)
+        cfg = Config.fromfile(cfg_file)
+
+        # len(cfg)
+        assert len(cfg) == 4
+        # cfg.keys()
+        assert set(cfg.keys()) == set(cfg_dict.keys())
+        assert set(cfg._cfg_dict.keys()) == set(cfg_dict.keys())
+        # cfg.values()
+        for value in cfg.values():
+            assert value in cfg_dict.values()
+        # cfg.items()
+        for name, value in cfg.items():
+            assert name in cfg_dict
+            assert value in cfg_dict.values()
+        # cfg.field
+        assert cfg.item1 == cfg_dict['item1']
+        assert cfg.item2 == cfg_dict['item2']
+        assert cfg.item2.a == 0
+        assert cfg.item3 == cfg_dict['item3']
+        assert cfg.item4 == cfg_dict['item4']
+        with pytest.raises(AttributeError):
+            cfg.not_exist
+        # field in cfg, cfg[field], cfg.get()
+        for name in ['item1', 'item2', 'item3', 'item4']:
+            assert name in cfg
+            assert cfg[name] == cfg_dict[name]
+            assert cfg.get(name) == cfg_dict[name]
+            assert cfg.get('not_exist') is None
+            assert cfg.get('not_exist', 0) == 0
+            with pytest.raises(KeyError):
+                cfg['not_exist']
+        assert 'item1' in cfg
+        assert 'not_exist' not in cfg
+        # cfg.update()
+        cfg.update(dict(item1=0))
+        assert cfg.item1 == 0
+        cfg.update(dict(item2=dict(a=1)))
+        assert cfg.item2.a == 1
+
+
+@pytest.mark.parametrize('file', ['a.json', 'b.py', 'c.yaml', 'd.yml', None])
+def test_dump(file):
+    # config loaded from dict
+    cfg_dict = dict(item1=[1, 2], item2=dict(a=0), item3=True, item4='test')
+    cfg = Config(cfg_dict=cfg_dict)
+    assert cfg.item1 == cfg_dict['item1']
+    assert cfg.item2 == cfg_dict['item2']
+    assert cfg.item3 == cfg_dict['item3']
+    assert cfg.item4 == cfg_dict['item4']
+    assert cfg._filename is None
+    if file is not None:
+        # dump without a filename argument is only returning pretty_text.
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            cfg_file = osp.join(temp_config_dir, file)
+            cfg.dump(cfg_file)
+            dumped_cfg = Config.fromfile(cfg_file)
+            assert dumped_cfg._cfg_dict == cfg._cfg_dict
+    else:
+        assert cfg.dump() == cfg.pretty_text
+
+    # The key of json must be a string, so key `1` will be converted to `'1'`.
+    def compare_json_cfg(ori_cfg, dumped_json_cfg):
+        for key, value in ori_cfg.items():
+            assert str(key) in dumped_json_cfg
+            if not isinstance(value, dict):
+                assert ori_cfg[key] == dumped_json_cfg[str(key)]
+            else:
+                compare_json_cfg(value, dumped_json_cfg[str(key)])
+
+    # config loaded from file
+    cfg_file = osp.join(data_path, 'config/n.py')
+    cfg = Config.fromfile(cfg_file)
+    if file is not None:
+        with tempfile.TemporaryDirectory() as temp_config_dir:
+            cfg_file = osp.join(temp_config_dir, file)
+            cfg.dump(cfg_file)
+            dumped_cfg = Config.fromfile(cfg_file)
+        if not file.endswith('.json'):
+            assert dumped_cfg._cfg_dict == cfg._cfg_dict
+        else:
+            compare_json_cfg(cfg._cfg_dict, dumped_cfg._cfg_dict)
+    else:
+        assert cfg.dump() == cfg.pretty_text
+
+
+def test_setattr():
+    cfg = Config()
+    cfg.item1 = [1, 2]
+    cfg.item2 = {'a': 0}
+    cfg['item5'] = {'a': {'b': None}}
+    assert cfg._cfg_dict['item1'] == [1, 2]
+    assert cfg.item1 == [1, 2]
+    assert cfg._cfg_dict['item2'] == {'a': 0}
+    assert cfg.item2.a == 0
+    assert cfg._cfg_dict['item5'] == {'a': {'b': None}}
+    assert cfg.item5.a.b is None
+
+
+def test_pretty_text():
+    cfg_file = osp.join(data_path, 'config/l.py')
+    cfg = Config.fromfile(cfg_file)
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        text_cfg_filename = osp.join(temp_config_dir, '_text_config.py')
+        with open(text_cfg_filename, 'w') as f:
+            f.write(cfg.pretty_text)
+        text_cfg = Config.fromfile(text_cfg_filename)
+    assert text_cfg._cfg_dict == cfg._cfg_dict
+
+
+def test_dict_action():
+    parser = argparse.ArgumentParser(description='Train a detector')
+    parser.add_argument(
+        '--options', nargs='+', action=DictAction, help='custom options')
+    # Nested brackets
+    args = parser.parse_args(
+        ['--options', 'item2.a=a,b', 'item2.b=[(a,b), [1,2], false]'])
+    out_dict = {'item2.a': ['a', 'b'], 'item2.b': [('a', 'b'), [1, 2], False]}
+    assert args.options == out_dict
+    # Single Nested brackets
+    args = parser.parse_args(['--options', 'item2.a=[[1]]'])
+    out_dict = {'item2.a': [[1]]}
+    assert args.options == out_dict
+    # Imbalance bracket
+    with pytest.raises(AssertionError):
+        parser.parse_args(['--options', 'item2.a=[(a,b), [1,2], false'])
+    # Normal values
+    args = parser.parse_args([
+        '--options', 'item2.a=1', 'item2.b=0.1', 'item2.c=x', 'item3=false',
+        'item4=none', 'item5=None'
+    ])
+    out_dict = {
+        'item2.a': 1,
+        'item2.b': 0.1,
+        'item2.c': 'x',
+        'item3': False,
+        'item4': 'none',
+        'item5': None,
+    }
+    assert args.options == out_dict
+    cfg_file = osp.join(data_path, 'config/a.py')
+    cfg = Config.fromfile(cfg_file)
+    cfg.merge_from_dict(args.options)
+    assert cfg.item2 == dict(a=1, b=0.1, c='x')
+    assert cfg.item3 is False
+
+
+def test_reserved_key():
+    cfg_file = osp.join(data_path, 'config/g.py')
+    with pytest.raises(KeyError):
+        Config.fromfile(cfg_file)
+
+
+def test_syntax_error():
+    # the name can not be used to open the file a second time in windows,
+    # so `delete` should be set as `False` and we need to manually remove it
+    # more details can be found at https://github.com/open-mmlab/mmcv/pull/1077
+    temp_cfg_file = tempfile.NamedTemporaryFile(suffix='.py', delete=False)
+    temp_cfg_path = temp_cfg_file.name
+    # write a file with syntax error
+    with open(temp_cfg_path, 'w') as f:
+        f.write('a=0b=dict(c=1)')
+    with pytest.raises(
+            SyntaxError, match='There are syntax errors in config file'):
+        Config.fromfile(temp_cfg_path)
+    temp_cfg_file.close()
+    os.remove(temp_cfg_path)
+
+
+def test_pickle_support():
+    cfg_file = osp.join(data_path, 'config/n.py')
+    cfg = Config.fromfile(cfg_file)
+
+    with tempfile.TemporaryDirectory() as temp_config_dir:
+        pkl_cfg_filename = osp.join(temp_config_dir, '_pickle.pkl')
+        dump(cfg, pkl_cfg_filename)
+        pkl_cfg = load(pkl_cfg_filename)
+
+    assert pkl_cfg._cfg_dict == cfg._cfg_dict
+
+
+def test_deprecation():
+    deprecated_cfg_files = [
+        osp.join(data_path, 'config/deprecated.py'),
+        osp.join(data_path, 'config/deprecated_as_base.py')
+    ]
+
+    for cfg_file in deprecated_cfg_files:
+        with pytest.warns(DeprecationWarning):
+            cfg = Config.fromfile(cfg_file)
+        assert cfg.item1 == 'expected'
+
+
+def test_deepcopy():
+    cfg_file = osp.join(data_path, 'config/n.py')
+    cfg = Config.fromfile(cfg_file)
+    new_cfg = copy.deepcopy(cfg)
+
+    assert isinstance(new_cfg, Config)
+    assert new_cfg._cfg_dict == cfg._cfg_dict
+    assert new_cfg._cfg_dict is not cfg._cfg_dict
+    assert new_cfg._filename == cfg._filename
+    assert new_cfg._text == cfg._text
+
+
+def test_copy():
+    cfg_file = osp.join(data_path, 'config/n.py')
+    cfg = Config.fromfile(cfg_file)
+    new_cfg = copy.copy(cfg)
+
+    assert isinstance(new_cfg, Config)
+    assert new_cfg is not cfg
+    assert new_cfg._cfg_dict is cfg._cfg_dict
+    assert new_cfg._filename == cfg._filename
+    assert new_cfg._text == cfg._text
diff --git a/mmcv/tests/test_utils/test_env.py b/mmcv/tests/test_utils/test_env.py
new file mode 100644
index 0000000000000000000000000000000000000000..74bafff3715d862394147f505adff77448108e11
--- /dev/null
+++ b/mmcv/tests/test_utils/test_env.py
@@ -0,0 +1,34 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import sys
+
+import pytest
+
+import mmcv
+
+
+def test_collect_env():
+    try:
+        import torch  # noqa: F401
+    except ModuleNotFoundError:
+        pytest.skip('skipping tests that require PyTorch')
+
+    from mmcv.utils import collect_env
+    env_info = collect_env()
+    expected_keys = [
+        'sys.platform', 'Python', 'CUDA available', 'PyTorch',
+        'PyTorch compiling details', 'OpenCV', 'MMCV', 'MMCV Compiler', 'GCC',
+        'MMCV CUDA Compiler'
+    ]
+    for key in expected_keys:
+        assert key in env_info
+
+    if env_info['CUDA available']:
+        for key in ['CUDA_HOME', 'NVCC']:
+            assert key in env_info
+
+    if sys.platform == 'win32':
+        assert 'MSVC' in env_info
+
+    assert env_info['sys.platform'] == sys.platform
+    assert env_info['Python'] == sys.version.replace('\n', '')
+    assert env_info['MMCV'] == mmcv.__version__
diff --git a/mmcv/tests/test_utils/test_hub.py b/mmcv/tests/test_utils/test_hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..b44ee9be06a60add57477834752e803a89592493
--- /dev/null
+++ b/mmcv/tests/test_utils/test_hub.py
@@ -0,0 +1,36 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+from torch.utils import model_zoo
+
+from mmcv.utils import TORCH_VERSION, digit_version, load_url
+
+
+@pytest.mark.skipif(
+    torch.__version__ == 'parrots', reason='not necessary in parrots test')
+def test_load_url():
+    url1 = 'https://download.openmmlab.com/mmcv/test_data/saved_in_pt1.5.pth'
+    url2 = 'https://download.openmmlab.com/mmcv/test_data/saved_in_pt1.6.pth'
+
+    # The 1.6 release of PyTorch switched torch.save to use a new zipfile-based
+    # file format. It will cause RuntimeError when a checkpoint was saved in
+    # torch >= 1.6.0 but loaded in torch < 1.7.0.
+    # More details at https://github.com/open-mmlab/mmpose/issues/904
+    if digit_version(TORCH_VERSION) < digit_version('1.7.0'):
+        model_zoo.load_url(url1)
+        with pytest.raises(RuntimeError):
+            model_zoo.load_url(url2)
+    else:
+        # high version of PyTorch can load checkpoints from url, regardless
+        # of which version they were saved in
+        model_zoo.load_url(url1)
+        model_zoo.load_url(url2)
+
+    load_url(url1)
+    # if a checkpoint was saved in torch >= 1.6.0 but loaded in torch < 1.5.0,
+    # it will raise a RuntimeError
+    if digit_version(TORCH_VERSION) < digit_version('1.5.0'):
+        with pytest.raises(RuntimeError):
+            load_url(url2)
+    else:
+        load_url(url2)
diff --git a/mmcv/tests/test_utils/test_logging.py b/mmcv/tests/test_utils/test_logging.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab66a34b94d5c56ce0a8b0a2e9480c2d4a2d634d
--- /dev/null
+++ b/mmcv/tests/test_utils/test_logging.py
@@ -0,0 +1,118 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+import os
+import platform
+import tempfile
+from unittest.mock import patch
+
+import pytest
+
+from mmcv import get_logger, print_log
+
+if platform.system() == 'Windows':
+    import regex as re
+else:
+    import re
+
+
+@patch('torch.distributed.get_rank', lambda: 0)
+@patch('torch.distributed.is_initialized', lambda: True)
+@patch('torch.distributed.is_available', lambda: True)
+def test_get_logger_rank0():
+    logger = get_logger('rank0.pkg1')
+    assert isinstance(logger, logging.Logger)
+    assert len(logger.handlers) == 1
+    assert isinstance(logger.handlers[0], logging.StreamHandler)
+    assert logger.handlers[0].level == logging.INFO
+
+    logger = get_logger('rank0.pkg2', log_level=logging.DEBUG)
+    assert isinstance(logger, logging.Logger)
+    assert len(logger.handlers) == 1
+    assert logger.handlers[0].level == logging.DEBUG
+
+    # the name can not be used to open the file a second time in windows,
+    # so `delete` should be set as `False` and we need to manually remove it
+    # more details can be found at https://github.com/open-mmlab/mmcv/pull/1077
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        logger = get_logger('rank0.pkg3', log_file=f.name)
+        assert isinstance(logger, logging.Logger)
+        assert len(logger.handlers) == 2
+        assert isinstance(logger.handlers[0], logging.StreamHandler)
+        assert isinstance(logger.handlers[1], logging.FileHandler)
+        logger_pkg3 = get_logger('rank0.pkg3')
+        assert id(logger_pkg3) == id(logger)
+        # flushing and closing all handlers in order to remove `f.name`
+        logging.shutdown()
+
+    os.remove(f.name)
+
+    logger_pkg3 = get_logger('rank0.pkg3.subpkg')
+    assert logger_pkg3.handlers == logger_pkg3.handlers
+
+
+@patch('torch.distributed.get_rank', lambda: 1)
+@patch('torch.distributed.is_initialized', lambda: True)
+@patch('torch.distributed.is_available', lambda: True)
+def test_get_logger_rank1():
+    logger = get_logger('rank1.pkg1')
+    assert isinstance(logger, logging.Logger)
+    assert len(logger.handlers) == 1
+    assert isinstance(logger.handlers[0], logging.StreamHandler)
+    assert logger.handlers[0].level == logging.INFO
+
+    # the name can not be used to open the file a second time in windows,
+    # so `delete` should be set as `False` and we need to manually remove it
+    # more details can be found at https://github.com/open-mmlab/mmcv/pull/1077
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        logger = get_logger('rank1.pkg2', log_file=f.name)
+        assert isinstance(logger, logging.Logger)
+        assert len(logger.handlers) == 1
+        assert logger.handlers[0].level == logging.INFO
+        # flushing and closing all handlers in order to remove `f.name`
+        logging.shutdown()
+
+    os.remove(f.name)
+
+
+def test_print_log_print(capsys):
+    print_log('welcome', logger=None)
+    out, _ = capsys.readouterr()
+    assert out == 'welcome\n'
+
+
+def test_print_log_silent(capsys, caplog):
+    print_log('welcome', logger='silent')
+    out, _ = capsys.readouterr()
+    assert out == ''
+    assert len(caplog.records) == 0
+
+
+def test_print_log_logger(caplog):
+    print_log('welcome', logger='mmcv')
+    assert caplog.record_tuples[-1] == ('mmcv', logging.INFO, 'welcome')
+
+    print_log('welcome', logger='mmcv', level=logging.ERROR)
+    assert caplog.record_tuples[-1] == ('mmcv', logging.ERROR, 'welcome')
+
+    # the name can not be used to open the file a second time in windows,
+    # so `delete` should be set as `False` and we need to manually remove it
+    # more details can be found at https://github.com/open-mmlab/mmcv/pull/1077
+    with tempfile.NamedTemporaryFile(delete=False) as f:
+        logger = get_logger('abc', log_file=f.name)
+        print_log('welcome', logger=logger)
+        assert caplog.record_tuples[-1] == ('abc', logging.INFO, 'welcome')
+        with open(f.name) as fin:
+            log_text = fin.read()
+            regex_time = r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2},\d{3}'
+            match = re.fullmatch(regex_time + r' - abc - INFO - welcome\n',
+                                 log_text)
+            assert match is not None
+        # flushing and closing all handlers in order to remove `f.name`
+        logging.shutdown()
+
+    os.remove(f.name)
+
+
+def test_print_log_exception():
+    with pytest.raises(TypeError):
+        print_log('welcome', logger=0)
diff --git a/mmcv/tests/test_utils/test_misc.py b/mmcv/tests/test_utils/test_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b14c007780a22f23d781662d328efec566a86db
--- /dev/null
+++ b/mmcv/tests/test_utils/test_misc.py
@@ -0,0 +1,224 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+
+import mmcv
+from mmcv import deprecated_api_warning
+from mmcv.utils.misc import has_method
+
+
+def test_to_ntuple():
+    single_number = 2
+    assert mmcv.utils.to_1tuple(single_number) == (single_number, )
+    assert mmcv.utils.to_2tuple(single_number) == (single_number,
+                                                   single_number)
+    assert mmcv.utils.to_3tuple(single_number) == (single_number,
+                                                   single_number,
+                                                   single_number)
+    assert mmcv.utils.to_4tuple(single_number) == (single_number,
+                                                   single_number,
+                                                   single_number,
+                                                   single_number)
+    assert mmcv.utils.to_ntuple(5)(single_number) == (single_number,
+                                                      single_number,
+                                                      single_number,
+                                                      single_number,
+                                                      single_number)
+    assert mmcv.utils.to_ntuple(6)(single_number) == (single_number,
+                                                      single_number,
+                                                      single_number,
+                                                      single_number,
+                                                      single_number,
+                                                      single_number)
+
+
+def test_iter_cast():
+    assert mmcv.list_cast([1, 2, 3], int) == [1, 2, 3]
+    assert mmcv.list_cast(['1.1', 2, '3'], float) == [1.1, 2.0, 3.0]
+    assert mmcv.list_cast([1, 2, 3], str) == ['1', '2', '3']
+    assert mmcv.tuple_cast((1, 2, 3), str) == ('1', '2', '3')
+    assert next(mmcv.iter_cast([1, 2, 3], str)) == '1'
+    with pytest.raises(TypeError):
+        mmcv.iter_cast([1, 2, 3], '')
+    with pytest.raises(TypeError):
+        mmcv.iter_cast(1, str)
+
+
+def test_is_seq_of():
+    assert mmcv.is_seq_of([1.0, 2.0, 3.0], float)
+    assert mmcv.is_seq_of([(1, ), (2, ), (3, )], tuple)
+    assert mmcv.is_seq_of((1.0, 2.0, 3.0), float)
+    assert mmcv.is_list_of([1.0, 2.0, 3.0], float)
+    assert not mmcv.is_seq_of((1.0, 2.0, 3.0), float, seq_type=list)
+    assert not mmcv.is_tuple_of([1.0, 2.0, 3.0], float)
+    assert not mmcv.is_seq_of([1.0, 2, 3], int)
+    assert not mmcv.is_seq_of((1.0, 2, 3), int)
+
+
+def test_slice_list():
+    in_list = [1, 2, 3, 4, 5, 6]
+    assert mmcv.slice_list(in_list, [1, 2, 3]) == [[1], [2, 3], [4, 5, 6]]
+    assert mmcv.slice_list(in_list, [len(in_list)]) == [in_list]
+    with pytest.raises(TypeError):
+        mmcv.slice_list(in_list, 2.0)
+    with pytest.raises(ValueError):
+        mmcv.slice_list(in_list, [1, 2])
+
+
+def test_concat_list():
+    assert mmcv.concat_list([[1, 2]]) == [1, 2]
+    assert mmcv.concat_list([[1, 2], [3, 4, 5], [6]]) == [1, 2, 3, 4, 5, 6]
+
+
+def test_requires_package(capsys):
+
+    @mmcv.requires_package('nnn')
+    def func_a():
+        pass
+
+    @mmcv.requires_package(['numpy', 'n1', 'n2'])
+    def func_b():
+        pass
+
+    @mmcv.requires_package('numpy')
+    def func_c():
+        return 1
+
+    with pytest.raises(RuntimeError):
+        func_a()
+    out, _ = capsys.readouterr()
+    assert out == ('Prerequisites "nnn" are required in method "func_a" but '
+                   'not found, please install them first.\n')
+
+    with pytest.raises(RuntimeError):
+        func_b()
+    out, _ = capsys.readouterr()
+    assert out == (
+        'Prerequisites "n1, n2" are required in method "func_b" but not found,'
+        ' please install them first.\n')
+
+    assert func_c() == 1
+
+
+def test_requires_executable(capsys):
+
+    @mmcv.requires_executable('nnn')
+    def func_a():
+        pass
+
+    @mmcv.requires_executable(['ls', 'n1', 'n2'])
+    def func_b():
+        pass
+
+    @mmcv.requires_executable('mv')
+    def func_c():
+        return 1
+
+    with pytest.raises(RuntimeError):
+        func_a()
+    out, _ = capsys.readouterr()
+    assert out == ('Prerequisites "nnn" are required in method "func_a" but '
+                   'not found, please install them first.\n')
+
+    with pytest.raises(RuntimeError):
+        func_b()
+    out, _ = capsys.readouterr()
+    assert out == (
+        'Prerequisites "n1, n2" are required in method "func_b" but not found,'
+        ' please install them first.\n')
+
+    assert func_c() == 1
+
+
+def test_import_modules_from_strings():
+    # multiple imports
+    import os.path as osp_
+    import sys as sys_
+    osp, sys = mmcv.import_modules_from_strings(['os.path', 'sys'])
+    assert osp == osp_
+    assert sys == sys_
+
+    # single imports
+    osp = mmcv.import_modules_from_strings('os.path')
+    assert osp == osp_
+    # No imports
+    assert mmcv.import_modules_from_strings(None) is None
+    assert mmcv.import_modules_from_strings([]) is None
+    assert mmcv.import_modules_from_strings('') is None
+    # Unsupported types
+    with pytest.raises(TypeError):
+        mmcv.import_modules_from_strings(1)
+    with pytest.raises(TypeError):
+        mmcv.import_modules_from_strings([1])
+    # Failed imports
+    with pytest.raises(ImportError):
+        mmcv.import_modules_from_strings('_not_implemented_module')
+    with pytest.warns(UserWarning):
+        imported = mmcv.import_modules_from_strings(
+            '_not_implemented_module', allow_failed_imports=True)
+        assert imported is None
+    with pytest.warns(UserWarning):
+        imported = mmcv.import_modules_from_strings(
+            ['os.path', '_not_implemented'], allow_failed_imports=True)
+        assert imported[0] == osp
+        assert imported[1] is None
+
+
+def test_is_method_overridden():
+
+    class Base:
+
+        def foo1():
+            pass
+
+        def foo2():
+            pass
+
+    class Sub(Base):
+
+        def foo1():
+            pass
+
+    # test passing sub class directly
+    assert mmcv.is_method_overridden('foo1', Base, Sub)
+    assert not mmcv.is_method_overridden('foo2', Base, Sub)
+
+    # test passing instance of sub class
+    sub_instance = Sub()
+    assert mmcv.is_method_overridden('foo1', Base, sub_instance)
+    assert not mmcv.is_method_overridden('foo2', Base, sub_instance)
+
+    # base_class should be a class, not instance
+    base_instance = Base()
+    with pytest.raises(AssertionError):
+        mmcv.is_method_overridden('foo1', base_instance, sub_instance)
+
+
+def test_has_method():
+
+    class Foo:
+
+        def __init__(self, name):
+            self.name = name
+
+        def print_name(self):
+            print(self.name)
+
+    foo = Foo('foo')
+    assert not has_method(foo, 'name')
+    assert has_method(foo, 'print_name')
+
+
+def test_deprecated_api_warning():
+
+    @deprecated_api_warning(name_dict=dict(old_key='new_key'))
+    def dummy_func(new_key=1):
+        return new_key
+
+    # replace `old_key` to `new_key`
+    assert dummy_func(old_key=2) == 2
+
+    # The expected behavior is to replace the
+    # deprecated key `old_key` to `new_key`,
+    # but got them in the arguments at the same time
+    with pytest.raises(AssertionError):
+        dummy_func(old_key=1, new_key=2)
diff --git a/mmcv/tests/test_utils/test_parrots_jit.py b/mmcv/tests/test_utils/test_parrots_jit.py
new file mode 100644
index 0000000000000000000000000000000000000000..71be929fb43d0e0f6efa5df9e48dcb15e3a83ed7
--- /dev/null
+++ b/mmcv/tests/test_utils/test_parrots_jit.py
@@ -0,0 +1,278 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+import mmcv
+from mmcv.utils import TORCH_VERSION
+
+pytest.skip('this test not ready now', allow_module_level=True)
+skip_no_parrots = pytest.mark.skipif(
+    TORCH_VERSION != 'parrots', reason='test case under parrots environment')
+
+
+class TestJit:
+
+    def test_add_dict(self):
+
+        @mmcv.jit
+        def add_dict(oper):
+            rets = oper['x'] + oper['y']
+            return {'result': rets}
+
+        def add_dict_pyfunc(oper):
+            rets = oper['x'] + oper['y']
+            return {'result': rets}
+
+        a = torch.rand((3, 4))
+        b = torch.rand((3, 4))
+        oper = {'x': a, 'y': b}
+
+        rets_t = add_dict(oper)
+        rets = add_dict_pyfunc(oper)
+        assert 'result' in rets
+        assert (rets_t['result'] == rets['result']).all()
+
+    def test_add_list(self):
+
+        @mmcv.jit
+        def add_list(oper, x, y):
+            rets = {}
+            for idx, pair in enumerate(oper):
+                rets[f'k{idx}'] = pair['x'] + pair['y']
+            rets[f'k{len(oper)}'] = x + y
+            return rets
+
+        def add_list_pyfunc(oper, x, y):
+            rets = {}
+            for idx, pair in enumerate(oper):
+                rets[f'k{idx}'] = pair['x'] + pair['y']
+            rets[f'k{len(oper)}'] = x + y
+            return rets
+
+        pair_num = 3
+        oper = []
+        for _ in range(pair_num):
+            oper.append({'x': torch.rand((3, 4)), 'y': torch.rand((3, 4))})
+        a = torch.rand((3, 4))
+        b = torch.rand((3, 4))
+        rets = add_list_pyfunc(oper, x=a, y=b)
+        rets_t = add_list(oper, x=a, y=b)
+        for idx in range(pair_num + 1):
+            assert f'k{idx}' in rets_t
+            assert (rets[f'k{idx}'] == rets_t[f'k{idx}']).all()
+
+    @skip_no_parrots
+    def test_jit_cache(self):
+
+        @mmcv.jit
+        def func(oper):
+            if oper['const'] > 1:
+                return oper['x'] * 2 + oper['y']
+            else:
+                return oper['x'] * 2 - oper['y']
+
+        def pyfunc(oper):
+            if oper['const'] > 1:
+                return oper['x'] * 2 + oper['y']
+            else:
+                return oper['x'] * 2 - oper['y']
+
+        assert len(func._cache._cache) == 0
+
+        oper = {'const': 2, 'x': torch.rand((3, 4)), 'y': torch.rand((3, 4))}
+        rets_plus = pyfunc(oper)
+        rets_plus_t = func(oper)
+        assert (rets_plus == rets_plus_t).all()
+        assert len(func._cache._cache) == 1
+
+        oper['const'] = 0.5
+        rets_minus = pyfunc(oper)
+        rets_minus_t = func(oper)
+        assert (rets_minus == rets_minus_t).all()
+        assert len(func._cache._cache) == 2
+
+        rets_a = (rets_minus_t + rets_plus_t) / 4
+        assert torch.allclose(oper['x'], rets_a)
+
+    @skip_no_parrots
+    def test_jit_shape(self):
+
+        @mmcv.jit
+        def func(a):
+            return a + 1
+
+        assert len(func._cache._cache) == 0
+
+        a = torch.ones((3, 4))
+        r = func(a)
+        assert r.shape == (3, 4)
+        assert (r == 2).all()
+        assert len(func._cache._cache) == 1
+
+        a = torch.ones((2, 3, 4))
+        r = func(a)
+        assert r.shape == (2, 3, 4)
+        assert (r == 2).all()
+        assert len(func._cache._cache) == 2
+
+    @skip_no_parrots
+    def test_jit_kwargs(self):
+
+        @mmcv.jit
+        def func(a, b):
+            return torch.mean((a - b) * (a - b))
+
+        assert len(func._cache._cache) == 0
+        x = torch.rand((16, 32))
+        y = torch.rand((16, 32))
+        func(x, y)
+        assert len(func._cache._cache) == 1
+        func(x, b=y)
+        assert len(func._cache._cache) == 1
+        func(b=y, a=x)
+        assert len(func._cache._cache) == 1
+
+    def test_jit_derivate(self):
+
+        @mmcv.jit(derivate=True)
+        def func(x, y):
+            return (x + 2) * (y - 2)
+
+        a = torch.rand((3, 4))
+        b = torch.rand((3, 4))
+        a.requires_grad = True
+
+        c = func(a, b)
+        assert c.requires_grad
+        d = torch.empty_like(c)
+        d.fill_(1.0)
+        c.backward(d)
+        assert torch.allclose(a.grad, (b - 2))
+        assert b.grad is None
+
+        a.grad = None
+        c = func(a, b)
+        assert c.requires_grad
+        d = torch.empty_like(c)
+        d.fill_(2.7)
+        c.backward(d)
+        assert torch.allclose(a.grad, 2.7 * (b - 2))
+        assert b.grad is None
+
+    def test_jit_optimize(self):
+
+        @mmcv.jit(optimize=True)
+        def func(a, b):
+            return torch.mean((a - b) * (a - b))
+
+        def pyfunc(a, b):
+            return torch.mean((a - b) * (a - b))
+
+        a = torch.rand((16, 32))
+        b = torch.rand((16, 32))
+
+        c = func(a, b)
+        d = pyfunc(a, b)
+        assert torch.allclose(c, d)
+
+    @mmcv.skip_no_elena
+    def test_jit_coderize(self):
+        if not torch.cuda.is_available():
+            return
+
+        @mmcv.jit(coderize=True)
+        def func(a, b):
+            return (a + b) * (a - b)
+
+        def pyfunc(a, b):
+            return (a + b) * (a - b)
+
+        a = torch.rand((16, 32), device='cuda')
+        b = torch.rand((16, 32), device='cuda')
+
+        c = func(a, b)
+        d = pyfunc(a, b)
+        assert torch.allclose(c, d)
+
+    def test_jit_value_dependent(self):
+
+        @mmcv.jit
+        def func(a, b):
+            torch.nonzero(a)
+            return torch.mean((a - b) * (a - b))
+
+        def pyfunc(a, b):
+            torch.nonzero(a)
+            return torch.mean((a - b) * (a - b))
+
+        a = torch.rand((16, 32))
+        b = torch.rand((16, 32))
+
+        c = func(a, b)
+        d = pyfunc(a, b)
+        assert torch.allclose(c, d)
+
+    @skip_no_parrots
+    def test_jit_check_input(self):
+
+        def func(x):
+            y = torch.rand_like(x)
+            return x + y
+
+        a = torch.ones((3, 4))
+        with pytest.raises(AssertionError):
+            func = mmcv.jit(func, check_input=(a, ))
+
+    @skip_no_parrots
+    def test_jit_partial_shape(self):
+
+        @mmcv.jit(full_shape=False)
+        def func(a, b):
+            return torch.mean((a - b) * (a - b))
+
+        def pyfunc(a, b):
+            return torch.mean((a - b) * (a - b))
+
+        a = torch.rand((3, 4))
+        b = torch.rand((3, 4))
+        assert torch.allclose(func(a, b), pyfunc(a, b))
+        assert len(func._cache._cache) == 1
+
+        a = torch.rand((6, 5))
+        b = torch.rand((6, 5))
+        assert torch.allclose(func(a, b), pyfunc(a, b))
+        assert len(func._cache._cache) == 1
+
+        a = torch.rand((3, 4, 5))
+        b = torch.rand((3, 4, 5))
+        assert torch.allclose(func(a, b), pyfunc(a, b))
+        assert len(func._cache._cache) == 2
+
+        a = torch.rand((1, 9, 8))
+        b = torch.rand((1, 9, 8))
+        assert torch.allclose(func(a, b), pyfunc(a, b))
+        assert len(func._cache._cache) == 2
+
+    def test_instance_method(self):
+
+        class T:
+
+            def __init__(self, shape):
+                self._c = torch.rand(shape)
+
+            @mmcv.jit
+            def test_method(self, x, y):
+                return (x * self._c) + y
+
+        shape = (16, 32)
+        t = T(shape)
+        a = torch.rand(shape)
+        b = torch.rand(shape)
+        res = (a * t._c) + b
+        jit_res = t.test_method(a, b)
+        assert torch.allclose(res, jit_res)
+
+        t = T(shape)
+        res = (a * t._c) + b
+        jit_res = t.test_method(a, b)
+        assert torch.allclose(res, jit_res)
diff --git a/mmcv/tests/test_utils/test_path.py b/mmcv/tests/test_utils/test_path.py
new file mode 100644
index 0000000000000000000000000000000000000000..56d65ce264a12b65a8747fb402643ae3a39b6398
--- /dev/null
+++ b/mmcv/tests/test_utils/test_path.py
@@ -0,0 +1,81 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from pathlib import Path
+
+import pytest
+
+import mmcv
+
+
+def test_is_filepath():
+    assert mmcv.is_filepath(__file__)
+    assert mmcv.is_filepath('abc')
+    assert mmcv.is_filepath(Path('/etc'))
+    assert not mmcv.is_filepath(0)
+
+
+def test_fopen():
+    assert hasattr(mmcv.fopen(__file__), 'read')
+    assert hasattr(mmcv.fopen(Path(__file__)), 'read')
+
+
+def test_check_file_exist():
+    mmcv.check_file_exist(__file__)
+    with pytest.raises(FileNotFoundError):
+        mmcv.check_file_exist('no_such_file.txt')
+
+
+def test_scandir():
+    folder = osp.join(osp.dirname(osp.dirname(__file__)), 'data/for_scan')
+    filenames = ['a.bin', '1.txt', '2.txt', '1.json', '2.json', '3.TXT']
+    assert set(mmcv.scandir(folder)) == set(filenames)
+    assert set(mmcv.scandir(Path(folder))) == set(filenames)
+    assert set(mmcv.scandir(folder, '.txt')) == {
+        filename
+        for filename in filenames if filename.endswith('.txt')
+    }
+    assert set(mmcv.scandir(folder, ('.json', '.txt'))) == {
+        filename
+        for filename in filenames if filename.endswith(('.txt', '.json'))
+    }
+    assert set(mmcv.scandir(folder, '.png')) == set()
+
+    # path of sep is `\\` in windows but `/` in linux, so osp.join should be
+    # used to join string for compatibility
+    filenames_recursive = [
+        'a.bin', '1.txt', '2.txt', '1.json', '2.json', '3.TXT',
+        osp.join('sub', '1.json'),
+        osp.join('sub', '1.txt'), '.file'
+    ]
+    # .file starts with '.' and is a file so it will not be scanned
+    assert set(mmcv.scandir(folder, recursive=True)) == {
+        filename
+        for filename in filenames_recursive if filename != '.file'
+    }
+    assert set(mmcv.scandir(Path(folder), recursive=True)) == {
+        filename
+        for filename in filenames_recursive if filename != '.file'
+    }
+    assert set(mmcv.scandir(folder, '.txt', recursive=True)) == {
+        filename
+        for filename in filenames_recursive if filename.endswith('.txt')
+    }
+    assert set(
+        mmcv.scandir(folder, '.TXT', recursive=True,
+                     case_sensitive=False)) == {
+                         filename
+                         for filename in filenames_recursive
+                         if filename.endswith(('.txt', '.TXT'))
+                     }
+    assert set(
+        mmcv.scandir(
+            folder, ('.TXT', '.JSON'), recursive=True,
+            case_sensitive=False)) == {
+                filename
+                for filename in filenames_recursive
+                if filename.endswith(('.txt', '.json', '.TXT'))
+            }
+    with pytest.raises(TypeError):
+        list(mmcv.scandir(123))
+    with pytest.raises(TypeError):
+        list(mmcv.scandir(folder, 111))
diff --git a/mmcv/tests/test_utils/test_progressbar.py b/mmcv/tests/test_utils/test_progressbar.py
new file mode 100644
index 0000000000000000000000000000000000000000..982aa247f7b4d0a0b47f035b3d6169c364e03d3e
--- /dev/null
+++ b/mmcv/tests/test_utils/test_progressbar.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import time
+from io import StringIO
+from unittest.mock import patch
+
+import mmcv
+
+
+def reset_string_io(io):
+    io.truncate(0)
+    io.seek(0)
+
+
+class TestProgressBar:
+
+    def test_start(self):
+        out = StringIO()
+        bar_width = 20
+        # without total task num
+        prog_bar = mmcv.ProgressBar(bar_width=bar_width, file=out)
+        assert out.getvalue() == 'completed: 0, elapsed: 0s'
+        reset_string_io(out)
+        prog_bar = mmcv.ProgressBar(bar_width=bar_width, start=False, file=out)
+        assert out.getvalue() == ''
+        reset_string_io(out)
+        prog_bar.start()
+        assert out.getvalue() == 'completed: 0, elapsed: 0s'
+        # with total task num
+        reset_string_io(out)
+        prog_bar = mmcv.ProgressBar(10, bar_width=bar_width, file=out)
+        assert out.getvalue() == f'[{" " * bar_width}] 0/10, elapsed: 0s, ETA:'
+        reset_string_io(out)
+        prog_bar = mmcv.ProgressBar(
+            10, bar_width=bar_width, start=False, file=out)
+        assert out.getvalue() == ''
+        reset_string_io(out)
+        prog_bar.start()
+        assert out.getvalue() == f'[{" " * bar_width}] 0/10, elapsed: 0s, ETA:'
+
+    def test_update(self):
+        out = StringIO()
+        bar_width = 20
+        # without total task num
+        prog_bar = mmcv.ProgressBar(bar_width=bar_width, file=out)
+        time.sleep(1)
+        reset_string_io(out)
+        prog_bar.update()
+        assert out.getvalue() == 'completed: 1, elapsed: 1s, 1.0 tasks/s'
+        reset_string_io(out)
+        # with total task num
+        prog_bar = mmcv.ProgressBar(10, bar_width=bar_width, file=out)
+        time.sleep(1)
+        reset_string_io(out)
+        prog_bar.update()
+        assert out.getvalue() == f'\r[{">" * 2 + " " * 18}] 1/10, 1.0 ' \
+                                 'task/s, elapsed: 1s, ETA:     9s'
+
+    def test_adaptive_length(self):
+        with patch.dict('os.environ', {'COLUMNS': '80'}):
+            out = StringIO()
+            bar_width = 20
+            prog_bar = mmcv.ProgressBar(10, bar_width=bar_width, file=out)
+            time.sleep(1)
+            reset_string_io(out)
+            prog_bar.update()
+            assert len(out.getvalue()) == 66
+
+            os.environ['COLUMNS'] = '30'
+            reset_string_io(out)
+            prog_bar.update()
+            assert len(out.getvalue()) == 48
+
+            os.environ['COLUMNS'] = '60'
+            reset_string_io(out)
+            prog_bar.update()
+            assert len(out.getvalue()) == 60
+
+
+def sleep_1s(num):
+    time.sleep(1)
+    return num
+
+
+def test_track_progress_list():
+    out = StringIO()
+    ret = mmcv.track_progress(sleep_1s, [1, 2, 3], bar_width=3, file=out)
+    assert out.getvalue() == (
+        '[   ] 0/3, elapsed: 0s, ETA:'
+        '\r[>  ] 1/3, 1.0 task/s, elapsed: 1s, ETA:     2s'
+        '\r[>> ] 2/3, 1.0 task/s, elapsed: 2s, ETA:     1s'
+        '\r[>>>] 3/3, 1.0 task/s, elapsed: 3s, ETA:     0s\n')
+    assert ret == [1, 2, 3]
+
+
+def test_track_progress_iterator():
+    out = StringIO()
+    ret = mmcv.track_progress(
+        sleep_1s, ((i for i in [1, 2, 3]), 3), bar_width=3, file=out)
+    assert out.getvalue() == (
+        '[   ] 0/3, elapsed: 0s, ETA:'
+        '\r[>  ] 1/3, 1.0 task/s, elapsed: 1s, ETA:     2s'
+        '\r[>> ] 2/3, 1.0 task/s, elapsed: 2s, ETA:     1s'
+        '\r[>>>] 3/3, 1.0 task/s, elapsed: 3s, ETA:     0s\n')
+    assert ret == [1, 2, 3]
+
+
+def test_track_iter_progress():
+    out = StringIO()
+    ret = []
+    for num in mmcv.track_iter_progress([1, 2, 3], bar_width=3, file=out):
+        ret.append(sleep_1s(num))
+    assert out.getvalue() == (
+        '[   ] 0/3, elapsed: 0s, ETA:'
+        '\r[>  ] 1/3, 1.0 task/s, elapsed: 1s, ETA:     2s'
+        '\r[>> ] 2/3, 1.0 task/s, elapsed: 2s, ETA:     1s'
+        '\r[>>>] 3/3, 1.0 task/s, elapsed: 3s, ETA:     0s\n')
+    assert ret == [1, 2, 3]
+
+
+def test_track_enum_progress():
+    out = StringIO()
+    ret = []
+    count = []
+    for i, num in enumerate(
+            mmcv.track_iter_progress([1, 2, 3], bar_width=3, file=out)):
+        ret.append(sleep_1s(num))
+        count.append(i)
+    assert out.getvalue() == (
+        '[   ] 0/3, elapsed: 0s, ETA:'
+        '\r[>  ] 1/3, 1.0 task/s, elapsed: 1s, ETA:     2s'
+        '\r[>> ] 2/3, 1.0 task/s, elapsed: 2s, ETA:     1s'
+        '\r[>>>] 3/3, 1.0 task/s, elapsed: 3s, ETA:     0s\n')
+    assert ret == [1, 2, 3]
+    assert count == [0, 1, 2]
+
+
+def test_track_parallel_progress_list():
+    out = StringIO()
+    results = mmcv.track_parallel_progress(
+        sleep_1s, [1, 2, 3, 4], 2, bar_width=4, file=out)
+    # The following cannot pass CI on Github Action
+    # assert out.getvalue() == (
+    #     '[    ] 0/4, elapsed: 0s, ETA:'
+    #     '\r[>   ] 1/4, 1.0 task/s, elapsed: 1s, ETA:     3s'
+    #     '\r[>>  ] 2/4, 2.0 task/s, elapsed: 1s, ETA:     1s'
+    #     '\r[>>> ] 3/4, 1.5 task/s, elapsed: 2s, ETA:     1s'
+    #     '\r[>>>>] 4/4, 2.0 task/s, elapsed: 2s, ETA:     0s\n')
+    assert results == [1, 2, 3, 4]
+
+
+def test_track_parallel_progress_iterator():
+    out = StringIO()
+    results = mmcv.track_parallel_progress(
+        sleep_1s, ((i for i in [1, 2, 3, 4]), 4), 2, bar_width=4, file=out)
+    # The following cannot pass CI on Github Action
+    # assert out.getvalue() == (
+    #     '[    ] 0/4, elapsed: 0s, ETA:'
+    #     '\r[>   ] 1/4, 1.0 task/s, elapsed: 1s, ETA:     3s'
+    #     '\r[>>  ] 2/4, 2.0 task/s, elapsed: 1s, ETA:     1s'
+    #     '\r[>>> ] 3/4, 1.5 task/s, elapsed: 2s, ETA:     1s'
+    #     '\r[>>>>] 4/4, 2.0 task/s, elapsed: 2s, ETA:     0s\n')
+    assert results == [1, 2, 3, 4]
diff --git a/mmcv/tests/test_utils/test_registry.py b/mmcv/tests/test_utils/test_registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..09dc46b7cd722a48f9b816a659b41ca6fada2e4b
--- /dev/null
+++ b/mmcv/tests/test_utils/test_registry.py
@@ -0,0 +1,294 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+
+import mmcv
+
+
+def test_registry():
+    CATS = mmcv.Registry('cat')
+    assert CATS.name == 'cat'
+    assert CATS.module_dict == {}
+    assert len(CATS) == 0
+
+    @CATS.register_module()
+    class BritishShorthair:
+        pass
+
+    assert len(CATS) == 1
+    assert CATS.get('BritishShorthair') is BritishShorthair
+
+    class Munchkin:
+        pass
+
+    CATS.register_module(Munchkin)
+    assert len(CATS) == 2
+    assert CATS.get('Munchkin') is Munchkin
+    assert 'Munchkin' in CATS
+
+    with pytest.raises(KeyError):
+        CATS.register_module(Munchkin)
+
+    CATS.register_module(Munchkin, force=True)
+    assert len(CATS) == 2
+
+    # force=False
+    with pytest.raises(KeyError):
+
+        @CATS.register_module()
+        class BritishShorthair:
+            pass
+
+    @CATS.register_module(force=True)
+    class BritishShorthair:
+        pass
+
+    assert len(CATS) == 2
+
+    assert CATS.get('PersianCat') is None
+    assert 'PersianCat' not in CATS
+
+    @CATS.register_module(name=['Siamese', 'Siamese2'])
+    class SiameseCat:
+        pass
+
+    assert CATS.get('Siamese').__name__ == 'SiameseCat'
+    assert CATS.get('Siamese2').__name__ == 'SiameseCat'
+
+    class SphynxCat:
+        pass
+
+    CATS.register_module(name='Sphynx', module=SphynxCat)
+    assert CATS.get('Sphynx') is SphynxCat
+
+    CATS.register_module(name=['Sphynx1', 'Sphynx2'], module=SphynxCat)
+    assert CATS.get('Sphynx2') is SphynxCat
+
+    repr_str = 'Registry(name=cat, items={'
+    repr_str += ("'BritishShorthair': <class 'test_registry.test_registry."
+                 "<locals>.BritishShorthair'>, ")
+    repr_str += ("'Munchkin': <class 'test_registry.test_registry."
+                 "<locals>.Munchkin'>, ")
+    repr_str += ("'Siamese': <class 'test_registry.test_registry."
+                 "<locals>.SiameseCat'>, ")
+    repr_str += ("'Siamese2': <class 'test_registry.test_registry."
+                 "<locals>.SiameseCat'>, ")
+    repr_str += ("'Sphynx': <class 'test_registry.test_registry."
+                 "<locals>.SphynxCat'>, ")
+    repr_str += ("'Sphynx1': <class 'test_registry.test_registry."
+                 "<locals>.SphynxCat'>, ")
+    repr_str += ("'Sphynx2': <class 'test_registry.test_registry."
+                 "<locals>.SphynxCat'>")
+    repr_str += '})'
+    assert repr(CATS) == repr_str
+
+    # name type
+    with pytest.raises(TypeError):
+        CATS.register_module(name=7474741, module=SphynxCat)
+
+    # the registered module should be a class
+    with pytest.raises(TypeError):
+        CATS.register_module(0)
+
+    @CATS.register_module()
+    def muchkin():
+        pass
+
+    assert CATS.get('muchkin') is muchkin
+    assert 'muchkin' in CATS
+
+    # can only decorate a class or a function
+    with pytest.raises(TypeError):
+
+        class Demo:
+
+            def some_method(self):
+                pass
+
+        method = Demo().some_method
+        CATS.register_module(name='some_method', module=method)
+
+    # begin: test old APIs
+    with pytest.warns(DeprecationWarning):
+        CATS.register_module(SphynxCat)
+        assert CATS.get('SphynxCat').__name__ == 'SphynxCat'
+
+    with pytest.warns(DeprecationWarning):
+        CATS.register_module(SphynxCat, force=True)
+        assert CATS.get('SphynxCat').__name__ == 'SphynxCat'
+
+    with pytest.warns(DeprecationWarning):
+
+        @CATS.register_module
+        class NewCat:
+            pass
+
+        assert CATS.get('NewCat').__name__ == 'NewCat'
+
+    with pytest.warns(DeprecationWarning):
+        CATS.deprecated_register_module(SphynxCat, force=True)
+        assert CATS.get('SphynxCat').__name__ == 'SphynxCat'
+
+    with pytest.warns(DeprecationWarning):
+
+        @CATS.deprecated_register_module
+        class CuteCat:
+            pass
+
+        assert CATS.get('CuteCat').__name__ == 'CuteCat'
+
+    with pytest.warns(DeprecationWarning):
+
+        @CATS.deprecated_register_module(force=True)
+        class NewCat2:
+            pass
+
+        assert CATS.get('NewCat2').__name__ == 'NewCat2'
+
+    # end: test old APIs
+
+
+def test_multi_scope_registry():
+    DOGS = mmcv.Registry('dogs')
+    assert DOGS.name == 'dogs'
+    assert DOGS.scope == 'test_registry'
+    assert DOGS.module_dict == {}
+    assert len(DOGS) == 0
+
+    @DOGS.register_module()
+    class GoldenRetriever:
+        pass
+
+    assert len(DOGS) == 1
+    assert DOGS.get('GoldenRetriever') is GoldenRetriever
+
+    HOUNDS = mmcv.Registry('dogs', parent=DOGS, scope='hound')
+
+    @HOUNDS.register_module()
+    class BloodHound:
+        pass
+
+    assert len(HOUNDS) == 1
+    assert HOUNDS.get('BloodHound') is BloodHound
+    assert DOGS.get('hound.BloodHound') is BloodHound
+    assert HOUNDS.get('hound.BloodHound') is BloodHound
+
+    LITTLE_HOUNDS = mmcv.Registry('dogs', parent=HOUNDS, scope='little_hound')
+
+    @LITTLE_HOUNDS.register_module()
+    class Dachshund:
+        pass
+
+    assert len(LITTLE_HOUNDS) == 1
+    assert LITTLE_HOUNDS.get('Dachshund') is Dachshund
+    assert LITTLE_HOUNDS.get('hound.BloodHound') is BloodHound
+    assert HOUNDS.get('little_hound.Dachshund') is Dachshund
+    assert DOGS.get('hound.little_hound.Dachshund') is Dachshund
+
+    MID_HOUNDS = mmcv.Registry('dogs', parent=HOUNDS, scope='mid_hound')
+
+    @MID_HOUNDS.register_module()
+    class Beagle:
+        pass
+
+    assert MID_HOUNDS.get('Beagle') is Beagle
+    assert HOUNDS.get('mid_hound.Beagle') is Beagle
+    assert DOGS.get('hound.mid_hound.Beagle') is Beagle
+    assert LITTLE_HOUNDS.get('hound.mid_hound.Beagle') is Beagle
+    assert MID_HOUNDS.get('hound.BloodHound') is BloodHound
+    assert MID_HOUNDS.get('hound.Dachshund') is None
+
+
+def test_build_from_cfg():
+    BACKBONES = mmcv.Registry('backbone')
+
+    @BACKBONES.register_module()
+    class ResNet:
+
+        def __init__(self, depth, stages=4):
+            self.depth = depth
+            self.stages = stages
+
+    @BACKBONES.register_module()
+    class ResNeXt:
+
+        def __init__(self, depth, stages=4):
+            self.depth = depth
+            self.stages = stages
+
+    cfg = dict(type='ResNet', depth=50)
+    model = mmcv.build_from_cfg(cfg, BACKBONES)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    cfg = dict(type='ResNet', depth=50)
+    model = mmcv.build_from_cfg(cfg, BACKBONES, default_args={'stages': 3})
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 3
+
+    cfg = dict(type='ResNeXt', depth=50, stages=3)
+    model = mmcv.build_from_cfg(cfg, BACKBONES)
+    assert isinstance(model, ResNeXt)
+    assert model.depth == 50 and model.stages == 3
+
+    cfg = dict(type=ResNet, depth=50)
+    model = mmcv.build_from_cfg(cfg, BACKBONES)
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    # type defined using default_args
+    cfg = dict(depth=50)
+    model = mmcv.build_from_cfg(
+        cfg, BACKBONES, default_args=dict(type='ResNet'))
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    cfg = dict(depth=50)
+    model = mmcv.build_from_cfg(cfg, BACKBONES, default_args=dict(type=ResNet))
+    assert isinstance(model, ResNet)
+    assert model.depth == 50 and model.stages == 4
+
+    # not a registry
+    with pytest.raises(TypeError):
+        cfg = dict(type='VGG')
+        model = mmcv.build_from_cfg(cfg, 'BACKBONES')
+
+    # non-registered class
+    with pytest.raises(KeyError):
+        cfg = dict(type='VGG')
+        model = mmcv.build_from_cfg(cfg, BACKBONES)
+
+    # default_args must be a dict or None
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = mmcv.build_from_cfg(cfg, BACKBONES, default_args=1)
+
+    # cfg['type'] should be a str or class
+    with pytest.raises(TypeError):
+        cfg = dict(type=1000)
+        model = mmcv.build_from_cfg(cfg, BACKBONES)
+
+    # cfg should contain the key "type"
+    with pytest.raises(KeyError, match='must contain the key "type"'):
+        cfg = dict(depth=50, stages=4)
+        model = mmcv.build_from_cfg(cfg, BACKBONES)
+
+    # cfg or default_args should contain the key "type"
+    with pytest.raises(KeyError, match='must contain the key "type"'):
+        cfg = dict(depth=50)
+        model = mmcv.build_from_cfg(
+            cfg, BACKBONES, default_args=dict(stages=4))
+
+    # incorrect registry type
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = mmcv.build_from_cfg(cfg, 'BACKBONES')
+
+    # incorrect default_args type
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', depth=50)
+        model = mmcv.build_from_cfg(cfg, BACKBONES, default_args=0)
+
+    # incorrect arguments
+    with pytest.raises(TypeError):
+        cfg = dict(type='ResNet', non_existing_arg=50)
+        model = mmcv.build_from_cfg(cfg, BACKBONES)
diff --git a/mmcv/tests/test_utils/test_testing.py b/mmcv/tests/test_utils/test_testing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c6f8e8d1230f7cfbc3c247a267f9ee7f5f68fab5
--- /dev/null
+++ b/mmcv/tests/test_utils/test_testing.py
@@ -0,0 +1,195 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+
+import mmcv
+
+try:
+    import torch
+except ImportError:
+    torch = None
+else:
+    import torch.nn as nn
+
+
+def test_assert_dict_contains_subset():
+    dict_obj = {'a': 'test1', 'b': 2, 'c': (4, 6)}
+
+    # case 1
+    expected_subset = {'a': 'test1', 'b': 2, 'c': (4, 6)}
+    assert mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    # case 2
+    expected_subset = {'a': 'test1', 'b': 2, 'c': (6, 4)}
+    assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    # case 3
+    expected_subset = {'a': 'test1', 'b': 2, 'c': None}
+    assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    # case 4
+    expected_subset = {'a': 'test1', 'b': 2, 'd': (4, 6)}
+    assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    # case 5
+    dict_obj = {
+        'a': 'test1',
+        'b': 2,
+        'c': (4, 6),
+        'd': np.array([[5, 3, 5], [1, 2, 3]])
+    }
+    expected_subset = {
+        'a': 'test1',
+        'b': 2,
+        'c': (4, 6),
+        'd': np.array([[5, 3, 5], [6, 2, 3]])
+    }
+    assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    # case 6
+    dict_obj = {'a': 'test1', 'b': 2, 'c': (4, 6), 'd': np.array([[1]])}
+    expected_subset = {'a': 'test1', 'b': 2, 'c': (4, 6), 'd': np.array([[1]])}
+    assert mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+    if torch is not None:
+        dict_obj = {
+            'a': 'test1',
+            'b': 2,
+            'c': (4, 6),
+            'd': torch.tensor([5, 3, 5])
+        }
+
+        # case 7
+        expected_subset = {'d': torch.tensor([5, 5, 5])}
+        assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+        # case 8
+        expected_subset = {'d': torch.tensor([[5, 3, 5], [4, 1, 2]])}
+        assert not mmcv.assert_dict_contains_subset(dict_obj, expected_subset)
+
+
+def test_assert_attrs_equal():
+
+    class TestExample:
+        a, b, c = 1, ('wvi', 3), [4.5, 3.14]
+
+        def test_func(self):
+            return self.b
+
+    # case 1
+    assert mmcv.assert_attrs_equal(TestExample, {
+        'a': 1,
+        'b': ('wvi', 3),
+        'c': [4.5, 3.14]
+    })
+
+    # case 2
+    assert not mmcv.assert_attrs_equal(TestExample, {
+        'a': 1,
+        'b': ('wvi', 3),
+        'c': [4.5, 3.14, 2]
+    })
+
+    # case 3
+    assert not mmcv.assert_attrs_equal(TestExample, {
+        'bc': 54,
+        'c': [4.5, 3.14]
+    })
+
+    # case 4
+    assert mmcv.assert_attrs_equal(TestExample, {
+        'b': ('wvi', 3),
+        'test_func': TestExample.test_func
+    })
+
+    if torch is not None:
+
+        class TestExample:
+            a, b = torch.tensor([1]), torch.tensor([4, 5])
+
+        # case 5
+        assert mmcv.assert_attrs_equal(TestExample, {
+            'a': torch.tensor([1]),
+            'b': torch.tensor([4, 5])
+        })
+
+        # case 6
+        assert not mmcv.assert_attrs_equal(TestExample, {
+            'a': torch.tensor([1]),
+            'b': torch.tensor([4, 6])
+        })
+
+
+assert_dict_has_keys_data_1 = [({
+    'res_layer': 1,
+    'norm_layer': 2,
+    'dense_layer': 3
+})]
+assert_dict_has_keys_data_2 = [(['res_layer', 'dense_layer'], True),
+                               (['res_layer', 'conv_layer'], False)]
+
+
+@pytest.mark.parametrize('obj', assert_dict_has_keys_data_1)
+@pytest.mark.parametrize('expected_keys, ret_value',
+                         assert_dict_has_keys_data_2)
+def test_assert_dict_has_keys(obj, expected_keys, ret_value):
+    assert mmcv.assert_dict_has_keys(obj, expected_keys) == ret_value
+
+
+assert_keys_equal_data_1 = [(['res_layer', 'norm_layer', 'dense_layer'])]
+assert_keys_equal_data_2 = [(['res_layer', 'norm_layer', 'dense_layer'], True),
+                            (['res_layer', 'dense_layer', 'norm_layer'], True),
+                            (['res_layer', 'norm_layer'], False),
+                            (['res_layer', 'conv_layer', 'norm_layer'], False)]
+
+
+@pytest.mark.parametrize('result_keys', assert_keys_equal_data_1)
+@pytest.mark.parametrize('target_keys, ret_value', assert_keys_equal_data_2)
+def test_assert_keys_equal(result_keys, target_keys, ret_value):
+    assert mmcv.assert_keys_equal(result_keys, target_keys) == ret_value
+
+
+@pytest.mark.skipif(torch is None, reason='requires torch library')
+def test_assert_is_norm_layer():
+    # case 1
+    assert not mmcv.assert_is_norm_layer(nn.Conv3d(3, 64, 3))
+
+    # case 2
+    assert mmcv.assert_is_norm_layer(nn.BatchNorm3d(128))
+
+    # case 3
+    assert mmcv.assert_is_norm_layer(nn.GroupNorm(8, 64))
+
+    # case 4
+    assert not mmcv.assert_is_norm_layer(nn.Sigmoid())
+
+
+@pytest.mark.skipif(torch is None, reason='requires torch library')
+def test_assert_params_all_zeros():
+    demo_module = nn.Conv2d(3, 64, 3)
+    nn.init.constant_(demo_module.weight, 0)
+    nn.init.constant_(demo_module.bias, 0)
+    assert mmcv.assert_params_all_zeros(demo_module)
+
+    nn.init.xavier_normal_(demo_module.weight)
+    nn.init.constant_(demo_module.bias, 0)
+    assert not mmcv.assert_params_all_zeros(demo_module)
+
+    demo_module = nn.Linear(2048, 400, bias=False)
+    nn.init.constant_(demo_module.weight, 0)
+    assert mmcv.assert_params_all_zeros(demo_module)
+
+    nn.init.normal_(demo_module.weight, mean=0, std=0.01)
+    assert not mmcv.assert_params_all_zeros(demo_module)
+
+
+def test_check_python_script(capsys):
+    mmcv.utils.check_python_script('./tests/data/scripts/hello.py zz')
+    captured = capsys.readouterr().out
+    assert captured == 'hello zz!\n'
+    mmcv.utils.check_python_script('./tests/data/scripts/hello.py agent')
+    captured = capsys.readouterr().out
+    assert captured == 'hello agent!\n'
+    # Make sure that wrong cmd raises an error
+    with pytest.raises(SystemExit):
+        mmcv.utils.check_python_script('./tests/data/scripts/hello.py li zz')
diff --git a/mmcv/tests/test_utils/test_timer.py b/mmcv/tests/test_utils/test_timer.py
new file mode 100644
index 0000000000000000000000000000000000000000..983f64f58e25f64fb4e6858839744e818bc62df9
--- /dev/null
+++ b/mmcv/tests/test_utils/test_timer.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import time
+
+import pytest
+
+import mmcv
+
+
+def test_timer_init():
+    timer = mmcv.Timer(start=False)
+    assert not timer.is_running
+    timer.start()
+    assert timer.is_running
+    timer = mmcv.Timer()
+    assert timer.is_running
+
+
+def test_timer_run():
+    timer = mmcv.Timer()
+    time.sleep(1)
+    assert abs(timer.since_start() - 1) < 1e-2
+    time.sleep(1)
+    assert abs(timer.since_last_check() - 1) < 1e-2
+    assert abs(timer.since_start() - 2) < 1e-2
+    timer = mmcv.Timer(False)
+    with pytest.raises(mmcv.TimerError):
+        timer.since_start()
+    with pytest.raises(mmcv.TimerError):
+        timer.since_last_check()
+
+
+def test_timer_context(capsys):
+    with mmcv.Timer():
+        time.sleep(1)
+    out, _ = capsys.readouterr()
+    assert abs(float(out) - 1) < 1e-2
+    with mmcv.Timer(print_tmpl='time: {:.1f}s'):
+        time.sleep(1)
+    out, _ = capsys.readouterr()
+    assert out == 'time: 1.0s\n'
diff --git a/mmcv/tests/test_utils/test_torch_ops.py b/mmcv/tests/test_utils/test_torch_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..e8752e0fd60f8d69787b92238460c0a61bab954e
--- /dev/null
+++ b/mmcv/tests/test_utils/test_torch_ops.py
@@ -0,0 +1,15 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.utils import torch_meshgrid
+
+
+def test_torch_meshgrid():
+    # torch_meshgrid should not throw warning
+    with pytest.warns(None) as record:
+        x = torch.tensor([1, 2, 3])
+        y = torch.tensor([4, 5, 6])
+        grid_x, grid_y = torch_meshgrid(x, y)
+
+    assert len(record) == 0
diff --git a/mmcv/tests/test_utils/test_trace.py b/mmcv/tests/test_utils/test_trace.py
new file mode 100644
index 0000000000000000000000000000000000000000..2dbf2c8549491d421d17c336e73d1358f275cdee
--- /dev/null
+++ b/mmcv/tests/test_utils/test_trace.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmcv.utils import digit_version, is_jit_tracing
+
+
+@pytest.mark.skipif(
+    digit_version(torch.__version__) < digit_version('1.6.0'),
+    reason='torch.jit.is_tracing is not available before 1.6.0')
+def test_is_jit_tracing():
+
+    def foo(x):
+        if is_jit_tracing():
+            return x
+        else:
+            return x.tolist()
+
+    x = torch.rand(3)
+    # test without trace
+    assert isinstance(foo(x), list)
+
+    # test with trace
+    traced_foo = torch.jit.trace(foo, (torch.rand(1), ))
+    assert isinstance(traced_foo(x), torch.Tensor)
diff --git a/mmcv/tests/test_utils/test_version_utils.py b/mmcv/tests/test_utils/test_version_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5400e3c86a61b84e2c924ad0105d51b36b598cbd
--- /dev/null
+++ b/mmcv/tests/test_utils/test_version_utils.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import patch
+
+import pytest
+
+from mmcv import get_git_hash, parse_version_info
+from mmcv.utils import digit_version
+
+
+def test_digit_version():
+    assert digit_version('0.2.16') == (0, 2, 16, 0, 0, 0)
+    assert digit_version('1.2.3') == (1, 2, 3, 0, 0, 0)
+    assert digit_version('1.2.3rc0') == (1, 2, 3, 0, -1, 0)
+    assert digit_version('1.2.3rc1') == (1, 2, 3, 0, -1, 1)
+    assert digit_version('1.0rc0') == (1, 0, 0, 0, -1, 0)
+    assert digit_version('1.0') == digit_version('1.0.0')
+    assert digit_version('1.5.0+cuda90_cudnn7.6.3_lms') == digit_version('1.5')
+    assert digit_version('1.0.0dev') < digit_version('1.0.0a')
+    assert digit_version('1.0.0a') < digit_version('1.0.0a1')
+    assert digit_version('1.0.0a') < digit_version('1.0.0b')
+    assert digit_version('1.0.0b') < digit_version('1.0.0rc')
+    assert digit_version('1.0.0rc1') < digit_version('1.0.0')
+    assert digit_version('1.0.0') < digit_version('1.0.0post')
+    assert digit_version('1.0.0post') < digit_version('1.0.0post1')
+    assert digit_version('v1') == (1, 0, 0, 0, 0, 0)
+    assert digit_version('v1.1.5') == (1, 1, 5, 0, 0, 0)
+    with pytest.raises(AssertionError):
+        digit_version('a')
+    with pytest.raises(AssertionError):
+        digit_version('1x')
+    with pytest.raises(AssertionError):
+        digit_version('1.x')
+
+
+def test_parse_version_info():
+    assert parse_version_info('0.2.16') == (0, 2, 16, 0, 0, 0)
+    assert parse_version_info('1.2.3') == (1, 2, 3, 0, 0, 0)
+    assert parse_version_info('1.2.3rc0') == (1, 2, 3, 0, 'rc', 0)
+    assert parse_version_info('1.2.3rc1') == (1, 2, 3, 0, 'rc', 1)
+    assert parse_version_info('1.0rc0') == (1, 0, 0, 0, 'rc', 0)
+
+
+def _mock_cmd_success(cmd):
+    return b'3b46d33e90c397869ad5103075838fdfc9812aa0'
+
+
+def _mock_cmd_fail(cmd):
+    raise OSError
+
+
+def test_get_git_hash():
+    with patch('mmcv.utils.version_utils._minimal_ext_cmd', _mock_cmd_success):
+        assert get_git_hash() == '3b46d33e90c397869ad5103075838fdfc9812aa0'
+        assert get_git_hash(digits=6) == '3b46d3'
+        assert get_git_hash(digits=100) == get_git_hash()
+    with patch('mmcv.utils.version_utils._minimal_ext_cmd', _mock_cmd_fail):
+        assert get_git_hash() == 'unknown'
+        assert get_git_hash(fallback='n/a') == 'n/a'
diff --git a/mmcv/tests/test_video/test_optflow.py b/mmcv/tests/test_video/test_optflow.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5aaba3f5e062babe723d753e74e4b4451b1e452
--- /dev/null
+++ b/mmcv/tests/test_video/test_optflow.py
@@ -0,0 +1,291 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+
+import cv2
+import numpy as np
+import pytest
+from numpy.testing import assert_array_almost_equal, assert_array_equal
+
+import mmcv
+
+
+def test_flowread():
+    data_dir = osp.join(osp.dirname(__file__), '../data')
+    flow_shape = (60, 80, 2)
+
+    # read .flo file
+    flow = mmcv.flowread(osp.join(data_dir, 'optflow.flo'))
+    assert flow.shape == flow_shape
+
+    # pseudo read
+    flow_same = mmcv.flowread(flow)
+    assert_array_equal(flow, flow_same)
+
+    # read quantized flow concatenated vertically
+    flow = mmcv.flowread(
+        osp.join(data_dir, 'optflow_concat0.jpg'), quantize=True, denorm=True)
+    assert flow.shape == flow_shape
+
+    # read quantized flow concatenated horizontally
+    flow = mmcv.flowread(
+        osp.join(data_dir, 'optflow_concat1.jpg'),
+        quantize=True,
+        concat_axis=1,
+        denorm=True)
+    assert flow.shape == flow_shape
+
+    # test exceptions
+    notflow_file = osp.join(data_dir, 'color.jpg')
+    with pytest.raises(TypeError):
+        mmcv.flowread(1)
+    with pytest.raises(IOError):
+        mmcv.flowread(notflow_file)
+    with pytest.raises(IOError):
+        mmcv.flowread(notflow_file, quantize=True)
+    with pytest.raises(ValueError):
+        mmcv.flowread(np.zeros((100, 100, 1)))
+
+
+def test_flowwrite():
+    flow = np.random.rand(100, 100, 2).astype(np.float32)
+
+    # write to a .flo file
+    tmp_filehandler, filename = tempfile.mkstemp()
+    mmcv.flowwrite(flow, filename)
+    flow_from_file = mmcv.flowread(filename)
+    assert_array_equal(flow, flow_from_file)
+    os.close(tmp_filehandler)
+    os.remove(filename)
+
+    # write to two .jpg files
+    tmp_filename = osp.join(tempfile.gettempdir(), 'mmcv_test_flow.jpg')
+    for concat_axis in range(2):
+        mmcv.flowwrite(
+            flow, tmp_filename, quantize=True, concat_axis=concat_axis)
+        shape = (200, 100) if concat_axis == 0 else (100, 200)
+        assert osp.isfile(tmp_filename)
+        assert mmcv.imread(tmp_filename, flag='unchanged').shape == shape
+        os.remove(tmp_filename)
+
+    # test exceptions
+    with pytest.raises(AssertionError):
+        mmcv.flowwrite(flow, tmp_filename, quantize=True, concat_axis=2)
+
+
+def test_quantize_flow():
+    flow = (np.random.rand(10, 8, 2).astype(np.float32) - 0.5) * 15
+    max_val = 5.0
+    dx, dy = mmcv.quantize_flow(flow, max_val=max_val, norm=False)
+    ref = np.zeros_like(flow, dtype=np.uint8)
+    for i in range(ref.shape[0]):
+        for j in range(ref.shape[1]):
+            for k in range(ref.shape[2]):
+                val = flow[i, j, k] + max_val
+                val = min(max(val, 0), 2 * max_val)
+                ref[i, j, k] = min(np.floor(255 * val / (2 * max_val)), 254)
+    assert_array_equal(dx, ref[..., 0])
+    assert_array_equal(dy, ref[..., 1])
+    max_val = 0.5
+    dx, dy = mmcv.quantize_flow(flow, max_val=max_val, norm=True)
+    ref = np.zeros_like(flow, dtype=np.uint8)
+    for i in range(ref.shape[0]):
+        for j in range(ref.shape[1]):
+            for k in range(ref.shape[2]):
+                scale = flow.shape[1] if k == 0 else flow.shape[0]
+                val = flow[i, j, k] / scale + max_val
+                val = min(max(val, 0), 2 * max_val)
+                ref[i, j, k] = min(np.floor(255 * val / (2 * max_val)), 254)
+    assert_array_equal(dx, ref[..., 0])
+    assert_array_equal(dy, ref[..., 1])
+
+
+def test_dequantize_flow():
+    dx = np.random.randint(256, size=(10, 8), dtype=np.uint8)
+    dy = np.random.randint(256, size=(10, 8), dtype=np.uint8)
+    max_val = 5.0
+    flow = mmcv.dequantize_flow(dx, dy, max_val=max_val, denorm=False)
+    ref = np.zeros_like(flow, dtype=np.float32)
+    for i in range(ref.shape[0]):
+        for j in range(ref.shape[1]):
+            ref[i, j, 0] = float(dx[i, j] + 0.5) * 2 * max_val / 255 - max_val
+            ref[i, j, 1] = float(dy[i, j] + 0.5) * 2 * max_val / 255 - max_val
+    assert_array_almost_equal(flow, ref)
+    max_val = 0.5
+    flow = mmcv.dequantize_flow(dx, dy, max_val=max_val, denorm=True)
+    h, w = dx.shape
+    ref = np.zeros_like(flow, dtype=np.float32)
+    for i in range(ref.shape[0]):
+        for j in range(ref.shape[1]):
+            ref[i, j,
+                0] = (float(dx[i, j] + 0.5) * 2 * max_val / 255 - max_val) * w
+            ref[i, j,
+                1] = (float(dy[i, j] + 0.5) * 2 * max_val / 255 - max_val) * h
+    assert_array_almost_equal(flow, ref)
+
+
+def test_flow2rgb():
+    flow = np.array([[[0, 0], [0.5, 0.5], [1, 1], [2, 1], [3, np.inf]]],
+                    dtype=np.float32)
+    flow_img = mmcv.flow2rgb(flow)
+    # yapf: disable
+    assert_array_almost_equal(
+        flow_img,
+        np.array([[[1., 1., 1.],
+                   [1., 0.826074731, 0.683772236],
+                   [1., 0.652149462, 0.367544472],
+                   [1., 0.265650552, 5.96046448e-08],
+                   [0., 0., 0.]]],
+                 dtype=np.float32))
+    # yapf: enable
+
+
+def test_flow_warp():
+
+    img = np.zeros((5, 5, 3))
+    img[2, 2, 0] = 1
+    flow = np.ones((5, 5, 2))
+
+    res_nn = mmcv.flow_warp(img, flow, interpolate_mode='nearest')
+    res_bi = mmcv.flow_warp(img, flow, interpolate_mode='bilinear')
+
+    assert_array_almost_equal(res_nn, res_bi, decimal=5)
+
+    img = np.zeros((5, 5, 1))
+    img[2, 2, 0] = 1
+    img[2, 3, 0] = 0.75
+    flow = np.zeros((5, 5, 2))
+    flow[2, 2, :] = [0.5, 0.7]
+
+    res_ = np.copy(img)
+    res_[2, 2] = 0.5 * 0.3 + 0.75 * 0.5 * 0.3
+    res_bi = mmcv.flow_warp(img, flow, interpolate_mode='bilinear')
+    assert_array_almost_equal(res_, res_bi, decimal=5)
+
+    with pytest.raises(NotImplementedError):
+        _ = mmcv.flow_warp(img, flow, interpolate_mode='xxx')
+
+    with pytest.raises(AssertionError):
+        _ = mmcv.flow_warp(img, flow[:, :, 0], interpolate_mode='xxx')
+
+
+def test_make_color_wheel():
+    default_color_wheel = mmcv.make_color_wheel()
+    color_wheel = mmcv.make_color_wheel([2, 2, 2, 2, 2, 2])
+    # yapf: disable
+    assert_array_equal(default_color_wheel, np.array(
+        [[1.       , 0.        , 0.        ],  # noqa
+        [1.        , 0.06666667, 0.        ],  # noqa
+        [1.        , 0.13333334, 0.        ],  # noqa
+        [1.        , 0.2       , 0.        ],  # noqa
+        [1.        , 0.26666668, 0.        ],  # noqa
+        [1.        , 0.33333334, 0.        ],  # noqa
+        [1.        , 0.4       , 0.        ],  # noqa
+        [1.        , 0.46666667, 0.        ],  # noqa
+        [1.        , 0.53333336, 0.        ],  # noqa
+        [1.        , 0.6       , 0.        ],  # noqa
+        [1.        , 0.6666667 , 0.        ],  # noqa
+        [1.        , 0.73333335, 0.        ],  # noqa
+        [1.        , 0.8       , 0.        ],  # noqa
+        [1.        , 0.8666667 , 0.        ],  # noqa
+        [1.        , 0.93333334, 0.        ],  # noqa
+        [1.        , 1.        , 0.        ],  # noqa
+        [0.8333333 , 1.        , 0.        ],  # noqa
+        [0.6666667 , 1.        , 0.        ],  # noqa
+        [0.5       , 1.        , 0.        ],  # noqa
+        [0.33333334, 1.        , 0.        ],  # noqa
+        [0.16666667, 1.        , 0.        ],  # noqa
+        [0.        , 1.        , 0.        ],  # noqa
+        [0.        , 1.        , 0.25      ],  # noqa
+        [0.        , 1.        , 0.5       ],  # noqa
+        [0.        , 1.        , 0.75      ],  # noqa
+        [0.        , 1.        , 1.        ],  # noqa
+        [0.        , 0.90909094, 1.        ],  # noqa
+        [0.        , 0.8181818 , 1.        ],  # noqa
+        [0.        , 0.72727275, 1.        ],  # noqa
+        [0.        , 0.6363636 , 1.        ],  # noqa
+        [0.        , 0.54545456, 1.        ],  # noqa
+        [0.        , 0.45454547, 1.        ],  # noqa
+        [0.        , 0.36363637, 1.        ],  # noqa
+        [0.        , 0.27272728, 1.        ],  # noqa
+        [0.        , 0.18181819, 1.        ],  # noqa
+        [0.        , 0.09090909, 1.        ],  # noqa
+        [0.        , 0.        , 1.        ],  # noqa
+        [0.07692308, 0.        , 1.        ],  # noqa
+        [0.15384616, 0.        , 1.        ],  # noqa
+        [0.23076923, 0.        , 1.        ],  # noqa
+        [0.30769232, 0.        , 1.        ],  # noqa
+        [0.3846154 , 0.        , 1.        ],  # noqa
+        [0.46153846, 0.        , 1.        ],  # noqa
+        [0.53846157, 0.        , 1.        ],  # noqa
+        [0.61538464, 0.        , 1.        ],  # noqa
+        [0.6923077 , 0.        , 1.        ],  # noqa
+        [0.7692308 , 0.        , 1.        ],  # noqa
+        [0.84615386, 0.        , 1.        ],  # noqa
+        [0.9230769 , 0.        , 1.        ],  # noqa
+        [1.        , 0.        , 1.        ],  # noqa
+        [1.        , 0.        , 0.8333333 ],  # noqa
+        [1.        , 0.        , 0.6666667 ],  # noqa
+        [1.        , 0.        , 0.5       ],  # noqa
+        [1.        , 0.        , 0.33333334],  # noqa
+        [1.        , 0.        , 0.16666667]], dtype=np.float32))  # noqa
+
+    assert_array_equal(
+        color_wheel,
+        np.array([[1., 0. , 0. ],  # noqa
+                 [1. , 0.5, 0. ],  # noqa
+                 [1. , 1. , 0. ],  # noqa
+                 [0.5, 1. , 0. ],  # noqa
+                 [0. , 1. , 0. ],  # noqa
+                 [0. , 1. , 0.5],  # noqa
+                 [0. , 1. , 1. ],  # noqa
+                 [0. , 0.5, 1. ],  # noqa
+                 [0. , 0. , 1. ],  # noqa
+                 [0.5, 0. , 1. ],  # noqa
+                 [1. , 0. , 1. ],  # noqa
+                 [1. , 0. , 0.5]], dtype=np.float32))  # noqa
+    # yapf: enable
+
+
+def test_flow_from_bytes():
+    data_dir = osp.join(osp.dirname(__file__), '../data')
+    flow_shape = (60, 80, 2)
+    flow_file = osp.join(data_dir, 'optflow.flo')
+
+    # read .flo file
+    flow_fromfile = mmcv.flowread(flow_file)
+
+    with open(flow_file, 'rb') as f:
+        flow_bytes = f.read()
+    flow_frombytes = mmcv.flow_from_bytes(flow_bytes)
+
+    assert flow_frombytes.shape == flow_shape
+    assert np.all(flow_frombytes == flow_fromfile)
+
+
+def test_sparse_flow_from_bytes():
+    data_dir = osp.join(osp.dirname(__file__), '../data')
+    flow_file = osp.join(data_dir, 'sparse_flow.png')
+
+    with open(flow_file, 'rb') as f:
+        flow_bytes = f.read()
+    # read flow from bytes
+    flow_frombytes, valid_frombytes = mmcv.sparse_flow_from_bytes(flow_bytes)
+
+    # test flow shape is [H, W, 2] and valid shape is [H, W]
+    assert flow_frombytes.shape[:2] == valid_frombytes.shape
+    assert flow_frombytes.shape[2] == 2
+
+    def read_sparse_flow_from_file():
+        flow = cv2.imread(flow_file, cv2.IMREAD_ANYDEPTH | cv2.IMREAD_COLOR)
+        flow = flow[:, :, ::-1].astype(np.float32)
+        flow, valid = flow[:, :, :2], flow[:, :, 2]
+        flow = (flow - 2**15) / 64.0
+        return flow, valid
+
+    # read flow from file
+    flow_flowfile, valid_fromfile = read_sparse_flow_from_file()
+
+    assert np.all(flow_frombytes == flow_flowfile)
+    assert np.all(valid_frombytes == valid_fromfile)
diff --git a/mmcv/tests/test_video/test_processing.py b/mmcv/tests/test_video/test_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..88c37a2bd3f1f353e1b402119226b4725cabe904
--- /dev/null
+++ b/mmcv/tests/test_video/test_processing.py
@@ -0,0 +1,58 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import platform
+import tempfile
+
+import pytest
+
+import mmcv
+
+
+class TestVideoEditor:
+
+    @classmethod
+    def setup_class(cls):
+        cls.video_path = osp.join(osp.dirname(__file__), '../data/test.mp4')
+        cls.num_frames = 168
+
+    @pytest.mark.skipif(platform.system() == 'Windows', reason='skip windows')
+    def test_cut_concat_video(self):
+        part1_file = osp.join(tempfile.gettempdir(), '.mmcv_test1.mp4')
+        part2_file = osp.join(tempfile.gettempdir(), '.mmcv_test2.mp4')
+        mmcv.cut_video(self.video_path, part1_file, end=3, vcodec='h264')
+        mmcv.cut_video(self.video_path, part2_file, start=3, vcodec='h264')
+        v1 = mmcv.VideoReader(part1_file)
+        v2 = mmcv.VideoReader(part2_file)
+        assert len(v1) == 75
+        assert len(v2) == self.num_frames - 75
+
+        out_file = osp.join(tempfile.gettempdir(), '.mmcv_test.mp4')
+        mmcv.concat_video([part1_file, part2_file], out_file)
+        v = mmcv.VideoReader(out_file)
+        assert len(v) == self.num_frames
+        os.remove(part1_file)
+        os.remove(part2_file)
+        os.remove(out_file)
+
+    @pytest.mark.skipif(platform.system() == 'Windows', reason='skip windows')
+    def test_resize_video(self):
+        out_file = osp.join(tempfile.gettempdir(), '.mmcv_test.mp4')
+        mmcv.resize_video(
+            self.video_path, out_file, (200, 100), log_level='panic')
+        v = mmcv.VideoReader(out_file)
+        assert v.resolution == (200, 100)
+        os.remove(out_file)
+        mmcv.resize_video(self.video_path, out_file, ratio=2)
+        v = mmcv.VideoReader(out_file)
+        assert v.resolution == (294 * 2, 240 * 2)
+        os.remove(out_file)
+        mmcv.resize_video(self.video_path, out_file, (1000, 480), keep_ar=True)
+        v = mmcv.VideoReader(out_file)
+        assert v.resolution == (294 * 2, 240 * 2)
+        os.remove(out_file)
+        mmcv.resize_video(
+            self.video_path, out_file, ratio=(2, 1.5), keep_ar=True)
+        v = mmcv.VideoReader(out_file)
+        assert v.resolution == (294 * 2, 360)
+        os.remove(out_file)
diff --git a/mmcv/tests/test_video/test_reader.py b/mmcv/tests/test_video/test_reader.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3bbdb7dcbbdd42e3c1e5ffefccbbf8b5c6c3897
--- /dev/null
+++ b/mmcv/tests/test_video/test_reader.py
@@ -0,0 +1,210 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import shutil
+import tempfile
+from collections import OrderedDict
+
+import pytest
+
+import mmcv
+
+
+class TestCache:
+
+    def test_init(self):
+        with pytest.raises(ValueError):
+            mmcv.Cache(0)
+        cache = mmcv.Cache(100)
+        assert cache.capacity == 100
+        assert cache.size == 0
+
+    def test_put(self):
+        cache = mmcv.Cache(3)
+        for i in range(1, 4):
+            cache.put(f'k{i}', i)
+            assert cache.size == i
+        assert cache._cache == OrderedDict([('k1', 1), ('k2', 2), ('k3', 3)])
+        cache.put('k4', 4)
+        assert cache.size == 3
+        assert cache._cache == OrderedDict([('k2', 2), ('k3', 3), ('k4', 4)])
+        cache.put('k2', 2)
+        assert cache._cache == OrderedDict([('k2', 2), ('k3', 3), ('k4', 4)])
+
+    def test_get(self):
+        cache = mmcv.Cache(3)
+        assert cache.get('key_none') is None
+        assert cache.get('key_none', 0) == 0
+        cache.put('k1', 1)
+        assert cache.get('k1') == 1
+
+
+class TestVideoReader:
+
+    @classmethod
+    def setup_class(cls):
+        cls.video_path = osp.join(osp.dirname(__file__), '../data/test.mp4')
+        cls.num_frames = 168
+        cls.video_url = 'https://download.openmmlab.com/mmcv/test_data/sample-mp4-file.mp4'  # noqa: E501
+
+    def test_load(self):
+        # read from video file
+        v = mmcv.VideoReader(self.video_path)
+        assert v.width == 294
+        assert v.height == 240
+        assert v.fps == 25
+        assert v.frame_cnt == self.num_frames
+        assert len(v) == self.num_frames
+        assert v.opened
+        import cv2
+        assert isinstance(v.vcap, type(cv2.VideoCapture()))
+
+        # read from video url
+        v = mmcv.VideoReader(self.video_url)
+        assert v.width == 320
+        assert v.height == 240
+        assert v.fps == 15
+        assert v.frame_cnt == 1889
+        assert len(v) == 1889
+        assert v.opened
+        assert isinstance(v.vcap, type(cv2.VideoCapture()))
+
+    def test_read(self):
+        v = mmcv.VideoReader(self.video_path)
+        img = v.read()
+        assert int(round(img.mean())) == 94
+        img = v.get_frame(63)
+        assert int(round(img.mean())) == 94
+        img = v[64]
+        assert int(round(img.mean())) == 205
+        img = v[-104]
+        assert int(round(img.mean())) == 205
+        img = v[63]
+        assert int(round(img.mean())) == 94
+        img = v[-105]
+        assert int(round(img.mean())) == 94
+        img = v.read()
+        assert int(round(img.mean())) == 205
+        with pytest.raises(IndexError):
+            v.get_frame(self.num_frames + 1)
+        with pytest.raises(IndexError):
+            v[-self.num_frames - 1]
+
+    def test_slice(self):
+        v = mmcv.VideoReader(self.video_path)
+        imgs = v[-105:-103]
+        assert int(round(imgs[0].mean())) == 94
+        assert int(round(imgs[1].mean())) == 205
+        assert len(imgs) == 2
+        imgs = v[63:65]
+        assert int(round(imgs[0].mean())) == 94
+        assert int(round(imgs[1].mean())) == 205
+        assert len(imgs) == 2
+        imgs = v[64:62:-1]
+        assert int(round(imgs[0].mean())) == 205
+        assert int(round(imgs[1].mean())) == 94
+        assert len(imgs) == 2
+        imgs = v[:5]
+        assert len(imgs) == 5
+        for img in imgs:
+            assert int(round(img.mean())) == 94
+        imgs = v[165:]
+        assert len(imgs) == 3
+        for img in imgs:
+            assert int(round(img.mean())) == 0
+        imgs = v[-3:]
+        assert len(imgs) == 3
+        for img in imgs:
+            assert int(round(img.mean())) == 0
+
+    def test_current_frame(self):
+        v = mmcv.VideoReader(self.video_path)
+        assert v.current_frame() is None
+        v.read()
+        img = v.current_frame()
+        assert int(round(img.mean())) == 94
+
+    def test_position(self):
+        v = mmcv.VideoReader(self.video_path)
+        assert v.position == 0
+        for _ in range(10):
+            v.read()
+        assert v.position == 10
+        v.get_frame(99)
+        assert v.position == 100
+
+    def test_iterator(self):
+        cnt = 0
+        for img in mmcv.VideoReader(self.video_path):
+            cnt += 1
+            assert img.shape == (240, 294, 3)
+        assert cnt == self.num_frames
+
+    def test_with(self):
+        with mmcv.VideoReader(self.video_path) as v:
+            assert v.opened
+        assert not v.opened
+
+    def test_cvt2frames(self):
+        v = mmcv.VideoReader(self.video_path)
+        frame_dir = tempfile.mkdtemp()
+        v.cvt2frames(frame_dir)
+        assert osp.isdir(frame_dir)
+        for i in range(self.num_frames):
+            filename = f'{frame_dir}/{i:06d}.jpg'
+            assert osp.isfile(filename)
+            os.remove(filename)
+
+        v = mmcv.VideoReader(self.video_path)
+        v.cvt2frames(frame_dir, show_progress=False)
+        assert osp.isdir(frame_dir)
+        for i in range(self.num_frames):
+            filename = f'{frame_dir}/{i:06d}.jpg'
+            assert osp.isfile(filename)
+            os.remove(filename)
+
+        v = mmcv.VideoReader(self.video_path)
+        v.cvt2frames(
+            frame_dir,
+            file_start=100,
+            filename_tmpl='{:03d}.JPEG',
+            start=100,
+            max_num=20)
+        assert osp.isdir(frame_dir)
+        for i in range(100, 120):
+            filename = f'{frame_dir}/{i:03d}.JPEG'
+            assert osp.isfile(filename)
+            os.remove(filename)
+        shutil.rmtree(frame_dir)
+
+    def test_frames2video(self):
+        v = mmcv.VideoReader(self.video_path)
+        frame_dir = tempfile.mkdtemp()
+        v.cvt2frames(frame_dir)
+        assert osp.isdir(frame_dir)
+        for i in range(self.num_frames):
+            filename = f'{frame_dir}/{i:06d}.jpg'
+            assert osp.isfile(filename)
+
+        out_filename = osp.join(tempfile.gettempdir(), 'mmcv_test.avi')
+        mmcv.frames2video(frame_dir, out_filename)
+        v = mmcv.VideoReader(out_filename)
+        assert v.fps == 30
+        assert len(v) == self.num_frames
+
+        mmcv.frames2video(
+            frame_dir,
+            out_filename,
+            fps=25,
+            start=10,
+            end=50,
+            show_progress=False)
+
+        with mmcv.VideoReader(out_filename) as v:
+            assert v.fps == 25
+            assert len(v) == 40
+
+            for i in range(self.num_frames):
+                filename = f'{frame_dir}/{i:06d}.jpg'
+                os.remove(filename)
+            shutil.rmtree(frame_dir)
diff --git a/mmcv/tests/test_visualization.py b/mmcv/tests/test_visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..82dd093bf8b6b97d196396d0ff79cde8d239b119
--- /dev/null
+++ b/mmcv/tests/test_visualization.py
@@ -0,0 +1,19 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import numpy as np
+import pytest
+
+import mmcv
+
+
+def test_color():
+    assert mmcv.color_val(mmcv.Color.blue) == (255, 0, 0)
+    assert mmcv.color_val('green') == (0, 255, 0)
+    assert mmcv.color_val((1, 2, 3)) == (1, 2, 3)
+    assert mmcv.color_val(100) == (100, 100, 100)
+    assert mmcv.color_val(np.zeros(3, dtype=int)) == (0, 0, 0)
+    with pytest.raises(TypeError):
+        mmcv.color_val([255, 255, 255])
+    with pytest.raises(TypeError):
+        mmcv.color_val(1.0)
+    with pytest.raises(AssertionError):
+        mmcv.color_val((0, 0, 500))
diff --git a/models/__init__.py b/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f6c3e368cdad368a2d5b4357d0c179761803992
--- /dev/null
+++ b/models/__init__.py
@@ -0,0 +1 @@
+from .aios import build_aios_smplx
\ No newline at end of file
diff --git a/models/aios/__init__.py b/models/aios/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b47e7b073f0582f09d4399c5925253768ea7e12d
--- /dev/null
+++ b/models/aios/__init__.py
@@ -0,0 +1 @@
+from .aios_smplx import build_aios_smplx
diff --git a/models/aios/aios_smplx copy.py b/models/aios/aios_smplx copy.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7e07a4eb73cbacb2159f19cc748dc4e5baf080b
--- /dev/null
+++ b/models/aios/aios_smplx copy.py	
@@ -0,0 +1,4375 @@
+import copy
+import pdb
+import os
+import math
+from typing import List
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch import Tensor
+from util import box_ops
+from util.keypoint_ops import keypoint_xyzxyz_to_xyxyzz
+from util.misc import (NestedTensor, nested_tensor_from_tensor_list, accuracy,
+                       get_world_size, interpolate,
+                       is_dist_avail_and_initialized, inverse_sigmoid)
+from .backbones import build_backbone
+from .matcher import build_matcher
+from .transformer import build_transformer
+from .utils import PoseProjector, sigmoid_focal_loss, MLP
+from .postprocesses import PostProcess_SMPLX, PostProcess_aios
+from .postprocesses import PostProcess_SMPLX_Multi as PostProcess_SMPLX
+from .postprocesses import PostProcess_SMPLX_Multi_Box
+from .postprocesses import  PostProcess_SMPLX_Multi_Infer, PostProcess_SMPLX_Multi_Infer_Box
+from .criterion_smplx import SetCriterion, SetCriterion_Box
+from ..registry import MODULE_BUILD_FUNCS
+from detrsmpl.core.conventions.keypoints_mapping import convert_kps
+from detrsmpl.models.body_models.builder import build_body_model
+from util.human_models import smpl_x
+from detrsmpl.core.conventions.keypoints_mapping import get_keypoint_idxs_by_part
+import numpy as np
+import random
+
+from detrsmpl.utils.geometry import (rot6d_to_rotmat)
+from detrsmpl.utils.transforms import rotmat_to_aa
+import cv2
+from config.config import cfg
+
+
+class AiOSSMPLX(nn.Module):
+    def __init__(
+        self,
+        backbone,
+        transformer,
+        num_classes,
+        num_queries,
+        aux_loss=False,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=False,
+        fix_refpoints_hw=-1,
+        num_feature_levels=1,
+        nheads=8,
+        two_stage_type='no',
+        dec_pred_class_embed_share=False,
+        dec_pred_bbox_embed_share=False,
+        dec_pred_pose_embed_share=False,
+        two_stage_class_embed_share=True,
+        two_stage_bbox_embed_share=True,
+        dn_number=100,
+        dn_box_noise_scale=0.4,
+        dn_label_noise_ratio=0.5,
+        dn_batch_gt_fuse=False,
+        dn_labelbook_size=100,
+        dn_attn_mask_type_list=['group2group'],
+        cls_no_bias=False,
+        num_group=100,
+        num_body_points=17,
+        num_hand_points=10,
+        num_face_points=10,
+        num_box_decoder_layers=2,
+        num_hand_face_decoder_layers=4,
+        body_model=dict(
+            type='smplx',
+            keypoint_src='smplx',
+            num_expression_coeffs=10,
+            keypoint_dst='smplx_137',
+            model_path='data/body_models/smplx',
+            use_pca=False,
+            use_face_contour=True),
+        train=True,
+        inference=False,
+        focal_length=[5000., 5000.],
+        camera_3d_size=2.5
+    ):
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim = transformer.d_model
+        self.num_feature_levels = num_feature_levels
+        self.nheads = nheads
+        self.label_enc = nn.Embedding(dn_labelbook_size + 1, hidden_dim)
+        self.num_body_points = num_body_points
+        self.num_hand_points = num_hand_points
+        self.num_face_points = num_face_points
+        self.num_whole_body_points = num_body_points + 2*num_hand_points + num_face_points
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.focal_length = focal_length
+        self.camera_3d_size=camera_3d_size
+        self.inference = inference
+        if train:
+            self.smpl_convention = 'smplx'
+        else:
+            self.smpl_convention = 'h36m'
+        # setting query dim
+        self.query_dim = query_dim
+        assert query_dim == 4
+        self.random_refpoints_xy = random_refpoints_xy  # False
+        self.fix_refpoints_hw = fix_refpoints_hw  # -1
+
+        # for dn training
+        self.dn_number = dn_number
+        self.dn_box_noise_scale = dn_box_noise_scale
+        self.dn_label_noise_ratio = dn_label_noise_ratio
+        self.dn_batch_gt_fuse = dn_batch_gt_fuse
+        self.dn_labelbook_size = dn_labelbook_size
+        self.dn_attn_mask_type_list = dn_attn_mask_type_list
+        assert all([
+            i in ['match2dn', 'dn2dn', 'group2group']
+            for i in dn_attn_mask_type_list
+        ])
+        assert not dn_batch_gt_fuse
+
+        # build human body
+        # if train:
+        #     self.body_model = build_body_model(body_model)
+        if inference:
+            body_model=dict(
+                type='smplx',
+                keypoint_src='smplx',
+                num_expression_coeffs=10,
+                num_betas=10,
+                keypoint_dst='smplx',
+                model_path='data/body_models/smplx',
+                use_pca=False,
+                use_face_contour=True)
+        self.body_model = build_body_model(body_model)
+        for param in self.body_model.parameters():
+            param.requires_grad = False       
+        # prepare input projection layers
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.num_channels)  # 3
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+            for _ in range(num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels,
+                                  hidden_dim,
+                                  kernel_size=3,
+                                  stride=2,
+                                  padding=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            assert two_stage_type == 'no', 'two_stage_type should be no if num_feature_levels=1 !!!'
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(backbone.num_channels[-1],
+                              hidden_dim,
+                              kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                )
+            ])
+
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.box_pred_damping = box_pred_damping = None
+
+        self.iter_update = iter_update
+        assert iter_update, 'Why not iter_update?'
+
+        # prepare pred layers
+        self.dec_pred_class_embed_share = dec_pred_class_embed_share  # false
+        self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share  # false
+
+        # 1.1 prepare class & box embed
+        _class_embed = nn.Linear(hidden_dim,
+                                 num_classes,
+                                 bias=(not cls_no_bias))
+        if not cls_no_bias:
+            prior_prob = 0.01
+            bias_value = -math.log((1 - prior_prob) / prior_prob)
+            _class_embed.bias.data = torch.ones(self.num_classes) * bias_value
+
+        # 1.2 box embed layer list
+        if dec_pred_class_embed_share:
+            class_embed_layerlist = [
+                _class_embed for i in range(transformer.num_decoder_layers)
+            ]
+        else:
+            class_embed_layerlist = [
+                copy.deepcopy(_class_embed)
+                for i in range(transformer.num_decoder_layers)
+            ]
+
+
+        ###########################################################################
+        #                    body bbox + l/r hand box + face box
+        ###########################################################################
+        # 1.1 body bbox embed
+        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+        
+        # 1.2 body bbox embed layer list
+        self.num_group = num_group
+        if dec_pred_bbox_embed_share:
+            box_body_embed_layerlist = [
+                _bbox_embed for i in range(transformer.num_decoder_layers)
+            ]
+        else:
+            box_body_embed_layerlist = [
+                copy.deepcopy(_bbox_embed)
+                for i in range(transformer.num_decoder_layers)
+            ]
+
+        # 2.1 lhand bbox embed
+        _bbox_hand_embed = MLP(hidden_dim, hidden_dim, 2, 3) # TODO: the out shape should be 2 not 4
+        nn.init.constant_(_bbox_hand_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_hand_embed.layers[-1].bias.data, 0)
+
+        _bbox_hand_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_hand_hw_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_hand_hw_embed.layers[-1].bias.data, 0)
+        # 2.2 lhand bbox embed layer list
+        if dec_pred_pose_embed_share:
+            box_hand_embed_layerlist = \
+                [_bbox_hand_embed for i in range(transformer.num_decoder_layers - num_box_decoder_layers+1)]
+        else:
+            box_hand_embed_layerlist = [
+                copy.deepcopy(_bbox_hand_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers + 1)
+            ]
+
+        if dec_pred_pose_embed_share:
+            box_hand_hw_embed_layerlist = [
+                _bbox_hand_hw_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)
+                ]
+        else:
+            box_hand_hw_embed_layerlist = [
+                copy.deepcopy(_bbox_hand_hw_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers)
+            ]
+                        
+        # 4.1 face bbox embed
+        _bbox_face_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_face_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_face_embed.layers[-1].bias.data, 0)
+
+        _bbox_face_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_face_hw_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_face_hw_embed.layers[-1].bias.data, 0)
+        
+        # 4.2 face bbox embed layer list
+        if dec_pred_pose_embed_share:
+            box_face_embed_layerlist = [
+                _bbox_face_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers + 1)
+                ]
+        else:
+            box_face_embed_layerlist = [
+                copy.deepcopy(_bbox_face_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers + 1)
+            ]
+
+        if dec_pred_pose_embed_share:
+            box_face_hw_embed_layerlist = [
+                _bbox_face_hw_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)]
+        else:
+            box_face_hw_embed_layerlist = [
+                copy.deepcopy(_bbox_face_hw_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers)
+            ]            
+        ###########################################################################
+        #                    body kp2d + l/r hand kp2d + face kp2d
+        ###########################################################################
+            
+        ######## body #######
+        # 1.1 body kp2d embed
+        _pose_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_pose_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_pose_embed.layers[-1].bias.data, 0)
+
+        # 1.2 body kp2d embed layer list
+        if num_body_points == 17:
+            if dec_pred_pose_embed_share:
+                pose_embed_layerlist = \
+                    [_pose_embed for i in range(transformer.num_decoder_layers - num_box_decoder_layers+1)]
+            else:
+                pose_embed_layerlist = [
+                    copy.deepcopy(_pose_embed)
+                    for i in range(transformer.num_decoder_layers -
+                                num_box_decoder_layers + 1)
+                ]
+        else:
+            if dec_pred_pose_embed_share:
+                pose_embed_layerlist = [
+                    _pose_embed for i in range(transformer.num_decoder_layers -
+                                            num_box_decoder_layers)
+                ]
+            else:
+                pose_embed_layerlist = [
+                    copy.deepcopy(_pose_embed)
+                    for i in range(transformer.num_decoder_layers -
+                                num_box_decoder_layers)
+                ]
+
+        # 1.3 body kp bbox embed 
+        _pose_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        
+        # 1.4 body kp bbox embed layer list
+        pose_hw_embed_layerlist = [
+            _pose_hw_embed for i in range(transformer.num_decoder_layers -
+                                        num_box_decoder_layers)
+        ]
+            
+        ######## lhand #######
+        # 2.1 lhand kp2d embed
+        _pose_hand_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_pose_hand_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_pose_hand_embed.layers[-1].bias.data, 0)
+
+        # 2.2 lhand kp2d embed layer list
+        if dec_pred_pose_embed_share:
+            pose_hand_embed_layerlist = \
+                [_pose_hand_embed for i in range(transformer.num_decoder_layers - num_hand_face_decoder_layers+1)]
+        else:
+            pose_hand_embed_layerlist = [
+                copy.deepcopy(_pose_hand_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_hand_face_decoder_layers + 1)
+            ]
+
+        # 2.3 lhand kp bbox embed 
+        _pose_hand_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        
+        # 2.4 lhand kp bbox embed layer list
+        pose_hand_hw_embed_layerlist = [
+            _pose_hand_hw_embed for i in range(transformer.num_decoder_layers -
+                                        num_hand_face_decoder_layers)
+        ]
+            
+
+        ######## face #######
+        # 4.1 face kp2d embed
+        _pose_face_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_pose_face_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_pose_face_embed.layers[-1].bias.data, 0)
+
+        # 4.2 face kp2d embed layer list
+        if dec_pred_pose_embed_share:
+            pose_face_embed_layerlist = \
+                [_pose_face_embed for i in range(transformer.num_decoder_layers - num_hand_face_decoder_layers+1)]
+        else:
+            pose_face_embed_layerlist = [
+                copy.deepcopy(_pose_face_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_hand_face_decoder_layers + 1)
+            ]
+
+        # 4.3 face kp bbox embed 
+        _pose_face_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        
+        # 4.4 face kp bbox embed layer list
+        pose_face_hw_embed_layerlist = [
+            _pose_face_hw_embed for i in range(transformer.num_decoder_layers -
+                                        num_hand_face_decoder_layers)
+        ]
+
+        ###########################################################################
+        #                    smpl pose + betas + kp2d + kp3d + cam
+        ###########################################################################
+        
+        # 1. smpl pose embed
+        if body_model['type'].upper()=='SMPL':
+            self.body_model_joint_num = 24
+        elif body_model['type'].upper()=='SMPLX':
+            self.body_model_joint_num = 22
+        else:
+            raise ValueError(
+            f'Only supports SMPL or SMPLX, but get {body_model.type}')      
+        #TODO: 
+
+        _smpl_pose_embed = MLP(hidden_dim * (self.num_body_points + 4),
+                            hidden_dim, self.body_model_joint_num * 6, 3)
+        nn.init.constant_(_smpl_pose_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_smpl_pose_embed.layers[-1].bias.data, 0)  
+
+        if dec_pred_bbox_embed_share:
+            smpl_pose_embed_layerlist = [
+                _smpl_pose_embed
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smpl_pose_embed_layerlist = [
+                copy.deepcopy(_smpl_pose_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        # 2. smpl betas embed
+        _smpl_beta_embed = MLP(hidden_dim * (self.num_body_points + 4),
+                               hidden_dim, 10, 3)
+        nn.init.constant_(_smpl_beta_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_smpl_beta_embed.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smpl_beta_embed_layerlist = [
+                _smpl_beta_embed
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smpl_beta_embed_layerlist = [
+                copy.deepcopy(_smpl_beta_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        # 3. smpl cam embed
+        _cam_embed = MLP(hidden_dim * (self.num_body_points + 4), hidden_dim,
+                         3, 3)
+        nn.init.constant_(_cam_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_cam_embed.layers[-1].bias.data, 0)
+        
+        if dec_pred_bbox_embed_share:
+            cam_embed_layerlist = [
+                _cam_embed for i in range(transformer.num_decoder_layers -
+                                          num_box_decoder_layers)
+            ]
+        else:
+            cam_embed_layerlist = [
+                copy.deepcopy(_cam_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        ###########################################################################
+        #  smplx body pose + hand pose + expression + betas + kp2d + kp3d + cam
+        ###########################################################################
+
+        # 1. smplx body pose embed
+        # _smplx_pose_embed = MLP(hidden_dim * (self.num_body_points + 1),
+        #                        hidden_dim, 23 * 6, 3)
+        # nn.init.constant_(_smplx_pose_embed.layers[-1].weight.data, 0)
+        # nn.init.constant_(_smplx_pose_embed.layers[-1].bias.data, 0)
+
+        # if dec_pred_bbox_embed_share:
+        #     smplx_pose_embed_layerlist = [
+        #         _smplx_pose_embed
+        #         for i in range(transformer.num_decoder_layers -
+        #                        num_box_decoder_layers + 1)
+        #     ]
+        # else:
+        #     smplx_pose_embed_layerlist = [
+        #         copy.deepcopy(_smplx_pose_embed)
+        #         for i in range(transformer.num_decoder_layers -
+        #                        num_box_decoder_layers + 1)
+        #     ]
+
+        # 2. smplx hand pose embed
+        _smplx_hand_pose_embed_layer_2_3 = \
+            MLP(hidden_dim, hidden_dim, 15 * 6, 3)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_hand_pose_embed_layer_4_5 = \
+            MLP(hidden_dim * (self.num_hand_points + 3), hidden_dim, 15 * 6, 3)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_4_5.layers[-1].bias.data, 0)
+
+
+        
+        if dec_pred_bbox_embed_share:
+            smplx_hand_pose_embed_layerlist = [
+                _smplx_hand_pose_embed_layer_2_3
+                if i<2 else _smplx_hand_pose_embed_layer_4_5
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smplx_hand_pose_embed_layerlist = [
+                copy.deepcopy(_smplx_hand_pose_embed_layer_2_3)
+                if i<2 else copy.deepcopy(_smplx_hand_pose_embed_layer_4_5)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+
+        # 3. smplx face expression 
+
+        _smplx_expression_embed_layer_2_3 = \
+            MLP(hidden_dim, hidden_dim, 10, 3)
+        nn.init.constant_(_smplx_expression_embed_layer_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_expression_embed_layer_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_expression_embed_layer_4_5 = \
+            MLP(hidden_dim * (self.num_hand_points + 2), hidden_dim, 10, 3)
+        nn.init.constant_(_smplx_expression_embed_layer_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_expression_embed_layer_4_5.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smplx_expression_embed_layerlist = [
+                _smplx_expression_embed_layer_2_3
+                if i<2 else _smplx_expression_embed_layer_4_5
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smplx_expression_embed_layerlist = [
+                copy.deepcopy(_smplx_expression_embed_layer_2_3)
+                if i<2 else copy.deepcopy(_smplx_expression_embed_layer_4_5)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        
+
+        # _smplx_expression_embed = MLP(hidden_dim * (self.num_face_points + 2),
+        #                        hidden_dim, 10, 3)
+        # nn.init.constant_(_smplx_expression_embed.layers[-1].weight.data, 0)
+        # nn.init.constant_(_smplx_expression_embed.layers[-1].bias.data, 0)
+
+        # if dec_pred_bbox_embed_share:
+        #     smplx_expression_embed_layerlist = [
+        #         _smplx_expression_embed
+        #         for i in range(transformer.num_decoder_layers -
+        #                        num_hand_face_decoder_layers)
+        #     ]
+        # else:
+        #     smplx_expression_embed_layerlist = [
+        #         copy.deepcopy(_smplx_expression_embed)
+        #         for i in range(transformer.num_decoder_layers -
+        #                        num_hand_face_decoder_layers)
+        #     ]
+
+        # 4. smplx jaw pose embed
+        _smplx_jaw_embed_2_3 = MLP(hidden_dim * 1,
+                               hidden_dim, 6, 3)
+        nn.init.constant_(_smplx_jaw_embed_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_jaw_embed_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_jaw_embed_4_5 = MLP(hidden_dim * (self.num_face_points + 2),
+                               hidden_dim, 6, 3)
+        nn.init.constant_(_smplx_jaw_embed_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_jaw_embed_4_5.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smplx_jaw_embed_layerlist = [
+                _smplx_jaw_embed_2_3 if i<2 else _smplx_jaw_embed_4_5
+                for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)
+            ]
+        else:
+            smplx_jaw_embed_layerlist = [
+                copy.deepcopy(_smplx_jaw_embed_2_3) 
+                if i<2 else copy.deepcopy(_smplx_jaw_embed_4_5) 
+                for i in range(
+                    transformer.num_decoder_layers -  num_box_decoder_layers)
+            ]
+            
+        ###############
+
+        self.bbox_embed = nn.ModuleList(box_body_embed_layerlist)
+        self.class_embed = nn.ModuleList(class_embed_layerlist)
+        self.pose_embed = nn.ModuleList(pose_embed_layerlist)
+        self.pose_hw_embed = nn.ModuleList(pose_hw_embed_layerlist)
+
+        self.transformer.decoder.bbox_embed = self.bbox_embed
+        self.transformer.decoder.pose_embed = self.pose_embed
+        self.transformer.decoder.pose_hw_embed = self.pose_hw_embed
+        self.transformer.decoder.class_embed = self.class_embed
+        
+        # smpl
+        self.smpl_pose_embed = nn.ModuleList(smpl_pose_embed_layerlist)
+        self.smpl_beta_embed = nn.ModuleList(smpl_beta_embed_layerlist)
+        self.smpl_cam_embed = nn.ModuleList(cam_embed_layerlist)
+        # self.smpl_cam_f_embed = nn.ModuleList(f_embed_layerlist)
+        # self.transformer.decoder.smpl_pose_embed = self.smpl_pose_embed
+        # self.transformer.decoder.smpl_beta_embed = self.smpl_beta_embed
+        # self.transformer.decoder.smpl_cam_embed = self.smpl_cam_embed
+
+        # smplx lhand kp
+        self.bbox_hand_embed = nn.ModuleList(box_hand_embed_layerlist)
+        self.bbox_hand_hw_embed = nn.ModuleList(box_hand_hw_embed_layerlist)
+        self.pose_hand_embed = nn.ModuleList(pose_hand_embed_layerlist)
+        self.pose_hand_hw_embed = nn.ModuleList(pose_hand_hw_embed_layerlist)
+
+        self.transformer.decoder.bbox_hand_embed = self.bbox_hand_embed
+        self.transformer.decoder.bbox_hand_hw_embed = self.bbox_hand_hw_embed
+        self.transformer.decoder.pose_hand_embed = self.pose_hand_embed
+        self.transformer.decoder.pose_hand_hw_embed = self.pose_hand_hw_embed
+
+        # smplx face kp
+        self.bbox_face_embed = nn.ModuleList(box_face_embed_layerlist)
+        self.bbox_face_hw_embed = nn.ModuleList(box_face_hw_embed_layerlist)
+        self.pose_face_embed = nn.ModuleList(pose_face_embed_layerlist)
+        self.pose_face_hw_embed = nn.ModuleList(pose_face_hw_embed_layerlist)               
+    
+        self.transformer.decoder.bbox_face_embed = self.bbox_face_embed
+        self.transformer.decoder.bbox_face_hw_embed = self.bbox_face_hw_embed
+        self.transformer.decoder.pose_face_embed = self.pose_face_embed
+        self.transformer.decoder.pose_face_hw_embed = self.pose_face_hw_embed
+            
+        # smplx 
+        self.smpl_hand_pose_embed = nn.ModuleList(smplx_hand_pose_embed_layerlist)
+        # self.smplx_rhand_pose_embed = nn.ModuleList(smplx_rhand_pose_embed_layerlist)
+        self.smpl_expr_embed = nn.ModuleList(smplx_expression_embed_layerlist)
+        self.smpl_jaw_embed = nn.ModuleList(smplx_jaw_embed_layerlist)
+
+        # self.transformer.decoder.smplx_hand_pose_embed = self.smplx_hand_pose_embed
+        # self.transformer.decoder.smplx_rhand_pose_embed = self.smplx_rhand_pose_embed
+        # self.transformer.decoder.num_whole_bosmpl_expr_embeddy_points = self.smplx_expression_embed
+        # self.transformer.decoder.smpl_jaw_embed = self.smplx_jaw_embed
+        
+        #########
+        self.transformer.decoder.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.transformer.decoder.num_box_decoder_layers = num_box_decoder_layers
+        self.transformer.decoder.num_body_points = num_body_points
+        self.transformer.decoder.num_hand_points = num_hand_points
+        self.transformer.decoder.num_face_points = num_face_points
+        # two stage
+        self.two_stage_type = two_stage_type
+        assert two_stage_type in [
+            'no', 'standard'
+        ], 'unknown param {} of two_stage_type'.format(two_stage_type)
+        if two_stage_type != 'no':
+            if two_stage_bbox_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_bbox_embed = _bbox_embed
+            else:
+                self.transformer.enc_out_bbox_embed = copy.deepcopy(
+                    _bbox_embed)
+
+            if two_stage_class_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_class_embed = _class_embed
+
+            else:
+                self.transformer.enc_out_class_embed = copy.deepcopy(
+                    _class_embed)
+            self.refpoint_embed = None
+
+        self._reset_parameters()
+
+    def get_camera_trans(self, cam_param, input_body_shape):
+        # camera translation
+        t_xy = cam_param[:, :2]
+        gamma = torch.sigmoid(cam_param[:, 2])  # apply sigmoid to make it positive
+        k_value = torch.FloatTensor(
+            [
+                math.sqrt(
+                    self.focal_length[0] * self.focal_length[1] * self.camera_3d_size * self.camera_3d_size / 
+                    (input_body_shape[0] * input_body_shape[1])
+                )
+            ]
+        ).cuda().view(-1)
+        t_z = k_value * gamma
+        cam_trans = torch.cat((t_xy, t_z[:, None]), 1)
+        return cam_trans
+
+    def _reset_parameters(self):
+        # init input_proj
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+    def prepare_for_dn2(self, targets):
+        if not self.training:
+            device = targets[0]['boxes'].device
+            bs = len(targets)
+            
+            num_points = self.num_body_points + 4
+            attn_mask2 = torch.zeros(
+                bs,
+                self.nheads,
+                self.num_group * num_points,
+                self.num_group * num_points,
+                device=device,
+                dtype=torch.bool)
+
+            group_bbox_kpt = num_points
+            group_nobbox_kpt = self.num_body_points
+            kpt_index = [
+                x for x in range(self.num_group * num_points) 
+                if x % num_points in [
+                    0, 
+                    self.num_body_points+1, 
+                    self.num_body_points+2, 
+                    self.num_body_points+3
+                    ]
+                ]
+            for matchj in range(self.num_group * num_points):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                if sj > 0:
+                    attn_mask2[:, :, matchj, :sj] = True
+                if ej < self.num_group * num_points:
+                    attn_mask2[:, :, matchj, ej:] = True
+
+            for match_x in range(self.num_group * num_points):
+                if match_x % group_bbox_kpt in [0, 
+                                                self.num_body_points+1, 
+                                                self.num_body_points+2, 
+                                                self.num_body_points+3]:
+                    attn_mask2[:,:,match_x,kpt_index]=False
+
+
+            num_points = self.num_whole_body_points + 4
+            attn_mask3 = torch.zeros(
+                bs,
+                self.nheads,
+                self.num_group * (num_points), 
+                self.num_group * (num_points),
+                device=device, 
+                dtype=torch.bool)
+
+            group_bbox_kpt = (num_points)
+            # group_nobbox_kpt = self.num_body_points
+            kpt_index = [
+                x for x in range(self.num_group * (num_points)) if x % (num_points) in 
+                [0, 
+                 1+self.num_body_points, 
+                 2+self.num_body_points+self.num_hand_points, 
+                 3+self.num_body_points+self.num_hand_points*2
+                 ]
+                ]
+            for matchj in range(self.num_group * num_points):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                if sj > 0:
+                    attn_mask3[:, :, matchj, :sj] = True
+                if ej < self.num_group * num_points:
+                    attn_mask3[:, :, matchj, ej:] = True
+
+            for match_x in range(self.num_group * num_points):
+                if match_x % group_bbox_kpt in [
+                    0, 
+                    1 + self.num_body_points, 
+                    2 + self.num_body_points + self.num_hand_points, 
+                    3 + self.num_body_points + self.num_hand_points * 2]:
+                    
+                    attn_mask3[:, :, match_x, kpt_index] = False
+
+            # num_points = self.num_whole_body_points + 4
+            # device = targets[0]['boxes'].device
+            # bs = len(targets)
+            # attn_mask_infere = torch.zeros(
+            #     bs,
+            #     self.nheads,
+            #     self.num_group * num_points,
+            #     self.num_group * num_points,
+            #     device=device,
+            #     dtype=torch.bool)
+            # group_bbox_kpt = num_points
+            # group_nobbox_kpt = self.num_body_points
+            # kpt_index = [
+            #     x for x in range(self.num_group * num_points)
+            #     if x % num_points == 0
+            # ]
+            # for matchj in range(self.num_group * num_points):
+            #     sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+            #     ej = (matchj // group_bbox_kpt + 1) * group_bbox_kpt
+            #     if sj > 0:
+            #         attn_mask_infere[:, :, matchj, :sj] = True
+            #     if ej < self.num_group * num_points:
+            #         attn_mask_infere[:, :, matchj, ej:] = True
+            # for match_x in range(self.num_group * num_points):
+            #     if match_x % group_bbox_kpt == 0:
+            #         attn_mask_infere[:, :, match_x, kpt_index] = False
+
+            # attn_mask_infere = attn_mask_infere.flatten(0, 1)
+            attn_mask2 = attn_mask2.flatten(0, 1)
+            attn_mask3 = attn_mask3.flatten(0, 1)
+            return None, None, None, attn_mask2, attn_mask3, None
+
+        # targets, dn_scalar, noise_scale = dn_args
+        device = targets[0]['boxes'].device
+        bs = len(targets)
+        dn_number = self.dn_number  # 100
+        dn_box_noise_scale = self.dn_box_noise_scale  # 0.4
+        dn_label_noise_ratio = self.dn_label_noise_ratio  # 0.5
+
+        # gather gt boxes and labels
+        gt_boxes = [t['boxes'] for t in targets]
+        gt_labels = [t['labels'] for t in targets]
+        gt_keypoints = [t['keypoints'] for t in targets]
+
+        # repeat them
+        def get_indices_for_repeat(now_num, target_num, device='cuda'):
+            """
+            Input:
+                - now_num: int
+                - target_num: int
+            Output:
+                - indices: tensor[target_num]
+            """
+            out_indice = []
+            base_indice = torch.arange(now_num).to(device)
+            multiplier = target_num // now_num
+            out_indice.append(base_indice.repeat(multiplier))
+            residue = target_num % now_num
+            out_indice.append(base_indice[torch.randint(0,
+                                                        now_num, (residue, ),
+                                                        device=device)])
+            return torch.cat(out_indice)
+
+        if self.dn_batch_gt_fuse:
+            raise NotImplementedError
+            gt_boxes_bsall = torch.cat(gt_boxes)  # num_boxes, 4
+            gt_labels_bsall = torch.cat(gt_labels)
+            num_gt_bsall = gt_boxes_bsall.shape[0]
+            if num_gt_bsall > 0:
+                indices = get_indices_for_repeat(num_gt_bsall, dn_number,
+                                                 device)
+                gt_boxes_expand = gt_boxes_bsall[indices][None].repeat(
+                    bs, 1, 1)  # bs, num_dn, 4
+                gt_labels_expand = gt_labels_bsall[indices][None].repeat(
+                    bs, 1)  # bs, num_dn
+            else:
+                # all negative samples when no gt boxes
+                gt_boxes_expand = torch.rand(bs, dn_number, 4, device=device)
+                gt_labels_expand = torch.ones(
+                    bs, dn_number, dtype=torch.int64, device=device) * int(
+                        self.num_classes)
+        else:
+            gt_boxes_expand = []
+            gt_labels_expand = []
+            gt_keypoints_expand = []  # here
+            for idx, (gt_boxes_i, gt_labels_i, gt_keypoint_i) in enumerate(
+                    zip(gt_boxes, gt_labels, gt_keypoints)):  # idx -> batch id
+                num_gt_i = gt_boxes_i.shape[0]  # instance num
+                if num_gt_i > 0:
+                    indices = get_indices_for_repeat(num_gt_i, dn_number,
+                                                     device)
+                    gt_boxes_expand_i = gt_boxes_i[indices]  # num_dn, 4
+                    gt_labels_expand_i = gt_labels_i[indices]  # add smpl
+                    gt_keypoints_expand_i = gt_keypoint_i[indices]
+                else:
+                    # all negative samples when no gt boxes
+                    gt_boxes_expand_i = torch.rand(dn_number, 4, device=device)
+                    gt_labels_expand_i = torch.ones(
+                        dn_number, dtype=torch.int64, device=device) * int(
+                            self.num_classes)
+                    gt_keypoints_expand_i = torch.rand(dn_number,
+                                                       self.num_body_points *
+                                                       3,
+                                                       device=device)
+                gt_boxes_expand.append(gt_boxes_expand_i)  # add smpl
+                gt_labels_expand.append(gt_labels_expand_i)
+                gt_keypoints_expand.append(gt_keypoints_expand_i)
+            gt_boxes_expand = torch.stack(gt_boxes_expand)
+            gt_labels_expand = torch.stack(gt_labels_expand)
+            gt_keypoints_expand = torch.stack(gt_keypoints_expand)
+        knwon_boxes_expand = gt_boxes_expand.clone()
+        knwon_labels_expand = gt_labels_expand.clone()
+
+        # add noise
+        if dn_label_noise_ratio > 0:
+            prob = torch.rand_like(knwon_labels_expand.float())
+            chosen_indice = prob < dn_label_noise_ratio
+            new_label = torch.randint_like(
+                knwon_labels_expand[chosen_indice], 0,
+                self.dn_labelbook_size)  # randomly put a new one here
+            knwon_labels_expand[chosen_indice] = new_label
+
+        if dn_box_noise_scale > 0:
+            diff = torch.zeros_like(knwon_boxes_expand)
+            diff[..., :2] = knwon_boxes_expand[..., 2:] / 2
+            diff[..., 2:] = knwon_boxes_expand[..., 2:]
+            knwon_boxes_expand += torch.mul(
+                (torch.rand_like(knwon_boxes_expand) * 2 - 1.0),
+                diff) * dn_box_noise_scale
+            knwon_boxes_expand = knwon_boxes_expand.clamp(min=0.0, max=1.0)
+
+        input_query_label = self.label_enc(knwon_labels_expand)
+        input_query_bbox = inverse_sigmoid(knwon_boxes_expand)
+
+        # prepare mask
+        body_mask, body_kps_mask, lhand_mask, lhand_kps_mask, rhand_mask, \
+            rhand_kps_mask, face_mask, face_kps_mask = \
+                False, False, False, False, False, False, False, False
+        if random.random() < 0.2:
+            body_mask = True
+        if random.random() < 0.5:
+            body_kps_mask = True
+            
+        if random.random() < 0.2:
+            lhand_mask = True
+        if random.random() < 0.5:
+            lhand_kps_mask = True
+            
+        if random.random() < 0.2:
+            rhand_mask = True
+        if random.random() < 0.5:
+            rhand_kps_mask = True
+            
+        if random.random() < 0.2:
+            face_mask = True
+        if random.random() < 0.5:
+            face_kps_mask = True
+            
+        if 'group2group' in self.dn_attn_mask_type_list:
+            attn_mask = torch.zeros(bs,
+                                    self.nheads,
+                                    dn_number + self.num_queries,
+                                    dn_number + self.num_queries,
+                                    device=device,
+                                    dtype=torch.bool)
+            attn_mask[:, :, dn_number:, :dn_number] = True
+            for idx, (gt_boxes_i,
+                      gt_labels_i) in enumerate(zip(gt_boxes,
+                                                    gt_labels)):  # for batch
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask[idx, :, matchi, ei:dn_number] = True
+            attn_mask = attn_mask.flatten(0, 1)
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            # self.num_body_points = self.num_body_points +3
+            inter_body_mask = []
+            if body_mask:
+                inter_body_mask.append(0)
+            if body_kps_mask:
+                indices = sorted(random.sample(range(1, self.num_body_points+1), k=6))
+                inter_body_mask.extend(indices)
+            if lhand_mask:
+                inter_body_mask.append(self.num_body_points+1)   
+            if rhand_mask:
+                inter_body_mask.append(self.num_body_points+2)   
+            if face_mask:
+                inter_body_mask.append(self.num_body_points+3)  
+                
+            num_points = self.num_body_points + 4           
+            attn_mask2 = torch.zeros(
+                bs,
+                self.nheads,
+                dn_number + self.num_group * num_points,
+                dn_number + self.num_group * num_points,
+                device=device,
+                dtype=torch.bool)
+            attn_mask2[:, :, dn_number:, :dn_number] = True
+            group_bbox_kpt = num_points
+            # group_nobbox_kpt = self.num_body_points
+            kpt_index = [x for x in range(self.num_group * num_points) 
+                         if x % num_points in [
+                             0, self.num_body_points+1, self.num_body_points+2, self.num_body_points+3]]
+            for matchj in range(self.num_group * num_points):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                if sj > 0:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, matchj, :sj] = True
+                if ej < self.num_group * num_points:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, matchj, ej:] = True
+                    
+                if (matchj // group_bbox_kpt) == 0:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, matchj, sj:ej][..., inter_body_mask] = True
+                    
+            for match_x in range(self.num_group * num_points):
+                if match_x % group_bbox_kpt == 0 and body_mask != False:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, match_x, ::num_points]=False
+                if match_x % group_bbox_kpt == self.num_body_points + 1 and lhand_mask != False:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, match_x, 1::num_points]=False
+                if match_x % group_bbox_kpt == self.num_body_points + 2 and rhand_mask != False:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, match_x, 2::num_points]=False
+                if match_x % group_bbox_kpt == self.num_body_points + 3 and face_mask != False:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, match_x, 3::num_points]=False
+                # if match_x % group_bbox_kpt in [0, 
+                #                                 self.num_body_points+1, 
+                #                                 self.num_body_points+2, 
+                #                                 self.num_body_points+3]:
+                #     attn_mask2[:, :, dn_number:, dn_number:][:, :, match_x, kpt_index]=False
+
+
+ 
+                
+                
+                
+                
+            for idx, (gt_boxes_i, gt_labels_i) in enumerate(zip(gt_boxes, gt_labels)):
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask2[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask2[idx, :, matchi, ei:dn_number] = True
+            attn_mask2 = attn_mask2.flatten(0, 1)
+
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            inter_body_mask = []
+            
+            if body_mask:
+                inter_body_mask.append(0)
+            if body_kps_mask:
+                indices = sorted(random.sample(range(1, self.num_body_points+1), k=6))
+                inter_body_mask.extend(indices)
+                
+            if lhand_mask:
+                inter_body_mask.append(self.num_body_points+1)   
+            if lhand_kps_mask:
+                indices = sorted(random.sample(range(self.num_body_points+2, self.num_body_points+8), k=3))
+                inter_body_mask.extend(indices)
+                
+            if rhand_mask:
+                inter_body_mask.append(self.num_body_points+8)
+            if rhand_kps_mask:
+                indices = sorted(random.sample(range(self.num_body_points+9, self.num_body_points+15), k=3))   
+                inter_body_mask.extend(indices)
+                
+            if face_mask:
+                inter_body_mask.append(self.num_body_points+15)
+            if face_kps_mask:
+                indices = sorted(random.sample(range(self.num_body_points+16, self.num_body_points+22), k=3) )
+                inter_body_mask.extend(indices)
+            # self.num_body_points = self.num_body_points +3
+            num_points = self.num_whole_body_points + 4
+            attn_mask3 = torch.zeros(
+                bs,
+                self.nheads,
+                dn_number + self.num_group * (num_points), dn_number + self.num_group * (num_points),
+                                    device=device, dtype=torch.bool)
+            attn_mask3[:, :, dn_number:, :dn_number] = True
+            group_bbox_kpt = (num_points)
+            # group_nobbox_kpt = self.num_body_points
+            kpt_index = [
+                x for x in range(self.num_group * (num_points)) if x % (num_points) in 
+                [0, 
+                 1+self.num_body_points, 
+                 2+self.num_body_points+self.num_hand_points, 
+                 3+self.num_body_points+self.num_hand_points*2
+                 ]
+                ]
+            for matchj in range(self.num_group * num_points):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                if sj > 0:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, matchj, :sj] = True
+                if ej < self.num_group * num_points:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, matchj, ej:] = True
+                    
+                if (matchj // group_bbox_kpt) == 0:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, matchj, sj:ej][..., inter_body_mask] = True
+                    
+            for match_x in range(self.num_group * num_points):
+                if match_x % group_bbox_kpt == 0 and body_mask != False:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, match_x, ::num_points]=False
+                if match_x % group_bbox_kpt == 1 + self.num_body_points and lhand_mask != False:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, match_x, 1::num_points]=False
+                if match_x % group_bbox_kpt == 2 + self.num_body_points + self.num_hand_points and rhand_mask != False:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, match_x, 2::num_points]=False
+                if match_x % group_bbox_kpt == 3 + self.num_body_points + self.num_hand_points * 2 and face_mask != False:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, match_x, 3::num_points]=False
+                # if match_x % group_bbox_kpt in [0, 
+                #                                 1 + self.num_body_points, 
+                #                                 2 + self.num_body_points + self.num_hand_points, 
+                #                                 3 + self.num_body_points + self.num_hand_points * 2]:
+                    
+                #     attn_mask3[:, :, dn_number:, dn_number:][:,:,match_x,kpt_index]=False
+
+            for idx, (gt_boxes_i, gt_labels_i) in enumerate(zip(gt_boxes, gt_labels)):
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask3[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask3[idx, :, matchi, ei:dn_number] = True
+            attn_mask3 = attn_mask3.flatten(0, 1)
+
+
+
+
+        mask_dict = {
+            'pad_size': dn_number,
+            'known_bboxs': gt_boxes_expand,
+            'known_labels': gt_labels_expand,
+            'known_keypoints': gt_keypoints_expand
+        }
+
+        return input_query_label, input_query_bbox, attn_mask, attn_mask2, attn_mask3, mask_dict
+
+    def dn_post_process2(self, outputs_class, outputs_coord,
+                         outputs_body_keypoints_list, mask_dict):
+        if mask_dict and mask_dict['pad_size'] > 0:
+            output_known_class = [
+                outputs_class_i[:, :mask_dict['pad_size'], :]
+                for outputs_class_i in outputs_class
+            ]
+            output_known_coord = [
+                outputs_coord_i[:, :mask_dict['pad_size'], :]
+                for outputs_coord_i in outputs_coord
+            ]
+
+            outputs_class = [
+                outputs_class_i[:, mask_dict['pad_size']:, :]
+                for outputs_class_i in outputs_class
+            ]
+            outputs_coord = [
+                outputs_coord_i[:, mask_dict['pad_size']:, :]
+                for outputs_coord_i in outputs_coord
+            ]
+            outputs_keypoint = outputs_body_keypoints_list
+
+            mask_dict.update({
+                'output_known_coord': output_known_coord,
+                'output_known_class': output_known_class
+            })
+        return outputs_class, outputs_coord, outputs_keypoint
+
+    def forward(self, data_batch: NestedTensor, targets: List = None):
+        """The forward expects a NestedTensor, which consists of:
+
+           - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+           - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+        It returns a dict with the following elements:
+           - "pred_logits": the classification logits (including no-object) for all queries.
+                            Shape= [batch_size x num_queries x num_classes]
+           - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                           (center_x, center_y, width, height). These values are normalized in [0, 1],
+                           relative to the size of each individual image (disregarding possible padding).
+                           See PostProcess for information on how to retrieve the unnormalized bounding box.
+           - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                            dictionnaries containing the two above keys for each decoder layer.
+        """
+
+        if isinstance(data_batch, dict):
+            samples, targets = self.prepare_targets(data_batch)
+            # import pdb; pdb.set_trace()
+        elif isinstance(data_batch, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(data_batch)
+        else:
+            samples = data_batch
+        # print(samples.data['img'].shape)
+        # exit()
+        features, poss = self.backbone(samples)
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):  # len(features=3)
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src))
+            masks.append(mask)
+            assert mask is not None
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](features[-1].tensors)
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                m = samples.mask
+                mask = F.interpolate(m[None].float(),
+                                     size=src.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                poss.append(pos_l)
+
+        if self.dn_number > 0 or targets is not None:
+            input_query_label, input_query_bbox, attn_mask,attn_mask2, attn_mask3, mask_dict =\
+                self.prepare_for_dn2(targets)
+        else:
+            assert targets is None
+            input_query_bbox = input_query_label = attn_mask = attn_mask2 = attn_mask3 = mask_dict = None
+
+
+        hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
+            srcs, masks, input_query_bbox, poss, input_query_label, attn_mask,
+            attn_mask2, attn_mask3)
+
+        # update human boxes
+        effective_dn_number = self.dn_number if self.training else 0
+        outputs_body_bbox_list = []
+        outputs_class = []
+        
+        for dec_lid, (layer_ref_sig, layer_body_bbox_embed, layer_cls_embed,
+                      layer_hs) in enumerate(
+                          zip(reference[:-1], self.bbox_embed,
+                              self.class_embed, hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                # human det
+                layer_delta_unsig = layer_body_bbox_embed(layer_hs)
+                layer_body_box_outputs_unsig = \
+                    layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
+                layer_body_box_outputs_unsig = layer_body_box_outputs_unsig.sigmoid()
+                layer_cls = layer_cls_embed(layer_hs)
+                # import mmcv
+                # import cv2
+                # img = (data_batch['img'][0]*255).permute(1,2,0).int().detach().cpu().numpy()
+                # bbox = (box_ops.box_cxcywh_to_xyxy(layer_body_box_outputs_unsig[0][0]).reshape(2,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0, ::-1]).reshape(1,4)
+                # img = mmcv.imshow_bboxes(img.copy(), bbox, show=False)
+                # cv2.imwrite('test.png',img)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)
+                outputs_class.append(layer_cls)
+                
+            elif dec_lid < self.num_box_decoder_layers + 2:
+                bs = layer_ref_sig.shape[0]                
+                # dn body bbox
+                layer_hs_body_bbox_dn = layer_hs[:, :effective_dn_number, :]  # dn content query
+                reference_before_sigmoid_body_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]  # dn position query
+                layer_body_box_delta_unsig_dn = layer_body_bbox_embed(layer_hs_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_delta_unsig_dn + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_outputs_unsig_dn.sigmoid()
+                
+                # norm body bbox
+                layer_hs_body_bbox_norm = layer_hs[:, effective_dn_number:, :][
+                    :, 0::(self.num_body_points + 4), :]  # norm content query
+                reference_before_sigmoid_body_bbox_norm = layer_ref_sig[:, effective_dn_number:, :][
+                    :, 0::(self.num_body_points+ 4), :]  # norm position query
+                layer_body_box_delta_unsig_norm = layer_body_bbox_embed(layer_hs_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_delta_unsig_norm + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_outputs_unsig_norm.sigmoid()
+
+                layer_body_box_outputs_unsig = torch.cat(
+                    (layer_body_box_outputs_unsig_dn, layer_body_box_outputs_unsig_norm), dim=1)
+
+                # classfication
+                layer_cls_dn = layer_cls_embed(layer_hs_body_bbox_dn)
+                layer_cls_norm = layer_cls_embed(layer_hs_body_bbox_norm)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+
+                outputs_class.append(layer_cls)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)                
+            else:
+                bs = layer_ref_sig.shape[0]                
+                # dn body bbox
+                layer_hs_body_bbox_dn = layer_hs[:, :effective_dn_number, :]  # dn content query
+                reference_before_sigmoid_body_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]  # dn position query
+                layer_body_box_delta_unsig_dn = layer_body_bbox_embed(layer_hs_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_delta_unsig_dn + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_outputs_unsig_dn.sigmoid()
+                
+                # norm body bbox
+                layer_hs_body_bbox_norm = layer_hs[:, effective_dn_number:, :][
+                    :, 0::(self.num_whole_body_points + 4), :]  # norm content query
+                reference_before_sigmoid_body_bbox_norm = layer_ref_sig[:,effective_dn_number:, :][
+                    :, 0::(self.num_whole_body_points + 4), :]  # norm position query
+                layer_body_box_delta_unsig_norm = layer_body_bbox_embed(layer_hs_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_delta_unsig_norm + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_outputs_unsig_norm.sigmoid()
+
+                layer_body_box_outputs_unsig = torch.cat(
+                    (layer_body_box_outputs_unsig_dn, layer_body_box_outputs_unsig_norm), dim=1)
+
+                # classfication
+                layer_cls_dn = layer_cls_embed(layer_hs_body_bbox_dn)
+                layer_cls_norm = layer_cls_embed(layer_hs_body_bbox_norm)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+
+                outputs_class.append(layer_cls)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)       
+        
+        # 找query
+        q_index = torch.topk(layer_cls_norm.max(-1)[0], 100, dim=1)[1]
+        q_value = torch.topk(layer_cls_norm.max(-1)[0], 100, dim=1)[0]
+        # update hand and face boxes
+        outputs_lhand_bbox_list = []
+        outputs_rhand_bbox_list = []
+        outputs_face_bbox_list = []
+        # update keypoints boxes
+        outputs_body_keypoints_list = []
+        outputs_body_keypoints_hw = []
+        outputs_lhand_keypoints_list = []
+        outputs_lhand_keypoints_hw = []        
+        outputs_rhand_keypoints_list = []
+        outputs_rhand_keypoints_hw = []
+        outputs_face_keypoints_list = []
+        outputs_face_keypoints_hw = []             
+        
+        outputs_smpl_pose_list = []
+        outputs_smpl_lhand_pose_list = []
+        outputs_smpl_rhand_pose_list = []
+        outputs_smpl_expr_list = []
+        outputs_smpl_jaw_pose_list = []
+        outputs_smpl_beta_list = []
+        outputs_smpl_cam_list = []
+        # outputs_smpl_cam_f_list = []
+        outputs_smpl_kp2d_list = []
+        outputs_smpl_kp3d_list = []
+        outputs_smpl_verts_list = []
+        body_kpt_index = [
+            x for x in range(self.num_group * (self.num_body_points + 4))
+            if x % (self.num_body_points + 4) in range(1,self.num_body_points+1)
+        ]
+        body_kpt_index_2 = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in range(1,self.num_body_points+1))
+        ]
+        
+        lhand_bbox_index = [
+            x for x in range(self.num_group * (self.num_body_points + 4))
+            if x % (self.num_body_points + 4) != 1
+        ]
+        lhand_kpt_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in range(
+                self.num_body_points+2, 
+                self.num_body_points+self.num_hand_points+2))]
+        
+        rhand_bbox_index = [
+            x for x in range(self.num_group * (self.num_body_points + 4))
+            if x % (self.num_body_points + 4) != 2
+        ]
+        rhand_kpt_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in range(
+                self.num_body_points+self.num_hand_points+3, 
+                self.num_body_points+self.num_hand_points*2+3))
+        ]
+        
+        face_bbox_index = [
+            x for x in range(self.num_group * (self.num_body_points + 4))
+            if x % (self.num_body_points + 4) != 3
+        ]
+        face_kpt_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in range(
+                self.num_body_points+self.num_hand_points*2+4, 
+                self.num_body_points+self.num_hand_points*2+self.num_face_points+4))
+        ]
+        
+        # smpl pose
+        
+        # body box, kps, lhand box
+        body_index = list(range(0,self.num_body_points+2))
+        # rhand box and face box        
+        body_index.extend(
+            [self.num_body_points + self.num_hand_points + 2, self.num_body_points + 2 * self.num_hand_points + 3]
+        )
+        smpl_pose_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in body_index) 
+        ]
+        
+        # smpl lhand
+        lhand_index = list(range(self.num_body_points+1, self.num_body_points+self.num_hand_points+3))
+        # body box
+        lhand_index.insert(0, 0)
+        
+        smpl_lhand_pose_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in lhand_index)]
+        
+        # smpl rhand
+        rhand_index = list(range(self.num_body_points + self.num_hand_points + 2, self.num_body_points + self.num_hand_points * 2 +3))
+        rhand_index.insert(0,self.num_body_points+1)
+        rhand_index.insert(0,0)
+        smpl_rhand_pose_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in rhand_index)]
+        
+        # smpl face
+        face_index = list(range(self.num_body_points + self.num_hand_points * 2 + 3, self.num_body_points + self.num_hand_points * 2 + self.num_face_points + 4))
+        face_index.insert(0,0)
+        
+        smpl_face_pose_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in face_index)]
+        
+        for dec_lid, (layer_ref_sig, layer_hs) in enumerate(zip(reference[:-1], hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                assert isinstance(layer_hs, torch.Tensor)
+                bs = layer_hs.shape[0]
+                layer_body_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_body_points * 3))  # [-, 900, 42]
+                outputs_body_keypoints_list.append(layer_body_kps_res)
+                
+                # lhand
+                layer_lhand_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_lhand_bbox_list.append(layer_lhand_bbox_res)
+                layer_lhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_lhand_keypoints_list.append(layer_lhand_kps_res)                
+
+                # rhand
+                layer_rhand_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_rhand_bbox_list.append(layer_rhand_bbox_res)                
+                layer_rhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_rhand_keypoints_list.append(layer_rhand_kps_res)
+                
+                # face
+                layer_face_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_face_bbox_list.append(layer_face_bbox_res)
+                layer_face_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_face_points * 3))  # [-, 900, 42]
+                outputs_face_keypoints_list.append(layer_face_kps_res)
+                
+                
+                # smpl or smplx
+                smpl_pose = layer_hs.new_zeros((bs, self.num_queries, self.body_model_joint_num * 3))
+                smpl_rhand_pose = layer_hs.new_zeros(
+                    (bs, self.num_queries, 15 * 3))
+                smpl_lhand_pose = layer_hs.new_zeros(
+                    (bs, self.num_queries, 15 * 3))
+                smpl_expr = layer_hs.new_zeros((bs, self.num_queries, 10))
+                smpl_jaw_pose = layer_hs.new_zeros((bs, self.num_queries, 6))
+                smpl_beta = layer_hs.new_zeros((bs, self.num_queries, 10))
+                smpl_cam = layer_hs.new_zeros((bs, self.num_queries, 3))
+                # smpl_cam_f = layer_hs.new_zeros((bs, self.num_queries, 1))
+                # smpl_kp2d = layer_hs.new_zeros((bs, self.num_queries, self.num_body_points,3))
+                smpl_kp3d = layer_hs.new_zeros(
+                    (bs, self.num_queries, self.num_body_points, 4))
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                # outputs_smpl_cam_f_list.append(smpl_cam_f)
+                # outputs_smpl_kp2d_list.append(smpl_kp2d)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+            elif dec_lid < self.num_box_decoder_layers +2:
+                bs = layer_ref_sig.shape[0]
+                layer_hs_body_kpt = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                        1, torch.tensor(body_kpt_index, device=layer_hs.device))
+                # body kp2d
+                delta_body_kp_xy_unsig = \
+                    self.pose_embed[dec_lid - self.num_box_decoder_layers](layer_hs_body_kpt)
+                layer_ref_sig_body_kpt = \
+                    layer_ref_sig[:,effective_dn_number:, :].index_select(1,torch.tensor(body_kpt_index,device=layer_hs.device))
+                layer_outputs_unsig_body_keypoints = delta_body_kp_xy_unsig + inverse_sigmoid(
+                    layer_ref_sig_body_kpt[..., :2])
+                vis_xy_unsig = torch.ones_like(
+                    layer_outputs_unsig_body_keypoints,
+                    device=layer_outputs_unsig_body_keypoints.device)
+                xyv = torch.cat((layer_outputs_unsig_body_keypoints,
+                                 vis_xy_unsig[:, :, 0].unsqueeze(-1)),
+                                dim=-1)
+                xyv = xyv.sigmoid()
+
+                # from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+                # img  =(data_batch['img'][0].permute(1,2,0)*255).int().cpu().numpy()
+                # gt_kp2d = xyv[0,:17]
+                # coco_kps = gt_kp2d[:,:2].reshape(17,2).detach().cpu().numpy() * data_batch['img_shape'].cpu().numpy()[0,None,None,::-1]
+                # visualize_kp2d(
+                #     coco_kps, 
+                #     output_path='.', 
+                #     image_array=img.copy()[None], 
+                #     data_source='coco',
+                #     overwrite=True)
+
+                layer_res = xyv.reshape(
+                    (bs, self.num_group, self.num_body_points,
+                     3)).flatten(2, 3)
+                layer_hw = layer_ref_sig_body_kpt[..., 2:].reshape(
+                    bs, self.num_group, self.num_body_points, 2).flatten(2, 3)
+                layer_res = keypoint_xyzxyz_to_xyxyzz(layer_res)
+                outputs_body_keypoints_list.append(layer_res)
+                outputs_body_keypoints_hw.append(layer_hw)
+                
+                # lhand bbox
+                layer_hs_lhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, (self.num_body_points + 1)::(self.num_body_points + 4), :]
+                    
+                delta_lhand_bbox_xy_unsig = self.bbox_hand_embed[dec_lid - self.num_box_decoder_layers](layer_hs_lhand_bbox)             
+                layer_ref_sig_lhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + 1)::(self.num_body_points + 4), :].clone() 
+                layer_ref_unsig_lhand_bbox = inverse_sigmoid(layer_ref_sig_lhand_bbox)
+                delta_lhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_lhand_bbox)
+                layer_ref_unsig_lhand_bbox[..., :2] +=delta_lhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_lhand_bbox[..., 2:] +=delta_lhand_bbox_hw_unsig
+                layer_ref_sig_lhand_bbox = layer_ref_unsig_lhand_bbox.sigmoid()
+                outputs_lhand_bbox_list.append(layer_ref_sig_lhand_bbox)
+                
+                layer_lhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_lhand_keypoints_list.append(layer_lhand_kps_res)
+                                
+                # rhand bbox
+                layer_hs_rhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][
+                        :, (self.num_body_points + 2)::(self.num_body_points + 4), :]
+                delta_rhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_rhand_bbox)             
+                layer_ref_sig_rhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + 2)::(self.num_body_points + 4), :].clone()
+                layer_ref_unsig_rhand_bbox = inverse_sigmoid(layer_ref_sig_rhand_bbox)
+                delta_rhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_rhand_bbox)
+                layer_ref_unsig_rhand_bbox[..., :2] +=delta_rhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_rhand_bbox[..., 2:] +=delta_rhand_bbox_hw_unsig
+                layer_ref_sig_rhand_bbox = layer_ref_unsig_rhand_bbox.sigmoid()
+                outputs_rhand_bbox_list.append(layer_ref_sig_rhand_bbox)
+                
+                # rhand kps
+                layer_rhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_rhand_keypoints_list.append(layer_rhand_kps_res)
+                
+                # face bbox
+                layer_hs_face_bbox = \
+                    layer_hs[:, effective_dn_number:, :][
+                        :, (self.num_body_points + 3)::(self.num_body_points + 4), :]
+                delta_face_bbox_xy_unsig = self.bbox_face_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_face_bbox)             
+                layer_ref_sig_face_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + 3)::(self.num_body_points + 4), :].clone()
+                layer_ref_unsig_face_bbox = inverse_sigmoid(layer_ref_sig_face_bbox)
+                delta_face_bbox_hw_unsig = self.bbox_face_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_face_bbox)
+                layer_ref_unsig_face_bbox[..., :2] +=delta_face_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_face_bbox[..., 2:] +=delta_face_bbox_hw_unsig                
+                layer_ref_sig_face_bbox = layer_ref_unsig_face_bbox.sigmoid()
+                
+                outputs_face_bbox_list.append(layer_ref_sig_face_bbox)
+                
+                # face kps
+                layer_face_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_face_points * 3))  # [-, 900, 42]
+                outputs_face_keypoints_list.append(layer_face_kps_res)
+                
+                # smpl or smplx
+                bs, _, feat_dim = layer_hs.shape
+                smpl_feats = layer_hs[:, effective_dn_number:, :].reshape(
+                    bs, -1, feat_dim * (self.num_body_points + 4))
+                smpl_lhand_pose_feats = layer_hs[:, effective_dn_number:, :][
+                    :, (self.num_body_points + 1):: (self.num_body_points + 4), :].reshape(
+                        bs, -1, feat_dim)
+                smpl_rhand_pose_feats = layer_hs[:, effective_dn_number:, :][
+                    :, (self.num_body_points + 2):: (self.num_body_points + 4), :].reshape(
+                        bs, -1, feat_dim)
+                smpl_face_pose_feats = layer_hs[:, effective_dn_number:, :][
+                    :, (self.num_body_points + 3):: (self.num_body_points + 4), :].reshape(
+                        bs, -1, feat_dim)
+                                  
+                smpl_pose = self.smpl_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+                smpl_pose = rot6d_to_rotmat(smpl_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, self.body_model_joint_num, 3, 3)
+                
+                smpl_lhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_lhand_pose_feats)
+                smpl_lhand_pose = rot6d_to_rotmat(smpl_lhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                
+                smpl_rhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_rhand_pose_feats)
+                smpl_rhand_pose = rot6d_to_rotmat(smpl_rhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                
+                smpl_jaw_pose = self.smpl_jaw_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = rot6d_to_rotmat(smpl_jaw_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, 1, 3, 3)
+                                 
+                smpl_beta = self.smpl_beta_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+                smpl_cam = self.smpl_cam_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+                # smpl_cam_f =  self.smpl_cam_f_embed[
+                #      dec_lid - self.num_box_decoder_layers](smpl_feats)
+
+                # zero
+                # smpl_lhand_pose = layer_hs.new_zeros(bs, self.num_group, 15, 3, 3)
+                # smpl_rhand_pose = layer_hs.new_zeros(bs, self.num_group, 15, 3, 3)
+                # smpl_expr = layer_hs.new_zeros(bs, self.num_group, 10)
+                smpl_expr = self.smpl_expr_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                # smpl_jaw_pose = layer_hs.new_zeros(bs, self.num_group, 3)
+                leye_pose = torch.zeros_like(smpl_jaw_pose)
+                reye_pose = torch.zeros_like(smpl_jaw_pose)
+
+
+
+                if self.body_model is not None:
+                    smpl_pose_ = rotmat_to_aa(smpl_pose)
+                    # smpl_lhand_pose_ = rotmat_to_aa(smpl_lhand_pose)
+                    # smpl_rhand_pose_ = rotmat_to_aa(smpl_rhand_pose)
+                    smpl_lhand_pose_ = layer_hs.new_zeros(bs, self.num_group, 15, 3)
+                    smpl_rhand_pose_ = layer_hs.new_zeros(bs, self.num_group, 15, 3)
+                    smpl_jaw_pose_ = rotmat_to_aa(smpl_jaw_pose)
+                    leye_pose_ = rotmat_to_aa(leye_pose)
+                    reye_pose_ = rotmat_to_aa(reye_pose)
+                    
+                    pred_output = self.body_model(
+                        betas=smpl_beta.reshape(-1, 10),
+                        body_pose=smpl_pose_[:, :,  1:].reshape(-1, 21 * 3),
+                        global_orient=smpl_pose_[:, :, 0].reshape(
+                            -1, 3).unsqueeze(1),
+                        left_hand_pose=smpl_lhand_pose_.reshape(-1, 15 * 3),
+                        right_hand_pose=smpl_rhand_pose_.reshape(-1, 15 * 3),
+                        leye_pose=leye_pose_,
+                        reye_pose=reye_pose_,
+                        jaw_pose=smpl_jaw_pose_.reshape(-1, 3),
+                        # expression=smpl_expr.reshape(-1, 10),
+                        expression=layer_hs.new_zeros(bs, self.num_group, 10).reshape(-1, 10)
+                    )
+                    smpl_kp3d = pred_output['joints'].reshape(
+                        bs, self.num_group, -1, 3)
+                    smpl_verts = pred_output['vertices'].reshape(
+                        bs, self.num_group, -1, 3)
+                    # pred_vertices = pred_output['vertices'].reshape(bs, -1, 6890, 3)
+
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                # outputs_smpl_cam_f_list.append(smpl_cam_f)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+                
+
+            else:
+                bs = layer_ref_sig.shape[0]
+                layer_hs_body_kpt = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                        1, torch.tensor(body_kpt_index_2, device=layer_hs.device))
+
+                # body kp2d
+                delta_body_kp_xy_unsig = \
+                    self.pose_embed[
+                        dec_lid - self.num_box_decoder_layers](layer_hs_body_kpt)
+                layer_ref_sig_body_kpt = \
+                    layer_ref_sig[:,effective_dn_number:, :].index_select(
+                        1,torch.tensor(body_kpt_index_2,device=layer_hs.device))
+                layer_outputs_unsig_body_keypoints = \
+                    delta_body_kp_xy_unsig + inverse_sigmoid(
+                    layer_ref_sig_body_kpt[..., :2])
+                vis_xy_unsig = torch.ones_like(
+                    layer_outputs_unsig_body_keypoints,
+                    device=layer_outputs_unsig_body_keypoints.device)
+                xyv = torch.cat((layer_outputs_unsig_body_keypoints,
+                                 vis_xy_unsig[:, :, 0].unsqueeze(-1)),
+                                dim=-1)
+                xyv = xyv.sigmoid()
+                layer_res = xyv.reshape(
+                    (bs, self.num_group, self.num_body_points,
+                     3)).flatten(2, 3)
+                layer_hw = layer_ref_sig_body_kpt[..., 2:].reshape(
+                    bs, self.num_group, self.num_body_points, 2).flatten(2, 3)
+                layer_res = keypoint_xyzxyz_to_xyxyzz(layer_res)
+                outputs_body_keypoints_list.append(layer_res)
+                outputs_body_keypoints_hw.append(layer_hw)
+                
+                # lhand bbox
+                layer_hs_lhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][
+                        :, (self.num_body_points + 1)::(self.num_whole_body_points + 4), :]
+                    
+                delta_lhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_lhand_bbox)             
+                layer_ref_sig_lhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + 1)::(self.num_whole_body_points + 4), :].clone()
+                layer_ref_unsig_lhand_bbox = inverse_sigmoid(layer_ref_sig_lhand_bbox)
+                delta_lhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_lhand_bbox)
+                layer_ref_unsig_lhand_bbox[..., :2] +=delta_lhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_lhand_bbox[..., 2:] +=delta_lhand_bbox_hw_unsig
+                layer_ref_sig_lhand_bbox = layer_ref_unsig_lhand_bbox.sigmoid()
+                outputs_lhand_bbox_list.append(layer_ref_sig_lhand_bbox)
+                
+                # lhand kps
+                layer_hs_lhand_kps_res = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                        1, torch.tensor(lhand_kpt_index, device=layer_hs.device))
+                delta_lhand_kp_xy_unsig = \
+                    self.pose_hand_embed[
+                        dec_lid - self.num_hand_face_decoder_layers](layer_hs_lhand_kps_res)                
+                layer_ref_sig_lhand_kpt = \
+                    layer_ref_sig[:,effective_dn_number:, :].index_select(
+                        1,torch.tensor(lhand_kpt_index,device=layer_hs.device)) 
+                layer_outputs_unsig_lhand_keypoints = delta_lhand_kp_xy_unsig + inverse_sigmoid(
+                    layer_ref_sig_lhand_kpt[..., :2])                    
+                lhand_vis_xy_unsig = torch.ones_like(
+                    layer_outputs_unsig_lhand_keypoints,
+                    device=layer_outputs_unsig_lhand_keypoints.device)
+                lhand_xyv = torch.cat((layer_outputs_unsig_lhand_keypoints,
+                                 lhand_vis_xy_unsig[:, :, 0].unsqueeze(-1)),
+                                dim=-1)
+                lhand_xyv = lhand_xyv.sigmoid()
+                layer_lhand_kps_res = lhand_xyv.reshape(
+                    (bs, self.num_group, self.num_hand_points,
+                     3)).flatten(2, 3)
+                layer_lhand_hw = layer_ref_sig_lhand_kpt[..., 2:].reshape(
+                    bs, self.num_group, self.num_hand_points, 2).flatten(2, 3)
+                layer_lhand_kps_res = keypoint_xyzxyz_to_xyxyzz(layer_lhand_kps_res)
+                outputs_lhand_keypoints_list.append(layer_lhand_kps_res)
+                outputs_lhand_keypoints_hw.append(layer_lhand_hw)
+
+                # rhand bbox
+                layer_hs_rhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][
+                        :, (self.num_body_points + self.num_hand_points + 2)::(self.num_whole_body_points + 4), :]
+                delta_rhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_rhand_bbox)             
+                layer_ref_sig_rhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + self.num_hand_points + 2)::(self.num_whole_body_points + 4), :].clone()                  
+                layer_ref_unsig_rhand_bbox = inverse_sigmoid(layer_ref_sig_rhand_bbox)
+                delta_rhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_rhand_bbox)
+                layer_ref_unsig_rhand_bbox[..., :2] +=delta_rhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_rhand_bbox[..., 2:] +=delta_rhand_bbox_hw_unsig
+                layer_ref_sig_rhand_bbox = layer_ref_unsig_rhand_bbox.sigmoid()
+                outputs_rhand_bbox_list.append(layer_ref_sig_rhand_bbox)
+                
+                # rhand kps
+                layer_hs_rhand_kps_res = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                        1, torch.tensor(rhand_kpt_index, device=layer_hs.device))
+                delta_rhand_kp_xy_unsig = \
+                    self.pose_hand_embed[
+                        dec_lid - self.num_hand_face_decoder_layers](layer_hs_rhand_kps_res)                
+                layer_ref_sig_rhand_kpt = \
+                    layer_ref_sig[:,effective_dn_number:, :].index_select(
+                        1,torch.tensor(rhand_kpt_index,device=layer_hs.device)) 
+                layer_outputs_unsig_rhand_keypoints = delta_rhand_kp_xy_unsig + inverse_sigmoid(
+                    layer_ref_sig_rhand_kpt[..., :2])                    
+                rhand_vis_xy_unsig = torch.ones_like(
+                    layer_outputs_unsig_rhand_keypoints,
+                    device=layer_outputs_unsig_rhand_keypoints.device)
+                rhand_xyv = torch.cat((layer_outputs_unsig_rhand_keypoints,
+                                 rhand_vis_xy_unsig[:, :, 0].unsqueeze(-1)),
+                                dim=-1)
+                rhand_xyv = rhand_xyv.sigmoid()
+                layer_rhand_kps_res = rhand_xyv.reshape(
+                    (bs, self.num_group, self.num_hand_points,
+                     3)).flatten(2, 3)
+                layer_rhand_hw = layer_ref_sig_rhand_kpt[..., 2:].reshape(
+                    bs, self.num_group, self.num_hand_points, 2).flatten(2, 3)
+                layer_rhand_kps_res = keypoint_xyzxyz_to_xyxyzz(layer_rhand_kps_res)
+                outputs_rhand_keypoints_list.append(layer_rhand_kps_res)
+                outputs_rhand_keypoints_hw.append(layer_rhand_hw)
+                
+                # face bbox
+                layer_hs_face_bbox = \
+                    layer_hs[:, effective_dn_number:, :][
+                        :, (self.num_body_points + 2 * self.num_hand_points + 3)::(self.num_whole_body_points + 4), :]
+                delta_face_bbox_xy_unsig = self.bbox_face_embed[dec_lid - self.num_box_decoder_layers](layer_hs_face_bbox)             
+                layer_ref_sig_face_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + 2 * self.num_hand_points + 3)::(self.num_whole_body_points + 4), :].clone()               
+                layer_ref_unsig_face_bbox = inverse_sigmoid(layer_ref_sig_face_bbox)
+                delta_face_bbox_hw_unsig = self.bbox_face_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_face_bbox)
+                layer_ref_unsig_face_bbox[..., :2] +=delta_face_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_face_bbox[..., 2:] +=delta_face_bbox_hw_unsig
+                layer_ref_sig_face_bbox = layer_ref_unsig_face_bbox.sigmoid()   
+                outputs_face_bbox_list.append(layer_ref_sig_face_bbox)
+                
+                # face kps
+                layer_hs_face_kps_res = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                        1, torch.tensor(face_kpt_index, device=layer_hs.device))
+                delta_face_kp_xy_unsig = \
+                    self.pose_face_embed[
+                        dec_lid - self.num_hand_face_decoder_layers](layer_hs_face_kps_res)                
+                layer_ref_sig_face_kpt = \
+                    layer_ref_sig[:,effective_dn_number:, :].index_select(
+                        1,torch.tensor(face_kpt_index,device=layer_hs.device)) 
+                layer_outputs_unsig_face_keypoints = delta_face_kp_xy_unsig + inverse_sigmoid(
+                    layer_ref_sig_face_kpt[..., :2])                    
+                face_vis_xy_unsig = torch.ones_like(
+                    layer_outputs_unsig_face_keypoints,
+                    device=layer_outputs_unsig_face_keypoints.device)
+                face_xyv = torch.cat((layer_outputs_unsig_face_keypoints,
+                                 face_vis_xy_unsig[:, :, 0].unsqueeze(-1)),
+                                dim=-1)
+                face_xyv = face_xyv.sigmoid()
+                layer_face_kps_res = face_xyv.reshape(
+                    (bs, self.num_group, self.num_face_points,
+                     3)).flatten(2, 3)
+                layer_face_hw = layer_ref_sig_face_kpt[..., 2:].reshape(
+                    bs, self.num_group, self.num_face_points, 2).flatten(2, 3)
+                layer_face_kps_res = keypoint_xyzxyz_to_xyxyzz(layer_face_kps_res)
+                outputs_face_keypoints_list.append(layer_face_kps_res)
+                outputs_face_keypoints_hw.append(layer_face_hw)
+                                
+                # pdb.set_trace()
+                bs, _, feat_dim = layer_hs.shape
+                smpl_body_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * (self.num_body_points + 4))
+                smpl_lhand_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_lhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * (self.num_hand_points + 3))
+                smpl_rhand_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_rhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * (self.num_hand_points + 3))
+                smpl_face_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_face_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * (self.num_face_points + 2))
+                                                
+                smpl_pose = self.smpl_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                
+                smpl_pose = rot6d_to_rotmat(smpl_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, self.body_model_joint_num, 3, 3)
+                smpl_lhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_lhand_pose_feats)
+                smpl_lhand_pose = rot6d_to_rotmat(smpl_lhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                smpl_rhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_rhand_pose_feats)
+                smpl_rhand_pose = rot6d_to_rotmat(smpl_rhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+
+                smpl_expr = self.smpl_expr_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = self.smpl_jaw_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = rot6d_to_rotmat(smpl_jaw_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, 1, 3, 3)
+                smpl_beta = self.smpl_beta_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                smpl_cam = self.smpl_cam_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                # smpl_cam_f = self.smpl_cam_f_embed[
+                #     dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                
+                num_samples = smpl_beta.reshape(-1, 10).shape[0]
+                device = smpl_beta.device
+                leye_pose = torch.zeros_like(smpl_jaw_pose)
+                reye_pose = torch.zeros_like(smpl_jaw_pose)
+
+                if self.body_model is not None:
+                    # print(smpl_pose)
+                    # exit()
+                    smpl_pose_ = rotmat_to_aa(smpl_pose)
+                    smpl_lhand_pose_ = rotmat_to_aa(smpl_lhand_pose)
+                    smpl_rhand_pose_ = rotmat_to_aa(smpl_rhand_pose)
+                    smpl_jaw_pose_ = rotmat_to_aa(smpl_jaw_pose)
+                    leye_pose_ = rotmat_to_aa(leye_pose)
+                    reye_pose_ = rotmat_to_aa(reye_pose)
+                    
+                    pred_output = self.body_model(
+                        betas=smpl_beta.reshape(-1, 10),
+                        body_pose=smpl_pose_[:, :,  1:].reshape(-1, 21 * 3),
+                        global_orient=smpl_pose_[:, :, 0].reshape(
+                            -1, 3).unsqueeze(1),
+                        left_hand_pose=smpl_lhand_pose_.reshape(-1, 15 * 3),
+                        right_hand_pose=smpl_rhand_pose_.reshape(-1, 15 * 3),
+                        leye_pose=leye_pose_,
+                        reye_pose=reye_pose_,
+                        jaw_pose=smpl_jaw_pose_.reshape(-1, 3),
+                        expression=smpl_expr.reshape(-1, 10),
+                        # expression=layer_hs.new_zeros(bs, self.num_group, 10).reshape(-1, 10),
+                    )
+                    smpl_kp3d = pred_output['joints'].reshape(
+                        bs, self.num_group, -1, 3)
+                    smpl_verts = pred_output['vertices'].reshape(
+                        bs, self.num_group, -1, 3)
+                    # pred_vertices = pred_output['vertices'].reshape(bs, -1, 6890, 3)
+                    # from detrsmpl.core.visualization.visualize_keypoints3d import visualize_kp3d
+                    # visualize_kp3d(smpl_kp3d[0,:100].detach().cpu().numpy(),
+                    #                         output_path='./figs/pred3d',
+                    #                         data_source='smplx_137')
+                    # import numpy as np
+                    # from pytorch3d.io import save_obj
+                    # save_obj(
+                    #     '1.obj', 
+                    #     torch.tensor(pred_output['vertices'][0]), 
+                    #     torch.tensor(self.body_model.faces.astype(np.float)))
+                    # exit()
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                # outputs_smpl_cam_f_list.append(smpl_cam_f)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+                if not self.training:
+                    outputs_smpl_verts_list.append(smpl_verts)
+        dn_mask_dict = mask_dict
+        if self.dn_number > 0 and dn_mask_dict is not None:
+            outputs_class, outputs_body_bbox_list, outputs_body_keypoints_list = self.dn_post_process2(
+                outputs_class, outputs_body_bbox_list, outputs_body_keypoints_list,
+                dn_mask_dict)
+            dn_class_input = dn_mask_dict['known_labels']
+            dn_bbox_input = dn_mask_dict['known_bboxs']
+            dn_class_pred = dn_mask_dict['output_known_class']
+            dn_bbox_pred = dn_mask_dict['output_known_coord']
+
+        for idx, (_out_class, _out_bbox, _out_keypoint) in enumerate(
+                zip(outputs_class, outputs_body_bbox_list,
+                    outputs_body_keypoints_list)):
+            assert _out_class.shape[1] == _out_bbox.shape[
+                1] == _out_keypoint.shape[1]
+        out = {
+            'pred_logits': outputs_class[-1],
+            'pred_boxes': outputs_body_bbox_list[-1],
+            'pred_lhand_boxes': outputs_lhand_bbox_list[-1],
+            'pred_rhand_boxes': outputs_rhand_bbox_list[-1],
+            'pred_face_boxes': outputs_face_bbox_list[-1],
+            'pred_keypoints': outputs_body_keypoints_list[-1],
+            'pred_lhand_keypoints': outputs_lhand_keypoints_list[-1],
+            'pred_rhand_keypoints': outputs_rhand_keypoints_list[-1],
+            'pred_face_keypoints': outputs_face_keypoints_list[-1],
+            'pred_smpl_pose': outputs_smpl_pose_list[-1],
+            'pred_smpl_rhand_pose': outputs_smpl_rhand_pose_list[-1],
+            'pred_smpl_lhand_pose': outputs_smpl_lhand_pose_list[-1],
+            'pred_smpl_jaw_pose': outputs_smpl_jaw_pose_list[-1],
+            'pred_smpl_expr': outputs_smpl_expr_list[-1],
+            'pred_smpl_beta': outputs_smpl_beta_list[-1],  # [B, 100, 10]
+            'pred_smpl_cam': outputs_smpl_cam_list[-1],
+            # 'pred_smpl_cam_f': outputs_smpl_cam_f_list[-1],
+            'pred_smpl_kp3d': outputs_smpl_kp3d_list[-1]
+        }
+        if not self.training:
+            full_pose = torch.cat((outputs_smpl_pose_list[-1],
+                               outputs_smpl_lhand_pose_list[-1],
+                               outputs_smpl_rhand_pose_list[-1],
+                               outputs_smpl_jaw_pose_list[-1]),dim=2)
+            bs,num_q,_,_,_ = full_pose.shape
+            full_pose = rotmat_to_aa(full_pose).reshape(bs,num_q,53*3)
+            out = {
+            'pred_logits': outputs_class[-1],
+            'pred_boxes': outputs_body_bbox_list[-1],
+            'pred_lhand_boxes': outputs_lhand_bbox_list[-1],
+            'pred_rhand_boxes': outputs_rhand_bbox_list[-1],
+            'pred_face_boxes': outputs_face_bbox_list[-1],
+            'pred_keypoints': outputs_body_keypoints_list[-1],
+            'pred_lhand_keypoints': outputs_lhand_keypoints_list[-1],
+            'pred_rhand_keypoints': outputs_rhand_keypoints_list[-1],
+            'pred_face_keypoints': outputs_face_keypoints_list[-1],
+            'pred_smpl_pose': outputs_smpl_pose_list[-1],
+            'pred_smpl_rhand_pose': outputs_smpl_rhand_pose_list[-1],
+            'pred_smpl_lhand_pose': outputs_smpl_lhand_pose_list[-1],
+            'pred_smpl_jaw_pose': outputs_smpl_jaw_pose_list[-1],
+            'pred_smpl_expr': outputs_smpl_expr_list[-1],
+            'pred_smpl_beta': outputs_smpl_beta_list[-1],  # [B, 100, 10]
+            'pred_smpl_cam': outputs_smpl_cam_list[-1],
+            # 'pred_smpl_cam_f': outputs_smpl_cam_f_list[-1],
+            'pred_smpl_kp3d': outputs_smpl_kp3d_list[-1],
+            'pred_smpl_verts': outputs_smpl_verts_list[-1],
+            'pred_smpl_fullpose': full_pose
+        }
+
+        if self.dn_number > 0 and dn_mask_dict is not None:
+            out.update({
+                'dn_class_input': dn_class_input,
+                'dn_bbox_input': dn_bbox_input,
+                'dn_class_pred': dn_class_pred[-1],
+                'dn_bbox_pred': dn_bbox_pred[-1],
+                'num_tgt': dn_mask_dict['pad_size']
+            })
+
+        if self.aux_loss:
+            out['aux_outputs'] = \
+                self._set_aux_loss(
+                    outputs_class,
+                    outputs_body_bbox_list,
+                    outputs_lhand_bbox_list,
+                    outputs_rhand_bbox_list,
+                    outputs_face_bbox_list,
+                    outputs_body_keypoints_list,
+                    outputs_lhand_keypoints_list,
+                    outputs_rhand_keypoints_list,
+                    outputs_face_keypoints_list,
+                    outputs_smpl_pose_list,
+                    outputs_smpl_rhand_pose_list,
+                    outputs_smpl_lhand_pose_list,
+                    outputs_smpl_jaw_pose_list,
+                    outputs_smpl_expr_list,
+                    outputs_smpl_beta_list,
+                    outputs_smpl_cam_list,
+                    # outputs_smpl_cam_f_list,
+                    outputs_smpl_kp3d_list
+                ) # with key pred_logits, pred_bbox, pred_keypoints
+            if self.dn_number > 0 and dn_mask_dict is not None:
+                assert len(dn_class_pred[:-1]) == len(
+                    dn_bbox_pred[:-1]) == len(out['aux_outputs'])
+                for aux_out, dn_class_pred_i, dn_bbox_pred_i in zip(
+                        out['aux_outputs'], dn_class_pred, dn_bbox_pred):
+                    aux_out.update({
+                        'dn_class_input': dn_class_input,
+                        'dn_bbox_input': dn_bbox_input,
+                        'dn_class_pred': dn_class_pred_i,
+                        'dn_bbox_pred': dn_bbox_pred_i,
+                        'num_tgt': dn_mask_dict['pad_size']
+                    })
+        # for encoder output
+        if hs_enc is not None:
+            interm_coord = ref_enc[-1]
+            interm_class = self.transformer.enc_out_class_embed(hs_enc[-1])
+            interm_pose = torch.zeros_like(outputs_body_keypoints_list[0])
+            out['interm_outputs'] = {
+                'pred_logits': interm_class,
+                'pred_boxes': interm_coord,
+                'pred_keypoints': interm_pose
+            }
+
+        return out, targets, data_batch
+
+    @torch.jit.unused
+    def _set_aux_loss(self, 
+                      outputs_class, 
+                      outputs_body_coord, 
+                      outputs_lhand_coord,
+                      outputs_rhand_coord,
+                      outputs_face_coord,
+                      outputs_body_keypoints,
+                      outputs_lhand_keypoints,
+                      outputs_rhand_keypoints,
+                      outputs_face_keypoints,
+                      outputs_smpl_pose, 
+                      outputs_smpl_rhand_pose,
+                      outputs_smpl_lhand_pose, 
+                      outputs_smpl_jaw_pose,
+                      outputs_smpl_expr, 
+                      outputs_smpl_beta, 
+                      outputs_smpl_cam,
+                    #   outputs_smpl_cam_f,
+                      outputs_smpl_kp3d):
+
+        return [{
+            'pred_logits': a,
+            'pred_boxes': b,
+            'pred_lhand_boxes': c,
+            'pred_rhand_boxes': d,
+            'pred_face_boxes': e,
+            'pred_keypoints': f,
+            'pred_lhand_keypoints': g,
+            'pred_rhand_keypoints': h,
+            'pred_face_keypoints': i,
+            'pred_smpl_pose': j,
+            'pred_smpl_rhand_pose': k,
+            'pred_smpl_lhand_pose': l,
+            'pred_smpl_jaw_pose': m,
+            'pred_smpl_expr': n,
+            'pred_smpl_beta': o,
+            'pred_smpl_cam': p,
+            # 'pred_smpl_cam_f': q,
+            'pred_smpl_kp3d': q
+        } for a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q in zip(
+            outputs_class[:-1], 
+            outputs_body_coord[:-1],
+            outputs_lhand_coord[:-1],
+            outputs_rhand_coord[:-1],
+            outputs_face_coord[:-1],
+            outputs_body_keypoints[:-1],
+            outputs_lhand_keypoints[:-1],
+            outputs_rhand_keypoints[:-1],
+            outputs_face_keypoints[:-1],
+            outputs_smpl_pose[:-1], 
+            outputs_smpl_rhand_pose[:-1],
+            outputs_smpl_lhand_pose[:-1], 
+            outputs_smpl_jaw_pose[:-1],
+            outputs_smpl_expr[:-1], 
+            outputs_smpl_beta[:-1],
+            outputs_smpl_cam[:-1], 
+            outputs_smpl_kp3d[:-1])]
+
+    def prepare_targets(self, data_batch):
+
+        data_batch_coco = []
+        instance_dict = {}
+        img_list = data_batch['img'].float()
+        # input_img_h, input_img_w = data_batch['image_metas'][0]['batch_input_shape']
+        batch_size, _, input_img_h, input_img_w = img_list.shape
+        device = img_list.device
+        masks = torch.ones((batch_size, input_img_h, input_img_w),
+                           dtype=torch.bool,
+                           device=device)
+        
+
+        # cv2.imread(data_batch['img_metas'][img_id]['image_path']).shape
+        for img_id in range(batch_size):
+            img_h, img_w = data_batch['img_shape'][img_id]
+            masks[img_id, :img_h, :img_w] = 0
+            
+            if not self.inference:
+                instance_body_bbox = torch.cat([data_batch['body_bbox_center'][img_id],\
+                                                data_batch['body_bbox_size'][img_id]],dim=-1)
+                instance_face_bbox = torch.cat([data_batch['face_bbox_center'][img_id],\
+                                                data_batch['face_bbox_size'][img_id]],dim=-1)
+                instance_lhand_bbox = torch.cat([data_batch['lhand_bbox_center'][img_id],\
+                                                data_batch['lhand_bbox_size'][img_id]],dim=-1)
+                instance_rhand_bbox = torch.cat([data_batch['rhand_bbox_center'][img_id],\
+                                                data_batch['rhand_bbox_size'][img_id]],dim=-1)
+
+                instance_kp2d = data_batch['joint_img'][img_id].clone().float()
+                instance_kp2d_mask = data_batch['joint_trunc'][img_id].clone().float()
+                instance_kp2d[:,:,2:] = instance_kp2d_mask
+                body_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'coco', approximate=True)
+                lhand_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_lhand', approximate=True)
+                rhand_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_rhand', approximate=True)
+                face_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_face', approximate=True)
+                # from util.vis_utils import show_bbox
+                # show_bbox(img_list[img_id],instance_kp2d.cpu().numpy(),data_batch['bbox_xywh'][img_id].cpu().numpy)
+                body_kp2d[:,:,0] = body_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                body_kp2d[:,:,1] = body_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                body_kp2d = torch.cat([body_kp2d[:,:,:2].flatten(1),body_kp2d[:,:,2]],dim=-1)
+
+                lhand_kp2d[:,:,0] = lhand_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                lhand_kp2d[:,:,1] = lhand_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                lhand_kp2d = torch.cat([lhand_kp2d[:,:,:2].flatten(1),lhand_kp2d[:,:,2]],dim=-1)
+                
+                rhand_kp2d[:,:,0] = rhand_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                rhand_kp2d[:,:,1] = rhand_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                rhand_kp2d = torch.cat([rhand_kp2d[:,:,:2].flatten(1),rhand_kp2d[:,:,2]],dim=-1)
+
+                face_kp2d[:,:,0] = face_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                face_kp2d[:,:,1] = face_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                face_kp2d = torch.cat([face_kp2d[:,:,:2].flatten(1),face_kp2d[:,:,2]],dim=-1)
+                
+                instance_dict = {}
+                instance_dict['boxes'] = instance_body_bbox.float()
+                instance_dict['face_boxes'] = instance_face_bbox.float()
+                instance_dict['lhand_boxes'] = instance_lhand_bbox.float()
+                instance_dict['rhand_boxes'] = instance_rhand_bbox.float()
+                instance_dict['keypoints'] = body_kp2d.float()
+                instance_dict['lhand_keypoints'] = lhand_kp2d.float()
+                instance_dict['rhand_keypoints'] = rhand_kp2d.float()
+                instance_dict['face_keypoints'] = face_kp2d.float()
+            
+                # instance_dict['orig_size'] = data_batch['ori_shape'][img_id]
+                instance_dict['size'] = data_batch['img_shape'][img_id]  # after augmentation 
+                
+                instance_dict['area'] = instance_body_bbox[:, 2] * instance_body_bbox[:, 3]
+                instance_dict['lhand_area'] = instance_lhand_bbox[:, 2] * instance_lhand_bbox[:, 3]
+                instance_dict['rhand_area'] = instance_rhand_bbox[:, 2] * instance_rhand_bbox[:, 3]
+                instance_dict['face_area'] = instance_face_bbox[:, 2] * instance_face_bbox[:, 3]
+
+                instance_dict['labels'] = torch.ones(instance_body_bbox.shape[0],
+                                                    dtype=torch.long,
+                                                    device=device)
+                data_batch_coco.append(instance_dict)               
+                # body_bbox = data_batch['body_bbox'][img_id].clone().float().reshape(-1, 4)
+                # lhand_bbox = data_batch['lhand_bbox'][img_id].clone().float().reshape(-1, 4)
+                # rhand_bbox = data_batch['rhand_bbox'][img_id].clone().float().reshape(-1, 4)
+                # face_bbox = data_batch['face_bbox'][img_id].clone().float().reshape(-1, 4)
+                # vis = False
+                # if vis:
+                #     import mmcv
+                    # body_bbox[:, 0] *= img_w
+                    # body_bbox[:, 1] *= img_h
+                    # body_bbox[:, 2] *= img_w
+                    # body_bbox[:, 3] *= img_h
+                #     img = (data_batch['img'][img_id]*255).int().permute(1,2,0).cpu().detach().numpy()
+                #     img = mmcv.imshow_bboxes(img.copy(), face_bbox.cpu().numpy(), show=False)
+                #     cv2.imwrite('test.png', img)
+                
+                # instance_kp2d[:,:,0] = instance_kp2d[:,:,0]/cfg.output_hm_shape[2]*img_w
+                # instance_kp2d[:,:,1] = instance_kp2d[:,:,1]/cfg.output_hm_shape[1]*img_h
+                # from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+                # img = (data_batch['img'][img_id]*255).int().permute(1,2,0).cpu().detach().numpy()
+                # img1 = visualize_kp2d(instance_kp2d.cpu().detach().numpy(),image_array=img[None].copy(),return_array=True)
+                # cv2.imwrite('test.png',img1[0])
+                # lhand_kp2d[:,:,0] = lhand_kp2d[:,:,0]/cfg.output_hm_shape[2]*img_w
+                # lhand_kp2d[:,:,1] = lhand_kp2d[:,:,1]/cfg.output_hm_shape[1]*img_h
+                # lhand_kp2d = convert_kps(lhand_kp2d, 'smplx_lhand', 'smplx', approximate=True)[0]
+            else:
+                instance_body_bbox = torch.cat([data_batch['body_bbox_center'][img_id],\
+                                                data_batch['body_bbox_size'][img_id]],dim=-1)
+                instance_dict = {}
+                # instance_dict['orig_size'] = data_batch['ori_shape'][img_id]
+                instance_dict['size'] = data_batch['img_shape'][img_id]  # after augmentation 
+                instance_dict['boxes'] = instance_body_bbox.float()    
+                     
+                data_batch_coco.append(instance_dict)  
+
+        input_img = NestedTensor(img_list, masks)
+        return input_img, data_batch_coco
+
+
+    def keypoints_to_scaled_bbox_bfh(
+        self, keypoints, occ=None, 
+        body_scale=1.0, fh_scale=1.0, 
+        convention='smplx'):
+        '''Obtain scaled bbox in xyxy format given keypoints
+        Args:
+            keypoints (np.ndarray): Keypoints
+            scale (float): Bounding Box scale
+        Returns:
+            bbox_xyxy (np.ndarray): Bounding box in xyxy format
+        '''
+        bboxs = []
+
+        # supported kps.shape: (1, n, k) or (n, k), k = 2 or 3
+        if keypoints.ndim == 3:
+            keypoints = keypoints[0]
+        if keypoints.shape[-1] != 2:
+            keypoints = keypoints[:, :2]
+
+        for body_part in ['body', 'head', 'left_hand', 'right_hand']:
+            if body_part == 'body':
+                scale = body_scale
+                kps = keypoints
+            else:
+                scale = fh_scale
+                kp_id = get_keypoint_idxs_by_part(body_part, convention=convention)
+                kps = keypoints[kp_id]
+
+            if not occ is None:
+                occ_p = occ[kp_id]
+                if np.sum(occ_p) / len(kp_id) >= 0.1:
+                    conf = 0
+                    # print(f'{body_part} occluded, occlusion: {np.sum(occ_p) / len(kp_id)}, skip')
+                else:
+                    # print(f'{body_part} good, {np.sum(self_occ_p + occ_p) / len(kp_id)}')
+                    conf = 1
+            else:
+                conf = 1
+            if body_part == 'body':
+                conf = 1
+
+            xmin, ymin = np.amin(kps, axis=0)
+            xmax, ymax = np.amax(kps, axis=0)
+
+            width = (xmax - xmin) * scale
+            height = (ymax - ymin) * scale
+
+            x_center = 0.5 * (xmax + xmin)
+            y_center = 0.5 * (ymax + ymin)
+            xmin = x_center - 0.5 * width
+            xmax = x_center + 0.5 * width
+            ymin = y_center - 0.5 * height
+            ymax = y_center + 0.5 * height
+
+            bbox = np.stack([xmin, ymin, xmax, ymax, conf], axis=0).astype(np.float32)
+            bboxs.append(bbox)
+        
+        return bboxs
+
+
+
+
+@MODULE_BUILD_FUNCS.registe_with_name(module_name='aios_smplx')
+def build_aios_smplx(args, cfg):
+    # pdb.set_trace()
+    num_classes = args.num_classes  # 2
+    device = torch.device(args.device)
+
+    backbone = build_backbone(args)
+
+    transformer = build_transformer(args)
+
+    dn_labelbook_size = args.dn_labelbook_size
+    dec_pred_class_embed_share = args.dec_pred_class_embed_share
+    dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
+
+    if args.eval:
+        body_model = args.body_model_test
+        train = False
+    else:
+        body_model = args.body_model_train
+        train = True
+        
+    model = AiOSSMPLX(
+        backbone,
+        transformer,
+        num_classes=num_classes,  # 2
+        num_queries=args.num_queries,  # 900
+        aux_loss=True,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=args.random_refpoints_xy,  # False
+        fix_refpoints_hw=args.fix_refpoints_hw,  # -1
+        num_feature_levels=args.num_feature_levels,  # 4
+        nheads=args.nheads,  # 8
+        dec_pred_class_embed_share=dec_pred_class_embed_share,  # false
+        dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,  # False
+        # two stage
+        two_stage_type=args.two_stage_type,
+
+        # box_share
+        two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,  # False
+        two_stage_class_embed_share=args.two_stage_class_embed_share,  # False
+        dn_number=args.dn_number if args.use_dn else 0,  # 100
+        dn_box_noise_scale=args.dn_box_noise_scale,  # 0.4
+        dn_label_noise_ratio=args.dn_label_noise_ratio,  # 0.5
+        dn_batch_gt_fuse=args.dn_batch_gt_fuse,  # false
+        dn_attn_mask_type_list=args.dn_attn_mask_type_list,
+        dn_labelbook_size=dn_labelbook_size,  # 100
+        cls_no_bias=args.cls_no_bias,  # False
+        num_group=args.num_group,  # 100
+        num_body_points=args.num_body_points,  # 17
+        num_hand_points=args.num_hand_points,  # 17
+        num_face_points=args.num_face_points,  # 17
+        num_box_decoder_layers=args.num_box_decoder_layers,  # 2
+        num_hand_face_decoder_layers=args.num_hand_face_decoder_layers,
+        # smpl_convention=convention
+        body_model=body_model,
+        train=train,
+        inference=args.inference)
+    matcher = build_matcher(args)
+
+    # prepare weight dict
+    weight_dict = {
+        'loss_ce': args.cls_loss_coef,  # 2
+        # bbox
+        'loss_body_bbox': args.body_bbox_loss_coef,  # 5
+        'loss_rhand_bbox': args.rhand_bbox_loss_coef,  # 5
+        'loss_lhand_bbox': args.lhand_bbox_loss_coef,  # 5
+        'loss_face_bbox': args.face_bbox_loss_coef,  # 5
+        # bbox giou
+        'loss_body_giou': args.body_giou_loss_coef,  # 2
+        'loss_rhand_giou': args.rhand_giou_loss_coef,  # 2
+        'loss_lhand_giou': args.lhand_giou_loss_coef,  # 2
+        'loss_face_giou': args.face_giou_loss_coef,  # 2
+        # 2d kp
+        'loss_keypoints': args.keypoints_loss_coef,  # 10
+        'loss_rhand_keypoints': args.rhand_keypoints_loss_coef,  # 10
+        'loss_lhand_keypoints': args.lhand_keypoints_loss_coef,  # 10
+        'loss_face_keypoints': args.face_keypoints_loss_coef,  # 10
+        # 2d kp oks
+        'loss_oks': args.oks_loss_coef,  # 4
+        'loss_rhand_oks': args.rhand_oks_loss_coef,  # 4
+        'loss_lhand_oks': args.lhand_oks_loss_coef,  # 4
+        'loss_face_oks': args.face_oks_loss_coef,  # 4
+        # smpl param
+        'loss_smpl_pose_root': args.smpl_pose_loss_root_coef,  # 0
+        'loss_smpl_pose_body': args.smpl_pose_loss_body_coef,  # 0
+        'loss_smpl_pose_lhand': args.smpl_pose_loss_lhand_coef,  # 0
+        'loss_smpl_pose_rhand': args.smpl_pose_loss_rhand_coef,  # 0
+        'loss_smpl_pose_jaw': args.smpl_pose_loss_jaw_coef,  # 0
+        'loss_smpl_beta': args.smpl_beta_loss_coef,  # 0
+        'loss_smpl_expr': args.smpl_expr_loss_coef, 
+        # smpl kp3d ra
+        'loss_smpl_body_kp3d_ra': args.smpl_body_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_lhand_kp3d_ra': args.smpl_lhand_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_rhand_kp3d_ra': args.smpl_rhand_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_face_kp3d_ra': args.smpl_face_kp3d_ra_loss_coef,  # 0
+        # smpl kp3d
+        'loss_smpl_body_kp3d': args.smpl_body_kp3d_loss_coef,  # 0
+        'loss_smpl_face_kp3d': args.smpl_face_kp3d_loss_coef,  # 0
+        'loss_smpl_lhand_kp3d': args.smpl_lhand_kp3d_loss_coef,  # 0
+        'loss_smpl_rhand_kp3d': args.smpl_rhand_kp3d_loss_coef,  # 0
+        # smpl kp2d
+        'loss_smpl_body_kp2d': args.smpl_body_kp2d_loss_coef,  # 0
+        'loss_smpl_lhand_kp2d': args.smpl_lhand_kp2d_loss_coef,  # 0
+        'loss_smpl_rhand_kp2d': args.smpl_rhand_kp2d_loss_coef,  # 0
+        'loss_smpl_face_kp2d': args.smpl_face_kp2d_loss_coef,  # 0
+        
+        # smpl kp2d ba
+        'loss_smpl_body_kp2d_ba': args.smpl_body_kp2d_ba_loss_coef,
+        'loss_smpl_face_kp2d_ba': args.smpl_face_kp2d_ba_loss_coef,
+        'loss_smpl_lhand_kp2d_ba': args.smpl_lhand_kp2d_ba_loss_coef,
+        'loss_smpl_rhand_kp2d_ba': args.smpl_rhand_kp2d_ba_loss_coef,
+        
+    }
+
+    clean_weight_dict_wo_dn = copy.deepcopy(weight_dict)
+
+    if args.use_dn:
+        weight_dict.update({
+            'dn_loss_ce':
+            args.dn_label_coef,  # 0.3
+            'dn_loss_bbox':
+            args.bbox_loss_coef * args.dn_bbox_coef,  # 5 * 0.5
+            'dn_loss_giou':
+            args.giou_loss_coef * args.dn_bbox_coef,  # 2 * 0.5
+        })
+
+    clean_weight_dict = copy.deepcopy(weight_dict)
+
+    if args.aux_loss:
+        aux_weight_dict = {}
+        for i in range(args.dec_layers - 1):  # from 0 t 4 # ???
+            for k, v in clean_weight_dict.items():
+                if i < args.num_box_decoder_layers and ('keypoints' in k or 'oks' in k):
+                    continue
+                if i < args.num_box_decoder_layers and k in [
+                    'loss_rhand_bbox', 'loss_lhand_bbox', 'loss_face_bbox',
+                    'loss_rhand_giou', 'loss_lhand_giou', 'loss_face_giou']:
+                    continue
+                if i < args.num_hand_face_decoder_layers and k in [
+                    'loss_rhand_keypoints', 'loss_lhand_keypoints', 
+                    'loss_face_keypoints', 'loss_rhand_oks',
+                    'loss_lhand_oks', 'loss_face_oks']:
+                    continue
+                if i < args.num_box_decoder_layers and 'smpl' in k:
+                    continue
+                aux_weight_dict.update({k + f'_{i}': v})
+        weight_dict.update(aux_weight_dict)
+
+    if args.two_stage_type != 'no':
+        interm_weight_dict = {}
+        try:
+            no_interm_box_loss = args.no_interm_box_loss
+        except:
+            no_interm_box_loss = False
+        _coeff_weight_dict = {
+            'loss_ce': 1.0,
+            # bbox
+            'loss_body_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            # bbox giou
+            'loss_body_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_giou': 1.0 if not no_interm_box_loss else 0.0,
+            # 2d kp
+            'loss_keypoints': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_keypoints': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_keypoints': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_keypoints': 1.0 if not no_interm_box_loss else 0.0,
+            # 2d oks
+            'loss_oks': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_oks': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_oks': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_oks': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl param
+            'loss_smpl_pose_root': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_body': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_lhand': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_rhand': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_jaw': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_beta': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_expr': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp3d ra
+            'loss_smpl_body_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp3d
+            'loss_smpl_body_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp2d
+            'loss_smpl_body_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp2d ba
+            'loss_smpl_body_kp2d_ba': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp2d_ba': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp2d_ba': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp2d_ba': 1.0 if not no_interm_box_loss else 0.0,
+        }
+        try:
+            interm_loss_coef = args.interm_loss_coef  # 1
+        except:
+            interm_loss_coef = 1.0
+        interm_weight_dict.update({
+            k + f'_interm': v * interm_loss_coef * _coeff_weight_dict[k]
+            for k, v in clean_weight_dict_wo_dn.items() if 'keypoints' not in k
+        })
+        weight_dict.update(interm_weight_dict)
+
+        interm_weight_dict.update({
+            k + f'_query_expand': v * interm_loss_coef * _coeff_weight_dict[k]
+            for k, v in clean_weight_dict_wo_dn.items()
+        })  # ???
+        weight_dict.update(interm_weight_dict)
+
+    losses = cfg.losses
+    
+    if args.dn_number > 0:
+        losses += ['dn_label', 'dn_bbox']
+    losses += ['matching']
+
+    criterion = SetCriterion(
+        num_classes,
+        matcher=matcher,
+        weight_dict=weight_dict,
+        focal_alpha=args.focal_alpha,
+        losses=losses,
+        num_box_decoder_layers=args.num_box_decoder_layers,
+        num_hand_face_decoder_layers=args.num_hand_face_decoder_layers,
+        num_body_points=args.num_body_points,
+        num_hand_points=args.num_hand_points,
+        num_face_points=args.num_face_points,
+        )
+
+    criterion.to(device)
+    if args.inference:
+        postprocessors = {
+            'bbox': 
+                PostProcess_SMPLX_Multi_Infer(
+                    num_select=args.num_select, 
+                    nms_iou_threshold=args.nms_iou_threshold,
+                    num_body_points=args.num_body_points),
+        }
+    else:
+        postprocessors = {
+            'bbox': 
+                PostProcess_SMPLX(
+                    num_select=args.num_select, 
+                    nms_iou_threshold=args.nms_iou_threshold,
+                    num_body_points=args.num_body_points),
+        }
+    postprocessors_aios = {
+        'bbox':
+        PostProcess_aios(num_select=args.num_select,
+                           nms_iou_threshold=args.nms_iou_threshold,
+                           num_body_points=args.num_body_points),
+    }
+    # criterion_smpl=build_architecture(cfg['smpl_loss'])
+    return model, criterion, postprocessors, postprocessors_aios
+
+
+
+
+
+
+
+
+class AiOSSMPLX_Box(nn.Module):
+    def __init__(
+        self,
+        backbone,
+        transformer,
+        num_classes,
+        num_queries,
+        aux_loss=False,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=False,
+        fix_refpoints_hw=-1,
+        num_feature_levels=1,
+        nheads=8,
+        two_stage_type='no',
+        dec_pred_class_embed_share=False,
+        dec_pred_bbox_embed_share=False,
+        dec_pred_pose_embed_share=False,
+        two_stage_class_embed_share=True,
+        two_stage_bbox_embed_share=True,
+        dn_number=100,
+        dn_box_noise_scale=0.4,
+        dn_label_noise_ratio=0.5,
+        dn_batch_gt_fuse=False,
+        dn_labelbook_size=100,
+        dn_attn_mask_type_list=['group2group'],
+        cls_no_bias=False,
+        num_group=100,
+        num_body_points=0,
+        num_hand_points=0,
+        num_face_points=0,
+        num_box_decoder_layers=2,
+        num_hand_face_decoder_layers=4,
+        body_model=dict(
+            type='smplx',
+            keypoint_src='smplx',
+            num_expression_coeffs=10,
+            keypoint_dst='smplx_137',
+            model_path='data/body_models/smplx',
+            use_pca=False,
+            use_face_contour=True),
+        train=True,
+        inference=False,
+        focal_length=[5000., 5000.],
+        camera_3d_size=2.5
+    ):
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim = transformer.d_model
+        self.num_feature_levels = num_feature_levels
+        self.nheads = nheads
+        self.label_enc = nn.Embedding(dn_labelbook_size + 1, hidden_dim)
+        self.num_body_points = num_body_points
+        self.num_hand_points = num_hand_points
+        self.num_face_points = num_face_points
+        self.num_whole_body_points = num_body_points + 2*num_hand_points + num_face_points
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.focal_length = focal_length
+        self.camera_3d_size=camera_3d_size
+        self.inference = inference
+        if train:
+            self.smpl_convention = 'smplx'
+        else:
+            self.smpl_convention = 'h36m'
+        # setting query dim
+        self.query_dim = query_dim
+        assert query_dim == 4
+        self.random_refpoints_xy = random_refpoints_xy  # False
+        self.fix_refpoints_hw = fix_refpoints_hw  # -1
+
+        # for dn training
+        self.dn_number = dn_number
+        self.dn_box_noise_scale = dn_box_noise_scale
+        self.dn_label_noise_ratio = dn_label_noise_ratio
+        self.dn_batch_gt_fuse = dn_batch_gt_fuse
+        self.dn_labelbook_size = dn_labelbook_size
+        self.dn_attn_mask_type_list = dn_attn_mask_type_list
+        assert all([
+            i in ['match2dn', 'dn2dn', 'group2group']
+            for i in dn_attn_mask_type_list
+        ])
+        assert not dn_batch_gt_fuse
+
+        # build human body
+        # if train:
+        #     self.body_model = build_body_model(body_model)
+        if inference:
+            body_model=dict(
+                type='smplx',
+                keypoint_src='smplx',
+                num_expression_coeffs=10,
+                num_betas=10,
+                keypoint_dst='smplx',
+                model_path='data/body_models/smplx',
+                use_pca=False,
+                use_face_contour=True)
+        self.body_model = build_body_model(body_model)
+        for param in self.body_model.parameters():
+            param.requires_grad = False       
+        # prepare input projection layers
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.num_channels)  # 3
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+            for _ in range(num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels,
+                                  hidden_dim,
+                                  kernel_size=3,
+                                  stride=2,
+                                  padding=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            assert two_stage_type == 'no', 'two_stage_type should be no if num_feature_levels=1 !!!'
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(backbone.num_channels[-1],
+                              hidden_dim,
+                              kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                )
+            ])
+
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.box_pred_damping = box_pred_damping = None
+
+        self.iter_update = iter_update
+        assert iter_update, 'Why not iter_update?'
+
+        # prepare pred layers
+        self.dec_pred_class_embed_share = dec_pred_class_embed_share  # false
+        self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share  # false
+
+        # 1.1 prepare class & box embed
+        _class_embed = nn.Linear(hidden_dim,
+                                 num_classes,
+                                 bias=(not cls_no_bias))
+        if not cls_no_bias:
+            prior_prob = 0.01
+            bias_value = -math.log((1 - prior_prob) / prior_prob)
+            _class_embed.bias.data = torch.ones(self.num_classes) * bias_value
+
+        # 1.2 box embed layer list
+        if dec_pred_class_embed_share:
+            class_embed_layerlist = [
+                _class_embed for i in range(transformer.num_decoder_layers)
+            ]
+        else:
+            class_embed_layerlist = [
+                copy.deepcopy(_class_embed)
+                for i in range(transformer.num_decoder_layers)
+            ]
+
+
+        ###########################################################################
+        #                    body bbox + l/r hand box + face box
+        ###########################################################################
+        # 1.1 body bbox embed
+        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+        
+        # 1.2 body bbox embed layer list
+        self.num_group = num_group
+        if dec_pred_bbox_embed_share:
+            box_body_embed_layerlist = [
+                _bbox_embed for i in range(transformer.num_decoder_layers)
+            ]
+        else:
+            box_body_embed_layerlist = [
+                copy.deepcopy(_bbox_embed)
+                for i in range(transformer.num_decoder_layers)
+            ]
+
+        # 2.1 lhand bbox embed
+        _bbox_hand_embed = MLP(hidden_dim, hidden_dim, 2, 3) # TODO: the out shape should be 2 not 4
+        nn.init.constant_(_bbox_hand_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_hand_embed.layers[-1].bias.data, 0)
+
+        _bbox_hand_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_hand_hw_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_hand_hw_embed.layers[-1].bias.data, 0)
+        # 2.2 lhand bbox embed layer list
+        if dec_pred_pose_embed_share:
+            box_hand_embed_layerlist = \
+                [_bbox_hand_embed for i in range(transformer.num_decoder_layers - num_box_decoder_layers+1)]
+        else:
+            box_hand_embed_layerlist = [
+                copy.deepcopy(_bbox_hand_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers + 1)
+            ]
+
+        if dec_pred_pose_embed_share:
+            box_hand_hw_embed_layerlist = [
+                _bbox_hand_hw_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)
+                ]
+        else:
+            box_hand_hw_embed_layerlist = [
+                copy.deepcopy(_bbox_hand_hw_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers)
+            ]
+                        
+        # 4.1 face bbox embed
+        _bbox_face_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_face_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_face_embed.layers[-1].bias.data, 0)
+
+        _bbox_face_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_face_hw_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_face_hw_embed.layers[-1].bias.data, 0)
+        
+        # 4.2 face bbox embed layer list
+        if dec_pred_pose_embed_share:
+            box_face_embed_layerlist = [
+                _bbox_face_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers + 1)
+                ]
+        else:
+            box_face_embed_layerlist = [
+                copy.deepcopy(_bbox_face_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers + 1)
+            ]
+
+        if dec_pred_pose_embed_share:
+            box_face_hw_embed_layerlist = [
+                _bbox_face_hw_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)]
+        else:
+            box_face_hw_embed_layerlist = [
+                copy.deepcopy(_bbox_face_hw_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers)
+            ]            
+        
+        # 1. smpl pose embed
+        if body_model['type'].upper()=='SMPL':
+            self.body_model_joint_num = 24
+        elif body_model['type'].upper()=='SMPLX':
+            self.body_model_joint_num = 22
+        else:
+            raise ValueError(
+            f'Only supports SMPL or SMPLX, but get {body_model.type}')      
+        #TODO: 
+
+        _smpl_pose_embed = MLP(hidden_dim *  4, hidden_dim, self.body_model_joint_num * 6, 3)
+        nn.init.constant_(_smpl_pose_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_smpl_pose_embed.layers[-1].bias.data, 0)  
+
+        if dec_pred_bbox_embed_share:
+            smpl_pose_embed_layerlist = [
+                _smpl_pose_embed
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smpl_pose_embed_layerlist = [
+                copy.deepcopy(_smpl_pose_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        # 2. smpl betas embed
+        _smpl_beta_embed = MLP(hidden_dim * 4, hidden_dim, 10, 3)
+        nn.init.constant_(_smpl_beta_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_smpl_beta_embed.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smpl_beta_embed_layerlist = [
+                _smpl_beta_embed
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smpl_beta_embed_layerlist = [
+                copy.deepcopy(_smpl_beta_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        # 3. smpl cam embed
+        _cam_embed = MLP(hidden_dim * 4, hidden_dim, 3, 3)
+        nn.init.constant_(_cam_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_cam_embed.layers[-1].bias.data, 0)
+        
+        if dec_pred_bbox_embed_share:
+            cam_embed_layerlist = [
+                _cam_embed for i in range(transformer.num_decoder_layers -
+                                          num_box_decoder_layers)
+            ]
+        else:
+            cam_embed_layerlist = [
+                copy.deepcopy(_cam_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        
+        ###########################################################################
+        #  smplx body pose + hand pose + expression + betas + kp2d + kp3d + cam
+        ###########################################################################
+
+        # 2. smplx hand pose embed
+        _smplx_hand_pose_embed_layer_2_3 = \
+            MLP(hidden_dim * 2, hidden_dim, 15 * 6, 3)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_hand_pose_embed_layer_4_5 = \
+            MLP(hidden_dim * 2, hidden_dim, 15 * 6, 3)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_4_5.layers[-1].bias.data, 0)
+
+
+        
+        if dec_pred_bbox_embed_share:
+            smplx_hand_pose_embed_layerlist = [
+                _smplx_hand_pose_embed_layer_2_3
+                if i<2 else _smplx_hand_pose_embed_layer_4_5
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smplx_hand_pose_embed_layerlist = [
+                copy.deepcopy(_smplx_hand_pose_embed_layer_2_3)
+                if i<2 else copy.deepcopy(_smplx_hand_pose_embed_layer_4_5)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+
+        # 3. smplx face expression 
+
+        _smplx_expression_embed_layer_2_3 = \
+            MLP(hidden_dim*2, hidden_dim, 10, 3)
+        nn.init.constant_(_smplx_expression_embed_layer_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_expression_embed_layer_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_expression_embed_layer_4_5 = \
+            MLP(hidden_dim * 2, hidden_dim, 10, 3)
+        nn.init.constant_(_smplx_expression_embed_layer_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_expression_embed_layer_4_5.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smplx_expression_embed_layerlist = [
+                _smplx_expression_embed_layer_2_3
+                if i<2 else _smplx_expression_embed_layer_4_5
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smplx_expression_embed_layerlist = [
+                copy.deepcopy(_smplx_expression_embed_layer_2_3)
+                if i<2 else copy.deepcopy(_smplx_expression_embed_layer_4_5)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        # 4. smplx jaw pose embed
+        _smplx_jaw_embed_2_3 = MLP(hidden_dim * 2, hidden_dim, 6, 3)
+        nn.init.constant_(_smplx_jaw_embed_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_jaw_embed_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_jaw_embed_4_5 = MLP(hidden_dim * 2, hidden_dim, 6, 3)
+        nn.init.constant_(_smplx_jaw_embed_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_jaw_embed_4_5.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smplx_jaw_embed_layerlist = [
+                _smplx_jaw_embed_2_3 if i<2 else _smplx_jaw_embed_4_5
+                for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)
+            ]
+        else:
+            smplx_jaw_embed_layerlist = [
+                copy.deepcopy(_smplx_jaw_embed_2_3) 
+                if i<2 else copy.deepcopy(_smplx_jaw_embed_4_5) 
+                for i in range(
+                    transformer.num_decoder_layers -  num_box_decoder_layers)
+            ]
+            
+        self.bbox_embed = nn.ModuleList(box_body_embed_layerlist)
+        self.class_embed = nn.ModuleList(class_embed_layerlist)
+
+        self.transformer.decoder.bbox_embed = self.bbox_embed
+        self.transformer.decoder.class_embed = self.class_embed
+        
+        # smpl
+        self.smpl_pose_embed = nn.ModuleList(smpl_pose_embed_layerlist)
+        self.smpl_beta_embed = nn.ModuleList(smpl_beta_embed_layerlist)
+        self.smpl_cam_embed = nn.ModuleList(cam_embed_layerlist)
+
+        # smplx lhand kp
+        self.bbox_hand_embed = nn.ModuleList(box_hand_embed_layerlist)
+        self.bbox_hand_hw_embed = nn.ModuleList(box_hand_hw_embed_layerlist)
+
+        self.transformer.decoder.bbox_hand_embed = self.bbox_hand_embed
+        self.transformer.decoder.bbox_hand_hw_embed = self.bbox_hand_hw_embed
+
+        # smplx face kp
+        self.bbox_face_embed = nn.ModuleList(box_face_embed_layerlist)
+        self.bbox_face_hw_embed = nn.ModuleList(box_face_hw_embed_layerlist)
+
+        self.transformer.decoder.bbox_face_embed = self.bbox_face_embed
+        self.transformer.decoder.bbox_face_hw_embed = self.bbox_face_hw_embed
+
+        # smplx 
+        self.smpl_hand_pose_embed = nn.ModuleList(smplx_hand_pose_embed_layerlist)
+
+        self.smpl_expr_embed = nn.ModuleList(smplx_expression_embed_layerlist)
+        self.smpl_jaw_embed = nn.ModuleList(smplx_jaw_embed_layerlist)
+
+        self.transformer.decoder.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.transformer.decoder.num_box_decoder_layers = num_box_decoder_layers
+        self.transformer.decoder.num_body_points = num_body_points
+        self.transformer.decoder.num_hand_points = num_hand_points
+        self.transformer.decoder.num_face_points = num_face_points
+        # two stage
+        self.two_stage_type = two_stage_type
+        assert two_stage_type in [
+            'no', 'standard'
+        ], 'unknown param {} of two_stage_type'.format(two_stage_type)
+        if two_stage_type != 'no':
+            if two_stage_bbox_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_bbox_embed = _bbox_embed
+            else:
+                self.transformer.enc_out_bbox_embed = copy.deepcopy(
+                    _bbox_embed)
+
+            if two_stage_class_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_class_embed = _class_embed
+
+            else:
+                self.transformer.enc_out_class_embed = copy.deepcopy(
+                    _class_embed)
+            self.refpoint_embed = None
+
+        self._reset_parameters()
+
+    def get_camera_trans(self, cam_param, input_body_shape):
+        # camera translation
+        t_xy = cam_param[:, :2]
+        gamma = torch.sigmoid(cam_param[:, 2])  # apply sigmoid to make it positive
+        k_value = torch.FloatTensor(
+            [
+                math.sqrt(
+                    self.focal_length[0] * self.focal_length[1] * self.camera_3d_size * self.camera_3d_size / 
+                    (input_body_shape[0] * input_body_shape[1])
+                )
+            ]
+        ).cuda().view(-1)
+        t_z = k_value * gamma
+        cam_trans = torch.cat((t_xy, t_z[:, None]), 1)
+        return cam_trans
+
+    def _reset_parameters(self):
+        # init input_proj
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+    def prepare_for_dn2(self, targets):
+        if not self.training:
+            device = targets[0]['boxes'].device
+            bs = len(targets)
+            
+            num_points = 4
+            attn_mask2 = torch.zeros(
+                bs,
+                self.nheads,
+                self.num_group * 4,
+                self.num_group * 4,
+                device=device,
+                dtype=torch.bool)
+
+            group_bbox_kpt = 4
+            # body bbox index
+            kpt_index = [x for x in range(self.num_group * 4) if x % 4 in [0]]
+            
+            for matchj in range(self.num_group * 4):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                
+                # for each instance, they should associate with their query (body hand face)
+                if sj > 0:
+                    attn_mask2[:, :, matchj, :sj] = True
+                if ej < self.num_group * 4:
+                    attn_mask2[:, :, matchj, ej:] = True
+
+            for match_x in range(self.num_group * 4):
+                if match_x % group_bbox_kpt in [0, 1, 2, 3]:
+                    # each query (hand face body) should associate with all body query
+                    attn_mask2[:,:,match_x, kpt_index]=False
+
+
+            num_points = 4
+            attn_mask3 = torch.zeros(
+                bs,
+                self.nheads,
+                self.num_group * 4, 
+                self.num_group * 4,
+                device=device, 
+                dtype=torch.bool)
+
+            group_bbox_kpt = 4
+            kpt_index = [x for x in range(self.num_group * 4) if x % 4 in [0]]
+            for matchj in range(self.num_group * 4):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                # for each instance, they should associate with their query (body hand face)
+                if sj > 0:
+                    attn_mask3[:, :, matchj, :sj] = True
+                if ej < self.num_group * 4:
+                    attn_mask3[:, :, matchj, ej:] = True
+
+            for match_x in range(self.num_group * 4):
+                if match_x % group_bbox_kpt in [0, 1,  2, 3]:
+                    # each query (hand face body) should associate with all body query
+                    attn_mask3[:, :, match_x, kpt_index] = False
+
+            attn_mask2 = attn_mask2.flatten(0, 1)
+            attn_mask3 = attn_mask3.flatten(0, 1)
+            return None, None, None, attn_mask2, attn_mask3, None
+
+        # targets, dn_scalar, noise_scale = dn_args
+        device = targets[0]['boxes'].device
+        bs = len(targets)
+        dn_number = self.dn_number  # 100
+        dn_box_noise_scale = self.dn_box_noise_scale  # 0.4
+        dn_label_noise_ratio = self.dn_label_noise_ratio  # 0.5
+
+        # gather gt boxes and labels
+        gt_boxes = [t['boxes'] for t in targets]
+        gt_labels = [t['labels'] for t in targets]
+        gt_keypoints = [t['keypoints'] for t in targets]
+
+        # repeat them
+        def get_indices_for_repeat(now_num, target_num, device='cuda'):
+            """
+            Input:
+                - now_num: int
+                - target_num: int
+            Output:
+                - indices: tensor[target_num]
+            """
+            out_indice = []
+            base_indice = torch.arange(now_num).to(device)
+            multiplier = target_num // now_num
+            out_indice.append(base_indice.repeat(multiplier))
+            residue = target_num % now_num
+            out_indice.append(base_indice[torch.randint(0,
+                                                        now_num, (residue, ),
+                                                        device=device)])
+            return torch.cat(out_indice)
+
+        if self.dn_batch_gt_fuse:
+            raise NotImplementedError
+            gt_boxes_bsall = torch.cat(gt_boxes)  # num_boxes, 4
+            gt_labels_bsall = torch.cat(gt_labels)
+            num_gt_bsall = gt_boxes_bsall.shape[0]
+            if num_gt_bsall > 0:
+                indices = get_indices_for_repeat(num_gt_bsall, dn_number,
+                                                 device)
+                gt_boxes_expand = gt_boxes_bsall[indices][None].repeat(
+                    bs, 1, 1)  # bs, num_dn, 4
+                gt_labels_expand = gt_labels_bsall[indices][None].repeat(
+                    bs, 1)  # bs, num_dn
+            else:
+                # all negative samples when no gt boxes
+                gt_boxes_expand = torch.rand(bs, dn_number, 4, device=device)
+                gt_labels_expand = torch.ones(
+                    bs, dn_number, dtype=torch.int64, device=device) * int(
+                        self.num_classes)
+        else:
+            gt_boxes_expand = []
+            gt_labels_expand = []
+            gt_keypoints_expand = []  # here
+            for idx, (gt_boxes_i, gt_labels_i, gt_keypoint_i) in enumerate(
+                    zip(gt_boxes, gt_labels, gt_keypoints)):  # idx -> batch id
+                num_gt_i = gt_boxes_i.shape[0]  # instance num
+                if num_gt_i > 0:
+                    indices = get_indices_for_repeat(num_gt_i, dn_number,
+                                                     device)
+                    gt_boxes_expand_i = gt_boxes_i[indices]  # num_dn, 4
+                    gt_labels_expand_i = gt_labels_i[indices]  # add smpl
+                    gt_keypoints_expand_i = gt_keypoint_i[indices]
+                else:
+                    # all negative samples when no gt boxes
+                    gt_boxes_expand_i = torch.rand(dn_number, 4, device=device)
+                    gt_labels_expand_i = torch.ones(
+                        dn_number, dtype=torch.int64, device=device) * int(
+                            self.num_classes)
+                    gt_keypoints_expand_i = torch.rand(dn_number,
+                                                       self.num_body_points *
+                                                       3,
+                                                       device=device)
+                gt_boxes_expand.append(gt_boxes_expand_i)  # add smpl
+                gt_labels_expand.append(gt_labels_expand_i)
+                gt_keypoints_expand.append(gt_keypoints_expand_i)
+            gt_boxes_expand = torch.stack(gt_boxes_expand)
+            gt_labels_expand = torch.stack(gt_labels_expand)
+            gt_keypoints_expand = torch.stack(gt_keypoints_expand)
+        knwon_boxes_expand = gt_boxes_expand.clone()
+        knwon_labels_expand = gt_labels_expand.clone()
+
+        # add noise
+        if dn_label_noise_ratio > 0:
+            prob = torch.rand_like(knwon_labels_expand.float())
+            chosen_indice = prob < dn_label_noise_ratio
+            new_label = torch.randint_like(
+                knwon_labels_expand[chosen_indice], 0,
+                self.dn_labelbook_size)  # randomly put a new one here
+            knwon_labels_expand[chosen_indice] = new_label
+
+        if dn_box_noise_scale > 0:
+            diff = torch.zeros_like(knwon_boxes_expand)
+            diff[..., :2] = knwon_boxes_expand[..., 2:] / 2
+            diff[..., 2:] = knwon_boxes_expand[..., 2:]
+            knwon_boxes_expand += torch.mul(
+                (torch.rand_like(knwon_boxes_expand) * 2 - 1.0),
+                diff) * dn_box_noise_scale
+            knwon_boxes_expand = knwon_boxes_expand.clamp(min=0.0, max=1.0)
+
+        input_query_label = self.label_enc(knwon_labels_expand)
+        input_query_bbox = inverse_sigmoid(knwon_boxes_expand)
+
+        # prepare mask
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            attn_mask = torch.zeros(bs,
+                                    self.nheads,
+                                    dn_number + self.num_queries,
+                                    dn_number + self.num_queries,
+                                    device=device,
+                                    dtype=torch.bool)
+            attn_mask[:, :, dn_number:, :dn_number] = True
+            for idx, (gt_boxes_i, gt_labels_i) in enumerate(
+                    zip(gt_boxes, gt_labels)):  # for batch
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask[idx, :, matchi, ei:dn_number] = True
+            attn_mask = attn_mask.flatten(0, 1)
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            # self.num_body_points = self.num_body_points +3
+            num_points = 4
+            attn_mask2 = torch.zeros(
+                bs,
+                self.nheads,
+                dn_number + self.num_group * 4,
+                dn_number + self.num_group * 4,
+                device=device,
+                dtype=torch.bool)
+            attn_mask2[:, :, dn_number:, :dn_number] = True
+            group_bbox_kpt = 4
+
+            for matchj in range(self.num_group * 4):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt 
+                # for each instance, they should associate their body, hand, and face bbox
+                if sj > 0:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, matchj, :sj] = True
+                if ej < self.num_group * 4:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, matchj, ej:] = True
+            # body bbox index
+            kpt_index = [x for x in range(self.num_group * 4) if x % 4 in [0]]
+            for match_x in range(self.num_group * 4):
+                if match_x % group_bbox_kpt in [0, 1,  2, 3]:
+                    # for each instance, they should associate their each query with 
+                    # other instances' body query
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, match_x, kpt_index]=False
+
+            for idx, (gt_boxes_i, gt_labels_i) in enumerate(zip(gt_boxes, gt_labels)):
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask2[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask2[idx, :, matchi, ei:dn_number] = True
+            attn_mask2 = attn_mask2.flatten(0, 1)
+
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            num_points = 4
+            attn_mask3 = torch.zeros(
+                bs,
+                self.nheads,
+                dn_number + self.num_group * 4, dn_number + self.num_group * 4,
+                device=device, dtype=torch.bool)
+            attn_mask3[:, :, dn_number:, :dn_number] = True
+            group_bbox_kpt = 4
+            
+            for matchj in range(self.num_group * 4):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                # for each instance, they should associate their body, hand, and face bbox
+                if sj > 0:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, matchj, :sj] = True
+                if ej < self.num_group * 4:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, matchj, ej:] = True
+            
+            kpt_index = [x for x in range(self.num_group * 4) if x % 4 in [0]]
+            for match_x in range(self.num_group * 4):
+                if match_x % group_bbox_kpt in [0, 1,  2, 3]:
+                    # for each instance, they should associate their each query with 
+                    # other instances' body query
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, match_x, kpt_index]=False
+
+            for idx, (gt_boxes_i, gt_labels_i) in enumerate(zip(gt_boxes, gt_labels)):
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask3[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask3[idx, :, matchi, ei:dn_number] = True
+            attn_mask3 = attn_mask3.flatten(0, 1)
+
+        mask_dict = {
+            'pad_size': dn_number,
+            'known_bboxs': gt_boxes_expand,
+            'known_labels': gt_labels_expand,
+            'known_keypoints': gt_keypoints_expand
+        }
+
+        return input_query_label, input_query_bbox, attn_mask, attn_mask2, attn_mask3, mask_dict
+
+    def dn_post_process2(self, outputs_class, outputs_coord, mask_dict):
+        if mask_dict and mask_dict['pad_size'] > 0:
+            output_known_class = [
+                outputs_class_i[:, :mask_dict['pad_size'], :]
+                for outputs_class_i in outputs_class
+            ]
+            output_known_coord = [
+                outputs_coord_i[:, :mask_dict['pad_size'], :]
+                for outputs_coord_i in outputs_coord
+            ]
+
+            outputs_class = [
+                outputs_class_i[:, mask_dict['pad_size']:, :]
+                for outputs_class_i in outputs_class
+            ]
+            outputs_coord = [
+                outputs_coord_i[:, mask_dict['pad_size']:, :]
+                for outputs_coord_i in outputs_coord
+            ]
+
+            mask_dict.update({
+                'output_known_coord': output_known_coord,
+                'output_known_class': output_known_class
+            })
+        return outputs_class, outputs_coord
+
+    def forward(self, data_batch: NestedTensor, targets: List = None):
+        """The forward expects a NestedTensor, which consists of:
+
+           - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+           - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+        It returns a dict with the following elements:
+           - "pred_logits": the classification logits (including no-object) for all queries.
+                            Shape= [batch_size x num_queries x num_classes]
+           - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                           (center_x, center_y, width, height). These values are normalized in [0, 1],
+                           relative to the size of each individual image (disregarding possible padding).
+                           See PostProcess for information on how to retrieve the unnormalized bounding box.
+           - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                            dictionnaries containing the two above keys for each decoder layer.
+        """
+
+        if isinstance(data_batch, dict):
+            samples, targets = self.prepare_targets(data_batch)
+            # import pdb; pdb.set_trace()
+        elif isinstance(data_batch, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(data_batch)
+        else:
+            samples = data_batch
+        features, poss = self.backbone(samples)
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):  # len(features=3)
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src))
+            masks.append(mask)
+            assert mask is not None
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](features[-1].tensors)
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                m = samples.mask
+                mask = F.interpolate(m[None].float(),
+                                     size=src.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                poss.append(pos_l)
+
+        if self.dn_number > 0 or targets is not None:
+            input_query_label, input_query_bbox, attn_mask,attn_mask2, attn_mask3, mask_dict =\
+                self.prepare_for_dn2(targets)
+        else:
+            assert targets is None
+            input_query_bbox = input_query_label = attn_mask = attn_mask2 = attn_mask3 = mask_dict = None
+
+
+        hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
+            srcs, masks, input_query_bbox, poss, input_query_label, attn_mask,
+            attn_mask2, attn_mask3)
+
+        # update human boxes
+        effective_dn_number = self.dn_number if self.training else 0
+        outputs_body_bbox_list = []
+        outputs_class = []
+        
+        for dec_lid, (layer_ref_sig, layer_body_bbox_embed, layer_cls_embed,
+                      layer_hs) in enumerate(
+                          zip(reference[:-1], self.bbox_embed,
+                              self.class_embed, hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                # human det
+                layer_delta_unsig = layer_body_bbox_embed(layer_hs)
+                layer_body_box_outputs_unsig = \
+                    layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
+                layer_body_box_outputs_unsig = layer_body_box_outputs_unsig.sigmoid()
+                layer_cls = layer_cls_embed(layer_hs)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)
+                outputs_class.append(layer_cls)
+                
+            elif dec_lid < self.num_box_decoder_layers + 2:
+                bs = layer_ref_sig.shape[0]                
+                # dn body bbox
+                layer_hs_body_bbox_dn = layer_hs[:, :effective_dn_number, :]  # dn content query
+                reference_before_sigmoid_body_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]  # dn position query
+                layer_body_box_delta_unsig_dn = layer_body_bbox_embed(layer_hs_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_delta_unsig_dn + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_outputs_unsig_dn.sigmoid()
+                
+                # norm body bbox
+                layer_hs_body_bbox_norm = layer_hs[:, effective_dn_number:, :][
+                    :, 0::(self.num_body_points + 4), :]  # norm content query
+                reference_before_sigmoid_body_bbox_norm = layer_ref_sig[:, effective_dn_number:, :][
+                    :, 0::(self.num_body_points+ 4), :]  # norm position query
+                layer_body_box_delta_unsig_norm = layer_body_bbox_embed(layer_hs_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_delta_unsig_norm + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_outputs_unsig_norm.sigmoid()
+
+                layer_body_box_outputs_unsig = torch.cat(
+                    (layer_body_box_outputs_unsig_dn, layer_body_box_outputs_unsig_norm), dim=1)
+
+                # classfication
+                layer_cls_dn = layer_cls_embed(layer_hs_body_bbox_dn)
+                layer_cls_norm = layer_cls_embed(layer_hs_body_bbox_norm)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+
+                outputs_class.append(layer_cls)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)                
+            else:
+                bs = layer_ref_sig.shape[0]                
+                # dn body bbox
+                layer_hs_body_bbox_dn = layer_hs[:, :effective_dn_number, :]  # dn content query
+                reference_before_sigmoid_body_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]  # dn position query
+                layer_body_box_delta_unsig_dn = layer_body_bbox_embed(layer_hs_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_delta_unsig_dn + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_outputs_unsig_dn.sigmoid()
+                
+                # norm body bbox
+                layer_hs_body_bbox_norm = layer_hs[:, effective_dn_number:, :][
+                    :, 0::(self.num_whole_body_points + 4), :]  # norm content query
+                reference_before_sigmoid_body_bbox_norm = layer_ref_sig[:,effective_dn_number:, :][
+                    :, 0::(self.num_whole_body_points + 4), :]  # norm position query
+                layer_body_box_delta_unsig_norm = layer_body_bbox_embed(layer_hs_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_delta_unsig_norm + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_outputs_unsig_norm.sigmoid()
+
+                layer_body_box_outputs_unsig = torch.cat(
+                    (layer_body_box_outputs_unsig_dn, layer_body_box_outputs_unsig_norm), dim=1)
+
+                # classfication
+                layer_cls_dn = layer_cls_embed(layer_hs_body_bbox_dn)
+                layer_cls_norm = layer_cls_embed(layer_hs_body_bbox_norm)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+
+                outputs_class.append(layer_cls)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)       
+                
+        # update hand and face boxes
+        outputs_lhand_bbox_list = []
+        outputs_rhand_bbox_list = []
+        outputs_face_bbox_list = []
+        # update keypoints boxes
+        outputs_body_keypoints_list = []
+        outputs_body_keypoints_hw = []
+        outputs_lhand_keypoints_list = []
+        outputs_lhand_keypoints_hw = []        
+        outputs_rhand_keypoints_list = []
+        outputs_rhand_keypoints_hw = []
+        outputs_face_keypoints_list = []
+        outputs_face_keypoints_hw = []             
+        
+        outputs_smpl_pose_list = []
+        outputs_smpl_lhand_pose_list = []
+        outputs_smpl_rhand_pose_list = []
+        outputs_smpl_expr_list = []
+        outputs_smpl_jaw_pose_list = []
+        outputs_smpl_beta_list = []
+        outputs_smpl_cam_list = []
+        outputs_smpl_kp2d_list = []
+        outputs_smpl_kp3d_list = []
+        outputs_smpl_verts_list = []
+        
+        # smpl pose
+        # body box, kps, lhand box
+        body_index = [0, 1, 2, 3]
+        smpl_pose_index = [
+            x for x in range(self.num_group * 4) if (x % 4 in body_index)]
+        
+        # smpl lhand
+        lhand_index = [0, 1]
+        smpl_lhand_pose_index = [
+            x for x in range(self.num_group * 4) if (x % 4 in lhand_index)]
+        
+        # smpl rhand
+        rhand_index = [0, 2]
+        smpl_rhand_pose_index = [
+            x for x in range(self.num_group * 4) if (x % 4 in rhand_index)]
+        
+        # smpl face
+        face_index = [0, 3]
+        smpl_face_pose_index = [
+            x for x in range(self.num_group * 4) if (x % 4 in face_index)]
+        
+        for dec_lid, (layer_ref_sig, layer_hs) in enumerate(zip(reference[:-1], hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                assert isinstance(layer_hs, torch.Tensor)
+                bs = layer_hs.shape[0]
+                layer_body_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_body_points * 3))  # [-, 900, 42]
+                outputs_body_keypoints_list.append(layer_body_kps_res)
+                
+                # lhand
+                layer_lhand_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_lhand_bbox_list.append(layer_lhand_bbox_res)
+                layer_lhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_lhand_keypoints_list.append(layer_lhand_kps_res)                
+
+                # rhand
+                layer_rhand_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_rhand_bbox_list.append(layer_rhand_bbox_res)                
+                layer_rhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_rhand_keypoints_list.append(layer_rhand_kps_res)
+                
+                # face
+                layer_face_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_face_bbox_list.append(layer_face_bbox_res)
+                layer_face_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_face_points * 3))  # [-, 900, 42]
+                outputs_face_keypoints_list.append(layer_face_kps_res)
+                
+                # smpl or smplx
+                smpl_pose = layer_hs.new_zeros((bs, self.num_queries, self.body_model_joint_num * 3))
+                smpl_rhand_pose = layer_hs.new_zeros(
+                    (bs, self.num_queries, 15 * 3))
+                smpl_lhand_pose = layer_hs.new_zeros(
+                    (bs, self.num_queries, 15 * 3))
+                smpl_expr = layer_hs.new_zeros((bs, self.num_queries, 10))
+                smpl_jaw_pose = layer_hs.new_zeros((bs, self.num_queries, 6))
+                smpl_beta = layer_hs.new_zeros((bs, self.num_queries, 10))
+                smpl_cam = layer_hs.new_zeros((bs, self.num_queries, 3))
+                # smpl_kp2d = layer_hs.new_zeros((bs, self.num_queries, self.num_body_points,3))
+                smpl_kp3d = layer_hs.new_zeros(
+                    (bs, self.num_queries, self.num_body_points, 4))
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                # outputs_smpl_kp2d_list.append(smpl_kp2d)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+            elif dec_lid < self.num_box_decoder_layers +2:
+                bs = layer_ref_sig.shape[0]
+                # lhand bbox
+                layer_hs_lhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 1::4, :]
+                    
+                delta_lhand_bbox_xy_unsig = self.bbox_hand_embed[dec_lid - self.num_box_decoder_layers](layer_hs_lhand_bbox)             
+                layer_ref_sig_lhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 1::4, :].clone() 
+                layer_ref_unsig_lhand_bbox = inverse_sigmoid(layer_ref_sig_lhand_bbox)
+                delta_lhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_lhand_bbox)
+                layer_ref_unsig_lhand_bbox[..., :2] +=delta_lhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_lhand_bbox[..., 2:] +=delta_lhand_bbox_hw_unsig
+                layer_ref_sig_lhand_bbox = layer_ref_unsig_lhand_bbox.sigmoid()
+                outputs_lhand_bbox_list.append(layer_ref_sig_lhand_bbox)
+                
+                # rhand bbox
+                layer_hs_rhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 2::4, :]
+                delta_rhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_rhand_bbox)             
+                layer_ref_sig_rhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 2::4, :].clone()
+                layer_ref_unsig_rhand_bbox = inverse_sigmoid(layer_ref_sig_rhand_bbox)
+                delta_rhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_rhand_bbox)
+                layer_ref_unsig_rhand_bbox[..., :2] +=delta_rhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_rhand_bbox[..., 2:] +=delta_rhand_bbox_hw_unsig
+                layer_ref_sig_rhand_bbox = layer_ref_unsig_rhand_bbox.sigmoid()
+                outputs_rhand_bbox_list.append(layer_ref_sig_rhand_bbox)
+                
+                # face bbox
+                layer_hs_face_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 3::4, :]
+                delta_face_bbox_xy_unsig = self.bbox_face_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_face_bbox)             
+                layer_ref_sig_face_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 3::4, :].clone()
+                layer_ref_unsig_face_bbox = inverse_sigmoid(layer_ref_sig_face_bbox)
+                delta_face_bbox_hw_unsig = self.bbox_face_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_face_bbox)
+                layer_ref_unsig_face_bbox[..., :2] +=delta_face_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_face_bbox[..., 2:] +=delta_face_bbox_hw_unsig                
+                layer_ref_sig_face_bbox = layer_ref_unsig_face_bbox.sigmoid()
+                
+                outputs_face_bbox_list.append(layer_ref_sig_face_bbox)
+                
+                # smpl or smplx
+                bs, _, feat_dim = layer_hs.shape
+                smpl_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 4)
+                smpl_lhand_pose_feats = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_lhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                smpl_rhand_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_rhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                smpl_face_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_face_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                                  
+                smpl_pose = self.smpl_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+                smpl_pose = rot6d_to_rotmat(smpl_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, self.body_model_joint_num, 3, 3)
+                
+                smpl_lhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_lhand_pose_feats)
+                smpl_lhand_pose = rot6d_to_rotmat(smpl_lhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                
+                smpl_rhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_rhand_pose_feats)
+                smpl_rhand_pose = rot6d_to_rotmat(smpl_rhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                
+                smpl_jaw_pose = self.smpl_jaw_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = rot6d_to_rotmat(smpl_jaw_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, 1, 3, 3)
+                                 
+                smpl_beta = self.smpl_beta_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+                smpl_cam = self.smpl_cam_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+
+                smpl_expr = self.smpl_expr_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                # smpl_jaw_pose = layer_hs.new_zeros(bs, self.num_group, 3)
+                leye_pose = torch.zeros_like(smpl_jaw_pose)
+                reye_pose = torch.zeros_like(smpl_jaw_pose)
+
+
+
+                if self.body_model is not None:
+                    smpl_pose_ = rotmat_to_aa(smpl_pose)
+                    # smpl_lhand_pose_ = rotmat_to_aa(smpl_lhand_pose)
+                    # smpl_rhand_pose_ = rotmat_to_aa(smpl_rhand_pose)
+                    smpl_lhand_pose_ = layer_hs.new_zeros(bs, self.num_group, 15, 3)
+                    smpl_rhand_pose_ = layer_hs.new_zeros(bs, self.num_group, 15, 3)
+                    smpl_jaw_pose_ = rotmat_to_aa(smpl_jaw_pose)
+                    leye_pose_ = rotmat_to_aa(leye_pose)
+                    reye_pose_ = rotmat_to_aa(reye_pose)
+                    
+                    pred_output = self.body_model(
+                        betas=smpl_beta.reshape(-1, 10),
+                        body_pose=smpl_pose_[:, :,  1:].reshape(-1, 21 * 3),
+                        global_orient=smpl_pose_[:, :, 0].reshape(
+                            -1, 3).unsqueeze(1),
+                        left_hand_pose=smpl_lhand_pose_.reshape(-1, 15 * 3),
+                        right_hand_pose=smpl_rhand_pose_.reshape(-1, 15 * 3),
+                        leye_pose=leye_pose_,
+                        reye_pose=reye_pose_,
+                        jaw_pose=smpl_jaw_pose_.reshape(-1, 3),
+                        # expression=smpl_expr.reshape(-1, 10),
+                        expression=layer_hs.new_zeros(bs, self.num_group, 10).reshape(-1, 10)
+                    )
+                    smpl_kp3d = pred_output['joints'].reshape(
+                        bs, self.num_group, -1, 3)
+                    smpl_verts = pred_output['vertices'].reshape(
+                        bs, self.num_group, -1, 3)
+                    # pred_vertices = pred_output['vertices'].reshape(bs, -1, 6890, 3)
+
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+                
+
+            else:
+                bs = layer_ref_sig.shape[0]
+                # lhand bbox
+                layer_hs_lhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 1::4, :]
+                delta_lhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_lhand_bbox)             
+                layer_ref_sig_lhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 1::4, :].clone()
+                layer_ref_unsig_lhand_bbox = inverse_sigmoid(layer_ref_sig_lhand_bbox)
+                delta_lhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_lhand_bbox)
+                layer_ref_unsig_lhand_bbox[..., :2] +=delta_lhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_lhand_bbox[..., 2:] +=delta_lhand_bbox_hw_unsig
+                layer_ref_sig_lhand_bbox = layer_ref_unsig_lhand_bbox.sigmoid()
+                outputs_lhand_bbox_list.append(layer_ref_sig_lhand_bbox)
+                
+                # rhand bbox
+                layer_hs_rhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 2::4, :]
+                delta_rhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_rhand_bbox)             
+                layer_ref_sig_rhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 2::4, :].clone()                  
+                layer_ref_unsig_rhand_bbox = inverse_sigmoid(layer_ref_sig_rhand_bbox)
+                delta_rhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_rhand_bbox)
+                layer_ref_unsig_rhand_bbox[..., :2] +=delta_rhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_rhand_bbox[..., 2:] +=delta_rhand_bbox_hw_unsig
+                layer_ref_sig_rhand_bbox = layer_ref_unsig_rhand_bbox.sigmoid()
+                outputs_rhand_bbox_list.append(layer_ref_sig_rhand_bbox)
+
+                # face bbox
+                layer_hs_face_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 3::4, :]
+                delta_face_bbox_xy_unsig = \
+                    self.bbox_face_embed[dec_lid - self.num_box_decoder_layers](layer_hs_face_bbox)             
+                layer_ref_sig_face_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 3::4, :].clone()               
+                layer_ref_unsig_face_bbox = inverse_sigmoid(layer_ref_sig_face_bbox)
+                delta_face_bbox_hw_unsig = self.bbox_face_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_face_bbox)
+                layer_ref_unsig_face_bbox[..., :2] +=delta_face_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_face_bbox[..., 2:] +=delta_face_bbox_hw_unsig
+                layer_ref_sig_face_bbox = layer_ref_unsig_face_bbox.sigmoid()   
+                outputs_face_bbox_list.append(layer_ref_sig_face_bbox)
+                
+                bs, _, feat_dim = layer_hs.shape
+                smpl_body_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 4)
+                smpl_lhand_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_lhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                smpl_rhand_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_rhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                smpl_face_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_face_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                                                
+                smpl_pose = self.smpl_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                
+                smpl_pose = rot6d_to_rotmat(smpl_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, self.body_model_joint_num, 3, 3)
+                smpl_lhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_lhand_pose_feats)
+                smpl_lhand_pose = rot6d_to_rotmat(smpl_lhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                smpl_rhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_rhand_pose_feats)
+                smpl_rhand_pose = rot6d_to_rotmat(smpl_rhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+
+                smpl_expr = self.smpl_expr_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = self.smpl_jaw_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = rot6d_to_rotmat(smpl_jaw_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, 1, 3, 3)
+                smpl_beta = self.smpl_beta_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                smpl_cam = self.smpl_cam_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                
+                num_samples = smpl_beta.reshape(-1, 10).shape[0]
+                device = smpl_beta.device
+                leye_pose = torch.zeros_like(smpl_jaw_pose)
+                reye_pose = torch.zeros_like(smpl_jaw_pose)
+
+                if self.body_model is not None:
+                    smpl_pose_ = rotmat_to_aa(smpl_pose)
+                    smpl_lhand_pose_ = rotmat_to_aa(smpl_lhand_pose)
+                    smpl_rhand_pose_ = rotmat_to_aa(smpl_rhand_pose)
+                    smpl_jaw_pose_ = rotmat_to_aa(smpl_jaw_pose)
+                    leye_pose_ = rotmat_to_aa(leye_pose)
+                    reye_pose_ = rotmat_to_aa(reye_pose)
+                    
+                    pred_output = self.body_model(
+                        betas=smpl_beta.reshape(-1, 10),
+                        body_pose=smpl_pose_[:, :,  1:].reshape(-1, 21 * 3),
+                        global_orient=smpl_pose_[:, :, 0].reshape(
+                            -1, 3).unsqueeze(1),
+                        left_hand_pose=smpl_lhand_pose_.reshape(-1, 15 * 3),
+                        right_hand_pose=smpl_rhand_pose_.reshape(-1, 15 * 3),
+                        leye_pose=leye_pose_,
+                        reye_pose=reye_pose_,
+                        jaw_pose=smpl_jaw_pose_.reshape(-1, 3),
+                        expression=smpl_expr.reshape(-1, 10),
+                        # expression=layer_hs.new_zeros(bs, self.num_group, 10).reshape(-1, 10),
+                    )
+                    smpl_kp3d = pred_output['joints'].reshape(
+                        bs, self.num_group, -1, 3)
+                    smpl_verts = pred_output['vertices'].reshape(
+                        bs, self.num_group, -1, 3)
+
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+                if not self.training:
+                    outputs_smpl_verts_list.append(smpl_verts)
+        dn_mask_dict = mask_dict
+        if self.dn_number > 0 and dn_mask_dict is not None:
+            outputs_class, outputs_body_bbox_list = self.dn_post_process2(
+                outputs_class, outputs_body_bbox_list, dn_mask_dict)
+            dn_class_input = dn_mask_dict['known_labels']
+            dn_bbox_input = dn_mask_dict['known_bboxs']
+            dn_class_pred = dn_mask_dict['output_known_class']
+            dn_bbox_pred = dn_mask_dict['output_known_coord']
+
+        for idx, (_out_class, _out_bbox) in enumerate(zip(outputs_class, outputs_body_bbox_list)):
+            assert _out_class.shape[1] == _out_bbox.shape[1]
+
+        out = {
+            'pred_logits': outputs_class[-1],
+            'pred_boxes': outputs_body_bbox_list[-1],
+            'pred_lhand_boxes': outputs_lhand_bbox_list[-1],
+            'pred_rhand_boxes': outputs_rhand_bbox_list[-1],
+            'pred_face_boxes': outputs_face_bbox_list[-1],
+            'pred_smpl_pose': outputs_smpl_pose_list[-1],
+            'pred_smpl_rhand_pose': outputs_smpl_rhand_pose_list[-1],
+            'pred_smpl_lhand_pose': outputs_smpl_lhand_pose_list[-1],
+            'pred_smpl_jaw_pose': outputs_smpl_jaw_pose_list[-1],
+            'pred_smpl_expr': outputs_smpl_expr_list[-1],
+            'pred_smpl_beta': outputs_smpl_beta_list[-1],  # [B, 100, 10]
+            'pred_smpl_cam': outputs_smpl_cam_list[-1],
+            'pred_smpl_kp3d': outputs_smpl_kp3d_list[-1]
+        }
+        if not self.training:
+            full_pose = torch.cat((outputs_smpl_pose_list[-1],
+                               outputs_smpl_lhand_pose_list[-1],
+                               outputs_smpl_rhand_pose_list[-1],
+                               outputs_smpl_jaw_pose_list[-1]),dim=2)
+            bs,num_q,_,_,_ = full_pose.shape
+            full_pose = rotmat_to_aa(full_pose).reshape(bs,num_q,53*3)
+            out = {
+            'pred_logits': outputs_class[-1],
+            'pred_boxes': outputs_body_bbox_list[-1],
+            'pred_lhand_boxes': outputs_lhand_bbox_list[-1],
+            'pred_rhand_boxes': outputs_rhand_bbox_list[-1],
+            'pred_face_boxes': outputs_face_bbox_list[-1],
+            'pred_smpl_pose': outputs_smpl_pose_list[-1],
+            'pred_smpl_rhand_pose': outputs_smpl_rhand_pose_list[-1],
+            'pred_smpl_lhand_pose': outputs_smpl_lhand_pose_list[-1],
+            'pred_smpl_jaw_pose': outputs_smpl_jaw_pose_list[-1],
+            'pred_smpl_expr': outputs_smpl_expr_list[-1],
+            'pred_smpl_beta': outputs_smpl_beta_list[-1],  # [B, 100, 10]
+            'pred_smpl_cam': outputs_smpl_cam_list[-1],
+            'pred_smpl_kp3d': outputs_smpl_kp3d_list[-1],
+            'pred_smpl_verts': outputs_smpl_verts_list[-1],
+            'pred_smpl_fullpose': full_pose
+        }
+
+        if self.dn_number > 0 and dn_mask_dict is not None:
+            out.update({
+                'dn_class_input': dn_class_input,
+                'dn_bbox_input': dn_bbox_input,
+                'dn_class_pred': dn_class_pred[-1],
+                'dn_bbox_pred': dn_bbox_pred[-1],
+                'num_tgt': dn_mask_dict['pad_size']
+            })
+
+        if self.aux_loss:
+            out['aux_outputs'] = \
+                self._set_aux_loss(
+                    outputs_class,
+                    outputs_body_bbox_list,
+                    outputs_lhand_bbox_list,
+                    outputs_rhand_bbox_list,
+                    outputs_face_bbox_list,
+                    outputs_smpl_pose_list,
+                    outputs_smpl_rhand_pose_list,
+                    outputs_smpl_lhand_pose_list,
+                    outputs_smpl_jaw_pose_list,
+                    outputs_smpl_expr_list,
+                    outputs_smpl_beta_list,
+                    outputs_smpl_cam_list,
+                    outputs_smpl_kp3d_list
+                ) # with key pred_logits, pred_bbox, pred_keypoints
+            if self.dn_number > 0 and dn_mask_dict is not None:
+                assert len(dn_class_pred[:-1]) == len(
+                    dn_bbox_pred[:-1]) == len(out['aux_outputs'])
+                for aux_out, dn_class_pred_i, dn_bbox_pred_i in zip(
+                        out['aux_outputs'], dn_class_pred, dn_bbox_pred):
+                    aux_out.update({
+                        'dn_class_input': dn_class_input,
+                        'dn_bbox_input': dn_bbox_input,
+                        'dn_class_pred': dn_class_pred_i,
+                        'dn_bbox_pred': dn_bbox_pred_i,
+                        'num_tgt': dn_mask_dict['pad_size']
+                    })
+        # for encoder output
+        if hs_enc is not None:
+            interm_coord = ref_enc[-1]
+            interm_class = self.transformer.enc_out_class_embed(hs_enc[-1])
+            interm_pose = torch.zeros_like(outputs_body_keypoints_list[0])
+            out['interm_outputs'] = {
+                'pred_logits': interm_class,
+                'pred_boxes': interm_coord,
+                'pred_keypoints': interm_pose
+            }
+
+        return out, targets, data_batch
+
+    @torch.jit.unused
+    def _set_aux_loss(self, 
+                      outputs_class, 
+                      outputs_body_coord, 
+                      outputs_lhand_coord,
+                      outputs_rhand_coord,
+                      outputs_face_coord,
+                      outputs_smpl_pose, 
+                      outputs_smpl_rhand_pose,
+                      outputs_smpl_lhand_pose, 
+                      outputs_smpl_jaw_pose,
+                      outputs_smpl_expr, 
+                      outputs_smpl_beta, 
+                      outputs_smpl_cam,
+                      outputs_smpl_kp3d):
+
+        return [{
+            'pred_logits': a,
+            'pred_boxes': b,
+            'pred_lhand_boxes': c,
+            'pred_rhand_boxes': d,
+            'pred_face_boxes': e,
+            'pred_smpl_pose': j,
+            'pred_smpl_rhand_pose': k,
+            'pred_smpl_lhand_pose': l,
+            'pred_smpl_jaw_pose': m,
+            'pred_smpl_expr': n,
+            'pred_smpl_beta': o,
+            'pred_smpl_cam': p,
+            'pred_smpl_kp3d': q
+        } for a, b, c, d, e, j, k, l, m, n, o, p, q in zip(
+            outputs_class[:-1], 
+            outputs_body_coord[:-1],
+            outputs_lhand_coord[:-1],
+            outputs_rhand_coord[:-1],
+            outputs_face_coord[:-1],
+            outputs_smpl_pose[:-1], 
+            outputs_smpl_rhand_pose[:-1],
+            outputs_smpl_lhand_pose[:-1], 
+            outputs_smpl_jaw_pose[:-1],
+            outputs_smpl_expr[:-1], 
+            outputs_smpl_beta[:-1],
+            outputs_smpl_cam[:-1], 
+            outputs_smpl_kp3d[:-1])]
+
+    def prepare_targets(self, data_batch):
+
+        data_batch_coco = []
+        instance_dict = {}
+        img_list = data_batch['img'].float()
+        # input_img_h, input_img_w = data_batch['image_metas'][0]['batch_input_shape']
+        batch_size, _, input_img_h, input_img_w = img_list.shape
+        device = img_list.device
+        masks = torch.ones((batch_size, input_img_h, input_img_w),
+                           dtype=torch.bool,
+                           device=device)
+        
+        if self.num_body_points == 17:
+            ed_convention = 'coco'
+        elif self.num_body_points == 14:
+            ed_convention = 'crowdpose'
+
+        # cv2.imread(data_batch['img_metas'][img_id]['image_path']).shape
+        for img_id in range(batch_size):
+            img_h, img_w = data_batch['img_shape'][img_id]
+            masks[img_id, :img_h, :img_w] = 0
+            
+            if not self.inference:
+                instance_body_bbox = torch.cat([data_batch['body_bbox_center'][img_id],\
+                                                data_batch['body_bbox_size'][img_id]],dim=-1)
+                instance_face_bbox = torch.cat([data_batch['face_bbox_center'][img_id],\
+                                                data_batch['face_bbox_size'][img_id]],dim=-1)
+                instance_lhand_bbox = torch.cat([data_batch['lhand_bbox_center'][img_id],\
+                                                data_batch['lhand_bbox_size'][img_id]],dim=-1)
+                instance_rhand_bbox = torch.cat([data_batch['rhand_bbox_center'][img_id],\
+                                                data_batch['rhand_bbox_size'][img_id]],dim=-1)
+
+                instance_kp2d = data_batch['joint_img'][img_id].clone().float()
+                instance_kp2d_mask = data_batch['joint_trunc'][img_id].clone().float()
+                instance_kp2d[:,:,2:] = instance_kp2d_mask
+                body_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'coco', approximate=True)
+                lhand_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_lhand', approximate=True)
+                rhand_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_rhand', approximate=True)
+                face_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_face', approximate=True)
+                # from util.vis_utils import show_bbox
+                # show_bbox(img_list[img_id],instance_kp2d.cpu().numpy(),data_batch['bbox_xywh'][img_id].cpu().numpy)
+                body_kp2d[:,:,0] = body_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                body_kp2d[:,:,1] = body_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                body_kp2d = torch.cat([body_kp2d[:,:,:2].flatten(1),body_kp2d[:,:,2]],dim=-1)
+
+                lhand_kp2d[:,:,0] = lhand_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                lhand_kp2d[:,:,1] = lhand_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                lhand_kp2d = torch.cat([lhand_kp2d[:,:,:2].flatten(1),lhand_kp2d[:,:,2]],dim=-1)
+                
+                rhand_kp2d[:,:,0] = rhand_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                rhand_kp2d[:,:,1] = rhand_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                rhand_kp2d = torch.cat([rhand_kp2d[:,:,:2].flatten(1),rhand_kp2d[:,:,2]],dim=-1)
+
+                face_kp2d[:,:,0] = face_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                face_kp2d[:,:,1] = face_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                face_kp2d = torch.cat([face_kp2d[:,:,:2].flatten(1),face_kp2d[:,:,2]],dim=-1)
+                
+                instance_dict = {}
+                instance_dict['boxes'] = instance_body_bbox.float()
+                instance_dict['face_boxes'] = instance_face_bbox.float()
+                instance_dict['lhand_boxes'] = instance_lhand_bbox.float()
+                instance_dict['rhand_boxes'] = instance_rhand_bbox.float()
+                instance_dict['keypoints'] = body_kp2d.float()
+                instance_dict['lhand_keypoints'] = lhand_kp2d.float()
+                instance_dict['rhand_keypoints'] = rhand_kp2d.float()
+                instance_dict['face_keypoints'] = face_kp2d.float()
+            
+                # instance_dict['orig_size'] = data_batch['ori_shape'][img_id]
+                instance_dict['size'] = data_batch['img_shape'][img_id]  # after augmentation 
+                
+                instance_dict['area'] = instance_body_bbox[:, 2] * instance_body_bbox[:, 3]
+                instance_dict['lhand_area'] = instance_lhand_bbox[:, 2] * instance_lhand_bbox[:, 3]
+                instance_dict['rhand_area'] = instance_rhand_bbox[:, 2] * instance_rhand_bbox[:, 3]
+                instance_dict['face_area'] = instance_face_bbox[:, 2] * instance_face_bbox[:, 3]
+
+                instance_dict['labels'] = torch.ones(instance_body_bbox.shape[0],
+                                                    dtype=torch.long,
+                                                    device=device)
+                data_batch_coco.append(instance_dict)               
+            else:
+                instance_body_bbox = torch.cat([data_batch['body_bbox_center'][img_id],\
+                                                data_batch['body_bbox_size'][img_id]],dim=-1)
+                instance_dict = {}
+                # instance_dict['orig_size'] = data_batch['ori_shape'][img_id]
+                instance_dict['size'] = data_batch['img_shape'][img_id]  # after augmentation 
+                instance_dict['boxes'] = instance_body_bbox.float()    
+                     
+                data_batch_coco.append(instance_dict)  
+
+        input_img = NestedTensor(img_list, masks)
+        return input_img, data_batch_coco
+
+
+    def keypoints_to_scaled_bbox_bfh(
+        self, keypoints, occ=None, 
+        body_scale=1.0, fh_scale=1.0, 
+        convention='smplx'):
+        '''Obtain scaled bbox in xyxy format given keypoints
+        Args:
+            keypoints (np.ndarray): Keypoints
+            scale (float): Bounding Box scale
+        Returns:
+            bbox_xyxy (np.ndarray): Bounding box in xyxy format
+        '''
+        bboxs = []
+
+        # supported kps.shape: (1, n, k) or (n, k), k = 2 or 3
+        if keypoints.ndim == 3:
+            keypoints = keypoints[0]
+        if keypoints.shape[-1] != 2:
+            keypoints = keypoints[:, :2]
+
+        for body_part in ['body', 'head', 'left_hand', 'right_hand']:
+            if body_part == 'body':
+                scale = body_scale
+                kps = keypoints
+            else:
+                scale = fh_scale
+                kp_id = get_keypoint_idxs_by_part(body_part, convention=convention)
+                kps = keypoints[kp_id]
+
+            if not occ is None:
+                occ_p = occ[kp_id]
+                if np.sum(occ_p) / len(kp_id) >= 0.1:
+                    conf = 0
+                    # print(f'{body_part} occluded, occlusion: {np.sum(occ_p) / len(kp_id)}, skip')
+                else:
+                    # print(f'{body_part} good, {np.sum(self_occ_p + occ_p) / len(kp_id)}')
+                    conf = 1
+            else:
+                conf = 1
+            if body_part == 'body':
+                conf = 1
+
+            xmin, ymin = np.amin(kps, axis=0)
+            xmax, ymax = np.amax(kps, axis=0)
+
+            width = (xmax - xmin) * scale
+            height = (ymax - ymin) * scale
+
+            x_center = 0.5 * (xmax + xmin)
+            y_center = 0.5 * (ymax + ymin)
+            xmin = x_center - 0.5 * width
+            xmax = x_center + 0.5 * width
+            ymin = y_center - 0.5 * height
+            ymax = y_center + 0.5 * height
+
+            bbox = np.stack([xmin, ymin, xmax, ymax, conf], axis=0).astype(np.float32)
+            bboxs.append(bbox)
+        
+        return bboxs
+
+
+@MODULE_BUILD_FUNCS.registe_with_name(module_name='aios_smplx_box')
+def build_aios_smplx_box(args, cfg):
+    # pdb.set_trace()
+    num_classes = args.num_classes  # 2
+    device = torch.device(args.device)
+
+    backbone = build_backbone(args)
+
+    transformer = build_transformer(args)
+
+    dn_labelbook_size = args.dn_labelbook_size
+    dec_pred_class_embed_share = args.dec_pred_class_embed_share
+    dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
+
+    if args.eval:
+        body_model = args.body_model_test
+        train = False
+    else:
+        body_model = args.body_model_train
+        train = True
+        
+    model = AiOSSMPLX_Box(
+        backbone,
+        transformer,
+        num_classes=num_classes,  # 2
+        num_queries=args.num_queries,  # 900
+        aux_loss=True,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=args.random_refpoints_xy,  # False
+        fix_refpoints_hw=args.fix_refpoints_hw,  # -1
+        num_feature_levels=args.num_feature_levels,  # 4
+        nheads=args.nheads,  # 8
+        dec_pred_class_embed_share=dec_pred_class_embed_share,  # false
+        dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,  # False
+        # two stage
+        two_stage_type=args.two_stage_type,
+
+        # box_share
+        two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,  # False
+        two_stage_class_embed_share=args.two_stage_class_embed_share,  # False
+        dn_number=args.dn_number if args.use_dn else 0,  # 100
+        dn_box_noise_scale=args.dn_box_noise_scale,  # 0.4
+        dn_label_noise_ratio=args.dn_label_noise_ratio,  # 0.5
+        dn_batch_gt_fuse=args.dn_batch_gt_fuse,  # false
+        dn_attn_mask_type_list=args.dn_attn_mask_type_list,
+        dn_labelbook_size=dn_labelbook_size,  # 100
+        cls_no_bias=args.cls_no_bias,  # False
+        num_group=args.num_group,  # 100
+        num_body_points=0,  # 17
+        num_hand_points=0,  # 17
+        num_face_points=0,  # 17
+        num_box_decoder_layers=args.num_box_decoder_layers,  # 2
+        num_hand_face_decoder_layers=args.num_hand_face_decoder_layers,
+        # smpl_convention=convention
+        body_model=body_model,
+        train=train,
+        inference=args.inference)
+    matcher = build_matcher(args)
+
+    # prepare weight dict
+    weight_dict = {
+        'loss_ce': args.cls_loss_coef,  # 2
+        # bbox
+        'loss_body_bbox': args.body_bbox_loss_coef,  # 5
+        'loss_rhand_bbox': args.rhand_bbox_loss_coef,  # 5
+        'loss_lhand_bbox': args.lhand_bbox_loss_coef,  # 5
+        'loss_face_bbox': args.face_bbox_loss_coef,  # 5
+        # bbox giou
+        'loss_body_giou': args.body_giou_loss_coef,  # 2
+        'loss_rhand_giou': args.rhand_giou_loss_coef,  # 2
+        'loss_lhand_giou': args.lhand_giou_loss_coef,  # 2
+        'loss_face_giou': args.face_giou_loss_coef,  # 2
+        # smpl param
+        'loss_smpl_pose_root': args.smpl_pose_loss_root_coef,  # 0
+        'loss_smpl_pose_body': args.smpl_pose_loss_body_coef,  # 0
+        'loss_smpl_pose_lhand': args.smpl_pose_loss_lhand_coef,  # 0
+        'loss_smpl_pose_rhand': args.smpl_pose_loss_rhand_coef,  # 0
+        'loss_smpl_pose_jaw': args.smpl_pose_loss_jaw_coef,  # 0
+        'loss_smpl_beta': args.smpl_beta_loss_coef,  # 0
+        'loss_smpl_expr': args.smpl_expr_loss_coef, 
+        # smpl kp3d ra
+        'loss_smpl_body_kp3d_ra': args.smpl_body_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_lhand_kp3d_ra': args.smpl_lhand_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_rhand_kp3d_ra': args.smpl_rhand_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_face_kp3d_ra': args.smpl_face_kp3d_ra_loss_coef,  # 0
+        # smpl kp3d
+        'loss_smpl_body_kp3d': args.smpl_body_kp3d_loss_coef,  # 0
+        'loss_smpl_face_kp3d': args.smpl_face_kp3d_loss_coef,  # 0
+        'loss_smpl_lhand_kp3d': args.smpl_lhand_kp3d_loss_coef,  # 0
+        'loss_smpl_rhand_kp3d': args.smpl_rhand_kp3d_loss_coef,  # 0
+        # smpl kp2d
+        'loss_smpl_body_kp2d': args.smpl_body_kp2d_loss_coef,  # 0
+        'loss_smpl_lhand_kp2d': args.smpl_lhand_kp2d_loss_coef,  # 0
+        'loss_smpl_rhand_kp2d': args.smpl_rhand_kp2d_loss_coef,  # 0
+        'loss_smpl_face_kp2d': args.smpl_face_kp2d_loss_coef,  # 0
+    }
+
+    clean_weight_dict_wo_dn = copy.deepcopy(weight_dict)
+
+    if args.use_dn:
+        weight_dict.update({
+            'dn_loss_ce':
+            args.dn_label_coef,  # 0.3
+            'dn_loss_bbox':
+            args.bbox_loss_coef * args.dn_bbox_coef,  # 5 * 0.5
+            'dn_loss_giou':
+            args.giou_loss_coef * args.dn_bbox_coef,  # 2 * 0.5
+        })
+
+    clean_weight_dict = copy.deepcopy(weight_dict)
+
+    if args.aux_loss:
+        aux_weight_dict = {}
+        for i in range(args.dec_layers - 1):  # from 0 t 4 # ???
+            for k, v in clean_weight_dict.items():
+                if i < args.num_box_decoder_layers and ('keypoints' in k or 'oks' in k):
+                    continue
+                if i < args.num_box_decoder_layers and k in [
+                    'loss_rhand_bbox', 'loss_lhand_bbox', 'loss_face_bbox',
+                    'loss_rhand_giou', 'loss_lhand_giou', 'loss_face_giou']:
+                    continue
+                if i < args.num_hand_face_decoder_layers and k in [
+                    'loss_rhand_keypoints', 'loss_lhand_keypoints', 
+                    'loss_face_keypoints', 'loss_rhand_oks',
+                    'loss_lhand_oks', 'loss_face_oks']:
+                    continue
+                if i < args.num_box_decoder_layers and 'smpl' in k:
+                    continue
+                aux_weight_dict.update({k + f'_{i}': v})
+        weight_dict.update(aux_weight_dict)
+
+    if args.two_stage_type != 'no':
+        interm_weight_dict = {}
+        try:
+            no_interm_box_loss = args.no_interm_box_loss
+        except:
+            no_interm_box_loss = False
+        _coeff_weight_dict = {
+            'loss_ce': 1.0,
+            # bbox
+            'loss_body_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            # bbox giou
+            'loss_body_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_giou': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl param
+            'loss_smpl_pose_root': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_body': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_lhand': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_rhand': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_jaw': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_beta': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_expr': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp3d ra
+            'loss_smpl_body_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp3d
+            'loss_smpl_body_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp2d
+            'loss_smpl_body_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+        }
+        try:
+            interm_loss_coef = args.interm_loss_coef  # 1
+        except:
+            interm_loss_coef = 1.0
+        interm_weight_dict.update({
+            k + f'_interm': v * interm_loss_coef * _coeff_weight_dict[k]
+            for k, v in clean_weight_dict_wo_dn.items() if 'keypoints' not in k
+        })
+        weight_dict.update(interm_weight_dict)
+
+        interm_weight_dict.update({
+            k + f'_query_expand': v * interm_loss_coef * _coeff_weight_dict[k]
+            for k, v in clean_weight_dict_wo_dn.items()
+        })  # ???
+        weight_dict.update(interm_weight_dict)
+
+    losses = cfg.losses
+    
+    if args.dn_number > 0:
+        losses += ['dn_label', 'dn_bbox']
+    losses += ['matching']
+
+    criterion = SetCriterion_Box(
+        num_classes,
+        matcher=matcher,
+        weight_dict=weight_dict,
+        focal_alpha=args.focal_alpha,
+        losses=losses,
+        num_box_decoder_layers=args.num_box_decoder_layers,
+        num_hand_face_decoder_layers=args.num_hand_face_decoder_layers,
+        num_body_points=0,
+        num_hand_points=0,
+        num_face_points=0,
+        )
+
+    criterion.to(device)
+    if args.inference:
+        postprocessors = {
+            'bbox': 
+                PostProcess_SMPLX_Multi_Infer_Box(
+                    num_select=args.num_select, 
+                    nms_iou_threshold=args.nms_iou_threshold,
+                    num_body_points=0),
+        }
+    else:
+        postprocessors = {
+            'bbox': 
+                PostProcess_SMPLX_Multi_Box(
+                    num_select=args.num_select, 
+                    nms_iou_threshold=args.nms_iou_threshold,
+                    num_body_points=0),
+        }
+    postprocessors_aios = {
+        'bbox':
+        PostProcess_aios(num_select=args.num_select,
+                           nms_iou_threshold=args.nms_iou_threshold,
+                           num_body_points=0),
+    }
+
+    return model, criterion, postprocessors, postprocessors_aios
+
diff --git a/models/aios/aios_smplx.py b/models/aios/aios_smplx.py
new file mode 100644
index 0000000000000000000000000000000000000000..d1a33a20cdd34e894ba92232b49e966d6a4c92c3
--- /dev/null
+++ b/models/aios/aios_smplx.py
@@ -0,0 +1,4287 @@
+import copy
+import pdb
+import os
+import math
+from typing import List
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch import Tensor
+from util import box_ops
+from util.keypoint_ops import keypoint_xyzxyz_to_xyxyzz
+from util.misc import (NestedTensor, nested_tensor_from_tensor_list, accuracy,
+                       get_world_size, interpolate,
+                       is_dist_avail_and_initialized, inverse_sigmoid)
+from .backbones import build_backbone
+from .matcher import build_matcher
+from .transformer import build_transformer
+from .utils import PoseProjector, sigmoid_focal_loss, MLP
+from .postprocesses import PostProcess_SMPLX, PostProcess_aios
+from .postprocesses import PostProcess_SMPLX_Multi as PostProcess_SMPLX
+from .postprocesses import PostProcess_SMPLX_Multi_Box
+from .postprocesses import  PostProcess_SMPLX_Multi_Infer, PostProcess_SMPLX_Multi_Infer_Box
+from .criterion_smplx import SetCriterion, SetCriterion_Box
+from ..registry import MODULE_BUILD_FUNCS
+from detrsmpl.core.conventions.keypoints_mapping import convert_kps
+from detrsmpl.models.body_models.builder import build_body_model
+from util.human_models import smpl_x
+from detrsmpl.core.conventions.keypoints_mapping import get_keypoint_idxs_by_part
+import numpy as np
+
+from detrsmpl.utils.geometry import (rot6d_to_rotmat)
+from detrsmpl.utils.transforms import rotmat_to_aa
+import cv2
+from config.config import cfg
+
+
+class AiOSSMPLX(nn.Module):
+    def __init__(
+        self,
+        backbone,
+        transformer,
+        num_classes,
+        num_queries,
+        aux_loss=False,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=False,
+        fix_refpoints_hw=-1,
+        num_feature_levels=1,
+        nheads=8,
+        two_stage_type='no',
+        dec_pred_class_embed_share=False,
+        dec_pred_bbox_embed_share=False,
+        dec_pred_pose_embed_share=False,
+        two_stage_class_embed_share=True,
+        two_stage_bbox_embed_share=True,
+        dn_number=100,
+        dn_box_noise_scale=0.4,
+        dn_label_noise_ratio=0.5,
+        dn_batch_gt_fuse=False,
+        dn_labelbook_size=100,
+        dn_attn_mask_type_list=['group2group'],
+        cls_no_bias=False,
+        num_group=100,
+        num_body_points=17,
+        num_hand_points=10,
+        num_face_points=10,
+        num_box_decoder_layers=2,
+        num_hand_face_decoder_layers=4,
+        body_model=dict(
+            type='smplx',
+            keypoint_src='smplx',
+            num_expression_coeffs=10,
+            keypoint_dst='smplx_137',
+            model_path='data/body_models/smplx',
+            use_pca=False,
+            use_face_contour=True),
+        train=True,
+        inference=False,
+        focal_length=[5000., 5000.],
+        camera_3d_size=2.5
+    ):
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim = transformer.d_model
+        self.num_feature_levels = num_feature_levels
+        self.nheads = nheads
+        self.label_enc = nn.Embedding(dn_labelbook_size + 1, hidden_dim)
+        self.num_body_points = num_body_points
+        self.num_hand_points = num_hand_points
+        self.num_face_points = num_face_points
+        self.num_whole_body_points = num_body_points + 2*num_hand_points + num_face_points
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.focal_length = focal_length
+        self.camera_3d_size=camera_3d_size
+        self.inference = inference
+        if train:
+            self.smpl_convention = 'smplx'
+        else:
+            self.smpl_convention = 'h36m'
+        # setting query dim
+        self.query_dim = query_dim
+        assert query_dim == 4
+        self.random_refpoints_xy = random_refpoints_xy  # False
+        self.fix_refpoints_hw = fix_refpoints_hw  # -1
+
+        # for dn training
+        self.dn_number = dn_number
+        self.dn_box_noise_scale = dn_box_noise_scale
+        self.dn_label_noise_ratio = dn_label_noise_ratio
+        self.dn_batch_gt_fuse = dn_batch_gt_fuse
+        self.dn_labelbook_size = dn_labelbook_size
+        self.dn_attn_mask_type_list = dn_attn_mask_type_list
+        assert all([
+            i in ['match2dn', 'dn2dn', 'group2group']
+            for i in dn_attn_mask_type_list
+        ])
+        assert not dn_batch_gt_fuse
+
+        # build human body
+        # if train:
+        #     self.body_model = build_body_model(body_model)
+        if inference:
+            body_model=dict(
+                type='smplx',
+                keypoint_src='smplx',
+                num_expression_coeffs=10,
+                num_betas=10,
+                keypoint_dst='smplx',
+                model_path='data/body_models/smplx',
+                use_pca=False,
+                use_face_contour=True)
+        self.body_model = build_body_model(body_model)
+        for param in self.body_model.parameters():
+            param.requires_grad = False       
+        # prepare input projection layers
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.num_channels)  # 3
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+            for _ in range(num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels,
+                                  hidden_dim,
+                                  kernel_size=3,
+                                  stride=2,
+                                  padding=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            assert two_stage_type == 'no', 'two_stage_type should be no if num_feature_levels=1 !!!'
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(backbone.num_channels[-1],
+                              hidden_dim,
+                              kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                )
+            ])
+
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.box_pred_damping = box_pred_damping = None
+
+        self.iter_update = iter_update
+        assert iter_update, 'Why not iter_update?'
+
+        # prepare pred layers
+        self.dec_pred_class_embed_share = dec_pred_class_embed_share  # false
+        self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share  # false
+
+        # 1.1 prepare class & box embed
+        _class_embed = nn.Linear(hidden_dim,
+                                 num_classes,
+                                 bias=(not cls_no_bias))
+        if not cls_no_bias:
+            prior_prob = 0.01
+            bias_value = -math.log((1 - prior_prob) / prior_prob)
+            _class_embed.bias.data = torch.ones(self.num_classes) * bias_value
+
+        # 1.2 box embed layer list
+        if dec_pred_class_embed_share:
+            class_embed_layerlist = [
+                _class_embed for i in range(transformer.num_decoder_layers)
+            ]
+        else:
+            class_embed_layerlist = [
+                copy.deepcopy(_class_embed)
+                for i in range(transformer.num_decoder_layers)
+            ]
+
+
+        ###########################################################################
+        #                    body bbox + l/r hand box + face box
+        ###########################################################################
+        # 1.1 body bbox embed
+        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+        
+        # 1.2 body bbox embed layer list
+        self.num_group = num_group
+        if dec_pred_bbox_embed_share:
+            box_body_embed_layerlist = [
+                _bbox_embed for i in range(transformer.num_decoder_layers)
+            ]
+        else:
+            box_body_embed_layerlist = [
+                copy.deepcopy(_bbox_embed)
+                for i in range(transformer.num_decoder_layers)
+            ]
+
+        # 2.1 lhand bbox embed
+        _bbox_hand_embed = MLP(hidden_dim, hidden_dim, 2, 3) # TODO: the out shape should be 2 not 4
+        nn.init.constant_(_bbox_hand_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_hand_embed.layers[-1].bias.data, 0)
+
+        _bbox_hand_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_hand_hw_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_hand_hw_embed.layers[-1].bias.data, 0)
+        # 2.2 lhand bbox embed layer list
+        if dec_pred_pose_embed_share:
+            box_hand_embed_layerlist = \
+                [_bbox_hand_embed for i in range(transformer.num_decoder_layers - num_box_decoder_layers+1)]
+        else:
+            box_hand_embed_layerlist = [
+                copy.deepcopy(_bbox_hand_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers + 1)
+            ]
+
+        if dec_pred_pose_embed_share:
+            box_hand_hw_embed_layerlist = [
+                _bbox_hand_hw_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)
+                ]
+        else:
+            box_hand_hw_embed_layerlist = [
+                copy.deepcopy(_bbox_hand_hw_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers)
+            ]
+                        
+        # 4.1 face bbox embed
+        _bbox_face_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_face_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_face_embed.layers[-1].bias.data, 0)
+
+        _bbox_face_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_face_hw_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_face_hw_embed.layers[-1].bias.data, 0)
+        
+        # 4.2 face bbox embed layer list
+        if dec_pred_pose_embed_share:
+            box_face_embed_layerlist = [
+                _bbox_face_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers + 1)
+                ]
+        else:
+            box_face_embed_layerlist = [
+                copy.deepcopy(_bbox_face_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers + 1)
+            ]
+
+        if dec_pred_pose_embed_share:
+            box_face_hw_embed_layerlist = [
+                _bbox_face_hw_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)]
+        else:
+            box_face_hw_embed_layerlist = [
+                copy.deepcopy(_bbox_face_hw_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers)
+            ]            
+        ###########################################################################
+        #                    body kp2d + l/r hand kp2d + face kp2d
+        ###########################################################################
+            
+        ######## body #######
+        # 1.1 body kp2d embed
+        _pose_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_pose_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_pose_embed.layers[-1].bias.data, 0)
+
+        # 1.2 body kp2d embed layer list
+        if num_body_points == 17:
+            if dec_pred_pose_embed_share:
+                pose_embed_layerlist = \
+                    [_pose_embed for i in range(transformer.num_decoder_layers - num_box_decoder_layers+1)]
+            else:
+                pose_embed_layerlist = [
+                    copy.deepcopy(_pose_embed)
+                    for i in range(transformer.num_decoder_layers -
+                                num_box_decoder_layers + 1)
+                ]
+        else:
+            if dec_pred_pose_embed_share:
+                pose_embed_layerlist = [
+                    _pose_embed for i in range(transformer.num_decoder_layers -
+                                            num_box_decoder_layers)
+                ]
+            else:
+                pose_embed_layerlist = [
+                    copy.deepcopy(_pose_embed)
+                    for i in range(transformer.num_decoder_layers -
+                                num_box_decoder_layers)
+                ]
+
+        # 1.3 body kp bbox embed 
+        _pose_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        
+        # 1.4 body kp bbox embed layer list
+        pose_hw_embed_layerlist = [
+            _pose_hw_embed for i in range(transformer.num_decoder_layers -
+                                        num_box_decoder_layers)
+        ]
+            
+        ######## lhand #######
+        # 2.1 lhand kp2d embed
+        _pose_hand_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_pose_hand_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_pose_hand_embed.layers[-1].bias.data, 0)
+
+        # 2.2 lhand kp2d embed layer list
+        if dec_pred_pose_embed_share:
+            pose_hand_embed_layerlist = \
+                [_pose_hand_embed for i in range(transformer.num_decoder_layers - num_hand_face_decoder_layers+1)]
+        else:
+            pose_hand_embed_layerlist = [
+                copy.deepcopy(_pose_hand_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_hand_face_decoder_layers + 1)
+            ]
+
+        # 2.3 lhand kp bbox embed 
+        _pose_hand_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        
+        # 2.4 lhand kp bbox embed layer list
+        pose_hand_hw_embed_layerlist = [
+            _pose_hand_hw_embed for i in range(transformer.num_decoder_layers -
+                                        num_hand_face_decoder_layers)
+        ]
+            
+
+        ######## face #######
+        # 4.1 face kp2d embed
+        _pose_face_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_pose_face_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_pose_face_embed.layers[-1].bias.data, 0)
+
+        # 4.2 face kp2d embed layer list
+        if dec_pred_pose_embed_share:
+            pose_face_embed_layerlist = \
+                [_pose_face_embed for i in range(transformer.num_decoder_layers - num_hand_face_decoder_layers+1)]
+        else:
+            pose_face_embed_layerlist = [
+                copy.deepcopy(_pose_face_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_hand_face_decoder_layers + 1)
+            ]
+
+        # 4.3 face kp bbox embed 
+        _pose_face_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        
+        # 4.4 face kp bbox embed layer list
+        pose_face_hw_embed_layerlist = [
+            _pose_face_hw_embed for i in range(transformer.num_decoder_layers -
+                                        num_hand_face_decoder_layers)
+        ]
+
+        ###########################################################################
+        #                    smpl pose + betas + kp2d + kp3d + cam
+        ###########################################################################
+        
+        # 1. smpl pose embed
+        if body_model['type'].upper()=='SMPL':
+            self.body_model_joint_num = 24
+        elif body_model['type'].upper()=='SMPLX':
+            self.body_model_joint_num = 22
+        else:
+            raise ValueError(
+            f'Only supports SMPL or SMPLX, but get {body_model.type}')      
+        #TODO: 
+
+        _smpl_pose_embed = MLP(hidden_dim * (self.num_body_points + 4),
+                            hidden_dim, self.body_model_joint_num * 6, 3)
+        nn.init.constant_(_smpl_pose_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_smpl_pose_embed.layers[-1].bias.data, 0)  
+
+        if dec_pred_bbox_embed_share:
+            smpl_pose_embed_layerlist = [
+                _smpl_pose_embed
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smpl_pose_embed_layerlist = [
+                copy.deepcopy(_smpl_pose_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        # 2. smpl betas embed
+        _smpl_beta_embed = MLP(hidden_dim * (self.num_body_points + 4),
+                               hidden_dim, 10, 3)
+        nn.init.constant_(_smpl_beta_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_smpl_beta_embed.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smpl_beta_embed_layerlist = [
+                _smpl_beta_embed
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smpl_beta_embed_layerlist = [
+                copy.deepcopy(_smpl_beta_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        # 3. smpl cam embed
+        _cam_embed = MLP(hidden_dim * (self.num_body_points + 4), hidden_dim,
+                         3, 3)
+        nn.init.constant_(_cam_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_cam_embed.layers[-1].bias.data, 0)
+        
+        if dec_pred_bbox_embed_share:
+            cam_embed_layerlist = [
+                _cam_embed for i in range(transformer.num_decoder_layers -
+                                          num_box_decoder_layers)
+            ]
+        else:
+            cam_embed_layerlist = [
+                copy.deepcopy(_cam_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        ###########################################################################
+        #  smplx body pose + hand pose + expression + betas + kp2d + kp3d + cam
+        ###########################################################################
+
+        # 1. smplx body pose embed
+        # _smplx_pose_embed = MLP(hidden_dim * (self.num_body_points + 1),
+        #                        hidden_dim, 23 * 6, 3)
+        # nn.init.constant_(_smplx_pose_embed.layers[-1].weight.data, 0)
+        # nn.init.constant_(_smplx_pose_embed.layers[-1].bias.data, 0)
+
+        # if dec_pred_bbox_embed_share:
+        #     smplx_pose_embed_layerlist = [
+        #         _smplx_pose_embed
+        #         for i in range(transformer.num_decoder_layers -
+        #                        num_box_decoder_layers + 1)
+        #     ]
+        # else:
+        #     smplx_pose_embed_layerlist = [
+        #         copy.deepcopy(_smplx_pose_embed)
+        #         for i in range(transformer.num_decoder_layers -
+        #                        num_box_decoder_layers + 1)
+        #     ]
+
+        # 2. smplx hand pose embed
+        _smplx_hand_pose_embed_layer_2_3 = \
+            MLP(hidden_dim, hidden_dim, 15 * 6, 3)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_hand_pose_embed_layer_4_5 = \
+            MLP(hidden_dim * (self.num_hand_points + 3), hidden_dim, 15 * 6, 3)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_4_5.layers[-1].bias.data, 0)
+
+
+        
+        if dec_pred_bbox_embed_share:
+            smplx_hand_pose_embed_layerlist = [
+                _smplx_hand_pose_embed_layer_2_3
+                if i<2 else _smplx_hand_pose_embed_layer_4_5
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smplx_hand_pose_embed_layerlist = [
+                copy.deepcopy(_smplx_hand_pose_embed_layer_2_3)
+                if i<2 else copy.deepcopy(_smplx_hand_pose_embed_layer_4_5)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+
+        # 3. smplx face expression 
+
+        _smplx_expression_embed_layer_2_3 = \
+            MLP(hidden_dim, hidden_dim, 10, 3)
+        nn.init.constant_(_smplx_expression_embed_layer_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_expression_embed_layer_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_expression_embed_layer_4_5 = \
+            MLP(hidden_dim * (self.num_hand_points + 2), hidden_dim, 10, 3)
+        nn.init.constant_(_smplx_expression_embed_layer_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_expression_embed_layer_4_5.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smplx_expression_embed_layerlist = [
+                _smplx_expression_embed_layer_2_3
+                if i<2 else _smplx_expression_embed_layer_4_5
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smplx_expression_embed_layerlist = [
+                copy.deepcopy(_smplx_expression_embed_layer_2_3)
+                if i<2 else copy.deepcopy(_smplx_expression_embed_layer_4_5)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        
+
+        # _smplx_expression_embed = MLP(hidden_dim * (self.num_face_points + 2),
+        #                        hidden_dim, 10, 3)
+        # nn.init.constant_(_smplx_expression_embed.layers[-1].weight.data, 0)
+        # nn.init.constant_(_smplx_expression_embed.layers[-1].bias.data, 0)
+
+        # if dec_pred_bbox_embed_share:
+        #     smplx_expression_embed_layerlist = [
+        #         _smplx_expression_embed
+        #         for i in range(transformer.num_decoder_layers -
+        #                        num_hand_face_decoder_layers)
+        #     ]
+        # else:
+        #     smplx_expression_embed_layerlist = [
+        #         copy.deepcopy(_smplx_expression_embed)
+        #         for i in range(transformer.num_decoder_layers -
+        #                        num_hand_face_decoder_layers)
+        #     ]
+
+        # 4. smplx jaw pose embed
+        _smplx_jaw_embed_2_3 = MLP(hidden_dim * 1,
+                               hidden_dim, 6, 3)
+        nn.init.constant_(_smplx_jaw_embed_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_jaw_embed_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_jaw_embed_4_5 = MLP(hidden_dim * (self.num_face_points + 2),
+                               hidden_dim, 6, 3)
+        nn.init.constant_(_smplx_jaw_embed_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_jaw_embed_4_5.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smplx_jaw_embed_layerlist = [
+                _smplx_jaw_embed_2_3 if i<2 else _smplx_jaw_embed_4_5
+                for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)
+            ]
+        else:
+            smplx_jaw_embed_layerlist = [
+                copy.deepcopy(_smplx_jaw_embed_2_3) 
+                if i<2 else copy.deepcopy(_smplx_jaw_embed_4_5) 
+                for i in range(
+                    transformer.num_decoder_layers -  num_box_decoder_layers)
+            ]
+            
+        ###############
+
+        self.bbox_embed = nn.ModuleList(box_body_embed_layerlist)
+        self.class_embed = nn.ModuleList(class_embed_layerlist)
+        self.pose_embed = nn.ModuleList(pose_embed_layerlist)
+        self.pose_hw_embed = nn.ModuleList(pose_hw_embed_layerlist)
+
+        self.transformer.decoder.bbox_embed = self.bbox_embed
+        self.transformer.decoder.pose_embed = self.pose_embed
+        self.transformer.decoder.pose_hw_embed = self.pose_hw_embed
+        self.transformer.decoder.class_embed = self.class_embed
+        
+        # smpl
+        self.smpl_pose_embed = nn.ModuleList(smpl_pose_embed_layerlist)
+        self.smpl_beta_embed = nn.ModuleList(smpl_beta_embed_layerlist)
+        self.smpl_cam_embed = nn.ModuleList(cam_embed_layerlist)
+        # self.smpl_cam_f_embed = nn.ModuleList(f_embed_layerlist)
+        # self.transformer.decoder.smpl_pose_embed = self.smpl_pose_embed
+        # self.transformer.decoder.smpl_beta_embed = self.smpl_beta_embed
+        # self.transformer.decoder.smpl_cam_embed = self.smpl_cam_embed
+
+        # smplx lhand kp
+        self.bbox_hand_embed = nn.ModuleList(box_hand_embed_layerlist)
+        self.bbox_hand_hw_embed = nn.ModuleList(box_hand_hw_embed_layerlist)
+        self.pose_hand_embed = nn.ModuleList(pose_hand_embed_layerlist)
+        self.pose_hand_hw_embed = nn.ModuleList(pose_hand_hw_embed_layerlist)
+
+        self.transformer.decoder.bbox_hand_embed = self.bbox_hand_embed
+        self.transformer.decoder.bbox_hand_hw_embed = self.bbox_hand_hw_embed
+        self.transformer.decoder.pose_hand_embed = self.pose_hand_embed
+        self.transformer.decoder.pose_hand_hw_embed = self.pose_hand_hw_embed
+
+        # smplx face kp
+        self.bbox_face_embed = nn.ModuleList(box_face_embed_layerlist)
+        self.bbox_face_hw_embed = nn.ModuleList(box_face_hw_embed_layerlist)
+        self.pose_face_embed = nn.ModuleList(pose_face_embed_layerlist)
+        self.pose_face_hw_embed = nn.ModuleList(pose_face_hw_embed_layerlist)               
+    
+        self.transformer.decoder.bbox_face_embed = self.bbox_face_embed
+        self.transformer.decoder.bbox_face_hw_embed = self.bbox_face_hw_embed
+        self.transformer.decoder.pose_face_embed = self.pose_face_embed
+        self.transformer.decoder.pose_face_hw_embed = self.pose_face_hw_embed
+            
+        # smplx 
+        self.smpl_hand_pose_embed = nn.ModuleList(smplx_hand_pose_embed_layerlist)
+        # self.smplx_rhand_pose_embed = nn.ModuleList(smplx_rhand_pose_embed_layerlist)
+        self.smpl_expr_embed = nn.ModuleList(smplx_expression_embed_layerlist)
+        self.smpl_jaw_embed = nn.ModuleList(smplx_jaw_embed_layerlist)
+
+        # self.transformer.decoder.smplx_hand_pose_embed = self.smplx_hand_pose_embed
+        # self.transformer.decoder.smplx_rhand_pose_embed = self.smplx_rhand_pose_embed
+        # self.transformer.decoder.num_whole_bosmpl_expr_embeddy_points = self.smplx_expression_embed
+        # self.transformer.decoder.smpl_jaw_embed = self.smplx_jaw_embed
+        
+        #########
+        self.transformer.decoder.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.transformer.decoder.num_box_decoder_layers = num_box_decoder_layers
+        self.transformer.decoder.num_body_points = num_body_points
+        self.transformer.decoder.num_hand_points = num_hand_points
+        self.transformer.decoder.num_face_points = num_face_points
+        # two stage
+        self.two_stage_type = two_stage_type
+        assert two_stage_type in [
+            'no', 'standard'
+        ], 'unknown param {} of two_stage_type'.format(two_stage_type)
+        if two_stage_type != 'no':
+            if two_stage_bbox_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_bbox_embed = _bbox_embed
+            else:
+                self.transformer.enc_out_bbox_embed = copy.deepcopy(
+                    _bbox_embed)
+
+            if two_stage_class_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_class_embed = _class_embed
+
+            else:
+                self.transformer.enc_out_class_embed = copy.deepcopy(
+                    _class_embed)
+            self.refpoint_embed = None
+
+        self._reset_parameters()
+
+    def get_camera_trans(self, cam_param, input_body_shape):
+        # camera translation
+        t_xy = cam_param[:, :2]
+        gamma = torch.sigmoid(cam_param[:, 2])  # apply sigmoid to make it positive
+        k_value = torch.FloatTensor(
+            [
+                math.sqrt(
+                    self.focal_length[0] * self.focal_length[1] * self.camera_3d_size * self.camera_3d_size / 
+                    (input_body_shape[0] * input_body_shape[1])
+                )
+            ]
+        ).cuda().view(-1)
+        t_z = k_value * gamma
+        cam_trans = torch.cat((t_xy, t_z[:, None]), 1)
+        return cam_trans
+
+    def _reset_parameters(self):
+        # init input_proj
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+    def prepare_for_dn2(self, targets):
+        if not self.training:
+            device = targets[0]['boxes'].device
+            bs = len(targets)
+            
+            num_points = self.num_body_points + 4
+            attn_mask2 = torch.zeros(
+                bs,
+                self.nheads,
+                self.num_group * num_points,
+                self.num_group * num_points,
+                device=device,
+                dtype=torch.bool)
+
+            group_bbox_kpt = num_points
+            group_nobbox_kpt = self.num_body_points
+            kpt_index = [
+                x for x in range(self.num_group * num_points) 
+                if x % num_points in [
+                    0, 
+                    self.num_body_points+1, 
+                    self.num_body_points+2, 
+                    self.num_body_points+3
+                    ]
+                ]
+            for matchj in range(self.num_group * num_points):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                if sj > 0:
+                    attn_mask2[:, :, matchj, :sj] = True
+                if ej < self.num_group * num_points:
+                    attn_mask2[:, :, matchj, ej:] = True
+
+            for match_x in range(self.num_group * num_points):
+                if match_x % group_bbox_kpt in [0, 
+                                                self.num_body_points+1, 
+                                                self.num_body_points+2, 
+                                                self.num_body_points+3]:
+                    attn_mask2[:,:,match_x,kpt_index]=False
+
+
+            num_points = self.num_whole_body_points + 4
+            attn_mask3 = torch.zeros(
+                bs,
+                self.nheads,
+                self.num_group * (num_points), 
+                self.num_group * (num_points),
+                device=device, 
+                dtype=torch.bool)
+
+            group_bbox_kpt = (num_points)
+            # group_nobbox_kpt = self.num_body_points
+            kpt_index = [
+                x for x in range(self.num_group * (num_points)) if x % (num_points) in 
+                [0, 
+                 1+self.num_body_points, 
+                 2+self.num_body_points+self.num_hand_points, 
+                 3+self.num_body_points+self.num_hand_points*2
+                 ]
+                ]
+            for matchj in range(self.num_group * num_points):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                if sj > 0:
+                    attn_mask3[:, :, matchj, :sj] = True
+                if ej < self.num_group * num_points:
+                    attn_mask3[:, :, matchj, ej:] = True
+
+            for match_x in range(self.num_group * num_points):
+                if match_x % group_bbox_kpt in [
+                    0, 
+                    1 + self.num_body_points, 
+                    2 + self.num_body_points + self.num_hand_points, 
+                    3 + self.num_body_points + self.num_hand_points * 2]:
+                    
+                    attn_mask3[:, :, match_x, kpt_index] = False
+
+            # num_points = self.num_whole_body_points + 4
+            # device = targets[0]['boxes'].device
+            # bs = len(targets)
+            # attn_mask_infere = torch.zeros(
+            #     bs,
+            #     self.nheads,
+            #     self.num_group * num_points,
+            #     self.num_group * num_points,
+            #     device=device,
+            #     dtype=torch.bool)
+            # group_bbox_kpt = num_points
+            # group_nobbox_kpt = self.num_body_points
+            # kpt_index = [
+            #     x for x in range(self.num_group * num_points)
+            #     if x % num_points == 0
+            # ]
+            # for matchj in range(self.num_group * num_points):
+            #     sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+            #     ej = (matchj // group_bbox_kpt + 1) * group_bbox_kpt
+            #     if sj > 0:
+            #         attn_mask_infere[:, :, matchj, :sj] = True
+            #     if ej < self.num_group * num_points:
+            #         attn_mask_infere[:, :, matchj, ej:] = True
+            # for match_x in range(self.num_group * num_points):
+            #     if match_x % group_bbox_kpt == 0:
+            #         attn_mask_infere[:, :, match_x, kpt_index] = False
+
+            # attn_mask_infere = attn_mask_infere.flatten(0, 1)
+            attn_mask2 = attn_mask2.flatten(0, 1)
+            attn_mask3 = attn_mask3.flatten(0, 1)
+            return None, None, None, attn_mask2, attn_mask3, None
+
+        # targets, dn_scalar, noise_scale = dn_args
+        device = targets[0]['boxes'].device
+        bs = len(targets)
+        dn_number = self.dn_number  # 100
+        dn_box_noise_scale = self.dn_box_noise_scale  # 0.4
+        dn_label_noise_ratio = self.dn_label_noise_ratio  # 0.5
+
+        # gather gt boxes and labels
+        gt_boxes = [t['boxes'] for t in targets]
+        gt_labels = [t['labels'] for t in targets]
+        gt_keypoints = [t['keypoints'] for t in targets]
+
+        # repeat them
+        def get_indices_for_repeat(now_num, target_num, device='cuda'):
+            """
+            Input:
+                - now_num: int
+                - target_num: int
+            Output:
+                - indices: tensor[target_num]
+            """
+            out_indice = []
+            base_indice = torch.arange(now_num).to(device)
+            multiplier = target_num // now_num
+            out_indice.append(base_indice.repeat(multiplier))
+            residue = target_num % now_num
+            out_indice.append(base_indice[torch.randint(0,
+                                                        now_num, (residue, ),
+                                                        device=device)])
+            return torch.cat(out_indice)
+
+        if self.dn_batch_gt_fuse:
+            raise NotImplementedError
+            gt_boxes_bsall = torch.cat(gt_boxes)  # num_boxes, 4
+            gt_labels_bsall = torch.cat(gt_labels)
+            num_gt_bsall = gt_boxes_bsall.shape[0]
+            if num_gt_bsall > 0:
+                indices = get_indices_for_repeat(num_gt_bsall, dn_number,
+                                                 device)
+                gt_boxes_expand = gt_boxes_bsall[indices][None].repeat(
+                    bs, 1, 1)  # bs, num_dn, 4
+                gt_labels_expand = gt_labels_bsall[indices][None].repeat(
+                    bs, 1)  # bs, num_dn
+            else:
+                # all negative samples when no gt boxes
+                gt_boxes_expand = torch.rand(bs, dn_number, 4, device=device)
+                gt_labels_expand = torch.ones(
+                    bs, dn_number, dtype=torch.int64, device=device) * int(
+                        self.num_classes)
+        else:
+            gt_boxes_expand = []
+            gt_labels_expand = []
+            gt_keypoints_expand = []  # here
+            for idx, (gt_boxes_i, gt_labels_i, gt_keypoint_i) in enumerate(
+                    zip(gt_boxes, gt_labels, gt_keypoints)):  # idx -> batch id
+                num_gt_i = gt_boxes_i.shape[0]  # instance num
+                if num_gt_i > 0:
+                    indices = get_indices_for_repeat(num_gt_i, dn_number,
+                                                     device)
+                    gt_boxes_expand_i = gt_boxes_i[indices]  # num_dn, 4
+                    gt_labels_expand_i = gt_labels_i[indices]  # add smpl
+                    gt_keypoints_expand_i = gt_keypoint_i[indices]
+                else:
+                    # all negative samples when no gt boxes
+                    gt_boxes_expand_i = torch.rand(dn_number, 4, device=device)
+                    gt_labels_expand_i = torch.ones(
+                        dn_number, dtype=torch.int64, device=device) * int(
+                            self.num_classes)
+                    gt_keypoints_expand_i = torch.rand(dn_number,
+                                                       self.num_body_points *
+                                                       3,
+                                                       device=device)
+                gt_boxes_expand.append(gt_boxes_expand_i)  # add smpl
+                gt_labels_expand.append(gt_labels_expand_i)
+                gt_keypoints_expand.append(gt_keypoints_expand_i)
+            gt_boxes_expand = torch.stack(gt_boxes_expand)
+            gt_labels_expand = torch.stack(gt_labels_expand)
+            gt_keypoints_expand = torch.stack(gt_keypoints_expand)
+        knwon_boxes_expand = gt_boxes_expand.clone()
+        knwon_labels_expand = gt_labels_expand.clone()
+
+        # add noise
+        if dn_label_noise_ratio > 0:
+            prob = torch.rand_like(knwon_labels_expand.float())
+            chosen_indice = prob < dn_label_noise_ratio
+            new_label = torch.randint_like(
+                knwon_labels_expand[chosen_indice], 0,
+                self.dn_labelbook_size)  # randomly put a new one here
+            knwon_labels_expand[chosen_indice] = new_label
+
+        if dn_box_noise_scale > 0:
+            diff = torch.zeros_like(knwon_boxes_expand)
+            diff[..., :2] = knwon_boxes_expand[..., 2:] / 2
+            diff[..., 2:] = knwon_boxes_expand[..., 2:]
+            knwon_boxes_expand += torch.mul(
+                (torch.rand_like(knwon_boxes_expand) * 2 - 1.0),
+                diff) * dn_box_noise_scale
+            knwon_boxes_expand = knwon_boxes_expand.clamp(min=0.0, max=1.0)
+
+        input_query_label = self.label_enc(knwon_labels_expand)
+        input_query_bbox = inverse_sigmoid(knwon_boxes_expand)
+
+        # prepare mask
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            attn_mask = torch.zeros(bs,
+                                    self.nheads,
+                                    dn_number + self.num_queries,
+                                    dn_number + self.num_queries,
+                                    device=device,
+                                    dtype=torch.bool)
+            attn_mask[:, :, dn_number:, :dn_number] = True
+            for idx, (gt_boxes_i,
+                      gt_labels_i) in enumerate(zip(gt_boxes,
+                                                    gt_labels)):  # for batch
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask[idx, :, matchi, ei:dn_number] = True
+            attn_mask = attn_mask.flatten(0, 1)
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            # self.num_body_points = self.num_body_points +3
+            num_points = self.num_body_points + 4
+            attn_mask2 = torch.zeros(
+                bs,
+                self.nheads,
+                dn_number + self.num_group * num_points,
+                dn_number + self.num_group * num_points,
+                device=device,
+                dtype=torch.bool)
+            attn_mask2[:, :, dn_number:, :dn_number] = True
+            group_bbox_kpt = num_points
+            # group_nobbox_kpt = self.num_body_points
+            kpt_index = [x for x in range(self.num_group * num_points) 
+                         if x % num_points in [
+                             0, self.num_body_points+1, self.num_body_points+2, self.num_body_points+3]]
+            for matchj in range(self.num_group * num_points):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                if sj > 0:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, matchj, :sj] = True
+                if ej < self.num_group * num_points:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, matchj, ej:] = True
+
+            for match_x in range(self.num_group * num_points):
+                if match_x % group_bbox_kpt in [0, 
+                                                self.num_body_points+1, 
+                                                self.num_body_points+2, 
+                                                self.num_body_points+3]:
+                    attn_mask2[:, :, dn_number:, dn_number:][:,:,match_x,kpt_index]=False
+
+            for idx, (gt_boxes_i, gt_labels_i) in enumerate(zip(gt_boxes, gt_labels)):
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask2[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask2[idx, :, matchi, ei:dn_number] = True
+            attn_mask2 = attn_mask2.flatten(0, 1)
+
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            
+            # self.num_body_points = self.num_body_points +3
+            num_points = self.num_whole_body_points + 4
+            attn_mask3 = torch.zeros(
+                bs,
+                self.nheads,
+                dn_number + self.num_group * (num_points), dn_number + self.num_group * (num_points),
+                                    device=device, dtype=torch.bool)
+            attn_mask3[:, :, dn_number:, :dn_number] = True
+            group_bbox_kpt = (num_points)
+            # group_nobbox_kpt = self.num_body_points
+            kpt_index = [
+                x for x in range(self.num_group * (num_points)) if x % (num_points) in 
+                [0, 
+                 1+self.num_body_points, 
+                 2+self.num_body_points+self.num_hand_points, 
+                 3+self.num_body_points+self.num_hand_points*2
+                 ]
+                ]
+            for matchj in range(self.num_group * num_points):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                if sj > 0:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, matchj, :sj] = True
+                if ej < self.num_group * num_points:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, matchj, ej:] = True
+
+            for match_x in range(self.num_group * num_points):
+                if match_x % group_bbox_kpt in [0, 
+                                                1 + self.num_body_points, 
+                                                2 + self.num_body_points + self.num_hand_points, 
+                                                3 + self.num_body_points + self.num_hand_points * 2]:
+                    
+                    attn_mask3[:, :, dn_number:, dn_number:][:,:,match_x,kpt_index]=False
+
+            for idx, (gt_boxes_i, gt_labels_i) in enumerate(zip(gt_boxes, gt_labels)):
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask3[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask3[idx, :, matchi, ei:dn_number] = True
+            attn_mask3 = attn_mask3.flatten(0, 1)
+
+
+
+
+        mask_dict = {
+            'pad_size': dn_number,
+            'known_bboxs': gt_boxes_expand,
+            'known_labels': gt_labels_expand,
+            'known_keypoints': gt_keypoints_expand
+        }
+
+        return input_query_label, input_query_bbox, attn_mask, attn_mask2, attn_mask3, mask_dict
+
+    def dn_post_process2(self, outputs_class, outputs_coord,
+                         outputs_body_keypoints_list, mask_dict):
+        if mask_dict and mask_dict['pad_size'] > 0:
+            output_known_class = [
+                outputs_class_i[:, :mask_dict['pad_size'], :]
+                for outputs_class_i in outputs_class
+            ]
+            output_known_coord = [
+                outputs_coord_i[:, :mask_dict['pad_size'], :]
+                for outputs_coord_i in outputs_coord
+            ]
+
+            outputs_class = [
+                outputs_class_i[:, mask_dict['pad_size']:, :]
+                for outputs_class_i in outputs_class
+            ]
+            outputs_coord = [
+                outputs_coord_i[:, mask_dict['pad_size']:, :]
+                for outputs_coord_i in outputs_coord
+            ]
+            outputs_keypoint = outputs_body_keypoints_list
+
+            mask_dict.update({
+                'output_known_coord': output_known_coord,
+                'output_known_class': output_known_class
+            })
+        return outputs_class, outputs_coord, outputs_keypoint
+
+    def forward(self, data_batch: NestedTensor, targets: List = None):
+        """The forward expects a NestedTensor, which consists of:
+
+           - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+           - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+        It returns a dict with the following elements:
+           - "pred_logits": the classification logits (including no-object) for all queries.
+                            Shape= [batch_size x num_queries x num_classes]
+           - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                           (center_x, center_y, width, height). These values are normalized in [0, 1],
+                           relative to the size of each individual image (disregarding possible padding).
+                           See PostProcess for information on how to retrieve the unnormalized bounding box.
+           - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                            dictionnaries containing the two above keys for each decoder layer.
+        """
+
+        if isinstance(data_batch, dict):
+            samples, targets = self.prepare_targets(data_batch)
+            # import pdb; pdb.set_trace()
+        elif isinstance(data_batch, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(data_batch)
+        else:
+            samples = data_batch
+        # print(samples.data['img'].shape)
+        # exit()
+        features, poss = self.backbone(samples)
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):  # len(features=3)
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src))
+            masks.append(mask)
+            assert mask is not None
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](features[-1].tensors)
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                m = samples.mask
+                mask = F.interpolate(m[None].float(),
+                                     size=src.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                poss.append(pos_l)
+
+        if self.dn_number > 0 or targets is not None:
+            input_query_label, input_query_bbox, attn_mask,attn_mask2, attn_mask3, mask_dict =\
+                self.prepare_for_dn2(targets)
+        else:
+            assert targets is None
+            input_query_bbox = input_query_label = attn_mask = attn_mask2 = attn_mask3 = mask_dict = None
+
+
+        hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
+            srcs, masks, input_query_bbox, poss, input_query_label, attn_mask,
+            attn_mask2, attn_mask3)
+
+        # update human boxes
+        effective_dn_number = self.dn_number if self.training else 0
+        outputs_body_bbox_list = []
+        outputs_class = []
+        
+        for dec_lid, (layer_ref_sig, layer_body_bbox_embed, layer_cls_embed,
+                      layer_hs) in enumerate(
+                          zip(reference[:-1], self.bbox_embed,
+                              self.class_embed, hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                # human det
+                layer_delta_unsig = layer_body_bbox_embed(layer_hs)
+                layer_body_box_outputs_unsig = \
+                    layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
+                layer_body_box_outputs_unsig = layer_body_box_outputs_unsig.sigmoid()
+                layer_cls = layer_cls_embed(layer_hs)
+                # import mmcv
+                # import cv2
+                # img = (data_batch['img'][0]*255).permute(1,2,0).int().detach().cpu().numpy()
+                # bbox = (box_ops.box_cxcywh_to_xyxy(layer_body_box_outputs_unsig[0][0]).reshape(2,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0, ::-1]).reshape(1,4)
+                # img = mmcv.imshow_bboxes(img.copy(), bbox, show=False)
+                # cv2.imwrite('test.png',img)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)
+                outputs_class.append(layer_cls)
+                
+            elif dec_lid < self.num_box_decoder_layers + 2:
+                bs = layer_ref_sig.shape[0]                
+                # dn body bbox
+                layer_hs_body_bbox_dn = layer_hs[:, :effective_dn_number, :]  # dn content query
+                reference_before_sigmoid_body_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]  # dn position query
+                layer_body_box_delta_unsig_dn = layer_body_bbox_embed(layer_hs_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_delta_unsig_dn + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_outputs_unsig_dn.sigmoid()
+                
+                # norm body bbox
+                layer_hs_body_bbox_norm = layer_hs[:, effective_dn_number:, :][
+                    :, 0::(self.num_body_points + 4), :]  # norm content query
+                reference_before_sigmoid_body_bbox_norm = layer_ref_sig[:, effective_dn_number:, :][
+                    :, 0::(self.num_body_points+ 4), :]  # norm position query
+                layer_body_box_delta_unsig_norm = layer_body_bbox_embed(layer_hs_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_delta_unsig_norm + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_outputs_unsig_norm.sigmoid()
+
+                layer_body_box_outputs_unsig = torch.cat(
+                    (layer_body_box_outputs_unsig_dn, layer_body_box_outputs_unsig_norm), dim=1)
+
+                # classfication
+                layer_cls_dn = layer_cls_embed(layer_hs_body_bbox_dn)
+                layer_cls_norm = layer_cls_embed(layer_hs_body_bbox_norm)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+
+                outputs_class.append(layer_cls)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)                
+            else:
+                bs = layer_ref_sig.shape[0]                
+                # dn body bbox
+                layer_hs_body_bbox_dn = layer_hs[:, :effective_dn_number, :]  # dn content query
+                reference_before_sigmoid_body_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]  # dn position query
+                layer_body_box_delta_unsig_dn = layer_body_bbox_embed(layer_hs_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_delta_unsig_dn + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_outputs_unsig_dn.sigmoid()
+                
+                # norm body bbox
+                layer_hs_body_bbox_norm = layer_hs[:, effective_dn_number:, :][
+                    :, 0::(self.num_whole_body_points + 4), :]  # norm content query
+                reference_before_sigmoid_body_bbox_norm = layer_ref_sig[:,effective_dn_number:, :][
+                    :, 0::(self.num_whole_body_points + 4), :]  # norm position query
+                layer_body_box_delta_unsig_norm = layer_body_bbox_embed(layer_hs_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_delta_unsig_norm + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_outputs_unsig_norm.sigmoid()
+
+                layer_body_box_outputs_unsig = torch.cat(
+                    (layer_body_box_outputs_unsig_dn, layer_body_box_outputs_unsig_norm), dim=1)
+
+                # classfication
+                layer_cls_dn = layer_cls_embed(layer_hs_body_bbox_dn)
+                layer_cls_norm = layer_cls_embed(layer_hs_body_bbox_norm)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+
+                outputs_class.append(layer_cls)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)       
+        
+        # 找query
+        q_index = torch.topk(layer_cls_norm.max(-1)[0], 100, dim=1)[1]
+        q_value = torch.topk(layer_cls_norm.max(-1)[0], 100, dim=1)[0]
+        # update hand and face boxes
+        outputs_lhand_bbox_list = []
+        outputs_rhand_bbox_list = []
+        outputs_face_bbox_list = []
+        # update keypoints boxes
+        outputs_body_keypoints_list = []
+        outputs_body_keypoints_hw = []
+        outputs_lhand_keypoints_list = []
+        outputs_lhand_keypoints_hw = []        
+        outputs_rhand_keypoints_list = []
+        outputs_rhand_keypoints_hw = []
+        outputs_face_keypoints_list = []
+        outputs_face_keypoints_hw = []             
+        
+        outputs_smpl_pose_list = []
+        outputs_smpl_lhand_pose_list = []
+        outputs_smpl_rhand_pose_list = []
+        outputs_smpl_expr_list = []
+        outputs_smpl_jaw_pose_list = []
+        outputs_smpl_beta_list = []
+        outputs_smpl_cam_list = []
+        # outputs_smpl_cam_f_list = []
+        outputs_smpl_kp2d_list = []
+        outputs_smpl_kp3d_list = []
+        outputs_smpl_verts_list = []
+        body_kpt_index = [
+            x for x in range(self.num_group * (self.num_body_points + 4))
+            if x % (self.num_body_points + 4) in range(1,self.num_body_points+1)
+        ]
+        body_kpt_index_2 = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in range(1,self.num_body_points+1))
+        ]
+        
+        lhand_bbox_index = [
+            x for x in range(self.num_group * (self.num_body_points + 4))
+            if x % (self.num_body_points + 4) != 1
+        ]
+        lhand_kpt_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in range(
+                self.num_body_points+2, 
+                self.num_body_points+self.num_hand_points+2))]
+        
+        rhand_bbox_index = [
+            x for x in range(self.num_group * (self.num_body_points + 4))
+            if x % (self.num_body_points + 4) != 2
+        ]
+        rhand_kpt_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in range(
+                self.num_body_points+self.num_hand_points+3, 
+                self.num_body_points+self.num_hand_points*2+3))
+        ]
+        
+        face_bbox_index = [
+            x for x in range(self.num_group * (self.num_body_points + 4))
+            if x % (self.num_body_points + 4) != 3
+        ]
+        face_kpt_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in range(
+                self.num_body_points+self.num_hand_points*2+4, 
+                self.num_body_points+self.num_hand_points*2+self.num_face_points+4))
+        ]
+        
+        # smpl pose
+        
+        # body box, kps, lhand box
+        body_index = list(range(0,self.num_body_points+2))
+        # rhand box and face box        
+        body_index.extend(
+            [self.num_body_points + self.num_hand_points + 2, self.num_body_points + 2 * self.num_hand_points + 3]
+        )
+        smpl_pose_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in body_index) 
+        ]
+        
+        # smpl lhand
+        lhand_index = list(range(self.num_body_points+1, self.num_body_points+self.num_hand_points+3))
+        # body box
+        lhand_index.insert(0, 0)
+        
+        smpl_lhand_pose_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in lhand_index)]
+        
+        # smpl rhand
+        rhand_index = list(range(self.num_body_points + self.num_hand_points + 2, self.num_body_points + self.num_hand_points * 2 +3))
+        rhand_index.insert(0,self.num_body_points+1)
+        rhand_index.insert(0,0)
+        smpl_rhand_pose_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in rhand_index)]
+        
+        # smpl face
+        face_index = list(range(self.num_body_points + self.num_hand_points * 2 + 3, self.num_body_points + self.num_hand_points * 2 + self.num_face_points + 4))
+        face_index.insert(0,0)
+        
+        smpl_face_pose_index = [
+            x for x in range(self.num_group * (self.num_whole_body_points + 4))
+            if (x % (self.num_whole_body_points + 4) in face_index)]
+        
+        for dec_lid, (layer_ref_sig, layer_hs) in enumerate(zip(reference[:-1], hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                assert isinstance(layer_hs, torch.Tensor)
+                bs = layer_hs.shape[0]
+                layer_body_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_body_points * 3))  # [-, 900, 42]
+                outputs_body_keypoints_list.append(layer_body_kps_res)
+                
+                # lhand
+                layer_lhand_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_lhand_bbox_list.append(layer_lhand_bbox_res)
+                layer_lhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_lhand_keypoints_list.append(layer_lhand_kps_res)                
+
+                # rhand
+                layer_rhand_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_rhand_bbox_list.append(layer_rhand_bbox_res)                
+                layer_rhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_rhand_keypoints_list.append(layer_rhand_kps_res)
+                
+                # face
+                layer_face_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_face_bbox_list.append(layer_face_bbox_res)
+                layer_face_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_face_points * 3))  # [-, 900, 42]
+                outputs_face_keypoints_list.append(layer_face_kps_res)
+                
+                
+                # smpl or smplx
+                smpl_pose = layer_hs.new_zeros((bs, self.num_queries, self.body_model_joint_num * 3))
+                smpl_rhand_pose = layer_hs.new_zeros(
+                    (bs, self.num_queries, 15 * 3))
+                smpl_lhand_pose = layer_hs.new_zeros(
+                    (bs, self.num_queries, 15 * 3))
+                smpl_expr = layer_hs.new_zeros((bs, self.num_queries, 10))
+                smpl_jaw_pose = layer_hs.new_zeros((bs, self.num_queries, 6))
+                smpl_beta = layer_hs.new_zeros((bs, self.num_queries, 10))
+                smpl_cam = layer_hs.new_zeros((bs, self.num_queries, 3))
+                # smpl_cam_f = layer_hs.new_zeros((bs, self.num_queries, 1))
+                # smpl_kp2d = layer_hs.new_zeros((bs, self.num_queries, self.num_body_points,3))
+                smpl_kp3d = layer_hs.new_zeros(
+                    (bs, self.num_queries, self.num_body_points, 4))
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                # outputs_smpl_cam_f_list.append(smpl_cam_f)
+                # outputs_smpl_kp2d_list.append(smpl_kp2d)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+            elif dec_lid < self.num_box_decoder_layers +2:
+                bs = layer_ref_sig.shape[0]
+                layer_hs_body_kpt = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                        1, torch.tensor(body_kpt_index, device=layer_hs.device))
+                # body kp2d
+                delta_body_kp_xy_unsig = \
+                    self.pose_embed[dec_lid - self.num_box_decoder_layers](layer_hs_body_kpt)
+                layer_ref_sig_body_kpt = \
+                    layer_ref_sig[:,effective_dn_number:, :].index_select(1,torch.tensor(body_kpt_index,device=layer_hs.device))
+                layer_outputs_unsig_body_keypoints = delta_body_kp_xy_unsig + inverse_sigmoid(
+                    layer_ref_sig_body_kpt[..., :2])
+                vis_xy_unsig = torch.ones_like(
+                    layer_outputs_unsig_body_keypoints,
+                    device=layer_outputs_unsig_body_keypoints.device)
+                xyv = torch.cat((layer_outputs_unsig_body_keypoints,
+                                 vis_xy_unsig[:, :, 0].unsqueeze(-1)),
+                                dim=-1)
+                xyv = xyv.sigmoid()
+
+                # from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+                # img  =(data_batch['img'][0].permute(1,2,0)*255).int().cpu().numpy()
+                # gt_kp2d = xyv[0,:17]
+                # coco_kps = gt_kp2d[:,:2].reshape(17,2).detach().cpu().numpy() * data_batch['img_shape'].cpu().numpy()[0,None,None,::-1]
+                # visualize_kp2d(
+                #     coco_kps, 
+                #     output_path='.', 
+                #     image_array=img.copy()[None], 
+                #     data_source='coco',
+                #     overwrite=True)
+
+                layer_res = xyv.reshape(
+                    (bs, self.num_group, self.num_body_points,
+                     3)).flatten(2, 3)
+                layer_hw = layer_ref_sig_body_kpt[..., 2:].reshape(
+                    bs, self.num_group, self.num_body_points, 2).flatten(2, 3)
+                layer_res = keypoint_xyzxyz_to_xyxyzz(layer_res)
+                outputs_body_keypoints_list.append(layer_res)
+                outputs_body_keypoints_hw.append(layer_hw)
+                
+                # lhand bbox
+                layer_hs_lhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, (self.num_body_points + 1)::(self.num_body_points + 4), :]
+                    
+                delta_lhand_bbox_xy_unsig = self.bbox_hand_embed[dec_lid - self.num_box_decoder_layers](layer_hs_lhand_bbox)             
+                layer_ref_sig_lhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + 1)::(self.num_body_points + 4), :].clone() 
+                layer_ref_unsig_lhand_bbox = inverse_sigmoid(layer_ref_sig_lhand_bbox)
+                delta_lhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_lhand_bbox)
+                layer_ref_unsig_lhand_bbox[..., :2] +=delta_lhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_lhand_bbox[..., 2:] +=delta_lhand_bbox_hw_unsig
+                layer_ref_sig_lhand_bbox = layer_ref_unsig_lhand_bbox.sigmoid()
+                outputs_lhand_bbox_list.append(layer_ref_sig_lhand_bbox)
+                
+                layer_lhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_lhand_keypoints_list.append(layer_lhand_kps_res)
+                                
+                # rhand bbox
+                layer_hs_rhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][
+                        :, (self.num_body_points + 2)::(self.num_body_points + 4), :]
+                delta_rhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_rhand_bbox)             
+                layer_ref_sig_rhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + 2)::(self.num_body_points + 4), :].clone()
+                layer_ref_unsig_rhand_bbox = inverse_sigmoid(layer_ref_sig_rhand_bbox)
+                delta_rhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_rhand_bbox)
+                layer_ref_unsig_rhand_bbox[..., :2] +=delta_rhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_rhand_bbox[..., 2:] +=delta_rhand_bbox_hw_unsig
+                layer_ref_sig_rhand_bbox = layer_ref_unsig_rhand_bbox.sigmoid()
+                outputs_rhand_bbox_list.append(layer_ref_sig_rhand_bbox)
+                
+                # rhand kps
+                layer_rhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_rhand_keypoints_list.append(layer_rhand_kps_res)
+                
+                # face bbox
+                layer_hs_face_bbox = \
+                    layer_hs[:, effective_dn_number:, :][
+                        :, (self.num_body_points + 3)::(self.num_body_points + 4), :]
+                delta_face_bbox_xy_unsig = self.bbox_face_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_face_bbox)             
+                layer_ref_sig_face_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + 3)::(self.num_body_points + 4), :].clone()
+                layer_ref_unsig_face_bbox = inverse_sigmoid(layer_ref_sig_face_bbox)
+                delta_face_bbox_hw_unsig = self.bbox_face_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_face_bbox)
+                layer_ref_unsig_face_bbox[..., :2] +=delta_face_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_face_bbox[..., 2:] +=delta_face_bbox_hw_unsig                
+                layer_ref_sig_face_bbox = layer_ref_unsig_face_bbox.sigmoid()
+                
+                outputs_face_bbox_list.append(layer_ref_sig_face_bbox)
+                
+                # face kps
+                layer_face_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_face_points * 3))  # [-, 900, 42]
+                outputs_face_keypoints_list.append(layer_face_kps_res)
+                
+                # smpl or smplx
+                bs, _, feat_dim = layer_hs.shape
+                smpl_feats = layer_hs[:, effective_dn_number:, :].reshape(
+                    bs, -1, feat_dim * (self.num_body_points + 4))
+                smpl_lhand_pose_feats = layer_hs[:, effective_dn_number:, :][
+                    :, (self.num_body_points + 1):: (self.num_body_points + 4), :].reshape(
+                        bs, -1, feat_dim)
+                smpl_rhand_pose_feats = layer_hs[:, effective_dn_number:, :][
+                    :, (self.num_body_points + 2):: (self.num_body_points + 4), :].reshape(
+                        bs, -1, feat_dim)
+                smpl_face_pose_feats = layer_hs[:, effective_dn_number:, :][
+                    :, (self.num_body_points + 3):: (self.num_body_points + 4), :].reshape(
+                        bs, -1, feat_dim)
+                                  
+                smpl_pose = self.smpl_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+                smpl_pose = rot6d_to_rotmat(smpl_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, self.body_model_joint_num, 3, 3)
+                
+                smpl_lhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_lhand_pose_feats)
+                smpl_lhand_pose = rot6d_to_rotmat(smpl_lhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                
+                smpl_rhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_rhand_pose_feats)
+                smpl_rhand_pose = rot6d_to_rotmat(smpl_rhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                
+                smpl_jaw_pose = self.smpl_jaw_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = rot6d_to_rotmat(smpl_jaw_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, 1, 3, 3)
+                                 
+                smpl_beta = self.smpl_beta_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+                smpl_cam = self.smpl_cam_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+                # smpl_cam_f =  self.smpl_cam_f_embed[
+                #      dec_lid - self.num_box_decoder_layers](smpl_feats)
+
+                # zero
+                # smpl_lhand_pose = layer_hs.new_zeros(bs, self.num_group, 15, 3, 3)
+                # smpl_rhand_pose = layer_hs.new_zeros(bs, self.num_group, 15, 3, 3)
+                # smpl_expr = layer_hs.new_zeros(bs, self.num_group, 10)
+                smpl_expr = self.smpl_expr_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                # smpl_jaw_pose = layer_hs.new_zeros(bs, self.num_group, 3)
+                leye_pose = torch.zeros_like(smpl_jaw_pose)
+                reye_pose = torch.zeros_like(smpl_jaw_pose)
+
+
+
+                if self.body_model is not None:
+                    smpl_pose_ = rotmat_to_aa(smpl_pose)
+                    # smpl_lhand_pose_ = rotmat_to_aa(smpl_lhand_pose)
+                    # smpl_rhand_pose_ = rotmat_to_aa(smpl_rhand_pose)
+                    smpl_lhand_pose_ = layer_hs.new_zeros(bs, self.num_group, 15, 3)
+                    smpl_rhand_pose_ = layer_hs.new_zeros(bs, self.num_group, 15, 3)
+                    smpl_jaw_pose_ = rotmat_to_aa(smpl_jaw_pose)
+                    leye_pose_ = rotmat_to_aa(leye_pose)
+                    reye_pose_ = rotmat_to_aa(reye_pose)
+                    
+                    pred_output = self.body_model(
+                        betas=smpl_beta.reshape(-1, 10),
+                        body_pose=smpl_pose_[:, :,  1:].reshape(-1, 21 * 3),
+                        global_orient=smpl_pose_[:, :, 0].reshape(
+                            -1, 3).unsqueeze(1),
+                        left_hand_pose=smpl_lhand_pose_.reshape(-1, 15 * 3),
+                        right_hand_pose=smpl_rhand_pose_.reshape(-1, 15 * 3),
+                        leye_pose=leye_pose_,
+                        reye_pose=reye_pose_,
+                        jaw_pose=smpl_jaw_pose_.reshape(-1, 3),
+                        # expression=smpl_expr.reshape(-1, 10),
+                        expression=layer_hs.new_zeros(bs, self.num_group, 10).reshape(-1, 10)
+                    )
+                    smpl_kp3d = pred_output['joints'].reshape(
+                        bs, self.num_group, -1, 3)
+                    smpl_verts = pred_output['vertices'].reshape(
+                        bs, self.num_group, -1, 3)
+                    # pred_vertices = pred_output['vertices'].reshape(bs, -1, 6890, 3)
+
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                # outputs_smpl_cam_f_list.append(smpl_cam_f)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+                
+
+            else:
+                bs = layer_ref_sig.shape[0]
+                layer_hs_body_kpt = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                        1, torch.tensor(body_kpt_index_2, device=layer_hs.device))
+
+                # body kp2d
+                delta_body_kp_xy_unsig = \
+                    self.pose_embed[
+                        dec_lid - self.num_box_decoder_layers](layer_hs_body_kpt)
+                layer_ref_sig_body_kpt = \
+                    layer_ref_sig[:,effective_dn_number:, :].index_select(
+                        1,torch.tensor(body_kpt_index_2,device=layer_hs.device))
+                layer_outputs_unsig_body_keypoints = \
+                    delta_body_kp_xy_unsig + inverse_sigmoid(
+                    layer_ref_sig_body_kpt[..., :2])
+                vis_xy_unsig = torch.ones_like(
+                    layer_outputs_unsig_body_keypoints,
+                    device=layer_outputs_unsig_body_keypoints.device)
+                xyv = torch.cat((layer_outputs_unsig_body_keypoints,
+                                 vis_xy_unsig[:, :, 0].unsqueeze(-1)),
+                                dim=-1)
+                xyv = xyv.sigmoid()
+                layer_res = xyv.reshape(
+                    (bs, self.num_group, self.num_body_points,
+                     3)).flatten(2, 3)
+                layer_hw = layer_ref_sig_body_kpt[..., 2:].reshape(
+                    bs, self.num_group, self.num_body_points, 2).flatten(2, 3)
+                layer_res = keypoint_xyzxyz_to_xyxyzz(layer_res)
+                outputs_body_keypoints_list.append(layer_res)
+                outputs_body_keypoints_hw.append(layer_hw)
+                
+                # lhand bbox
+                layer_hs_lhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][
+                        :, (self.num_body_points + 1)::(self.num_whole_body_points + 4), :]
+                    
+                delta_lhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_lhand_bbox)             
+                layer_ref_sig_lhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + 1)::(self.num_whole_body_points + 4), :].clone()
+                layer_ref_unsig_lhand_bbox = inverse_sigmoid(layer_ref_sig_lhand_bbox)
+                delta_lhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_lhand_bbox)
+                layer_ref_unsig_lhand_bbox[..., :2] +=delta_lhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_lhand_bbox[..., 2:] +=delta_lhand_bbox_hw_unsig
+                layer_ref_sig_lhand_bbox = layer_ref_unsig_lhand_bbox.sigmoid()
+                outputs_lhand_bbox_list.append(layer_ref_sig_lhand_bbox)
+                
+                # lhand kps
+                layer_hs_lhand_kps_res = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                        1, torch.tensor(lhand_kpt_index, device=layer_hs.device))
+                delta_lhand_kp_xy_unsig = \
+                    self.pose_hand_embed[
+                        dec_lid - self.num_hand_face_decoder_layers](layer_hs_lhand_kps_res)                
+                layer_ref_sig_lhand_kpt = \
+                    layer_ref_sig[:,effective_dn_number:, :].index_select(
+                        1,torch.tensor(lhand_kpt_index,device=layer_hs.device)) 
+                layer_outputs_unsig_lhand_keypoints = delta_lhand_kp_xy_unsig + inverse_sigmoid(
+                    layer_ref_sig_lhand_kpt[..., :2])                    
+                lhand_vis_xy_unsig = torch.ones_like(
+                    layer_outputs_unsig_lhand_keypoints,
+                    device=layer_outputs_unsig_lhand_keypoints.device)
+                lhand_xyv = torch.cat((layer_outputs_unsig_lhand_keypoints,
+                                 lhand_vis_xy_unsig[:, :, 0].unsqueeze(-1)),
+                                dim=-1)
+                lhand_xyv = lhand_xyv.sigmoid()
+                layer_lhand_kps_res = lhand_xyv.reshape(
+                    (bs, self.num_group, self.num_hand_points,
+                     3)).flatten(2, 3)
+                layer_lhand_hw = layer_ref_sig_lhand_kpt[..., 2:].reshape(
+                    bs, self.num_group, self.num_hand_points, 2).flatten(2, 3)
+                layer_lhand_kps_res = keypoint_xyzxyz_to_xyxyzz(layer_lhand_kps_res)
+                outputs_lhand_keypoints_list.append(layer_lhand_kps_res)
+                outputs_lhand_keypoints_hw.append(layer_lhand_hw)
+
+                # rhand bbox
+                layer_hs_rhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][
+                        :, (self.num_body_points + self.num_hand_points + 2)::(self.num_whole_body_points + 4), :]
+                delta_rhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_rhand_bbox)             
+                layer_ref_sig_rhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + self.num_hand_points + 2)::(self.num_whole_body_points + 4), :].clone()                  
+                layer_ref_unsig_rhand_bbox = inverse_sigmoid(layer_ref_sig_rhand_bbox)
+                delta_rhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_rhand_bbox)
+                layer_ref_unsig_rhand_bbox[..., :2] +=delta_rhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_rhand_bbox[..., 2:] +=delta_rhand_bbox_hw_unsig
+                layer_ref_sig_rhand_bbox = layer_ref_unsig_rhand_bbox.sigmoid()
+                outputs_rhand_bbox_list.append(layer_ref_sig_rhand_bbox)
+                
+                # rhand kps
+                layer_hs_rhand_kps_res = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                        1, torch.tensor(rhand_kpt_index, device=layer_hs.device))
+                delta_rhand_kp_xy_unsig = \
+                    self.pose_hand_embed[
+                        dec_lid - self.num_hand_face_decoder_layers](layer_hs_rhand_kps_res)                
+                layer_ref_sig_rhand_kpt = \
+                    layer_ref_sig[:,effective_dn_number:, :].index_select(
+                        1,torch.tensor(rhand_kpt_index,device=layer_hs.device)) 
+                layer_outputs_unsig_rhand_keypoints = delta_rhand_kp_xy_unsig + inverse_sigmoid(
+                    layer_ref_sig_rhand_kpt[..., :2])                    
+                rhand_vis_xy_unsig = torch.ones_like(
+                    layer_outputs_unsig_rhand_keypoints,
+                    device=layer_outputs_unsig_rhand_keypoints.device)
+                rhand_xyv = torch.cat((layer_outputs_unsig_rhand_keypoints,
+                                 rhand_vis_xy_unsig[:, :, 0].unsqueeze(-1)),
+                                dim=-1)
+                rhand_xyv = rhand_xyv.sigmoid()
+                layer_rhand_kps_res = rhand_xyv.reshape(
+                    (bs, self.num_group, self.num_hand_points,
+                     3)).flatten(2, 3)
+                layer_rhand_hw = layer_ref_sig_rhand_kpt[..., 2:].reshape(
+                    bs, self.num_group, self.num_hand_points, 2).flatten(2, 3)
+                layer_rhand_kps_res = keypoint_xyzxyz_to_xyxyzz(layer_rhand_kps_res)
+                outputs_rhand_keypoints_list.append(layer_rhand_kps_res)
+                outputs_rhand_keypoints_hw.append(layer_rhand_hw)
+                
+                # face bbox
+                layer_hs_face_bbox = \
+                    layer_hs[:, effective_dn_number:, :][
+                        :, (self.num_body_points + 2 * self.num_hand_points + 3)::(self.num_whole_body_points + 4), :]
+                delta_face_bbox_xy_unsig = self.bbox_face_embed[dec_lid - self.num_box_decoder_layers](layer_hs_face_bbox)             
+                layer_ref_sig_face_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][
+                        :, (self.num_body_points + 2 * self.num_hand_points + 3)::(self.num_whole_body_points + 4), :].clone()               
+                layer_ref_unsig_face_bbox = inverse_sigmoid(layer_ref_sig_face_bbox)
+                delta_face_bbox_hw_unsig = self.bbox_face_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_face_bbox)
+                layer_ref_unsig_face_bbox[..., :2] +=delta_face_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_face_bbox[..., 2:] +=delta_face_bbox_hw_unsig
+                layer_ref_sig_face_bbox = layer_ref_unsig_face_bbox.sigmoid()   
+                outputs_face_bbox_list.append(layer_ref_sig_face_bbox)
+                
+                # face kps
+                layer_hs_face_kps_res = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                        1, torch.tensor(face_kpt_index, device=layer_hs.device))
+                delta_face_kp_xy_unsig = \
+                    self.pose_face_embed[
+                        dec_lid - self.num_hand_face_decoder_layers](layer_hs_face_kps_res)                
+                layer_ref_sig_face_kpt = \
+                    layer_ref_sig[:,effective_dn_number:, :].index_select(
+                        1,torch.tensor(face_kpt_index,device=layer_hs.device)) 
+                layer_outputs_unsig_face_keypoints = delta_face_kp_xy_unsig + inverse_sigmoid(
+                    layer_ref_sig_face_kpt[..., :2])                    
+                face_vis_xy_unsig = torch.ones_like(
+                    layer_outputs_unsig_face_keypoints,
+                    device=layer_outputs_unsig_face_keypoints.device)
+                face_xyv = torch.cat((layer_outputs_unsig_face_keypoints,
+                                 face_vis_xy_unsig[:, :, 0].unsqueeze(-1)),
+                                dim=-1)
+                face_xyv = face_xyv.sigmoid()
+                layer_face_kps_res = face_xyv.reshape(
+                    (bs, self.num_group, self.num_face_points,
+                     3)).flatten(2, 3)
+                layer_face_hw = layer_ref_sig_face_kpt[..., 2:].reshape(
+                    bs, self.num_group, self.num_face_points, 2).flatten(2, 3)
+                layer_face_kps_res = keypoint_xyzxyz_to_xyxyzz(layer_face_kps_res)
+                outputs_face_keypoints_list.append(layer_face_kps_res)
+                outputs_face_keypoints_hw.append(layer_face_hw)
+                                
+                # pdb.set_trace()
+                bs, _, feat_dim = layer_hs.shape
+                smpl_body_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * (self.num_body_points + 4))
+                smpl_lhand_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_lhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * (self.num_hand_points + 3))
+                smpl_rhand_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_rhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * (self.num_hand_points + 3))
+                smpl_face_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_face_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * (self.num_face_points + 2))
+                                                
+                smpl_pose = self.smpl_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                
+                smpl_pose = rot6d_to_rotmat(smpl_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, self.body_model_joint_num, 3, 3)
+                smpl_lhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_lhand_pose_feats)
+                smpl_lhand_pose = rot6d_to_rotmat(smpl_lhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                smpl_rhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_rhand_pose_feats)
+                smpl_rhand_pose = rot6d_to_rotmat(smpl_rhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+
+                smpl_expr = self.smpl_expr_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = self.smpl_jaw_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = rot6d_to_rotmat(smpl_jaw_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, 1, 3, 3)
+                smpl_beta = self.smpl_beta_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                smpl_cam = self.smpl_cam_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                # smpl_cam_f = self.smpl_cam_f_embed[
+                #     dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                
+                num_samples = smpl_beta.reshape(-1, 10).shape[0]
+                device = smpl_beta.device
+                leye_pose = torch.zeros_like(smpl_jaw_pose)
+                reye_pose = torch.zeros_like(smpl_jaw_pose)
+
+                if self.body_model is not None:
+                    # print(smpl_pose)
+                    # exit()
+                    smpl_pose_ = rotmat_to_aa(smpl_pose)
+                    smpl_lhand_pose_ = rotmat_to_aa(smpl_lhand_pose)
+                    smpl_rhand_pose_ = rotmat_to_aa(smpl_rhand_pose)
+                    smpl_jaw_pose_ = rotmat_to_aa(smpl_jaw_pose)
+                    leye_pose_ = rotmat_to_aa(leye_pose)
+                    reye_pose_ = rotmat_to_aa(reye_pose)
+                    
+                    pred_output = self.body_model(
+                        betas=smpl_beta.reshape(-1, 10),
+                        body_pose=smpl_pose_[:, :,  1:].reshape(-1, 21 * 3),
+                        global_orient=smpl_pose_[:, :, 0].reshape(
+                            -1, 3).unsqueeze(1),
+                        left_hand_pose=smpl_lhand_pose_.reshape(-1, 15 * 3),
+                        right_hand_pose=smpl_rhand_pose_.reshape(-1, 15 * 3),
+                        leye_pose=leye_pose_,
+                        reye_pose=reye_pose_,
+                        jaw_pose=smpl_jaw_pose_.reshape(-1, 3),
+                        expression=smpl_expr.reshape(-1, 10),
+                        # expression=layer_hs.new_zeros(bs, self.num_group, 10).reshape(-1, 10),
+                    )
+                    smpl_kp3d = pred_output['joints'].reshape(
+                        bs, self.num_group, -1, 3)
+                    smpl_verts = pred_output['vertices'].reshape(
+                        bs, self.num_group, -1, 3)
+                    # pred_vertices = pred_output['vertices'].reshape(bs, -1, 6890, 3)
+                    # from detrsmpl.core.visualization.visualize_keypoints3d import visualize_kp3d
+                    # visualize_kp3d(smpl_kp3d[0,:100].detach().cpu().numpy(),
+                    #                         output_path='./figs/pred3d',
+                    #                         data_source='smplx_137')
+                    # import numpy as np
+                    # from pytorch3d.io import save_obj
+                    # save_obj(
+                    #     '1.obj', 
+                    #     torch.tensor(pred_output['vertices'][0]), 
+                    #     torch.tensor(self.body_model.faces.astype(np.float)))
+                    # exit()
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                # outputs_smpl_cam_f_list.append(smpl_cam_f)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+                if not self.training:
+                    outputs_smpl_verts_list.append(smpl_verts)
+        dn_mask_dict = mask_dict
+        if self.dn_number > 0 and dn_mask_dict is not None:
+            outputs_class, outputs_body_bbox_list, outputs_body_keypoints_list = self.dn_post_process2(
+                outputs_class, outputs_body_bbox_list, outputs_body_keypoints_list,
+                dn_mask_dict)
+            dn_class_input = dn_mask_dict['known_labels']
+            dn_bbox_input = dn_mask_dict['known_bboxs']
+            dn_class_pred = dn_mask_dict['output_known_class']
+            dn_bbox_pred = dn_mask_dict['output_known_coord']
+
+        for idx, (_out_class, _out_bbox, _out_keypoint) in enumerate(
+                zip(outputs_class, outputs_body_bbox_list,
+                    outputs_body_keypoints_list)):
+            assert _out_class.shape[1] == _out_bbox.shape[
+                1] == _out_keypoint.shape[1]
+        out = {
+            'pred_logits': outputs_class[-1],
+            'pred_boxes': outputs_body_bbox_list[-1],
+            'pred_lhand_boxes': outputs_lhand_bbox_list[-1],
+            'pred_rhand_boxes': outputs_rhand_bbox_list[-1],
+            'pred_face_boxes': outputs_face_bbox_list[-1],
+            'pred_keypoints': outputs_body_keypoints_list[-1],
+            'pred_lhand_keypoints': outputs_lhand_keypoints_list[-1],
+            'pred_rhand_keypoints': outputs_rhand_keypoints_list[-1],
+            'pred_face_keypoints': outputs_face_keypoints_list[-1],
+            'pred_smpl_pose': outputs_smpl_pose_list[-1],
+            'pred_smpl_rhand_pose': outputs_smpl_rhand_pose_list[-1],
+            'pred_smpl_lhand_pose': outputs_smpl_lhand_pose_list[-1],
+            'pred_smpl_jaw_pose': outputs_smpl_jaw_pose_list[-1],
+            'pred_smpl_expr': outputs_smpl_expr_list[-1],
+            'pred_smpl_beta': outputs_smpl_beta_list[-1],  # [B, 100, 10]
+            'pred_smpl_cam': outputs_smpl_cam_list[-1],
+            # 'pred_smpl_cam_f': outputs_smpl_cam_f_list[-1],
+            'pred_smpl_kp3d': outputs_smpl_kp3d_list[-1]
+        }
+        if not self.training:
+            full_pose = torch.cat((outputs_smpl_pose_list[-1],
+                               outputs_smpl_lhand_pose_list[-1],
+                               outputs_smpl_rhand_pose_list[-1],
+                               outputs_smpl_jaw_pose_list[-1]),dim=2)
+            bs,num_q,_,_,_ = full_pose.shape
+            full_pose = rotmat_to_aa(full_pose).reshape(bs,num_q,53*3)
+            out = {
+            'pred_logits': outputs_class[-1],
+            'pred_boxes': outputs_body_bbox_list[-1],
+            'pred_lhand_boxes': outputs_lhand_bbox_list[-1],
+            'pred_rhand_boxes': outputs_rhand_bbox_list[-1],
+            'pred_face_boxes': outputs_face_bbox_list[-1],
+            'pred_keypoints': outputs_body_keypoints_list[-1],
+            'pred_lhand_keypoints': outputs_lhand_keypoints_list[-1],
+            'pred_rhand_keypoints': outputs_rhand_keypoints_list[-1],
+            'pred_face_keypoints': outputs_face_keypoints_list[-1],
+            'pred_smpl_pose': outputs_smpl_pose_list[-1],
+            'pred_smpl_rhand_pose': outputs_smpl_rhand_pose_list[-1],
+            'pred_smpl_lhand_pose': outputs_smpl_lhand_pose_list[-1],
+            'pred_smpl_jaw_pose': outputs_smpl_jaw_pose_list[-1],
+            'pred_smpl_expr': outputs_smpl_expr_list[-1],
+            'pred_smpl_beta': outputs_smpl_beta_list[-1],  # [B, 100, 10]
+            'pred_smpl_cam': outputs_smpl_cam_list[-1],
+            # 'pred_smpl_cam_f': outputs_smpl_cam_f_list[-1],
+            'pred_smpl_kp3d': outputs_smpl_kp3d_list[-1],
+            'pred_smpl_verts': outputs_smpl_verts_list[-1],
+            'pred_smpl_fullpose': full_pose
+        }
+
+        if self.dn_number > 0 and dn_mask_dict is not None:
+            out.update({
+                'dn_class_input': dn_class_input,
+                'dn_bbox_input': dn_bbox_input,
+                'dn_class_pred': dn_class_pred[-1],
+                'dn_bbox_pred': dn_bbox_pred[-1],
+                'num_tgt': dn_mask_dict['pad_size']
+            })
+
+        if self.aux_loss:
+            out['aux_outputs'] = \
+                self._set_aux_loss(
+                    outputs_class,
+                    outputs_body_bbox_list,
+                    outputs_lhand_bbox_list,
+                    outputs_rhand_bbox_list,
+                    outputs_face_bbox_list,
+                    outputs_body_keypoints_list,
+                    outputs_lhand_keypoints_list,
+                    outputs_rhand_keypoints_list,
+                    outputs_face_keypoints_list,
+                    outputs_smpl_pose_list,
+                    outputs_smpl_rhand_pose_list,
+                    outputs_smpl_lhand_pose_list,
+                    outputs_smpl_jaw_pose_list,
+                    outputs_smpl_expr_list,
+                    outputs_smpl_beta_list,
+                    outputs_smpl_cam_list,
+                    # outputs_smpl_cam_f_list,
+                    outputs_smpl_kp3d_list
+                ) # with key pred_logits, pred_bbox, pred_keypoints
+            if self.dn_number > 0 and dn_mask_dict is not None:
+                assert len(dn_class_pred[:-1]) == len(
+                    dn_bbox_pred[:-1]) == len(out['aux_outputs'])
+                for aux_out, dn_class_pred_i, dn_bbox_pred_i in zip(
+                        out['aux_outputs'], dn_class_pred, dn_bbox_pred):
+                    aux_out.update({
+                        'dn_class_input': dn_class_input,
+                        'dn_bbox_input': dn_bbox_input,
+                        'dn_class_pred': dn_class_pred_i,
+                        'dn_bbox_pred': dn_bbox_pred_i,
+                        'num_tgt': dn_mask_dict['pad_size']
+                    })
+        # for encoder output
+        if hs_enc is not None:
+            interm_coord = ref_enc[-1]
+            interm_class = self.transformer.enc_out_class_embed(hs_enc[-1])
+            interm_pose = torch.zeros_like(outputs_body_keypoints_list[0])
+            out['interm_outputs'] = {
+                'pred_logits': interm_class,
+                'pred_boxes': interm_coord,
+                'pred_keypoints': interm_pose
+            }
+
+        return out, targets, data_batch
+
+    @torch.jit.unused
+    def _set_aux_loss(self, 
+                      outputs_class, 
+                      outputs_body_coord, 
+                      outputs_lhand_coord,
+                      outputs_rhand_coord,
+                      outputs_face_coord,
+                      outputs_body_keypoints,
+                      outputs_lhand_keypoints,
+                      outputs_rhand_keypoints,
+                      outputs_face_keypoints,
+                      outputs_smpl_pose, 
+                      outputs_smpl_rhand_pose,
+                      outputs_smpl_lhand_pose, 
+                      outputs_smpl_jaw_pose,
+                      outputs_smpl_expr, 
+                      outputs_smpl_beta, 
+                      outputs_smpl_cam,
+                    #   outputs_smpl_cam_f,
+                      outputs_smpl_kp3d):
+
+        return [{
+            'pred_logits': a,
+            'pred_boxes': b,
+            'pred_lhand_boxes': c,
+            'pred_rhand_boxes': d,
+            'pred_face_boxes': e,
+            'pred_keypoints': f,
+            'pred_lhand_keypoints': g,
+            'pred_rhand_keypoints': h,
+            'pred_face_keypoints': i,
+            'pred_smpl_pose': j,
+            'pred_smpl_rhand_pose': k,
+            'pred_smpl_lhand_pose': l,
+            'pred_smpl_jaw_pose': m,
+            'pred_smpl_expr': n,
+            'pred_smpl_beta': o,
+            'pred_smpl_cam': p,
+            # 'pred_smpl_cam_f': q,
+            'pred_smpl_kp3d': q
+        } for a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q in zip(
+            outputs_class[:-1], 
+            outputs_body_coord[:-1],
+            outputs_lhand_coord[:-1],
+            outputs_rhand_coord[:-1],
+            outputs_face_coord[:-1],
+            outputs_body_keypoints[:-1],
+            outputs_lhand_keypoints[:-1],
+            outputs_rhand_keypoints[:-1],
+            outputs_face_keypoints[:-1],
+            outputs_smpl_pose[:-1], 
+            outputs_smpl_rhand_pose[:-1],
+            outputs_smpl_lhand_pose[:-1], 
+            outputs_smpl_jaw_pose[:-1],
+            outputs_smpl_expr[:-1], 
+            outputs_smpl_beta[:-1],
+            outputs_smpl_cam[:-1], 
+            outputs_smpl_kp3d[:-1])]
+
+    def prepare_targets(self, data_batch):
+
+        data_batch_coco = []
+        instance_dict = {}
+        img_list = data_batch['img'].float()
+        # input_img_h, input_img_w = data_batch['image_metas'][0]['batch_input_shape']
+        batch_size, _, input_img_h, input_img_w = img_list.shape
+        device = img_list.device
+        masks = torch.ones((batch_size, input_img_h, input_img_w),
+                           dtype=torch.bool,
+                           device=device)
+        
+
+        # cv2.imread(data_batch['img_metas'][img_id]['image_path']).shape
+        for img_id in range(batch_size):
+            img_h, img_w = data_batch['img_shape'][img_id]
+            masks[img_id, :img_h, :img_w] = 0
+            
+            if not self.inference:
+                instance_body_bbox = torch.cat([data_batch['body_bbox_center'][img_id],\
+                                                data_batch['body_bbox_size'][img_id]],dim=-1)
+                instance_face_bbox = torch.cat([data_batch['face_bbox_center'][img_id],\
+                                                data_batch['face_bbox_size'][img_id]],dim=-1)
+                instance_lhand_bbox = torch.cat([data_batch['lhand_bbox_center'][img_id],\
+                                                data_batch['lhand_bbox_size'][img_id]],dim=-1)
+                instance_rhand_bbox = torch.cat([data_batch['rhand_bbox_center'][img_id],\
+                                                data_batch['rhand_bbox_size'][img_id]],dim=-1)
+
+                instance_kp2d = data_batch['joint_img'][img_id].clone().float()
+                instance_kp2d_mask = data_batch['joint_trunc'][img_id].clone().float()
+                instance_kp2d[:,:,2:] = instance_kp2d_mask
+                body_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'coco', approximate=True)
+                lhand_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_lhand', approximate=True)
+                rhand_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_rhand', approximate=True)
+                face_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_face', approximate=True)
+                # from util.vis_utils import show_bbox
+                # show_bbox(img_list[img_id],instance_kp2d.cpu().numpy(),data_batch['bbox_xywh'][img_id].cpu().numpy)
+                body_kp2d[:,:,0] = body_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                body_kp2d[:,:,1] = body_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                body_kp2d = torch.cat([body_kp2d[:,:,:2].flatten(1),body_kp2d[:,:,2]],dim=-1)
+
+                lhand_kp2d[:,:,0] = lhand_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                lhand_kp2d[:,:,1] = lhand_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                lhand_kp2d = torch.cat([lhand_kp2d[:,:,:2].flatten(1),lhand_kp2d[:,:,2]],dim=-1)
+                
+                rhand_kp2d[:,:,0] = rhand_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                rhand_kp2d[:,:,1] = rhand_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                rhand_kp2d = torch.cat([rhand_kp2d[:,:,:2].flatten(1),rhand_kp2d[:,:,2]],dim=-1)
+
+                face_kp2d[:,:,0] = face_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                face_kp2d[:,:,1] = face_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                face_kp2d = torch.cat([face_kp2d[:,:,:2].flatten(1),face_kp2d[:,:,2]],dim=-1)
+                
+                instance_dict = {}
+                instance_dict['boxes'] = instance_body_bbox.float()
+                instance_dict['face_boxes'] = instance_face_bbox.float()
+                instance_dict['lhand_boxes'] = instance_lhand_bbox.float()
+                instance_dict['rhand_boxes'] = instance_rhand_bbox.float()
+                instance_dict['keypoints'] = body_kp2d.float()
+                instance_dict['lhand_keypoints'] = lhand_kp2d.float()
+                instance_dict['rhand_keypoints'] = rhand_kp2d.float()
+                instance_dict['face_keypoints'] = face_kp2d.float()
+            
+                # instance_dict['orig_size'] = data_batch['ori_shape'][img_id]
+                instance_dict['size'] = data_batch['img_shape'][img_id]  # after augmentation 
+                
+                instance_dict['area'] = instance_body_bbox[:, 2] * instance_body_bbox[:, 3]
+                instance_dict['lhand_area'] = instance_lhand_bbox[:, 2] * instance_lhand_bbox[:, 3]
+                instance_dict['rhand_area'] = instance_rhand_bbox[:, 2] * instance_rhand_bbox[:, 3]
+                instance_dict['face_area'] = instance_face_bbox[:, 2] * instance_face_bbox[:, 3]
+
+                instance_dict['labels'] = torch.ones(instance_body_bbox.shape[0],
+                                                    dtype=torch.long,
+                                                    device=device)
+                data_batch_coco.append(instance_dict)               
+                # body_bbox = data_batch['body_bbox'][img_id].clone().float().reshape(-1, 4)
+                # lhand_bbox = data_batch['lhand_bbox'][img_id].clone().float().reshape(-1, 4)
+                # rhand_bbox = data_batch['rhand_bbox'][img_id].clone().float().reshape(-1, 4)
+                # face_bbox = data_batch['face_bbox'][img_id].clone().float().reshape(-1, 4)
+                # vis = False
+                # if vis:
+                #     import mmcv
+                    # body_bbox[:, 0] *= img_w
+                    # body_bbox[:, 1] *= img_h
+                    # body_bbox[:, 2] *= img_w
+                    # body_bbox[:, 3] *= img_h
+                #     img = (data_batch['img'][img_id]*255).int().permute(1,2,0).cpu().detach().numpy()
+                #     img = mmcv.imshow_bboxes(img.copy(), face_bbox.cpu().numpy(), show=False)
+                #     cv2.imwrite('test.png', img)
+                
+                # instance_kp2d[:,:,0] = instance_kp2d[:,:,0]/cfg.output_hm_shape[2]*img_w
+                # instance_kp2d[:,:,1] = instance_kp2d[:,:,1]/cfg.output_hm_shape[1]*img_h
+                # from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+                # img = (data_batch['img'][img_id]*255).int().permute(1,2,0).cpu().detach().numpy()
+                # img1 = visualize_kp2d(instance_kp2d.cpu().detach().numpy(),image_array=img[None].copy(),return_array=True)
+                # cv2.imwrite('test.png',img1[0])
+                # lhand_kp2d[:,:,0] = lhand_kp2d[:,:,0]/cfg.output_hm_shape[2]*img_w
+                # lhand_kp2d[:,:,1] = lhand_kp2d[:,:,1]/cfg.output_hm_shape[1]*img_h
+                # lhand_kp2d = convert_kps(lhand_kp2d, 'smplx_lhand', 'smplx', approximate=True)[0]
+            else:
+                instance_body_bbox = torch.cat([data_batch['body_bbox_center'][img_id],\
+                                                data_batch['body_bbox_size'][img_id]],dim=-1)
+                instance_dict = {}
+                # instance_dict['orig_size'] = data_batch['ori_shape'][img_id]
+                instance_dict['size'] = data_batch['img_shape'][img_id]  # after augmentation 
+                instance_dict['boxes'] = instance_body_bbox.float()    
+                     
+                data_batch_coco.append(instance_dict)  
+
+        input_img = NestedTensor(img_list, masks)
+        return input_img, data_batch_coco
+
+
+    def keypoints_to_scaled_bbox_bfh(
+        self, keypoints, occ=None, 
+        body_scale=1.0, fh_scale=1.0, 
+        convention='smplx'):
+        '''Obtain scaled bbox in xyxy format given keypoints
+        Args:
+            keypoints (np.ndarray): Keypoints
+            scale (float): Bounding Box scale
+        Returns:
+            bbox_xyxy (np.ndarray): Bounding box in xyxy format
+        '''
+        bboxs = []
+
+        # supported kps.shape: (1, n, k) or (n, k), k = 2 or 3
+        if keypoints.ndim == 3:
+            keypoints = keypoints[0]
+        if keypoints.shape[-1] != 2:
+            keypoints = keypoints[:, :2]
+
+        for body_part in ['body', 'head', 'left_hand', 'right_hand']:
+            if body_part == 'body':
+                scale = body_scale
+                kps = keypoints
+            else:
+                scale = fh_scale
+                kp_id = get_keypoint_idxs_by_part(body_part, convention=convention)
+                kps = keypoints[kp_id]
+
+            if not occ is None:
+                occ_p = occ[kp_id]
+                if np.sum(occ_p) / len(kp_id) >= 0.1:
+                    conf = 0
+                    # print(f'{body_part} occluded, occlusion: {np.sum(occ_p) / len(kp_id)}, skip')
+                else:
+                    # print(f'{body_part} good, {np.sum(self_occ_p + occ_p) / len(kp_id)}')
+                    conf = 1
+            else:
+                conf = 1
+            if body_part == 'body':
+                conf = 1
+
+            xmin, ymin = np.amin(kps, axis=0)
+            xmax, ymax = np.amax(kps, axis=0)
+
+            width = (xmax - xmin) * scale
+            height = (ymax - ymin) * scale
+
+            x_center = 0.5 * (xmax + xmin)
+            y_center = 0.5 * (ymax + ymin)
+            xmin = x_center - 0.5 * width
+            xmax = x_center + 0.5 * width
+            ymin = y_center - 0.5 * height
+            ymax = y_center + 0.5 * height
+
+            bbox = np.stack([xmin, ymin, xmax, ymax, conf], axis=0).astype(np.float32)
+            bboxs.append(bbox)
+        
+        return bboxs
+
+
+
+
+@MODULE_BUILD_FUNCS.registe_with_name(module_name='aios_smplx')
+def build_aios_smplx(args, cfg):
+    # pdb.set_trace()
+    num_classes = args.num_classes  # 2
+    device = torch.device(args.device)
+
+    backbone = build_backbone(args)
+
+    transformer = build_transformer(args)
+
+    dn_labelbook_size = args.dn_labelbook_size
+    dec_pred_class_embed_share = args.dec_pred_class_embed_share
+    dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
+
+    if args.eval:
+        body_model = args.body_model_test
+        train = False
+    else:
+        body_model = args.body_model_train
+        train = True
+        
+    model = AiOSSMPLX(
+        backbone,
+        transformer,
+        num_classes=num_classes,  # 2
+        num_queries=args.num_queries,  # 900
+        aux_loss=True,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=args.random_refpoints_xy,  # False
+        fix_refpoints_hw=args.fix_refpoints_hw,  # -1
+        num_feature_levels=args.num_feature_levels,  # 4
+        nheads=args.nheads,  # 8
+        dec_pred_class_embed_share=dec_pred_class_embed_share,  # false
+        dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,  # False
+        # two stage
+        two_stage_type=args.two_stage_type,
+
+        # box_share
+        two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,  # False
+        two_stage_class_embed_share=args.two_stage_class_embed_share,  # False
+        dn_number=args.dn_number if args.use_dn else 0,  # 100
+        dn_box_noise_scale=args.dn_box_noise_scale,  # 0.4
+        dn_label_noise_ratio=args.dn_label_noise_ratio,  # 0.5
+        dn_batch_gt_fuse=args.dn_batch_gt_fuse,  # false
+        dn_attn_mask_type_list=args.dn_attn_mask_type_list,
+        dn_labelbook_size=dn_labelbook_size,  # 100
+        cls_no_bias=args.cls_no_bias,  # False
+        num_group=args.num_group,  # 100
+        num_body_points=args.num_body_points,  # 17
+        num_hand_points=args.num_hand_points,  # 17
+        num_face_points=args.num_face_points,  # 17
+        num_box_decoder_layers=args.num_box_decoder_layers,  # 2
+        num_hand_face_decoder_layers=args.num_hand_face_decoder_layers,
+        # smpl_convention=convention
+        body_model=body_model,
+        train=train,
+        inference=args.inference)
+    matcher = build_matcher(args)
+
+    # prepare weight dict
+    weight_dict = {
+        'loss_ce': args.cls_loss_coef,  # 2
+        # bbox
+        'loss_body_bbox': args.body_bbox_loss_coef,  # 5
+        'loss_rhand_bbox': args.rhand_bbox_loss_coef,  # 5
+        'loss_lhand_bbox': args.lhand_bbox_loss_coef,  # 5
+        'loss_face_bbox': args.face_bbox_loss_coef,  # 5
+        # bbox giou
+        'loss_body_giou': args.body_giou_loss_coef,  # 2
+        'loss_rhand_giou': args.rhand_giou_loss_coef,  # 2
+        'loss_lhand_giou': args.lhand_giou_loss_coef,  # 2
+        'loss_face_giou': args.face_giou_loss_coef,  # 2
+        # 2d kp
+        'loss_keypoints': args.keypoints_loss_coef,  # 10
+        'loss_rhand_keypoints': args.rhand_keypoints_loss_coef,  # 10
+        'loss_lhand_keypoints': args.lhand_keypoints_loss_coef,  # 10
+        'loss_face_keypoints': args.face_keypoints_loss_coef,  # 10
+        # 2d kp oks
+        'loss_oks': args.oks_loss_coef,  # 4
+        'loss_rhand_oks': args.rhand_oks_loss_coef,  # 4
+        'loss_lhand_oks': args.lhand_oks_loss_coef,  # 4
+        'loss_face_oks': args.face_oks_loss_coef,  # 4
+        # smpl param
+        'loss_smpl_pose_root': args.smpl_pose_loss_root_coef,  # 0
+        'loss_smpl_pose_body': args.smpl_pose_loss_body_coef,  # 0
+        'loss_smpl_pose_lhand': args.smpl_pose_loss_lhand_coef,  # 0
+        'loss_smpl_pose_rhand': args.smpl_pose_loss_rhand_coef,  # 0
+        'loss_smpl_pose_jaw': args.smpl_pose_loss_jaw_coef,  # 0
+        'loss_smpl_beta': args.smpl_beta_loss_coef,  # 0
+        'loss_smpl_expr': args.smpl_expr_loss_coef, 
+        # smpl kp3d ra
+        'loss_smpl_body_kp3d_ra': args.smpl_body_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_lhand_kp3d_ra': args.smpl_lhand_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_rhand_kp3d_ra': args.smpl_rhand_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_face_kp3d_ra': args.smpl_face_kp3d_ra_loss_coef,  # 0
+        # smpl kp3d
+        'loss_smpl_body_kp3d': args.smpl_body_kp3d_loss_coef,  # 0
+        'loss_smpl_face_kp3d': args.smpl_face_kp3d_loss_coef,  # 0
+        'loss_smpl_lhand_kp3d': args.smpl_lhand_kp3d_loss_coef,  # 0
+        'loss_smpl_rhand_kp3d': args.smpl_rhand_kp3d_loss_coef,  # 0
+        # smpl kp2d
+        'loss_smpl_body_kp2d': args.smpl_body_kp2d_loss_coef,  # 0
+        'loss_smpl_lhand_kp2d': args.smpl_lhand_kp2d_loss_coef,  # 0
+        'loss_smpl_rhand_kp2d': args.smpl_rhand_kp2d_loss_coef,  # 0
+        'loss_smpl_face_kp2d': args.smpl_face_kp2d_loss_coef,  # 0
+        
+        # smpl kp2d ba
+        'loss_smpl_body_kp2d_ba': args.smpl_body_kp2d_ba_loss_coef,
+        'loss_smpl_face_kp2d_ba': args.smpl_face_kp2d_ba_loss_coef,
+        'loss_smpl_lhand_kp2d_ba': args.smpl_lhand_kp2d_ba_loss_coef,
+        'loss_smpl_rhand_kp2d_ba': args.smpl_rhand_kp2d_ba_loss_coef,
+        
+    }
+
+    clean_weight_dict_wo_dn = copy.deepcopy(weight_dict)
+
+    if args.use_dn:
+        weight_dict.update({
+            'dn_loss_ce':
+            args.dn_label_coef,  # 0.3
+            'dn_loss_bbox':
+            args.bbox_loss_coef * args.dn_bbox_coef,  # 5 * 0.5
+            'dn_loss_giou':
+            args.giou_loss_coef * args.dn_bbox_coef,  # 2 * 0.5
+        })
+
+    clean_weight_dict = copy.deepcopy(weight_dict)
+
+    if args.aux_loss:
+        aux_weight_dict = {}
+        for i in range(args.dec_layers - 1):  # from 0 t 4 # ???
+            for k, v in clean_weight_dict.items():
+                if i < args.num_box_decoder_layers and ('keypoints' in k or 'oks' in k):
+                    continue
+                if i < args.num_box_decoder_layers and k in [
+                    'loss_rhand_bbox', 'loss_lhand_bbox', 'loss_face_bbox',
+                    'loss_rhand_giou', 'loss_lhand_giou', 'loss_face_giou']:
+                    continue
+                if i < args.num_hand_face_decoder_layers and k in [
+                    'loss_rhand_keypoints', 'loss_lhand_keypoints', 
+                    'loss_face_keypoints', 'loss_rhand_oks',
+                    'loss_lhand_oks', 'loss_face_oks']:
+                    continue
+                if i < args.num_box_decoder_layers and 'smpl' in k:
+                    continue
+                aux_weight_dict.update({k + f'_{i}': v})
+        weight_dict.update(aux_weight_dict)
+
+    if args.two_stage_type != 'no':
+        interm_weight_dict = {}
+        try:
+            no_interm_box_loss = args.no_interm_box_loss
+        except:
+            no_interm_box_loss = False
+        _coeff_weight_dict = {
+            'loss_ce': 1.0,
+            # bbox
+            'loss_body_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            # bbox giou
+            'loss_body_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_giou': 1.0 if not no_interm_box_loss else 0.0,
+            # 2d kp
+            'loss_keypoints': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_keypoints': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_keypoints': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_keypoints': 1.0 if not no_interm_box_loss else 0.0,
+            # 2d oks
+            'loss_oks': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_oks': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_oks': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_oks': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl param
+            'loss_smpl_pose_root': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_body': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_lhand': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_rhand': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_jaw': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_beta': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_expr': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp3d ra
+            'loss_smpl_body_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp3d
+            'loss_smpl_body_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp2d
+            'loss_smpl_body_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp2d ba
+            'loss_smpl_body_kp2d_ba': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp2d_ba': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp2d_ba': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp2d_ba': 1.0 if not no_interm_box_loss else 0.0,
+        }
+        try:
+            interm_loss_coef = args.interm_loss_coef  # 1
+        except:
+            interm_loss_coef = 1.0
+        interm_weight_dict.update({
+            k + f'_interm': v * interm_loss_coef * _coeff_weight_dict[k]
+            for k, v in clean_weight_dict_wo_dn.items() if 'keypoints' not in k
+        })
+        weight_dict.update(interm_weight_dict)
+
+        interm_weight_dict.update({
+            k + f'_query_expand': v * interm_loss_coef * _coeff_weight_dict[k]
+            for k, v in clean_weight_dict_wo_dn.items()
+        })  # ???
+        weight_dict.update(interm_weight_dict)
+
+    losses = cfg.losses
+    
+    if args.dn_number > 0:
+        losses += ['dn_label', 'dn_bbox']
+    losses += ['matching']
+
+    criterion = SetCriterion(
+        num_classes,
+        matcher=matcher,
+        weight_dict=weight_dict,
+        focal_alpha=args.focal_alpha,
+        losses=losses,
+        num_box_decoder_layers=args.num_box_decoder_layers,
+        num_hand_face_decoder_layers=args.num_hand_face_decoder_layers,
+        num_body_points=args.num_body_points,
+        num_hand_points=args.num_hand_points,
+        num_face_points=args.num_face_points,
+        )
+
+    criterion.to(device)
+    if args.inference:
+        postprocessors = {
+            'bbox': 
+                PostProcess_SMPLX_Multi_Infer(
+                    num_select=args.num_select, 
+                    nms_iou_threshold=args.nms_iou_threshold,
+                    num_body_points=args.num_body_points),
+        }
+    else:
+        postprocessors = {
+            'bbox': 
+                PostProcess_SMPLX(
+                    num_select=args.num_select, 
+                    nms_iou_threshold=args.nms_iou_threshold,
+                    num_body_points=args.num_body_points),
+        }
+    postprocessors_aios = {
+        'bbox':
+        PostProcess_aios(num_select=args.num_select,
+                           nms_iou_threshold=args.nms_iou_threshold,
+                           num_body_points=args.num_body_points),
+    }
+    # criterion_smpl=build_architecture(cfg['smpl_loss'])
+    return model, criterion, postprocessors, postprocessors_aios
+
+
+
+
+
+
+
+
+class AiOSSMPLX_Box(nn.Module):
+    def __init__(
+        self,
+        backbone,
+        transformer,
+        num_classes,
+        num_queries,
+        aux_loss=False,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=False,
+        fix_refpoints_hw=-1,
+        num_feature_levels=1,
+        nheads=8,
+        two_stage_type='no',
+        dec_pred_class_embed_share=False,
+        dec_pred_bbox_embed_share=False,
+        dec_pred_pose_embed_share=False,
+        two_stage_class_embed_share=True,
+        two_stage_bbox_embed_share=True,
+        dn_number=100,
+        dn_box_noise_scale=0.4,
+        dn_label_noise_ratio=0.5,
+        dn_batch_gt_fuse=False,
+        dn_labelbook_size=100,
+        dn_attn_mask_type_list=['group2group'],
+        cls_no_bias=False,
+        num_group=100,
+        num_body_points=0,
+        num_hand_points=0,
+        num_face_points=0,
+        num_box_decoder_layers=2,
+        num_hand_face_decoder_layers=4,
+        body_model=dict(
+            type='smplx',
+            keypoint_src='smplx',
+            num_expression_coeffs=10,
+            keypoint_dst='smplx_137',
+            model_path='data/body_models/smplx',
+            use_pca=False,
+            use_face_contour=True),
+        train=True,
+        inference=False,
+        focal_length=[5000., 5000.],
+        camera_3d_size=2.5
+    ):
+        super().__init__()
+        self.num_queries = num_queries
+        self.transformer = transformer
+        self.num_classes = num_classes
+        self.hidden_dim = hidden_dim = transformer.d_model
+        self.num_feature_levels = num_feature_levels
+        self.nheads = nheads
+        self.label_enc = nn.Embedding(dn_labelbook_size + 1, hidden_dim)
+        self.num_body_points = num_body_points
+        self.num_hand_points = num_hand_points
+        self.num_face_points = num_face_points
+        self.num_whole_body_points = num_body_points + 2*num_hand_points + num_face_points
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.focal_length = focal_length
+        self.camera_3d_size=camera_3d_size
+        self.inference = inference
+        if train:
+            self.smpl_convention = 'smplx'
+        else:
+            self.smpl_convention = 'h36m'
+        # setting query dim
+        self.query_dim = query_dim
+        assert query_dim == 4
+        self.random_refpoints_xy = random_refpoints_xy  # False
+        self.fix_refpoints_hw = fix_refpoints_hw  # -1
+
+        # for dn training
+        self.dn_number = dn_number
+        self.dn_box_noise_scale = dn_box_noise_scale
+        self.dn_label_noise_ratio = dn_label_noise_ratio
+        self.dn_batch_gt_fuse = dn_batch_gt_fuse
+        self.dn_labelbook_size = dn_labelbook_size
+        self.dn_attn_mask_type_list = dn_attn_mask_type_list
+        assert all([
+            i in ['match2dn', 'dn2dn', 'group2group']
+            for i in dn_attn_mask_type_list
+        ])
+        assert not dn_batch_gt_fuse
+
+        # build human body
+        # if train:
+        #     self.body_model = build_body_model(body_model)
+        if inference:
+            body_model=dict(
+                type='smplx',
+                keypoint_src='smplx',
+                num_expression_coeffs=10,
+                num_betas=10,
+                keypoint_dst='smplx',
+                model_path='data/body_models/smplx',
+                use_pca=False,
+                use_face_contour=True)
+        self.body_model = build_body_model(body_model)
+        for param in self.body_model.parameters():
+            param.requires_grad = False       
+        # prepare input projection layers
+        if num_feature_levels > 1:
+            num_backbone_outs = len(backbone.num_channels)  # 3
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = backbone.num_channels[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+            for _ in range(num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels,
+                                  hidden_dim,
+                                  kernel_size=3,
+                                  stride=2,
+                                  padding=1),
+                        nn.GroupNorm(32, hidden_dim),
+                    ))
+                in_channels = hidden_dim
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            assert two_stage_type == 'no', 'two_stage_type should be no if num_feature_levels=1 !!!'
+            self.input_proj = nn.ModuleList([
+                nn.Sequential(
+                    nn.Conv2d(backbone.num_channels[-1],
+                              hidden_dim,
+                              kernel_size=1),
+                    nn.GroupNorm(32, hidden_dim),
+                )
+            ])
+
+        self.backbone = backbone
+        self.aux_loss = aux_loss
+        self.box_pred_damping = box_pred_damping = None
+
+        self.iter_update = iter_update
+        assert iter_update, 'Why not iter_update?'
+
+        # prepare pred layers
+        self.dec_pred_class_embed_share = dec_pred_class_embed_share  # false
+        self.dec_pred_bbox_embed_share = dec_pred_bbox_embed_share  # false
+
+        # 1.1 prepare class & box embed
+        _class_embed = nn.Linear(hidden_dim,
+                                 num_classes,
+                                 bias=(not cls_no_bias))
+        if not cls_no_bias:
+            prior_prob = 0.01
+            bias_value = -math.log((1 - prior_prob) / prior_prob)
+            _class_embed.bias.data = torch.ones(self.num_classes) * bias_value
+
+        # 1.2 box embed layer list
+        if dec_pred_class_embed_share:
+            class_embed_layerlist = [
+                _class_embed for i in range(transformer.num_decoder_layers)
+            ]
+        else:
+            class_embed_layerlist = [
+                copy.deepcopy(_class_embed)
+                for i in range(transformer.num_decoder_layers)
+            ]
+
+
+        ###########################################################################
+        #                    body bbox + l/r hand box + face box
+        ###########################################################################
+        # 1.1 body bbox embed
+        _bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
+        nn.init.constant_(_bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_embed.layers[-1].bias.data, 0)
+        
+        # 1.2 body bbox embed layer list
+        self.num_group = num_group
+        if dec_pred_bbox_embed_share:
+            box_body_embed_layerlist = [
+                _bbox_embed for i in range(transformer.num_decoder_layers)
+            ]
+        else:
+            box_body_embed_layerlist = [
+                copy.deepcopy(_bbox_embed)
+                for i in range(transformer.num_decoder_layers)
+            ]
+
+        # 2.1 lhand bbox embed
+        _bbox_hand_embed = MLP(hidden_dim, hidden_dim, 2, 3) # TODO: the out shape should be 2 not 4
+        nn.init.constant_(_bbox_hand_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_hand_embed.layers[-1].bias.data, 0)
+
+        _bbox_hand_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_hand_hw_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_hand_hw_embed.layers[-1].bias.data, 0)
+        # 2.2 lhand bbox embed layer list
+        if dec_pred_pose_embed_share:
+            box_hand_embed_layerlist = \
+                [_bbox_hand_embed for i in range(transformer.num_decoder_layers - num_box_decoder_layers+1)]
+        else:
+            box_hand_embed_layerlist = [
+                copy.deepcopy(_bbox_hand_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers + 1)
+            ]
+
+        if dec_pred_pose_embed_share:
+            box_hand_hw_embed_layerlist = [
+                _bbox_hand_hw_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)
+                ]
+        else:
+            box_hand_hw_embed_layerlist = [
+                copy.deepcopy(_bbox_hand_hw_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers)
+            ]
+                        
+        # 4.1 face bbox embed
+        _bbox_face_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_face_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_face_embed.layers[-1].bias.data, 0)
+
+        _bbox_face_hw_embed = MLP(hidden_dim, hidden_dim, 2, 3)
+        nn.init.constant_(_bbox_face_hw_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_bbox_face_hw_embed.layers[-1].bias.data, 0)
+        
+        # 4.2 face bbox embed layer list
+        if dec_pred_pose_embed_share:
+            box_face_embed_layerlist = [
+                _bbox_face_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers + 1)
+                ]
+        else:
+            box_face_embed_layerlist = [
+                copy.deepcopy(_bbox_face_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers + 1)
+            ]
+
+        if dec_pred_pose_embed_share:
+            box_face_hw_embed_layerlist = [
+                _bbox_face_hw_embed for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)]
+        else:
+            box_face_hw_embed_layerlist = [
+                copy.deepcopy(_bbox_face_hw_embed)
+                for i in range(transformer.num_decoder_layers -
+                            num_box_decoder_layers)
+            ]            
+        
+        # 1. smpl pose embed
+        if body_model['type'].upper()=='SMPL':
+            self.body_model_joint_num = 24
+        elif body_model['type'].upper()=='SMPLX':
+            self.body_model_joint_num = 22
+        else:
+            raise ValueError(
+            f'Only supports SMPL or SMPLX, but get {body_model.type}')      
+        #TODO: 
+
+        _smpl_pose_embed = MLP(hidden_dim *  4, hidden_dim, self.body_model_joint_num * 6, 3)
+        nn.init.constant_(_smpl_pose_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_smpl_pose_embed.layers[-1].bias.data, 0)  
+
+        if dec_pred_bbox_embed_share:
+            smpl_pose_embed_layerlist = [
+                _smpl_pose_embed
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smpl_pose_embed_layerlist = [
+                copy.deepcopy(_smpl_pose_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        # 2. smpl betas embed
+        _smpl_beta_embed = MLP(hidden_dim * 4, hidden_dim, 10, 3)
+        nn.init.constant_(_smpl_beta_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_smpl_beta_embed.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smpl_beta_embed_layerlist = [
+                _smpl_beta_embed
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smpl_beta_embed_layerlist = [
+                copy.deepcopy(_smpl_beta_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        # 3. smpl cam embed
+        _cam_embed = MLP(hidden_dim * 4, hidden_dim, 3, 3)
+        nn.init.constant_(_cam_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(_cam_embed.layers[-1].bias.data, 0)
+        
+        if dec_pred_bbox_embed_share:
+            cam_embed_layerlist = [
+                _cam_embed for i in range(transformer.num_decoder_layers -
+                                          num_box_decoder_layers)
+            ]
+        else:
+            cam_embed_layerlist = [
+                copy.deepcopy(_cam_embed)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        
+        ###########################################################################
+        #  smplx body pose + hand pose + expression + betas + kp2d + kp3d + cam
+        ###########################################################################
+
+        # 2. smplx hand pose embed
+        _smplx_hand_pose_embed_layer_2_3 = \
+            MLP(hidden_dim * 2, hidden_dim, 15 * 6, 3)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_hand_pose_embed_layer_4_5 = \
+            MLP(hidden_dim * 2, hidden_dim, 15 * 6, 3)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_hand_pose_embed_layer_4_5.layers[-1].bias.data, 0)
+
+
+        
+        if dec_pred_bbox_embed_share:
+            smplx_hand_pose_embed_layerlist = [
+                _smplx_hand_pose_embed_layer_2_3
+                if i<2 else _smplx_hand_pose_embed_layer_4_5
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smplx_hand_pose_embed_layerlist = [
+                copy.deepcopy(_smplx_hand_pose_embed_layer_2_3)
+                if i<2 else copy.deepcopy(_smplx_hand_pose_embed_layer_4_5)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+
+        # 3. smplx face expression 
+
+        _smplx_expression_embed_layer_2_3 = \
+            MLP(hidden_dim*2, hidden_dim, 10, 3)
+        nn.init.constant_(_smplx_expression_embed_layer_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_expression_embed_layer_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_expression_embed_layer_4_5 = \
+            MLP(hidden_dim * 2, hidden_dim, 10, 3)
+        nn.init.constant_(_smplx_expression_embed_layer_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_expression_embed_layer_4_5.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smplx_expression_embed_layerlist = [
+                _smplx_expression_embed_layer_2_3
+                if i<2 else _smplx_expression_embed_layer_4_5
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+        else:
+            smplx_expression_embed_layerlist = [
+                copy.deepcopy(_smplx_expression_embed_layer_2_3)
+                if i<2 else copy.deepcopy(_smplx_expression_embed_layer_4_5)
+                for i in range(transformer.num_decoder_layers -
+                               num_box_decoder_layers)
+            ]
+
+        # 4. smplx jaw pose embed
+        _smplx_jaw_embed_2_3 = MLP(hidden_dim * 2, hidden_dim, 6, 3)
+        nn.init.constant_(_smplx_jaw_embed_2_3.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_jaw_embed_2_3.layers[-1].bias.data, 0)
+        
+        _smplx_jaw_embed_4_5 = MLP(hidden_dim * 2, hidden_dim, 6, 3)
+        nn.init.constant_(_smplx_jaw_embed_4_5.layers[-1].weight.data, 0)
+        nn.init.constant_(_smplx_jaw_embed_4_5.layers[-1].bias.data, 0)
+
+        if dec_pred_bbox_embed_share:
+            smplx_jaw_embed_layerlist = [
+                _smplx_jaw_embed_2_3 if i<2 else _smplx_jaw_embed_4_5
+                for i in range(
+                    transformer.num_decoder_layers - num_box_decoder_layers)
+            ]
+        else:
+            smplx_jaw_embed_layerlist = [
+                copy.deepcopy(_smplx_jaw_embed_2_3) 
+                if i<2 else copy.deepcopy(_smplx_jaw_embed_4_5) 
+                for i in range(
+                    transformer.num_decoder_layers -  num_box_decoder_layers)
+            ]
+            
+        self.bbox_embed = nn.ModuleList(box_body_embed_layerlist)
+        self.class_embed = nn.ModuleList(class_embed_layerlist)
+
+        self.transformer.decoder.bbox_embed = self.bbox_embed
+        self.transformer.decoder.class_embed = self.class_embed
+        
+        # smpl
+        self.smpl_pose_embed = nn.ModuleList(smpl_pose_embed_layerlist)
+        self.smpl_beta_embed = nn.ModuleList(smpl_beta_embed_layerlist)
+        self.smpl_cam_embed = nn.ModuleList(cam_embed_layerlist)
+
+        # smplx lhand kp
+        self.bbox_hand_embed = nn.ModuleList(box_hand_embed_layerlist)
+        self.bbox_hand_hw_embed = nn.ModuleList(box_hand_hw_embed_layerlist)
+
+        self.transformer.decoder.bbox_hand_embed = self.bbox_hand_embed
+        self.transformer.decoder.bbox_hand_hw_embed = self.bbox_hand_hw_embed
+
+        # smplx face kp
+        self.bbox_face_embed = nn.ModuleList(box_face_embed_layerlist)
+        self.bbox_face_hw_embed = nn.ModuleList(box_face_hw_embed_layerlist)
+
+        self.transformer.decoder.bbox_face_embed = self.bbox_face_embed
+        self.transformer.decoder.bbox_face_hw_embed = self.bbox_face_hw_embed
+
+        # smplx 
+        self.smpl_hand_pose_embed = nn.ModuleList(smplx_hand_pose_embed_layerlist)
+
+        self.smpl_expr_embed = nn.ModuleList(smplx_expression_embed_layerlist)
+        self.smpl_jaw_embed = nn.ModuleList(smplx_jaw_embed_layerlist)
+
+        self.transformer.decoder.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.transformer.decoder.num_box_decoder_layers = num_box_decoder_layers
+        self.transformer.decoder.num_body_points = num_body_points
+        self.transformer.decoder.num_hand_points = num_hand_points
+        self.transformer.decoder.num_face_points = num_face_points
+        # two stage
+        self.two_stage_type = two_stage_type
+        assert two_stage_type in [
+            'no', 'standard'
+        ], 'unknown param {} of two_stage_type'.format(two_stage_type)
+        if two_stage_type != 'no':
+            if two_stage_bbox_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_bbox_embed = _bbox_embed
+            else:
+                self.transformer.enc_out_bbox_embed = copy.deepcopy(
+                    _bbox_embed)
+
+            if two_stage_class_embed_share:
+                assert dec_pred_class_embed_share and dec_pred_bbox_embed_share
+                self.transformer.enc_out_class_embed = _class_embed
+
+            else:
+                self.transformer.enc_out_class_embed = copy.deepcopy(
+                    _class_embed)
+            self.refpoint_embed = None
+
+        self._reset_parameters()
+
+    def get_camera_trans(self, cam_param, input_body_shape):
+        # camera translation
+        t_xy = cam_param[:, :2]
+        gamma = torch.sigmoid(cam_param[:, 2])  # apply sigmoid to make it positive
+        k_value = torch.FloatTensor(
+            [
+                math.sqrt(
+                    self.focal_length[0] * self.focal_length[1] * self.camera_3d_size * self.camera_3d_size / 
+                    (input_body_shape[0] * input_body_shape[1])
+                )
+            ]
+        ).cuda().view(-1)
+        t_z = k_value * gamma
+        cam_trans = torch.cat((t_xy, t_z[:, None]), 1)
+        return cam_trans
+
+    def _reset_parameters(self):
+        # init input_proj
+        for proj in self.input_proj:
+            nn.init.xavier_uniform_(proj[0].weight, gain=1)
+            nn.init.constant_(proj[0].bias, 0)
+
+    def prepare_for_dn2(self, targets):
+        if not self.training:
+            device = targets[0]['boxes'].device
+            bs = len(targets)
+            
+            num_points = 4
+            attn_mask2 = torch.zeros(
+                bs,
+                self.nheads,
+                self.num_group * 4,
+                self.num_group * 4,
+                device=device,
+                dtype=torch.bool)
+
+            group_bbox_kpt = 4
+            # body bbox index
+            kpt_index = [x for x in range(self.num_group * 4) if x % 4 in [0]]
+            
+            for matchj in range(self.num_group * 4):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                
+                # for each instance, they should associate with their query (body hand face)
+                if sj > 0:
+                    attn_mask2[:, :, matchj, :sj] = True
+                if ej < self.num_group * 4:
+                    attn_mask2[:, :, matchj, ej:] = True
+
+            for match_x in range(self.num_group * 4):
+                if match_x % group_bbox_kpt in [0, 1, 2, 3]:
+                    # each query (hand face body) should associate with all body query
+                    attn_mask2[:,:,match_x, kpt_index]=False
+
+
+            num_points = 4
+            attn_mask3 = torch.zeros(
+                bs,
+                self.nheads,
+                self.num_group * 4, 
+                self.num_group * 4,
+                device=device, 
+                dtype=torch.bool)
+
+            group_bbox_kpt = 4
+            kpt_index = [x for x in range(self.num_group * 4) if x % 4 in [0]]
+            for matchj in range(self.num_group * 4):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                # for each instance, they should associate with their query (body hand face)
+                if sj > 0:
+                    attn_mask3[:, :, matchj, :sj] = True
+                if ej < self.num_group * 4:
+                    attn_mask3[:, :, matchj, ej:] = True
+
+            for match_x in range(self.num_group * 4):
+                if match_x % group_bbox_kpt in [0, 1,  2, 3]:
+                    # each query (hand face body) should associate with all body query
+                    attn_mask3[:, :, match_x, kpt_index] = False
+
+            attn_mask2 = attn_mask2.flatten(0, 1)
+            attn_mask3 = attn_mask3.flatten(0, 1)
+            return None, None, None, attn_mask2, attn_mask3, None
+
+        # targets, dn_scalar, noise_scale = dn_args
+        device = targets[0]['boxes'].device
+        bs = len(targets)
+        dn_number = self.dn_number  # 100
+        dn_box_noise_scale = self.dn_box_noise_scale  # 0.4
+        dn_label_noise_ratio = self.dn_label_noise_ratio  # 0.5
+
+        # gather gt boxes and labels
+        gt_boxes = [t['boxes'] for t in targets]
+        gt_labels = [t['labels'] for t in targets]
+        gt_keypoints = [t['keypoints'] for t in targets]
+
+        # repeat them
+        def get_indices_for_repeat(now_num, target_num, device='cuda'):
+            """
+            Input:
+                - now_num: int
+                - target_num: int
+            Output:
+                - indices: tensor[target_num]
+            """
+            out_indice = []
+            base_indice = torch.arange(now_num).to(device)
+            multiplier = target_num // now_num
+            out_indice.append(base_indice.repeat(multiplier))
+            residue = target_num % now_num
+            out_indice.append(base_indice[torch.randint(0,
+                                                        now_num, (residue, ),
+                                                        device=device)])
+            return torch.cat(out_indice)
+
+        if self.dn_batch_gt_fuse:
+            raise NotImplementedError
+            gt_boxes_bsall = torch.cat(gt_boxes)  # num_boxes, 4
+            gt_labels_bsall = torch.cat(gt_labels)
+            num_gt_bsall = gt_boxes_bsall.shape[0]
+            if num_gt_bsall > 0:
+                indices = get_indices_for_repeat(num_gt_bsall, dn_number,
+                                                 device)
+                gt_boxes_expand = gt_boxes_bsall[indices][None].repeat(
+                    bs, 1, 1)  # bs, num_dn, 4
+                gt_labels_expand = gt_labels_bsall[indices][None].repeat(
+                    bs, 1)  # bs, num_dn
+            else:
+                # all negative samples when no gt boxes
+                gt_boxes_expand = torch.rand(bs, dn_number, 4, device=device)
+                gt_labels_expand = torch.ones(
+                    bs, dn_number, dtype=torch.int64, device=device) * int(
+                        self.num_classes)
+        else:
+            gt_boxes_expand = []
+            gt_labels_expand = []
+            gt_keypoints_expand = []  # here
+            for idx, (gt_boxes_i, gt_labels_i, gt_keypoint_i) in enumerate(
+                    zip(gt_boxes, gt_labels, gt_keypoints)):  # idx -> batch id
+                num_gt_i = gt_boxes_i.shape[0]  # instance num
+                if num_gt_i > 0:
+                    indices = get_indices_for_repeat(num_gt_i, dn_number,
+                                                     device)
+                    gt_boxes_expand_i = gt_boxes_i[indices]  # num_dn, 4
+                    gt_labels_expand_i = gt_labels_i[indices]  # add smpl
+                    gt_keypoints_expand_i = gt_keypoint_i[indices]
+                else:
+                    # all negative samples when no gt boxes
+                    gt_boxes_expand_i = torch.rand(dn_number, 4, device=device)
+                    gt_labels_expand_i = torch.ones(
+                        dn_number, dtype=torch.int64, device=device) * int(
+                            self.num_classes)
+                    gt_keypoints_expand_i = torch.rand(dn_number,
+                                                       self.num_body_points *
+                                                       3,
+                                                       device=device)
+                gt_boxes_expand.append(gt_boxes_expand_i)  # add smpl
+                gt_labels_expand.append(gt_labels_expand_i)
+                gt_keypoints_expand.append(gt_keypoints_expand_i)
+            gt_boxes_expand = torch.stack(gt_boxes_expand)
+            gt_labels_expand = torch.stack(gt_labels_expand)
+            gt_keypoints_expand = torch.stack(gt_keypoints_expand)
+        knwon_boxes_expand = gt_boxes_expand.clone()
+        knwon_labels_expand = gt_labels_expand.clone()
+
+        # add noise
+        if dn_label_noise_ratio > 0:
+            prob = torch.rand_like(knwon_labels_expand.float())
+            chosen_indice = prob < dn_label_noise_ratio
+            new_label = torch.randint_like(
+                knwon_labels_expand[chosen_indice], 0,
+                self.dn_labelbook_size)  # randomly put a new one here
+            knwon_labels_expand[chosen_indice] = new_label
+
+        if dn_box_noise_scale > 0:
+            diff = torch.zeros_like(knwon_boxes_expand)
+            diff[..., :2] = knwon_boxes_expand[..., 2:] / 2
+            diff[..., 2:] = knwon_boxes_expand[..., 2:]
+            knwon_boxes_expand += torch.mul(
+                (torch.rand_like(knwon_boxes_expand) * 2 - 1.0),
+                diff) * dn_box_noise_scale
+            knwon_boxes_expand = knwon_boxes_expand.clamp(min=0.0, max=1.0)
+
+        input_query_label = self.label_enc(knwon_labels_expand)
+        input_query_bbox = inverse_sigmoid(knwon_boxes_expand)
+
+        # prepare mask
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            attn_mask = torch.zeros(bs,
+                                    self.nheads,
+                                    dn_number + self.num_queries,
+                                    dn_number + self.num_queries,
+                                    device=device,
+                                    dtype=torch.bool)
+            attn_mask[:, :, dn_number:, :dn_number] = True
+            for idx, (gt_boxes_i, gt_labels_i) in enumerate(
+                    zip(gt_boxes, gt_labels)):  # for batch
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask[idx, :, matchi, ei:dn_number] = True
+            attn_mask = attn_mask.flatten(0, 1)
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            # self.num_body_points = self.num_body_points +3
+            num_points = 4
+            attn_mask2 = torch.zeros(
+                bs,
+                self.nheads,
+                dn_number + self.num_group * 4,
+                dn_number + self.num_group * 4,
+                device=device,
+                dtype=torch.bool)
+            attn_mask2[:, :, dn_number:, :dn_number] = True
+            group_bbox_kpt = 4
+
+            for matchj in range(self.num_group * 4):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt 
+                # for each instance, they should associate their body, hand, and face bbox
+                if sj > 0:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, matchj, :sj] = True
+                if ej < self.num_group * 4:
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, matchj, ej:] = True
+            # body bbox index
+            kpt_index = [x for x in range(self.num_group * 4) if x % 4 in [0]]
+            for match_x in range(self.num_group * 4):
+                if match_x % group_bbox_kpt in [0, 1,  2, 3]:
+                    # for each instance, they should associate their each query with 
+                    # other instances' body query
+                    attn_mask2[:, :, dn_number:, dn_number:][:, :, match_x, kpt_index]=False
+
+            for idx, (gt_boxes_i, gt_labels_i) in enumerate(zip(gt_boxes, gt_labels)):
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask2[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask2[idx, :, matchi, ei:dn_number] = True
+            attn_mask2 = attn_mask2.flatten(0, 1)
+
+
+        if 'group2group' in self.dn_attn_mask_type_list:
+            num_points = 4
+            attn_mask3 = torch.zeros(
+                bs,
+                self.nheads,
+                dn_number + self.num_group * 4, dn_number + self.num_group * 4,
+                device=device, dtype=torch.bool)
+            attn_mask3[:, :, dn_number:, :dn_number] = True
+            group_bbox_kpt = 4
+            
+            for matchj in range(self.num_group * 4):
+                sj = (matchj // group_bbox_kpt) * group_bbox_kpt
+                ej = (matchj // group_bbox_kpt + 1)*group_bbox_kpt
+                # for each instance, they should associate their body, hand, and face bbox
+                if sj > 0:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, matchj, :sj] = True
+                if ej < self.num_group * 4:
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, matchj, ej:] = True
+            
+            kpt_index = [x for x in range(self.num_group * 4) if x % 4 in [0]]
+            for match_x in range(self.num_group * 4):
+                if match_x % group_bbox_kpt in [0, 1,  2, 3]:
+                    # for each instance, they should associate their each query with 
+                    # other instances' body query
+                    attn_mask3[:, :, dn_number:, dn_number:][:, :, match_x, kpt_index]=False
+
+            for idx, (gt_boxes_i, gt_labels_i) in enumerate(zip(gt_boxes, gt_labels)):
+                num_gt_i = gt_boxes_i.shape[0]
+                if num_gt_i == 0:
+                    continue
+                for matchi in range(dn_number):
+                    si = (matchi // num_gt_i) * num_gt_i
+                    ei = (matchi // num_gt_i + 1) * num_gt_i
+                    if si > 0:
+                        attn_mask3[idx, :, matchi, :si] = True
+                    if ei < dn_number:
+                        attn_mask3[idx, :, matchi, ei:dn_number] = True
+            attn_mask3 = attn_mask3.flatten(0, 1)
+
+        mask_dict = {
+            'pad_size': dn_number,
+            'known_bboxs': gt_boxes_expand,
+            'known_labels': gt_labels_expand,
+            'known_keypoints': gt_keypoints_expand
+        }
+
+        return input_query_label, input_query_bbox, attn_mask, attn_mask2, attn_mask3, mask_dict
+
+    def dn_post_process2(self, outputs_class, outputs_coord, mask_dict):
+        if mask_dict and mask_dict['pad_size'] > 0:
+            output_known_class = [
+                outputs_class_i[:, :mask_dict['pad_size'], :]
+                for outputs_class_i in outputs_class
+            ]
+            output_known_coord = [
+                outputs_coord_i[:, :mask_dict['pad_size'], :]
+                for outputs_coord_i in outputs_coord
+            ]
+
+            outputs_class = [
+                outputs_class_i[:, mask_dict['pad_size']:, :]
+                for outputs_class_i in outputs_class
+            ]
+            outputs_coord = [
+                outputs_coord_i[:, mask_dict['pad_size']:, :]
+                for outputs_coord_i in outputs_coord
+            ]
+
+            mask_dict.update({
+                'output_known_coord': output_known_coord,
+                'output_known_class': output_known_class
+            })
+        return outputs_class, outputs_coord
+
+    def forward(self, data_batch: NestedTensor, targets: List = None):
+        """The forward expects a NestedTensor, which consists of:
+
+           - samples.tensor: batched images, of shape [batch_size x 3 x H x W]
+           - samples.mask: a binary mask of shape [batch_size x H x W], containing 1 on padded pixels
+
+        It returns a dict with the following elements:
+           - "pred_logits": the classification logits (including no-object) for all queries.
+                            Shape= [batch_size x num_queries x num_classes]
+           - "pred_boxes": The normalized boxes coordinates for all queries, represented as
+                           (center_x, center_y, width, height). These values are normalized in [0, 1],
+                           relative to the size of each individual image (disregarding possible padding).
+                           See PostProcess for information on how to retrieve the unnormalized bounding box.
+           - "aux_outputs": Optional, only returned when auxilary losses are activated. It is a list of
+                            dictionnaries containing the two above keys for each decoder layer.
+        """
+
+        if isinstance(data_batch, dict):
+            samples, targets = self.prepare_targets(data_batch)
+            # import pdb; pdb.set_trace()
+        elif isinstance(data_batch, (list, torch.Tensor)):
+            samples = nested_tensor_from_tensor_list(data_batch)
+        else:
+            samples = data_batch
+        features, poss = self.backbone(samples)
+        srcs = []
+        masks = []
+        for l, feat in enumerate(features):  # len(features=3)
+            src, mask = feat.decompose()
+            srcs.append(self.input_proj[l](src))
+            masks.append(mask)
+            assert mask is not None
+        if self.num_feature_levels > len(srcs):
+            _len_srcs = len(srcs)
+            for l in range(_len_srcs, self.num_feature_levels):
+                if l == _len_srcs:
+                    src = self.input_proj[l](features[-1].tensors)
+                else:
+                    src = self.input_proj[l](srcs[-1])
+                m = samples.mask
+                mask = F.interpolate(m[None].float(),
+                                     size=src.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone[1](NestedTensor(src, mask)).to(src.dtype)
+                srcs.append(src)
+                masks.append(mask)
+                poss.append(pos_l)
+
+        if self.dn_number > 0 or targets is not None:
+            input_query_label, input_query_bbox, attn_mask,attn_mask2, attn_mask3, mask_dict =\
+                self.prepare_for_dn2(targets)
+        else:
+            assert targets is None
+            input_query_bbox = input_query_label = attn_mask = attn_mask2 = attn_mask3 = mask_dict = None
+
+
+        hs, reference, hs_enc, ref_enc, init_box_proposal = self.transformer(
+            srcs, masks, input_query_bbox, poss, input_query_label, attn_mask,
+            attn_mask2, attn_mask3)
+
+        # update human boxes
+        effective_dn_number = self.dn_number if self.training else 0
+        outputs_body_bbox_list = []
+        outputs_class = []
+        
+        for dec_lid, (layer_ref_sig, layer_body_bbox_embed, layer_cls_embed,
+                      layer_hs) in enumerate(
+                          zip(reference[:-1], self.bbox_embed,
+                              self.class_embed, hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                # human det
+                layer_delta_unsig = layer_body_bbox_embed(layer_hs)
+                layer_body_box_outputs_unsig = \
+                    layer_delta_unsig + inverse_sigmoid(layer_ref_sig)
+                layer_body_box_outputs_unsig = layer_body_box_outputs_unsig.sigmoid()
+                layer_cls = layer_cls_embed(layer_hs)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)
+                outputs_class.append(layer_cls)
+                
+            elif dec_lid < self.num_box_decoder_layers + 2:
+                bs = layer_ref_sig.shape[0]                
+                # dn body bbox
+                layer_hs_body_bbox_dn = layer_hs[:, :effective_dn_number, :]  # dn content query
+                reference_before_sigmoid_body_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]  # dn position query
+                layer_body_box_delta_unsig_dn = layer_body_bbox_embed(layer_hs_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_delta_unsig_dn + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_outputs_unsig_dn.sigmoid()
+                
+                # norm body bbox
+                layer_hs_body_bbox_norm = layer_hs[:, effective_dn_number:, :][
+                    :, 0::(self.num_body_points + 4), :]  # norm content query
+                reference_before_sigmoid_body_bbox_norm = layer_ref_sig[:, effective_dn_number:, :][
+                    :, 0::(self.num_body_points+ 4), :]  # norm position query
+                layer_body_box_delta_unsig_norm = layer_body_bbox_embed(layer_hs_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_delta_unsig_norm + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_outputs_unsig_norm.sigmoid()
+
+                layer_body_box_outputs_unsig = torch.cat(
+                    (layer_body_box_outputs_unsig_dn, layer_body_box_outputs_unsig_norm), dim=1)
+
+                # classfication
+                layer_cls_dn = layer_cls_embed(layer_hs_body_bbox_dn)
+                layer_cls_norm = layer_cls_embed(layer_hs_body_bbox_norm)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+
+                outputs_class.append(layer_cls)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)                
+            else:
+                bs = layer_ref_sig.shape[0]                
+                # dn body bbox
+                layer_hs_body_bbox_dn = layer_hs[:, :effective_dn_number, :]  # dn content query
+                reference_before_sigmoid_body_bbox_dn = layer_ref_sig[:, :effective_dn_number, :]  # dn position query
+                layer_body_box_delta_unsig_dn = layer_body_bbox_embed(layer_hs_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_delta_unsig_dn + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_dn)
+                layer_body_box_outputs_unsig_dn = layer_body_box_outputs_unsig_dn.sigmoid()
+                
+                # norm body bbox
+                layer_hs_body_bbox_norm = layer_hs[:, effective_dn_number:, :][
+                    :, 0::(self.num_whole_body_points + 4), :]  # norm content query
+                reference_before_sigmoid_body_bbox_norm = layer_ref_sig[:,effective_dn_number:, :][
+                    :, 0::(self.num_whole_body_points + 4), :]  # norm position query
+                layer_body_box_delta_unsig_norm = layer_body_bbox_embed(layer_hs_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_delta_unsig_norm + inverse_sigmoid(
+                    reference_before_sigmoid_body_bbox_norm)
+                layer_body_box_outputs_unsig_norm = layer_body_box_outputs_unsig_norm.sigmoid()
+
+                layer_body_box_outputs_unsig = torch.cat(
+                    (layer_body_box_outputs_unsig_dn, layer_body_box_outputs_unsig_norm), dim=1)
+
+                # classfication
+                layer_cls_dn = layer_cls_embed(layer_hs_body_bbox_dn)
+                layer_cls_norm = layer_cls_embed(layer_hs_body_bbox_norm)
+                layer_cls = torch.cat((layer_cls_dn, layer_cls_norm), dim=1)
+
+                outputs_class.append(layer_cls)
+                outputs_body_bbox_list.append(layer_body_box_outputs_unsig)       
+                
+        # update hand and face boxes
+        outputs_lhand_bbox_list = []
+        outputs_rhand_bbox_list = []
+        outputs_face_bbox_list = []
+        # update keypoints boxes
+        outputs_body_keypoints_list = []
+        outputs_body_keypoints_hw = []
+        outputs_lhand_keypoints_list = []
+        outputs_lhand_keypoints_hw = []        
+        outputs_rhand_keypoints_list = []
+        outputs_rhand_keypoints_hw = []
+        outputs_face_keypoints_list = []
+        outputs_face_keypoints_hw = []             
+        
+        outputs_smpl_pose_list = []
+        outputs_smpl_lhand_pose_list = []
+        outputs_smpl_rhand_pose_list = []
+        outputs_smpl_expr_list = []
+        outputs_smpl_jaw_pose_list = []
+        outputs_smpl_beta_list = []
+        outputs_smpl_cam_list = []
+        outputs_smpl_kp2d_list = []
+        outputs_smpl_kp3d_list = []
+        outputs_smpl_verts_list = []
+        
+        # smpl pose
+        # body box, kps, lhand box
+        body_index = [0, 1, 2, 3]
+        smpl_pose_index = [
+            x for x in range(self.num_group * 4) if (x % 4 in body_index)]
+        
+        # smpl lhand
+        lhand_index = [0, 1]
+        smpl_lhand_pose_index = [
+            x for x in range(self.num_group * 4) if (x % 4 in lhand_index)]
+        
+        # smpl rhand
+        rhand_index = [0, 2]
+        smpl_rhand_pose_index = [
+            x for x in range(self.num_group * 4) if (x % 4 in rhand_index)]
+        
+        # smpl face
+        face_index = [0, 3]
+        smpl_face_pose_index = [
+            x for x in range(self.num_group * 4) if (x % 4 in face_index)]
+        
+        for dec_lid, (layer_ref_sig, layer_hs) in enumerate(zip(reference[:-1], hs)):
+            if dec_lid < self.num_box_decoder_layers:
+                assert isinstance(layer_hs, torch.Tensor)
+                bs = layer_hs.shape[0]
+                layer_body_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_body_points * 3))  # [-, 900, 42]
+                outputs_body_keypoints_list.append(layer_body_kps_res)
+                
+                # lhand
+                layer_lhand_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_lhand_bbox_list.append(layer_lhand_bbox_res)
+                layer_lhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_lhand_keypoints_list.append(layer_lhand_kps_res)                
+
+                # rhand
+                layer_rhand_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_rhand_bbox_list.append(layer_rhand_bbox_res)                
+                layer_rhand_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_hand_points * 3))  # [-, 900, 42]
+                outputs_rhand_keypoints_list.append(layer_rhand_kps_res)
+                
+                # face
+                layer_face_bbox_res = layer_hs.new_zeros(
+                    (bs, self.num_queries, 4))  # [-, 900, 42]
+                outputs_face_bbox_list.append(layer_face_bbox_res)
+                layer_face_kps_res = layer_hs.new_zeros(
+                    (bs, self.num_queries,
+                     self.num_face_points * 3))  # [-, 900, 42]
+                outputs_face_keypoints_list.append(layer_face_kps_res)
+                
+                # smpl or smplx
+                smpl_pose = layer_hs.new_zeros((bs, self.num_queries, self.body_model_joint_num * 3))
+                smpl_rhand_pose = layer_hs.new_zeros(
+                    (bs, self.num_queries, 15 * 3))
+                smpl_lhand_pose = layer_hs.new_zeros(
+                    (bs, self.num_queries, 15 * 3))
+                smpl_expr = layer_hs.new_zeros((bs, self.num_queries, 10))
+                smpl_jaw_pose = layer_hs.new_zeros((bs, self.num_queries, 6))
+                smpl_beta = layer_hs.new_zeros((bs, self.num_queries, 10))
+                smpl_cam = layer_hs.new_zeros((bs, self.num_queries, 3))
+                # smpl_kp2d = layer_hs.new_zeros((bs, self.num_queries, self.num_body_points,3))
+                smpl_kp3d = layer_hs.new_zeros(
+                    (bs, self.num_queries, self.num_body_points, 4))
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                # outputs_smpl_kp2d_list.append(smpl_kp2d)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+            elif dec_lid < self.num_box_decoder_layers +2:
+                bs = layer_ref_sig.shape[0]
+                # lhand bbox
+                layer_hs_lhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 1::4, :]
+                    
+                delta_lhand_bbox_xy_unsig = self.bbox_hand_embed[dec_lid - self.num_box_decoder_layers](layer_hs_lhand_bbox)             
+                layer_ref_sig_lhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 1::4, :].clone() 
+                layer_ref_unsig_lhand_bbox = inverse_sigmoid(layer_ref_sig_lhand_bbox)
+                delta_lhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_lhand_bbox)
+                layer_ref_unsig_lhand_bbox[..., :2] +=delta_lhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_lhand_bbox[..., 2:] +=delta_lhand_bbox_hw_unsig
+                layer_ref_sig_lhand_bbox = layer_ref_unsig_lhand_bbox.sigmoid()
+                outputs_lhand_bbox_list.append(layer_ref_sig_lhand_bbox)
+                
+                # rhand bbox
+                layer_hs_rhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 2::4, :]
+                delta_rhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_rhand_bbox)             
+                layer_ref_sig_rhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 2::4, :].clone()
+                layer_ref_unsig_rhand_bbox = inverse_sigmoid(layer_ref_sig_rhand_bbox)
+                delta_rhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_rhand_bbox)
+                layer_ref_unsig_rhand_bbox[..., :2] +=delta_rhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_rhand_bbox[..., 2:] +=delta_rhand_bbox_hw_unsig
+                layer_ref_sig_rhand_bbox = layer_ref_unsig_rhand_bbox.sigmoid()
+                outputs_rhand_bbox_list.append(layer_ref_sig_rhand_bbox)
+                
+                # face bbox
+                layer_hs_face_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 3::4, :]
+                delta_face_bbox_xy_unsig = self.bbox_face_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_face_bbox)             
+                layer_ref_sig_face_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 3::4, :].clone()
+                layer_ref_unsig_face_bbox = inverse_sigmoid(layer_ref_sig_face_bbox)
+                delta_face_bbox_hw_unsig = self.bbox_face_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_face_bbox)
+                layer_ref_unsig_face_bbox[..., :2] +=delta_face_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_face_bbox[..., 2:] +=delta_face_bbox_hw_unsig                
+                layer_ref_sig_face_bbox = layer_ref_unsig_face_bbox.sigmoid()
+                
+                outputs_face_bbox_list.append(layer_ref_sig_face_bbox)
+                
+                # smpl or smplx
+                bs, _, feat_dim = layer_hs.shape
+                smpl_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 4)
+                smpl_lhand_pose_feats = \
+                    layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_lhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                smpl_rhand_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_rhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                smpl_face_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_face_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                                  
+                smpl_pose = self.smpl_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+                smpl_pose = rot6d_to_rotmat(smpl_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, self.body_model_joint_num, 3, 3)
+                
+                smpl_lhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_lhand_pose_feats)
+                smpl_lhand_pose = rot6d_to_rotmat(smpl_lhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                
+                smpl_rhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_rhand_pose_feats)
+                smpl_rhand_pose = rot6d_to_rotmat(smpl_rhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                
+                smpl_jaw_pose = self.smpl_jaw_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = rot6d_to_rotmat(smpl_jaw_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, 1, 3, 3)
+                                 
+                smpl_beta = self.smpl_beta_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+                smpl_cam = self.smpl_cam_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_feats)
+
+                smpl_expr = self.smpl_expr_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                # smpl_jaw_pose = layer_hs.new_zeros(bs, self.num_group, 3)
+                leye_pose = torch.zeros_like(smpl_jaw_pose)
+                reye_pose = torch.zeros_like(smpl_jaw_pose)
+
+
+
+                if self.body_model is not None:
+                    smpl_pose_ = rotmat_to_aa(smpl_pose)
+                    # smpl_lhand_pose_ = rotmat_to_aa(smpl_lhand_pose)
+                    # smpl_rhand_pose_ = rotmat_to_aa(smpl_rhand_pose)
+                    smpl_lhand_pose_ = layer_hs.new_zeros(bs, self.num_group, 15, 3)
+                    smpl_rhand_pose_ = layer_hs.new_zeros(bs, self.num_group, 15, 3)
+                    smpl_jaw_pose_ = rotmat_to_aa(smpl_jaw_pose)
+                    leye_pose_ = rotmat_to_aa(leye_pose)
+                    reye_pose_ = rotmat_to_aa(reye_pose)
+                    
+                    pred_output = self.body_model(
+                        betas=smpl_beta.reshape(-1, 10),
+                        body_pose=smpl_pose_[:, :,  1:].reshape(-1, 21 * 3),
+                        global_orient=smpl_pose_[:, :, 0].reshape(
+                            -1, 3).unsqueeze(1),
+                        left_hand_pose=smpl_lhand_pose_.reshape(-1, 15 * 3),
+                        right_hand_pose=smpl_rhand_pose_.reshape(-1, 15 * 3),
+                        leye_pose=leye_pose_,
+                        reye_pose=reye_pose_,
+                        jaw_pose=smpl_jaw_pose_.reshape(-1, 3),
+                        # expression=smpl_expr.reshape(-1, 10),
+                        expression=layer_hs.new_zeros(bs, self.num_group, 10).reshape(-1, 10)
+                    )
+                    smpl_kp3d = pred_output['joints'].reshape(
+                        bs, self.num_group, -1, 3)
+                    smpl_verts = pred_output['vertices'].reshape(
+                        bs, self.num_group, -1, 3)
+                    # pred_vertices = pred_output['vertices'].reshape(bs, -1, 6890, 3)
+
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+                
+
+            else:
+                bs = layer_ref_sig.shape[0]
+                # lhand bbox
+                layer_hs_lhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 1::4, :]
+                delta_lhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_lhand_bbox)             
+                layer_ref_sig_lhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 1::4, :].clone()
+                layer_ref_unsig_lhand_bbox = inverse_sigmoid(layer_ref_sig_lhand_bbox)
+                delta_lhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_lhand_bbox)
+                layer_ref_unsig_lhand_bbox[..., :2] +=delta_lhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_lhand_bbox[..., 2:] +=delta_lhand_bbox_hw_unsig
+                layer_ref_sig_lhand_bbox = layer_ref_unsig_lhand_bbox.sigmoid()
+                outputs_lhand_bbox_list.append(layer_ref_sig_lhand_bbox)
+                
+                # rhand bbox
+                layer_hs_rhand_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 2::4, :]
+                delta_rhand_bbox_xy_unsig = self.bbox_hand_embed[
+                    dec_lid - self.num_box_decoder_layers](layer_hs_rhand_bbox)             
+                layer_ref_sig_rhand_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 2::4, :].clone()                  
+                layer_ref_unsig_rhand_bbox = inverse_sigmoid(layer_ref_sig_rhand_bbox)
+                delta_rhand_bbox_hw_unsig = self.bbox_hand_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_rhand_bbox)
+                layer_ref_unsig_rhand_bbox[..., :2] +=delta_rhand_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_rhand_bbox[..., 2:] +=delta_rhand_bbox_hw_unsig
+                layer_ref_sig_rhand_bbox = layer_ref_unsig_rhand_bbox.sigmoid()
+                outputs_rhand_bbox_list.append(layer_ref_sig_rhand_bbox)
+
+                # face bbox
+                layer_hs_face_bbox = \
+                    layer_hs[:, effective_dn_number:, :][:, 3::4, :]
+                delta_face_bbox_xy_unsig = \
+                    self.bbox_face_embed[dec_lid - self.num_box_decoder_layers](layer_hs_face_bbox)             
+                layer_ref_sig_face_bbox = \
+                    layer_ref_sig[:,effective_dn_number:, :][:, 3::4, :].clone()               
+                layer_ref_unsig_face_bbox = inverse_sigmoid(layer_ref_sig_face_bbox)
+                delta_face_bbox_hw_unsig = self.bbox_face_hw_embed[
+                    dec_lid-self.num_box_decoder_layers](layer_hs_face_bbox)
+                layer_ref_unsig_face_bbox[..., :2] +=delta_face_bbox_xy_unsig[..., :2]
+                layer_ref_unsig_face_bbox[..., 2:] +=delta_face_bbox_hw_unsig
+                layer_ref_sig_face_bbox = layer_ref_unsig_face_bbox.sigmoid()   
+                outputs_face_bbox_list.append(layer_ref_sig_face_bbox)
+                
+                bs, _, feat_dim = layer_hs.shape
+                smpl_body_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 4)
+                smpl_lhand_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_lhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                smpl_rhand_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_rhand_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                smpl_face_pose_feats = layer_hs[:, effective_dn_number:, :].index_select(
+                    1, torch.tensor(smpl_face_pose_index, device=layer_hs.device)
+                    ).reshape(bs, -1, feat_dim * 2)
+                                                
+                smpl_pose = self.smpl_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                
+                smpl_pose = rot6d_to_rotmat(smpl_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, self.body_model_joint_num, 3, 3)
+                smpl_lhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_lhand_pose_feats)
+                smpl_lhand_pose = rot6d_to_rotmat(smpl_lhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+                smpl_rhand_pose = self.smpl_hand_pose_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_rhand_pose_feats)
+                smpl_rhand_pose = rot6d_to_rotmat(smpl_rhand_pose.reshape(
+                    -1, 6)).reshape(bs, self.num_group, 15, 3, 3)
+
+                smpl_expr = self.smpl_expr_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = self.smpl_jaw_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_face_pose_feats)
+                smpl_jaw_pose = rot6d_to_rotmat(smpl_jaw_pose.reshape(-1, 6)).reshape(
+                    bs, self.num_group, 1, 3, 3)
+                smpl_beta = self.smpl_beta_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                smpl_cam = self.smpl_cam_embed[
+                    dec_lid - self.num_box_decoder_layers](smpl_body_pose_feats)
+                
+                num_samples = smpl_beta.reshape(-1, 10).shape[0]
+                device = smpl_beta.device
+                leye_pose = torch.zeros_like(smpl_jaw_pose)
+                reye_pose = torch.zeros_like(smpl_jaw_pose)
+
+                if self.body_model is not None:
+                    smpl_pose_ = rotmat_to_aa(smpl_pose)
+                    smpl_lhand_pose_ = rotmat_to_aa(smpl_lhand_pose)
+                    smpl_rhand_pose_ = rotmat_to_aa(smpl_rhand_pose)
+                    smpl_jaw_pose_ = rotmat_to_aa(smpl_jaw_pose)
+                    leye_pose_ = rotmat_to_aa(leye_pose)
+                    reye_pose_ = rotmat_to_aa(reye_pose)
+                    
+                    pred_output = self.body_model(
+                        betas=smpl_beta.reshape(-1, 10),
+                        body_pose=smpl_pose_[:, :,  1:].reshape(-1, 21 * 3),
+                        global_orient=smpl_pose_[:, :, 0].reshape(
+                            -1, 3).unsqueeze(1),
+                        left_hand_pose=smpl_lhand_pose_.reshape(-1, 15 * 3),
+                        right_hand_pose=smpl_rhand_pose_.reshape(-1, 15 * 3),
+                        leye_pose=leye_pose_,
+                        reye_pose=reye_pose_,
+                        jaw_pose=smpl_jaw_pose_.reshape(-1, 3),
+                        expression=smpl_expr.reshape(-1, 10),
+                        # expression=layer_hs.new_zeros(bs, self.num_group, 10).reshape(-1, 10),
+                    )
+                    smpl_kp3d = pred_output['joints'].reshape(
+                        bs, self.num_group, -1, 3)
+                    smpl_verts = pred_output['vertices'].reshape(
+                        bs, self.num_group, -1, 3)
+
+                outputs_smpl_pose_list.append(smpl_pose)
+                outputs_smpl_rhand_pose_list.append(smpl_rhand_pose)
+                outputs_smpl_lhand_pose_list.append(smpl_lhand_pose)
+                outputs_smpl_expr_list.append(smpl_expr)
+                outputs_smpl_jaw_pose_list.append(smpl_jaw_pose)
+                outputs_smpl_beta_list.append(smpl_beta)
+                outputs_smpl_cam_list.append(smpl_cam)
+                outputs_smpl_kp3d_list.append(smpl_kp3d)
+                if not self.training:
+                    outputs_smpl_verts_list.append(smpl_verts)
+        dn_mask_dict = mask_dict
+        if self.dn_number > 0 and dn_mask_dict is not None:
+            outputs_class, outputs_body_bbox_list = self.dn_post_process2(
+                outputs_class, outputs_body_bbox_list, dn_mask_dict)
+            dn_class_input = dn_mask_dict['known_labels']
+            dn_bbox_input = dn_mask_dict['known_bboxs']
+            dn_class_pred = dn_mask_dict['output_known_class']
+            dn_bbox_pred = dn_mask_dict['output_known_coord']
+
+        for idx, (_out_class, _out_bbox) in enumerate(zip(outputs_class, outputs_body_bbox_list)):
+            assert _out_class.shape[1] == _out_bbox.shape[1]
+
+        out = {
+            'pred_logits': outputs_class[-1],
+            'pred_boxes': outputs_body_bbox_list[-1],
+            'pred_lhand_boxes': outputs_lhand_bbox_list[-1],
+            'pred_rhand_boxes': outputs_rhand_bbox_list[-1],
+            'pred_face_boxes': outputs_face_bbox_list[-1],
+            'pred_smpl_pose': outputs_smpl_pose_list[-1],
+            'pred_smpl_rhand_pose': outputs_smpl_rhand_pose_list[-1],
+            'pred_smpl_lhand_pose': outputs_smpl_lhand_pose_list[-1],
+            'pred_smpl_jaw_pose': outputs_smpl_jaw_pose_list[-1],
+            'pred_smpl_expr': outputs_smpl_expr_list[-1],
+            'pred_smpl_beta': outputs_smpl_beta_list[-1],  # [B, 100, 10]
+            'pred_smpl_cam': outputs_smpl_cam_list[-1],
+            'pred_smpl_kp3d': outputs_smpl_kp3d_list[-1]
+        }
+        if not self.training:
+            full_pose = torch.cat((outputs_smpl_pose_list[-1],
+                               outputs_smpl_lhand_pose_list[-1],
+                               outputs_smpl_rhand_pose_list[-1],
+                               outputs_smpl_jaw_pose_list[-1]),dim=2)
+            bs,num_q,_,_,_ = full_pose.shape
+            full_pose = rotmat_to_aa(full_pose).reshape(bs,num_q,53*3)
+            out = {
+            'pred_logits': outputs_class[-1],
+            'pred_boxes': outputs_body_bbox_list[-1],
+            'pred_lhand_boxes': outputs_lhand_bbox_list[-1],
+            'pred_rhand_boxes': outputs_rhand_bbox_list[-1],
+            'pred_face_boxes': outputs_face_bbox_list[-1],
+            'pred_smpl_pose': outputs_smpl_pose_list[-1],
+            'pred_smpl_rhand_pose': outputs_smpl_rhand_pose_list[-1],
+            'pred_smpl_lhand_pose': outputs_smpl_lhand_pose_list[-1],
+            'pred_smpl_jaw_pose': outputs_smpl_jaw_pose_list[-1],
+            'pred_smpl_expr': outputs_smpl_expr_list[-1],
+            'pred_smpl_beta': outputs_smpl_beta_list[-1],  # [B, 100, 10]
+            'pred_smpl_cam': outputs_smpl_cam_list[-1],
+            'pred_smpl_kp3d': outputs_smpl_kp3d_list[-1],
+            'pred_smpl_verts': outputs_smpl_verts_list[-1],
+            'pred_smpl_fullpose': full_pose
+        }
+
+        if self.dn_number > 0 and dn_mask_dict is not None:
+            out.update({
+                'dn_class_input': dn_class_input,
+                'dn_bbox_input': dn_bbox_input,
+                'dn_class_pred': dn_class_pred[-1],
+                'dn_bbox_pred': dn_bbox_pred[-1],
+                'num_tgt': dn_mask_dict['pad_size']
+            })
+
+        if self.aux_loss:
+            out['aux_outputs'] = \
+                self._set_aux_loss(
+                    outputs_class,
+                    outputs_body_bbox_list,
+                    outputs_lhand_bbox_list,
+                    outputs_rhand_bbox_list,
+                    outputs_face_bbox_list,
+                    outputs_smpl_pose_list,
+                    outputs_smpl_rhand_pose_list,
+                    outputs_smpl_lhand_pose_list,
+                    outputs_smpl_jaw_pose_list,
+                    outputs_smpl_expr_list,
+                    outputs_smpl_beta_list,
+                    outputs_smpl_cam_list,
+                    outputs_smpl_kp3d_list
+                ) # with key pred_logits, pred_bbox, pred_keypoints
+            if self.dn_number > 0 and dn_mask_dict is not None:
+                assert len(dn_class_pred[:-1]) == len(
+                    dn_bbox_pred[:-1]) == len(out['aux_outputs'])
+                for aux_out, dn_class_pred_i, dn_bbox_pred_i in zip(
+                        out['aux_outputs'], dn_class_pred, dn_bbox_pred):
+                    aux_out.update({
+                        'dn_class_input': dn_class_input,
+                        'dn_bbox_input': dn_bbox_input,
+                        'dn_class_pred': dn_class_pred_i,
+                        'dn_bbox_pred': dn_bbox_pred_i,
+                        'num_tgt': dn_mask_dict['pad_size']
+                    })
+        # for encoder output
+        if hs_enc is not None:
+            interm_coord = ref_enc[-1]
+            interm_class = self.transformer.enc_out_class_embed(hs_enc[-1])
+            interm_pose = torch.zeros_like(outputs_body_keypoints_list[0])
+            out['interm_outputs'] = {
+                'pred_logits': interm_class,
+                'pred_boxes': interm_coord,
+                'pred_keypoints': interm_pose
+            }
+
+        return out, targets, data_batch
+
+    @torch.jit.unused
+    def _set_aux_loss(self, 
+                      outputs_class, 
+                      outputs_body_coord, 
+                      outputs_lhand_coord,
+                      outputs_rhand_coord,
+                      outputs_face_coord,
+                      outputs_smpl_pose, 
+                      outputs_smpl_rhand_pose,
+                      outputs_smpl_lhand_pose, 
+                      outputs_smpl_jaw_pose,
+                      outputs_smpl_expr, 
+                      outputs_smpl_beta, 
+                      outputs_smpl_cam,
+                      outputs_smpl_kp3d):
+
+        return [{
+            'pred_logits': a,
+            'pred_boxes': b,
+            'pred_lhand_boxes': c,
+            'pred_rhand_boxes': d,
+            'pred_face_boxes': e,
+            'pred_smpl_pose': j,
+            'pred_smpl_rhand_pose': k,
+            'pred_smpl_lhand_pose': l,
+            'pred_smpl_jaw_pose': m,
+            'pred_smpl_expr': n,
+            'pred_smpl_beta': o,
+            'pred_smpl_cam': p,
+            'pred_smpl_kp3d': q
+        } for a, b, c, d, e, j, k, l, m, n, o, p, q in zip(
+            outputs_class[:-1], 
+            outputs_body_coord[:-1],
+            outputs_lhand_coord[:-1],
+            outputs_rhand_coord[:-1],
+            outputs_face_coord[:-1],
+            outputs_smpl_pose[:-1], 
+            outputs_smpl_rhand_pose[:-1],
+            outputs_smpl_lhand_pose[:-1], 
+            outputs_smpl_jaw_pose[:-1],
+            outputs_smpl_expr[:-1], 
+            outputs_smpl_beta[:-1],
+            outputs_smpl_cam[:-1], 
+            outputs_smpl_kp3d[:-1])]
+
+    def prepare_targets(self, data_batch):
+
+        data_batch_coco = []
+        instance_dict = {}
+        img_list = data_batch['img'].float()
+        # input_img_h, input_img_w = data_batch['image_metas'][0]['batch_input_shape']
+        batch_size, _, input_img_h, input_img_w = img_list.shape
+        device = img_list.device
+        masks = torch.ones((batch_size, input_img_h, input_img_w),
+                           dtype=torch.bool,
+                           device=device)
+        
+        if self.num_body_points == 17:
+            ed_convention = 'coco'
+        elif self.num_body_points == 14:
+            ed_convention = 'crowdpose'
+
+        # cv2.imread(data_batch['img_metas'][img_id]['image_path']).shape
+        for img_id in range(batch_size):
+            img_h, img_w = data_batch['img_shape'][img_id]
+            masks[img_id, :img_h, :img_w] = 0
+            
+            if not self.inference:
+                instance_body_bbox = torch.cat([data_batch['body_bbox_center'][img_id],\
+                                                data_batch['body_bbox_size'][img_id]],dim=-1)
+                instance_face_bbox = torch.cat([data_batch['face_bbox_center'][img_id],\
+                                                data_batch['face_bbox_size'][img_id]],dim=-1)
+                instance_lhand_bbox = torch.cat([data_batch['lhand_bbox_center'][img_id],\
+                                                data_batch['lhand_bbox_size'][img_id]],dim=-1)
+                instance_rhand_bbox = torch.cat([data_batch['rhand_bbox_center'][img_id],\
+                                                data_batch['rhand_bbox_size'][img_id]],dim=-1)
+
+                instance_kp2d = data_batch['joint_img'][img_id].clone().float()
+                instance_kp2d_mask = data_batch['joint_trunc'][img_id].clone().float()
+                instance_kp2d[:,:,2:] = instance_kp2d_mask
+                body_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'coco', approximate=True)
+                lhand_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_lhand', approximate=True)
+                rhand_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_rhand', approximate=True)
+                face_kp2d, _  = convert_kps(instance_kp2d, 'smplx_137', 'smplx_face', approximate=True)
+                # from util.vis_utils import show_bbox
+                # show_bbox(img_list[img_id],instance_kp2d.cpu().numpy(),data_batch['bbox_xywh'][img_id].cpu().numpy)
+                body_kp2d[:,:,0] = body_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                body_kp2d[:,:,1] = body_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                body_kp2d = torch.cat([body_kp2d[:,:,:2].flatten(1),body_kp2d[:,:,2]],dim=-1)
+
+                lhand_kp2d[:,:,0] = lhand_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                lhand_kp2d[:,:,1] = lhand_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                lhand_kp2d = torch.cat([lhand_kp2d[:,:,:2].flatten(1),lhand_kp2d[:,:,2]],dim=-1)
+                
+                rhand_kp2d[:,:,0] = rhand_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                rhand_kp2d[:,:,1] = rhand_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                rhand_kp2d = torch.cat([rhand_kp2d[:,:,:2].flatten(1),rhand_kp2d[:,:,2]],dim=-1)
+
+                face_kp2d[:,:,0] = face_kp2d[:,:,0]/cfg.output_hm_shape[2]
+                face_kp2d[:,:,1] = face_kp2d[:,:,1]/cfg.output_hm_shape[1]
+                face_kp2d = torch.cat([face_kp2d[:,:,:2].flatten(1),face_kp2d[:,:,2]],dim=-1)
+                
+                instance_dict = {}
+                instance_dict['boxes'] = instance_body_bbox.float()
+                instance_dict['face_boxes'] = instance_face_bbox.float()
+                instance_dict['lhand_boxes'] = instance_lhand_bbox.float()
+                instance_dict['rhand_boxes'] = instance_rhand_bbox.float()
+                instance_dict['keypoints'] = body_kp2d.float()
+                instance_dict['lhand_keypoints'] = lhand_kp2d.float()
+                instance_dict['rhand_keypoints'] = rhand_kp2d.float()
+                instance_dict['face_keypoints'] = face_kp2d.float()
+            
+                # instance_dict['orig_size'] = data_batch['ori_shape'][img_id]
+                instance_dict['size'] = data_batch['img_shape'][img_id]  # after augmentation 
+                
+                instance_dict['area'] = instance_body_bbox[:, 2] * instance_body_bbox[:, 3]
+                instance_dict['lhand_area'] = instance_lhand_bbox[:, 2] * instance_lhand_bbox[:, 3]
+                instance_dict['rhand_area'] = instance_rhand_bbox[:, 2] * instance_rhand_bbox[:, 3]
+                instance_dict['face_area'] = instance_face_bbox[:, 2] * instance_face_bbox[:, 3]
+
+                instance_dict['labels'] = torch.ones(instance_body_bbox.shape[0],
+                                                    dtype=torch.long,
+                                                    device=device)
+                data_batch_coco.append(instance_dict)               
+            else:
+                instance_body_bbox = torch.cat([data_batch['body_bbox_center'][img_id],\
+                                                data_batch['body_bbox_size'][img_id]],dim=-1)
+                instance_dict = {}
+                # instance_dict['orig_size'] = data_batch['ori_shape'][img_id]
+                instance_dict['size'] = data_batch['img_shape'][img_id]  # after augmentation 
+                instance_dict['boxes'] = instance_body_bbox.float()    
+                     
+                data_batch_coco.append(instance_dict)  
+
+        input_img = NestedTensor(img_list, masks)
+        return input_img, data_batch_coco
+
+
+    def keypoints_to_scaled_bbox_bfh(
+        self, keypoints, occ=None, 
+        body_scale=1.0, fh_scale=1.0, 
+        convention='smplx'):
+        '''Obtain scaled bbox in xyxy format given keypoints
+        Args:
+            keypoints (np.ndarray): Keypoints
+            scale (float): Bounding Box scale
+        Returns:
+            bbox_xyxy (np.ndarray): Bounding box in xyxy format
+        '''
+        bboxs = []
+
+        # supported kps.shape: (1, n, k) or (n, k), k = 2 or 3
+        if keypoints.ndim == 3:
+            keypoints = keypoints[0]
+        if keypoints.shape[-1] != 2:
+            keypoints = keypoints[:, :2]
+
+        for body_part in ['body', 'head', 'left_hand', 'right_hand']:
+            if body_part == 'body':
+                scale = body_scale
+                kps = keypoints
+            else:
+                scale = fh_scale
+                kp_id = get_keypoint_idxs_by_part(body_part, convention=convention)
+                kps = keypoints[kp_id]
+
+            if not occ is None:
+                occ_p = occ[kp_id]
+                if np.sum(occ_p) / len(kp_id) >= 0.1:
+                    conf = 0
+                    # print(f'{body_part} occluded, occlusion: {np.sum(occ_p) / len(kp_id)}, skip')
+                else:
+                    # print(f'{body_part} good, {np.sum(self_occ_p + occ_p) / len(kp_id)}')
+                    conf = 1
+            else:
+                conf = 1
+            if body_part == 'body':
+                conf = 1
+
+            xmin, ymin = np.amin(kps, axis=0)
+            xmax, ymax = np.amax(kps, axis=0)
+
+            width = (xmax - xmin) * scale
+            height = (ymax - ymin) * scale
+
+            x_center = 0.5 * (xmax + xmin)
+            y_center = 0.5 * (ymax + ymin)
+            xmin = x_center - 0.5 * width
+            xmax = x_center + 0.5 * width
+            ymin = y_center - 0.5 * height
+            ymax = y_center + 0.5 * height
+
+            bbox = np.stack([xmin, ymin, xmax, ymax, conf], axis=0).astype(np.float32)
+            bboxs.append(bbox)
+        
+        return bboxs
+
+
+@MODULE_BUILD_FUNCS.registe_with_name(module_name='aios_smplx_box')
+def build_aios_smplx_box(args, cfg):
+    # pdb.set_trace()
+    num_classes = args.num_classes  # 2
+    device = torch.device(args.device)
+
+    backbone = build_backbone(args)
+
+    transformer = build_transformer(args)
+
+    dn_labelbook_size = args.dn_labelbook_size
+    dec_pred_class_embed_share = args.dec_pred_class_embed_share
+    dec_pred_bbox_embed_share = args.dec_pred_bbox_embed_share
+
+    if args.eval:
+        body_model = args.body_model_test
+        train = False
+    else:
+        body_model = args.body_model_train
+        train = True
+        
+    model = AiOSSMPLX_Box(
+        backbone,
+        transformer,
+        num_classes=num_classes,  # 2
+        num_queries=args.num_queries,  # 900
+        aux_loss=True,
+        iter_update=True,
+        query_dim=4,
+        random_refpoints_xy=args.random_refpoints_xy,  # False
+        fix_refpoints_hw=args.fix_refpoints_hw,  # -1
+        num_feature_levels=args.num_feature_levels,  # 4
+        nheads=args.nheads,  # 8
+        dec_pred_class_embed_share=dec_pred_class_embed_share,  # false
+        dec_pred_bbox_embed_share=dec_pred_bbox_embed_share,  # False
+        # two stage
+        two_stage_type=args.two_stage_type,
+
+        # box_share
+        two_stage_bbox_embed_share=args.two_stage_bbox_embed_share,  # False
+        two_stage_class_embed_share=args.two_stage_class_embed_share,  # False
+        dn_number=args.dn_number if args.use_dn else 0,  # 100
+        dn_box_noise_scale=args.dn_box_noise_scale,  # 0.4
+        dn_label_noise_ratio=args.dn_label_noise_ratio,  # 0.5
+        dn_batch_gt_fuse=args.dn_batch_gt_fuse,  # false
+        dn_attn_mask_type_list=args.dn_attn_mask_type_list,
+        dn_labelbook_size=dn_labelbook_size,  # 100
+        cls_no_bias=args.cls_no_bias,  # False
+        num_group=args.num_group,  # 100
+        num_body_points=0,  # 17
+        num_hand_points=0,  # 17
+        num_face_points=0,  # 17
+        num_box_decoder_layers=args.num_box_decoder_layers,  # 2
+        num_hand_face_decoder_layers=args.num_hand_face_decoder_layers,
+        # smpl_convention=convention
+        body_model=body_model,
+        train=train,
+        inference=args.inference)
+    matcher = build_matcher(args)
+
+    # prepare weight dict
+    weight_dict = {
+        'loss_ce': args.cls_loss_coef,  # 2
+        # bbox
+        'loss_body_bbox': args.body_bbox_loss_coef,  # 5
+        'loss_rhand_bbox': args.rhand_bbox_loss_coef,  # 5
+        'loss_lhand_bbox': args.lhand_bbox_loss_coef,  # 5
+        'loss_face_bbox': args.face_bbox_loss_coef,  # 5
+        # bbox giou
+        'loss_body_giou': args.body_giou_loss_coef,  # 2
+        'loss_rhand_giou': args.rhand_giou_loss_coef,  # 2
+        'loss_lhand_giou': args.lhand_giou_loss_coef,  # 2
+        'loss_face_giou': args.face_giou_loss_coef,  # 2
+        # smpl param
+        'loss_smpl_pose_root': args.smpl_pose_loss_root_coef,  # 0
+        'loss_smpl_pose_body': args.smpl_pose_loss_body_coef,  # 0
+        'loss_smpl_pose_lhand': args.smpl_pose_loss_lhand_coef,  # 0
+        'loss_smpl_pose_rhand': args.smpl_pose_loss_rhand_coef,  # 0
+        'loss_smpl_pose_jaw': args.smpl_pose_loss_jaw_coef,  # 0
+        'loss_smpl_beta': args.smpl_beta_loss_coef,  # 0
+        'loss_smpl_expr': args.smpl_expr_loss_coef, 
+        # smpl kp3d ra
+        'loss_smpl_body_kp3d_ra': args.smpl_body_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_lhand_kp3d_ra': args.smpl_lhand_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_rhand_kp3d_ra': args.smpl_rhand_kp3d_ra_loss_coef,  # 0
+        'loss_smpl_face_kp3d_ra': args.smpl_face_kp3d_ra_loss_coef,  # 0
+        # smpl kp3d
+        'loss_smpl_body_kp3d': args.smpl_body_kp3d_loss_coef,  # 0
+        'loss_smpl_face_kp3d': args.smpl_face_kp3d_loss_coef,  # 0
+        'loss_smpl_lhand_kp3d': args.smpl_lhand_kp3d_loss_coef,  # 0
+        'loss_smpl_rhand_kp3d': args.smpl_rhand_kp3d_loss_coef,  # 0
+        # smpl kp2d
+        'loss_smpl_body_kp2d': args.smpl_body_kp2d_loss_coef,  # 0
+        'loss_smpl_lhand_kp2d': args.smpl_lhand_kp2d_loss_coef,  # 0
+        'loss_smpl_rhand_kp2d': args.smpl_rhand_kp2d_loss_coef,  # 0
+        'loss_smpl_face_kp2d': args.smpl_face_kp2d_loss_coef,  # 0
+    }
+
+    clean_weight_dict_wo_dn = copy.deepcopy(weight_dict)
+
+    if args.use_dn:
+        weight_dict.update({
+            'dn_loss_ce':
+            args.dn_label_coef,  # 0.3
+            'dn_loss_bbox':
+            args.bbox_loss_coef * args.dn_bbox_coef,  # 5 * 0.5
+            'dn_loss_giou':
+            args.giou_loss_coef * args.dn_bbox_coef,  # 2 * 0.5
+        })
+
+    clean_weight_dict = copy.deepcopy(weight_dict)
+
+    if args.aux_loss:
+        aux_weight_dict = {}
+        for i in range(args.dec_layers - 1):  # from 0 t 4 # ???
+            for k, v in clean_weight_dict.items():
+                if i < args.num_box_decoder_layers and ('keypoints' in k or 'oks' in k):
+                    continue
+                if i < args.num_box_decoder_layers and k in [
+                    'loss_rhand_bbox', 'loss_lhand_bbox', 'loss_face_bbox',
+                    'loss_rhand_giou', 'loss_lhand_giou', 'loss_face_giou']:
+                    continue
+                if i < args.num_hand_face_decoder_layers and k in [
+                    'loss_rhand_keypoints', 'loss_lhand_keypoints', 
+                    'loss_face_keypoints', 'loss_rhand_oks',
+                    'loss_lhand_oks', 'loss_face_oks']:
+                    continue
+                if i < args.num_box_decoder_layers and 'smpl' in k:
+                    continue
+                aux_weight_dict.update({k + f'_{i}': v})
+        weight_dict.update(aux_weight_dict)
+
+    if args.two_stage_type != 'no':
+        interm_weight_dict = {}
+        try:
+            no_interm_box_loss = args.no_interm_box_loss
+        except:
+            no_interm_box_loss = False
+        _coeff_weight_dict = {
+            'loss_ce': 1.0,
+            # bbox
+            'loss_body_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_bbox': 1.0 if not no_interm_box_loss else 0.0,
+            # bbox giou
+            'loss_body_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_rhand_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_lhand_giou': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_face_giou': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl param
+            'loss_smpl_pose_root': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_body': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_lhand': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_rhand': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_pose_jaw': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_beta': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_expr': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp3d ra
+            'loss_smpl_body_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp3d_ra': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp3d
+            'loss_smpl_body_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp3d': 1.0 if not no_interm_box_loss else 0.0,
+            # smpl kp2d
+            'loss_smpl_body_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_lhand_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_rhand_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+            'loss_smpl_face_kp2d': 1.0 if not no_interm_box_loss else 0.0,
+        }
+        try:
+            interm_loss_coef = args.interm_loss_coef  # 1
+        except:
+            interm_loss_coef = 1.0
+        interm_weight_dict.update({
+            k + f'_interm': v * interm_loss_coef * _coeff_weight_dict[k]
+            for k, v in clean_weight_dict_wo_dn.items() if 'keypoints' not in k
+        })
+        weight_dict.update(interm_weight_dict)
+
+        interm_weight_dict.update({
+            k + f'_query_expand': v * interm_loss_coef * _coeff_weight_dict[k]
+            for k, v in clean_weight_dict_wo_dn.items()
+        })  # ???
+        weight_dict.update(interm_weight_dict)
+
+    losses = cfg.losses
+    
+    if args.dn_number > 0:
+        losses += ['dn_label', 'dn_bbox']
+    losses += ['matching']
+
+    criterion = SetCriterion_Box(
+        num_classes,
+        matcher=matcher,
+        weight_dict=weight_dict,
+        focal_alpha=args.focal_alpha,
+        losses=losses,
+        num_box_decoder_layers=args.num_box_decoder_layers,
+        num_hand_face_decoder_layers=args.num_hand_face_decoder_layers,
+        num_body_points=0,
+        num_hand_points=0,
+        num_face_points=0,
+        )
+
+    criterion.to(device)
+    if args.inference:
+        postprocessors = {
+            'bbox': 
+                PostProcess_SMPLX_Multi_Infer_Box(
+                    num_select=args.num_select, 
+                    nms_iou_threshold=args.nms_iou_threshold,
+                    num_body_points=0),
+        }
+    else:
+        postprocessors = {
+            'bbox': 
+                PostProcess_SMPLX_Multi_Box(
+                    num_select=args.num_select, 
+                    nms_iou_threshold=args.nms_iou_threshold,
+                    num_body_points=0),
+        }
+    postprocessors_aios = {
+        'bbox':
+        PostProcess_aios(num_select=args.num_select,
+                           nms_iou_threshold=args.nms_iou_threshold,
+                           num_body_points=0),
+    }
+
+    return model, criterion, postprocessors, postprocessors_aios
+
diff --git a/models/aios/backbones/__init__.py b/models/aios/backbones/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..76e4b272b479a26c63d120c818c140870cd8c287
--- /dev/null
+++ b/models/aios/backbones/__init__.py
@@ -0,0 +1 @@
+from .backbone import build_backbone
diff --git a/models/aios/backbones/backbone.py b/models/aios/backbones/backbone.py
new file mode 100644
index 0000000000000000000000000000000000000000..75195fd0489de4000dbb06fdce0631abb8ccf9e7
--- /dev/null
+++ b/models/aios/backbones/backbone.py
@@ -0,0 +1,222 @@
+"""Backbone modules."""
+from collections import OrderedDict
+import os
+
+import torch
+import torch.nn.functional as F
+import torchvision
+from torch import nn
+from torchvision.models._utils import IntermediateLayerGetter
+from typing import Dict, List
+
+from util.misc import NestedTensor, clean_state_dict, is_main_process
+
+from ..position_encoding import build_position_encoding
+from .swin_transformer import build_swin_transformer
+
+
+class FrozenBatchNorm2d(torch.nn.Module):
+    """BatchNorm2d where the batch statistics and the affine parameters are
+    fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without
+    which any other models than torchvision.models.resnet[18,34,50,101] produce
+    nans.
+    """
+    def __init__(self, n):
+        super(FrozenBatchNorm2d, self).__init__()
+        self.register_buffer('weight', torch.ones(n))
+        self.register_buffer('bias', torch.zeros(n))
+        self.register_buffer('running_mean', torch.zeros(n))
+        self.register_buffer('running_var', torch.ones(n))
+
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
+                              missing_keys, unexpected_keys, error_msgs):
+        num_batches_tracked_key = prefix + 'num_batches_tracked'
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super(FrozenBatchNorm2d,
+              self)._load_from_state_dict(state_dict, prefix, local_metadata,
+                                          strict, missing_keys,
+                                          unexpected_keys, error_msgs)
+
+    def forward(self, x):
+        w = self.weight.reshape(1, -1, 1, 1)
+        b = self.bias.reshape(1, -1, 1, 1)
+        rv = self.running_var.reshape(1, -1, 1, 1)
+        rm = self.running_mean.reshape(1, -1, 1, 1)
+        eps = 1e-5
+        scale = w * (rv + eps).rsqrt()
+        bias = b - rm * scale
+        return x * scale + bias
+
+
+class BackboneBase(nn.Module):
+    def __init__(self, backbone: nn.Module, train_backbone: bool,
+                 num_channels: int, return_interm_indices: list):
+        super().__init__()
+        for name, parameter in backbone.named_parameters():
+            if not train_backbone or 'layer0' not in name and 'layer1' not in name and 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
+                parameter.requires_grad_(False)
+
+        return_layers = {}
+        for idx, layer_index in enumerate(return_interm_indices):
+            return_layers.update({
+                'layer{}'.format(5 - len(return_interm_indices) + idx):
+                '{}'.format(layer_index)
+            })
+
+        self.body = IntermediateLayerGetter(backbone,
+                                            return_layers=return_layers)
+        self.num_channels = num_channels
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self.body(tensor_list.tensors)
+        out: Dict[str, NestedTensor] = {}
+        for name, x in xs.items():
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(),
+                                 size=x.shape[-2:]).to(torch.bool)[0]
+            out[name] = NestedTensor(x, mask)
+        return out
+
+
+class Backbone(BackboneBase):
+    """ResNet backbone with frozen BatchNorm."""
+    def __init__(
+        self,
+        name: str,
+        train_backbone: bool,
+        dilation: bool,
+        return_interm_indices: list,
+        batch_norm=FrozenBatchNorm2d,
+    ):
+        if name in ['resnet18', 'resnet34', 'resnet50', 'resnet101']:
+            # backbone = getattr(torchvision.models, name)(
+            #     replace_stride_with_dilation=[False, False, dilation],
+            #     pretrained=is_main_process(), norm_layer=batch_norm)
+            backbone = getattr(torchvision.models, name)(
+                replace_stride_with_dilation=[False, False, dilation],
+                pretrained=False,
+                norm_layer=batch_norm)
+        else:
+            raise NotImplementedError(
+                'Why you can get here with name {}'.format(name))
+
+        assert name not in (
+            'resnet18',
+            'resnet34'), 'Only resnet50 and resnet101 are available.'
+        assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]
+        num_channels_all = [256, 512, 1024, 2048]
+        num_channels = num_channels_all[4 - len(return_interm_indices):]
+        super().__init__(backbone, train_backbone, num_channels,
+                         return_interm_indices)
+
+
+class Joiner(nn.Sequential):
+    def __init__(self, backbone, position_embedding):
+        super().__init__(backbone, position_embedding)
+
+    def forward(self, tensor_list: NestedTensor):
+        xs = self[0](tensor_list)
+        out: List[NestedTensor] = []
+        pos = []
+        for name, x in xs.items():
+            out.append(x)
+
+            pos.append(self[1](x).to(x.tensors.dtype))
+
+        return out, pos
+
+
+def build_backbone(args):
+    """Useful args:
+
+    - backbone: backbone name
+    - lr_backbone:
+    - dilation
+    - return_interm_indices: available: [0,1,2,3], [1,2,3], [3]
+    - backbone_freeze_keywords:
+    - use_checkpoint: for swin only for now
+    """
+    position_embedding = build_position_encoding(args)
+    train_backbone = args.lr_backbone > 0
+    if not train_backbone:
+        raise ValueError('Please set lr_backbone > 0')
+    return_interm_indices = args.return_interm_indices
+    assert return_interm_indices in [[0, 1, 2, 3], [1, 2, 3], [3]]  #  [1,2,3]
+    backbone_freeze_keywords = args.backbone_freeze_keywords  # None
+    use_checkpoint = getattr(args, 'use_checkpoint', False)  # False
+
+    if args.backbone in ['resnet50', 'resnet101']:
+        backbone = Backbone(args.backbone,
+                            train_backbone,
+                            args.dilation,
+                            return_interm_indices,
+                            batch_norm=FrozenBatchNorm2d)
+        bb_num_channels = backbone.num_channels
+
+    elif args.backbone in [
+            'swin_T_224_1k', 'swin_B_224_22k', 'swin_B_384_22k',
+            'swin_L_224_22k', 'swin_L_384_22k'
+    ]:
+        pretrain_img_size = int(args.backbone.split('_')[-2])
+        backbone = build_swin_transformer(
+            args.backbone,
+            pretrain_img_size=pretrain_img_size,
+            out_indices=tuple(return_interm_indices),
+            dilation=args.dilation,
+            use_checkpoint=use_checkpoint)
+        # freeze some layers
+        if backbone_freeze_keywords is not None:
+            for name, parameter in backbone.named_parameters():
+                for keyword in backbone_freeze_keywords:
+                    if keyword in name:
+                        parameter.requires_grad_(False)
+                        break
+        pretrained_dir = os.environ.get('pretrain_model_path')
+        # import pdb
+        # pdb.set_trace()
+        PTDICT = {
+            'swin_T_224_1k': 'swin_tiny_patch4_window7_224.pth',
+            'swin_B_384_22k': 'swin_base_patch4_window12_384.pth',
+            'swin_L_384_22k': 'swin_large_patch4_window12_384_22k.pth',
+        }
+        pretrainedpath = os.path.join(pretrained_dir, PTDICT[args.backbone])
+        checkpoint = torch.load(pretrainedpath, map_location='cpu')['model']
+        from collections import OrderedDict
+
+        def key_select_function(keyname):
+            if 'head' in keyname:
+                return False
+            if args.dilation and 'layers.3' in keyname:
+                return False
+            return True
+
+        _tmp_st = OrderedDict({
+            k: v
+            for k, v in clean_state_dict(checkpoint).items()
+            if key_select_function(k)
+        })
+        _tmp_st_output = backbone.load_state_dict(_tmp_st, strict=False)
+        print(str(_tmp_st_output))
+
+        bb_num_channels = backbone.num_features[4 -
+                                                len(return_interm_indices):]
+    else:
+        raise NotImplementedError('Unknown backbone {}'.format(args.backbone))
+
+    assert len(bb_num_channels) == len(
+        return_interm_indices
+    ), f'len(bb_num_channels) {len(bb_num_channels)} != len(return_interm_indices) {len(return_interm_indices)}'
+
+    model = Joiner(backbone, position_embedding)
+    model.num_channels = bb_num_channels
+    assert isinstance(
+        bb_num_channels,
+        List), 'bb_num_channels is expected to be a List but {}'.format(
+            type(bb_num_channels))
+    # import pdb; pdb.set_trace()
+    return model
diff --git a/models/aios/backbones/swin_transformer.py b/models/aios/backbones/swin_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2604586a82155ee320d68f8831be2d73699b172f
--- /dev/null
+++ b/models/aios/backbones/swin_transformer.py
@@ -0,0 +1,806 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+import numpy as np
+from timm.models.layers import DropPath, to_2tuple, trunc_normal_
+from util.misc import NestedTensor
+
+
+class Mlp(nn.Module):
+    """Multilayer perceptron."""
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_layer=nn.GELU,
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+def window_partition(x, window_size):
+    """
+    Args:
+        x: (B, H, W, C)
+        window_size (int): window size
+    Returns:
+        windows: (num_windows*B, window_size, window_size, C)
+    """
+    B, H, W, C = x.shape
+    x = x.view(B, H // window_size, window_size, W // window_size, window_size,
+               C)
+    windows = x.permute(0, 1, 3, 2, 4,
+                        5).contiguous().view(-1, window_size, window_size, C)
+    return windows
+
+
+def window_reverse(windows, window_size, H, W):
+    """
+    Args:
+        windows: (num_windows*B, window_size, window_size, C)
+        window_size (int): Window size
+        H (int): Height of image
+        W (int): Width of image
+    Returns:
+        x: (B, H, W, C)
+    """
+    B = int(windows.shape[0] / (H * W / window_size / window_size))
+    x = windows.view(B, H // window_size, W // window_size, window_size,
+                     window_size, -1)
+    x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+    return x
+
+
+class WindowAttention(nn.Module):
+    """Window based multi-head self attention (W-MSA) module with relative
+    position bias.
+
+    It supports both of shifted and non-shifted window.
+    Args:
+        dim (int): Number of input channels.
+        window_size (tuple[int]): The height and width of the window.
+        num_heads (int): Number of attention heads.
+        qkv_bias (bool, optional):  If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set
+        attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0
+        proj_drop (float, optional): Dropout ratio of output. Default: 0.0
+    """
+    def __init__(self,
+                 dim,
+                 window_size,
+                 num_heads,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop=0.,
+                 proj_drop=0.):
+
+        super().__init__()
+        self.dim = dim
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros((2 * window_size[0] - 1) * (2 * window_size[1] - 1),
+                        num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(self.window_size[0])
+        coords_w = torch.arange(self.window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :,
+                                         None] - coords_flatten[:,
+                                                                None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(
+            1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :,
+                        0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        self.register_buffer('relative_position_index',
+                             relative_position_index)
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+
+        trunc_normal_(self.relative_position_bias_table, std=.02)
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """Forward function.
+
+        Args:
+            x: input features with shape of (num_windows*B, N, C)
+            mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None
+        """
+        B_, N, C = x.shape
+        qkv = self.qkv(x).reshape(B_, N, 3, self.num_heads,
+                                  C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[
+            2]  # make torchscript happy (cannot use tensor as tuple)
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        relative_position_bias = self.relative_position_bias_table[
+            self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1)  # Wh*Ww,Wh*Ww,nH
+        relative_position_bias = relative_position_bias.permute(
+            2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+        attn = attn + relative_position_bias.unsqueeze(0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B_ // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+            attn = self.softmax(attn)
+        else:
+            attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B_, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+
+class SwinTransformerBlock(nn.Module):
+    """Swin Transformer Block.
+
+    Args:
+        dim (int): Number of input channels.
+        num_heads (int): Number of attention heads.
+        window_size (int): Window size.
+        shift_size (int): Shift size for SW-MSA.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float, optional): Stochastic depth rate. Default: 0.0
+        act_layer (nn.Module, optional): Activation layer. Default: nn.GELU
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self,
+                 dim,
+                 num_heads,
+                 window_size=7,
+                 shift_size=0,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 act_layer=nn.GELU,
+                 norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.shift_size = shift_size
+        self.mlp_ratio = mlp_ratio
+        assert 0 <= self.shift_size < self.window_size, 'shift_size must in 0-window_size'
+
+        self.norm1 = norm_layer(dim)
+        self.attn = WindowAttention(dim,
+                                    window_size=to_2tuple(self.window_size),
+                                    num_heads=num_heads,
+                                    qkv_bias=qkv_bias,
+                                    qk_scale=qk_scale,
+                                    attn_drop=attn_drop,
+                                    proj_drop=drop)
+
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim,
+                       hidden_features=mlp_hidden_dim,
+                       act_layer=act_layer,
+                       drop=drop)
+
+        self.H = None
+        self.W = None
+
+    def forward(self, x, mask_matrix):
+        """Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+            mask_matrix: Attention mask for cyclic shift.
+        """
+        B, L, C = x.shape
+        H, W = self.H, self.W
+        assert L == H * W, 'input feature has wrong size'
+
+        shortcut = x
+        x = self.norm1(x)
+        x = x.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_l = pad_t = 0
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        x = F.pad(x, (0, 0, pad_l, pad_r, pad_t, pad_b))
+        _, Hp, Wp, _ = x.shape
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_x = torch.roll(x,
+                                   shifts=(-self.shift_size, -self.shift_size),
+                                   dims=(1, 2))
+            attn_mask = mask_matrix
+        else:
+            shifted_x = x
+            attn_mask = None
+
+        # partition windows
+        x_windows = window_partition(
+            shifted_x, self.window_size)  # nW*B, window_size, window_size, C
+        x_windows = x_windows.view(-1, self.window_size * self.window_size,
+                                   C)  # nW*B, window_size*window_size, C
+
+        # W-MSA/SW-MSA
+        attn_windows = self.attn(
+            x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size, C)
+        shifted_x = window_reverse(attn_windows, self.window_size, Hp,
+                                   Wp)  # B H' W' C
+
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(shifted_x,
+                           shifts=(self.shift_size, self.shift_size),
+                           dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b > 0:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, C)
+
+        # FFN
+        x = shortcut + self.drop_path(x)
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+
+        return x
+
+
+class PatchMerging(nn.Module):
+    """ Patch Merging Layer
+    Args:
+        dim (int): Number of input channels.
+        norm_layer (nn.Module, optional): Normalization layer.  Default: nn.LayerNorm
+    """
+    def __init__(self, dim, norm_layer=nn.LayerNorm):
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Linear(4 * dim, 2 * dim, bias=False)
+        self.norm = norm_layer(4 * dim)
+
+    def forward(self, x, H, W):
+        """Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+        B, L, C = x.shape
+        assert L == H * W, 'input feature has wrong size'
+
+        x = x.view(B, H, W, C)
+
+        # padding
+        pad_input = (H % 2 == 1) or (W % 2 == 1)
+        if pad_input:
+            x = F.pad(x, (0, 0, 0, W % 2, 0, H % 2))
+
+        x0 = x[:, 0::2, 0::2, :]  # B H/2 W/2 C
+        x1 = x[:, 1::2, 0::2, :]  # B H/2 W/2 C
+        x2 = x[:, 0::2, 1::2, :]  # B H/2 W/2 C
+        x3 = x[:, 1::2, 1::2, :]  # B H/2 W/2 C
+        x = torch.cat([x0, x1, x2, x3], -1)  # B H/2 W/2 4*C
+        x = x.view(B, -1, 4 * C)  # B H/2*W/2 4*C
+
+        x = self.norm(x)
+        x = self.reduction(x)
+
+        return x
+
+
+class BasicLayer(nn.Module):
+    """A basic Swin Transformer layer for one stage.
+
+    Args:
+        dim (int): Number of feature channels
+        depth (int): Depths of this stage.
+        num_heads (int): Number of attention head.
+        window_size (int): Local window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set.
+        drop (float, optional): Dropout rate. Default: 0.0
+        attn_drop (float, optional): Attention dropout rate. Default: 0.0
+        drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0
+        norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm
+        downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+    """
+    def __init__(self,
+                 dim,
+                 depth,
+                 num_heads,
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop=0.,
+                 attn_drop=0.,
+                 drop_path=0.,
+                 norm_layer=nn.LayerNorm,
+                 downsample=None,
+                 use_checkpoint=False):
+        super().__init__()
+        self.window_size = window_size
+        self.shift_size = window_size // 2
+        self.depth = depth
+        self.use_checkpoint = use_checkpoint
+
+        # build blocks
+        self.blocks = nn.ModuleList([
+            SwinTransformerBlock(dim=dim,
+                                 num_heads=num_heads,
+                                 window_size=window_size,
+                                 shift_size=0 if
+                                 (i % 2 == 0) else window_size // 2,
+                                 mlp_ratio=mlp_ratio,
+                                 qkv_bias=qkv_bias,
+                                 qk_scale=qk_scale,
+                                 drop=drop,
+                                 attn_drop=attn_drop,
+                                 drop_path=drop_path[i] if isinstance(
+                                     drop_path, list) else drop_path,
+                                 norm_layer=norm_layer) for i in range(depth)
+        ])
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=norm_layer)
+        else:
+            self.downsample = None
+
+    def forward(self, x, H, W):
+        """Forward function.
+
+        Args:
+            x: Input feature, tensor size (B, H*W, C).
+            H, W: Spatial resolution of the input feature.
+        """
+
+        # calculate attention mask for SW-MSA
+        Hp = int(np.ceil(H / self.window_size)) * self.window_size
+        Wp = int(np.ceil(W / self.window_size)) * self.window_size
+        img_mask = torch.zeros((1, Hp, Wp, 1), device=x.device)  # 1 Hp Wp 1
+        h_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        w_slices = (slice(0, -self.window_size),
+                    slice(-self.window_size,
+                          -self.shift_size), slice(-self.shift_size, None))
+        cnt = 0
+        for h in h_slices:
+            for w in w_slices:
+                img_mask[:, h, w, :] = cnt
+                cnt += 1
+
+        mask_windows = window_partition(
+            img_mask, self.window_size)  # nW, window_size, window_size, 1
+        mask_windows = mask_windows.view(-1,
+                                         self.window_size * self.window_size)
+        attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+        attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                          float(-100.0)).masked_fill(
+                                              attn_mask == 0, float(0.0))
+
+        for blk in self.blocks:
+            blk.H, blk.W = H, W
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, attn_mask)
+            else:
+                x = blk(x, attn_mask)
+        if self.downsample is not None:
+            x_down = self.downsample(x, H, W)
+            Wh, Ww = (H + 1) // 2, (W + 1) // 2
+            return x, H, W, x_down, Wh, Ww
+        else:
+            return x, H, W, x, H, W
+
+
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    Args:
+        patch_size (int): Patch token size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+    def __init__(self,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 norm_layer=None):
+        super().__init__()
+        patch_size = to_2tuple(patch_size)
+        self.patch_size = patch_size
+
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+
+        self.proj = nn.Conv2d(in_chans,
+                              embed_dim,
+                              kernel_size=patch_size,
+                              stride=patch_size)
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = None
+
+    def forward(self, x):
+        """Forward function."""
+        # padding
+        _, _, H, W = x.size()
+        if W % self.patch_size[1] != 0:
+            x = F.pad(x, (0, self.patch_size[1] - W % self.patch_size[1]))
+        if H % self.patch_size[0] != 0:
+            x = F.pad(x,
+                      (0, 0, 0, self.patch_size[0] - H % self.patch_size[0]))
+
+        x = self.proj(x)  # B C Wh Ww
+        if self.norm is not None:
+            Wh, Ww = x.size(2), x.size(3)
+            x = x.flatten(2).transpose(1, 2)
+            x = self.norm(x)
+            x = x.transpose(1, 2).view(-1, self.embed_dim, Wh, Ww)
+
+        return x
+
+
+class SwinTransformer(nn.Module):
+    """ Swin Transformer backbone.
+        A PyTorch impl of : `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows`  -
+          https://arxiv.org/pdf/2103.14030
+    Args:
+        pretrain_img_size (int): Input image size for training the pretrained model,
+            used in absolute postion embedding. Default 224.
+        patch_size (int | tuple(int)): Patch size. Default: 4.
+        in_chans (int): Number of input image channels. Default: 3.
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        depths (tuple[int]): Depths of each Swin Transformer stage.
+        num_heads (tuple[int]): Number of attention head of each stage.
+        window_size (int): Window size. Default: 7.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.
+        qkv_bias (bool): If True, add a learnable bias to query, key, value. Default: True
+        qk_scale (float): Override default qk scale of head_dim ** -0.5 if set.
+        drop_rate (float): Dropout rate.
+        attn_drop_rate (float): Attention dropout rate. Default: 0.
+        drop_path_rate (float): Stochastic depth rate. Default: 0.2.
+        norm_layer (nn.Module): Normalization layer. Default: nn.LayerNorm.
+        ape (bool): If True, add absolute position embedding to the patch embedding. Default: False.
+        patch_norm (bool): If True, add normalization after patch embedding. Default: True.
+        out_indices (Sequence[int]): Output from which stages.
+        frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
+            -1 means not freezing any parameters.
+        use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False.
+        dilation (bool): if True, the output size if 16x downsample, ow 32x downsample.
+    """
+    def __init__(self,
+                 pretrain_img_size=224,
+                 patch_size=4,
+                 in_chans=3,
+                 embed_dim=96,
+                 depths=[2, 2, 6, 2],
+                 num_heads=[3, 6, 12, 24],
+                 window_size=7,
+                 mlp_ratio=4.,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.2,
+                 norm_layer=nn.LayerNorm,
+                 ape=False,
+                 patch_norm=True,
+                 out_indices=(0, 1, 2, 3),
+                 frozen_stages=-1,
+                 dilation=False,
+                 use_checkpoint=False):
+        super().__init__()
+
+        self.pretrain_img_size = pretrain_img_size
+        self.num_layers = len(depths)
+        self.embed_dim = embed_dim
+        self.ape = ape
+        self.patch_norm = patch_norm
+        self.out_indices = out_indices
+        self.frozen_stages = frozen_stages
+        self.dilation = dilation
+
+        if use_checkpoint:
+            print('use_checkpoint!!!!!!!!!!!!!!!!!!!!!!!!')
+
+        # split image into non-overlapping patches
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            norm_layer=norm_layer if self.patch_norm else None)
+
+        # absolute position embedding
+        if self.ape:
+            pretrain_img_size = to_2tuple(pretrain_img_size)
+            patch_size = to_2tuple(patch_size)
+            patches_resolution = [
+                pretrain_img_size[0] // patch_size[0],
+                pretrain_img_size[1] // patch_size[1]
+            ]
+
+            self.absolute_pos_embed = nn.Parameter(
+                torch.zeros(1, embed_dim, patches_resolution[0],
+                            patches_resolution[1]))
+            trunc_normal_(self.absolute_pos_embed, std=.02)
+
+        self.pos_drop = nn.Dropout(p=drop_rate)
+
+        # stochastic depth
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+
+        # build layers
+        self.layers = nn.ModuleList()
+        # prepare downsample list
+        downsamplelist = [PatchMerging for i in range(self.num_layers)]
+        downsamplelist[-1] = None
+        num_features = [int(embed_dim * 2**i) for i in range(self.num_layers)]
+        if self.dilation:
+            downsamplelist[-2] = None
+            num_features[-1] = int(embed_dim * 2**(self.num_layers - 1)) // 2
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                # dim=int(embed_dim * 2 ** i_layer),
+                dim=num_features[i_layer],
+                depth=depths[i_layer],
+                num_heads=num_heads[i_layer],
+                window_size=window_size,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                drop=drop_rate,
+                attn_drop=attn_drop_rate,
+                drop_path=dpr[sum(depths[:i_layer]):sum(depths[:i_layer + 1])],
+                norm_layer=norm_layer,
+                # downsample=PatchMerging if (i_layer < self.num_layers - 1) else None,
+                downsample=downsamplelist[i_layer],
+                use_checkpoint=use_checkpoint)
+            self.layers.append(layer)
+
+        # num_features = [int(embed_dim * 2 ** i) for i in range(self.num_layers)]
+        self.num_features = num_features
+
+        # add a norm layer for each output
+        for i_layer in out_indices:
+            layer = norm_layer(num_features[i_layer])
+            layer_name = f'norm{i_layer}'
+            self.add_module(layer_name, layer)
+
+        self._freeze_stages()
+
+    def _freeze_stages(self):
+        if self.frozen_stages >= 0:
+            self.patch_embed.eval()
+            for param in self.patch_embed.parameters():
+                param.requires_grad = False
+
+        if self.frozen_stages >= 1 and self.ape:
+            self.absolute_pos_embed.requires_grad = False
+
+        if self.frozen_stages >= 2:
+            self.pos_drop.eval()
+            for i in range(0, self.frozen_stages - 1):
+                m = self.layers[i]
+                m.eval()
+                for param in m.parameters():
+                    param.requires_grad = False
+
+    # def init_weights(self, pretrained=None):
+    #     """Initialize the weights in backbone.
+    #     Args:
+    #         pretrained (str, optional): Path to pre-trained weights.
+    #             Defaults to None.
+    #     """
+
+    #     def _init_weights(m):
+    #         if isinstance(m, nn.Linear):
+    #             trunc_normal_(m.weight, std=.02)
+    #             if isinstance(m, nn.Linear) and m.bias is not None:
+    #                 nn.init.constant_(m.bias, 0)
+    #         elif isinstance(m, nn.LayerNorm):
+    #             nn.init.constant_(m.bias, 0)
+    #             nn.init.constant_(m.weight, 1.0)
+
+    #     if isinstance(pretrained, str):
+    #         self.apply(_init_weights)
+    #         logger = get_root_logger()
+    #         load_checkpoint(self, pretrained, strict=False, logger=logger)
+    #     elif pretrained is None:
+    #         self.apply(_init_weights)
+    #     else:
+    #         raise TypeError('pretrained must be a str or None')
+
+    def forward_raw(self, x):
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed,
+                                               size=(Wh, Ww),
+                                               mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1,
+                                                              2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+            # import pdb; pdb.set_trace()
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W,
+                                 self.num_features[i]).permute(0, 3, 1,
+                                                               2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # outs:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+        return tuple(outs)
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        """Forward function."""
+        x = self.patch_embed(x)
+
+        Wh, Ww = x.size(2), x.size(3)
+        if self.ape:
+            # interpolate the position embedding to the corresponding size
+            absolute_pos_embed = F.interpolate(self.absolute_pos_embed,
+                                               size=(Wh, Ww),
+                                               mode='bicubic')
+            x = (x + absolute_pos_embed).flatten(2).transpose(1,
+                                                              2)  # B Wh*Ww C
+        else:
+            x = x.flatten(2).transpose(1, 2)
+        x = self.pos_drop(x)
+
+        outs = []
+        for i in range(self.num_layers):
+            layer = self.layers[i]
+            x_out, H, W, x, Wh, Ww = layer(x, Wh, Ww)
+
+            if i in self.out_indices:
+                norm_layer = getattr(self, f'norm{i}')
+                x_out = norm_layer(x_out)
+
+                out = x_out.view(-1, H, W,
+                                 self.num_features[i]).permute(0, 3, 1,
+                                                               2).contiguous()
+                outs.append(out)
+        # in:
+        #   torch.Size([2, 3, 1024, 1024])
+        # out:
+        #   [torch.Size([2, 192, 256, 256]), torch.Size([2, 384, 128, 128]), \
+        #       torch.Size([2, 768, 64, 64]), torch.Size([2, 1536, 32, 32])]
+
+        # collect for nesttensors
+        outs_dict = {}
+        for idx, out_i in enumerate(outs):
+            m = tensor_list.mask
+            assert m is not None
+            mask = F.interpolate(m[None].float(),
+                                 size=out_i.shape[-2:]).to(torch.bool)[0]
+            outs_dict[idx] = NestedTensor(out_i, mask)
+
+        return outs_dict
+
+    def train(self, mode=True):
+        """Convert the model into training mode while keep layers freezed."""
+        super(SwinTransformer, self).train(mode)
+        self._freeze_stages()
+
+
+def build_swin_transformer(modelname, pretrain_img_size, **kw):
+    assert modelname in [
+        'swin_T_224_1k', 'swin_B_224_22k', 'swin_B_384_22k', 'swin_L_224_22k',
+        'swin_L_384_22k'
+    ]
+
+    model_para_dict = {
+        'swin_T_224_1k':
+        dict(embed_dim=96,
+             depths=[2, 2, 6, 2],
+             num_heads=[3, 6, 12, 24],
+             window_size=7),
+        'swin_B_224_22k':
+        dict(embed_dim=128,
+             depths=[2, 2, 18, 2],
+             num_heads=[4, 8, 16, 32],
+             window_size=7),
+        'swin_B_384_22k':
+        dict(embed_dim=128,
+             depths=[2, 2, 18, 2],
+             num_heads=[4, 8, 16, 32],
+             window_size=12),
+        'swin_L_224_22k':
+        dict(embed_dim=192,
+             depths=[2, 2, 18, 2],
+             num_heads=[6, 12, 24, 48],
+             window_size=7),
+        'swin_L_384_22k':
+        dict(embed_dim=192,
+             depths=[2, 2, 18, 2],
+             num_heads=[6, 12, 24, 48],
+             window_size=12),
+    }
+    kw_cgf = model_para_dict[modelname]
+    kw_cgf.update(kw)
+    model = SwinTransformer(pretrain_img_size=pretrain_img_size, **kw_cgf)
+    return model
+
+
+if __name__ == '__main__':
+    model = build_swin_transformer('swin_L_384_22k', 384, dilation=True)
+    x = torch.rand(2, 3, 1024, 1024)
+    y = model.forward_raw(x)
+    import pdb
+    pdb.set_trace()
+    x = torch.rand(2, 3, 384, 384)
+    y = model.forward_raw(x)
diff --git a/models/aios/criterion_smplx.py b/models/aios/criterion_smplx.py
new file mode 100644
index 0000000000000000000000000000000000000000..09e79c53e28b5ede085c8e798e976e9b826f8dfb
--- /dev/null
+++ b/models/aios/criterion_smplx.py
@@ -0,0 +1,2775 @@
+import copy
+import os
+import math
+from typing import List
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.ops.boxes import nms
+from torch import Tensor
+from util import box_ops
+from util.misc import (NestedTensor, nested_tensor_from_tensor_list, accuracy,
+                       get_world_size, interpolate,
+                       is_dist_avail_and_initialized, inverse_sigmoid)
+from .utils import PoseProjector, sigmoid_focal_loss, MLP, OKSLoss
+from typing import Optional, Union
+from detrsmpl.core.conventions.keypoints_mapping import (get_keypoint_idx,
+                                                         convert_kps)
+from detrsmpl.utils.geometry import (batch_rodrigues, project_points_new)
+from config.config import cfg
+from util.human_models import smpl_x
+from detrsmpl.utils.transforms import rotmat_to_aa
+class SetCriterion(nn.Module):
+    def __init__(self,
+                 num_classes,
+                 matcher,
+                 weight_dict,
+                 focal_alpha,
+                 losses,
+                 num_box_decoder_layers=2,
+                 num_hand_face_decoder_layers=4,
+                 num_body_points=17,
+                 num_hand_points=6,
+                 num_face_points=6,
+                 smpl_loss_config=None,
+                 convention='smplx_137'):
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.losses = losses
+        self.focal_alpha = focal_alpha
+        self.vis = 0.1
+        self.abs = 1
+        self.num_body_points = num_body_points
+        self.num_hand_points = num_hand_points
+        self.num_face_points = num_face_points
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.convention = convention
+        self.body_oks = OKSLoss(linear=True,
+                           num_keypoints=num_body_points,
+                           eps=1e-6,
+                           reduction='mean',
+                           loss_weight=1.0)
+        self.hand_oks = OKSLoss(linear=True,
+                           num_keypoints=num_hand_points,
+                           eps=1e-6,
+                           reduction='mean',
+                           loss_weight=1.0)
+        self.face_oks = OKSLoss(linear=True,
+                           num_keypoints=num_face_points,
+                           eps=1e-6,
+                           reduction='mean',
+                           loss_weight=1.0)
+
+    def loss_labels(self,
+                    outputs,
+                    targets,
+                    indices,
+                    idx,
+                    num_boxes,
+                    data_batch,
+                    log=True):
+        """Classification loss (Binary focal loss) targets dicts must contain
+        the key "labels" containing a tensor of dim [nb_target_boxes]"""
+        indices = indices[0]
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        target_classes_o = torch.cat(
+            [t['labels'][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2],
+                                    self.num_classes,
+                                    dtype=torch.int64,
+                                    device=src_logits.device)
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros([
+            src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1
+        ],
+                                            dtype=src_logits.dtype,
+                                            layout=src_logits.layout,
+                                            device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = sigmoid_focal_loss(src_logits,
+                                     target_classes_onehot,
+                                     num_boxes,
+                                     alpha=self.focal_alpha,
+                                     gamma=2) * src_logits.shape[1]
+        losses = {'loss_ce': loss_ce}
+
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses['class_error'] = 100 - accuracy(src_logits[idx],
+                                                   target_classes_o)[0]
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes,
+                         data_batch):
+        """Compute the cardinality error, ie the absolute error in the number
+        of predicted non-empty boxes This is not really a loss, it is intended
+        for logging purposes only.
+
+        It doesn't propagate gradients
+        """
+        pred_logits = outputs['pred_logits']
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor([len(v['labels']) for v in targets],
+                                      device=device)
+        if tgt_lengths == 0:
+            return  {'cardinality_error': pred_logits.sum()*0}
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) !=
+                     pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {'cardinality_error': card_err}
+        return losses
+
+    def loss_keypoints(self, outputs, targets, indices, 
+                       idx, num_boxes, data_batch,
+                       face_hand_kpt=False):
+        """Compute the losses related to the keypoints."""
+        indices = indices[0]
+        losses = {}
+        device = outputs['pred_logits'].device
+        ############################################################
+        #   body
+        ############################################################
+
+        src_body_keypoints = outputs['pred_keypoints'][idx]  # xyxyvv
+        if len(src_body_keypoints) == 0:
+            losses.append({
+                'loss_keypoints': src_body_keypoints.sum() * 0 + \
+                    outputs['pred_smpl_cam'][idx].float().sum()*0,
+                'loss_oks': src_body_keypoints.sum() * torch.as_tensor(0., device=device),
+            })
+        else:
+            Z_pred = src_body_keypoints[:, 0:(self.num_body_points * 2)]  # [2, 2*14]
+            V_pred = src_body_keypoints[:, (self.num_body_points * 2):]
+            targets_body_keypoints = torch.cat(
+                [t['keypoints'][i] for t, (_, i) in zip(targets, indices)],
+                dim=0)
+            targets_area = torch.cat(
+                [t['area'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            target_body_boxes_conf = torch.cat(
+                [t[i] for t, (_, i) in zip(data_batch['body_bbox_valid'], indices)], dim=0)
+            Z_gt = targets_body_keypoints[:, 0:(self.num_body_points * 2)]
+            V_gt: torch.Tensor = targets_body_keypoints[:, (self.num_body_points * 2):]
+            body_kps_conf = V_gt.sum(-1)>0
+            body_num_boxes = (body_kps_conf * target_body_boxes_conf).sum()
+            oks_loss = self.body_oks(Z_pred,
+                                Z_gt,
+                                V_gt,
+                                targets_area,
+                                weight=None,
+                                avg_factor=None,
+                                reduction_override=None)
+            oks_loss*= body_kps_conf * target_body_boxes_conf
+            pose_loss = F.l1_loss(Z_pred, Z_gt, reduction='none')
+            pose_loss = pose_loss * V_gt.repeat_interleave(2, dim=1)
+            pose_loss = pose_loss.sum(-1) * target_body_boxes_conf
+            if body_num_boxes>0:
+                losses['loss_keypoints'] = pose_loss.sum() / body_num_boxes
+                losses['loss_oks'] = oks_loss.sum() / body_num_boxes
+            else:
+                losses['loss_keypoints'] = src_body_keypoints.sum() * torch.as_tensor(0., device=device)
+                losses['loss_oks'] = src_body_keypoints.sum() * torch.as_tensor(0., device=device)
+        ############################################################
+        #   lhand
+        ############################################################
+        if 'pred_lhand_keypoints' in outputs and face_hand_kpt:
+            src_lhand_keypoints = outputs['pred_lhand_keypoints'][idx]  # xyxyvv
+            if len(src_lhand_keypoints) == 0:
+                losses.update({
+                    'loss_lhand_keypoints': src_lhand_keypoints.sum() * torch.as_tensor(0., device=device),
+                    'loss_lhand_oks':src_lhand_keypoints.sum() * torch.as_tensor(0., device=device),
+                })
+            else:
+                Z_pred = src_lhand_keypoints[:, 0:(self.num_hand_points * 2)]  # [2, 2*14]
+                V_pred = src_lhand_keypoints[:, (self.num_hand_points * 2):]
+                targets_lhand_keypoints = torch.cat(
+                    [t['lhand_keypoints'][i] for t, (_, i) in zip(targets, indices)],
+                    dim=0)  # i is batch_size
+                targets_area = torch.cat(
+                    [t['area'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+                target_lhand_boxes_conf = torch.cat(
+                    [t[i] for t, (_, i) in zip(data_batch['lhand_bbox_valid'], indices)], dim=0)
+                
+                Z_gt = targets_lhand_keypoints[:, 0:(self.num_hand_points * 2)]
+                V_gt: torch.Tensor = targets_lhand_keypoints[:, (self.num_hand_points * 2):]
+                lhand_kps_conf = V_gt.sum(-1)>0
+                lhand_num_boxes = (lhand_kps_conf*target_lhand_boxes_conf).sum()
+                oks_loss = self.hand_oks(Z_pred,
+                                    Z_gt,
+                                    V_gt,
+                                    targets_area,
+                                    weight=None,
+                                    avg_factor=None,
+                                    reduction_override=None)
+                oks_loss = oks_loss*lhand_kps_conf*target_lhand_boxes_conf
+                pose_loss = F.l1_loss(Z_pred, Z_gt, reduction='none')
+                pose_loss = pose_loss * V_gt.repeat_interleave(2, dim=1)
+                pose_loss = pose_loss.sum(-1)*target_lhand_boxes_conf
+                if lhand_num_boxes>0:
+                    losses['loss_lhand_keypoints'] = pose_loss.sum() / lhand_num_boxes
+                    losses['loss_lhand_oks'] = oks_loss.sum() / lhand_num_boxes
+                else:
+                    losses['loss_lhand_keypoints'] = src_lhand_keypoints.sum() * torch.as_tensor(0., device=device)
+                    losses['loss_lhand_oks'] = src_lhand_keypoints.sum() * torch.as_tensor(0., device=device)
+                    
+        ############################################################
+        #   rhand
+        ############################################################
+        if 'pred_rhand_keypoints' in outputs and face_hand_kpt:
+            src_rhand_keypoints = outputs['pred_rhand_keypoints'][idx]  # xyxyvv
+            if len(src_rhand_keypoints) == 0:
+                losses.update({
+                    'loss_rhand_keypoints':
+                    src_rhand_keypoints.sum() * torch.as_tensor(0., device=device),
+                    'loss_rhand_oks':
+                    src_rhand_keypoints.sum() * torch.as_tensor(0., device=device),
+                })
+            else:
+                Z_pred = src_rhand_keypoints[:, 0:(self.num_hand_points * 2)]  # [2, 2*14]
+                V_pred = src_rhand_keypoints[:, (self.num_hand_points * 2):]
+                targets_rhand_keypoints = torch.cat(
+                    [t['rhand_keypoints'][i] for t, (_, i) in zip(targets, indices)],
+                    dim=0)
+                targets_area = torch.cat(
+                    [t['area'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+                target_rhand_boxes_conf = torch.cat(
+                    [t[i] for t, (_, i) in zip(data_batch['rhand_bbox_valid'], indices)], dim=0)
+                Z_gt = targets_rhand_keypoints[:, 0:(self.num_hand_points * 2)]
+                V_gt: torch.Tensor = targets_rhand_keypoints[:, (self.num_hand_points * 2):]
+                rhand_kps_conf = V_gt.sum(-1)>0
+                rhand_num_boxes = (rhand_kps_conf*target_rhand_boxes_conf).sum()
+                oks_loss = self.hand_oks(Z_pred,
+                                    Z_gt,
+                                    V_gt,
+                                    targets_area,
+                                    weight=None,
+                                    avg_factor=None,
+                                    reduction_override=None)
+                oks_loss = oks_loss*rhand_kps_conf*target_rhand_boxes_conf
+                pose_loss = F.l1_loss(Z_pred, Z_gt, reduction='none')
+                pose_loss = pose_loss * V_gt.repeat_interleave(2, dim=1)
+                pose_loss = pose_loss.sum(-1)*target_rhand_boxes_conf
+                if rhand_num_boxes>0:
+                    losses['loss_rhand_keypoints'] = pose_loss.sum() / rhand_num_boxes
+                    losses['loss_rhand_oks'] = oks_loss.sum() / rhand_num_boxes
+                else:
+                    losses['loss_rhand_keypoints'] =  src_rhand_keypoints.sum() * torch.as_tensor(0., device=device)
+                    losses['loss_rhand_oks'] = src_rhand_keypoints.sum() * torch.as_tensor(0., device=device)
+             
+        ############################################################
+        #   face
+        ############################################################
+        if 'pred_face_keypoints' in outputs and face_hand_kpt:
+            src_face_keypoints = outputs['pred_face_keypoints'][idx]  # xyxyvv
+            if len(src_face_keypoints) == 0:
+                losses.update({
+                    'loss_face_keypoints': src_face_keypoints.sum() * 0,
+                    'loss_face_oks':  src_face_keypoints.sum() * 0,
+                })
+            else:
+                Z_pred = src_face_keypoints[:, 0:(self.num_face_points * 2)]  # [2, 2*14]
+                V_pred = src_face_keypoints[:, (self.num_face_points * 2):]
+                targets_face_keypoints = torch.cat(
+                    [t['face_keypoints'][i] for t, (_, i) in zip(targets, indices)],
+                    dim=0)
+                targets_area = torch.cat(
+                    [t['area'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+                target_face_boxes_conf = torch.cat(
+                    [t[i] for t, (_, i) in zip(data_batch['face_bbox_valid'], indices)], dim=0)
+                Z_gt = targets_face_keypoints[:, 0:(self.num_face_points * 2)]
+                V_gt: torch.Tensor = targets_face_keypoints[:, (self.num_face_points * 2):]
+                face_kps_conf = V_gt.sum(-1)>0
+                face_num_boxes = (lhand_kps_conf*target_face_boxes_conf).sum()
+                oks_loss = self.face_oks(Z_pred,
+                                    Z_gt,
+                                    V_gt,
+                                    targets_area,
+                                    weight=None,
+                                    avg_factor=None,
+                                    reduction_override=None)
+                oks_loss = oks_loss*face_kps_conf*target_face_boxes_conf
+                pose_loss = F.l1_loss(Z_pred, Z_gt, reduction='none')
+                pose_loss = pose_loss * V_gt.repeat_interleave(2, dim=1)
+                pose_loss = pose_loss.sum(-1)*target_face_boxes_conf    
+                if face_num_boxes>0:
+                    losses['loss_face_keypoints'] = pose_loss.sum() / face_num_boxes
+                    losses['loss_face_oks'] = oks_loss.sum() / face_num_boxes
+                else:
+                    losses['loss_face_keypoints'] = src_face_keypoints.sum() * torch.as_tensor(0., device=device)
+                    losses['loss_face_oks'] = src_face_keypoints.sum() * torch.as_tensor(0., device=device)
+        
+        return losses 
+
+    def loss_smpl_pose(self, outputs, targets, indices, idx, num_boxes,
+                       data_batch, face_hand_kpt=False):
+        device = outputs['pred_logits'].device
+        indices = indices[0]
+        pred_smpl_body_pose = outputs['pred_smpl_pose'][idx] # 22
+        pred_smpl_lhand_pose = outputs['pred_smpl_lhand_pose'][idx] # 15
+        pred_smpl_rhand_pose = outputs['pred_smpl_rhand_pose'][idx] # 15
+        pred_smpl_jaw_pose = outputs['pred_smpl_jaw_pose'][idx]
+
+        pred_smplx_pose = torch.cat((pred_smpl_body_pose, pred_smpl_lhand_pose,
+                                     pred_smpl_rhand_pose, pred_smpl_jaw_pose),
+                                    dim=1)
+
+        targets_smpl_pose = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['smplx_pose'], indices)],
+            dim=0)
+        targets_smpl_pose = batch_rodrigues(targets_smpl_pose.view(
+            -1, 3)).view(-1, 53, 3, 3)
+        conf = torch.cat([
+            t[i] for t, (_, i) in zip(data_batch['smplx_pose_valid'], indices)
+            ], dim=0)
+        # conf = (conf.reshape(-1,53,3)[:,:,:,None]).repeat(1,1,1,3)
+        body_pose_valid = conf[:, :22].sum(-1) > 0
+        lhand_pose_valid = conf[:, 22:37].sum(-1) > 0
+        rhand_pose_valid = conf[:, 37:52].sum(-1) > 0
+        face_pose_valid = conf[:, 52].sum(-1) > 0
+
+        losses = {}
+        loss_smpl_pose = \
+            F.l1_loss(
+                pred_smplx_pose,
+                targets_smpl_pose,
+                reduction='none'
+            )
+        loss_smpl_pose = loss_smpl_pose.sum([-1,-2]) * conf
+        if face_hand_kpt:
+            losses = {
+                'loss_smpl_pose_root': loss_smpl_pose[:, 0].sum() / (body_pose_valid.sum() + 1e-6),
+                'loss_smpl_pose_body': loss_smpl_pose[:, 1:22].sum() / (body_pose_valid.sum() + 1e-6),
+                'loss_smpl_pose_lhand': loss_smpl_pose[:, 22:37].sum() / (lhand_pose_valid.sum() + 1e-6),
+                'loss_smpl_pose_rhand': loss_smpl_pose[:, 37:52].sum() / (rhand_pose_valid.sum() + 1e-6),
+                'loss_smpl_pose_jaw': loss_smpl_pose[:, 52].sum() / (face_pose_valid.sum() + 1e-6),
+            }
+        else:
+            losses = {
+                'loss_smpl_pose_root': loss_smpl_pose[:, 0].sum() / (body_pose_valid.sum() + 1e-6),
+                'loss_smpl_pose_body': loss_smpl_pose[:, 1:22].sum() / (body_pose_valid.sum() + 1e-6),
+                'loss_smpl_pose_lhand': torch.as_tensor(0., device=device) * loss_smpl_pose[:, 22:37].sum()/(lhand_pose_valid.sum() + 1e-6),
+                'loss_smpl_pose_rhand': torch.as_tensor(0., device=device) * loss_smpl_pose[:, 37:52].sum() / (rhand_pose_valid.sum() + 1e-6),
+                'loss_smpl_pose_jaw': torch.as_tensor(0., device=device)*loss_smpl_pose[:, 52].sum() / (face_pose_valid.sum() + 1e-6),
+            }
+        return losses
+
+    def loss_smpl_beta(self, outputs, targets, indices, idx, num_boxes,
+                       data_batch, face_hand_kpt=False):
+        indices = indices[0]
+        device = outputs['pred_logits'].device
+        pred_smpl_betas = outputs['pred_smpl_beta'][idx]
+        
+        targets_smpl_betas = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['smplx_shape'], indices)],
+            dim=0)
+
+        losses = {}
+        conf = torch.cat([t[i] for t, (_, i) in zip(data_batch['smplx_shape_valid'], indices)], dim=0)
+        if conf.sum() == 0:
+            return {
+                'loss_smpl_beta': pred_smpl_betas.sum() * 0
+            }
+        loss_smpl_betas = \
+            F.l1_loss(
+                pred_smpl_betas,
+                targets_smpl_betas,
+                reduction='none'
+            )
+        loss_smpl_betas = loss_smpl_betas.sum(-1) * conf
+        losses = {'loss_smpl_beta': loss_smpl_betas.sum() / (conf.sum() + 1e-6)}
+        return losses
+
+    def loss_smpl_expr(self, outputs, targets, indices, idx, num_boxes,
+                       data_batch, face_hand_kpt=False):
+        indices = indices[0]
+        device = outputs['pred_logits'].device
+        pred_smpl_expr = outputs['pred_smpl_expr'][idx]
+        targets_smpl_expr = torch.cat([t[i] for t, (_, i) in zip(data_batch['smplx_expr'], indices)], dim=0)
+
+        conf = torch.cat([t[i] for t, (_, i) in zip(data_batch['smplx_expr_valid'], indices)], dim=0)
+        if conf.sum() == 0:
+            return {
+                'loss_smpl_expr': pred_smpl_expr.sum() * torch.as_tensor(0., device=device)
+            }
+
+        loss_smpl_expr = \
+            F.l1_loss(
+                pred_smpl_expr,
+                targets_smpl_expr,
+                reduction='none'
+            )
+        loss_smpl_expr = loss_smpl_expr.sum(-1) * conf
+        
+        losses = {} 
+        if face_hand_kpt:
+            losses = {'loss_smpl_expr': loss_smpl_expr.sum() / (conf.sum() + 1e-6)}
+        else:
+            losses = {'loss_smpl_expr': torch.as_tensor(0., device=device)*loss_smpl_expr.sum() / (conf.sum() + 1e-6) }
+            
+        return losses
+
+    def loss_smpl_kp3d(self,
+                       outputs,
+                       targets,
+                       indices,
+                       idx,
+                       num_boxes,
+                       data_batch,
+                       has_keypoints3d=None,
+                       face_hand_kpt=False):
+
+        # supervision for keypoints3d wo/ ra
+        device = outputs['pred_logits'].device
+        indices = indices[0]
+        
+        pred_smpl_kp3d = outputs['pred_smpl_kp3d'][idx].float()
+        # meta_info['joint_valid'] * meta_info['is_3D'][:, None, None])
+        targets_smpl_kp3d = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['joint_cam'], indices)],
+            dim=0)
+        losses = {}
+        targets_kp3d_conf = targets_smpl_kp3d[:,:,3:].clone()
+        targets_smpl_kp3d = targets_smpl_kp3d[:,:,:3]
+        
+        targets_is_3d = torch.cat([
+            t[None, None].repeat(len(i), 1, 1)
+            for t, (_, i) in zip(data_batch['is_3D'], indices)
+        ], dim=0)
+
+        targets_kp3d_conf = (targets_kp3d_conf * targets_is_3d)
+        pelvis_idx = get_keypoint_idx('pelvis', self.convention)
+        targets_pelvis = targets_smpl_kp3d[..., pelvis_idx, :]
+        pred_pelvis = pred_smpl_kp3d[..., pelvis_idx, :]
+
+        targets_smpl_kp3d = targets_smpl_kp3d - targets_pelvis[:, None, :]
+        pred_smpl_kp3d = pred_smpl_kp3d - pred_pelvis[:, None, :]
+
+        losses = {}
+        body_idx = smpl_x.joint_part['body']
+        face_idx = smpl_x.joint_part['face']
+        lhand_idx = smpl_x.joint_part['lhand']
+        rhand_idx = smpl_x.joint_part['rhand']
+        
+        loss_smpl_kp3d = F.l1_loss(pred_smpl_kp3d,
+                                   targets_smpl_kp3d,
+                                   reduction='none')
+
+        body_kp3d_valid = targets_kp3d_conf[:, body_idx].sum([-1,-2]) > 0
+        lhand_kp3d_valid = targets_kp3d_conf[:, lhand_idx].sum([-1,-2]) > 0
+        rhand_kp3d_valid = targets_kp3d_conf[:, rhand_idx].sum([-1,-2]) > 0
+        face_kp3d_valid = targets_kp3d_conf[:, face_idx].sum([-1,-2]) > 0
+        
+        loss_smpl_kp3d = loss_smpl_kp3d * targets_kp3d_conf # + outputs['pred_smpl_cam'][idx].float().sum()*0
+        
+        if face_hand_kpt:
+            losses['loss_smpl_body_kp3d'] = torch.sum(loss_smpl_kp3d[:, body_idx, :])  / (body_kp3d_valid.sum() + 1e-6)
+            losses['loss_smpl_lhand_kp3d'] = torch.sum(loss_smpl_kp3d[:, lhand_idx, :]) / (lhand_kp3d_valid.sum() + 1e-6)
+            losses['loss_smpl_rhand_kp3d'] = torch.sum(loss_smpl_kp3d[:, rhand_idx, :]) / (rhand_kp3d_valid.sum() + 1e-6)
+            losses['loss_smpl_face_kp3d'] = torch.sum(loss_smpl_kp3d[:, face_idx, :]) / (face_kp3d_valid.sum() + 1e-6)
+        else:
+            losses['loss_smpl_body_kp3d'] = torch.sum(loss_smpl_kp3d[:, body_idx, :])  / (body_kp3d_valid.sum() + 1e-6)
+            losses['loss_smpl_lhand_kp3d'] = torch.as_tensor(0., device=device)*torch.sum(loss_smpl_kp3d[:, lhand_idx, :]) / (lhand_kp3d_valid.sum() + 1e-6)
+            losses['loss_smpl_rhand_kp3d'] = torch.as_tensor(0., device=device)*torch.sum(loss_smpl_kp3d[:, rhand_idx, :]) / (rhand_kp3d_valid.sum() + 1e-6)
+            losses['loss_smpl_face_kp3d'] = torch.as_tensor(0., device=device)*torch.sum(loss_smpl_kp3d[:, face_idx, :]) / (face_kp3d_valid.sum() + 1e-6)
+        return losses
+
+    def loss_smpl_kp3d_ra(self,
+                          outputs,
+                          targets,
+                          indices,
+                          idx,
+                          num_boxes,
+                          data_batch,
+                          has_keypoints3d=None,
+                          face_hand_kpt=False):
+        # supervision for keypoints3d w/ ra
+        device = outputs['pred_logits'].device
+        indices = indices[0]
+
+        
+        pred_smpl_kp3d = outputs['pred_smpl_kp3d'][idx].float()
+
+        # meta_info['joint_valid'] * meta_info['is_3D'][:, None, None])
+        targets_smpl_kp3d = torch.cat([
+            t[i] for t, (_, i) in zip(data_batch['smplx_joint_cam'], indices)],
+            dim=0)
+        losses = {}
+        # if valid_num == 0:
+        #     losses['loss_smpl_rhand_kp3d_ra'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+        #     losses['loss_smpl_body_kp3d_ra'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+        #     losses['loss_smpl_face_kp3d_ra'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+        #     losses['loss_smpl_lhand_kp3d_ra'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+        #     return losses
+        
+        targets_kp3d_conf = targets_smpl_kp3d[:,:,3:].clone()
+        targets_smpl_kp3d = targets_smpl_kp3d[:,:,:3]
+        
+        targets_is_3d = torch.cat([
+            t[None, None].repeat(len(i), 1, 1)
+            for t, (_, i) in zip(data_batch['is_3D'], indices)],dim=0)
+        
+        targets_kp3d_conf = (targets_kp3d_conf * targets_is_3d).repeat(1, 1, 3)
+        # targets_smpl_kp3d = targets_smpl_kp3d[..., :3].float()
+        pelvis_idx = get_keypoint_idx('pelvis', self.convention)
+        targets_pelvis = targets_smpl_kp3d[..., pelvis_idx, :]
+        pred_pelvis = pred_smpl_kp3d[..., pelvis_idx, :]
+
+        targets_smpl_kp3d = targets_smpl_kp3d - targets_pelvis[:, None, :]
+        pred_smpl_kp3d = pred_smpl_kp3d - pred_pelvis[:, None, :]
+        # calculate body, face and hand loss separately:
+
+        losses = {}
+        body_idx = smpl_x.joint_part['body']
+        face_idx = smpl_x.joint_part['face']
+        lhand_idx = smpl_x.joint_part['lhand']
+        rhand_idx = smpl_x.joint_part['rhand']
+
+        body_kp3d_valid = targets_kp3d_conf[:, body_idx].sum([-1,-2]) > 0
+        lhand_kp3d_valid = targets_kp3d_conf[:, lhand_idx].sum([-1,-2]) > 0
+        rhand_kp3d_valid = targets_kp3d_conf[:, rhand_idx].sum([-1,-2]) > 0
+        face_kp3d_valid = targets_kp3d_conf[:, face_idx].sum([-1,-2]) > 0
+
+        loss_smpl_body_kp3d = F.l1_loss(pred_smpl_kp3d[:, body_idx, :],
+                                        targets_smpl_kp3d[:, body_idx, :],
+                                        reduction='none')
+        
+        loss_smpl_body_kp3d = torch.sum(
+            loss_smpl_body_kp3d * targets_kp3d_conf[:, body_idx, :])
+        losses['loss_smpl_body_kp3d_ra'] = loss_smpl_body_kp3d / (body_kp3d_valid.sum() + 1e-6)
+        
+        face_cam = pred_smpl_kp3d[:, face_idx, :]
+        neck_cam = pred_smpl_kp3d[:, smpl_x.neck_idx, None, :]
+        face_cam = face_cam - neck_cam
+        loss_smpl_face_kp3d = F.l1_loss(face_cam,
+                                        targets_smpl_kp3d[:, face_idx, :],
+                                        reduction='none')
+        loss_smpl_face_kp3d = torch.sum(
+            loss_smpl_face_kp3d * targets_kp3d_conf[:, face_idx, :])
+        if face_hand_kpt:
+            losses['loss_smpl_face_kp3d_ra'] = (loss_smpl_face_kp3d / (face_kp3d_valid.sum() + 1e-6))
+        else:
+            losses['loss_smpl_face_kp3d_ra'] = 0 * (loss_smpl_face_kp3d / (face_kp3d_valid.sum() + 1e-6))
+        
+        lhand_cam = pred_smpl_kp3d[:, lhand_idx, :]
+        lwrist_cam = pred_smpl_kp3d[:, smpl_x.lwrist_idx, None, :]
+        lhand_cam = lhand_cam - lwrist_cam
+        loss_smpl_lhand_kp3d = F.l1_loss(lhand_cam,
+                                            targets_smpl_kp3d[:, lhand_idx, :],
+                                            reduction='none')
+        loss_smpl_lhand_kp3d = torch.sum(
+            loss_smpl_lhand_kp3d * targets_kp3d_conf[:, lhand_idx, :])
+        
+        if face_hand_kpt:
+            losses['loss_smpl_lhand_kp3d_ra'] = (loss_smpl_lhand_kp3d / (lhand_kp3d_valid.sum() + 1e-6))
+        else:
+            losses['loss_smpl_lhand_kp3d_ra'] = 0*(loss_smpl_lhand_kp3d / (lhand_kp3d_valid.sum() + 1e-6))
+            
+        rhand_cam = pred_smpl_kp3d[:, rhand_idx, :]
+        rwrist_cam = pred_smpl_kp3d[:, smpl_x.rwrist_idx, None, :]
+        rhand_cam = rhand_cam - rwrist_cam
+
+        loss_smpl_rhand_kp3d = F.l1_loss(rhand_cam,
+                                            targets_smpl_kp3d[:, rhand_idx, :],
+                                            reduction='none')
+        loss_smpl_rhand_kp3d = torch.sum(
+            loss_smpl_rhand_kp3d * targets_kp3d_conf[:, rhand_idx, :])
+        
+        if face_hand_kpt:
+            losses['loss_smpl_rhand_kp3d_ra'] = (loss_smpl_rhand_kp3d / (rhand_kp3d_valid.sum() + 1e-6))
+        else:
+            losses['loss_smpl_rhand_kp3d_ra'] = 0*(loss_smpl_rhand_kp3d / (rhand_kp3d_valid.sum() + 1e-6))
+
+        return losses
+
+    def loss_smpl_kp2d(self,
+                       outputs,
+                       targets,
+                       indices,
+                       idx,
+                       num_boxes,
+                       data_batch,
+                       focal_length=5000.,
+                       has_keypoints2d=None,
+                       face_hand_kpt=False):
+        """Compute loss for 2d keypoints."""
+        device = outputs['pred_logits'].device
+        indices = indices[0]
+        
+        pred_smpl_kp3d = outputs['pred_smpl_kp3d'][idx].float()#.detach()
+        pred_cam = outputs['pred_smpl_cam'][idx].float()
+        targets_kp2d = torch.cat([t[i] for t, (_, i) in zip(data_batch['joint_img'], indices)], dim=0)
+
+        keypoints2d_conf =  targets_kp2d[:,:,2:].clone()
+
+        targets_kp2d = targets_kp2d[:, :, :2].float()
+        targets_kp2d[:,:,0] = targets_kp2d[:,:,0]/cfg.output_hm_shape[2]
+        targets_kp2d[:,:,1] = targets_kp2d[:,:,1]/cfg.output_hm_shape[1]
+        # targets_kp2d = targets_kp2d*2-1
+        img_wh =  torch.cat([data_batch['img_shape'][i][None] for i in idx[0]], dim=0).flip(-1)
+        pred_smpl_kp2d = project_points_new(
+            points_3d=pred_smpl_kp3d,
+            pred_cam=pred_cam,
+            focal_length=focal_length,
+            camera_center=img_wh/2
+        )
+
+        pred_smpl_kp2d = pred_smpl_kp2d / img_wh[:, None]
+        
+        losses = {}
+        body_idx = smpl_x.joint_part['body']
+        face_idx = smpl_x.joint_part['face']
+        lhand_idx = smpl_x.joint_part['lhand']
+        rhand_idx = smpl_x.joint_part['rhand']
+        
+        body_kp2d_valid = keypoints2d_conf[:, body_idx].sum([-1,-2]) > 0
+        lhand_kp2d_valid = keypoints2d_conf[:, lhand_idx].sum([-1,-2]) > 0
+        rhand_kp2d_valid = keypoints2d_conf[:, rhand_idx].sum([-1,-2]) > 0
+        face_kp2d_valid = keypoints2d_conf[:, face_idx].sum([-1,-2]) > 0
+        
+        loss_smpl_kp2d = F.l1_loss(pred_smpl_kp2d,
+                                   targets_kp2d,
+                                   reduction='none')
+        loss_smpl_kp2d = loss_smpl_kp2d * keypoints2d_conf
+
+        # import mmcv
+        # import cv2
+        # img = (data_batch['img'][0]*255).permute(1,2,0).int().detach().cpu().numpy()
+        
+        if face_hand_kpt:
+            losses['loss_smpl_body_kp2d'] = torch.sum(loss_smpl_kp2d[:, body_idx, :])  / (body_kp2d_valid.sum() + 1e-6)
+            losses['loss_smpl_lhand_kp2d'] = torch.sum(loss_smpl_kp2d[:, lhand_idx, :]) / (lhand_kp2d_valid.sum() + 1e-6)
+            losses['loss_smpl_rhand_kp2d'] = torch.sum(loss_smpl_kp2d[:, rhand_idx, :]) / (rhand_kp2d_valid.sum() + 1e-6)
+            losses['loss_smpl_face_kp2d'] = torch.sum(loss_smpl_kp2d[:, face_idx, :]) / (face_kp2d_valid.sum() + 1e-6)
+
+        else:
+            losses['loss_smpl_body_kp2d'] = torch.sum(loss_smpl_kp2d[:, body_idx, :])  / (body_kp2d_valid.sum() + 1e-6)
+            losses['loss_smpl_lhand_kp2d'] = 0*torch.sum(loss_smpl_kp2d[:, lhand_idx, :]) / (lhand_kp2d_valid.sum() + 1e-6)
+            losses['loss_smpl_rhand_kp2d'] = 0*torch.sum(loss_smpl_kp2d[:, rhand_idx, :]) / (rhand_kp2d_valid.sum() + 1e-6)
+            losses['loss_smpl_face_kp2d'] = 0*torch.sum(loss_smpl_kp2d[:, face_idx, :]) / (face_kp2d_valid.sum() + 1e-6)
+
+
+        return losses
+
+    def loss_smpl_kp2d_ba(self,
+                          outputs,
+                          targets,
+                          indices,
+                          idx,
+                          num_boxes,
+                          data_batch,
+                          focal_length=5000.,
+                          has_keypoints2d=None,
+                        face_hand_kpt=False):
+        """Compute loss for 2d keypoints."""
+        device = outputs['pred_logits'].device
+        indices = indices[0]
+        # pdb.set_trace()
+        pred_smpl_kp3d = outputs['pred_smpl_kp3d'][idx].float()#.detach()
+        pred_cam = outputs['pred_smpl_cam'][idx].float()
+
+        
+        valid_num=0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        targets_kp2d = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['joint_img'], indices)],
+            dim=0)
+        losses = {}
+
+        
+        
+        keypoints2d_conf =  targets_kp2d[:,:,2:].clone()
+        targets_kp2d = targets_kp2d[:,:,:2]
+        
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+        targets_kp2d = targets_kp2d[:, :, :2].float()
+        targets_kp2d[:, :, 0] = targets_kp2d[:, :, 0] / cfg.output_hm_shape[2]
+        targets_kp2d[:, :, 1] = targets_kp2d[:, :, 1] / cfg.output_hm_shape[1]
+        # targets_kp2d = targets_kp2d * 2 - 1
+        img_wh =  torch.cat([data_batch['img_shape'][i][None] for i in idx[0]], dim=0).flip(-1)
+
+        pred_smpl_kp2d = project_points_new(
+            points_3d=pred_smpl_kp3d,
+            pred_cam=pred_cam,
+            focal_length=focal_length,
+            camera_center=img_wh/2
+        )
+
+        pred_smpl_kp2d = pred_smpl_kp2d / img_wh[:, None]
+        
+        if valid_num == 0:
+            losses['loss_smpl_body_kp2d_ba'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0
+
+            losses['loss_smpl_lhand_kp2d_ba'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0
+        
+            losses['loss_smpl_rhand_kp2d_ba'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0
+        
+            losses['loss_smpl_face_kp2d_ba'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0
+            return losses        
+        # rhand bbox
+        rhand_bbox_valid = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['rhand_bbox_valid'], indices) ], dim=0)
+        rhand_bbox_gt = torch.cat(
+            [t['rhand_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        rhand_bbox_gt = (box_ops.box_cxcywh_to_xyxy(rhand_bbox_gt).
+                         reshape(-1,2,2)*img_wh[:, None]).reshape(-1, 4)
+        num_rhand_bbox = rhand_bbox_valid.sum()
+        # lhand bbox
+        lhand_bbox_valid = torch.cat([
+            t[i] for t, (_, i) in zip(data_batch['lhand_bbox_valid'], indices)], dim=0)
+        lhand_bbox_gt = torch.cat(
+            [t['lhand_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        lhand_bbox_gt = (box_ops.box_cxcywh_to_xyxy(lhand_bbox_gt).
+                         reshape(-1,2,2)*img_wh[:, None]).reshape(-1, 4)
+        num_lhand_bbox = lhand_bbox_valid.sum()
+        # face bbox
+        face_bbox_valid = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['face_bbox_valid'], indices)], dim=0)
+        face_bbox_gt = torch.cat(
+            [t['face_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        face_bbox_gt = (box_ops.box_cxcywh_to_xyxy(face_bbox_gt).
+                        reshape(-1,2,2)*img_wh[:, None]).reshape(-1, 4)
+        num_face_bbox = face_bbox_valid.sum()
+        img_shape = torch.cat(
+            [t[None].repeat(len(i), 1) for t, (_, i) in zip(data_batch['img_shape'], indices)], dim=0)
+        
+        # joint_proj = (joint_proj / 2 + 0.5)
+        # joint_proj[:, :, 0] = joint_proj[:, :, 0] * img_shape[:, 1:]
+        # joint_proj[:, :, 1] = joint_proj[:, :, 1] * img_shape[:, :1]
+
+        if not (lhand_bbox_valid + rhand_bbox_valid + face_bbox_valid == 0).all():
+            for part_name, bbox in (
+                    ('lhand', lhand_bbox_gt), 
+                    ('rhand', rhand_bbox_gt), 
+                    ('face', face_bbox_gt)):
+                
+                x = targets_kp2d[:, smpl_x.joint_part[part_name], 0]
+                y = targets_kp2d[:, smpl_x.joint_part[part_name], 1]
+                # trunc = joint_trunc[:, smpl_x.joint_part[part_name], 0]
+                trunc = keypoints2d_conf[:, smpl_x.joint_part[part_name], 0].clone()
+                # x in [0, 1]? bbox in [0, 1]. 
+                x -= (bbox[:, None, 0] / img_shape[:, 1:])
+                # x 
+                x *= (img_shape[:, 1:] / (bbox[:, None, 2] - bbox[:, None, 0] + 1e-6))
+                
+                y -= (bbox[:, None, 1] / img_shape[:, :1])
+                y *= (img_shape[:, :1] / (bbox[:, None, 3] - bbox[:, None, 1] + 1e-6))
+                # transformed to 0-1 bbox space
+
+                trunc *= ((x >= 0) * (x <= 1) *
+                          (y >= 0) * (y <= 1))
+
+                
+                coord = torch.stack((x, y), 2)
+                
+
+                targets_kp2d = torch.cat(
+                    (targets_kp2d[:, :smpl_x.joint_part[part_name][0], :], coord,
+                     targets_kp2d[:, smpl_x.joint_part[part_name][-1] + 1:, :]),
+                    1)
+                
+                x_pred = pred_smpl_kp2d[:, smpl_x.joint_part[part_name], 0]
+                y_pred = pred_smpl_kp2d[:, smpl_x.joint_part[part_name], 1]
+                # bbox: xyxy img_shape: hw
+                x_pred -= (bbox[:, None, 0] / img_shape[:, 1:])
+                x_pred *= (img_shape[:, 1:] / (bbox[:, None, 2] - bbox[:, None, 0] + 1e-6))
+                
+                y_pred -= (bbox[:, None, 1] / img_shape[:, :1])
+                y_pred *= (img_shape[:, :1] / (bbox[:, None, 3] - bbox[:, None, 1] + 1e-6))
+
+                coord_pred = torch.stack((x_pred, y_pred), 2)
+                trans = []
+
+                for bid in range(coord_pred.shape[0]):
+                    mask = trunc[bid] == 1
+                    
+                    if torch.sum(mask) == 0:
+                        trans.append(torch.zeros((2)).float().cuda())
+                    else:
+                        trans.append(
+                            (-coord_pred[bid, mask, :2] + targets_kp2d[:, smpl_x.joint_part[part_name], :][bid, mask, :2]).mean(0))
+                trans = torch.stack(trans)[:, None, :]
+                
+                coord_pred = coord_pred + trans  # global translation alignment
+                pred_smpl_kp2d = torch.cat(
+                    (pred_smpl_kp2d[:, :smpl_x.joint_part[part_name][0], :], coord_pred,
+                     pred_smpl_kp2d[:, smpl_x.joint_part[part_name][-1] + 1:, :]),
+                    1)
+                
+
+            
+        loss_smpl_kp2d_ba = F.l1_loss(pred_smpl_kp2d,
+                                   targets_kp2d[:, :, :2],
+                                   reduction='none')
+        valid_pos = keypoints2d_conf > 0
+        
+        losses = {}
+        if keypoints2d_conf[valid_pos].numel() == 0:
+            return {
+                'loss_smpl_body_kp2d_ba': loss_smpl_kp2d_ba.sum()*0,
+                'loss_smpl_lhand_kp2d_ba': loss_smpl_kp2d_ba.sum()*0,
+                'loss_smpl_rhand_kp2d_ba': loss_smpl_kp2d_ba.sum()*0,
+                'loss_smpl_face_kp2d_ba': loss_smpl_kp2d_ba.sum()*0,             
+            }
+        # loss /= targets_kp3d_conf[valid_pos].numel()
+        # 要改
+        loss_smpl_kp2d_ba = loss_smpl_kp2d_ba * keypoints2d_conf
+        losses['loss_smpl_body_kp2d_ba'] = torch.sum(loss_smpl_kp2d_ba[:, 
+                                                smpl_x.joint_part['body'], :]) / num_boxes
+        if face_hand_kpt:
+            if num_lhand_bbox>0:
+                losses['loss_smpl_lhand_kp2d_ba'] = torch.sum(loss_smpl_kp2d_ba[:, 
+                                                        smpl_x.joint_part['lhand'], :]) / num_lhand_bbox
+            else:
+                losses['loss_smpl_lhand_kp2d_ba'] = torch.as_tensor(0., device=device) + loss_smpl_kp2d_ba.sum()*0
+            if num_rhand_bbox>0:
+                losses['loss_smpl_rhand_kp2d_ba'] = torch.sum(loss_smpl_kp2d_ba[:, 
+                                                        smpl_x.joint_part['rhand'], :]) / num_rhand_bbox
+            else:
+                losses['loss_smpl_rhand_kp2d_ba'] = torch.as_tensor(0., device=device) + loss_smpl_kp2d_ba.sum()*0
+            if num_face_bbox>0:
+                losses['loss_smpl_face_kp2d_ba'] = torch.sum(loss_smpl_kp2d_ba[:, 
+                                                        smpl_x.joint_part['face'], :]) / num_face_bbox
+            else:
+                losses['loss_smpl_face_kp2d_ba'] = torch.as_tensor(0., device=device) + loss_smpl_kp2d_ba.sum()*0
+        else:
+            losses['loss_smpl_lhand_kp2d_ba'] = 0*torch.sum(loss_smpl_kp2d_ba[:, 
+                                                    smpl_x.joint_part['lhand'], :]) / num_lhand_bbox
+
+            losses['loss_smpl_rhand_kp2d_ba'] = 0*torch.sum(loss_smpl_kp2d_ba[:, 
+                                                    smpl_x.joint_part['rhand'], :]) / num_rhand_bbox
+
+            losses['loss_smpl_face_kp2d_ba'] = 0*torch.sum(loss_smpl_kp2d_ba[:, 
+                                                        smpl_x.joint_part['face'], :]) / num_face_bbox
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, 
+                   idx, num_boxes, data_batch,
+                   face_hand_box=False):
+        """Compute the losses related to the bounding boxes, the L1 regression
+        loss and the GIoU loss targets dicts must contain the key "boxes"
+        containing a tensor of dim [nb_target_boxes, 4] The target boxes are
+        expected in format (center_x, center_y, w, h), normalized by the image
+        size."""
+        indices = indices[0]
+        device = outputs['pred_logits'].device
+        assert 'pred_boxes' in outputs
+                
+        src_body_boxes = outputs['pred_boxes'][idx]
+        target_body_boxes = torch.cat(
+            [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        target_body_boxes_conf = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['body_bbox_valid'], indices)], dim=0)
+        
+        loss_body_bbox = F.l1_loss(src_body_boxes, target_body_boxes, reduction='none')
+        loss_body_bbox = loss_body_bbox * target_body_boxes_conf[:,None]
+        
+        losses = {}
+        losses['loss_body_bbox'] = loss_body_bbox.sum() / num_boxes
+        loss_body_giou = 1 - torch.diag(
+            box_ops.generalized_box_iou(
+                box_ops.box_cxcywh_to_xyxy(src_body_boxes),
+                box_ops.box_cxcywh_to_xyxy(target_body_boxes)))
+        
+        loss_body_giou = loss_body_giou * target_body_boxes_conf
+        losses['loss_body_giou'] = loss_body_giou.sum() / num_boxes
+        
+        if 'pred_lhand_boxes' in outputs and face_hand_box:
+            src_lhand_boxes = outputs['pred_lhand_boxes'][idx]
+            target_lhand_boxes = torch.cat(
+                [t['lhand_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            target_lhand_boxes_conf = torch.cat(
+                [t[i] for t, (_, i) in zip(data_batch['lhand_bbox_valid'], indices)], dim=0)
+            # print(target_lhand_boxes_conf)
+            loss_lhand_bbox = F.l1_loss(src_lhand_boxes, target_lhand_boxes, reduction='none')
+            loss_lhand_bbox = loss_lhand_bbox * target_lhand_boxes_conf[:,None]
+            num_lhand_boxes = (target_lhand_boxes_conf>0).sum()
+            loss_lhand_giou = 1 - torch.diag(
+                box_ops.generalized_box_iou(
+                    box_ops.box_cxcywh_to_xyxy(src_lhand_boxes),
+                    box_ops.box_cxcywh_to_xyxy(target_lhand_boxes)))
+            loss_lhand_giou = loss_lhand_giou * target_lhand_boxes_conf
+            if num_lhand_boxes > 0:
+                losses['loss_lhand_bbox'] = loss_lhand_bbox.sum() / num_lhand_boxes
+                losses['loss_lhand_giou'] = loss_lhand_giou.sum() / num_lhand_boxes
+            else:
+                losses['loss_lhand_bbox'] = loss_lhand_bbox.sum() * 0
+                losses['loss_lhand_giou'] = loss_lhand_giou.sum() * 0
+           
+        
+        if 'pred_rhand_boxes' in outputs and face_hand_box:
+            src_rhand_boxes = outputs['pred_rhand_boxes'][idx]
+            target_rhand_boxes = torch.cat(
+                [t['rhand_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            target_rhand_boxes_conf = torch.cat(
+                [t[i] for t, (_, i) in zip(data_batch['rhand_bbox_valid'], indices)], dim=0)
+            loss_rhand_bbox = F.l1_loss(src_rhand_boxes, target_rhand_boxes, reduction='none')
+            loss_rhand_bbox = loss_rhand_bbox * target_rhand_boxes_conf[:,None]
+            num_rhand_boxes = (target_rhand_boxes_conf>0).sum()
+            loss_rhand_giou = 1 - torch.diag(
+                box_ops.generalized_box_iou(
+                    box_ops.box_cxcywh_to_xyxy(src_rhand_boxes),
+                    box_ops.box_cxcywh_to_xyxy(target_rhand_boxes)))
+            loss_rhand_giou = loss_rhand_giou * target_rhand_boxes_conf
+            if num_rhand_boxes > 0:
+                losses['loss_rhand_bbox'] = loss_rhand_bbox.sum() / num_rhand_boxes
+                losses['loss_rhand_giou'] = loss_rhand_giou.sum() / num_rhand_boxes
+            else:
+                losses['loss_rhand_bbox'] = loss_rhand_bbox.sum() * 0
+                losses['loss_rhand_giou'] = loss_rhand_giou.sum() * 0
+                
+        if 'pred_face_boxes' in outputs and face_hand_box:
+            src_face_boxes = outputs['pred_face_boxes'][idx]
+            target_face_boxes = torch.cat(
+                [t['face_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            target_face_boxes_conf = torch.cat(
+                [t[i] for t, (_, i) in zip(data_batch['face_bbox_valid'], indices)], dim=0)
+            loss_face_bbox = F.l1_loss(src_face_boxes, target_face_boxes, reduction='none')
+            loss_face_bbox = loss_face_bbox * target_face_boxes_conf[:,None]
+            num_face_boxes = (target_face_boxes_conf>0).sum()
+            loss_face_giou = 1 - torch.diag(
+                box_ops.generalized_box_iou(
+                    box_ops.box_cxcywh_to_xyxy(src_face_boxes),
+                    box_ops.box_cxcywh_to_xyxy(target_face_boxes)))       
+            loss_face_giou = loss_face_giou * target_face_boxes_conf
+            if num_face_boxes > 0:
+                losses['loss_face_bbox'] = loss_face_bbox.sum() / num_face_boxes
+                losses['loss_face_giou'] = loss_face_giou.sum() / num_face_boxes  
+            else:
+                losses['loss_face_bbox'] = loss_face_bbox.sum() * 0
+                losses['loss_face_giou'] = loss_face_giou.sum() * 0
+
+        return losses
+
+    def loss_dn_boxes(self, outputs, targets, indices, idx, num_boxes,
+                      data_batch):
+        """
+        Input:
+            - src_boxes: bs, num_dn, 4
+            - tgt_boxes: bs, num_dn, 4
+
+        """
+        indices = indices[0]
+        num_tgt = outputs['num_tgt']
+        src_boxes = outputs['dn_bbox_pred']
+        tgt_boxes = outputs['dn_bbox_input']
+
+        if 'num_tgt' not in outputs:
+            device = outputs['pred_logits'].device
+            losses = {
+                'dn_loss_bbox': src_boxes.sum()*0,
+                'dn_loss_giou': src_boxes.sum()*0,
+            }
+            return losses
+        
+        if 'num_tgt' not in outputs:
+            device = outputs['pred_logits'].device
+            losses = {
+                'dn_loss_bbox': src_boxes.sum()*0,
+                'dn_loss_giou': src_boxes.sum()*0,
+            }
+            return losses
+
+
+        return self.tgt_loss_boxes(src_boxes, tgt_boxes, num_tgt)
+
+    def loss_dn_labels(self, outputs, targets, indices, idx, num_boxes,
+                       data_batch):
+        """
+        Input:
+            - src_logits: bs, num_dn, num_classes
+            - tgt_labels: bs, num_dn
+
+        """
+        indices = indices[0]
+        if 'num_tgt' not in outputs:
+            device = outputs['pred_logits'].device
+            losses = {
+                'dn_loss_ce': outputs['pred_logits'].sum()*0,
+            }
+            return losses
+        num_tgt = outputs['num_tgt']
+        src_logits = outputs['dn_class_pred']  # bs, num_dn, text_len
+        tgt_labels = outputs['dn_class_input']
+
+        return self.tgt_loss_labels(src_logits, tgt_labels, num_tgt)
+
+    @torch.no_grad()
+    def loss_matching_cost(self, outputs, targets, indices, idx, num_boxes,
+                           data_batch):
+        """
+        Input:
+            - src_logits: bs, num_dn, num_classes
+            - tgt_labels: bs, num_dn
+
+        """
+        cost_mean_dict = indices[1]
+        losses = {'set_{}'.format(k): v for k, v in cost_mean_dict.items()}
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat(
+            [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, data_batch, indices, num_boxes,
+                 **kwargs):
+        loss_map = {
+            'smpl_pose': self.loss_smpl_pose,
+            'smpl_beta': self.loss_smpl_beta,
+            'smpl_expr': self.loss_smpl_expr,
+            'smpl_kp2d': self.loss_smpl_kp2d,
+            'smpl_kp2d_ba': self.loss_smpl_kp2d_ba,
+            'smpl_kp3d_ra': self.loss_smpl_kp3d_ra,
+            'smpl_kp3d': self.loss_smpl_kp3d,
+            'labels': self.loss_labels,
+            'cardinality': self.loss_cardinality,
+            'keypoints': self.loss_keypoints,
+            'boxes': self.loss_boxes,
+            'dn_label': self.loss_dn_labels,
+            'dn_bbox': self.loss_dn_boxes,
+            'matching': self.loss_matching_cost,
+        }
+        
+        idx = self._get_src_permutation_idx(indices[0])
+        # pdb.set_trace()
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, idx, num_boxes,
+                              data_batch, **kwargs)
+
+    def prep_for_dn2(self, mask_dict):
+        known_bboxs = mask_dict['known_bboxs']
+        known_labels = mask_dict['known_labels']
+        output_known_coord = mask_dict['output_known_coord']
+        output_known_class = mask_dict['output_known_class']
+        num_tgt = mask_dict['pad_size']
+
+        return known_labels, known_bboxs, output_known_class, output_known_coord, num_tgt
+
+    ## SMPL losses
+
+    def forward(self, outputs, targets, data_batch, return_indices=False):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+
+             return_indices: used for vis. if True, the layer0-5 indices will be returned as well.
+
+        """
+        # import pdb; pdb.set_trace()
+        outputs_without_aux = {
+            k: v
+            for k, v in outputs.items() if k != 'aux_outputs'
+        }
+        device = next(iter(outputs.values())).device
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t['boxes']) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes],
+                                    dtype=torch.float,
+                                    device=device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+
+        # loss for final layer
+        # pdb.set_trace()
+        indices = self.matcher(outputs_without_aux, targets, data_batch)
+        if return_indices:
+            indices0_copy = indices
+            indices_list = []
+        losses = {}
+        smpl_loss = ['smpl_pose', 'smpl_beta', 'smpl_expr', 'smpl_kp2d',
+                     'smpl_kp2d_ba', 'smpl_kp3d', 'smpl_kp3d_ra']
+        # import pdb; pdb.set_trace()
+        for loss in self.losses:
+            # print(loss)
+            # print(self.get_loss(loss, outputs, targets, indices, num_boxes))
+            kwargs = {}
+
+            if loss == 'keypoints' or loss in smpl_loss:
+                kwargs.update({'face_hand_kpt': True})
+            if loss == 'boxes':
+                kwargs.update({'face_hand_box': True})
+
+            losses.update(
+                self.get_loss(
+                    loss, outputs, targets, 
+                    data_batch, indices, 
+                    num_boxes, **kwargs
+                    ))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for idx, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets, data_batch)
+                if return_indices:
+                    indices_list.append(indices)
+                for loss in self.losses:
+                    kwargs = {}
+                    if loss == 'boxes':
+                        kwargs.update({'face_hand_box': False})
+                        if idx >= self.num_box_decoder_layers:
+                            kwargs.update({'face_hand_box': True})
+                            
+                    if loss == 'masks':
+                        continue
+                    
+                    if loss == 'keypoints':
+                        if idx < self.num_box_decoder_layers:
+                            continue
+                        elif idx < self.num_hand_face_decoder_layers:
+                            kwargs.update({'face_hand_kpt': False})
+                        else:
+                            kwargs.update({'face_hand_kpt': True})
+                            
+                    if loss in smpl_loss: 
+                        if idx < self.num_box_decoder_layers:
+                            continue
+                        elif idx < self.num_hand_face_decoder_layers:
+                            kwargs.update({'face_hand_kpt': False})
+                        else:
+                            kwargs.update({'face_hand_kpt': True})                    
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+                    
+                    # if loss == 'smpl_expr' and idx < self.num_box_decoder_layers:
+                    #     continue
+                        
+                    
+                    # import pdb;pdb.set_trace()
+                    l_dict = self.get_loss(loss, aux_outputs, targets,
+                                           data_batch, indices, num_boxes,
+                                           **kwargs)
+                    l_dict = {k + f'_{idx}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # interm_outputs loss
+        if 'interm_outputs' in outputs:
+            interm_outputs = outputs['interm_outputs']
+            indices = self.matcher(interm_outputs, targets)
+            if return_indices:
+                indices_list.append(indices)
+            for loss in self.losses:
+                if loss in ['dn_bbox', 'dn_label', 'keypoints']:
+                    continue
+                if loss in [
+                        'smpl_pose', 'smpl_beta', 'smpl_kp2d_ba', 'smpl_kp2d',
+                        'smpl_kp3d_ra', 'smpl_kp3d', 'smpl_expr'
+                ]:
+                    continue
+                kwargs = {}
+                if loss == 'labels':
+                    kwargs = {'log': False}
+                l_dict = self.get_loss(loss, interm_outputs, targets,
+                                       data_batch, indices, num_boxes,
+                                       **kwargs)
+                l_dict = {k + f'_interm': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        # aux_init loss
+        if 'query_expand' in outputs:
+            interm_outputs = outputs['query_expand']
+            indices = self.matcher(interm_outputs, targets)
+            if return_indices:
+                indices_list.append(indices)
+            for loss in self.losses:
+                if loss in ['dn_bbox', 'dn_label']:
+                    continue
+                kwargs = {}
+
+                if loss == 'labels':
+                    kwargs = {'log': False}
+                l_dict = self.get_loss(loss, interm_outputs, targets,
+                                       data_batch, indices, num_boxes,
+                                       **kwargs)
+                l_dict = {k + f'_query_expand': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        if return_indices:
+            indices_list.append(indices0_copy)
+            return losses, indices_list
+
+        return losses
+
+    def tgt_loss_boxes(
+        self,
+        src_boxes,
+        tgt_boxes,
+        num_tgt,
+    ):
+        """
+        Input:
+            - src_boxes: bs, num_dn, 4
+            - tgt_boxes: bs, num_dn, 4
+
+        """
+        
+        loss_bbox = F.l1_loss(src_boxes, tgt_boxes, reduction='none')
+
+        losses = {}
+        losses['dn_loss_bbox'] = loss_bbox.sum() / num_tgt
+
+        loss_giou = 1 - torch.diag(
+            box_ops.generalized_box_iou(
+                box_ops.box_cxcywh_to_xyxy(src_boxes.flatten(0, 1)),
+                box_ops.box_cxcywh_to_xyxy(tgt_boxes.flatten(0, 1))))
+        losses['dn_loss_giou'] = loss_giou.sum() / num_tgt
+        return losses
+
+    def tgt_loss_labels(self,
+                        src_logits: Tensor,
+                        tgt_labels: Tensor,
+                        num_tgt: int,
+                        log: bool = True):
+        """
+        Input:
+            - src_logits: bs, num_dn, num_classes
+            - tgt_labels: bs, num_dn
+
+        """
+        target_classes_onehot = torch.zeros([
+            src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1
+        ],
+                                            dtype=src_logits.dtype,
+                                            layout=src_logits.layout,
+                                            device=src_logits.device)
+        target_classes_onehot.scatter_(2, tgt_labels.unsqueeze(-1), 1)
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = sigmoid_focal_loss(src_logits,
+                                     target_classes_onehot,
+                                     num_tgt,
+                                     alpha=self.focal_alpha,
+                                     gamma=2) * src_logits.shape[1]
+        losses = {'dn_loss_ce': loss_ce}
+
+        return losses
+
+
+class SetCriterion_Box(nn.Module):
+    def __init__(self,
+                 num_classes,
+                 matcher,
+                 weight_dict,
+                 focal_alpha,
+                 losses,
+                 num_box_decoder_layers=2,
+                 num_hand_face_decoder_layers=4,
+                 num_body_points=17,
+                 num_hand_points=6,
+                 num_face_points=6,
+                 smpl_loss_config=None,
+                 convention='smplx_137'):
+        super().__init__()
+        self.num_classes = num_classes
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.losses = losses
+        self.focal_alpha = focal_alpha
+        self.vis = 0.1
+        self.abs = 1
+        self.num_body_points = 0
+        self.num_hand_points = 0
+        self.num_face_points = 0
+        self.num_box_decoder_layers = num_box_decoder_layers
+        self.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.convention = convention
+
+
+    def loss_labels(self,
+                    outputs,
+                    targets,
+                    indices,
+                    idx,
+                    num_boxes,
+                    data_batch,
+                    log=True):
+        """Classification loss (Binary focal loss) targets dicts must contain
+        the key "labels" containing a tensor of dim [nb_target_boxes]"""
+        indices = indices[0]
+        valid_num = 0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        assert 'pred_logits' in outputs
+        src_logits = outputs['pred_logits']
+        target_classes_o = torch.cat(
+            [t['labels'][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(src_logits.shape[:2],
+                                    self.num_classes,
+                                    dtype=torch.int64,
+                                    device=src_logits.device)
+        if valid_num == 0:
+            
+            return {'loss_ce': src_logits.sum()*0}
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros([
+            src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1
+        ],
+                                            dtype=src_logits.dtype,
+                                            layout=src_logits.layout,
+                                            device=src_logits.device)
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = sigmoid_focal_loss(src_logits,
+                                     target_classes_onehot,
+                                     num_boxes,
+                                     alpha=self.focal_alpha,
+                                     gamma=2) * src_logits.shape[1]
+        losses = {'loss_ce': loss_ce}
+
+        if log:
+            # TODO this should probably be a separate loss, not hacked in this one here
+            losses['class_error'] = 100 - accuracy(src_logits[idx],
+                                                   target_classes_o)[0]
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes,
+                         data_batch):
+        """Compute the cardinality error, ie the absolute error in the number
+        of predicted non-empty boxes This is not really a loss, it is intended
+        for logging purposes only.
+
+        It doesn't propagate gradients
+        """
+        pred_logits = outputs['pred_logits']
+        device = pred_logits.device
+        tgt_lengths = torch.as_tensor([len(v['labels']) for v in targets],
+                                      device=device)
+        if tgt_lengths == 0:
+            return  {'cardinality_error': pred_logits.sum()*0}
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (pred_logits.argmax(-1) !=
+                     pred_logits.shape[-1] - 1).sum(1)
+        card_err = F.l1_loss(card_pred.float(), tgt_lengths.float())
+        losses = {'cardinality_error': card_err}
+        return losses
+
+    def loss_smpl_pose(self, outputs, targets, indices, idx, num_boxes,
+                       data_batch, face_hand_kpt=False):
+        indices = indices[0]
+        device = outputs['pred_logits'].device
+        # import pdb
+        # pdb.set_trace()
+        
+        valid_num=0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        
+        
+        pred_smpl_body_pose = outputs['pred_smpl_pose'][idx] # 22
+        pred_smpl_lhand_pose = outputs['pred_smpl_lhand_pose'][idx] # 15
+        pred_smpl_rhand_pose = outputs['pred_smpl_rhand_pose'][idx] # 15
+        pred_smpl_jaw_pose = outputs['pred_smpl_jaw_pose'][idx]
+
+        pred_smplx_pose = torch.cat((pred_smpl_body_pose, pred_smpl_lhand_pose,
+                                     pred_smpl_rhand_pose, pred_smpl_jaw_pose),
+                                    dim=1)
+
+        targets_smpl_pose = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['smplx_pose'], indices)],
+            dim=0)
+        targets_smpl_pose = batch_rodrigues(targets_smpl_pose.view(
+            -1, 3)).view(-1, 53, 3, 3)
+        conf = torch.cat([
+            t[i] for t, (_, i) in zip(data_batch['smplx_pose_valid'], indices)
+            ], dim=0)
+        
+        conf = (conf.reshape(-1,53,3)[:,:,:,None]).repeat(1,1,1,3)
+        losses = {}
+        if valid_num == 0:
+            losses['loss_smpl_pose_root'] = torch.as_tensor(0., device=device) + pred_smplx_pose.sum() * 0
+            losses['loss_smpl_pose_body'] = torch.as_tensor(0., device=device) + pred_smplx_pose.sum() * 0
+            losses['loss_smpl_pose_lhand'] = torch.as_tensor(0., device=device) + pred_smplx_pose.sum() * 0
+            losses['loss_smpl_pose_rhand'] = torch.as_tensor(0., device=device) + pred_smplx_pose.sum() * 0
+            losses['loss_smpl_pose_jaw'] = torch.as_tensor(0., device=device) + pred_smplx_pose.sum() * 0
+            return losses
+        
+        # valid_pos = conf > 0
+        
+        if conf.sum() == 0:
+            losses['loss_smpl_pose_root'] = torch.as_tensor(0., device=device) + pred_smplx_pose.sum() * 0
+            losses['loss_smpl_pose_body'] = torch.as_tensor(0., device=device) + pred_smplx_pose.sum() * 0
+            losses['loss_smpl_pose_lhand'] = torch.as_tensor(0., device=device) + pred_smplx_pose.sum() * 0
+            losses['loss_smpl_pose_rhand'] = torch.as_tensor(0., device=device) + pred_smplx_pose.sum() * 0
+            losses['loss_smpl_pose_jaw'] = torch.as_tensor(0., device=device) + pred_smplx_pose.sum() * 0
+            return losses
+
+        loss_smpl_pose = \
+            F.l1_loss(
+                pred_smplx_pose,
+                targets_smpl_pose,
+                reduction='none'
+            )
+        # pdb.set_trace()
+        loss_smpl_pose = loss_smpl_pose * conf
+        loss_smpl_pose = loss_smpl_pose.sum([-1,-2])
+        # loss_smpl_pose[:,0] = loss_smpl_pose[:,0]*5
+        if face_hand_kpt:
+            losses = {
+                'loss_smpl_pose_root': loss_smpl_pose[:, 0].sum() / num_boxes,
+                'loss_smpl_pose_body': loss_smpl_pose[:, 1:22].sum() / num_boxes,
+                'loss_smpl_pose_lhand': loss_smpl_pose[:, 22:37].sum() / num_boxes,
+                'loss_smpl_pose_rhand': loss_smpl_pose[:, 37:52].sum() / num_boxes,
+                'loss_smpl_pose_jaw': loss_smpl_pose[:, 52].sum() / num_boxes,
+            }
+        else:
+            losses = {
+                'loss_smpl_pose_root': loss_smpl_pose[:, 0].sum() / num_boxes,
+                'loss_smpl_pose_body': loss_smpl_pose[:, 1:22].sum() / num_boxes,
+                'loss_smpl_pose_lhand': 0 * loss_smpl_pose[:, 22:37].sum() / num_boxes,
+                'loss_smpl_pose_rhand': 0 * loss_smpl_pose[:, 37:52].sum() / num_boxes,
+                'loss_smpl_pose_jaw': loss_smpl_pose[:, 52].sum() / num_boxes,
+            }
+        # losses = {'loss_smpl_pose': loss_smpl_pose.sum() / num_boxes}
+        return losses
+
+    def loss_smpl_beta(self, outputs, targets, indices, idx, num_boxes,
+                       data_batch, face_hand_kpt=False):
+        indices = indices[0]
+        device = outputs['pred_logits'].device
+        # import pdb
+        # pdb.set_trace()
+
+        pred_smpl_betas = outputs['pred_smpl_beta'][idx]
+
+        
+        targets_smpl_betas = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['smplx_shape'], indices)],
+            dim=0)
+        # import pdb
+        # pdb.set_trace()
+
+        valid_num=0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        losses = {}
+        if valid_num == 0:
+            losses['loss_smpl_beta'] = torch.as_tensor(0., device=device) + pred_smpl_betas.sum() * 0
+            return losses
+
+        
+        conf = torch.cat([t[i] for t, (_, i) in zip(data_batch['smplx_shape_valid'], indices)], dim=0)
+        
+        # valid_pos = conf > 0
+        if conf.sum() == 0:
+            return {
+                'loss_smpl_beta': torch.as_tensor(0., device=device) + pred_smpl_betas.sum() * 0
+            }
+
+        loss_smpl_betas = \
+            F.l1_loss(
+                pred_smpl_betas,
+                targets_smpl_betas,
+                reduction='none'
+            )
+        # pdb.set_trace()
+        
+        loss_smpl_betas = loss_smpl_betas.sum(-1) * conf
+        losses = {'loss_smpl_beta': loss_smpl_betas.sum() / num_boxes}
+        return losses
+
+    def loss_smpl_expr(self, outputs, targets, indices, idx, num_boxes,
+                       data_batch, face_hand_kpt=False):
+        indices = indices[0]
+        device = outputs['pred_logits'].device
+        pred_smpl_expr = outputs['pred_smpl_expr'][idx]
+        # import pdb
+        # pdb.set_trace()
+        targets_smpl_expr = torch.cat([t[i] for t, (_, i) in zip(data_batch['smplx_expr'], indices)], dim=0)
+        valid_num=0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        losses = {}
+        if valid_num == 0:
+            losses['loss_smpl_expr'] = torch.as_tensor(0., device=device) + pred_smpl_expr.sum() * 0
+            return losses
+        
+        
+        
+        
+        
+        conf = torch.cat([t[i] for t, (_, i) in zip(data_batch['smplx_expr_valid'], indices)], dim=0)
+        # valid_pos = conf > 0
+        if conf.sum() == 0:
+            return {
+                'loss_smpl_expr': torch.as_tensor(0., device=device) + pred_smpl_expr.sum() * 0
+            }
+
+        loss_smpl_expr = \
+            F.l1_loss(
+                pred_smpl_expr,
+                targets_smpl_expr,
+                reduction='none'
+            )
+        # pdb.set_trace()
+        loss_smpl_expr = loss_smpl_expr.sum(-1) * conf
+        if face_hand_kpt:
+            losses = {'loss_smpl_expr': loss_smpl_expr.sum() / (conf.sum() + 1e-6)}
+        else:
+            losses = {'loss_smpl_expr': 0*loss_smpl_expr.sum() / (conf.sum() + 1e-6) }
+            
+        return losses
+
+    def loss_smpl_kp3d(self,
+                       outputs,
+                       targets,
+                       indices,
+                       idx,
+                       num_boxes,
+                       data_batch,
+                       has_keypoints3d=None,
+                       face_hand_kpt=False):
+
+        # supervision for keypoints3d wo/ ra
+        device = outputs['pred_logits'].device
+        indices = indices[0]
+        valid_num=0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        
+        
+        pred_smpl_kp3d = outputs['pred_smpl_kp3d'][idx].float()
+
+        # meta_info['joint_valid'] * meta_info['is_3D'][:, None, None])
+        targets_smpl_kp3d = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['joint_cam'], indices)],
+            dim=0)
+        losses = {}
+        if valid_num == 0:
+            losses['loss_smpl_body_kp3d'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+            losses['loss_smpl_lhand_kp3d'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+            losses['loss_smpl_rhand_kp3d'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+            losses['loss_smpl_face_kp3d'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+            return losses
+        targets_kp3d_conf = targets_smpl_kp3d[:,:,3:].clone()
+        targets_smpl_kp3d = targets_smpl_kp3d[:,:,:3]
+        
+        targets_is_3d = torch.cat([
+            t[None, None].repeat(len(i), 1, 1)
+            for t, (_, i) in zip(data_batch['is_3D'], indices)
+        ],
+                                  dim=0)
+
+        
+        targets_kp3d_conf = (targets_kp3d_conf * targets_is_3d).repeat(1, 1, 3)
+        pelvis_idx = get_keypoint_idx('pelvis', self.convention)
+        targets_pelvis = targets_smpl_kp3d[..., pelvis_idx, :]
+        pred_pelvis = pred_smpl_kp3d[..., pelvis_idx, :]
+
+        targets_smpl_kp3d = targets_smpl_kp3d - targets_pelvis[:, None, :]
+        pred_smpl_kp3d = pred_smpl_kp3d - pred_pelvis[:, None, :]
+
+        losses = {}
+        body_idx = smpl_x.joint_part['body']
+        face_idx = smpl_x.joint_part['face']
+        lhand_idx = smpl_x.joint_part['lhand']
+        rhand_idx = smpl_x.joint_part['rhand']
+       
+        # currently, only mpi_inf_3dhp and h36m have 3d keypoints
+        # both datasets have right_hip_extra and left_hip_extra
+        loss_smpl_kp3d = F.l1_loss(pred_smpl_kp3d,
+                                   targets_smpl_kp3d,
+                                   reduction='none')
+
+        # If has_keypoints3d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints3d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints3d
+        # which have positive confidence.
+
+        # has_keypoints3d is None when the key has_keypoints3d
+        # is not in the datasets
+
+        valid_pos = targets_kp3d_conf > 0
+        if targets_kp3d_conf[valid_pos].numel() == 0:
+            return {
+                'loss_smpl_body_kp3d':
+                torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0,
+                'loss_smpl_lhand_kp3d':
+                torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0,
+                'loss_smpl_rhand_kp3d':
+                torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0,
+                'loss_smpl_face_kp3d':
+                torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0,             
+            }
+        loss_smpl_kp3d = loss_smpl_kp3d * targets_kp3d_conf
+        
+        if face_hand_kpt:
+            losses['loss_smpl_body_kp3d'] = torch.sum(loss_smpl_kp3d[:, body_idx, :])  / num_boxes
+            losses['loss_smpl_lhand_kp3d'] = torch.sum(loss_smpl_kp3d[:, lhand_idx, :]) / num_boxes
+            losses['loss_smpl_rhand_kp3d'] = torch.sum(loss_smpl_kp3d[:, rhand_idx, :]) / num_boxes
+            losses['loss_smpl_face_kp3d'] = torch.sum(loss_smpl_kp3d[:, face_idx, :]) / num_boxes
+        else:
+            losses['loss_smpl_body_kp3d'] = torch.sum(loss_smpl_kp3d[:, body_idx, :])  / num_boxes
+            losses['loss_smpl_lhand_kp3d'] = 0*torch.sum(loss_smpl_kp3d[:, lhand_idx, :]) / num_boxes
+            losses['loss_smpl_rhand_kp3d'] = 0*torch.sum(loss_smpl_kp3d[:, rhand_idx, :]) /num_boxes
+            losses['loss_smpl_face_kp3d'] = 0*torch.sum(loss_smpl_kp3d[:, face_idx, :]) / num_boxes
+        return losses
+
+
+    def loss_smpl_kp3d_ra(self,
+                          outputs,
+                          targets,
+                          indices,
+                          idx,
+                          num_boxes,
+                          data_batch,
+                          has_keypoints3d=None,
+                          face_hand_kpt=False):
+        # supervision for keypoints3d w/ ra
+        device = outputs['pred_logits'].device
+        indices = indices[0]
+
+        valid_num=0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        
+        pred_smpl_kp3d = outputs['pred_smpl_kp3d'][idx].float()
+
+        # meta_info['joint_valid'] * meta_info['is_3D'][:, None, None])
+        targets_smpl_kp3d = torch.cat([
+            t[i] for t, (_, i) in zip(data_batch['smplx_joint_cam'], indices)
+        ],
+                                      dim=0)
+        losses = {}
+        if valid_num == 0:
+            losses['loss_smpl_rhand_kp3d_ra'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+            losses['loss_smpl_body_kp3d_ra'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+            losses['loss_smpl_face_kp3d_ra'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+            losses['loss_smpl_lhand_kp3d_ra'] = torch.as_tensor(0., device=device) + pred_smpl_kp3d.sum() * 0
+            return losses
+        
+        targets_kp3d_conf = targets_smpl_kp3d[:,:,3:].clone()
+        
+        targets_smpl_kp3d = targets_smpl_kp3d[:,:,:3]
+        targets_is_3d = torch.cat([
+            t[None, None].repeat(len(i), 1, 1)
+            for t, (_, i) in zip(data_batch['is_3D'], indices)
+        ],
+                                  dim=0)
+        
+        targets_kp3d_conf = (targets_kp3d_conf * targets_is_3d).repeat(1, 1, 3)
+        targets_smpl_kp3d = targets_smpl_kp3d[..., :3].float()
+        pelvis_idx = get_keypoint_idx('pelvis', self.convention)
+        targets_pelvis = targets_smpl_kp3d[..., pelvis_idx, :]
+        pred_pelvis = pred_smpl_kp3d[..., pelvis_idx, :]
+
+        targets_smpl_kp3d = targets_smpl_kp3d - targets_pelvis[:, None, :]
+        pred_smpl_kp3d = pred_smpl_kp3d - pred_pelvis[:, None, :]
+        # calculate body, face and hand loss separately:
+
+        losses = {}
+        body_idx = smpl_x.joint_part['body']
+        face_idx = smpl_x.joint_part['face']
+        lhand_idx = smpl_x.joint_part['lhand']
+        rhand_idx = smpl_x.joint_part['rhand']
+
+        loss_smpl_body_kp3d = F.l1_loss(pred_smpl_kp3d[:, body_idx, :],
+                                        targets_smpl_kp3d[:, body_idx, :],
+                                        reduction='none')
+        loss_smpl_body_kp3d = torch.sum(
+            loss_smpl_body_kp3d * targets_kp3d_conf[:, body_idx, :])
+        losses['loss_smpl_body_kp3d_ra'] = loss_smpl_body_kp3d / num_boxes
+        
+        # if face_hand_kpt:
+        face_cam = pred_smpl_kp3d[:, face_idx, :]
+        neck_cam = pred_smpl_kp3d[:, smpl_x.neck_idx, None, :]
+        face_cam = face_cam - neck_cam
+        loss_smpl_face_kp3d = F.l1_loss(face_cam,
+                                        targets_smpl_kp3d[:, face_idx, :],
+                                        reduction='none')
+        loss_smpl_face_kp3d = torch.sum(
+            loss_smpl_face_kp3d * targets_kp3d_conf[:, face_idx, :])
+        if face_hand_kpt:
+            losses['loss_smpl_face_kp3d_ra'] = (loss_smpl_face_kp3d / num_boxes)
+        else:
+            losses['loss_smpl_face_kp3d_ra'] = 0*(loss_smpl_face_kp3d / num_boxes)
+        
+        lhand_cam = pred_smpl_kp3d[:, lhand_idx, :]
+        lwrist_cam = pred_smpl_kp3d[:, smpl_x.lwrist_idx, None, :]
+        lhand_cam = lhand_cam - lwrist_cam
+        loss_smpl_lhand_kp3d = F.l1_loss(lhand_cam,
+                                            targets_smpl_kp3d[:, lhand_idx, :],
+                                            reduction='none')
+        loss_smpl_lhand_kp3d = torch.sum(
+            loss_smpl_lhand_kp3d * targets_kp3d_conf[:, lhand_idx, :])
+        
+        if face_hand_kpt:
+            losses['loss_smpl_lhand_kp3d_ra'] = (loss_smpl_lhand_kp3d / num_boxes)
+        else:
+            losses['loss_smpl_lhand_kp3d_ra'] = 0*(loss_smpl_lhand_kp3d /num_boxes)
+            
+        rhand_cam = pred_smpl_kp3d[:, rhand_idx, :]
+        rwrist_cam = pred_smpl_kp3d[:, smpl_x.rwrist_idx, None, :]
+        rhand_cam = rhand_cam - rwrist_cam
+
+        loss_smpl_rhand_kp3d = F.l1_loss(rhand_cam,
+                                            targets_smpl_kp3d[:, rhand_idx, :],
+                                            reduction='none')
+        loss_smpl_rhand_kp3d = torch.sum(
+            loss_smpl_rhand_kp3d * targets_kp3d_conf[:, rhand_idx, :])
+        
+        if face_hand_kpt:
+            losses['loss_smpl_rhand_kp3d_ra'] = (loss_smpl_rhand_kp3d / num_boxes)
+        else:
+            losses['loss_smpl_rhand_kp3d_ra'] = 0*(loss_smpl_rhand_kp3d / num_boxes)
+
+        return losses
+
+    def loss_smpl_kp2d(self,
+                       outputs,
+                       targets,
+                       indices,
+                       idx,
+                       num_boxes,
+                       data_batch,
+                       focal_length=5000.,
+                       has_keypoints2d=None,
+                       face_hand_kpt=False):
+        """Compute loss for 2d keypoints."""
+        device = outputs['pred_logits'].device
+        indices = indices[0]
+        
+        valid_num=0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        # pdb.set_trace()
+        pred_smpl_kp3d = outputs['pred_smpl_kp3d'][idx].float()#.detach()
+        # pred_smpl_kp3d = outputs['pred_smpl_kp3d'][idx].float()
+        # pelvis_idx = get_keypoint_idx('pelvis', self.convention)
+        # pred_pelvis = pred_smpl_kp3d[..., pelvis_idx, :]
+
+        # pred_smpl_kp3d = pred_smpl_kp3d - pred_pelvis[:, None, :] +1e-7
+
+
+        pred_cam = outputs['pred_smpl_cam'][idx].float()
+       
+        targets_kp2d = torch.cat([t[i] for t, (_, i) in zip(data_batch['joint_img'], indices)], dim=0)
+        
+        keypoints2d_conf =  targets_kp2d[:,:,2:].clone()
+        targets_kp2d = targets_kp2d[:,:,:2]
+
+        target_lhand_boxes_conf = torch.cat(
+                    [t[i] for t, (_, i) in zip(data_batch['lhand_bbox_valid'], indices)], dim=0)
+        lhand_num_boxes = target_lhand_boxes_conf.sum()
+        target_rhand_boxes_conf = torch.cat(
+                    [t[i] for t, (_, i) in zip(data_batch['rhand_bbox_valid'], indices)], dim=0)
+        rhand_num_boxes = target_rhand_boxes_conf.sum()
+        target_face_boxes_conf = torch.cat(
+                    [t[i] for t, (_, i) in zip(data_batch['face_bbox_valid'], indices)], dim=0)
+        face_num_boxes = target_face_boxes_conf.sum()
+        # t_pose  = torch.cat([t[i] for t, (_, i) in zip(data_batch['smplx_pose'], indices)], dim=0)
+        # t_shape = torch.cat([t[i] for t, (_, i) in zip(data_batch['smplx_shape'], indices)], dim=0)
+        # t_expr  = torch.cat([t[i] for t, (_, i) in zip(data_batch['smplx_expr'], indices)], dim=0)
+        
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+
+        targets_kp2d = targets_kp2d[:, :, :2].float()
+        targets_kp2d[:,:,0] = targets_kp2d[:,:,0]/cfg.output_hm_shape[2]
+        targets_kp2d[:,:,1] = targets_kp2d[:,:,1]/cfg.output_hm_shape[1]
+        # targets_kp2d = targets_kp2d*2-1
+        img_wh =  torch.cat([data_batch['img_shape'][i][None] for i in idx[0]], dim=0).flip(-1)
+        # pred_smpl_kp2d = weak_perspective_projection(pred_smpl_kp3d, scale=pred_cam[:, 0], translation=pred_cam[:, 1:3])
+        
+        # If kp2ds is normalized to [-1, 1], the center should be the center of the image; 
+        # if normalized to 0-1, it should be at the top left corner (0, 0)?
+       
+        
+        pred_smpl_kp2d = project_points_new(
+            points_3d=pred_smpl_kp3d,
+            pred_cam=pred_cam,
+            focal_length=focal_length,
+            camera_center=img_wh/2
+        )
+
+        pred_smpl_kp2d = pred_smpl_kp2d / img_wh[:, None]
+        vis=False
+        # if 'vis' in cfg:
+        #     vis=cfg['vis']
+        # vis = True
+        if vis:
+            import mmcv
+            import cv2
+            import numpy as np
+            from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+            from detrsmpl.core.visualization.visualize_smpl import visualize_smpl_hmr,render_smpl
+            from detrsmpl.models.body_models.builder import build_body_model
+            
+            from pytorch3d.io import save_obj
+            from detrsmpl.core.visualization.visualize_keypoints3d import visualize_kp3d
+
+            img = mmcv.imdenormalize(
+                img=(data_batch['img'][0].cpu().numpy()).transpose(1, 2, 0), 
+                mean=np.array([123.675, 116.28, 103.53]), 
+                std=np.array([58.395, 57.12, 57.375]),
+                to_bgr=True).astype(np.uint8)
+            cv2.imwrite('test.png', img)
+            device = outputs['pred_smpl_kp3d'].device
+            
+            body_model = dict(
+                type='smplx',
+                keypoint_src='smplx',
+                num_expression_coeffs=10,
+                num_betas=10,
+                keypoint_dst='smplx_137',
+                model_path='data/body_models/smplx',
+                use_pca=False,
+                use_face_contour=True)
+            bm = build_body_model(body_model).to(device)
+            pred_smpl_body_pose = rotmat_to_aa(outputs['pred_smpl_pose'][idx])
+            pred_smpl_lhand_pose = rotmat_to_aa(outputs['pred_smpl_lhand_pose'][idx])
+            pred_smpl_rhand_pose = rotmat_to_aa(outputs['pred_smpl_rhand_pose'][idx])
+            pred_smpl_jaw_pose = rotmat_to_aa(outputs['pred_smpl_jaw_pose'][idx])
+            pred_smpl_shape = outputs['pred_smpl_beta'][idx]
+            pred_output = bm(
+                betas=pred_smpl_shape.reshape(-1, 10),
+                body_pose=pred_smpl_body_pose[:,1:].reshape(-1, 21*3), 
+                global_orient=pred_smpl_body_pose[:,:1].reshape(-1, 3),
+                left_hand_pose=pred_smpl_lhand_pose.reshape(-1, 15*3),
+                right_hand_pose=pred_smpl_rhand_pose.reshape(-1, 15*3),
+                leye_pose=torch.zeros_like(pred_smpl_jaw_pose).reshape(-1, 3),
+                reye_pose=torch.zeros_like(pred_smpl_jaw_pose).reshape(-1, 3),
+                expression=torch.zeros_like(pred_smpl_shape).reshape(-1, 10),
+                jaw_pose=pred_smpl_jaw_pose.reshape(-1, 3))
+            verts = pred_output['vertices']
+            
+            # for i_obj,v in enumerate(verts):
+            #     save_obj('./figs/pred_smpl_%d.obj'%i_obj,verts = v,faces=torch.tensor([]))
+            pred_cam = outputs['pred_smpl_cam'][idx]
+            
+            targets_smpl_pose = data_batch['smplx_pose'][0]
+            targets_shape = data_batch['smplx_shape'][0]
+            gt_kp3d = data_batch['joint_cam'][0]
+            
+            gt_kp2d = data_batch['joint_img'][0]
+            gt_body_boxes = torch.cat(
+                [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            # gt kp3d
+            pred_smpl_kp3d = outputs['pred_smpl_kp3d'][idx].float()
+            
+            visualize_kp3d(gt_kp3d.detach().cpu().numpy(),
+                                    output_path='./figs/gt3d',
+                                    data_source='smplx_137')  
+            # visualize_kp3d(pred_smpl_kp3d.detach().cpu().numpy(),
+            #                         output_path='./figs/pred3d',
+            #                         data_source='smplx_137')           
+            # gt kp2d
+            img  =(data_batch['img'][0].permute(1,2,0)*255).int().cpu().numpy()
+            gt_2d= gt_kp2d.detach().cpu().numpy()[...,:2]*data_batch['img_shape'].cpu().numpy()[0,None,None,::-1]
+            gt_2d[...,0] = gt_2d[...,0]/12
+            gt_2d[...,1] = gt_2d[...,1]/16
+            import mmcv
+            batch_id = 0
+            gt_bbox = (box_ops.box_cxcywh_to_xyxy(targets[batch_id]['boxes']).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[batch_id, ::-1]).reshape(-1,4)
+            gt_bbox_lhand = (box_ops.box_cxcywh_to_xyxy(targets[batch_id]['lhand_boxes']).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[batch_id, ::-1]).reshape(-1,4)
+            gt_bbox_rhand = (box_ops.box_cxcywh_to_xyxy(targets[batch_id]['rhand_boxes']).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[batch_id, ::-1]).reshape(-1,4)
+            gt_bbox_face = (box_ops.box_cxcywh_to_xyxy(targets[batch_id]['face_boxes']).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[batch_id, ::-1]).reshape(-1,4)
+            gt_bbox = np.concatenate([gt_bbox,gt_bbox_face,gt_bbox_rhand,gt_bbox_lhand],axis=0)
+            # gt_bbox = (box_ops.box_cxcywh_to_xyxy(gt_body_boxes).reshape(-1,2,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0, ::-1][None,None,:]).reshape(-1,4)
+            img = mmcv.imshow_bboxes(img.copy(), gt_bbox, show=False)
+            
+            gt_2d = data_batch['joint_img'][0][:,:,:2].cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0,None,None,::-1]# *data_batch['joint_img'][0][:,:,2:].cpu().numpy()
+            gt_2d[...,0] = gt_2d[...,0]/12
+            gt_2d[...,1] = gt_2d[...,1]/16
+            # data_batch['joint_img']
+            # gt_kp2d = gt_2d[0][keypoints2d_conf[0]!=0]
+            visualize_kp2d(
+                (gt_2d).reshape(-1,2)[None], 
+                output_path='./figs/gt2d', 
+                image_array=img.copy()[None], 
+                # data_source='smplx_137',
+                disable_limbs = True,
+                overwrite=True)
+            img  =(data_batch['img'][0].permute(1,2,0)*255).int().cpu().numpy()
+            # pred_smpl_kp2d = project_points_new(
+            #     points_3d=outputs['pred_smpl_kp3d'][:,:2].reshape(-1,137,3),
+            #     pred_cam=pred_cam,
+            #     focal_length=focal_length,
+            #     camera_center=img_wh/2
+            # )
+            
+            img_shape = data_batch['img_shape'][0]
+            
+            
+            
+            
+            # pred_kp2d = pred_kp2d.cpu().detach().numpy()*img_shape.cpu().numpy()[None,None ::-1]
+            # pred_bbox_all = []
+            # for i in idx[0]:
+
+            #     pred_bbox_body = (box_ops.box_cxcywh_to_xyxy(outputs['pred_boxes'][0,i]).reshape(2,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0, ::-1]).reshape(1,4)
+            #     pred_bbox_lhand = (box_ops.box_cxcywh_to_xyxy(outputs['pred_lhand_boxes'][0,i]).reshape(2,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0, ::-1]).reshape(1,4)
+            #     pred_bbox_rhand = (box_ops.box_cxcywh_to_xyxy(outputs['pred_rhand_boxes'][0,i]).reshape(2,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0, ::-1]).reshape(1,4)
+            #     pred_bbox_face = (box_ops.box_cxcywh_to_xyxy(outputs['pred_face_boxes'][0,i]).reshape(2,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0, ::-1]).reshape(1,4)
+            #     pred_bbox = np.concatenate([pred_bbox_body,pred_bbox_face,pred_bbox_rhand,pred_bbox_lhand],axis=0)
+            #     pred_bbox_all.append(pred_bbox)
+            # src_body_boxes = outputs['pred_boxes'][idx]
+            # pred_bbox_all = np.concatenate(pred_bbox_all,axis=0)
+
+            pred_bbox_body = (box_ops.box_cxcywh_to_xyxy(outputs['pred_boxes'][idx]).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[1, ::-1]).reshape(-1,4)
+            pred_bbox_lhand = (box_ops.box_cxcywh_to_xyxy(outputs['pred_lhand_boxes'][idx]).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[1, ::-1]).reshape(-1,4)
+            pred_bbox_rhand = (box_ops.box_cxcywh_to_xyxy(outputs['pred_rhand_boxes'][idx]).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[1, ::-1]).reshape(-1,4)
+            pred_bbox_face = (box_ops.box_cxcywh_to_xyxy(outputs['pred_face_boxes'][idx]).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[1, ::-1]).reshape(-1,4)
+            pred_bbox = np.concatenate([pred_bbox_body,pred_bbox_face,pred_bbox_rhand,pred_bbox_lhand],axis=0)
+            # pred_bbox_body = (box_ops.box_cxcywh_to_xyxy(src_body_boxes).reshape(-1,2,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0, ::-1][None,None,:]).reshape(-1,4)
+            #  import ipdb;ipdb.set_trace()
+            img = mmcv.imshow_bboxes(img.copy(), pred_bbox, show=False)
+            # cv2.imwrite('test.png',img)
+            
+            visualize_kp2d(
+                (pred_smpl_kp2d*img_wh[:, None])[None].detach().cpu().numpy(), 
+                output_path='./figs/pred2d', 
+                image_array=img.copy()[None], 
+                data_source='smplx_137',
+                overwrite=True) 
+
+            # visualize_kp2d(
+            #     (pred_smpl_kp2d*img_wh[:, None])[None].detach().cpu().numpy(), 
+            #     output_path='./figs/pred2d', 
+            #     image_array=img.copy()[None], 
+            #     data_source='smplx_137',
+            #     overwrite=True)  
+            vis_smpl=True    
+            if vis_smpl: 
+                
+                gt_output = bm(
+                    betas=targets_shape.reshape(-1, 10),
+                    body_pose=targets_smpl_pose[:,3:66].reshape(-1, 21*3), 
+                    global_orient=targets_smpl_pose[:,:3].reshape(-1, 3),
+                    left_hand_pose=targets_smpl_pose[:,66:111].reshape(-1, 15*3),
+                    right_hand_pose=targets_smpl_pose[:,111:156].reshape(-1, 15*3),
+                    leye_pose=torch.zeros_like(targets_smpl_pose[:,:3]).reshape(-1, 3),
+                    reye_pose=torch.zeros_like(targets_smpl_pose[:,:3]).reshape(-1, 3),
+                    expression=torch.zeros_like(targets_shape).reshape(-1, 10),
+                    jaw_pose=targets_smpl_pose[:,156:].reshape(-1, 3))
+                verts = gt_output['vertices']
+                for i_obj,v in enumerate(verts):
+                    save_obj('./figs/gt_smpl_%d.obj'%i_obj,verts = v,faces=torch.tensor([]))
+            import ipdb;ipdb.set_trace()
+        losses = {}
+
+        if valid_num == 0:
+            losses['loss_smpl_body_kp2d'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0
+
+            losses['loss_smpl_lhand_kp2d'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0 
+        
+            losses['loss_smpl_rhand_kp2d'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0 
+        
+            losses['loss_smpl_face_kp2d'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0 
+            return losses        
+
+        body_idx = smpl_x.joint_part['body']
+        face_idx = smpl_x.joint_part['face']
+        lhand_idx = smpl_x.joint_part['lhand']
+        rhand_idx = smpl_x.joint_part['rhand']
+        
+        loss_smpl_kp2d = F.l1_loss(pred_smpl_kp2d,
+                                   targets_kp2d,
+                                   reduction='none')
+
+        # If has_keypoints2d is not None, then computes the losses on the
+        # instances that have ground-truth keypoints2d.
+        # But the zero confidence keypoints will be included in mean.
+        # Otherwise, only compute the keypoints2d
+        # which have positive confidence.
+        # has_keypoints2d is None when the key has_keypoints2d
+        # is not in the datasets
+        # import pdb; pdb.set_trace()
+        
+        valid_pos = keypoints2d_conf > 0
+        if keypoints2d_conf[valid_pos].numel() == 0:
+            return {
+                'loss_smpl_body_kp2d': torch.as_tensor(0., device=device) + loss_smpl_kp2d.sum()*0,
+                'loss_smpl_lhand_kp2d': torch.as_tensor(0., device=device) + loss_smpl_kp2d.sum()*0,
+                'loss_smpl_rhand_kp2d': torch.as_tensor(0., device=device) + loss_smpl_kp2d.sum()*0,
+                'loss_smpl_face_kp2d': torch.as_tensor(0., device=device) + loss_smpl_kp2d.sum()*0,
+            }
+        loss_smpl_kp2d = loss_smpl_kp2d * keypoints2d_conf
+        # loss /= keypoints2d_conf[valid_pos].numel()
+
+        
+        if face_hand_kpt:
+            losses['loss_smpl_body_kp2d'] = torch.sum(loss_smpl_kp2d[:, body_idx, :])  / num_boxes
+            if lhand_num_boxes>0:
+                losses['loss_smpl_lhand_kp2d'] = torch.sum(loss_smpl_kp2d[:, lhand_idx, :]) / lhand_num_boxes
+            else:
+                losses['loss_smpl_lhand_kp2d'] =torch.as_tensor(0., device=device) + loss_smpl_kp2d.sum()*0
+            if rhand_num_boxes>0:
+                losses['loss_smpl_rhand_kp2d'] = torch.sum(loss_smpl_kp2d[:, rhand_idx, :]) / rhand_num_boxes
+            else:
+                losses['loss_smpl_rhand_kp2d'] = torch.as_tensor(0., device=device) + loss_smpl_kp2d.sum()*0
+            if face_num_boxes>0:
+                losses['loss_smpl_face_kp2d'] = torch.sum(loss_smpl_kp2d[:, face_idx, :]) / face_num_boxes
+            else:
+                losses['loss_smpl_face_kp2d'] = torch.as_tensor(0., device=device) + loss_smpl_kp2d.sum()*0
+        else:
+            losses['loss_smpl_body_kp2d'] = torch.sum(loss_smpl_kp2d[:, body_idx, :])  / num_boxes
+            losses['loss_smpl_lhand_kp2d'] = 0*torch.sum(loss_smpl_kp2d[:, lhand_idx, :]) / (keypoints2d_conf[:, lhand_idx].sum() + 1e-6)
+            losses['loss_smpl_rhand_kp2d'] = 0*torch.sum(loss_smpl_kp2d[:, rhand_idx, :]) / (keypoints2d_conf[:, rhand_idx].sum() + 1e-6)
+            losses['loss_smpl_face_kp2d'] = 0*torch.sum(loss_smpl_kp2d[:, face_idx, :]) / (keypoints2d_conf[:, face_idx].sum() + 1e-6)
+
+
+        return losses
+
+    def loss_smpl_kp2d_ba(self,
+                          outputs,
+                          targets,
+                          indices,
+                          idx,
+                          num_boxes,
+                          data_batch,
+                          focal_length=5000.,
+                          has_keypoints2d=None,
+                        face_hand_kpt=False):
+        """Compute loss for 2d keypoints."""
+        device = outputs['pred_logits'].device
+        indices = indices[0]
+        # pdb.set_trace()
+        pred_smpl_kp3d = outputs['pred_smpl_kp3d'][idx].float()#.detach()
+        pred_cam = outputs['pred_smpl_cam'][idx].float()
+
+        # pdb.set_trace()
+
+        # max_img_res = orig_img_res.max(-1)[0]
+        # torch.cat([ torch.Tensor([orig_img_res[0]]*9), torch.Tensor([orig_img_res[1]]*9)], 0)
+        # torch.cat([orig_img_res[i][None].repeat(num,1) for i, num in enumerate(instance_num)], 0)
+
+        # orig_img_res = torch.Tensor([t['orig_size'] for t, (_, i) in zip(targets, indices)]).type_as(pred_smpl_kp3d)
+        # orig_img_res = torch.Tensor([target['orig_size'] for target in targets]).type_as(pred_smpl_kp3d)
+        # max_img_res = torch.cat([torch.full_like(src, i) for i, (src, _) in zip(max_img_res, indices)]).type_as(pred_smpl_kp3d)
+        valid_num=0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        targets_kp2d = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['joint_img'], indices)],
+            dim=0)
+        losses = {}
+
+        
+        
+        keypoints2d_conf =  targets_kp2d[:,:,2:].clone()
+        targets_kp2d = targets_kp2d[:,:,:2]
+        
+        keypoints2d_conf = keypoints2d_conf.repeat(1, 1, 2)
+        targets_kp2d = targets_kp2d[:, :, :2].float()
+        targets_kp2d[:, :, 0] = targets_kp2d[:, :, 0] / cfg.output_hm_shape[2]
+        targets_kp2d[:, :, 1] = targets_kp2d[:, :, 1] / cfg.output_hm_shape[1]
+        # targets_kp2d = targets_kp2d * 2 - 1
+        img_wh =  torch.cat([data_batch['img_shape'][i][None] for i in idx[0]], dim=0).flip(-1)
+
+        pred_smpl_kp2d = project_points_new(
+            points_3d=pred_smpl_kp3d,
+            pred_cam=pred_cam,
+            focal_length=focal_length,
+            camera_center=img_wh/2
+        )
+
+        pred_smpl_kp2d = pred_smpl_kp2d / img_wh[:, None]
+        
+        if valid_num == 0:
+            losses['loss_smpl_body_kp2d_ba'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0
+
+            losses['loss_smpl_lhand_kp2d_ba'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0
+        
+            losses['loss_smpl_rhand_kp2d_ba'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0
+        
+            losses['loss_smpl_face_kp2d_ba'] = torch.as_tensor(0., device=device) + pred_smpl_kp2d.sum()*0
+            return losses        
+        # rhand bbox
+        rhand_bbox_valid = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['rhand_bbox_valid'], indices) ], dim=0)
+        rhand_bbox_gt = torch.cat(
+            [t['rhand_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        rhand_bbox_gt = (box_ops.box_cxcywh_to_xyxy(rhand_bbox_gt).
+                         reshape(-1,2,2)*img_wh[:, None]).reshape(-1, 4)
+        num_rhand_bbox = rhand_bbox_valid.sum()
+        # lhand bbox
+        lhand_bbox_valid = torch.cat([
+            t[i] for t, (_, i) in zip(data_batch['lhand_bbox_valid'], indices)], dim=0)
+        lhand_bbox_gt = torch.cat(
+            [t['lhand_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        lhand_bbox_gt = (box_ops.box_cxcywh_to_xyxy(lhand_bbox_gt).
+                         reshape(-1,2,2)*img_wh[:, None]).reshape(-1, 4)
+        num_lhand_bbox = lhand_bbox_valid.sum()
+        # face bbox
+        face_bbox_valid = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['face_bbox_valid'], indices)], dim=0)
+        face_bbox_gt = torch.cat(
+            [t['face_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        face_bbox_gt = (box_ops.box_cxcywh_to_xyxy(face_bbox_gt).
+                        reshape(-1,2,2)*img_wh[:, None]).reshape(-1, 4)
+        num_face_bbox = face_bbox_valid.sum()
+        img_shape = torch.cat(
+            [t[None].repeat(len(i), 1) for t, (_, i) in zip(data_batch['img_shape'], indices)], dim=0)
+        
+        # joint_proj = (joint_proj / 2 + 0.5)
+        # joint_proj[:, :, 0] = joint_proj[:, :, 0] * img_shape[:, 1:]
+        # joint_proj[:, :, 1] = joint_proj[:, :, 1] * img_shape[:, :1]
+
+        if not (lhand_bbox_valid + rhand_bbox_valid + face_bbox_valid == 0).all():
+            for part_name, bbox in (
+                    ('lhand', lhand_bbox_gt), 
+                    ('rhand', rhand_bbox_gt), 
+                    ('face', face_bbox_gt)):
+                
+                x = targets_kp2d[:, smpl_x.joint_part[part_name], 0]
+                y = targets_kp2d[:, smpl_x.joint_part[part_name], 1]
+                # trunc = joint_trunc[:, smpl_x.joint_part[part_name], 0]
+                trunc = keypoints2d_conf[:, smpl_x.joint_part[part_name], 0].clone()
+                # x in [0, 1]? bbox in [0, 1]. 
+                x -= (bbox[:, None, 0] / img_shape[:, 1:])
+                # x 
+                x *= (img_shape[:, 1:] / (bbox[:, None, 2] - bbox[:, None, 0] + 1e-6))
+                
+                y -= (bbox[:, None, 1] / img_shape[:, :1])
+                y *= (img_shape[:, :1] / (bbox[:, None, 3] - bbox[:, None, 1] + 1e-6))
+                # transformed to 0-1 bbox space
+
+                trunc *= ((x >= 0) * (x <= 1) *
+                          (y >= 0) * (y <= 1))
+
+                
+                coord = torch.stack((x, y), 2)
+                
+
+                targets_kp2d = torch.cat(
+                    (targets_kp2d[:, :smpl_x.joint_part[part_name][0], :], coord,
+                     targets_kp2d[:, smpl_x.joint_part[part_name][-1] + 1:, :]),
+                    1)
+                
+                x_pred = pred_smpl_kp2d[:, smpl_x.joint_part[part_name], 0]
+                y_pred = pred_smpl_kp2d[:, smpl_x.joint_part[part_name], 1]
+                # bbox: xyxy img_shape: hw
+                x_pred -= (bbox[:, None, 0] / img_shape[:, 1:])
+                x_pred *= (img_shape[:, 1:] / (bbox[:, None, 2] - bbox[:, None, 0] + 1e-6))
+                
+                y_pred -= (bbox[:, None, 1] / img_shape[:, :1])
+                y_pred *= (img_shape[:, :1] / (bbox[:, None, 3] - bbox[:, None, 1] + 1e-6))
+
+                coord_pred = torch.stack((x_pred, y_pred), 2)
+                trans = []
+
+                for bid in range(coord_pred.shape[0]):
+                    mask = trunc[bid] == 1
+                    
+                    if torch.sum(mask) == 0:
+                        trans.append(torch.zeros((2)).float().cuda())
+                    else:
+                        trans.append(
+                            (-coord_pred[bid, mask, :2] + targets_kp2d[:, smpl_x.joint_part[part_name], :][bid, mask, :2]).mean(0))
+                trans = torch.stack(trans)[:, None, :]
+                
+                coord_pred = coord_pred + trans  # global translation alignment
+                pred_smpl_kp2d = torch.cat(
+                    (pred_smpl_kp2d[:, :smpl_x.joint_part[part_name][0], :], coord_pred,
+                     pred_smpl_kp2d[:, smpl_x.joint_part[part_name][-1] + 1:, :]),
+                    1)
+                
+                vis = False
+                if vis:
+                    import mmcv
+                    import cv2
+                    import numpy as np
+                    from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+                    from detrsmpl.core.visualization.visualize_smpl import visualize_smpl_hmr,render_smpl
+                    from detrsmpl.models.body_models.builder import build_body_model
+                    
+                    from pytorch3d.io import save_obj
+                    from detrsmpl.core.visualization.visualize_keypoints3d import visualize_kp3d
+
+                    img = mmcv.imdenormalize(
+                        img=(data_batch['img'][0].cpu().numpy()).transpose(1, 2, 0), 
+                        mean=np.array([123.675, 116.28, 103.53]), 
+                        std=np.array([58.395, 57.12, 57.375]),
+                        to_bgr=True).astype(np.uint8).copy()
+                    
+                    device = outputs['pred_smpl_kp3d'].device
+                    gt_2d = (coord)
+                    
+                    
+                    img = mmcv.imshow_bboxes(img,bbox[0,None].int().cpu().numpy(),show=False)
+                    gt_2d[:,:,0] /= (img_shape[:, 1:] / (bbox[:, None, 2] - bbox[:, None, 0]))
+                    gt_2d[:,:,1] /= (img_shape[:, :1] / (bbox[:, None, 3] - bbox[:, None, 1]))
+                    gt_2d_ori = gt_2d.clone()
+                    gt_2d_ori[:,:,0] += (bbox[:, None, 0] / img_shape[:, 1:])
+                    gt_2d_ori[:,:,1] += (bbox[:, None, 1] / img_shape[:, :1])
+                    gt_2d = (gt_2d*img_wh[:, None]).cpu().detach().numpy()
+                    gt_2d_ori = (gt_2d_ori*img_wh[:, None]).cpu().detach().numpy()
+                    
+                    # visualize keypoints after translation to bbox and to gt
+                    pred_2d = (coord_pred).clone()
+                    
+                    pred_2d[:,:,0] /= (img_shape[:, 1:] / (bbox[:, None, 2] - bbox[:, None, 0]))
+                    pred_2d[:,:,1] /= (img_shape[:, :1] / (bbox[:, None, 3] - bbox[:, None, 1]))
+                    # visualize keypoints begore translation to bbox and to gt
+                    pred_2d_ori = (coord_pred-trans).clone()
+                    pred_2d_ori[:,:,0] /= (img_shape[:, 1:] / (bbox[:, None, 2] - bbox[:, None, 0]))
+                    pred_2d_ori[:,:,1] /= (img_shape[:, :1] / (bbox[:, None, 3] - bbox[:, None, 1]))
+                    pred_2d_ori[:,:,0] += (bbox[:, None, 0] / img_shape[:, 1:])
+                    pred_2d_ori[:,:,1] += (bbox[:, None, 1] / img_shape[:, :1])
+                    pred_2d = (pred_2d*img_wh[:, None]).cpu().detach().numpy()
+                    pred_2d_ori = (pred_2d_ori*img_wh[:, None]).cpu().detach().numpy()
+                    visualize_kp2d(
+                        gt_2d[0].reshape(-1,2)[None], 
+                        output_path='./figs/gt2d%s'%part_name, 
+                        image_array=img.copy()[None], 
+                        # data_source='smplx_137',
+                        disable_limbs = True,
+                        overwrite=True)    
+                    
+                    visualize_kp2d(
+                        gt_2d_ori[0].reshape(-1,2)[None], 
+                        output_path='./figs/gt2d%s_ori'%part_name, 
+                        image_array=img.copy()[None], 
+                        # data_source='smplx_137',
+                        disable_limbs = True,
+                        overwrite=True) 
+                    visualize_kp2d(
+                        pred_2d[0].reshape(-1,2)[None], 
+                        output_path='./figs/pred2d%s'%part_name, 
+                        image_array=img.copy()[None], 
+                        # data_source='smplx_137',
+                        disable_limbs = True,
+                        overwrite=True)    
+                    
+                    visualize_kp2d(
+                        pred_2d_ori[0].reshape(-1,2)[None], 
+                        output_path='./figs/pred2d%s_ori'%part_name, 
+                        image_array=img.copy()[None], 
+                        # data_source='smplx_137',
+                        disable_limbs = True,
+                        overwrite=True)  
+                
+
+            
+        loss_smpl_kp2d_ba = F.l1_loss(pred_smpl_kp2d,
+                                   targets_kp2d[:, :, :2],
+                                   reduction='none')
+        valid_pos = keypoints2d_conf > 0
+        
+        losses = {}
+        if keypoints2d_conf[valid_pos].numel() == 0:
+            return {
+                'loss_smpl_body_kp2d_ba':
+                torch.as_tensor(0., device=device) + loss_smpl_kp2d_ba.sum()*0,
+                'loss_smpl_lhand_kp2d_ba':
+                torch.as_tensor(0., device=device) + loss_smpl_kp2d_ba.sum()*0,
+                'loss_smpl_rhand_kp2d_ba':
+                torch.as_tensor(0., device=device) + loss_smpl_kp2d_ba.sum()*0,
+                'loss_smpl_face_kp2d_ba':
+                torch.as_tensor(0., device=device) + loss_smpl_kp2d_ba.sum()*0,             
+            }
+        # loss /= targets_kp3d_conf[valid_pos].numel()
+        # 要改
+        loss_smpl_kp2d_ba = loss_smpl_kp2d_ba * keypoints2d_conf
+        losses['loss_smpl_body_kp2d_ba'] = torch.sum(loss_smpl_kp2d_ba[:, 
+                                                smpl_x.joint_part['body'], :]) / num_boxes
+        if face_hand_kpt:
+            if num_lhand_bbox>0:
+                losses['loss_smpl_lhand_kp2d_ba'] = torch.sum(loss_smpl_kp2d_ba[:, 
+                                                        smpl_x.joint_part['lhand'], :]) / num_lhand_bbox
+            else:
+                losses['loss_smpl_lhand_kp2d_ba'] = torch.as_tensor(0., device=device) + loss_smpl_kp2d_ba.sum()*0
+            if num_rhand_bbox>0:
+                losses['loss_smpl_rhand_kp2d_ba'] = torch.sum(loss_smpl_kp2d_ba[:, 
+                                                        smpl_x.joint_part['rhand'], :]) / num_rhand_bbox
+            else:
+                losses['loss_smpl_rhand_kp2d_ba'] = torch.as_tensor(0., device=device) + loss_smpl_kp2d_ba.sum()*0
+            if num_face_bbox>0:
+                losses['loss_smpl_face_kp2d_ba'] = torch.sum(loss_smpl_kp2d_ba[:, 
+                                                        smpl_x.joint_part['face'], :]) / num_face_bbox
+            else:
+                losses['loss_smpl_face_kp2d_ba'] = torch.as_tensor(0., device=device) + loss_smpl_kp2d_ba.sum()*0
+        else:
+            losses['loss_smpl_lhand_kp2d_ba'] = 0*torch.sum(loss_smpl_kp2d_ba[:, 
+                                                    smpl_x.joint_part['lhand'], :]) / num_lhand_bbox
+
+            losses['loss_smpl_rhand_kp2d_ba'] = 0*torch.sum(loss_smpl_kp2d_ba[:, 
+                                                    smpl_x.joint_part['rhand'], :]) / num_rhand_bbox
+
+            losses['loss_smpl_face_kp2d_ba'] = 0*torch.sum(loss_smpl_kp2d_ba[:, 
+                                                        smpl_x.joint_part['face'], :]) / num_face_bbox
+        return losses
+
+
+    def loss_boxes(self, outputs, targets, indices, 
+                   idx, num_boxes, data_batch,
+                   face_hand_box=False):
+        """Compute the losses related to the bounding boxes, the L1 regression
+        loss and the GIoU loss targets dicts must contain the key "boxes"
+        containing a tensor of dim [nb_target_boxes, 4] The target boxes are
+        expected in format (center_x, center_y, w, h), normalized by the image
+        size."""
+        indices = indices[0]
+        device = outputs['pred_logits'].device
+        assert 'pred_boxes' in outputs
+        # assert 'pred_lhand_boxes' in outputs
+        # assert 'pred_rhand_boxes' in outputs
+        # assert 'pred_face_boxes' in outputs
+        
+        
+        src_body_boxes = outputs['pred_boxes'][idx]
+        target_body_boxes = torch.cat(
+            [t['boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+        target_body_boxes_conf = torch.cat(
+            [t[i] for t, (_, i) in zip(data_batch['body_bbox_valid'], indices)], dim=0)
+        valid_num=0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+
+        
+        loss_body_bbox = F.l1_loss(src_body_boxes, target_body_boxes, reduction='none')
+        loss_body_bbox = loss_body_bbox * target_body_boxes_conf[:,None]
+        
+        losses = {}
+        losses['loss_body_bbox'] = loss_body_bbox.sum() / num_boxes
+        loss_body_giou = 1 - torch.diag(
+            box_ops.generalized_box_iou(
+                box_ops.box_cxcywh_to_xyxy(src_body_boxes),
+                box_ops.box_cxcywh_to_xyxy(target_body_boxes)))
+        
+        loss_body_giou = loss_body_giou * target_body_boxes_conf
+        losses['loss_body_giou'] = loss_body_giou.sum() / num_boxes
+        
+        if 'pred_lhand_boxes' in outputs and face_hand_box:
+            src_lhand_boxes = outputs['pred_lhand_boxes'][idx]
+            target_lhand_boxes = torch.cat(
+                [t['lhand_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            target_lhand_boxes_conf = torch.cat(
+                [t[i] for t, (_, i) in zip(data_batch['lhand_bbox_valid'], indices)], dim=0)
+            # print(target_lhand_boxes_conf)
+            loss_lhand_bbox = F.l1_loss(src_lhand_boxes, target_lhand_boxes, reduction='none')
+            loss_lhand_bbox = loss_lhand_bbox * target_lhand_boxes_conf[:,None]
+            losses['loss_lhand_bbox'] = loss_lhand_bbox.sum() / num_boxes
+            loss_lhand_giou = 1 - torch.diag(
+                box_ops.generalized_box_iou(
+                    box_ops.box_cxcywh_to_xyxy(src_lhand_boxes),
+                    box_ops.box_cxcywh_to_xyxy(target_lhand_boxes)))
+            loss_lhand_giou = loss_lhand_giou * target_lhand_boxes_conf
+            losses['loss_lhand_giou'] = loss_lhand_giou.sum() / num_boxes
+            # import mmcv
+            # import cv2
+            # img = (data_batch['img'][0]*255).permute(1,2,0).int().detach().cpu().numpy()
+            # pred_bbox = (box_ops.box_cxcywh_to_xyxy(src_lhand_boxes[0]).reshape(2,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0, ::-1]).reshape(1,4)
+            # pred_bbox = (box_ops.box_cxcywh_to_xyxy(src_lhand_boxes[0]).reshape(2,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[0, ::-1]).reshape(1,4)
+            # img = mmcv.imshow_bboxes(img.copy(), pred_bbox, show=False)
+            # cv2.imwrite('test.png',img)
+        
+        if 'pred_rhand_boxes' in outputs and face_hand_box:
+            src_rhand_boxes = outputs['pred_rhand_boxes'][idx]
+            target_rhand_boxes = torch.cat(
+                [t['rhand_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            target_rhand_boxes_conf = torch.cat(
+                [t[i] for t, (_, i) in zip(data_batch['rhand_bbox_valid'], indices)], dim=0)
+            loss_rhand_bbox = F.l1_loss(src_rhand_boxes, target_rhand_boxes, reduction='none')
+            loss_rhand_bbox = loss_rhand_bbox * target_rhand_boxes_conf[:,None]
+            losses['loss_rhand_bbox'] = loss_rhand_bbox.sum() / num_boxes
+            loss_rhand_giou = 1 - torch.diag(
+                box_ops.generalized_box_iou(
+                    box_ops.box_cxcywh_to_xyxy(src_rhand_boxes),
+                    box_ops.box_cxcywh_to_xyxy(target_rhand_boxes)))
+            loss_rhand_giou = loss_rhand_giou * target_rhand_boxes_conf
+            losses['loss_rhand_giou'] = loss_rhand_giou.sum() / num_boxes
+        
+        if 'pred_face_boxes' in outputs and face_hand_box:
+            src_face_boxes = outputs['pred_face_boxes'][idx]
+            target_face_boxes = torch.cat(
+                [t['face_boxes'][i] for t, (_, i) in zip(targets, indices)], dim=0)
+            target_face_boxes_conf = torch.cat(
+                [t[i] for t, (_, i) in zip(data_batch['face_bbox_valid'], indices)], dim=0)
+            loss_face_bbox = F.l1_loss(src_face_boxes, target_face_boxes, reduction='none')
+            loss_face_bbox = loss_face_bbox * target_face_boxes_conf[:,None]
+            losses['loss_face_bbox'] = loss_face_bbox.sum() / num_boxes
+            loss_face_giou = 1 - torch.diag(
+                box_ops.generalized_box_iou(
+                    box_ops.box_cxcywh_to_xyxy(src_face_boxes),
+                    box_ops.box_cxcywh_to_xyxy(target_face_boxes)))       
+            loss_face_giou = loss_face_giou * target_face_boxes_conf
+            losses['loss_face_giou'] = loss_face_giou.sum() / num_boxes        
+
+        if valid_num == 0:
+            losses = {}
+            if face_hand_box:
+                losses = {
+                    'loss_body_bbox': loss_body_bbox.sum() * 0,
+                    'loss_body_giou': loss_body_bbox.sum() * 0,
+                    'loss_lhand_bbox': loss_lhand_bbox.sum() * 0,
+                    'loss_lhand_giou': loss_lhand_bbox.sum() * 0,
+                    'loss_rhand_bbox': loss_rhand_bbox.sum() * 0,
+                    'loss_rhand_giou': loss_rhand_bbox.sum() * 0,
+                    'loss_face_bbox': loss_face_bbox.sum() * 0,
+                    'loss_face_giou': loss_face_bbox.sum() * 0,
+                            
+                }
+            else:
+                losses = {
+                    'loss_body_bbox': loss_body_bbox.sum() * 0,
+                    'loss_body_giou': loss_body_bbox.sum() * 0,
+                    'loss_lhand_bbox': loss_body_bbox.sum() * 0,
+                    'loss_lhand_giou': loss_body_bbox.sum() * 0,
+                    'loss_rhand_bbox': loss_body_bbox.sum() * 0,
+                    'loss_rhand_giou': loss_body_bbox.sum() * 0,
+                    'loss_face_bbox': loss_body_bbox.sum() * 0,
+                    'loss_face_giou': loss_body_bbox.sum() * 0,
+                            
+                }
+            return losses
+
+        return losses
+
+    def loss_dn_boxes(self, outputs, targets, indices, idx, num_boxes,
+                      data_batch):
+        """
+        Input:
+            - src_boxes: bs, num_dn, 4
+            - tgt_boxes: bs, num_dn, 4
+
+        """
+        indices = indices[0]
+        num_tgt = outputs['num_tgt']
+        src_boxes = outputs['dn_bbox_pred']
+        tgt_boxes = outputs['dn_bbox_input']
+        valid_num=0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        if valid_num == 0:
+            device = outputs['pred_logits'].device
+            losses = {
+                'dn_loss_bbox': src_boxes.sum()*0,
+                'dn_loss_giou': src_boxes.sum()*0,
+            }
+            return losses   
+        if 'num_tgt' not in outputs:
+            device = outputs['pred_logits'].device
+            losses = {
+                'dn_loss_bbox': src_boxes.sum()*0,
+                'dn_loss_giou': src_boxes.sum()*0,
+            }
+            return losses
+        
+        if 'num_tgt' not in outputs:
+            device = outputs['pred_logits'].device
+            losses = {
+                'dn_loss_bbox': src_boxes.sum()*0,
+                'dn_loss_giou': src_boxes.sum()*0,
+            }
+            return losses
+
+
+        return self.tgt_loss_boxes(src_boxes, tgt_boxes, num_tgt)
+
+    def loss_dn_labels(self, outputs, targets, indices, idx, num_boxes,
+                       data_batch):
+        """
+        Input:
+            - src_logits: bs, num_dn, num_classes
+            - tgt_labels: bs, num_dn
+
+        """
+        indices = indices[0]
+        if 'num_tgt' not in outputs:
+            device = outputs['pred_logits'].device
+            losses = {
+                'dn_loss_ce': outputs['pred_logits'].sum()*0,
+            }
+            return losses
+        valid_num = 0
+        for indice in indices[0]:
+            valid_num+=len(indice)
+        if valid_num == 0:
+            device = outputs['pred_logits'].device
+            losses = {
+                'dn_loss_ce': outputs['pred_logits'].sum()*0,
+            }
+            return losses 
+        num_tgt = outputs['num_tgt']
+        src_logits = outputs['dn_class_pred']  # bs, num_dn, text_len
+        tgt_labels = outputs['dn_class_input']
+
+        return self.tgt_loss_labels(src_logits, tgt_labels, num_tgt)
+
+    @torch.no_grad()
+    def loss_matching_cost(self, outputs, targets, indices, idx, num_boxes,
+                           data_batch):
+        """
+        Input:
+            - src_logits: bs, num_dn, num_classes
+            - tgt_labels: bs, num_dn
+
+        """
+        cost_mean_dict = indices[1]
+        losses = {'set_{}'.format(k): v for k, v in cost_mean_dict.items()}
+        return losses
+
+    def _get_src_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    def _get_tgt_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat(
+            [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    def get_loss(self, loss, outputs, targets, data_batch, indices, num_boxes,
+                 **kwargs):
+        loss_map = {
+            'smpl_pose': self.loss_smpl_pose,
+            'smpl_beta': self.loss_smpl_beta,
+            'smpl_expr': self.loss_smpl_expr,
+            'smpl_kp2d': self.loss_smpl_kp2d,
+            'smpl_kp2d_ba': self.loss_smpl_kp2d_ba,
+            'smpl_kp3d_ra': self.loss_smpl_kp3d_ra,
+            'smpl_kp3d': self.loss_smpl_kp3d,
+            'labels': self.loss_labels,
+            'cardinality': self.loss_cardinality,
+            'boxes': self.loss_boxes,
+            'dn_label': self.loss_dn_labels,
+            'dn_bbox': self.loss_dn_boxes,
+            'matching': self.loss_matching_cost,
+        }
+        
+        idx = self._get_src_permutation_idx(indices[0])
+        # pdb.set_trace()
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, idx, num_boxes,
+                              data_batch, **kwargs)
+
+    def prep_for_dn2(self, mask_dict):
+        known_bboxs = mask_dict['known_bboxs']
+        known_labels = mask_dict['known_labels']
+        output_known_coord = mask_dict['output_known_coord']
+        output_known_class = mask_dict['output_known_class']
+        num_tgt = mask_dict['pad_size']
+
+        return known_labels, known_bboxs, output_known_class, output_known_coord, num_tgt
+
+    ## SMPL losses
+
+    def forward(self, outputs, targets, data_batch, return_indices=False):
+        """ This performs the loss computation.
+        Parameters:
+             outputs: dict of tensors, see the output specification of the model for the format
+             targets: list of dicts, such that len(targets) == batch_size.
+                      The expected keys in each dict depends on the losses applied, see each loss' doc
+
+             return_indices: used for vis. if True, the layer0-5 indices will be returned as well.
+
+        """
+        # import pdb; pdb.set_trace()
+        outputs_without_aux = {
+            k: v
+            for k, v in outputs.items() if k != 'aux_outputs'
+        }
+        device = next(iter(outputs.values())).device
+
+        # Compute the average number of target boxes accross all nodes, for normalization purposes
+        num_boxes = sum(len(t['boxes']) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes],
+                                    dtype=torch.float,
+                                    device=device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_boxes)
+        num_boxes = torch.clamp(num_boxes / get_world_size(), min=1).item()
+
+        # loss for final layer
+        # pdb.set_trace()
+        indices = self.matcher(outputs_without_aux, targets)
+        if return_indices:
+            indices0_copy = indices
+            indices_list = []
+        losses = {}
+        smpl_loss = ['smpl_pose', 'smpl_beta', 'smpl_expr', 'smpl_kp2d',
+                     'smpl_kp2d_ba', 'smpl_kp3d', 'smpl_kp3d_ra']
+        # import pdb; pdb.set_trace()
+        for loss in self.losses:
+            # print(loss)
+            # print(self.get_loss(loss, outputs, targets, indices, num_boxes))
+            kwargs = {}
+
+            if loss == 'keypoints' or loss in smpl_loss:
+                kwargs.update({'face_hand_kpt': True})
+            if loss == 'boxes':
+                kwargs.update({'face_hand_box': True})
+
+            losses.update(
+                self.get_loss(
+                    loss, outputs, targets, 
+                    data_batch, indices, 
+                    num_boxes, **kwargs
+                    ))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if 'aux_outputs' in outputs:
+            for idx, aux_outputs in enumerate(outputs['aux_outputs']):
+                indices = self.matcher(aux_outputs, targets)
+                if return_indices:
+                    indices_list.append(indices)
+                for loss in self.losses:
+                    kwargs = {}
+                    if loss == 'boxes':
+                        kwargs.update({'face_hand_box': False})
+                        if idx >= self.num_box_decoder_layers:
+                            kwargs.update({'face_hand_box': True})
+                            
+                    if loss == 'masks':
+                        continue
+                    
+                    if loss == 'keypoints':
+                        if idx < self.num_box_decoder_layers:
+                            continue
+                        elif idx < self.num_hand_face_decoder_layers:
+                            kwargs.update({'face_hand_kpt': False})
+                        else:
+                            kwargs.update({'face_hand_kpt': True})
+                            
+                    if loss in smpl_loss: 
+                        if idx < self.num_box_decoder_layers:
+                            continue
+                        elif idx < self.num_hand_face_decoder_layers:
+                            kwargs.update({'face_hand_kpt': False})
+                        else:
+                            kwargs.update({'face_hand_kpt': True})                    
+                    if loss == 'labels':
+                        # Logging is enabled only for the last layer
+                        kwargs = {'log': False}
+                    
+                    # if loss == 'smpl_expr' and idx < self.num_box_decoder_layers:
+                    #     continue
+                        
+                    
+                    # import pdb;pdb.set_trace()
+                    l_dict = self.get_loss(loss, aux_outputs, targets,
+                                           data_batch, indices, num_boxes,
+                                           **kwargs)
+                    l_dict = {k + f'_{idx}': v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        # interm_outputs loss
+        if 'interm_outputs' in outputs:
+            interm_outputs = outputs['interm_outputs']
+            indices = self.matcher(interm_outputs, targets)
+            if return_indices:
+                indices_list.append(indices)
+            for loss in self.losses:
+                if loss in ['dn_bbox', 'dn_label', 'keypoints']:
+                    continue
+                if loss in [
+                        'smpl_pose', 'smpl_beta', 'smpl_kp2d_ba', 'smpl_kp2d',
+                        'smpl_kp3d_ra', 'smpl_kp3d', 'smpl_expr'
+                ]:
+                    continue
+                kwargs = {}
+                if loss == 'labels':
+                    kwargs = {'log': False}
+                l_dict = self.get_loss(loss, interm_outputs, targets,
+                                       data_batch, indices, num_boxes,
+                                       **kwargs)
+                l_dict = {k + f'_interm': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        # aux_init loss
+        if 'query_expand' in outputs:
+            interm_outputs = outputs['query_expand']
+            indices = self.matcher(interm_outputs, targets)
+            if return_indices:
+                indices_list.append(indices)
+            for loss in self.losses:
+                if loss in ['dn_bbox', 'dn_label']:
+                    continue
+                kwargs = {}
+
+                if loss == 'labels':
+                    kwargs = {'log': False}
+                l_dict = self.get_loss(loss, interm_outputs, targets,
+                                       data_batch, indices, num_boxes,
+                                       **kwargs)
+                l_dict = {k + f'_query_expand': v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        if return_indices:
+            indices_list.append(indices0_copy)
+            return losses, indices_list
+
+        return losses
+
+    def tgt_loss_boxes(
+        self,
+        src_boxes,
+        tgt_boxes,
+        num_tgt,
+    ):
+        """
+        Input:
+            - src_boxes: bs, num_dn, 4
+            - tgt_boxes: bs, num_dn, 4
+
+        """
+        
+        loss_bbox = F.l1_loss(src_boxes, tgt_boxes, reduction='none')
+
+        losses = {}
+        losses['dn_loss_bbox'] = loss_bbox.sum() / num_tgt
+
+        loss_giou = 1 - torch.diag(
+            box_ops.generalized_box_iou(
+                box_ops.box_cxcywh_to_xyxy(src_boxes.flatten(0, 1)),
+                box_ops.box_cxcywh_to_xyxy(tgt_boxes.flatten(0, 1))))
+        losses['dn_loss_giou'] = loss_giou.sum() / num_tgt
+        return losses
+
+    def tgt_loss_labels(self,
+                        src_logits: Tensor,
+                        tgt_labels: Tensor,
+                        num_tgt: int,
+                        log: bool = True):
+        """
+        Input:
+            - src_logits: bs, num_dn, num_classes
+            - tgt_labels: bs, num_dn
+
+        """
+        target_classes_onehot = torch.zeros([
+            src_logits.shape[0], src_logits.shape[1], src_logits.shape[2] + 1
+        ],
+                                            dtype=src_logits.dtype,
+                                            layout=src_logits.layout,
+                                            device=src_logits.device)
+        target_classes_onehot.scatter_(2, tgt_labels.unsqueeze(-1), 1)
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = sigmoid_focal_loss(src_logits,
+                                     target_classes_onehot,
+                                     num_tgt,
+                                     alpha=self.focal_alpha,
+                                     gamma=2) * src_logits.shape[1]
+        losses = {'dn_loss_ce': loss_ce}
+
+        return losses
+
diff --git a/models/aios/matcher.py b/models/aios/matcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b8592a978ef455198a9b25d77697f1297792210
--- /dev/null
+++ b/models/aios/matcher.py
@@ -0,0 +1,273 @@
+import torch, os
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+from .utils import OKSLoss
+import numpy as np
+from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
+
+
+class HungarianMatcher(nn.Module):
+    def __init__(self,
+                 cost_class: float = 1,
+                 cost_bbox: float = 1,
+                 cost_giou: float = 1,
+                 focal_alpha=0.25,
+                 cost_keypoints=1.0,
+                 cost_kpvis=0.1,
+                 cost_oks=0.01,
+                 num_body_points=17):
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, 'all costs cant be 0'
+        self.cost_keypoints = cost_keypoints
+        self.cost_kpvis = cost_kpvis
+        self.cost_oks = cost_oks
+        self.focal_alpha = focal_alpha
+        self.num_body_points = num_body_points
+        if num_body_points == 17:
+            self.sigmas = np.array([
+                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
+                1.07, .87, .87, .89, .89
+            ],
+                                   dtype=np.float32) / 10.0
+
+        elif num_body_points == 14:
+            self.sigmas = np.array([
+                .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
+                .79, .79
+            ]) / 10.0
+        else:
+            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
+
+    @torch.no_grad()
+    def forward(self, outputs, targets, data_batch=None):
+        bs, num_queries = outputs['pred_logits'].shape[:2]
+        out_prob = outputs['pred_logits'].flatten(0, 1).sigmoid()
+        out_bbox = outputs['pred_boxes'].flatten(0, 1)
+        
+        out_keypoints = outputs['pred_keypoints'].flatten(0, 1)
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v['labels'] for v in targets])
+        tgt_bbox = torch.cat([v['boxes'] for v in targets])
+        tgt_keypoints = torch.cat([v['keypoints'] for v in targets])
+        tgt_area = torch.cat([v['area'] for v in targets])
+        # Compute the classification cost.
+        alpha = self.focal_alpha
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**
+                                        gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * (
+            (1 - out_prob)**gamma) * (-(out_prob + 1e-8).log())
+        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost betwen boxes
+        
+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
+                                         box_cxcywh_to_xyxy(tgt_bbox), data_batch)
+
+        # compute the keypoint costs
+        Z_pred = out_keypoints[:, 0:(self.num_body_points * 2)]
+        V_pred = out_keypoints[:, (self.num_body_points * 2):]
+        Z_gt = tgt_keypoints[:, 0:(self.num_body_points * 2)]
+        V_gt: torch.Tensor = tgt_keypoints[:, (self.num_body_points * 2):]
+        if Z_pred.sum() > 0:
+            sigmas = Z_pred.new_tensor(self.sigmas)
+            variances = (sigmas * 2)**2
+            kpt_preds = Z_pred.reshape(-1, Z_pred.size(-1) // 2, 2)
+            kpt_gts = Z_gt.reshape(-1, Z_gt.size(-1) // 2, 2)
+            squared_distance = (kpt_preds[:, None, :, 0] - kpt_gts[None, :, :, 0]) ** 2 + \
+                               (kpt_preds[:, None, :, 1] - kpt_gts[None, :, :, 1]) ** 2
+            squared_distance0 = squared_distance / (tgt_area[:, None] *
+                                                    variances[None, :] * 2)
+            squared_distance1 = torch.exp(-squared_distance0)
+            squared_distance1 = squared_distance1 * V_gt
+            oks = squared_distance1.sum(dim=-1) / (V_gt.sum(dim=-1) + 1e-6)
+            oks = oks.clamp(min=1e-6)
+            cost_oks = 1 - oks
+            # import pdb; pdb.set_trace()
+            cost_keypoints = torch.abs(Z_pred[:, None, :] - Z_gt[None])
+            cost_keypoints = cost_keypoints * V_gt.repeat_interleave(
+                2, dim=1)[None]
+            cost_keypoints = cost_keypoints.sum(-1)
+            cost_bbox = torch.zeros_like(cost_keypoints)
+            cost_giou = torch.zeros_like(
+                cost_keypoints)  # [bs*query, instance_num]
+            C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou + self.cost_keypoints * cost_keypoints + self.cost_oks * cost_oks
+            C = C.view(bs, num_queries, -1).cpu()
+
+        else:
+            cost_oks = torch.zeros_like(cost_bbox)
+            cost_keypoints = torch.zeros_like(cost_bbox)
+            C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou + self.cost_keypoints * cost_keypoints + self.cost_oks * cost_oks
+            C = C.view(bs, num_queries, -1).cpu()
+
+        sizes = [len(v['boxes']) for v in targets]
+        indices = [
+            linear_sum_assignment(c[i])
+            for i, c in enumerate(C.split(sizes, -1))
+        ]
+            # import mmcv
+            # import numpy as np
+            # import cv2
+            # from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+
+            # img = mmcv.imdenormalize(
+            #     img=(data_batch['img'][0].cpu().numpy()).transpose(1, 2, 0), 
+            #     mean=np.array([123.675, 116.28, 103.53]), 
+            #     std=np.array([58.395, 57.12, 57.375]),
+            #     to_bgr=True).astype(np.uint8)
+            # visualize_kp2d(
+            #     (gt_2d).reshape(-1,2)[None], 
+            #     output_path='./figs/gt2d', 
+            #     image_array=img.copy()[None], 
+            #     # data_source='smplx_137',
+            #     disable_limbs = True,
+            #     overwrite=True)
+
+            # from util import box_ops
+            # idx = [0, 1, 83]
+            # pred_bbox_body = (box_ops.box_cxcywh_to_xyxy(outputs['pred_boxes'][idx]).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[1, ::-1]).reshape(-1,4)
+            # pred_bbox_lhand = (box_ops.box_cxcywh_to_xyxy(outputs['pred_lhand_boxes'][idx]).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[1, ::-1]).reshape(-1,4)
+            # pred_bbox_rhand = (box_ops.box_cxcywh_to_xyxy(outputs['pred_rhand_boxes'][idx]).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[1, ::-1]).reshape(-1,4)
+            # pred_bbox_face = (box_ops.box_cxcywh_to_xyxy(outputs['pred_face_boxes'][idx]).reshape(-1,2).detach().cpu().numpy()*data_batch['img_shape'].cpu().numpy()[1, ::-1]).reshape(-1,4)
+            # pred_bbox = np.concatenate([pred_bbox_body,pred_bbox_face,pred_bbox_rhand,pred_bbox_lhand],axis=0)
+            # img = mmcv.imshow_bboxes(img.copy(), pred_bbox, show=False)
+            # cv2.imwrite('test1.png', img)
+        if tgt_ids.shape[0] > 0:
+            cost_mean_dict = {
+                'class': cost_class.mean(),
+                'bbox': cost_bbox.mean(),
+                'giou': cost_giou.mean(),
+                'keypoints': cost_keypoints.mean()
+            }
+        else:
+            cost_mean_dict = {
+                'class': torch.zeros_like(cost_class.mean()),
+                'bbox': torch.zeros_like(cost_bbox.mean()),
+                'giou': torch.zeros_like(cost_giou.mean()),
+                'keypoints': torch.zeros_like(cost_keypoints.mean()),
+            }
+
+        return [(torch.as_tensor(i, dtype=torch.int64),
+                 torch.as_tensor(j, dtype=torch.int64))
+                for i, j in indices], cost_mean_dict
+
+
+def build_matcher(args):
+    if args.matcher_type == 'HungarianMatcher':
+        return HungarianMatcher(cost_class=args.set_cost_class,
+                                cost_bbox=args.set_cost_bbox,
+                                cost_giou=args.set_cost_giou,
+                                focal_alpha=args.focal_alpha,
+                                cost_keypoints=args.set_cost_keypoints,
+                                cost_kpvis=args.set_cost_kpvis,
+                                cost_oks=args.set_cost_oks,
+                                num_body_points=args.num_body_points)
+    elif args.matcher_type == 'HungarianMatcherBox':
+        return HungarianMatcherBox(cost_class=args.set_cost_class,
+                                cost_bbox=args.set_cost_bbox,
+                                cost_giou=args.set_cost_giou,
+                                focal_alpha=args.focal_alpha)
+    else:
+        raise NotImplementedError('Unknown args.matcher_type: {}'.format(
+            args.matcher_type))
+
+
+
+
+class HungarianMatcherBox(nn.Module):
+    def __init__(self,
+                 cost_class: float = 1,
+                 cost_bbox: float = 1,
+                 cost_giou: float = 1,
+                 focal_alpha=0.25,
+                 cost_keypoints=1.0,
+                 cost_kpvis=0.1,
+                 cost_oks=0.01,
+                 num_body_points=17):
+        super().__init__()
+        self.cost_class = cost_class
+        self.cost_bbox = cost_bbox
+        self.cost_giou = cost_giou
+        assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, 'all costs cant be 0'
+        self.cost_keypoints = cost_keypoints
+        self.cost_kpvis = cost_kpvis
+        self.cost_oks = cost_oks
+        self.focal_alpha = focal_alpha
+        self.num_body_points = num_body_points
+        if num_body_points == 17:
+            self.sigmas = np.array([
+                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
+                1.07, .87, .87, .89, .89
+            ],
+                                   dtype=np.float32) / 10.0
+
+        elif num_body_points == 14:
+            self.sigmas = np.array([
+                .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
+                .79, .79
+            ]) / 10.0
+        else:
+            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        bs, num_queries = outputs['pred_logits'].shape[:2]
+        out_prob = outputs['pred_logits'].flatten(0, 1).sigmoid()
+        out_bbox = outputs['pred_boxes'].flatten(0, 1)
+
+
+        # Also concat the target labels and boxes
+        tgt_ids = torch.cat([v['labels'] for v in targets])
+        tgt_bbox = torch.cat([v['boxes'] for v in targets])
+
+        # Compute the classification cost.
+        alpha = self.focal_alpha
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**
+                                        gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * (
+            (1 - out_prob)**gamma) * (-(out_prob + 1e-8).log())
+        cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
+
+        # Compute the L1 cost between boxes
+        cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
+
+        # Compute the giou cost betwen boxes
+        cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
+                                         box_cxcywh_to_xyxy(tgt_bbox))
+
+
+        cost_oks = torch.zeros_like(cost_bbox)
+        cost_keypoints = torch.zeros_like(cost_bbox)
+        C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
+        C = C.view(bs, num_queries, -1).cpu()
+
+        sizes = [len(v['boxes']) for v in targets]
+        indices = [
+            linear_sum_assignment(c[i])
+            for i, c in enumerate(C.split(sizes, -1))
+        ]
+
+        if tgt_ids.shape[0] > 0:
+            cost_mean_dict = {
+                'class': cost_class.mean(),
+                'bbox': cost_bbox.mean(),
+                'giou': cost_giou.mean(),
+            }
+        else:
+            cost_mean_dict = {
+                'class': torch.zeros_like(cost_class.mean()),
+                'bbox': torch.zeros_like(cost_bbox.mean()),
+                'giou': torch.zeros_like(cost_giou.mean()),
+            }
+
+        return [(torch.as_tensor(i, dtype=torch.int64),
+                 torch.as_tensor(j, dtype=torch.int64))
+                for i, j in indices], cost_mean_dict
diff --git a/models/aios/ops/functions/__init__.py b/models/aios/ops/functions/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c528f3c6cf906661443541d85383ae42a5e827f4
--- /dev/null
+++ b/models/aios/ops/functions/__init__.py
@@ -0,0 +1,9 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from .ms_deform_attn_func import MSDeformAttnFunction
diff --git a/models/aios/ops/functions/ms_deform_attn_func.py b/models/aios/ops/functions/ms_deform_attn_func.py
new file mode 100644
index 0000000000000000000000000000000000000000..798a749296b991bc6cd14c9f830fb2cb4e2d03d5
--- /dev/null
+++ b/models/aios/ops/functions/ms_deform_attn_func.py
@@ -0,0 +1,71 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+import MultiScaleDeformableAttention as MSDA
+
+
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index,
+                sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(value, value_spatial_shapes,
+                                             value_level_start_index,
+                                             sampling_locations,
+                                             attention_weights,
+                                             ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes,
+                              value_level_start_index, sampling_locations,
+                              attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = \
+            MSDA.ms_deform_attn_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes,
+                                sampling_locations, attention_weights):
+
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes],
+                             dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(
+            N_ * M_, D_, H_, W_)
+        sampling_grid_l_ = sampling_grids[:, :, :,
+                                          lid_].transpose(1, 2).flatten(0, 1)
+        sampling_value_l_ = F.grid_sample(value_l_,
+                                          sampling_grid_l_,
+                                          mode='bilinear',
+                                          padding_mode='zeros',
+                                          align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        N_ * M_, 1, Lq_, L_ * P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) *
+              attention_weights).sum(-1).view(N_, M_ * D_, Lq_)
+    return output.transpose(1, 2).contiguous()
diff --git a/models/aios/ops/make.sh b/models/aios/ops/make.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3781893cae5a2f027edf678c7c78dd7a9f74e9d1
--- /dev/null
+++ b/models/aios/ops/make.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+
+TORCH_CUDA_ARCH_LIST="8.0" CUDA_HOME='path/to/your/cuda_home'  python setup.py build install
diff --git a/models/aios/ops/modules/__init__.py b/models/aios/ops/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f82cb1ad9d634a87b54ba6a71b58a230bcade5fe
--- /dev/null
+++ b/models/aios/ops/modules/__init__.py
@@ -0,0 +1,9 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from .ms_deform_attn import MSDeformAttn
diff --git a/models/aios/ops/modules/ms_deform_attn.py b/models/aios/ops/modules/ms_deform_attn.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa0a37f89c2b8e42b447d3762dfa996d687794f5
--- /dev/null
+++ b/models/aios/ops/modules/ms_deform_attn.py
@@ -0,0 +1,135 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+import math, os
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+
+from ..functions import MSDeformAttnFunction
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError('invalid input for _is_power_of_2: {} (type: {})'.format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4, use_4D_normalizer=False):
+        """Multi-Scale Deformable Attention Module.
+
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = 64
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self.use_4D_normalizer = use_4D_normalizer # false
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            if self.use_4D_normalizer:
+                offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
+            else:
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+
+
+        # for amp
+        if value.dtype == torch.float16:
+            # for mixed precision
+            output = MSDeformAttnFunction.apply(
+            value.to(torch.float32), input_spatial_shapes, input_level_start_index, sampling_locations.to(torch.float32), attention_weights, self.im2col_step)
+            output = output.to(torch.float16)
+            output = self.output_proj(output)
+            return output
+
+        output = MSDeformAttnFunction.apply(
+            value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        output = self.output_proj(output)
+        return output
diff --git a/models/aios/ops/modules/ms_deform_attn_key_aware.py b/models/aios/ops/modules/ms_deform_attn_key_aware.py
new file mode 100644
index 0000000000000000000000000000000000000000..966e3863a06267887354344002daaad9ba4e9c97
--- /dev/null
+++ b/models/aios/ops/modules/ms_deform_attn_key_aware.py
@@ -0,0 +1,161 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+import math, os
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+
+from ..functions import MSDeformAttnFunction
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError(
+            'invalid input for _is_power_of_2: {} (type: {})'.format(
+                n, type(n)))
+    return (n & (n - 1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+    def __init__(self,
+                 d_model=256,
+                 n_levels=4,
+                 n_heads=8,
+                 n_points=4,
+                 use_4D_normalizer=False):
+        """Multi-Scale Deformable Attention Module.
+
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError(
+                'd_model must be divisible by n_heads, but got {} and {}'.
+                format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn(
+                "You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                'which is more efficient in our CUDA implementation.')
+
+        self.im2col_step = 64
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model,
+                                          n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model,
+                                           n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self.use_4D_normalizer = use_4D_normalizer
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(
+            self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init /
+                     grid_init.abs().max(-1, keepdim=True)[0]).view(
+                         self.n_heads, 1, 1, 2).repeat(1, self.n_levels,
+                                                       self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self,
+                query,
+                key,
+                reference_points,
+                input_flatten,
+                input_spatial_shapes,
+                input_level_start_index,
+                input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param key                          (N, 1, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] *
+                input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads,
+                           self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(
+            N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(
+            N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights,
+                                      -1).view(N, Len_q, self.n_heads,
+                                               self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack(
+                [input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]],
+                -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            if self.use_4D_normalizer:
+                offset_normalizer = torch.stack([
+                    input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]
+                ], -1)
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / offset_normalizer[None, None, None, :, None, :] * reference_points[:, :, None, :, None, 2:] * 0.5
+            else:
+                sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                    + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'
+                .format(reference_points.shape[-1]))
+        output = MSDeformAttnFunction.apply(value, input_spatial_shapes,
+                                            input_level_start_index,
+                                            sampling_locations,
+                                            attention_weights,
+                                            self.im2col_step)
+        output = self.output_proj(output)
+        return output
diff --git a/models/aios/ops/setup.py b/models/aios/ops/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..f619a512e254724e2027d6a1df71c4b7317e59c4
--- /dev/null
+++ b/models/aios/ops/setup.py
@@ -0,0 +1,77 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+import os
+import glob
+
+import torch
+
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+from setuptools import find_packages
+from setuptools import setup
+
+requirements = ['torch', 'torchvision']
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, 'src')
+
+    main_file = glob.glob(os.path.join(extensions_dir, '*.cpp'))
+    source_cpu = glob.glob(os.path.join(extensions_dir, 'cpu', '*.cpp'))
+    source_cuda = glob.glob(os.path.join(extensions_dir, 'cuda', '*.cu'))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {'cxx': []}
+    define_macros = []
+
+    if torch.cuda.is_available() and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [('WITH_CUDA', None)]
+        extra_compile_args['nvcc'] = [
+            '-DCUDA_HAS_FP16=1',
+            '-D__CUDA_NO_HALF_OPERATORS__',
+            '-D__CUDA_NO_HALF_CONVERSIONS__',
+            '-D__CUDA_NO_HALF2_OPERATORS__',
+        ]
+    else:
+        raise NotImplementedError('Cuda is not availabel')
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            'MultiScaleDeformableAttention',
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+
+setup(
+    name='MultiScaleDeformableAttention',
+    version='1.0',
+    author='Weijie Su',
+    url='https://github.com/fundamentalvision/Deformable-DETR',
+    description=
+    'PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention',
+    packages=find_packages(exclude=(
+        'configs',
+        'tests',
+    )),
+    ext_modules=get_extensions(),
+    cmdclass={'build_ext': torch.utils.cpp_extension.BuildExtension},
+)
diff --git a/models/aios/ops/src/cpu/ms_deform_attn_cpu.cpp b/models/aios/ops/src/cpu/ms_deform_attn_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2aa070c5be2fcd99b22828d28ecd5b1a5894f773
--- /dev/null
+++ b/models/aios/ops/src/cpu/ms_deform_attn_cpu.cpp
@@ -0,0 +1,40 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
diff --git a/models/aios/ops/src/cpu/ms_deform_attn_cpu.h b/models/aios/ops/src/cpu/ms_deform_attn_cpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..277befb08061f7c186f3f277b252e210c7fcdb2b
--- /dev/null
+++ b/models/aios/ops/src/cpu/ms_deform_attn_cpu.h
@@ -0,0 +1,31 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
diff --git a/models/aios/ops/src/cuda/ms_deform_attn_cuda.cu b/models/aios/ops/src/cuda/ms_deform_attn_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..07be7d4b432d313735aa0b8caa9c50f78e52485a
--- /dev/null
+++ b/models/aios/ops/src/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,153 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include <vector>
+#include "cuda/ms_deform_im2col_cuda.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
diff --git a/models/aios/ops/src/cuda/ms_deform_attn_cuda.h b/models/aios/ops/src/cuda/ms_deform_attn_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..375d7372b25f04c76b794ec37f052c386cb435fb
--- /dev/null
+++ b/models/aios/ops/src/cuda/ms_deform_attn_cuda.h
@@ -0,0 +1,29 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
diff --git a/models/aios/ops/src/cuda/ms_deform_im2col_cuda.cuh b/models/aios/ops/src/cuda/ms_deform_im2col_cuda.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..e8c08516826eaa8b1c195f9ce457ebc6b9486478
--- /dev/null
+++ b/models/aios/ops/src/cuda/ms_deform_im2col_cuda.cuh
@@ -0,0 +1,1327 @@
+/*!
+**************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************
+* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
+* Copyright (c) 2018 Microsoft
+**************************************************************************
+*/
+
+#include <cstdio>
+#include <algorithm>
+#include <cstring>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+#include <THC/THCAtomics.cuh>
+
+#define CUDA_KERNEL_LOOP(i, n)                          \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
+      i < (n);                                          \
+      i += blockDim.x * gridDim.x)
+
+const int CUDA_NUM_THREADS = 1024;
+inline int GET_BLOCKS(const int N, const int num_threads)
+{
+  return (N + num_threads - 1) / num_threads;
+}
+
+
+template <typename scalar_t>
+__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+  }
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  return val;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value,
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  *grad_attn_weight = top_grad * val;
+  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
+  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
+}
+
+
+template <typename scalar_t>
+__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data,
+                                                   const int &height, const int &width, const int &nheads, const int &channels,
+                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
+                                                   const scalar_t &top_grad,
+                                                   const scalar_t &attn_weight,
+                                                   scalar_t* &grad_value,
+                                                   scalar_t* grad_sampling_loc,
+                                                   scalar_t* grad_attn_weight)
+{
+  const int h_low = floor(h);
+  const int w_low = floor(w);
+  const int h_high = h_low + 1;
+  const int w_high = w_low + 1;
+
+  const scalar_t lh = h - h_low;
+  const scalar_t lw = w - w_low;
+  const scalar_t hh = 1 - lh, hw = 1 - lw;
+
+  const int w_stride = nheads * channels;
+  const int h_stride = width * w_stride;
+  const int h_low_ptr_offset = h_low * h_stride;
+  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
+  const int w_low_ptr_offset = w_low * w_stride;
+  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
+  const int base_ptr = m * channels + c;
+
+  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
+  const scalar_t top_grad_value = top_grad * attn_weight;
+  scalar_t grad_h_weight = 0, grad_w_weight = 0;
+
+  scalar_t v1 = 0;
+  if (h_low >= 0 && w_low >= 0)
+  {
+    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
+    v1 = bottom_data[ptr1];
+    grad_h_weight -= hw * v1;
+    grad_w_weight -= hh * v1;
+    atomicAdd(grad_value+ptr1, w1*top_grad_value);
+  }
+  scalar_t v2 = 0;
+  if (h_low >= 0 && w_high <= width - 1)
+  {
+    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
+    v2 = bottom_data[ptr2];
+    grad_h_weight -= lw * v2;
+    grad_w_weight += hh * v2;
+    atomicAdd(grad_value+ptr2, w2*top_grad_value);
+  }
+  scalar_t v3 = 0;
+  if (h_high <= height - 1 && w_low >= 0)
+  {
+    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
+    v3 = bottom_data[ptr3];
+    grad_h_weight += hw * v3;
+    grad_w_weight -= lh * v3;
+    atomicAdd(grad_value+ptr3, w3*top_grad_value);
+  }
+  scalar_t v4 = 0;
+  if (h_high <= height - 1 && w_high <= width - 1)
+  {
+    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
+    v4 = bottom_data[ptr4];
+    grad_h_weight += lw * v4;
+    grad_w_weight += lh * v4;
+    atomicAdd(grad_value+ptr4, w4*top_grad_value);
+  }
+
+  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+  atomicAdd(grad_attn_weight, top_grad * val);
+  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
+  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_im2col_gpu_kernel(const int n,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *data_col)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    scalar_t *data_col_ptr = data_col + index;
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+    scalar_t col = 0;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
+        }
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+      }
+    }
+    *data_col_ptr = col;
+  }
+}
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockSize; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t, unsigned int blockSize>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
+    __shared__ scalar_t cache_grad_attn_weight[blockSize];
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s=blockSize/2; s>0; s>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+
+        __syncthreads();
+        if (tid == 0)
+        {
+          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
+          int sid=2;
+          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
+          {
+            _grad_w += cache_grad_sampling_loc[sid];
+            _grad_h += cache_grad_sampling_loc[sid + 1];
+            _grad_a += cache_grad_attn_weight[tid];
+            sid += 2;
+          }
+
+
+          *grad_sampling_loc = _grad_w;
+          *(grad_sampling_loc + 1) = _grad_h;
+          *grad_attn_weight = _grad_a;
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          *grad_sampling_loc = cache_grad_sampling_loc[0];
+          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
+          *grad_attn_weight = cache_grad_attn_weight[0];
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    extern __shared__ int _s[];
+    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
+    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
+    unsigned int tid = threadIdx.x;
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
+        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
+        *(cache_grad_attn_weight+threadIdx.x)=0;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
+        }
+
+        __syncthreads();
+
+        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
+        {
+          if (tid < s) {
+            const unsigned int xid1 = tid << 1;
+            const unsigned int xid2 = (tid + s) << 1;
+            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
+            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
+            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
+            if (tid + (s << 1) < spre)
+            {
+              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
+              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
+              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
+            }
+          }
+          __syncthreads();
+        }
+
+        if (tid == 0)
+        {
+          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
+          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
+          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
+        }
+        __syncthreads();
+
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
+                                                const scalar_t *grad_col,
+                                                const scalar_t *data_value,
+                                                const int64_t *data_spatial_shapes,
+                                                const int64_t *data_level_start_index,
+                                                const scalar_t *data_sampling_loc,
+                                                const scalar_t *data_attn_weight,
+                                                const int batch_size,
+                                                const int spatial_size,
+                                                const int num_heads,
+                                                const int channels,
+                                                const int num_levels,
+                                                const int num_query,
+                                                const int num_point,
+                                                scalar_t *grad_value,
+                                                scalar_t *grad_sampling_loc,
+                                                scalar_t *grad_attn_weight)
+{
+  CUDA_KERNEL_LOOP(index, n)
+  {
+    int _temp = index;
+    const int c_col = _temp % channels;
+    _temp /= channels;
+    const int sampling_index = _temp;
+    const int m_col = _temp % num_heads;
+    _temp /= num_heads;
+    const int q_col = _temp % num_query;
+    _temp /= num_query;
+    const int b_col = _temp;
+
+    const scalar_t top_grad = grad_col[index];
+
+    int data_weight_ptr = sampling_index * num_levels * num_point;
+    int data_loc_w_ptr = data_weight_ptr << 1;
+    const int grad_sampling_ptr = data_weight_ptr;
+    grad_sampling_loc += grad_sampling_ptr << 1;
+    grad_attn_weight += grad_sampling_ptr;
+    const int grad_weight_stride = 1;
+    const int grad_loc_stride = 2;
+    const int qid_stride = num_heads * channels;
+    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
+
+    for (int l_col=0; l_col < num_levels; ++l_col)
+    {
+      const int level_start_id = data_level_start_index[l_col];
+      const int spatial_h_ptr = l_col << 1;
+      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
+      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
+      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
+      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
+      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
+
+      for (int p_col=0; p_col < num_point; ++p_col)
+      {
+        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
+        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
+        const scalar_t weight = data_attn_weight[data_weight_ptr];
+
+        const scalar_t h_im = loc_h * spatial_h - 0.5;
+        const scalar_t w_im = loc_w * spatial_w - 0.5;
+        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
+        {
+          ms_deform_attn_col2im_bilinear_gm(
+            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
+            top_grad, weight, grad_value_ptr,
+            grad_sampling_loc, grad_attn_weight);
+        }
+        data_weight_ptr += 1;
+        data_loc_w_ptr += 2;
+        grad_attn_weight += grad_weight_stride;
+        grad_sampling_loc += grad_loc_stride;
+      }
+    }
+  }
+}
+
+
+template <typename scalar_t>
+void ms_deformable_im2col_cuda(cudaStream_t stream,
+                              const scalar_t* data_value,
+                              const int64_t* data_spatial_shapes,
+                              const int64_t* data_level_start_index,
+                              const scalar_t* data_sampling_loc,
+                              const scalar_t* data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size,
+                              const int num_heads,
+                              const int channels,
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* data_col)
+{
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  const int num_threads = CUDA_NUM_THREADS;
+  ms_deformable_im2col_gpu_kernel<scalar_t>
+      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+          0, stream>>>(
+      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight,
+      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
+
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
+
+template <typename scalar_t>
+void ms_deformable_col2im_cuda(cudaStream_t stream,
+                              const scalar_t* grad_col,
+                              const scalar_t* data_value,
+                              const int64_t * data_spatial_shapes,
+                              const int64_t * data_level_start_index,
+                              const scalar_t * data_sampling_loc,
+                              const scalar_t * data_attn_weight,
+                              const int batch_size,
+                              const int spatial_size,
+                              const int num_heads,
+                              const int channels,
+                              const int num_levels,
+                              const int num_query,
+                              const int num_point,
+                              scalar_t* grad_value,
+                              scalar_t* grad_sampling_loc,
+                              scalar_t* grad_attn_weight)
+{
+  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
+  const int num_kernels = batch_size * num_query * num_heads * channels;
+  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
+  if (channels > 1024)
+  {
+    if ((channels & 1023) == 0)
+    {
+      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+    }
+    else
+    {
+      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+    }
+  }
+  else{
+    switch(channels)
+    {
+      case 1:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 2:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 4:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 8:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 16:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 32:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 64:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 128:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 256:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 512:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      case 1024:
+        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
+        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+            0, stream>>>(
+                      num_kernels,
+                      grad_col,
+                      data_value,
+                      data_spatial_shapes,
+                      data_level_start_index,
+                      data_sampling_loc,
+                      data_attn_weight,
+                      batch_size,
+                      spatial_size,
+                      num_heads,
+                      channels,
+                      num_levels,
+                      num_query,
+                      num_point,
+                      grad_value,
+                      grad_sampling_loc,
+                      grad_attn_weight);
+        break;
+      default:
+        if (channels < 64)
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+        else
+        {
+          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
+          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
+              num_threads*3*sizeof(scalar_t), stream>>>(
+                        num_kernels,
+                        grad_col,
+                        data_value,
+                        data_spatial_shapes,
+                        data_level_start_index,
+                        data_sampling_loc,
+                        data_attn_weight,
+                        batch_size,
+                        spatial_size,
+                        num_heads,
+                        channels,
+                        num_levels,
+                        num_query,
+                        num_point,
+                        grad_value,
+                        grad_sampling_loc,
+                        grad_attn_weight);
+        }
+    }
+  }
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess)
+  {
+    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
+  }
+
+}
diff --git a/models/aios/ops/src/ms_deform_attn.h b/models/aios/ops/src/ms_deform_attn.h
new file mode 100644
index 0000000000000000000000000000000000000000..3fd73447b6cdeb7221335c1c26ea1920562ed510
--- /dev/null
+++ b/models/aios/ops/src/ms_deform_attn.h
@@ -0,0 +1,61 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#pragma once
+
+#include "cpu/ms_deform_attn_cpu.h"
+
+#ifdef WITH_CUDA
+#include "cuda/ms_deform_attn_cuda.h"
+#endif
+
+
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value,
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
diff --git a/models/aios/ops/src/vision.cpp b/models/aios/ops/src/vision.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2201f63a51dca16d0b31148ed2c9e8e47ec15bdc
--- /dev/null
+++ b/models/aios/ops/src/vision.cpp
@@ -0,0 +1,16 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+#include "ms_deform_attn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}
diff --git a/models/aios/ops/test.py b/models/aios/ops/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6dfcc84c9b48f6b2966835cbb578910b3756f46e
--- /dev/null
+++ b/models/aios/ops/test.py
@@ -0,0 +1,112 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
+
+N, M, D = 1, 2, 2
+Lq, L, P = 2, 2, 2
+shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+level_start_index = torch.cat((shapes.new_zeros(
+    (1, )), shapes.prod(1).cumsum(0)[:-1]))
+S = sum([(H * W).item() for H, W in shapes])
+
+torch.manual_seed(3)
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1,
+                                               keepdim=True).sum(-2,
+                                                                 keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(
+        value.double(), shapes, sampling_locations.double(),
+        attention_weights.double()).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes,
+                                             level_start_index,
+                                             sampling_locations.double(),
+                                             attention_weights.double(),
+                                             im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+
+    print(
+        f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
+    )
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1,
+                                               keepdim=True).sum(-2,
+                                                                 keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(
+        value, shapes, sampling_locations, attention_weights).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index,
+                                             sampling_locations,
+                                             attention_weights,
+                                             im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() /
+                   output_pytorch.abs()).max()
+
+    print(
+        f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}'
+    )
+
+
+def check_gradient_numerical(channels=4,
+                             grad_value=True,
+                             grad_sampling_loc=True,
+                             grad_attn_weight=True):
+
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1,
+                                               keepdim=True).sum(-2,
+                                                                 keepdim=True)
+    im2col_step = 2
+    func = MSDeformAttnFunction.apply
+
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+
+    gradok = gradcheck(
+        func,
+        (value.double(), shapes, level_start_index,
+         sampling_locations.double(), attention_weights.double(), im2col_step))
+
+    print(f'* {gradok} check_gradient_numerical(D={channels})')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+
+    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
+        check_gradient_numerical(channels, True, True, True)
diff --git a/models/aios/position_encoding.py b/models/aios/position_encoding.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe037e14b8ddbecb72b08179276bdc40b3c6bf19
--- /dev/null
+++ b/models/aios/position_encoding.py
@@ -0,0 +1,160 @@
+"""Various positional encodings for the transformer."""
+import math
+import torch
+from torch import nn
+
+from util.misc import NestedTensor
+
+
+class PositionEmbeddingSine(nn.Module):
+    """This is a more standard version of the position embedding, very similar
+    to the one used by the Attention is all you need paper, generalized to work
+    on images."""
+    def __init__(self,
+                 num_pos_feats=64,
+                 temperature=10000,
+                 normalize=False,
+                 scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError('normalize should be True if scale is passed')
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.num_pos_feats,
+                             dtype=torch.float32,
+                             device=x.device)
+        dim_t = self.temperature**(2 * (dim_t // 2) / self.num_pos_feats)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+class PositionEmbeddingSineHW(nn.Module):
+    """This is a more standard version of the position embedding, very similar
+    to the one used by the Attention is all you need paper, generalized to work
+    on images."""
+    def __init__(self,
+                 num_pos_feats=64,
+                 temperatureH=10000,
+                 temperatureW=10000,
+                 normalize=False,
+                 scale=None):
+        super().__init__()
+        self.num_pos_feats = num_pos_feats  # 128
+        self.temperatureH = temperatureH  # 20
+        self.temperatureW = temperatureW
+        self.normalize = normalize  # true
+        if scale is not None and normalize is False:
+            raise ValueError('normalize should be True if scale is passed')
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        mask = tensor_list.mask
+        assert mask is not None
+        not_mask = ~mask
+        y_embed = not_mask.cumsum(1, dtype=torch.float32)
+        x_embed = not_mask.cumsum(2, dtype=torch.float32)
+
+        # import pdb; pdb.set_trace()
+
+        if self.normalize:
+            eps = 1e-6
+            y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_tx = torch.arange(self.num_pos_feats,
+                              dtype=torch.float32,
+                              device=x.device)
+        dim_tx = self.temperatureW**(2 * (dim_tx // 2) / self.num_pos_feats)
+        pos_x = x_embed[:, :, :, None] / dim_tx
+
+        dim_ty = torch.arange(self.num_pos_feats,
+                              dtype=torch.float32,
+                              device=x.device)
+        dim_ty = self.temperatureH**(2 * (dim_ty // 2) / self.num_pos_feats)
+        pos_y = y_embed[:, :, :, None] / dim_ty
+
+        pos_x = torch.stack(
+            (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos_y = torch.stack(
+            (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()),
+            dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+
+        # import pdb; pdb.set_trace()
+
+        return pos
+
+
+class PositionEmbeddingLearned(nn.Module):
+    """Absolute pos embedding, learned."""
+    def __init__(self, num_pos_feats=256):
+        super().__init__()
+        self.row_embed = nn.Embedding(50, num_pos_feats)
+        self.col_embed = nn.Embedding(50, num_pos_feats)
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        nn.init.uniform_(self.row_embed.weight)
+        nn.init.uniform_(self.col_embed.weight)
+
+    def forward(self, tensor_list: NestedTensor):
+        x = tensor_list.tensors
+        h, w = x.shape[-2:]
+        i = torch.arange(w, device=x.device)
+        j = torch.arange(h, device=x.device)
+        x_emb = self.col_embed(i)
+        y_emb = self.row_embed(j)
+        pos = torch.cat([
+            x_emb.unsqueeze(0).repeat(h, 1, 1),
+            y_emb.unsqueeze(1).repeat(1, w, 1),
+        ],
+                        dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(
+                            x.shape[0], 1, 1, 1)
+        return pos
+
+
+def build_position_encoding(args):
+    N_steps = args.hidden_dim // 2  # 256//2
+    if args.position_embedding in ('v2', 'sine'):  # sine
+        # TODO find a better way of exposing other arguments
+        position_embedding = PositionEmbeddingSineHW(
+            N_steps,
+            temperatureH=args.pe_temperatureH,
+            temperatureW=args.pe_temperatureW,
+            normalize=True)
+    elif args.position_embedding in ('v3', 'learned'):
+        position_embedding = PositionEmbeddingLearned(N_steps)
+    else:
+        raise ValueError(f'not supported {args.position_embedding}')
+
+    return position_embedding
diff --git a/models/aios/postprocesses.py b/models/aios/postprocesses.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bacb91c2c885dc741814452939935d0cafdcf42
--- /dev/null
+++ b/models/aios/postprocesses.py
@@ -0,0 +1,1907 @@
+import copy
+import os
+import math
+from scipy.optimize import linear_sum_assignment
+from typing import List
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torchvision.ops.boxes import nms
+from torch import Tensor
+from pycocotools.coco import COCO
+from util import box_ops
+from util.misc import (NestedTensor, nested_tensor_from_tensor_list, accuracy,
+                       get_world_size, interpolate,
+                       is_dist_avail_and_initialized, inverse_sigmoid)
+from detrsmpl.utils.demo_utils import convert_verts_to_cam_coord, xywh2xyxy, xyxy2xywh
+import numpy as np
+from detrsmpl.core.conventions.keypoints_mapping import convert_kps
+from detrsmpl.models.body_models.builder import build_body_model
+from detrsmpl.utils.geometry import batch_rodrigues, project_points, weak_perspective_projection,project_points_new
+from util.human_models import smpl_x
+from detrsmpl.core.conventions.keypoints_mapping import get_keypoint_idx
+class PostProcess(nn.Module):
+    """This module converts the model's output into the format expected by the
+    coco api."""
+    def __init__(self,
+                 num_select=100,
+                 nms_iou_threshold=-1,
+                 num_body_points=17,
+                 body_model=None) -> None:
+        super().__init__()
+        self.num_select = num_select
+        self.nms_iou_threshold = nms_iou_threshold
+        self.num_body_points = num_body_points
+        self.body_model = build_body_model(
+            dict(type='GenderedSMPL',
+                 keypoint_src='h36m',
+                 keypoint_dst='h36m',
+                 model_path='data/body_models/smpl',
+                 keypoint_approximate=True,
+                 joints_regressor=
+                 'data/body_models/J_regressor_h36m.npy'))
+
+    @torch.no_grad()
+    def forward(self,
+                outputs,
+                target_sizes,
+                targets,
+                data_batch_nc,
+                device,
+                not_to_xyxy=False,
+                test=False):
+        # import pdb; pdb.set_trace()
+        num_select = self.num_select
+        self.body_model.to(device)
+
+        out_logits, out_bbox, out_keypoints= \
+            outputs['pred_logits'], outputs['pred_boxes'], \
+            outputs['pred_keypoints']
+
+        out_smpl_pose, out_smpl_beta, out_smpl_cam, out_smpl_kp3d = \
+            outputs['pred_smpl_pose'], outputs['pred_smpl_beta'], \
+            outputs['pred_smpl_cam'], outputs['pred_smpl_kp3d']
+
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = \
+            torch.topk(prob.view(out_logits.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        # bbox
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        if not_to_xyxy:
+            boxes = out_bbox
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+
+        if test:
+            assert not not_to_xyxy
+            boxes[:, :, 2:] = boxes[:, :, 2:] - boxes[:, :, :2]
+        boxes_norm = torch.gather(boxes, 1,
+                                  topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+        target_sizes = target_sizes.type_as(boxes)
+        # from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes_norm * scale_fct[:, None, :]
+
+        # keypoints
+        topk_keypoints = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        keypoints = torch.gather(
+            out_keypoints, 1,
+            topk_keypoints.unsqueeze(-1).repeat(1, 1,
+                                                self.num_body_points * 3))
+
+        Z_pred = keypoints[:, :, :(self.num_body_points * 2)]
+        V_pred = keypoints[:, :, (self.num_body_points * 2):]
+        img_h, img_w = target_sizes.unbind(1)
+        Z_pred = Z_pred * torch.stack([img_w, img_h], dim=1).repeat(
+            1, self.num_body_points)[:, None, :]
+        keypoints_res = torch.zeros_like(keypoints)
+        keypoints_res[..., 0::3] = Z_pred[..., 0::2]
+        keypoints_res[..., 1::3] = Z_pred[..., 1::2]
+        keypoints_res[..., 2::3] = V_pred[..., 0::1]
+
+        # smpl out_smpl_pose, out_smpl_beta, out_smpl_cam, out_smpl_kp3d
+        topk_smpl = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        smpl_pose = torch.gather(
+            out_smpl_pose, 1, topk_smpl[:, :, None, None,
+                                        None].repeat(1, 1, 24, 3, 3))
+        smpl_beta = torch.gather(out_smpl_beta, 1,
+                                 topk_smpl[:, :, None].repeat(1, 1, 10))
+        smpl_cam = torch.gather(out_smpl_cam, 1,
+                                topk_smpl[:, :, None].repeat(1, 1, 3))
+        smpl_kp3d = torch.gather(
+            out_smpl_kp3d, 1,
+            topk_smpl[:, :, None, None].repeat(1, 1, out_smpl_kp3d.shape[-2],
+                                               3))
+
+        if False:
+            import cv2
+            import mmcv
+            img = cv2.imread(data_batch_nc['img_metas'][0]['image_path'])
+            render_img = mmcv.imshow_bboxes(img.copy(),
+                                            boxes[0][:3].cpu().numpy(),
+                                            show=False)
+            cv2.imwrite('r_bbox.png', render_img)
+            gt_bbox_xyxy = xywh2xyxy(
+                data_batch_nc['bbox_xywh'][0].cpu().numpy())
+            render_img = mmcv.imshow_bboxes(img.copy(),
+                                            gt_bbox_xyxy,
+                                            show=False)
+            cv2.imwrite('r_bbox.png', render_img)
+            from detrsmpl.core.visualization.visualize_keypoints3d import visualize_kp3d
+            visualize_kp3d(smpl_kp3d[0][[0]].cpu().numpy(),
+                           output_path='.',
+                           data_source='smpl_54')
+            from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+            visualize_kp2d(keypoints_res[0].reshape(-1, 17,
+                                                    3)[[0]].cpu().numpy(),
+                           output_path='.',
+                           image_array=img.copy()[None],
+                           data_source='coco',
+                           overwrite=True)
+
+        tgt_smpl_kp3d = data_batch_nc['keypoints3d_smpl']
+        tgt_smpl_pose = [
+            torch.concat([
+                data_batch_nc['smpl_global_orient'][i][:, None],
+                data_batch_nc['smpl_body_pose'][i]
+            ],
+                         dim=-2)
+            for i in range(len(data_batch_nc['smpl_body_pose']))
+        ]
+        tgt_smpl_beta = data_batch_nc['smpl_betas']
+        tgt_keypoints = data_batch_nc['keypoints2d_ori']
+        tgt_bbox = data_batch_nc['bbox_xywh']
+
+        indices = []
+        # pred
+        pred_smpl_kp3d = []
+        pred_smpl_pose = []
+        pred_smpl_beta = []
+        pred_scores = []
+        pred_labels = []
+        pred_boxes = []
+        pred_keypoints = []
+        pred_smpl_cam = []
+
+        # gt
+        gt_smpl_kp3d = []
+        gt_smpl_pose = []
+        gt_smpl_beta = []
+        gt_boxes = []
+        gt_keypoints = []
+        image_idx = []
+
+        results = []
+        for i, kp3d in enumerate(tgt_smpl_kp3d):
+            # kp3d
+            conf = tgt_smpl_kp3d[i][..., [3]]
+            gt_kp3d = tgt_smpl_kp3d[i][..., :3]
+            pred_kp3d = smpl_kp3d[i]
+
+            gt_output = self.body_model(
+                betas=tgt_smpl_beta[i].float(),
+                body_pose=tgt_smpl_pose[i][:, 1:].float().reshape(-1, 69),
+                global_orient=tgt_smpl_pose[i][:, [0]].float().reshape(-1, 3),
+                gender=torch.zeros(tgt_smpl_beta[i].shape[0]),
+                pose2rot=True)
+            gt_kp3d = gt_output['joints']
+            # gt_kp3d,_ = convert_kps(
+            #     gt_kp3d,
+            #     src='smpl_54',
+            #     dst='h36m',
+            # )
+            assert gt_kp3d.shape[-2] == 17
+
+            H36M_TO_J17 = [
+                6, 5, 4, 1, 2, 3, 16, 15, 14, 11, 12, 13, 8, 10, 0, 7, 9
+            ]
+            H36M_TO_J14 = H36M_TO_J17[:14]
+            joint_mapper = H36M_TO_J14
+            pred_pelvis = pred_kp3d[:, 0]
+            gt_pelvis = gt_kp3d[:, 0]
+            gt_keypoints3d = gt_kp3d[:, joint_mapper, :]
+            pred_keypoints3d = pred_kp3d[:, joint_mapper, :]
+
+            pred_keypoints3d = (pred_keypoints3d -
+                                pred_pelvis[:, None, :]) * 1000
+            gt_keypoints3d = (gt_keypoints3d - gt_pelvis[:, None, :]) * 1000
+            
+            cost_kp3d = torch.abs((pred_keypoints3d[:, None] -
+                                   gt_keypoints3d[None])).sum([-2, -1])
+            
+            
+            tgt_bbox[i][..., 2] = tgt_bbox[i][..., 0] + tgt_bbox[i][..., 2]
+            tgt_bbox[i][..., 3] = tgt_bbox[i][..., 1] + tgt_bbox[i][..., 3]
+            gt_bbox = tgt_bbox[i][..., :4].float()
+            pred_bbox = boxes[i]
+            # box_iou = box_ops.box_iou(pred_bbox,gt_bbox)[0]
+            cost_giou = -box_ops.generalized_box_iou(pred_bbox, gt_bbox)
+            
+            indice = linear_sum_assignment(cost_giou.cpu())
+            pred_ind, gt_ind = indice
+
+            indices.append(indice)
+            # bbox
+            
+            # cost_bbox = torch.cdist(pred_bbox, gt_bbox, p=1)
+            # indice = linear_sum_assignment(cost_giou.cpu())
+            # pred_ind, gt_ind = indice
+            # indices.append(indice)
+
+            # pred
+            pred_scores.append(scores[i][pred_ind].detach().cpu().numpy())
+            pred_labels.append(labels[i][pred_ind].detach().cpu().numpy())
+            pred_boxes.append(boxes[i][pred_ind].detach().cpu().numpy())
+            pred_keypoints.append(
+                keypoints_res[i][pred_ind].detach().cpu().numpy())
+
+            pred_smpl_kp3d.append(
+                smpl_kp3d[i][pred_ind].detach().cpu().numpy())
+            pred_smpl_pose.append(
+                smpl_pose[i][pred_ind].detach().cpu().numpy())
+            pred_smpl_beta.append(
+                smpl_beta[i][pred_ind].detach().cpu().numpy())
+            pred_smpl_cam.append(smpl_cam[i][pred_ind].detach().cpu().numpy())
+
+            # gt
+            gt_smpl_kp3d.append(
+                tgt_smpl_kp3d[i][gt_ind].detach().cpu().numpy())
+            gt_smpl_pose.append(
+                tgt_smpl_pose[i][gt_ind].detach().cpu().numpy())
+            gt_smpl_beta.append(
+                tgt_smpl_beta[i][gt_ind].detach().cpu().numpy())
+            gt_boxes.append(tgt_bbox[i][gt_ind].detach().cpu().numpy())
+            gt_keypoints.append(
+                tgt_keypoints[i][gt_ind].detach().cpu().numpy())
+            image_idx.append(targets[i]['image_id'].detach().cpu().numpy())
+            # gt_output = self.body_model(
+            #     betas=tgt_smpl_beta[i].float(),
+            #     body_pose=tgt_smpl_pose[i][:,1:].float().reshape(-1, 69),
+            #     global_orient=tgt_smpl_pose[i][:,[0]].float().reshape(-1, 3),
+            #     pose2rot=True
+            #     )
+
+        results.append({
+            'scores': pred_scores,
+            'labels': pred_labels,
+            'boxes': pred_boxes,
+            'keypoints': pred_keypoints,
+            'pred_smpl_pose': pred_smpl_pose,
+            'pred_smpl_beta': pred_smpl_beta,
+            'pred_smpl_cam': pred_smpl_cam,
+            'pred_smpl_kp3d': pred_smpl_kp3d,
+            'gt_smpl_pose': gt_smpl_pose,
+            'gt_smpl_beta': gt_smpl_beta,
+            'gt_smpl_kp3d': gt_smpl_kp3d,
+            'gt_boxes': gt_bbox,
+            'gt_keypoints': gt_keypoints,
+            'image_idx': image_idx,
+        })
+        # results.append({
+        #     'scores': scores[i][pred_ind],
+        #     'labels': labels[i][pred_ind],
+        #     'boxes': boxes[i][pred_ind],
+        #     'keypoints': keypoints_res[i][pred_ind],
+        #     'pred_smpl_pose': smpl_pose[i][pred_ind],
+        #     'pred_smpl_beta': tgt_smpl_beta[i][gt_ind],
+        #     'pred_smpl_cam': smpl_cam[i][pred_ind],
+        #     'pred_smpl_kp3d': smpl_kp3d[i][pred_ind],
+        #     'gt_smpl_pose': tgt_smpl_pose[i][gt_ind],
+        #     'gt_smpl_beta': tgt_smpl_beta[i][gt_ind],
+        #     'gt_smpl_kp3d': tgt_smpl_kp3d[i][gt_ind],
+        #     'gt_boxes': tgt_bbox[i][gt_ind],
+        #     'gt_keypoints': tgt_keypoints[i][gt_ind],
+        #     'image_idx': targets[i]['image_id'],
+        #     }
+        # )
+
+        if self.nms_iou_threshold > 0:
+            raise NotImplementedError
+            item_indices = [
+                nms(b, s, iou_threshold=self.nms_iou_threshold)
+                for b, s in zip(boxes, scores)
+            ]
+            # import pdb; pdb.set_trace()
+            results = [{
+                'scores': s[i],
+                'labels': l[i],
+                'boxes': b[i]
+            } for s, l, b, i in zip(scores, labels, boxes, item_indices)]
+        else:
+            results = results
+
+        return results
+
+
+class PostProcess_aios(nn.Module):
+    """This module converts the model's output into the format expected by the
+    coco api."""
+    def __init__(self,
+                 num_select=100,
+                 nms_iou_threshold=-1,
+                 num_body_points=17) -> None:
+        super().__init__()
+        self.num_select = num_select
+        self.nms_iou_threshold = nms_iou_threshold
+        self.num_body_points = num_body_points
+
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, not_to_xyxy=False, test=False):
+        num_select = self.num_select
+        out_logits, out_bbox, out_keypoints = outputs['pred_logits'], outputs[
+            'pred_boxes'], outputs['pred_keypoints']
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = torch.topk(prob.view(
+            out_logits.shape[0], -1),
+                                               num_select,
+                                               dim=1)
+        scores = topk_values
+
+        # bbox
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        if not_to_xyxy:
+            boxes = out_bbox
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+
+        if test:
+            assert not not_to_xyxy
+            boxes[:, :, 2:] = boxes[:, :, 2:] - boxes[:, :, :2]
+        boxes = torch.gather(boxes, 1,
+                             topk_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes * scale_fct[:, None, :]
+
+        # keypoints
+        topk_keypoints = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        keypoints = torch.gather(
+            out_keypoints, 1,
+            topk_keypoints.unsqueeze(-1).repeat(1, 1,
+                                                self.num_body_points * 3))
+
+        Z_pred = keypoints[:, :, :(self.num_body_points * 2)]
+        V_pred = keypoints[:, :, (self.num_body_points * 2):]
+        img_h, img_w = target_sizes.unbind(1)
+        Z_pred = Z_pred * torch.stack([img_w, img_h], dim=1).repeat(
+            1, self.num_body_points)[:, None, :]
+        keypoints_res = torch.zeros_like(keypoints)
+        keypoints_res[..., 0::3] = Z_pred[..., 0::2]
+        keypoints_res[..., 1::3] = Z_pred[..., 1::2]
+        keypoints_res[..., 2::3] = V_pred[..., 0::1]
+
+        if self.nms_iou_threshold > 0:
+            raise NotImplementedError
+            item_indices = [
+                nms(b, s, iou_threshold=self.nms_iou_threshold)
+                for b, s in zip(boxes, scores)
+            ]
+            # import ipdb; ipdb.set_trace()
+            results = [{
+                'scores': s[i],
+                'labels': l[i],
+                'boxes': b[i]
+            } for s, l, b, i in zip(scores, labels, boxes, item_indices)]
+        else:
+            results = [{
+                'scores': s,
+                'labels': l,
+                'boxes': b,
+                'keypoints': k
+            } for s, l, b, k in zip(scores, labels, boxes, keypoints_res)]
+
+        return results
+
+
+class PostProcess_SMPLX(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    def __init__(
+        self, 
+        num_select=100, 
+        nms_iou_threshold=-1,
+        num_body_points=17,
+        body_model= dict(
+            type='smplx',
+            keypoint_src='smplx',
+            num_expression_coeffs=10,
+            keypoint_dst='smplx_137',
+            model_path='data/body_models/smplx',
+            use_pca=False,
+            use_face_contour=True)
+        ) -> None:
+        super().__init__()
+        self.num_select = num_select
+        self.nms_iou_threshold = nms_iou_threshold
+        self.num_body_points=num_body_points
+        self.body_model = build_body_model(body_model)
+        
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, targets, data_batch_nc, not_to_xyxy=False, test=False):
+        # import pdb; pdb.set_trace()
+        num_select = self.num_select
+        
+        out_logits, out_bbox, out_keypoints= \
+            outputs['pred_logits'], outputs['pred_boxes'], \
+            outputs['pred_keypoints']
+            
+        out_smpl_pose, out_smpl_beta, out_smpl_expr, out_smpl_cam, out_smpl_kp3d, out_smpl_verts = \
+            outputs['pred_smpl_fullpose'], outputs['pred_smpl_beta'], outputs['pred_smpl_expr'], \
+            outputs['pred_smpl_cam'], outputs['pred_smpl_kp3d'], outputs['pred_smpl_verts']
+            
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = \
+            torch.topk(prob.view(out_logits.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        # bbox
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        if not_to_xyxy:
+            boxes = out_bbox
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+
+        if test:
+            assert not not_to_xyxy
+            boxes[:,:,2:] = boxes[:,:,2:] - boxes[:,:,:2]
+        boxes_norm = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        target_sizes = target_sizes.type_as(boxes)
+        # from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes_norm * scale_fct[:, None, :]
+
+
+        # keypoints
+        topk_keypoints = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        keypoints = torch.gather(out_keypoints, 1, topk_keypoints.unsqueeze(-1).repeat(1, 1, self.num_body_points*3))
+
+        Z_pred = keypoints[:, :, :(self.num_body_points*2)]
+        V_pred = keypoints[:, :, (self.num_body_points*2):]
+        img_h, img_w = target_sizes.unbind(1)
+        Z_pred = Z_pred * torch.stack([img_w, img_h], dim=1).repeat(1, self.num_body_points)[:, None, :]
+        keypoints_res = torch.zeros_like(keypoints)
+        keypoints_res[..., 0::3] = Z_pred[..., 0::2]
+        keypoints_res[..., 1::3] = Z_pred[..., 1::2]
+        keypoints_res[..., 2::3] = V_pred[..., 0::1]
+
+        # smpl out_smpl_pose, out_smpl_beta, out_smpl_cam, out_smpl_kp3d
+        topk_smpl = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        
+        smpl_pose = torch.gather(out_smpl_pose, 1, topk_smpl[:,:,None].repeat(1, 1, 159))        
+        smpl_beta = torch.gather(out_smpl_beta, 1, topk_smpl[:,:,None].repeat(1, 1, 10))   
+        smpl_expr = torch.gather(out_smpl_expr, 1, topk_smpl[:,:,None].repeat(1, 1, 10))   
+        smpl_cam = torch.gather(out_smpl_cam, 1, topk_smpl[:,:,None].repeat(1, 1, 3))   
+        smpl_kp3d = torch.gather(out_smpl_kp3d, 1, topk_smpl[:,:,None, None].repeat(1, 1, out_smpl_kp3d.shape[-2],3))
+        smpl_verts = torch.gather(out_smpl_verts, 1, topk_smpl[:,:,None, None].repeat(1, 1, out_smpl_verts.shape[-2],3))
+        if False:
+            import cv2
+            import mmcv
+            import ipdb;ipdb.set_trace()
+            img = (data_batch_nc['img'][1].permute(1,2,0)*255).int().detach().cpu().numpy()
+            # img = cv2.imread(data_batch_nc['img_metas'][1]['image_path'])
+            tgt_bbox_center = torch.stack(data_batch_nc['body_bbox_center'])
+            tgt_bbox_size = torch.stack(data_batch_nc['body_bbox_size']).cpu().numpy()
+            tgt_bbox = torch.cat([tgt_bbox_center-tgt_bbox_size/2,tgt_bbox_center+tgt_bbox_size/2],dim=-1)
+            tgt_img_shape = data_batch_nc['img_shape']
+            bbox = tgt_bbox.cpu().numpy()*(tgt_img_shape.repeat(1,2).cpu().numpy()[:,::-1])
+            render_img = mmcv.imshow_bboxes(img.copy(), boxes[1][:3].cpu().numpy(), show=False)
+            cv2.imwrite('r_bbox.png',render_img)
+            
+            render_img = mmcv.imshow_bboxes(img.copy(), bbox, show=False)
+            # cv2.imwrite('r_bbox.png',render_img)
+            # from detrsmpl.core.visualization.visualize_keypoints3d import visualize_kp3d
+            # visualize_kp3d(smpl_kp3d[1][[0]].cpu().numpy(),output_path='.',data_source='smpl_54')
+            from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+            import ipdb;ipdb.set_trace()
+            visualize_kp2d(keypoints_res[0].reshape(-1,17,3)[[3]].cpu().numpy(), output_path='.', image_array=img.copy()[None], data_source='coco',overwrite=True)
+        # TODO: align it with agora
+        tgt_smpl_kp3d = data_batch_nc['joint_cam']
+        tgt_smpl_kp3d_conf = data_batch_nc['joint_valid']
+        tgt_smpl_pose = data_batch_nc['smplx_pose']
+        tgt_smpl_beta = data_batch_nc['smplx_shape']
+        tgt_smpl_expr = data_batch_nc['smplx_expr']
+        tgt_keypoints = data_batch_nc['joint_img']
+        tgt_img_shape = data_batch_nc['img_shape']
+        tgt_ann_idx = data_batch_nc['ann_idx']
+        # tgt_img_path = data_batch_nc['img_shape']
+                
+        
+        tgt_bbox_center = torch.stack(data_batch_nc['body_bbox_center'])
+        tgt_bbox_size = torch.stack(data_batch_nc['body_bbox_size'])
+        tgt_bbox = torch.cat([tgt_bbox_center-tgt_bbox_size/2,tgt_bbox_size],dim=-1)
+        tgt_bbox = tgt_bbox * scale_fct
+        tgt_verts = data_batch_nc['smplx_mesh_cam']
+        tgt_bb2img_trans = data_batch_nc['bb2img_trans']
+        indices = []
+        # pred
+        pred_smpl_kp3d = []
+        pred_smpl_pose = []
+        pred_smpl_beta = []
+        pred_smpl_verts = []
+        pred_smpl_expr = []
+        pred_scores = []
+        pred_labels = []
+        pred_boxes = []
+        pred_keypoints = []
+        pred_smpl_cam = []
+        
+        # gt
+        gt_smpl_kp3d = []
+        gt_smpl_pose = []
+        gt_smpl_beta = []
+        gt_smpl_expr = []
+        gt_smpl_verts = []
+        gt_boxes = []
+        gt_keypoints = []
+        gt_bb2img_trans = []
+        image_idx = []       
+        
+        results = []
+        for i, kp3d in enumerate(tgt_smpl_kp3d):
+            # kp3d
+            conf =  tgt_smpl_kp3d_conf[i][...,]
+            gt_kp3d = tgt_smpl_kp3d[i][...,:3]
+            pred_kp3d = smpl_kp3d[i]
+            pred_kp3d_match,_ = convert_kps(pred_kp3d,'smplx','smplx_137')
+            # pred_kp3d_match = pred_kp3d
+            cost_kp3d = torch.abs((pred_kp3d_match[:,None] - 
+              gt_kp3d[None])* conf[None]).sum([-2,-1])
+
+            # bbox
+            tgt_bbox[i][...,2] = tgt_bbox[i][...,0] + tgt_bbox[i][...,2]
+            tgt_bbox[i][...,3] = tgt_bbox[i][...,1] + tgt_bbox[i][...,3]
+            gt_bbox = tgt_bbox[i][..., :4][None].float()
+            pred_bbox = boxes[i]
+            # box_iou = box_ops.box_iou(pred_bbox,gt_bbox)[0]
+            cost_giou = -box_ops.generalized_box_iou(pred_bbox,gt_bbox)
+            # cost_bbox = torch.cdist(pred_bbox, gt_bbox, p=1)
+            indice = linear_sum_assignment(cost_kp3d.cpu())
+            pred_ind, gt_ind = indice
+            indices=(indice)
+            # pred
+            pred_scores=(scores[i][pred_ind].detach().cpu().numpy())
+            pred_labels=(labels[i][pred_ind].detach().cpu().numpy())
+            pred_boxes=(boxes[i][pred_ind].detach().cpu().numpy())
+            pred_keypoints=(keypoints_res[i][pred_ind].detach().cpu().numpy())
+            
+            pred_smpl_kp3d=(smpl_kp3d[i][pred_ind].detach().cpu().numpy())
+            pred_smpl_pose=(smpl_pose[i][pred_ind].detach().cpu().numpy())
+            pred_smpl_beta=(smpl_beta[i][pred_ind].detach().cpu().numpy())
+            pred_smpl_cam=(smpl_cam[i][pred_ind].detach().cpu().numpy())
+            pred_smpl_expr=(smpl_expr[i][pred_ind].detach().cpu().numpy())
+            pred_smpl_verts=(smpl_verts[i][pred_ind].detach().cpu().numpy())
+            # gt 
+            # gt_smpl_kp3d=(tgt_smpl_kp3d[i][gt_ind].detach().cpu().numpy())
+            # gt_smpl_pose=(tgt_smpl_pose[i][gt_ind].detach().cpu().numpy())
+            # gt_smpl_beta=(tgt_smpl_beta[i][gt_ind].detach().cpu().numpy())
+            # gt_boxes=(tgt_bbox[i][gt_ind].detach().cpu().numpy())
+            # gt_smpl_expr=(tgt_smpl_expr[i][gt_ind].detach().cpu().numpy())
+            
+            # gt_smpl_verts=(tgt_verts[i][gt_ind].detach().cpu().numpy())
+            # gt_keypoints=(tgt_keypoints[i][gt_ind].detach().cpu().numpy())
+            # gt_bb2img_trans=(tgt_bb2img_trans[i][gt_ind].detach().cpu().numpy())
+
+            gt_smpl_kp3d=(tgt_smpl_kp3d[i].detach().cpu().numpy())
+            gt_smpl_pose=(tgt_smpl_pose[i].detach().cpu().numpy())
+            gt_smpl_beta=(tgt_smpl_beta[i].detach().cpu().numpy())
+            gt_boxes=(tgt_bbox[i].detach().cpu().numpy())
+            gt_smpl_expr=(tgt_smpl_expr[i].detach().cpu().numpy())
+            
+            gt_smpl_verts=(tgt_verts[i].detach().cpu().numpy())
+            gt_ann_idx=(tgt_ann_idx[i].detach().cpu().numpy())
+            gt_keypoints=(tgt_keypoints[i].detach().cpu().numpy())
+            gt_img_shape=(tgt_img_shape[i].detach().cpu().numpy())
+            gt_bb2img_trans=(tgt_bb2img_trans[i].detach().cpu().numpy())
+            if 'image_id' in targets[i]:
+                image_idx=(targets[i]['image_id'].detach().cpu().numpy())
+            
+            # pred_smpl_pose = np.concatenate(pred_smpl_pose,axis = 0)
+            # gt_bb2img_trans = np.concatenate(gt_bb2img_trans,axis = 0)
+            # gt_smpl_verts = np.concatenate(gt_smpl_verts,axis = 0)
+            # pred_smpl_verts = np.concatenate(pred_smpl_verts, axis = 0)
+            # pred_smpl_cam = np.concatenate(pred_smpl_cam, axis = 0)
+            #  import ipdb;ipdb.set_trace()
+            smplx_root_pose = pred_smpl_pose[:,:3]
+            smplx_body_pose = pred_smpl_pose[:,3:66]
+            smplx_lhand_pose = pred_smpl_pose[:,66:111]
+            smplx_rhand_pose = pred_smpl_pose[:,111:156]
+            smplx_jaw_pose = pred_smpl_pose[:,156:]
+            
+            # pred_smpl_kp3d = np.concatenate(pred_smpl_kp3d,axis = 0)
+
+            pred_smpl_cam = torch.Tensor(pred_smpl_cam)
+            pred_smpl_kp3d = torch.Tensor(pred_smpl_kp3d)
+            # pred_smpl_kp2d = weak_perspective_projection(pred_smpl_kp3d, scale=pred_smpl_cam[:, :1], translation=pred_smpl_cam[:, 1:3])
+            # pred_smpl_verts2d = weak_perspective_projection(pred_smpl_kp3d, scale=pred_smpl_cam[:, :1], translation=pred_smpl_cam[:, 1:3])
+            img_wh = tgt_img_shape[i].flip(-1)[None]
+            pred_smpl_kp2d = project_points_new(
+                points_3d=pred_smpl_kp3d,
+                pred_cam=pred_smpl_cam,
+                focal_length=5000,
+                camera_center=img_wh/2
+            )
+            
+            pred_smpl_kp2d = pred_smpl_kp2d.numpy()
+            pred_smpl_cam = pred_smpl_cam.numpy()
+            # cam_trans = get_camera_trans(pred_smpl_cam)
+            
+            # pred_smpl_kp2d = (pred_smpl_kp2d+1)/2
+            # pred_smpl_kp2d[:, :,0] = pred_smpl_kp2d[:, :, 0] * gt_img_shape[1]
+            # pred_smpl_kp2d[:, :, 1] = pred_smpl_kp2d[:, :, 1] * gt_img_shape[0]
+            # # joint_proj = np.dot(out['bb2img_trans'], joint_proj.transpose(1, 0)).transpose(1, 0)
+            # # joint_proj[:, 0] = joint_proj[:, 0] / self.resolution[1] * 3840  # restore to original resolution
+            # # joint_proj[:, 1] = joint_proj[:, 1] / self.resolution[0] * 2160  # restore to original resolution
+            
+            vis = False
+            if vis:
+                from pytorch3d.io import save_obj
+                from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+                from detrsmpl.core.visualization.visualize_smpl import visualize_smpl_hmr,render_smpl
+                from detrsmpl.utils.demo_utils import get_default_hmr_intrinsic
+                
+                # img = (data_batch_nc['img'][i]*255).permute(1,2,0).int().detach().cpu().numpy()
+                # (s, tx, ty) = (pred_smpl_cam[:, 0] + 1e-9), pred_smpl_cam[:, 1], pred_smpl_cam[:, 2]
+                # depth, dx, dy = 1./s, tx/s, ty/s
+                # cam_t = np.stack([dx, dy, depth], 1)   
+                # K = torch.Tensor(
+                #     get_default_hmr_intrinsic(focal_length=5000,
+                #                             det_height=750,
+                #                             det_width=1333))
+                # render_smpl(verts = pred_smpl_verts+cam_t[:,None,:],
+                #             image_array=img.copy()[None],
+                #             body_model=self.body_model,convention='opencv',
+                #             output_path='.',overwrite=True,K=K)
+             
+                # save_obj(
+                #     'pred.obj', 
+                #     torch.tensor(pred_smpl_verts[0]), 
+                #     torch.tensor(self.body_model.faces.astype(np.float)))
+                
+                import mmcv
+                import cv2
+                import numpy as np
+                from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+                from detrsmpl.core.visualization.visualize_smpl import visualize_smpl_hmr,render_smpl
+                from detrsmpl.models.body_models.builder import build_body_model
+                
+                from pytorch3d.io import save_obj
+                from detrsmpl.core.visualization.visualize_keypoints3d import visualize_kp3d
+                
+                img = mmcv.imdenormalize(
+                    img=(data_batch_nc['img'][i].cpu().numpy()).transpose(1, 2, 0), 
+                    mean=np.array([123.675, 116.28, 103.53]), 
+                    std=np.array([58.395, 57.12, 57.375]),
+                    to_bgr=True).astype(np.uint8)
+                img  = mmcv.imshow_bboxes(img,pred_boxes,show=False)
+                img= visualize_kp2d(pred_smpl_kp2d, output_path='.', image_array=img.copy()[None], data_source='smplx',overwrite=True)[0]
+                name = str(pred_smpl_kp2d[0,0,0]).replace('.','')
+                cv2.imwrite('res_vis/%s.png'%name, img)
+             # # joint_proj = np.dot(out['bb2img_trans'], joint_proj.transpose(1, 0)).transpose(1, 0)
+            # # joint_proj[:, 0] = joint_proj[:, 0] / self.resolution[1] * 3840  # restore to original resolution
+            # # joint_proj[:, 1] = joint_proj[:, 1] / self.resolution[0] * 2160  # restore to original resolution
+            # 
+            # from detrsmpl.core.visualization.visualize_keypoints2d import visualize_kp2d
+            # from detrsmpl.core.visualization.visualize_smpl import visualize_smpl_hmr,render_smpl
+            # from detrsmpl.models.body_models.builder import build_body_model
+            
+            # body_model = dict(
+            #                         type='smplx',
+            #                         keypoint_src='smplx',
+            #                         num_expression_coeffs=10,
+            #                         keypoint_dst='smplx_137',
+            #                         model_path='data/body_models/smplx',
+            #                         use_pca=False,
+            #                         use_face_contour=True)
+            # body_modeltest = build_body_model(body_model)
+
+            # # device =gt_betas.device
+            # # body_modeltest.to(device)
+            # gt_output = body_modeltest(betas=torch.Tensor(gt_smpl_beta[None].reshape(-1, 10)),body_pose=torch.Tensor(gt_smpl_pose[3:66][None].reshape(-1, 21*3)), global_orient=torch.Tensor(gt_smpl_pose[:3][None].reshape(-1, 3)),left_hand_pose=torch.Tensor(gt_smpl_pose[66:111][None].reshape(-1, 15*3)),right_hand_pose=torch.Tensor(gt_smpl_pose[111:156][None].reshape(-1, 15*3)),jaw_pose=torch.Tensor(gt_smpl_pose[156:][None].reshape(-1, 3)),)      
+            # img = (data_batch_nc['img'][i]*255).permute(1,2,0).int().detach().cpu().numpy()
+            # render_smpl(verts = gt_output['vertices'],image_array=img.copy()[None],body_model=self.body_model,convention='opencv',orig_cam = np.concatenate([pred_smpl_cam[:,:1],pred_smpl_cam[:,:1],pred_smpl_cam[:,1:]],axis=-1),output_path='.',overwrite=True)
+            
+            
+            
+            # img_new = visualize_smpl_hmr(
+            #     cam_transl=pred_smpl_cam,
+            #     verts = pred_smpl_verts,
+            #     body_model=self.body_model,
+            #     bbox = np.array([0,0,gt_img_shape[1],gt_img_shape[0]]),
+            #     det_width = gt_img_shape[1],
+            #     det_height=gt_img_shape[0],
+            #     image_array=img.copy()[None],
+            #     output_path='.',
+            #     overwrite=True
+
+            # )
+            
+            results.append({
+                    'scores': pred_scores, 
+                    'labels': pred_labels, 
+                    'boxes': pred_boxes[0], 
+                    'keypoints': pred_keypoints[0],
+                    'smplx_root_pose': smplx_root_pose[0], 
+                    'smplx_body_pose': smplx_body_pose[0],
+                    'smplx_lhand_pose': smplx_lhand_pose[0],
+                    'smplx_rhand_pose': smplx_rhand_pose[0],
+                    'smplx_jaw_pose': smplx_jaw_pose[0],
+                    'smplx_shape': pred_smpl_beta[0], 
+                    'smplx_expr': pred_smpl_expr[0], 
+                    'cam_trans': pred_smpl_cam[0], 
+                    'smplx_mesh_cam': pred_smpl_verts[0],
+                    'smplx_mesh_cam_target': gt_smpl_verts,
+                    'gt_ann_idx':gt_ann_idx,
+                    'gt_smpl_kp3d':gt_smpl_kp3d,
+                    'smplx_joint_proj': pred_smpl_kp2d[0],
+                    'image_idx': image_idx,
+                    'bb2img_trans': gt_bb2img_trans,
+                    'img_shape': gt_img_shape
+                })
+                # results.append({
+                #     'scores': scores[i][pred_ind], 
+                #     'labels': labels[i][pred_ind], 
+                #     'boxes': boxes[i][pred_ind], 
+                #     'keypoints': keypoints_res[i][pred_ind],
+                #     'pred_smpl_pose': smpl_pose[i][pred_ind], 
+                #     'pred_smpl_beta': tgt_smpl_beta[i][gt_ind], 
+                #     'pred_smpl_cam': smpl_cam[i][pred_ind], 
+                #     'pred_smpl_kp3d': smpl_kp3d[i][pred_ind], 
+                #     'gt_smpl_pose': tgt_smpl_pose[i][gt_ind],
+                #     'gt_smpl_beta': tgt_smpl_beta[i][gt_ind],
+                #     'gt_smpl_kp3d': tgt_smpl_kp3d[i][gt_ind],
+                #     'gt_boxes': tgt_bbox[i][gt_ind],
+                #     'gt_keypoints': tgt_keypoints[i][gt_ind],
+                #     'image_idx': targets[i]['image_id'],
+                #     }
+                # )   
+
+        if self.nms_iou_threshold > 0:
+            raise NotImplementedError
+            item_indices = [nms(b, s, iou_threshold=self.nms_iou_threshold) for b,s in zip(boxes, scores)]
+            # import pdb; pdb.set_trace()
+            results = [{'scores': s[i], 'labels': l[i], 'boxes': b[i]} for s, l, b, i in zip(scores, labels, boxes, item_indices)]
+        else:
+            results = results
+
+        return results
+    
+    
+class PostProcess_SMPLX_Multi(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    def __init__(
+        self, 
+        num_select=100, 
+        nms_iou_threshold=-1,
+        num_body_points=17,
+        body_model= dict(
+            type='smplx',
+            keypoint_src='smplx',
+            num_expression_coeffs=10,
+            num_betas=10,
+            gender='neutral',
+            keypoint_dst='smplx_137',
+            model_path='data/body_models/smplx',
+            use_pca=False,
+            use_face_contour=True,
+            ),
+        ) -> None:
+        super().__init__()
+        self.num_select = num_select
+        self.nms_iou_threshold = nms_iou_threshold
+        self.num_body_points=num_body_points
+        
+        # -1 for neutral; 0 for male; 1 for femal
+        gender_body_model = {}
+        gender_body_model[-1] = build_body_model(body_model)
+        
+        body_model['gender']='male'
+        gender_body_model[0] = build_body_model(body_model)
+        
+        body_model['gender']='female'
+        gender_body_model[1] = build_body_model(body_model)
+        
+        self.body_model = gender_body_model
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, targets, data_batch_nc, not_to_xyxy=False, test=False,  dataset = None):
+        # import pdb; pdb.set_trace()
+        batch_size = outputs['pred_keypoints'].shape[0]
+        results = []
+        device = outputs['pred_keypoints'].device
+        for body_model in self.body_model.values():
+            body_model.to(device)
+        # test with instance num
+        # num_select=data_batch_nc['joint_img'][0].shape[0]
+        # num_select = self.num_select
+        num_select = 1
+        out_logits, out_bbox, out_keypoints= \
+            outputs['pred_logits'], outputs['pred_boxes'], \
+            outputs['pred_keypoints']
+        
+        out_smpl_pose, out_smpl_beta, out_smpl_expr, out_smpl_cam, out_smpl_kp3d, out_smpl_verts = \
+            outputs['pred_smpl_fullpose'], outputs['pred_smpl_beta'], outputs['pred_smpl_expr'], \
+            outputs['pred_smpl_cam'], outputs['pred_smpl_kp3d'], outputs['pred_smpl_verts']
+
+        out_smpl_kp2d = []
+        for bs in range(batch_size):
+            out_kp3d_i = out_smpl_kp3d[bs]
+            out_cam_i = out_smpl_cam[bs]
+            out_img_shape = data_batch_nc['img_shape'][bs].flip(-1)[None]
+            # out_kp3d_i = out_kp3d_i - out_kp3d_i[:, [0]]
+            out_kp2d_i = project_points_new(
+                points_3d=out_kp3d_i,
+                pred_cam=out_cam_i,
+                focal_length=5000,
+                camera_center=out_img_shape/2
+            )   
+            out_smpl_kp2d.append(out_kp2d_i.detach().cpu().numpy())
+        out_smpl_kp2d = torch.tensor(out_smpl_kp2d).to(device)
+
+            
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = \
+            torch.topk(prob.view(out_logits.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        
+        # bbox
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        
+        if not_to_xyxy:
+            boxes = out_bbox
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+
+        if test:
+            assert not not_to_xyxy
+            boxes[:,:,2:] = boxes[:,:,2:] - boxes[:,:,:2]
+        
+        # gather gt bbox
+        boxes_norm = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        target_sizes = target_sizes.type_as(boxes)
+        # from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes_norm * scale_fct[:, None, :]
+
+        # smplx kp2d
+        topk_smpl_kp2d = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        pred_smpl_kp2d = torch.gather(
+            out_smpl_kp2d, 1, 
+            topk_smpl_kp2d.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, 137, 2)) 
+               
+       
+        # keypoints
+        topk_keypoints = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        keypoints = torch.gather(
+            out_keypoints, 1, 
+            topk_keypoints.unsqueeze(-1).repeat(1, 1, self.num_body_points*3))
+
+        Z_pred = keypoints[:, :, :(self.num_body_points * 2)]
+        V_pred = keypoints[:, :, (self.num_body_points * 2):]
+        img_h, img_w = target_sizes.unbind(1)
+        Z_pred = Z_pred * torch.stack([img_w, img_h], dim=1).repeat(1, self.num_body_points)[:, None, :]
+        keypoints_res = torch.zeros_like(keypoints)
+        keypoints_res[..., 0::3] = Z_pred[..., 0::2]
+        keypoints_res[..., 1::3] = Z_pred[..., 1::2]
+        keypoints_res[..., 2::3] = V_pred[..., 0::1]
+
+        # smpl out_smpl_pose, out_smpl_beta, out_smpl_cam, out_smpl_kp3d
+        topk_smpl = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        
+        smpl_pose = torch.gather(out_smpl_pose, 1, topk_smpl[:,:,None].repeat(1, 1, 159))        
+        smpl_beta = torch.gather(out_smpl_beta, 1, topk_smpl[:,:,None].repeat(1, 1, 10))   
+        smpl_expr = torch.gather(out_smpl_expr, 1, topk_smpl[:,:,None].repeat(1, 1, 10))   
+        smpl_cam = torch.gather(out_smpl_cam, 1, topk_smpl[:,:,None].repeat(1, 1, 3))   
+        smpl_kp3d = torch.gather(out_smpl_kp3d, 1, topk_smpl[:,:,None, None].repeat(1, 1, out_smpl_kp3d.shape[-2],3))
+        smpl_verts = torch.gather(out_smpl_verts, 1, topk_smpl[:,:,None, None].repeat(1, 1, out_smpl_verts.shape[-2],3))
+      
+        tgt_smpl_kp3d = data_batch_nc['joint_cam']
+        # tgt_smpl_kp3d_conf = data_batch_nc['joint_valid']
+        tgt_smpl_pose = data_batch_nc['smplx_pose']
+        tgt_smpl_beta = data_batch_nc['smplx_shape']
+        tgt_smpl_expr = data_batch_nc['smplx_expr']
+        tgt_keypoints = data_batch_nc['joint_img']
+        tgt_img_shape = data_batch_nc['img_shape']
+        # tgt_bbox_center = data_batch_nc['body_bbox_center']
+        # tgt_bbox_size = data_batch_nc['body_bbox_size']
+        tgt_bb2img_trans = data_batch_nc['bb2img_trans']
+        tgt_ann_idx = data_batch_nc['ann_idx']
+        
+        pred_indice_list = []
+        gt_indice_list = []
+        tgt_verts = []
+        tgt_kp3d = []
+        tgt_bbox = []
+        for bbox_center, bbox_size, pose, \
+            beta, expr, gender, gt_kp2d, _, pred_kp2d, pred_kp3d, boxe, scale \
+                in zip(
+                    data_batch_nc['body_bbox_center'], 
+                    data_batch_nc['body_bbox_size'], 
+                    # data_batch_nc['bb2img_trans'],
+                    data_batch_nc['smplx_pose'],
+                    data_batch_nc['smplx_shape'],
+                    data_batch_nc['smplx_expr'],
+                    data_batch_nc['gender'],
+                    data_batch_nc['joint_img'],
+                    data_batch_nc['joint_cam'],
+                    # keypoints_res, smpl_kp3d, boxes, scale_fct,
+                    pred_smpl_kp2d, smpl_kp3d, boxes, scale_fct,
+                ):
+            # build smplx verts
+            gt_verts = []
+            gt_kp3d = []
+            gt_bbox = []
+            gender_ = gender.cpu().numpy()
+        
+            for i, g in enumerate(gender_):
+                gt_out = self.body_model[g](
+                    betas=beta[i].reshape(-1, 10),
+                    global_orient=pose[i, :3].reshape(-1, 3).unsqueeze(1),
+                    body_pose=pose[i, 3:66].reshape(-1, 21 * 3),
+                    left_hand_pose=pose[i, 66:111].reshape(-1, 15 * 3),
+                    right_hand_pose=pose[i, 111:156].reshape(-1, 15 * 3),
+                    jaw_pose=pose[i, 156:159].reshape(-1, 3),
+                    leye_pose=torch.zeros_like(pose[i, 156:159]),
+                    reye_pose=torch.zeros_like(pose[i, 156:159]),
+                    expression=expr[i].reshape(-1, 10),                
+                )
+                gt_verts.append(gt_out['vertices'][0].detach().cpu().numpy())
+                gt_kp3d.append(gt_out['joints'][0].detach().cpu().numpy())
+            
+            tgt_verts.append(gt_verts)
+            tgt_kp3d.append(gt_kp3d)
+                    
+            # bbox
+            gt_bbox = torch.cat(
+                [bbox_center - bbox_size / 2, bbox_size ], dim=-1)
+            gt_bbox = gt_bbox * scale
+            # xywh2xyxy
+            gt_bbox[..., 2] = gt_bbox[..., 0] + gt_bbox[..., 2]
+            gt_bbox[..., 3] = gt_bbox[..., 1] + gt_bbox[..., 3]
+            tgt_bbox.append(gt_bbox[..., :4].float())
+            
+            pred_bbox = boxe.clone()
+            # box_iou = box_ops.box_iou(pred_bbox,gt_bbox)[0]
+            cost_giou = -box_ops.generalized_box_iou(pred_bbox, gt_bbox)
+            
+            
+            cost_bbox = torch.cdist(
+                box_ops.box_xyxy_to_cxcywh(pred_bbox)/scale, 
+                box_ops.box_xyxy_to_cxcywh(gt_bbox)/scale, p=1)
+            
+            # smpl kp2d
+            gt_kp2d_conf = gt_kp2d[:,:,2:3]
+            gt_kp2d_ = (gt_kp2d[:, :, :2] * scale[:2]) /torch.tensor([12, 16]).to(device)
+            
+            gt_kp2d_body = gt_kp2d_[:, smpl_x.joint_part['body']]
+            gt_kp2d_body_conf  = gt_kp2d_conf[:, smpl_x.joint_part['body']]
+            pred_kp2d_body = pred_kp2d[:, smpl_x.joint_part['body']] # smplx kps head
+            # print(gt_kp2d_body.shape,gt_kp2d_body_conf.shape,pred_kp2d_body.shape,pred_kp2d.shape)
+            # exit()
+            # print(gt_kp2d_body_conf.shape)
+            # exit()
+            # gt_kp2d_body_conf, _ = convert_kps(gt_kp2d_conf,'smplx_137', 'coco', approximate=True)
+            # gt_kp2d_body, _ = convert_kps(gt_kp2d_,'smplx_137', 'coco', approximate=True)
+            # pred_kp2d_body, _ = convert_kps(pred_kp2d,'smplx_137', 'coco', approximate=True)
+            # cost_keypoints = torch.abs(
+            #     (pred_kp2d_body[:, None]/scale[:2] - gt_kp2d_body[None]/scale[:2])
+            #                            ).sum([-2,-1])
+            # print(dataset.__class__.__name__)
+            if dataset.__class__.__name__ == 'UBody_MM':
+                cost_keypoints = torch.abs(
+                    (pred_kp2d_body[:, None]/scale[:2] - gt_kp2d_body[None]/scale[:2])*gt_kp2d_body_conf[None]
+                                        ).sum([-2,-1])/gt_kp2d_body_conf[None].sum()            
+            else:
+                cost_keypoints = torch.abs(
+                (pred_kp2d_body[:, None]/scale[:2] - gt_kp2d_body[None]/scale[:2])
+                                       ).sum([-2,-1])
+            # smpl kp3d
+            gt_kp3d_ = torch.tensor(np.array(gt_kp3d) - np.array(gt_kp3d)[:, [0]]).to(device)
+            pred_kp3d_ = (pred_kp3d - pred_kp3d[:, [0]])
+            cost_kp3d = torch.abs((pred_kp3d_[:, None] - gt_kp3d_[None])).sum([-2,-1])
+            
+            # 1. kps
+            indice = linear_sum_assignment(cost_keypoints.cpu())
+            
+            # 2. bbox giou
+            # indice = linear_sum_assignment(cost_giou.cpu())
+            
+            # 3. bbox
+            # indice = linear_sum_assignment(cost_bbox.cpu())      
+            
+            # 4. all
+            # indice = linear_sum_assignment(
+            #     10* (cost_keypoints).cpu() +  5 * cost_bbox.cpu())
+            
+            # 5. kp3d 
+            # indice = linear_sum_assignment(cost_kp3d.cpu())
+            
+            pred_ind, gt_ind = indice
+            pred_indice_list.append(pred_ind)
+            gt_indice_list.append(gt_ind)
+            
+        pred_scores = torch.cat(
+            [t[i] for t, i in zip(scores, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_labels = torch.cat(
+            [t[i] for t, i in zip(labels, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_boxes = torch.cat(
+            [t[i] for t, i in zip(boxes, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_keypoints = torch.cat(
+            [t[i] for t, i in zip(keypoints_res, pred_indice_list)]
+            ).detach().cpu().numpy()
+        
+
+
+        
+        pred_smpl_kp2d = []
+        pred_smpl_kp3d = []
+        pred_smpl_cam = []
+        img_wh_list = []
+        for i, img_wh in enumerate(tgt_img_shape):
+            
+            kp3d = smpl_kp3d[i][pred_indice_list[i]]
+            cam = smpl_cam[i][pred_indice_list[i]]
+            img_wh = img_wh.flip(-1)[None]
+            
+            kp2d = project_points_new(
+                points_3d=kp3d,
+                pred_cam=cam,
+                focal_length=5000,
+                camera_center=img_wh/2
+            )     
+            num_instance = kp2d.shape[0]
+            img_wh_list.append(img_wh.repeat(num_instance,1).cpu().numpy())
+            pred_smpl_kp2d.append(kp2d.detach().cpu().numpy())
+            pred_smpl_kp3d.append(kp3d.detach().cpu().numpy())
+            pred_smpl_cam.append(cam.detach().cpu().numpy())
+        
+        # pred_smpl_cam = torch.cat(
+        #     [t[i] for t, i in zip(smpl_cam, pred_indice_list)]
+        #     ).detach().cpu().numpy()
+        # pred_smpl_kp3d = torch.cat(
+        #     [t[i] for t, i in zip(smpl_kp3d, pred_indice_list)]
+        #     )
+        pred_smpl_pose = torch.cat(
+            [t[i] for t, i in zip(smpl_pose, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_smpl_beta = torch.cat(
+            [t[i] for t, i in zip(smpl_beta, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_smpl_expr = torch.cat(
+            [t[i] for t, i in zip(smpl_expr, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_smpl_verts = torch.cat(
+            [t[i] for t, i in zip(smpl_verts, pred_indice_list)]
+            ).detach().cpu().numpy()
+        # from pytorch3d.io import save_obj
+        # for m_i,(mesh_out_i) in enumerate(smpl_verts[0].detach().cpu()):
+        #         save_obj('temp_smpl_%d.obj'%m_i,verts=(mesh_out_i),faces=torch.tensor([]))
+        # for m_i,(mesh_out_i) in enumerate(pred_smpl_verts):
+        #         save_obj('temp_pred_%d.obj'%m_i,verts=torch.Tensor(mesh_out_i),faces=torch.tensor([]))
+        # print(pred_indice_list)
+        # exit()
+        pred_smpl_kp2d = np.concatenate(pred_smpl_kp2d, 0)
+        pred_smpl_kp3d = np.concatenate(pred_smpl_kp3d, 0)
+        pred_smpl_cam = np.concatenate(pred_smpl_cam, 0)       
+        img_wh_list = np.concatenate(img_wh_list, 0)   
+
+        gt_smpl_kp3d = torch.cat(tgt_smpl_kp3d).detach().cpu().numpy()
+        gt_smpl_pose = torch.cat(tgt_smpl_pose).detach().cpu().numpy()
+        gt_smpl_beta = torch.cat(tgt_smpl_beta).detach().cpu().numpy()
+        gt_boxes = torch.cat(tgt_bbox).detach().cpu().numpy()
+        gt_smpl_expr = torch.cat(tgt_smpl_expr).detach().cpu().numpy()
+        # gt_img_shape = torch.cat(tgt_img_shape).detach().cpu().numpy()
+        gt_smpl_verts = np.concatenate(
+            [np.array(t)[i] for t, i in zip(tgt_verts, gt_indice_list)], 0)
+        gt_ann_idx = torch.cat([t.repeat(len(i)) for t, i in zip(tgt_ann_idx, gt_indice_list)],dim=0).cpu().numpy()
+        
+        gt_keypoints = torch.cat(tgt_keypoints).detach().cpu().numpy()
+        # gt_img_shape = tgt_img_shape.detach().cpu().numpy()
+        gt_bb2img_trans = torch.stack(tgt_bb2img_trans).detach().cpu().numpy()
+
+        if 'image_id' in targets[i]:
+            image_idx=(targets[i]['image_id'].detach().cpu().numpy())
+
+        smplx_root_pose = pred_smpl_pose[:,:3]
+        smplx_body_pose = pred_smpl_pose[:,3:66]
+        smplx_lhand_pose = pred_smpl_pose[:,66:111]
+        smplx_rhand_pose = pred_smpl_pose[:,111:156]
+        smplx_jaw_pose = pred_smpl_pose[:,156:]
+
+        results.append({
+                    'scores': pred_scores, 
+                    'labels': pred_labels, 
+                    'boxes': pred_boxes, 
+                    'keypoints': pred_keypoints,
+                    'smplx_root_pose': smplx_root_pose, 
+                    'smplx_body_pose': smplx_body_pose,
+                    'smplx_lhand_pose': smplx_lhand_pose,
+                    'smplx_rhand_pose': smplx_rhand_pose,
+                    'smplx_jaw_pose': smplx_jaw_pose,
+                    'smplx_shape': pred_smpl_beta, 
+                    'smplx_expr': pred_smpl_expr, 
+                    'cam_trans': pred_smpl_cam, 
+                    'smplx_mesh_cam': pred_smpl_verts,
+                    'smplx_mesh_cam_target': gt_smpl_verts,
+                    'gt_smpl_kp3d':gt_smpl_kp3d,
+                    'smplx_joint_proj': pred_smpl_kp2d,
+                    # 'image_idx': image_idx,
+                    "img": data_batch_nc['img'].cpu().numpy(),
+                    'bb2img_trans': gt_bb2img_trans,
+                    'img_shape': img_wh_list,
+                    'gt_ann_idx': gt_ann_idx
+                })
+
+        if self.nms_iou_threshold > 0:
+            raise NotImplementedError
+            item_indices = [nms(b, s, iou_threshold=self.nms_iou_threshold) for b,s in zip(boxes, scores)]
+            # import pdb; pdb.set_trace()
+            results = [{'scores': s[i], 'labels': l[i], 'boxes': b[i]} for s, l, b, i in zip(scores, labels, boxes, item_indices)]
+        else:
+            results = results
+
+        return results
+
+
+class PostProcess_SMPLX_Multi_Infer(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    def __init__(
+        self, 
+        num_select=100, 
+        nms_iou_threshold=-1,
+        num_body_points=17,
+        body_model= dict(
+            type='smplx',
+            keypoint_src='smplx',
+            num_expression_coeffs=10,
+            num_betas=10,
+            gender='neutral',
+            keypoint_dst='smplx_137',
+            model_path='data/body_models/smplx',
+            use_pca=False,
+            use_face_contour=True)
+        ) -> None:
+        super().__init__()
+        self.num_select = num_select
+        self.nms_iou_threshold = nms_iou_threshold
+        self.num_body_points=num_body_points
+        
+        # -1 for neutral; 0 for male; 1 for femal
+        gender_body_model = {}
+        gender_body_model[-1] = build_body_model(body_model)
+        
+        body_model['gender']='male'
+        gender_body_model[0] = build_body_model(body_model)
+        
+        body_model['gender']='female'
+        gender_body_model[1] = build_body_model(body_model)
+        
+        self.body_model = gender_body_model
+        
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, targets, data_batch_nc, image_shape= None, not_to_xyxy=False, test=False):
+        """
+        image_shape(target_sizes): input image shape
+        
+        """
+        # import pdb; pdb.set_trace()
+        batch_size = outputs['pred_keypoints'].shape[0]
+        results = []
+        device = outputs['pred_keypoints'].device
+        # for body_model in self.body_model.values():
+        #     body_model.to(device)
+        
+        pred_kp_coco = outputs['pred_keypoints']
+        num_select = self.num_select
+        out_logits, out_bbox= outputs['pred_logits'], outputs['pred_boxes']
+        
+        out_body_bbox, out_lhand_bbox, out_rhand_bbox, out_face_bbox = \
+            outputs['pred_boxes'], outputs['pred_lhand_boxes'], \
+                outputs['pred_rhand_boxes'], outputs['pred_face_boxes']
+            
+        out_smpl_pose, out_smpl_beta, out_smpl_expr, out_smpl_cam, out_smpl_kp3d, out_smpl_verts = \
+            outputs['pred_smpl_fullpose'], outputs['pred_smpl_beta'], outputs['pred_smpl_expr'], \
+            outputs['pred_smpl_cam'], outputs['pred_smpl_kp3d'], outputs['pred_smpl_verts']
+
+        out_smpl_kp2d = []
+        for bs in range(batch_size):
+            out_kp3d_i = out_smpl_kp3d[bs]
+            out_cam_i = out_smpl_cam[bs]
+            out_img_shape = data_batch_nc['img_shape'][bs].flip(-1)[None]
+
+            out_kp2d_i = project_points_new(
+                points_3d=out_kp3d_i,
+                pred_cam=out_cam_i,
+                focal_length=5000,
+                camera_center=out_img_shape/2
+            )   
+            out_smpl_kp2d.append(out_kp2d_i.detach().cpu().numpy())
+        out_smpl_kp2d = torch.tensor(out_smpl_kp2d).to(device)
+
+            
+        # assert len(out_logits) == len(target_sizes)
+        # assert target_sizes.shape[1] == 2
+        
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = \
+            torch.topk(prob.view(out_logits.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        
+        # bbox
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        
+        if not_to_xyxy:
+            boxes = out_bbox
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+            out_body_bbox = box_ops.box_cxcywh_to_xyxy(out_body_bbox)
+            out_lhand_bbox = box_ops.box_cxcywh_to_xyxy(out_lhand_bbox)
+            out_rhand_bbox = box_ops.box_cxcywh_to_xyxy(out_rhand_bbox)
+            out_face_bbox = box_ops.box_cxcywh_to_xyxy(out_face_bbox)
+            
+        # gather body bbox
+        target_sizes = target_sizes.type_as(boxes)
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        
+        boxes_norm = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        boxes = boxes_norm * scale_fct[:, None, :]
+        
+        body_bbox_norm = torch.gather(out_body_bbox, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        body_boxes = body_bbox_norm * scale_fct[:, None, :]
+        
+        lhand_bbox_norm = torch.gather(out_lhand_bbox, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        lhand_boxes = lhand_bbox_norm * scale_fct[:, None, :]
+        
+        rhand_bbox_norm = torch.gather(out_rhand_bbox, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        rhand_boxes = rhand_bbox_norm * scale_fct[:, None, :]
+        
+        face_bbox_norm = torch.gather(out_face_bbox, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        face_boxes = face_bbox_norm * scale_fct[:, None, :]
+        
+        # from relative [0, 1] to absolute [0, height] coordinates
+
+        # smplx kp2d
+        topk_smpl_kp2d = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        pred_smpl_kp2d = torch.gather(
+            out_smpl_kp2d, 1, 
+            topk_smpl_kp2d.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, 144, 2))        
+        # pred_smpl_kp2d = np.concatenate(pred_smpl_kp2d, 0)
+        pred_kp_coco = pred_kp_coco[..., 0:17*2].reshape(pred_kp_coco.shape[0], pred_kp_coco.shape[1], 17, 2)
+        # pred_kp_coco_norm = torch.gather(
+        #     pred_kp_coco, 1, 
+        #     topk_smpl_kp2d.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, 17, 2))  
+        # pred_kp_coco = pred_kp_coco_norm * scale_fct[:, None, :2]
+        # smpl param
+        topk_smpl = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+
+        smpl_pose = torch.gather(out_smpl_pose, 1, topk_smpl[:,:,None].repeat(1, 1, 159))        
+        smpl_beta = torch.gather(out_smpl_beta, 1, topk_smpl[:,:,None].repeat(1, 1, 10))   
+        smpl_expr = torch.gather(out_smpl_expr, 1, topk_smpl[:,:,None].repeat(1, 1, 10))   
+        smpl_cam = torch.gather(out_smpl_cam, 1, topk_smpl[:,:,None].repeat(1, 1, 3))   
+        smpl_kp3d = torch.gather(out_smpl_kp3d, 1, topk_smpl[:,:,None, None].repeat(1, 1, out_smpl_kp3d.shape[-2],3))
+        smpl_verts = torch.gather(out_smpl_verts, 1, topk_smpl[:,:,None, None].repeat(1, 1, out_smpl_verts.shape[-2],3))
+        # smpl_verts = smpl_verts - smpl_kp3d[:,:, [0]]
+        (s, tx, ty) = (smpl_cam[..., 0] + 1e-9), smpl_cam[..., 1], smpl_cam[..., 2]
+        depth, dx, dy = 1./s, tx/s, ty/s
+        transl = torch.stack([dx, dy, depth], -1) 
+        
+        smplx_root_pose = smpl_pose[:, :, :3]
+        smplx_body_pose = smpl_pose[:, :, 3:66]
+        smplx_lhand_pose = smpl_pose[:, :, 66:111]
+        smplx_rhand_pose = smpl_pose[:, :, 111:156]
+        smplx_jaw_pose = smpl_pose[:, :, 156:]
+        
+        if 'ann_idx' in data_batch_nc:
+            image_idx=[target.cpu().numpy()[0] for target in data_batch_nc['ann_idx']]
+
+        for bs in range(batch_size):
+            results.append({
+                        'scores': scores[bs], 
+                        'labels': labels[bs], 
+                        'keypoints_coco': pred_kp_coco[bs],
+                        'smpl_kp3d': smpl_kp3d[bs],
+                        'smplx_root_pose': smplx_root_pose[bs], 
+                        'smplx_body_pose': smplx_body_pose[bs],
+                        'smplx_lhand_pose': smplx_lhand_pose[bs],
+                        'smplx_rhand_pose': smplx_rhand_pose[bs],
+                        'smplx_jaw_pose': smplx_jaw_pose[bs],
+                        'smplx_shape': smpl_beta[bs], 
+                        'smplx_expr': smpl_expr[bs], 
+                        'smplx_joint_proj': pred_smpl_kp2d[bs],
+                        'smpl_verts': smpl_verts[bs],
+                        'image_idx': image_idx[bs],
+                        'cam_trans': transl[bs],
+                        'body_bbox': body_boxes[bs],
+                        'lhand_bbox': lhand_boxes[bs],
+                        'rhand_bbox': rhand_boxes[bs],
+                        'face_bbox': face_boxes[bs],
+                        'bb2img_trans': data_batch_nc['bb2img_trans'][bs],
+                        'img2bb_trans': data_batch_nc['img2bb_trans'][bs],
+                        'img': data_batch_nc['img'][bs],
+                        'img_shape': data_batch_nc['img_shape'][bs]
+                    })
+
+        if self.nms_iou_threshold > 0:
+            raise NotImplementedError
+            item_indices = [nms(b, s, iou_threshold=self.nms_iou_threshold) for b,s in zip(boxes, scores)]
+            # import pdb; pdb.set_trace()
+            results = [{'scores': s[i], 'labels': l[i], 'boxes': b[i]} for s, l, b, i in zip(scores, labels, boxes, item_indices)]
+        else:
+            results = results
+
+        return results
+
+
+class PostProcess_SMPLX_Multi_Box(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    def __init__(
+        self, 
+        num_select=100, 
+        nms_iou_threshold=-1,
+        num_body_points=17,
+        body_model= dict(
+            type='smplx',
+            keypoint_src='smplx',
+            num_expression_coeffs=10,
+            num_betas=10,
+            gender='neutral',
+            keypoint_dst='smplx_137',
+            model_path='data/body_models/smplx',
+            use_pca=False,
+            use_face_contour=True)
+        ) -> None:
+        super().__init__()
+        self.num_select = num_select
+        self.nms_iou_threshold = nms_iou_threshold
+        self.num_body_points=num_body_points
+        
+        # -1 for neutral; 0 for male; 1 for femal
+        gender_body_model = {}
+        gender_body_model[-1] = build_body_model(body_model)
+        
+        body_model['gender']='male'
+        gender_body_model[0] = build_body_model(body_model)
+        
+        body_model['gender']='female'
+        gender_body_model[1] = build_body_model(body_model)
+        
+        self.body_model = gender_body_model
+        
+        
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, targets, data_batch_nc, not_to_xyxy=False, test=False):
+        # import pdb; pdb.set_trace()
+        batch_size = outputs['pred_smpl_beta'].shape[0]
+        results = []
+        device = outputs['pred_smpl_beta'].device
+        for body_model in self.body_model.values():
+            body_model.to(device)
+        # test with instance num
+        # num_select=data_batch_nc['joint_img'][0].shape[0]
+        num_select = self.num_select
+        out_logits, out_bbox= outputs['pred_logits'], outputs['pred_boxes']
+
+        out_smpl_pose, out_smpl_beta, out_smpl_expr, out_smpl_cam, out_smpl_kp3d, out_smpl_verts = \
+            outputs['pred_smpl_fullpose'], outputs['pred_smpl_beta'], outputs['pred_smpl_expr'], \
+            outputs['pred_smpl_cam'], outputs['pred_smpl_kp3d'], outputs['pred_smpl_verts']
+
+        out_smpl_kp2d = []
+        
+        for bs in range(batch_size):
+            out_kp3d_i = out_smpl_kp3d[bs]
+            out_cam_i = out_smpl_cam[bs]
+            out_img_shape = data_batch_nc['img_shape'][bs].flip(-1)[None]
+            # out_kp3d_i = out_kp3d_i - out_kp3d_i[:, [0]]
+            out_kp2d_i = project_points_new(
+                points_3d=out_kp3d_i,
+                pred_cam=out_cam_i,
+                focal_length=5000,
+                camera_center=out_img_shape/2
+            )   
+            out_smpl_kp2d.append(out_kp2d_i.detach().cpu().numpy())
+        out_smpl_kp2d = torch.tensor(out_smpl_kp2d).to(device)
+
+            
+        assert len(out_logits) == len(target_sizes)
+        assert target_sizes.shape[1] == 2
+        
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = \
+            torch.topk(prob.view(out_logits.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        
+        # bbox
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        
+        if not_to_xyxy:
+            boxes = out_bbox
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+
+        if test:
+            assert not not_to_xyxy
+            boxes[:,:,2:] = boxes[:,:,2:] - boxes[:,:,:2]
+        
+        # gather gt bbox
+        boxes_norm = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        target_sizes = target_sizes.type_as(boxes)
+        # from relative [0, 1] to absolute [0, height] coordinates
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        boxes = boxes_norm * scale_fct[:, None, :]
+
+        # smplx kp2d
+        topk_smpl_kp2d = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        pred_smpl_kp2d = torch.gather(
+            out_smpl_kp2d, 1, 
+            topk_smpl_kp2d.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, 137, 2)) 
+
+        # smpl out_smpl_pose, out_smpl_beta, out_smpl_cam, out_smpl_kp3d
+        topk_smpl = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+
+        smpl_pose = torch.gather(out_smpl_pose, 1, topk_smpl[:,:,None].repeat(1, 1, 159))        
+        smpl_beta = torch.gather(out_smpl_beta, 1, topk_smpl[:,:,None].repeat(1, 1, 10))   
+        smpl_expr = torch.gather(out_smpl_expr, 1, topk_smpl[:,:,None].repeat(1, 1, 10))   
+        smpl_cam = torch.gather(out_smpl_cam, 1, topk_smpl[:,:,None].repeat(1, 1, 3))   
+        smpl_kp3d = torch.gather(out_smpl_kp3d, 1, topk_smpl[:,:,None, None].repeat(1, 1, out_smpl_kp3d.shape[-2],3))
+        smpl_verts = torch.gather(out_smpl_verts, 1, topk_smpl[:,:,None, None].repeat(1, 1, out_smpl_verts.shape[-2],3))
+
+        tgt_smpl_kp3d = data_batch_nc['joint_cam']
+        tgt_smpl_pose = data_batch_nc['smplx_pose']
+        tgt_smpl_beta = data_batch_nc['smplx_shape']
+        tgt_smpl_expr = data_batch_nc['smplx_expr']
+        tgt_keypoints = data_batch_nc['joint_img']
+        tgt_img_shape = data_batch_nc['img_shape']
+        tgt_bb2img_trans = data_batch_nc['bb2img_trans']
+        tgt_ann_idx = data_batch_nc['ann_idx']
+
+        pred_indice_list = []
+        gt_indice_list = []
+        tgt_verts = []
+        tgt_kp3d = []
+        tgt_bbox = []
+        for bbox_center, bbox_size, pose, \
+            beta, expr, gender, gt_kp2d, _, pred_kp2d, pred_kp3d, boxe, scale \
+                in zip(
+                    data_batch_nc['body_bbox_center'], 
+                    data_batch_nc['body_bbox_size'], 
+                    data_batch_nc['smplx_pose'],
+                    data_batch_nc['smplx_shape'],
+                    data_batch_nc['smplx_expr'],
+                    data_batch_nc['gender'],
+                    data_batch_nc['joint_img'],
+                    data_batch_nc['joint_cam'],
+                    pred_smpl_kp2d, smpl_kp3d, boxes, scale_fct,
+                ):
+            # build smplx verts
+            gt_verts = []
+            gt_kp3d = []
+            gt_bbox = []
+            gender_ = gender.cpu().numpy()
+        
+            for i, g in enumerate(gender_):
+                gt_out = self.body_model[g](
+                    betas=beta[i].reshape(-1, 10),
+                    global_orient=pose[i, :3].reshape(-1, 3).unsqueeze(1),
+                    body_pose=pose[i, 3:66].reshape(-1, 21 * 3),
+                    left_hand_pose=pose[i, 66:111].reshape(-1, 15 * 3),
+                    right_hand_pose=pose[i, 111:156].reshape(-1, 15 * 3),
+                    jaw_pose=pose[i, 156:159].reshape(-1, 3),
+                    leye_pose=torch.zeros_like(pose[i, 156:159]),
+                    reye_pose=torch.zeros_like(pose[i, 156:159]),
+                    expression=expr[i].reshape(-1, 10),                
+                )
+                gt_verts.append(gt_out['vertices'][0].detach().cpu().numpy())
+                gt_kp3d.append(gt_out['joints'][0].detach().cpu().numpy())
+            
+            tgt_verts.append(gt_verts)
+            tgt_kp3d.append(gt_kp3d)
+                    
+            # bbox
+            gt_bbox = torch.cat(
+                [bbox_center - bbox_size / 2, bbox_size ], dim=-1)
+            gt_bbox = gt_bbox * scale
+            # xywh2xyxy
+            gt_bbox[..., 2] = gt_bbox[..., 0] + gt_bbox[..., 2]
+            gt_bbox[..., 3] = gt_bbox[..., 1] + gt_bbox[..., 3]
+            tgt_bbox.append(gt_bbox[..., :4].float())
+            
+            pred_bbox = boxe.clone()
+            # box_iou = box_ops.box_iou(pred_bbox,gt_bbox)[0]
+            cost_giou = -box_ops.generalized_box_iou(pred_bbox, gt_bbox)
+            
+            
+            cost_bbox = torch.cdist(
+                box_ops.box_xyxy_to_cxcywh(pred_bbox)/scale, 
+                box_ops.box_xyxy_to_cxcywh(gt_bbox)/scale, p=1)
+            
+            # smpl kp2d
+            gt_kp2d_conf = gt_kp2d[:,:,2:3]
+            gt_kp2d_ = (gt_kp2d[:, :, :2] * scale[:2]) /torch.tensor([12, 16]).to(device)
+            
+            # gt_kp2d_conf, _ = convert_kps(gt_kp2d_conf,'smplx_137', 'coco', approximate=True)
+            # cost_keypoints = torch.abs(
+            #     (pred_kp2d_body[:, None]/scale[:2] - gt_kp2d_body[None]/scale[:2])*gt_kp2d_conf[None]
+            #                            ).sum([-2,-1])/gt_kp2d_conf[None].sum()    
+            gt_kp2d_body, _ = convert_kps(gt_kp2d_,'smplx_137', 'coco', approximate=True)
+            pred_kp2d_body, _ = convert_kps(pred_kp2d,'smplx_137', 'coco', approximate=True)
+            cost_keypoints = torch.abs(
+                (pred_kp2d_body[:, None]/scale[:2] - gt_kp2d_body[None]/scale[:2])
+                                       ).sum([-2,-1])
+            # cost_keypoints = torch.abs(
+            #     (pred_kp2d_body[:, None]/scale[:2] - gt_kp2d_body[None]/scale[:2])*gt_kp2d_body_conf[None]
+            #                            ).sum([-2,-1])/gt_kp2d_body_conf[None].sum()    
+            # coco kp2d
+            # gt_kp2d_conf, _ = convert_kps(gt_kp2d_conf,'smplx_137', 'coco', approximate=True)
+            # keypoints_coco = Z_pred.reshape(num_select, 17,2)
+            
+            # ubody
+            # cost_keypoints_coco = torch.abs(
+            #     (keypoints_coco[:, None]/scale[:2] - gt_kp2d_body[None]/scale[:2])*gt_kp2d_conf[None]
+            #                            ).sum([-2,-1])/gt_kp2d_conf[None].sum()          
+            
+            # others
+            # cost_keypoints_coco = torch.abs(
+            #     (keypoints_coco[:, None]/scale[:2] - gt_kp2d_body[None]/scale[:2])
+            #                            ).sum([-2,-1])               
+                    
+            # smpl kp3d
+            gt_kp3d_ = torch.tensor(np.array(gt_kp3d) - np.array(gt_kp3d)[:, [0]]).to(device)
+            pred_kp3d_ = (pred_kp3d - pred_kp3d[:, [0]])
+            cost_kp3d = torch.abs((pred_kp3d_[:, None] - gt_kp3d_[None])).sum([-2,-1])
+            
+            # 1. kps
+            indice = linear_sum_assignment(cost_keypoints.cpu())
+            
+            # 2. bbox giou
+            # indice = linear_sum_assignment(cost_giou.cpu())
+            
+            # 3. bbox
+            # indice = linear_sum_assignment(cost_bbox.cpu())      
+            
+            # 4. all
+            # indice = linear_sum_assignment(
+            #     10* (cost_keypoints).cpu() +  5 * cost_bbox.cpu())
+            
+            # 5. kp3d 
+            # indice = linear_sum_assignment(cost_kp3d.cpu())
+            
+            # 5. kp2d coco
+            # indice = linear_sum_assignment(cost_keypoints_coco.cpu())
+            
+            pred_ind, gt_ind = indice
+            pred_indice_list.append(pred_ind)
+            gt_indice_list.append(gt_ind)
+            
+        pred_scores = torch.cat(
+            [t[i] for t, i in zip(scores, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_labels = torch.cat(
+            [t[i] for t, i in zip(labels, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_boxes = torch.cat(
+            [t[i] for t, i in zip(boxes, pred_indice_list)]
+            ).detach().cpu().numpy()
+        # pred_keypoints = torch.cat(
+        #     [t[i] for t, i in zip(keypoints_res, pred_indice_list)]
+        #     ).detach().cpu().numpy()
+        
+
+
+        
+        pred_smpl_kp2d = []
+        pred_smpl_kp3d = []
+        pred_smpl_cam = []
+        img_wh_list = []
+        for i, img_wh in enumerate(tgt_img_shape):
+            
+            kp3d = smpl_kp3d[i][pred_indice_list[i]]
+            cam = smpl_cam[i][pred_indice_list[i]]
+            img_wh = img_wh.flip(-1)[None]
+            
+            kp2d = project_points_new(
+                points_3d=kp3d,
+                pred_cam=cam,
+                focal_length=5000,
+                camera_center=img_wh/2
+            )     
+            num_instance = kp2d.shape[0]
+            img_wh_list.append(img_wh.repeat(num_instance,1).cpu().numpy())
+            pred_smpl_kp2d.append(kp2d.detach().cpu().numpy())
+            pred_smpl_kp3d.append(kp3d.detach().cpu().numpy())
+            pred_smpl_cam.append(cam.detach().cpu().numpy())
+        
+        # pred_smpl_cam = torch.cat(
+        #     [t[i] for t, i in zip(smpl_cam, pred_indice_list)]
+        #     ).detach().cpu().numpy()
+        # pred_smpl_kp3d = torch.cat(
+        #     [t[i] for t, i in zip(smpl_kp3d, pred_indice_list)]
+        #     )
+        pred_smpl_pose = torch.cat(
+            [t[i] for t, i in zip(smpl_pose, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_smpl_beta = torch.cat(
+            [t[i] for t, i in zip(smpl_beta, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_smpl_expr = torch.cat(
+            [t[i] for t, i in zip(smpl_expr, pred_indice_list)]
+            ).detach().cpu().numpy()
+        pred_smpl_verts = torch.cat(
+            [t[i] for t, i in zip(smpl_verts, pred_indice_list)]
+            ).detach().cpu().numpy()
+        
+        pred_smpl_kp2d = np.concatenate(pred_smpl_kp2d, 0)
+        pred_smpl_kp3d = np.concatenate(pred_smpl_kp3d, 0)
+        pred_smpl_cam = np.concatenate(pred_smpl_cam, 0)       
+        img_wh_list = np.concatenate(img_wh_list, 0)   
+
+        gt_smpl_kp3d = torch.cat(tgt_smpl_kp3d).detach().cpu().numpy()
+        gt_smpl_pose = torch.cat(tgt_smpl_pose).detach().cpu().numpy()
+        gt_smpl_beta = torch.cat(tgt_smpl_beta).detach().cpu().numpy()
+        gt_boxes = torch.cat(tgt_bbox).detach().cpu().numpy()
+        gt_smpl_expr = torch.cat(tgt_smpl_expr).detach().cpu().numpy()
+        # gt_img_shape = torch.cat(tgt_img_shape).detach().cpu().numpy()
+        gt_smpl_verts = np.concatenate(
+            [np.array(t)[i] for t, i in zip(tgt_verts, gt_indice_list)], 0)
+        gt_ann_idx = torch.cat([t.repeat(len(i)) for t, i in zip(tgt_ann_idx, gt_indice_list)],dim=0).cpu().numpy()
+        
+        gt_keypoints = torch.cat(tgt_keypoints).detach().cpu().numpy()
+        # gt_img_shape = tgt_img_shape.detach().cpu().numpy()
+        gt_bb2img_trans = torch.stack(tgt_bb2img_trans).detach().cpu().numpy()
+
+        if 'image_id' in targets[i]:
+            image_idx=(targets[i]['image_id'].detach().cpu().numpy())
+
+        smplx_root_pose = pred_smpl_pose[:,:3]
+        smplx_body_pose = pred_smpl_pose[:,3:66]
+        smplx_lhand_pose = pred_smpl_pose[:,66:111]
+        smplx_rhand_pose = pred_smpl_pose[:,111:156]
+        smplx_jaw_pose = pred_smpl_pose[:,156:]
+
+        results.append({
+                    'scores': pred_scores, 
+                    'labels': pred_labels, 
+                    'boxes': pred_boxes, 
+                    # 'keypoints': pred_keypoints,
+                    'smplx_root_pose': smplx_root_pose, 
+                    'smplx_body_pose': smplx_body_pose,
+                    'smplx_lhand_pose': smplx_lhand_pose,
+                    'smplx_rhand_pose': smplx_rhand_pose,
+                    'smplx_jaw_pose': smplx_jaw_pose,
+                    'smplx_shape': pred_smpl_beta, 
+                    'smplx_expr': pred_smpl_expr, 
+                    'cam_trans': pred_smpl_cam, 
+                    'smplx_mesh_cam': pred_smpl_verts,
+                    'smplx_mesh_cam_target': gt_smpl_verts,
+                    'gt_smpl_kp3d':gt_smpl_kp3d,
+                    'smplx_joint_proj': pred_smpl_kp2d,
+                    # 'image_idx': image_idx,
+                    "img": data_batch_nc['img'].cpu().numpy(),
+                    'bb2img_trans': gt_bb2img_trans,
+                    'img_shape': img_wh_list,
+                    'gt_ann_idx': gt_ann_idx
+                })
+
+        if self.nms_iou_threshold > 0:
+            raise NotImplementedError
+            item_indices = [nms(b, s, iou_threshold=self.nms_iou_threshold) for b,s in zip(boxes, scores)]
+            # import pdb; pdb.set_trace()
+            results = [{'scores': s[i], 'labels': l[i], 'boxes': b[i]} for s, l, b, i in zip(scores, labels, boxes, item_indices)]
+        else:
+            results = results
+
+        return results
+
+
+class PostProcess_SMPLX_Multi_Infer_Box(nn.Module):
+    """ This module converts the model's output into the format expected by the coco api"""
+    def __init__(
+        self, 
+        num_select=100, 
+        nms_iou_threshold=-1,
+        num_body_points=17,
+        body_model= dict(
+            type='smplx',
+            keypoint_src='smplx',
+            num_expression_coeffs=10,
+            num_betas=10,
+            gender='neutral',
+            keypoint_dst='smplx_137',
+            model_path='data/body_models/smplx',
+            use_pca=False,
+            use_face_contour=True)
+        ) -> None:
+        super().__init__()
+        self.num_select = num_select
+        self.nms_iou_threshold = nms_iou_threshold
+        self.num_body_points=num_body_points
+        
+        # -1 for neutral; 0 for male; 1 for femal
+        gender_body_model = {}
+        gender_body_model[-1] = build_body_model(body_model)
+        
+        body_model['gender']='male'
+        gender_body_model[0] = build_body_model(body_model)
+        
+        body_model['gender']='female'
+        gender_body_model[1] = build_body_model(body_model)
+        
+        self.body_model = gender_body_model
+        
+    @torch.no_grad()
+    def forward(self, outputs, target_sizes, targets, data_batch_nc, image_shape= None, not_to_xyxy=False, test=False):
+        """
+        image_shape(target_sizes): input image shape
+        
+        """
+
+        batch_size = outputs['pred_smpl_beta'].shape[0]
+        results = []
+        device = outputs['pred_smpl_beta'].device
+
+        num_select = self.num_select
+        out_logits, out_bbox= outputs['pred_logits'], outputs['pred_boxes']
+        
+        out_body_bbox, out_lhand_bbox, out_rhand_bbox, out_face_bbox = \
+            outputs['pred_boxes'], outputs['pred_lhand_boxes'], \
+                outputs['pred_rhand_boxes'], outputs['pred_face_boxes']
+            
+        out_smpl_pose, out_smpl_beta, out_smpl_expr, out_smpl_cam, out_smpl_kp3d, out_smpl_verts = \
+            outputs['pred_smpl_fullpose'], outputs['pred_smpl_beta'], outputs['pred_smpl_expr'], \
+            outputs['pred_smpl_cam'], outputs['pred_smpl_kp3d'], outputs['pred_smpl_verts']
+
+        out_smpl_kp2d = []
+        for bs in range(batch_size):
+            out_kp3d_i = out_smpl_kp3d[bs]
+            out_cam_i = out_smpl_cam[bs]
+            out_img_shape = data_batch_nc['img_shape'][bs].flip(-1)[None]
+
+            out_kp2d_i = project_points_new(
+                points_3d=out_kp3d_i,
+                pred_cam=out_cam_i,
+                focal_length=5000,
+                camera_center=out_img_shape/2
+            )   
+            out_smpl_kp2d.append(out_kp2d_i.detach().cpu().numpy())
+        out_smpl_kp2d = torch.tensor(out_smpl_kp2d).to(device)
+
+            
+        prob = out_logits.sigmoid()
+        topk_values, topk_indexes = \
+            torch.topk(prob.view(out_logits.shape[0], -1), num_select, dim=1)
+        scores = topk_values
+        
+        # bbox
+        topk_boxes = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        
+        if not_to_xyxy:
+            boxes = out_bbox
+        else:
+            boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
+            out_body_bbox = box_ops.box_cxcywh_to_xyxy(out_body_bbox)
+            out_lhand_bbox = box_ops.box_cxcywh_to_xyxy(out_lhand_bbox)
+            out_rhand_bbox = box_ops.box_cxcywh_to_xyxy(out_rhand_bbox)
+            out_face_bbox = box_ops.box_cxcywh_to_xyxy(out_face_bbox)
+            
+        # gather body bbox
+        target_sizes = target_sizes.type_as(boxes)
+        img_h, img_w = target_sizes.unbind(1)
+        scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
+        
+        boxes_norm = torch.gather(boxes, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        boxes = boxes_norm * scale_fct[:, None, :]
+        
+        body_bbox_norm = torch.gather(out_body_bbox, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        body_boxes = body_bbox_norm * scale_fct[:, None, :]
+        
+        lhand_bbox_norm = torch.gather(out_lhand_bbox, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        lhand_boxes = lhand_bbox_norm * scale_fct[:, None, :]
+        
+        rhand_bbox_norm = torch.gather(out_rhand_bbox, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        rhand_boxes = rhand_bbox_norm * scale_fct[:, None, :]
+        
+        face_bbox_norm = torch.gather(out_face_bbox, 1, topk_boxes.unsqueeze(-1).repeat(1,1,4))
+        face_boxes = face_bbox_norm * scale_fct[:, None, :]
+        
+        # from relative [0, 1] to absolute [0, height] coordinates
+
+        # smplx kp2d
+        topk_smpl_kp2d = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+        pred_smpl_kp2d = torch.gather(
+            out_smpl_kp2d, 1, 
+            topk_smpl_kp2d.unsqueeze(-1).unsqueeze(-1).repeat(1, 1, 144, 2))        
+
+        # smpl param
+        topk_smpl = topk_indexes // out_logits.shape[2]
+        labels = topk_indexes % out_logits.shape[2]
+
+        smpl_pose = torch.gather(out_smpl_pose, 1, topk_smpl[:,:,None].repeat(1, 1, 159))        
+        smpl_beta = torch.gather(out_smpl_beta, 1, topk_smpl[:,:,None].repeat(1, 1, 10))   
+        smpl_expr = torch.gather(out_smpl_expr, 1, topk_smpl[:,:,None].repeat(1, 1, 10))   
+        smpl_cam = torch.gather(out_smpl_cam, 1, topk_smpl[:,:,None].repeat(1, 1, 3))   
+        smpl_kp3d = torch.gather(out_smpl_kp3d, 1, topk_smpl[:,:,None, None].repeat(1, 1, out_smpl_kp3d.shape[-2],3))
+        smpl_verts = torch.gather(out_smpl_verts, 1, topk_smpl[:,:,None, None].repeat(1, 1, out_smpl_verts.shape[-2],3))
+        # smpl_verts = smpl_verts - smpl_kp3d[:,:, [0]]
+        (s, tx, ty) = (smpl_cam[..., 0] + 1e-9), smpl_cam[..., 1], smpl_cam[..., 2]
+        depth, dx, dy = 1./s, tx/s, ty/s
+        transl = torch.stack([dx, dy, depth], -1) 
+        
+        smplx_root_pose = smpl_pose[:, :, :3]
+        smplx_body_pose = smpl_pose[:, :, 3:66]
+        smplx_lhand_pose = smpl_pose[:, :, 66:111]
+        smplx_rhand_pose = smpl_pose[:, :, 111:156]
+        smplx_jaw_pose = smpl_pose[:, :, 156:]
+        
+        if 'ann_idx' in data_batch_nc:
+            image_idx=[target.cpu().numpy()[0] for target in data_batch_nc['ann_idx']]
+
+        for bs in range(batch_size):
+            results.append({
+                        'scores': scores[bs], 
+                        'labels': labels[bs], 
+                        'smpl_kp3d': smpl_kp3d[bs],
+                        'smplx_root_pose': smplx_root_pose[bs], 
+                        'smplx_body_pose': smplx_body_pose[bs],
+                        'smplx_lhand_pose': smplx_lhand_pose[bs],
+                        'smplx_rhand_pose': smplx_rhand_pose[bs],
+                        'smplx_jaw_pose': smplx_jaw_pose[bs],
+                        'smplx_shape': smpl_beta[bs], 
+                        'smplx_expr': smpl_expr[bs], 
+                        'smplx_joint_proj': pred_smpl_kp2d[bs],
+                        'smpl_verts': smpl_verts[bs],
+                        'image_idx': image_idx[bs],
+                        'cam_trans': transl[bs],
+                        'body_bbox': body_boxes[bs],
+                        'lhand_bbox': lhand_boxes[bs],
+                        'rhand_bbox': rhand_boxes[bs],
+                        'face_bbox': face_boxes[bs],
+                        'bb2img_trans': data_batch_nc['bb2img_trans'][bs],
+                        'img2bb_trans': data_batch_nc['img2bb_trans'][bs],
+                        'img': data_batch_nc['img'][bs],
+                        'img_shape': data_batch_nc['img_shape'][bs]
+                    })
+
+        if self.nms_iou_threshold > 0:
+            raise NotImplementedError
+            item_indices = [nms(b, s, iou_threshold=self.nms_iou_threshold) for b,s in zip(boxes, scores)]
+            # import pdb; pdb.set_trace()
+            results = [{'scores': s[i], 'labels': l[i], 'boxes': b[i]} for s, l, b, i in zip(scores, labels, boxes, item_indices)]
+        else:
+            results = results
+
+        return results
diff --git a/models/aios/transformer.py b/models/aios/transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4214767776051021d3e951dcdde93932d8e6fa4f
--- /dev/null
+++ b/models/aios/transformer.py
@@ -0,0 +1,2537 @@
+import math, random
+import copy
+import os
+from typing import Optional, List, Union
+import warnings
+from util.misc import inverse_sigmoid
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from .transformer_deformable import DeformableTransformerEncoderLayer, DeformableTransformerDecoderLayer
+from .utils import gen_encoder_output_proposals, sigmoid_focal_loss, MLP, _get_activation_fn, gen_sineembed_for_position
+from .ops.modules.ms_deform_attn import MSDeformAttn
+import pdb
+
+
+class Transformer(nn.Module):
+    def __init__(
+            self,
+            d_model=256,
+            nhead=8,
+            num_queries=300,
+            num_encoder_layers=6,
+            num_decoder_layers=6,
+            dim_feedforward=2048,
+            dropout=0.0,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=False,
+            query_dim=4,
+            num_patterns=0,
+            modulate_hw_attn=False,
+            # for deformable encoder
+            deformable_encoder=False,
+            deformable_decoder=False,
+            num_feature_levels=1,
+            enc_n_points=4,
+            dec_n_points=4,
+            # init query
+            learnable_tgt_init=False,
+            random_refpoints_xy=False,
+            # two stage
+            two_stage_type='no',
+            two_stage_learn_wh=False,
+            two_stage_keep_all_tokens=False,
+            # evo of #anchors
+            dec_layer_number=None,
+            rm_self_attn_layers=None,
+            # for detach
+            rm_detach=None,
+            decoder_sa_type='sa',
+            module_seq=['sa', 'ca', 'ffn'],
+            # for pose
+            embed_init_tgt=False,
+            num_body_points=17,
+            num_hand_points=10,
+            num_face_points=10, 
+            num_box_decoder_layers=2,
+            num_hand_face_decoder_layers=4,
+            num_group=100):
+        super().__init__()
+        # pdb.set_trace()
+        self.num_feature_levels = num_feature_levels  # 4
+        self.num_encoder_layers = num_encoder_layers  # 6
+        self.num_decoder_layers = num_decoder_layers  # 6
+        self.deformable_encoder = deformable_encoder
+        self.deformable_decoder = deformable_decoder
+        self.two_stage_keep_all_tokens = two_stage_keep_all_tokens  # False
+        self.num_queries = num_queries  # 900
+        self.random_refpoints_xy = random_refpoints_xy  # False
+        assert query_dim == 4
+
+        if num_feature_levels > 1:
+            assert deformable_encoder, 'only support deformable_encoder for num_feature_levels > 1'
+
+        self.decoder_sa_type = decoder_sa_type  # sa
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+
+        # choose encoder layer type
+        if deformable_encoder:
+            encoder_layer = DeformableTransformerEncoderLayer(
+                d_model, dim_feedforward, dropout, activation,
+                num_feature_levels, nhead, enc_n_points)
+        else:
+            raise NotImplementedError
+            encoder_layer = TransformerEncoderLayer(d_model, nhead,
+                                                    dim_feedforward, dropout,
+                                                    activation,
+                                                    normalize_before)
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(
+            encoder_layer,
+            num_encoder_layers,
+            encoder_norm,
+            d_model=d_model,
+            num_queries=num_queries,
+            deformable_encoder=deformable_encoder,
+            two_stage_type=two_stage_type)
+
+        # choose decoder layer type
+        if deformable_decoder:
+            decoder_layer = DeformableTransformerDecoderLayer(
+                d_model,
+                dim_feedforward,
+                dropout,
+                activation,
+                num_feature_levels,
+                nhead,
+                dec_n_points,
+                decoder_sa_type=decoder_sa_type,
+                module_seq=module_seq)
+
+        else:
+            raise NotImplementedError
+            decoder_layer = TransformerDecoderLayer(
+                d_model,
+                nhead,
+                dim_feedforward,
+                dropout,
+                activation,
+                normalize_before,
+                num_feature_levels=num_feature_levels)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+            d_model=d_model,
+            query_dim=query_dim,
+            modulate_hw_attn=modulate_hw_attn,
+            num_feature_levels=num_feature_levels,
+            deformable_decoder=deformable_decoder,
+            dec_layer_number=dec_layer_number,
+            num_body_points=num_body_points,
+            num_hand_points=num_hand_points,
+            num_face_points=num_face_points,
+            num_box_decoder_layers=num_box_decoder_layers,
+            num_hand_face_decoder_layers=num_hand_face_decoder_layers,
+            num_group=num_group,
+            num_dn=num_group,
+            )
+
+        self.d_model = d_model
+        self.nhead = nhead  # 8
+        self.dec_layers = num_decoder_layers  # 6
+        self.num_queries = num_queries  # useful for single stage model only
+        self.num_patterns = num_patterns  # 0
+        if not isinstance(num_patterns, int):
+            Warning('num_patterns should be int but {}'.format(
+                type(num_patterns)))
+            self.num_patterns = 0
+        if self.num_patterns > 0:
+            assert two_stage_type == 'no'
+            self.patterns = nn.Embedding(self.num_patterns, d_model)
+        if num_feature_levels > 1:
+            if self.num_encoder_layers > 0:
+                self.level_embed = nn.Parameter(
+                    torch.Tensor(num_feature_levels, d_model))
+            else:
+                self.level_embed = None
+
+        self.learnable_tgt_init = learnable_tgt_init  # true
+        assert learnable_tgt_init, 'why not learnable_tgt_init'
+        self.embed_init_tgt = embed_init_tgt  # false
+        if (two_stage_type != 'no' and embed_init_tgt) or (two_stage_type
+                                                           == 'no'):
+            self.tgt_embed = nn.Embedding(self.num_queries, d_model)
+            nn.init.normal_(self.tgt_embed.weight.data)
+        else:
+            self.tgt_embed = None
+
+        # for two stage
+        self.two_stage_type = two_stage_type
+        self.two_stage_learn_wh = two_stage_learn_wh
+        assert two_stage_type in [
+            'no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1'
+        ], 'unknown param {} of two_stage_type'.format(two_stage_type)
+        if two_stage_type in [
+                'standard', 'combine', 'enceachlayer', 'enclayer1'
+        ]:
+            # anchor selection at the output of encoder
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+
+            if two_stage_learn_wh:
+                # import pdb; pdb.set_trace()
+                self.two_stage_wh_embedding = nn.Embedding(1, 2)
+            else:
+                self.two_stage_wh_embedding = None
+
+        if two_stage_type in ['early', 'combine']:
+            # anchor selection at the output of backbone
+            self.enc_output_backbone = nn.Linear(d_model, d_model)
+            self.enc_output_norm_backbone = nn.LayerNorm(d_model)
+
+        if two_stage_type == 'no':
+            self.init_ref_points(num_queries)  # init self.refpoint_embed
+
+        self.enc_out_class_embed = None
+        self.enc_out_bbox_embed = None
+        self.enc_out_pose_embed = None
+
+        # evolution of anchors
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            if self.two_stage_type != 'no' or num_patterns == 0:
+                assert dec_layer_number[
+                    0] == num_queries, f'dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries})'
+            else:
+                assert dec_layer_number[
+                    0] == num_queries * num_patterns, f'dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries}) * num_patterns({num_patterns})'
+
+        self._reset_parameters()
+
+        self.rm_self_attn_layers = rm_self_attn_layers
+        if rm_self_attn_layers is not None:
+            # assert len(rm_self_attn_layers) == num_decoder_layers
+            print('Removing the self-attn in {} decoder layers'.format(
+                rm_self_attn_layers))
+            for lid, dec_layer in enumerate(self.decoder.layers):
+                if lid in rm_self_attn_layers:
+                    dec_layer.rm_self_attn_modules()
+
+        self.rm_detach = rm_detach
+        if self.rm_detach:
+            assert isinstance(rm_detach, list)
+            assert any([i in ['enc_ref', 'enc_tgt', 'dec'] for i in rm_detach])
+        self.decoder.rm_detach = rm_detach
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+
+        if self.num_feature_levels > 1 and self.level_embed is not None:
+            nn.init.normal_(self.level_embed)
+
+        if self.two_stage_learn_wh:
+            nn.init.constant_(self.two_stage_wh_embedding.weight,
+                              math.log(0.05 / (1 - 0.05)))
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, 4)
+
+        if self.random_refpoints_xy:
+            # import pdb; pdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(
+                self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+
+    # srcs: features; refpoint_embed:
+    def forward(self,
+                srcs,
+                masks,
+                refpoint_embed,
+                pos_embeds,
+                tgt,
+                attn_mask=None,
+                attn_mask2=None,
+                attn_mask3=None):
+        # pdb.set_trace()
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(
+                zip(srcs, masks, pos_embeds)):  # for feature level
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+
+            src = src.flatten(2).transpose(1, 2)  # bs, hw, c
+            mask = mask.flatten(1)  # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)  # bs, hw, c
+            if self.num_feature_levels > 1 and self.level_embed is not None:
+                lvl_pos_embed = pos_embed + self.level_embed[lvl].view(
+                    1, 1, -1)  # level_embed[lvl]: [256]
+            else:
+                lvl_pos_embed = pos_embed
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)  # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)  # bs, \sum{hxw}
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten,
+                                          1)  # bs, \sum{hxw}, c
+        spatial_shapes = torch.as_tensor(spatial_shapes,
+                                         dtype=torch.long,
+                                         device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        # two stage
+        if self.two_stage_type in ['early', 'combine']:
+            output_memory, output_proposals = gen_encoder_output_proposals(
+                src_flatten, mask_flatten, spatial_shapes)
+            output_memory = self.enc_output_norm_backbone(
+                self.enc_output_backbone(output_memory))
+
+            # gather boxes
+            topk = self.num_queries
+            enc_outputs_class = self.encoder.class_embed[0](output_memory)
+            enc_topk_proposals = torch.topk(enc_outputs_class.max(-1)[0],
+                                            topk,
+                                            dim=1)[1]  # bs, nq
+            enc_refpoint_embed = torch.gather(
+                output_proposals, 1,
+                enc_topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+
+            src_flatten = output_memory
+        else:
+            enc_topk_proposals = enc_refpoint_embed = None
+
+        #########################################################
+        # Begin Encoder
+        #########################################################
+        memory, enc_intermediate_output, enc_intermediate_refpoints = self.encoder(
+            src_flatten,
+            pos=lvl_pos_embed_flatten,
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            key_padding_mask=mask_flatten,
+            ref_token_index=enc_topk_proposals,  # bs, nq
+            ref_token_coord=enc_refpoint_embed,  # bs, nq, 4
+        )
+        #########################################################
+        # End Encoder
+        # - memory: bs, \sum{hw}, c
+        # - mask_flatten: bs, \sum{hw}
+        # - lvl_pos_embed_flatten: bs, \sum{hw}, c
+        # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        #########################################################
+
+        if self.two_stage_type in [
+                'standard', 'combine', 'enceachlayer', 'enclayer1'
+        ]:
+            if self.two_stage_learn_wh:
+                # import pdb; pdb.set_trace()
+                input_hw = self.two_stage_wh_embedding.weight[0]
+            else:
+                input_hw = None
+            output_memory, output_proposals = gen_encoder_output_proposals(
+                memory, mask_flatten, spatial_shapes, input_hw)
+            output_memory = self.enc_output_norm(
+                self.enc_output(output_memory))
+
+            enc_outputs_class_unselected = self.enc_out_class_embed(
+                output_memory)  # [11531, 2] for swin
+            enc_outputs_coord_unselected = self.enc_out_bbox_embed(
+                output_memory
+            ) + output_proposals  # (bs, \sum{hw}, 4) unsigmoid
+            topk = self.num_queries
+            topk_proposals = torch.topk(
+                enc_outputs_class_unselected.max(-1)[0], topk,
+                dim=1)[1]  # bs, nq coarse human query selection
+
+            # gather boxes
+            refpoint_embed_undetach = torch.gather(
+                enc_outputs_coord_unselected, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))  # unsigmoid
+            refpoint_embed_ = refpoint_embed_undetach.detach()
+            init_box_proposal = torch.gather(
+                output_proposals, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1,
+                                                    4)).sigmoid()  # sigmoid
+
+            # gather tgt
+            tgt_undetach = torch.gather(
+                output_memory, 1,
+                topk_proposals.unsqueeze(-1).repeat(
+                    1, 1, self.d_model))  # selected content query
+            if self.embed_init_tgt:
+                tgt_ = self.tgt_embed.weight[:, None, :].repeat(
+                    1, bs, 1).transpose(0, 1)  # nq, bs, d_model
+            else:
+                tgt_ = tgt_undetach.detach()
+
+            if refpoint_embed is not None:
+                # import pdb; pdb.set_trace()
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_],
+                                           dim=1)  # [1000, 4]
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+        elif self.two_stage_type == 'early':
+            refpoint_embed_undetach = self.enc_out_bbox_embed(
+                enc_intermediate_output[-1]
+            ) + enc_refpoint_embed  # unsigmoid, (bs, nq, 4)
+            refpoint_embed = refpoint_embed_undetach.detach()  #
+
+            tgt_undetach = enc_intermediate_output[-1]  # bs, nq, d_model
+            tgt = tgt_undetach.detach()
+        elif self.two_stage_type == 'no':
+            tgt_ = self.tgt_embed.weight[:,
+                                         None, :].repeat(1, bs, 1).transpose(
+                                             0, 1)  # nq, bs, d_model
+            refpoint_embed_ = self.refpoint_embed.weight[:, None, :].repeat(
+                1, bs, 1).transpose(0, 1)  # nq, bs, 4
+
+            if refpoint_embed is not None:
+                # import pdb; pdb.set_trace()
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_],
+                                           dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+            # pat embed
+            if self.num_patterns > 0:
+                tgt_embed = tgt.repeat(1, self.num_patterns, 1)
+                refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)
+                tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(
+                    self.num_queries, 1)  # 1, n_q*n_pat, d_model
+                tgt = tgt_embed + tgt_pat
+
+            init_box_proposal = refpoint_embed_.sigmoid()
+
+        else:
+            raise NotImplementedError('unknown two_stage_type {}'.format(
+                self.two_stage_type))
+
+        #########################################################
+        # Begin Decoder
+        #########################################################
+        hs, references = self.decoder(
+            tgt=tgt.transpose(0, 1),
+            memory=memory.transpose(0, 1),
+            memory_key_padding_mask=mask_flatten,
+            pos=lvl_pos_embed_flatten.transpose(0, 1),
+            refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            tgt_mask=attn_mask,
+            tgt_mask2=attn_mask2,
+            tgt_mask3=attn_mask3)
+        #########################################################
+        # End Decoder
+        # hs: n_dec, bs, nq, d_model
+        # references: n_dec+1, bs, nq, query_dim
+        #########################################################
+
+        #########################################################
+        # Begin postprocess
+        #########################################################
+        if self.two_stage_type == 'standard':
+            if self.two_stage_keep_all_tokens:
+                hs_enc = output_memory.unsqueeze(0)
+                ref_enc = enc_outputs_coord_unselected.unsqueeze(0)
+                init_box_proposal = output_proposals
+                # import pdb; pdb.set_trace()
+            else:
+                hs_enc = tgt_undetach.unsqueeze(0)
+                ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)
+        elif self.two_stage_type in ['combine', 'early']:
+            hs_enc = enc_intermediate_output
+            hs_enc = torch.cat((hs_enc, tgt_undetach.unsqueeze(0)),
+                               dim=0)  # nenc+1, bs, nq, c
+            n_layer_hs_enc = hs_enc.shape[0]
+            assert n_layer_hs_enc == self.num_encoder_layers + 1
+
+            ref_enc = enc_intermediate_refpoints
+            ref_enc = torch.cat(
+                (ref_enc, refpoint_embed_undetach.sigmoid().unsqueeze(0)),
+                dim=0)  # nenc+1, bs, nq, 4
+        elif self.two_stage_type in ['enceachlayer', 'enclayer1']:
+            hs_enc = enc_intermediate_output
+            hs_enc = torch.cat((hs_enc, tgt_undetach.unsqueeze(0)),
+                               dim=0)  # nenc, bs, nq, c
+            n_layer_hs_enc = hs_enc.shape[0]
+            assert n_layer_hs_enc == self.num_encoder_layers
+
+            ref_enc = enc_intermediate_refpoints
+            ref_enc = torch.cat(
+                (ref_enc, refpoint_embed_undetach.sigmoid().unsqueeze(0)),
+                dim=0)  # nenc, bs, nq, 4
+        else:
+            hs_enc = ref_enc = None
+
+        return hs, references, hs_enc, ref_enc, init_box_proposal
+
+
+class TransformerEncoder(nn.Module):
+    def __init__(
+        self,
+        encoder_layer,
+        num_layers,
+        norm=None,
+        d_model=256,
+        num_queries=300,
+        deformable_encoder=False,
+        enc_layer_share=False,
+        enc_layer_dropout_prob=None,
+        two_stage_type='no',
+    ):
+        super().__init__()
+        # pdb.set_trace()
+        # prepare layers
+        if num_layers > 0:  # 6
+            self.layers = _get_clones(
+                encoder_layer, num_layers,
+                layer_share=enc_layer_share)  # enc_layer_share false
+        else:
+            self.layers = []
+            del encoder_layer
+
+        self.query_scale = None
+        self.num_queries = num_queries  # 900
+        self.deformable_encoder = deformable_encoder
+        self.num_layers = num_layers  # 6
+        self.norm = norm
+        self.d_model = d_model
+
+        self.enc_layer_dropout_prob = enc_layer_dropout_prob
+        if enc_layer_dropout_prob is not None:
+            assert isinstance(enc_layer_dropout_prob, list)
+            assert len(enc_layer_dropout_prob) == num_layers
+            for i in enc_layer_dropout_prob:
+                assert 0.0 <= i <= 1.0
+
+        self.two_stage_type = two_stage_type
+        if two_stage_type in ['enceachlayer', 'enclayer1']:
+            _proj_layer = nn.Linear(d_model, d_model)
+            _norm_layer = nn.LayerNorm(d_model)
+            if two_stage_type == 'enclayer1':
+                self.enc_norm = nn.ModuleList([_norm_layer])
+                self.enc_proj = nn.ModuleList([_proj_layer])
+            else:
+                self.enc_norm = nn.ModuleList([
+                    copy.deepcopy(_norm_layer) for i in range(num_layers - 1)
+                ])
+                self.enc_proj = nn.ModuleList([
+                    copy.deepcopy(_proj_layer) for i in range(num_layers - 1)
+                ])
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        reference_points_list = []
+        for lvl, (H_, W_) in enumerate(spatial_shapes):
+
+            ref_y, ref_x = torch.meshgrid(
+                torch.linspace(0.5,
+                               H_ - 0.5,
+                               H_,
+                               dtype=torch.float32,
+                               device=device),
+                torch.linspace(0.5,
+                               W_ - 0.5,
+                               W_,
+                               dtype=torch.float32,
+                               device=device))
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, lvl, 1] *
+                                               H_)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, lvl, 0] *
+                                               W_)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(self,
+                src: Tensor,
+                pos: Tensor,
+                spatial_shapes: Tensor,
+                level_start_index: Tensor,
+                valid_ratios: Tensor,
+                key_padding_mask: Tensor,
+                ref_token_index: Optional[Tensor] = None,
+                ref_token_coord: Optional[Tensor] = None):
+        """
+        Input:
+            - src: [bs, sum(hi*wi), 256]
+            - pos: pos embed for src. [bs, sum(hi*wi), 256]
+            - spatial_shapes: h,w of each level [num_level, 2]
+            - level_start_index: [num_level] start point of level in sum(hi*wi).
+            - valid_ratios: [bs, num_level, 2]
+            - key_padding_mask: [bs, sum(hi*wi)]
+
+            - ref_token_index: bs, nq
+            - ref_token_coord: bs, nq, 4
+        Intermedia:
+            - reference_points: [bs, sum(hi*wi), num_level, 2]
+        Outpus:
+            - output: [bs, sum(hi*wi), 256]
+        """
+        # pdb.set_trace()
+        if self.two_stage_type in [
+                'no', 'standard', 'enceachlayer', 'enclayer1'
+        ]:
+            assert ref_token_index is None
+
+        output = src
+
+        # preparation and reshape
+        if self.num_layers > 0:
+            if self.deformable_encoder:
+                reference_points = self.get_reference_points(spatial_shapes,
+                                                             valid_ratios,
+                                                             device=src.device)
+                # import pdb; pdb.set_trace()
+
+        intermediate_output = []
+        intermediate_ref = []
+        if ref_token_index is not None:
+            out_i = torch.gather(
+                output, 1,
+                ref_token_index.unsqueeze(-1).repeat(1, 1, self.d_model))
+            intermediate_output.append(out_i)
+            intermediate_ref.append(ref_token_coord)
+
+        # intermediate_coord = []
+        # main process
+        for layer_id, layer in enumerate(self.layers):
+            # main process
+            dropflag = False
+            if self.enc_layer_dropout_prob is not None:
+                prob = random.random()
+                if prob < self.enc_layer_dropout_prob[layer_id]:
+                    dropflag = True
+
+            if not dropflag:
+                if self.deformable_encoder:
+                    output = layer(src=output,
+                                   pos=pos,
+                                   reference_points=reference_points,
+                                   spatial_shapes=spatial_shapes,
+                                   level_start_index=level_start_index,
+                                   key_padding_mask=key_padding_mask)
+                else:
+                    output = layer(
+                        src=output.transpose(0, 1),
+                        pos=pos.transpose(0, 1),
+                        key_padding_mask=key_padding_mask).transpose(0, 1)
+
+            if ((layer_id == 0 and self.two_stage_type in ['enceachlayer', 'enclayer1']) \
+                or (self.two_stage_type == 'enceachlayer')) \
+                    and (layer_id != self.num_layers - 1):
+                output_memory, output_proposals = gen_encoder_output_proposals(
+                    output, key_padding_mask, spatial_shapes)
+                output_memory = self.enc_norm[layer_id](
+                    self.enc_proj[layer_id](output_memory))
+
+                # gather boxes
+                topk = self.num_queries
+                enc_outputs_class = self.class_embed[layer_id](output_memory)
+                ref_token_index = torch.topk(enc_outputs_class.max(-1)[0],
+                                             topk,
+                                             dim=1)[1]  # bs, nq
+                ref_token_coord = torch.gather(
+                    output_proposals, 1,
+                    ref_token_index.unsqueeze(-1).repeat(1, 1, 4))
+
+                output = output_memory
+
+            # aux loss
+            if (layer_id !=
+                    self.num_layers - 1) and ref_token_index is not None:
+                out_i = torch.gather(
+                    output, 1,
+                    ref_token_index.unsqueeze(-1).repeat(1, 1, self.d_model))
+                intermediate_output.append(out_i)
+                intermediate_ref.append(ref_token_coord)
+
+        if self.norm is not None:
+            output = self.norm(output)
+
+        if ref_token_index is not None:
+            intermediate_output = torch.stack(
+                intermediate_output)  # n_enc/n_enc-1, bs, \sum{hw}, d_model
+            intermediate_ref = torch.stack(intermediate_ref)
+        else:
+            intermediate_output = intermediate_ref = None
+
+        return output, intermediate_output, intermediate_ref
+
+
+class TransformerDecoder(nn.Module):
+    def __init__(
+            self,
+            decoder_layer,
+            num_layers,
+            norm=None,
+            return_intermediate=False,
+            d_model=256,
+            query_dim=4,
+            modulate_hw_attn=False,
+            num_feature_levels=1,
+            deformable_decoder=False,
+            dec_layer_number=None,  # number of queries each layer in decoder
+            dec_layer_share=False,
+            dec_layer_dropout_prob=None,
+            num_box_decoder_layers=2,
+            num_hand_face_decoder_layers=4,
+            num_body_points=17,
+            num_hand_points=10,
+            num_face_points=10,
+            num_dn=100,
+            num_group=100):
+        super().__init__()
+        # pdb.set_trace()
+        if num_layers > 0:
+            self.layers = _get_clones(decoder_layer,
+                                      num_layers,
+                                      layer_share=dec_layer_share)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate  # True
+        assert return_intermediate, 'support return_intermediate only'
+        self.query_dim = query_dim  # 4
+        assert query_dim in [
+            2, 4
+        ], 'query_dim should be 2/4 but {}'.format(query_dim)
+        self.num_feature_levels = num_feature_levels  # 4
+
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model,
+                                  2)  # 4//2 * 256, 256, 256, 2
+        if not deformable_decoder:
+            self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
+        else:
+            self.query_pos_sine_scale = None
+
+        self.num_body_points = num_body_points
+        self.num_hand_points = num_hand_points
+        self.num_face_points = num_face_points
+        self.query_scale = None
+        
+        # aios kp
+        self.bbox_embed = None
+        self.class_embed = None
+        self.pose_embed = None
+        self.pose_hw_embed = None
+        
+        # smpl
+        # self.smpl_pose_embed = None
+        # self.smpl_beta_embed = None
+        # self.smpl_cam_embed = None
+        
+        # smplx
+        # smplx hand kp
+
+        self.bbox_hand_embed = None
+        self.bbox_hand_hw_embed = None
+        self.pose_hand_embed = None
+        self.pose_hand_hw_embed = None
+
+        
+        # smplx face kp
+        self.bbox_face_embed = None
+        self.bbox_face_hw_embed = None
+        self.pose_face_embed = None
+        self.pose_face_hw_embed = None
+        
+        # self.smplx_lhand_pose_embed = None 
+        # self.smplx_rhand_pose_embed = None 
+        # self.smplx_expression_embed = None
+        # self.smplx_jaw_embed = None 
+        
+        self.num_box_decoder_layers = num_box_decoder_layers  # 2
+        self.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.d_model = d_model
+        self.modulate_hw_attn = modulate_hw_attn
+        self.deformable_decoder = deformable_decoder
+
+        if not deformable_decoder and modulate_hw_attn:
+            self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
+        else:
+            self.ref_anchor_head = None
+
+        self.box_pred_damping = None
+
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            assert isinstance(dec_layer_number, list)
+            assert len(dec_layer_number) == num_layers
+            # assert dec_layer_number[0] ==
+
+        self.dec_layer_dropout_prob = dec_layer_dropout_prob
+        if dec_layer_dropout_prob is not None:
+            raise NotImplementedError
+            assert isinstance(dec_layer_dropout_prob, list)
+            assert len(dec_layer_dropout_prob) == num_layers
+            for i in dec_layer_dropout_prob:
+                assert 0.0 <= i <= 1.0
+        self.num_group = num_group
+        self.rm_detach = None
+        self.num_dn = num_dn
+        # self.hw_body_kps = nn.Embedding(self.num_body_points, 2)
+        self.hw = nn.Embedding(self.num_body_points, 2)
+        self.keypoint_embed = nn.Embedding(self.num_body_points, d_model)
+        
+        self.body_kpt_index_1 = [
+            x for x in range(self.num_group*(self.num_body_points+4)) if x%(self.num_body_points+4) not in [0, (1 + self.num_body_points), (2 + self.num_body_points), (3 + self.num_body_points)]]
+
+        self.whole_body_points = \
+            self.num_body_points + self.num_hand_points *2 + self.num_face_points
+        
+        self.body_kpt_index_2 = [
+            x for x in range(self.num_group * (self.whole_body_points + 4))
+            if (x % (self.whole_body_points + 4) in range(1,self.num_body_points+1))
+        ]
+
+
+        # [0-99]: dn bbox; 
+        # [0,1]: body box; 
+        # [1, 18]: body kps; 
+        # [18, 19]: lhand box
+        # [19, 29]: lhand kps
+        # [29, 30]: rhand box
+        # [30, 40]: rhand kps
+        # [40, 41]: face bbox
+        # [41, 51]: face kps
+        self.lhand_kpt_index = [
+            x for x in range(self.num_group * (self.whole_body_points + 4))
+            if (x % (self.whole_body_points + 4) in range(
+                self.num_body_points+2, self.num_body_points+self.num_hand_points+2))]
+
+        self.rhand_kpt_index = [
+            x for x in range(self.num_group * (self.whole_body_points + 4))
+            if (x % (self.whole_body_points + 4) in range(
+                self.num_body_points+self.num_hand_points+3, self.num_body_points+self.num_hand_points*2+3))
+        ]
+
+        self.face_kpt_index = [
+            x for x in range(self.num_group * (self.whole_body_points + 4))
+            if (x % (self.whole_body_points + 4) in range(
+                self.num_body_points+self.num_hand_points*2+4, self.num_body_points+self.num_hand_points*2+self.num_face_points+4))
+        ]
+
+        self.lhand_box_embed = nn.Embedding(1, d_model)
+        self.rhand_box_embed = nn.Embedding(1, d_model)
+        self.face_box_embed = nn.Embedding(1, d_model)
+        self.hw_lhand_bbox = nn.Embedding(1, 2)
+        self.hw_rhand_bbox = nn.Embedding(1, 2)
+        self.hw_face_bbox = nn.Embedding(1, 2)
+        
+        self.hw_lhand_kps = nn.Embedding(self.num_hand_points, 2)
+        self.hw_rhand_kps = nn.Embedding(self.num_hand_points, 2)
+        self.hw_face_kps = nn.Embedding(self.num_face_points, 2)
+        self.lhand_keypoint_embed = nn.Embedding(self.num_hand_points, d_model)
+        self.rhand_keypoint_embed = nn.Embedding(self.num_hand_points, d_model)
+        self.face_keypoint_embed = nn.Embedding(self.num_face_points, d_model)
+
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        tgt_mask2: Optional[Tensor] = None,
+        tgt_mask3: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        refpoints_unsigmoid: Optional[Tensor] = None,  # num_queries, bs, 2
+        # for memory
+        level_start_index: Optional[Tensor] = None,  # num_levels
+        spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+        valid_ratios: Optional[Tensor] = None,
+    ):
+        output = tgt
+
+        intermediate = []
+        reference_points = refpoints_unsigmoid.sigmoid()
+        ref_points = [reference_points]
+
+        effect_num_dn = self.num_dn if self.training else 0
+        inter_select_number = self.num_group
+        for layer_id, layer in enumerate(self.layers):
+            if self.deformable_decoder:
+                if reference_points.shape[-1] == 4:
+                    reference_points_input = reference_points[:, :, None] \
+                                            * torch.cat([valid_ratios, valid_ratios], -1)[None, :] # nq, bs, nlevel, 4
+                else:
+                    assert reference_points.shape[-1] == 2
+                    reference_points_input = reference_points[:, :,
+                                                              None] * valid_ratios[
+                                                                  None, :]
+                query_sine_embed = gen_sineembed_for_position(
+                    reference_points_input[:, :, 0, :]
+                )  # convert the position query from bbox to sine/cosin embend
+            else:
+                query_sine_embed = gen_sineembed_for_position(
+                    reference_points)  # nq, bs, 256*2
+                reference_points_input = None
+
+            raw_query_pos = self.ref_point_head(
+                query_sine_embed)  # nq, bs, 256
+            pos_scale = self.query_scale(
+                output) if self.query_scale is not None else 1  # ?
+            query_pos = pos_scale * raw_query_pos
+            if not self.deformable_decoder:
+                query_sine_embed = query_sine_embed[
+                    ..., :self.d_model] * self.query_pos_sine_scale(output)
+
+            # modulated HW attentions
+            if not self.deformable_decoder and self.modulate_hw_attn:
+                refHW_cond = self.ref_anchor_head(
+                    output).sigmoid()  # nq, bs, 2
+                query_sine_embed[..., self.d_model // 2:] *= (
+                    refHW_cond[..., 0] /
+                    reference_points[..., 2]).unsqueeze(-1)
+                query_sine_embed[..., :self.d_model //
+                                 2] *= (refHW_cond[..., 1] /
+                                        reference_points[..., 3]).unsqueeze(-1)
+
+            dropflag = False
+            if self.dec_layer_dropout_prob is not None:
+                prob = random.random()
+                if prob < self.dec_layer_dropout_prob[layer_id]:
+                    dropflag = True
+            if not dropflag:
+                output = layer(
+                    tgt=output,
+                    tgt_query_pos=query_pos,
+                    tgt_query_sine_embed=query_sine_embed,
+                    tgt_key_padding_mask=tgt_key_padding_mask,
+                    tgt_reference_points=reference_points_input,
+                    memory=memory,  # encoder output, also known as content query of encoder
+                    memory_key_padding_mask=memory_key_padding_mask,
+                    memory_level_start_index=level_start_index,
+                    memory_spatial_shapes=spatial_shapes,
+                    memory_pos=pos,  # position query of enconder
+                    self_attn_mask=tgt_mask,
+                    cross_attn_mask=memory_mask)
+
+            intermediate.append(self.norm(output))
+
+            # human update
+            if layer_id < self.num_box_decoder_layers:
+                # reference_points: [100*(17+20*2+72) 4, 4]
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                delta_unsig = self.bbox_embed[layer_id](
+                    output)  # delta_x, delta_y, delta_w, delta_h
+                outputs_unsig = delta_unsig + reference_before_sigmoid
+                new_reference_points = outputs_unsig.sigmoid(
+                )  # update the positional query by adding the offset delta_unsig
+
+            # kp query expansion
+            if layer_id == self.num_box_decoder_layers - 1:
+                dn_output = output[:effect_num_dn]  # [100,-,256]
+                dn_new_reference_points = new_reference_points[:
+                                                               effect_num_dn]  # [100, -, 4]
+
+                class_unselected = self.class_embed[layer_id](output)[
+                    effect_num_dn:]  # [900, -, 2]
+                topk_proposals = torch.topk(class_unselected.max(-1)[0],
+                                            inter_select_number,
+                                            dim=0)[1]  # 100
+                # selected position: select 100 query
+                new_reference_points_for_body_box = torch.gather(
+                    new_reference_points[effect_num_dn:], 0,
+                    topk_proposals.unsqueeze(-1).repeat(
+                        1, 1, 4))  # selected position query
+                # selected output features
+                new_output_for_body_box = torch.gather(
+                    output[effect_num_dn:], 0,
+                    topk_proposals.unsqueeze(-1).repeat(
+                        1, 1, self.d_model))  # selected content query
+                bs = new_output_for_body_box.shape[1]
+                # selected content query + keypoint position query, with shape [100, -, 4]
+                # expand per-human query to per-keypoint query
+                new_output_for_body_keypoint = new_output_for_body_box[:, None, :, :] \
+                    + self.keypoint_embed.weight[None, :, None, :]  # keypoint content query
+                    
+                if self.num_body_points == 17:
+                    delta_xy = self.pose_embed[-1](new_output_for_body_keypoint)[
+                        ..., :2]
+                else:
+                    delta_xy = self.pose_embed[0](new_output_for_body_keypoint)[
+                        ..., :2]
+                body_keypoint_xy = (inverse_sigmoid(
+                    new_reference_points_for_body_box[..., :2][:, None]) +
+                               delta_xy).sigmoid()  # [100, 14, -, 2]
+                num_queries, _, bs, _ = body_keypoint_xy.shape
+                body_keypoint_wh_weight = self.hw.weight.unsqueeze(0).unsqueeze(
+                    -2).repeat(num_queries, 1, bs, 1).sigmoid()
+                body_keypoint_wh = body_keypoint_wh_weight * new_reference_points_for_body_box[
+                    ..., 2:][:, None]
+                new_reference_points_for_keypoint = torch.cat(
+                    (body_keypoint_xy, body_keypoint_wh), dim=-1)
+
+    
+                # for lhand bbox
+                new_output_for_lhand_box = new_output_for_body_box[:, None, :, :] \
+                    + self.lhand_box_embed.weight[None, :, None, :] 
+                    
+                delta_lhand_box_xy = self.bbox_hand_embed[-1](new_output_for_lhand_box)[..., :2]
+
+                lhand_bbox_xy = (inverse_sigmoid(
+                    new_reference_points_for_body_box[..., :2][:, None]) +
+                               delta_lhand_box_xy).sigmoid()  # [100, 14, -, 2]    
+                num_queries, _, bs, _ = lhand_bbox_xy.shape           
+                lhand_bbox_wh_weight = self.hw_lhand_bbox.weight.unsqueeze(0).unsqueeze(
+                    -2).repeat(num_queries, 1, bs, 1).sigmoid()                
+                lhand_bbox_wh = lhand_bbox_wh_weight * new_reference_points_for_body_box[
+                    ..., 2:][:, None]     
+                
+                new_reference_points_for_lhand_bbox = torch.cat(
+                    (lhand_bbox_xy, lhand_bbox_wh), dim=-1)
+
+                # for rhand bbox
+                new_output_for_rhand_box = new_output_for_body_box[:, None, :, :] \
+                    + self.rhand_box_embed.weight[None, :, None, :] 
+                    
+                delta_rhand_box_xy = self.bbox_hand_embed[-1](new_output_for_rhand_box)[..., :2]
+
+                rhand_bbox_xy = (inverse_sigmoid(
+                    new_reference_points_for_body_box[..., :2][:, None]) +
+                               delta_rhand_box_xy).sigmoid()  # [100, 14, -, 2]    
+                num_queries, _, bs, _ = rhand_bbox_xy.shape           
+                rhand_bbox_wh_weight = self.hw_rhand_bbox.weight.unsqueeze(0).unsqueeze(
+                    -2).repeat(num_queries, 1, bs, 1).sigmoid()                
+                rhand_bbox_wh = rhand_bbox_wh_weight * new_reference_points_for_body_box[
+                    ..., 2:][:, None]     
+                
+                new_reference_points_for_rhand_bbox = torch.cat(
+                    (rhand_bbox_xy, rhand_bbox_wh), dim=-1)
+                
+                # for face bbox
+                new_output_for_face_box = new_output_for_body_box[:, None, :, :] \
+                    + self.face_box_embed.weight[None, :, None, :] 
+                delta_face_box_xy = self.bbox_face_embed[-1](new_output_for_face_box)[..., :2]
+                face_bbox_xy = (inverse_sigmoid(
+                    new_reference_points_for_body_box[..., :2][:, None]) +
+                               delta_face_box_xy).sigmoid()  # [100, 14, -, 2]
+                num_queries, _, bs, _ = face_bbox_xy.shape
+                face_bbox_wh_weight = self.hw_face_bbox.weight.unsqueeze(0).unsqueeze(
+                    -2).repeat(num_queries, 1, bs, 1).sigmoid()    
+                face_bbox_wh = face_bbox_wh_weight * new_reference_points_for_body_box[
+                    ..., 2:][:, None]                
+
+                new_reference_points_for_face_box = torch.cat(
+                    (face_bbox_xy, face_bbox_wh), dim=-1)
+
+
+                output = torch.cat(
+                    (new_output_for_body_box.unsqueeze(1), 
+                     new_output_for_body_keypoint,
+                     new_output_for_lhand_box,
+                     new_output_for_rhand_box,
+                     new_output_for_face_box),
+                    dim=1).flatten(0, 1)
+                new_reference_points = torch.cat(
+                    (new_reference_points_for_body_box.unsqueeze(1), 
+                     new_reference_points_for_keypoint,
+                     new_reference_points_for_lhand_bbox,
+                     new_reference_points_for_rhand_bbox,
+                     new_reference_points_for_face_box), dim=1).flatten(0,1)
+                
+                new_reference_points = torch.cat((dn_new_reference_points, new_reference_points),dim=0)
+                output = torch.cat((dn_output, output), dim=0)
+                tgt_mask = tgt_mask2
+
+
+            # human-to-keypoints, human2face, human2hand update # 2
+            if layer_id >= self.num_box_decoder_layers and layer_id < self.num_box_decoder_layers +2: 
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                
+                reference_before_sigmoid_body_bbox_dn = \
+                    reference_before_sigmoid[:effect_num_dn]
+                reference_before_sigmoid_bbox_body_norm = \
+                    reference_before_sigmoid[effect_num_dn:][0::(self.num_body_points+4)]
+
+                output_bbox_body_dn=output[:effect_num_dn]
+                output_bbox_body_norm = output[effect_num_dn:][
+                    0::(self.num_body_points+4)]
+                delta_unsig_bbox_body_dn = self.bbox_embed[
+                    layer_id](output_bbox_body_dn)
+                delta_unsig_bbox_body_norm = self.bbox_embed[
+                    layer_id](output_bbox_body_norm)
+
+                outputs_unsig_body_bbox_dn = delta_unsig_bbox_body_dn + reference_before_sigmoid_body_bbox_dn
+                outputs_unsig_body_bbox_norm = delta_unsig_bbox_body_norm + reference_before_sigmoid_bbox_body_norm
+
+                new_reference_points_for_body_box_dn = outputs_unsig_body_bbox_dn.sigmoid()
+                new_reference_points_for_body_box_norm = outputs_unsig_body_bbox_norm.sigmoid()
+                
+                
+                
+                # body kps
+                output_body_kpt=output[effect_num_dn:].index_select(
+                    0,torch.tensor(self.body_kpt_index_1,device=output.device)) # select kp center content query
+                delta_xy_body_unsig = self.pose_embed[
+                    layer_id-self.num_box_decoder_layers](output_body_kpt) # offset of kp bbox center
+                outputs_body_kp_unsig = \
+                    reference_before_sigmoid[effect_num_dn:].index_select(
+                        0, torch.tensor(self.body_kpt_index_1, device=output.device)).clone() # select kp position query
+                delta_hw_body_kp_unsig = self.pose_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_body_kpt)
+                outputs_body_kp_unsig[..., :2] += delta_xy_body_unsig[..., :2]
+                outputs_body_kp_unsig[..., 2:] += delta_hw_body_kp_unsig
+                new_reference_points_for_body_keypoint = outputs_body_kp_unsig.sigmoid()
+                bs=new_reference_points_for_body_box_norm.shape[1]
+
+
+                # lhand box
+                output_lhand_bbox_query = output[effect_num_dn:][
+                    (self.num_body_points + 1)::(self.num_body_points+4)]
+                delta_xy_lhand_bbox_unsig = self.bbox_hand_embed[
+                    layer_id-self.num_box_decoder_layers](output_lhand_bbox_query)
+                
+                outputs_lhand_bbox_unsig = \
+                    reference_before_sigmoid[effect_num_dn:][
+                        (self.num_body_points + 1)::(self.num_body_points+4)].clone()
+                delta_hw_lhand_bbox_unsig = self.bbox_hand_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_lhand_bbox_query)     
+                
+                outputs_lhand_bbox_unsig[..., :2] +=delta_xy_lhand_bbox_unsig[..., :2]
+                outputs_lhand_bbox_unsig[..., 2:] +=delta_hw_lhand_bbox_unsig
+                new_reference_points_for_lhand_box_norm = outputs_lhand_bbox_unsig.sigmoid()
+                
+                
+                # rhand box
+                output_rhand_bbox_query = output[effect_num_dn:][
+                    (self.num_body_points + 2)::(self.num_body_points+4)] 
+                delta_xy_rhand_bbox_unsig = self.bbox_hand_embed[
+                    layer_id-self.num_box_decoder_layers](output_rhand_bbox_query)
+
+                outputs_rhand_bbox_unsig = \
+                    reference_before_sigmoid[effect_num_dn:][
+                        (self.num_body_points + 2)::(self.num_body_points+4)].clone()
+                delta_hw_rhand_bbox_unsig = self.bbox_hand_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_rhand_bbox_query)
+                                
+                outputs_rhand_bbox_unsig[..., :2] +=delta_xy_rhand_bbox_unsig[..., :2]
+                outputs_rhand_bbox_unsig[..., 2:] +=delta_hw_rhand_bbox_unsig
+                new_reference_points_for_rhand_box_norm = outputs_rhand_bbox_unsig.sigmoid()
+
+
+                # face box
+                output_face_bbox_query = output[effect_num_dn:][
+                    (self.num_body_points + 3)::(self.num_body_points+4)]    
+                delta_xy_face_bbox_unsig = self.bbox_face_embed[
+                    layer_id-self.num_box_decoder_layers](output_face_bbox_query)
+                
+                outputs_face_bbox_unsig = \
+                    reference_before_sigmoid[effect_num_dn:][
+                        (self.num_body_points + 3)::(self.num_body_points+4)].clone()
+                delta_hw_face_bbox_unsig = self.bbox_face_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_face_bbox_query)
+                
+                outputs_face_bbox_unsig[..., :2] +=delta_xy_face_bbox_unsig[..., :2]
+                outputs_face_bbox_unsig[..., 2:] +=delta_hw_face_bbox_unsig                                                    
+                new_reference_points_for_face_box_norm = outputs_face_bbox_unsig.sigmoid()
+
+
+                new_reference_points_norm = torch.cat(
+                    (new_reference_points_for_body_box_norm.unsqueeze(1),
+                     new_reference_points_for_body_keypoint.view(-1,self.num_body_points,bs,4),
+                     new_reference_points_for_lhand_box_norm.unsqueeze(1),
+                     new_reference_points_for_rhand_box_norm.unsqueeze(1), 
+                     new_reference_points_for_face_box_norm.unsqueeze(1)), dim=1).flatten(0,1)
+                
+                new_reference_points = torch.cat((
+                    new_reference_points_for_body_box_dn, 
+                    new_reference_points_norm), dim=0)
+
+            # hand, bbox query expansion
+            if layer_id == self.num_hand_face_decoder_layers - 1:
+                
+                dn_body_output = output[:effect_num_dn]
+                dn_reference_points_body = new_reference_points[:effect_num_dn]                
+                
+                
+                # body bbox
+                new_reference_points_for_body_box = \
+                    new_reference_points[effect_num_dn:][0::(self.num_body_points + 4)]
+                new_output_for_body_box = output[effect_num_dn:][0::
+                    (self.num_body_points + 4)]
+                                
+                # body kp bbox
+                new_output_body_for_body_keypoint = \
+                    output[effect_num_dn:].index_select(
+                        0,torch.tensor(self.body_kpt_index_1,device=output.device)).clone()
+                new_output_body_for_body_keypoint = new_output_body_for_body_keypoint.view(
+                    self.num_group, self.num_body_points, bs, self.d_model)
+                
+                new_reference_points_for_body_keypoint = new_reference_points[effect_num_dn:].index_select(
+                        0,torch.tensor(self.body_kpt_index_1,device=output.device)).clone()
+                new_reference_points_for_body_keypoint = \
+                    new_reference_points_for_body_keypoint.view(self.num_group, self.num_body_points, bs, 4)
+                
+                new_reference_points_body = \
+                    torch.cat((new_reference_points_for_body_box.unsqueeze(1), 
+                               new_reference_points_for_body_keypoint), dim=1)
+                new_body_output = torch.cat((new_output_for_body_box.unsqueeze(1),
+                                             new_output_body_for_body_keypoint), dim=1)                
+                
+                # lhand bbox content query and position query
+                new_reference_points_for_lhand_box = \
+                    new_reference_points[effect_num_dn:][
+                        (self.num_body_points + 1)::(self.num_body_points + 4)]
+                new_output_for_lhand_box = output[effect_num_dn:][
+                    (self.num_body_points + 1)::(self.num_body_points + 4)]
+                
+                # lhand query expansion
+                new_output_for_lhand_keypoint = new_output_for_lhand_box[:, None, :, :] \
+                    + self.lhand_keypoint_embed.weight[None, :, None, :]
+                    
+                # use the expanded lhand kp query to regress 
+                # the center displacement relatived to lhand bbox
+                delta_lhand_kp_xy = self.pose_hand_embed[-1](new_output_for_lhand_keypoint)[..., :2]
+                # get absoulte bbox center for each lhand kps bbox
+                lhand_keypoint_xy = (
+                    inverse_sigmoid(new_reference_points_for_lhand_box[..., :2][:, None])
+                    + delta_lhand_kp_xy).sigmoid()
+                
+                num_queries,_,bs,_=lhand_keypoint_xy.shape
+                lhand_keypoint_wh_weight = \
+                    self.hw_lhand_kps.weight.unsqueeze(0).unsqueeze(-2).repeat(num_queries,1,bs,1).sigmoid()
+                lhand_keypoint_wh = lhand_keypoint_wh_weight * new_reference_points_for_lhand_box[..., 2:][:, None]
+                new_reference_points_for_lhand_keypoint = torch.cat((lhand_keypoint_xy, lhand_keypoint_wh), dim=-1)
+                
+                new_reference_points_lhand = \
+                    torch.cat((new_reference_points_for_lhand_box.unsqueeze(1), new_reference_points_for_lhand_keypoint), dim=1)
+                new_lhand_output = torch.cat((new_output_for_lhand_box.unsqueeze(1), new_output_for_lhand_keypoint), dim=1)
+                
+                
+                # rhand
+                new_reference_points_for_rhand_box = \
+                    new_reference_points[effect_num_dn:][
+                        (self.num_body_points + 2)::(self.num_body_points + 4)]
+                new_output_for_rhand_box = output[effect_num_dn:][
+                    (self.num_body_points + 2)::(self.num_body_points + 4)]
+                
+                new_output_for_rhand_keypoint = new_output_for_rhand_box[:, None, :, :] \
+                    + self.rhand_keypoint_embed.weight[None, :, None, :]
+                
+                delta_rhand_kp_xy = self.pose_hand_embed[-1](new_output_for_rhand_keypoint)
+                rhand_keypoint_xy = (
+                    inverse_sigmoid(new_reference_points_for_rhand_box[..., :2][:, None])
+                    + delta_rhand_kp_xy).sigmoid()
+                
+                num_queries,_,bs,_=rhand_keypoint_xy.shape
+                rhand_keypoint_wh_weight = \
+                    self.hw_rhand_kps.weight.unsqueeze(0).unsqueeze(-2).repeat(num_queries,1,bs,1).sigmoid()
+                rhand_keypoint_wh = rhand_keypoint_wh_weight * new_reference_points_for_rhand_box[..., 2:][:, None]
+                new_reference_points_for_rhand_keypoint = torch.cat((rhand_keypoint_xy, rhand_keypoint_wh), dim=-1)                
+
+                new_reference_points_rhand = \
+                    torch.cat((new_reference_points_for_rhand_box.unsqueeze(1), new_reference_points_for_rhand_keypoint), dim=1)
+                new_rhand_output = torch.cat((new_output_for_rhand_box.unsqueeze(1), new_output_for_rhand_keypoint), dim=1)
+                                
+                # face
+                new_reference_points_for_face_box = \
+                    new_reference_points[effect_num_dn:][
+                        (self.num_body_points + 3)::(self.num_body_points + 4)]
+                new_output_for_face_box = output[effect_num_dn:][
+                    (self.num_body_points + 3)::(self.num_body_points + 4)]
+                
+                new_output_for_face_keypoint = new_output_for_face_box[:, None, :, :] \
+                    + self.face_keypoint_embed.weight[None, :, None, :]
+                
+                delta_face_kp_xy = self.pose_face_embed[-1](new_output_for_face_keypoint)[..., :2]
+                face_keypoint_xy = (
+                    inverse_sigmoid(new_reference_points_for_face_box[..., :2][:, None])
+                    + delta_face_kp_xy).sigmoid()
+        
+                num_queries,_,bs,_= face_keypoint_xy.shape
+                face_keypoint_wh_weight = \
+                    self.hw_face_kps.weight.unsqueeze(0).unsqueeze(-2).repeat(num_queries,1,bs,1).sigmoid()
+                face_keypoint_wh = face_keypoint_wh_weight * new_reference_points_for_face_box[..., 2:][:, None]
+                new_reference_points_for_face_keypoint = torch.cat((face_keypoint_xy, face_keypoint_wh), dim=-1)       
+
+                new_reference_points_face = torch.cat(
+                    (new_reference_points_for_face_box.unsqueeze(1), 
+                     new_reference_points_for_face_keypoint), dim=1)
+                new_face_output = torch.cat(
+                    (new_output_for_face_box.unsqueeze(1), 
+                     new_output_for_face_keypoint), dim=1)
+                
+
+                # new_reference_points = torch.cat(
+                #     (dn_reference_points_body.unsqueeze(1),
+                #      new_reference_points_body,
+                #      new_reference_points_lhand,
+                #      new_reference_points_rhand,
+                #      new_reference_points_face), dim=1).flatten(0,1)
+                new_reference_points = torch.cat(
+                    (new_reference_points_body,
+                     new_reference_points_lhand,
+                     new_reference_points_rhand,
+                     new_reference_points_face), dim=1).flatten(0,1)
+                # new_reference_points =  torch.cat((dn_reference_points_body,new_reference_points),dim=0)
+                
+                new_reference_points = torch.cat(
+                    (dn_reference_points_body, new_reference_points), dim=0
+                )
+                     
+                output = torch.cat(
+                    (new_body_output,
+                     new_lhand_output,
+                     new_rhand_output,
+                     new_face_output), dim=1).flatten(0, 1)
+                
+                output = torch.cat(
+                    (dn_body_output, output), dim=0
+                )
+                
+                tgt_mask = tgt_mask3
+
+            if layer_id >= self.num_hand_face_decoder_layers:
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                
+                
+                # body box
+                reference_before_sigmoid_body_bbox_dn = \
+                    reference_before_sigmoid[:effect_num_dn]
+                reference_before_sigmoid_bbox_body_norm = \
+                    reference_before_sigmoid[effect_num_dn:][
+                        0::(self.num_body_points+2*self.num_hand_points+self.num_face_points+4)]
+
+                output_bbox_body_dn=output[:effect_num_dn]
+                output_bbox_body_norm = output[effect_num_dn:][
+                    0::(self.num_body_points+2*self.num_hand_points+self.num_face_points+4)]
+                delta_unsig_bbox_body_dn = self.bbox_embed[
+                    layer_id](output_bbox_body_dn)
+                delta_unsig_bbox_body_norm = self.bbox_embed[
+                    layer_id](output_bbox_body_norm)                    
+
+                outputs_unsig_body_bbox_dn = \
+                    delta_unsig_bbox_body_dn + reference_before_sigmoid_body_bbox_dn
+                outputs_unsig_body_bbox_norm = \
+                    delta_unsig_bbox_body_norm + reference_before_sigmoid_bbox_body_norm                    
+
+                new_reference_points_for_body_box_dn = outputs_unsig_body_bbox_dn.sigmoid()
+                new_reference_points_for_body_box_norm = outputs_unsig_body_bbox_norm.sigmoid()                    
+                    
+
+                # body kps
+
+                output_body_kpt=output[effect_num_dn:].index_select(
+                    0,torch.tensor(self.body_kpt_index_2,device=output.device)) # select kp center content query
+                delta_xy_body_unsig = self.pose_embed[
+                    layer_id-self.num_box_decoder_layers](output_body_kpt) # offset of kp bbox center
+                outputs_body_kp_unsig = \
+                    reference_before_sigmoid[effect_num_dn:].index_select(
+                        0, torch.tensor(self.body_kpt_index_2, device=output.device)).clone() # select kp position query
+                delta_hw_body_kp_unsig = self.pose_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_body_kpt)
+                outputs_body_kp_unsig[..., :2] += delta_xy_body_unsig[..., :2]
+                outputs_body_kp_unsig[..., 2:] += delta_hw_body_kp_unsig
+                new_reference_points_for_body_keypoint = outputs_body_kp_unsig.sigmoid()
+                bs=new_reference_points_for_body_box_norm.shape[1]
+                new_reference_points_for_body_keypoint = \
+                    new_reference_points_for_body_keypoint.view(-1,self.num_body_points,bs,4)                
+                
+                
+                # lhand bbox
+                output_lhand_bbox_query = output[effect_num_dn:][
+                    (self.num_body_points + 1)::
+                        (self.num_body_points + 2 * self.num_hand_points + self.num_face_points + 4)] 
+                delta_xy_lhand_bbox_unsig = self.bbox_hand_embed[
+                    layer_id-self.num_box_decoder_layers](output_lhand_bbox_query)
+                
+                outputs_lhand_bbox_unsig = \
+                    reference_before_sigmoid[effect_num_dn:][
+                        (self.num_body_points + 1)::
+                            (self.num_body_points+2*self.num_hand_points+self.num_face_points+4)].clone() 
+                delta_hw_lhand_bbox_unsig = self.bbox_hand_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_lhand_bbox_query)     
+                
+                outputs_lhand_bbox_unsig[..., :2] +=delta_xy_lhand_bbox_unsig[..., :2]
+                outputs_lhand_bbox_unsig[..., 2:] +=delta_hw_lhand_bbox_unsig
+                new_reference_points_for_lhand_box_norm = outputs_lhand_bbox_unsig.sigmoid()
+                
+                # output_bbox_lhand_norm = output[effect_num_dn:][
+                #     (self.num_body_points + 1)::
+                #         (self.num_body_points + 2 * self.num_hand_points + self.num_face_points + 4)]                
+
+                # reference_before_sigmoid_bbox_lhand_norm = \
+                #     reference_before_sigmoid[effect_num_dn:][
+                #         (self.num_body_points + 1)::
+                #             (self.num_body_points+2*self.num_hand_points+self.num_face_points+4)]
+                # delta_unsig_bbox_lhand_norm = self.bbox_hand_embed[
+                #     layer_id-self.num_box_decoder_layers](output_bbox_lhand_norm)
+                # outputs_unsig_lhand_bbox_norm = \
+                #     delta_unsig_bbox_lhand_norm + reference_before_sigmoid_bbox_lhand_norm  
+                # new_reference_points_for_lhand_box_norm = outputs_unsig_lhand_bbox_norm.sigmoid()    
+                
+
+
+                # lhand kps
+                output_lhand_kpt_query=output[effect_num_dn:].index_select(
+                    0,torch.tensor(self.lhand_kpt_index,device=output.device)) # select kp center content query
+                delta_xy_lhand_kpt_unsig = self.pose_hand_embed[
+                    layer_id-self.num_hand_face_decoder_layers](output_lhand_kpt_query) # offset of kp bbox center
+                outputs_lhand_kp_unsig = \
+                    reference_before_sigmoid[effect_num_dn:].index_select(
+                        0, torch.tensor(self.lhand_kpt_index, device=output.device)).clone() # select kp position query
+                delta_hw_lhand_kp_unsig = self.pose_hand_hw_embed[
+                    layer_id-self.num_hand_face_decoder_layers](output_lhand_kpt_query)
+                
+                outputs_lhand_kp_unsig[..., :2] += delta_xy_lhand_kpt_unsig[..., :2]
+                outputs_lhand_kp_unsig[..., 2:] += delta_hw_lhand_kp_unsig
+                new_reference_points_for_lhand_keypoint = outputs_lhand_kp_unsig.sigmoid()
+                bs=new_reference_points_for_lhand_box_norm.shape[1]
+                new_reference_points_for_lhand_keypoint = \
+                    new_reference_points_for_lhand_keypoint.view(-1,self.num_hand_points,bs,4) 
+                    
+  
+                # rhand bbox
+                output_rhand_bbox_query = output[effect_num_dn:][
+                    (self.num_body_points + self.num_hand_points + 2)::
+                        (self.num_body_points+2*self.num_hand_points+self.num_face_points+4)]
+                delta_xy_rhand_bbox_unsig = self.bbox_hand_embed[
+                    layer_id-self.num_box_decoder_layers](output_rhand_bbox_query)
+
+                outputs_rhand_bbox_unsig = \
+                    reference_before_sigmoid[effect_num_dn:][
+                    (self.num_body_points + self.num_hand_points + 2)::
+                        (self.num_body_points+2*self.num_hand_points+self.num_face_points+4)].clone()
+                delta_hw_rhand_bbox_unsig = self.bbox_hand_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_rhand_bbox_query)
+                                
+                outputs_rhand_bbox_unsig[..., :2] +=delta_xy_rhand_bbox_unsig[..., :2]
+                outputs_rhand_bbox_unsig[..., 2:] +=delta_hw_rhand_bbox_unsig
+                new_reference_points_for_rhand_box_norm = outputs_rhand_bbox_unsig.sigmoid()
+
+                # output_bbox_rhand_norm = output[effect_num_dn:][
+                #     (self.num_body_points + self.num_hand_points + 2)::
+                #         (self.num_body_points+2*self.num_hand_points+self.num_face_points+4)]
+                # reference_before_sigmoid_bbox_rhand_norm = \
+                #     reference_before_sigmoid[effect_num_dn:][
+                #     (self.num_body_points + self.num_hand_points + 2)::
+                #         (self.num_body_points+2*self.num_hand_points+self.num_face_points+4)]   
+                
+                # delta_unsig_bbox_rhand_norm = self.bbox_hand_embed[
+                #     layer_id-self.num_box_decoder_layers](output_bbox_rhand_norm)  
+                # outputs_unsig_rhand_bbox_norm = \
+                #     delta_unsig_bbox_rhand_norm + reference_before_sigmoid_bbox_rhand_norm  
+                
+                # new_reference_points_for_rhand_box_norm = outputs_unsig_rhand_bbox_norm.sigmoid()       
+                
+                  
+                # rhand kps
+                output_rhand_kpt_query=output[effect_num_dn:].index_select(
+                    0,torch.tensor(self.rhand_kpt_index,device=output.device)) # select kp center content query
+                delta_xy_rhand_kpt_unsig = self.pose_hand_embed[
+                    layer_id-self.num_hand_face_decoder_layers](output_rhand_kpt_query) # offset of kp bbox center
+                outputs_rhand_kp_unsig = \
+                    reference_before_sigmoid[effect_num_dn:].index_select(
+                        0, torch.tensor(self.rhand_kpt_index, device=output.device)).clone() # select kp position query
+                delta_hw_rhand_kp_unsig = self.pose_hand_hw_embed[
+                    layer_id-self.num_hand_face_decoder_layers](output_rhand_kpt_query)
+                outputs_rhand_kp_unsig[..., :2] += delta_xy_rhand_kpt_unsig[..., :2]
+                outputs_rhand_kp_unsig[..., 2:] += delta_hw_rhand_kp_unsig
+                new_reference_points_for_rhand_keypoint = outputs_rhand_kp_unsig.sigmoid()
+                bs=new_reference_points_for_rhand_box_norm.shape[1]
+                new_reference_points_for_rhand_keypoint = \
+                    new_reference_points_for_rhand_keypoint.view(-1,self.num_hand_points,bs,4)
+                                        
+                # face bbox
+                output_face_bbox_query = output[effect_num_dn:][
+                    (self.num_body_points + 2 * self.num_hand_points + 3)::
+                        (self.num_body_points+2*self.num_hand_points+self.num_face_points+4)]   
+                delta_xy_face_bbox_unsig = self.bbox_face_embed[
+                    layer_id-self.num_box_decoder_layers](output_face_bbox_query)
+                
+                outputs_face_bbox_unsig = \
+                    reference_before_sigmoid[effect_num_dn:][
+                    (self.num_body_points + 2 * self.num_hand_points + 3)::
+                        (self.num_body_points+2*self.num_hand_points+self.num_face_points+4)].clone() 
+                delta_hw_face_bbox_unsig = self.bbox_face_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_face_bbox_query)
+                
+                outputs_face_bbox_unsig[..., :2] +=delta_xy_face_bbox_unsig[..., :2]
+                outputs_face_bbox_unsig[..., 2:] +=delta_hw_face_bbox_unsig                                                    
+                new_reference_points_for_face_box_norm = outputs_face_bbox_unsig.sigmoid()
+                
+                # output_bbox_face_norm = output[effect_num_dn:][
+                #     (self.num_body_points + 2 * self.num_hand_points + 3)::
+                #         (self.num_body_points+2*self.num_hand_points+self.num_face_points+4)]
+                # reference_before_sigmoid_bbox_face_norm = \
+                #     reference_before_sigmoid[effect_num_dn:][
+                #         (self.num_body_points + 2 * self.num_hand_points + 3)::
+                #             (self.num_body_points+2*self.num_hand_points+self.num_face_points+4)]
+                # delta_unsig_bbox_face_norm = self.bbox_face_embed[
+                #     layer_id-self.num_box_decoder_layers](output_bbox_face_norm)
+                # outputs_unsig_face_bbox_norm = \
+                #     delta_unsig_bbox_face_norm + reference_before_sigmoid_bbox_face_norm        
+                # new_reference_points_for_face_box_norm = outputs_unsig_face_bbox_norm.sigmoid() 
+
+                # face kps
+                output_face_kpt_query=output[effect_num_dn:].index_select(
+                    0,torch.tensor(self.face_kpt_index,device=output.device)) # select kp center content query
+                delta_xy_face_kpt_unsig = self.pose_face_embed[
+                    layer_id-self.num_hand_face_decoder_layers](output_face_kpt_query) # offset of kp bbox center
+                outputs_face_kp_unsig = \
+                    reference_before_sigmoid[effect_num_dn:].index_select(
+                        0, torch.tensor(self.face_kpt_index, device=output.device)).clone() # select kp position query
+                delta_hw_face_kp_unsig = self.pose_face_hw_embed[
+                    layer_id-self.num_hand_face_decoder_layers](output_face_kpt_query)
+                outputs_face_kp_unsig[..., :2] += delta_xy_face_kpt_unsig[..., :2]
+                outputs_face_kp_unsig[..., 2:] += delta_hw_face_kp_unsig
+                new_reference_points_for_face_keypoint = outputs_face_kp_unsig.sigmoid()
+                
+                bs=new_reference_points_for_face_box_norm.shape[1]
+                new_reference_points_for_face_keypoint = \
+                    new_reference_points_for_face_keypoint.view(-1,self.num_face_points,bs,4)                      
+
+                new_reference_points_norm = torch.cat(
+                    (new_reference_points_for_body_box_norm.unsqueeze(1),
+                     new_reference_points_for_body_keypoint,
+                     new_reference_points_for_lhand_box_norm.unsqueeze(1),
+                     new_reference_points_for_lhand_keypoint,
+                     new_reference_points_for_rhand_box_norm.unsqueeze(1),  
+                     new_reference_points_for_rhand_keypoint,
+                     new_reference_points_for_face_box_norm.unsqueeze(1), 
+                     new_reference_points_for_face_keypoint,
+                     ), dim=1).flatten(0,1)
+                
+                new_reference_points = torch.cat(
+                    (new_reference_points_for_body_box_dn, new_reference_points_norm), dim=0)
+            
+            if self.rm_detach and 'dec' in self.rm_detach:
+                reference_points = new_reference_points
+            else:
+                reference_points = new_reference_points.detach()
+            ref_points.append(new_reference_points)
+            
+        return [[itm_out.transpose(0, 1) for itm_out in intermediate],
+                [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]]
+
+
+def _get_clones(module, N, layer_share=False):
+    if layer_share:
+        return nn.ModuleList([module for i in range(N)])
+    else:
+        return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def build_transformer(args):
+    if args.modelname == 'aios_smplx_box':
+        return Transformer_Box(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        num_queries=args.num_queries,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+        query_dim=args.query_dim,
+        activation=args.transformer_activation,
+        num_patterns=args.num_patterns,
+        modulate_hw_attn=True,
+        deformable_encoder=True,
+        deformable_decoder=True,
+        num_feature_levels=args.num_feature_levels,
+        enc_n_points=args.enc_n_points,
+        dec_n_points=args.dec_n_points,
+        learnable_tgt_init=True,
+        random_refpoints_xy=args.random_refpoints_xy,
+        two_stage_type=args.two_stage_type,
+        two_stage_learn_wh=args.two_stage_learn_wh,
+        two_stage_keep_all_tokens=args.two_stage_keep_all_tokens,
+        dec_layer_number=args.dec_layer_number,
+        rm_self_attn_layers=args.rm_self_attn_layers,
+        rm_detach=args.rm_detach,
+        decoder_sa_type=args.decoder_sa_type,
+        module_seq=args.decoder_module_seq,
+        embed_init_tgt=args.embed_init_tgt,
+        num_body_points=args.num_body_points,
+        num_hand_points=args.num_hand_points,
+        num_face_points=args.num_face_points,
+        num_box_decoder_layers=args.num_box_decoder_layers,
+        num_hand_face_decoder_layers=args.num_hand_face_decoder_layers,
+        num_group=args.num_group)
+    elif args.modelname == 'aios_smplx':
+        return Transformer(
+        d_model=args.hidden_dim,
+        dropout=args.dropout,
+        nhead=args.nheads,
+        num_queries=args.num_queries,
+        dim_feedforward=args.dim_feedforward,
+        num_encoder_layers=args.enc_layers,
+        num_decoder_layers=args.dec_layers,
+        normalize_before=args.pre_norm,
+        return_intermediate_dec=True,
+        query_dim=args.query_dim,
+        activation=args.transformer_activation,
+        num_patterns=args.num_patterns,
+        modulate_hw_attn=True,
+        deformable_encoder=True,
+        deformable_decoder=True,
+        num_feature_levels=args.num_feature_levels,
+        enc_n_points=args.enc_n_points,
+        dec_n_points=args.dec_n_points,
+        learnable_tgt_init=True,
+        random_refpoints_xy=args.random_refpoints_xy,
+        two_stage_type=args.two_stage_type,
+        two_stage_learn_wh=args.two_stage_learn_wh,
+        two_stage_keep_all_tokens=args.two_stage_keep_all_tokens,
+        dec_layer_number=args.dec_layer_number,
+        rm_self_attn_layers=args.rm_self_attn_layers,
+        rm_detach=args.rm_detach,
+        decoder_sa_type=args.decoder_sa_type,
+        module_seq=args.decoder_module_seq,
+        embed_init_tgt=args.embed_init_tgt,
+        num_body_points=args.num_body_points,
+        num_hand_points=args.num_hand_points,
+        num_face_points=args.num_face_points,
+        num_box_decoder_layers=args.num_box_decoder_layers,
+        num_hand_face_decoder_layers=args.num_hand_face_decoder_layers,
+        num_group=args.num_group)
+    else:
+        raise ValueError('Wrong Transformer type')
+
+class TransformerDecoder_Box(nn.Module):
+    def __init__(
+            self,
+            decoder_layer,
+            num_layers,
+            norm=None,
+            return_intermediate=False,
+            d_model=256,
+            query_dim=4,
+            modulate_hw_attn=False,
+            num_feature_levels=1,
+            deformable_decoder=False,
+            dec_layer_number=None,  # number of queries each layer in decoder
+            dec_layer_share=False,
+            dec_layer_dropout_prob=None,
+            num_box_decoder_layers=2,
+            num_hand_face_decoder_layers=4,
+            num_body_points=0,
+            num_hand_points=0,
+            num_face_points=0,
+            num_dn=100,
+            num_group=100):
+        super().__init__()
+        # pdb.set_trace()
+        if num_layers > 0:
+            self.layers = _get_clones(decoder_layer,
+                                      num_layers,
+                                      layer_share=dec_layer_share)
+        else:
+            self.layers = []
+        self.num_layers = num_layers
+        self.norm = norm
+        self.return_intermediate = return_intermediate  # True
+        assert return_intermediate, 'support return_intermediate only'
+        self.query_dim = query_dim  # 4
+        assert query_dim in [
+            2, 4
+        ], 'query_dim should be 2/4 but {}'.format(query_dim)
+        self.num_feature_levels = num_feature_levels  # 4
+
+        self.ref_point_head = MLP(query_dim // 2 * d_model, d_model, d_model,
+                                  2)  # 4//2 * 256, 256, 256, 2
+        if not deformable_decoder:
+            self.query_pos_sine_scale = MLP(d_model, d_model, d_model, 2)
+        else:
+            self.query_pos_sine_scale = None
+
+        self.num_body_points = 0
+        self.num_hand_points = 0
+        self.num_face_points = 0
+        self.query_scale = None
+        
+        # aios kp
+        self.bbox_embed = None
+        self.class_embed = None
+
+        self.bbox_hand_embed = None
+        self.bbox_hand_hw_embed = None
+
+        # smplx face kp
+        self.bbox_face_embed = None
+        self.bbox_face_hw_embed = None
+        
+        self.num_box_decoder_layers = num_box_decoder_layers  # 2
+        self.num_hand_face_decoder_layers = num_hand_face_decoder_layers
+        self.d_model = d_model
+        self.modulate_hw_attn = modulate_hw_attn
+        self.deformable_decoder = deformable_decoder
+
+        if not deformable_decoder and modulate_hw_attn:
+            self.ref_anchor_head = MLP(d_model, d_model, 2, 2)
+        else:
+            self.ref_anchor_head = None
+
+        self.box_pred_damping = None
+
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            assert isinstance(dec_layer_number, list)
+            assert len(dec_layer_number) == num_layers
+            # assert dec_layer_number[0] ==
+
+        self.dec_layer_dropout_prob = dec_layer_dropout_prob
+        if dec_layer_dropout_prob is not None:
+            raise NotImplementedError
+            assert isinstance(dec_layer_dropout_prob, list)
+            assert len(dec_layer_dropout_prob) == num_layers
+            for i in dec_layer_dropout_prob:
+                assert 0.0 <= i <= 1.0
+        self.num_group = num_group
+        self.rm_detach = None
+        self.num_dn = num_dn
+        # self.hw_body_kps = nn.Embedding(self.num_body_points, 2)
+        # self.hw = nn.Embedding(self.num_body_points, 2)
+        # self.keypoint_embed = nn.Embedding(self.num_body_points, d_model)
+        
+        # self.body_kpt_index_1 = [
+        #     x for x in range(self.num_group*(self.num_body_points+4)) if x%(self.num_body_points+4) not in [0, (1 + self.num_body_points), (2 + self.num_body_points), (3 + self.num_body_points)]]
+
+        # self.whole_body_points = \
+        #     self.num_body_points + self.num_hand_points *2 + self.num_face_points
+        
+        # self.body_kpt_index_2 = [
+        #     x for x in range(self.num_group * (self.whole_body_points + 4))
+        #     if (x % (self.whole_body_points + 4) in range(1,self.num_body_points+1))
+        # ]
+
+
+        # [0-99]: dn bbox; 
+        # [0,1]: body box; 
+        # [1, 18]: body kps; 
+        # [18, 19]: lhand box
+        # [19, 29]: lhand kps
+        # [29, 30]: rhand box
+        # [30, 40]: rhand kps
+        # [40, 41]: face bbox
+        # [41, 51]: face kps
+        # self.lhand_kpt_index = [
+        #     x for x in range(self.num_group * (self.whole_body_points + 4))
+        #     if (x % (self.whole_body_points + 4) in range(
+        #         self.num_body_points+2, self.num_body_points+self.num_hand_points+2))]
+
+        # self.rhand_kpt_index = [
+        #     x for x in range(self.num_group * (self.whole_body_points + 4))
+        #     if (x % (self.whole_body_points + 4) in range(
+        #         self.num_body_points+self.num_hand_points+3, self.num_body_points+self.num_hand_points*2+3))
+        # ]
+
+        # self.face_kpt_index = [
+        #     x for x in range(self.num_group * (self.whole_body_points + 4))
+        #     if (x % (self.whole_body_points + 4) in range(
+        #         self.num_body_points+self.num_hand_points*2+4, self.num_body_points+self.num_hand_points*2+self.num_face_points+4))
+        # ]
+
+        self.lhand_box_embed = nn.Embedding(1, d_model)
+        self.rhand_box_embed = nn.Embedding(1, d_model)
+        self.face_box_embed = nn.Embedding(1, d_model)
+        self.hw_lhand_bbox = nn.Embedding(1, 2)
+        self.hw_rhand_bbox = nn.Embedding(1, 2)
+        self.hw_face_bbox = nn.Embedding(1, 2)
+        
+    def forward(
+        self,
+        tgt,
+        memory,
+        tgt_mask: Optional[Tensor] = None,
+        tgt_mask2: Optional[Tensor] = None,
+        tgt_mask3: Optional[Tensor] = None,
+        memory_mask: Optional[Tensor] = None,
+        tgt_key_padding_mask: Optional[Tensor] = None,
+        memory_key_padding_mask: Optional[Tensor] = None,
+        pos: Optional[Tensor] = None,
+        refpoints_unsigmoid: Optional[Tensor] = None,  # num_queries, bs, 2
+        # for memory
+        level_start_index: Optional[Tensor] = None,  # num_levels
+        spatial_shapes: Optional[Tensor] = None,  # bs, num_levels, 2
+        valid_ratios: Optional[Tensor] = None,
+    ):
+        output = tgt
+
+        intermediate = []
+        reference_points = refpoints_unsigmoid.sigmoid()
+        ref_points = [reference_points]
+
+        effect_num_dn = self.num_dn if self.training else 0
+        inter_select_number = self.num_group
+        for layer_id, layer in enumerate(self.layers):
+            if self.deformable_decoder:
+                if reference_points.shape[-1] == 4:
+                    reference_points_input = reference_points[:, :, None] \
+                                            * torch.cat([valid_ratios, valid_ratios], -1)[None, :] # nq, bs, nlevel, 4
+                else:
+                    assert reference_points.shape[-1] == 2
+                    reference_points_input = reference_points[:, :,
+                                                              None] * valid_ratios[
+                                                                  None, :]
+                query_sine_embed = gen_sineembed_for_position(
+                    reference_points_input[:, :, 0, :]
+                )  # convert the position query from bbox to sine/cosin embend
+            else:
+                query_sine_embed = gen_sineembed_for_position(
+                    reference_points)  # nq, bs, 256*2
+                reference_points_input = None
+
+            raw_query_pos = self.ref_point_head(
+                query_sine_embed)  # nq, bs, 256
+            pos_scale = self.query_scale(
+                output) if self.query_scale is not None else 1  # ?
+            query_pos = pos_scale * raw_query_pos
+            if not self.deformable_decoder:
+                query_sine_embed = query_sine_embed[
+                    ..., :self.d_model] * self.query_pos_sine_scale(output)
+
+            # modulated HW attentions
+            if not self.deformable_decoder and self.modulate_hw_attn:
+                refHW_cond = self.ref_anchor_head(
+                    output).sigmoid()  # nq, bs, 2
+                query_sine_embed[..., self.d_model // 2:] *= (
+                    refHW_cond[..., 0] /
+                    reference_points[..., 2]).unsqueeze(-1)
+                query_sine_embed[..., :self.d_model //
+                                 2] *= (refHW_cond[..., 1] /
+                                        reference_points[..., 3]).unsqueeze(-1)
+
+            dropflag = False
+            if self.dec_layer_dropout_prob is not None:
+                prob = random.random()
+                if prob < self.dec_layer_dropout_prob[layer_id]:
+                    dropflag = True
+            if not dropflag:
+                output = layer(
+                    tgt=output,
+                    tgt_query_pos=query_pos,
+                    tgt_query_sine_embed=query_sine_embed,
+                    tgt_key_padding_mask=tgt_key_padding_mask,
+                    tgt_reference_points=reference_points_input,
+                    memory=memory,  # encoder output, also known as content query of encoder
+                    memory_key_padding_mask=memory_key_padding_mask,
+                    memory_level_start_index=level_start_index,
+                    memory_spatial_shapes=spatial_shapes,
+                    memory_pos=pos,  # position query of enconder
+                    self_attn_mask=tgt_mask,
+                    cross_attn_mask=memory_mask)
+
+            intermediate.append(self.norm(output))
+
+            # human update
+            if layer_id < self.num_box_decoder_layers:
+                # reference_points: [100*(17+20*2+72) 4, 4]
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                delta_unsig = self.bbox_embed[layer_id](
+                    output)  # delta_x, delta_y, delta_w, delta_h
+                outputs_unsig = delta_unsig + reference_before_sigmoid
+                new_reference_points = outputs_unsig.sigmoid(
+                )  # update the positional query by adding the offset delta_unsig
+
+            # kp query expansion
+            if layer_id == self.num_box_decoder_layers - 1:
+                dn_output = output[:effect_num_dn]  # [100,-,256]
+                dn_new_reference_points = new_reference_points[:effect_num_dn]  # [100, -, 4]
+
+                class_unselected = self.class_embed[layer_id](output)[
+                    effect_num_dn:]  # [900, -, 2]
+                topk_proposals = torch.topk(class_unselected.max(-1)[0],
+                                            inter_select_number,
+                                            dim=0)[1]  # 100
+                # selected position: select 100 query
+                new_reference_points_for_body_box = torch.gather(
+                    new_reference_points[effect_num_dn:], 0,
+                    topk_proposals.unsqueeze(-1).repeat(
+                        1, 1, 4))  # selected position query
+                # selected output features
+                new_output_for_body_box = torch.gather(
+                    output[effect_num_dn:], 0,
+                    topk_proposals.unsqueeze(-1).repeat(
+                        1, 1, self.d_model))  # selected content query
+                bs = new_output_for_body_box.shape[1]
+
+
+                # for lhand bbox
+                new_output_for_lhand_box = new_output_for_body_box[:, None, :, :] \
+                    + self.lhand_box_embed.weight[None, :, None, :] 
+                    
+                delta_lhand_box_xy = self.bbox_hand_embed[-1](new_output_for_lhand_box)[..., :2]
+
+                lhand_bbox_xy = (inverse_sigmoid(
+                    new_reference_points_for_body_box[..., :2][:, None]) +
+                               delta_lhand_box_xy).sigmoid()  # [100, 14, -, 2]    
+                num_queries, _, bs, _ = lhand_bbox_xy.shape           
+                lhand_bbox_wh_weight = self.hw_lhand_bbox.weight.unsqueeze(0).unsqueeze(
+                    -2).repeat(num_queries, 1, bs, 1).sigmoid()                
+                lhand_bbox_wh = lhand_bbox_wh_weight * new_reference_points_for_body_box[
+                    ..., 2:][:, None]     
+                
+                new_reference_points_for_lhand_bbox = torch.cat(
+                    (lhand_bbox_xy, lhand_bbox_wh), dim=-1)
+
+                # for rhand bbox
+                new_output_for_rhand_box = new_output_for_body_box[:, None, :, :] \
+                    + self.rhand_box_embed.weight[None, :, None, :] 
+                    
+                delta_rhand_box_xy = self.bbox_hand_embed[-1](new_output_for_rhand_box)[..., :2]
+
+                rhand_bbox_xy = (inverse_sigmoid(
+                    new_reference_points_for_body_box[..., :2][:, None]) +
+                               delta_rhand_box_xy).sigmoid()  # [100, 14, -, 2]    
+                num_queries, _, bs, _ = rhand_bbox_xy.shape           
+                rhand_bbox_wh_weight = self.hw_rhand_bbox.weight.unsqueeze(0).unsqueeze(
+                    -2).repeat(num_queries, 1, bs, 1).sigmoid()                
+                rhand_bbox_wh = rhand_bbox_wh_weight * new_reference_points_for_body_box[
+                    ..., 2:][:, None]     
+                
+                new_reference_points_for_rhand_bbox = torch.cat(
+                    (rhand_bbox_xy, rhand_bbox_wh), dim=-1)
+                
+                # for face bbox
+                new_output_for_face_box = new_output_for_body_box[:, None, :, :] \
+                    + self.face_box_embed.weight[None, :, None, :] 
+                delta_face_box_xy = self.bbox_face_embed[-1](new_output_for_face_box)[..., :2]
+                face_bbox_xy = (inverse_sigmoid(
+                    new_reference_points_for_body_box[..., :2][:, None]) +
+                               delta_face_box_xy).sigmoid()  # [100, 14, -, 2]
+                num_queries, _, bs, _ = face_bbox_xy.shape
+                face_bbox_wh_weight = self.hw_face_bbox.weight.unsqueeze(0).unsqueeze(
+                    -2).repeat(num_queries, 1, bs, 1).sigmoid()    
+                face_bbox_wh = face_bbox_wh_weight * new_reference_points_for_body_box[
+                    ..., 2:][:, None]                
+
+                new_reference_points_for_face_box = torch.cat(
+                    (face_bbox_xy, face_bbox_wh), dim=-1)
+
+
+                output = torch.cat(
+                    (new_output_for_body_box.unsqueeze(1), 
+                     new_output_for_lhand_box,
+                     new_output_for_rhand_box,
+                     new_output_for_face_box),
+                    dim=1).flatten(0, 1)
+                new_reference_points = torch.cat(
+                    (new_reference_points_for_body_box.unsqueeze(1), 
+                     new_reference_points_for_lhand_bbox,
+                     new_reference_points_for_rhand_bbox,
+                     new_reference_points_for_face_box), dim=1).flatten(0,1)
+
+                new_reference_points = torch.cat((dn_new_reference_points, new_reference_points),dim=0)
+                output = torch.cat((dn_output, output), dim=0)
+                tgt_mask = tgt_mask2
+
+
+            # human-to-keypoints, human2face, human2hand update # 2
+            if layer_id >= self.num_box_decoder_layers: 
+                reference_before_sigmoid = inverse_sigmoid(reference_points)
+                reference_before_sigmoid_body_bbox_dn = reference_before_sigmoid[:effect_num_dn]
+                reference_before_sigmoid_bbox_body_norm = \
+                    reference_before_sigmoid[effect_num_dn:][0::(self.num_body_points+4)]
+
+                output_bbox_body_dn=output[:effect_num_dn]
+                output_bbox_body_norm = output[effect_num_dn:][
+                    0::(self.num_body_points+4)]
+                delta_unsig_bbox_body_dn = self.bbox_embed[
+                    layer_id](output_bbox_body_dn)
+                delta_unsig_bbox_body_norm = self.bbox_embed[
+                    layer_id](output_bbox_body_norm)
+
+                outputs_unsig_body_bbox_dn = delta_unsig_bbox_body_dn + reference_before_sigmoid_body_bbox_dn
+                outputs_unsig_body_bbox_norm = delta_unsig_bbox_body_norm + reference_before_sigmoid_bbox_body_norm
+
+                new_reference_points_for_body_box_dn = outputs_unsig_body_bbox_dn.sigmoid()
+                new_reference_points_for_body_box_norm = outputs_unsig_body_bbox_norm.sigmoid()
+
+                # lhand box
+                output_lhand_bbox_query = output[effect_num_dn:][
+                    (self.num_body_points + 1)::(self.num_body_points+4)]
+                delta_xy_lhand_bbox_unsig = self.bbox_hand_embed[
+                    layer_id-self.num_box_decoder_layers](output_lhand_bbox_query)
+                
+                outputs_lhand_bbox_unsig = \
+                    reference_before_sigmoid[effect_num_dn:][
+                        (self.num_body_points + 1)::(self.num_body_points+4)].clone()
+                delta_hw_lhand_bbox_unsig = self.bbox_hand_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_lhand_bbox_query)     
+                
+                outputs_lhand_bbox_unsig[..., :2] +=delta_xy_lhand_bbox_unsig[..., :2]
+                outputs_lhand_bbox_unsig[..., 2:] +=delta_hw_lhand_bbox_unsig
+                new_reference_points_for_lhand_box_norm = outputs_lhand_bbox_unsig.sigmoid()
+                
+                
+                # rhand box
+                output_rhand_bbox_query = output[effect_num_dn:][
+                    (self.num_body_points + 2)::(self.num_body_points+4)] 
+                delta_xy_rhand_bbox_unsig = self.bbox_hand_embed[
+                    layer_id-self.num_box_decoder_layers](output_rhand_bbox_query)
+
+                outputs_rhand_bbox_unsig = \
+                    reference_before_sigmoid[effect_num_dn:][
+                        (self.num_body_points + 2)::(self.num_body_points+4)].clone()
+                delta_hw_rhand_bbox_unsig = self.bbox_hand_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_rhand_bbox_query)
+                                
+                outputs_rhand_bbox_unsig[..., :2] +=delta_xy_rhand_bbox_unsig[..., :2]
+                outputs_rhand_bbox_unsig[..., 2:] +=delta_hw_rhand_bbox_unsig
+                new_reference_points_for_rhand_box_norm = outputs_rhand_bbox_unsig.sigmoid()
+
+
+                # face box
+                output_face_bbox_query = output[effect_num_dn:][
+                    (self.num_body_points + 3)::(self.num_body_points+4)]    
+                delta_xy_face_bbox_unsig = self.bbox_face_embed[
+                    layer_id-self.num_box_decoder_layers](output_face_bbox_query)
+                
+                outputs_face_bbox_unsig = \
+                    reference_before_sigmoid[effect_num_dn:][
+                        (self.num_body_points + 3)::(self.num_body_points+4)].clone()
+                delta_hw_face_bbox_unsig = self.bbox_face_hw_embed[
+                    layer_id-self.num_box_decoder_layers](output_face_bbox_query)
+                
+                outputs_face_bbox_unsig[..., :2] +=delta_xy_face_bbox_unsig[..., :2]
+                outputs_face_bbox_unsig[..., 2:] +=delta_hw_face_bbox_unsig                                                    
+                new_reference_points_for_face_box_norm = outputs_face_bbox_unsig.sigmoid()
+
+                new_reference_points_norm = torch.cat(
+                    (new_reference_points_for_body_box_norm.unsqueeze(1),
+                     new_reference_points_for_lhand_box_norm.unsqueeze(1),
+                     new_reference_points_for_rhand_box_norm.unsqueeze(1), 
+                     new_reference_points_for_face_box_norm.unsqueeze(1)), dim=1).flatten(0,1)
+                
+                new_reference_points = torch.cat((
+                    new_reference_points_for_body_box_dn, 
+                    new_reference_points_norm), dim=0)
+
+
+
+            if self.rm_detach and 'dec' in self.rm_detach:
+                reference_points = new_reference_points
+            else:
+                reference_points = new_reference_points.detach()
+            ref_points.append(new_reference_points)
+            
+        return [[itm_out.transpose(0, 1) for itm_out in intermediate],
+                [itm_refpoint.transpose(0, 1) for itm_refpoint in ref_points]]
+
+
+class Transformer_Box(nn.Module):
+    def __init__(
+            self,
+            d_model=256,
+            nhead=8,
+            num_queries=300,
+            num_encoder_layers=6,
+            num_decoder_layers=6,
+            dim_feedforward=2048,
+            dropout=0.0,
+            activation='relu',
+            normalize_before=False,
+            return_intermediate_dec=False,
+            query_dim=4,
+            num_patterns=0,
+            modulate_hw_attn=False,
+            # for deformable encoder
+            deformable_encoder=False,
+            deformable_decoder=False,
+            num_feature_levels=1,
+            enc_n_points=4,
+            dec_n_points=4,
+            # init query
+            learnable_tgt_init=False,
+            random_refpoints_xy=False,
+            # two stage
+            two_stage_type='no',
+            two_stage_learn_wh=False,
+            two_stage_keep_all_tokens=False,
+            # evo of #anchors
+            dec_layer_number=None,
+            rm_self_attn_layers=None,
+            # for detach
+            rm_detach=None,
+            decoder_sa_type='sa',
+            module_seq=['sa', 'ca', 'ffn'],
+            # for pose
+            embed_init_tgt=False,
+            num_body_points=0,
+            num_hand_points=0,
+            num_face_points=0, 
+            num_box_decoder_layers=2,
+            num_hand_face_decoder_layers=4,
+            num_group=100):
+        super().__init__()
+        # pdb.set_trace()
+        self.num_feature_levels = num_feature_levels  # 4
+        self.num_encoder_layers = num_encoder_layers  # 6
+        self.num_decoder_layers = num_decoder_layers  # 6
+        self.deformable_encoder = deformable_encoder
+        self.deformable_decoder = deformable_decoder
+        self.two_stage_keep_all_tokens = two_stage_keep_all_tokens  # False
+        self.num_queries = num_queries  # 900
+        self.random_refpoints_xy = random_refpoints_xy  # False
+        assert query_dim == 4
+
+        if num_feature_levels > 1:
+            assert deformable_encoder, 'only support deformable_encoder for num_feature_levels > 1'
+
+        self.decoder_sa_type = decoder_sa_type  # sa
+        assert decoder_sa_type in ['sa', 'ca_label', 'ca_content']
+
+        # choose encoder layer type
+        if deformable_encoder:
+            encoder_layer = DeformableTransformerEncoderLayer(
+                d_model, dim_feedforward, dropout, activation,
+                num_feature_levels, nhead, enc_n_points)
+        else:
+            raise NotImplementedError
+            encoder_layer = TransformerEncoderLayer(d_model, nhead,
+                                                    dim_feedforward, dropout,
+                                                    activation,
+                                                    normalize_before)
+        encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
+        self.encoder = TransformerEncoder(
+            encoder_layer,
+            num_encoder_layers,
+            encoder_norm,
+            d_model=d_model,
+            num_queries=num_queries,
+            deformable_encoder=deformable_encoder,
+            two_stage_type=two_stage_type)
+
+        # choose decoder layer type
+        if deformable_decoder:
+            decoder_layer = DeformableTransformerDecoderLayer(
+                d_model,
+                dim_feedforward,
+                dropout,
+                activation,
+                num_feature_levels,
+                nhead,
+                dec_n_points,
+                decoder_sa_type=decoder_sa_type,
+                module_seq=module_seq)
+
+        else:
+            raise NotImplementedError
+            decoder_layer = TransformerDecoderLayer(
+                d_model,
+                nhead,
+                dim_feedforward,
+                dropout,
+                activation,
+                normalize_before,
+                num_feature_levels=num_feature_levels)
+        decoder_norm = nn.LayerNorm(d_model)
+        self.decoder = TransformerDecoder_Box(
+            decoder_layer,
+            num_decoder_layers,
+            decoder_norm,
+            return_intermediate=return_intermediate_dec,
+            d_model=d_model,
+            query_dim=query_dim,
+            modulate_hw_attn=modulate_hw_attn,
+            num_feature_levels=num_feature_levels,
+            deformable_decoder=deformable_decoder,
+            dec_layer_number=dec_layer_number,
+            num_body_points=num_body_points,
+            num_hand_points=num_hand_points,
+            num_face_points=num_face_points,
+            num_box_decoder_layers=num_box_decoder_layers,
+            num_hand_face_decoder_layers=num_hand_face_decoder_layers,
+            num_group=num_group,
+            num_dn=num_group,
+            )
+
+        self.d_model = d_model
+        self.nhead = nhead  # 8
+        self.dec_layers = num_decoder_layers  # 6
+        self.num_queries = num_queries  # useful for single stage model only
+        self.num_patterns = num_patterns  # 0
+        if not isinstance(num_patterns, int):
+            Warning('num_patterns should be int but {}'.format(
+                type(num_patterns)))
+            self.num_patterns = 0
+        if self.num_patterns > 0:
+            assert two_stage_type == 'no'
+            self.patterns = nn.Embedding(self.num_patterns, d_model)
+        if num_feature_levels > 1:
+            if self.num_encoder_layers > 0:
+                self.level_embed = nn.Parameter(
+                    torch.Tensor(num_feature_levels, d_model))
+            else:
+                self.level_embed = None
+
+        self.learnable_tgt_init = learnable_tgt_init  # true
+        assert learnable_tgt_init, 'why not learnable_tgt_init'
+        self.embed_init_tgt = embed_init_tgt  # false
+        if (two_stage_type != 'no' and embed_init_tgt) or (two_stage_type
+                                                           == 'no'):
+            self.tgt_embed = nn.Embedding(self.num_queries, d_model)
+            nn.init.normal_(self.tgt_embed.weight.data)
+        else:
+            self.tgt_embed = None
+
+        # for two stage
+        self.two_stage_type = two_stage_type
+        self.two_stage_learn_wh = two_stage_learn_wh
+        assert two_stage_type in [
+            'no', 'standard', 'early', 'combine', 'enceachlayer', 'enclayer1'
+        ], 'unknown param {} of two_stage_type'.format(two_stage_type)
+        if two_stage_type in [
+                'standard', 'combine', 'enceachlayer', 'enclayer1'
+        ]:
+            # anchor selection at the output of encoder
+            self.enc_output = nn.Linear(d_model, d_model)
+            self.enc_output_norm = nn.LayerNorm(d_model)
+
+            if two_stage_learn_wh:
+                # import pdb; pdb.set_trace()
+                self.two_stage_wh_embedding = nn.Embedding(1, 2)
+            else:
+                self.two_stage_wh_embedding = None
+
+        if two_stage_type in ['early', 'combine']:
+            # anchor selection at the output of backbone
+            self.enc_output_backbone = nn.Linear(d_model, d_model)
+            self.enc_output_norm_backbone = nn.LayerNorm(d_model)
+
+        if two_stage_type == 'no':
+            self.init_ref_points(num_queries)  # init self.refpoint_embed
+
+        self.enc_out_class_embed = None
+        self.enc_out_bbox_embed = None
+        self.enc_out_pose_embed = None
+
+        # evolution of anchors
+        self.dec_layer_number = dec_layer_number
+        if dec_layer_number is not None:
+            if self.two_stage_type != 'no' or num_patterns == 0:
+                assert dec_layer_number[
+                    0] == num_queries, f'dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries})'
+            else:
+                assert dec_layer_number[
+                    0] == num_queries * num_patterns, f'dec_layer_number[0]({dec_layer_number[0]}) != num_queries({num_queries}) * num_patterns({num_patterns})'
+
+        self._reset_parameters()
+
+        self.rm_self_attn_layers = rm_self_attn_layers
+        if rm_self_attn_layers is not None:
+            # assert len(rm_self_attn_layers) == num_decoder_layers
+            print('Removing the self-attn in {} decoder layers'.format(
+                rm_self_attn_layers))
+            for lid, dec_layer in enumerate(self.decoder.layers):
+                if lid in rm_self_attn_layers:
+                    dec_layer.rm_self_attn_modules()
+
+        self.rm_detach = rm_detach
+        if self.rm_detach:
+            assert isinstance(rm_detach, list)
+            assert any([i in ['enc_ref', 'enc_tgt', 'dec'] for i in rm_detach])
+        self.decoder.rm_detach = rm_detach
+
+    def _reset_parameters(self):
+        for p in self.parameters():
+            if p.dim() > 1:
+                nn.init.xavier_uniform_(p)
+
+        for m in self.modules():
+            if isinstance(m, MSDeformAttn):
+                m._reset_parameters()
+
+        if self.num_feature_levels > 1 and self.level_embed is not None:
+            nn.init.normal_(self.level_embed)
+
+        if self.two_stage_learn_wh:
+            nn.init.constant_(self.two_stage_wh_embedding.weight,
+                              math.log(0.05 / (1 - 0.05)))
+
+    def get_valid_ratio(self, mask):
+        _, H, W = mask.shape
+        valid_H = torch.sum(~mask[:, :, 0], 1)
+        valid_W = torch.sum(~mask[:, 0, :], 1)
+        valid_ratio_h = valid_H.float() / H
+        valid_ratio_w = valid_W.float() / W
+        valid_ratio = torch.stack([valid_ratio_w, valid_ratio_h], -1)
+        return valid_ratio
+
+    def init_ref_points(self, use_num_queries):
+        self.refpoint_embed = nn.Embedding(use_num_queries, 4)
+
+        if self.random_refpoints_xy:
+            # import pdb; pdb.set_trace()
+            self.refpoint_embed.weight.data[:, :2].uniform_(0, 1)
+            self.refpoint_embed.weight.data[:, :2] = inverse_sigmoid(
+                self.refpoint_embed.weight.data[:, :2])
+            self.refpoint_embed.weight.data[:, :2].requires_grad = False
+
+    # srcs: features; refpoint_embed:
+    def forward(self,
+                srcs,
+                masks,
+                refpoint_embed,
+                pos_embeds,
+                tgt,
+                attn_mask=None,
+                attn_mask2=None,
+                attn_mask3=None):
+        # pdb.set_trace()
+        # prepare input for encoder
+        src_flatten = []
+        mask_flatten = []
+        lvl_pos_embed_flatten = []
+        spatial_shapes = []
+        for lvl, (src, mask, pos_embed) in enumerate(
+                zip(srcs, masks, pos_embeds)):  # for feature level
+            bs, c, h, w = src.shape
+            spatial_shape = (h, w)
+            spatial_shapes.append(spatial_shape)
+
+            src = src.flatten(2).transpose(1, 2)  # bs, hw, c
+            mask = mask.flatten(1)  # bs, hw
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)  # bs, hw, c
+            if self.num_feature_levels > 1 and self.level_embed is not None:
+                lvl_pos_embed = pos_embed + self.level_embed[lvl].view(
+                    1, 1, -1)  # level_embed[lvl]: [256]
+            else:
+                lvl_pos_embed = pos_embed
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+            src_flatten.append(src)
+            mask_flatten.append(mask)
+        src_flatten = torch.cat(src_flatten, 1)  # bs, \sum{hxw}, c
+        mask_flatten = torch.cat(mask_flatten, 1)  # bs, \sum{hxw}
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten,
+                                          1)  # bs, \sum{hxw}, c
+        spatial_shapes = torch.as_tensor(spatial_shapes,
+                                         dtype=torch.long,
+                                         device=src_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros(
+            (1, )), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        # two stage
+        if self.two_stage_type in ['early', 'combine']:
+            output_memory, output_proposals = gen_encoder_output_proposals(
+                src_flatten, mask_flatten, spatial_shapes)
+            output_memory = self.enc_output_norm_backbone(
+                self.enc_output_backbone(output_memory))
+
+            # gather boxes
+            topk = self.num_queries
+            enc_outputs_class = self.encoder.class_embed[0](output_memory)
+            enc_topk_proposals = torch.topk(enc_outputs_class.max(-1)[0],
+                                            topk,
+                                            dim=1)[1]  # bs, nq
+            enc_refpoint_embed = torch.gather(
+                output_proposals, 1,
+                enc_topk_proposals.unsqueeze(-1).repeat(1, 1, 4))
+
+            src_flatten = output_memory
+        else:
+            enc_topk_proposals = enc_refpoint_embed = None
+
+        #########################################################
+        # Begin Encoder
+        #########################################################
+        memory, enc_intermediate_output, enc_intermediate_refpoints = self.encoder(
+            src_flatten,
+            pos=lvl_pos_embed_flatten,
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            key_padding_mask=mask_flatten,
+            ref_token_index=enc_topk_proposals,  # bs, nq
+            ref_token_coord=enc_refpoint_embed,  # bs, nq, 4
+        )
+        #########################################################
+        # End Encoder
+        # - memory: bs, \sum{hw}, c
+        # - mask_flatten: bs, \sum{hw}
+        # - lvl_pos_embed_flatten: bs, \sum{hw}, c
+        # - enc_intermediate_output: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        # - enc_intermediate_refpoints: None or (nenc+1, bs, nq, c) or (nenc, bs, nq, c)
+        #########################################################
+
+        if self.two_stage_type in [
+                'standard', 'combine', 'enceachlayer', 'enclayer1'
+        ]:
+            if self.two_stage_learn_wh:
+                # import pdb; pdb.set_trace()
+                input_hw = self.two_stage_wh_embedding.weight[0]
+            else:
+                input_hw = None
+            output_memory, output_proposals = gen_encoder_output_proposals(
+                memory, mask_flatten, spatial_shapes, input_hw)
+            output_memory = self.enc_output_norm(
+                self.enc_output(output_memory))
+
+            enc_outputs_class_unselected = self.enc_out_class_embed(
+                output_memory)  # [11531, 2] for swin
+            enc_outputs_coord_unselected = self.enc_out_bbox_embed(
+                output_memory
+            ) + output_proposals  # (bs, \sum{hw}, 4) unsigmoid
+            topk = self.num_queries
+            topk_proposals = torch.topk(
+                enc_outputs_class_unselected.max(-1)[0], topk,
+                dim=1)[1]  # bs, nq coarse human query selection
+
+            # gather boxes
+            refpoint_embed_undetach = torch.gather(
+                enc_outputs_coord_unselected, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1, 4))  # unsigmoid
+            refpoint_embed_ = refpoint_embed_undetach.detach()
+            init_box_proposal = torch.gather(
+                output_proposals, 1,
+                topk_proposals.unsqueeze(-1).repeat(1, 1,
+                                                    4)).sigmoid()  # sigmoid
+
+            # gather tgt
+            tgt_undetach = torch.gather(
+                output_memory, 1,
+                topk_proposals.unsqueeze(-1).repeat(
+                    1, 1, self.d_model))  # selected content query
+            if self.embed_init_tgt:
+                tgt_ = self.tgt_embed.weight[:, None, :].repeat(
+                    1, bs, 1).transpose(0, 1)  # nq, bs, d_model
+            else:
+                tgt_ = tgt_undetach.detach()
+
+            if refpoint_embed is not None:
+                # import pdb; pdb.set_trace()
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_],
+                                           dim=1)  # [1000, 4]
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+        elif self.two_stage_type == 'early':
+            refpoint_embed_undetach = self.enc_out_bbox_embed(
+                enc_intermediate_output[-1]
+            ) + enc_refpoint_embed  # unsigmoid, (bs, nq, 4)
+            refpoint_embed = refpoint_embed_undetach.detach()  #
+
+            tgt_undetach = enc_intermediate_output[-1]  # bs, nq, d_model
+            tgt = tgt_undetach.detach()
+        elif self.two_stage_type == 'no':
+            tgt_ = self.tgt_embed.weight[:,
+                                         None, :].repeat(1, bs, 1).transpose(
+                                             0, 1)  # nq, bs, d_model
+            refpoint_embed_ = self.refpoint_embed.weight[:, None, :].repeat(
+                1, bs, 1).transpose(0, 1)  # nq, bs, 4
+
+            if refpoint_embed is not None:
+                # import pdb; pdb.set_trace()
+                refpoint_embed = torch.cat([refpoint_embed, refpoint_embed_],
+                                           dim=1)
+                tgt = torch.cat([tgt, tgt_], dim=1)
+            else:
+                refpoint_embed, tgt = refpoint_embed_, tgt_
+
+            # pat embed
+            if self.num_patterns > 0:
+                tgt_embed = tgt.repeat(1, self.num_patterns, 1)
+                refpoint_embed = refpoint_embed.repeat(1, self.num_patterns, 1)
+                tgt_pat = self.patterns.weight[None, :, :].repeat_interleave(
+                    self.num_queries, 1)  # 1, n_q*n_pat, d_model
+                tgt = tgt_embed + tgt_pat
+
+            init_box_proposal = refpoint_embed_.sigmoid()
+
+        else:
+            raise NotImplementedError('unknown two_stage_type {}'.format(
+                self.two_stage_type))
+
+        #########################################################
+        # Begin Decoder
+        #########################################################
+        hs, references = self.decoder(
+            tgt=tgt.transpose(0, 1),
+            memory=memory.transpose(0, 1),
+            memory_key_padding_mask=mask_flatten,
+            pos=lvl_pos_embed_flatten.transpose(0, 1),
+            refpoints_unsigmoid=refpoint_embed.transpose(0, 1),
+            level_start_index=level_start_index,
+            spatial_shapes=spatial_shapes,
+            valid_ratios=valid_ratios,
+            tgt_mask=attn_mask,
+            tgt_mask2=attn_mask2,
+            tgt_mask3=attn_mask3)
+        #########################################################
+        # End Decoder
+        # hs: n_dec, bs, nq, d_model
+        # references: n_dec+1, bs, nq, query_dim
+        #########################################################
+
+        #########################################################
+        # Begin postprocess
+        #########################################################
+        if self.two_stage_type == 'standard':
+            if self.two_stage_keep_all_tokens:
+                hs_enc = output_memory.unsqueeze(0)
+                ref_enc = enc_outputs_coord_unselected.unsqueeze(0)
+                init_box_proposal = output_proposals
+                # import pdb; pdb.set_trace()
+            else:
+                hs_enc = tgt_undetach.unsqueeze(0)
+                ref_enc = refpoint_embed_undetach.sigmoid().unsqueeze(0)
+        elif self.two_stage_type in ['combine', 'early']:
+            hs_enc = enc_intermediate_output
+            hs_enc = torch.cat((hs_enc, tgt_undetach.unsqueeze(0)),
+                               dim=0)  # nenc+1, bs, nq, c
+            n_layer_hs_enc = hs_enc.shape[0]
+            assert n_layer_hs_enc == self.num_encoder_layers + 1
+
+            ref_enc = enc_intermediate_refpoints
+            ref_enc = torch.cat(
+                (ref_enc, refpoint_embed_undetach.sigmoid().unsqueeze(0)),
+                dim=0)  # nenc+1, bs, nq, 4
+        elif self.two_stage_type in ['enceachlayer', 'enclayer1']:
+            hs_enc = enc_intermediate_output
+            hs_enc = torch.cat((hs_enc, tgt_undetach.unsqueeze(0)),
+                               dim=0)  # nenc, bs, nq, c
+            n_layer_hs_enc = hs_enc.shape[0]
+            assert n_layer_hs_enc == self.num_encoder_layers
+
+            ref_enc = enc_intermediate_refpoints
+            ref_enc = torch.cat(
+                (ref_enc, refpoint_embed_undetach.sigmoid().unsqueeze(0)),
+                dim=0)  # nenc, bs, nq, 4
+        else:
+            hs_enc = ref_enc = None
+
+        return hs, references, hs_enc, ref_enc, init_box_proposal
+
diff --git a/models/aios/transformer_deformable.py b/models/aios/transformer_deformable.py
new file mode 100644
index 0000000000000000000000000000000000000000..833102999494b894f20a5feae000fbeb31113f38
--- /dev/null
+++ b/models/aios/transformer_deformable.py
@@ -0,0 +1,183 @@
+import copy
+import os
+from typing import Optional, List
+import math
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from util.misc import inverse_sigmoid
+from .ops.modules import MSDeformAttn
+
+from .utils import sigmoid_focal_loss, MLP, _get_activation_fn, gen_sineembed_for_position
+import pdb
+
+
+class DeformableTransformerEncoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model=256,
+        d_ffn=1024,
+        dropout=0.1,
+        activation='relu',
+        n_levels=4,
+        n_heads=8,
+        n_points=4,
+    ):
+        super().__init__()
+        # pdb.set_trace()
+        # self attention
+        self.self_attn = MSDeformAttn(d_model, n_levels, n_heads,
+                                      n_points)  # 256 4 8 4
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation, d_model=d_ffn)
+        self.dropout2 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout3 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, src):
+        src2 = self.linear2(self.dropout2(self.activation(self.linear1(src))))
+        src = src + self.dropout3(src2)
+        src = self.norm2(src)
+        return src
+
+    def forward(self,
+                src,
+                pos,
+                reference_points,
+                spatial_shapes,
+                level_start_index,
+                key_padding_mask=None):
+        # pdb.set_trace()
+        src2 = self.self_attn(self.with_pos_embed(src, pos), reference_points,
+                              src, spatial_shapes, level_start_index,
+                              key_padding_mask)
+        src = src + self.dropout1(src2)
+        src = self.norm1(src)
+        src = self.forward_ffn(src)
+
+        return src
+
+
+class DeformableTransformerDecoderLayer(nn.Module):
+    def __init__(
+        self,
+        d_model=256,
+        d_ffn=1024,
+        dropout=0.1,
+        activation='relu',
+        n_levels=4,
+        n_heads=8,
+        n_points=4,
+        decoder_sa_type='ca',
+        module_seq=['sa', 'ca', 'ffn'],
+    ):
+        super().__init__()
+        # pdb.set_trace()
+        self.module_seq = module_seq
+        assert sorted(module_seq) == ['ca', 'ffn', 'sa']
+
+        # cross attention
+        self.cross_attn = MSDeformAttn(d_model, n_levels, n_heads, n_points)
+        self.dropout1 = nn.Dropout(dropout)
+        self.norm1 = nn.LayerNorm(d_model)
+
+        # self attention
+        self.self_attn = nn.MultiheadAttention(d_model,
+                                               n_heads,
+                                               dropout=dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        self.norm2 = nn.LayerNorm(d_model)
+
+        # ffn
+        self.linear1 = nn.Linear(d_model, d_ffn)
+        self.activation = _get_activation_fn(activation,
+                                             d_model=d_ffn,
+                                             batch_dim=1)
+        self.dropout3 = nn.Dropout(dropout)
+        self.linear2 = nn.Linear(d_ffn, d_model)
+        self.dropout4 = nn.Dropout(dropout)
+        self.norm3 = nn.LayerNorm(d_model)
+
+        self.key_aware_proj = None
+        self.decoder_sa_type = decoder_sa_type
+        assert decoder_sa_type in ['sa']
+
+    def rm_self_attn_modules(self):
+        self.self_attn = None
+        self.dropout2 = None
+        self.norm2 = None
+
+    @staticmethod
+    def with_pos_embed(tensor, pos):
+        return tensor if pos is None else tensor + pos
+
+    def forward_ffn(self, tgt):
+        tgt2 = self.linear2(self.dropout3(self.activation(self.linear1(tgt))))
+        tgt = tgt + self.dropout4(tgt2)
+        tgt = self.norm3(tgt)
+        return tgt
+
+    def forward(
+            self,
+            # for tgt
+            tgt: Optional[Tensor],  # nq, bs, d_model
+            tgt_query_pos: Optional[
+                Tensor] = None,  # pos for query. MLP(Sine(pos))
+            tgt_query_sine_embed: Optional[
+                Tensor] = None,  # pos for query. Sine(pos)
+            tgt_key_padding_mask: Optional[Tensor] = None,
+            tgt_reference_points: Optional[Tensor] = None,  # nq, bs, 4
+
+            # for memory
+        memory: Optional[Tensor] = None,  # hw, bs, d_model
+            memory_key_padding_mask: Optional[Tensor] = None,
+            memory_level_start_index: Optional[Tensor] = None,  # num_levels
+            memory_spatial_shapes: Optional[
+                Tensor] = None,  # bs, num_levels, 2
+            memory_pos: Optional[Tensor] = None,  # pos for memory
+
+            # sa
+        self_attn_mask: Optional[
+            Tensor] = None,  # mask used for self-attention
+            cross_attn_mask: Optional[
+                Tensor] = None,  # mask used for cross-attention
+    ):
+        """
+        Input:
+            - tgt/tgt_query_pos: nq, bs, d_model
+            -
+        """
+        # pdb.set_trace()
+        assert cross_attn_mask is None
+
+        if self.self_attn is not None:
+            q = k = self.with_pos_embed(tgt, tgt_query_pos)
+            tgt2 = self.self_attn(q, k, tgt, attn_mask=self_attn_mask)[0]
+            tgt = tgt + self.dropout2(tgt2)
+            tgt = self.norm2(tgt)
+        tgt2 = self.cross_attn(
+            self.with_pos_embed(tgt, tgt_query_pos).transpose(0, 1),
+            tgt_reference_points.transpose(0, 1).contiguous(),
+            memory.transpose(0, 1), memory_spatial_shapes,
+            memory_level_start_index, memory_key_padding_mask).transpose(0, 1)
+        tgt = tgt + self.dropout1(tgt2)
+        tgt = self.norm1(tgt)
+        # ffn
+        tgt = self.forward_ffn(tgt)
+
+        return tgt
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
diff --git a/models/aios/utils.py b/models/aios/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6481296825bbb8c50964a0b182312aca2361037
--- /dev/null
+++ b/models/aios/utils.py
@@ -0,0 +1,356 @@
+import torch
+import random
+from torch import nn, Tensor
+import os
+import numpy as np
+import math
+import torch.nn.functional as F
+from torch import nn
+
+
+class PoseProjector(nn.Module):
+    def __init__(self, hidden_dim=256, num_body_points=17):
+        super().__init__()
+        self.num_body_points = num_body_points
+        self.V_projector = nn.Linear(hidden_dim, num_body_points)
+        nn.init.constant_(self.V_projector.bias.data, 0)
+        self.Z_projector = MLP(hidden_dim, hidden_dim, num_body_points * 2, 3)
+        nn.init.constant_(self.Z_projector.layers[-1].weight.data, 0)
+        nn.init.constant_(self.Z_projector.layers[-1].bias.data, 0)
+
+    def forward(self, hs):
+        """_summary_
+
+        Args:
+            hs (_type_): ..., bs, nq, hidden_dim
+        """
+        Z = self.Z_projector(hs)  # ..., bs, nq, 34
+        V = self.V_projector(hs)  # ..., bs, nq, 17
+        return Z, V
+
+
+def gen_encoder_output_proposals(memory: Tensor,
+                                 memory_padding_mask: Tensor,
+                                 spatial_shapes: Tensor,
+                                 learnedwh=None):
+    """
+    Input:
+        - memory: bs, \sum{hw}, d_model
+        - memory_padding_mask: bs, \sum{hw}
+        - spatial_shapes: nlevel, 2
+        - learnedwh: 2
+    Output:
+        - output_memory: bs, \sum{hw}, d_model
+        - output_proposals: bs, \sum{hw}, 4
+    """
+    N_, S_, C_ = memory.shape
+    base_scale = 4.0
+    proposals = []
+    _cur = 0
+    for lvl, (H_, W_) in enumerate(spatial_shapes):
+        mask_flatten_ = memory_padding_mask[:, _cur:(_cur + H_ * W_)].view(
+            N_, H_, W_, 1)
+        valid_H = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+        valid_W = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+        grid_y, grid_x = torch.meshgrid(
+            torch.linspace(0,
+                           H_ - 1,
+                           H_,
+                           dtype=torch.float32,
+                           device=memory.device),
+            torch.linspace(0,
+                           W_ - 1,
+                           W_,
+                           dtype=torch.float32,
+                           device=memory.device))
+        grid = torch.cat(
+            [grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)  # H_, W_, 2
+
+        scale = torch.cat([valid_W.unsqueeze(-1),
+                           valid_H.unsqueeze(-1)], 1).view(N_, 1, 1, 2)
+        grid = (grid.unsqueeze(0).expand(N_, -1, -1, -1) + 0.5) / scale
+
+        if learnedwh is not None:
+            wh = torch.ones_like(grid) * learnedwh.sigmoid() * (2.0**lvl)
+        else:
+            wh = torch.ones_like(grid) * 0.05 * (2.0**lvl)
+        proposal = torch.cat((grid, wh), -1).view(N_, -1, 4)
+        proposals.append(proposal)
+        _cur += (H_ * W_)
+    # import pdb; pdb.set_trace()
+    output_proposals = torch.cat(proposals, 1)
+    output_proposals_valid = ((output_proposals > 0.01) &
+                              (output_proposals < 0.99)).all(-1, keepdim=True)
+    output_proposals = torch.log(output_proposals /
+                                 (1 - output_proposals))  # unsigmoid
+    output_proposals = output_proposals.masked_fill(
+        memory_padding_mask.unsqueeze(-1), float('inf'))
+    output_proposals = output_proposals.masked_fill(~output_proposals_valid,
+                                                    float('inf'))
+
+    output_memory = memory
+    output_memory = output_memory.masked_fill(
+        memory_padding_mask.unsqueeze(-1), float(0))
+    output_memory = output_memory.masked_fill(~output_proposals_valid,
+                                              float(0))
+    return output_memory, output_proposals
+
+
+class RandomBoxPerturber():
+    def __init__(self,
+                 x_noise_scale=0.2,
+                 y_noise_scale=0.2,
+                 w_noise_scale=0.2,
+                 h_noise_scale=0.2) -> None:
+        self.noise_scale = torch.Tensor(
+            [x_noise_scale, y_noise_scale, w_noise_scale, h_noise_scale])
+
+    def __call__(self, refanchors: Tensor) -> Tensor:
+        nq, bs, query_dim = refanchors.shape
+        device = refanchors.device
+
+        noise_raw = torch.rand_like(refanchors)
+        noise_scale = self.noise_scale.to(device)[:query_dim]
+
+        new_refanchors = refanchors * (1 + (noise_raw - 0.5) * noise_scale)
+        return new_refanchors.clamp_(0, 1)
+
+
+def sigmoid_focal_loss(inputs,
+                       targets,
+                       num_boxes,
+                       alpha: float = 0.25,
+                       gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://arxiv.org/abs/1708.02002.
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+        alpha: (optional) Weighting factor in range (0,1) to balance
+                positive vs negative examples. Default = -1 (no weighting).
+        gamma: Exponent of the modulating factor (1 - p_t) to
+               balance easy vs hard examples.
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = F.binary_cross_entropy_with_logits(inputs,
+                                                 targets,
+                                                 reduction='none')
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t)**gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+def _get_activation_fn(activation, d_model=256, batch_dim=0):
+    """Return an activation function given a string."""
+    if activation == 'relu':
+        return F.relu
+    if activation == 'gelu':
+        return F.gelu
+    if activation == 'glu':
+        return F.glu
+    if activation == 'prelu':
+        return nn.PReLU()
+    if activation == 'selu':
+        return F.selu
+    raise RuntimeError(F'activation should be relu/gelu, not {activation}.')
+
+
+def gen_sineembed_for_position(pos_tensor):
+    # n_query, bs, _ = pos_tensor.size()
+    # sineembed_tensor = torch.zeros(n_query, bs, 256)
+    scale = 2 * math.pi
+    dim_t = torch.arange(128, dtype=torch.float32, device=pos_tensor.device)
+    dim_t = 10000**(2 * (dim_t // 2) / 128)
+    x_embed = pos_tensor[:, :, 0] * scale
+    y_embed = pos_tensor[:, :, 1] * scale
+    pos_x = x_embed[:, :, None] / dim_t
+    pos_y = y_embed[:, :, None] / dim_t
+    pos_x = torch.stack((pos_x[:, :, 0::2].sin(), pos_x[:, :, 1::2].cos()),
+                        dim=3).flatten(2)
+    pos_y = torch.stack((pos_y[:, :, 0::2].sin(), pos_y[:, :, 1::2].cos()),
+                        dim=3).flatten(2)
+    if pos_tensor.size(-1) == 2:
+        pos = torch.cat((pos_y, pos_x), dim=2)
+    elif pos_tensor.size(-1) == 4:
+        w_embed = pos_tensor[:, :, 2] * scale
+        pos_w = w_embed[:, :, None] / dim_t
+        pos_w = torch.stack((pos_w[:, :, 0::2].sin(), pos_w[:, :, 1::2].cos()),
+                            dim=3).flatten(2)
+
+        h_embed = pos_tensor[:, :, 3] * scale
+        pos_h = h_embed[:, :, None] / dim_t
+        pos_h = torch.stack((pos_h[:, :, 0::2].sin(), pos_h[:, :, 1::2].cos()),
+                            dim=3).flatten(2)
+
+        pos = torch.cat((pos_y, pos_x, pos_w, pos_h), dim=2)
+    else:
+        raise ValueError('Unknown pos_tensor shape(-1):{}'.format(
+            pos_tensor.size(-1)))
+    return pos
+
+
+def oks_overlaps(kpt_preds, kpt_gts, kpt_valids, kpt_areas, sigmas):
+    sigmas = kpt_preds.new_tensor(sigmas)
+    variances = (sigmas * 2)**2
+
+    assert kpt_preds.size(0) == kpt_gts.size(0)
+    kpt_preds = kpt_preds.reshape(-1, kpt_preds.size(-1) // 2, 2)
+    kpt_gts = kpt_gts.reshape(-1, kpt_gts.size(-1) // 2, 2)
+
+    squared_distance = (kpt_preds[:, :, 0] - kpt_gts[:, :, 0]) ** 2 + \
+        (kpt_preds[:, :, 1] - kpt_gts[:, :, 1]) ** 2
+    # import pdb
+    # pdb.set_trace()
+    # assert (kpt_valids.sum(-1) > 0).all()
+    squared_distance0 = squared_distance / (kpt_areas[:, None] *
+                                            variances[None, :] * 2)
+    squared_distance1 = torch.exp(-squared_distance0)
+    squared_distance1 = squared_distance1 * kpt_valids
+    oks = squared_distance1.sum(dim=1) / (kpt_valids.sum(dim=1) + 1e-6)
+
+    return oks
+
+
+def oks_loss(pred,
+             target,
+             valid=None,
+             area=None,
+             linear=False,
+             sigmas=None,
+             eps=1e-6):
+    """Oks loss.
+
+    Computing the oks loss between a set of predicted poses and target poses.
+    The loss is calculated as negative log of oks.
+    Args:
+        pred (torch.Tensor): Predicted poses of format (x1, y1, x2, y2, ...),
+            shape (n, 2K).
+        target (torch.Tensor): Corresponding gt poses, shape (n, 2K).
+        linear (bool, optional): If True, use linear scale of loss instead of
+            log scale. Default: False.
+        eps (float): Eps to avoid log(0).
+    Return:
+        torch.Tensor: Loss tensor.
+    """
+    oks = oks_overlaps(pred, target, valid, area, sigmas).clamp(min=eps)
+    if linear:
+        loss = 1 - oks
+    else:
+        loss = -oks.log()
+    loss = loss * valid.sum(-1) / (valid.sum(-1) + eps)
+    return loss
+
+
+class OKSLoss(nn.Module):
+    """IoULoss.
+
+    Computing the oks loss between a set of predicted poses and target poses.
+    Args:
+        linear (bool): If True, use linear scale of loss instead of log scale.
+            Default: False.
+        eps (float): Eps to avoid log(0).
+        reduction (str): Options are "none", "mean" and "sum".
+        loss_weight (float): Weight of loss.
+    """
+    def __init__(self,
+                 linear=False,
+                 num_keypoints=17,
+                 eps=1e-6,
+                 reduction='mean',
+                 loss_weight=1.0):
+        super(OKSLoss, self).__init__()
+        self.linear = linear
+        self.eps = eps
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        if num_keypoints == 17:
+            self.sigmas = np.array([
+                .26, .25, .25, .35, .35, .79, .79, .72, .72, .62, .62, 1.07,
+                1.07, .87, .87, .89, .89
+            ],
+                                   dtype=np.float32) / 10.0
+        elif num_keypoints == 14:
+            self.sigmas = np.array([
+                .79, .79, .72, .72, .62, .62, 1.07, 1.07, .87, .87, .89, .89,
+                .79, .79
+            ]) / 10.0
+        elif num_keypoints == 6:
+            self.sigmas = np.array(
+                [
+                    .25,.25,.25,.25,.25,.25
+                ], dtype=np.float32
+            )/ 10.0
+        else:
+            raise ValueError(f'Unsupported keypoints number {num_keypoints}')
+
+    def forward(self,
+                pred,
+                target,
+                valid,
+                area,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction.
+            target (torch.Tensor): The learning target of the prediction.
+            valid (torch.Tensor): The visible flag of the target pose.
+            area (torch.Tensor): The area of the target pose.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction. Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Defaults to None. Options are "none", "mean" and "sum".
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (reduction_override
+                     if reduction_override else self.reduction)
+        if (weight is not None) and (not torch.any(weight > 0)) and (
+                reduction != 'none'):
+            if pred.dim() == weight.dim() + 1:
+                weight = weight.unsqueeze(1)
+            return (pred * weight).sum()  # 0
+        if weight is not None and weight.dim() > 1:
+            # TODO: remove this in the future
+            # reduce the weight of shape (n, 4) to (n,) to match the
+            # iou_loss of shape (n,)
+            assert weight.shape == pred.shape
+            weight = weight.mean(-1)
+        loss = self.loss_weight * oks_loss(pred,
+                                           target,
+                                           valid=valid,
+                                           area=area,
+                                           linear=self.linear,
+                                           sigmas=self.sigmas,
+                                           eps=self.eps)
+        return loss
diff --git a/models/registry.py b/models/registry.py
new file mode 100644
index 0000000000000000000000000000000000000000..9da538629bb72692f9295a4bb15391875ba654dd
--- /dev/null
+++ b/models/registry.py
@@ -0,0 +1,52 @@
+import inspect
+from functools import partial
+
+
+class Registry(object):
+    def __init__(self, name):
+        self._name = name
+        self._module_dict = dict()
+
+    def __repr__(self):
+        format_str = self.__class__.__name__ + '(name={}, items={})'.format(
+            self._name, list(self._module_dict.keys()))
+        return format_str
+
+    def __len__(self):
+        return len(self._module_dict)
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def module_dict(self):
+        return self._module_dict
+
+    def get(self, key):
+        return self._module_dict.get(key, None)
+
+    def registe_with_name(self, module_name=None, force=False):
+        return partial(self.register, module_name=module_name, force=force)
+
+    def register(self, module_build_function, module_name=None, force=False):
+        """Register a module build function.
+
+        Args:
+            module (:obj:`nn.Module`): Module to be registered.
+        """
+        if not inspect.isfunction(module_build_function):
+            raise TypeError(
+                'module_build_function must be a function, but got {}'.format(
+                    type(module_build_function)))
+        if module_name is None:
+            module_name = module_build_function.__name__
+        if not force and module_name in self._module_dict:
+            raise KeyError('{} is already registered in {}'.format(
+                module_name, self.name))
+        self._module_dict[module_name] = module_build_function
+
+        return module_build_function
+
+
+MODULE_BUILD_FUNCS = Registry('model build functions')
diff --git a/models/utils.py b/models/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..31a6a4ffdb08152919627a11fda129d7a3e1fb34
--- /dev/null
+++ b/models/utils.py
@@ -0,0 +1,341 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import math
+from util import box_ops
+from util.misc import NestedTensor
+from util.utils import NiceRepr
+
+
+class GroupwiseMLP(nn.Module):
+    def __init__(self, num_class, input_dim, hidden_dim, output_dim,
+                 num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            GroupWiseLinear(num_class, n, k)
+            for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        if x.dim() == 4:
+            resize_flag = True
+            c0, b, k, d = x.shape
+            x = x.flatten(0, 1)
+        else:
+            resize_flag = False
+
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+
+        if resize_flag:
+            x = x.reshape(c0, b, k, -1)
+        return x
+
+
+class GroupWiseLinear(nn.Module):
+    def __init__(self, num_class, input_dim, output_dim, bias=True):
+        super().__init__()
+        self.num_class = num_class
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.bias = bias
+
+        self.W = nn.Parameter(torch.Tensor(num_class, input_dim, output_dim))
+        if bias:
+            self.b = nn.Parameter(torch.Tensor(num_class, output_dim))
+        self.reset_parameters()
+
+    def reset_parameters(self):
+        stdv = 1. / math.sqrt(self.W.size(2))
+        for i in range(self.num_class):
+            for j in range(self.input_dim):
+                self.W[i][j].data.uniform_(-stdv, stdv)
+        if self.bias:
+            for i in range(self.num_class):
+                self.b[i].data.uniform_(-stdv, stdv)
+
+    def forward(self, x: torch.FloatTensor):
+        """
+
+        Dim:
+            - b: batch size
+            - k: num_class
+            - d: input dim
+            - o: output dim
+
+        Input:
+            - x: shape(b,k,d) or (c0,b,k,d)
+
+        Output:
+            - x: shape(b,k,o) or (c0,b,k,o)
+        """
+        if x.dim() == 4:
+            resize_flag = True
+            c0, b, k, d = x.shape
+            x = x.flatten(0, 1)
+        else:
+            resize_flag = False
+
+        x = torch.einsum('bkd,kdo->bko', x, self.W)
+        if self.bias:
+            x = torch.einsum('bko,ko->bko', x, self.b)
+
+        if resize_flag:
+            x = x.reshape(c0, b, k, -1)
+        return x
+
+
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@torch.no_grad()
+def mask_sample(samples: NestedTensor, known_boxes):
+    """[summary]
+
+    Args:
+        samples (NestedTensor): batch of imgs. B,3,H,W
+        known_boxes (list of knownBox): [knownbox_each_img x B]
+
+    Returns:
+        [Tensor]: Masked imgs. B,3,H,W.
+    """
+    # print("HERE!!!!!!!!!")
+    # import pdb; pdb.set_trace()
+    boxes_flat = [
+        box_ops.box_cxcywh_to_xyxy(kbs[:, :4])
+        for idx, kbs in enumerate(known_boxes)
+    ]
+    img_shapes = samples.imgsize()
+    device = samples.tensors.device
+    # ! TODO:
+    for idx, (shape, boxes) in enumerate(zip(img_shapes, boxes_flat)):
+        h, w = shape.tolist()
+        scale = torch.Tensor([w, h, w, h]).to(device)
+        boxes = boxes * scale
+        for box in boxes:
+            x1, y1, x2, y2 = [int(i) for i in box.tolist()]
+            samples.tensors[idx, :, y1:y2, x1:x2] = 0
+    return samples
+
+
+class AssignResult(NiceRepr):
+    """Stores assignments between predicted and truth boxes.
+
+    ! Borrow from mmdetection
+
+
+    Attributes:
+        num_gts (int): the number of truth boxes considered when computing this
+            assignment
+
+        gt_inds (LongTensor): for each predicted box indicates the 1-based
+            index of the assigned truth box. 0 means unassigned and -1 means
+            ignore.
+
+        max_overlaps (FloatTensor): the iou between the predicted box and its
+            assigned truth box.
+
+        labels (None | LongTensor): If specified, for each predicted box
+            indicates the category label of the assigned truth box.
+
+    Example:
+        >>> # An assign result between 4 predicted boxes and 9 true boxes
+        >>> # where only two boxes were assigned.
+        >>> num_gts = 9
+        >>> max_overlaps = torch.LongTensor([0, .5, .9, 0])
+        >>> gt_inds = torch.LongTensor([-1, 1, 2, 0])
+        >>> labels = torch.LongTensor([0, 3, 4, 0])
+        >>> self = AssignResult(num_gts, gt_inds, max_overlaps, labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(4,), max_overlaps.shape=(4,),
+                      labels.shape=(4,))>
+        >>> # Force addition of gt labels (when adding gt as proposals)
+        >>> new_labels = torch.LongTensor([3, 4, 5])
+        >>> self.add_gt_(new_labels)
+        >>> print(str(self))  # xdoctest: +IGNORE_WANT
+        <AssignResult(num_gts=9, gt_inds.shape=(7,), max_overlaps.shape=(7,),
+                      labels.shape=(7,))>
+    """
+    def __init__(self, num_gts, gt_inds, max_overlaps, labels=None):
+        self.num_gts = num_gts
+        self.gt_inds = gt_inds
+        self.max_overlaps = max_overlaps
+        self.labels = labels
+        # Interface for possible user-defined properties
+        self._extra_properties = {}
+
+    @property
+    def num_preds(self):
+        """int: the number of predictions in this assignment"""
+        return len(self.gt_inds)
+
+    def set_extra_property(self, key, value):
+        """Set user-defined new property."""
+        assert key not in self.info
+        self._extra_properties[key] = value
+
+    def get_extra_property(self, key):
+        """Get user-defined property."""
+        return self._extra_properties.get(key, None)
+
+    @property
+    def info(self):
+        """dict: a dictionary of info about the object"""
+        basic_info = {
+            'num_gts': self.num_gts,
+            'num_preds': self.num_preds,
+            'gt_inds': self.gt_inds,
+            'max_overlaps': self.max_overlaps,
+            'labels': self.labels,
+        }
+        basic_info.update(self._extra_properties)
+        return basic_info
+
+    def __nice__(self):
+        """str: a "nice" summary string describing this assign result"""
+        parts = []
+        parts.append(f'num_gts={self.num_gts!r}')
+        if self.gt_inds is None:
+            parts.append(f'gt_inds={self.gt_inds!r}')
+        else:
+            parts.append(f'gt_inds.shape={tuple(self.gt_inds.shape)!r}')
+        if self.max_overlaps is None:
+            parts.append(f'max_overlaps={self.max_overlaps!r}')
+        else:
+            parts.append('max_overlaps.shape='
+                         f'{tuple(self.max_overlaps.shape)!r}')
+        if self.labels is None:
+            parts.append(f'labels={self.labels!r}')
+        else:
+            parts.append(f'labels.shape={tuple(self.labels.shape)!r}')
+        return ', '.join(parts)
+
+    @classmethod
+    def random(cls, **kwargs):
+        """Create random AssignResult for tests or debugging.
+
+        Args:
+            num_preds: number of predicted boxes
+            num_gts: number of true boxes
+            p_ignore (float): probability of a predicted box assigned to an
+                ignored truth
+            p_assigned (float): probability of a predicted box not being
+                assigned
+            p_use_label (float | bool): with labels or not
+            rng (None | int | numpy.random.RandomState): seed or state
+
+        Returns:
+            :obj:`AssignResult`: Randomly generated assign results.
+
+        Example:
+            >>> from mmdet.core.bbox.assigners.assign_result import *  # NOQA
+            >>> self = AssignResult.random()
+            >>> print(self.info)
+        """
+        from util.utils import ensure_rng
+        rng = ensure_rng(kwargs.get('rng', None))
+
+        num_gts = kwargs.get('num_gts', None)
+        num_preds = kwargs.get('num_preds', None)
+        p_ignore = kwargs.get('p_ignore', 0.3)
+        p_assigned = kwargs.get('p_assigned', 0.7)
+        p_use_label = kwargs.get('p_use_label', 0.5)
+        num_classes = kwargs.get('p_use_label', 3)
+
+        if num_gts is None:
+            num_gts = rng.randint(0, 8)
+        if num_preds is None:
+            num_preds = rng.randint(0, 16)
+
+        if num_gts == 0:
+            max_overlaps = torch.zeros(num_preds, dtype=torch.float32)
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+            if p_use_label is True or p_use_label < rng.rand():
+                labels = torch.zeros(num_preds, dtype=torch.int64)
+            else:
+                labels = None
+        else:
+            import numpy as np
+            # Create an overlap for each predicted box
+            max_overlaps = torch.from_numpy(rng.rand(num_preds))
+
+            # Construct gt_inds for each predicted box
+            is_assigned = torch.from_numpy(rng.rand(num_preds) < p_assigned)
+            # maximum number of assignments constraints
+            n_assigned = min(num_preds, min(num_gts, is_assigned.sum()))
+
+            assigned_idxs = np.where(is_assigned)[0]
+            rng.shuffle(assigned_idxs)
+            assigned_idxs = assigned_idxs[0:n_assigned]
+            assigned_idxs.sort()
+
+            is_assigned[:] = 0
+            is_assigned[assigned_idxs] = True
+
+            is_ignore = torch.from_numpy(
+                rng.rand(num_preds) < p_ignore) & is_assigned
+
+            gt_inds = torch.zeros(num_preds, dtype=torch.int64)
+
+            true_idxs = np.arange(num_gts)
+            rng.shuffle(true_idxs)
+            true_idxs = torch.from_numpy(true_idxs)
+            gt_inds[is_assigned] = true_idxs[:n_assigned]
+
+            gt_inds = torch.from_numpy(
+                rng.randint(1, num_gts + 1, size=num_preds))
+            gt_inds[is_ignore] = -1
+            gt_inds[~is_assigned] = 0
+            max_overlaps[~is_assigned] = 0
+
+            if p_use_label is True or p_use_label < rng.rand():
+                if num_classes == 0:
+                    labels = torch.zeros(num_preds, dtype=torch.int64)
+                else:
+                    labels = torch.from_numpy(
+                        # remind that we set FG labels to [0, num_class-1]
+                        # since mmdet v2.0
+                        # BG cat_id: num_class
+                        rng.randint(0, num_classes, size=num_preds))
+                    labels[~is_assigned] = 0
+            else:
+                labels = None
+
+        self = cls(num_gts, gt_inds, max_overlaps, labels)
+        return self
+
+    def add_gt_(self, gt_labels):
+        """Add ground truth as assigned results.
+
+        Args:
+            gt_labels (torch.Tensor): Labels of gt boxes
+        """
+        self_inds = torch.arange(1,
+                                 len(gt_labels) + 1,
+                                 dtype=torch.long,
+                                 device=gt_labels.device)
+        self.gt_inds = torch.cat([self_inds, self.gt_inds])
+
+        self.max_overlaps = torch.cat(
+            [self.max_overlaps.new_ones(len(gt_labels)), self.max_overlaps])
+
+        if self.labels is not None:
+            self.labels = torch.cat([gt_labels, self.labels])
+
+    def get_indices(self):
+        inds_used = torch.where(self.gt_inds > 0)[0]
+        tgt_inds = self.gt_inds[inds_used] - 1
+        return inds_used, tgt_inds
diff --git a/packages.txt b/packages.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a18b53ca7856e494ee5557846a88eea1fc8dc688
--- /dev/null
+++ b/packages.txt
@@ -0,0 +1 @@
+zip
\ No newline at end of file
diff --git a/pre-requirements.txt b/pre-requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c654e2f16aa7820cde131c53b9872059741847d6
--- /dev/null
+++ b/pre-requirements.txt
@@ -0,0 +1,39 @@
+torch==2.1.2
+torchvision==0.16.2
+torchaudio==2.1.2
+ffmpeg-python
+astropy
+cdflib<0.4.0
+chumpy
+colormap
+easydev
+einops
+h5py
+matplotlib
+numpy==1.23.5
+opencv-python
+pandas==1.5.3
+pickle5
+plyfile
+rtree
+scikit-image
+scipy
+smplx
+tqdm
+trimesh
+# vedo
+pycocotools
+Cython==3.0.8
+torchgeometry==0.1.2
+tensorboard==2.14.0
+yapf==0.40.1
+timm==0.9.12
+codecov
+flake8
+interrogate
+isort==4.3.21
+pytest
+surrogate
+xdoctest >= 0.10.0
+setuptools==75.1.0
+debugpy
\ No newline at end of file
diff --git a/pytorch3d/.circleci/build_count.py b/pytorch3d/.circleci/build_count.py
new file mode 100644
index 0000000000000000000000000000000000000000..a00abce76bd541859890f7c64d4774a175cbf59d
--- /dev/null
+++ b/pytorch3d/.circleci/build_count.py
@@ -0,0 +1,33 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Print the number of nightly builds
+"""
+
+from collections import Counter
+
+import yaml
+
+
+conf = yaml.safe_load(open("config.yml"))
+jobs = conf["workflows"]["build_and_test"]["jobs"]
+
+
+def jobtype(job):
+    if isinstance(job, str):
+        return job
+    if len(job) == 1:
+        [name] = job.keys()
+        return name
+    return "MULTIPLE PARTS"
+
+
+for i, j in Counter(map(jobtype, jobs)).items():
+    print(i, j)
+print()
+print(len(jobs))
diff --git a/pytorch3d/.circleci/check.sh b/pytorch3d/.circleci/check.sh
new file mode 100644
index 0000000000000000000000000000000000000000..9630dc24b5cdd8fba34b368e3f5d3385078ad9f0
--- /dev/null
+++ b/pytorch3d/.circleci/check.sh
@@ -0,0 +1,13 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Run this script before committing config.yml to verify it is valid yaml.
+
+python -c 'import yaml; yaml.safe_load(open("config.yml"))' && echo OK - valid yaml
+
+msg="circleci not installed so can't check schema"
+command -v circleci > /dev/null && (cd ..; circleci config validate) || echo "$msg"
diff --git a/pytorch3d/.circleci/config.in.yml b/pytorch3d/.circleci/config.in.yml
new file mode 100644
index 0000000000000000000000000000000000000000..323d36a3b3a4cea0f578b0921681869f4995c6b2
--- /dev/null
+++ b/pytorch3d/.circleci/config.in.yml
@@ -0,0 +1,249 @@
+version: 2.1
+
+#examples:
+#https://github.com/facebookresearch/ParlAI/blob/master/.circleci/config.yml
+#https://github.com/facebookresearch/hydra/blob/master/.circleci/config.yml
+#https://github.com/facebookresearch/habitat-api/blob/master/.circleci/config.yml
+
+#drive tests with nox or tox or pytest?
+
+# -------------------------------------------------------------------------------------
+# environments where we run our jobs
+# -------------------------------------------------------------------------------------
+
+
+setupcuda: &setupcuda
+  run:
+    name: Setup CUDA
+    working_directory: ~/
+    command: |
+      # download and install nvidia drivers, cuda, etc
+      wget --no-verbose --no-clobber -P ~/nvidia-downloads https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run
+      sudo sh ~/nvidia-downloads/cuda_11.3.1_465.19.01_linux.run --silent
+      echo "Done installing CUDA."
+      pyenv versions
+      nvidia-smi
+      pyenv global 3.9.1
+
+binary_common: &binary_common
+  parameters:
+    # Edit these defaults to do a release`
+    build_version:
+      description: "version number of release binary; by default, build a nightly"
+      type: string
+      default: ""
+    pytorch_version:
+      description: "PyTorch version to build against; by default, use a nightly"
+      type: string
+      default: ""
+    # Don't edit these
+    python_version:
+      description: "Python version to build against (e.g., 3.7)"
+      type: string
+    cu_version:
+      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
+      type: string
+    wheel_docker_image:
+      description: "Wheel only: what docker image to use"
+      type: string
+      default: "pytorch/manylinux-cuda101"
+    conda_docker_image:
+      description: "what docker image to use for docker"
+      type: string
+      default: "pytorch/conda-cuda"
+  environment:
+    PYTHON_VERSION: << parameters.python_version >>
+    BUILD_VERSION: << parameters.build_version >>
+    PYTORCH_VERSION: << parameters.pytorch_version >>
+    CU_VERSION: << parameters.cu_version >>
+    TESTRUN_DOCKER_IMAGE: << parameters.conda_docker_image >>
+
+jobs:
+  main:
+    environment:
+      CUDA_VERSION: "11.3"
+    resource_class: gpu.nvidia.small.multi
+    machine:
+      image: ubuntu-2004:202101-01
+    steps:
+      - checkout
+      - <<: *setupcuda
+      - run: pip3 install --progress-bar off imageio wheel matplotlib 'pillow<7'
+      - run: pip3 install --progress-bar off torch==1.10.0+cu113 torchvision==0.11.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
+      # - run: conda create -p ~/conda_env python=3.7 numpy
+      # - run: conda activate ~/conda_env
+      # - run: conda install -c pytorch pytorch torchvision
+
+      - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/fvcore'
+      - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/iopath'
+      - run:
+          name: build
+          command: |
+            export LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.3/lib64
+            python3 setup.py build_ext --inplace
+      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.3/lib64 python -m unittest discover -v -s tests
+      - run: python3 setup.py bdist_wheel
+
+  binary_linux_wheel:
+    <<: *binary_common
+    docker:
+      - image: << parameters.wheel_docker_image >>
+        auth:
+          username: $DOCKERHUB_USERNAME
+          password: $DOCKERHUB_TOKEN
+    resource_class: 2xlarge+
+    steps:
+      - checkout
+      - run: MAX_JOBS=15 packaging/build_wheel.sh
+      - store_artifacts:
+          path: dist
+      - persist_to_workspace:
+          root: dist
+          paths:
+            - "*"
+
+  binary_linux_conda:
+    <<: *binary_common
+    docker:
+      - image: "<< parameters.conda_docker_image >>"
+        auth:
+          username: $DOCKERHUB_USERNAME
+          password: $DOCKERHUB_TOKEN
+    resource_class: 2xlarge+
+    steps:
+      - checkout
+      # This is building with cuda but no gpu present,
+      # so we aren't running the tests.
+      - run:
+          name: build
+          no_output_timeout: 20m
+          command: MAX_JOBS=15 TEST_FLAG=--no-test packaging/build_conda.sh
+      - store_artifacts:
+          path: /opt/conda/conda-bld/linux-64
+      - persist_to_workspace:
+          root: /opt/conda/conda-bld/linux-64
+          paths:
+            - "*"
+
+  binary_linux_conda_cuda:
+    <<: *binary_common
+    machine:
+      image: ubuntu-1604:201903-01
+    resource_class: gpu.nvidia.small.multi
+    steps:
+    - checkout
+    - run:
+        name: Setup environment
+        command: |
+          set -e
+
+          curl -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add -
+          curl -L https://dl.google.com/linux/linux_signing_key.pub | sudo apt-key add -
+
+          sudo apt-get update
+
+          sudo apt-get install \
+              apt-transport-https \
+              ca-certificates \
+              curl \
+              gnupg-agent \
+              software-properties-common
+
+          curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
+
+          sudo add-apt-repository \
+             "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
+             $(lsb_release -cs) \
+             stable"
+
+          sudo apt-get update
+          export DOCKER_VERSION="5:19.03.2~3-0~ubuntu-xenial"
+          sudo apt-get install docker-ce=${DOCKER_VERSION} docker-ce-cli=${DOCKER_VERSION} containerd.io=1.2.6-3
+
+          # Add the package repositories
+          distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+          curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+          curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+
+          export NVIDIA_CONTAINER_VERSION="1.0.3-1"
+          sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit=${NVIDIA_CONTAINER_VERSION}
+          sudo systemctl restart docker
+
+          DRIVER_FN="NVIDIA-Linux-x86_64-460.84.run"
+          wget "https://us.download.nvidia.com/XFree86/Linux-x86_64/460.84/$DRIVER_FN"
+          sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+          nvidia-smi
+
+    - run:
+        name: Pull docker image
+        command: |
+          set -e
+
+          { docker login -u="$DOCKERHUB_USERNAME" -p="$DOCKERHUB_TOKEN" ; } 2> /dev/null
+
+          echo Pulling docker image $TESTRUN_DOCKER_IMAGE
+          docker pull $TESTRUN_DOCKER_IMAGE
+    - run:
+        name: Build and run tests
+        no_output_timeout: 20m
+        command: |
+          set -e
+
+          cd ${HOME}/project/
+
+          export JUST_TESTRUN=1
+          VARS_TO_PASS="-e PYTHON_VERSION -e BUILD_VERSION -e PYTORCH_VERSION -e CU_VERSION -e JUST_TESTRUN"
+
+          docker run --gpus all  --ipc=host -v $(pwd):/remote -w /remote ${VARS_TO_PASS} ${TESTRUN_DOCKER_IMAGE} ./packaging/build_conda.sh
+
+  binary_macos_wheel:
+    <<: *binary_common
+    macos:
+      xcode: "12.0"
+    steps:
+      - checkout
+      - run:
+          # Cannot easily deduplicate this as source'ing activate
+          # will set environment variables which we need to propagate
+          # to build_wheel.sh
+          command: |
+            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+            sh conda.sh -b
+            source $HOME/miniconda3/bin/activate
+            packaging/build_wheel.sh
+      - store_artifacts:
+          path: dist
+
+workflows:
+  version: 2
+  build_and_test:
+    jobs:
+      # - main:
+      #     context: DOCKERHUB_TOKEN
+      {{workflows()}}
+      - binary_linux_conda_cuda:
+          name: testrun_conda_cuda_py37_cu102_pyt170
+          context: DOCKERHUB_TOKEN
+          python_version: "3.7"
+          pytorch_version: '1.7.0'
+          cu_version: "cu102"
+      - binary_macos_wheel:
+          cu_version: cpu
+          name: macos_wheel_py36_cpu
+          python_version: '3.6'
+          pytorch_version: '1.9.0'
+      - binary_macos_wheel:
+          cu_version: cpu
+          name: macos_wheel_py37_cpu
+          python_version: '3.7'
+          pytorch_version: '1.9.0'
+      - binary_macos_wheel:
+          cu_version: cpu
+          name: macos_wheel_py38_cpu
+          python_version: '3.8'
+          pytorch_version: '1.9.0'
+      - binary_macos_wheel:
+          cu_version: cpu
+          name: macos_wheel_py39_cpu
+          python_version: '3.9'
+          pytorch_version: '1.9.0'
diff --git a/pytorch3d/.circleci/config.yml b/pytorch3d/.circleci/config.yml
new file mode 100644
index 0000000000000000000000000000000000000000..5f62e5101dcde0ded56149408cf23692893773a8
--- /dev/null
+++ b/pytorch3d/.circleci/config.yml
@@ -0,0 +1,744 @@
+version: 2.1
+
+#examples:
+#https://github.com/facebookresearch/ParlAI/blob/master/.circleci/config.yml
+#https://github.com/facebookresearch/hydra/blob/master/.circleci/config.yml
+#https://github.com/facebookresearch/habitat-api/blob/master/.circleci/config.yml
+
+#drive tests with nox or tox or pytest?
+
+# -------------------------------------------------------------------------------------
+# environments where we run our jobs
+# -------------------------------------------------------------------------------------
+
+
+setupcuda: &setupcuda
+  run:
+    name: Setup CUDA
+    working_directory: ~/
+    command: |
+      # download and install nvidia drivers, cuda, etc
+      wget --no-verbose --no-clobber -P ~/nvidia-downloads https://developer.download.nvidia.com/compute/cuda/11.3.1/local_installers/cuda_11.3.1_465.19.01_linux.run
+      sudo sh ~/nvidia-downloads/cuda_11.3.1_465.19.01_linux.run --silent
+      echo "Done installing CUDA."
+      pyenv versions
+      nvidia-smi
+      pyenv global 3.9.1
+
+binary_common: &binary_common
+  parameters:
+    # Edit these defaults to do a release`
+    build_version:
+      description: "version number of release binary; by default, build a nightly"
+      type: string
+      default: ""
+    pytorch_version:
+      description: "PyTorch version to build against; by default, use a nightly"
+      type: string
+      default: ""
+    # Don't edit these
+    python_version:
+      description: "Python version to build against (e.g., 3.7)"
+      type: string
+    cu_version:
+      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
+      type: string
+    wheel_docker_image:
+      description: "Wheel only: what docker image to use"
+      type: string
+      default: "pytorch/manylinux-cuda101"
+    conda_docker_image:
+      description: "what docker image to use for docker"
+      type: string
+      default: "pytorch/conda-cuda"
+  environment:
+    PYTHON_VERSION: << parameters.python_version >>
+    BUILD_VERSION: << parameters.build_version >>
+    PYTORCH_VERSION: << parameters.pytorch_version >>
+    CU_VERSION: << parameters.cu_version >>
+    TESTRUN_DOCKER_IMAGE: << parameters.conda_docker_image >>
+
+jobs:
+  main:
+    environment:
+      CUDA_VERSION: "11.3"
+    resource_class: gpu.nvidia.small.multi
+    machine:
+      image: ubuntu-2004:202101-01
+    steps:
+      - checkout
+      - <<: *setupcuda
+      - run: pip3 install --progress-bar off imageio wheel matplotlib 'pillow<7'
+      - run: pip3 install --progress-bar off torch==1.10.0+cu113 torchvision==0.11.1+cu113 -f https://download.pytorch.org/whl/cu113/torch_stable.html
+      # - run: conda create -p ~/conda_env python=3.7 numpy
+      # - run: conda activate ~/conda_env
+      # - run: conda install -c pytorch pytorch torchvision
+
+      - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/fvcore'
+      - run: pip3 install --progress-bar off 'git+https://github.com/facebookresearch/iopath'
+      - run:
+          name: build
+          command: |
+            export LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.3/lib64
+            python3 setup.py build_ext --inplace
+      - run: LD_LIBRARY_PATH=$LD_LIBARY_PATH:/usr/local/cuda-11.3/lib64 python -m unittest discover -v -s tests
+      - run: python3 setup.py bdist_wheel
+
+  binary_linux_wheel:
+    <<: *binary_common
+    docker:
+      - image: << parameters.wheel_docker_image >>
+        auth:
+          username: $DOCKERHUB_USERNAME
+          password: $DOCKERHUB_TOKEN
+    resource_class: 2xlarge+
+    steps:
+      - checkout
+      - run: MAX_JOBS=15 packaging/build_wheel.sh
+      - store_artifacts:
+          path: dist
+      - persist_to_workspace:
+          root: dist
+          paths:
+            - "*"
+
+  binary_linux_conda:
+    <<: *binary_common
+    docker:
+      - image: "<< parameters.conda_docker_image >>"
+        auth:
+          username: $DOCKERHUB_USERNAME
+          password: $DOCKERHUB_TOKEN
+    resource_class: 2xlarge+
+    steps:
+      - checkout
+      # This is building with cuda but no gpu present,
+      # so we aren't running the tests.
+      - run:
+          name: build
+          no_output_timeout: 20m
+          command: MAX_JOBS=15 TEST_FLAG=--no-test packaging/build_conda.sh
+      - store_artifacts:
+          path: /opt/conda/conda-bld/linux-64
+      - persist_to_workspace:
+          root: /opt/conda/conda-bld/linux-64
+          paths:
+            - "*"
+
+  binary_linux_conda_cuda:
+    <<: *binary_common
+    machine:
+      image: ubuntu-1604:201903-01
+    resource_class: gpu.nvidia.small.multi
+    steps:
+    - checkout
+    - run:
+        name: Setup environment
+        command: |
+          set -e
+
+          curl -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add -
+          curl -L https://dl.google.com/linux/linux_signing_key.pub | sudo apt-key add -
+
+          sudo apt-get update
+
+          sudo apt-get install \
+              apt-transport-https \
+              ca-certificates \
+              curl \
+              gnupg-agent \
+              software-properties-common
+
+          curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add -
+
+          sudo add-apt-repository \
+             "deb [arch=amd64] https://download.docker.com/linux/ubuntu \
+             $(lsb_release -cs) \
+             stable"
+
+          sudo apt-get update
+          export DOCKER_VERSION="5:19.03.2~3-0~ubuntu-xenial"
+          sudo apt-get install docker-ce=${DOCKER_VERSION} docker-ce-cli=${DOCKER_VERSION} containerd.io=1.2.6-3
+
+          # Add the package repositories
+          distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
+          curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+          curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
+
+          export NVIDIA_CONTAINER_VERSION="1.0.3-1"
+          sudo apt-get update && sudo apt-get install -y nvidia-container-toolkit=${NVIDIA_CONTAINER_VERSION}
+          sudo systemctl restart docker
+
+          DRIVER_FN="NVIDIA-Linux-x86_64-460.84.run"
+          wget "https://us.download.nvidia.com/XFree86/Linux-x86_64/460.84/$DRIVER_FN"
+          sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false)
+          nvidia-smi
+
+    - run:
+        name: Pull docker image
+        command: |
+          set -e
+
+          { docker login -u="$DOCKERHUB_USERNAME" -p="$DOCKERHUB_TOKEN" ; } 2> /dev/null
+
+          echo Pulling docker image $TESTRUN_DOCKER_IMAGE
+          docker pull $TESTRUN_DOCKER_IMAGE
+    - run:
+        name: Build and run tests
+        no_output_timeout: 20m
+        command: |
+          set -e
+
+          cd ${HOME}/project/
+
+          export JUST_TESTRUN=1
+          VARS_TO_PASS="-e PYTHON_VERSION -e BUILD_VERSION -e PYTORCH_VERSION -e CU_VERSION -e JUST_TESTRUN"
+
+          docker run --gpus all  --ipc=host -v $(pwd):/remote -w /remote ${VARS_TO_PASS} ${TESTRUN_DOCKER_IMAGE} ./packaging/build_conda.sh
+
+  binary_macos_wheel:
+    <<: *binary_common
+    macos:
+      xcode: "12.0"
+    steps:
+      - checkout
+      - run:
+          # Cannot easily deduplicate this as source'ing activate
+          # will set environment variables which we need to propagate
+          # to build_wheel.sh
+          command: |
+            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
+            sh conda.sh -b
+            source $HOME/miniconda3/bin/activate
+            packaging/build_wheel.sh
+      - store_artifacts:
+          path: dist
+
+workflows:
+  version: 2
+  build_and_test:
+    jobs:
+      # - main:
+      #     context: DOCKERHUB_TOKEN
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu92
+          name: linux_conda_py36_cu92_pyt160
+          python_version: '3.6'
+          pytorch_version: 1.6.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py36_cu101_pyt160
+          python_version: '3.6'
+          pytorch_version: 1.6.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py36_cu102_pyt160
+          python_version: '3.6'
+          pytorch_version: 1.6.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py36_cu101_pyt170
+          python_version: '3.6'
+          pytorch_version: 1.7.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py36_cu102_pyt170
+          python_version: '3.6'
+          pytorch_version: 1.7.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu110
+          name: linux_conda_py36_cu110_pyt170
+          python_version: '3.6'
+          pytorch_version: 1.7.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py36_cu101_pyt171
+          python_version: '3.6'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py36_cu102_pyt171
+          python_version: '3.6'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu110
+          name: linux_conda_py36_cu110_pyt171
+          python_version: '3.6'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py36_cu101_pyt180
+          python_version: '3.6'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py36_cu102_pyt180
+          python_version: '3.6'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py36_cu111_pyt180
+          python_version: '3.6'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py36_cu101_pyt181
+          python_version: '3.6'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py36_cu102_pyt181
+          python_version: '3.6'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py36_cu111_pyt181
+          python_version: '3.6'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py36_cu102_pyt190
+          python_version: '3.6'
+          pytorch_version: 1.9.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py36_cu111_pyt190
+          python_version: '3.6'
+          pytorch_version: 1.9.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py36_cu102_pyt191
+          python_version: '3.6'
+          pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py36_cu111_pyt191
+          python_version: '3.6'
+          pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py36_cu102_pyt1100
+          python_version: '3.6'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py36_cu111_pyt1100
+          python_version: '3.6'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py36_cu113_pyt1100
+          python_version: '3.6'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu92
+          name: linux_conda_py37_cu92_pyt160
+          python_version: '3.7'
+          pytorch_version: 1.6.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py37_cu101_pyt160
+          python_version: '3.7'
+          pytorch_version: 1.6.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py37_cu102_pyt160
+          python_version: '3.7'
+          pytorch_version: 1.6.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py37_cu101_pyt170
+          python_version: '3.7'
+          pytorch_version: 1.7.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py37_cu102_pyt170
+          python_version: '3.7'
+          pytorch_version: 1.7.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu110
+          name: linux_conda_py37_cu110_pyt170
+          python_version: '3.7'
+          pytorch_version: 1.7.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py37_cu101_pyt171
+          python_version: '3.7'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py37_cu102_pyt171
+          python_version: '3.7'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu110
+          name: linux_conda_py37_cu110_pyt171
+          python_version: '3.7'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py37_cu101_pyt180
+          python_version: '3.7'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py37_cu102_pyt180
+          python_version: '3.7'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py37_cu111_pyt180
+          python_version: '3.7'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py37_cu101_pyt181
+          python_version: '3.7'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py37_cu102_pyt181
+          python_version: '3.7'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py37_cu111_pyt181
+          python_version: '3.7'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py37_cu102_pyt190
+          python_version: '3.7'
+          pytorch_version: 1.9.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py37_cu111_pyt190
+          python_version: '3.7'
+          pytorch_version: 1.9.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py37_cu102_pyt191
+          python_version: '3.7'
+          pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py37_cu111_pyt191
+          python_version: '3.7'
+          pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py37_cu102_pyt1100
+          python_version: '3.7'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py37_cu111_pyt1100
+          python_version: '3.7'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py37_cu113_pyt1100
+          python_version: '3.7'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu92
+          name: linux_conda_py38_cu92_pyt160
+          python_version: '3.8'
+          pytorch_version: 1.6.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py38_cu101_pyt160
+          python_version: '3.8'
+          pytorch_version: 1.6.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py38_cu102_pyt160
+          python_version: '3.8'
+          pytorch_version: 1.6.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py38_cu101_pyt170
+          python_version: '3.8'
+          pytorch_version: 1.7.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py38_cu102_pyt170
+          python_version: '3.8'
+          pytorch_version: 1.7.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu110
+          name: linux_conda_py38_cu110_pyt170
+          python_version: '3.8'
+          pytorch_version: 1.7.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py38_cu101_pyt171
+          python_version: '3.8'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py38_cu102_pyt171
+          python_version: '3.8'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu110
+          name: linux_conda_py38_cu110_pyt171
+          python_version: '3.8'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py38_cu101_pyt180
+          python_version: '3.8'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py38_cu102_pyt180
+          python_version: '3.8'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py38_cu111_pyt180
+          python_version: '3.8'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py38_cu101_pyt181
+          python_version: '3.8'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py38_cu102_pyt181
+          python_version: '3.8'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py38_cu111_pyt181
+          python_version: '3.8'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py38_cu102_pyt190
+          python_version: '3.8'
+          pytorch_version: 1.9.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py38_cu111_pyt190
+          python_version: '3.8'
+          pytorch_version: 1.9.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py38_cu102_pyt191
+          python_version: '3.8'
+          pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py38_cu111_pyt191
+          python_version: '3.8'
+          pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py38_cu102_pyt1100
+          python_version: '3.8'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py38_cu111_pyt1100
+          python_version: '3.8'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py38_cu113_pyt1100
+          python_version: '3.8'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py39_cu101_pyt171
+          python_version: '3.9'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py39_cu102_pyt171
+          python_version: '3.9'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu110
+          name: linux_conda_py39_cu110_pyt171
+          python_version: '3.9'
+          pytorch_version: 1.7.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py39_cu101_pyt180
+          python_version: '3.9'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py39_cu102_pyt180
+          python_version: '3.9'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py39_cu111_pyt180
+          python_version: '3.9'
+          pytorch_version: 1.8.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu101
+          name: linux_conda_py39_cu101_pyt181
+          python_version: '3.9'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py39_cu102_pyt181
+          python_version: '3.9'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py39_cu111_pyt181
+          python_version: '3.9'
+          pytorch_version: 1.8.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py39_cu102_pyt190
+          python_version: '3.9'
+          pytorch_version: 1.9.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py39_cu111_pyt190
+          python_version: '3.9'
+          pytorch_version: 1.9.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py39_cu102_pyt191
+          python_version: '3.9'
+          pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py39_cu111_pyt191
+          python_version: '3.9'
+          pytorch_version: 1.9.1
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu102
+          name: linux_conda_py39_cu102_pyt1100
+          python_version: '3.9'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          context: DOCKERHUB_TOKEN
+          cu_version: cu111
+          name: linux_conda_py39_cu111_pyt1100
+          python_version: '3.9'
+          pytorch_version: 1.10.0
+      - binary_linux_conda:
+          conda_docker_image: pytorch/conda-builder:cuda113
+          context: DOCKERHUB_TOKEN
+          cu_version: cu113
+          name: linux_conda_py39_cu113_pyt1100
+          python_version: '3.9'
+          pytorch_version: 1.10.0
+      - binary_linux_conda_cuda:
+          name: testrun_conda_cuda_py37_cu102_pyt170
+          context: DOCKERHUB_TOKEN
+          python_version: "3.7"
+          pytorch_version: '1.7.0'
+          cu_version: "cu102"
+      - binary_macos_wheel:
+          cu_version: cpu
+          name: macos_wheel_py36_cpu
+          python_version: '3.6'
+          pytorch_version: '1.9.0'
+      - binary_macos_wheel:
+          cu_version: cpu
+          name: macos_wheel_py37_cpu
+          python_version: '3.7'
+          pytorch_version: '1.9.0'
+      - binary_macos_wheel:
+          cu_version: cpu
+          name: macos_wheel_py38_cpu
+          python_version: '3.8'
+          pytorch_version: '1.9.0'
+      - binary_macos_wheel:
+          cu_version: cpu
+          name: macos_wheel_py39_cpu
+          python_version: '3.9'
+          pytorch_version: '1.9.0'
diff --git a/pytorch3d/.circleci/regenerate.py b/pytorch3d/.circleci/regenerate.py
new file mode 100755
index 0000000000000000000000000000000000000000..f93def65ca054c0872f2ec83257bf5080f24e61e
--- /dev/null
+++ b/pytorch3d/.circleci/regenerate.py
@@ -0,0 +1,167 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This script is adapted from the torchvision one.
+"""
+
+import os.path
+
+import jinja2
+import yaml
+
+
+# The CUDA versions which have pytorch conda packages available for linux for each
+# version of pytorch.
+# Pytorch 1.4 also supports cuda 10.0 but we no longer build for cuda 10.0 at all.
+CONDA_CUDA_VERSIONS = {
+    "1.6.0": ["cu92", "cu101", "cu102"],
+    "1.7.0": ["cu101", "cu102", "cu110"],
+    "1.7.1": ["cu101", "cu102", "cu110"],
+    "1.8.0": ["cu101", "cu102", "cu111"],
+    "1.8.1": ["cu101", "cu102", "cu111"],
+    "1.9.0": ["cu102", "cu111"],
+    "1.9.1": ["cu102", "cu111"],
+    "1.10.0": ["cu102", "cu111", "cu113"],
+}
+
+
+def conda_docker_image_for_cuda(cuda_version):
+    if cuda_version == "cu113":
+        return "pytorch/conda-builder:cuda113"
+    return None
+
+
+def pytorch_versions_for_python(python_version):
+    if python_version in ["3.6", "3.7", "3.8"]:
+        return list(CONDA_CUDA_VERSIONS)
+    pytorch_without_py39 = ["1.4", "1.5.0", "1.5.1", "1.6.0", "1.7.0"]
+    return [i for i in CONDA_CUDA_VERSIONS if i not in pytorch_without_py39]
+
+
+def workflows(prefix="", filter_branch=None, upload=False, indentation=6):
+    w = []
+    for btype in ["conda"]:
+        for python_version in ["3.6", "3.7", "3.8", "3.9"]:
+            for pytorch_version in pytorch_versions_for_python(python_version):
+                for cu_version in CONDA_CUDA_VERSIONS[pytorch_version]:
+                    w += workflow_pair(
+                        btype=btype,
+                        python_version=python_version,
+                        pytorch_version=pytorch_version,
+                        cu_version=cu_version,
+                        prefix=prefix,
+                        upload=upload,
+                        filter_branch=filter_branch,
+                    )
+
+    return indent(indentation, w)
+
+
+def workflow_pair(
+    *,
+    btype,
+    python_version,
+    pytorch_version,
+    cu_version,
+    prefix="",
+    upload=False,
+    filter_branch,
+):
+
+    w = []
+    py = python_version.replace(".", "")
+    pyt = pytorch_version.replace(".", "")
+    base_workflow_name = f"{prefix}linux_{btype}_py{py}_{cu_version}_pyt{pyt}"
+
+    w.append(
+        generate_base_workflow(
+            base_workflow_name=base_workflow_name,
+            python_version=python_version,
+            pytorch_version=pytorch_version,
+            cu_version=cu_version,
+            btype=btype,
+            filter_branch=filter_branch,
+        )
+    )
+
+    if upload:
+        w.append(
+            generate_upload_workflow(
+                base_workflow_name=base_workflow_name,
+                btype=btype,
+                cu_version=cu_version,
+                filter_branch=filter_branch,
+            )
+        )
+
+    return w
+
+
+def generate_base_workflow(
+    *,
+    base_workflow_name,
+    python_version,
+    cu_version,
+    pytorch_version,
+    btype,
+    filter_branch=None,
+):
+
+    d = {
+        "name": base_workflow_name,
+        "python_version": python_version,
+        "cu_version": cu_version,
+        "pytorch_version": pytorch_version,
+        "context": "DOCKERHUB_TOKEN",
+    }
+
+    conda_docker_image = conda_docker_image_for_cuda(cu_version)
+    if conda_docker_image is not None:
+        d["conda_docker_image"] = conda_docker_image
+
+    if filter_branch is not None:
+        d["filters"] = {"branches": {"only": filter_branch}}
+
+    return {f"binary_linux_{btype}": d}
+
+
+def generate_upload_workflow(*, base_workflow_name, btype, cu_version, filter_branch):
+    d = {
+        "name": f"{base_workflow_name}_upload",
+        "context": "org-member",
+        "requires": [base_workflow_name],
+    }
+
+    if btype == "wheel":
+        d["subfolder"] = cu_version + "/"
+
+    if filter_branch is not None:
+        d["filters"] = {"branches": {"only": filter_branch}}
+
+    return {f"binary_{btype}_upload": d}
+
+
+def indent(indentation, data_list):
+    if len(data_list) == 0:
+        return ""
+    return ("\n" + " " * indentation).join(
+        yaml.dump(data_list, default_flow_style=False).splitlines()
+    )
+
+
+if __name__ == "__main__":
+    d = os.path.dirname(__file__)
+    env = jinja2.Environment(
+        loader=jinja2.FileSystemLoader(d),
+        lstrip_blocks=True,
+        autoescape=False,
+        keep_trailing_newline=True,
+    )
+
+    with open(os.path.join(d, "config.yml"), "w") as f:
+        f.write(env.get_template("config.in.yml").render(workflows=workflows))
diff --git a/pytorch3d/.clang-format b/pytorch3d/.clang-format
new file mode 100644
index 0000000000000000000000000000000000000000..39b1b3d603ed0cf6b7f94c9c08067f148f35613f
--- /dev/null
+++ b/pytorch3d/.clang-format
@@ -0,0 +1,85 @@
+AccessModifierOffset: -1
+AlignAfterOpenBracket: AlwaysBreak
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlinesLeft: true
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: false
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: false
+BinPackParameters: false
+BraceWrapping:
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: false
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ForEachMacros:   [ FOR_EACH, FOR_EACH_R, FOR_EACH_RANGE, ]
+IncludeCategories:
+  - Regex:           '^<.*\.h(pp)?>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SpaceAfterCStyleCast: false
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Cpp11
+TabWidth:        8
+UseTab:          Never
diff --git a/pytorch3d/.flake8 b/pytorch3d/.flake8
new file mode 100644
index 0000000000000000000000000000000000000000..6c3b6d91f3dcf1baa1fc8e5f337fc469e0a9b0ae
--- /dev/null
+++ b/pytorch3d/.flake8
@@ -0,0 +1,6 @@
+[flake8]
+ignore = E203, E266, E501, W503, E221
+max-line-length = 88
+max-complexity = 18
+select = B,C,E,F,W,T4,B9
+exclude = build,__init__.py
diff --git a/pytorch3d/.gitignore b/pytorch3d/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..30afb0d15147d3d6fadafd55d509916a0afe2d3a
--- /dev/null
+++ b/pytorch3d/.gitignore
@@ -0,0 +1,21 @@
+build/
+dist/
+*.egg-info/
+**/__pycache__/
+*-checkpoint.ipynb
+**/.ipynb_checkpoints
+**/.ipynb_checkpoints/**
+
+
+# Docusaurus site
+website/yarn.lock
+website/build/
+website/i18n/
+website/node_modules/*
+website/npm-debug.log
+
+## Generated for tutorials
+website/_tutorials/
+website/static/files/
+website/pages/tutorials/*
+!website/pages/tutorials/index.js
diff --git a/pytorch3d/INSTALL.md b/pytorch3d/INSTALL.md
new file mode 100644
index 0000000000000000000000000000000000000000..ecc915adc9fec28f9f0f68d6467ba0b0b8e1602e
--- /dev/null
+++ b/pytorch3d/INSTALL.md
@@ -0,0 +1,160 @@
+# Installation
+
+
+## Requirements
+
+### Core library
+
+The core library is written in PyTorch. Several components have underlying implementation in CUDA for improved performance. A subset of these components have CPU implementations in C++/PyTorch. It is advised to use PyTorch3D with GPU support in order to use all the features.
+
+- Linux or macOS or Windows
+- Python 3.6, 3.7, 3.8 or 3.9
+- PyTorch 1.6.0, 1.7.0, 1.7.1, 1.8.0, 1.8.1, 1.9.0, 1.9.1 or 1.10.0.
+- torchvision that matches the PyTorch installation. You can install them together as explained at pytorch.org to make sure of this.
+- gcc & g++ ≥ 4.9
+- [fvcore](https://github.com/facebookresearch/fvcore)
+- [ioPath](https://github.com/facebookresearch/iopath)
+- If CUDA is to be used, use a version which is supported by the corresponding pytorch version and at least version 9.2.
+- If CUDA is to be used and you are building from source, the CUB library must be available. We recommend version 1.10.0.
+
+The runtime dependencies can be installed by running:
+```
+conda create -n pytorch3d python=3.9
+conda activate pytorch3d
+conda install -c pytorch pytorch=1.9.1 torchvision cudatoolkit=10.2
+conda install -c fvcore -c iopath -c conda-forge fvcore iopath
+```
+
+For the CUB build time dependency, if you are using conda, you can continue with
+```
+conda install -c bottler nvidiacub
+```
+Otherwise download the CUB library from https://github.com/NVIDIA/cub/releases and unpack it to a folder of your choice.
+Define the environment variable CUB_HOME before building and point it to the directory that contains `CMakeLists.txt` for CUB.
+For example on Linux/Mac,
+```
+curl -LO https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+tar xzf 1.10.0.tar.gz
+export CUB_HOME=$PWD/cub-1.10.0
+```
+
+### Tests/Linting and Demos
+
+For developing on top of PyTorch3D or contributing, you will need to run the linter and tests. If you want to run any of the notebook tutorials as `docs/tutorials` or the examples in `docs/examples` you will also need matplotlib and OpenCV.
+- scikit-image
+- black
+- isort
+- flake8
+- matplotlib
+- tdqm
+- jupyter
+- imageio
+- plotly
+- opencv-python
+
+These can be installed by running:
+```
+# Demos and examples
+conda install jupyter
+pip install scikit-image matplotlib imageio plotly opencv-python
+
+# Tests/Linting
+pip install black 'isort<5' flake8 flake8-bugbear flake8-comprehensions
+```
+
+## Installing prebuilt binaries for PyTorch3D
+After installing the above dependencies, run one of the following commands:
+
+### 1. Install with CUDA support from Anaconda Cloud, on Linux only
+
+```
+# Anaconda Cloud
+conda install pytorch3d -c pytorch3d
+```
+
+Or, to install a nightly (non-official, alpha) build:
+```
+# Anaconda Cloud
+conda install pytorch3d -c pytorch3d-nightly
+```
+### 2. Install from PyPI, on Mac only.
+This works with pytorch 1.9.0 only. The build is CPU only.
+```
+pip install pytorch3d
+```
+
+### 3. Install wheels for Linux
+We have prebuilt wheels with CUDA for Linux for PyTorch 1.10.0, for each of the CUDA versions that they support,
+for Python 3.7, 3.8 and 3.9.
+These are installed in a special way.
+For example, to install for Python 3.8, PyTorch 1.9.0 and CUDA 10.2
+```
+pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/py38_cu102_pyt1100/download.html
+```
+
+In general, from inside IPython, or in Google Colab or a jupyter notebook, you can install with
+```
+import sys
+import torch
+pyt_version_str=torch.__version__.split("+")[0].replace(".", "")
+version_str="".join([
+    f"py3{sys.version_info.minor}_cu",
+    torch.version.cuda.replace(".",""),
+    f"_pyt{pyt_version_str}"
+])
+!pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
+```
+
+## Building / installing from source.
+CUDA support will be included if CUDA is available in pytorch or if the environment variable
+`FORCE_CUDA` is set to `1`.
+
+### 1. Install from GitHub
+```
+pip install "git+https://github.com/facebookresearch/pytorch3d.git"
+```
+To install using the code of the released version instead of from the main branch, use the following instead.
+```
+pip install "git+https://github.com/facebookresearch/pytorch3d.git@stable"
+```
+
+For CUDA builds with versions earlier than CUDA 11, set `CUB_HOME` before building as described above.
+
+**Install from Github on macOS:**
+Some environment variables should be provided, like this.
+```
+MACOSX_DEPLOYMENT_TARGET=10.14 CC=clang CXX=clang++ pip install "git+https://github.com/facebookresearch/pytorch3d.git"
+```
+
+### 2. Install from a local clone
+```
+git clone https://github.com/facebookresearch/pytorch3d.git
+cd pytorch3d && pip install -e .
+```
+To rebuild after installing from a local clone run, `rm -rf build/ **/*.so` then `pip install -e .`. You often need to rebuild pytorch3d after reinstalling PyTorch. For CUDA builds with versions earlier than CUDA 11, set `CUB_HOME` before building as described above.
+
+**Install from local clone on macOS:**
+```
+MACOSX_DEPLOYMENT_TARGET=10.14 CC=clang CXX=clang++ pip install -e .
+```
+
+**Install from local clone on Windows:**
+
+Depending on the version of PyTorch, changes to some PyTorch headers may be needed before compilation. These are often discussed in issues in this repository.
+
+After any necessary patching, you can go to "x64 Native Tools Command Prompt for VS 2019" to compile and install
+```
+cd pytorch3d
+python3 setup.py install
+```
+After installing, verify whether all unit tests have passed
+```
+cd tests
+python3 -m unittest discover -p *.py
+```
+
+# FAQ
+
+### Can I use Docker?
+
+We don't provide a docker file but see [#113](https://github.com/facebookresearch/pytorch3d/issues/113) for a docker file shared by a user (NOTE: this has not been tested by the PyTorch3D team).
diff --git a/pytorch3d/LICENSE b/pytorch3d/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..dc73e2d2520378e07940b8dc857891b62cef0c26
--- /dev/null
+++ b/pytorch3d/LICENSE
@@ -0,0 +1,30 @@
+BSD License
+
+For PyTorch3D software
+
+Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice, this
+   list of conditions and the following disclaimer.
+
+ * Redistributions in binary form must reproduce the above copyright notice,
+   this list of conditions and the following disclaimer in the documentation
+   and/or other materials provided with the distribution.
+
+ * Neither the name Facebook nor the names of its contributors may be used to
+   endorse or promote products derived from this software without specific
+   prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
+ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
+ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/pytorch3d/README.md b/pytorch3d/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..01f34e3bc71a14093cb0e697d19b9ab05900190f
--- /dev/null
+++ b/pytorch3d/README.md
@@ -0,0 +1,157 @@
+<img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/pytorch3dlogo.png" width="900"/>
+
+[![CircleCI](https://circleci.com/gh/facebookresearch/pytorch3d.svg?style=svg)](https://circleci.com/gh/facebookresearch/pytorch3d)
+[![Anaconda-Server Badge](https://anaconda.org/pytorch3d/pytorch3d/badges/version.svg)](https://anaconda.org/pytorch3d/pytorch3d)
+
+# Introduction
+
+PyTorch3D provides efficient, reusable components for 3D Computer Vision research with [PyTorch](https://pytorch.org).
+
+Key features include:
+
+- Data structure for storing and manipulating triangle meshes
+- Efficient operations on triangle meshes (projective transformations, graph convolution, sampling, loss functions)
+- A differentiable mesh renderer
+
+PyTorch3D is designed to integrate smoothly with deep learning methods for predicting and manipulating 3D data.
+For this reason, all operators in PyTorch3D:
+
+- Are implemented using PyTorch tensors
+- Can handle minibatches of hetereogenous data
+- Can be differentiated
+- Can utilize GPUs for acceleration
+
+Within FAIR, PyTorch3D has been used to power research projects such as [Mesh R-CNN](https://arxiv.org/abs/1906.02739).
+
+## Installation
+
+For detailed instructions refer to [INSTALL.md](INSTALL.md).
+
+## License
+
+PyTorch3D is released under the [BSD License](LICENSE).
+
+## Tutorials
+
+Get started with PyTorch3D by trying one of the tutorial notebooks.
+
+|<img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/dolphin_deform.gif" width="310"/>|<img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/bundle_adjust.gif" width="310"/>|
+|:-----------------------------------------------------------------------------------------------------------:|:--------------------------------------------------:|
+| [Deform a sphere mesh to dolphin](https://github.com/facebookresearch/pytorch3d/blob/main/docs/tutorials/deform_source_mesh_to_target_mesh.ipynb)| [Bundle adjustment](https://github.com/facebookresearch/pytorch3d/blob/main/docs/tutorials/bundle_adjustment.ipynb) |
+
+| <img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/render_textured_mesh.gif" width="310"/> | <img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/camera_position_teapot.gif" width="310" height="310"/>
+|:------------------------------------------------------------:|:--------------------------------------------------:|
+| [Render textured meshes](https://github.com/facebookresearch/pytorch3d/blob/main/docs/tutorials/render_textured_meshes.ipynb)| [Camera position optimization](https://github.com/facebookresearch/pytorch3d/blob/main/docs/tutorials/camera_position_optimization_with_differentiable_rendering.ipynb)|
+
+| <img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/pointcloud_render.png" width="310"/> | <img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/cow_deform.gif" width="310" height="310"/>
+|:------------------------------------------------------------:|:--------------------------------------------------:|
+| [Render textured pointclouds](https://github.com/facebookresearch/pytorch3d/blob/main/docs/tutorials/render_colored_points.ipynb)| [Fit a mesh with texture](https://github.com/facebookresearch/pytorch3d/blob/main/docs/tutorials/fit_textured_mesh.ipynb)|
+
+| <img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/densepose_render.png" width="310"/> | <img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/shapenet_render.png" width="310" height="310"/>
+|:------------------------------------------------------------:|:--------------------------------------------------:|
+| [Render DensePose data](https://github.com/facebookresearch/pytorch3d/blob/main/docs/tutorials/render_densepose.ipynb)| [Load & Render ShapeNet data](https://github.com/facebookresearch/pytorch3d/blob/main/docs/tutorials/dataloaders_ShapeNetCore_R2N2.ipynb)|
+
+| <img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/fit_textured_volume.gif" width="310"/> | <img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/fit_nerf.gif" width="310" height="310"/>
+|:------------------------------------------------------------:|:--------------------------------------------------:|
+| [Fit Textured Volume](https://github.com/facebookresearch/pytorch3d/blob/main/docs/tutorials/fit_textured_volume.ipynb)| [Fit A Simple Neural Radiance Field](https://github.com/facebookresearch/pytorch3d/blob/main/docs/tutorials/fit_simple_neural_radiance_field.ipynb)|
+
+
+
+
+## Documentation
+
+Learn more about the API by reading the PyTorch3D [documentation](https://pytorch3d.readthedocs.org/).
+
+We also have deep dive notes on several API components:
+
+- [Heterogeneous Batching](https://github.com/facebookresearch/pytorch3d/tree/main/docs/notes/batching.md)
+- [Mesh IO](https://github.com/facebookresearch/pytorch3d/tree/main/docs/notes/meshes_io.md)
+- [Differentiable Rendering](https://github.com/facebookresearch/pytorch3d/tree/main/docs/notes/renderer_getting_started.md)
+
+### Overview Video
+
+We have created a short (~14 min) video tutorial providing an overview of the PyTorch3D codebase including several code examples. Click on the image below to watch the video on YouTube:
+
+<a href="http://www.youtube.com/watch?v=Pph1r-x9nyY"><img src="http://img.youtube.com/vi/Pph1r-x9nyY/0.jpg" height="225" ></a>
+
+## Development
+
+We welcome new contributions to PyTorch3D and we will be actively maintaining this library! Please refer to [CONTRIBUTING.md](./.github/CONTRIBUTING.md) for full instructions on how to run the code, tests and linter, and submit your pull requests.
+
+## Development and Compatibility
+
+- `main` branch: actively developed, without any guarantee, Anything can be broken at any time
+  - REMARK: this includes nightly builds which are built from `main`
+  - HINT: the commit history can help locate regressions or changes
+- backward-compatibility between releases: no guarantee. Best efforts to communicate breaking changes and facilitate migration of code or data (incl. models).
+
+## Contributors
+
+PyTorch3D is written and maintained by the Facebook AI Research Computer Vision Team.
+
+In alphabetical order:
+
+* Amitav Baruah
+* Steve Branson
+* Luya Gao
+* Georgia Gkioxari
+* Taylor Gordon
+* Justin Johnson
+* Patrick Labatut
+* Christoph Lassner
+* Wan-Yen Lo
+* David Novotny
+* Nikhila Ravi
+* Jeremy Reizenstein
+* Dave Schnizlein
+* Roman Shapovalov
+* Olivia Wiles
+
+## Citation
+
+If you find PyTorch3D useful in your research, please cite our tech report:
+
+```bibtex
+@article{ravi2020pytorch3d,
+    author = {Nikhila Ravi and Jeremy Reizenstein and David Novotny and Taylor Gordon
+                  and Wan-Yen Lo and Justin Johnson and Georgia Gkioxari},
+    title = {Accelerating 3D Deep Learning with PyTorch3D},
+    journal = {arXiv:2007.08501},
+    year = {2020},
+}
+```
+
+If you are using the pulsar backend for sphere-rendering (the `PulsarPointRenderer` or `pytorch3d.renderer.points.pulsar.Renderer`), please cite the tech report:
+
+```bibtex
+@article{lassner2020pulsar,
+    author = {Christoph Lassner and Michael Zollh\"ofer},
+    title = {Pulsar: Efficient Sphere-based Neural Rendering},
+    journal = {arXiv:2004.07484},
+    year = {2020},
+}
+```
+
+## News
+
+Please see below for a timeline of the codebase updates in reverse chronological order. We are sharing updates on the releases as well as research projects which are built with PyTorch3D. The changelogs for the releases are available under [`Releases`](https://github.com/facebookresearch/pytorch3d/releases),  and the builds can be installed using `conda` as per the instructions in [INSTALL.md](INSTALL.md).
+
+**[Oct 6th 2021]:**   PyTorch3D [v0.6.0](https://github.com/facebookresearch/pytorch3d/releases/tag/v0.6.0) released
+
+**[Aug 5th 2021]:**   PyTorch3D [v0.5.0](https://github.com/facebookresearch/pytorch3d/releases/tag/v0.5.0) released
+
+**[Feb 9th 2021]:** PyTorch3D [v0.4.0](https://github.com/facebookresearch/pytorch3d/releases/tag/v0.4.0) released with support for implicit functions, volume rendering and a [reimplementation of NeRF](https://github.com/facebookresearch/pytorch3d/tree/main/projects/nerf).
+
+**[November 2nd 2020]:** PyTorch3D [v0.3.0](https://github.com/facebookresearch/pytorch3d/releases/tag/v0.3.0) released, integrating the pulsar backend.
+
+**[Aug 28th 2020]:**   PyTorch3D [v0.2.5](https://github.com/facebookresearch/pytorch3d/releases/tag/v0.2.5) released
+
+**[July 17th 2020]:**   PyTorch3D tech report published on ArXiv: https://arxiv.org/abs/2007.08501
+
+**[April 24th 2020]:**   PyTorch3D [v0.2.0](https://github.com/facebookresearch/pytorch3d/releases/tag/v0.2.0) released
+
+**[March 25th 2020]:**   [SynSin](https://arxiv.org/abs/1912.08804) codebase released using PyTorch3D: https://github.com/facebookresearch/synsin
+
+**[March 8th 2020]:**   PyTorch3D [v0.1.1](https://github.com/facebookresearch/pytorch3d/releases/tag/v0.1.1) bug fix release
+
+**[Jan 23rd 2020]:**   PyTorch3D [v0.1.0](https://github.com/facebookresearch/pytorch3d/releases/tag/v0.1.0) released. [Mesh R-CNN](https://arxiv.org/abs/1906.02739) codebase released: https://github.com/facebookresearch/meshrcnn
diff --git a/pytorch3d/dev/linter.sh b/pytorch3d/dev/linter.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2ad74fa91e2a6d5c22106fca822d8e4106d8efb5
--- /dev/null
+++ b/pytorch3d/dev/linter.sh
@@ -0,0 +1,46 @@
+#!/bin/bash -e
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Run this script at project root by "./dev/linter.sh" before you commit
+
+{
+  V=$(black --version|cut '-d ' -f3)
+  code='import distutils.version; assert "19.3" < distutils.version.LooseVersion("'$V'")'
+  PYTHON=false
+  command -v python > /dev/null && PYTHON=python
+  command -v python3 > /dev/null && PYTHON=python3
+  ${PYTHON} -c "${code}" 2> /dev/null
+} || {
+  echo "Linter requires black 19.3b0 or higher!"
+  exit 1
+}
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+DIR=$(dirname "${DIR}")
+
+echo "Running isort..."
+isort -y -sp "${DIR}"
+
+echo "Running black..."
+black "${DIR}"
+
+echo "Running flake..."
+flake8 "${DIR}" || true
+
+echo "Running clang-format ..."
+clangformat=$(command -v clang-format-8 || echo clang-format)
+find "${DIR}" -regex ".*\.\(cpp\|c\|cc\|cu\|cuh\|cxx\|h\|hh\|hpp\|hxx\|tcc\|mm\|m\)" -print0 | xargs -0 "${clangformat}" -i
+
+# Run arc and pyre internally only.
+if [[ -f "${DIR}/tests/TARGETS" ]]
+then
+  (cd "${DIR}"; command -v arc > /dev/null && arc lint) || true
+
+  echo "Running pyre..."
+  echo "To restart/kill pyre server, run 'pyre restart' or 'pyre kill' in fbcode/"
+  ( cd ~/fbsource/fbcode; pyre -l vision/fair/pytorch3d/ )
+fi
diff --git a/pytorch3d/dev/run_tutorials.sh b/pytorch3d/dev/run_tutorials.sh
new file mode 100644
index 0000000000000000000000000000000000000000..a3947af536f21cc2f0842f59a469073d0f4fe8c8
--- /dev/null
+++ b/pytorch3d/dev/run_tutorials.sh
@@ -0,0 +1,56 @@
+#!/usr/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# This script is for running some of the tutorials using the nightly build in
+# an isolated environment. It is designed to be run in docker.
+
+# If you run this script in this directory with
+#   sudo docker run --runtime=nvidia -it --rm -v $PWD/../docs/tutorials:/notebooks -v $PWD:/loc pytorch/conda-cuda bash /loc/run_tutorials.sh | tee log.txt
+# it should execute some tutorials with the nightly build and resave them, and
+# save a log in the current directory.
+
+# We use nbconvert. runipy would be an alternative but it currently doesn't
+# work well with plotly.
+
+set -e
+
+conda init bash
+# shellcheck source=/dev/null
+source ~/.bashrc
+conda create -y -n myenv python=3.8 matplotlib ipython ipywidgets nbconvert
+conda activate myenv
+conda install -y -c fvcore -c iopath -c conda-forge fvcore iopath
+conda install -y -c pytorch pytorch=1.6.0 cudatoolkit=10.1 torchvision
+conda install -y -c pytorch3d-nightly pytorch3d
+pip install plotly scikit-image
+
+for notebook in /notebooks/*.ipynb
+do
+    name=$(basename "$notebook")
+
+    if [[ "$name" == "dataloaders_ShapeNetCore_R2N2.ipynb" ]]
+    then
+        #skip as data not easily available
+        continue
+    fi
+    if [[ "$name" == "render_densepose.ipynb" ]]
+    then
+        #skip as data not easily available
+        continue
+    fi
+
+    #comment the lines which install torch, torchvision and pytorch3d
+    sed -Ei '/(torchvision)|(pytorch3d)/ s/!pip/!#pip/' "$notebook"
+    #Don't let tqdm use widgets
+    sed -i 's/from tqdm.notebook import tqdm/from tqdm import tqdm/' "$notebook"
+
+    echo
+    echo "###   ###   ###"
+    echo "starting $name"
+    time jupyter nbconvert --to notebook --inplace --ExecutePreprocessor.kernel_name=python3 --execute "$notebook" || true
+    echo "ending $name"
+done
diff --git a/pytorch3d/packaging/build_conda.sh b/pytorch3d/packaging/build_conda.sh
new file mode 100755
index 0000000000000000000000000000000000000000..aa1ba2e0295d2bbafe193b66f7095f3946c8d79d
--- /dev/null
+++ b/pytorch3d/packaging/build_conda.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+. "$script_dir/pkg_helpers.bash"
+
+VERSION=$(python -c "exec(open('${script_dir}/../pytorch3d/__init__.py').read()); print(__version__)")
+
+# Prevent dev tag in the version string.
+export BUILD_VERSION=$VERSION
+
+export BUILD_TYPE=conda
+setup_env "$VERSION"
+export SOURCE_ROOT_DIR="$PWD"
+setup_conda_pytorch_constraint
+setup_conda_cudatoolkit_constraint
+setup_visual_studio_constraint
+
+if [[ "$JUST_TESTRUN" == "1" ]]
+then
+    # We are not building for other users, we
+    # are only trying to see if the tests pass.
+    # So save time by only building for our own GPU.
+    unset NVCC_FLAGS
+fi
+
+# shellcheck disable=SC2086
+conda build $CONDA_CHANNEL_FLAGS ${TEST_FLAG:-} -c bottler -c fvcore -c iopath -c conda-forge --no-anaconda-upload --python "$PYTHON_VERSION" packaging/pytorch3d
diff --git a/pytorch3d/packaging/build_wheel.sh b/pytorch3d/packaging/build_wheel.sh
new file mode 100755
index 0000000000000000000000000000000000000000..12dbc17608423783cd92ea1b1127b97f084bdc87
--- /dev/null
+++ b/pytorch3d/packaging/build_wheel.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+. "$script_dir/pkg_helpers.bash"
+
+VERSION=$(python -c "exec(open('${script_dir}/../pytorch3d/__init__.py').read()); print(__version__)")
+
+export BUILD_TYPE=wheel
+setup_env "$VERSION"
+setup_wheel_python
+pip_install numpy
+setup_pip_pytorch_version
+download_nvidiacub_if_needed
+python setup.py clean
+IS_WHEEL=1 python setup.py bdist_wheel
diff --git a/pytorch3d/packaging/conda/build_pytorch3d.sh b/pytorch3d/packaging/conda/build_pytorch3d.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ec0dcbc535ef017d36bee691bb334a0f60c64212
--- /dev/null
+++ b/pytorch3d/packaging/conda/build_pytorch3d.sh
@@ -0,0 +1,218 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+if [[ -x "/remote/anaconda_token" ]]; then
+    . /remote/anaconda_token || true
+fi
+
+set -ex
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# Parse arguments and determmine version
+###########################################################
+
+if [ "$#" -ne 3 ]; then
+    echo "Illegal number of parameters. Pass cuda version, pytorch3d version, pytorch3d build number"
+    echo "CUDA version should be Mm with no dot, e.g. '80'"
+    echo "DESIRED_PYTHON should be M.m, e.g. '2.7'"
+    exit 1
+fi
+
+desired_cuda="$1"
+build_version="$2"
+build_number="$3"
+
+if [[ "$desired_cuda" != cpu ]]; then
+  desired_cuda="$(echo $desired_cuda | tr -d cuda. )"
+fi
+echo "Building cuda version $desired_cuda and pytorch3d version: $build_version build_number: $build_number"
+
+if [[ "$desired_cuda" == 'cpu' ]]; then
+    cpu_only=1
+    cuver="cpu"
+else
+    # Switch desired_cuda to be M.m to be consistent with other scripts in
+    # pytorch/builder
+    export FORCE_CUDA=1
+    cuda_nodot="$desired_cuda"
+
+    if [[ ${#cuda_nodot} -eq 2 ]]; then
+        desired_cuda="${desired_cuda:0:1}.${desired_cuda:1:1}"
+    elif [[ ${#cuda_nodot} -eq 3 ]]; then
+        desired_cuda="${desired_cuda:0:2}.${desired_cuda:2:1}"
+    else
+        echo "unknown cuda version $cuda_nodot"
+        exit 1
+    fi
+
+    cuver="cu$cuda_nodot"
+fi
+
+export PYTORCH3D_BUILD_VERSION=$build_version
+export PYTORCH3D_BUILD_NUMBER=$build_number
+
+if [[ -z "$DESIRED_PYTHON" ]]; then
+    DESIRED_PYTHON=('3.5' '3.6' '3.7')
+fi
+
+SOURCE_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
+
+if [[ -z "$WIN_PACKAGE_WORK_DIR" ]]; then
+    WIN_PACKAGE_WORK_DIR="$(echo $(pwd -W) | tr '/' '\\')\\tmp_conda_$(date +%H%M%S)"
+fi
+
+mkdir -p "$WIN_PACKAGE_WORK_DIR" || true
+pytorch3d_rootdir="$(realpath ${WIN_PACKAGE_WORK_DIR})/pytorch3d-src"
+git config --system core.longpaths true
+
+if [[ ! -d "$pytorch3d_rootdir" ]]; then
+    rm -rf "$pytorch3d_rootdir"
+    git clone SOURCE_DIR/../.. "$pytorch3d_rootdir"
+
+fi
+
+cd "$SOURCE_DIR"
+
+export tmp_conda="${WIN_PACKAGE_WORK_DIR}\\conda"
+export miniconda_exe="${WIN_PACKAGE_WORK_DIR}\\miniconda.exe"
+rm -rf "$tmp_conda"
+rm -f "$miniconda_exe"
+curl -sSk https://repo.continuum.io/miniconda/Miniconda3-latest-Windows-x86_64.exe -o "$miniconda_exe"
+"$SOURCE_DIR/install_conda.bat" && rm "$miniconda_exe"
+pushd $tmp_conda
+export PATH="$(pwd):$(pwd)/Library/usr/bin:$(pwd)/Library/bin:$(pwd)/Scripts:$(pwd)/bin:$PATH"
+popd
+retry conda install -yq conda-build
+
+ANACONDA_USER=pytorch-nightly
+conda config --set anaconda_upload no
+
+
+if [[ "$desired_cuda" == 'cpu' ]]; then
+    export CONDA_CUDATOOLKIT_CONSTRAINT=""
+    export CONDA_CPUONLY_FEATURE="- cpuonly # [not osx]"
+    export CUDA_VERSION="None"
+else
+    export CONDA_CPUONLY_FEATURE=""
+    . ./switch_cuda_version.sh $desired_cuda
+    if [[ "$desired_cuda" == "10.1" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.1,<10.2 # [not osx]"
+    elif [[ "$desired_cuda" == "10.0" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.0,<10.1 # [not osx]"
+    elif [[ "$desired_cuda" == "9.2" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.2,<9.3 # [not osx]"
+    elif [[ "$desired_cuda" == "9.0" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.0,<9.1 # [not osx]"
+    elif [[ "$desired_cuda" == "8.0" ]]; then
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=8.0,<8.1 # [not osx]"
+    else
+        echo "unhandled desired_cuda: $desired_cuda"
+        exit 1
+    fi
+fi
+
+if [[ -z "$PYTORCH_VERSION" ]]; then
+    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly"
+    export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
+                                python -c "import os, sys, json, re; cuver = '$cuver'; \
+                                cuver = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
+                                print(re.sub(r'\\+.*$', '', \
+                                [x['version'] for x in json.load(sys.stdin)['pytorch'] \
+                                    if (x['platform'] == 'darwin' or cuver in x['fn']) \
+                                    and 'py' + os.environ['DESIRED_PYTHON'] in x['fn']][-1]))")"
+    if [[ -z "$PYTORCH_VERSION" ]]; then
+        echo "PyTorch version auto detection failed"
+        echo "No package found for desired_cuda=$desired_cuda and DESIRED_PYTHON=$DESIRED_PYTHON"
+        exit 1
+    fi
+else
+    export CONDA_CHANNEL_FLAGS="-c pytorch -c pytorch-nightly"
+fi
+if [[ "$desired_cuda" == 'cpu' ]]; then
+    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
+    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
+else
+    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}"
+    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}"
+fi
+
+export PYTORCH_VERSION_NODOT=${PYTORCH_VERSION//./}
+
+# Loop through all Python versions to build a package for each
+for py_ver in "${DESIRED_PYTHON[@]}"; do
+    build_string="py${py_ver}_${build_string_suffix}"
+    folder_tag="${build_string}_$(date +'%Y%m%d')"
+
+    # Create the conda package into this temporary folder. This is so we can find
+    # the package afterwards, as there's no easy way to extract the final filename
+    # from conda-build
+    output_folder="out_$folder_tag"
+    rm -rf "$output_folder"
+    mkdir "$output_folder"
+
+    export VSTOOLCHAIN_PACKAGE=vs2017
+
+    # We need to build the compiler activation scripts first on Windows
+    time VSDEVCMD_ARGS=${VSDEVCMD_ARGS[@]} \
+        conda build -c "$ANACONDA_USER" \
+                    --no-anaconda-upload \
+                    --output-folder "$output_folder" \
+                    ../$VSTOOLCHAIN_PACKAGE
+
+    cp ../$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml ../pytorch3d/conda_build_config.yaml
+
+    conda config --set anaconda_upload no
+    echo "Calling conda-build at $(date)"
+    if [[ "$desired_cuda" == "9.2" ]]; then
+        time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+            BUILD_VERSION="$PYTORCH3D_BUILD_VERSION" \
+            CU_VERSION="$cuver" \
+            SOURCE_ROOT_DIR="$pytorch3d_rootdir" \
+            conda build -c "$ANACONDA_USER" \
+                        -c defaults \
+                        -c conda-forge \
+                        -c "numba/label/dev" \
+                        --no-anaconda-upload \
+                        --python "$py_ver" \
+                        --output-folder "$output_folder" \
+                        --no-verify \
+                        --no-test \
+                        ../pytorch3d
+    else
+        time CMAKE_ARGS=${CMAKE_ARGS[@]} \
+            BUILD_VERSION="$PYTORCH3D_BUILD_VERSION" \
+            CU_VERSION="$cuver" \
+            SOURCE_ROOT_DIR="$pytorch3d_rootdir" \
+            conda build -c "$ANACONDA_USER" \
+                        -c defaults \
+                        -c conda-forge \
+                        --no-anaconda-upload \
+                        --python "$py_ver" \
+                        --output-folder "$output_folder" \
+                        --no-verify \
+                        --no-test \
+                        ../pytorch3d
+    fi
+    echo "Finished conda-build at $(date)"
+
+    # Extract the package for testing
+    ls -lah "$output_folder"
+    built_package="$(find $output_folder/ -name '*pytorch3d*.tar.bz2')"
+
+    # Copy the built package to the host machine for persistence before testing
+    if [[ -n "$PYTORCH_FINAL_PACKAGE_DIR" ]]; then
+        mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" || true
+        cp "$built_package" "$PYTORCH_FINAL_PACKAGE_DIR/"
+    fi
+done
+
+
+set +e
diff --git a/pytorch3d/packaging/conda/install_conda.bat b/pytorch3d/packaging/conda/install_conda.bat
new file mode 100644
index 0000000000000000000000000000000000000000..b629bf57b2eadc50f2ba37056b30b8812faa7ea5
--- /dev/null
+++ b/pytorch3d/packaging/conda/install_conda.bat
@@ -0,0 +1,8 @@
+@REM Copyright (c) Facebook, Inc. and its affiliates.
+@REM All rights reserved.
+@REM
+@REM This source code is licensed under the BSD-style license found in the
+@REM LICENSE file in the root directory of this source tree.
+
+:: Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
diff --git a/pytorch3d/packaging/conda/switch_cuda_version.sh b/pytorch3d/packaging/conda/switch_cuda_version.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a9496ed20aedbda08a9b95c7b089b3d573a560dc
--- /dev/null
+++ b/pytorch3d/packaging/conda/switch_cuda_version.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+if [[ "$OSTYPE" == "msys" ]]; then
+    CUDA_DIR="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v$1"
+else
+    CUDA_DIR="/usr/local/cuda-$1"
+fi
+
+if ! ls "$CUDA_DIR"
+then
+    echo "folder $CUDA_DIR not found to switch"
+fi
+
+echo "Switching symlink to $CUDA_DIR"
+mkdir -p /usr/local
+rm -fr /usr/local/cuda
+ln -s "$CUDA_DIR" /usr/local/cuda
+
+if [[ "$OSTYPE" == "msys" ]]; then
+    export CUDA_VERSION=`ls /usr/local/cuda/bin/cudart64*.dll | head -1 | tr '._' ' ' | cut -d ' ' -f2`
+    export CUDNN_VERSION=`ls /usr/local/cuda/bin/cudnn64*.dll | head -1 | tr '._' ' ' | cut -d ' ' -f2`
+else
+    export CUDA_VERSION=$(ls /usr/local/cuda/lib64/libcudart.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev)
+    export CUDNN_VERSION=$(ls /usr/local/cuda/lib64/libcudnn.so.*|sort|tac | head -1 | rev | cut -d"." -f -3 | rev)
+fi
+
+ls -alh /usr/local/cuda
+
+echo "CUDA_VERSION=$CUDA_VERSION"
+echo "CUDNN_VERSION=$CUDNN_VERSION"
diff --git a/pytorch3d/packaging/cub_conda/README.md b/pytorch3d/packaging/cub_conda/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..fbf71eb4fb126dde214ebf60f99aa80738155983
--- /dev/null
+++ b/pytorch3d/packaging/cub_conda/README.md
@@ -0,0 +1,26 @@
+## For building conda package for NVIDIA CUB
+
+CUB is required for building PyTorch3D so it makes sense
+to provide a conda package to make its header files available.
+This directory is used to do that, it is independent of the rest
+of this repo.
+
+Make sure you are in a conda environment with
+anaconda-client and conda-build installed.
+
+From this directory, build the package with the following.
+```
+mkdir -p ./out
+conda build --no-anaconda-upload --output-folder ./out cub
+```
+
+You can then upload the package with the following.
+```
+retry () {
+    # run a command, and try again if it fails
+    $*  || (echo && sleep 8 && echo retrying && $*)
+}
+
+file=out/linux-64/nvidiacub-1.10.0-0.tar.bz2
+retry anaconda --verbose -t ${TOKEN} upload -u pytorch3d --force ${file} --no-progress
+```
diff --git a/pytorch3d/packaging/cub_conda/cub/meta.yaml b/pytorch3d/packaging/cub_conda/cub/meta.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..7ebb038a58deeecfcabb662ad32d77129336604a
--- /dev/null
+++ b/pytorch3d/packaging/cub_conda/cub/meta.yaml
@@ -0,0 +1,12 @@
+package:
+  name: nvidiacub
+  version: 1.10.0
+source:
+  url: https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+  folder: source
+build:
+  script: mkdir $PREFIX/include && cp -r source/cub $PREFIX/include/cub
+
+about:
+  home: https://github.com/NVIDIA/cub
+  summary: CUB provides state-of-the-art, reusable software components for every layer of the CUDA programming model.
diff --git a/pytorch3d/packaging/linux_wheels/README.md b/pytorch3d/packaging/linux_wheels/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a057e0bf7d391ad8e3ab30112afcd5db60ae0ea9
--- /dev/null
+++ b/pytorch3d/packaging/linux_wheels/README.md
@@ -0,0 +1,30 @@
+## Building Linux pip Packages
+
+1. Make sure this directory is on a filesystem which docker can
+use - e.g. not NFS. If you are using a local hard drive there is
+nothing to do here.
+
+2. You may want to `docker pull pytorch/conda-cuda:latest`.
+
+3. Run `bash go.sh` in this directory. This takes ages
+and writes packages to `inside/output`.
+
+4. You can upload the packages to s3, along with basic html files
+which enable them to be used, with `bash after.sh`.
+
+
+In particular, if you are in a jupyter/colab notebook you can
+then install using these wheels with the following series of
+commands.
+
+```
+import sys
+import torch
+pyt_version_str=torch.__version__.split("+")[0].replace(".", "")
+version_str="".join([
+    f"py3{sys.version_info.minor}_cu",
+    torch.version.cuda.replace(".",""),
+    f"_pyt{pyt_version_str}"
+])
+!pip install pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
+```
diff --git a/pytorch3d/packaging/linux_wheels/after.sh b/pytorch3d/packaging/linux_wheels/after.sh
new file mode 100644
index 0000000000000000000000000000000000000000..8c17869164cc2c328813ed9853bf3bcdb7162fae
--- /dev/null
+++ b/pytorch3d/packaging/linux_wheels/after.sh
@@ -0,0 +1,10 @@
+#!/usr/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+sudo chown -R "$USER" output
+python publish.py
diff --git a/pytorch3d/packaging/linux_wheels/go.sh b/pytorch3d/packaging/linux_wheels/go.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e701988ccc40a6d05739eff428adaa954390a0a5
--- /dev/null
+++ b/pytorch3d/packaging/linux_wheels/go.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+sudo docker run --rm  -v "$PWD/../../:/inside" pytorch/conda-cuda bash inside/packaging/linux_wheels/inside.sh
+sudo docker run --rm  -v "$PWD/../../:/inside" -e SELECTED_CUDA=cu113 pytorch/conda-builder:cuda113 bash inside/packaging/linux_wheels/inside.sh
diff --git a/pytorch3d/packaging/linux_wheels/inside.sh b/pytorch3d/packaging/linux_wheels/inside.sh
new file mode 100644
index 0000000000000000000000000000000000000000..41ffa490eeb06d6d449fc7997074b0318b5e77a1
--- /dev/null
+++ b/pytorch3d/packaging/linux_wheels/inside.sh
@@ -0,0 +1,131 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -ex
+
+conda init bash
+# shellcheck source=/dev/null
+source ~/.bashrc
+
+cd /inside
+VERSION=$(python -c "exec(open('pytorch3d/__init__.py').read()); print(__version__)")
+
+export BUILD_VERSION=$VERSION
+export FORCE_CUDA=1
+
+wget --no-verbose https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+tar xzf 1.10.0.tar.gz
+CUB_HOME=$(realpath ./cub-1.10.0)
+export CUB_HOME
+echo "CUB_HOME is now $CUB_HOME"
+
+# As a rule, we want to build for any combination of dependencies which is supported by
+# PyTorch3D and not older than the current Google Colab set up.
+
+PYTHON_VERSIONS="3.7 3.8 3.9"
+# the keys are pytorch versions
+declare -A CONDA_CUDA_VERSIONS=(
+#    ["1.4.0"]="cu101"
+#    ["1.5.0"]="cu101 cu102"
+#    ["1.5.1"]="cu101 cu102"
+#    ["1.6.0"]="cu101 cu102"
+#    ["1.7.0"]="cu101 cu102 cu110"
+#    ["1.7.1"]="cu101 cu102 cu110"
+#    ["1.8.0"]="cu101 cu102 cu111"
+#    ["1.8.1"]="cu101 cu102 cu111"
+#    ["1.9.0"]="cu102 cu111"
+#    ["1.9.1"]="cu102 cu111"
+    ["1.10.0"]="cu102 cu111 cu113"
+)
+
+
+
+for python_version in $PYTHON_VERSIONS
+do
+    for pytorch_version in "${!CONDA_CUDA_VERSIONS[@]}"
+    do
+        if [[ "3.6 3.7 3.8" != *$python_version* ]] && [[ "1.4.0 1.5.0 1.5.1 1.6.0 1.7.0" == *$pytorch_version* ]]
+        then
+            #python 3.9 and later not supported by pytorch 1.7.0 and before
+            continue
+        fi
+
+        extra_channel="-c conda-forge"
+
+        for cu_version in ${CONDA_CUDA_VERSIONS[$pytorch_version]}
+        do
+            if [[ "cu113" == *$cu_version* ]] && [[ $SELECTED_CUDA != "$cu_version" ]]
+            #       ^^^ CUDA versions listed here have to be built
+            # in their own containers.
+            then
+                continue
+            fi
+
+            case "$cu_version" in
+                cu113)
+                    export CUDA_HOME=/usr/local/cuda-11.3/
+                    export CUDA_TAG=11.3
+                    export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+                ;;
+                cu112)
+                    export CUDA_HOME=/usr/local/cuda-11.2/
+                    export CUDA_TAG=11.2
+                    export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+                ;;
+                cu111)
+                    export CUDA_HOME=/usr/local/cuda-11.1/
+                    export CUDA_TAG=11.1
+                    export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+                ;;
+                cu110)
+                    export CUDA_HOME=/usr/local/cuda-11.0/
+                    export CUDA_TAG=11.0
+                    export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_50,code=compute_50"
+                ;;
+                cu102)
+                    export CUDA_HOME=/usr/local/cuda-10.2/
+                    export CUDA_TAG=10.2
+                    export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50"
+                ;;
+                cu101)
+                    export CUDA_HOME=/usr/local/cuda-10.1/
+                    export CUDA_TAG=10.1
+                    export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50"
+                ;;
+                *)
+                    echo "Unrecognized cu_version=$cu_version"
+                    exit 1
+                ;;
+            esac
+            tag=py"${python_version//./}"_"${cu_version}"_pyt"${pytorch_version//./}"
+
+            outdir="/inside/packaging/linux_wheels/output/$tag"
+            if [[ -d "$outdir" ]]
+            then
+                continue
+            fi
+
+            conda create -y -n "$tag" "python=$python_version"
+            conda activate "$tag"
+            conda install -y -c pytorch $extra_channel "pytorch=$pytorch_version" "cudatoolkit=$CUDA_TAG" torchvision
+            pip install fvcore iopath
+            echo "python version" "$python_version" "pytorch version" "$pytorch_version" "cuda version" "$cu_version" "tag" "$tag"
+
+            rm -rf dist
+
+            python setup.py clean
+            python setup.py bdist_wheel
+
+            rm -rf "$outdir"
+            mkdir -p "$outdir"
+            cp dist/*whl "$outdir"
+
+            conda deactivate
+        done
+    done
+done
+echo "DONE"
diff --git a/pytorch3d/packaging/linux_wheels/publish.py b/pytorch3d/packaging/linux_wheels/publish.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4e99041b34191e5ab6efda68166a74d086b531b
--- /dev/null
+++ b/pytorch3d/packaging/linux_wheels/publish.py
@@ -0,0 +1,87 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import subprocess
+from pathlib import Path
+from typing import List
+
+
+dest = "s3://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/"
+
+output = Path("output")
+
+
+def aws_s3_cmd(args) -> List[str]:
+    """
+    This function returns the full args for subprocess to do a command
+    with aws.
+    """
+    cmd_args = ["aws", "s3", "--profile", "saml"] + args
+    return cmd_args
+
+
+def fs3_exists(path) -> bool:
+    """
+    Returns True if the path exists inside dest on S3.
+    In fact, will also return True if there is a file which has the given
+    path as a prefix, but we are careful about this.
+    """
+    out = subprocess.check_output(aws_s3_cmd(["ls", path]))
+    return len(out) != 0
+
+
+def get_html_wrappers() -> None:
+    for directory in sorted(output.iterdir()):
+        output_wrapper = directory / "download.html"
+        assert not output_wrapper.exists()
+        dest_wrapper = dest + directory.name + "/download.html"
+        if fs3_exists(dest_wrapper):
+            subprocess.check_call(aws_s3_cmd(["cp", dest_wrapper, str(output_wrapper)]))
+
+
+def write_html_wrappers() -> None:
+    html = """
+    <a href="$">$</a><br>
+    """
+
+    for directory in sorted(output.iterdir()):
+        files = list(directory.glob("*.whl"))
+        assert len(files) == 1, files
+        [wheel] = files
+
+        this_html = html.replace("$", wheel.name)
+        output_wrapper = directory / "download.html"
+        if output_wrapper.exists():
+            contents = output_wrapper.read_text()
+            if this_html not in contents:
+                with open(output_wrapper, "a") as f:
+                    f.write(this_html)
+        else:
+            output_wrapper.write_text(this_html)
+
+
+def to_aws() -> None:
+    for directory in output.iterdir():
+        for file in directory.iterdir():
+            print(file)
+            subprocess.check_call(
+                aws_s3_cmd(["cp", str(file), dest + str(file.relative_to(output))])
+            )
+
+
+if __name__ == "__main__":
+    # Uncomment this for subsequent releases.
+    # get_html_wrappers()
+    write_html_wrappers()
+    to_aws()
+
+
+# see all files with
+#  aws s3 --profile saml ls --recursive s3://dl.fbaipublicfiles.com/pytorch3d/
+
+# empty current with
+#  aws s3 --profile saml rm --recursive
+#                 s3://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/
diff --git a/pytorch3d/packaging/pkg_helpers.bash b/pytorch3d/packaging/pkg_helpers.bash
new file mode 100644
index 0000000000000000000000000000000000000000..6aaa5cf7b9176cf79f69dfd5cb3a6ee9703beeda
--- /dev/null
+++ b/pytorch3d/packaging/pkg_helpers.bash
@@ -0,0 +1,358 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+# shellcheck shell=bash
+# A set of useful bash functions for common functionality we need to do in
+# many build scripts
+
+
+# Setup CUDA environment variables, based on CU_VERSION
+#
+# Inputs:
+#   CU_VERSION (cu92, cu100, cu101, cu102)
+#   NO_CUDA_PACKAGE (bool)
+#   BUILD_TYPE (conda, wheel)
+#
+# Outputs:
+#   VERSION_SUFFIX (e.g., "")
+#   PYTORCH_VERSION_SUFFIX (e.g., +cpu)
+#   WHEEL_DIR (e.g., cu100/)
+#   CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension)
+#   FORCE_CUDA (respected by pytorch3d setup.py)
+#   NVCC_FLAGS (respected by pytorch3d setup.py)
+#
+# Precondition: CUDA versions are installed in their conventional locations in
+# /usr/local/cuda-*
+#
+# NOTE: Why VERSION_SUFFIX versus PYTORCH_VERSION_SUFFIX?  If you're building
+# a package with CUDA on a platform we support CUDA on, VERSION_SUFFIX ==
+# PYTORCH_VERSION_SUFFIX and everyone is happy.  However, if you are building a
+# package with only CPU bits (e.g., torchaudio), then VERSION_SUFFIX is always
+# empty, but PYTORCH_VERSION_SUFFIX is +cpu (because that's how you get a CPU
+# version of a Python package.  But that doesn't apply if you're on OS X,
+# since the default CU_VERSION on OS X is cpu.
+setup_cuda() {
+
+  # First, compute version suffixes.  By default, assume no version suffixes
+  export VERSION_SUFFIX=""
+  export PYTORCH_VERSION_SUFFIX=""
+  export WHEEL_DIR=""
+  # Wheel builds need suffixes (but not if they're on OS X, which never has suffix)
+  if [[ "$BUILD_TYPE" == "wheel" ]] && [[ "$(uname)" != Darwin ]]; then
+    # The default CUDA has no suffix
+    if [[ "$CU_VERSION" != "cu102" ]]; then
+      export PYTORCH_VERSION_SUFFIX="+$CU_VERSION"
+    fi
+    # Match the suffix scheme of pytorch, unless this package does not have
+    # CUDA builds (in which case, use default)
+    if [[ -z "$NO_CUDA_PACKAGE" ]]; then
+      export VERSION_SUFFIX="$PYTORCH_VERSION_SUFFIX"
+      export WHEEL_DIR="$CU_VERSION/"
+    fi
+  fi
+
+  # Now work out the CUDA settings
+  case "$CU_VERSION" in
+    cu113)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.3"
+      else
+        export CUDA_HOME=/usr/local/cuda-11.3/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu112)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.2"
+      else
+        export CUDA_HOME=/usr/local/cuda-11.2/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu111)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.1"
+      else
+        export CUDA_HOME=/usr/local/cuda-11.1/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu110)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.0"
+      else
+        export CUDA_HOME=/usr/local/cuda-11.0/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu102)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2"
+      else
+        export CUDA_HOME=/usr/local/cuda-10.2/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu101)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.1"
+      else
+        export CUDA_HOME=/usr/local/cuda-10.1/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu100)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.0"
+      else
+        export CUDA_HOME=/usr/local/cuda-10.0/
+      fi
+      export FORCE_CUDA=1
+      # Hard-coding gencode flags is temporary situation until
+      # https://github.com/pytorch/pytorch/pull/23408 lands
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cu92)
+      if [[ "$OSTYPE" == "msys" ]]; then
+        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v9.2"
+      else
+        export CUDA_HOME=/usr/local/cuda-9.2/
+      fi
+      export FORCE_CUDA=1
+      export NVCC_FLAGS="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_50,code=compute_50"
+      ;;
+    cpu)
+      ;;
+    *)
+      echo "Unrecognized CU_VERSION=$CU_VERSION"
+      exit 1
+      ;;
+  esac
+}
+
+# Populate build version if necessary, and add version suffix
+#
+# Inputs:
+#   BUILD_VERSION (e.g., 0.2.0 or empty)
+#   VERSION_SUFFIX (e.g., +cpu)
+#
+# Outputs:
+#   BUILD_VERSION (e.g., 0.2.0.dev20190807+cpu)
+#
+# Fill BUILD_VERSION if it doesn't exist already with a nightly string
+# Usage: setup_build_version 0.2.0
+setup_build_version() {
+  if [[ -z "$BUILD_VERSION" ]]; then
+    export BUILD_VERSION="$1.dev$(date "+%Y%m%d")$VERSION_SUFFIX"
+  else
+    export BUILD_VERSION="$BUILD_VERSION$VERSION_SUFFIX"
+  fi
+}
+
+# Set some useful variables for OS X, if applicable
+setup_macos() {
+  if [[ "$(uname)" == Darwin ]]; then
+    export MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++
+  fi
+}
+
+# Top-level entry point for things every package will need to do
+#
+# Usage: setup_env 0.2.0
+setup_env() {
+  setup_cuda
+  setup_build_version "$1"
+  setup_macos
+}
+
+# Function to retry functions that sometimes timeout or have flaky failures
+retry () {
+    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
+}
+
+# Inputs:
+#   PYTHON_VERSION (2.7, 3.5, 3.6, 3.7)
+#   UNICODE_ABI (bool)
+#
+# Outputs:
+#   PATH modified to put correct Python version in PATH
+#
+# Precondition: If Linux, you are in a soumith/manylinux-cuda* Docker image
+setup_wheel_python() {
+  if [[ "$(uname)" == Darwin ]]; then
+    eval "$(conda shell.bash hook)"
+    conda env remove -n "env$PYTHON_VERSION" || true
+    conda create -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION"
+    conda activate "env$PYTHON_VERSION"
+  else
+    case "$PYTHON_VERSION" in
+      2.7)
+        if [[ -n "$UNICODE_ABI" ]]; then
+          python_abi=cp27-cp27mu
+        else
+          python_abi=cp27-cp27m
+        fi
+        ;;
+      3.5) python_abi=cp35-cp35m ;;
+      3.6) python_abi=cp36-cp36m ;;
+      3.7) python_abi=cp37-cp37m ;;
+      3.8) python_abi=cp38-cp38 ;;
+      *)
+        echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION"
+        exit 1
+        ;;
+    esac
+    export PATH="/opt/python/$python_abi/bin:$PATH"
+  fi
+}
+
+# Install with pip a bit more robustly than the default
+pip_install() {
+  retry pip install --progress-bar off "$@"
+}
+
+# Install torch with pip, respecting PYTORCH_VERSION, and record the installed
+# version into PYTORCH_VERSION, if applicable
+setup_pip_pytorch_version() {
+  if [[ -z "$PYTORCH_VERSION" ]]; then
+    # Install latest prerelease version of torch, per our nightlies, consistent
+    # with the requested cuda version
+    pip_install --pre torch -f "https://download.pytorch.org/whl/nightly/${WHEEL_DIR}torch_nightly.html"
+    if [[ "$CUDA_VERSION" == "cpu" ]]; then
+      # CUDA and CPU are ABI compatible on the CPU-only parts, so strip
+      # in this case
+      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//' | sed 's/+.\+//')"
+    else
+      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//')"
+    fi
+  else
+    pip_install "torch==$PYTORCH_VERSION$CUDA_SUFFIX" \
+      -f https://download.pytorch.org/whl/torch_stable.html \
+      -f https://download.pytorch.org/whl/nightly/torch_nightly.html
+  fi
+}
+
+# Fill PYTORCH_VERSION with the latest conda nightly version, and
+# CONDA_CHANNEL_FLAGS with appropriate flags to retrieve these versions
+#
+# You MUST have populated CUDA_SUFFIX before hand.
+setup_conda_pytorch_constraint() {
+  if [[ -z "$PYTORCH_VERSION" ]]; then
+    export CONDA_CHANNEL_FLAGS="-c pytorch-nightly"
+    export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-nightly]' | \
+                              python -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \
+                               cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
+                               cuver_2 = (cuver[:-1] + '.' + cuver[-1]).replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
+                               print(re.sub(r'\\+.*$', '', \
+                                [x['version'] for x in json.load(sys.stdin)['pytorch'] \
+                                  if (x['platform'] == 'darwin' or cuver_1 in x['fn'] or cuver_2 in x['fn']) \
+                                    and 'py' + os.environ['PYTHON_VERSION'] in x['fn']][-1]))")"
+    if [[ -z "$PYTORCH_VERSION" ]]; then
+      echo "PyTorch version auto detection failed"
+      echo "No package found for CU_VERSION=$CU_VERSION and PYTHON_VERSION=$PYTHON_VERSION"
+      exit 1
+    fi
+  else
+    export CONDA_CHANNEL_FLAGS="-c pytorch"
+  fi
+  if [[ "$CU_VERSION" == cpu ]]; then
+    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}"
+    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
+  else
+    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
+    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
+  fi
+  export PYTORCH_VERSION_NODOT=${PYTORCH_VERSION//./}
+}
+
+# Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT
+setup_conda_cudatoolkit_constraint() {
+  export CONDA_CPUONLY_FEATURE=""
+  export CONDA_CUB_CONSTRAINT=""
+  if [[ "$(uname)" == Darwin ]]; then
+    export CONDA_CUDATOOLKIT_CONSTRAINT=""
+  else
+    case "$CU_VERSION" in
+      cu113)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.3,<11.4 # [not osx]"
+        ;;
+      cu112)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.2,<11.3 # [not osx]"
+        ;;
+      cu111)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.1,<11.2 # [not osx]"
+        ;;
+      cu110)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=11.0,<11.1 # [not osx]"
+        # Even though cudatoolkit 11.0 provides CUB we need our own, to control the
+        # version, because the built-in 1.9.9 in the cudatoolkit causes problems.
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
+        ;;
+      cu102)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.2,<10.3 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
+        ;;
+      cu101)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.1,<10.2 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
+        ;;
+      cu100)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=10.0,<10.1 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
+        ;;
+      cu92)
+        export CONDA_CUDATOOLKIT_CONSTRAINT="- cudatoolkit >=9.2,<9.3 # [not osx]"
+        export CONDA_CUB_CONSTRAINT="- nvidiacub"
+        ;;
+      cpu)
+        export CONDA_CUDATOOLKIT_CONSTRAINT=""
+        export CONDA_CPUONLY_FEATURE="- cpuonly"
+        ;;
+      *)
+        echo "Unrecognized CU_VERSION=$CU_VERSION"
+        exit 1
+        ;;
+    esac
+  fi
+}
+
+# Build the proper compiler package before building the final package
+setup_visual_studio_constraint() {
+  if [[ "$OSTYPE" == "msys" ]]; then
+      export VSTOOLCHAIN_PACKAGE=vs2019
+      export VSDEVCMD_ARGS=''
+      # shellcheck disable=SC2086
+      conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE
+      cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/pytorch3d/conda_build_config.yaml
+  fi
+}
+
+download_nvidiacub_if_needed() {
+  case "$CU_VERSION" in
+    cu110|cu102|cu101|cu100|cu92)
+      echo "Downloading cub"
+      wget --no-verbose https://github.com/NVIDIA/cub/archive/1.10.0.tar.gz
+      tar xzf 1.10.0.tar.gz
+      CUB_HOME=$(realpath ./cub-1.10.0)
+      export CUB_HOME
+      echo "CUB_HOME is now $CUB_HOME"
+      ;;
+  esac
+  # We don't need CUB for a cpu build or if cuda is 11.1 or higher
+}
diff --git a/pytorch3d/packaging/pytorch3d/meta.yaml b/pytorch3d/packaging/pytorch3d/meta.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..202f83efe170727e0bef4ef8f768f9ffd9f3969f
--- /dev/null
+++ b/pytorch3d/packaging/pytorch3d/meta.yaml
@@ -0,0 +1,57 @@
+package:
+  name: pytorch3d
+  version: "{{ environ.get('BUILD_VERSION') }}"
+
+source:
+ path: "{{ environ.get('SOURCE_ROOT_DIR') }}"
+
+requirements:
+  build:
+    - {{ compiler('c') }} # [win]
+    {{ environ.get('CONDA_CUB_CONSTRAINT') }}
+
+  host:
+    - python
+    - setuptools
+    {{ environ.get('CONDA_PYTORCH_BUILD_CONSTRAINT') }}
+    {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
+    {{ environ.get('CONDA_CPUONLY_FEATURE') }}
+
+  run:
+    - python
+    - numpy >=1.11
+    - torchvision >=0.5
+    - fvcore
+    - iopath
+    {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
+    {{ environ.get('CONDA_CUDATOOLKIT_CONSTRAINT') }}
+
+build:
+  string: py{{py}}_{{ environ['CU_VERSION'] }}_pyt{{ environ['PYTORCH_VERSION_NODOT']}}
+  script: python setup.py install --single-version-externally-managed --record=record.txt # [not win]
+  script_env:
+    - CUDA_HOME
+    - FORCE_CUDA
+    - NVCC_FLAGS
+    - MAX_JOBS
+  features:
+    {{ environ.get('CONDA_CPUONLY_FEATURE') }}
+
+test:
+  imports:
+    - pytorch3d
+  source_files:
+    - tests
+    - docs
+  requires:
+    - imageio
+  commands:
+    #pytest .
+    python -m unittest discover -v -s tests
+
+
+about:
+  home: https://github.com/facebookresearch/pytorch3d
+  license: BSD
+  license_file: LICENSE
+  summary: '3d Geometry for pytorch'
diff --git a/pytorch3d/packaging/vs2017/activate.bat b/pytorch3d/packaging/vs2017/activate.bat
new file mode 100644
index 0000000000000000000000000000000000000000..b9f970c51f8cbc43299cb4b1593771dbb3d18901
--- /dev/null
+++ b/pytorch3d/packaging/vs2017/activate.bat
@@ -0,0 +1,50 @@
+@REM Copyright (c) Facebook, Inc. and its affiliates.
+@REM All rights reserved.
+@REM
+@REM This source code is licensed under the BSD-style license found in the
+@REM LICENSE file in the root directory of this source tree.
+
+:: Set env vars that tell distutils to use the compiler that we put on path
+SET DISTUTILS_USE_SDK=1
+SET MSSdk=1
+
+SET "VS_VERSION=15.0"
+SET "VS_MAJOR=15"
+SET "VS_YEAR=2017"
+
+set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
+set "MSYS2_ENV_CONV_EXCL=CL"
+
+:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
+:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
+set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VSINSTALLDIR=%%i\"
+        goto :vswhere
+    )
+)
+
+:vswhere
+
+:: Shorten PATH to avoid the `input line too long` error.
+SET MyPath=%PATH%
+
+setlocal EnableDelayedExpansion
+
+SET TempPath="%MyPath:;=";"%"
+SET var=
+FOR %%a IN (%TempPath%) DO (
+    IF EXIST %%~sa (
+        SET "var=!var!;%%~sa"
+    )
+)
+
+set "TempPath=!var:~1!"
+endlocal & set "PATH=%TempPath%"
+
+:: Shorten current directory too
+FOR %%A IN (.) DO CD "%%~sA"
+
+:: other things added by install_activate.bat at package build time
diff --git a/pytorch3d/packaging/vs2017/conda_build_config.yaml b/pytorch3d/packaging/vs2017/conda_build_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5188bb0ebecf72aefb1c2e779458998216e4d479
--- /dev/null
+++ b/pytorch3d/packaging/vs2017/conda_build_config.yaml
@@ -0,0 +1,24 @@
+blas_impl:
+  - mkl                        # [x86_64]
+c_compiler:
+  - vs2017                     # [win]
+cxx_compiler:
+  - vs2017                     # [win]
+python:
+  - 3.5
+  - 3.6
+# This differs from target_platform in that it determines what subdir the compiler
+#    will target, not what subdir the compiler package will be itself.
+#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
+#    code on win-64 miniconda.
+cross_compiler_target_platform:
+  - win-64                     # [win]
+target_platform:
+  - win-64                     # [win]
+vc:
+  - 14
+zip_keys:
+  -                             # [win]
+    - vc                        # [win]
+    - c_compiler                # [win]
+    - cxx_compiler              # [win]
diff --git a/pytorch3d/packaging/vs2017/install_activate.bat b/pytorch3d/packaging/vs2017/install_activate.bat
new file mode 100644
index 0000000000000000000000000000000000000000..70103174c554dbfc2dcf6e3bd87d227d089d1f2b
--- /dev/null
+++ b/pytorch3d/packaging/vs2017/install_activate.bat
@@ -0,0 +1,35 @@
+@REM Copyright (c) Facebook, Inc. and its affiliates.
+@REM All rights reserved.
+@REM
+@REM This source code is licensed under the BSD-style license found in the
+@REM LICENSE file in the root directory of this source tree.
+
+set YEAR=2017
+set VER=15
+
+mkdir "%PREFIX%\etc\conda\activate.d"
+COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+
+IF "%cross_compiler_target_platform%" == "win-64" (
+  set "target_platform=amd64"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  IF "%VSDEVCMD_ARGS%" == "" (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) ELSE (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  )
+  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) else (
+  set "target_platform=x86"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo popd
+  )
diff --git a/pytorch3d/packaging/vs2017/install_runtime.bat b/pytorch3d/packaging/vs2017/install_runtime.bat
new file mode 100644
index 0000000000000000000000000000000000000000..143a24da8ae832e01d6da4148b7b1f293828c87e
--- /dev/null
+++ b/pytorch3d/packaging/vs2017/install_runtime.bat
@@ -0,0 +1,55 @@
+@REM Copyright (c) Facebook, Inc. and its affiliates.
+@REM All rights reserved.
+@REM
+@REM This source code is licensed under the BSD-style license found in the
+@REM LICENSE file in the root directory of this source tree.
+
+set VC_PATH=x86
+if "%ARCH%"=="64" (
+   set VC_PATH=x64
+)
+
+set MSC_VER=2017
+
+rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
+rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
+rem     set SP=%%A
+rem     )
+
+rem if not "%SP%" == "%PKG_VERSION%" (
+rem    echo "Version detected from registry: %SP%"
+rem    echo    "does not match version of package being built (%PKG_VERSION%)"
+rem    echo "Do you have current updates for VS 2015 installed?"
+rem    exit 1
+rem )
+
+
+REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
+if %ERRORLEVEL% GEQ 8 exit 1
+
+REM ========== This one comes from visual studio 2017
+set "VC_VER=141"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto :eof
+    )
+)
+
+@setlocal
+call "%VS15VARSALL%" x64
+
+set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
+
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+@endlocal
diff --git a/pytorch3d/packaging/vs2017/meta.yaml b/pytorch3d/packaging/vs2017/meta.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..34f4860ba850120f59f3f499e21e2a4b429e03cc
--- /dev/null
+++ b/pytorch3d/packaging/vs2017/meta.yaml
@@ -0,0 +1,45 @@
+{% set vcver="14.1" %}
+{% set vcfeature="14" %}
+{% set vsyear="2017" %}
+{% set fullver="15.4.27004.2010" %}
+
+package:
+  name: vs{{ vsyear }}
+  version: {{ fullver }}
+
+build:
+  skip: True  [not win]
+  script_env:
+    - VSDEVCMD_ARGS # [win]
+
+outputs:
+  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
+    script: install_activate.bat
+    track_features:
+      # VS 2017 is binary-compatible with VS 2015/vc14.  Tools are "v141".
+      strong:
+        - vc{{ vcfeature }}
+    run_exports:
+      - vc {{ vcver }}
+    about:
+      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
+      license: BSD 3-clause
+  - name: vs{{ vsyear }}_runtime
+    script: install_runtime.bat
+  - name: vc
+    version: {{ vcver }}
+    track_features:
+      - vc{{ vcfeature }}
+    requirements:
+      run:
+        - {{ pin_subpackage('vs' ~ vsyear ~ '_runtime') }}
+    about:
+      home: https://github.com/conda/conda/wiki/VC-features
+      license: Modified BSD License (3-clause)
+      license_family: BSD
+      summary: A meta-package to track VC features.
+      description: |
+          This metapackage is used to activate vc features without
+          depending on Python.
+      doc_url: https://github.com/conda/conda/wiki/VC-features
+      dev_url: https://github.com/conda/conda/wiki/VC-features
diff --git a/pytorch3d/packaging/vs2019/activate.bat b/pytorch3d/packaging/vs2019/activate.bat
new file mode 100644
index 0000000000000000000000000000000000000000..ff199a3356c6013c8f456428f50244a8988b5e76
--- /dev/null
+++ b/pytorch3d/packaging/vs2019/activate.bat
@@ -0,0 +1,50 @@
+@REM Copyright (c) Facebook, Inc. and its affiliates.
+@REM All rights reserved.
+@REM
+@REM This source code is licensed under the BSD-style license found in the
+@REM LICENSE file in the root directory of this source tree.
+
+:: Set env vars that tell distutils to use the compiler that we put on path
+SET DISTUTILS_USE_SDK=1
+SET MSSdk=1
+
+SET "VS_VERSION=16.0"
+SET "VS_MAJOR=16"
+SET "VS_YEAR=2019"
+
+set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
+set "MSYS2_ENV_CONV_EXCL=CL"
+
+:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
+:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
+set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VSINSTALLDIR=%%i\"
+        goto :vswhere
+    )
+)
+
+:vswhere
+
+:: Shorten PATH to avoid the `input line too long` error.
+SET MyPath=%PATH%
+
+setlocal EnableDelayedExpansion
+
+SET TempPath="%MyPath:;=";"%"
+SET var=
+FOR %%a IN (%TempPath%) DO (
+    IF EXIST %%~sa (
+        SET "var=!var!;%%~sa"
+    )
+)
+
+set "TempPath=!var:~1!"
+endlocal & set "PATH=%TempPath%"
+
+:: Shorten current directory too
+FOR %%A IN (.) DO CD "%%~sA"
+
+:: other things added by install_activate.bat at package build time
diff --git a/pytorch3d/packaging/vs2019/conda_build_config.yaml b/pytorch3d/packaging/vs2019/conda_build_config.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..358052ec012940bb56778d167bcd69302d255846
--- /dev/null
+++ b/pytorch3d/packaging/vs2019/conda_build_config.yaml
@@ -0,0 +1,24 @@
+blas_impl:
+  - mkl                        # [x86_64]
+c_compiler:
+  - vs2019                     # [win]
+cxx_compiler:
+  - vs2019                     # [win]
+python:
+  - 3.5
+  - 3.6
+# This differs from target_platform in that it determines what subdir the compiler
+#    will target, not what subdir the compiler package will be itself.
+#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
+#    code on win-64 miniconda.
+cross_compiler_target_platform:
+  - win-64                     # [win]
+target_platform:
+  - win-64                     # [win]
+vc:
+  - 14
+zip_keys:
+  -                             # [win]
+    - vc                        # [win]
+    - c_compiler                # [win]
+    - cxx_compiler              # [win]
diff --git a/pytorch3d/packaging/vs2019/install_activate.bat b/pytorch3d/packaging/vs2019/install_activate.bat
new file mode 100644
index 0000000000000000000000000000000000000000..348178859f6a970d8cd3c4814f4ff019db6ed21e
--- /dev/null
+++ b/pytorch3d/packaging/vs2019/install_activate.bat
@@ -0,0 +1,35 @@
+@REM Copyright (c) Facebook, Inc. and its affiliates.
+@REM All rights reserved.
+@REM
+@REM This source code is licensed under the BSD-style license found in the
+@REM LICENSE file in the root directory of this source tree.
+
+set YEAR=2019
+set VER=16
+
+mkdir "%PREFIX%\etc\conda\activate.d"
+COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+
+IF "%cross_compiler_target_platform%" == "win-64" (
+  set "target_platform=amd64"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  IF "%VSDEVCMD_ARGS%" == "" (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) ELSE (
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  )
+  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  ) else (
+  set "target_platform=x86"
+  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
+  echo popd
+  )
diff --git a/pytorch3d/packaging/vs2019/install_runtime.bat b/pytorch3d/packaging/vs2019/install_runtime.bat
new file mode 100644
index 0000000000000000000000000000000000000000..d7009203090b2130e3c852ed1fd659dbc5d66ddd
--- /dev/null
+++ b/pytorch3d/packaging/vs2019/install_runtime.bat
@@ -0,0 +1,55 @@
+@REM Copyright (c) Facebook, Inc. and its affiliates.
+@REM All rights reserved.
+@REM
+@REM This source code is licensed under the BSD-style license found in the
+@REM LICENSE file in the root directory of this source tree.
+
+set VC_PATH=x86
+if "%ARCH%"=="64" (
+   set VC_PATH=x64
+)
+
+set MSC_VER=2019
+
+rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
+rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
+rem     set SP=%%A
+rem     )
+
+rem if not "%SP%" == "%PKG_VERSION%" (
+rem    echo "Version detected from registry: %SP%"
+rem    echo    "does not match version of package being built (%PKG_VERSION%)"
+rem    echo "Do you have current updates for VS 2015 installed?"
+rem    exit 1
+rem )
+
+
+REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
+robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
+if %ERRORLEVEL% GEQ 8 exit 1
+
+REM ========== This one comes from visual studio 2019
+set "VC_VER=142"
+
+for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
+    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
+        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
+        goto :eof
+    )
+)
+
+@setlocal
+call "%VS15VARSALL%" x64
+
+set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
+
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
+if %ERRORLEVEL% LSS 8 exit 0
+@endlocal
diff --git a/pytorch3d/packaging/vs2019/meta.yaml b/pytorch3d/packaging/vs2019/meta.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e3f8b4714818e1fe5754a30ceb2070ff000991fd
--- /dev/null
+++ b/pytorch3d/packaging/vs2019/meta.yaml
@@ -0,0 +1,45 @@
+{% set vcver="14.2" %}
+{% set vcfeature="14" %}
+{% set vsyear="2019" %}
+{% set fullver="15.4.27004.2010" %}
+
+package:
+  name: vs{{ vsyear }}
+  version: {{ fullver }}
+
+build:
+  skip: True  [not win]
+  script_env:
+    - VSDEVCMD_ARGS # [win]
+
+outputs:
+  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
+    script: install_activate.bat
+    track_features:
+      # VS 2019 is binary-compatible with VS 2017/vc 14.1 and 2015/vc14.  Tools are "v142".
+      strong:
+        - vc{{ vcfeature }}
+    run_exports:
+      - vc {{ vcver }}
+    about:
+      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
+      license: BSD 3-clause
+  - name: vs{{ vsyear }}_runtime
+    script: install_runtime.bat
+  - name: vc
+    version: {{ vcver }}
+    track_features:
+      - vc{{ vcfeature }}
+    requirements:
+      run:
+        - {{ pin_subpackage('vs' ~ vsyear ~ '_runtime') }}
+    about:
+      home: https://github.com/conda/conda/wiki/VC-features
+      license: Modified BSD License (3-clause)
+      license_family: BSD
+      summary: A meta-package to track VC features.
+      description: |
+          This metapackage is used to activate vc features without
+          depending on Python.
+      doc_url: https://github.com/conda/conda/wiki/VC-features
+      dev_url: https://github.com/conda/conda/wiki/VC-features
diff --git a/pytorch3d/projects/nerf/.gitignore b/pytorch3d/projects/nerf/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..07f207f0f7bb99be4841fd6b3a7818d251ccc6d3
--- /dev/null
+++ b/pytorch3d/projects/nerf/.gitignore
@@ -0,0 +1,5 @@
+checkpoints
+outputs
+data/*.png
+data/*.pth
+data/*_license.txt
diff --git a/pytorch3d/projects/nerf/README.md b/pytorch3d/projects/nerf/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..a103aab47d21ebfa7bfd65387a68f83f0a1c8fbc
--- /dev/null
+++ b/pytorch3d/projects/nerf/README.md
@@ -0,0 +1,91 @@
+Neural Radiance Fields in PyTorch3D
+===================================
+
+This project implements the Neural Radiance Fields (NeRF) from [1].
+
+<img src="https://raw.githubusercontent.com/facebookresearch/pytorch3d/main/.github/nerf_project_logo.gif" width="600" height="338"/>
+
+
+Installation
+------------
+1) [Install PyTorch3D](https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md)
+
+2) Install other dependencies:
+    - [`visdom`](https://github.com/facebookresearch/visdom)
+    - [`hydra`](https://github.com/facebookresearch/hydra)
+    - [`Pillow`](https://python-pillow.org/)
+    - [`requests`](https://pypi.org/project/requests/)
+
+    E.g. using `pip`:
+    ```
+    pip install visdom
+    pip install hydra-core --upgrade
+    pip install Pillow
+    pip install requests
+    ```
+
+    Exporting videos further requires a working `ffmpeg`.
+
+Training NeRF
+-------------
+```
+python ./train_nerf.py --config-name lego
+```
+will train the model from [1] on the Lego dataset.
+
+Note that the script outputs visualizations to `Visdom`. In order to enable this, make sure to start the visdom server (before launching the training) with the following command:
+```
+python -m visdom.server
+```
+Note that training on the "lego" scene takes roughly 24 hours on a single Tesla V100.
+
+#### Training data
+Note that the `train_nerf.py` script will automatically download the relevant dataset in case it is missing.
+
+Testing NeRF
+------------
+```
+python ./test_nerf.py --config-name lego
+```
+Will load a trained model from the `./checkpoints` directory and evaluate it on the test split of the corresponding dataset (Lego in the case above).
+
+### Exporting multi-view video of the radiance field
+Furthermore, the codebase supports generating videos of the neural radiance field.
+The following generates a turntable video of the Lego scene:
+```
+python ./test_nerf.py --config-name=lego test.mode='export_video'
+```
+Note that this requires a working `ffmpeg` for generating the video from exported frames.
+
+Additionally, note that generation of the video in the original resolution is quite slow. In order to speed up the process, one can decrease the resolution of the output video by setting the `data.image_size` flag:
+```
+python ./test_nerf.py --config-name=lego test.mode='export_video' data.image_size="[128,128]"
+```
+This will generate the video in a lower `128 x 128` resolution.
+
+
+Training & testing on other datasets
+------------------------------------
+Currently we support the following datasets:
+- lego `python ./train_nerf.py --config-name lego`
+- fern `python ./train_nerf.py --config-name fern`
+- pt3logo `python ./train_nerf.py --config-name pt3logo`
+
+The dataset files are located in the following public S3 bucket:
+https://dl.fbaipublicfiles.com/pytorch3d_nerf_data
+
+Attribution: `lego` and `fern` are data from the original code release of [1] in https://drive.google.com/drive/folders/128yBriW1IG_3NJ5Rp7APSTZsJqdJdfc1, which are hosted under the CC-BY license (https://creativecommons.org/licenses/by/4.0/) The S3 bucket files contains the same images while the camera matrices have been adjusted to follow the PyTorch3D convention.
+
+#### Quantitative results
+Below are the comparisons between our implementation and the official [`TensorFlow code`](https://github.com/bmild/nerf). The speed is measured on NVidia Quadro GP100.
+```
++----------------+------------------+------------------+-----------------+
+| Implementation |  Lego: test PSNR |  Fern: test PSNR |  training speed |
++----------------+------------------+------------------+-----------------+
+| TF (official)  |             31.0 |             27.5 |  0.24 sec/it    |
+| PyTorch3D      |             32.7 |             27.9 |  0.18 sec/it    |
++----------------+------------------+------------------+-----------------+
+```
+
+#### References
+[1] Ben Mildenhall and Pratul P. Srinivasan and Matthew Tancik and Jonathan T. Barron and Ravi Ramamoorthi and Ren Ng, NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis, ECCV2020
diff --git a/pytorch3d/projects/nerf/__init__.py b/pytorch3d/projects/nerf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a55772ab58b21573a6eba0356ddd3080164ac7
--- /dev/null
+++ b/pytorch3d/projects/nerf/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/pytorch3d/projects/nerf/configs/fern.yaml b/pytorch3d/projects/nerf/configs/fern.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1beb67211ca43d44f3b92f2ed0c0698205456136
--- /dev/null
+++ b/pytorch3d/projects/nerf/configs/fern.yaml
@@ -0,0 +1,45 @@
+seed: 3
+resume: True
+stats_print_interval: 10
+validation_epoch_interval: 150
+checkpoint_epoch_interval: 150
+checkpoint_path: 'checkpoints/fern_pt3d.pth'
+data:
+  dataset_name: 'fern'
+  image_size: [378, 504] # [height, width]
+  precache_rays: True
+test:
+  mode: 'evaluation'
+  trajectory_type: 'figure_eight'
+  up: [0.0, 1.0, 0.0]
+  scene_center: [0.0, 0.0, -2.0]
+  n_frames: 100
+  fps: 20
+  trajectory_scale: 1.0
+optimizer:
+  max_epochs: 37500
+  lr: 0.0005
+  lr_scheduler_step_size: 12500
+  lr_scheduler_gamma: 0.1
+visualization:
+  history_size: 10
+  visdom: True
+  visdom_server: 'localhost'
+  visdom_port: 8097
+  visdom_env: 'nerf_pytorch3d'
+raysampler:
+  n_pts_per_ray: 64
+  n_pts_per_ray_fine: 64
+  n_rays_per_image: 1024
+  min_depth: 1.2
+  max_depth: 6.28
+  stratified: True
+  stratified_test: False
+  chunk_size_test: 6000
+implicit_function:
+  n_harmonic_functions_xyz: 10
+  n_harmonic_functions_dir: 4
+  n_hidden_neurons_xyz: 256
+  n_hidden_neurons_dir: 128
+  density_noise_std: 0.0
+  n_layers_xyz: 8
diff --git a/pytorch3d/projects/nerf/configs/lego.yaml b/pytorch3d/projects/nerf/configs/lego.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5a8870a7f6e7754843706f79663c8187f227b7f4
--- /dev/null
+++ b/pytorch3d/projects/nerf/configs/lego.yaml
@@ -0,0 +1,45 @@
+seed: 3
+resume: True
+stats_print_interval: 10
+validation_epoch_interval: 30
+checkpoint_epoch_interval: 30
+checkpoint_path: 'checkpoints/lego_pt3d.pth'
+data:
+  dataset_name: 'lego'
+  image_size: [800, 800] # [height, width]
+  precache_rays: True
+test:
+  mode: 'evaluation'
+  trajectory_type: 'circular'
+  up: [0.0, 0.0, 1.0]
+  scene_center: [0.0, 0.0, 0.0]
+  n_frames: 100
+  fps: 20
+  trajectory_scale: 0.2
+optimizer:
+  max_epochs: 20000
+  lr: 0.0005
+  lr_scheduler_step_size: 5000
+  lr_scheduler_gamma: 0.1
+visualization:
+  history_size: 10
+  visdom: True
+  visdom_server: 'localhost'
+  visdom_port: 8097
+  visdom_env: 'nerf_pytorch3d'
+raysampler:
+  n_pts_per_ray: 64
+  n_pts_per_ray_fine: 64
+  n_rays_per_image: 1024
+  min_depth: 2.0
+  max_depth: 6.0
+  stratified: True
+  stratified_test: False
+  chunk_size_test: 6000
+implicit_function:
+  n_harmonic_functions_xyz: 10
+  n_harmonic_functions_dir: 4
+  n_hidden_neurons_xyz: 256
+  n_hidden_neurons_dir: 128
+  density_noise_std: 0.0
+  n_layers_xyz: 8
diff --git a/pytorch3d/projects/nerf/configs/pt3logo.yaml b/pytorch3d/projects/nerf/configs/pt3logo.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..db4a640ab2c10329a29acf7bd33f34bebb46ede7
--- /dev/null
+++ b/pytorch3d/projects/nerf/configs/pt3logo.yaml
@@ -0,0 +1,45 @@
+seed: 3
+resume: True
+stats_print_interval: 10
+validation_epoch_interval: 30
+checkpoint_epoch_interval: 30
+checkpoint_path: 'checkpoints/pt3logo_pt3d.pth'
+data:
+  dataset_name: 'pt3logo'
+  image_size: [512, 1024] # [height, width]
+  precache_rays: True
+test:
+  mode: 'export_video'
+  trajectory_type: 'figure_eight'
+  up: [0.0, -1.0, 0.0]
+  scene_center: [0.0, 0.0, 0.0]
+  n_frames: 100
+  fps: 20
+  trajectory_scale: 0.2
+optimizer:
+  max_epochs: 100000
+  lr: 0.0005
+  lr_scheduler_step_size: 10000
+  lr_scheduler_gamma: 0.1
+visualization:
+  history_size: 20
+  visdom: True
+  visdom_server: 'localhost'
+  visdom_port: 8097
+  visdom_env: 'nerf_pytorch3d'
+raysampler:
+  n_pts_per_ray: 64
+  n_pts_per_ray_fine: 64
+  n_rays_per_image: 1024
+  min_depth: 8.0
+  max_depth: 23.0
+  stratified: True
+  stratified_test: False
+  chunk_size_test: 6000
+implicit_function:
+  n_harmonic_functions_xyz: 10
+  n_harmonic_functions_dir: 4
+  n_hidden_neurons_xyz: 256
+  n_hidden_neurons_dir: 128
+  density_noise_std: 0.0
+  n_layers_xyz: 8
diff --git a/pytorch3d/projects/nerf/nerf/__init__.py b/pytorch3d/projects/nerf/nerf/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a55772ab58b21573a6eba0356ddd3080164ac7
--- /dev/null
+++ b/pytorch3d/projects/nerf/nerf/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/pytorch3d/projects/nerf/nerf/dataset.py b/pytorch3d/projects/nerf/nerf/dataset.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4fae445b2a04f976882d12e270b4ff41f308406
--- /dev/null
+++ b/pytorch3d/projects/nerf/nerf/dataset.py
@@ -0,0 +1,166 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+from typing import List, Optional, Tuple
+
+import numpy as np
+import requests
+import torch
+from PIL import Image
+from pytorch3d.renderer import PerspectiveCameras
+from torch.utils.data import Dataset
+
+
+DEFAULT_DATA_ROOT = os.path.join(
+    os.path.dirname(os.path.realpath(__file__)), "..", "data"
+)
+
+DEFAULT_URL_ROOT = "https://dl.fbaipublicfiles.com/pytorch3d_nerf_data"
+
+ALL_DATASETS = ("lego", "fern", "pt3logo")
+
+
+def trivial_collate(batch):
+    """
+    A trivial collate function that merely returns the uncollated batch.
+    """
+    return batch
+
+
+class ListDataset(Dataset):
+    """
+    A simple dataset made of a list of entries.
+    """
+
+    def __init__(self, entries: List) -> None:
+        """
+        Args:
+            entries: The list of dataset entries.
+        """
+        self._entries = entries
+
+    def __len__(
+        self,
+    ) -> int:
+        return len(self._entries)
+
+    def __getitem__(self, index):
+        return self._entries[index]
+
+
+def get_nerf_datasets(
+    dataset_name: str,  # 'lego | fern'
+    image_size: Tuple[int, int],
+    data_root: str = DEFAULT_DATA_ROOT,
+    autodownload: bool = True,
+) -> Tuple[Dataset, Dataset, Dataset]:
+    """
+    Obtains the training and validation dataset object for a dataset specified
+    with the `dataset_name` argument.
+
+    Args:
+        dataset_name: The name of the dataset to load.
+        image_size: A tuple (height, width) denoting the sizes of the loaded dataset images.
+        data_root: The root folder at which the data is stored.
+        autodownload: Auto-download the dataset files in case they are missing.
+
+    Returns:
+        train_dataset: The training dataset object.
+        val_dataset: The validation dataset object.
+        test_dataset: The testing dataset object.
+    """
+
+    if dataset_name not in ALL_DATASETS:
+        raise ValueError(f"'{dataset_name}'' does not refer to a known dataset.")
+
+    print(f"Loading dataset {dataset_name}, image size={str(image_size)} ...")
+
+    cameras_path = os.path.join(data_root, dataset_name + ".pth")
+    image_path = cameras_path.replace(".pth", ".png")
+
+    if autodownload and any(not os.path.isfile(p) for p in (cameras_path, image_path)):
+        # Automatically download the data files if missing.
+        download_data((dataset_name,), data_root=data_root)
+
+    train_data = torch.load(cameras_path)
+    n_cameras = train_data["cameras"]["R"].shape[0]
+
+    _image_max_image_pixels = Image.MAX_IMAGE_PIXELS
+    Image.MAX_IMAGE_PIXELS = None  # The dataset image is very large ...
+    images = torch.FloatTensor(np.array(Image.open(image_path))) / 255.0
+    images = torch.stack(torch.chunk(images, n_cameras, dim=0))[..., :3]
+    Image.MAX_IMAGE_PIXELS = _image_max_image_pixels
+
+    scale_factors = [s_new / s for s, s_new in zip(images.shape[1:3], image_size)]
+    if abs(scale_factors[0] - scale_factors[1]) > 1e-3:
+        raise ValueError(
+            "Non-isotropic scaling is not allowed. Consider changing the 'image_size' argument."
+        )
+    scale_factor = sum(scale_factors) * 0.5
+
+    if scale_factor != 1.0:
+        print(f"Rescaling dataset (factor={scale_factor})")
+        images = torch.nn.functional.interpolate(
+            images.permute(0, 3, 1, 2),
+            size=tuple(image_size),
+            mode="bilinear",
+        ).permute(0, 2, 3, 1)
+
+    cameras = [
+        PerspectiveCameras(
+            **{k: v[cami][None] for k, v in train_data["cameras"].items()}
+        ).to("cpu")
+        for cami in range(n_cameras)
+    ]
+
+    train_idx, val_idx, test_idx = train_data["split"]
+
+    train_dataset, val_dataset, test_dataset = [
+        ListDataset(
+            [
+                {"image": images[i], "camera": cameras[i], "camera_idx": int(i)}
+                for i in idx
+            ]
+        )
+        for idx in [train_idx, val_idx, test_idx]
+    ]
+
+    return train_dataset, val_dataset, test_dataset
+
+
+def download_data(
+    dataset_names: Optional[List[str]] = None,
+    data_root: str = DEFAULT_DATA_ROOT,
+    url_root: str = DEFAULT_URL_ROOT,
+) -> None:
+    """
+    Downloads the relevant dataset files.
+
+    Args:
+        dataset_names: A list of the names of datasets to download. If `None`,
+            downloads all available datasets.
+    """
+
+    if dataset_names is None:
+        dataset_names = ALL_DATASETS
+
+    os.makedirs(data_root, exist_ok=True)
+
+    for dataset_name in dataset_names:
+        cameras_file = dataset_name + ".pth"
+        images_file = cameras_file.replace(".pth", ".png")
+        license_file = cameras_file.replace(".pth", "_license.txt")
+
+        for fl in (cameras_file, images_file, license_file):
+            local_fl = os.path.join(data_root, fl)
+            remote_fl = os.path.join(url_root, fl)
+
+            print(f"Downloading dataset {dataset_name} from {remote_fl} to {local_fl}.")
+
+            r = requests.get(remote_fl)
+            with open(local_fl, "wb") as f:
+                f.write(r.content)
diff --git a/pytorch3d/projects/nerf/nerf/eval_video_utils.py b/pytorch3d/projects/nerf/nerf/eval_video_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..0720624b735fa4827c7fc895b67ec7a1c8af6c87
--- /dev/null
+++ b/pytorch3d/projects/nerf/nerf/eval_video_utils.py
@@ -0,0 +1,158 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Tuple
+
+import torch
+from pytorch3d.renderer import PerspectiveCameras, look_at_view_transform
+from torch.utils.data.dataset import Dataset
+
+
+def generate_eval_video_cameras(
+    train_dataset,
+    n_eval_cams: int = 100,
+    trajectory_type: str = "figure_eight",
+    trajectory_scale: float = 0.2,
+    scene_center: Tuple[float, float, float] = (0.0, 0.0, 0.0),
+    up: Tuple[float, float, float] = (0.0, 0.0, 1.0),
+) -> Dataset[torch.Tensor]:
+    """
+    Generate a camera trajectory for visualizing a NeRF model.
+
+    Args:
+        train_dataset: The training dataset object.
+        n_eval_cams: Number of cameras in the trajectory.
+        trajectory_type: The type of the camera trajectory. Can be one of:
+            circular: Rotating around the center of the scene at a fixed radius.
+            figure_eight: Figure-of-8 trajectory around the center of the
+                central camera of the training dataset.
+            trefoil_knot: Same as 'figure_eight', but the trajectory has a shape
+                of a trefoil knot (https://en.wikipedia.org/wiki/Trefoil_knot).
+            figure_eight_knot: Same as 'figure_eight', but the trajectory has a shape
+                of a figure-eight knot
+                (https://en.wikipedia.org/wiki/Figure-eight_knot_(mathematics)).
+        trajectory_scale: The extent of the trajectory.
+        up: The "up" vector of the scene (=the normal of the scene floor).
+            Active for the `trajectory_type="circular"`.
+        scene_center: The center of the scene in world coordinates which all
+            the cameras from the generated trajectory look at.
+    Returns:
+        Dictionary of camera instances which can be used as the test dataset
+    """
+    if trajectory_type in ("figure_eight", "trefoil_knot", "figure_eight_knot"):
+        cam_centers = torch.cat(
+            [e["camera"].get_camera_center() for e in train_dataset]
+        )
+        # get the nearest camera center to the mean of centers
+        mean_camera_idx = (
+            ((cam_centers - cam_centers.mean(dim=0)[None]) ** 2)
+            .sum(dim=1)
+            .min(dim=0)
+            .indices
+        )
+        # generate the knot trajectory in canonical coords
+        time = torch.linspace(0, 2 * math.pi, n_eval_cams + 1)[:n_eval_cams]
+        if trajectory_type == "trefoil_knot":
+            traj = _trefoil_knot(time)
+        elif trajectory_type == "figure_eight_knot":
+            traj = _figure_eight_knot(time)
+        elif trajectory_type == "figure_eight":
+            traj = _figure_eight(time)
+        traj[:, 2] -= traj[:, 2].max()
+
+        # transform the canonical knot to the coord frame of the mean camera
+        traj_trans = (
+            train_dataset[mean_camera_idx]["camera"]
+            .get_world_to_view_transform()
+            .inverse()
+        )
+        traj_trans = traj_trans.scale(cam_centers.std(dim=0).mean() * trajectory_scale)
+        traj = traj_trans.transform_points(traj)
+
+    elif trajectory_type == "circular":
+        cam_centers = torch.cat(
+            [e["camera"].get_camera_center() for e in train_dataset]
+        )
+
+        # fit plane to the camera centers
+        plane_mean = cam_centers.mean(dim=0)
+        cam_centers_c = cam_centers - plane_mean[None]
+
+        if up is not None:
+            # us the up vector instead of the plane through the camera centers
+            plane_normal = torch.FloatTensor(up)
+        else:
+            cov = (cam_centers_c.t() @ cam_centers_c) / cam_centers_c.shape[0]
+            _, e_vec = torch.symeig(cov, eigenvectors=True)
+            plane_normal = e_vec[:, 0]
+
+        plane_dist = (plane_normal[None] * cam_centers_c).sum(dim=-1)
+        cam_centers_on_plane = cam_centers_c - plane_dist[:, None] * plane_normal[None]
+
+        cov = (
+            cam_centers_on_plane.t() @ cam_centers_on_plane
+        ) / cam_centers_on_plane.shape[0]
+        _, e_vec = torch.symeig(cov, eigenvectors=True)
+        traj_radius = (cam_centers_on_plane ** 2).sum(dim=1).sqrt().mean()
+        angle = torch.linspace(0, 2.0 * math.pi, n_eval_cams)
+        traj = traj_radius * torch.stack(
+            (torch.zeros_like(angle), angle.cos(), angle.sin()), dim=-1
+        )
+        traj = traj @ e_vec.t() + plane_mean[None]
+
+    else:
+        raise ValueError(f"Unknown trajectory_type {trajectory_type}.")
+
+    # point all cameras towards the center of the scene
+    R, T = look_at_view_transform(
+        eye=traj,
+        at=(scene_center,),  # (1, 3)
+        up=(up,),  # (1, 3)
+        device=traj.device,
+    )
+
+    # get the average focal length and principal point
+    focal = torch.cat([e["camera"].focal_length for e in train_dataset]).mean(dim=0)
+    p0 = torch.cat([e["camera"].principal_point for e in train_dataset]).mean(dim=0)
+
+    # assemble the dataset
+    test_dataset = [
+        {
+            "image": None,
+            "camera": PerspectiveCameras(
+                focal_length=focal[None],
+                principal_point=p0[None],
+                R=R_[None],
+                T=T_[None],
+            ),
+            "camera_idx": i,
+        }
+        for i, (R_, T_) in enumerate(zip(R, T))
+    ]
+
+    return test_dataset
+
+
+def _figure_eight_knot(t: torch.Tensor, z_scale: float = 0.5):
+    x = (2 + (2 * t).cos()) * (3 * t).cos()
+    y = (2 + (2 * t).cos()) * (3 * t).sin()
+    z = (4 * t).sin() * z_scale
+    return torch.stack((x, y, z), dim=-1)
+
+
+def _trefoil_knot(t: torch.Tensor, z_scale: float = 0.5):
+    x = t.sin() + 2 * (2 * t).sin()
+    y = t.cos() - 2 * (2 * t).cos()
+    z = -(3 * t).sin() * z_scale
+    return torch.stack((x, y, z), dim=-1)
+
+
+def _figure_eight(t: torch.Tensor, z_scale: float = 0.5):
+    x = t.cos()
+    y = (2 * t).sin() / 2
+    z = t.sin() * z_scale
+    return torch.stack((x, y, z), dim=-1)
diff --git a/pytorch3d/projects/nerf/nerf/implicit_function.py b/pytorch3d/projects/nerf/nerf/implicit_function.py
new file mode 100644
index 0000000000000000000000000000000000000000..74f3764fda6bb6849886060f7556ed026b4cafef
--- /dev/null
+++ b/pytorch3d/projects/nerf/nerf/implicit_function.py
@@ -0,0 +1,302 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from pytorch3d.renderer import RayBundle, ray_bundle_to_ray_points, HarmonicEmbedding
+
+from .linear_with_repeat import LinearWithRepeat
+
+
+def _xavier_init(linear):
+    """
+    Performs the Xavier weight initialization of the linear layer `linear`.
+    """
+    torch.nn.init.xavier_uniform_(linear.weight.data)
+
+
+class NeuralRadianceField(torch.nn.Module):
+    def __init__(
+        self,
+        n_harmonic_functions_xyz: int = 6,
+        n_harmonic_functions_dir: int = 4,
+        n_hidden_neurons_xyz: int = 256,
+        n_hidden_neurons_dir: int = 128,
+        n_layers_xyz: int = 8,
+        append_xyz: Tuple[int] = (5,),
+        use_multiple_streams: bool = True,
+        **kwargs,
+    ):
+        """
+        Args:
+            n_harmonic_functions_xyz: The number of harmonic functions
+                used to form the harmonic embedding of 3D point locations.
+            n_harmonic_functions_dir: The number of harmonic functions
+                used to form the harmonic embedding of the ray directions.
+            n_hidden_neurons_xyz: The number of hidden units in the
+                fully connected layers of the MLP that accepts the 3D point
+                locations and outputs the occupancy field with the intermediate
+                features.
+            n_hidden_neurons_dir: The number of hidden units in the
+                fully connected layers of the MLP that accepts the intermediate
+                features and ray directions and outputs the radiance field
+                (per-point colors).
+            n_layers_xyz: The number of layers of the MLP that outputs the
+                occupancy field.
+            append_xyz: The list of indices of the skip layers of the occupancy MLP.
+            use_multiple_streams: Whether density and color should be calculated on
+                separate CUDA streams.
+        """
+        super().__init__()
+
+        # The harmonic embedding layer converts input 3D coordinates
+        # to a representation that is more suitable for
+        # processing with a deep neural network.
+        self.harmonic_embedding_xyz = HarmonicEmbedding(n_harmonic_functions_xyz)
+        self.harmonic_embedding_dir = HarmonicEmbedding(n_harmonic_functions_dir)
+        embedding_dim_xyz = n_harmonic_functions_xyz * 2 * 3 + 3
+        embedding_dim_dir = n_harmonic_functions_dir * 2 * 3 + 3
+
+        self.mlp_xyz = MLPWithInputSkips(
+            n_layers_xyz,
+            embedding_dim_xyz,
+            n_hidden_neurons_xyz,
+            embedding_dim_xyz,
+            n_hidden_neurons_xyz,
+            input_skips=append_xyz,
+        )
+
+        self.intermediate_linear = torch.nn.Linear(
+            n_hidden_neurons_xyz, n_hidden_neurons_xyz
+        )
+        _xavier_init(self.intermediate_linear)
+
+        self.density_layer = torch.nn.Linear(n_hidden_neurons_xyz, 1)
+        _xavier_init(self.density_layer)
+
+        # Zero the bias of the density layer to avoid
+        # a completely transparent initialization.
+        self.density_layer.bias.data[:] = 0.0  # fixme: Sometimes this is not enough
+
+        self.color_layer = torch.nn.Sequential(
+            LinearWithRepeat(
+                n_hidden_neurons_xyz + embedding_dim_dir, n_hidden_neurons_dir
+            ),
+            torch.nn.ReLU(True),
+            torch.nn.Linear(n_hidden_neurons_dir, 3),
+            torch.nn.Sigmoid(),
+        )
+        self.use_multiple_streams = use_multiple_streams
+
+    def _get_densities(
+        self,
+        features: torch.Tensor,
+        depth_values: torch.Tensor,
+        density_noise_std: float,
+    ) -> torch.Tensor:
+        """
+        This function takes `features` predicted by `self.mlp_xyz`
+        and converts them to `raw_densities` with `self.density_layer`.
+        `raw_densities` are later re-weighted using the depth step sizes
+        and mapped to [0-1] range with 1 - inverse exponential of `raw_densities`.
+        """
+        raw_densities = self.density_layer(features)
+        deltas = torch.cat(
+            (
+                depth_values[..., 1:] - depth_values[..., :-1],
+                1e10 * torch.ones_like(depth_values[..., :1]),
+            ),
+            dim=-1,
+        )[..., None]
+        if density_noise_std > 0.0:
+            raw_densities = (
+                raw_densities + torch.randn_like(raw_densities) * density_noise_std
+            )
+        densities = 1 - (-deltas * torch.relu(raw_densities)).exp()
+        return densities
+
+    def _get_colors(
+        self, features: torch.Tensor, rays_directions: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        This function takes per-point `features` predicted by `self.mlp_xyz`
+        and evaluates the color model in order to attach to each
+        point a 3D vector of its RGB color.
+        """
+        # Normalize the ray_directions to unit l2 norm.
+        rays_directions_normed = torch.nn.functional.normalize(rays_directions, dim=-1)
+
+        # Obtain the harmonic embedding of the normalized ray directions.
+        rays_embedding = self.harmonic_embedding_dir(rays_directions_normed)
+
+        return self.color_layer((self.intermediate_linear(features), rays_embedding))
+
+    def _get_densities_and_colors(
+        self, features: torch.Tensor, ray_bundle: RayBundle, density_noise_std: float
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        The second part of the forward calculation.
+
+        Args:
+            features: the output of the common mlp (the prior part of the
+                calculation), shape
+                (minibatch x ... x self.n_hidden_neurons_xyz).
+            ray_bundle: As for forward().
+            density_noise_std:  As for forward().
+
+        Returns:
+            rays_densities: A tensor of shape `(minibatch, ..., num_points_per_ray, 1)`
+                denoting the opacity of each ray point.
+            rays_colors: A tensor of shape `(minibatch, ..., num_points_per_ray, 3)`
+                denoting the color of each ray point.
+        """
+        if self.use_multiple_streams and features.is_cuda:
+            current_stream = torch.cuda.current_stream(features.device)
+            other_stream = torch.cuda.Stream(features.device)
+            other_stream.wait_stream(current_stream)
+
+            with torch.cuda.stream(other_stream):
+                rays_densities = self._get_densities(
+                    features, ray_bundle.lengths, density_noise_std
+                )
+                # rays_densities.shape = [minibatch x ... x 1] in [0-1]
+
+            rays_colors = self._get_colors(features, ray_bundle.directions)
+            # rays_colors.shape = [minibatch x ... x 3] in [0-1]
+
+            current_stream.wait_stream(other_stream)
+        else:
+            # Same calculation as above, just serial.
+            rays_densities = self._get_densities(
+                features, ray_bundle.lengths, density_noise_std
+            )
+            rays_colors = self._get_colors(features, ray_bundle.directions)
+        return rays_densities, rays_colors
+
+    def forward(
+        self,
+        ray_bundle: RayBundle,
+        density_noise_std: float = 0.0,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        The forward function accepts the parametrizations of
+        3D points sampled along projection rays. The forward
+        pass is responsible for attaching a 3D vector
+        and a 1D scalar representing the point's
+        RGB color and opacity respectively.
+
+        Args:
+            ray_bundle: A RayBundle object containing the following variables:
+                origins: A tensor of shape `(minibatch, ..., 3)` denoting the
+                    origins of the sampling rays in world coords.
+                directions: A tensor of shape `(minibatch, ..., 3)`
+                    containing the direction vectors of sampling rays in world coords.
+                lengths: A tensor of shape `(minibatch, ..., num_points_per_ray)`
+                    containing the lengths at which the rays are sampled.
+            density_noise_std: A floating point value representing the
+                variance of the random normal noise added to the output of
+                the opacity function. This can prevent floating artifacts.
+
+        Returns:
+            rays_densities: A tensor of shape `(minibatch, ..., num_points_per_ray, 1)`
+                denoting the opacity of each ray point.
+            rays_colors: A tensor of shape `(minibatch, ..., num_points_per_ray, 3)`
+                denoting the color of each ray point.
+        """
+        # We first convert the ray parametrizations to world
+        # coordinates with `ray_bundle_to_ray_points`.
+        rays_points_world = ray_bundle_to_ray_points(ray_bundle)
+        # rays_points_world.shape = [minibatch x ... x 3]
+
+        # For each 3D world coordinate, we obtain its harmonic embedding.
+        embeds_xyz = self.harmonic_embedding_xyz(rays_points_world)
+        # embeds_xyz.shape = [minibatch x ... x self.n_harmonic_functions*6 + 3]
+
+        # self.mlp maps each harmonic embedding to a latent feature space.
+        features = self.mlp_xyz(embeds_xyz, embeds_xyz)
+        # features.shape = [minibatch x ... x self.n_hidden_neurons_xyz]
+
+        rays_densities, rays_colors = self._get_densities_and_colors(
+            features, ray_bundle, density_noise_std
+        )
+        return rays_densities, rays_colors
+
+
+class MLPWithInputSkips(torch.nn.Module):
+    """
+    Implements the multi-layer perceptron architecture of the Neural Radiance Field.
+
+    As such, `MLPWithInputSkips` is a multi layer perceptron consisting
+    of a sequence of linear layers with ReLU activations.
+
+    Additionally, for a set of predefined layers `input_skips`, the forward pass
+    appends a skip tensor `z` to the output of the preceding layer.
+
+    Note that this follows the architecture described in the Supplementary
+    Material (Fig. 7) of [1].
+
+    References:
+        [1] Ben Mildenhall and Pratul P. Srinivasan and Matthew Tancik
+            and Jonathan T. Barron and Ravi Ramamoorthi and Ren Ng:
+            NeRF: Representing Scenes as Neural Radiance Fields for View
+            Synthesis, ECCV2020
+    """
+
+    def __init__(
+        self,
+        n_layers: int,
+        input_dim: int,
+        output_dim: int,
+        skip_dim: int,
+        hidden_dim: int,
+        input_skips: Tuple[int] = (),
+    ):
+        """
+        Args:
+            n_layers: The number of linear layers of the MLP.
+            input_dim: The number of channels of the input tensor.
+            output_dim: The number of channels of the output.
+            skip_dim: The number of channels of the tensor `z` appended when
+                evaluating the skip layers.
+            hidden_dim: The number of hidden units of the MLP.
+            input_skips: The list of layer indices at which we append the skip
+                tensor `z`.
+        """
+        super().__init__()
+        layers = []
+        for layeri in range(n_layers):
+            if layeri == 0:
+                dimin = input_dim
+                dimout = hidden_dim
+            elif layeri in input_skips:
+                dimin = hidden_dim + skip_dim
+                dimout = hidden_dim
+            else:
+                dimin = hidden_dim
+                dimout = hidden_dim
+            linear = torch.nn.Linear(dimin, dimout)
+            _xavier_init(linear)
+            layers.append(torch.nn.Sequential(linear, torch.nn.ReLU(True)))
+        self.mlp = torch.nn.ModuleList(layers)
+        self._input_skips = set(input_skips)
+
+    def forward(self, x: torch.Tensor, z: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: The input tensor of shape `(..., input_dim)`.
+            z: The input skip tensor of shape `(..., skip_dim)` which is appended
+                to layers whose indices are specified by `input_skips`.
+        Returns:
+            y: The output tensor of shape `(..., output_dim)`.
+        """
+        y = x
+        for li, layer in enumerate(self.mlp):
+            if li in self._input_skips:
+                y = torch.cat((y, z), dim=-1)
+            y = layer(y)
+        return y
diff --git a/pytorch3d/projects/nerf/nerf/linear_with_repeat.py b/pytorch3d/projects/nerf/nerf/linear_with_repeat.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f23e6b3a6c0c967a9dd27b92efe7ca4ea618a03
--- /dev/null
+++ b/pytorch3d/projects/nerf/nerf/linear_with_repeat.py
@@ -0,0 +1,57 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+
+
+class LinearWithRepeat(torch.nn.Linear):
+    """
+    if x has shape (..., k, n1)
+    and y has shape (..., n2)
+    then
+    LinearWithRepeat(n1 + n2, out_features).forward((x,y))
+    is equivalent to
+    Linear(n1 + n2, out_features).forward(
+        torch.cat([x, y.unsqueeze(-2).expand(..., k, n2)], dim=-1)
+    )
+
+    Or visually:
+    Given the following, for each ray,
+
+                feature   ->
+
+    ray         xxxxxxxx
+    position    xxxxxxxx
+      |         xxxxxxxx
+      v         xxxxxxxx
+
+
+    and
+                            yyyyyyyy
+
+    where the y's do not depend on the position
+    but only on the ray,
+    we want to evaluate a Linear layer on both
+    types of data at every position.
+
+    It's as if we constructed
+
+                xxxxxxxxyyyyyyyy
+                xxxxxxxxyyyyyyyy
+                xxxxxxxxyyyyyyyy
+                xxxxxxxxyyyyyyyy
+
+    and sent that through the Linear.
+    """
+
+    def forward(self, input: Tuple[torch.Tensor, torch.Tensor]) -> torch.Tensor:
+        n1 = input[0].shape[-1]
+        output1 = F.linear(input[0], self.weight[:, :n1], self.bias)
+        output2 = F.linear(input[1], self.weight[:, n1:], None)
+        return output1 + output2.unsqueeze(-2)
diff --git a/pytorch3d/projects/nerf/nerf/nerf_renderer.py b/pytorch3d/projects/nerf/nerf/nerf_renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c2d5dc880ba2854a2ff923c7fd511575fc45c16
--- /dev/null
+++ b/pytorch3d/projects/nerf/nerf/nerf_renderer.py
@@ -0,0 +1,434 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional, Tuple
+
+import torch
+from pytorch3d.renderer import ImplicitRenderer, ray_bundle_to_ray_points
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.structures import Pointclouds
+from pytorch3d.vis.plotly_vis import plot_scene
+from visdom import Visdom
+
+from .implicit_function import NeuralRadianceField
+from .raymarcher import EmissionAbsorptionNeRFRaymarcher
+from .raysampler import NeRFRaysampler, ProbabilisticRaysampler
+from .utils import calc_mse, calc_psnr, sample_images_at_mc_locs
+
+
+class RadianceFieldRenderer(torch.nn.Module):
+    """
+    Implements a renderer of a Neural Radiance Field.
+
+    This class holds pointers to the fine and coarse renderer objects, which are
+    instances of `pytorch3d.renderer.ImplicitRenderer`, and pointers to the
+    neural networks representing the fine and coarse Neural Radiance Fields,
+    which are instances of `NeuralRadianceField`.
+
+    The rendering forward pass proceeds as follows:
+        1) For a given input camera, rendering rays are generated with the
+            `NeRFRaysampler` object of `self._renderer['coarse']`.
+            In the training mode (`self.training==True`), the rays are a set
+                of `n_rays_per_image` random 2D locations of the image grid.
+            In the evaluation mode (`self.training==False`), the rays correspond
+                to the full image grid. The rays are further split to
+                `chunk_size_test`-sized chunks to prevent out-of-memory errors.
+        2) For each ray point, the coarse `NeuralRadianceField` MLP is evaluated.
+            The pointer to this MLP is stored in `self._implicit_function['coarse']`
+        3) The coarse radiance field is rendered with the
+            `EmissionAbsorptionNeRFRaymarcher` object of `self._renderer['coarse']`.
+        4) The coarse raymarcher outputs a probability distribution that guides
+            the importance raysampling of the fine rendering pass. The
+            `ProbabilisticRaysampler` stored in `self._renderer['fine'].raysampler`
+            implements the importance ray-sampling.
+        5) Similar to 2) the fine MLP in `self._implicit_function['fine']`
+            labels the ray points with occupancies and colors.
+        6) self._renderer['fine'].raymarcher` generates the final fine render.
+        7) The fine and coarse renders are compared to the ground truth input image
+            with PSNR and MSE metrics.
+    """
+
+    def __init__(
+        self,
+        image_size: Tuple[int, int],
+        n_pts_per_ray: int,
+        n_pts_per_ray_fine: int,
+        n_rays_per_image: int,
+        min_depth: float,
+        max_depth: float,
+        stratified: bool,
+        stratified_test: bool,
+        chunk_size_test: int,
+        n_harmonic_functions_xyz: int = 6,
+        n_harmonic_functions_dir: int = 4,
+        n_hidden_neurons_xyz: int = 256,
+        n_hidden_neurons_dir: int = 128,
+        n_layers_xyz: int = 8,
+        append_xyz: Tuple[int] = (5,),
+        density_noise_std: float = 0.0,
+        visualization: bool = False,
+    ):
+        """
+        Args:
+            image_size: The size of the rendered image (`[height, width]`).
+            n_pts_per_ray: The number of points sampled along each ray for the
+                coarse rendering pass.
+            n_pts_per_ray_fine: The number of points sampled along each ray for the
+                fine rendering pass.
+            n_rays_per_image: Number of Monte Carlo ray samples when training
+                (`self.training==True`).
+            min_depth: The minimum depth of a sampled ray-point for the coarse rendering.
+            max_depth: The maximum depth of a sampled ray-point for the coarse rendering.
+            stratified: If `True`, stratifies (=randomly offsets) the depths
+                of each ray point during training (`self.training==True`).
+            stratified_test: If `True`, stratifies (=randomly offsets) the depths
+                of each ray point during evaluation (`self.training==False`).
+            chunk_size_test: The number of rays in each chunk of image rays.
+                Active only when `self.training==True`.
+            n_harmonic_functions_xyz: The number of harmonic functions
+                used to form the harmonic embedding of 3D point locations.
+            n_harmonic_functions_dir: The number of harmonic functions
+                used to form the harmonic embedding of the ray directions.
+            n_hidden_neurons_xyz: The number of hidden units in the
+                fully connected layers of the MLP that accepts the 3D point
+                locations and outputs the occupancy field with the intermediate
+                features.
+            n_hidden_neurons_dir: The number of hidden units in the
+                fully connected layers of the MLP that accepts the intermediate
+                features and ray directions and outputs the radiance field
+                (per-point colors).
+            n_layers_xyz: The number of layers of the MLP that outputs the
+                occupancy field.
+            append_xyz: The list of indices of the skip layers of the occupancy MLP.
+                Prior to evaluating the skip layers, the tensor which was input to MLP
+                is appended to the skip layer input.
+            density_noise_std: The standard deviation of the random normal noise
+                added to the output of the occupancy MLP.
+                Active only when `self.training==True`.
+            visualization: whether to store extra output for visualization.
+        """
+
+        super().__init__()
+
+        # The renderers and implicit functions are stored under the fine/coarse
+        # keys in ModuleDict PyTorch modules.
+        self._renderer = torch.nn.ModuleDict()
+        self._implicit_function = torch.nn.ModuleDict()
+
+        # Init the EA raymarcher used by both passes.
+        raymarcher = EmissionAbsorptionNeRFRaymarcher()
+
+        # Parse out image dimensions.
+        image_height, image_width = image_size
+
+        for render_pass in ("coarse", "fine"):
+            if render_pass == "coarse":
+                # Initialize the coarse raysampler.
+                raysampler = NeRFRaysampler(
+                    n_pts_per_ray=n_pts_per_ray,
+                    min_depth=min_depth,
+                    max_depth=max_depth,
+                    stratified=stratified,
+                    stratified_test=stratified_test,
+                    n_rays_per_image=n_rays_per_image,
+                    image_height=image_height,
+                    image_width=image_width,
+                )
+            elif render_pass == "fine":
+                # Initialize the fine raysampler.
+                raysampler = ProbabilisticRaysampler(
+                    n_pts_per_ray=n_pts_per_ray_fine,
+                    stratified=stratified,
+                    stratified_test=stratified_test,
+                )
+            else:
+                raise ValueError(f"No such rendering pass {render_pass}")
+
+            # Initialize the fine/coarse renderer.
+            self._renderer[render_pass] = ImplicitRenderer(
+                raysampler=raysampler,
+                raymarcher=raymarcher,
+            )
+
+            # Instantiate the fine/coarse NeuralRadianceField module.
+            self._implicit_function[render_pass] = NeuralRadianceField(
+                n_harmonic_functions_xyz=n_harmonic_functions_xyz,
+                n_harmonic_functions_dir=n_harmonic_functions_dir,
+                n_hidden_neurons_xyz=n_hidden_neurons_xyz,
+                n_hidden_neurons_dir=n_hidden_neurons_dir,
+                n_layers_xyz=n_layers_xyz,
+                append_xyz=append_xyz,
+            )
+
+        self._density_noise_std = density_noise_std
+        self._chunk_size_test = chunk_size_test
+        self._image_size = image_size
+        self.visualization = visualization
+
+    def precache_rays(
+        self,
+        cache_cameras: List[CamerasBase],
+        cache_camera_hashes: List[str],
+    ):
+        """
+        Precaches the rays emitted from the list of cameras `cache_cameras`,
+        where each camera is uniquely identified with the corresponding hash
+        from `cache_camera_hashes`.
+
+        The cached rays are moved to cpu and stored in
+        `self._renderer['coarse']._ray_cache`.
+
+        Raises `ValueError` when caching two cameras with the same hash.
+
+        Args:
+            cache_cameras: A list of `N` cameras for which the rays are pre-cached.
+            cache_camera_hashes: A list of `N` unique identifiers for each
+                camera from `cameras`.
+        """
+        self._renderer["coarse"].raysampler.precache_rays(
+            cache_cameras,
+            cache_camera_hashes,
+        )
+
+    def _process_ray_chunk(
+        self,
+        camera_hash: Optional[str],
+        camera: CamerasBase,
+        image: torch.Tensor,
+        chunk_idx: int,
+    ) -> dict:
+        """
+        Samples and renders a chunk of rays.
+
+        Args:
+            camera_hash: A unique identifier of a pre-cached camera.
+                If `None`, the cache is not searched and the sampled rays are
+                calculated from scratch.
+            camera: A batch of cameras from which the scene is rendered.
+            image: A batch of corresponding ground truth images of shape
+                ('batch_size', ·, ·, 3).
+            chunk_idx: The index of the currently rendered ray chunk.
+        Returns:
+            out: `dict` containing the outputs of the rendering:
+                `rgb_coarse`: The result of the coarse rendering pass.
+                `rgb_fine`: The result of the fine rendering pass.
+                `rgb_gt`: The corresponding ground-truth RGB values.
+        """
+        # Initialize the outputs of the coarse rendering to None.
+        coarse_ray_bundle = None
+        coarse_weights = None
+
+        # First evaluate the coarse rendering pass, then the fine one.
+        for renderer_pass in ("coarse", "fine"):
+            (rgb, weights), ray_bundle_out = self._renderer[renderer_pass](
+                cameras=camera,
+                volumetric_function=self._implicit_function[renderer_pass],
+                chunksize=self._chunk_size_test,
+                chunk_idx=chunk_idx,
+                density_noise_std=(self._density_noise_std if self.training else 0.0),
+                input_ray_bundle=coarse_ray_bundle,
+                ray_weights=coarse_weights,
+                camera_hash=camera_hash,
+            )
+
+            if renderer_pass == "coarse":
+                rgb_coarse = rgb
+                # Store the weights and the rays of the first rendering pass
+                # for the ensuing importance ray-sampling of the fine render.
+                coarse_ray_bundle = ray_bundle_out
+                coarse_weights = weights
+                if image is not None:
+                    # Sample the ground truth images at the xy locations of the
+                    # rendering ray pixels.
+                    rgb_gt = sample_images_at_mc_locs(
+                        image[..., :3][None],
+                        ray_bundle_out.xys,
+                    )
+                else:
+                    rgb_gt = None
+
+            elif renderer_pass == "fine":
+                rgb_fine = rgb
+
+            else:
+                raise ValueError(f"No such rendering pass {renderer_pass}")
+
+        out = {"rgb_fine": rgb_fine, "rgb_coarse": rgb_coarse, "rgb_gt": rgb_gt}
+        if self.visualization:
+            # Store the coarse rays/weights only for visualization purposes.
+            out["coarse_ray_bundle"] = type(coarse_ray_bundle)(
+                *[v.detach().cpu() for k, v in coarse_ray_bundle._asdict().items()]
+            )
+            out["coarse_weights"] = coarse_weights.detach().cpu()
+
+        return out
+
+    def forward(
+        self,
+        camera_hash: Optional[str],
+        camera: CamerasBase,
+        image: torch.Tensor,
+    ) -> Tuple[dict, dict]:
+        """
+        Performs the coarse and fine rendering passes of the radiance field
+        from the viewpoint of the input `camera`.
+        Afterwards, both renders are compared to the input ground truth `image`
+        by evaluating the peak signal-to-noise ratio and the mean-squared error.
+
+        The rendering result depends on the `self.training` flag:
+            - In the training mode (`self.training==True`), the function renders
+              a random subset of image rays (Monte Carlo rendering).
+            - In evaluation mode (`self.training==False`), the function renders
+              the full image. In order to prevent out-of-memory errors,
+              when `self.training==False`, the rays are sampled and rendered
+              in batches of size `chunksize`.
+
+        Args:
+            camera_hash: A unique identifier of a pre-cached camera.
+                If `None`, the cache is not searched and the sampled rays are
+                calculated from scratch.
+            camera: A batch of cameras from which the scene is rendered.
+            image: A batch of corresponding ground truth images of shape
+                ('batch_size', ·, ·, 3).
+        Returns:
+            out: `dict` containing the outputs of the rendering:
+                `rgb_coarse`: The result of the coarse rendering pass.
+                `rgb_fine`: The result of the fine rendering pass.
+                `rgb_gt`: The corresponding ground-truth RGB values.
+
+                The shape of `rgb_coarse`, `rgb_fine`, `rgb_gt` depends on the
+                `self.training` flag:
+                    If `==True`, all 3 tensors are of shape
+                    `(batch_size, n_rays_per_image, 3)` and contain the result
+                    of the Monte Carlo training rendering pass.
+                    If `==False`, all 3 tensors are of shape
+                    `(batch_size, image_size[0], image_size[1], 3)` and contain
+                    the result of the full image rendering pass.
+            metrics: `dict` containing the error metrics comparing the fine and
+                coarse renders to the ground truth:
+                `mse_coarse`: Mean-squared error between the coarse render and
+                    the input `image`
+                `mse_fine`: Mean-squared error between the fine render and
+                    the input `image`
+                `psnr_coarse`: Peak signal-to-noise ratio between the coarse render and
+                    the input `image`
+                `psnr_fine`: Peak signal-to-noise ratio between the fine render and
+                    the input `image`
+        """
+        if not self.training:
+            # Full evaluation pass.
+            n_chunks = self._renderer["coarse"].raysampler.get_n_chunks(
+                self._chunk_size_test,
+                camera.R.shape[0],
+            )
+        else:
+            # MonteCarlo ray sampling.
+            n_chunks = 1
+
+        # Process the chunks of rays.
+        chunk_outputs = [
+            self._process_ray_chunk(
+                camera_hash,
+                camera,
+                image,
+                chunk_idx,
+            )
+            for chunk_idx in range(n_chunks)
+        ]
+
+        if not self.training:
+            # For a full render pass concatenate the output chunks,
+            # and reshape to image size.
+            out = {
+                k: torch.cat(
+                    [ch_o[k] for ch_o in chunk_outputs],
+                    dim=1,
+                ).view(-1, *self._image_size, 3)
+                if chunk_outputs[0][k] is not None
+                else None
+                for k in ("rgb_fine", "rgb_coarse", "rgb_gt")
+            }
+        else:
+            out = chunk_outputs[0]
+
+        # Calc the error metrics.
+        metrics = {}
+        if image is not None:
+            for render_pass in ("coarse", "fine"):
+                for metric_name, metric_fun in zip(
+                    ("mse", "psnr"), (calc_mse, calc_psnr)
+                ):
+                    metrics[f"{metric_name}_{render_pass}"] = metric_fun(
+                        out["rgb_" + render_pass][..., :3],
+                        out["rgb_gt"][..., :3],
+                    )
+
+        return out, metrics
+
+
+def visualize_nerf_outputs(
+    nerf_out: dict, output_cache: List, viz: Visdom, visdom_env: str
+):
+    """
+    Visualizes the outputs of the `RadianceFieldRenderer`.
+
+    Args:
+        nerf_out: An output of the validation rendering pass.
+        output_cache: A list with outputs of several training render passes.
+        viz: A visdom connection object.
+        visdom_env: The name of visdom environment for visualization.
+    """
+
+    # Show the training images.
+    ims = torch.stack([o["image"] for o in output_cache])
+    ims = torch.cat(list(ims), dim=1)
+    viz.image(
+        ims.permute(2, 0, 1),
+        env=visdom_env,
+        win="images",
+        opts={"title": "train_images"},
+    )
+
+    # Show the coarse and fine renders together with the ground truth images.
+    ims_full = torch.cat(
+        [
+            nerf_out[imvar][0].permute(2, 0, 1).detach().cpu().clamp(0.0, 1.0)
+            for imvar in ("rgb_coarse", "rgb_fine", "rgb_gt")
+        ],
+        dim=2,
+    )
+    viz.image(
+        ims_full,
+        env=visdom_env,
+        win="images_full",
+        opts={"title": "coarse | fine | target"},
+    )
+
+    # Make a 3D plot of training cameras and their emitted rays.
+    camera_trace = {
+        f"camera_{ci:03d}": o["camera"].cpu() for ci, o in enumerate(output_cache)
+    }
+    ray_pts_trace = {
+        f"ray_pts_{ci:03d}": Pointclouds(
+            ray_bundle_to_ray_points(o["coarse_ray_bundle"])
+            .detach()
+            .cpu()
+            .view(1, -1, 3)
+        )
+        for ci, o in enumerate(output_cache)
+    }
+    plotly_plot = plot_scene(
+        {
+            "training_scene": {
+                **camera_trace,
+                **ray_pts_trace,
+            },
+        },
+        pointcloud_max_points=5000,
+        pointcloud_marker_size=1,
+        camera_scale=0.3,
+    )
+    viz.plotlyplot(plotly_plot, env=visdom_env, win="scenes")
diff --git a/pytorch3d/projects/nerf/nerf/raymarcher.py b/pytorch3d/projects/nerf/nerf/raymarcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..c1f0b5fb4dbab324f7e1fc074d62f0c7931d2fd3
--- /dev/null
+++ b/pytorch3d/projects/nerf/nerf/raymarcher.py
@@ -0,0 +1,73 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from pytorch3d.renderer import EmissionAbsorptionRaymarcher
+from pytorch3d.renderer.implicit.raymarching import (
+    _check_density_bounds,
+    _check_raymarcher_inputs,
+    _shifted_cumprod,
+)
+
+
+class EmissionAbsorptionNeRFRaymarcher(EmissionAbsorptionRaymarcher):
+    """
+    This is essentially the `pytorch3d.renderer.EmissionAbsorptionRaymarcher`
+    which additionally returns the rendering weights. It also skips returning
+    the computation of the alpha-mask which is, in case of NeRF, equal to 1
+    everywhere.
+
+    The weights are later used in the NeRF pipeline to carry out the importance
+    ray-sampling for the fine rendering pass.
+
+    For more details about the EmissionAbsorptionRaymarcher please refer to
+    the documentation of `pytorch3d.renderer.EmissionAbsorptionRaymarcher`.
+    """
+
+    def forward(
+        self,
+        rays_densities: torch.Tensor,
+        rays_features: torch.Tensor,
+        eps: float = 1e-10,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Args:
+            rays_densities: Per-ray density values represented with a tensor
+                of shape `(..., n_points_per_ray, 1)` whose values range in [0, 1].
+            rays_features: Per-ray feature values represented with a tensor
+                of shape `(..., n_points_per_ray, feature_dim)`.
+            eps: A lower bound added to `rays_densities` before computing
+                the absorption function (cumprod of `1-rays_densities` along
+                each ray). This prevents the cumprod to yield exact 0
+                which would inhibit any gradient-based learning.
+
+        Returns:
+            features: A tensor of shape `(..., feature_dim)` containing
+                the rendered features for each ray.
+            weights: A tensor of shape `(..., n_points_per_ray)` containing
+                the ray-specific emission-absorption distribution.
+                Each ray distribution `(..., :)` is a valid probability
+                distribution, i.e. it contains non-negative values that integrate
+                to 1, such that `weights.sum(dim=-1)==1).all()` yields `True`.
+        """
+        _check_raymarcher_inputs(
+            rays_densities,
+            rays_features,
+            None,
+            z_can_be_none=True,
+            features_can_be_none=False,
+            density_1d=True,
+        )
+        _check_density_bounds(rays_densities)
+        rays_densities = rays_densities[..., 0]
+        absorption = _shifted_cumprod(
+            (1.0 + eps) - rays_densities, shift=self.surface_thickness
+        )
+        weights = rays_densities * absorption
+        features = (weights[..., None] * rays_features).sum(dim=-2)
+
+        return features, weights
diff --git a/pytorch3d/projects/nerf/nerf/raysampler.py b/pytorch3d/projects/nerf/nerf/raysampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..e061380cf2bd57b18f24072d643e22016d2cfa19
--- /dev/null
+++ b/pytorch3d/projects/nerf/nerf/raysampler.py
@@ -0,0 +1,365 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import List
+
+import torch
+from pytorch3d.renderer import MonteCarloRaysampler, NDCGridRaysampler, RayBundle
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.renderer.implicit.sample_pdf import sample_pdf
+
+
+class ProbabilisticRaysampler(torch.nn.Module):
+    """
+    Implements the importance sampling of points along rays.
+    The input is a `RayBundle` object with a `ray_weights` tensor
+    which specifies the probabilities of sampling a point along each ray.
+
+    This raysampler is used for the fine rendering pass of NeRF.
+    As such, the forward pass accepts the RayBundle output by the
+    raysampling of the coarse rendering pass. Hence, it does not
+    take cameras as input.
+    """
+
+    def __init__(
+        self,
+        n_pts_per_ray: int,
+        stratified: bool,
+        stratified_test: bool,
+        add_input_samples: bool = True,
+    ):
+        """
+        Args:
+            n_pts_per_ray: The number of points to sample along each ray.
+            stratified: If `True`, the input `ray_weights` are assumed to be
+                sampled at equidistant intervals.
+            stratified_test: Same as `stratified` with the difference that this
+                setting is applied when the module is in the `eval` mode
+                (`self.training==False`).
+            add_input_samples: Concatenates and returns the sampled values
+                together with the input samples.
+        """
+        super().__init__()
+        self._n_pts_per_ray = n_pts_per_ray
+        self._stratified = stratified
+        self._stratified_test = stratified_test
+        self._add_input_samples = add_input_samples
+
+    def forward(
+        self,
+        input_ray_bundle: RayBundle,
+        ray_weights: torch.Tensor,
+        **kwargs,
+    ) -> RayBundle:
+        """
+        Args:
+            input_ray_bundle: An instance of `RayBundle` specifying the
+                source rays for sampling of the probability distribution.
+            ray_weights: A tensor of shape
+                `(..., input_ray_bundle.legths.shape[-1])` with non-negative
+                elements defining the probability distribution to sample
+                ray points from.
+
+        Returns:
+            ray_bundle: A new `RayBundle` instance containing the input ray
+                points together with `n_pts_per_ray` additional sampled
+                points per ray.
+        """
+
+        # Calculate the mid-points between the ray depths.
+        z_vals = input_ray_bundle.lengths
+        batch_size = z_vals.shape[0]
+
+        # Carry out the importance sampling.
+        with torch.no_grad():
+            z_vals_mid = 0.5 * (z_vals[..., 1:] + z_vals[..., :-1])
+            z_samples = sample_pdf(
+                z_vals_mid.view(-1, z_vals_mid.shape[-1]),
+                ray_weights.view(-1, ray_weights.shape[-1])[..., 1:-1],
+                self._n_pts_per_ray,
+                det=not (
+                    (self._stratified and self.training)
+                    or (self._stratified_test and not self.training)
+                ),
+            ).view(batch_size, z_vals.shape[1], self._n_pts_per_ray)
+
+        if self._add_input_samples:
+            # Add the new samples to the input ones.
+            z_vals = torch.cat((z_vals, z_samples), dim=-1)
+        else:
+            z_vals = z_samples
+        # Resort by depth.
+        z_vals, _ = torch.sort(z_vals, dim=-1)
+
+        return RayBundle(
+            origins=input_ray_bundle.origins,
+            directions=input_ray_bundle.directions,
+            lengths=z_vals,
+            xys=input_ray_bundle.xys,
+        )
+
+
+class NeRFRaysampler(torch.nn.Module):
+    """
+    Implements the raysampler of NeRF.
+
+    Depending on the `self.training` flag, the raysampler either samples
+    a chunk of random rays (`self.training==True`), or returns a subset of rays
+    of the full image grid (`self.training==False`).
+    The chunking of rays allows for efficient evaluation of the NeRF implicit
+    surface function without encountering out-of-GPU-memory errors.
+
+    Additionally, this raysampler supports pre-caching of the ray bundles
+    for a set of input cameras (`self.precache_rays`).
+    Pre-caching the rays before training greatly speeds-up the ensuing
+    raysampling step of the training NeRF iterations.
+    """
+
+    def __init__(
+        self,
+        n_pts_per_ray: int,
+        min_depth: float,
+        max_depth: float,
+        n_rays_per_image: int,
+        image_width: int,
+        image_height: int,
+        stratified: bool = False,
+        stratified_test: bool = False,
+    ):
+        """
+        Args:
+            n_pts_per_ray: The number of points sampled along each ray.
+            min_depth: The minimum depth of a ray-point.
+            max_depth: The maximum depth of a ray-point.
+            n_rays_per_image: Number of Monte Carlo ray samples when training
+                (`self.training==True`).
+            image_width: The horizontal size of the image grid.
+            image_height: The vertical size of the image grid.
+            stratified: If `True`, stratifies (=randomly offsets) the depths
+                of each ray point during training (`self.training==True`).
+            stratified_test: If `True`, stratifies (=randomly offsets) the depths
+                of each ray point during evaluation (`self.training==False`).
+        """
+
+        super().__init__()
+        self._stratified = stratified
+        self._stratified_test = stratified_test
+
+        # Initialize the grid ray sampler.
+        self._grid_raysampler = NDCGridRaysampler(
+            image_width=image_width,
+            image_height=image_height,
+            n_pts_per_ray=n_pts_per_ray,
+            min_depth=min_depth,
+            max_depth=max_depth,
+        )
+
+        # Initialize the Monte Carlo ray sampler.
+        self._mc_raysampler = MonteCarloRaysampler(
+            min_x=-1.0,
+            max_x=1.0,
+            min_y=-1.0,
+            max_y=1.0,
+            n_rays_per_image=n_rays_per_image,
+            n_pts_per_ray=n_pts_per_ray,
+            min_depth=min_depth,
+            max_depth=max_depth,
+        )
+
+        # create empty ray cache
+        self._ray_cache = {}
+
+    def get_n_chunks(self, chunksize: int, batch_size: int):
+        """
+        Returns the total number of `chunksize`-sized chunks
+        of the raysampler's rays.
+
+        Args:
+            chunksize: The number of rays per chunk.
+            batch_size: The size of the batch of the raysampler.
+
+        Returns:
+            n_chunks: The total number of chunks.
+        """
+        return int(
+            math.ceil(
+                (self._grid_raysampler._xy_grid.numel() * 0.5 * batch_size) / chunksize
+            )
+        )
+
+    def _print_precaching_progress(self, i, total, bar_len=30):
+        """
+        Print a progress bar for ray precaching.
+        """
+        position = round((i + 1) / total * bar_len)
+        pbar = "[" + "█" * position + " " * (bar_len - position) + "]"
+        print(pbar, end="\r")
+
+    def precache_rays(self, cameras: List[CamerasBase], camera_hashes: List):
+        """
+        Precaches the rays emitted from the list of cameras `cameras`,
+        where each camera is uniquely identified with the corresponding hash
+        from `camera_hashes`.
+
+        The cached rays are moved to cpu and stored in `self._ray_cache`.
+        Raises `ValueError` when caching two cameras with the same hash.
+
+        Args:
+            cameras: A list of `N` cameras for which the rays are pre-cached.
+            camera_hashes: A list of `N` unique identifiers of each
+                camera from `cameras`.
+        """
+        print(f"Precaching {len(cameras)} ray bundles ...")
+        full_chunksize = (
+            self._grid_raysampler._xy_grid.numel()
+            // 2
+            * self._grid_raysampler._n_pts_per_ray
+        )
+        if self.get_n_chunks(full_chunksize, 1) != 1:
+            raise ValueError("There has to be one chunk for precaching rays!")
+        for camera_i, (camera, camera_hash) in enumerate(zip(cameras, camera_hashes)):
+            ray_bundle = self.forward(
+                camera,
+                caching=True,
+                chunksize=full_chunksize,
+            )
+            if camera_hash in self._ray_cache:
+                raise ValueError("There are redundant cameras!")
+            self._ray_cache[camera_hash] = RayBundle(
+                *[v.to("cpu").detach() for v in ray_bundle]
+            )
+            self._print_precaching_progress(camera_i, len(cameras))
+        print("")
+
+    def _stratify_ray_bundle(self, ray_bundle: RayBundle):
+        """
+        Stratifies the lengths of the input `ray_bundle`.
+
+        More specifically, the stratification replaces each ray points' depth `z`
+        with a sample from a uniform random distribution on
+        `[z - delta_depth, z+delta_depth]`, where `delta_depth` is the difference
+        of depths of the consecutive ray depth values.
+
+        Args:
+            `ray_bundle`: The input `RayBundle`.
+
+        Returns:
+            `stratified_ray_bundle`: `ray_bundle` whose `lengths` field is replaced
+                with the stratified samples.
+        """
+        z_vals = ray_bundle.lengths
+        # Get intervals between samples.
+        mids = 0.5 * (z_vals[..., 1:] + z_vals[..., :-1])
+        upper = torch.cat((mids, z_vals[..., -1:]), dim=-1)
+        lower = torch.cat((z_vals[..., :1], mids), dim=-1)
+        # Stratified samples in those intervals.
+        z_vals = lower + (upper - lower) * torch.rand_like(lower)
+        return ray_bundle._replace(lengths=z_vals)
+
+    def _normalize_raybundle(self, ray_bundle: RayBundle):
+        """
+        Normalizes the ray directions of the input `RayBundle` to unit norm.
+        """
+        ray_bundle = ray_bundle._replace(
+            directions=torch.nn.functional.normalize(ray_bundle.directions, dim=-1)
+        )
+        return ray_bundle
+
+    def forward(
+        self,
+        cameras: CamerasBase,
+        chunksize: int = None,
+        chunk_idx: int = 0,
+        camera_hash: str = None,
+        caching: bool = False,
+        **kwargs,
+    ) -> RayBundle:
+        """
+        Args:
+            cameras: A batch of `batch_size` cameras from which the rays are emitted.
+            chunksize: The number of rays per chunk.
+                Active only when `self.training==False`.
+            chunk_idx: The index of the ray chunk. The number has to be in
+                `[0, self.get_n_chunks(chunksize, batch_size)-1]`.
+                Active only when `self.training==False`.
+            camera_hash: A unique identifier of a pre-cached camera. If `None`,
+                the cache is not searched and the rays are calculated from scratch.
+            caching: If `True`, activates the caching mode that returns the `RayBundle`
+                that should be stored into the cache.
+        Returns:
+            A named tuple `RayBundle` with the following fields:
+                origins: A tensor of shape
+                    `(batch_size, n_rays_per_image, 3)`
+                    denoting the locations of ray origins in the world coordinates.
+                directions: A tensor of shape
+                    `(batch_size, n_rays_per_image, 3)`
+                    denoting the directions of each ray in the world coordinates.
+                lengths: A tensor of shape
+                    `(batch_size, n_rays_per_image, n_pts_per_ray)`
+                    containing the z-coordinate (=depth) of each ray in world units.
+                xys: A tensor of shape
+                    `(batch_size, n_rays_per_image, 2)`
+                    containing the 2D image coordinates of each ray.
+        """
+
+        batch_size = cameras.R.shape[0]  # pyre-ignore
+        device = cameras.device
+
+        if (camera_hash is None) and (not caching) and self.training:
+            # Sample random rays from scratch.
+            ray_bundle = self._mc_raysampler(cameras)
+            ray_bundle = self._normalize_raybundle(ray_bundle)
+        else:
+            if camera_hash is not None:
+                # The case where we retrieve a camera from cache.
+                if batch_size != 1:
+                    raise NotImplementedError(
+                        "Ray caching works only for batches with a single camera!"
+                    )
+                full_ray_bundle = self._ray_cache[camera_hash]
+            else:
+                # We generate a full ray grid from scratch.
+                full_ray_bundle = self._grid_raysampler(cameras)
+                full_ray_bundle = self._normalize_raybundle(full_ray_bundle)
+
+            n_pixels = full_ray_bundle.directions.shape[:-1].numel()
+
+            if self.training:
+                # During training we randomly subsample rays.
+                sel_rays = torch.randperm(n_pixels, device=device)[
+                    : self._mc_raysampler._n_rays_per_image
+                ]
+            else:
+                # In case we test, we take only the requested chunk.
+                if chunksize is None:
+                    chunksize = n_pixels * batch_size
+                start = chunk_idx * chunksize * batch_size
+                end = min(start + chunksize, n_pixels)
+                sel_rays = torch.arange(
+                    start,
+                    end,
+                    dtype=torch.long,
+                    device=full_ray_bundle.lengths.device,
+                )
+
+            # Take the "sel_rays" rays from the full ray bundle.
+            ray_bundle = RayBundle(
+                *[
+                    v.view(n_pixels, -1)[sel_rays]
+                    .view(batch_size, sel_rays.numel() // batch_size, -1)
+                    .to(device)
+                    for v in full_ray_bundle
+                ]
+            )
+
+        if (
+            (self._stratified and self.training)
+            or (self._stratified_test and not self.training)
+        ) and not caching:  # Make sure not to stratify when caching!
+            ray_bundle = self._stratify_ray_bundle(ray_bundle)
+
+        return ray_bundle
diff --git a/pytorch3d/projects/nerf/nerf/stats.py b/pytorch3d/projects/nerf/nerf/stats.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddb076406870892516e43358d2f5f40ca7cc5702
--- /dev/null
+++ b/pytorch3d/projects/nerf/nerf/stats.py
@@ -0,0 +1,346 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import time
+import warnings
+from itertools import cycle
+from typing import List, Optional
+
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+from matplotlib import colors as mcolors
+from visdom import Visdom
+
+
+class AverageMeter:
+    """
+    Computes and stores the average and current value.
+    Tracks the exact history of the added values in every epoch.
+    """
+
+    def __init__(self) -> None:
+        """
+        Initialize the structure with empty history and zero-ed moving average.
+        """
+        self.history = []
+        self.reset()
+
+    def reset(self) -> None:
+        """
+        Reset the running average meter.
+        """
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val: float, n: int = 1, epoch: int = 0) -> None:
+        """
+        Updates the average meter with a value `val`.
+
+        Args:
+            val: A float to be added to the meter.
+            n: Represents the number of entities to be added.
+            epoch: The epoch to which the number should be added.
+        """
+        # make sure the history is of the same len as epoch
+        while len(self.history) <= epoch:
+            self.history.append([])
+        self.history[epoch].append(val / n)
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def get_epoch_averages(self):
+        """
+        Returns:
+            averages: A list of average values of the metric for each epoch
+                in the history buffer.
+        """
+        if len(self.history) == 0:
+            return None
+        return [
+            (float(np.array(h).mean()) if len(h) > 0 else float("NaN"))
+            for h in self.history
+        ]
+
+
+class Stats:
+    """
+    Stats logging object useful for gathering statistics of training
+    a deep network in PyTorch.
+
+    Example:
+        ```
+        # Init stats structure that logs statistics 'objective' and 'top1e'.
+        stats = Stats( ('objective','top1e') )
+
+        network = init_net()  # init a pytorch module (=neural network)
+        dataloader = init_dataloader()  # init a dataloader
+
+        for epoch in range(10):
+
+            # start of epoch -> call new_epoch
+            stats.new_epoch()
+
+            # Iterate over batches.
+            for batch in dataloader:
+                # Run a model and save into a dict of output variables "output"
+                output = network(batch)
+
+                # stats.update() automatically parses the 'objective' and 'top1e'
+                # from the "output" dict and stores this into the db.
+                stats.update(output)
+                stats.print() # prints the averages over given epoch
+
+            # Stores the training plots into '/tmp/epoch_stats.pdf'
+            # and plots into a visdom server running at localhost (if running).
+            stats.plot_stats(plot_file='/tmp/epoch_stats.pdf')
+        ```
+    """
+
+    def __init__(
+        self,
+        log_vars: List[str],
+        verbose: bool = False,
+        epoch: int = -1,
+        plot_file: Optional[str] = None,
+    ) -> None:
+        """
+        Args:
+            log_vars: The list of variable names to be logged.
+            verbose: Print status messages.
+            epoch: The initial epoch of the object.
+            plot_file: The path to the file that will hold the training plots.
+        """
+        self.verbose = verbose
+        self.log_vars = log_vars
+        self.plot_file = plot_file
+        self.hard_reset(epoch=epoch)
+
+    def reset(self) -> None:
+        """
+        Called before an epoch to clear current epoch buffers.
+        """
+        stat_sets = list(self.stats.keys())
+        if self.verbose:
+            print("stats: epoch %d - reset" % self.epoch)
+        self.it = {k: -1 for k in stat_sets}
+        for stat_set in stat_sets:
+            for stat in self.stats[stat_set]:
+                self.stats[stat_set][stat].reset()
+
+        # Set a new timestamp.
+        self._epoch_start = time.time()
+
+    def hard_reset(self, epoch: int = -1) -> None:
+        """
+        Erases all logged data.
+        """
+        self._epoch_start = None
+        self.epoch = epoch
+        if self.verbose:
+            print("stats: epoch %d - hard reset" % self.epoch)
+        self.stats = {}
+        self.reset()
+
+    def new_epoch(self) -> None:
+        """
+        Initializes a new epoch.
+        """
+        if self.verbose:
+            print("stats: new epoch %d" % (self.epoch + 1))
+        self.epoch += 1  # increase epoch counter
+        self.reset()  # zero the stats
+
+    def _gather_value(self, val):
+        if isinstance(val, float):
+            pass
+        else:
+            val = val.data.cpu().numpy()
+            val = float(val.sum())
+        return val
+
+    def update(self, preds: dict, stat_set: str = "train") -> None:
+        """
+        Update the internal logs with metrics of a training step.
+
+        Each metric is stored as an instance of an AverageMeter.
+
+        Args:
+            preds: Dict of values to be added to the logs.
+            stat_set: The set of statistics to be updated (e.g. "train", "val").
+        """
+
+        if self.epoch == -1:  # uninitialized
+            warnings.warn(
+                "self.epoch==-1 means uninitialized stats structure"
+                " -> new_epoch() called"
+            )
+            self.new_epoch()
+
+        if stat_set not in self.stats:
+            self.stats[stat_set] = {}
+            self.it[stat_set] = -1
+
+        self.it[stat_set] += 1
+
+        epoch = self.epoch
+        it = self.it[stat_set]
+
+        for stat in self.log_vars:
+
+            if stat not in self.stats[stat_set]:
+                self.stats[stat_set][stat] = AverageMeter()
+
+            if stat == "sec/it":  # compute speed
+                elapsed = time.time() - self._epoch_start
+                time_per_it = float(elapsed) / float(it + 1)
+                val = time_per_it
+            else:
+                if stat in preds:
+                    val = self._gather_value(preds[stat])
+                else:
+                    val = None
+
+            if val is not None:
+                self.stats[stat_set][stat].update(val, epoch=epoch, n=1)
+
+    def print(self, max_it: Optional[int] = None, stat_set: str = "train") -> None:
+        """
+        Print the current values of all stored stats.
+
+        Args:
+            max_it: Maximum iteration number to be displayed.
+                If None, the maximum iteration number is not displayed.
+            stat_set: The set of statistics to be printed.
+        """
+
+        epoch = self.epoch
+        stats = self.stats
+
+        str_out = ""
+
+        it = self.it[stat_set]
+        stat_str = ""
+        stats_print = sorted(stats[stat_set].keys())
+        for stat in stats_print:
+            if stats[stat_set][stat].count == 0:
+                continue
+            stat_str += " {0:.12}: {1:1.3f} |".format(stat, stats[stat_set][stat].avg)
+
+        head_str = f"[{stat_set}] | epoch {epoch} | it {it}"
+        if max_it:
+            head_str += f"/ {max_it}"
+
+        str_out = f"{head_str} | {stat_str}"
+
+        print(str_out)
+
+    def plot_stats(
+        self,
+        viz: Visdom = None,
+        visdom_env: Optional[str] = None,
+        plot_file: Optional[str] = None,
+    ) -> None:
+        """
+        Plot the line charts of the history of the stats.
+
+        Args:
+            viz: The Visdom object holding the connection to a Visdom server.
+            visdom_env: The visdom environment for storing the graphs.
+            plot_file: The path to a file with training plots.
+        """
+
+        stat_sets = list(self.stats.keys())
+
+        if viz is None:
+            withvisdom = False
+        elif not viz.check_connection():
+            warnings.warn("Cannot connect to the visdom server! Skipping visdom plots.")
+            withvisdom = False
+        else:
+            withvisdom = True
+
+        lines = []
+
+        for stat in self.log_vars:
+            vals = []
+            stat_sets_now = []
+            for stat_set in stat_sets:
+                val = self.stats[stat_set][stat].get_epoch_averages()
+                if val is None:
+                    continue
+                else:
+                    val = np.array(val).reshape(-1)
+                    stat_sets_now.append(stat_set)
+                vals.append(val)
+
+            if len(vals) == 0:
+                continue
+
+            vals = np.stack(vals, axis=1)
+            x = np.arange(vals.shape[0])
+
+            lines.append((stat_sets_now, stat, x, vals))
+
+        if withvisdom:
+            for tmodes, stat, x, vals in lines:
+                title = "%s" % stat
+                opts = {"title": title, "legend": list(tmodes)}
+                for i, (tmode, val) in enumerate(zip(tmodes, vals.T)):
+                    update = "append" if i > 0 else None
+                    valid = np.where(np.isfinite(val))
+                    if len(valid) == 0:
+                        continue
+                    viz.line(
+                        Y=val[valid],
+                        X=x[valid],
+                        env=visdom_env,
+                        opts=opts,
+                        win=f"stat_plot_{title}",
+                        name=tmode,
+                        update=update,
+                    )
+
+        if plot_file is None:
+            plot_file = self.plot_file
+
+        if plot_file is not None:
+            print("Exporting stats to %s" % plot_file)
+            ncol = 3
+            nrow = int(np.ceil(float(len(lines)) / ncol))
+            matplotlib.rcParams.update({"font.size": 5})
+            color = cycle(plt.cm.tab10(np.linspace(0, 1, 10)))
+            fig = plt.figure(1)
+            plt.clf()
+            for idx, (tmodes, stat, x, vals) in enumerate(lines):
+                c = next(color)
+                plt.subplot(nrow, ncol, idx + 1)
+                for vali, vals_ in enumerate(vals.T):
+                    c_ = c * (1.0 - float(vali) * 0.3)
+                    valid = np.where(np.isfinite(vals_))
+                    if len(valid) == 0:
+                        continue
+                    plt.plot(x[valid], vals_[valid], c=c_, linewidth=1)
+                plt.ylabel(stat)
+                plt.xlabel("epoch")
+                plt.gca().yaxis.label.set_color(c[0:3] * 0.75)
+                plt.legend(tmodes)
+                gcolor = np.array(mcolors.to_rgba("lightgray"))
+                plt.grid(
+                    b=True, which="major", color=gcolor, linestyle="-", linewidth=0.4
+                )
+                plt.grid(
+                    b=True, which="minor", color=gcolor, linestyle="--", linewidth=0.2
+                )
+                plt.minorticks_on()
+
+            plt.tight_layout()
+            plt.show()
+            fig.savefig(plot_file)
diff --git a/pytorch3d/projects/nerf/nerf/utils.py b/pytorch3d/projects/nerf/nerf/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..01d065a5819433a1dc4bb827f02fbf845fbf9e6a
--- /dev/null
+++ b/pytorch3d/projects/nerf/nerf/utils.py
@@ -0,0 +1,59 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def calc_mse(x: torch.Tensor, y: torch.Tensor):
+    """
+    Calculates the mean square error between tensors `x` and `y`.
+    """
+    return torch.mean((x - y) ** 2)
+
+
+def calc_psnr(x: torch.Tensor, y: torch.Tensor):
+    """
+    Calculates the Peak-signal-to-noise ratio between tensors `x` and `y`.
+    """
+    mse = calc_mse(x, y)
+    psnr = -10.0 * torch.log10(mse)
+    return psnr
+
+
+def sample_images_at_mc_locs(
+    target_images: torch.Tensor,
+    sampled_rays_xy: torch.Tensor,
+):
+    """
+    Given a set of pixel locations `sampled_rays_xy` this method samples the tensor
+    `target_images` at the respective 2D locations.
+
+    This function is used in order to extract the colors from ground truth images
+    that correspond to the colors rendered using a Monte Carlo rendering.
+
+    Args:
+        target_images: A tensor of shape `(batch_size, ..., 3)`.
+        sampled_rays_xy: A tensor of shape `(batch_size, S_1, ..., S_N, 2)`.
+
+    Returns:
+        images_sampled: A tensor of shape `(batch_size, S_1, ..., S_N, 3)`
+            containing `target_images` sampled at `sampled_rays_xy`.
+    """
+    ba = target_images.shape[0]
+    dim = target_images.shape[-1]
+    spatial_size = sampled_rays_xy.shape[1:-1]
+
+    # The coordinate grid convention for grid_sample has both x and y
+    # directions inverted.
+    xy_sample = -sampled_rays_xy.view(ba, -1, 1, 2).clone()
+
+    images_sampled = torch.nn.functional.grid_sample(
+        target_images.permute(0, 3, 1, 2),
+        xy_sample,
+        align_corners=True,
+        mode="bilinear",
+    )
+    return images_sampled.permute(0, 2, 3, 1).view(ba, *spatial_size, dim)
diff --git a/pytorch3d/projects/nerf/test_nerf.py b/pytorch3d/projects/nerf/test_nerf.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5fd8c5caadd044270c0def3943c73826bb112e6
--- /dev/null
+++ b/pytorch3d/projects/nerf/test_nerf.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import os
+import warnings
+
+import hydra
+import numpy as np
+import torch
+from nerf.dataset import get_nerf_datasets, trivial_collate
+from nerf.eval_video_utils import generate_eval_video_cameras
+from nerf.nerf_renderer import RadianceFieldRenderer
+from nerf.stats import Stats
+from omegaconf import DictConfig
+from PIL import Image
+
+
+CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs")
+
+
+@hydra.main(config_path=CONFIG_DIR, config_name="lego")
+def main(cfg: DictConfig):
+
+    # Device on which to run.
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        warnings.warn(
+            "Please note that although executing on CPU is supported,"
+            + "the testing is unlikely to finish in reasonable time."
+        )
+        device = "cpu"
+
+    # Initialize the Radiance Field model.
+    model = RadianceFieldRenderer(
+        image_size=cfg.data.image_size,
+        n_pts_per_ray=cfg.raysampler.n_pts_per_ray,
+        n_pts_per_ray_fine=cfg.raysampler.n_pts_per_ray,
+        n_rays_per_image=cfg.raysampler.n_rays_per_image,
+        min_depth=cfg.raysampler.min_depth,
+        max_depth=cfg.raysampler.max_depth,
+        stratified=cfg.raysampler.stratified,
+        stratified_test=cfg.raysampler.stratified_test,
+        chunk_size_test=cfg.raysampler.chunk_size_test,
+        n_harmonic_functions_xyz=cfg.implicit_function.n_harmonic_functions_xyz,
+        n_harmonic_functions_dir=cfg.implicit_function.n_harmonic_functions_dir,
+        n_hidden_neurons_xyz=cfg.implicit_function.n_hidden_neurons_xyz,
+        n_hidden_neurons_dir=cfg.implicit_function.n_hidden_neurons_dir,
+        n_layers_xyz=cfg.implicit_function.n_layers_xyz,
+        density_noise_std=cfg.implicit_function.density_noise_std,
+    )
+
+    # Move the model to the relevant device.
+    model.to(device)
+
+    # Resume from the checkpoint.
+    checkpoint_path = os.path.join(hydra.utils.get_original_cwd(), cfg.checkpoint_path)
+    if not os.path.isfile(checkpoint_path):
+        raise ValueError(f"Model checkpoint {checkpoint_path} does not exist!")
+
+    print(f"Loading checkpoint {checkpoint_path}.")
+    loaded_data = torch.load(checkpoint_path)
+    # Do not load the cached xy grid.
+    # - this allows setting an arbitrary evaluation image size.
+    state_dict = {
+        k: v
+        for k, v in loaded_data["model"].items()
+        if "_grid_raysampler._xy_grid" not in k
+    }
+    model.load_state_dict(state_dict, strict=False)
+
+    # Load the test data.
+    if cfg.test.mode == "evaluation":
+        _, _, test_dataset = get_nerf_datasets(
+            dataset_name=cfg.data.dataset_name,
+            image_size=cfg.data.image_size,
+        )
+    elif cfg.test.mode == "export_video":
+        train_dataset, _, _ = get_nerf_datasets(
+            dataset_name=cfg.data.dataset_name,
+            image_size=cfg.data.image_size,
+        )
+        test_dataset = generate_eval_video_cameras(
+            train_dataset,
+            trajectory_type=cfg.test.trajectory_type,
+            up=cfg.test.up,
+            scene_center=cfg.test.scene_center,
+            n_eval_cams=cfg.test.n_frames,
+            trajectory_scale=cfg.test.trajectory_scale,
+        )
+        # store the video in directory (checkpoint_file - extension + '_video')
+        export_dir = os.path.splitext(checkpoint_path)[0] + "_video"
+        os.makedirs(export_dir, exist_ok=True)
+    else:
+        raise ValueError(f"Unknown test mode {cfg.test_mode}.")
+
+    # Init the test dataloader.
+    test_dataloader = torch.utils.data.DataLoader(
+        test_dataset,
+        batch_size=1,
+        shuffle=False,
+        num_workers=0,
+        collate_fn=trivial_collate,
+    )
+
+    if cfg.test.mode == "evaluation":
+        # Init the test stats object.
+        eval_stats = ["mse_coarse", "mse_fine", "psnr_coarse", "psnr_fine", "sec/it"]
+        stats = Stats(eval_stats)
+        stats.new_epoch()
+    elif cfg.test.mode == "export_video":
+        # Init the frame buffer.
+        frame_paths = []
+
+    # Set the model to the eval mode.
+    model.eval()
+
+    # Run the main testing loop.
+    for batch_idx, test_batch in enumerate(test_dataloader):
+        test_image, test_camera, camera_idx = test_batch[0].values()
+        if test_image is not None:
+            test_image = test_image.to(device)
+        test_camera = test_camera.to(device)
+
+        # Activate eval mode of the model (lets us do a full rendering pass).
+        model.eval()
+        with torch.no_grad():
+            test_nerf_out, test_metrics = model(
+                None,  # we do not use pre-cached cameras
+                test_camera,
+                test_image,
+            )
+
+        if cfg.test.mode == "evaluation":
+            # Update stats with the validation metrics.
+            stats.update(test_metrics, stat_set="test")
+            stats.print(stat_set="test")
+
+        elif cfg.test.mode == "export_video":
+            # Store the video frame.
+            frame = test_nerf_out["rgb_fine"][0].detach().cpu()
+            frame_path = os.path.join(export_dir, f"frame_{batch_idx:05d}.png")
+            print(f"Writing {frame_path}.")
+            Image.fromarray((frame.numpy() * 255.0).astype(np.uint8)).save(frame_path)
+            frame_paths.append(frame_path)
+
+    if cfg.test.mode == "evaluation":
+        print(f"Final evaluation metrics on '{cfg.data.dataset_name}':")
+        for stat in eval_stats:
+            stat_value = stats.stats["test"][stat].get_epoch_averages()[0]
+            print(f"{stat:15s}: {stat_value:1.4f}")
+
+    elif cfg.test.mode == "export_video":
+        # Convert the exported frames to a video.
+        video_path = os.path.join(export_dir, "video.mp4")
+        ffmpeg_bin = "ffmpeg"
+        frame_regexp = os.path.join(export_dir, "frame_%05d.png")
+        ffmcmd = (
+            "%s -r %d -i %s -vcodec h264 -f mp4 -y -b 2000k -pix_fmt yuv420p %s"
+            % (ffmpeg_bin, cfg.test.fps, frame_regexp, video_path)
+        )
+        ret = os.system(ffmcmd)
+        if ret != 0:
+            raise RuntimeError("ffmpeg failed!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pytorch3d/projects/nerf/tests/__init__.py b/pytorch3d/projects/nerf/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a55772ab58b21573a6eba0356ddd3080164ac7
--- /dev/null
+++ b/pytorch3d/projects/nerf/tests/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/pytorch3d/projects/nerf/tests/test_raymarcher.py b/pytorch3d/projects/nerf/tests/test_raymarcher.py
new file mode 100644
index 0000000000000000000000000000000000000000..0497c4efeaeab6cade7608e7636fd9c28ad47281
--- /dev/null
+++ b/pytorch3d/projects/nerf/tests/test_raymarcher.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from nerf.raymarcher import EmissionAbsorptionNeRFRaymarcher
+from pytorch3d.renderer import EmissionAbsorptionRaymarcher
+
+
+class TestRaymarcher(unittest.TestCase):
+    def setUp(self) -> None:
+        torch.manual_seed(42)
+
+    def test_raymarcher(self):
+        """
+        Checks that the nerf raymarcher outputs are identical to the
+        EmissionAbsorptionRaymarcher.
+        """
+
+        feat_dim = 3
+        rays_densities = torch.rand(100, 10, 1)
+        rays_features = torch.randn(100, 10, feat_dim)
+
+        out, out_nerf = [
+            raymarcher(rays_densities, rays_features)
+            for raymarcher in (
+                EmissionAbsorptionRaymarcher(),
+                EmissionAbsorptionNeRFRaymarcher(),
+            )
+        ]
+
+        self.assertTrue(
+            torch.allclose(out[..., :feat_dim], out_nerf[0][..., :feat_dim])
+        )
diff --git a/pytorch3d/projects/nerf/tests/test_raysampler.py b/pytorch3d/projects/nerf/tests/test_raysampler.py
new file mode 100644
index 0000000000000000000000000000000000000000..75fabc97171bfb8063169e82d06d43f7b6169add
--- /dev/null
+++ b/pytorch3d/projects/nerf/tests/test_raysampler.py
@@ -0,0 +1,126 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import torch
+from nerf.raysampler import NeRFRaysampler, ProbabilisticRaysampler
+from pytorch3d.renderer import PerspectiveCameras
+from pytorch3d.transforms.rotation_conversions import random_rotations
+
+
+class TestRaysampler(unittest.TestCase):
+    def setUp(self) -> None:
+        torch.manual_seed(42)
+
+    def test_raysampler_caching(self, batch_size=10):
+        """
+        Tests the consistency of the NeRF raysampler caching.
+        """
+
+        raysampler = NeRFRaysampler(
+            min_x=0.0,
+            max_x=10.0,
+            min_y=0.0,
+            max_y=10.0,
+            n_pts_per_ray=10,
+            min_depth=0.1,
+            max_depth=10.0,
+            n_rays_per_image=12,
+            image_width=10,
+            image_height=10,
+            stratified=False,
+            stratified_test=False,
+            invert_directions=True,
+        )
+
+        raysampler.eval()
+
+        cameras, rays = [], []
+
+        for _ in range(batch_size):
+
+            R = random_rotations(1)
+            T = torch.randn(1, 3)
+            focal_length = torch.rand(1, 2) + 0.5
+            principal_point = torch.randn(1, 2)
+
+            camera = PerspectiveCameras(
+                focal_length=focal_length,
+                principal_point=principal_point,
+                R=R,
+                T=T,
+            )
+
+            cameras.append(camera)
+            rays.append(raysampler(camera))
+
+        raysampler.precache_rays(cameras, list(range(batch_size)))
+
+        for cam_index, rays_ in enumerate(rays):
+            rays_cached_ = raysampler(
+                cameras=cameras[cam_index],
+                chunksize=None,
+                chunk_idx=0,
+                camera_hash=cam_index,
+                caching=False,
+            )
+
+            for v, v_cached in zip(rays_, rays_cached_):
+                self.assertTrue(torch.allclose(v, v_cached))
+
+    def test_probabilistic_raysampler(self, batch_size=1, n_pts_per_ray=60):
+        """
+        Check that the probabilistic ray sampler does not crash for various
+        settings.
+        """
+
+        raysampler_grid = NeRFRaysampler(
+            min_x=0.0,
+            max_x=10.0,
+            min_y=0.0,
+            max_y=10.0,
+            n_pts_per_ray=n_pts_per_ray,
+            min_depth=1.0,
+            max_depth=10.0,
+            n_rays_per_image=12,
+            image_width=10,
+            image_height=10,
+            stratified=False,
+            stratified_test=False,
+            invert_directions=True,
+        )
+
+        R = random_rotations(batch_size)
+        T = torch.randn(batch_size, 3)
+        focal_length = torch.rand(batch_size, 2) + 0.5
+        principal_point = torch.randn(batch_size, 2)
+        camera = PerspectiveCameras(
+            focal_length=focal_length,
+            principal_point=principal_point,
+            R=R,
+            T=T,
+        )
+
+        raysampler_grid.eval()
+
+        ray_bundle = raysampler_grid(cameras=camera)
+
+        ray_weights = torch.rand_like(ray_bundle.lengths)
+
+        # Just check that we dont crash for all possible settings.
+        for stratified_test in (True, False):
+            for stratified in (True, False):
+                raysampler_prob = ProbabilisticRaysampler(
+                    n_pts_per_ray=n_pts_per_ray,
+                    stratified=stratified,
+                    stratified_test=stratified_test,
+                    add_input_samples=True,
+                )
+                for mode in ("train", "eval"):
+                    getattr(raysampler_prob, mode)()
+                    for _ in range(10):
+                        raysampler_prob(ray_bundle, ray_weights)
diff --git a/pytorch3d/projects/nerf/train_nerf.py b/pytorch3d/projects/nerf/train_nerf.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6525ac2a4b20a895e72a753ac0c463815bffb3
--- /dev/null
+++ b/pytorch3d/projects/nerf/train_nerf.py
@@ -0,0 +1,273 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import collections
+import os
+import pickle
+import warnings
+
+import hydra
+import numpy as np
+import torch
+from nerf.dataset import get_nerf_datasets, trivial_collate
+from nerf.nerf_renderer import RadianceFieldRenderer, visualize_nerf_outputs
+from nerf.stats import Stats
+from omegaconf import DictConfig
+from visdom import Visdom
+
+
+CONFIG_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), "configs")
+
+
+@hydra.main(config_path=CONFIG_DIR, config_name="lego")
+def main(cfg: DictConfig):
+
+    # Set the relevant seeds for reproducibility.
+    np.random.seed(cfg.seed)
+    torch.manual_seed(cfg.seed)
+
+    # Device on which to run.
+    if torch.cuda.is_available():
+        device = "cuda"
+    else:
+        warnings.warn(
+            "Please note that although executing on CPU is supported,"
+            + "the training is unlikely to finish in reasonable time."
+        )
+        device = "cpu"
+
+    # Initialize the Radiance Field model.
+    model = RadianceFieldRenderer(
+        image_size=cfg.data.image_size,
+        n_pts_per_ray=cfg.raysampler.n_pts_per_ray,
+        n_pts_per_ray_fine=cfg.raysampler.n_pts_per_ray,
+        n_rays_per_image=cfg.raysampler.n_rays_per_image,
+        min_depth=cfg.raysampler.min_depth,
+        max_depth=cfg.raysampler.max_depth,
+        stratified=cfg.raysampler.stratified,
+        stratified_test=cfg.raysampler.stratified_test,
+        chunk_size_test=cfg.raysampler.chunk_size_test,
+        n_harmonic_functions_xyz=cfg.implicit_function.n_harmonic_functions_xyz,
+        n_harmonic_functions_dir=cfg.implicit_function.n_harmonic_functions_dir,
+        n_hidden_neurons_xyz=cfg.implicit_function.n_hidden_neurons_xyz,
+        n_hidden_neurons_dir=cfg.implicit_function.n_hidden_neurons_dir,
+        n_layers_xyz=cfg.implicit_function.n_layers_xyz,
+        density_noise_std=cfg.implicit_function.density_noise_std,
+        visualization=cfg.visualization.visdom,
+    )
+
+    # Move the model to the relevant device.
+    model.to(device)
+
+    # Init stats to None before loading.
+    stats = None
+    optimizer_state_dict = None
+    start_epoch = 0
+
+    checkpoint_path = os.path.join(hydra.utils.get_original_cwd(), cfg.checkpoint_path)
+    if len(cfg.checkpoint_path) > 0:
+        # Make the root of the experiment directory.
+        checkpoint_dir = os.path.split(checkpoint_path)[0]
+        os.makedirs(checkpoint_dir, exist_ok=True)
+
+        # Resume training if requested.
+        if cfg.resume and os.path.isfile(checkpoint_path):
+            print(f"Resuming from checkpoint {checkpoint_path}.")
+            loaded_data = torch.load(checkpoint_path)
+            model.load_state_dict(loaded_data["model"])
+            stats = pickle.loads(loaded_data["stats"])
+            print(f"   => resuming from epoch {stats.epoch}.")
+            optimizer_state_dict = loaded_data["optimizer"]
+            start_epoch = stats.epoch
+
+    # Initialize the optimizer.
+    optimizer = torch.optim.Adam(
+        model.parameters(),
+        lr=cfg.optimizer.lr,
+    )
+
+    # Load the optimizer state dict in case we are resuming.
+    if optimizer_state_dict is not None:
+        optimizer.load_state_dict(optimizer_state_dict)
+        optimizer.last_epoch = start_epoch
+
+    # Init the stats object.
+    if stats is None:
+        stats = Stats(
+            ["loss", "mse_coarse", "mse_fine", "psnr_coarse", "psnr_fine", "sec/it"],
+        )
+
+    # Learning rate scheduler setup.
+
+    # Following the original code, we use exponential decay of the
+    # learning rate: current_lr = base_lr * gamma ** (epoch / step_size)
+    def lr_lambda(epoch):
+        return cfg.optimizer.lr_scheduler_gamma ** (
+            epoch / cfg.optimizer.lr_scheduler_step_size
+        )
+
+    # The learning rate scheduling is implemented with LambdaLR PyTorch scheduler.
+    lr_scheduler = torch.optim.lr_scheduler.LambdaLR(
+        optimizer, lr_lambda, last_epoch=start_epoch - 1, verbose=False
+    )
+
+    # Initialize the cache for storing variables needed for visualization.
+    visuals_cache = collections.deque(maxlen=cfg.visualization.history_size)
+
+    # Init the visualization visdom env.
+    if cfg.visualization.visdom:
+        viz = Visdom(
+            server=cfg.visualization.visdom_server,
+            port=cfg.visualization.visdom_port,
+            use_incoming_socket=False,
+        )
+    else:
+        viz = None
+
+    # Load the training/validation data.
+    train_dataset, val_dataset, _ = get_nerf_datasets(
+        dataset_name=cfg.data.dataset_name,
+        image_size=cfg.data.image_size,
+    )
+
+    if cfg.data.precache_rays:
+        # Precache the projection rays.
+        model.eval()
+        with torch.no_grad():
+            for dataset in (train_dataset, val_dataset):
+                cache_cameras = [e["camera"].to(device) for e in dataset]
+                cache_camera_hashes = [e["camera_idx"] for e in dataset]
+                model.precache_rays(cache_cameras, cache_camera_hashes)
+
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=1,
+        shuffle=True,
+        num_workers=0,
+        collate_fn=trivial_collate,
+    )
+
+    # The validation dataloader is just an endless stream of random samples.
+    val_dataloader = torch.utils.data.DataLoader(
+        val_dataset,
+        batch_size=1,
+        num_workers=0,
+        collate_fn=trivial_collate,
+        sampler=torch.utils.data.RandomSampler(
+            val_dataset,
+            replacement=True,
+            num_samples=cfg.optimizer.max_epochs,
+        ),
+    )
+
+    # Set the model to the training mode.
+    model.train()
+
+    # Run the main training loop.
+    for epoch in range(start_epoch, cfg.optimizer.max_epochs):
+        stats.new_epoch()  # Init a new epoch.
+        for iteration, batch in enumerate(train_dataloader):
+            image, camera, camera_idx = batch[0].values()
+            image = image.to(device)
+            camera = camera.to(device)
+
+            optimizer.zero_grad()
+
+            # Run the forward pass of the model.
+            nerf_out, metrics = model(
+                camera_idx if cfg.data.precache_rays else None,
+                camera,
+                image,
+            )
+
+            # The loss is a sum of coarse and fine MSEs
+            loss = metrics["mse_coarse"] + metrics["mse_fine"]
+
+            # Take the training step.
+            loss.backward()
+            optimizer.step()
+
+            # Update stats with the current metrics.
+            stats.update(
+                {"loss": float(loss), **metrics},
+                stat_set="train",
+            )
+
+            if iteration % cfg.stats_print_interval == 0:
+                stats.print(stat_set="train")
+
+            # Update the visualization cache.
+            if viz is not None:
+                visuals_cache.append(
+                    {
+                        "camera": camera.cpu(),
+                        "camera_idx": camera_idx,
+                        "image": image.cpu().detach(),
+                        "rgb_fine": nerf_out["rgb_fine"].cpu().detach(),
+                        "rgb_coarse": nerf_out["rgb_coarse"].cpu().detach(),
+                        "rgb_gt": nerf_out["rgb_gt"].cpu().detach(),
+                        "coarse_ray_bundle": nerf_out["coarse_ray_bundle"],
+                    }
+                )
+
+        # Adjust the learning rate.
+        lr_scheduler.step()
+
+        # Validation
+        if epoch % cfg.validation_epoch_interval == 0 and epoch > 0:
+
+            # Sample a validation camera/image.
+            val_batch = next(val_dataloader.__iter__())
+            val_image, val_camera, camera_idx = val_batch[0].values()
+            val_image = val_image.to(device)
+            val_camera = val_camera.to(device)
+
+            # Activate eval mode of the model (lets us do a full rendering pass).
+            model.eval()
+            with torch.no_grad():
+                val_nerf_out, val_metrics = model(
+                    camera_idx if cfg.data.precache_rays else None,
+                    val_camera,
+                    val_image,
+                )
+
+            # Update stats with the validation metrics.
+            stats.update(val_metrics, stat_set="val")
+            stats.print(stat_set="val")
+
+            if viz is not None:
+                # Plot that loss curves into visdom.
+                stats.plot_stats(
+                    viz=viz,
+                    visdom_env=cfg.visualization.visdom_env,
+                    plot_file=None,
+                )
+                # Visualize the intermediate results.
+                visualize_nerf_outputs(
+                    val_nerf_out, visuals_cache, viz, cfg.visualization.visdom_env
+                )
+
+            # Set the model back to train mode.
+            model.train()
+
+        # Checkpoint.
+        if (
+            epoch % cfg.checkpoint_epoch_interval == 0
+            and len(cfg.checkpoint_path) > 0
+            and epoch > 0
+        ):
+            print(f"Storing checkpoint {checkpoint_path}.")
+            data_to_store = {
+                "model": model.state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "stats": pickle.dumps(stats),
+            }
+            torch.save(data_to_store, checkpoint_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/pytorch3d/pytorch3d/__init__.py b/pytorch3d/pytorch3d/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a990247a29cd4a6840e652db3cb3742181a943e
--- /dev/null
+++ b/pytorch3d/pytorch3d/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+__version__ = "0.6.1"
diff --git a/pytorch3d/pytorch3d/common/__init__.py b/pytorch3d/pytorch3d/common/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..32b79ce312477be2f66c7cf8f3d4015b5a3f079f
--- /dev/null
+++ b/pytorch3d/pytorch3d/common/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .types import Device, get_device, make_device
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/common/compat.py b/pytorch3d/pytorch3d/common/compat.py
new file mode 100644
index 0000000000000000000000000000000000000000..5f25a185e89494d734c12c1a422cd82c1a015dda
--- /dev/null
+++ b/pytorch3d/pytorch3d/common/compat.py
@@ -0,0 +1,51 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+
+
+"""
+Some functions which depend on PyTorch versions.
+"""
+
+
+def solve(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:  # pragma: no cover
+    """
+    Like torch.linalg.solve, tries to return X
+    such that AX=B, with A square.
+    """
+    if hasattr(torch, "linalg") and hasattr(torch.linalg, "solve"):
+        # PyTorch version >= 1.8.0
+        return torch.linalg.solve(A, B)
+
+    return torch.solve(B, A).solution
+
+
+def lstsq(A: torch.Tensor, B: torch.Tensor) -> torch.Tensor:  # pragma: no cover
+    """
+    Like torch.linalg.lstsq, tries to return X
+    such that AX=B.
+    """
+    if hasattr(torch, "linalg") and hasattr(torch.linalg, "lstsq"):
+        # PyTorch version >= 1.9
+        return torch.linalg.lstsq(A, B).solution
+
+    solution = torch.lstsq(B, A).solution
+    if A.shape[1] < A.shape[0]:
+        return solution[: A.shape[1]]
+    return solution
+
+
+def qr(A: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:  # pragma: no cover
+    """
+    Like torch.linalg.qr.
+    """
+    if hasattr(torch, "linalg") and hasattr(torch.linalg, "qr"):
+        # PyTorch version >= 1.9
+        return torch.linalg.qr(A)
+    return torch.qr(A)
diff --git a/pytorch3d/pytorch3d/common/types.py b/pytorch3d/pytorch3d/common/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..da5e71a9252254e11a77dc78bb5ce17cee5c03e7
--- /dev/null
+++ b/pytorch3d/pytorch3d/common/types.py
@@ -0,0 +1,58 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional, Union
+
+import torch
+
+
+Device = Union[str, torch.device]
+
+
+def make_device(device: Device) -> torch.device:
+    """
+    Makes an actual torch.device object from the device specified as
+    either a string or torch.device object. If the device is `cuda` without
+    a specific index, the index of the current device is assigned.
+
+    Args:
+        device: Device (as str or torch.device)
+
+    Returns:
+        A matching torch.device object
+    """
+    device = torch.device(device) if isinstance(device, str) else device
+    if device.type == "cuda" and device.index is None:  # pyre-ignore[16]
+        # If cuda but with no index, then the current cuda device is indicated.
+        # In that case, we fix to that device
+        device = torch.device(f"cuda:{torch.cuda.current_device()}")
+    return device
+
+
+def get_device(x, device: Optional[Device] = None) -> torch.device:
+    """
+    Gets the device of the specified variable x if it is a tensor, or
+    falls back to a default CPU device otherwise. Allows overriding by
+    providing an explicit device.
+
+    Args:
+        x: a torch.Tensor to get the device from or another type
+        device: Device (as str or torch.device) to fall back to
+
+    Returns:
+        A matching torch.device object
+    """
+
+    # User overrides device
+    if device is not None:
+        return make_device(device)
+
+    # Set device based on input tensor
+    if torch.is_tensor(x):
+        return x.device
+
+    # Default device is cpu
+    return torch.device("cpu")
diff --git a/pytorch3d/pytorch3d/common/workaround/__init__.py b/pytorch3d/pytorch3d/common/workaround/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..058b01ef4f7b57353b09882037aad2501fcb2b40
--- /dev/null
+++ b/pytorch3d/pytorch3d/common/workaround/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .symeig3x3 import symeig3x3
+from .utils import _safe_det_3x3
diff --git a/pytorch3d/pytorch3d/common/workaround/symeig3x3.py b/pytorch3d/pytorch3d/common/workaround/symeig3x3.py
new file mode 100644
index 0000000000000000000000000000000000000000..42467b9842a4a6ad95c4b64162f9107d5c0e822d
--- /dev/null
+++ b/pytorch3d/pytorch3d/common/workaround/symeig3x3.py
@@ -0,0 +1,316 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class _SymEig3x3(nn.Module):
+    """
+    Optimized implementation of eigenvalues and eigenvectors computation for symmetric 3x3
+     matrices.
+
+    Please see https://en.wikipedia.org/wiki/Eigenvalue_algorithm#3.C3.973_matrices
+     and https://www.geometrictools.com/Documentation/RobustEigenSymmetric3x3.pdf
+    """
+
+    def __init__(self, eps: Optional[float] = None) -> None:
+        """
+        Args:
+            eps: epsilon to specify, if None then use torch.float eps
+        """
+        super().__init__()
+
+        self.register_buffer("_identity", torch.eye(3))
+        self.register_buffer("_rotation_2d", torch.tensor([[0.0, -1.0], [1.0, 0.0]]))
+        self.register_buffer(
+            "_rotations_3d", self._create_rotation_matrices(self._rotation_2d)
+        )
+
+        self._eps = eps or torch.finfo(torch.float).eps
+
+    @staticmethod
+    def _create_rotation_matrices(rotation_2d) -> torch.Tensor:
+        """
+        Compute rotations for later use in U V computation
+
+        Args:
+            rotation_2d: a π/2 rotation matrix.
+
+        Returns:
+            a (3, 3, 3) tensor containing 3 rotation matrices around each of the coordinate axes
+            by π/2
+        """
+
+        rotations_3d = torch.zeros((3, 3, 3))
+        rotation_axes = set(range(3))
+        for rotation_axis in rotation_axes:
+            rest = list(rotation_axes - {rotation_axis})
+            rotations_3d[rotation_axis][rest[0], rest] = rotation_2d[0]
+            rotations_3d[rotation_axis][rest[1], rest] = rotation_2d[1]
+
+        return rotations_3d
+
+    def forward(
+        self, inputs: torch.Tensor, eigenvectors: bool = True
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Compute eigenvalues and (optionally) eigenvectors
+
+        Args:
+            inputs: symmetric matrices with shape of (..., 3, 3)
+            eigenvectors: whether should we compute only eigenvalues or eigenvectors as well
+
+        Returns:
+            Either a tuple of (eigenvalues, eigenvectors) or eigenvalues only, depending on
+             given params. Eigenvalues are of shape (..., 3) and eigenvectors (..., 3, 3)
+        """
+        if inputs.shape[-2:] != (3, 3):
+            raise ValueError("Only inputs of shape (..., 3, 3) are supported.")
+
+        inputs_diag = inputs.diagonal(dim1=-2, dim2=-1)  # pyre-ignore[16]
+        inputs_trace = inputs_diag.sum(-1)
+        q = inputs_trace / 3.0
+
+        # Calculate squared sum of elements outside the main diagonal / 2
+        p1 = ((inputs ** 2).sum(dim=(-1, -2)) - (inputs_diag ** 2).sum(-1)) / 2
+        p2 = ((inputs_diag - q[..., None]) ** 2).sum(dim=-1) + 2.0 * p1.clamp(self._eps)
+
+        p = torch.sqrt(p2 / 6.0)
+        B = (inputs - q[..., None, None] * self._identity) / p[..., None, None]
+
+        r = torch.det(B) / 2.0
+        # Keep r within (-1.0, 1.0) boundaries with a margin to prevent exploding gradients.
+        r = r.clamp(-1.0 + self._eps, 1.0 - self._eps)
+
+        phi = torch.acos(r) / 3.0
+        eig1 = q + 2 * p * torch.cos(phi)
+        eig2 = q + 2 * p * torch.cos(phi + 2 * math.pi / 3)
+        eig3 = 3 * q - eig1 - eig2
+        # eigenvals[..., i] is the i-th eigenvalue of the input, α0 ≤ α1 ≤ α2.
+        eigenvals = torch.stack((eig2, eig3, eig1), dim=-1)
+
+        # Soft dispatch between the degenerate case (diagonal A) and general.
+        # diag_soft_cond -> 1.0 when p1 < 6 * eps and diag_soft_cond -> 0.0 otherwise.
+        # We use 6 * eps to take into account the error accumulated during the p1 summation
+        diag_soft_cond = torch.exp(-((p1 / (6 * self._eps)) ** 2)).detach()[..., None]
+
+        # Eigenvalues are the ordered elements of main diagonal in the degenerate case
+        diag_eigenvals, _ = torch.sort(inputs_diag, dim=-1)
+        eigenvals = diag_soft_cond * diag_eigenvals + (1.0 - diag_soft_cond) * eigenvals
+
+        if eigenvectors:
+            eigenvecs = self._construct_eigenvecs_set(inputs, eigenvals)
+        else:
+            eigenvecs = None
+
+        return eigenvals, eigenvecs
+
+    def _construct_eigenvecs_set(
+        self, inputs: torch.Tensor, eigenvals: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Construct orthonormal set of eigenvectors by given inputs and pre-computed eigenvalues
+
+        Args:
+            inputs: tensor of symmetric matrices of shape (..., 3, 3)
+            eigenvals: tensor of pre-computed eigenvalues of of shape (..., 3, 3)
+
+        Returns:
+            Tuple of three eigenvector tensors of shape (..., 3, 3), composing an orthonormal
+             set
+        """
+        eigenvecs_tuple_for_01 = self._construct_eigenvecs(
+            inputs, eigenvals[..., 0], eigenvals[..., 1]
+        )
+        eigenvecs_for_01 = torch.stack(eigenvecs_tuple_for_01, dim=-1)
+
+        eigenvecs_tuple_for_21 = self._construct_eigenvecs(
+            inputs, eigenvals[..., 2], eigenvals[..., 1]
+        )
+        eigenvecs_for_21 = torch.stack(eigenvecs_tuple_for_21[::-1], dim=-1)
+
+        # The result will be smooth here even if both parts of comparison
+        # are close, because eigenvecs_01 and eigenvecs_21 would be mostly equal as well
+        eigenvecs_cond = (
+            eigenvals[..., 1] - eigenvals[..., 0]
+            > eigenvals[..., 2] - eigenvals[..., 1]
+        ).detach()
+        eigenvecs = torch.where(
+            eigenvecs_cond[..., None, None], eigenvecs_for_01, eigenvecs_for_21
+        )
+
+        return eigenvecs
+
+    def _construct_eigenvecs(
+        self, inputs: torch.Tensor, alpha0: torch.Tensor, alpha1: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Construct an orthonormal set of eigenvectors by given pair of eigenvalues.
+
+        Args:
+            inputs: tensor of symmetric matrices of shape (..., 3, 3)
+            alpha0: first eigenvalues of shape (..., 3)
+            alpha1: second eigenvalues of shape (..., 3)
+
+        Returns:
+            Tuple of three eigenvector tensors of shape (..., 3, 3), composing an orthonormal
+             set
+        """
+
+        # Find the eigenvector corresponding to alpha0, its eigenvalue is distinct
+        ev0 = self._get_ev0(inputs - alpha0[..., None, None] * self._identity)
+        u, v = self._get_uv(ev0)
+        ev1 = self._get_ev1(inputs - alpha1[..., None, None] * self._identity, u, v)
+        # Third eigenvector is computed as the cross-product of the other two
+        ev2 = torch.cross(ev0, ev1, dim=-1)
+
+        return ev0, ev1, ev2
+
+    def _get_ev0(self, char_poly: torch.Tensor) -> torch.Tensor:
+        """
+        Construct the first normalized eigenvector given a characteristic polynomial
+
+        Args:
+            char_poly: a characteristic polynomials of the input matrices of shape (..., 3, 3)
+
+        Returns:
+            Tensor of first eigenvectors of shape (..., 3)
+        """
+
+        r01 = torch.cross(char_poly[..., 0, :], char_poly[..., 1, :], dim=-1)
+        r12 = torch.cross(char_poly[..., 1, :], char_poly[..., 2, :], dim=-1)
+        r02 = torch.cross(char_poly[..., 0, :], char_poly[..., 2, :], dim=-1)
+
+        cross_products = torch.stack((r01, r12, r02), dim=-2)
+        # Regularize it with + or -eps depending on the sign of the first vector
+        cross_products += self._eps * self._sign_without_zero(
+            cross_products[..., :1, :]
+        )
+
+        norms_sq = (cross_products ** 2).sum(dim=-1)
+        max_norms_index = norms_sq.argmax(dim=-1)  # pyre-ignore[16]
+
+        # Pick only the cross-product with highest squared norm for each input
+        max_cross_products = self._gather_by_index(
+            cross_products, max_norms_index[..., None, None], -2
+        )
+        # Pick corresponding squared norms for each cross-product
+        max_norms_sq = self._gather_by_index(norms_sq, max_norms_index[..., None], -1)
+
+        # Normalize cross-product vectors by thier norms
+        return max_cross_products / torch.sqrt(max_norms_sq[..., None])
+
+    def _gather_by_index(
+        self, source: torch.Tensor, index: torch.Tensor, dim: int
+    ) -> torch.Tensor:
+        """
+        Selects elements from the given source tensor by provided index tensor.
+        Number of dimensions should be the same for source and index tensors.
+
+        Args:
+            source: input tensor to gather from
+            index: index tensor with indices to gather from source
+            dim: dimension to gather across
+
+        Returns:
+            Tensor of shape same as the source with exception of specified dimension.
+        """
+
+        index_shape = list(source.shape)
+        index_shape[dim] = 1
+
+        return source.gather(dim, index.expand(index_shape)).squeeze(  # pyre-ignore[16]
+            dim
+        )
+
+    def _get_uv(self, w: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Computes unit-length vectors U and V such that {U, V, W} is a right-handed
+        orthonormal set.
+
+        Args:
+            w: eigenvector tensor of shape (..., 3)
+
+        Returns:
+            Tuple of U and V unit-length vector tensors of shape (..., 3)
+        """
+
+        min_idx = w.abs().argmin(dim=-1)  # pyre-ignore[16]
+        rotation_2d = self._rotations_3d[min_idx].to(w)
+
+        u = F.normalize((rotation_2d @ w[..., None])[..., 0], dim=-1)
+        v = torch.cross(w, u, dim=-1)
+        return u, v
+
+    def _get_ev1(
+        self, char_poly: torch.Tensor, u: torch.Tensor, v: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Computes the second normalized eigenvector given a characteristic polynomial
+        and U and V vectors
+
+        Args:
+            char_poly: a characteristic polynomials of the input matrices of shape (..., 3, 3)
+            u: unit-length vectors from _get_uv method
+            v: unit-length vectors from _get_uv method
+
+        Returns:
+            desc
+        """
+
+        j = torch.stack((u, v), dim=-1)
+        m = j.transpose(-1, -2) @ char_poly @ j
+
+        # If angle between those vectors is acute, take their sum = m[..., 0, :] + m[..., 1, :],
+        # otherwise take the difference = m[..., 0, :] - m[..., 1, :]
+        # m is in theory of rank 1 (or 0), so it snaps only when one of the rows is close to 0
+        is_acute_sign = self._sign_without_zero(
+            (m[..., 0, :] * m[..., 1, :]).sum(dim=-1)
+        ).detach()
+
+        rowspace = m[..., 0, :] + is_acute_sign[..., None] * m[..., 1, :]
+        # rowspace will be near zero for second-order eigenvalues
+        # this regularization guarantees abs(rowspace[0]) >= eps in a smooth'ish way
+        rowspace += self._eps * self._sign_without_zero(rowspace[..., :1])
+
+        return (
+            j
+            @ F.normalize(rowspace @ self._rotation_2d.to(rowspace), dim=-1)[..., None]
+        )[..., 0]
+
+    @staticmethod
+    def _sign_without_zero(tensor):
+        """
+        Args:
+            tensor: an arbitrary shaped tensor
+
+        Returns:
+            Tensor of the same shape as an input, but with 1.0 if tensor > 0.0 and -1.0
+             otherwise
+        """
+        return 2.0 * (tensor > 0.0).to(tensor.dtype) - 1.0
+
+
+def symeig3x3(
+    inputs: torch.Tensor, eigenvectors: bool = True
+) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
+    """
+    Compute eigenvalues and (optionally) eigenvectors
+
+    Args:
+        inputs: symmetric matrices with shape of (..., 3, 3)
+        eigenvectors: whether should we compute only eigenvalues or eigenvectors as well
+
+    Returns:
+        Either a tuple of (eigenvalues, eigenvectors) or eigenvalues only, depending on
+         given params. Eigenvalues are of shape (..., 3) and eigenvectors (..., 3, 3)
+    """
+    return _SymEig3x3().to(inputs.device)(inputs, eigenvectors=eigenvectors)
diff --git a/pytorch3d/pytorch3d/common/workaround/utils.py b/pytorch3d/pytorch3d/common/workaround/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbc5f15641fb0a63f7ab1f2148a89c8522f47849
--- /dev/null
+++ b/pytorch3d/pytorch3d/common/workaround/utils.py
@@ -0,0 +1,31 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+
+
+def _safe_det_3x3(t: torch.Tensor):
+    """
+    Fast determinant calculation for a batch of 3x3 matrices.
+
+    Note, result of this function might not be the same as `torch.det()`.
+    The differences might be in the last significant digit.
+
+    Args:
+        t: Tensor of shape (N, 3, 3).
+
+    Returns:
+        Tensor of shape (N) with determinants.
+    """
+
+    det = (
+        t[..., 0, 0] * (t[..., 1, 1] * t[..., 2, 2] - t[..., 1, 2] * t[..., 2, 1])
+        - t[..., 0, 1] * (t[..., 1, 0] * t[..., 2, 2] - t[..., 2, 0] * t[..., 1, 2])
+        + t[..., 0, 2] * (t[..., 1, 0] * t[..., 2, 1] - t[..., 2, 0] * t[..., 1, 1])
+    )
+
+    return det
diff --git a/pytorch3d/pytorch3d/csrc/ball_query/ball_query.cu b/pytorch3d/pytorch3d/csrc/ball_query/ball_query.cu
new file mode 100644
index 0000000000000000000000000000000000000000..bababec5d333d8c57cc6c9ff6faa48726fb611f6
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/ball_query/ball_query.cu
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils/pytorch3d_cutils.h"
+
+// A chunk of work is blocksize-many points of P1.
+// The number of potential chunks to do is N*(1+(P1-1)/blocksize)
+// call (1+(P1-1)/blocksize) chunks_per_cloud
+// These chunks are divided among the gridSize-many blocks.
+// In block b, we work on chunks b, b+gridSize, b+2*gridSize etc .
+// In chunk i, we work on cloud i/chunks_per_cloud on points starting from
+// blocksize*(i%chunks_per_cloud).
+
+template <typename scalar_t>
+__global__ void BallQueryKernel(
+    const at::PackedTensorAccessor64<scalar_t, 3, at::RestrictPtrTraits> p1,
+    const at::PackedTensorAccessor64<scalar_t, 3, at::RestrictPtrTraits> p2,
+    const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+        lengths1,
+    const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits>
+        lengths2,
+    at::PackedTensorAccessor64<int64_t, 3, at::RestrictPtrTraits> idxs,
+    at::PackedTensorAccessor64<scalar_t, 3, at::RestrictPtrTraits> dists,
+    const int64_t K,
+    const float radius2) {
+  const int64_t N = p1.size(0);
+  const int64_t chunks_per_cloud = (1 + (p1.size(1) - 1) / blockDim.x);
+  const int64_t chunks_to_do = N * chunks_per_cloud;
+  const int D = p1.size(2);
+
+  for (int64_t chunk = blockIdx.x; chunk < chunks_to_do; chunk += gridDim.x) {
+    const int64_t n = chunk / chunks_per_cloud; // batch_index
+    const int64_t start_point = blockDim.x * (chunk % chunks_per_cloud);
+    int64_t i = start_point + threadIdx.x;
+
+    // Check if point is valid in heterogeneous tensor
+    if (i >= lengths1[n]) {
+      continue;
+    }
+
+    // Iterate over points in p2 until desired count is reached or
+    // all points have been considered
+    for (int64_t j = 0, count = 0; j < lengths2[n] && count < K; ++j) {
+      // Calculate the distance between the points
+      scalar_t dist2 = 0.0;
+      for (int d = 0; d < D; ++d) {
+        scalar_t diff = p1[n][i][d] - p2[n][j][d];
+        dist2 += (diff * diff);
+      }
+
+      if (dist2 < radius2) {
+        // If the point is within the radius
+        // Set the value of the index to the point index
+        idxs[n][i][count] = j;
+        dists[n][i][count] = dist2;
+
+        // increment the number of selected samples for the point i
+        ++count;
+      }
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
+    const at::Tensor& p1, // (N, P1, 3)
+    const at::Tensor& p2, // (N, P2, 3)
+    const at::Tensor& lengths1, // (N,)
+    const at::Tensor& lengths2, // (N,)
+    int K,
+    float radius) {
+  // Check inputs are on the same device
+  at::TensorArg p1_t{p1, "p1", 1}, p2_t{p2, "p2", 2},
+      lengths1_t{lengths1, "lengths1", 3}, lengths2_t{lengths2, "lengths2", 4};
+  at::CheckedFrom c = "BallQueryCuda";
+  at::checkAllSameGPU(c, {p1_t, p2_t, lengths1_t, lengths2_t});
+  at::checkAllSameType(c, {p1_t, p2_t});
+
+  // Set the device for the kernel launch based on the device of p1
+  at::cuda::CUDAGuard device_guard(p1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  TORCH_CHECK(
+      p2.size(2) == p1.size(2), "Point sets must have the same last dimension");
+
+  const int N = p1.size(0);
+  const int P1 = p1.size(1);
+  const int64_t K_64 = K;
+  const float radius2 = radius * radius;
+
+  // Output tensor with indices of neighbors for each point in p1
+  auto long_dtype = lengths1.options().dtype(at::kLong);
+  auto idxs = at::full({N, P1, K}, -1, long_dtype);
+  auto dists = at::zeros({N, P1, K}, p1.options());
+
+  if (idxs.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(idxs, dists);
+  }
+
+  const size_t blocks = 256;
+  const size_t threads = 256;
+
+  AT_DISPATCH_FLOATING_TYPES(
+      p1.scalar_type(), "ball_query_kernel_cuda", ([&] {
+        BallQueryKernel<<<blocks, threads, 0, stream>>>(
+            p1.packed_accessor64<float, 3, at::RestrictPtrTraits>(),
+            p2.packed_accessor64<float, 3, at::RestrictPtrTraits>(),
+            lengths1.packed_accessor64<int64_t, 1, at::RestrictPtrTraits>(),
+            lengths2.packed_accessor64<int64_t, 1, at::RestrictPtrTraits>(),
+            idxs.packed_accessor64<int64_t, 3, at::RestrictPtrTraits>(),
+            dists.packed_accessor64<float, 3, at::RestrictPtrTraits>(),
+            K_64,
+            radius2);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return std::make_tuple(idxs, dists);
+}
diff --git a/pytorch3d/pytorch3d/csrc/ball_query/ball_query.h b/pytorch3d/pytorch3d/csrc/ball_query/ball_query.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8a1cd7632e8a011536ba80554c52f4d9d503f6e
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/ball_query/ball_query.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <tuple>
+#include "utils/pytorch3d_cutils.h"
+
+// Compute indices of K neighbors in pointcloud p2 to points
+// in pointcloud p1 which fall within a specified radius
+//
+// Args:
+//    p1: FloatTensor of shape (N, P1, D) giving a batch of pointclouds each
+//        containing P1 points of dimension D.
+//    p2: FloatTensor of shape (N, P2, D) giving a batch of pointclouds each
+//        containing P2 points of dimension D.
+//    lengths1: LongTensor, shape (N,), giving actual length of each P1 cloud.
+//    lengths2: LongTensor, shape (N,), giving actual length of each P2 cloud.
+//    K: Integer giving the upper bound on the number of samples to take
+//      within the radius
+//    radius: the radius around each point within which the neighbors need to be
+//      located
+//
+// Returns:
+//    p1_neighbor_idx: LongTensor of shape (N, P1, K), where
+//        p1_neighbor_idx[n, i, k] = j means that the kth
+//        neighbor to p1[n, i] in the cloud p2[n] is p2[n, j].
+//        This is padded with -1s both where a cloud in p2 has fewer than
+//        S points and where a cloud in p1 has fewer than P1 points and
+//        also if there are fewer than K points which satisfy the radius
+//        threshold.
+//
+//    p1_neighbor_dists: FloatTensor of shape (N, P1, K) containing the squared
+//        distance from each point p1[n, p, :] to its K neighbors
+//        p2[n, p1_neighbor_idx[n, p, k], :].
+
+// CPU implementation
+std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    const int K,
+    const float radius);
+
+// CUDA implementation
+std::tuple<at::Tensor, at::Tensor> BallQueryCuda(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    const int K,
+    const float radius);
+
+// Implementation which is exposed
+// Note: the backward pass reuses the KNearestNeighborBackward kernel
+inline std::tuple<at::Tensor, at::Tensor> BallQuery(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    int K,
+    float radius) {
+  if (p1.is_cuda() || p2.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(p1);
+    CHECK_CUDA(p2);
+    return BallQueryCuda(
+        p1.contiguous(),
+        p2.contiguous(),
+        lengths1.contiguous(),
+        lengths2.contiguous(),
+        K,
+        radius);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return BallQueryCpu(
+      p1.contiguous(),
+      p2.contiguous(),
+      lengths1.contiguous(),
+      lengths2.contiguous(),
+      K,
+      radius);
+}
diff --git a/pytorch3d/pytorch3d/csrc/ball_query/ball_query_cpu.cpp b/pytorch3d/pytorch3d/csrc/ball_query/ball_query_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f7c59e0bc835babcfaea49ec013c49fe324c829e
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/ball_query/ball_query_cpu.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <queue>
+#include <tuple>
+
+std::tuple<at::Tensor, at::Tensor> BallQueryCpu(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    int K,
+    float radius) {
+  const int N = p1.size(0);
+  const int P1 = p1.size(1);
+  const int D = p1.size(2);
+
+  auto long_opts = lengths1.options().dtype(torch::kInt64);
+  torch::Tensor idxs = torch::full({N, P1, K}, -1, long_opts);
+  torch::Tensor dists = torch::full({N, P1, K}, 0, p1.options());
+  const float radius2 = radius * radius;
+
+  auto p1_a = p1.accessor<float, 3>();
+  auto p2_a = p2.accessor<float, 3>();
+  auto lengths1_a = lengths1.accessor<int64_t, 1>();
+  auto lengths2_a = lengths2.accessor<int64_t, 1>();
+  auto idxs_a = idxs.accessor<int64_t, 3>();
+  auto dists_a = dists.accessor<float, 3>();
+
+  for (int n = 0; n < N; ++n) {
+    const int64_t length1 = lengths1_a[n];
+    const int64_t length2 = lengths2_a[n];
+    for (int64_t i = 0; i < length1; ++i) {
+      for (int64_t j = 0, count = 0; j < length2 && count < K; ++j) {
+        float dist2 = 0;
+        for (int d = 0; d < D; ++d) {
+          float diff = p1_a[n][i][d] - p2_a[n][j][d];
+          dist2 += diff * diff;
+        }
+        if (dist2 < radius2) {
+          dists_a[n][i][count] = dist2;
+          idxs_a[n][i][count] = j;
+          ++count;
+        }
+      }
+    }
+  }
+  return std::make_tuple(idxs, dists);
+}
diff --git a/pytorch3d/pytorch3d/csrc/blending/sigmoid_alpha_blend.cu b/pytorch3d/pytorch3d/csrc/blending/sigmoid_alpha_blend.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e10e3d67694c55fdacc8c52f75b5ab00f6c076e1
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/blending/sigmoid_alpha_blend.cu
@@ -0,0 +1,216 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <cmath>
+#include <vector>
+
+template <typename scalar_t>
+__global__ void SigmoidAlphaBlendForwardKernel(
+    // clang-format off
+    const at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> distances, // (N, H, W, K)
+    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> pix_to_face, // (N, H, W, K)
+    at::PackedTensorAccessor64<scalar_t, 3, at::RestrictPtrTraits> alphas, // (N, H, W)
+    // clang-format on
+    const scalar_t sigma,
+    const int N,
+    const int H,
+    const int W,
+    const int K) {
+  // Parallelize over each pixel in images of
+  // size H * W, for each image in the batch of size N.
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // TODO: revisit performance of this kernel with shared memory usage
+
+  for (int t_i = tid; t_i < N * H * W; t_i += num_threads) {
+    // Convert linear index to 3D index
+    const int n = t_i / (H * W); // batch index.
+    const int pix_idx = t_i % (H * W);
+
+    // TODO: fix index calculation for non square images.
+    const int yi = pix_idx / W;
+    const int xi = pix_idx % W;
+    scalar_t alpha = 1.0;
+
+    // Loop over all the faces for this pixel.
+    for (int k = 0; k < K; k++) {
+      // Index into (N, H, W, K) tensors
+      const int f = pix_to_face[n][yi][xi][k];
+      if (f < 0) {
+        // Sentinel value is -1 indicating no face overlaps the pixel.
+        continue;
+      }
+      // The distance is negative if a pixel is inside a face and positive
+      // outside the face. Therefore use -1.0 * the distance to get the
+      // correct sign.
+      scalar_t dist = -1.0 * distances[n][yi][xi][k];
+
+      // Calculate the sigmoid probability.
+      scalar_t prob = 1. / (1. + exp(-dist / sigma));
+
+      // The cumulative product ensures that alpha will be 0.0 if at least 1
+      // face fully covers the pixel as for that face, prob will be 1.0.
+      // This results in a multiplication by 0.0 because of the (1.0 - prob)
+      // term. Therefore the final result of (1.0 - alpha) will be 1.0.
+      alpha *= (1.0 - prob);
+    }
+    alphas[n][yi][xi] = 1.0 - alpha;
+  }
+}
+
+at::Tensor SigmoidAlphaBlendForwardCuda(
+    const at::Tensor& distances, // (N, H, W, K)
+    const at::Tensor& pix_to_face, // (N, H, W, K)
+    const float sigma) {
+  const int N = distances.size(0);
+  const int H = distances.size(1);
+  const int W = distances.size(2);
+  const int K = distances.size(3);
+
+  at::Tensor alphas = at::zeros({N, H, W}, distances.options());
+  const size_t blocks = 1024;
+  const size_t threads = 128;
+
+  // Check inputs are on the same device
+  at::TensorArg distances_t{distances, "distances", 1},
+      pix_to_face_t{pix_to_face, "pix_to_face", 2};
+  at::CheckedFrom c = "SigmoidAlphaBlendForwardCuda";
+  at::checkAllSameGPU(c, {distances_t, pix_to_face_t});
+
+  // Set the device for the kernel launch based on the device of distances
+  at::cuda::CUDAGuard device_guard(distances.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (distances.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return alphas;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(
+      distances.scalar_type(), "sigmoid_alpha_blend_kernel", ([&] {
+        // clang-format off
+      SigmoidAlphaBlendForwardKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+      distances.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+      pix_to_face.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>(),
+      alphas.packed_accessor64<scalar_t, 3, at::RestrictPtrTraits>(),
+      sigma,
+      N,
+      H,
+      W,
+      K);
+        // clang-format on
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return alphas;
+}
+
+template <typename scalar_t>
+__global__ void SigmoidAlphaBlendBackwardKernel(
+    // clang-format off
+    const at::PackedTensorAccessor64<scalar_t, 3, at::RestrictPtrTraits> grad_alphas, // (N, H, W)
+    const at::PackedTensorAccessor64<scalar_t, 3, at::RestrictPtrTraits> alphas, // (N, H, W)
+    const at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> distances, // (N, H, W, K)
+    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> pix_to_face, // (N, H, W, K)
+    at::PackedTensorAccessor64<scalar_t, 4, at::RestrictPtrTraits> grad_distances, // (N, H, W)
+    // clang-format on
+    const scalar_t sigma,
+    const int N,
+    const int H,
+    const int W,
+    const int K) {
+  // Parallelize over each of the top K faces for each pixel in images of
+  // size H * W * K, for each image in the batch of size N.
+
+  // Get block and thread index.
+  const int n = blockIdx.x;
+  const int num_pixels = H * W * K;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  for (int t_i = tid; t_i < num_pixels; t_i += num_threads) {
+    // Convert linear index to 3D index.
+    int yi = t_i / (W * K);
+    int xi = (t_i % (W * K)) / K;
+    int k = (t_i % (W * K)) % K;
+
+    const scalar_t alpha = 1.0 - alphas[n][yi][xi];
+    const scalar_t grad_alpha = grad_alphas[n][yi][xi];
+    const int f = pix_to_face[n][yi][xi][k];
+
+    // Sentinel value is -1 indicating no face overlaps the pixel.
+    if (f >= 0) {
+      // The distance is negative if a pixel is inside a face and positive
+      // outside the face. Therefore use -1.0 * the distance to get the
+      // correct sign.
+      scalar_t dist = -1.0 * distances[n][yi][xi][k];
+
+      // Calculate the sigmoid probability.
+      scalar_t prob = 1. / (1. + exp(-dist / sigma));
+
+      grad_distances[n][yi][xi][k] = grad_alpha * (-1.0 / sigma) * prob * alpha;
+    }
+  }
+}
+
+at::Tensor SigmoidAlphaBlendBackwardCuda(
+    const at::Tensor& grad_alphas, // (N, H, W)
+    const at::Tensor& alphas, // (N, H, W)
+    const at::Tensor& distances, // (N, H, W, K)
+    const at::Tensor& pix_to_face, // (N, H, W, K)
+    float sigma) {
+  const int N = distances.size(0);
+  const int H = distances.size(1);
+  const int W = distances.size(2);
+  const int K = distances.size(3);
+
+  at::Tensor grad_distances = at::zeros({N, H, W, K}, distances.options());
+
+  const dim3 threads(512);
+  const dim3 blocks(N, 1024 / N + 1);
+
+  at::TensorArg grad_alphas_t{grad_alphas, "grad_alphas", 1},
+      alphas_t{alphas, "alphas", 2}, distances_t{distances, "distances", 3},
+      pix_to_face_t{pix_to_face, "pix_to_face", 4};
+  at::CheckedFrom c = "SigmoidAlphaBlendBackwardCuda";
+  at::checkAllSameGPU(c, {grad_alphas_t, alphas_t, distances_t, pix_to_face_t});
+
+  // Set the device for the kernel launch based on the device of distances
+  at::cuda::CUDAGuard device_guard(alphas.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  if (alphas.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_alphas;
+  }
+
+  AT_DISPATCH_FLOATING_TYPES(
+      distances.scalar_type(), "sigmoid_alpha_blend_backward_kernel", ([&] {
+        SigmoidAlphaBlendBackwardKernel<
+            scalar_t><<<blocks, threads, 0, stream>>>(
+            // clang-format off
+            grad_alphas.packed_accessor64<scalar_t, 3,at::RestrictPtrTraits>(),
+            alphas.packed_accessor64<scalar_t, 3, at::RestrictPtrTraits>(),
+            distances.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+            pix_to_face.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>(),
+            grad_distances.packed_accessor64<scalar_t, 4, at::RestrictPtrTraits>(),
+            // clang-format on
+            sigma,
+            N,
+            H,
+            W,
+            K);
+      }));
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_distances;
+}
diff --git a/pytorch3d/pytorch3d/csrc/blending/sigmoid_alpha_blend.h b/pytorch3d/pytorch3d/csrc/blending/sigmoid_alpha_blend.h
new file mode 100644
index 0000000000000000000000000000000000000000..93f4bc8b342a28d611eac144a6ff9bee4bb1145d
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/blending/sigmoid_alpha_blend.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <tuple>
+
+// clang-format off
+// Function to blend the top K faces per pixel based on the 2d euclidean distance
+// from the center of the pixel to the face. This method is adapted from [1].
+// The output can be used to set the alpha value in an RGBA image.
+// Args:
+//      pix_to_face: LongTensor of shape (N, H, W, K), indices of faces overlapping
+//          with each pixel, where N is the batch size, H, W are the dimensions of the
+//          image and K is the number of faces rasterized per pixel.
+//      distances: FloatTensor of shape (N, H, W, K), 2d euclidean distance of each pixel
+//          relative to the faces in pix_to_face
+//      sigma: float, parameter which controls the width of the sigmoid for blending
+// Returns:
+//      alphas: FloatTensor of shape (N, H, W), the blended values for each pixel
+//          in the image.
+//
+// [1] Shichen Liu et al, 'Soft Rasterizer: A Differentiable Renderer for
+// Image-based 3D Reasoning'
+// clang-format on
+at::Tensor SigmoidAlphaBlendForwardCpu(
+    const at::Tensor& distances,
+    const at::Tensor& pix_to_face,
+    const float sigma);
+
+#ifdef WITH_CUDA
+at::Tensor SigmoidAlphaBlendForwardCuda(
+    const at::Tensor& distances,
+    const at::Tensor& pix_to_face,
+    const float sigma);
+#endif
+
+// clang-format off
+// Args:
+//      grad_alphas: FloatTensor of shape (N, H, W), upstream gradients for alphas
+//      alphas: FloatTensor of shape (N, H, W), the alpha values from the forward pass
+//      pix_to_face: LongTensor of shape (N, H, W, K), indices of faces overlapping
+//          with each pixel, where N is the batch size, H, W are the dimensions of the
+//          image, and K is the number of faces rasterized per pixel
+//      distances: FloatTensor of shape (N, H, W, K), 2d euclidean distance of each pixel
+//          to the corresponding faces in pix_to_face
+//      sigma: float, parameter which controls the width of the sigmoid for blending
+// Returns:
+//      grad_distances: FloatTensor of shape (N, H, W, K)
+// clang-format on
+at::Tensor SigmoidAlphaBlendBackwardCpu(
+    const at::Tensor& grad_alphas,
+    const at::Tensor& alphas,
+    const at::Tensor& distances,
+    const at::Tensor& pix_to_face,
+    const float sigma);
+
+#ifdef WITH_CUDA
+at::Tensor SigmoidAlphaBlendBackwardCuda(
+    const at::Tensor& grad_alphas,
+    const at::Tensor& alphas,
+    const at::Tensor& distances,
+    const at::Tensor& pix_to_face,
+    const float sigma);
+#endif
+
+// Implementation which is exposed.
+at::Tensor
+SigmoidAlphaBlend(at::Tensor& distances, at::Tensor& pix_to_face, float sigma) {
+  if (distances.is_cuda() && pix_to_face.is_cuda()) {
+#ifdef WITH_CUDA
+    return SigmoidAlphaBlendForwardCuda(distances, pix_to_face, sigma);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return SigmoidAlphaBlendForwardCpu(distances, pix_to_face, sigma);
+}
+
+// Implementation which is exposed.
+at::Tensor SigmoidAlphaBlendBackward(
+    const at::Tensor& grad_alphas,
+    const at::Tensor& alphas,
+    const at::Tensor& distances,
+    const at::Tensor& pix_to_face,
+    const float sigma) {
+  if (distances.is_cuda() && pix_to_face.is_cuda() && alphas.is_cuda() &&
+      grad_alphas.is_cuda()) {
+#ifdef WITH_CUDA
+    return SigmoidAlphaBlendBackwardCuda(
+        grad_alphas, alphas, distances, pix_to_face, sigma);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return SigmoidAlphaBlendBackwardCpu(
+      grad_alphas, alphas, distances, pix_to_face, sigma);
+}
diff --git a/pytorch3d/pytorch3d/csrc/blending/sigmoid_alpha_blend_cpu.cpp b/pytorch3d/pytorch3d/csrc/blending/sigmoid_alpha_blend_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5ba4855dc481fd46a31f95c27efdda0a248b1571
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/blending/sigmoid_alpha_blend_cpu.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <cmath>
+#include <vector>
+
+at::Tensor SigmoidAlphaBlendForwardCpu(
+    const at::Tensor& distances, // (N, H, W, K)
+    const at::Tensor& pix_to_face, // (N, H, W, K)
+    const float sigma) {
+  const int N = distances.size(0);
+  const int H = distances.size(1);
+  const int W = distances.size(2);
+  const int K = distances.size(3);
+
+  torch::Tensor out = torch::empty({N, H, W}, distances.options());
+
+  auto distances_a = distances.accessor<float, 4>();
+  auto pix_to_face_a = pix_to_face.accessor<int64_t, 4>();
+  auto out_a = out.accessor<float, 3>();
+
+  // Iterate over the images in the batch.
+  for (int n = 0; n < N; ++n) {
+    // Iterate through the horizontal lines of the image from top to bottom.
+    for (int h = 0; h < H; ++h) {
+      // Iterate over the pixels on this horizontal line, left to right.
+      for (int w = 0; w < W; ++w) {
+        float alpha = 1.0;
+
+        // Loop through the top K faces for each pixel.
+        for (int k = 0; k < K; ++k) {
+          const int f = pix_to_face_a[n][h][w][k];
+          if (f < 0) {
+            // Sentinel value is -1 indicating no face overlaps the pixel.
+            continue;
+          }
+          // The distance is negative if a pixel is inside a face and positive
+          // outside the face. Therefore use -1.0 * the distance to get the
+          // correct sign.
+          float dist = -1.0 * distances_a[n][h][w][k];
+
+          // Calculate the sigmoid probability.
+          float prob = 1. / (1. + exp(-dist / sigma));
+
+          // The product ensures that alpha will be 0.0 if at least 1
+          // face fully covers the pixel as for that face, prob will be 1.0.
+          // This results in a multiplication by 0.0 because of the (1.0 - prob)
+          // term. Therefore 1.0 - alpha will be 1.0.
+          alpha *= 1.0 - prob;
+        }
+        out_a[n][h][w] = 1.0 - alpha;
+      }
+    }
+  }
+  return out;
+}
+
+at::Tensor SigmoidAlphaBlendBackwardCpu(
+    const at::Tensor& grad_alphas, // (N, H, W)
+    const at::Tensor& alphas, // (N, H, W)
+    const at::Tensor& distances, // (N, H, W, K)
+    const at::Tensor& pix_to_face, // (N, H, W, K)
+    const float sigma) {
+  const int N = distances.size(0);
+  const int H = distances.size(1);
+  const int W = distances.size(2);
+  const int K = distances.size(3);
+
+  auto distances_a = distances.accessor<float, 4>();
+  auto pix_to_face_a = pix_to_face.accessor<int64_t, 4>();
+  auto alphas_a = alphas.accessor<float, 3>();
+  auto grad_alphas_a = grad_alphas.accessor<float, 3>();
+
+  torch::Tensor grad_distances =
+      torch::zeros({N, H, W, K}, distances.options());
+  auto grad_distances_a = grad_distances.accessor<float, 4>();
+
+  // Iterate over the images in the batch.
+  for (int n = 0; n < N; ++n) {
+    // Iterate through the horizontal lines of the image from top to bottom.
+    for (int h = 0; h < H; ++h) {
+      // Iterate over the pixels on this horizontal line, left to right.
+      for (int w = 0; w < W; ++w) {
+        // Get the alpha value from the forward pass and the
+        // upstream gradient.
+        const float alpha = 1.0 - alphas_a[n][h][w];
+        const float grad_alpha = grad_alphas_a[n][h][w];
+
+        // Loop through the top K faces for each pixel.
+        for (int k = 0; k < K; ++k) {
+          const int f = pix_to_face_a[n][h][w][k];
+          if (f < 0) {
+            // Sentinel value is -1 indicating no face overlaps the pixel
+            continue;
+          }
+          // The distance is negative if a pixel is inside a face and positive
+          // outside the face. Therefore use -1.0 * distance to get the
+          // correct sign.
+          float dist = -1.0 * distances_a[n][h][w][k];
+
+          // Calculate the sigmoid probability.
+          float prob = 1. / (1. + exp(-dist / sigma));
+
+          // clang-format off
+          // We need to take the derivative of alpha w.r.t to the distance.
+          // alpha = 1.0 - (1.0- sigmoid(-x)) * (1.0 - sigmoid(-x2)) * ... * (1.0 - sigmoid(-xn))
+          //
+          // Note that d/dx sigmoid(x) = sigmoid(x) * (1.0 - sigmoid(x))
+          //
+          // This gives:
+          // d_alpha/d_dist = -1.0 * -1.0 * sigmoid(-x)(1. - sigmoid(-x)) * (-1.0/sigma)
+          //        * ((1.0 - sigmoid(-x2) * ... * (1.0 - sigmoid(-xn))
+          //    = (-1.0/sigma) * prob * (1.0 - prob) * alpha/(1.0 - prob)
+          //    = (-1.0/sigma) * prob * alpha
+          // clang-format on
+          grad_distances_a[n][h][w][k] =
+              grad_alpha * (-1.0 / sigma) * prob * alpha;
+        }
+      }
+    }
+  }
+  return grad_distances;
+}
diff --git a/pytorch3d/pytorch3d/csrc/compositing/alpha_composite.cu b/pytorch3d/pytorch3d/csrc/compositing/alpha_composite.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2be0625de1ba0d2abb6d123b841d5e47aa43ed02
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/compositing/alpha_composite.cu
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <stdio.h>
+#include <vector>
+
+__constant__ const float kEpsilon = 1e-9;
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void alphaCompositeCudaForwardKernel(
+    // clang-format off
+    at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> result,
+    const at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> features,
+    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
+    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
+  // clang-format on
+  const int64_t batch_size = result.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * H * W;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Iterate over each feature in each pixel
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
+
+    // alphacomposite the different values
+    float cum_alpha = 1.;
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+
+      float alpha = alphas[batch][k][j][i];
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(
+          &result[batch][ch][j][i], features[ch][n_idx] * cum_alpha * alpha);
+      cum_alpha = cum_alpha * (1 - alpha);
+    }
+  }
+}
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void alphaCompositeCudaBackwardKernel(
+    // clang-format off
+    at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> grad_features,
+    at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> grad_alphas,
+    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> grad_outputs,
+    const at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> features,
+    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
+    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
+  // clang-format on
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * H * W;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Parallelize over each feature in each pixel in images of size H * W,
+  // for each image in the batch of size batch_size
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
+
+    // alphacomposite the different values
+    float cum_alpha = 1.;
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+      float alpha = alphas[batch][k][j][i];
+
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(
+          &grad_alphas[batch][k][j][i],
+          cum_alpha * features[ch][n_idx] * grad_outputs[batch][ch][j][i]);
+      atomicAdd(
+          &grad_features[ch][n_idx],
+          cum_alpha * alpha * grad_outputs[batch][ch][j][i]);
+
+      // Iterate over all (K-1) nearest points to update gradient
+      for (int t = 0; t < k; ++t) {
+        int t_idx = points_idx[batch][t][j][i];
+        // Sentinel value is -1, indicating no point overlaps this pixel
+        if (t_idx < 0) {
+          continue;
+        }
+        float alpha_tvalue = alphas[batch][t][j][i];
+        // TODO(gkioxari) It might be more efficient to have threads write in a
+        // local variable, and move atomicAdd outside of the loop such that
+        // atomicAdd is executed once per thread.
+        atomicAdd(
+            &grad_alphas[batch][t][j][i],
+            -grad_outputs[batch][ch][j][i] * features[ch][n_idx] * cum_alpha *
+                alpha / (1 - alpha_tvalue + kEpsilon));
+      }
+
+      cum_alpha = cum_alpha * (1 - alphas[batch][k][j][i]);
+    }
+  }
+}
+
+at::Tensor alphaCompositeCudaForward(
+    const at::Tensor& features,
+    const at::Tensor& alphas,
+    const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg features_t{features, "features", 1},
+      alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
+  at::CheckedFrom c = "alphaCompositeCudaForward";
+  at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  auto result = at::zeros({batch_size, C, H, W}, features.options());
+
+  if (result.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return result;
+  }
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  alphaCompositeCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
+      // clang-format off
+      // As we are using packed accessors here the tensors
+      // do not need to be made contiguous.
+      result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
+      alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
+  // clang-format on
+  AT_CUDA_CHECK(cudaGetLastError());
+  return result;
+}
+
+std::tuple<at::Tensor, at::Tensor> alphaCompositeCudaBackward(
+    const at::Tensor& grad_outputs,
+    const at::Tensor& features,
+    const at::Tensor& alphas,
+    const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
+      features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
+      points_idx_t{points_idx, "points_idx", 4};
+  at::CheckedFrom c = "alphaCompositeCudaBackward";
+  at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  auto grad_features = at::zeros_like(features);
+  auto grad_alphas = at::zeros_like(alphas);
+
+  if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(grad_features, grad_alphas);
+  }
+
+  const int64_t bs = alphas.size(0);
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(bs, 1024 / bs + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  alphaCompositeCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
+      // clang-format off
+      // As we are using packed accessors here the tensors
+      // do not need to be made contiguous.
+      grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
+      grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      grad_outputs.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
+      alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
+  // clang-format on
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(grad_features, grad_alphas);
+}
diff --git a/pytorch3d/pytorch3d/csrc/compositing/alpha_composite.h b/pytorch3d/pytorch3d/csrc/compositing/alpha_composite.h
new file mode 100644
index 0000000000000000000000000000000000000000..2945b3279e5958a7908da3048a4ddc14f58ed26a
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/compositing/alpha_composite.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include "utils/pytorch3d_cutils.h"
+
+#include <vector>
+
+// Perform alpha compositing of points in a z-buffer.
+//
+// Inputs:
+//    features: FloatTensor of shape (C, P) which gives the features
+//            of each point where C is the size of the feature and
+//            P the number of points.
+//    alphas: FloatTensor of shape (N, points_per_pixel, H, W) where
+//            points_per_pixel is the number of points in the z-buffer
+//            sorted in z-order, and (H, W) is the image size.
+//    points_idx: IntTensor of shape (N, points_per_pixel, H, W) giving the
+//            indices of the nearest points at each pixel, sorted in z-order.
+// Returns:
+//    weighted_fs: FloatTensor of shape (N, C, H, W) giving the accumulated
+//            feature for each point. Concretely, it gives:
+//                 weighted_fs[b,c,i,j] = sum_k cum_alpha_k *
+//                   features[c,points_idx[b,k,i,j]]
+//                 where cum_alpha_k =
+//                    alphas[b,k,i,j] * prod_l=0..k-1 (1 - alphas[b,l,i,j])
+
+// CUDA declarations
+#ifdef WITH_CUDA
+torch::Tensor alphaCompositeCudaForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCudaBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+#endif
+
+// C++ declarations
+torch::Tensor alphaCompositeCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+torch::Tensor alphaCompositeForward(
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (features.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(features);
+    CHECK_CUDA(alphas);
+    CHECK_CUDA(points_idx);
+    return alphaCompositeCudaForward(features, alphas, points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return alphaCompositeCpuForward(features, alphas, points_idx);
+  }
+}
+
+std::tuple<torch::Tensor, torch::Tensor> alphaCompositeBackward(
+    torch::Tensor& grad_outputs,
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  grad_outputs = grad_outputs.contiguous();
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (grad_outputs.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(grad_outputs);
+    CHECK_CUDA(features);
+    CHECK_CUDA(alphas);
+    CHECK_CUDA(points_idx);
+
+    return alphaCompositeCudaBackward(
+        grad_outputs, features, alphas, points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return alphaCompositeCpuBackward(
+        grad_outputs, features, alphas, points_idx);
+  }
+}
diff --git a/pytorch3d/pytorch3d/csrc/compositing/alpha_composite_cpu.cpp b/pytorch3d/pytorch3d/csrc/compositing/alpha_composite_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..63607b1bda721d3cbf0d086c378dde674a2950de
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/compositing/alpha_composite_cpu.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+// Epsilon float
+const float kEps = 1e-9;
+
+torch::Tensor alphaCompositeCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
+
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto result_a = result.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate over the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          float cum_alpha = 1.;
+          // Iterate through the closest K points for this pixel
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinel value is -1 indicating no point overlaps the pixel
+            if (n_idx < 0) {
+              continue;
+            }
+            float alpha = alphas_a[b][k][j][i];
+            result_a[b][c][j][i] += cum_alpha * alpha * features_a[c][n_idx];
+            cum_alpha = cum_alpha * (1 - alpha);
+          }
+        }
+      }
+    }
+  }
+  return result;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> alphaCompositeCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  torch::Tensor grad_features = torch::zeros_like(features);
+  torch::Tensor grad_alphas = torch::zeros_like(alphas);
+
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  auto grad_outputs_a = grad_outputs.accessor<float, 4>();
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto grad_features_a = grad_features.accessor<float, 2>();
+  auto grad_alphas_a = grad_alphas.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate over the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          float cum_alpha = 1.;
+          // Iterate through the closest K points for this pixel
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinal value is -1, indicating no point overlaps this pixel
+            if (n_idx < 0) {
+              continue;
+            }
+            float alpha = alphas_a[b][k][j][i];
+            grad_alphas_a[b][k][j][i] +=
+                grad_outputs_a[b][c][j][i] * features_a[c][n_idx] * cum_alpha;
+            grad_features_a[c][n_idx] +=
+                grad_outputs_a[b][c][j][i] * cum_alpha * alpha;
+
+            // Iterate over all (K-1) nearer points to update gradient
+            for (int t = 0; t < k; t++) {
+              int64_t t_idx = points_idx_a[b][t][j][i];
+              // Sentinal value is -1, indicating no point overlaps this pixel
+              if (t_idx < 0) {
+                continue;
+              }
+              float alpha_tvalue = alphas_a[b][t][j][i];
+              grad_alphas_a[b][t][j][i] -= grad_outputs_a[b][c][j][i] *
+                  features_a[c][n_idx] * cum_alpha * alpha /
+                  (1 - alpha_tvalue + kEps);
+            }
+
+            cum_alpha = cum_alpha * (1 - alpha);
+          }
+        }
+      }
+    }
+  }
+  return std::make_tuple(grad_features, grad_alphas);
+}
diff --git a/pytorch3d/pytorch3d/csrc/compositing/norm_weighted_sum.cu b/pytorch3d/pytorch3d/csrc/compositing/norm_weighted_sum.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eade1e3588e9e538b3ff80252666e35a420dd0fb
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/compositing/norm_weighted_sum.cu
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <stdio.h>
+#include <vector>
+
+__constant__ const float kEpsilon = 1e-4;
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void weightedSumNormCudaForwardKernel(
+    // clang-format off
+    at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> result,
+    const at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> features,
+    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
+    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
+  // clang-format on
+  const int64_t batch_size = result.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * H * W;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Parallelize over each feature in each pixel in images of size H * W,
+  // for each image in the batch of size batch_size
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
+
+    // Store the accumulated alpha value
+    float cum_alpha = 0.;
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+
+      cum_alpha += alphas[batch][k][j][i];
+    }
+
+    if (cum_alpha < kEpsilon) {
+      cum_alpha = kEpsilon;
+    }
+
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+      float alpha = alphas[batch][k][j][i];
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(
+          &result[batch][ch][j][i], features[ch][n_idx] * alpha / cum_alpha);
+    }
+  }
+}
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void weightedSumNormCudaBackwardKernel(
+    // clang-format off
+    at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> grad_features,
+    at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> grad_alphas,
+    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> grad_outputs,
+    const at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> features,
+    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
+    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
+  // clang-format on
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * W * H;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Parallelize over each feature in each pixel in images of size H * W,
+  // for each image in the batch of size batch_size
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
+
+    float sum_alpha = 0.;
+    float sum_alphafs = 0.;
+    // Iterate through the closest K points for this pixel to calculate the
+    // cumulative sum of the alphas for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+
+      sum_alpha += alphas[batch][k][j][i];
+      sum_alphafs += alphas[batch][k][j][i] * features[ch][n_idx];
+    }
+
+    if (sum_alpha < kEpsilon) {
+      sum_alpha = kEpsilon;
+    }
+
+    // Iterate again through the closest K points for this pixel to calculate
+    // the gradient.
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+      float alpha = alphas[batch][k][j][i];
+
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(
+          &grad_alphas[batch][k][j][i],
+          (features[ch][n_idx] * sum_alpha - sum_alphafs) /
+              (sum_alpha * sum_alpha) * grad_outputs[batch][ch][j][i]);
+      atomicAdd(
+          &grad_features[ch][n_idx],
+          alpha * grad_outputs[batch][ch][j][i] / sum_alpha);
+    }
+  }
+}
+
+at::Tensor weightedSumNormCudaForward(
+    const at::Tensor& features,
+    const at::Tensor& alphas,
+    const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg features_t{features, "features", 1},
+      alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
+  at::CheckedFrom c = "weightedSumNormCudaForward";
+  at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  auto result = at::zeros({batch_size, C, H, W}, features.options());
+
+  if (result.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return result;
+  }
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  // clang-format off
+  weightedSumNormCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
+      // As we are using packed accessors here the tensors
+      // do not need to be made contiguous.
+      result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
+      alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
+  // clang-format on
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return result;
+}
+
+std::tuple<at::Tensor, at::Tensor> weightedSumNormCudaBackward(
+    const at::Tensor& grad_outputs,
+    const at::Tensor& features,
+    const at::Tensor& alphas,
+    const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
+      features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
+      points_idx_t{points_idx, "points_idx", 4};
+  at::CheckedFrom c = "weightedSumNormCudaBackward";
+  at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  auto grad_features = at::zeros_like(features);
+  auto grad_alphas = at::zeros_like(alphas);
+
+  if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(grad_features, grad_alphas);
+  }
+
+  const int64_t bs = points_idx.size(0);
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(bs, 1024 / bs + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  weightedSumNormCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
+      // clang-format off
+      // As we are using packed accessors here the tensors
+      // do not need to be made contiguous.
+      grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
+      grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      grad_outputs.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
+      alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
+  // clang-format on
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(grad_features, grad_alphas);
+}
diff --git a/pytorch3d/pytorch3d/csrc/compositing/norm_weighted_sum.h b/pytorch3d/pytorch3d/csrc/compositing/norm_weighted_sum.h
new file mode 100644
index 0000000000000000000000000000000000000000..ef556bae42bc2dff5f55c0bd777b144d0ce288c7
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/compositing/norm_weighted_sum.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include "utils/pytorch3d_cutils.h"
+
+#include <vector>
+
+// Perform normalized weighted sum compositing of points in a z-buffer.
+//
+// Inputs:
+//    features: FloatTensor of shape (C, P) which gives the features
+//            of each point where C is the size of the feature and
+//            P the number of points.
+//    alphas: FloatTensor of shape (N, points_per_pixel, H, W) where
+//            points_per_pixel is the number of points in the z-buffer
+//            sorted in z-order, and (H, W) is the image size.
+//    points_idx: IntTensor of shape (N, points_per_pixel, H, W) giving the
+//            indices of the nearest points at each pixel, sorted in z-order.
+// Returns:
+//    weighted_fs: FloatTensor of shape (N, C, H, W) giving the accumulated
+//            feature in each point. Concretely, it gives:
+//                 weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
+//                   features[c,points_idx[b,k,i,j]] / sum_k alphas[b,k,i,j]
+
+// CUDA declarations
+#ifdef WITH_CUDA
+torch::Tensor weightedSumNormCudaForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCudaBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+#endif
+
+// C++ declarations
+torch::Tensor weightedSumNormCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+torch::Tensor weightedSumNormForward(
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (features.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(features);
+    CHECK_CUDA(alphas);
+    CHECK_CUDA(points_idx);
+
+    return weightedSumNormCudaForward(features, alphas, points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return weightedSumNormCpuForward(features, alphas, points_idx);
+  }
+}
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumNormBackward(
+    torch::Tensor& grad_outputs,
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  grad_outputs = grad_outputs.contiguous();
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (grad_outputs.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(grad_outputs);
+    CHECK_CUDA(features);
+    CHECK_CUDA(alphas);
+    CHECK_CUDA(points_idx);
+
+    return weightedSumNormCudaBackward(
+        grad_outputs, features, alphas, points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return weightedSumNormCpuBackward(
+        grad_outputs, features, alphas, points_idx);
+  }
+}
diff --git a/pytorch3d/pytorch3d/csrc/compositing/norm_weighted_sum_cpu.cpp b/pytorch3d/pytorch3d/csrc/compositing/norm_weighted_sum_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..28fce017784a348447fd49b8ce38ae02b41e767e
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/compositing/norm_weighted_sum_cpu.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+// Epsilon float
+const float kEps = 1e-4;
+
+torch::Tensor weightedSumNormCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
+
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto result_a = result.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate oer the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          float t_alpha = 0.;
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            if (n_idx < 0) {
+              continue;
+            }
+
+            t_alpha += alphas_a[b][k][j][i];
+          }
+
+          if (t_alpha < kEps) {
+            t_alpha = kEps;
+          }
+
+          // Iterate over the different zs to combine
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinel value is -1 indicating no point overlaps the pixel
+            if (n_idx < 0) {
+              continue;
+            }
+            float alpha = alphas_a[b][k][j][i];
+            result_a[b][c][j][i] += alpha * features_a[c][n_idx] / t_alpha;
+          }
+        }
+      }
+    }
+  }
+  return result;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumNormCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  torch::Tensor grad_features = torch::zeros_like(features);
+  torch::Tensor grad_alphas = torch::zeros_like(alphas);
+
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  auto grad_outputs_a = grad_outputs.accessor<float, 4>();
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto grad_features_a = grad_features.accessor<float, 2>();
+  auto grad_alphas_a = grad_alphas.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate oer the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          float t_alpha = 0.;
+          float t_alphafs = 0.;
+          // Iterate through the closest K points for this pixel
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinel value is -1, indicating no point overlaps this pixel
+            if (n_idx < 0) {
+              continue;
+            }
+
+            t_alpha += alphas_a[b][k][j][i];
+            t_alphafs += alphas_a[b][k][j][i] * features_a[c][n_idx];
+          }
+
+          if (t_alpha < kEps) {
+            t_alpha = kEps;
+          }
+
+          // Iterate through the closest K points for this pixel ordered by z
+          // distance.
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinel value is -1 indicating no point overlaps the pixel
+            if (n_idx < 0) {
+              continue;
+            }
+            float alpha = alphas_a[b][k][j][i];
+            grad_alphas_a[b][k][j][i] += grad_outputs_a[b][c][j][i] *
+                (features_a[c][n_idx] * t_alpha - t_alphafs) /
+                (t_alpha * t_alpha);
+            grad_features_a[c][n_idx] +=
+                grad_outputs_a[b][c][j][i] * alpha / t_alpha;
+          }
+        }
+      }
+    }
+  }
+  return std::make_tuple(grad_features, grad_alphas);
+}
diff --git a/pytorch3d/pytorch3d/csrc/compositing/weighted_sum.cu b/pytorch3d/pytorch3d/csrc/compositing/weighted_sum.cu
new file mode 100644
index 0000000000000000000000000000000000000000..d3c565ea517de0d735899778809e514972b33082
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/compositing/weighted_sum.cu
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/core/TensorAccessor.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <stdio.h>
+#include <vector>
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void weightedSumCudaForwardKernel(
+    // clang-format off
+    at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> result,
+    const at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> features,
+    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
+    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
+  // clang-format on
+  const int64_t batch_size = result.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * H * W;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Parallelize over each feature in each pixel in images of size H * W,
+  // for each image in the batch of size batch_size
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
+
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+
+      // Accumulate the values
+      float alpha = alphas[batch][k][j][i];
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(&result[batch][ch][j][i], features[ch][n_idx] * alpha);
+    }
+  }
+}
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void weightedSumCudaBackwardKernel(
+    // clang-format off
+    at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> grad_features,
+    at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> grad_alphas,
+    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> grad_outputs,
+    const at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> features,
+    const at::PackedTensorAccessor64<float, 4, at::RestrictPtrTraits> alphas,
+    const at::PackedTensorAccessor64<int64_t, 4, at::RestrictPtrTraits> points_idx) {
+  // clang-format on
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  // Get the batch and index
+  const int batch = blockIdx.x;
+
+  const int num_pixels = C * H * W;
+  const int num_threads = gridDim.y * blockDim.x;
+  const int tid = blockIdx.y * blockDim.x + threadIdx.x;
+
+  // Iterate over each pixel to compute the contribution to the
+  // gradient for the features and weights
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    int ch = pid / (H * W);
+    int j = (pid % (H * W)) / W;
+    int i = (pid % (H * W)) % W;
+
+    // Iterate through the closest K points for this pixel
+    for (int k = 0; k < points_idx.size(1); ++k) {
+      int n_idx = points_idx[batch][k][j][i];
+      // Sentinel value is -1 indicating no point overlaps the pixel
+      if (n_idx < 0) {
+        continue;
+      }
+      float alpha = alphas[batch][k][j][i];
+
+      // TODO(gkioxari) It might be more efficient to have threads write in a
+      // local variable, and move atomicAdd outside of the loop such that
+      // atomicAdd is executed once per thread.
+      atomicAdd(
+          &grad_alphas[batch][k][j][i],
+          features[ch][n_idx] * grad_outputs[batch][ch][j][i]);
+      atomicAdd(
+          &grad_features[ch][n_idx], alpha * grad_outputs[batch][ch][j][i]);
+    }
+  }
+}
+
+at::Tensor weightedSumCudaForward(
+    const at::Tensor& features,
+    const at::Tensor& alphas,
+    const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg features_t{features, "features", 1},
+      alphas_t{alphas, "alphas", 2}, points_idx_t{points_idx, "points_idx", 3};
+  at::CheckedFrom c = "weightedSumCudaForward";
+  at::checkAllSameGPU(c, {features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t batch_size = points_idx.size(0);
+  const int64_t C = features.size(0);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+
+  auto result = at::zeros({batch_size, C, H, W}, features.options());
+
+  if (result.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return result;
+  }
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(batch_size, 1024 / batch_size + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  weightedSumCudaForwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
+      // clang-format off
+      // As we are using packed accessors here the tensors
+      // do not need to be made contiguous.
+      result.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
+      alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
+  // clang-format on
+  AT_CUDA_CHECK(cudaGetLastError());
+  return result;
+}
+
+std::tuple<at::Tensor, at::Tensor> weightedSumCudaBackward(
+    const at::Tensor& grad_outputs,
+    const at::Tensor& features,
+    const at::Tensor& alphas,
+    const at::Tensor& points_idx) {
+  // Check inputs are on the same device
+  at::TensorArg grad_outputs_t{grad_outputs, "grad_outputs", 1},
+      features_t{features, "features", 2}, alphas_t{alphas, "alphas", 3},
+      points_idx_t{points_idx, "points_idx", 4};
+  at::CheckedFrom c = "weightedSumCudaBackward";
+  at::checkAllSameGPU(c, {grad_outputs_t, features_t, alphas_t, points_idx_t});
+  at::checkAllSameType(c, {grad_outputs_t, features_t, alphas_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(features.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  auto grad_features = at::zeros_like(features);
+  auto grad_alphas = at::zeros_like(alphas);
+
+  if (grad_features.numel() == 0 || grad_alphas.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(grad_features, grad_alphas);
+  }
+
+  const int64_t bs = points_idx.size(0);
+
+  const dim3 threadsPerBlock(64);
+  const dim3 numBlocks(bs, 1024 / bs + 1);
+
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  weightedSumCudaBackwardKernel<<<numBlocks, threadsPerBlock, 0, stream>>>(
+      // clang-format off
+      // As we are using packed accessors here the tensors
+      // do not need to be made contiguous.
+      grad_features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
+      grad_alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      grad_outputs.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      features.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
+      alphas.packed_accessor64<float, 4, at::RestrictPtrTraits>(),
+      points_idx.packed_accessor64<int64_t, 4, at::RestrictPtrTraits>());
+  // clang-format on
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(grad_features, grad_alphas);
+}
diff --git a/pytorch3d/pytorch3d/csrc/compositing/weighted_sum.h b/pytorch3d/pytorch3d/csrc/compositing/weighted_sum.h
new file mode 100644
index 0000000000000000000000000000000000000000..c3a40c62e53b4ca31584de2bf942f516e5b7edf3
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/compositing/weighted_sum.h
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include "utils/pytorch3d_cutils.h"
+
+#include <vector>
+
+// Perform weighted sum compositing of points in a z-buffer.
+//
+// Inputs:
+//    features: FloatTensor of shape (C, P) which gives the features
+//            of each point where C is the size of the feature and
+//            P the number of points.
+//    alphas: FloatTensor of shape (N, points_per_pixel, H, W) where
+//            points_per_pixel is the number of points in the z-buffer
+//            sorted in z-order, and (H, W) is the image size.
+//    points_idx: IntTensor of shape (N, points_per_pixel, W, W) giving the
+//            indices of the nearest points at each pixel, sorted in z-order.
+// Returns:
+//    weighted_fs: FloatTensor of shape (N, C, H, W) giving the accumulated
+//            feature in each point. Concretely, it gives:
+//                 weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] *
+//                   features[c,points_idx[b,k,i,j]]
+
+// CUDA declarations
+#ifdef WITH_CUDA
+torch::Tensor weightedSumCudaForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumCudaBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+#endif
+
+// C++ declarations
+torch::Tensor weightedSumCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx);
+
+torch::Tensor weightedSumForward(
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (features.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(features);
+    CHECK_CUDA(alphas);
+    CHECK_CUDA(points_idx);
+    return weightedSumCudaForward(features, alphas, points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return weightedSumCpuForward(features, alphas, points_idx);
+  }
+}
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumBackward(
+    torch::Tensor& grad_outputs,
+    torch::Tensor& features,
+    torch::Tensor& alphas,
+    torch::Tensor& points_idx) {
+  grad_outputs = grad_outputs.contiguous();
+  features = features.contiguous();
+  alphas = alphas.contiguous();
+  points_idx = points_idx.contiguous();
+
+  if (grad_outputs.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(grad_outputs);
+    CHECK_CUDA(features);
+    CHECK_CUDA(alphas);
+    CHECK_CUDA(points_idx);
+
+    return weightedSumCudaBackward(grad_outputs, features, alphas, points_idx);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return weightedSumCpuBackward(grad_outputs, features, alphas, points_idx);
+  }
+}
diff --git a/pytorch3d/pytorch3d/csrc/compositing/weighted_sum_cpu.cpp b/pytorch3d/pytorch3d/csrc/compositing/weighted_sum_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2e0b868c8ffefca2bd9166784e6fc1f3fce1fad
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/compositing/weighted_sum_cpu.cpp
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+
+#include <cmath>
+#include <vector>
+
+torch::Tensor weightedSumCpuForward(
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  torch::Tensor result = torch::zeros({B, C, H, W}, features.options());
+
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto result_a = result.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate over the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          // Iterate through the closest K points for this pixel
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinel value is -1 indicating no point overlaps the pixel
+            if (n_idx < 0) {
+              continue;
+            }
+
+            float alpha = alphas_a[b][k][j][i];
+            result_a[b][c][j][i] += alpha * features_a[c][n_idx];
+          }
+        }
+      }
+    }
+  }
+  return result;
+}
+
+std::tuple<torch::Tensor, torch::Tensor> weightedSumCpuBackward(
+    const torch::Tensor& grad_outputs,
+    const torch::Tensor& features,
+    const torch::Tensor& alphas,
+    const torch::Tensor& points_idx) {
+  const int64_t B = points_idx.size(0);
+  const int64_t K = points_idx.size(1);
+  const int64_t H = points_idx.size(2);
+  const int64_t W = points_idx.size(3);
+  const int64_t C = features.size(0);
+
+  torch::Tensor grad_features = torch::zeros_like(features);
+  torch::Tensor grad_alphas = torch::zeros_like(alphas);
+
+  auto grad_outputs_a = grad_outputs.accessor<float, 4>();
+  auto features_a = features.accessor<float, 2>();
+  auto alphas_a = alphas.accessor<float, 4>();
+  auto points_idx_a = points_idx.accessor<int64_t, 4>();
+  auto grad_features_a = grad_features.accessor<float, 2>();
+  auto grad_alphas_a = grad_alphas.accessor<float, 4>();
+
+  // Iterate over the batch
+  for (int b = 0; b < B; ++b) {
+    // Iterate oer the features
+    for (int c = 0; c < C; ++c) {
+      // Iterate through the horizontal lines of the image from top to bottom
+      for (int j = 0; j < H; ++j) {
+        // Iterate over pixels in a horizontal line, left to right
+        for (int i = 0; i < W; ++i) {
+          // Iterate through the closest K points for this pixel
+          for (int k = 0; k < K; ++k) {
+            int64_t n_idx = points_idx_a[b][k][j][i];
+            // Sentinal value is -1, indicating no point overlaps this pixel
+            if (n_idx < 0) {
+              continue;
+            }
+
+            float alpha = alphas_a[b][k][j][i];
+            grad_alphas_a[b][k][j][i] +=
+                grad_outputs_a[b][c][j][i] * features_a[c][n_idx];
+            grad_features_a[c][n_idx] += grad_outputs_a[b][c][j][i] * alpha;
+          }
+        }
+      }
+    }
+  }
+  return std::make_tuple(grad_features, grad_alphas);
+}
diff --git a/pytorch3d/pytorch3d/csrc/ext.cpp b/pytorch3d/pytorch3d/csrc/ext.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6fd6d0c6529afe5315d95e21b340140367ed027b
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/ext.cpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// clang-format off
+#include "./pulsar/global.h" // Include before <torch/extension.h>.
+#include <torch/extension.h>
+// clang-format on
+#include "./pulsar/pytorch/renderer.h"
+#include "./pulsar/pytorch/tensor_util.h"
+#include "ball_query/ball_query.h"
+#include "blending/sigmoid_alpha_blend.h"
+#include "compositing/alpha_composite.h"
+#include "compositing/norm_weighted_sum.h"
+#include "compositing/weighted_sum.h"
+#include "face_areas_normals/face_areas_normals.h"
+#include "gather_scatter/gather_scatter.h"
+#include "interp_face_attrs/interp_face_attrs.h"
+#include "iou_box3d/iou_box3d.h"
+#include "knn/knn.h"
+#include "mesh_normal_consistency/mesh_normal_consistency.h"
+#include "packed_to_padded_tensor/packed_to_padded_tensor.h"
+#include "point_mesh/point_mesh_cuda.h"
+#include "points_to_volumes/points_to_volumes.h"
+#include "rasterize_meshes/rasterize_meshes.h"
+#include "rasterize_points/rasterize_points.h"
+#include "sample_farthest_points/sample_farthest_points.h"
+#include "sample_pdf/sample_pdf.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("face_areas_normals_forward", &FaceAreasNormalsForward);
+  m.def("face_areas_normals_backward", &FaceAreasNormalsBackward);
+  m.def("packed_to_padded", &PackedToPadded);
+  m.def("padded_to_packed", &PaddedToPacked);
+  m.def("interp_face_attrs_forward", &InterpFaceAttrsForward);
+  m.def("interp_face_attrs_backward", &InterpFaceAttrsBackward);
+#ifdef WITH_CUDA
+  m.def("knn_check_version", &KnnCheckVersion);
+#endif
+  m.def("knn_points_idx", &KNearestNeighborIdx);
+  m.def("knn_points_backward", &KNearestNeighborBackward);
+  m.def("ball_query", &BallQuery);
+  m.def("sample_farthest_points", &FarthestPointSampling);
+  m.def(
+      "mesh_normal_consistency_find_verts", &MeshNormalConsistencyFindVertices);
+  m.def("gather_scatter", &GatherScatter);
+  m.def("points_to_volumes_forward", PointsToVolumesForward);
+  m.def("points_to_volumes_backward", PointsToVolumesBackward);
+  m.def("rasterize_points", &RasterizePoints);
+  m.def("rasterize_points_backward", &RasterizePointsBackward);
+  m.def("rasterize_meshes_backward", &RasterizeMeshesBackward);
+  m.def("rasterize_meshes", &RasterizeMeshes);
+  m.def("sigmoid_alpha_blend", &SigmoidAlphaBlend);
+  m.def("sigmoid_alpha_blend_backward", &SigmoidAlphaBlendBackward);
+
+  // Accumulation functions
+  m.def("accum_weightedsumnorm", &weightedSumNormForward);
+  m.def("accum_weightedsum", &weightedSumForward);
+  m.def("accum_alphacomposite", &alphaCompositeForward);
+  m.def("accum_weightedsumnorm_backward", &weightedSumNormBackward);
+  m.def("accum_weightedsum_backward", &weightedSumBackward);
+  m.def("accum_alphacomposite_backward", &alphaCompositeBackward);
+
+  // These are only visible for testing; users should not call them directly
+  m.def("_rasterize_points_coarse", &RasterizePointsCoarse);
+  m.def("_rasterize_points_naive", &RasterizePointsNaive);
+  m.def("_rasterize_meshes_naive", &RasterizeMeshesNaive);
+  m.def("_rasterize_meshes_coarse", &RasterizeMeshesCoarse);
+  m.def("_rasterize_meshes_fine", &RasterizeMeshesFine);
+
+  // PointEdge distance functions
+  m.def("point_edge_dist_forward", &PointEdgeDistanceForward);
+  m.def("point_edge_dist_backward", &PointEdgeDistanceBackward);
+  m.def("edge_point_dist_forward", &EdgePointDistanceForward);
+  m.def("edge_point_dist_backward", &EdgePointDistanceBackward);
+  m.def("point_edge_array_dist_forward", &PointEdgeArrayDistanceForward);
+  m.def("point_edge_array_dist_backward", &PointEdgeArrayDistanceBackward);
+
+  // PointFace distance functions
+  m.def("point_face_dist_forward", &PointFaceDistanceForward);
+  m.def("point_face_dist_backward", &PointFaceDistanceBackward);
+  m.def("face_point_dist_forward", &FacePointDistanceForward);
+  m.def("face_point_dist_backward", &FacePointDistanceBackward);
+  m.def("point_face_array_dist_forward", &PointFaceArrayDistanceForward);
+  m.def("point_face_array_dist_backward", &PointFaceArrayDistanceBackward);
+
+  // Sample PDF
+  m.def("sample_pdf", &SamplePdf);
+
+  // 3D IoU
+  m.def("iou_box3d", &IoUBox3D);
+
+  // Pulsar.
+#ifdef PULSAR_LOGGING_ENABLED
+  c10::ShowLogInfoToStderr();
+#endif
+  py::class_<
+      pulsar::pytorch::Renderer,
+      std::shared_ptr<pulsar::pytorch::Renderer>>(m, "PulsarRenderer")
+      .def(py::init<
+           const uint&,
+           const uint&,
+           const uint&,
+           const bool&,
+           const bool&,
+           const float&,
+           const uint&,
+           const uint&>())
+      .def(
+          "__eq__",
+          [](const pulsar::pytorch::Renderer& a,
+             const pulsar::pytorch::Renderer& b) { return a == b; },
+          py::is_operator())
+      .def(
+          "__ne__",
+          [](const pulsar::pytorch::Renderer& a,
+             const pulsar::pytorch::Renderer& b) { return !(a == b); },
+          py::is_operator())
+      .def(
+          "__repr__",
+          [](const pulsar::pytorch::Renderer& self) {
+            std::stringstream ss;
+            ss << self;
+            return ss.str();
+          })
+      .def(
+          "forward",
+          &pulsar::pytorch::Renderer::forward,
+          py::arg("vert_pos"),
+          py::arg("vert_col"),
+          py::arg("vert_radii"),
+
+          py::arg("cam_pos"),
+          py::arg("pixel_0_0_center"),
+          py::arg("pixel_vec_x"),
+          py::arg("pixel_vec_y"),
+          py::arg("focal_length"),
+          py::arg("principal_point_offsets"),
+
+          py::arg("gamma"),
+          py::arg("max_depth"),
+          py::arg("min_depth") /* = 0.f*/,
+          py::arg(
+              "bg_col") /* = at::nullopt not exposed properly in pytorch 1.1. */
+          ,
+          py::arg("opacity") /* = at::nullopt ... */,
+          py::arg("percent_allowed_difference") = 0.01f,
+          py::arg("max_n_hits") = MAX_UINT,
+          py::arg("mode") = 0)
+      .def("backward", &pulsar::pytorch::Renderer::backward)
+      .def_property(
+          "device_tracker",
+          [](const pulsar::pytorch::Renderer& self) {
+            return self.device_tracker;
+          },
+          [](pulsar::pytorch::Renderer& self, const torch::Tensor& val) {
+            self.device_tracker = val;
+          })
+      .def_property_readonly("width", &pulsar::pytorch::Renderer::width)
+      .def_property_readonly("height", &pulsar::pytorch::Renderer::height)
+      .def_property_readonly(
+          "max_num_balls", &pulsar::pytorch::Renderer::max_num_balls)
+      .def_property_readonly(
+          "orthogonal", &pulsar::pytorch::Renderer::orthogonal)
+      .def_property_readonly(
+          "right_handed", &pulsar::pytorch::Renderer::right_handed)
+      .def_property_readonly("n_track", &pulsar::pytorch::Renderer::n_track);
+  m.def(
+      "pulsar_sphere_ids_from_result_info_nograd",
+      &pulsar::pytorch::sphere_ids_from_result_info_nograd);
+  // Constants.
+  m.attr("EPS") = py::float_(EPS);
+  m.attr("MAX_FLOAT") = py::float_(MAX_FLOAT);
+  m.attr("MAX_INT") = py::int_(MAX_INT);
+  m.attr("MAX_UINT") = py::int_(MAX_UINT);
+  m.attr("MAX_USHORT") = py::int_(MAX_USHORT);
+  m.attr("PULSAR_MAX_GRAD_SPHERES") = py::int_(MAX_GRAD_SPHERES);
+}
diff --git a/pytorch3d/pytorch3d/csrc/face_areas_normals/face_areas_normals.cu b/pytorch3d/pytorch3d/csrc/face_areas_normals/face_areas_normals.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1d4b73dcd1996b58c81cb2ecbfddeaffb62a71d4
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/face_areas_normals/face_areas_normals.cu
@@ -0,0 +1,299 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <tuple>
+
+template <typename scalar_t>
+__global__ void FaceAreasNormalsForwardKernel(
+    const scalar_t* __restrict__ verts,
+    const int64_t* __restrict__ faces,
+    scalar_t* __restrict__ face_areas,
+    scalar_t* __restrict__ face_normals,
+    const size_t V,
+    const size_t F) {
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = gridDim.x * blockDim.x;
+
+  // Faces split evenly over the number of threads in the grid.
+  // Each thread computes the area & normal of its respective faces and adds it
+  // to the global face_areas tensor.
+  for (size_t f = tid; f < F; f += stride) {
+    const int64_t i0 = faces[3 * f + 0];
+    const int64_t i1 = faces[3 * f + 1];
+    const int64_t i2 = faces[3 * f + 2];
+
+    const scalar_t v0_x = verts[3 * i0 + 0];
+    const scalar_t v0_y = verts[3 * i0 + 1];
+    const scalar_t v0_z = verts[3 * i0 + 2];
+
+    const scalar_t v1_x = verts[3 * i1 + 0];
+    const scalar_t v1_y = verts[3 * i1 + 1];
+    const scalar_t v1_z = verts[3 * i1 + 2];
+
+    const scalar_t v2_x = verts[3 * i2 + 0];
+    const scalar_t v2_y = verts[3 * i2 + 1];
+    const scalar_t v2_z = verts[3 * i2 + 2];
+
+    const scalar_t ax = v1_x - v0_x;
+    const scalar_t ay = v1_y - v0_y;
+    const scalar_t az = v1_z - v0_z;
+
+    const scalar_t bx = v2_x - v0_x;
+    const scalar_t by = v2_y - v0_y;
+    const scalar_t bz = v2_z - v0_z;
+
+    const scalar_t cx = ay * bz - az * by;
+    const scalar_t cy = az * bx - ax * bz;
+    const scalar_t cz = ax * by - ay * bx;
+
+    scalar_t norm = sqrt(cx * cx + cy * cy + cz * cz);
+    face_areas[f] = norm / 2.0;
+    norm = (norm < 1e-6) ? 1e-6 : norm; // max(norm, 1e-6)
+    face_normals[3 * f + 0] = cx / norm;
+    face_normals[3 * f + 1] = cy / norm;
+    face_normals[3 * f + 2] = cz / norm;
+  }
+}
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void FaceAreasNormalsBackwardKernel(
+    const float* __restrict__ grad_areas,
+    const float* __restrict__ grad_normals,
+    const float* __restrict__ verts,
+    const int64_t* __restrict__ faces,
+    float* __restrict__ grad_verts,
+    const size_t V,
+    const size_t F) {
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = gridDim.x * blockDim.x;
+
+  // Faces split evenly over the number of threads in the grid.
+  // Each thread computes the area & normal of its respective faces and adds it
+  // to the global face_areas tensor.
+  for (size_t f = tid; f < F; f += stride) {
+    const int64_t i0 = faces[3 * f + 0];
+    const int64_t i1 = faces[3 * f + 1];
+    const int64_t i2 = faces[3 * f + 2];
+
+    const float v0_x = verts[3 * i0 + 0];
+    const float v0_y = verts[3 * i0 + 1];
+    const float v0_z = verts[3 * i0 + 2];
+
+    const float v1_x = verts[3 * i1 + 0];
+    const float v1_y = verts[3 * i1 + 1];
+    const float v1_z = verts[3 * i1 + 2];
+
+    const float v2_x = verts[3 * i2 + 0];
+    const float v2_y = verts[3 * i2 + 1];
+    const float v2_z = verts[3 * i2 + 2];
+
+    const float ax = v1_x - v0_x;
+    const float ay = v1_y - v0_y;
+    const float az = v1_z - v0_z;
+
+    const float bx = v2_x - v0_x;
+    const float by = v2_y - v0_y;
+    const float bz = v2_z - v0_z;
+
+    const float cx = ay * bz - az * by;
+    const float cy = az * bx - ax * bz;
+    const float cz = ax * by - ay * bx;
+
+    float norm = sqrt(cx * cx + cy * cy + cz * cz);
+    norm = (norm < 1e-6) ? 1e-6 : norm; // max(norm, 1e-6)
+    float inv_norm = 1. / norm;
+    float inv_norm_2 = pow(inv_norm, 2.0f);
+    float inv_norm_3 = pow(inv_norm, 3.0f);
+
+    // We compute gradients with respect to the input vertices.
+    // For each vertex, gradients come from grad_areas and grad_normals.
+    // eg, grad_v0_x = (d / d v0_x)
+    //       = \sum_f (d / d areas[f]) * (d areas[f] / d v0_x)
+    //              + (d / d normals[f, 0]) * (d normals[f, 0] / d v0_x)
+    //              + (d / d normals[f, 1]) * (d normals[f, 1] / d v0_x)
+    //              + (d / d normals[f, 2]) * (d normals[f, 2] / d v0_x)
+    // with (d / d areas[f]) = grad_areas[f] and
+    //      (d / d normals[f, j]) = grad_normals[f][j].
+    // The equations below are derived after taking
+    // derivatives wrt to the vertices (fun times!).
+
+    // grad v0 coming from grad areas and grad normals
+    const float grad_v0_x =
+        ((-az + bz) * cy + (-by + ay) * cz) / 2.0 * inv_norm * grad_areas[f] +
+        -cx * ((-az + bz) * cy + (-by + ay) * cz) * inv_norm_3 *
+            grad_normals[3 * f + 0] +
+        ((-az + bz) - cy * ((-az + bz) * cy + (-by + ay) * cz) * inv_norm_2) *
+            inv_norm * grad_normals[3 * f + 1] +
+        ((-by + ay) - cz * ((-az + bz) * cy + (-by + ay) * cz) * inv_norm_2) *
+            inv_norm * grad_normals[3 * f + 2];
+    atomicAdd(grad_verts + 3 * i0 + 0, grad_v0_x);
+
+    const float grad_v0_y =
+        ((-bz + az) * cx + (-ax + bx) * cz) / 2.0 * inv_norm * grad_areas[f] +
+        ((-bz + az) - cx * ((-bz + az) * cx + (-ax + bx) * cz) * inv_norm_2) *
+            inv_norm * grad_normals[3 * f + 0] +
+        -cy * ((-bz + az) * cx + (-ax + bx) * cz) * inv_norm_3 *
+            grad_normals[3 * f + 1] +
+        ((-ax + bx) - cz * ((-bz + az) * cx + (-ax + bx) * cz) * inv_norm_2) *
+            inv_norm * grad_normals[3 * f + 2];
+    atomicAdd(grad_verts + 3 * i0 + 1, grad_v0_y);
+
+    const float grad_v0_z =
+        ((-ay + by) * cx + (-bx + ax) * cy) / 2.0 * inv_norm * grad_areas[f] +
+        ((-ay + by) - cx * ((-ay + by) * cx + (-bx + ax) * cy) * inv_norm_2) *
+            inv_norm * grad_normals[3 * f + 0] +
+        ((-bx + ax) - cy * ((-ay + by) * cx + (-bx + ax) * cy) * inv_norm_2) *
+            inv_norm * grad_normals[3 * f + 1] +
+        -cz * ((-ay + by) * cx + (-bx + ax) * cy) * inv_norm_3 *
+            grad_normals[3 * f + 2];
+    atomicAdd(grad_verts + 3 * i0 + 2, grad_v0_z);
+
+    // grad v1 coming from grad areas and grad normals
+    const float grad_v1_x =
+        (by * cz - bz * cy) / 2.0 * inv_norm * grad_areas[f] +
+        -cx * (by * cz - bz * cy) * inv_norm_3 * grad_normals[3 * f + 0] +
+        (-bz - cy * (by * cz - bz * cy) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 1] +
+        (by - cz * (by * cz - bz * cy) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 2];
+    atomicAdd(grad_verts + 3 * i1 + 0, grad_v1_x);
+
+    const float grad_v1_y =
+        (bz * cx - bx * cz) / 2.0 * inv_norm * grad_areas[f] +
+        (bz - cx * (bz * cx - bx * cz) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 0] +
+        -cy * (bz * cx - bx * cz) * inv_norm_3 * grad_normals[3 * f + 1] +
+        (-bx - cz * (bz * cx - bx * cz) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 2];
+    atomicAdd(grad_verts + 3 * i1 + 1, grad_v1_y);
+
+    const float grad_v1_z =
+        (bx * cy - by * cx) / 2.0 * inv_norm * grad_areas[f] +
+        (-by - cx * (bx * cy - by * cx) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 0] +
+        (bx - cx * (bx * cy - by * cx) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 1] +
+        -cz * (bx * cy - by * cx) * inv_norm_3 * grad_normals[3 * f + 2];
+    atomicAdd(grad_verts + 3 * i1 + 2, grad_v1_z);
+
+    // grad v2 coming from grad areas
+    const float grad_v2_x =
+        (az * cy - ay * cz) / 2.0 * inv_norm * grad_areas[f] +
+        -cx * (az * cy - ay * cz) * inv_norm_3 * grad_normals[3 * f + 0] +
+        (az - cy * (az * cy - ay * cz) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 1] +
+        (-ay - cz * (az * cy - ay * cz) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 2];
+    atomicAdd(grad_verts + 3 * i2 + 0, grad_v2_x);
+
+    const float grad_v2_y =
+        (ax * cz - az * cx) / 2.0 * inv_norm * grad_areas[f] +
+        (-az - cx * (ax * cz - az * cx) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 0] +
+        -cy * (ax * cz - az * cx) * inv_norm_3 * grad_normals[3 * f + 1] +
+        (ax - cz * (ax * cz - az * cx) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 2];
+    atomicAdd(grad_verts + 3 * i2 + 1, grad_v2_y);
+
+    const float grad_v2_z =
+        (ay * cx - ax * cy) / 2.0 * inv_norm * grad_areas[f] +
+        (ay - cx * (ay * cx - ax * cy) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 0] +
+        (-ax - cy * (ay * cx - ax * cy) * inv_norm_2) * inv_norm *
+            grad_normals[3 * f + 1] +
+        -cz * (ay * cx - ax * cy) * inv_norm_3 * grad_normals[3 * f + 2];
+    atomicAdd(grad_verts + 3 * i2 + 2, grad_v2_z);
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCuda(
+    const at::Tensor verts,
+    const at::Tensor faces) {
+  const auto V = verts.size(0);
+  const auto F = faces.size(0);
+
+  // Check inputs are on the same device
+  at::TensorArg verts_t{verts, "verts", 1}, faces_t{faces, "faces", 2};
+  at::CheckedFrom c = "FaceAreasNormalsForwardCuda";
+  at::checkAllSameGPU(c, {verts_t, faces_t});
+
+  // Set the device for the kernel launch based on the device of verts
+  at::cuda::CUDAGuard device_guard(verts.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  at::Tensor areas = at::empty({F}, verts.options());
+  at::Tensor normals = at::empty({F, 3}, verts.options());
+
+  if (areas.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(areas, normals);
+  }
+
+  const int blocks = 64;
+  const int threads = 512;
+
+  AT_DISPATCH_FLOATING_TYPES(
+      verts.scalar_type(), "face_areas_normals_forward_cuda", ([&] {
+        FaceAreasNormalsForwardKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+            verts.contiguous().data_ptr<scalar_t>(),
+            faces.contiguous().data_ptr<int64_t>(),
+            areas.data_ptr<scalar_t>(),
+            normals.data_ptr<scalar_t>(),
+            V,
+            F);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(areas, normals);
+}
+
+at::Tensor FaceAreasNormalsBackwardCuda(
+    const at::Tensor grad_areas,
+    const at::Tensor grad_normals,
+    const at::Tensor verts,
+    const at::Tensor faces) {
+  // Check inputs are on the same device
+  at::TensorArg verts_t{verts, "verts", 1}, faces_t{faces, "faces", 2},
+      grad_areas_t{grad_areas, "grad_areas", 3},
+      grad_normals_t{grad_normals, "grad_normals", 4};
+  at::CheckedFrom c = "FaceAreasNormalsBackwardCuda";
+  at::checkAllSameGPU(c, {verts_t, faces_t, grad_areas_t, grad_normals_t});
+
+  // Set the device for the kernel launch based on the device of verts
+  at::cuda::CUDAGuard device_guard(verts.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const auto V = verts.size(0);
+  const auto F = faces.size(0);
+
+  at::Tensor grad_verts = at::zeros({V, 3}, grad_areas.options());
+
+  if (grad_verts.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_verts;
+  }
+
+  const int blocks = 64;
+  const int threads = 512;
+  // TODO(gkioxari) add AT_DISPATCH_FLOATING_TYPES once atomicAdd supports
+  // doubles. Currently, support is for floats only.
+  FaceAreasNormalsBackwardKernel<<<blocks, threads, 0, stream>>>(
+      grad_areas.contiguous().data_ptr<float>(),
+      grad_normals.contiguous().data_ptr<float>(),
+      verts.contiguous().data_ptr<float>(),
+      faces.contiguous().data_ptr<int64_t>(),
+      grad_verts.data_ptr<float>(),
+      V,
+      F);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_verts;
+}
diff --git a/pytorch3d/pytorch3d/csrc/face_areas_normals/face_areas_normals.h b/pytorch3d/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
new file mode 100644
index 0000000000000000000000000000000000000000..623d3d1ba60bf33d3c85f0fceccfe87be25352ea
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/face_areas_normals/face_areas_normals.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <tuple>
+#include "utils/pytorch3d_cutils.h"
+
+// Compute areas of mesh faces using packed representation.
+//
+// Inputs:
+//    verts: FloatTensor of shape (V, 3) giving vertex positions.
+//    faces: LongTensor of shape (F, 3) giving faces.
+//
+// Returns:
+//    areas: FloatTensor of shape (F,) where areas[f] is the area of faces[f].
+//    normals: FloatTensor of shape (F, 3) where normals[f] is the normal of
+//    faces[f]
+//
+
+// Cpu implementation.
+std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCpu(
+    const at::Tensor verts,
+    const at::Tensor faces);
+// Cpu implementation
+at::Tensor FaceAreasNormalsBackwardCpu(
+    const at::Tensor grad_areas,
+    const at::Tensor grad_normals,
+    const at::Tensor verts,
+    const at::Tensor faces);
+
+#ifdef WITH_CUDA
+// Cuda implementation.
+std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCuda(
+    const at::Tensor verts,
+    const at::Tensor faces);
+// Cuda implementation.
+at::Tensor FaceAreasNormalsBackwardCuda(
+    const at::Tensor grad_areas,
+    const at::Tensor grad_normals,
+    const at::Tensor verts,
+    const at::Tensor faces);
+#endif
+
+// Implementation which is exposed.
+std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForward(
+    const at::Tensor verts,
+    const at::Tensor faces) {
+  if (verts.is_cuda() && faces.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(verts);
+    CHECK_CUDA(faces);
+    return FaceAreasNormalsForwardCuda(verts, faces);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return FaceAreasNormalsForwardCpu(verts, faces);
+}
+
+// Implementation which is exposed.
+at::Tensor FaceAreasNormalsBackward(
+    const at::Tensor grad_areas,
+    const at::Tensor grad_normals,
+    const at::Tensor verts,
+    const at::Tensor faces) {
+  if (verts.is_cuda() && faces.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(verts);
+    CHECK_CUDA(faces);
+    CHECK_CUDA(grad_areas);
+    CHECK_CUDA(grad_normals);
+    return FaceAreasNormalsBackwardCuda(grad_areas, grad_normals, verts, faces);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return FaceAreasNormalsBackwardCpu(grad_areas, grad_normals, verts, faces);
+}
diff --git a/pytorch3d/pytorch3d/csrc/face_areas_normals/face_areas_normals_cpu.cpp b/pytorch3d/pytorch3d/csrc/face_areas_normals/face_areas_normals_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ff30f0ff2a2fa9914cae7fdbb5af2b2fa19eeed
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/face_areas_normals/face_areas_normals_cpu.cpp
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <tuple>
+
+std::tuple<at::Tensor, at::Tensor> FaceAreasNormalsForwardCpu(
+    const at::Tensor verts,
+    const at::Tensor faces) {
+  const int F = faces.size(0);
+
+  at::Tensor areas = at::empty({F}, verts.options());
+  at::Tensor normals = at::empty({F, 3}, verts.options());
+
+  auto verts_a = verts.accessor<float, 2>();
+  auto faces_a = faces.accessor<int64_t, 2>();
+  auto areas_a = areas.accessor<float, 1>();
+  auto normals_a = normals.accessor<float, 2>();
+
+  for (int f = 0; f < F; ++f) {
+    const int64_t i0 = faces_a[f][0];
+    const int64_t i1 = faces_a[f][1];
+    const int64_t i2 = faces_a[f][2];
+
+    const float v0_x = verts_a[i0][0];
+    const float v0_y = verts_a[i0][1];
+    const float v0_z = verts_a[i0][2];
+
+    const float v1_x = verts_a[i1][0];
+    const float v1_y = verts_a[i1][1];
+    const float v1_z = verts_a[i1][2];
+
+    const float v2_x = verts_a[i2][0];
+    const float v2_y = verts_a[i2][1];
+    const float v2_z = verts_a[i2][2];
+
+    const float ax = v1_x - v0_x;
+    const float ay = v1_y - v0_y;
+    const float az = v1_z - v0_z;
+
+    const float bx = v2_x - v0_x;
+    const float by = v2_y - v0_y;
+    const float bz = v2_z - v0_z;
+
+    const float cx = ay * bz - az * by;
+    const float cy = az * bx - ax * bz;
+    const float cz = ax * by - ay * bx;
+
+    float norm = sqrt(cx * cx + cy * cy + cz * cz);
+    areas_a[f] = norm / 2.0;
+    norm = (norm < 1e-6) ? 1e-6 : norm; // max(norm, 1e-6)
+    normals_a[f][0] = cx / norm;
+    normals_a[f][1] = cy / norm;
+    normals_a[f][2] = cz / norm;
+  }
+  return std::make_tuple(areas, normals);
+}
+
+at::Tensor FaceAreasNormalsBackwardCpu(
+    const at::Tensor grad_areas,
+    const at::Tensor grad_normals,
+    const at::Tensor verts,
+    const at::Tensor faces) {
+  const int V = verts.size(0);
+  const int F = faces.size(0);
+
+  at::Tensor grad_verts = at::zeros({V, 3}, grad_areas.options());
+
+  auto grad_areas_a = grad_areas.accessor<float, 1>();
+  auto grad_normals_a = grad_normals.accessor<float, 2>();
+  auto verts_a = verts.accessor<float, 2>();
+  auto faces_a = faces.accessor<int64_t, 2>();
+  auto grad_verts_a = grad_verts.accessor<float, 2>();
+
+  for (int f = 0; f < F; ++f) {
+    const int64_t i0 = faces_a[f][0];
+    const int64_t i1 = faces_a[f][1];
+    const int64_t i2 = faces_a[f][2];
+
+    const float v0_x = verts_a[i0][0];
+    const float v0_y = verts_a[i0][1];
+    const float v0_z = verts_a[i0][2];
+
+    const float v1_x = verts_a[i1][0];
+    const float v1_y = verts_a[i1][1];
+    const float v1_z = verts_a[i1][2];
+
+    const float v2_x = verts_a[i2][0];
+    const float v2_y = verts_a[i2][1];
+    const float v2_z = verts_a[i2][2];
+
+    const float ax = v1_x - v0_x;
+    const float ay = v1_y - v0_y;
+    const float az = v1_z - v0_z;
+
+    const float bx = v2_x - v0_x;
+    const float by = v2_y - v0_y;
+    const float bz = v2_z - v0_z;
+
+    const float cx = ay * bz - az * by;
+    const float cy = az * bx - ax * bz;
+    const float cz = ax * by - ay * bx;
+
+    float norm = sqrt(cx * cx + cy * cy + cz * cz);
+    norm = (norm < 1e-6) ? 1e-6 : norm; // max(norm, 1e-6)
+    float inv_norm = 1. / norm;
+    float inv_norm_2 = pow(inv_norm, 2.0f);
+    float inv_norm_3 = pow(inv_norm, 3.0f);
+
+    // We compute gradients with respect to the input vertices.
+    // For each vertex, gradients come from grad_areas and grad_normals.
+    // eg, grad_v0_x = (d / d v0_x)
+    //       = \sum_f (d / d areas[f]) * (d areas[f] / d v0_x)
+    //              + (d / d normals[f, 0]) * (d normals[f, 0] / d v0_x)
+    //              + (d / d normals[f, 1]) * (d normals[f, 1] / d v0_x)
+    //              + (d / d normals[f, 2]) * (d normals[f, 2] / d v0_x)
+    // with (d / d areas[f]) = grad_areas[f] and
+    //      (d / d normals[f, j]) = grad_normals[f][j].
+    // The equations below are derived after taking
+    // derivatives wrt to the vertices (fun times!).
+
+    // grad v0 coming from grad areas and grad normals
+    const float grad_v0_x =
+        ((-az + bz) * cy + (-by + ay) * cz) / 2.0 * inv_norm * grad_areas_a[f] +
+        -cx * ((-az + bz) * cy + (-by + ay) * cz) * inv_norm_3 *
+            grad_normals_a[f][0] +
+        ((-az + bz) - cy * ((-az + bz) * cy + (-by + ay) * cz) * inv_norm_2) *
+            inv_norm * grad_normals_a[f][1] +
+        ((-by + ay) - cz * ((-az + bz) * cy + (-by + ay) * cz) * inv_norm_2) *
+            inv_norm * grad_normals_a[f][2];
+    grad_verts_a[i0][0] += grad_v0_x;
+
+    const float grad_v0_y =
+        ((-bz + az) * cx + (-ax + bx) * cz) / 2.0 * inv_norm * grad_areas_a[f] +
+        ((-bz + az) - cx * ((-bz + az) * cx + (-ax + bx) * cz) * inv_norm_2) *
+            inv_norm * grad_normals_a[f][0] +
+        -cy * ((-bz + az) * cx + (-ax + bx) * cz) * inv_norm_3 *
+            grad_normals_a[f][1] +
+        ((-ax + bx) - cz * ((-bz + az) * cx + (-ax + bx) * cz) * inv_norm_2) *
+            inv_norm * grad_normals_a[f][2];
+    grad_verts[i0][1] += grad_v0_y;
+
+    const float grad_v0_z =
+        ((-ay + by) * cx + (-bx + ax) * cy) / 2.0 * inv_norm * grad_areas_a[f] +
+        ((-ay + by) - cx * ((-ay + by) * cx + (-bx + ax) * cy) * inv_norm_2) *
+            inv_norm * grad_normals_a[f][0] +
+        ((-bx + ax) - cy * ((-ay + by) * cx + (-bx + ax) * cy) * inv_norm_2) *
+            inv_norm * grad_normals_a[f][1] +
+        -cz * ((-ay + by) * cx + (-bx + ax) * cy) * inv_norm_3 *
+            grad_normals_a[f][2];
+    grad_verts[i0][2] += grad_v0_z;
+
+    // grad v1 coming from grad areas and grad normals
+    const float grad_v1_x =
+        (by * cz - bz * cy) / 2.0 * inv_norm * grad_areas_a[f] +
+        -cx * (by * cz - bz * cy) * inv_norm_3 * grad_normals_a[f][0] +
+        (-bz - cy * (by * cz - bz * cy) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][1] +
+        (by - cz * (by * cz - bz * cy) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][2];
+    grad_verts[i1][0] += grad_v1_x;
+
+    const float grad_v1_y =
+        (bz * cx - bx * cz) / 2.0 * inv_norm * grad_areas_a[f] +
+        (bz - cx * (bz * cx - bx * cz) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][0] +
+        -cy * (bz * cx - bx * cz) * inv_norm_3 * grad_normals_a[f][1] +
+        (-bx - cz * (bz * cx - bx * cz) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][2];
+    grad_verts[i1][1] += grad_v1_y;
+
+    const float grad_v1_z =
+        (bx * cy - by * cx) / 2.0 * inv_norm * grad_areas_a[f] +
+        (-by - cx * (bx * cy - by * cx) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][0] +
+        (bx - cx * (bx * cy - by * cx) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][1] +
+        -cz * (bx * cy - by * cx) * inv_norm_3 * grad_normals_a[f][2];
+    grad_verts[i1][2] += grad_v1_z;
+
+    // grad v2 coming from grad areas
+    const float grad_v2_x =
+        (az * cy - ay * cz) / 2.0 * inv_norm * grad_areas_a[f] +
+        -cx * (az * cy - ay * cz) * inv_norm_3 * grad_normals_a[f][0] +
+        (az - cy * (az * cy - ay * cz) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][1] +
+        (-ay - cz * (az * cy - ay * cz) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][2];
+    grad_verts[i2][0] += grad_v2_x;
+
+    const float grad_v2_y =
+        (ax * cz - az * cx) / 2.0 * inv_norm * grad_areas_a[f] +
+        (-az - cx * (ax * cz - az * cx) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][0] +
+        -cy * (ax * cz - az * cx) * inv_norm_3 * grad_normals_a[f][1] +
+        (ax - cz * (ax * cz - az * cx) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][2];
+    grad_verts[i2][1] += grad_v2_y;
+
+    const float grad_v2_z =
+        (ay * cx - ax * cy) / 2.0 * inv_norm * grad_areas_a[f] +
+        (ay - cx * (ay * cx - ax * cy) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][0] +
+        (-ax - cy * (ay * cx - ax * cy) * inv_norm_2) * inv_norm *
+            grad_normals_a[f][1] +
+        -cz * (ay * cx - ax * cy) * inv_norm_3 * grad_normals_a[f][2];
+    grad_verts[i2][2] += grad_v2_z;
+  }
+  return grad_verts;
+}
diff --git a/pytorch3d/pytorch3d/csrc/gather_scatter/gather_scatter.cu b/pytorch3d/pytorch3d/csrc/gather_scatter/gather_scatter.cu
new file mode 100644
index 0000000000000000000000000000000000000000..68ab369ca6fc20d495d54737e224aa06747e170e
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/gather_scatter/gather_scatter.cu
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+// TODO(T47953967) to make this cuda kernel support all datatypes.
+__global__ void GatherScatterCudaKernel(
+    const float* __restrict__ input,
+    const int64_t* __restrict__ edges,
+    float* __restrict__ output,
+    bool directed,
+    bool backward,
+    const size_t V,
+    const size_t D,
+    const size_t E) {
+  const int tid = threadIdx.x;
+
+  // Reverse the vertex order if backward.
+  const int v0_idx = backward ? 1 : 0;
+  const int v1_idx = backward ? 0 : 1;
+
+  // Edges are split evenly across the blocks.
+  for (int e = blockIdx.x; e < E; e += gridDim.x) {
+    // Get indices of vertices which form the edge.
+    const int64_t v0 = edges[2 * e + v0_idx];
+    const int64_t v1 = edges[2 * e + v1_idx];
+
+    // Split vertex features evenly across threads.
+    // This implementation will be quite wasteful when D<128 since there will be
+    // a lot of threads doing nothing.
+    for (int d = tid; d < D; d += blockDim.x) {
+      const float val = input[v1 * D + d];
+      float* address = output + v0 * D + d;
+      atomicAdd(address, val);
+      if (!directed) {
+        const float val = input[v0 * D + d];
+        float* address = output + v1 * D + d;
+        atomicAdd(address, val);
+      }
+    }
+    __syncthreads();
+  }
+}
+
+at::Tensor GatherScatterCuda(
+    const at::Tensor& input,
+    const at::Tensor& edges,
+    bool directed,
+    bool backward) {
+  // Check inputs are on the same device
+  at::TensorArg input_t{input, "input", 1}, edges_t{edges, "edges", 2};
+  at::CheckedFrom c = "GatherScatterCuda";
+  at::checkAllSameGPU(c, {input_t, edges_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(input.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const auto num_vertices = input.size(0);
+  const auto input_feature_dim = input.size(1);
+  const auto num_edges = edges.size(0);
+
+  auto output = at::zeros({num_vertices, input_feature_dim}, input.options());
+  const size_t threads = 128;
+  const size_t max_blocks = 1920;
+  const size_t blocks = num_edges < max_blocks ? num_edges : max_blocks;
+
+  if (output.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return output;
+  }
+
+  GatherScatterCudaKernel<<<blocks, threads, 0, stream>>>(
+      input.contiguous().data_ptr<float>(),
+      edges.contiguous().data_ptr<int64_t>(),
+      output.data_ptr<float>(),
+      directed,
+      backward,
+      num_vertices,
+      input_feature_dim,
+      num_edges);
+  AT_CUDA_CHECK(cudaGetLastError());
+  return output;
+}
diff --git a/pytorch3d/pytorch3d/csrc/gather_scatter/gather_scatter.h b/pytorch3d/pytorch3d/csrc/gather_scatter/gather_scatter.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a17f80278bb4ba926c78b7a490b799c4934554d
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/gather_scatter/gather_scatter.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include "utils/pytorch3d_cutils.h"
+
+// Fused gather scatter operation for aggregating features of neighbor nodes
+// in a graph. This gather scatter operation is specific to graphs as edge
+// indices are used as input.
+//
+// Args:
+//   input: float32 Tensor of shape (V, D) where V is the number of vertices
+//          and D is the feature dimension.
+//   edges: int64 Tensor of shape (E, 2) giving the indices of the vertices that
+//          make up the edge. E is the number of edges.
+//  directed: Bool indicating if edges in the graph are directed. For a
+//            directed graph v0 -> v1 the updated feature for v0 depends on v1.
+//  backward: Bool indicating if the operation is the backward pass.
+//
+// Returns:
+//   output: float32 Tensor of same shape as input.
+
+at::Tensor GatherScatterCuda(
+    const at::Tensor& input,
+    const at::Tensor& edges,
+    bool directed,
+    bool backward);
+
+at::Tensor GatherScatterCpu(
+    const at::Tensor& input,
+    const at::Tensor& edges,
+    bool directed,
+    bool backward);
+
+// Exposed implementation.
+at::Tensor GatherScatter(
+    const at::Tensor& input,
+    const at::Tensor& edges,
+    bool directed,
+    bool backward) {
+  if (input.is_cuda() && edges.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(input);
+    CHECK_CUDA(edges);
+    return GatherScatterCuda(input, edges, directed, backward);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return GatherScatterCpu(input, edges, directed, backward);
+}
diff --git a/pytorch3d/pytorch3d/csrc/gather_scatter/gather_scatter_cpu.cpp b/pytorch3d/pytorch3d/csrc/gather_scatter/gather_scatter_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7bf9fde091cf1af912992be73d1119aae134f329
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/gather_scatter/gather_scatter_cpu.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+
+at::Tensor GatherScatterCpu(
+    const at::Tensor& input,
+    const at::Tensor& edges,
+    bool directed,
+    bool backward) {
+  const auto num_vertices = input.size(0);
+  const auto input_feature_dim = input.size(1);
+  const auto num_edges = edges.size(0);
+
+  auto output = at::zeros({num_vertices, input_feature_dim}, input.options());
+
+  auto input_a = input.accessor<float, 2>();
+  auto edges_a = edges.accessor<int64_t, 2>();
+  auto output_a = output.accessor<float, 2>();
+  const int v0_idx = backward ? 1 : 0;
+  const int v1_idx = backward ? 0 : 1;
+
+  for (int e = 0; e < num_edges; ++e) {
+    // Get indices of vertices which form the edge.
+    const int64_t v0 = edges_a[e][v0_idx];
+    const int64_t v1 = edges_a[e][v1_idx];
+
+    for (int d = 0; d < input_feature_dim; ++d) {
+      output_a[v0][d] += input_a[v1][d];
+      if (!directed) {
+        output_a[v1][d] += input_a[v0][d];
+      }
+    }
+  }
+  return output;
+}
diff --git a/pytorch3d/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu b/pytorch3d/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9478cc70a07cd53d1075464f09d1a1153b634cd5
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.cu
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <tuple>
+
+template <typename scalar_t>
+__global__ void InterpFaceAttrsForwardKernel(
+    const int64_t* __restrict__ pix_to_face, // (P,)
+    const scalar_t* __restrict__ barycentric_coords, // (P, 3)
+    const scalar_t* __restrict__ face_attrs, // (F, 3, D)
+    scalar_t* pix_attrs, // (P, D)
+    const size_t P,
+    const size_t F,
+    const size_t D) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int num_threads = blockDim.x * gridDim.x;
+  for (int pd = tid; pd < P * D; pd += num_threads) {
+    const int p = pd / D;
+    const int d = pd % D;
+    const int64_t f = pix_to_face[p];
+    if (f < 0) {
+      continue;
+    }
+    scalar_t pix_attr = 0.0;
+    for (int i = 0; i < 3; ++i) {
+      scalar_t weight = barycentric_coords[p * 3 + i];
+      scalar_t vert_attr = face_attrs[f * 3 * D + i * D + d];
+      pix_attr += weight * vert_attr;
+    }
+    pix_attrs[p * D + d] = pix_attr;
+  }
+}
+
+at::Tensor InterpFaceAttrsForwardCuda(
+    const at::Tensor& pix_to_face,
+    const at::Tensor& barycentric_coords,
+    const at::Tensor& face_attrs) {
+  // Make sure all inputs are on the same device
+  at::TensorArg pix_to_face_t{pix_to_face, "pix_to_face", 1},
+      barycentric_coords_t{barycentric_coords, "barycentric_coords", 2},
+      face_attrs_t{face_attrs, "face_attributes", 3};
+  at::CheckedFrom c = "InterpFaceAttrsForwardCuda";
+  at::checkAllSameGPU(c, {pix_to_face_t, barycentric_coords_t, face_attrs_t});
+  at::checkAllSameType(c, {barycentric_coords_t, face_attrs_t});
+
+  // Set the device for the kernel launch based on the input
+  at::cuda::CUDAGuard device_guard(pix_to_face.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const auto P = pix_to_face.size(0);
+  const auto F = face_attrs.size(0);
+  const auto D = face_attrs.size(2);
+
+  TORCH_CHECK(
+      barycentric_coords.size(0) == P && barycentric_coords.size(1) == 3,
+      "barycentric_coords must have size (P, 3)");
+  TORCH_CHECK(face_attrs.size(1) == 3, "face_attrs must have size (F, 3, D)");
+
+  auto pix_attrs = at::zeros({P, D}, face_attrs.options());
+  const int threads = 1024;
+  const int blocks = 512;
+  AT_DISPATCH_FLOATING_TYPES(
+      face_attrs.scalar_type(), "interp_face_attrs_cuda", ([&] {
+        InterpFaceAttrsForwardKernel<<<blocks, threads, 0, stream>>>(
+            pix_to_face.contiguous().data_ptr<int64_t>(),
+            barycentric_coords.contiguous().data_ptr<scalar_t>(),
+            face_attrs.contiguous().data_ptr<scalar_t>(),
+            pix_attrs.contiguous().data_ptr<scalar_t>(),
+            P,
+            F,
+            D);
+      }));
+  AT_CUDA_CHECK(cudaGetLastError());
+  return pix_attrs;
+}
+
+template <typename scalar_t>
+__global__ void InterpFaceAttrsBackwardKernel(
+    const int64_t* __restrict__ pix_to_face, // (P,)
+    const scalar_t* __restrict__ barycentric_coords, // (P, 3)
+    const scalar_t* __restrict__ face_attrs, // (F, 3, D)
+    const scalar_t* __restrict__ grad_pix_attrs, // (P, D)
+    scalar_t* __restrict__ grad_barycentric_coords, // (P, 3)
+    scalar_t* __restrict__ grad_face_attrs, // (F, 3, D)
+    const size_t P,
+    const size_t F,
+    const size_t D) {
+  const int tid = threadIdx.x + blockIdx.x * blockDim.x;
+  const int num_threads = blockDim.x * gridDim.x;
+  for (int pd = tid; pd < P * D; pd += num_threads) {
+    const int p = pd / D;
+    const int d = pd % D;
+    const int64_t f = pix_to_face[p];
+    if (f < 0) {
+      continue;
+    }
+    scalar_t upstream_grad = grad_pix_attrs[p * D + d];
+    for (int i = 0; i < 3; ++i) {
+      scalar_t weight = barycentric_coords[p * 3 + i];
+      scalar_t vert_attr = face_attrs[f * 3 * D + i * D + d];
+      scalar_t grad_bary_down = vert_attr * upstream_grad;
+      scalar_t grad_face_down = weight * upstream_grad;
+      atomicAdd(grad_barycentric_coords + p * 3 + i, grad_bary_down);
+      atomicAdd(grad_face_attrs + f * 3 * D + i * D + d, grad_face_down);
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> InterpFaceAttrsBackwardCuda(
+    const at::Tensor& pix_to_face,
+    const at::Tensor& barycentric_coords,
+    const at::Tensor& face_attrs,
+    const at::Tensor& grad_pix_attrs) {
+  // Make sure all inputs are on the same device
+  at::TensorArg pix_to_face_t{pix_to_face, "pix_to_face", 1},
+      barycentric_coords_t{barycentric_coords, "barycentric_coords", 2},
+      face_attrs_t{face_attrs, "face_attributes", 3},
+      grad_pix_attrs_t{grad_pix_attrs, "pix_attrs", 4};
+  at::CheckedFrom c = "InterpFaceAttrsBackwarduda";
+  at::checkAllSameGPU(
+      c, {pix_to_face_t, barycentric_coords_t, face_attrs_t, grad_pix_attrs_t});
+  at::checkAllSameType(
+      c, {barycentric_coords_t, face_attrs_t, grad_pix_attrs_t});
+
+  // Set the device for the kernel launch based on the input
+  at::cuda::CUDAGuard device_guard(pix_to_face.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const auto P = pix_to_face.size(0);
+  const auto F = face_attrs.size(0);
+  const auto D = face_attrs.size(2);
+
+  TORCH_CHECK(
+      barycentric_coords.size(0) == P && barycentric_coords.size(1) == 3,
+      "barycentric_coords must have size (P, 3)");
+  TORCH_CHECK(face_attrs.size(1) == 3, "face_attrs must have size (F, 3, D)");
+  TORCH_CHECK(
+      grad_pix_attrs.size(0) == P && grad_pix_attrs.size(1) == D,
+      "grad_pix_attrs must have size (P, D)");
+
+  auto grad_barycentric_coords = at::zeros_like(barycentric_coords);
+  auto grad_face_attrs = at::zeros_like(face_attrs);
+  const int threads = 1024;
+  const int blocks = 512;
+  // Only allow float for now.
+  // TODO: Add support for double once we fix atomicAdd
+  // clang-format off
+  InterpFaceAttrsBackwardKernel<<<blocks, threads, 0, stream>>>(
+    pix_to_face.contiguous().data_ptr<int64_t>(),
+    barycentric_coords.contiguous().data_ptr<float>(),
+    face_attrs.contiguous().data_ptr<float>(),
+    grad_pix_attrs.contiguous().data_ptr<float>(),
+    grad_barycentric_coords.contiguous().data_ptr<float>(),
+    grad_face_attrs.contiguous().data_ptr<float>(),
+    P, F, D);
+  AT_CUDA_CHECK(cudaGetLastError());
+  // clang-format on
+  return std::make_tuple(grad_barycentric_coords, grad_face_attrs);
+}
diff --git a/pytorch3d/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.h b/pytorch3d/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.h
new file mode 100644
index 0000000000000000000000000000000000000000..f419743fbc980476f93eef210b4b8b80b642fce0
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/interp_face_attrs/interp_face_attrs.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <tuple>
+#include "utils/pytorch3d_cutils.h"
+
+// Interpolates per-face attributes (forward pass)
+//
+// Inputs:
+//    pix_to_face: LongTensor of shape (P,) giving a face index for each pixel.
+//        Each element should be < F, the total number of faces.
+//        Face indices < 0 indicate that the pixel is not covered by a face.
+//    barycentric_coords: FloatTensor of shape (P, 3) giving barycentric coords.
+//    face_attrs: FloatTensor of shape (F, 3, D) giving a D-dimensional
+//        value for each vertex of each face.
+//
+// Returns:
+//    pix_attributes: FloatTensor of shape (P, D) giving an interpolated value
+//    for each pixel.
+
+// CPU implementation
+at::Tensor InterpFaceAttrsForwardCpu(
+    const at::Tensor& pix_to_face,
+    const at::Tensor& barycentric_coords,
+    const at::Tensor& face_attrs) {
+  AT_ERROR("Not Implemented");
+  return pix_to_face;
+}
+
+#ifdef WITH_CUDA
+// Cuda implementation.
+at::Tensor InterpFaceAttrsForwardCuda(
+    const at::Tensor& pix_to_face,
+    const at::Tensor& barycentric_coords,
+    const at::Tensor& face_attrs);
+#endif
+
+// General implementation
+at::Tensor InterpFaceAttrsForward(
+    const at::Tensor& pix_to_face,
+    const at::Tensor& barycentric_coords,
+    const at::Tensor& face_attrs) {
+  if (pix_to_face.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(face_attrs);
+    CHECK_CUDA(barycentric_coords);
+    return InterpFaceAttrsForwardCuda(
+        pix_to_face, barycentric_coords, face_attrs);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return InterpFaceAttrsForwardCpu(pix_to_face, barycentric_coords, face_attrs);
+}
+
+// Interpolates per-face attributes (backward pass)
+//
+// Inputs:
+//    pix_to_face: LongTensor of shape (P,) giving a face index for each pixel.
+//        Each element should be < F, the total number of faces.
+//        Face indices < 0 indicate that the pixel is not covered by a face.
+//    barycentric_coords: FloatTensor of shape (P, 3) giving barycentric coords.
+//    face_attrs: FloatTensor of shape (F, 3, D) giving a D-dimensional
+//        value for each vertex of each face.
+//    grad_pix_attrs: Upstream gradients of shape (P, D)
+//
+// Returns a tuple of:
+//    grad_barycentric_coords: FloatTensor of shape (P, 3)
+//    grad_face_attrs: FloatTensor of shape (F, 3, D)
+
+std::tuple<at::Tensor, at::Tensor> InterpFaceAttrsBackwardCpu(
+    const at::Tensor& pix_to_face,
+    const at::Tensor& barycentric_coords,
+    const at::Tensor& face_attrs,
+    const at::Tensor& grad_pix_attrs) {
+  AT_ERROR("Not Implemented");
+  return std::make_tuple(pix_to_face, pix_to_face);
+}
+
+std::tuple<at::Tensor, at::Tensor> InterpFaceAttrsBackwardCuda(
+    const at::Tensor& pix_to_face,
+    const at::Tensor& barycentric_coords,
+    const at::Tensor& face_attrs,
+    const at::Tensor& grad_pix_attrs);
+
+std::tuple<at::Tensor, at::Tensor> InterpFaceAttrsBackward(
+    const at::Tensor& pix_to_face,
+    const at::Tensor& barycentric_coords,
+    const at::Tensor& face_attrs,
+    const at::Tensor& grad_pix_attrs) {
+  if (pix_to_face.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(face_attrs);
+    CHECK_CUDA(barycentric_coords);
+    CHECK_CUDA(grad_pix_attrs);
+    return InterpFaceAttrsBackwardCuda(
+        pix_to_face, barycentric_coords, face_attrs, grad_pix_attrs);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return InterpFaceAttrsBackwardCpu(
+      pix_to_face, barycentric_coords, face_attrs, grad_pix_attrs);
+}
diff --git a/pytorch3d/pytorch3d/csrc/iou_box3d/iou_box3d.cu b/pytorch3d/pytorch3d/csrc/iou_box3d/iou_box3d.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7b1679f25dd5f444b0bac6bd2cebd6e6f61543d8
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/iou_box3d/iou_box3d.cu
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <thrust/device_vector.h>
+#include <thrust/tuple.h>
+#include "iou_box3d/iou_utils.cuh"
+#include "utils/pytorch3d_cutils.h"
+
+// Parallelize over N*M computations which can each be done
+// independently
+__global__ void IoUBox3DKernel(
+    const at::PackedTensorAccessor64<float, 3, at::RestrictPtrTraits> boxes1,
+    const at::PackedTensorAccessor64<float, 3, at::RestrictPtrTraits> boxes2,
+    at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> vols,
+    at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> ious) {
+  const size_t N = boxes1.size(0);
+  const size_t M = boxes2.size(0);
+
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = gridDim.x * blockDim.x;
+
+  for (size_t i = tid; i < N * M; i += stride) {
+    const size_t n = i / M; // box1 index
+    const size_t m = i % M; // box2 index
+
+    // Convert to array of structs of face vertices i.e. effectively (F, 3, 3)
+    // FaceVerts is a data type defined in iou_utils.cuh
+    FaceVerts box1_tris[NUM_TRIS];
+    FaceVerts box2_tris[NUM_TRIS];
+    GetBoxTris(boxes1[n], box1_tris);
+    GetBoxTris(boxes2[m], box2_tris);
+
+    // Calculate the position of the center of the box which is used in
+    // several calculations. This requires a tensor as input.
+    const float3 box1_center = BoxCenter(boxes1[n]);
+    const float3 box2_center = BoxCenter(boxes2[m]);
+
+    // Convert to an array of face vertices
+    FaceVerts box1_planes[NUM_PLANES];
+    GetBoxPlanes(boxes1[n], box1_planes);
+    FaceVerts box2_planes[NUM_PLANES];
+    GetBoxPlanes(boxes2[m], box2_planes);
+
+    // Get Box Volumes
+    const float box1_vol = BoxVolume(box1_tris, box1_center, NUM_TRIS);
+    const float box2_vol = BoxVolume(box2_tris, box2_center, NUM_TRIS);
+
+    // Tris in Box1 intersection with Planes in Box2
+    // Initialize box1 intersecting faces. MAX_TRIS is the
+    // max faces possible in the intersecting shape.
+    // TODO: determine if the value of MAX_TRIS is sufficient or
+    // if we should store the max tris for each NxM computation
+    // and throw an error if any exceeds the max.
+    FaceVerts box1_intersect[MAX_TRIS];
+    for (int j = 0; j < NUM_TRIS; ++j) {
+      // Initialize the faces from the box
+      box1_intersect[j] = box1_tris[j];
+    }
+    // Get the count of the actual number of faces in the intersecting shape
+    int box1_count = BoxIntersections(box2_planes, box2_center, box1_intersect);
+
+    // Tris in Box2 intersection with Planes in Box1
+    FaceVerts box2_intersect[MAX_TRIS];
+    for (int j = 0; j < NUM_TRIS; ++j) {
+      box2_intersect[j] = box2_tris[j];
+    }
+    const int box2_count =
+        BoxIntersections(box1_planes, box1_center, box2_intersect);
+
+    // If there are overlapping regions in Box2, remove any coplanar faces
+    if (box2_count > 0) {
+      // Identify if any triangles in Box2 are coplanar with Box1
+      Keep tri2_keep[MAX_TRIS];
+      for (int j = 0; j < MAX_TRIS; ++j) {
+        // Initialize the valid faces to be true
+        tri2_keep[j].keep = j < box2_count ? true : false;
+      }
+      for (int b1 = 0; b1 < box1_count; ++b1) {
+        for (int b2 = 0; b2 < box2_count; ++b2) {
+          const bool is_coplanar =
+              IsCoplanarFace(box1_intersect[b1], box2_intersect[b2]);
+          const float area = FaceArea(box1_intersect[b1]);
+          if ((is_coplanar) && (area > kEpsilon)) {
+            tri2_keep[b2].keep = false;
+          }
+        }
+      }
+
+      // Keep only the non coplanar triangles in Box2 - add them to the
+      // Box1 triangles.
+      for (int b2 = 0; b2 < box2_count; ++b2) {
+        if (tri2_keep[b2].keep) {
+          box1_intersect[box1_count] = box2_intersect[b2];
+          // box1_count will determine the total faces in the
+          // intersecting shape
+          box1_count++;
+        }
+      }
+    }
+
+    // Initialize the vol and iou to 0.0 in case there are no triangles
+    // in the intersecting shape.
+    float vol = 0.0;
+    float iou = 0.0;
+
+    // If there are triangles in the intersecting shape
+    if (box1_count > 0) {
+      // The intersecting shape is a polyhedron made up of the
+      // triangular faces that are all now in box1_intersect.
+      // Calculate the polyhedron center
+      const float3 poly_center = PolyhedronCenter(box1_intersect, box1_count);
+      // Compute intersecting polyhedron volume
+      vol = BoxVolume(box1_intersect, poly_center, box1_count);
+      // Compute IoU
+      iou = vol / (box1_vol + box2_vol - vol);
+    }
+
+    // Write the volume and IoU to global memory
+    vols[n][m] = vol;
+    ious[n][m] = iou;
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> IoUBox3DCuda(
+    const at::Tensor& boxes1, // (N, 8, 3)
+    const at::Tensor& boxes2) { // (M, 8, 3)
+  // Check inputs are on the same device
+  at::TensorArg boxes1_t{boxes1, "boxes1", 1}, boxes2_t{boxes2, "boxes2", 2};
+  at::CheckedFrom c = "IoUBox3DCuda";
+  at::checkAllSameGPU(c, {boxes1_t, boxes2_t});
+  at::checkAllSameType(c, {boxes1_t, boxes2_t});
+
+  // Set the device for the kernel launch based on the device of boxes1
+  at::cuda::CUDAGuard device_guard(boxes1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  TORCH_CHECK(boxes2.size(2) == boxes1.size(2), "Boxes must have shape (8, 3)");
+
+  TORCH_CHECK(
+      (boxes2.size(1) == 8) && (boxes1.size(1) == 8),
+      "Boxes must have shape (8, 3)");
+
+  const int64_t N = boxes1.size(0);
+  const int64_t M = boxes2.size(0);
+
+  auto vols = at::zeros({N, M}, boxes1.options());
+  auto ious = at::zeros({N, M}, boxes1.options());
+
+  if (vols.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(vols, ious);
+  }
+
+  const size_t blocks = 512;
+  const size_t threads = 256;
+
+  IoUBox3DKernel<<<blocks, threads, 0, stream>>>(
+      boxes1.packed_accessor64<float, 3, at::RestrictPtrTraits>(),
+      boxes2.packed_accessor64<float, 3, at::RestrictPtrTraits>(),
+      vols.packed_accessor64<float, 2, at::RestrictPtrTraits>(),
+      ious.packed_accessor64<float, 2, at::RestrictPtrTraits>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return std::make_tuple(vols, ious);
+}
diff --git a/pytorch3d/pytorch3d/csrc/iou_box3d/iou_box3d.h b/pytorch3d/pytorch3d/csrc/iou_box3d/iou_box3d.h
new file mode 100644
index 0000000000000000000000000000000000000000..1fedfd10abea679a7ef7f41385060448549cc736
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/iou_box3d/iou_box3d.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <tuple>
+#include "utils/pytorch3d_cutils.h"
+
+// Calculate the intersection volume and IoU metric for two batches of boxes
+//
+// Args:
+//     boxes1: tensor of shape (N, 8, 3) of the coordinates of the 1st boxes
+//     boxes2: tensor of shape (M, 8, 3) of the coordinates of the 2nd boxes
+// Returns:
+//     vol: (N, M) tensor of the volume of the intersecting convex shapes
+//     iou: (N, M) tensor of the intersection over union which is
+//          defined as: `iou = vol / (vol1 + vol2 - vol)`
+
+// CPU implementation
+std::tuple<at::Tensor, at::Tensor> IoUBox3DCpu(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+
+// CUDA implementation
+std::tuple<at::Tensor, at::Tensor> IoUBox3DCuda(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2);
+
+// Implementation which is exposed
+inline std::tuple<at::Tensor, at::Tensor> IoUBox3D(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  if (boxes1.is_cuda() || boxes2.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(boxes1);
+    CHECK_CUDA(boxes2);
+    return IoUBox3DCuda(boxes1.contiguous(), boxes2.contiguous());
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return IoUBox3DCpu(boxes1.contiguous(), boxes2.contiguous());
+}
diff --git a/pytorch3d/pytorch3d/csrc/iou_box3d/iou_box3d_cpu.cpp b/pytorch3d/pytorch3d/csrc/iou_box3d/iou_box3d_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3370097f1a8cc5d45c2465013e6bd1440f77ef40
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/iou_box3d/iou_box3d_cpu.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <torch/torch.h>
+#include <list>
+#include <numeric>
+#include <queue>
+#include <tuple>
+#include "iou_box3d/iou_utils.h"
+
+std::tuple<at::Tensor, at::Tensor> IoUBox3DCpu(
+    const at::Tensor& boxes1,
+    const at::Tensor& boxes2) {
+  const int N = boxes1.size(0);
+  const int M = boxes2.size(0);
+  auto float_opts = boxes1.options().dtype(torch::kFloat32);
+  torch::Tensor vols = torch::zeros({N, M}, float_opts);
+  torch::Tensor ious = torch::zeros({N, M}, float_opts);
+
+  // Create tensor accessors
+  auto boxes1_a = boxes1.accessor<float, 3>();
+  auto boxes2_a = boxes2.accessor<float, 3>();
+  auto vols_a = vols.accessor<float, 2>();
+  auto ious_a = ious.accessor<float, 2>();
+
+  // Iterate through the N boxes in boxes1
+  for (int n = 0; n < N; ++n) {
+    const auto& box1 = boxes1_a[n];
+    // Convert to vector of face vertices i.e. effectively (F, 3, 3)
+    // face_verts is a data type defined in iou_utils.h
+    const face_verts box1_tris = GetBoxTris(box1);
+
+    // Calculate the position of the center of the box which is used in
+    // several calculations. This requires a tensor as input.
+    const vec3<float> box1_center = BoxCenter(boxes1[n]);
+
+    // Convert to vector of face vertices i.e. effectively (P, 4, 3)
+    const face_verts box1_planes = GetBoxPlanes(box1);
+
+    // Get Box Volumes
+    const float box1_vol = BoxVolume(box1_tris, box1_center);
+
+    // Iterate through the M boxes in boxes2
+    for (int m = 0; m < M; ++m) {
+      // Repeat above steps for box2
+      // TODO: check if caching these value helps performance.
+      const auto& box2 = boxes2_a[m];
+      const face_verts box2_tris = GetBoxTris(box2);
+      const vec3<float> box2_center = BoxCenter(boxes2[m]);
+      const face_verts box2_planes = GetBoxPlanes(box2);
+      const float box2_vol = BoxVolume(box2_tris, box2_center);
+
+      // Every triangle in one box will be compared to each plane in the other
+      // box. There are 3 possible outcomes:
+      // 1. If the triangle is fully inside, then it will
+      //    remain as is.
+      // 2. If the triagnle it is fully outside, it will be removed.
+      // 3. If the triangle intersects with the (infinite) plane, it
+      //    will be broken into subtriangles such that each subtriangle is full
+      //    inside the plane and part of the intersecting tetrahedron.
+
+      // Tris in Box1 -> Planes in Box2
+      face_verts box1_intersect =
+          BoxIntersections(box1_tris, box2_planes, box2_center);
+      // Tris in Box2 -> Planes in Box1
+      face_verts box2_intersect =
+          BoxIntersections(box2_tris, box1_planes, box1_center);
+
+      // If there are overlapping regions in Box2, remove any coplanar faces
+      if (box2_intersect.size() > 0) {
+        // Identify if any triangles in Box2 are coplanar with Box1
+        std::vector<int> tri2_keep(box2_intersect.size());
+        std::fill(tri2_keep.begin(), tri2_keep.end(), 1);
+        for (int b1 = 0; b1 < box1_intersect.size(); ++b1) {
+          for (int b2 = 0; b2 < box2_intersect.size(); ++b2) {
+            const bool is_coplanar =
+                IsCoplanarFace(box1_intersect[b1], box2_intersect[b2]);
+            const float area = FaceArea(box1_intersect[b1]);
+            if ((is_coplanar) && (area > kEpsilon)) {
+              tri2_keep[b2] = 0;
+            }
+          }
+        }
+
+        // Keep only the non coplanar triangles in Box2 - add them to the
+        // Box1 triangles.
+        for (int b2 = 0; b2 < box2_intersect.size(); ++b2) {
+          if (tri2_keep[b2] == 1) {
+            box1_intersect.push_back((box2_intersect[b2]));
+          }
+        }
+      }
+
+      // Initialize the vol and iou to 0.0 in case there are no triangles
+      // in the intersecting shape.
+      float vol = 0.0;
+      float iou = 0.0;
+
+      // If there are triangles in the intersecting shape
+      if (box1_intersect.size() > 0) {
+        // The intersecting shape is a polyhedron made up of the
+        // triangular faces that are all now in box1_intersect.
+        // Calculate the polyhedron center
+        const vec3<float> polyhedron_center = PolyhedronCenter(box1_intersect);
+        // Compute intersecting polyhedron volume
+        vol = BoxVolume(box1_intersect, polyhedron_center);
+        // Compute IoU
+        iou = vol / (box1_vol + box2_vol - vol);
+      }
+      // Save out volume and IoU
+      vols_a[n][m] = vol;
+      ious_a[n][m] = iou;
+    }
+  }
+  return std::make_tuple(vols, ious);
+}
diff --git a/pytorch3d/pytorch3d/csrc/iou_box3d/iou_utils.cuh b/pytorch3d/pytorch3d/csrc/iou_box3d/iou_utils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..960661e369edfb57c328b68e2bee906373eae4f7
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/iou_box3d/iou_utils.cuh
@@ -0,0 +1,605 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <thrust/device_vector.h>
+#include <cstdio>
+#include "utils/float_math.cuh"
+
+__constant__ const float kEpsilon = 1e-4;
+
+/*
+_PLANES and _TRIS define the 4- and 3-connectivity
+of the 8 box corners.
+_PLANES gives the quad faces of the 3D box
+_TRIS gives the triangle faces of the 3D box
+*/
+const int NUM_PLANES = 6;
+const int NUM_TRIS = 12;
+// This is required for iniitalizing the faces
+// in the intersecting shape
+const int MAX_TRIS = 100;
+
+// Create data types for representing the
+// verts for each face and the indices.
+// We will use struct arrays for representing
+// the data for each box and intersecting
+// triangles
+typedef struct {
+  float3 v0;
+  float3 v1;
+  float3 v2;
+  float3 v3; // Can be empty for triangles
+} FaceVerts;
+
+typedef struct {
+  int v0;
+  int v1;
+  int v2;
+  int v3; // Can be empty for triangles
+} FaceVertsIdx;
+
+// This is used when deciding which faces to
+// keep that are not coplanar
+typedef struct {
+  bool keep;
+} Keep;
+
+__device__ FaceVertsIdx _PLANES[] = {
+    {0, 1, 2, 3},
+    {3, 2, 6, 7},
+    {0, 1, 5, 4},
+    {0, 3, 7, 4},
+    {1, 5, 6, 2},
+    {4, 5, 6, 7},
+};
+__device__ FaceVertsIdx _TRIS[] = {
+    {0, 1, 2},
+    {0, 3, 2},
+    {4, 5, 6},
+    {4, 6, 7},
+    {1, 5, 6},
+    {1, 6, 2},
+    {0, 4, 7},
+    {0, 7, 3},
+    {3, 2, 6},
+    {3, 6, 7},
+    {0, 1, 5},
+    {0, 4, 5},
+};
+
+// Args
+//    box: (8, 3) tensor accessor for the box vertices
+//    box_tris: Array of structs of type FaceVerts,
+//      effectively (F, 3, 3) where the coordinates of the
+//      verts for each face will be saved to.
+//
+// Returns: None (output saved to box_tris)
+//
+template <typename Box, typename BoxTris>
+__device__ inline void GetBoxTris(const Box& box, BoxTris& box_tris) {
+  for (int t = 0; t < NUM_TRIS; ++t) {
+    const float3 v0 = make_float3(
+        box[_TRIS[t].v0][0], box[_TRIS[t].v0][1], box[_TRIS[t].v0][2]);
+    const float3 v1 = make_float3(
+        box[_TRIS[t].v1][0], box[_TRIS[t].v1][1], box[_TRIS[t].v1][2]);
+    const float3 v2 = make_float3(
+        box[_TRIS[t].v2][0], box[_TRIS[t].v2][1], box[_TRIS[t].v2][2]);
+    box_tris[t] = {v0, v1, v2};
+  }
+}
+
+// Args
+//    box: (8, 3) tensor accessor for the box vertices
+//    box_planes: Array of structs of type FaceVerts, effectively (P, 4, 3)
+//      where the coordinates of the verts for the four corners of each plane
+//      will be saved to
+//
+// Returns: None (output saved to box_planes)
+//
+template <typename Box, typename FaceVertsBoxPlanes>
+__device__ inline void GetBoxPlanes(
+    const Box& box,
+    FaceVertsBoxPlanes& box_planes) {
+  for (int t = 0; t < NUM_PLANES; ++t) {
+    const float3 v0 = make_float3(
+        box[_PLANES[t].v0][0], box[_PLANES[t].v0][1], box[_PLANES[t].v0][2]);
+    const float3 v1 = make_float3(
+        box[_PLANES[t].v1][0], box[_PLANES[t].v1][1], box[_PLANES[t].v1][2]);
+    const float3 v2 = make_float3(
+        box[_PLANES[t].v2][0], box[_PLANES[t].v2][1], box[_PLANES[t].v2][2]);
+    const float3 v3 = make_float3(
+        box[_PLANES[t].v3][0], box[_PLANES[t].v3][1], box[_PLANES[t].v3][2]);
+    box_planes[t] = {v0, v1, v2, v3};
+  }
+}
+
+// The normal of the face defined by vertices (v0, v1, v2)
+// Define e0 to be the edge connecting (v1, v0)
+// Define e1 to be the edge connecting (v2, v0)
+// normal is the cross product of e0, e1
+//
+// Args
+//    v0, v1, v2: float3 coordinates of the vertices of the face
+//
+// Returns
+//    float3: normal for the face
+//
+__device__ inline float3
+FaceNormal(const float3 v0, const float3 v1, const float3 v2) {
+  float3 n = cross(v1 - v0, v2 - v0);
+  n = n / fmaxf(norm(n), kEpsilon);
+  return n;
+}
+
+// The area of the face defined by vertices (v0, v1, v2)
+// Define e0 to be the edge connecting (v1, v0)
+// Define e1 to be the edge connecting (v2, v0)
+// Area is the norm of the cross product of e0, e1 divided by 2.0
+//
+// Args
+//    tri: FaceVerts of float3 coordinates of the vertices of the face
+//
+// Returns
+//    float: area for the face
+//
+__device__ inline float FaceArea(const FaceVerts& tri) {
+  // Get verts for face 1
+  const float3 v0 = tri.v0;
+  const float3 v1 = tri.v1;
+  const float3 v2 = tri.v2;
+  const float3 n = cross(v1 - v0, v2 - v0);
+  return norm(n) / 2.0;
+}
+
+// The normal of a box plane defined by the verts in `plane` with
+// the centroid of the box given by `center`.
+// Args
+//    plane: float3 coordinates of the vertices of the plane
+//    center: float3 coordinates of the center of the box from
+//        which the plane originated
+//
+// Returns
+//    float3: normal for the plane such that it points towards
+//      the center of the box
+//
+template <typename FaceVertsPlane>
+__device__ inline float3 PlaneNormalDirection(
+    const FaceVertsPlane& plane,
+    const float3& center) {
+  // Only need the first 3 verts of the plane
+  const float3 v0 = plane.v0;
+  const float3 v1 = plane.v1;
+  const float3 v2 = plane.v2;
+
+  // We project the center on the plane defined by (v0, v1, v2)
+  // We can write center = v0 + a * e0 + b * e1 + c * n
+  // We know that <e0, n> = 0 and <e1, n> = 0 and
+  // <a, b> is the dot product between a and b.
+  // This means we can solve for c as:
+  // c = <center - v0 - a * e0 - b * e1, n> = <center - v0, n>
+  float3 n = FaceNormal(v0, v1, v2);
+  const float c = dot((center - v0), n);
+
+  // If c is negative, then we revert the direction of n such that n
+  // points "inside"
+  if (c < kEpsilon) {
+    n = -1.0f * n;
+  }
+
+  return n;
+}
+
+// Calculate the volume of the box by summing the volume of
+// each of the tetrahedrons formed with a triangle face and
+// the box centroid.
+//
+// Args
+//    box_tris: vector of float3 coordinates of the vertices of each
+//       of the triangles in the box
+//    box_center: float3 coordinates of the center of the box
+//
+// Returns
+//    float: volume of the box
+//
+template <typename BoxTris>
+__device__ inline float BoxVolume(
+    const BoxTris& box_tris,
+    const float3& box_center,
+    const int num_tris) {
+  float box_vol = 0.0;
+  // Iterate through each triange, calculate the area of the
+  // tetrahedron formed with the box_center and sum them
+  for (int t = 0; t < num_tris; ++t) {
+    // Subtract the center:
+    float3 v0 = box_tris[t].v0;
+    float3 v1 = box_tris[t].v1;
+    float3 v2 = box_tris[t].v2;
+
+    v0 = v0 - box_center;
+    v1 = v1 - box_center;
+    v2 = v2 - box_center;
+
+    // Compute the area
+    const float area = dot(v0, cross(v1, v2));
+    const float vol = abs(area) / 6.0;
+    box_vol = box_vol + vol;
+  }
+  return box_vol;
+}
+
+// Compute the box center as the mean of the verts
+//
+// Args
+//    box_verts: (8, 3) tensor of the corner vertices of the box
+//
+// Returns
+//    float3: coordinates of the center of the box
+//
+template <typename Box>
+__device__ inline float3 BoxCenter(const Box box_verts) {
+  float x = 0.0;
+  float y = 0.0;
+  float z = 0.0;
+  const int num_verts = box_verts.size(0); // Should be 8
+  // Sum all x, y, z, and take the mean
+  for (int t = 0; t < num_verts; ++t) {
+    x = x + box_verts[t][0];
+    y = y + box_verts[t][1];
+    z = z + box_verts[t][2];
+  }
+  // Take the mean of all the vertex positions
+  x = x / num_verts;
+  y = y / num_verts;
+  z = z / num_verts;
+  const float3 center = make_float3(x, y, z);
+  return center;
+}
+
+// Compute the polyhedron center as the mean of the face centers
+// of the triangle faces
+//
+// Args
+//    tris: vector of float3 coordinates of the
+//       vertices of each of the triangles in the polyhedron
+//
+// Returns
+//    float3: coordinates of the center of the polyhedron
+//
+template <typename Tris>
+__device__ inline float3 PolyhedronCenter(
+    const Tris& tris,
+    const int num_tris) {
+  float x = 0.0;
+  float y = 0.0;
+  float z = 0.0;
+
+  // Find the center point of each face
+  for (int t = 0; t < num_tris; ++t) {
+    const float3 v0 = tris[t].v0;
+    const float3 v1 = tris[t].v1;
+    const float3 v2 = tris[t].v2;
+    const float x_face = (v0.x + v1.x + v2.x) / 3.0;
+    const float y_face = (v0.y + v1.y + v2.y) / 3.0;
+    const float z_face = (v0.z + v1.z + v2.z) / 3.0;
+    x = x + x_face;
+    y = y + y_face;
+    z = z + z_face;
+  }
+
+  // Take the mean of the centers of all faces
+  x = x / num_tris;
+  y = y / num_tris;
+  z = z / num_tris;
+
+  const float3 center = make_float3(x, y, z);
+  return center;
+}
+
+// Compute a boolean indicator for whether a point
+// is inside a plane, where inside refers to whether
+// or not the point has a component in the
+// normal direction of the plane.
+//
+// Args
+//    plane: vector of float3 coordinates of the
+//       vertices of each of the triangles in the box
+//    normal: float3 of the direction of the plane normal
+//    point: float3 of the position of the point of interest
+//
+// Returns
+//    bool: whether or not the point is inside the plane
+//
+__device__ inline bool
+IsInside(const FaceVerts& plane, const float3& normal, const float3& point) {
+  // Get one vert of the plane
+  const float3 v0 = plane.v0;
+
+  // Every point p can be written as p = v0 + a e0 + b e1 + c n
+  // Solving for c:
+  // c = (point - v0 - a * e0 - b * e1).dot(n)
+  // We know that <e0, n> = 0 and <e1, n> = 0
+  // So the calculation can be simplified as:
+  const float c = dot((point - v0), normal);
+  const bool inside = c > -1.0f * kEpsilon;
+  return inside;
+}
+
+// Find the point of intersection between a plane
+// and a line given by the end points (p0, p1)
+//
+// Args
+//    plane: vector of float3 coordinates of the
+//       vertices of each of the triangles in the box
+//    normal: float3 of the direction of the plane normal
+//    p0, p1: float3 of the start and end point of the line
+//
+// Returns
+//    float3: position of the intersection point
+//
+__device__ inline float3 PlaneEdgeIntersection(
+    const FaceVerts& plane,
+    const float3& normal,
+    const float3& p0,
+    const float3& p1) {
+  // Get one vert of the plane
+  const float3 v0 = plane.v0;
+
+  // The point of intersection can be parametrized
+  // p = p0 + a (p1 - p0) where a in [0, 1]
+  // We want to find a such that p is on plane
+  // <p - v0, n> = 0
+  const float top = dot(-1.0f * (p0 - v0), normal);
+  const float bot = dot(p1 - p0, normal);
+  const float a = top / bot;
+  const float3 p = p0 + a * (p1 - p0);
+  return p;
+}
+
+// Triangle is clipped into a quadrilateral
+// based on the intersection points with the plane.
+// Then the quadrilateral is divided into two triangles.
+//
+// Args
+//    plane: vector of float3 coordinates of the
+//        vertices of each of the triangles in the box
+//    normal: float3 of the direction of the plane normal
+//    vout: float3 of the point in the triangle which is outside
+//       the plane
+//    vin1, vin2: float3 of the points in the triangle which are
+//        inside the plane
+//    face_verts_out: Array of structs of type FaceVerts,
+//       with the coordinates of the new triangle faces
+//       formed after clipping.
+//       All triangles are now "inside" the plane.
+//
+// Returns:
+//    count: (int) number of new faces after clipping the triangle
+//      i.e. the valid faces which have been saved
+//      to face_verts_out
+//
+template <typename FaceVertsBox>
+__device__ inline int ClipTriByPlaneOneOut(
+    const FaceVerts& plane,
+    const float3& normal,
+    const float3& vout,
+    const float3& vin1,
+    const float3& vin2,
+    FaceVertsBox& face_verts_out) {
+  // point of intersection between plane and (vin1, vout)
+  const float3 pint1 = PlaneEdgeIntersection(plane, normal, vin1, vout);
+  // point of intersection between plane and (vin2, vout)
+  const float3 pint2 = PlaneEdgeIntersection(plane, normal, vin2, vout);
+
+  face_verts_out[0] = {vin1, pint1, pint2};
+  face_verts_out[1] = {vin1, pint2, vin2};
+
+  return 2;
+}
+
+// Triangle is clipped into a smaller triangle based
+// on the intersection points with the plane.
+//
+// Args
+//    plane: vector of float3 coordinates of the
+//       vertices of each of the triangles in the box
+//    normal: float3 of the direction of the plane normal
+//    vout1, vout2: float3 of the points in the triangle which are
+//       outside the plane
+//    vin: float3 of the point in the triangle which is inside
+//        the plane
+//    face_verts_out: Array of structs of type FaceVerts,
+//       with the coordinates of the new triangle faces
+//       formed after clipping.
+//       All triangles are now "inside" the plane.
+//
+// Returns:
+//    count: (int) number of new faces after clipping the triangle
+//      i.e. the valid faces which have been saved
+//      to face_verts_out
+//
+template <typename FaceVertsBox>
+__device__ inline int ClipTriByPlaneTwoOut(
+    const FaceVerts& plane,
+    const float3& normal,
+    const float3& vout1,
+    const float3& vout2,
+    const float3& vin,
+    FaceVertsBox& face_verts_out) {
+  // point of intersection between plane and (vin, vout1)
+  const float3 pint1 = PlaneEdgeIntersection(plane, normal, vin, vout1);
+  // point of intersection between plane and (vin, vout2)
+  const float3 pint2 = PlaneEdgeIntersection(plane, normal, vin, vout2);
+
+  face_verts_out[0] = {vin, pint1, pint2};
+
+  return 1;
+}
+
+// Clip the triangle faces so that they lie within the
+// plane, creating new triangle faces where necessary.
+//
+// Args
+//    plane: Array of structs of type FaceVerts with the coordinates
+//       of the vertices of each of the triangles in the box
+//    tri: Array of structs of type FaceVerts with the vertex
+//       coordinates of the triangle faces
+//    normal: float3 of the direction of the plane normal
+//    face_verts_out: Array of structs of type FaceVerts,
+//       with the coordinates of the new triangle faces
+//       formed after clipping.
+//       All triangles are now "inside" the plane.
+//
+// Returns:
+//    count: (int) number of new faces after clipping the triangle
+//      i.e. the valid faces which have been saved
+//      to face_verts_out
+//
+template <typename FaceVertsBox>
+__device__ inline int ClipTriByPlane(
+    const FaceVerts& plane,
+    const FaceVerts& tri,
+    const float3& normal,
+    FaceVertsBox& face_verts_out) {
+  // Get Triangle vertices
+  const float3 v0 = tri.v0;
+  const float3 v1 = tri.v1;
+  const float3 v2 = tri.v2;
+
+  // Check each of the triangle vertices to see if it is inside the plane
+  const bool isin0 = IsInside(plane, normal, v0);
+  const bool isin1 = IsInside(plane, normal, v1);
+  const bool isin2 = IsInside(plane, normal, v2);
+
+  // All in
+  if (isin0 && isin1 && isin2) {
+    // Return input vertices
+    face_verts_out[0] = {v0, v1, v2};
+    return 1;
+  }
+
+  // All out
+  if (!isin0 && !isin1 && !isin2) {
+    return 0;
+  }
+
+  // One vert out
+  if (isin0 && isin1 && !isin2) {
+    return ClipTriByPlaneOneOut(plane, normal, v2, v0, v1, face_verts_out);
+  }
+  if (isin0 && !isin1 && isin2) {
+    return ClipTriByPlaneOneOut(plane, normal, v1, v0, v2, face_verts_out);
+  }
+  if (!isin0 && isin1 && isin2) {
+    return ClipTriByPlaneOneOut(plane, normal, v0, v1, v2, face_verts_out);
+  }
+
+  // Two verts out
+  if (isin0 && !isin1 && !isin2) {
+    return ClipTriByPlaneTwoOut(plane, normal, v1, v2, v0, face_verts_out);
+  }
+  if (!isin0 && !isin1 && isin2) {
+    return ClipTriByPlaneTwoOut(plane, normal, v0, v1, v2, face_verts_out);
+  }
+  if (!isin0 && isin1 && !isin2) {
+    return ClipTriByPlaneTwoOut(plane, normal, v0, v2, v1, face_verts_out);
+  }
+
+  // Else return empty (should not be reached)
+  return 0;
+}
+
+// Compute a boolean indicator for whether or not two faces
+// are coplanar
+//
+// Args
+//    tri1, tri2: FaceVerts struct of the vertex coordinates of
+//       the triangle face
+//
+// Returns
+//    bool: whether or not the two faces are coplanar
+//
+__device__ inline bool IsCoplanarFace(
+    const FaceVerts& tri1,
+    const FaceVerts& tri2) {
+  // Get verts for face 1
+  const float3 v0 = tri1.v0;
+  const float3 v1 = tri1.v1;
+  const float3 v2 = tri1.v2;
+
+  const float3 n1 = FaceNormal(v0, v1, v2);
+  int coplanar_count = 0;
+
+  // Check v0, v1, v2
+  if (abs(dot(tri2.v0 - v0, n1)) < kEpsilon) {
+    coplanar_count++;
+  }
+  if (abs(dot(tri2.v1 - v0, n1)) < kEpsilon) {
+    coplanar_count++;
+  }
+  if (abs(dot(tri2.v2 - v0, n1)) < kEpsilon) {
+    coplanar_count++;
+  }
+  return (coplanar_count == 3);
+}
+
+// Get the triangles from each box which are part of the
+// intersecting polyhedron by computing the intersection
+// points with each of the planes.
+//
+// Args
+//    planes: Array of structs of type FaceVerts with the coordinates
+//       of the vertices of each of the triangles in the box
+//    center: float3 coordinates of the center of the box from which
+//        the planes originate
+//    face_verts_out: Array of structs of type FaceVerts,
+//       where the coordinates of the new triangle faces
+//       formed after clipping will be saved to.
+//       All triangles are now "inside" the plane.
+//
+// Returns:
+//    count: (int) number of faces in the intersecting shape
+//      i.e. the valid faces which have been saved
+//      to face_verts_out
+//
+template <typename FaceVertsPlane, typename FaceVertsBox>
+__device__ inline int BoxIntersections(
+    const FaceVertsPlane& planes,
+    const float3& center,
+    FaceVertsBox& face_verts_out) {
+  // Initialize num tris to 12
+  int num_tris = NUM_TRIS;
+  for (int p = 0; p < NUM_PLANES; ++p) {
+    // Get plane normal direction
+    const float3 n2 = PlaneNormalDirection(planes[p], center);
+    // Create intermediate vector to store the updated tris
+    FaceVerts tri_verts_updated[MAX_TRIS];
+    int offset = 0;
+
+    // Iterate through triangles in face_verts_out
+    // for the valid tris given by num_tris
+    for (int t = 0; t < num_tris; ++t) {
+      // Clip tri by plane, can max be split into 2 triangles
+      FaceVerts tri_updated[2];
+      const int count =
+          ClipTriByPlane(planes[p], face_verts_out[t], n2, tri_updated);
+      // Add to the tri_verts_updated output if not empty
+      for (int v = 0; v < count; ++v) {
+        tri_verts_updated[offset] = tri_updated[v];
+        offset++;
+      }
+    }
+    // Update the face_verts_out tris
+    num_tris = offset;
+    for (int j = 0; j < num_tris; ++j) {
+      face_verts_out[j] = tri_verts_updated[j];
+    }
+  }
+  return num_tris;
+}
diff --git a/pytorch3d/pytorch3d/csrc/iou_box3d/iou_utils.h b/pytorch3d/pytorch3d/csrc/iou_box3d/iou_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..9aea3fd4434fbfc62eccb4aaa50585aae10acc51
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/iou_box3d/iou_utils.h
@@ -0,0 +1,551 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <assert.h>
+#include <torch/extension.h>
+#include <torch/torch.h>
+#include <algorithm>
+#include <list>
+#include <numeric>
+#include <queue>
+#include <tuple>
+#include <type_traits>
+#include "utils/vec3.h"
+
+const auto kEpsilon = 1e-4;
+/*
+_PLANES and _TRIS define the 4- and 3-connectivity
+of the 8 box corners.
+_PLANES gives the quad faces of the 3D box
+_TRIS gives the triangle faces of the 3D box
+*/
+const int NUM_PLANES = 6;
+const int NUM_TRIS = 12;
+const int _PLANES[6][4] = {
+    {0, 1, 2, 3},
+    {3, 2, 6, 7},
+    {0, 1, 5, 4},
+    {0, 3, 7, 4},
+    {1, 5, 6, 2},
+    {4, 5, 6, 7},
+};
+const int _TRIS[12][3] = {
+    {0, 1, 2},
+    {0, 3, 2},
+    {4, 5, 6},
+    {4, 6, 7},
+    {1, 5, 6},
+    {1, 6, 2},
+    {0, 4, 7},
+    {0, 7, 3},
+    {3, 2, 6},
+    {3, 6, 7},
+    {0, 1, 5},
+    {0, 4, 5},
+};
+
+// Create a new data type for representing the
+// verts for each face which can be triangle or plane.
+// This helps make the code more readable.
+using face_verts = std::vector<std::vector<vec3<float>>>;
+
+// Args
+//    box: (8, 3) tensor accessor for the box vertices
+//    plane_idx: index of the plane in the box
+//    vert_idx: index of the vertex in the plane
+//
+// Returns
+//    vec3<T> (x, y, x) vertex coordinates
+//
+template <typename Box>
+inline vec3<float>
+ExtractVertsPlane(const Box& box, const int plane_idx, const int vert_idx) {
+  return vec3<float>(
+      box[_PLANES[plane_idx][vert_idx]][0],
+      box[_PLANES[plane_idx][vert_idx]][1],
+      box[_PLANES[plane_idx][vert_idx]][2]);
+}
+
+// Args
+//    box: (8, 3) tensor accessor for the box vertices
+//    tri_idx: index of the triangle face in the box
+//    vert_idx: index of the vertex in the triangle
+//
+// Returns
+//    vec3<T> (x, y, x) vertex coordinates
+//
+template <typename Box>
+inline vec3<float>
+ExtractVertsTri(const Box& box, const int tri_idx, const int vert_idx) {
+  return vec3<float>(
+      box[_TRIS[tri_idx][vert_idx]][0],
+      box[_TRIS[tri_idx][vert_idx]][1],
+      box[_TRIS[tri_idx][vert_idx]][2]);
+}
+
+// Args
+//    box: (8, 3) tensor accessor for the box vertices
+//
+// Returns
+//    std::vector<std::vector<vec3<T>>> effectively (F, 3, 3)
+//      coordinates of the verts for each face
+//
+template <typename Box>
+inline face_verts GetBoxTris(const Box& box) {
+  face_verts box_tris;
+  for (int t = 0; t < NUM_TRIS; ++t) {
+    vec3<float> v0 = ExtractVertsTri(box, t, 0);
+    vec3<float> v1 = ExtractVertsTri(box, t, 1);
+    vec3<float> v2 = ExtractVertsTri(box, t, 2);
+    box_tris.push_back({v0, v1, v2});
+  }
+  return box_tris;
+}
+
+// Args
+//    box: (8, 3) tensor accessor for the box vertices
+//
+// Returns
+//    std::vector<std::vector<vec3<T>>> effectively (P, 3, 3)
+//      coordinates of the 4 verts for each plane
+//
+template <typename Box>
+inline face_verts GetBoxPlanes(const Box& box) {
+  face_verts box_planes;
+  for (int t = 0; t < NUM_PLANES; ++t) {
+    vec3<float> v0 = ExtractVertsPlane(box, t, 0);
+    vec3<float> v1 = ExtractVertsPlane(box, t, 1);
+    vec3<float> v2 = ExtractVertsPlane(box, t, 2);
+    vec3<float> v3 = ExtractVertsPlane(box, t, 3);
+    box_planes.push_back({v0, v1, v2, v3});
+  }
+  return box_planes;
+}
+
+// The normal of the face defined by vertices (v0, v1, v2)
+// Define e0 to be the edge connecting (v1, v0)
+// Define e1 to be the edge connecting (v2, v0)
+// normal is the cross product of e0, e1
+//
+// Args
+//    v0, v1, v2: vec3 coordinates of the vertices of the face
+//
+// Returns
+//    vec3: normal for the face
+//
+inline vec3<float> FaceNormal(vec3<float> v0, vec3<float> v1, vec3<float> v2) {
+  vec3<float> n = cross(v1 - v0, v2 - v0);
+  n = n / std::fmaxf(norm(n), kEpsilon);
+  return n;
+}
+
+// The area of the face defined by vertices (v0, v1, v2)
+// Define e0 to be the edge connecting (v1, v0)
+// Define e1 to be the edge connecting (v2, v0)
+// Area is the norm of the cross product of e0, e1 divided by 2.0
+//
+// Args
+//    tri: vec3 coordinates of the vertices of the face
+//
+// Returns
+//    float: area for the face
+//
+inline float FaceArea(const std::vector<vec3<float>>& tri) {
+  // Get verts for face
+  const vec3<float> v0 = tri[0];
+  const vec3<float> v1 = tri[1];
+  const vec3<float> v2 = tri[2];
+  const vec3<float> n = cross(v1 - v0, v2 - v0);
+  return norm(n) / 2.0;
+}
+
+// The normal of a box plane defined by the verts in `plane` with
+// the centroid of the box given by `center`.
+// Args
+//    plane: vec3 coordinates of the vertices of the plane
+//    center: vec3 coordinates of the center of the box from
+//        which the plane originated
+//
+// Returns
+//    vec3: normal for the plane such that it points towards
+//      the center of the box
+//
+inline vec3<float> PlaneNormalDirection(
+    const std::vector<vec3<float>>& plane,
+    const vec3<float>& center) {
+  // Only need the first 3 verts of the plane
+  const vec3<float> v0 = plane[0];
+  const vec3<float> v1 = plane[1];
+  const vec3<float> v2 = plane[2];
+
+  // We project the center on the plane defined by (v0, v1, v2)
+  // We can write center = v0 + a * e0 + b * e1 + c * n
+  // We know that <e0, n> = 0 and <e1, n> = 0 and
+  // <a, b> is the dot product between a and b.
+  // This means we can solve for c as:
+  // c = <center - v0 - a * e0 - b * e1, n> = <center - v0, n>
+  vec3<float> n = FaceNormal(v0, v1, v2);
+  const float c = dot((center - v0), n);
+
+  // If c is negative, then we revert the direction of n such that n
+  // points "inside"
+  if (c < kEpsilon) {
+    n = -1.0f * n;
+  }
+
+  return n;
+}
+
+// Calculate the volume of the box by summing the volume of
+// each of the tetrahedrons formed with a triangle face and
+// the box centroid.
+//
+// Args
+//    box_tris: vector of vec3 coordinates of the vertices of each
+//       of the triangles in the box
+//    box_center: vec3 coordinates of the center of the box
+//
+// Returns
+//    float: volume of the box
+//
+inline float BoxVolume(
+    const face_verts& box_tris,
+    const vec3<float>& box_center) {
+  float box_vol = 0.0;
+  // Iterate through each triange, calculate the area of the
+  // tetrahedron formed with the box_center and sum them
+  for (int t = 0; t < box_tris.size(); ++t) {
+    // Subtract the center:
+    const vec3<float> v0 = box_tris[t][0] - box_center;
+    const vec3<float> v1 = box_tris[t][1] - box_center;
+    const vec3<float> v2 = box_tris[t][2] - box_center;
+
+    // Compute the area
+    const float area = dot(v0, cross(v1, v2));
+    const float vol = std::abs(area) / 6.0;
+    box_vol = box_vol + vol;
+  }
+  return box_vol;
+}
+
+// Compute the box center as the mean of the verts
+//
+// Args
+//    box_verts: (8, 3) tensor of the corner vertices of the box
+//
+// Returns
+//    vec3: coordinates of the center of the box
+//
+inline vec3<float> BoxCenter(const at::Tensor& box_verts) {
+  const auto& box_center_t = at::mean(box_verts, 0);
+  const vec3<float> box_center(
+      box_center_t[0].item<float>(),
+      box_center_t[1].item<float>(),
+      box_center_t[2].item<float>());
+  return box_center;
+}
+
+// Compute the polyhedron center as the mean of the face centers
+// of the triangle faces
+//
+// Args
+//    tris: vector of vec3 coordinates of the
+//       vertices of each of the triangles in the polyhedron
+//
+// Returns
+//    vec3: coordinates of the center of the polyhedron
+//
+inline vec3<float> PolyhedronCenter(const face_verts& tris) {
+  float x = 0.0;
+  float y = 0.0;
+  float z = 0.0;
+  const int num_tris = tris.size();
+
+  // Find the center point of each face
+  for (int t = 0; t < num_tris; ++t) {
+    const vec3<float> v0 = tris[t][0];
+    const vec3<float> v1 = tris[t][1];
+    const vec3<float> v2 = tris[t][2];
+    const float x_face = (v0.x + v1.x + v2.x) / 3.0;
+    const float y_face = (v0.y + v1.y + v2.y) / 3.0;
+    const float z_face = (v0.z + v1.z + v2.z) / 3.0;
+    x = x + x_face;
+    y = y + y_face;
+    z = z + z_face;
+  }
+
+  // Take the mean of the centers of all faces
+  x = x / num_tris;
+  y = y / num_tris;
+  z = z / num_tris;
+
+  const vec3<float> center(x, y, z);
+  return center;
+}
+
+// Compute a boolean indicator for whether a point
+// is inside a plane, where inside refers to whether
+// or not the point has a component in the
+// normal direction of the plane.
+//
+// Args
+//    plane: vector of vec3 coordinates of the
+//       vertices of each of the triangles in the box
+//    normal: vec3 of the direction of the plane normal
+//    point: vec3 of the position of the point of interest
+//
+// Returns
+//    bool: whether or not the point is inside the plane
+//
+inline bool IsInside(
+    const std::vector<vec3<float>>& plane,
+    const vec3<float>& normal,
+    const vec3<float>& point) {
+  // Get one vert of the plane
+  const vec3<float> v0 = plane[0];
+
+  // Every point p can be written as p = v0 + a e0 + b e1 + c n
+  // Solving for c:
+  // c = (point - v0 - a * e0 - b * e1).dot(n)
+  // We know that <e0, n> = 0 and <e1, n> = 0
+  // So the calculation can be simplified as:
+  const float c = dot((point - v0), normal);
+  const bool inside = c > -1.0f * kEpsilon;
+  return inside;
+}
+
+// Find the point of intersection between a plane
+// and a line given by the end points (p0, p1)
+//
+// Args
+//    plane: vector of vec3 coordinates of the
+//       vertices of each of the triangles in the box
+//    normal: vec3 of the direction of the plane normal
+//    p0, p1: vec3 of the start and end point of the line
+//
+// Returns
+//    vec3: position of the intersection point
+//
+inline vec3<float> PlaneEdgeIntersection(
+    const std::vector<vec3<float>>& plane,
+    const vec3<float>& normal,
+    const vec3<float>& p0,
+    const vec3<float>& p1) {
+  // Get one vert of the plane
+  const vec3<float> v0 = plane[0];
+
+  // The point of intersection can be parametrized
+  // p = p0 + a (p1 - p0) where a in [0, 1]
+  // We want to find a such that p is on plane
+  // <p - v0, n> = 0
+  const float top = dot(-1.0f * (p0 - v0), normal);
+  const float bot = dot(p1 - p0, normal);
+  const float a = top / bot;
+  const vec3<float> p = p0 + a * (p1 - p0);
+  return p;
+}
+
+// Triangle is clipped into a quadrilateral
+// based on the intersection points with the plane.
+// Then the quadrilateral is divided into two triangles.
+//
+// Args
+//    plane: vector of vec3 coordinates of the
+//        vertices of each of the triangles in the box
+//    normal: vec3 of the direction of the plane normal
+//    vout: vec3 of the point in the triangle which is outside
+//       the plane
+//    vin1, vin2: vec3 of the points in the triangle which are
+//        inside the plane
+//
+// Returns
+//    std::vector<std::vector<vec3>>: vector of vertex coordinates
+//      of the new triangle faces
+//
+inline face_verts ClipTriByPlaneOneOut(
+    const std::vector<vec3<float>>& plane,
+    const vec3<float>& normal,
+    const vec3<float>& vout,
+    const vec3<float>& vin1,
+    const vec3<float>& vin2) {
+  // point of intersection between plane and (vin1, vout)
+  const vec3<float> pint1 = PlaneEdgeIntersection(plane, normal, vin1, vout);
+  // point of intersection between plane and (vin2, vout)
+  const vec3<float> pint2 = PlaneEdgeIntersection(plane, normal, vin2, vout);
+  const face_verts face_verts = {{vin1, pint1, pint2}, {vin1, pint2, vin2}};
+  return face_verts;
+}
+
+// Triangle is clipped into a smaller triangle based
+// on the intersection points with the plane.
+//
+// Args
+//    plane: vector of vec3 coordinates of the
+//       vertices of each of the triangles in the box
+//    normal: vec3 of the direction of the plane normal
+//    vout1, vout2: vec3 of the points in the triangle which are
+//       outside the plane
+//    vin: vec3 of the point in the triangle which is inside
+//        the plane
+// Returns
+//    std::vector<std::vector<vec3>>: vector of vertex coordinates
+//      of the new triangle face
+//
+inline face_verts ClipTriByPlaneTwoOut(
+    const std::vector<vec3<float>>& plane,
+    const vec3<float>& normal,
+    const vec3<float>& vout1,
+    const vec3<float>& vout2,
+    const vec3<float>& vin) {
+  // point of intersection between plane and (vin, vout1)
+  const vec3<float> pint1 = PlaneEdgeIntersection(plane, normal, vin, vout1);
+  // point of intersection between plane and (vin, vout2)
+  const vec3<float> pint2 = PlaneEdgeIntersection(plane, normal, vin, vout2);
+  const face_verts face_verts = {{vin, pint1, pint2}};
+  return face_verts;
+}
+
+// Clip the triangle faces so that they lie within the
+// plane, creating new triangle faces where necessary.
+//
+// Args
+//    plane: vector of vec3 coordinates of the
+//       vertices of each of the triangles in the box
+//    tri: std:vector<vec3> of the vertex coordinates of the
+//       triangle faces
+//    normal: vec3 of the direction of the plane normal
+//
+// Returns
+//    std::vector<std::vector<vec3>>: vector of vertex coordinates
+//      of the new triangle faces formed after clipping.
+//      All triangles are now "inside" the plane.
+//
+inline face_verts ClipTriByPlane(
+    const std::vector<vec3<float>>& plane,
+    const std::vector<vec3<float>>& tri,
+    const vec3<float>& normal) {
+  // Get Triangle vertices
+  const vec3<float> v0 = tri[0];
+  const vec3<float> v1 = tri[1];
+  const vec3<float> v2 = tri[2];
+
+  // Check each of the triangle vertices to see if it is inside the plane
+  const bool isin0 = IsInside(plane, normal, v0);
+  const bool isin1 = IsInside(plane, normal, v1);
+  const bool isin2 = IsInside(plane, normal, v2);
+
+  // All in
+  if (isin0 && isin1 && isin2) {
+    // Return input vertices
+    face_verts tris = {{v0, v1, v2}};
+    return tris;
+  }
+
+  face_verts empty_tris = {};
+  // All out
+  if (!isin0 && !isin1 && !isin2) {
+    return empty_tris;
+  }
+
+  // One vert out
+  if (isin0 && isin1 && !isin2) {
+    return ClipTriByPlaneOneOut(plane, normal, v2, v0, v1);
+  }
+  if (isin0 && !isin1 && isin2) {
+    return ClipTriByPlaneOneOut(plane, normal, v1, v0, v2);
+  }
+  if (!isin0 && isin1 && isin2) {
+    return ClipTriByPlaneOneOut(plane, normal, v0, v1, v2);
+  }
+
+  // Two verts out
+  if (isin0 && !isin1 && !isin2) {
+    return ClipTriByPlaneTwoOut(plane, normal, v1, v2, v0);
+  }
+  if (!isin0 && !isin1 && isin2) {
+    return ClipTriByPlaneTwoOut(plane, normal, v0, v1, v2);
+  }
+  if (!isin0 && isin1 && !isin2) {
+    return ClipTriByPlaneTwoOut(plane, normal, v0, v2, v1);
+  }
+
+  // Else return empty (should not be reached)
+  return empty_tris;
+}
+
+// Compute a boolean indicator for whether or not two faces
+// are coplanar
+//
+// Args
+//    tri1, tri2: std:vector<vec3> of the vertex coordinates of
+//        triangle faces
+//
+// Returns
+//    bool: whether or not the two faces are coplanar
+//
+inline bool IsCoplanarFace(
+    const std::vector<vec3<float>>& tri1,
+    const std::vector<vec3<float>>& tri2) {
+  // Get verts for face 1
+  const vec3<float> v0 = tri1[0];
+  const vec3<float> v1 = tri1[1];
+  const vec3<float> v2 = tri1[2];
+
+  const vec3<float> n1 = FaceNormal(v0, v1, v2);
+  int coplanar_count = 0;
+  for (int i = 0; i < 3; ++i) {
+    float d = std::abs(dot(tri2[i] - v0, n1));
+    if (d < kEpsilon) {
+      coplanar_count = coplanar_count + 1;
+    }
+  }
+  return (coplanar_count == 3);
+}
+
+// Get the triangles from each box which are part of the
+// intersecting polyhedron by computing the intersection
+// points with each of the planes.
+//
+// Args
+//    tris: vertex coordinates of all the triangle faces
+//       in the box
+//    planes: vertex coordinates of all the planes in the box
+//    center: vec3 coordinates of the center of the box from which
+//        the planes originate
+//
+// Returns
+//    std::vector<std::vector<vec3>>> vector of vertex coordinates
+//      of the new triangle faces formed after clipping.
+//      All triangles are now "inside" the planes.
+//
+inline face_verts BoxIntersections(
+    const face_verts& tris,
+    const face_verts& planes,
+    const vec3<float>& center) {
+  // Create a new vector to avoid modifying in place
+  face_verts out_tris = tris;
+  for (int p = 0; p < NUM_PLANES; ++p) {
+    // Get plane normal direction
+    const vec3<float> n2 = PlaneNormalDirection(planes[p], center);
+    // Iterate through triangles in tris
+    // Create intermediate vector to store the updated tris
+    face_verts tri_verts_updated;
+    for (int t = 0; t < out_tris.size(); ++t) {
+      // Clip tri by plane
+      const face_verts tri_updated = ClipTriByPlane(planes[p], out_tris[t], n2);
+      // Add to the tri_verts_updated output if not empty
+      for (int v = 0; v < tri_updated.size(); ++v) {
+        tri_verts_updated.push_back(tri_updated[v]);
+      }
+    }
+    // Update the tris
+    out_tris = tri_verts_updated;
+  }
+  return out_tris;
+}
diff --git a/pytorch3d/pytorch3d/csrc/knn/knn.cu b/pytorch3d/pytorch3d/csrc/knn/knn.cu
new file mode 100644
index 0000000000000000000000000000000000000000..eaa25203b65a1394f56c4a5b115bed55b07847dd
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/knn/knn.cu
@@ -0,0 +1,554 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <float.h>
+#include <iostream>
+#include <tuple>
+
+#include "utils/dispatch.cuh"
+#include "utils/mink.cuh"
+
+// A chunk of work is blocksize-many points of P1.
+// The number of potential chunks to do is N*(1+(P1-1)/blocksize)
+// call (1+(P1-1)/blocksize) chunks_per_cloud
+// These chunks are divided among the gridSize-many blocks.
+// In block b, we work on chunks b, b+gridSize, b+2*gridSize etc .
+// In chunk i, we work on cloud i/chunks_per_cloud on points starting from
+// blocksize*(i%chunks_per_cloud).
+
+template <typename scalar_t>
+__global__ void KNearestNeighborKernelV0(
+    const scalar_t* __restrict__ points1,
+    const scalar_t* __restrict__ points2,
+    const int64_t* __restrict__ lengths1,
+    const int64_t* __restrict__ lengths2,
+    scalar_t* __restrict__ dists,
+    int64_t* __restrict__ idxs,
+    const size_t N,
+    const size_t P1,
+    const size_t P2,
+    const size_t D,
+    const size_t K) {
+  // Store both dists and indices for knn in global memory.
+  const int64_t chunks_per_cloud = (1 + (P1 - 1) / blockDim.x);
+  const int64_t chunks_to_do = N * chunks_per_cloud;
+  for (int64_t chunk = blockIdx.x; chunk < chunks_to_do; chunk += gridDim.x) {
+    const int64_t n = chunk / chunks_per_cloud;
+    const int64_t start_point = blockDim.x * (chunk % chunks_per_cloud);
+    int64_t p1 = start_point + threadIdx.x;
+    if (p1 >= lengths1[n])
+      continue;
+    int offset = n * P1 * K + p1 * K;
+    int64_t length2 = lengths2[n];
+    MinK<scalar_t, int64_t> mink(dists + offset, idxs + offset, K);
+    for (int p2 = 0; p2 < length2; ++p2) {
+      // Find the distance between points1[n, p1] and points[n, p2]
+      scalar_t dist = 0;
+      for (int d = 0; d < D; ++d) {
+        scalar_t coord1 = points1[n * P1 * D + p1 * D + d];
+        scalar_t coord2 = points2[n * P2 * D + p2 * D + d];
+        scalar_t diff = coord1 - coord2;
+        dist += diff * diff;
+      }
+      mink.add(dist, p2);
+    }
+  }
+}
+
+template <typename scalar_t, int64_t D>
+__global__ void KNearestNeighborKernelV1(
+    const scalar_t* __restrict__ points1,
+    const scalar_t* __restrict__ points2,
+    const int64_t* __restrict__ lengths1,
+    const int64_t* __restrict__ lengths2,
+    scalar_t* __restrict__ dists,
+    int64_t* __restrict__ idxs,
+    const size_t N,
+    const size_t P1,
+    const size_t P2,
+    const size_t K) {
+  // Same idea as the previous version, but hoist D into a template argument
+  // so we can cache the current point in a thread-local array. We still store
+  // the current best K dists and indices in global memory, so this should work
+  // for very large K and fairly large D.
+  scalar_t cur_point[D];
+  const int64_t chunks_per_cloud = (1 + (P1 - 1) / blockDim.x);
+  const int64_t chunks_to_do = N * chunks_per_cloud;
+  for (int64_t chunk = blockIdx.x; chunk < chunks_to_do; chunk += gridDim.x) {
+    const int64_t n = chunk / chunks_per_cloud;
+    const int64_t start_point = blockDim.x * (chunk % chunks_per_cloud);
+    int64_t p1 = start_point + threadIdx.x;
+    if (p1 >= lengths1[n])
+      continue;
+    for (int d = 0; d < D; ++d) {
+      cur_point[d] = points1[n * P1 * D + p1 * D + d];
+    }
+    int offset = n * P1 * K + p1 * K;
+    int64_t length2 = lengths2[n];
+    MinK<scalar_t, int64_t> mink(dists + offset, idxs + offset, K);
+    for (int p2 = 0; p2 < length2; ++p2) {
+      // Find the distance between cur_point and points[n, p2]
+      scalar_t dist = 0;
+      for (int d = 0; d < D; ++d) {
+        scalar_t diff = cur_point[d] - points2[n * P2 * D + p2 * D + d];
+        dist += diff * diff;
+      }
+      mink.add(dist, p2);
+    }
+  }
+}
+
+// This is a shim functor to allow us to dispatch using DispatchKernel1D
+template <typename scalar_t, int64_t D>
+struct KNearestNeighborV1Functor {
+  static void run(
+      size_t blocks,
+      size_t threads,
+      const scalar_t* __restrict__ points1,
+      const scalar_t* __restrict__ points2,
+      const int64_t* __restrict__ lengths1,
+      const int64_t* __restrict__ lengths2,
+      scalar_t* __restrict__ dists,
+      int64_t* __restrict__ idxs,
+      const size_t N,
+      const size_t P1,
+      const size_t P2,
+      const size_t K) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    KNearestNeighborKernelV1<scalar_t, D><<<blocks, threads, 0, stream>>>(
+        points1, points2, lengths1, lengths2, dists, idxs, N, P1, P2, K);
+  }
+};
+
+template <typename scalar_t, int64_t D, int64_t K>
+__global__ void KNearestNeighborKernelV2(
+    const scalar_t* __restrict__ points1,
+    const scalar_t* __restrict__ points2,
+    const int64_t* __restrict__ lengths1,
+    const int64_t* __restrict__ lengths2,
+    scalar_t* __restrict__ dists,
+    int64_t* __restrict__ idxs,
+    const int64_t N,
+    const int64_t P1,
+    const int64_t P2) {
+  // Same general implementation as V2, but also hoist K into a template arg.
+  scalar_t cur_point[D];
+  scalar_t min_dists[K];
+  int min_idxs[K];
+  const int64_t chunks_per_cloud = (1 + (P1 - 1) / blockDim.x);
+  const int64_t chunks_to_do = N * chunks_per_cloud;
+  for (int64_t chunk = blockIdx.x; chunk < chunks_to_do; chunk += gridDim.x) {
+    const int64_t n = chunk / chunks_per_cloud;
+    const int64_t start_point = blockDim.x * (chunk % chunks_per_cloud);
+    int64_t p1 = start_point + threadIdx.x;
+    if (p1 >= lengths1[n])
+      continue;
+    for (int d = 0; d < D; ++d) {
+      cur_point[d] = points1[n * P1 * D + p1 * D + d];
+    }
+    int64_t length2 = lengths2[n];
+    MinK<scalar_t, int> mink(min_dists, min_idxs, K);
+    for (int p2 = 0; p2 < length2; ++p2) {
+      scalar_t dist = 0;
+      for (int d = 0; d < D; ++d) {
+        int offset = n * P2 * D + p2 * D + d;
+        scalar_t diff = cur_point[d] - points2[offset];
+        dist += diff * diff;
+      }
+      mink.add(dist, p2);
+    }
+    for (int k = 0; k < mink.size(); ++k) {
+      idxs[n * P1 * K + p1 * K + k] = min_idxs[k];
+      dists[n * P1 * K + p1 * K + k] = min_dists[k];
+    }
+  }
+}
+
+// This is a shim so we can dispatch using DispatchKernel2D
+template <typename scalar_t, int64_t D, int64_t K>
+struct KNearestNeighborKernelV2Functor {
+  static void run(
+      size_t blocks,
+      size_t threads,
+      const scalar_t* __restrict__ points1,
+      const scalar_t* __restrict__ points2,
+      const int64_t* __restrict__ lengths1,
+      const int64_t* __restrict__ lengths2,
+      scalar_t* __restrict__ dists,
+      int64_t* __restrict__ idxs,
+      const int64_t N,
+      const int64_t P1,
+      const int64_t P2) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    KNearestNeighborKernelV2<scalar_t, D, K><<<blocks, threads, 0, stream>>>(
+        points1, points2, lengths1, lengths2, dists, idxs, N, P1, P2);
+  }
+};
+
+template <typename scalar_t, int D, int K>
+__global__ void KNearestNeighborKernelV3(
+    const scalar_t* __restrict__ points1,
+    const scalar_t* __restrict__ points2,
+    const int64_t* __restrict__ lengths1,
+    const int64_t* __restrict__ lengths2,
+    scalar_t* __restrict__ dists,
+    int64_t* __restrict__ idxs,
+    const size_t N,
+    const size_t P1,
+    const size_t P2) {
+  // Same idea as V2, but use register indexing for thread-local arrays.
+  // Enabling sorting for this version leads to huge slowdowns; I suspect
+  // that it forces min_dists into local memory rather than registers.
+  // As a result this version is always unsorted.
+  scalar_t cur_point[D];
+  scalar_t min_dists[K];
+  int min_idxs[K];
+  const int64_t chunks_per_cloud = (1 + (P1 - 1) / blockDim.x);
+  const int64_t chunks_to_do = N * chunks_per_cloud;
+  for (int64_t chunk = blockIdx.x; chunk < chunks_to_do; chunk += gridDim.x) {
+    const int64_t n = chunk / chunks_per_cloud;
+    const int64_t start_point = blockDim.x * (chunk % chunks_per_cloud);
+    int64_t p1 = start_point + threadIdx.x;
+    if (p1 >= lengths1[n])
+      continue;
+    for (int d = 0; d < D; ++d) {
+      cur_point[d] = points1[n * P1 * D + p1 * D + d];
+    }
+    int64_t length2 = lengths2[n];
+    RegisterMinK<scalar_t, int, K> mink(min_dists, min_idxs);
+    for (int p2 = 0; p2 < length2; ++p2) {
+      scalar_t dist = 0;
+      for (int d = 0; d < D; ++d) {
+        int offset = n * P2 * D + p2 * D + d;
+        scalar_t diff = cur_point[d] - points2[offset];
+        dist += diff * diff;
+      }
+      mink.add(dist, p2);
+    }
+    for (int k = 0; k < mink.size(); ++k) {
+      idxs[n * P1 * K + p1 * K + k] = min_idxs[k];
+      dists[n * P1 * K + p1 * K + k] = min_dists[k];
+    }
+  }
+}
+
+// This is a shim so we can dispatch using DispatchKernel2D
+template <typename scalar_t, int64_t D, int64_t K>
+struct KNearestNeighborKernelV3Functor {
+  static void run(
+      size_t blocks,
+      size_t threads,
+      const scalar_t* __restrict__ points1,
+      const scalar_t* __restrict__ points2,
+      const int64_t* __restrict__ lengths1,
+      const int64_t* __restrict__ lengths2,
+      scalar_t* __restrict__ dists,
+      int64_t* __restrict__ idxs,
+      const size_t N,
+      const size_t P1,
+      const size_t P2) {
+    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+    KNearestNeighborKernelV3<scalar_t, D, K><<<blocks, threads, 0, stream>>>(
+        points1, points2, lengths1, lengths2, dists, idxs, N, P1, P2);
+  }
+};
+
+constexpr int V1_MIN_D = 1;
+constexpr int V1_MAX_D = 32;
+
+constexpr int V2_MIN_D = 1;
+constexpr int V2_MAX_D = 8;
+constexpr int V2_MIN_K = 1;
+constexpr int V2_MAX_K = 32;
+
+constexpr int V3_MIN_D = 1;
+constexpr int V3_MAX_D = 8;
+constexpr int V3_MIN_K = 1;
+constexpr int V3_MAX_K = 4;
+
+bool InBounds(const int64_t min, const int64_t x, const int64_t max) {
+  return min <= x && x <= max;
+}
+
+bool KnnCheckVersion(int version, const int64_t D, const int64_t K) {
+  if (version == 0) {
+    return true;
+  } else if (version == 1) {
+    return InBounds(V1_MIN_D, D, V1_MAX_D);
+  } else if (version == 2) {
+    return InBounds(V2_MIN_D, D, V2_MAX_D) && InBounds(V2_MIN_K, K, V2_MAX_K);
+  } else if (version == 3) {
+    return InBounds(V3_MIN_D, D, V3_MAX_D) && InBounds(V3_MIN_K, K, V3_MAX_K);
+  }
+  return false;
+}
+
+int ChooseVersion(const int64_t D, const int64_t K) {
+  for (int version = 3; version >= 1; version--) {
+    if (KnnCheckVersion(version, D, K)) {
+      return version;
+    }
+  }
+  return 0;
+}
+
+std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    int K,
+    int version) {
+  // Check inputs are on the same device
+  at::TensorArg p1_t{p1, "p1", 1}, p2_t{p2, "p2", 2},
+      lengths1_t{lengths1, "lengths1", 3}, lengths2_t{lengths2, "lengths2", 4};
+  at::CheckedFrom c = "KNearestNeighborIdxCuda";
+  at::checkAllSameGPU(c, {p1_t, p2_t, lengths1_t, lengths2_t});
+  at::checkAllSameType(c, {p1_t, p2_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(p1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const auto N = p1.size(0);
+  const auto P1 = p1.size(1);
+  const auto P2 = p2.size(1);
+  const auto D = p2.size(2);
+  const int64_t K_64 = K;
+
+  TORCH_CHECK(p2.size(2) == D, "Point sets must have the same last dimension");
+  auto long_dtype = lengths1.options().dtype(at::kLong);
+  auto idxs = at::zeros({N, P1, K}, long_dtype);
+  auto dists = at::zeros({N, P1, K}, p1.options());
+
+  if (idxs.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(idxs, dists);
+  }
+
+  if (version < 0) {
+    version = ChooseVersion(D, K);
+  } else if (!KnnCheckVersion(version, D, K)) {
+    int new_version = ChooseVersion(D, K);
+    std::cout << "WARNING: Requested KNN version " << version
+              << " is not compatible with D = " << D << "; K = " << K
+              << ". Falling back to version = " << new_version << std::endl;
+    version = new_version;
+  }
+
+  // At this point we should have a valid version no matter what data the user
+  // gave us. But we can check once more to be sure; however this time
+  // assert fail since failing at this point means we have a bug in our version
+  // selection or checking code.
+  AT_ASSERTM(KnnCheckVersion(version, D, K), "Invalid version");
+
+  const size_t threads = 256;
+  const size_t blocks = 256;
+  if (version == 0) {
+    AT_DISPATCH_FLOATING_TYPES(
+        p1.scalar_type(), "knn_kernel_cuda", ([&] {
+          KNearestNeighborKernelV0<scalar_t><<<blocks, threads, 0, stream>>>(
+              p1.contiguous().data_ptr<scalar_t>(),
+              p2.contiguous().data_ptr<scalar_t>(),
+              lengths1.contiguous().data_ptr<int64_t>(),
+              lengths2.contiguous().data_ptr<int64_t>(),
+              dists.data_ptr<scalar_t>(),
+              idxs.data_ptr<int64_t>(),
+              N,
+              P1,
+              P2,
+              D,
+              K);
+        }));
+  } else if (version == 1) {
+    AT_DISPATCH_FLOATING_TYPES(p1.scalar_type(), "knn_kernel_cuda", ([&] {
+                                 DispatchKernel1D<
+                                     KNearestNeighborV1Functor,
+                                     scalar_t,
+                                     V1_MIN_D,
+                                     V1_MAX_D>(
+                                     D,
+                                     blocks,
+                                     threads,
+                                     p1.contiguous().data_ptr<scalar_t>(),
+                                     p2.contiguous().data_ptr<scalar_t>(),
+                                     lengths1.contiguous().data_ptr<int64_t>(),
+                                     lengths2.contiguous().data_ptr<int64_t>(),
+                                     dists.data_ptr<scalar_t>(),
+                                     idxs.data_ptr<int64_t>(),
+                                     N,
+                                     P1,
+                                     P2,
+                                     K);
+                               }));
+  } else if (version == 2) {
+    AT_DISPATCH_FLOATING_TYPES(p1.scalar_type(), "knn_kernel_cuda", ([&] {
+                                 DispatchKernel2D<
+                                     KNearestNeighborKernelV2Functor,
+                                     scalar_t,
+                                     V2_MIN_D,
+                                     V2_MAX_D,
+                                     V2_MIN_K,
+                                     V2_MAX_K>(
+                                     D,
+                                     K_64,
+                                     blocks,
+                                     threads,
+                                     p1.contiguous().data_ptr<scalar_t>(),
+                                     p2.contiguous().data_ptr<scalar_t>(),
+                                     lengths1.contiguous().data_ptr<int64_t>(),
+                                     lengths2.contiguous().data_ptr<int64_t>(),
+                                     dists.data_ptr<scalar_t>(),
+                                     idxs.data_ptr<int64_t>(),
+                                     N,
+                                     P1,
+                                     P2);
+                               }));
+  } else if (version == 3) {
+    AT_DISPATCH_FLOATING_TYPES(p1.scalar_type(), "knn_kernel_cuda", ([&] {
+                                 DispatchKernel2D<
+                                     KNearestNeighborKernelV3Functor,
+                                     scalar_t,
+                                     V3_MIN_D,
+                                     V3_MAX_D,
+                                     V3_MIN_K,
+                                     V3_MAX_K>(
+                                     D,
+                                     K_64,
+                                     blocks,
+                                     threads,
+                                     p1.contiguous().data_ptr<scalar_t>(),
+                                     p2.contiguous().data_ptr<scalar_t>(),
+                                     lengths1.contiguous().data_ptr<int64_t>(),
+                                     lengths2.contiguous().data_ptr<int64_t>(),
+                                     dists.data_ptr<scalar_t>(),
+                                     idxs.data_ptr<int64_t>(),
+                                     N,
+                                     P1,
+                                     P2);
+                               }));
+  }
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(idxs, dists);
+}
+
+// ------------------------------------------------------------- //
+//                   Backward Operators                          //
+// ------------------------------------------------------------- //
+
+// TODO(gkioxari) support all data types once AtomicAdd supports doubles.
+// Currently, support is for floats only.
+__global__ void KNearestNeighborBackwardKernel(
+    const float* __restrict__ p1, // (N, P1, D)
+    const float* __restrict__ p2, // (N, P2, D)
+    const int64_t* __restrict__ lengths1, // (N,)
+    const int64_t* __restrict__ lengths2, // (N,)
+    const int64_t* __restrict__ idxs, // (N, P1, K)
+    const float* __restrict__ grad_dists, // (N, P1, K)
+    float* __restrict__ grad_p1, // (N, P1, D)
+    float* __restrict__ grad_p2, // (N, P2, D)
+    const size_t N,
+    const size_t P1,
+    const size_t P2,
+    const size_t K,
+    const size_t D) {
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = gridDim.x * blockDim.x;
+
+  for (size_t i = tid; i < N * P1 * K * D; i += stride) {
+    const size_t n = i / (P1 * K * D); // batch index
+    size_t rem = i % (P1 * K * D);
+    const size_t p1_idx = rem / (K * D); // index of point in p1
+    rem = rem % (K * D);
+    const size_t k = rem / D; // k-th nearest neighbor
+    const size_t d = rem % D; // d-th dimension in the feature vector
+
+    const size_t num1 = lengths1[n]; // number of valid points in p1 in batch
+    const size_t num2 = lengths2[n]; // number of valid points in p2 in batch
+    if ((p1_idx < num1) && (k < num2)) {
+      const float grad_dist = grad_dists[n * P1 * K + p1_idx * K + k];
+      // index of point in p2 corresponding to the k-th nearest neighbor
+      const size_t p2_idx = idxs[n * P1 * K + p1_idx * K + k];
+      // If the index is the pad value of -1 then ignore it
+      if (p2_idx == -1) {
+        continue;
+      }
+      const float diff = 2.0 * grad_dist *
+          (p1[n * P1 * D + p1_idx * D + d] - p2[n * P2 * D + p2_idx * D + d]);
+      atomicAdd(grad_p1 + n * P1 * D + p1_idx * D + d, diff);
+      atomicAdd(grad_p2 + n * P2 * D + p2_idx * D + d, -1.0f * diff);
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackwardCuda(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    const at::Tensor& idxs,
+    const at::Tensor& grad_dists) {
+  // Check inputs are on the same device
+  at::TensorArg p1_t{p1, "p1", 1}, p2_t{p2, "p2", 2},
+      lengths1_t{lengths1, "lengths1", 3}, lengths2_t{lengths2, "lengths2", 4},
+      idxs_t{idxs, "idxs", 5}, grad_dists_t{grad_dists, "grad_dists", 6};
+  at::CheckedFrom c = "KNearestNeighborBackwardCuda";
+  at::checkAllSameGPU(
+      c, {p1_t, p2_t, lengths1_t, lengths2_t, idxs_t, grad_dists_t});
+  at::checkAllSameType(c, {p1_t, p2_t, grad_dists_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(p1.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const auto N = p1.size(0);
+  const auto P1 = p1.size(1);
+  const auto P2 = p2.size(1);
+  const auto D = p2.size(2);
+  const auto K = idxs.size(2);
+
+  TORCH_CHECK(p1.size(2) == D, "Point sets must have the same last dimension");
+  TORCH_CHECK(idxs.size(0) == N, "KNN idxs must have the same batch dimension");
+  TORCH_CHECK(
+      idxs.size(1) == P1, "KNN idxs must have the same point dimension as p1");
+  TORCH_CHECK(grad_dists.size(0) == N);
+  TORCH_CHECK(grad_dists.size(1) == P1);
+  TORCH_CHECK(grad_dists.size(2) == K);
+
+  auto grad_p1 = at::zeros({N, P1, D}, p1.options());
+  auto grad_p2 = at::zeros({N, P2, D}, p2.options());
+
+  if (grad_p1.numel() == 0 || grad_p2.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(grad_p1, grad_p2);
+  }
+
+  const int blocks = 64;
+  const int threads = 512;
+
+  KNearestNeighborBackwardKernel<<<blocks, threads, 0, stream>>>(
+      p1.contiguous().data_ptr<float>(),
+      p2.contiguous().data_ptr<float>(),
+      lengths1.contiguous().data_ptr<int64_t>(),
+      lengths2.contiguous().data_ptr<int64_t>(),
+      idxs.contiguous().data_ptr<int64_t>(),
+      grad_dists.contiguous().data_ptr<float>(),
+      grad_p1.data_ptr<float>(),
+      grad_p2.data_ptr<float>(),
+      N,
+      P1,
+      P2,
+      K,
+      D);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(grad_p1, grad_p2);
+}
diff --git a/pytorch3d/pytorch3d/csrc/knn/knn.h b/pytorch3d/pytorch3d/csrc/knn/knn.h
new file mode 100644
index 0000000000000000000000000000000000000000..f5f0503da48cef4d709ded2f653b2ad29e53f73c
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/knn/knn.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <tuple>
+#include "utils/pytorch3d_cutils.h"
+
+// Compute indices of K nearest neighbors in pointcloud p2 to points
+// in pointcloud p1.
+//
+// Args:
+//    p1: FloatTensor of shape (N, P1, D) giving a batch of pointclouds each
+//        containing P1 points of dimension D.
+//    p2: FloatTensor of shape (N, P2, D) giving a batch of pointclouds each
+//        containing P2 points of dimension D.
+//    lengths1: LongTensor, shape (N,), giving actual length of each P1 cloud.
+//    lengths2: LongTensor, shape (N,), giving actual length of each P2 cloud.
+//    K: int giving the number of nearest points to return.
+//    version: Integer telling which implementation to use.
+//
+// Returns:
+//    p1_neighbor_idx: LongTensor of shape (N, P1, K), where
+//        p1_neighbor_idx[n, i, k] = j means that the kth nearest
+//        neighbor to p1[n, i] in the cloud p2[n] is p2[n, j].
+//        It is padded with zeros so that it can be used easily in a later
+//        gather() operation.
+//
+//    p1_neighbor_dists: FloatTensor of shape (N, P1, K) containing the squared
+//        distance from each point p1[n, p, :] to its K neighbors
+//        p2[n, p1_neighbor_idx[n, p, k], :].
+
+// CPU implementation.
+std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCpu(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    int K);
+
+// CUDA implementation
+std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCuda(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    int K,
+    int version);
+
+// Implementation which is exposed.
+std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdx(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    int K,
+    int version) {
+  if (p1.is_cuda() || p2.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(p1);
+    CHECK_CUDA(p2);
+    return KNearestNeighborIdxCuda(p1, p2, lengths1, lengths2, K, version);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return KNearestNeighborIdxCpu(p1, p2, lengths1, lengths2, K);
+}
+
+// Compute gradients with respect to p1 and p2
+//
+// Args:
+//    p1: FloatTensor of shape (N, P1, D) giving a batch of pointclouds each
+//        containing P1 points of dimension D.
+//    p2: FloatTensor of shape (N, P2, D) giving a batch of pointclouds each
+//        containing P2 points of dimension D.
+//    lengths1: LongTensor, shape (N,), giving actual length of each P1 cloud.
+//    lengths2: LongTensor, shape (N,), giving actual length of each P2 cloud.
+//    p1_neighbor_idx: LongTensor of shape (N, P1, K), where
+//        p1_neighbor_idx[n, i, k] = j means that the kth nearest
+//        neighbor to p1[n, i] in the cloud p2[n] is p2[n, j].
+//        It is padded with zeros so that it can be used easily in a later
+//        gather() operation. This is computed from the forward pass.
+//    grad_dists: FLoatTensor of shape (N, P1, K) which contains the input
+//        gradients.
+//
+// Returns:
+//    grad_p1: FloatTensor of shape (N, P1, D) containing the output gradients
+//        wrt p1.
+//    grad_p2: FloatTensor of shape (N, P2, D) containing the output gradients
+//        wrt p2.
+
+// CPU implementation.
+std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackwardCpu(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    const at::Tensor& idxs,
+    const at::Tensor& grad_dists);
+
+// CUDA implementation
+std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackwardCuda(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    const at::Tensor& idxs,
+    const at::Tensor& grad_dists);
+
+// Implementation which is exposed.
+std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackward(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    const at::Tensor& idxs,
+    const at::Tensor& grad_dists) {
+  if (p1.is_cuda() || p2.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(p1);
+    CHECK_CUDA(p2);
+    return KNearestNeighborBackwardCuda(
+        p1, p2, lengths1, lengths2, idxs, grad_dists);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return KNearestNeighborBackwardCpu(
+      p1, p2, lengths1, lengths2, idxs, grad_dists);
+}
+
+// Utility to check whether a KNN version can be used.
+//
+// Args:
+//    version: Integer in the range 0 <= version <= 3 indicating one of our
+//        KNN implementations.
+//    D: Number of dimensions for the input and query point clouds
+//    K: Number of neighbors to be found
+//
+// Returns:
+//    Whether the indicated KNN version can be used.
+bool KnnCheckVersion(int version, const int64_t D, const int64_t K);
diff --git a/pytorch3d/pytorch3d/csrc/knn/knn_cpu.cpp b/pytorch3d/pytorch3d/csrc/knn/knn_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7a5cb6c06e466cdd1c7752de3739cc41e7c4e4b
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/knn/knn_cpu.cpp
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <queue>
+#include <tuple>
+
+std::tuple<at::Tensor, at::Tensor> KNearestNeighborIdxCpu(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    int K) {
+  const int N = p1.size(0);
+  const int P1 = p1.size(1);
+  const int D = p1.size(2);
+
+  auto long_opts = lengths1.options().dtype(torch::kInt64);
+  torch::Tensor idxs = torch::full({N, P1, K}, 0, long_opts);
+  torch::Tensor dists = torch::full({N, P1, K}, 0, p1.options());
+
+  auto p1_a = p1.accessor<float, 3>();
+  auto p2_a = p2.accessor<float, 3>();
+  auto lengths1_a = lengths1.accessor<int64_t, 1>();
+  auto lengths2_a = lengths2.accessor<int64_t, 1>();
+  auto idxs_a = idxs.accessor<int64_t, 3>();
+  auto dists_a = dists.accessor<float, 3>();
+
+  for (int n = 0; n < N; ++n) {
+    const int64_t length1 = lengths1_a[n];
+    const int64_t length2 = lengths2_a[n];
+    for (int64_t i1 = 0; i1 < length1; ++i1) {
+      // Use a priority queue to store (distance, index) tuples.
+      std::priority_queue<std::tuple<float, int>> q;
+      for (int64_t i2 = 0; i2 < length2; ++i2) {
+        float dist = 0;
+        for (int d = 0; d < D; ++d) {
+          float diff = p1_a[n][i1][d] - p2_a[n][i2][d];
+          dist += diff * diff;
+        }
+        int size = static_cast<int>(q.size());
+        if (size < K || dist < std::get<0>(q.top())) {
+          q.emplace(dist, i2);
+          if (size >= K) {
+            q.pop();
+          }
+        }
+      }
+      while (!q.empty()) {
+        auto t = q.top();
+        q.pop();
+        const int k = q.size();
+        dists_a[n][i1][k] = std::get<0>(t);
+        idxs_a[n][i1][k] = std::get<1>(t);
+      }
+    }
+  }
+  return std::make_tuple(idxs, dists);
+}
+
+// ------------------------------------------------------------- //
+//                   Backward Operators                          //
+// ------------------------------------------------------------- //
+
+std::tuple<at::Tensor, at::Tensor> KNearestNeighborBackwardCpu(
+    const at::Tensor& p1,
+    const at::Tensor& p2,
+    const at::Tensor& lengths1,
+    const at::Tensor& lengths2,
+    const at::Tensor& idxs,
+    const at::Tensor& grad_dists) {
+  const int N = p1.size(0);
+  const int P1 = p1.size(1);
+  const int D = p1.size(2);
+  const int P2 = p2.size(1);
+  const int K = idxs.size(2);
+
+  torch::Tensor grad_p1 = torch::full({N, P1, D}, 0, p1.options());
+  torch::Tensor grad_p2 = torch::full({N, P2, D}, 0, p2.options());
+
+  auto p1_a = p1.accessor<float, 3>();
+  auto p2_a = p2.accessor<float, 3>();
+  auto lengths1_a = lengths1.accessor<int64_t, 1>();
+  auto lengths2_a = lengths2.accessor<int64_t, 1>();
+  auto idxs_a = idxs.accessor<int64_t, 3>();
+  auto grad_dists_a = grad_dists.accessor<float, 3>();
+  auto grad_p1_a = grad_p1.accessor<float, 3>();
+  auto grad_p2_a = grad_p2.accessor<float, 3>();
+
+  for (int n = 0; n < N; ++n) {
+    const int64_t length1 = lengths1_a[n];
+    int64_t length2 = lengths2_a[n];
+    length2 = (length2 < K) ? length2 : K;
+    for (int64_t i1 = 0; i1 < length1; ++i1) {
+      for (int64_t k = 0; k < length2; ++k) {
+        const int64_t i2 = idxs_a[n][i1][k];
+        // If the index is the pad value of -1 then ignore it
+        if (i2 == -1) {
+          continue;
+        }
+        for (int64_t d = 0; d < D; ++d) {
+          const float diff =
+              2.0f * grad_dists_a[n][i1][k] * (p1_a[n][i1][d] - p2_a[n][i2][d]);
+          grad_p1_a[n][i1][d] += diff;
+          grad_p2_a[n][i2][d] += -1.0f * diff;
+        }
+      }
+    }
+  }
+  return std::make_tuple(grad_p1, grad_p2);
+}
diff --git a/pytorch3d/pytorch3d/csrc/mesh_normal_consistency/mesh_normal_consistency.h b/pytorch3d/pytorch3d/csrc/mesh_normal_consistency/mesh_normal_consistency.h
new file mode 100644
index 0000000000000000000000000000000000000000..e95bfca7bfe1323c5f33ea2682d3d8ca13bd877e
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/mesh_normal_consistency/mesh_normal_consistency.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include "utils/pytorch3d_cutils.h"
+
+// For mesh_normal_consistency, find pairs of vertices opposite the same edge.
+//
+// Args:
+//   edge_num: int64 Tensor of shape (E,) giving the number of vertices
+//              corresponding to each edge.
+//
+// Returns:
+//    pairs: int64 Tensor of shape (N,2)
+
+at::Tensor MeshNormalConsistencyFindVerticesCpu(const at::Tensor& edge_num);
+
+// Exposed implementation.
+at::Tensor MeshNormalConsistencyFindVertices(const at::Tensor& edge_num) {
+  if (edge_num.is_cuda()) {
+    AT_ERROR("This function needs a CPU tensor.");
+  }
+  return MeshNormalConsistencyFindVerticesCpu(edge_num);
+}
diff --git a/pytorch3d/pytorch3d/csrc/mesh_normal_consistency/mesh_normal_consistency_cpu.cpp b/pytorch3d/pytorch3d/csrc/mesh_normal_consistency/mesh_normal_consistency_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..495e8729291ea745d4f479f03eff50eb8ebdc29c
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/mesh_normal_consistency/mesh_normal_consistency_cpu.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <utility>
+#include <vector>
+
+at::Tensor MeshNormalConsistencyFindVerticesCpu(const at::Tensor& edge_num) {
+  // We take a LongTensor of shape (E,) giving the number of things intersecting
+  // each edge. The things are taken to be numbered in order.
+  // (In fact, the "things" are opposite vertices to edges, renumbered).
+  // We return a tensor of shape (?, 2) where for every pair of things which
+  // intersect the same edge there is a row of their numbers in the output.
+
+  // Example possible inputs and outputs (order of output is not specified):
+  //  [1,0,1,1,0] => [[]]
+  //          [3] => [[0,1], [0,2], [1,2]]
+  //        [0,3] => [[0,1], [0,2], [1,2]]
+  //        [1,3] => [[1,2], [1,3], [2,3]]
+  //[1,0,2,1,0,2] => [[1,2], [4,5]]
+
+  const auto num_edges = edge_num.size(0);
+  auto edges_a = edge_num.accessor<int64_t, 1>();
+
+  int64_t vert_idx = 0;
+  std::vector<std::pair<int64_t, int64_t>> pairs;
+  for (int64_t i_edge = 0; i_edge < num_edges; ++i_edge) {
+    int64_t e = edges_a[i_edge];
+    for (int64_t j = 0; j < e; ++j) {
+      for (int64_t i = 0; i < j; ++i) {
+        pairs.emplace_back(vert_idx + i, vert_idx + j);
+      }
+    }
+    vert_idx += e;
+  }
+
+  // Convert from std::vector by copying over the items to a new empty torch
+  // tensor.
+  auto pairs_tensor = at::empty({(int64_t)pairs.size(), 2}, edge_num.options());
+  auto pairs_a = pairs_tensor.accessor<int64_t, 2>();
+  for (int64_t i_pair = 0; i_pair < pairs.size(); ++i_pair) {
+    auto accessor = pairs_a[i_pair];
+    accessor[0] = pairs[i_pair].first;
+    accessor[1] = pairs[i_pair].second;
+  }
+
+  return pairs_tensor;
+}
diff --git a/pytorch3d/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.cu b/pytorch3d/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b9de938f2b03e3fadd9a5f8d785ded2fb7842af9
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.cu
@@ -0,0 +1,241 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+// Kernel for inputs_packed of shape (F, D), where D > 1
+template <typename scalar_t>
+__global__ void PackedToPaddedKernel(
+    const scalar_t* __restrict__ inputs_packed,
+    const int64_t* __restrict__ first_idxs,
+    scalar_t* __restrict__ inputs_padded,
+    const size_t batch_size,
+    const size_t max_size,
+    const size_t num_inputs,
+    const size_t D) {
+  // Batch elements split evenly across blocks (num blocks = batch_size) and
+  // values for each element split across threads in the block. Each thread adds
+  // the values of its respective input elements to the global inputs_padded
+  // tensor.
+  const size_t tid = threadIdx.x;
+  const size_t batch_idx = blockIdx.x;
+
+  const int64_t start = first_idxs[batch_idx];
+  const int64_t end =
+      batch_idx + 1 < batch_size ? first_idxs[batch_idx + 1] : num_inputs;
+  const int num = end - start;
+  for (size_t f = tid; f < num; f += blockDim.x) {
+    for (size_t j = 0; j < D; ++j) {
+      inputs_padded[batch_idx * max_size * D + f * D + j] =
+          inputs_packed[(start + f) * D + j];
+    }
+  }
+}
+
+// Kernel for inputs of shape (F, 1)
+template <typename scalar_t>
+__global__ void PackedToPaddedKernelD1(
+    const scalar_t* __restrict__ inputs_packed,
+    const int64_t* __restrict__ first_idxs,
+    scalar_t* __restrict__ inputs_padded,
+    const size_t batch_size,
+    const size_t max_size,
+    const size_t num_inputs) {
+  // Batch elements split evenly across blocks (num blocks = batch_size) and
+  // values for each element split across threads in the block. Each thread adds
+  // the values of its respective input elements to the global inputs_padded
+  // tensor.
+  const size_t tid = threadIdx.x;
+  const size_t batch_idx = blockIdx.x;
+
+  const int64_t start = first_idxs[batch_idx];
+  const int64_t end =
+      batch_idx + 1 < batch_size ? first_idxs[batch_idx + 1] : num_inputs;
+  const int num = end - start;
+  for (size_t f = tid; f < num; f += blockDim.x) {
+    inputs_padded[batch_idx * max_size + f] = inputs_packed[start + f];
+  }
+}
+
+// Kernel for inputs_padded of shape (B, F, D), where D > 1
+template <typename scalar_t>
+__global__ void PaddedToPackedKernel(
+    const scalar_t* __restrict__ inputs_padded,
+    const int64_t* __restrict__ first_idxs,
+    scalar_t* __restrict__ inputs_packed,
+    const size_t batch_size,
+    const size_t max_size,
+    const size_t num_inputs,
+    const size_t D) {
+  // Batch elements split evenly across blocks (num blocks = batch_size) and
+  // values for each element split across threads in the block. Each thread adds
+  // the values of its respective input elements to the global inputs_packed
+  // tensor.
+  const size_t tid = threadIdx.x;
+  const size_t batch_idx = blockIdx.x;
+
+  const int64_t start = first_idxs[batch_idx];
+  const int64_t end =
+      batch_idx + 1 < batch_size ? first_idxs[batch_idx + 1] : num_inputs;
+  const int num = end - start;
+  for (size_t f = tid; f < num; f += blockDim.x) {
+    for (size_t j = 0; j < D; ++j) {
+      inputs_packed[(start + f) * D + j] =
+          inputs_padded[batch_idx * max_size * D + f * D + j];
+    }
+  }
+}
+
+// Kernel for inputs_padded of shape (B, F, 1)
+template <typename scalar_t>
+__global__ void PaddedToPackedKernelD1(
+    const scalar_t* __restrict__ inputs_padded,
+    const int64_t* __restrict__ first_idxs,
+    scalar_t* __restrict__ inputs_packed,
+    const size_t batch_size,
+    const size_t max_size,
+    const size_t num_inputs) {
+  // Batch elements split evenly across blocks (num blocks = batch_size) and
+  // values for each element split across threads in the block. Each thread adds
+  // the values of its respective input elements to the global inputs_packed
+  // tensor.
+  const size_t tid = threadIdx.x;
+  const size_t batch_idx = blockIdx.x;
+
+  const int64_t start = first_idxs[batch_idx];
+  const int64_t end =
+      batch_idx + 1 < batch_size ? first_idxs[batch_idx + 1] : num_inputs;
+  const int num = end - start;
+  for (size_t f = tid; f < num; f += blockDim.x) {
+    inputs_packed[start + f] = inputs_padded[batch_idx * max_size + f];
+  }
+}
+
+at::Tensor PackedToPaddedCuda(
+    const at::Tensor inputs_packed,
+    const at::Tensor first_idxs,
+    const int64_t max_size) {
+  // Check inputs are on the same device
+  at::TensorArg inputs_packed_t{inputs_packed, "inputs_packed", 1},
+      first_idxs_t{first_idxs, "first_idxs", 2};
+  at::CheckedFrom c = "PackedToPaddedCuda";
+  at::checkAllSameGPU(c, {inputs_packed_t, first_idxs_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(inputs_packed.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t num_inputs = inputs_packed.size(0);
+  const int64_t batch_size = first_idxs.size(0);
+
+  TORCH_CHECK(
+      inputs_packed.dim() == 2, "inputs_packed must be a 2-dimensional tensor");
+  const int64_t D = inputs_packed.size(1);
+  at::Tensor inputs_padded =
+      at::zeros({batch_size, max_size, D}, inputs_packed.options());
+
+  if (inputs_padded.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return inputs_padded;
+  }
+
+  const int threads = 512;
+  const int blocks = batch_size;
+  if (D == 1) {
+    AT_DISPATCH_FLOATING_TYPES(
+        inputs_packed.scalar_type(), "packed_to_padded_d1_kernel", ([&] {
+          PackedToPaddedKernelD1<scalar_t><<<blocks, threads, 0, stream>>>(
+              inputs_packed.contiguous().data_ptr<scalar_t>(),
+              first_idxs.contiguous().data_ptr<int64_t>(),
+              inputs_padded.data_ptr<scalar_t>(),
+              batch_size,
+              max_size,
+              num_inputs);
+        }));
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(
+        inputs_packed.scalar_type(), "packed_to_padded_kernel", ([&] {
+          PackedToPaddedKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+              inputs_packed.contiguous().data_ptr<scalar_t>(),
+              first_idxs.contiguous().data_ptr<int64_t>(),
+              inputs_padded.data_ptr<scalar_t>(),
+              batch_size,
+              max_size,
+              num_inputs,
+              D);
+        }));
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return inputs_padded;
+}
+
+at::Tensor PaddedToPackedCuda(
+    const at::Tensor inputs_padded,
+    const at::Tensor first_idxs,
+    const int64_t num_inputs) {
+  // Check inputs are on the same device
+  at::TensorArg inputs_padded_t{inputs_padded, "inputs_padded", 1},
+      first_idxs_t{first_idxs, "first_idxs", 2};
+  at::CheckedFrom c = "PaddedToPackedCuda";
+  at::checkAllSameGPU(c, {inputs_padded_t, first_idxs_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(inputs_padded.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t batch_size = inputs_padded.size(0);
+  const int64_t max_size = inputs_padded.size(1);
+
+  TORCH_CHECK(batch_size == first_idxs.size(0), "sizes mismatch");
+  TORCH_CHECK(
+      inputs_padded.dim() == 3,
+      "inputs_padded  must be a 3-dimensional tensor");
+  const int64_t D = inputs_padded.size(2);
+
+  at::Tensor inputs_packed =
+      at::zeros({num_inputs, D}, inputs_padded.options());
+
+  if (inputs_packed.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return inputs_packed;
+  }
+
+  const int threads = 512;
+  const int blocks = batch_size;
+
+  if (D == 1) {
+    AT_DISPATCH_FLOATING_TYPES(
+        inputs_padded.scalar_type(), "padded_to_packed_d1_kernel", ([&] {
+          PaddedToPackedKernelD1<scalar_t><<<blocks, threads, 0, stream>>>(
+              inputs_padded.contiguous().data_ptr<scalar_t>(),
+              first_idxs.contiguous().data_ptr<int64_t>(),
+              inputs_packed.data_ptr<scalar_t>(),
+              batch_size,
+              max_size,
+              num_inputs);
+        }));
+  } else {
+    AT_DISPATCH_FLOATING_TYPES(
+        inputs_padded.scalar_type(), "padded_to_packed_kernel", ([&] {
+          PaddedToPackedKernel<scalar_t><<<blocks, threads, 0, stream>>>(
+              inputs_padded.contiguous().data_ptr<scalar_t>(),
+              first_idxs.contiguous().data_ptr<int64_t>(),
+              inputs_packed.data_ptr<scalar_t>(),
+              batch_size,
+              max_size,
+              num_inputs,
+              D);
+        }));
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return inputs_packed;
+}
diff --git a/pytorch3d/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h b/pytorch3d/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
new file mode 100644
index 0000000000000000000000000000000000000000..77e2b4910156b5736370762fe555acd38c414369
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include "utils/pytorch3d_cutils.h"
+
+// PackedToPadded
+// Converts a packed tensor into a padded tensor, restoring the batch dimension.
+// Refer to pytorch3d/structures/meshes.py for details on packed/padded tensors.
+//
+// Inputs:
+//    inputs_packed: FloatTensor of shape (F, D), representing the packed batch
+//                      tensor, e.g. areas for faces in a batch of meshes.
+//    first_idxs: LongTensor of shape (N,) where N is the number of
+//                       elements in the batch and `first_idxs[i] = f`
+//                       means that the inputs for batch element i begin at
+//                       `inputs[f]`.
+//    max_size: Max length of an element in the batch.
+// Returns:
+//   inputs_padded: FloatTensor of shape (N, max_size, D) where max_size is max
+//                 of `sizes`. The values for batch element i which start at
+//                 `inputs_packed[first_idxs[i]]` will be copied to
+//                 `inputs_padded[i, :]`, with zeros padding out the extra
+//                  inputs.
+//
+
+// PaddedToPacked
+// Converts a padded tensor into a packed tensor.
+// Refer to pytorch3d/structures/meshes.py for details on packed/padded tensors.
+//
+// Inputs:
+//    inputs_padded: FloatTensor of shape (N, max_size, D), representing the
+//                padded tensor, e.g. areas for faces in a batch of meshes.
+//    first_idxs: LongTensor of shape (N,) where N is the number of
+//                       elements in the batch and `first_idxs[i] = f`
+//                       means that the inputs for batch element i begin at
+//                       `inputs_packed[f]`.
+//    num_inputs: Number of packed entries (= F)
+// Returns:
+//   inputs_packed: FloatTensor of shape (F, D), where
+//                      `inputs_packed[first_idx[i]:] = inputs_padded[i, :]`.
+//
+//
+
+// Cpu implementation.
+at::Tensor PackedToPaddedCpu(
+    const at::Tensor inputs_packed,
+    const at::Tensor first_idxs,
+    const int64_t max_size);
+
+// Cpu implementation.
+at::Tensor PaddedToPackedCpu(
+    const at::Tensor inputs_padded,
+    const at::Tensor first_idxs,
+    const int64_t num_inputs);
+
+#ifdef WITH_CUDA
+// Cuda implementation.
+at::Tensor PackedToPaddedCuda(
+    const at::Tensor inputs_packed,
+    const at::Tensor first_idxs,
+    const int64_t max_size);
+
+// Cuda implementation.
+at::Tensor PaddedToPackedCuda(
+    const at::Tensor inputs_padded,
+    const at::Tensor first_idxs,
+    const int64_t num_inputs);
+#endif
+
+// Implementation which is exposed.
+at::Tensor PackedToPadded(
+    const at::Tensor inputs_packed,
+    const at::Tensor first_idxs,
+    const int64_t max_size) {
+  if (inputs_packed.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(inputs_packed);
+    CHECK_CUDA(first_idxs);
+    return PackedToPaddedCuda(inputs_packed, first_idxs, max_size);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return PackedToPaddedCpu(inputs_packed, first_idxs, max_size);
+}
+
+// Implementation which is exposed.
+at::Tensor PaddedToPacked(
+    const at::Tensor inputs_padded,
+    const at::Tensor first_idxs,
+    const int64_t num_inputs) {
+  if (inputs_padded.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(inputs_padded);
+    CHECK_CUDA(first_idxs);
+    return PaddedToPackedCuda(inputs_padded, first_idxs, num_inputs);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return PaddedToPackedCpu(inputs_padded, first_idxs, num_inputs);
+}
diff --git a/pytorch3d/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor_cpu.cpp b/pytorch3d/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..18fab8f67858c7114806bd3fd68732eecb608b6b
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/packed_to_padded_tensor/packed_to_padded_tensor_cpu.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+
+at::Tensor PackedToPaddedCpu(
+    const at::Tensor inputs_packed,
+    const at::Tensor first_idxs,
+    const int64_t max_size) {
+  const int64_t num_inputs = inputs_packed.size(0);
+  const int64_t batch_size = first_idxs.size(0);
+
+  AT_ASSERTM(
+      inputs_packed.dim() == 2, "inputs_packed must be a 2-dimensional tensor");
+  const int64_t D = inputs_packed.size(1);
+
+  torch::Tensor inputs_padded =
+      torch::zeros({batch_size, max_size, D}, inputs_packed.options());
+
+  auto inputs_packed_a = inputs_packed.accessor<float, 2>();
+  auto first_idxs_a = first_idxs.accessor<int64_t, 1>();
+  auto inputs_padded_a = inputs_padded.accessor<float, 3>();
+
+  for (int b = 0; b < batch_size; ++b) {
+    const int64_t start = first_idxs_a[b];
+    const int64_t end = b + 1 < batch_size ? first_idxs_a[b + 1] : num_inputs;
+    const int64_t num = end - start;
+    for (int i = 0; i < num; ++i) {
+      for (int j = 0; j < D; ++j) {
+        inputs_padded_a[b][i][j] = inputs_packed_a[start + i][j];
+      }
+    }
+  }
+  return inputs_padded;
+}
+
+at::Tensor PaddedToPackedCpu(
+    const at::Tensor inputs_padded,
+    const at::Tensor first_idxs,
+    const int64_t num_inputs) {
+  const int64_t batch_size = inputs_padded.size(0);
+
+  AT_ASSERTM(
+      inputs_padded.dim() == 3, "inputs_padded must be a 3-dimensional tensor");
+  const int64_t D = inputs_padded.size(2);
+
+  torch::Tensor inputs_packed =
+      torch::zeros({num_inputs, D}, inputs_padded.options());
+
+  auto inputs_padded_a = inputs_padded.accessor<float, 3>();
+  auto first_idxs_a = first_idxs.accessor<int64_t, 1>();
+  auto inputs_packed_a = inputs_packed.accessor<float, 2>();
+
+  for (int b = 0; b < batch_size; ++b) {
+    const int64_t start = first_idxs_a[b];
+    const int64_t end = b + 1 < batch_size ? first_idxs_a[b + 1] : num_inputs;
+    const int64_t num = end - start;
+    for (int i = 0; i < num; ++i) {
+      for (int j = 0; j < D; ++j) {
+        inputs_packed_a[start + i][j] = inputs_padded_a[b][i][j];
+      }
+    }
+  }
+  return inputs_packed;
+}
diff --git a/pytorch3d/pytorch3d/csrc/point_mesh/point_mesh_cpu.cpp b/pytorch3d/pytorch3d/csrc/point_mesh/point_mesh_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cb2d0ac260b332764990ff4f940f2c85b511f9d9
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/point_mesh/point_mesh_cpu.cpp
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <array>
+#include <limits>
+#include "utils/geometry_utils.h"
+#include "utils/vec3.h"
+
+// - We start with implementations of simple operations on points, edges and
+// faces. The hull of H points is a point if H=1, an edge if H=2, a face if H=3.
+
+template <typename T>
+vec3<T> ExtractPoint(const at::TensorAccessor<T, 1>& t) {
+  return vec3<T>(t[0], t[1], t[2]);
+}
+
+template <typename Accessor>
+static std::array<vec3<std::remove_pointer_t<typename Accessor::PtrType>>, 1>
+ExtractHullHelper(const Accessor& t, std::array<char, 1> /*tag*/) {
+  return {ExtractPoint(t)};
+}
+
+template <typename Accessor>
+static std::array<vec3<std::remove_pointer_t<typename Accessor::PtrType>>, 2>
+ExtractHullHelper(const Accessor& t, std::array<char, 2> /*tag*/) {
+  return {ExtractPoint(t[0]), ExtractPoint(t[1])};
+}
+
+template <typename Accessor>
+static std::array<vec3<std::remove_pointer_t<typename Accessor::PtrType>>, 3>
+ExtractHullHelper(const Accessor& t, std::array<char, 3> /*tag*/) {
+  return {ExtractPoint(t[0]), ExtractPoint(t[1]), ExtractPoint(t[2])};
+}
+
+template <int H, typename Accessor>
+std::array<vec3<std::remove_pointer_t<typename Accessor::PtrType>>, H>
+ExtractHull(const Accessor& t) {
+  std::array<char, H> tag;
+  return ExtractHullHelper(t, tag);
+}
+
+template <typename T>
+void IncrementPoint(at::TensorAccessor<T, 1>&& t, const vec3<T>& point) {
+  t[0] += point.x;
+  t[1] += point.y;
+  t[2] += point.z;
+}
+
+// distance between the convex hull of A points and B points
+// this could be done in c++17 with tuple_cat and invoke
+template <typename T>
+T HullDistance(
+    const std::array<vec3<T>, 1>& a,
+    const std::array<vec3<T>, 2>& b) {
+  using std::get;
+  return PointLine3DistanceForward(get<0>(a), get<0>(b), get<1>(b));
+}
+template <typename T>
+T HullDistance(
+    const std::array<vec3<T>, 1>& a,
+    const std::array<vec3<T>, 3>& b) {
+  using std::get;
+  return PointTriangle3DistanceForward(
+      get<0>(a), get<0>(b), get<1>(b), get<2>(b));
+}
+template <typename T>
+T HullDistance(
+    const std::array<vec3<T>, 2>& a,
+    const std::array<vec3<T>, 1>& b) {
+  return HullDistance(b, a);
+}
+template <typename T>
+T HullDistance(
+    const std::array<vec3<T>, 3>& a,
+    const std::array<vec3<T>, 1>& b) {
+  return HullDistance(b, a);
+}
+
+template <typename T>
+void HullHullDistanceBackward(
+    const std::array<vec3<T>, 1>& a,
+    const std::array<vec3<T>, 2>& b,
+    T grad_dist,
+    at::TensorAccessor<T, 1>&& grad_a,
+    at::TensorAccessor<T, 2>&& grad_b) {
+  using std::get;
+  auto res =
+      PointLine3DistanceBackward(get<0>(a), get<0>(b), get<1>(b), grad_dist);
+  IncrementPoint(std::move(grad_a), get<0>(res));
+  IncrementPoint(grad_b[0], get<1>(res));
+  IncrementPoint(grad_b[1], get<2>(res));
+}
+template <typename T>
+void HullHullDistanceBackward(
+    const std::array<vec3<T>, 1>& a,
+    const std::array<vec3<T>, 3>& b,
+    T grad_dist,
+    at::TensorAccessor<T, 1>&& grad_a,
+    at::TensorAccessor<T, 2>&& grad_b) {
+  using std::get;
+  auto res = PointTriangle3DistanceBackward(
+      get<0>(a), get<0>(b), get<1>(b), get<2>(b), grad_dist);
+  IncrementPoint(std::move(grad_a), get<0>(res));
+  IncrementPoint(grad_b[0], get<1>(res));
+  IncrementPoint(grad_b[1], get<2>(res));
+  IncrementPoint(grad_b[2], get<3>(res));
+}
+template <typename T>
+void HullHullDistanceBackward(
+    const std::array<vec3<T>, 3>& a,
+    const std::array<vec3<T>, 1>& b,
+    T grad_dist,
+    at::TensorAccessor<T, 2>&& grad_a,
+    at::TensorAccessor<T, 1>&& grad_b) {
+  return HullHullDistanceBackward(
+      b, a, grad_dist, std::move(grad_b), std::move(grad_a));
+}
+template <typename T>
+void HullHullDistanceBackward(
+    const std::array<vec3<T>, 2>& a,
+    const std::array<vec3<T>, 1>& b,
+    T grad_dist,
+    at::TensorAccessor<T, 2>&& grad_a,
+    at::TensorAccessor<T, 1>&& grad_b) {
+  return HullHullDistanceBackward(
+      b, a, grad_dist, std::move(grad_b), std::move(grad_a));
+}
+
+template <int H>
+void ValidateShape(const at::Tensor& as) {
+  if (H == 1) {
+    TORCH_CHECK(as.size(1) == 3);
+  } else {
+    TORCH_CHECK(as.size(2) == 3 && as.size(1) == H);
+  }
+}
+
+// ----------- Here begins the implementation of each top-level
+//             function using non-type template parameters to
+//             implement all the cases in one go. ----------- //
+
+template <int H1, int H2>
+std::tuple<at::Tensor, at::Tensor> HullHullDistanceForwardCpu(
+    const at::Tensor& as,
+    const at::Tensor& as_first_idx,
+    const at::Tensor& bs,
+    const at::Tensor& bs_first_idx) {
+  const int64_t A_N = as.size(0);
+  const int64_t B_N = bs.size(0);
+  const int64_t BATCHES = as_first_idx.size(0);
+
+  ValidateShape<H1>(as);
+  ValidateShape<H2>(bs);
+
+  TORCH_CHECK(bs_first_idx.size(0) == BATCHES);
+
+  // clang-format off
+  at::Tensor dists = at::zeros({A_N,}, as.options());
+  at::Tensor idxs = at::zeros({A_N,}, as_first_idx.options());
+  // clang-format on
+
+  auto as_a = as.accessor < float, H1 == 1 ? 2 : 3 > ();
+  auto bs_a = bs.accessor < float, H2 == 1 ? 2 : 3 > ();
+  auto as_first_idx_a = as_first_idx.accessor<int64_t, 1>();
+  auto bs_first_idx_a = bs_first_idx.accessor<int64_t, 1>();
+  auto dists_a = dists.accessor<float, 1>();
+  auto idxs_a = idxs.accessor<int64_t, 1>();
+  int64_t a_batch_end = 0;
+  int64_t b_batch_start = 0, b_batch_end = 0;
+  int64_t batch_idx = 0;
+  for (int64_t a_n = 0; a_n < A_N; ++a_n) {
+    if (a_n == a_batch_end) {
+      ++batch_idx;
+      b_batch_start = b_batch_end;
+      if (batch_idx == BATCHES) {
+        a_batch_end = std::numeric_limits<int64_t>::max();
+        b_batch_end = B_N;
+      } else {
+        a_batch_end = as_first_idx_a[batch_idx];
+        b_batch_end = bs_first_idx_a[batch_idx];
+      }
+    }
+    float min_dist = std::numeric_limits<float>::max();
+    size_t min_idx = 0;
+    auto a = ExtractHull<H1>(as_a[a_n]);
+    for (int64_t b_n = b_batch_start; b_n < b_batch_end; ++b_n) {
+      float dist = HullDistance(a, ExtractHull<H2>(bs_a[b_n]));
+      if (dist <= min_dist) {
+        min_dist = dist;
+        min_idx = b_n;
+      }
+    }
+    dists_a[a_n] = min_dist;
+    idxs_a[a_n] = min_idx;
+  }
+
+  return std::make_tuple(dists, idxs);
+}
+
+template <int H1, int H2>
+std::tuple<at::Tensor, at::Tensor> HullHullDistanceBackwardCpu(
+    const at::Tensor& as,
+    const at::Tensor& bs,
+    const at::Tensor& idx_bs,
+    const at::Tensor& grad_dists) {
+  const int64_t A_N = as.size(0);
+
+  TORCH_CHECK(idx_bs.size(0) == A_N);
+  TORCH_CHECK(grad_dists.size(0) == A_N);
+  ValidateShape<H1>(as);
+  ValidateShape<H2>(bs);
+
+  at::Tensor grad_as = at::zeros_like(as);
+  at::Tensor grad_bs = at::zeros_like(bs);
+
+  auto as_a = as.accessor < float, H1 == 1 ? 2 : 3 > ();
+  auto bs_a = bs.accessor < float, H2 == 1 ? 2 : 3 > ();
+  auto grad_as_a = grad_as.accessor < float, H1 == 1 ? 2 : 3 > ();
+  auto grad_bs_a = grad_bs.accessor < float, H2 == 1 ? 2 : 3 > ();
+  auto idx_bs_a = idx_bs.accessor<int64_t, 1>();
+  auto grad_dists_a = grad_dists.accessor<float, 1>();
+
+  for (int64_t a_n = 0; a_n < A_N; ++a_n) {
+    auto a = ExtractHull<H1>(as_a[a_n]);
+    auto b = ExtractHull<H2>(bs_a[idx_bs_a[a_n]]);
+    HullHullDistanceBackward(
+        a, b, grad_dists_a[a_n], grad_as_a[a_n], grad_bs_a[idx_bs_a[a_n]]);
+  }
+  return std::make_tuple(grad_as, grad_bs);
+}
+
+template <int H>
+torch::Tensor PointHullArrayDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& bs) {
+  const int64_t P = points.size(0);
+  const int64_t B_N = bs.size(0);
+
+  TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
+  ValidateShape<H>(bs);
+
+  at::Tensor dists = at::zeros({P, B_N}, points.options());
+  auto points_a = points.accessor<float, 2>();
+  auto bs_a = bs.accessor<float, 3>();
+  auto dists_a = dists.accessor<float, 2>();
+  for (int64_t p = 0; p < P; ++p) {
+    auto point = ExtractHull<1>(points_a[p]);
+    auto dest = dists_a[p];
+    for (int64_t b_n = 0; b_n < B_N; ++b_n) {
+      auto b = ExtractHull<H>(bs_a[b_n]);
+      dest[b_n] = HullDistance(point, b);
+    }
+  }
+  return dists;
+}
+
+template <int H>
+std::tuple<at::Tensor, at::Tensor> PointHullArrayDistanceBackwardCpu(
+    const at::Tensor& points,
+    const at::Tensor& bs,
+    const at::Tensor& grad_dists) {
+  const int64_t P = points.size(0);
+  const int64_t B_N = bs.size(0);
+
+  TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
+  ValidateShape<H>(bs);
+  TORCH_CHECK((grad_dists.size(0) == P) && (grad_dists.size(1) == B_N));
+
+  at::Tensor grad_points = at::zeros({P, 3}, points.options());
+  at::Tensor grad_bs = at::zeros({B_N, H, 3}, bs.options());
+
+  auto points_a = points.accessor<float, 2>();
+  auto bs_a = bs.accessor<float, 3>();
+  auto grad_dists_a = grad_dists.accessor<float, 2>();
+  auto grad_points_a = grad_points.accessor<float, 2>();
+  auto grad_bs_a = grad_bs.accessor<float, 3>();
+  for (int64_t p = 0; p < P; ++p) {
+    auto point = ExtractHull<1>(points_a[p]);
+    auto grad_point = grad_points_a[p];
+    auto grad_dist = grad_dists_a[p];
+    for (int64_t b_n = 0; b_n < B_N; ++b_n) {
+      auto b = ExtractHull<H>(bs_a[b_n]);
+      HullHullDistanceBackward(
+          point, b, grad_dist[b_n], std::move(grad_point), grad_bs_a[b_n]);
+    }
+  }
+  return std::make_tuple(grad_points, grad_bs);
+}
+
+// ---------- Here begin the exported functions ------------ //
+
+std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& tris,
+    const torch::Tensor& tris_first_idx) {
+  return HullHullDistanceForwardCpu<1, 3>(
+      points, points_first_idx, tris, tris_first_idx);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& idx_points,
+    const torch::Tensor& grad_dists) {
+  return HullHullDistanceBackwardCpu<1, 3>(
+      points, tris, idx_points, grad_dists);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& tris,
+    const torch::Tensor& tris_first_idx) {
+  return HullHullDistanceForwardCpu<3, 1>(
+      tris, tris_first_idx, points, points_first_idx);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& idx_tris,
+    const torch::Tensor& grad_dists) {
+  auto res =
+      HullHullDistanceBackwardCpu<3, 1>(tris, points, idx_tris, grad_dists);
+  return std::make_tuple(std::get<1>(res), std::get<0>(res));
+}
+
+torch::Tensor PointEdgeArrayDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& segms) {
+  return PointHullArrayDistanceForwardCpu<2>(points, segms);
+}
+
+std::tuple<at::Tensor, at::Tensor> PointFaceArrayDistanceBackwardCpu(
+    const at::Tensor& points,
+    const at::Tensor& tris,
+    const at::Tensor& grad_dists) {
+  return PointHullArrayDistanceBackwardCpu<3>(points, tris, grad_dists);
+}
+
+torch::Tensor PointFaceArrayDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& tris) {
+  return PointHullArrayDistanceForwardCpu<3>(points, tris);
+}
+
+std::tuple<at::Tensor, at::Tensor> PointEdgeArrayDistanceBackwardCpu(
+    const at::Tensor& points,
+    const at::Tensor& segms,
+    const at::Tensor& grad_dists) {
+  return PointHullArrayDistanceBackwardCpu<2>(points, segms, grad_dists);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& segms,
+    const torch::Tensor& segms_first_idx,
+    const int64_t /*max_points*/) {
+  return HullHullDistanceForwardCpu<1, 2>(
+      points, points_first_idx, segms, segms_first_idx);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& idx_points,
+    const torch::Tensor& grad_dists) {
+  return HullHullDistanceBackwardCpu<1, 2>(
+      points, segms, idx_points, grad_dists);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& segms,
+    const torch::Tensor& segms_first_idx,
+    const int64_t /*max_segms*/) {
+  return HullHullDistanceForwardCpu<2, 1>(
+      segms, segms_first_idx, points, points_first_idx);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& idx_segms,
+    const torch::Tensor& grad_dists) {
+  auto res =
+      HullHullDistanceBackwardCpu<2, 1>(segms, points, idx_segms, grad_dists);
+  return std::make_tuple(std::get<1>(res), std::get<0>(res));
+}
diff --git a/pytorch3d/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu b/pytorch3d/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
new file mode 100644
index 0000000000000000000000000000000000000000..85716c00a49190d3193fe474c7afe9cf46398a70
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/point_mesh/point_mesh_cuda.cu
@@ -0,0 +1,783 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <algorithm>
+#include <list>
+#include <queue>
+#include <tuple>
+#include "utils/float_math.cuh"
+#include "utils/geometry_utils.cuh"
+#include "utils/warp_reduce.cuh"
+
+// ****************************************************************************
+// *                   Generic Forward/Backward Kernels                       *
+// ****************************************************************************
+
+__global__ void DistanceForwardKernel(
+    const float* __restrict__ objects, // (O * oD * 3)
+    const size_t objects_size, // O
+    const size_t objects_dim, // oD
+    const float* __restrict__ targets, // (T * tD * 3)
+    const size_t targets_size, // T
+    const size_t targets_dim, // tD
+    const int64_t* __restrict__ objects_first_idx, // (B,)
+    const int64_t* __restrict__ targets_first_idx, // (B,)
+    const size_t batch_size, // B
+    float* __restrict__ dist_objects, // (O,)
+    int64_t* __restrict__ idx_objects) { // (O,)
+  // This kernel is used interchangeably to compute bi-directional distances
+  // between points and triangles/lines. The direction of the distance computed,
+  // i.e. point to triangle/line or triangle/line to point, depends on the order
+  // of the input arguments and is inferred based on their shape. The shape is
+  // used to distinguish between triangles and lines
+
+  // Single shared memory buffer which is split and cast to different types.
+  extern __shared__ char shared_buf[];
+  float* min_dists = (float*)shared_buf; // float[NUM_THREADS]
+  int64_t* min_idxs = (int64_t*)&min_dists[blockDim.x]; // int64_t[NUM_THREADS]
+
+  const size_t batch_idx = blockIdx.y; // index of batch element.
+
+  // start and end for objects in batch_idx
+  const int64_t starto = objects_first_idx[batch_idx];
+  const int64_t endo = batch_idx + 1 < batch_size
+      ? objects_first_idx[batch_idx + 1]
+      : objects_size;
+
+  // start and end for targets in batch_idx
+  const int64_t startt = targets_first_idx[batch_idx];
+  const int64_t endt = batch_idx + 1 < batch_size
+      ? targets_first_idx[batch_idx + 1]
+      : targets_size;
+
+  const size_t i = blockIdx.x; // index within batch element.
+  const size_t tid = threadIdx.x; // thread index
+
+  // Set references to points/face based on which of objects/targets refer to
+  // points/faces
+  float3* points_f3 = objects_dim == 1 ? (float3*)objects : (float3*)targets;
+  float3* face_f3 = objects_dim == 1 ? (float3*)targets : (float3*)objects;
+  // Distinguishes whether we're computing distance against triangle vs edge
+  bool isTriangle = objects_dim == 3 || targets_dim == 3;
+
+  // Each block will compute one element of the output idx_objects[starto + i],
+  // dist_objects[starto + i]. Within the block we will use threads to compute
+  // the distances between objects[starto + i] and targets[j] for all j
+  // belonging in the same batch as i, i.e. j in [startt, endt]. Then use a
+  // block reduction to take an argmin of the distances.
+
+  // If i exceeds the number of objects in batch_idx, then do nothing
+  if (i < (endo - starto)) {
+    // Compute the distances between objects[starto + i] and targets[j] for
+    // all j belonging in the same batch as i, i.e. j in [startt, endt].
+    // Here each thread will reduce over (endt-startt) / blockDim.x in serial,
+    // and store its result to shared memory
+    float min_dist = FLT_MAX;
+    size_t min_idx = 0;
+    for (size_t j = tid; j < (endt - startt); j += blockDim.x) {
+      size_t point_idx = objects_dim == 1 ? starto + i : startt + j;
+      size_t face_idx = objects_dim == 1 ? (startt + j) * targets_dim
+                                         : (starto + i) * objects_dim;
+
+      float dist;
+      if (isTriangle) {
+        dist = PointTriangle3DistanceForward(
+            points_f3[point_idx],
+            face_f3[face_idx],
+            face_f3[face_idx + 1],
+            face_f3[face_idx + 2]);
+      } else {
+        dist = PointLine3DistanceForward(
+            points_f3[point_idx], face_f3[face_idx], face_f3[face_idx + 1]);
+      }
+
+      min_dist = (j == tid) ? dist : min_dist;
+      min_idx = (dist <= min_dist) ? (startt + j) : min_idx;
+      min_dist = (dist <= min_dist) ? dist : min_dist;
+    }
+    min_dists[tid] = min_dist;
+    min_idxs[tid] = min_idx;
+    __syncthreads();
+
+    // Perform reduction in shared memory.
+    for (int s = blockDim.x / 2; s > 32; s >>= 1) {
+      if (tid < s) {
+        if (min_dists[tid] > min_dists[tid + s]) {
+          min_dists[tid] = min_dists[tid + s];
+          min_idxs[tid] = min_idxs[tid + s];
+        }
+      }
+      __syncthreads();
+    }
+
+    // Unroll the last 6 iterations of the loop since they will happen
+    // synchronized within a single warp.
+    if (tid < 32)
+      WarpReduceMin<float>(min_dists, min_idxs, tid);
+
+    // Finally thread 0 writes the result to the output buffer.
+    if (tid == 0) {
+      idx_objects[starto + i] = min_idxs[0];
+      dist_objects[starto + i] = min_dists[0];
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> DistanceForwardCuda(
+    const at::Tensor& objects,
+    const size_t objects_dim,
+    const at::Tensor& objects_first_idx,
+    const at::Tensor& targets,
+    const size_t targets_dim,
+    const at::Tensor& targets_first_idx,
+    const int64_t max_objects) {
+  // Check inputs are on the same device
+  at::TensorArg objects_t{objects, "objects", 1},
+      objects_first_idx_t{objects_first_idx, "objects_first_idx", 2},
+      targets_t{targets, "targets", 3},
+      targets_first_idx_t{targets_first_idx, "targets_first_idx", 4};
+  at::CheckedFrom c = "DistanceForwardCuda";
+  at::checkAllSameGPU(
+      c, {objects_t, objects_first_idx_t, targets_t, targets_first_idx_t});
+  at::checkAllSameType(c, {objects_t, targets_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(objects.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t objects_size = objects.size(0);
+  const int64_t targets_size = targets.size(0);
+  const int64_t batch_size = objects_first_idx.size(0);
+
+  TORCH_CHECK(targets_first_idx.size(0) == batch_size);
+  if (objects_dim == 1) {
+    TORCH_CHECK(
+        targets_dim >= 2 && targets_dim <= 3,
+        "either object or target must be edge or face");
+    TORCH_CHECK(objects.size(1) == 3, "points must be of shape Px3");
+    TORCH_CHECK(
+        targets.size(2) == 3,
+        "face must be of shape Tx3x3, lines must be of shape Tx2x3");
+  } else {
+    TORCH_CHECK(targets_dim == 1, "either object or target must be point");
+    TORCH_CHECK(
+        objects_dim >= 2 && objects_dim <= 3,
+        "either object or target must be edge or face");
+    TORCH_CHECK(targets.size(1) == 3, "points must be of shape Px3");
+    TORCH_CHECK(
+        objects.size(2) == 3,
+        "face must be of shape Tx3x3, lines must be of shape Tx2x3");
+  }
+
+  // clang-format off
+  at::Tensor dists = at::zeros({objects_size,}, objects.options());
+  at::Tensor idxs = at::zeros({objects_size,}, objects_first_idx.options());
+  // clang-format on
+
+  if (dists.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(dists, idxs);
+  }
+
+  const int threads = 128;
+  const dim3 blocks(max_objects, batch_size);
+  size_t shared_size = threads * sizeof(size_t) + threads * sizeof(int64_t);
+
+  DistanceForwardKernel<<<blocks, threads, shared_size, stream>>>(
+      objects.contiguous().data_ptr<float>(),
+      objects_size,
+      objects_dim,
+      targets.contiguous().data_ptr<float>(),
+      targets_size,
+      targets_dim,
+      objects_first_idx.contiguous().data_ptr<int64_t>(),
+      targets_first_idx.contiguous().data_ptr<int64_t>(),
+      batch_size,
+      dists.data_ptr<float>(),
+      idxs.data_ptr<int64_t>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(dists, idxs);
+}
+
+__global__ void DistanceBackwardKernel(
+    const float* __restrict__ objects, // (O * oD * 3)
+    const size_t objects_size, // O
+    const size_t objects_dim, // oD
+    const float* __restrict__ targets, // (T * tD * 3)
+    const size_t targets_dim, // tD
+    const int64_t* __restrict__ idx_objects, // (O,)
+    const float* __restrict__ grad_dists, // (O,)
+    float* __restrict__ grad_points, // ((O or T) * 3)
+    float* __restrict__ grad_face) { // ((O or T) * max(oD, tD) * 3)
+  // This kernel is used interchangeably to compute bi-directional backward
+  // distances between points and triangles/lines. The direction of the distance
+  // computed, i.e. point to triangle/line or triangle/line to point, depends on
+  // the order of the input arguments and is inferred based on their shape. The
+  // shape is used to distinguish between triangles and lines. Note that
+  // grad_points will always be used for the point data and grad_face for the
+  // edge/triangle
+
+  // Set references to points/face based on whether objects/targets are which
+  float3* points_f3 = objects_dim == 1 ? (float3*)objects : (float3*)targets;
+  float3* face_f3 = objects_dim == 1 ? (float3*)targets : (float3*)objects;
+
+  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const size_t stride = gridDim.x * blockDim.x;
+
+  for (size_t o = tid; o < objects_size; o += stride) {
+    const int64_t tidx = idx_objects[o];
+
+    size_t point_index = objects_dim == 1 ? o : tidx;
+    size_t face_index = objects_dim == 1 ? tidx * targets_dim : o * objects_dim;
+    bool isTriangle = objects_dim == 3 || targets_dim == 3;
+
+    float3 grad_point, grad_v0, grad_v1, grad_v2;
+    if (isTriangle) {
+      const auto grads = PointTriangle3DistanceBackward(
+          points_f3[point_index],
+          face_f3[face_index],
+          face_f3[face_index + 1],
+          face_f3[face_index + 2],
+          grad_dists[o]);
+      grad_point = thrust::get<0>(grads);
+      grad_v0 = thrust::get<1>(grads);
+      grad_v1 = thrust::get<2>(grads);
+      grad_v2 = thrust::get<3>(grads);
+    } else {
+      const auto grads = PointLine3DistanceBackward(
+          points_f3[point_index],
+          face_f3[face_index],
+          face_f3[face_index + 1],
+          grad_dists[o]);
+      grad_point = thrust::get<0>(grads);
+      grad_v0 = thrust::get<1>(grads);
+      grad_v1 = thrust::get<2>(grads);
+    }
+
+    atomicAdd(grad_points + point_index * 3 + 0, grad_point.x);
+    atomicAdd(grad_points + point_index * 3 + 1, grad_point.y);
+    atomicAdd(grad_points + point_index * 3 + 2, grad_point.z);
+
+    atomicAdd(grad_face + face_index * 3 + 0 * 3 + 0, grad_v0.x);
+    atomicAdd(grad_face + face_index * 3 + 0 * 3 + 1, grad_v0.y);
+    atomicAdd(grad_face + face_index * 3 + 0 * 3 + 2, grad_v0.z);
+
+    atomicAdd(grad_face + face_index * 3 + 1 * 3 + 0, grad_v1.x);
+    atomicAdd(grad_face + face_index * 3 + 1 * 3 + 1, grad_v1.y);
+    atomicAdd(grad_face + face_index * 3 + 1 * 3 + 2, grad_v1.z);
+
+    if (isTriangle) {
+      atomicAdd(grad_face + face_index * 3 + 2 * 3 + 0, grad_v2.x);
+      atomicAdd(grad_face + face_index * 3 + 2 * 3 + 1, grad_v2.y);
+      atomicAdd(grad_face + face_index * 3 + 2 * 3 + 2, grad_v2.z);
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> DistanceBackwardCuda(
+    const at::Tensor& objects,
+    const size_t objects_dim,
+    const at::Tensor& targets,
+    const size_t targets_dim,
+    const at::Tensor& idx_objects,
+    const at::Tensor& grad_dists) {
+  // Check inputs are on the same device
+  at::TensorArg objects_t{objects, "objects", 1},
+      targets_t{targets, "targets", 2},
+      idx_objects_t{idx_objects, "idx_objects", 3},
+      grad_dists_t{grad_dists, "grad_dists", 4};
+  at::CheckedFrom c = "DistanceBackwardCuda";
+  at::checkAllSameGPU(c, {objects_t, targets_t, idx_objects_t, grad_dists_t});
+  at::checkAllSameType(c, {objects_t, targets_t, grad_dists_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(objects.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t objects_size = objects.size(0);
+  const int64_t targets_size = targets.size(0);
+
+  at::Tensor grad_points;
+  at::Tensor grad_tris;
+
+  TORCH_CHECK(idx_objects.size(0) == objects_size);
+  TORCH_CHECK(grad_dists.size(0) == objects_size);
+  if (objects_dim == 1) {
+    TORCH_CHECK(
+        targets_dim >= 2 && targets_dim <= 3,
+        "either object or target must be edge or face");
+    TORCH_CHECK(objects.size(1) == 3, "points must be of shape Px3");
+    TORCH_CHECK(
+        targets.size(2) == 3,
+        "face must be of shape Tx3x3, lines must be of shape Tx2x3");
+    // clang-format off
+    grad_points = at::zeros({objects_size, 3}, objects.options());
+    grad_tris = at::zeros({targets_size, int64_t(targets_dim), 3}, targets.options());
+    // clang-format on
+  } else {
+    TORCH_CHECK(targets_dim == 1, "either object or target must be point");
+    TORCH_CHECK(
+        objects_dim >= 2 && objects_dim <= 3,
+        "either object or target must be edge or face");
+    TORCH_CHECK(targets.size(1) == 3, "points must be of shape Px3");
+    TORCH_CHECK(
+        objects.size(2) == 3,
+        "face must be of shape Tx3x3, lines must be of shape Tx2x3");
+    // clang-format off
+    grad_points = at::zeros({targets_size, 3}, targets.options());
+    grad_tris = at::zeros({objects_size, int64_t(objects_dim), 3}, objects.options());
+    // clang-format on
+  }
+
+  if (grad_points.numel() == 0 || grad_tris.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(grad_points, grad_tris);
+  }
+
+  const int blocks = 64;
+  const int threads = 512;
+
+  DistanceBackwardKernel<<<blocks, threads, 0, stream>>>(
+      objects.contiguous().data_ptr<float>(),
+      objects_size,
+      objects_dim,
+      targets.contiguous().data_ptr<float>(),
+      targets_dim,
+      idx_objects.contiguous().data_ptr<int64_t>(),
+      grad_dists.contiguous().data_ptr<float>(),
+      grad_points.data_ptr<float>(),
+      grad_tris.data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(grad_points, grad_tris);
+}
+
+// ****************************************************************************
+// *                          PointFaceDistance                               *
+// ****************************************************************************
+
+std::tuple<at::Tensor, at::Tensor> PointFaceDistanceForwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& points_first_idx,
+    const at::Tensor& tris,
+    const at::Tensor& tris_first_idx,
+    const int64_t max_points) {
+  return DistanceForwardCuda(
+      points, 1, points_first_idx, tris, 3, tris_first_idx, max_points);
+}
+
+std::tuple<at::Tensor, at::Tensor> PointFaceDistanceBackwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& tris,
+    const at::Tensor& idx_points,
+    const at::Tensor& grad_dists) {
+  return DistanceBackwardCuda(points, 1, tris, 3, idx_points, grad_dists);
+}
+
+// ****************************************************************************
+// *                          FacePointDistance                               *
+// ****************************************************************************
+
+std::tuple<at::Tensor, at::Tensor> FacePointDistanceForwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& points_first_idx,
+    const at::Tensor& tris,
+    const at::Tensor& tris_first_idx,
+    const int64_t max_tris) {
+  return DistanceForwardCuda(
+      tris, 3, tris_first_idx, points, 1, points_first_idx, max_tris);
+}
+
+std::tuple<at::Tensor, at::Tensor> FacePointDistanceBackwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& tris,
+    const at::Tensor& idx_tris,
+    const at::Tensor& grad_dists) {
+  return DistanceBackwardCuda(tris, 3, points, 1, idx_tris, grad_dists);
+}
+
+// ****************************************************************************
+// *                          PointEdgeDistance                               *
+// ****************************************************************************
+
+std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceForwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& points_first_idx,
+    const at::Tensor& segms,
+    const at::Tensor& segms_first_idx,
+    const int64_t max_points) {
+  return DistanceForwardCuda(
+      points, 1, points_first_idx, segms, 2, segms_first_idx, max_points);
+}
+
+std::tuple<at::Tensor, at::Tensor> PointEdgeDistanceBackwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& segms,
+    const at::Tensor& idx_points,
+    const at::Tensor& grad_dists) {
+  return DistanceBackwardCuda(points, 1, segms, 2, idx_points, grad_dists);
+}
+
+// ****************************************************************************
+// *                          EdgePointDistance                               *
+// ****************************************************************************
+
+std::tuple<at::Tensor, at::Tensor> EdgePointDistanceForwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& points_first_idx,
+    const at::Tensor& segms,
+    const at::Tensor& segms_first_idx,
+    const int64_t max_segms) {
+  return DistanceForwardCuda(
+      segms, 2, segms_first_idx, points, 1, points_first_idx, max_segms);
+}
+
+std::tuple<at::Tensor, at::Tensor> EdgePointDistanceBackwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& segms,
+    const at::Tensor& idx_segms,
+    const at::Tensor& grad_dists) {
+  return DistanceBackwardCuda(segms, 2, points, 1, idx_segms, grad_dists);
+}
+
+// ****************************************************************************
+// *                     PointFaceArrayDistance                               *
+// ****************************************************************************
+// TODO: Create wrapper function and merge kernel with other array kernel
+
+__global__ void PointFaceArrayForwardKernel(
+    const float* __restrict__ points, // (P, 3)
+    const float* __restrict__ tris, // (T, 3, 3)
+    float* __restrict__ dists, // (P, T)
+    const size_t P,
+    const size_t T) {
+  const float3* points_f3 = (float3*)points;
+  const float3* tris_f3 = (float3*)tris;
+
+  // Parallelize over P * S computations
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int t_i = tid; t_i < P * T; t_i += num_threads) {
+    const int t = t_i / P; // segment index.
+    const int p = t_i % P; // point index
+    const float3 v0 = tris_f3[t * 3 + 0];
+    const float3 v1 = tris_f3[t * 3 + 1];
+    const float3 v2 = tris_f3[t * 3 + 2];
+
+    const float3 point = points_f3[p];
+    float dist = PointTriangle3DistanceForward(point, v0, v1, v2);
+    dists[p * T + t] = dist;
+  }
+}
+
+at::Tensor PointFaceArrayDistanceForwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& tris) {
+  // Check inputs are on the same device
+  at::TensorArg points_t{points, "points", 1}, tris_t{tris, "tris", 2};
+  at::CheckedFrom c = "PointFaceArrayDistanceForwardCuda";
+  at::checkAllSameGPU(c, {points_t, tris_t});
+  at::checkAllSameType(c, {points_t, tris_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t P = points.size(0);
+  const int64_t T = tris.size(0);
+
+  TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
+  TORCH_CHECK(
+      (tris.size(1) == 3) && (tris.size(2) == 3),
+      "tris must be of shape Tx3x3");
+
+  at::Tensor dists = at::zeros({P, T}, points.options());
+
+  if (dists.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return dists;
+  }
+
+  const size_t blocks = 1024;
+  const size_t threads = 64;
+
+  PointFaceArrayForwardKernel<<<blocks, threads, 0, stream>>>(
+      points.contiguous().data_ptr<float>(),
+      tris.contiguous().data_ptr<float>(),
+      dists.data_ptr<float>(),
+      P,
+      T);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return dists;
+}
+
+__global__ void PointFaceArrayBackwardKernel(
+    const float* __restrict__ points, // (P, 3)
+    const float* __restrict__ tris, // (T, 3, 3)
+    const float* __restrict__ grad_dists, // (P, T)
+    float* __restrict__ grad_points, // (P, 3)
+    float* __restrict__ grad_tris, // (T, 3, 3)
+    const size_t P,
+    const size_t T) {
+  const float3* points_f3 = (float3*)points;
+  const float3* tris_f3 = (float3*)tris;
+
+  // Parallelize over P * S computations
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int t_i = tid; t_i < P * T; t_i += num_threads) {
+    const int t = t_i / P; // triangle index.
+    const int p = t_i % P; // point index
+    const float3 v0 = tris_f3[t * 3 + 0];
+    const float3 v1 = tris_f3[t * 3 + 1];
+    const float3 v2 = tris_f3[t * 3 + 2];
+
+    const float3 point = points_f3[p];
+
+    const float grad_dist = grad_dists[p * T + t];
+    const auto grad =
+        PointTriangle3DistanceBackward(point, v0, v1, v2, grad_dist);
+
+    const float3 grad_point = thrust::get<0>(grad);
+    const float3 grad_v0 = thrust::get<1>(grad);
+    const float3 grad_v1 = thrust::get<2>(grad);
+    const float3 grad_v2 = thrust::get<3>(grad);
+
+    atomicAdd(grad_points + 3 * p + 0, grad_point.x);
+    atomicAdd(grad_points + 3 * p + 1, grad_point.y);
+    atomicAdd(grad_points + 3 * p + 2, grad_point.z);
+
+    atomicAdd(grad_tris + t * 3 * 3 + 0 * 3 + 0, grad_v0.x);
+    atomicAdd(grad_tris + t * 3 * 3 + 0 * 3 + 1, grad_v0.y);
+    atomicAdd(grad_tris + t * 3 * 3 + 0 * 3 + 2, grad_v0.z);
+
+    atomicAdd(grad_tris + t * 3 * 3 + 1 * 3 + 0, grad_v1.x);
+    atomicAdd(grad_tris + t * 3 * 3 + 1 * 3 + 1, grad_v1.y);
+    atomicAdd(grad_tris + t * 3 * 3 + 1 * 3 + 2, grad_v1.z);
+
+    atomicAdd(grad_tris + t * 3 * 3 + 2 * 3 + 0, grad_v2.x);
+    atomicAdd(grad_tris + t * 3 * 3 + 2 * 3 + 1, grad_v2.y);
+    atomicAdd(grad_tris + t * 3 * 3 + 2 * 3 + 2, grad_v2.z);
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> PointFaceArrayDistanceBackwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& tris,
+    const at::Tensor& grad_dists) {
+  // Check inputs are on the same device
+  at::TensorArg points_t{points, "points", 1}, tris_t{tris, "tris", 2},
+      grad_dists_t{grad_dists, "grad_dists", 3};
+  at::CheckedFrom c = "PointFaceArrayDistanceBackwardCuda";
+  at::checkAllSameGPU(c, {points_t, tris_t, grad_dists_t});
+  at::checkAllSameType(c, {points_t, tris_t, grad_dists_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t P = points.size(0);
+  const int64_t T = tris.size(0);
+
+  TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
+  TORCH_CHECK(
+      (tris.size(1) == 3) && (tris.size(2) == 3),
+      "tris must be of shape Tx3x3");
+  TORCH_CHECK((grad_dists.size(0) == P) && (grad_dists.size(1) == T));
+
+  at::Tensor grad_points = at::zeros({P, 3}, points.options());
+  at::Tensor grad_tris = at::zeros({T, 3, 3}, tris.options());
+
+  if (grad_points.numel() == 0 || grad_tris.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(grad_points, grad_tris);
+  }
+
+  const size_t blocks = 1024;
+  const size_t threads = 64;
+
+  PointFaceArrayBackwardKernel<<<blocks, threads, 0, stream>>>(
+      points.contiguous().data_ptr<float>(),
+      tris.contiguous().data_ptr<float>(),
+      grad_dists.contiguous().data_ptr<float>(),
+      grad_points.data_ptr<float>(),
+      grad_tris.data_ptr<float>(),
+      P,
+      T);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(grad_points, grad_tris);
+}
+
+// ****************************************************************************
+// *                     PointEdgeArrayDistance                               *
+// ****************************************************************************
+// TODO: Create wrapper function and merge kernel with other array kernel
+
+__global__ void PointEdgeArrayForwardKernel(
+    const float* __restrict__ points, // (P, 3)
+    const float* __restrict__ segms, // (S, 2, 3)
+    float* __restrict__ dists, // (P, S)
+    const size_t P,
+    const size_t S) {
+  float3* points_f3 = (float3*)points;
+  float3* segms_f3 = (float3*)segms;
+
+  // Parallelize over P * S computations
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int t_i = tid; t_i < P * S; t_i += num_threads) {
+    const int s = t_i / P; // segment index.
+    const int p = t_i % P; // point index
+    float3 a = segms_f3[s * 2 + 0];
+    float3 b = segms_f3[s * 2 + 1];
+
+    float3 point = points_f3[p];
+    float dist = PointLine3DistanceForward(point, a, b);
+    dists[p * S + s] = dist;
+  }
+}
+
+at::Tensor PointEdgeArrayDistanceForwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& segms) {
+  // Check inputs are on the same device
+  at::TensorArg points_t{points, "points", 1}, segms_t{segms, "segms", 2};
+  at::CheckedFrom c = "PointEdgeArrayDistanceForwardCuda";
+  at::checkAllSameGPU(c, {points_t, segms_t});
+  at::checkAllSameType(c, {points_t, segms_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t P = points.size(0);
+  const int64_t S = segms.size(0);
+
+  TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
+  TORCH_CHECK(
+      (segms.size(1) == 2) && (segms.size(2) == 3),
+      "segms must be of shape Sx2x3");
+
+  at::Tensor dists = at::zeros({P, S}, points.options());
+
+  if (dists.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return dists;
+  }
+
+  const size_t blocks = 1024;
+  const size_t threads = 64;
+
+  PointEdgeArrayForwardKernel<<<blocks, threads, 0, stream>>>(
+      points.contiguous().data_ptr<float>(),
+      segms.contiguous().data_ptr<float>(),
+      dists.data_ptr<float>(),
+      P,
+      S);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return dists;
+}
+
+__global__ void PointEdgeArrayBackwardKernel(
+    const float* __restrict__ points, // (P, 3)
+    const float* __restrict__ segms, // (S, 2, 3)
+    const float* __restrict__ grad_dists, // (P, S)
+    float* __restrict__ grad_points, // (P, 3)
+    float* __restrict__ grad_segms, // (S, 2, 3)
+    const size_t P,
+    const size_t S) {
+  float3* points_f3 = (float3*)points;
+  float3* segms_f3 = (float3*)segms;
+
+  // Parallelize over P * S computations
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int t_i = tid; t_i < P * S; t_i += num_threads) {
+    const int s = t_i / P; // segment index.
+    const int p = t_i % P; // point index
+    const float3 a = segms_f3[s * 2 + 0];
+    const float3 b = segms_f3[s * 2 + 1];
+
+    const float3 point = points_f3[p];
+    const float grad_dist = grad_dists[p * S + s];
+    const auto grads = PointLine3DistanceBackward(point, a, b, grad_dist);
+    const float3 grad_point = thrust::get<0>(grads);
+    const float3 grad_a = thrust::get<1>(grads);
+    const float3 grad_b = thrust::get<2>(grads);
+
+    atomicAdd(grad_points + p * 3 + 0, grad_point.x);
+    atomicAdd(grad_points + p * 3 + 1, grad_point.y);
+    atomicAdd(grad_points + p * 3 + 2, grad_point.z);
+
+    atomicAdd(grad_segms + s * 2 * 3 + 0 * 3 + 0, grad_a.x);
+    atomicAdd(grad_segms + s * 2 * 3 + 0 * 3 + 1, grad_a.y);
+    atomicAdd(grad_segms + s * 2 * 3 + 0 * 3 + 2, grad_a.z);
+
+    atomicAdd(grad_segms + s * 2 * 3 + 1 * 3 + 0, grad_b.x);
+    atomicAdd(grad_segms + s * 2 * 3 + 1 * 3 + 1, grad_b.y);
+    atomicAdd(grad_segms + s * 2 * 3 + 1 * 3 + 2, grad_b.z);
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor> PointEdgeArrayDistanceBackwardCuda(
+    const at::Tensor& points,
+    const at::Tensor& segms,
+    const at::Tensor& grad_dists) {
+  // Check inputs are on the same device
+  at::TensorArg points_t{points, "points", 1}, segms_t{segms, "segms", 2},
+      grad_dists_t{grad_dists, "grad_dists", 3};
+  at::CheckedFrom c = "PointEdgeArrayDistanceBackwardCuda";
+  at::checkAllSameGPU(c, {points_t, segms_t, grad_dists_t});
+  at::checkAllSameType(c, {points_t, segms_t, grad_dists_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t P = points.size(0);
+  const int64_t S = segms.size(0);
+
+  TORCH_CHECK(points.size(1) == 3, "points must be of shape Px3");
+  TORCH_CHECK(
+      (segms.size(1) == 2) && (segms.size(2) == 3),
+      "segms must be of shape Sx2x3");
+  TORCH_CHECK((grad_dists.size(0) == P) && (grad_dists.size(1) == S));
+
+  at::Tensor grad_points = at::zeros({P, 3}, points.options());
+  at::Tensor grad_segms = at::zeros({S, 2, 3}, segms.options());
+
+  if (grad_points.numel() == 0 || grad_segms.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(grad_points, grad_segms);
+  }
+
+  const size_t blocks = 1024;
+  const size_t threads = 64;
+
+  PointEdgeArrayBackwardKernel<<<blocks, threads, 0, stream>>>(
+      points.contiguous().data_ptr<float>(),
+      segms.contiguous().data_ptr<float>(),
+      grad_dists.contiguous().data_ptr<float>(),
+      grad_points.data_ptr<float>(),
+      grad_segms.data_ptr<float>(),
+      P,
+      S);
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(grad_points, grad_segms);
+}
diff --git a/pytorch3d/pytorch3d/csrc/point_mesh/point_mesh_cuda.h b/pytorch3d/pytorch3d/csrc/point_mesh/point_mesh_cuda.h
new file mode 100644
index 0000000000000000000000000000000000000000..1aa7cdb455d93de7cec40dcb507487f00957eeb5
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/point_mesh/point_mesh_cuda.h
@@ -0,0 +1,661 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <cstdio>
+#include <tuple>
+#include "utils/pytorch3d_cutils.h"
+
+// ****************************************************************************
+// *                      PointFaceDistance                                   *
+// ****************************************************************************
+
+// Computes the squared euclidean distance of each p in points to it closest
+// triangular face belonging to the corresponding mesh example in the batch of
+// size N.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    points_first_idx: LongTensor of shape (N,) indicating the first point
+//        index for each example in the batch
+//    tris: FloatTensor of shape (T, 3, 3) of the triangular faces. The t-th
+//        triangular face is spanned by (tris[t, 0], tris[t, 1], tris[t, 2])
+//    tris_first_idx: LongTensor of shape (N,) indicating the first face
+//        index for each example in the batch
+//    max_points: Scalar equal to max(P_i) for i in [0, N - 1] containing
+//        the maximum number of points in the batch and is used to set
+//        the block dimensions in the CUDA implementation.
+//
+// Returns:
+//    dists: FloatTensor of shape (P,), where dists[p] is the minimum
+//        squared euclidean distance of points[p] to the faces in the same
+//        example in the batch.
+//    idxs: LongTensor of shape (P,), where idxs[p] is the index of the closest
+//        face in the batch.
+//        So, dists[p] = d(points[p], tris[idxs[p], 0], tris[idxs[p], 1],
+//        tris[idxs[p], 2]) where d(u, v0, v1, v2) is the distance of u from the
+//        face spanned by (v0, v1, v2)
+//
+//
+
+#ifdef WITH_CUDA
+
+std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& tris,
+    const torch::Tensor& tris_first_idx,
+    const int64_t max_points);
+#endif
+
+std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& tris,
+    const torch::Tensor& tris_first_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceForward(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& tris,
+    const torch::Tensor& tris_first_idx,
+    const int64_t max_points) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(points_first_idx);
+    CHECK_CUDA(tris);
+    CHECK_CUDA(tris_first_idx);
+    return PointFaceDistanceForwardCuda(
+        points, points_first_idx, tris, tris_first_idx, max_points);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return PointFaceDistanceForwardCpu(
+      points, points_first_idx, tris, tris_first_idx);
+}
+
+// Backward pass for PointFaceDistance.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    tris: FloatTensor of shape (T, 3, 3)
+//    idx_points: LongTensor of shape (P,) containing the indices
+//        of the closest face in the example in the batch.
+//        This is computed by the forward pass
+//    grad_dists: FloatTensor of shape (P,)
+//
+// Returns:
+//    grad_points: FloatTensor of shape (P, 3)
+//    grad_tris: FloatTensor of shape (T, 3, 3)
+//
+
+#ifdef WITH_CUDA
+
+std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& idx_points,
+    const torch::Tensor& grad_dists);
+#endif
+std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& idx_points,
+    const torch::Tensor& grad_dists);
+
+std::tuple<torch::Tensor, torch::Tensor> PointFaceDistanceBackward(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& idx_points,
+    const torch::Tensor& grad_dists) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(tris);
+    CHECK_CUDA(idx_points);
+    CHECK_CUDA(grad_dists);
+    return PointFaceDistanceBackwardCuda(points, tris, idx_points, grad_dists);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return PointFaceDistanceBackwardCpu(points, tris, idx_points, grad_dists);
+}
+
+// ****************************************************************************
+// *                      FacePointDistance                                   *
+// ****************************************************************************
+
+// Computes the squared euclidean distance of each triangular face to its
+// closest point belonging to the corresponding example in the batch of size N.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    points_first_idx: LongTensor of shape (N,) indicating the first point
+//        index for each example in the batch
+//    tris: FloatTensor of shape (T, 3, 3) of the triangular faces. The t-th
+//        triangular face is spanned by (tris[t, 0], tris[t, 1], tris[t, 2])
+//    tris_first_idx: LongTensor of shape (N,) indicating the first face
+//        index for each example in the batch
+//    max_tris: Scalar equal to max(T_i) for i in [0, N - 1] containing
+//        the maximum number of faces in the batch and is used to set
+//        the block dimensions in the CUDA implementation.
+//
+// Returns:
+//    dists: FloatTensor of shape (T,), where dists[t] is the minimum squared
+//        euclidean distance of t-th triangular face from the closest point in
+//        the batch.
+//    idxs: LongTensor of shape (T,), where idxs[t] is the index of the closest
+//        point in the batch.
+//        So, dists[t] = d(points[idxs[t]], tris[t, 0], tris[t, 1], tris[t, 2])
+//        where d(u, v0, v1, v2) is the distance of u from the triangular face
+//        spanned by (v0, v1, v2)
+//
+
+#ifdef WITH_CUDA
+
+std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& tris,
+    const torch::Tensor& tris_first_idx,
+    const int64_t max_tris);
+#endif
+
+std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& tris,
+    const torch::Tensor& tris_first_idx);
+
+std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceForward(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& tris,
+    const torch::Tensor& tris_first_idx,
+    const int64_t max_tris) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(points_first_idx);
+    CHECK_CUDA(tris);
+    CHECK_CUDA(tris_first_idx);
+    return FacePointDistanceForwardCuda(
+        points, points_first_idx, tris, tris_first_idx, max_tris);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return FacePointDistanceForwardCpu(
+      points, points_first_idx, tris, tris_first_idx);
+}
+
+// Backward pass for FacePointDistance.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    tris: FloatTensor of shape (T, 3, 3)
+//    idx_tris: LongTensor of shape (T,) containing the indices
+//        of the closest point in the example in the batch.
+//        This is computed by the forward pass
+//    grad_dists: FloatTensor of shape (T,)
+//
+// Returns:
+//    grad_points: FloatTensor of shape (P, 3)
+//    grad_tris: FloatTensor of shape (T, 3, 3)
+//
+
+#ifdef WITH_CUDA
+
+std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& idx_tris,
+    const torch::Tensor& grad_dists);
+#endif
+
+std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& idx_tris,
+    const torch::Tensor& grad_dists);
+
+std::tuple<torch::Tensor, torch::Tensor> FacePointDistanceBackward(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& idx_tris,
+    const torch::Tensor& grad_dists) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(tris);
+    CHECK_CUDA(idx_tris);
+    CHECK_CUDA(grad_dists);
+    return FacePointDistanceBackwardCuda(points, tris, idx_tris, grad_dists);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return FacePointDistanceBackwardCpu(points, tris, idx_tris, grad_dists);
+}
+
+// ****************************************************************************
+// *                      PointEdgeDistance                                   *
+// ****************************************************************************
+
+// Computes the squared euclidean distance of each p in points to the closest
+// mesh edge belonging to the corresponding example in the batch of size N.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    points_first_idx: LongTensor of shape (N,) indicating the first point
+//         index for each example in the batch
+//    segms: FloatTensor of shape (S, 2, 3) of edge segments. The s-th edge
+//        segment is spanned by (segms[s, 0], segms[s, 1])
+//    segms_first_idx: LongTensor of shape (N,) indicating the first edge
+//        index for each example in the batch
+//    max_points: Scalar equal to max(P_i) for i in [0, N - 1] containing
+//        the maximum number of points in the batch and is used to set
+//        the grid dimensions in the CUDA implementation.
+//
+// Returns:
+//    dists: FloatTensor of shape (P,), where dists[p] is the squared euclidean
+//        distance of points[p] to the closest edge in the same example in the
+//        batch.
+//    idxs: LongTensor of shape (P,), where idxs[p] is the index of the closest
+//        edge in the batch.
+//        So, dists[p] = d(points[p], segms[idxs[p], 0], segms[idxs[p], 1]),
+//        where d(u, v0, v1) is the distance of u from the segment spanned by
+//        (v0, v1).
+//
+
+#ifdef WITH_CUDA
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& segms,
+    const torch::Tensor& segms_first_idx,
+    const int64_t max_points);
+#endif
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& segms,
+    const torch::Tensor& segms_first_idx,
+    const int64_t max_points);
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceForward(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& segms,
+    const torch::Tensor& segms_first_idx,
+    const int64_t max_points) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(points_first_idx);
+    CHECK_CUDA(segms);
+    CHECK_CUDA(segms_first_idx);
+    return PointEdgeDistanceForwardCuda(
+        points, points_first_idx, segms, segms_first_idx, max_points);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return PointEdgeDistanceForwardCpu(
+      points, points_first_idx, segms, segms_first_idx, max_points);
+}
+
+// Backward pass for PointEdgeDistance.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    segms: FloatTensor of shape (S, 2, 3)
+//    idx_points: LongTensor of shape (P,) containing the indices
+//        of the closest edge in the example in the batch.
+//        This is computed by the forward pass.
+//    grad_dists: FloatTensor of shape (P,)
+//
+// Returns:
+//    grad_points: FloatTensor of shape (P, 3)
+//    grad_segms: FloatTensor of shape (S, 2, 3)
+//
+
+#ifdef WITH_CUDA
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& idx_points,
+    const torch::Tensor& grad_dists);
+#endif
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& idx_points,
+    const torch::Tensor& grad_dists);
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeDistanceBackward(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& idx_points,
+    const torch::Tensor& grad_dists) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(segms);
+    CHECK_CUDA(idx_points);
+    CHECK_CUDA(grad_dists);
+    return PointEdgeDistanceBackwardCuda(points, segms, idx_points, grad_dists);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return PointEdgeDistanceBackwardCpu(points, segms, idx_points, grad_dists);
+}
+
+// ****************************************************************************
+// *                      EdgePointDistance                                   *
+// ****************************************************************************
+
+// Computes the squared euclidean distance of each edge segment to the closest
+// point belonging to the corresponding example in the batch of size N.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    points_first_idx: LongTensor of shape (N,) indicating the first point
+//         index for each example in the batch
+//    segms: FloatTensor of shape (S, 2, 3) of edge segments. The s-th edge
+//        segment is spanned by (segms[s, 0], segms[s, 1])
+//    segms_first_idx: LongTensor of shape (N,) indicating the first edge
+//        index for each example in the batch
+//    max_segms: Scalar equal to max(S_i) for i in [0, N - 1] containing
+//        the maximum number of edges in the batch and is used to set
+//        the block dimensions in the CUDA implementation.
+//
+// Returns:
+//    dists: FloatTensor of shape (S,), where dists[s] is the squared
+//        euclidean distance of s-th edge to the closest point in the
+//        corresponding example in the batch.
+//    idxs: LongTensor of shape (S,), where idxs[s] is the index of the closest
+//        point in the example in the batch.
+//        So, dists[s] = d(points[idxs[s]], segms[s, 0], segms[s, 1]), where
+//        d(u, v0, v1) is the distance of u from the segment spanned by (v0, v1)
+//
+//
+
+#ifdef WITH_CUDA
+
+std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& segms,
+    const torch::Tensor& segms_first_idx,
+    const int64_t max_segms);
+#endif
+
+std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& segms,
+    const torch::Tensor& segms_first_idx,
+    const int64_t max_segms);
+
+std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceForward(
+    const torch::Tensor& points,
+    const torch::Tensor& points_first_idx,
+    const torch::Tensor& segms,
+    const torch::Tensor& segms_first_idx,
+    const int64_t max_segms) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(points_first_idx);
+    CHECK_CUDA(segms);
+    CHECK_CUDA(segms_first_idx);
+    return EdgePointDistanceForwardCuda(
+        points, points_first_idx, segms, segms_first_idx, max_segms);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return EdgePointDistanceForwardCpu(
+      points, points_first_idx, segms, segms_first_idx, max_segms);
+}
+
+// Backward pass for EdgePointDistance.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    segms: FloatTensor of shape (S, 2, 3)
+//    idx_segms: LongTensor of shape (S,) containing the indices
+//        of the closest point in the example in the batch.
+//        This is computed by the forward pass
+//    grad_dists: FloatTensor of shape (S,)
+//
+// Returns:
+//    grad_points: FloatTensor of shape (P, 3)
+//    grad_segms: FloatTensor of shape (S, 2, 3)
+//
+
+#ifdef WITH_CUDA
+
+std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& idx_segms,
+    const torch::Tensor& grad_dists);
+#endif
+
+std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& idx_segms,
+    const torch::Tensor& grad_dists);
+
+std::tuple<torch::Tensor, torch::Tensor> EdgePointDistanceBackward(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& idx_segms,
+    const torch::Tensor& grad_dists) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(segms);
+    CHECK_CUDA(idx_segms);
+    CHECK_CUDA(grad_dists);
+    return EdgePointDistanceBackwardCuda(points, segms, idx_segms, grad_dists);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return EdgePointDistanceBackwardCpu(points, segms, idx_segms, grad_dists);
+}
+
+// ****************************************************************************
+// *                       PointFaceArrayDistance                             *
+// ****************************************************************************
+
+// Computes the squared euclidean distance of each p in points to each
+// triangular face spanned by (v0, v1, v2) in tris.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    tris: FloatTensor of shape (T, 3, 3) of the triangular faces. The t-th
+//        triangular face is spanned by (tris[t, 0], tris[t, 1], tris[t, 2])
+//
+// Returns:
+//    dists: FloatTensor of shape (P, T), where dists[p, t] is the squared
+//        euclidean distance of points[p] to the face spanned by (v0, v1, v2)
+//        where v0 = tris[t, 0], v1 = tris[t, 1] and v2 = tris[t, 2]
+//
+// For pointcloud and meshes of batch size N, this function requires N
+// computations. The memory occupied is O(NPT) which can become quite large.
+// For example, a medium sized batch with N = 32 with P = 10000 and T = 5000
+// will require for the forward pass 5.8G of memory to store dists.
+
+#ifdef WITH_CUDA
+
+torch::Tensor PointFaceArrayDistanceForwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& tris);
+#endif
+
+torch::Tensor PointFaceArrayDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& tris);
+
+torch::Tensor PointFaceArrayDistanceForward(
+    const torch::Tensor& points,
+    const torch::Tensor& tris) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(tris);
+    return PointFaceArrayDistanceForwardCuda(points, tris);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return PointFaceArrayDistanceForwardCpu(points, tris);
+}
+
+// Backward pass for PointFaceArrayDistance.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    tris: FloatTensor of shape (T, 3, 3)
+//    grad_dists: FloatTensor of shape (P, T)
+//
+// Returns:
+//    grad_points: FloatTensor of shape (P, 3)
+//    grad_tris: FloatTensor of shape (T, 3, 3)
+//
+
+#ifdef WITH_CUDA
+std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& grad_dists);
+#endif
+std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& grad_dists);
+
+std::tuple<torch::Tensor, torch::Tensor> PointFaceArrayDistanceBackward(
+    const torch::Tensor& points,
+    const torch::Tensor& tris,
+    const torch::Tensor& grad_dists) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(tris);
+    CHECK_CUDA(grad_dists);
+    return PointFaceArrayDistanceBackwardCuda(points, tris, grad_dists);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return PointFaceArrayDistanceBackwardCpu(points, tris, grad_dists);
+}
+
+// ****************************************************************************
+// *                          PointEdgeArrayDistance                          *
+// ****************************************************************************
+
+// Computes the squared euclidean distance of each p in points to each edge
+// segment in segms.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    segms: FloatTensor of shape (S, 2, 3) of edge segments. The s-th
+//        edge segment is spanned by (segms[s, 0], segms[s, 1])
+//
+// Returns:
+//    dists: FloatTensor of shape (P, S), where dists[p, s] is the squared
+//        euclidean distance of points[p] to the segment spanned by
+//        (segms[s, 0], segms[s, 1])
+//
+// For pointcloud and meshes of batch size N, this function requires N
+// computations. The memory occupied is O(NPS) which can become quite large.
+// For example, a medium sized batch with N = 32 with P = 10000 and S = 5000
+// will require for the forward pass 5.8G of memory to store dists.
+
+#ifdef WITH_CUDA
+torch::Tensor PointEdgeArrayDistanceForwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& segms);
+#endif
+
+torch::Tensor PointEdgeArrayDistanceForwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& segms);
+
+torch::Tensor PointEdgeArrayDistanceForward(
+    const torch::Tensor& points,
+    const torch::Tensor& segms) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(segms);
+    return PointEdgeArrayDistanceForwardCuda(points, segms);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return PointEdgeArrayDistanceForwardCpu(points, segms);
+}
+
+// Backward pass for PointEdgeArrayDistance.
+//
+// Args:
+//    points: FloatTensor of shape (P, 3)
+//    segms: FloatTensor of shape (S, 2, 3)
+//    grad_dists: FloatTensor of shape (P, S)
+//
+// Returns:
+//   grad_points: FloatTensor of shape (P, 3)
+//   grad_segms: FloatTensor of shape (S, 2, 3)
+//
+
+#ifdef WITH_CUDA
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeArrayDistanceBackwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& grad_dists);
+#endif
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeArrayDistanceBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& grad_dists);
+
+std::tuple<torch::Tensor, torch::Tensor> PointEdgeArrayDistanceBackward(
+    const torch::Tensor& points,
+    const torch::Tensor& segms,
+    const torch::Tensor& grad_dists) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(segms);
+    CHECK_CUDA(grad_dists);
+    return PointEdgeArrayDistanceBackwardCuda(points, segms, grad_dists);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return PointEdgeArrayDistanceBackwardCpu(points, segms, grad_dists);
+}
diff --git a/pytorch3d/pytorch3d/csrc/points_to_volumes/points_to_volumes.cu b/pytorch3d/pytorch3d/csrc/points_to_volumes/points_to_volumes.cu
new file mode 100644
index 0000000000000000000000000000000000000000..30263f008c7d6266005bfef55fc51d298f80c586
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/points_to_volumes/points_to_volumes.cu
@@ -0,0 +1,346 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+using at::PackedTensorAccessor64;
+using at::RestrictPtrTraits;
+
+// A chunk of work is blocksize-many points.
+// There are N clouds in the batch, and P points in each cloud.
+// The number of potential chunks to do per cloud is (1+(P-1)/blocksize),
+// which we call chunks_per_cloud.
+// These (N*chunks_per_cloud) chunks are divided among the gridSize-many blocks.
+// In block b, we work on chunks b, b+gridSize, b+2*gridSize etc .
+// In chunk i, we work on cloud (i/chunks_per_cloud) on points starting from
+// blocksize*(i%chunks_per_cloud).
+
+// Explanation of the calculation is in the cpp file.
+
+// EightDirections(t) runs t(a,b,c) for every combination of boolean a, b, c.
+template <class T>
+static __device__ void EightDirections(T&& t) {
+  t(false, false, false);
+  t(false, false, true);
+  t(false, true, false);
+  t(false, true, true);
+  t(true, false, false);
+  t(true, false, true);
+  t(true, true, false);
+  t(true, true, true);
+}
+
+__global__ void PointsToVolumesForwardKernel(
+    const PackedTensorAccessor64<float, 3, RestrictPtrTraits> points_3d,
+    const PackedTensorAccessor64<float, 3, RestrictPtrTraits> points_features,
+    PackedTensorAccessor64<float, 5, RestrictPtrTraits> volume_densities,
+    PackedTensorAccessor64<float, 5, RestrictPtrTraits> volume_features,
+    PackedTensorAccessor64<int64_t, 2, RestrictPtrTraits> grid_sizes,
+    PackedTensorAccessor64<float, 2, RestrictPtrTraits> mask,
+    const float point_weight,
+    const bool align_corners,
+    const bool splat,
+    const int64_t batch_size,
+    const int64_t P,
+    const int64_t n_features) {
+  const int64_t chunks_per_cloud = (1 + (P - 1) / blockDim.x);
+  const int64_t chunks_to_do = batch_size * chunks_per_cloud;
+  const int scale_offset = align_corners ? 1 : 0;
+  const float offset = align_corners ? 0 : 0.5;
+  for (int64_t chunk = blockIdx.x; chunk < chunks_to_do; chunk += gridDim.x) {
+    const int64_t batch_index = chunk / chunks_per_cloud;
+    const int64_t start_point = blockDim.x * (chunk % chunks_per_cloud);
+    int64_t point_idx = start_point + threadIdx.x;
+    if (point_idx >= P) {
+      continue;
+    }
+    if (mask[batch_index][point_idx] == 0) {
+      continue;
+    }
+    auto volume_densities_aa = volume_densities[batch_index][0];
+    auto volume_features_aa = volume_features[batch_index];
+    auto point = points_3d[batch_index][point_idx];
+    auto point_features = points_features[batch_index][point_idx];
+    const int64_t grid_size_x = grid_sizes[batch_index][2];
+    const int64_t grid_size_y = grid_sizes[batch_index][1];
+    const int64_t grid_size_z = grid_sizes[batch_index][0];
+    auto increment_location =
+        [&](int64_t x, int64_t y, int64_t z, float weight) {
+          if (x >= grid_size_x || y >= grid_size_y || z >= grid_size_z) {
+            return;
+          }
+          if (x < 0 || y < 0 || z < 0) {
+            return;
+          }
+
+          atomicAdd(&volume_densities_aa[z][y][x], weight * point_weight);
+
+          for (int64_t feature_idx = 0; feature_idx < n_features;
+               ++feature_idx) {
+            atomicAdd(
+                &volume_features_aa[feature_idx][z][y][x],
+                point_features[feature_idx] * weight * point_weight);
+          }
+        };
+    if (!splat) {
+      long x = std::lround(
+          (point[0] + 1) * 0.5 * (grid_size_x - scale_offset) - offset);
+      long y = std::lround(
+          (point[1] + 1) * 0.5 * (grid_size_y - scale_offset) - offset);
+      long z = std::lround(
+          (point[2] + 1) * 0.5 * (grid_size_z - scale_offset) - offset);
+      increment_location(x, y, z, 1);
+    } else {
+      float x = 0, y = 0, z = 0;
+      float rx = std::modf(
+          (point[0] + 1) * 0.5 * (grid_size_x - scale_offset) - offset, &x);
+      float ry = std::modf(
+          (point[1] + 1) * 0.5 * (grid_size_y - scale_offset) - offset, &y);
+      float rz = std::modf(
+          (point[2] + 1) * 0.5 * (grid_size_z - scale_offset) - offset, &z);
+      auto handle_point = [&](bool up_x, bool up_y, bool up_z) {
+        float weight =
+            (up_x ? rx : 1 - rx) * (up_y ? ry : 1 - ry) * (up_z ? rz : 1 - rz);
+        increment_location(x + up_x, y + up_y, z + up_z, weight);
+      };
+      EightDirections(handle_point);
+    }
+  }
+}
+
+void PointsToVolumesForwardCuda(
+    const at::Tensor& points_3d,
+    const at::Tensor& points_features,
+    const at::Tensor& volume_densities,
+    const at::Tensor& volume_features,
+    const at::Tensor& grid_sizes,
+    const at::Tensor& mask,
+    const float point_weight,
+    const bool align_corners,
+    const bool splat) {
+  // Check inputs are on the same device
+  at::TensorArg points_3d_t{points_3d, "points_3d", 1},
+      points_features_t{points_features, "points_features", 2},
+      volume_densities_t{volume_densities, "volume_densities", 3},
+      volume_features_t{volume_features, "volume_features", 4},
+      grid_sizes_t{grid_sizes, "grid_sizes", 5}, mask_t{mask, "mask", 6};
+  at::CheckedFrom c = "PointsToVolumesForwardCuda";
+  at::checkAllSameGPU(
+      c,
+      {points_3d_t,
+       points_features_t,
+       volume_densities_t,
+       volume_features_t,
+       grid_sizes_t,
+       mask_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(points_3d.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int blocks = 1024;
+  const int threads = 32;
+
+  const int64_t batch_size = points_3d.size(0);
+  const int64_t P = points_3d.size(1);
+  const int64_t n_features = points_features.size(2);
+
+  PointsToVolumesForwardKernel<<<blocks, threads, 0, stream>>>(
+      points_3d.packed_accessor64<float, 3, RestrictPtrTraits>(),
+      points_features.packed_accessor64<float, 3, RestrictPtrTraits>(),
+      volume_densities.packed_accessor64<float, 5, RestrictPtrTraits>(),
+      volume_features.packed_accessor64<float, 5, RestrictPtrTraits>(),
+      grid_sizes.packed_accessor64<int64_t, 2, RestrictPtrTraits>(),
+      mask.packed_accessor64<float, 2, RestrictPtrTraits>(),
+      point_weight,
+      align_corners,
+      splat,
+      batch_size,
+      P,
+      n_features);
+}
+
+__global__ void PointsToVolumesBackwardKernel(
+    const PackedTensorAccessor64<float, 3, RestrictPtrTraits> points_3d,
+    const PackedTensorAccessor64<float, 3, RestrictPtrTraits> points_features,
+    const PackedTensorAccessor64<int64_t, 2, RestrictPtrTraits> grid_sizes,
+    const PackedTensorAccessor64<float, 2, RestrictPtrTraits> mask,
+    PackedTensorAccessor64<float, 5, RestrictPtrTraits> grad_volume_densities,
+    PackedTensorAccessor64<float, 5, RestrictPtrTraits> grad_volume_features,
+    PackedTensorAccessor64<float, 3, RestrictPtrTraits> grad_points_3d,
+    PackedTensorAccessor64<float, 3, RestrictPtrTraits> grad_points_features,
+    const float point_weight,
+    const bool align_corners,
+    const bool splat,
+    const int64_t batch_size,
+    const int64_t P,
+    const int64_t n_features) {
+  const int64_t chunks_per_cloud = (1 + (P - 1) / blockDim.x);
+  const int64_t chunks_to_do = batch_size * chunks_per_cloud;
+  const int scale_offset = align_corners ? 1 : 0;
+  const float offset = align_corners ? 0 : 0.5;
+  // Note that the gradients belonging to each point are only touched by
+  // a single thread in one of our "chunks", which is in a single block.
+  // So unlike in the forward pass, there's no need for atomics here.
+  for (int64_t chunk = blockIdx.x; chunk < chunks_to_do; chunk += gridDim.x) {
+    const int64_t batch_index = chunk / chunks_per_cloud;
+    const int64_t start_point = blockDim.x * (chunk % chunks_per_cloud);
+    int64_t point_idx = start_point + threadIdx.x;
+    if (point_idx >= P) {
+      continue;
+    }
+    if (mask[batch_index][point_idx] == 0) {
+      continue;
+    }
+    auto point = points_3d[batch_index][point_idx];
+    auto point_features = points_features[batch_index][point_idx];
+    auto grad_point = grad_points_3d[batch_index][point_idx];
+    auto grad_point_features = grad_points_features[batch_index][point_idx];
+    auto grad_volume_densities_a = grad_volume_densities[batch_index][0];
+    auto grad_volume_features_a = grad_volume_features[batch_index];
+    const int64_t grid_size_x = grid_sizes[batch_index][2];
+    const int64_t grid_size_y = grid_sizes[batch_index][1];
+    const int64_t grid_size_z = grid_sizes[batch_index][0];
+
+    auto increment_location =
+        [&](int64_t x, int64_t y, int64_t z, float weight) {
+          if (x >= grid_size_x || y >= grid_size_y || z >= grid_size_z) {
+            return false;
+          }
+          if (x < 0 || y < 0 || z < 0) {
+            return false;
+          }
+
+          // This is a forward line, for comparison
+          // volume_densities_aa[z][y][x] += weight * point_weight;
+
+          for (int64_t feature_idx = 0; feature_idx < n_features;
+               ++feature_idx) {
+            // This is a forward line, for comparison
+            // volume_features_aa[feature_idx][z][y][x] +=
+            //    point_features[feature_idx] * weight * point_weight;
+            grad_point_features[feature_idx] +=
+                grad_volume_features_a[feature_idx][z][y][x] * weight *
+                point_weight;
+          }
+          return true;
+        };
+
+    if (!splat) {
+      long x = std::lround(
+          (point[0] + 1) * 0.5 * (grid_size_x - scale_offset) - offset);
+      long y = std::lround(
+          (point[1] + 1) * 0.5 * (grid_size_y - scale_offset) - offset);
+      long z = std::lround(
+          (point[2] + 1) * 0.5 * (grid_size_z - scale_offset) - offset);
+      increment_location(x, y, z, 1);
+    } else {
+      float x = 0, y = 0, z = 0;
+      float rx = std::modf(
+          (point[0] + 1) * 0.5 * (grid_size_x - scale_offset) - offset, &x);
+      float ry = std::modf(
+          (point[1] + 1) * 0.5 * (grid_size_y - scale_offset) - offset, &y);
+      float rz = std::modf(
+          (point[2] + 1) * 0.5 * (grid_size_z - scale_offset) - offset, &z);
+      auto handle_point = [&](bool up_x, bool up_y, bool up_z) {
+        float weight_x = (up_x ? rx : 1 - rx);
+        float weight_y = (up_y ? ry : 1 - ry);
+        float weight_z = (up_z ? rz : 1 - rz);
+        float weight = weight_x * weight_y * weight_z;
+        if (increment_location(x + up_x, y + up_y, z + up_z, weight)) {
+          // weight * point_weight has been added to
+          // volume_densities_aa[z+up_z][y+up_y][x+up_x]
+          // Also for each feature_idx,
+          //   point_features[feature_idx] * weight * point_weight
+          // has been added to
+          // volume_features_aa[feature_idx][z+up_z][y+up_y][x+up_x]
+
+          double source_gradient =
+              grad_volume_densities_a[z + up_z][y + up_y][x + up_x];
+          for (int64_t feature_idx = 0; feature_idx < n_features;
+               ++feature_idx) {
+            source_gradient += point_features[feature_idx] *
+                grad_volume_features_a[feature_idx][z + up_z][y + up_y]
+                                      [x + up_x];
+          }
+          grad_point[0] += source_gradient * (up_x ? 1 : -1) * weight_y *
+              weight_z * 0.5 * (grid_size_x - scale_offset) * point_weight;
+          grad_point[1] += source_gradient * (up_y ? 1 : -1) * weight_x *
+              weight_z * 0.5 * (grid_size_y - scale_offset) * point_weight;
+          grad_point[2] += source_gradient * (up_z ? 1 : -1) * weight_x *
+              weight_y * 0.5 * (grid_size_z - scale_offset) * point_weight;
+        }
+      };
+      EightDirections(handle_point);
+    }
+  }
+}
+
+void PointsToVolumesBackwardCuda(
+    const at::Tensor& points_3d,
+    const at::Tensor& points_features,
+    const at::Tensor& grid_sizes,
+    const at::Tensor& mask,
+    const float point_weight,
+    const bool align_corners,
+    const bool splat,
+    const at::Tensor& grad_volume_densities,
+    const at::Tensor& grad_volume_features,
+    const at::Tensor& grad_points_3d,
+    const at::Tensor& grad_points_features) {
+  // Check inputs are on the same device
+  at::TensorArg points_3d_t{points_3d, "points_3d", 1},
+      points_features_t{points_features, "points_features", 2},
+      grid_sizes_t{grid_sizes, "grid_sizes", 3}, mask_t{mask, "mask", 4},
+      grad_volume_densities_t{
+          grad_volume_densities, "grad_volume_densities", 8},
+      grad_volume_features_t{grad_volume_features, "grad_volume_features", 9},
+      grad_points_3d_t{grad_points_3d, "grad_points_3d", 10},
+      grad_points_features_t{grad_points_features, "grad_points_features", 11};
+
+  at::CheckedFrom c = "PointsToVolumesBackwardCuda";
+  at::checkAllSameGPU(
+      c,
+      {points_3d_t,
+       points_features_t,
+       grid_sizes_t,
+       mask_t,
+       grad_volume_densities_t,
+       grad_volume_features_t,
+       grad_points_3d_t,
+       grad_points_features_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(points_3d.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int blocks = 1024;
+  const int threads = 32;
+
+  const int64_t batch_size = points_3d.size(0);
+  const int64_t P = points_3d.size(1);
+  const int64_t n_features = points_features.size(2);
+
+  PointsToVolumesBackwardKernel<<<blocks, threads, 0, stream>>>(
+      points_3d.packed_accessor64<float, 3, RestrictPtrTraits>(),
+      points_features.packed_accessor64<float, 3, RestrictPtrTraits>(),
+      grid_sizes.packed_accessor64<int64_t, 2, RestrictPtrTraits>(),
+      mask.packed_accessor64<float, 2, RestrictPtrTraits>(),
+      grad_volume_densities.packed_accessor64<float, 5, RestrictPtrTraits>(),
+      grad_volume_features.packed_accessor64<float, 5, RestrictPtrTraits>(),
+      grad_points_3d.packed_accessor64<float, 3, RestrictPtrTraits>(),
+      grad_points_features.packed_accessor64<float, 3, RestrictPtrTraits>(),
+      point_weight,
+      align_corners,
+      splat,
+      batch_size,
+      P,
+      n_features);
+}
diff --git a/pytorch3d/pytorch3d/csrc/points_to_volumes/points_to_volumes.h b/pytorch3d/pytorch3d/csrc/points_to_volumes/points_to_volumes.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a93c905a79268afbf2447c83e961261b48cd0fb
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/points_to_volumes/points_to_volumes.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <cstdio>
+#include <tuple>
+#include "utils/pytorch3d_cutils.h"
+
+/*
+    volume_features and volume_densities are modified in place.
+
+    Args:
+        points_3d: Batch of 3D point cloud coordinates of shape
+            `(minibatch, N, 3)` where N is the number of points
+            in each point cloud. Coordinates have to be specified in the
+            local volume coordinates (ranging in [-1, 1]).
+        points_features: Features of shape `(minibatch, N, feature_dim)`
+            corresponding to the points of the input point cloud `points_3d`.
+        volume_features: Batch of input feature volumes
+            of shape `(minibatch, feature_dim, D, H, W)`
+        volume_densities: Batch of input feature volume densities
+            of shape `(minibatch, 1, D, H, W)`. Each voxel should
+            contain a non-negative number corresponding to its
+            opaqueness (the higher, the less transparent).
+
+        grid_sizes: `LongTensor` of shape (minibatch, 3) representing the
+            spatial resolutions of each of the the non-flattened `volumes`
+            tensors. Note that the following has to hold:
+                `torch.prod(grid_sizes, dim=1)==N_voxels`.
+
+        point_weight: A scalar controlling how much weight a single point has.
+
+        mask: A binary mask of shape `(minibatch, N)` determining
+            which 3D points are going to be converted to the resulting
+            volume. Set to `None` if all points are valid.
+
+        align_corners: as for grid_sample.
+
+        splat: if true, trilinear interpolation. If false all the weight goes in
+            the nearest voxel.
+*/
+
+void PointsToVolumesForwardCpu(
+    const torch::Tensor& points_3d,
+    const torch::Tensor& points_features,
+    const torch::Tensor& volume_densities,
+    const torch::Tensor& volume_features,
+    const torch::Tensor& grid_sizes,
+    const torch::Tensor& mask,
+    float point_weight,
+    bool align_corners,
+    bool splat);
+
+void PointsToVolumesForwardCuda(
+    const torch::Tensor& points_3d,
+    const torch::Tensor& points_features,
+    const torch::Tensor& volume_densities,
+    const torch::Tensor& volume_features,
+    const torch::Tensor& grid_sizes,
+    const torch::Tensor& mask,
+    float point_weight,
+    bool align_corners,
+    bool splat);
+
+inline void PointsToVolumesForward(
+    const torch::Tensor& points_3d,
+    const torch::Tensor& points_features,
+    const torch::Tensor& volume_densities,
+    const torch::Tensor& volume_features,
+    const torch::Tensor& grid_sizes,
+    const torch::Tensor& mask,
+    float point_weight,
+    bool align_corners,
+    bool splat) {
+  if (points_3d.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points_3d);
+    CHECK_CUDA(points_features);
+    CHECK_CUDA(volume_densities);
+    CHECK_CUDA(volume_features);
+    CHECK_CUDA(grid_sizes);
+    CHECK_CUDA(mask);
+    PointsToVolumesForwardCuda(
+        points_3d,
+        points_features,
+        volume_densities,
+        volume_features,
+        grid_sizes,
+        mask,
+        point_weight,
+        align_corners,
+        splat);
+    return;
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  PointsToVolumesForwardCpu(
+      points_3d,
+      points_features,
+      volume_densities,
+      volume_features,
+      grid_sizes,
+      mask,
+      point_weight,
+      align_corners,
+      splat);
+}
+
+// grad_points_3d and grad_points_features are modified in place.
+
+void PointsToVolumesBackwardCpu(
+    const torch::Tensor& points_3d,
+    const torch::Tensor& points_features,
+    const torch::Tensor& grid_sizes,
+    const torch::Tensor& mask,
+    float point_weight,
+    bool align_corners,
+    bool splat,
+    const torch::Tensor& grad_volume_densities,
+    const torch::Tensor& grad_volume_features,
+    const torch::Tensor& grad_points_3d,
+    const torch::Tensor& grad_points_features);
+
+void PointsToVolumesBackwardCuda(
+    const torch::Tensor& points_3d,
+    const torch::Tensor& points_features,
+    const torch::Tensor& grid_sizes,
+    const torch::Tensor& mask,
+    float point_weight,
+    bool align_corners,
+    bool splat,
+    const torch::Tensor& grad_volume_densities,
+    const torch::Tensor& grad_volume_features,
+    const torch::Tensor& grad_points_3d,
+    const torch::Tensor& grad_points_features);
+
+inline void PointsToVolumesBackward(
+    const torch::Tensor& points_3d,
+    const torch::Tensor& points_features,
+    const torch::Tensor& grid_sizes,
+    const torch::Tensor& mask,
+    float point_weight,
+    bool align_corners,
+    bool splat,
+    const torch::Tensor& grad_volume_densities,
+    const torch::Tensor& grad_volume_features,
+    const torch::Tensor& grad_points_3d,
+    const torch::Tensor& grad_points_features) {
+  if (points_3d.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points_3d);
+    CHECK_CUDA(points_features);
+    CHECK_CUDA(grid_sizes);
+    CHECK_CUDA(mask);
+    CHECK_CUDA(grad_volume_densities);
+    CHECK_CUDA(grad_volume_features);
+    CHECK_CUDA(grad_points_3d);
+    CHECK_CUDA(grad_points_features);
+    PointsToVolumesBackwardCuda(
+        points_3d,
+        points_features,
+        grid_sizes,
+        mask,
+        point_weight,
+        align_corners,
+        splat,
+        grad_volume_densities,
+        grad_volume_features,
+        grad_points_3d,
+        grad_points_features);
+    return;
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  PointsToVolumesBackwardCpu(
+      points_3d,
+      points_features,
+      grid_sizes,
+      mask,
+      point_weight,
+      align_corners,
+      splat,
+      grad_volume_densities,
+      grad_volume_features,
+      grad_points_3d,
+      grad_points_features);
+}
diff --git a/pytorch3d/pytorch3d/csrc/points_to_volumes/points_to_volumes_cpu.cpp b/pytorch3d/pytorch3d/csrc/points_to_volumes/points_to_volumes_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..532ee31cdc6f2055cf01e4eecffd047f8def33cc
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/points_to_volumes/points_to_volumes_cpu.cpp
@@ -0,0 +1,316 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <algorithm>
+#include <cmath>
+#include <thread>
+#include <vector>
+
+// In the x direction, the location {0, ..., grid_size_x - 1} correspond to
+// points px in [-1, 1]. There are two ways to do this.
+
+// If align_corners=True, px=-1 is the exact location 0 and px=1 is the exact
+// location grid_size_x - 1.
+// So the location of px is {(px + 1) * 0.5} * (grid_size_x - 1).
+// Note that if you generate random points within the bounds you are less likely
+// to hit the edge locations than other locations.
+// This can be thought of as saying "location i" means a specific point.
+
+// If align_corners=False, px=-1 is half way between the exact location 0 and
+// the non-existent location -1, i.e. location -0.5.
+// Similarly px=1 is is half way between the exact location grid_size_x-1 and
+// the non-existent location grid_size, i.e. the location grid_size_x - 0.5.
+// So the location of px is ({(px + 1) * 0.5} * grid_size_x) - 0.5.
+// Note that if you generate random points within the bounds you are equally
+// likely to hit any location.
+// This can be thought of as saying "location i" means the whole box from
+// (i-0.5) to (i+0.5)
+
+// EightDirections(t) runs t(a,b,c) for every combination of boolean a, b, c.
+template <class T>
+static void EightDirections(T&& t) {
+  t(false, false, false);
+  t(false, false, true);
+  t(false, true, false);
+  t(false, true, true);
+  t(true, false, false);
+  t(true, false, true);
+  t(true, true, false);
+  t(true, true, true);
+}
+
+void PointsToVolumesForwardCpu(
+    const torch::Tensor& points_3d,
+    const torch::Tensor& points_features,
+    const torch::Tensor& volume_densities,
+    const torch::Tensor& volume_features,
+    const torch::Tensor& grid_sizes,
+    const torch::Tensor& mask,
+    const float point_weight,
+    const bool align_corners,
+    const bool splat) {
+  const int64_t batch_size = points_3d.size(0);
+  const int64_t P = points_3d.size(1);
+  const int64_t n_features = points_features.size(2);
+
+  // We unify the formula for the location of px in the comment above as
+  // ({(px + 1) * 0.5} * (grid_size_x-scale_offset)) - offset.
+  const int scale_offset = align_corners ? 1 : 0;
+  const float offset = align_corners ? 0 : 0.5;
+
+  auto points_3d_a = points_3d.accessor<float, 3>();
+  auto points_features_a = points_features.accessor<float, 3>();
+  auto volume_densities_a = volume_densities.accessor<float, 5>();
+  auto volume_features_a = volume_features.accessor<float, 5>();
+  auto grid_sizes_a = grid_sizes.accessor<int64_t, 2>();
+  auto mask_a = mask.accessor<float, 2>();
+
+  // For each batch element
+  for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
+    auto points_3d_aa = points_3d_a[batch_idx];
+    auto points_features_aa = points_features_a[batch_idx];
+    auto volume_densities_aa = volume_densities_a[batch_idx][0];
+    auto volume_features_aa = volume_features_a[batch_idx];
+    auto grid_sizes_aa = grid_sizes_a[batch_idx];
+    auto mask_aa = mask_a[batch_idx];
+
+    const int64_t grid_size_x = grid_sizes_aa[2];
+    const int64_t grid_size_y = grid_sizes_aa[1];
+    const int64_t grid_size_z = grid_sizes_aa[0];
+
+    // For each point
+    for (int64_t point_idx = 0; point_idx < P; ++point_idx) {
+      // Ignore point if mask is 0
+      if (mask_aa[point_idx] == 0) {
+        continue;
+      }
+      auto point = points_3d_aa[point_idx];
+      auto point_features = points_features_aa[point_idx];
+
+      // Define how to increment a location in the volume by an amount. The need
+      // for this depends on the interpolation method:
+      // once per point for nearest, eight times for splat.
+      auto increment_location =
+          [&](int64_t x, int64_t y, int64_t z, float weight) {
+            if (x >= grid_size_x || y >= grid_size_y || z >= grid_size_z) {
+              return;
+            }
+            if (x < 0 || y < 0 || z < 0) {
+              return;
+            }
+
+            volume_densities_aa[z][y][x] += weight * point_weight;
+
+            for (int64_t feature_idx = 0; feature_idx < n_features;
+                 ++feature_idx) {
+              volume_features_aa[feature_idx][z][y][x] +=
+                  point_features[feature_idx] * weight * point_weight;
+            }
+          };
+
+      if (!splat) {
+        // Increment the location nearest the point.
+        long x = std::lround(
+            (point[0] + 1) * 0.5 * (grid_size_x - scale_offset) - offset);
+        long y = std::lround(
+            (point[1] + 1) * 0.5 * (grid_size_y - scale_offset) - offset);
+        long z = std::lround(
+            (point[2] + 1) * 0.5 * (grid_size_z - scale_offset) - offset);
+        increment_location(x, y, z, 1);
+      } else {
+        // There are 8 locations around the point which we need to worry about.
+        // Their coordinates are (x or x+1, y or y+1, z or z+1).
+        // rx is a number between 0 and 1 for the proportion in the x direction:
+        // rx==0 means weight all on the lower bound, x, rx=1-eps means most
+        // weight on x+1. Ditto for ry and yz.
+        float x = 0, y = 0, z = 0;
+        float rx = std::modf(
+            (point[0] + 1) * 0.5 * (grid_size_x - scale_offset) - offset, &x);
+        float ry = std::modf(
+            (point[1] + 1) * 0.5 * (grid_size_y - scale_offset) - offset, &y);
+        float rz = std::modf(
+            (point[2] + 1) * 0.5 * (grid_size_z - scale_offset) - offset, &z);
+        // Define how to fractionally increment one of the 8 locations around
+        // the point.
+        auto handle_point = [&](bool up_x, bool up_y, bool up_z) {
+          float weight = (up_x ? rx : 1 - rx) * (up_y ? ry : 1 - ry) *
+              (up_z ? rz : 1 - rz);
+          increment_location(x + up_x, y + up_y, z + up_z, weight);
+        };
+        // and do so.
+        EightDirections(handle_point);
+      }
+    }
+  }
+}
+
+// With nearest, the only smooth dependence is that volume features
+// depend on points features.
+//
+// With splat, the dependencies are as follows, with gradients passing
+// in the opposite direction.
+//
+//    points_3d         points_features
+//         │  │                  │
+//         │  │                  │
+//         │  └───────────┐      │
+//         │              │      │
+//         │              │      │
+//         ▼              ▼      ▼
+// volume_densities    volume_features
+
+// It is also the case that the input volume_densities and
+// volume_features affect the corresponding outputs (they are
+// modified in place).
+// But the forward pass just increments these by a value which
+// does not depend on them. So our autograd backwards pass needs
+// to copy the gradient for each of those outputs to the
+// corresponding input. We just do that in the Python layer.
+
+void PointsToVolumesBackwardCpu(
+    const torch::Tensor& points_3d,
+    const torch::Tensor& points_features,
+    const torch::Tensor& grid_sizes,
+    const torch::Tensor& mask,
+    const float point_weight,
+    const bool align_corners,
+    const bool splat,
+    const torch::Tensor& grad_volume_densities,
+    const torch::Tensor& grad_volume_features,
+    const torch::Tensor& grad_points_3d,
+    const torch::Tensor& grad_points_features) {
+  const int64_t batch_size = points_3d.size(0);
+  const int64_t P = points_3d.size(1);
+  const int64_t n_features = grad_points_features.size(2);
+  const int scale_offset = align_corners ? 1 : 0;
+  const float offset = align_corners ? 0 : 0.5;
+
+  auto points_3d_a = points_3d.accessor<float, 3>();
+  auto points_features_a = points_features.accessor<float, 3>();
+  auto grid_sizes_a = grid_sizes.accessor<int64_t, 2>();
+  auto mask_a = mask.accessor<float, 2>();
+  auto grad_volume_densities_a = grad_volume_densities.accessor<float, 5>();
+  auto grad_volume_features_a = grad_volume_features.accessor<float, 5>();
+  auto grad_points_3d_a = grad_points_3d.accessor<float, 3>();
+  auto grad_points_features_a = grad_points_features.accessor<float, 3>();
+
+  // For each batch element
+  for (int64_t batch_idx = 0; batch_idx < batch_size; ++batch_idx) {
+    auto points_3d_aa = points_3d_a[batch_idx];
+    auto points_features_aa = points_features_a[batch_idx];
+    auto grid_sizes_aa = grid_sizes_a[batch_idx];
+    auto mask_aa = mask_a[batch_idx];
+    auto grad_volume_densities_aa = grad_volume_densities_a[batch_idx][0];
+    auto grad_volume_features_aa = grad_volume_features_a[batch_idx];
+    auto grad_points_3d_aa = grad_points_3d_a[batch_idx];
+    auto grad_points_features_aa = grad_points_features_a[batch_idx];
+
+    const int64_t grid_size_x = grid_sizes_aa[2];
+    const int64_t grid_size_y = grid_sizes_aa[1];
+    const int64_t grid_size_z = grid_sizes_aa[0];
+
+    // For each point
+    for (int64_t point_idx = 0; point_idx < P; ++point_idx) {
+      if (mask_aa[point_idx] == 0) {
+        continue;
+      }
+      auto point = points_3d_aa[point_idx];
+      auto point_features = points_features_aa[point_idx];
+      auto grad_point_features = grad_points_features_aa[point_idx];
+      auto grad_point = grad_points_3d_aa[point_idx];
+
+      // Define how to (backwards) increment a location in the point cloud,
+      // to take gradients to the features.
+      // We return false if the location does not really exist, so there was
+      // nothing to do.
+      // This happens once per point for nearest, eight times for splat.
+      auto increment_location =
+          [&](int64_t x, int64_t y, int64_t z, float weight) {
+            if (x >= grid_size_x || y >= grid_size_y || z >= grid_size_z) {
+              return false;
+            }
+            if (x < 0 || y < 0 || z < 0) {
+              return false;
+            }
+
+            for (int64_t feature_idx = 0; feature_idx < n_features;
+                 ++feature_idx) {
+              // This is a forward line, for comparison
+              // volume_features_aa[feature_idx][z][y][x] +=
+              //    point_features[feature_idx] * weight * point_weight;
+              grad_point_features[feature_idx] +=
+                  grad_volume_features_aa[feature_idx][z][y][x] * weight *
+                  point_weight;
+            }
+            return true;
+          };
+
+      if (!splat) {
+        long x = std::lround(
+            (point[0] + 1) * 0.5 * (grid_size_x - scale_offset) - offset);
+        long y = std::lround(
+            (point[1] + 1) * 0.5 * (grid_size_y - scale_offset) - offset);
+        long z = std::lround(
+            (point[2] + 1) * 0.5 * (grid_size_z - scale_offset) - offset);
+        increment_location(x, y, z, 1);
+      } else {
+        float x = 0, y = 0, z = 0;
+        float rx = std::modf(
+            (point[0] + 1) * 0.5 * (grid_size_x - scale_offset) - offset, &x);
+        float ry = std::modf(
+            (point[1] + 1) * 0.5 * (grid_size_y - scale_offset) - offset, &y);
+        float rz = std::modf(
+            (point[2] + 1) * 0.5 * (grid_size_z - scale_offset) - offset, &z);
+        auto handle_point = [&](bool up_x, bool up_y, bool up_z) {
+          float weight_x = (up_x ? rx : 1 - rx);
+          float weight_y = (up_y ? ry : 1 - ry);
+          float weight_z = (up_z ? rz : 1 - rz);
+          float weight = weight_x * weight_y * weight_z;
+          // For each of the eight locations, we first increment the feature
+          // gradient.
+          if (increment_location(x + up_x, y + up_y, z + up_z, weight)) {
+            // If the location is a real location, we also (in this splat
+            // case) need to update the gradient w.r.t. the point position.
+            // - the amount in this location is controlled by the weight.
+            // There are two contributions:
+            //  (1) The point position affects how much density we added
+            //      to the location's density, so we have a contribution
+            //      from grad_volume_density. Specifically,
+            //      weight * point_weight has been added to
+            //      volume_densities_aa[z+up_z][y+up_y][x+up_x]
+            //
+            //  (2) The point position affects how much of each of the
+            //      point's features were added to the corresponding feature
+            //      of this location, so we have a contribution from
+            //      grad_volume_features. Specifically, for each feature_idx,
+            //      point_features[feature_idx] * weight * point_weight
+            //      has been added to
+            //      volume_features_aa[feature_idx][z+up_z][y+up_y][x+up_x]
+
+            float source_gradient =
+                grad_volume_densities_aa[z + up_z][y + up_y][x + up_x];
+            for (int64_t feature_idx = 0; feature_idx < n_features;
+                 ++feature_idx) {
+              source_gradient += point_features[feature_idx] *
+                  grad_volume_features_aa[feature_idx][z + up_z][y + up_y]
+                                         [x + up_x];
+            }
+            grad_point[0] += source_gradient * (up_x ? 1 : -1) * weight_y *
+                weight_z * 0.5 * (grid_size_x - scale_offset) * point_weight;
+            grad_point[1] += source_gradient * (up_y ? 1 : -1) * weight_x *
+                weight_z * 0.5 * (grid_size_y - scale_offset) * point_weight;
+            grad_point[2] += source_gradient * (up_z ? 1 : -1) * weight_x *
+                weight_y * 0.5 * (grid_size_z - scale_offset) * point_weight;
+          }
+        };
+        EightDirections(handle_point);
+      }
+    }
+  }
+}
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/constants.h b/pytorch3d/pytorch3d/csrc/pulsar/constants.h
new file mode 100644
index 0000000000000000000000000000000000000000..82b4be5abecd1bd133383deb861f00637ffa97a9
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/constants.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_CONSTANTS_H_
+#define PULSAR_NATIVE_CONSTANTS_H_
+
+#define EPS 1E-6
+#define FEPS 1E-6f
+#define MAX_FLOAT 3.4E38f
+#define MAX_INT 2147483647
+#define MAX_UINT 4294967295u
+#define MAX_USHORT 65535u
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/README.md b/pytorch3d/pytorch3d/csrc/pulsar/cuda/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..60c5d07cba3b8d403693e9aa3db2a0b74f66c472
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/README.md
@@ -0,0 +1,5 @@
+# CUDA device compilation units
+
+This folder contains `.cu` files to create compilation units
+for device-specific functions. See `../include/README.md` for
+more information.
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/commands.h b/pytorch3d/pytorch3d/csrc/pulsar/cuda/commands.h
new file mode 100644
index 0000000000000000000000000000000000000000..a7a038bd2fd8ad5f682d817d68abf5618a081208
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/commands.h
@@ -0,0 +1,505 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_CUDA_COMMANDS_H_
+#define PULSAR_NATIVE_CUDA_COMMANDS_H_
+
+// Definitions for GPU commands.
+#include <cooperative_groups.h>
+#include <cub/cub.cuh>
+namespace cg = cooperative_groups;
+
+#ifdef __DRIVER_TYPES_H__
+#ifndef DEVICE_RESET
+#define DEVICE_RESET cudaDeviceReset();
+#endif
+#else
+#ifndef DEVICE_RESET
+#define DEVICE_RESET
+#endif
+#endif
+
+#define HANDLECUDA(CMD) CMD
+// handleCudaError((CMD), __FILE__, __LINE__)
+inline void
+handleCudaError(const cudaError_t err, const char* file, const int line) {
+  if (err != cudaSuccess) {
+#ifndef __NVCC__
+    fprintf(
+        stderr,
+        "%s(%i) : getLastCudaError() CUDA error :"
+        " (%d) %s.\n",
+        file,
+        line,
+        static_cast<int>(err),
+        cudaGetErrorString(err));
+    DEVICE_RESET
+    exit(1);
+#endif
+  }
+}
+inline void
+getLastCudaError(const char* errorMessage, const char* file, const int line) {
+  cudaError_t err = cudaGetLastError();
+  if (cudaSuccess != err) {
+    fprintf(stderr, "Error: %s.", errorMessage);
+    handleCudaError(err, file, line);
+  }
+}
+
+#define ALIGN(VAL) __align__(VAL)
+#define SYNC() HANDLECUDE(cudaDeviceSynchronize())
+#define THREADFENCE_B() __threadfence_block()
+#define SHFL_SYNC(a, b, c) __shfl_sync((a), (b), (c))
+#define SHARED __shared__
+#define ACTIVEMASK() __activemask()
+#define BALLOT(mask, val) __ballot_sync((mask), val)
+/**
+ * Find the cumulative sum within a warp up to the current
+ * thread lane, with each mask thread contributing base.
+ */
+template <typename T>
+DEVICE T
+WARP_CUMSUM(const cg::coalesced_group& group, const uint& mask, const T& base) {
+  T ret = base;
+  T shfl_val;
+  shfl_val = __shfl_down_sync(mask, ret, 1u); // Deactivate the rightmost lane.
+  ret += (group.thread_rank() < 31) * shfl_val;
+  shfl_val = __shfl_down_sync(mask, ret, 2u);
+  ret += (group.thread_rank() < 30) * shfl_val;
+  shfl_val = __shfl_down_sync(mask, ret, 4u); // ...4
+  ret += (group.thread_rank() < 28) * shfl_val;
+  shfl_val = __shfl_down_sync(mask, ret, 8u); // ...8
+  ret += (group.thread_rank() < 24) * shfl_val;
+  shfl_val = __shfl_down_sync(mask, ret, 16u); // ...16
+  ret += (group.thread_rank() < 16) * shfl_val;
+  return ret;
+}
+
+template <typename T>
+DEVICE T
+WARP_MAX(const cg::coalesced_group& group, const uint& mask, const T& base) {
+  T ret = base;
+  ret = max(ret, __shfl_down_sync(mask, ret, 16u));
+  ret = max(ret, __shfl_down_sync(mask, ret, 8u));
+  ret = max(ret, __shfl_down_sync(mask, ret, 4u));
+  ret = max(ret, __shfl_down_sync(mask, ret, 2u));
+  ret = max(ret, __shfl_down_sync(mask, ret, 1u));
+  return ret;
+}
+
+template <typename T>
+DEVICE T
+WARP_SUM(const cg::coalesced_group& group, const uint& mask, const T& base) {
+  T ret = base;
+  ret = ret + __shfl_down_sync(mask, ret, 16u);
+  ret = ret + __shfl_down_sync(mask, ret, 8u);
+  ret = ret + __shfl_down_sync(mask, ret, 4u);
+  ret = ret + __shfl_down_sync(mask, ret, 2u);
+  ret = ret + __shfl_down_sync(mask, ret, 1u);
+  return ret;
+}
+
+INLINE DEVICE float3 WARP_SUM_FLOAT3(
+    const cg::coalesced_group& group,
+    const uint& mask,
+    const float3& base) {
+  float3 ret = base;
+  ret.x = WARP_SUM(group, mask, base.x);
+  ret.y = WARP_SUM(group, mask, base.y);
+  ret.z = WARP_SUM(group, mask, base.z);
+  return ret;
+}
+
+// Floating point.
+// #define FMUL(a, b) __fmul_rn((a), (b))
+#define FMUL(a, b) ((a) * (b))
+#define FDIV(a, b) __fdiv_rn((a), (b))
+// #define FSUB(a, b) __fsub_rn((a), (b))
+#define FSUB(a, b) ((a) - (b))
+#define FADD(a, b) __fadd_rn((a), (b))
+#define FSQRT(a) __fsqrt_rn(a)
+#define FEXP(a) fasterexp(a)
+#define FLN(a) fasterlog(a)
+#define FPOW(a, b) __powf((a), (b))
+#define FMAX(a, b) fmax((a), (b))
+#define FMIN(a, b) fmin((a), (b))
+#define FCEIL(a) ceilf(a)
+#define FFLOOR(a) floorf(a)
+#define FROUND(x) nearbyintf(x)
+#define FSATURATE(x) __saturatef(x)
+#define FABS(a) abs(a)
+#define IASF(a, loc) (loc) = __int_as_float(a)
+#define FASI(a, loc) (loc) = __float_as_int(a)
+#define FABSLEQAS(a, b, c) \
+  ((a) <= (b) ? FSUB((b), (a)) <= (c) : FSUB((a), (b)) < (c))
+/** Calculates x*y+z. */
+#define FMA(x, y, z) __fmaf_rn((x), (y), (z))
+#define I2F(a) __int2float_rn(a)
+#define FRCP(x) __frcp_rn(x)
+__device__ static float atomicMax(float* address, float val) {
+  int* address_as_i = (int*)address;
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = ::atomicCAS(
+        address_as_i,
+        assumed,
+        __float_as_int(::fmaxf(val, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+__device__ static float atomicMin(float* address, float val) {
+  int* address_as_i = (int*)address;
+  int old = *address_as_i, assumed;
+  do {
+    assumed = old;
+    old = ::atomicCAS(
+        address_as_i,
+        assumed,
+        __float_as_int(::fminf(val, __int_as_float(assumed))));
+  } while (assumed != old);
+  return __int_as_float(old);
+}
+#define DMAX(a, b) FMAX(a, b)
+#define DMIN(a, b) FMIN(a, b)
+#define DSQRT(a) sqrt(a)
+#define DSATURATE(a) DMIN(1., DMAX(0., (a)))
+// half
+#define HADD(a, b) __hadd((a), (b))
+#define HSUB2(a, b) __hsub2((a), (b))
+#define HMUL2(a, b) __hmul2((a), (b))
+#define HSQRT(a) hsqrt(a)
+
+// uint.
+#define CLZ(VAL) __clz(VAL)
+#define POPC(a) __popc(a)
+//
+//
+//
+//
+//
+//
+//
+//
+//
+#define ATOMICADD(PTR, VAL) atomicAdd((PTR), (VAL))
+#define ATOMICADD_F3(PTR, VAL)   \
+  ATOMICADD(&((PTR)->x), VAL.x); \
+  ATOMICADD(&((PTR)->y), VAL.y); \
+  ATOMICADD(&((PTR)->z), VAL.z);
+#if (CUDART_VERSION >= 10000) && (__CUDA_ARCH__ >= 600)
+#define ATOMICADD_B(PTR, VAL) atomicAdd_block((PTR), (VAL))
+#else
+#define ATOMICADD_B(PTR, VAL) ATOMICADD(PTR, VAL)
+#endif
+//
+//
+//
+//
+// int.
+#define IMIN(a, b) min((a), (b))
+#define IMAX(a, b) max((a), (b))
+#define IABS(a) abs(a)
+
+// Checks.
+// like TORCH_CHECK_ARG in PyTorch > 1.10
+#define ARGCHECK(cond, argN, ...) \
+  TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)
+
+// Math.
+#define NORM3DF(x, y, z) norm3df(x, y, z)
+#define RNORM3DF(x, y, z) rnorm3df(x, y, z)
+
+// High level.
+#define GET_SORT_WS_SIZE(RES_PTR, KEY_TYPE, VAL_TYPE, NUM_OBJECTS) \
+  cub::DeviceRadixSort::SortPairsDescending(                       \
+      (void*)NULL,                                                 \
+      *(RES_PTR),                                                  \
+      reinterpret_cast<KEY_TYPE*>(NULL),                           \
+      reinterpret_cast<KEY_TYPE*>(NULL),                           \
+      reinterpret_cast<VAL_TYPE*>(NULL),                           \
+      reinterpret_cast<VAL_TYPE*>(NULL),                           \
+      (NUM_OBJECTS));
+#define GET_REDUCE_WS_SIZE(RES_PTR, TYPE, REDUCE_OP, NUM_OBJECTS) \
+  {                                                               \
+    TYPE init = TYPE();                                           \
+    cub::DeviceReduce::Reduce(                                    \
+        (void*)NULL,                                              \
+        *(RES_PTR),                                               \
+        (TYPE*)NULL,                                              \
+        (TYPE*)NULL,                                              \
+        (NUM_OBJECTS),                                            \
+        (REDUCE_OP),                                              \
+        init);                                                    \
+  }
+#define GET_SELECT_WS_SIZE(                              \
+    RES_PTR, TYPE_SELECTOR, TYPE_SELECTION, NUM_OBJECTS) \
+  {                                                      \
+    cub::DeviceSelect::Flagged(                          \
+        (void*)NULL,                                     \
+        *(RES_PTR),                                      \
+        (TYPE_SELECTION*)NULL,                           \
+        (TYPE_SELECTOR*)NULL,                            \
+        (TYPE_SELECTION*)NULL,                           \
+        (int*)NULL,                                      \
+        (NUM_OBJECTS));                                  \
+  }
+#define GET_SUM_WS_SIZE(RES_PTR, TYPE_SUM, NUM_OBJECTS) \
+  {                                                     \
+    cub::DeviceReduce::Sum(                             \
+        (void*)NULL,                                    \
+        *(RES_PTR),                                     \
+        (TYPE_SUM*)NULL,                                \
+        (TYPE_SUM*)NULL,                                \
+        NUM_OBJECTS);                                   \
+  }
+#define GET_MM_WS_SIZE(RES_PTR, TYPE, NUM_OBJECTS)                         \
+  {                                                                        \
+    TYPE init = TYPE();                                                    \
+    cub::DeviceReduce::Max(                                                \
+        (void*)NULL, *(RES_PTR), (TYPE*)NULL, (TYPE*)NULL, (NUM_OBJECTS)); \
+  }
+#define SORT_DESCENDING(                                               \
+    TMPN1, SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS) \
+  void* TMPN1 = NULL;                                                  \
+  size_t TMPN1##_bytes = 0;                                            \
+  cub::DeviceRadixSort::SortPairsDescending(                           \
+      TMPN1,                                                           \
+      TMPN1##_bytes,                                                   \
+      (SORT_PTR),                                                      \
+      (SORTED_PTR),                                                    \
+      (VAL_PTR),                                                       \
+      (VAL_SORTED_PTR),                                                \
+      (NUM_OBJECTS));                                                  \
+  HANDLECUDA(cudaMalloc(&TMPN1, TMPN1##_bytes));                       \
+  cub::DeviceRadixSort::SortPairsDescending(                           \
+      TMPN1,                                                           \
+      TMPN1##_bytes,                                                   \
+      (SORT_PTR),                                                      \
+      (SORTED_PTR),                                                    \
+      (VAL_PTR),                                                       \
+      (VAL_SORTED_PTR),                                                \
+      (NUM_OBJECTS));                                                  \
+  HANDLECUDA(cudaFree(TMPN1));
+#define SORT_DESCENDING_WS(                  \
+    TMPN1,                                   \
+    SORT_PTR,                                \
+    SORTED_PTR,                              \
+    VAL_PTR,                                 \
+    VAL_SORTED_PTR,                          \
+    NUM_OBJECTS,                             \
+    WORKSPACE_PTR,                           \
+    WORKSPACE_BYTES)                         \
+  cub::DeviceRadixSort::SortPairsDescending( \
+      (WORKSPACE_PTR),                       \
+      (WORKSPACE_BYTES),                     \
+      (SORT_PTR),                            \
+      (SORTED_PTR),                          \
+      (VAL_PTR),                             \
+      (VAL_SORTED_PTR),                      \
+      (NUM_OBJECTS));
+#define SORT_ASCENDING_WS(         \
+    SORT_PTR,                      \
+    SORTED_PTR,                    \
+    VAL_PTR,                       \
+    VAL_SORTED_PTR,                \
+    NUM_OBJECTS,                   \
+    WORKSPACE_PTR,                 \
+    WORKSPACE_BYTES,               \
+    STREAM)                        \
+  cub::DeviceRadixSort::SortPairs( \
+      (WORKSPACE_PTR),             \
+      (WORKSPACE_BYTES),           \
+      (SORT_PTR),                  \
+      (SORTED_PTR),                \
+      (VAL_PTR),                   \
+      (VAL_SORTED_PTR),            \
+      (NUM_OBJECTS),               \
+      0,                           \
+      sizeof(*(SORT_PTR)) * 8,     \
+      (STREAM));
+#define SUM_WS(                                                            \
+    SUM_PTR, OUT_PTR, NUM_OBJECTS, WORKSPACE_PTR, WORKSPACE_BYTES, STREAM) \
+  cub::DeviceReduce::Sum(                                                  \
+      (WORKSPACE_PTR),                                                     \
+      (WORKSPACE_BYTES),                                                   \
+      (SUM_PTR),                                                           \
+      (OUT_PTR),                                                           \
+      (NUM_OBJECTS),                                                       \
+      (STREAM));
+#define MIN_WS(                                                            \
+    MIN_PTR, OUT_PTR, NUM_OBJECTS, WORKSPACE_PTR, WORKSPACE_BYTES, STREAM) \
+  cub::DeviceReduce::Min(                                                  \
+      (WORKSPACE_PTR),                                                     \
+      (WORKSPACE_BYTES),                                                   \
+      (MIN_PTR),                                                           \
+      (OUT_PTR),                                                           \
+      (NUM_OBJECTS),                                                       \
+      (STREAM));
+#define MAX_WS(                                                            \
+    MAX_PTR, OUT_PTR, NUM_OBJECTS, WORKSPACE_PTR, WORKSPACE_BYTES, STREAM) \
+  cub::DeviceReduce::Min(                                                  \
+      (WORKSPACE_PTR),                                                     \
+      (WORKSPACE_BYTES),                                                   \
+      (MAX_PTR),                                                           \
+      (OUT_PTR),                                                           \
+      (NUM_OBJECTS),                                                       \
+      (STREAM));
+//
+//
+//
+// TODO: rewrite using nested contexts instead of temporary names.
+#define REDUCE(REDUCE_PTR, RESULT_PTR, NUM_ITEMS, REDUCE_OP, REDUCE_INIT) \
+  cub::DeviceReduce::Reduce(                                              \
+      TMPN1,                                                              \
+      TMPN1##_bytes,                                                      \
+      (REDUCE_PTR),                                                       \
+      (RESULT_PTR),                                                       \
+      (NUM_ITEMS),                                                        \
+      (REDUCE_OP),                                                        \
+      (REDUCE_INIT));                                                     \
+  HANDLECUDA(cudaMalloc(&TMPN1, TMPN1##_bytes));                          \
+  cub::DeviceReduce::Reduce(                                              \
+      TMPN1,                                                              \
+      TMPN1##_bytes,                                                      \
+      (REDUCE_PTR),                                                       \
+      (RESULT_PTR),                                                       \
+      (NUM_ITEMS),                                                        \
+      (REDUCE_OP),                                                        \
+      (REDUCE_INIT));                                                     \
+  HANDLECUDA(cudaFree(TMPN1));
+#define REDUCE_WS(           \
+    REDUCE_PTR,              \
+    RESULT_PTR,              \
+    NUM_ITEMS,               \
+    REDUCE_OP,               \
+    REDUCE_INIT,             \
+    WORKSPACE_PTR,           \
+    WORSPACE_BYTES,          \
+    STREAM)                  \
+  cub::DeviceReduce::Reduce( \
+      (WORKSPACE_PTR),       \
+      (WORSPACE_BYTES),      \
+      (REDUCE_PTR),          \
+      (RESULT_PTR),          \
+      (NUM_ITEMS),           \
+      (REDUCE_OP),           \
+      (REDUCE_INIT),         \
+      (STREAM));
+#define SELECT_FLAGS_WS(      \
+    FLAGS_PTR,                \
+    ITEM_PTR,                 \
+    OUT_PTR,                  \
+    NUM_SELECTED_PTR,         \
+    NUM_ITEMS,                \
+    WORKSPACE_PTR,            \
+    WORSPACE_BYTES,           \
+    STREAM)                   \
+  cub::DeviceSelect::Flagged( \
+      (WORKSPACE_PTR),        \
+      (WORSPACE_BYTES),       \
+      (ITEM_PTR),             \
+      (FLAGS_PTR),            \
+      (OUT_PTR),              \
+      (NUM_SELECTED_PTR),     \
+      (NUM_ITEMS),            \
+      stream = (STREAM));
+
+#define COPY_HOST_DEV(PTR_D, PTR_H, TYPE, SIZE) \
+  HANDLECUDA(cudaMemcpy(                        \
+      (PTR_D), (PTR_H), sizeof(TYPE) * (SIZE), cudaMemcpyHostToDevice))
+#define COPY_DEV_HOST(PTR_H, PTR_D, TYPE, SIZE) \
+  HANDLECUDA(cudaMemcpy(                        \
+      (PTR_H), (PTR_D), sizeof(TYPE) * (SIZE), cudaMemcpyDeviceToHost))
+#define COPY_DEV_DEV(PTR_T, PTR_S, TYPE, SIZE) \
+  HANDLECUDA(cudaMemcpy(                       \
+      (PTR_T), (PTR_S), sizeof(TYPE) * (SIZE), cudaMemcpyDeviceToDevice))
+//
+// We *must* use cudaMallocManaged for pointers on device that should
+// interact with pytorch. However, this comes at a significant speed penalty.
+// We're using plain CUDA pointers for the rendering operations and
+// explicitly copy results to managed pointers wrapped for pytorch (see
+// pytorch/util.h).
+#define MALLOC(VAR, TYPE, SIZE) cudaMalloc(&(VAR), sizeof(TYPE) * (SIZE))
+#define FREE(PTR) HANDLECUDA(cudaFree(PTR))
+#define MEMSET(VAR, VAL, TYPE, SIZE, STREAM) \
+  HANDLECUDA(cudaMemsetAsync((VAR), (VAL), sizeof(TYPE) * (SIZE), (STREAM)))
+
+#define LAUNCH_MAX_PARALLEL_1D(FUNC, N, STREAM, ...)                \
+  {                                                                 \
+    int64_t max_threads =                                           \
+        at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock; \
+    uint num_threads = min((N), max_threads);                       \
+    uint num_blocks = iDivCeil((N), num_threads);                   \
+    FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__);    \
+  }
+#define LAUNCH_PARALLEL_1D(FUNC, N, TN, STREAM, ...)                   \
+  {                                                                    \
+    uint num_threads = min(static_cast<int>(N), static_cast<int>(TN)); \
+    uint num_blocks = iDivCeil((N), num_threads);                      \
+    FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__);       \
+  }
+#define LAUNCH_MAX_PARALLEL_2D(FUNC, NX, NY, STREAM, ...)               \
+  {                                                                     \
+    int64_t max_threads =                                               \
+        at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock;     \
+    int64_t max_threads_sqrt = static_cast<int64_t>(sqrt(max_threads)); \
+    dim3 num_threads, num_blocks;                                       \
+    num_threads.x = min((NX), max_threads_sqrt);                        \
+    num_blocks.x = iDivCeil((NX), num_threads.x);                       \
+    num_threads.y = min((NY), max_threads_sqrt);                        \
+    num_blocks.y = iDivCeil((NY), num_threads.y);                       \
+    num_threads.z = 1;                                                  \
+    num_blocks.z = 1;                                                   \
+    FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__);        \
+  }
+#define LAUNCH_PARALLEL_2D(FUNC, NX, NY, TX, TY, STREAM, ...)    \
+  {                                                              \
+    dim3 num_threads, num_blocks;                                \
+    num_threads.x = min((NX), (TX));                             \
+    num_blocks.x = iDivCeil((NX), num_threads.x);                \
+    num_threads.y = min((NY), (TY));                             \
+    num_blocks.y = iDivCeil((NY), num_threads.y);                \
+    num_threads.z = 1;                                           \
+    num_blocks.z = 1;                                            \
+    FUNC<<<num_blocks, num_threads, 0, (STREAM)>>>(__VA_ARGS__); \
+  }
+
+#define GET_PARALLEL_IDX_1D(VARNAME, N)                               \
+  const uint VARNAME = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; \
+  if (VARNAME >= (N)) {                                               \
+    return;                                                           \
+  }
+#define GET_PARALLEL_IDS_2D(VAR_X, VAR_Y, WIDTH, HEIGHT)            \
+  const uint VAR_X = __mul24(blockIdx.x, blockDim.x) + threadIdx.x; \
+  const uint VAR_Y = __mul24(blockIdx.y, blockDim.y) + threadIdx.y; \
+  if (VAR_X >= (WIDTH) || VAR_Y >= (HEIGHT))                        \
+    return;
+#define END_PARALLEL()
+#define END_PARALLEL_NORET()
+#define END_PARALLEL_2D_NORET()
+#define END_PARALLEL_2D()
+#define RETURN_PARALLEL() return
+#define CHECKLAUNCH() C10_CUDA_CHECK(cudaGetLastError());
+#define ISONDEVICE true
+#define SYNCDEVICE() HANDLECUDA(cudaDeviceSynchronize())
+#define START_TIME(TN)                             \
+  cudaEvent_t __time_start_##TN, __time_stop_##TN; \
+  cudaEventCreate(&__time_start_##TN);             \
+  cudaEventCreate(&__time_stop_##TN);              \
+  cudaEventRecord(__time_start_##TN);
+#define STOP_TIME(TN) cudaEventRecord(__time_stop_##TN);
+#define GET_TIME(TN, TOPTR)               \
+  cudaEventSynchronize(__time_stop_##TN); \
+  cudaEventElapsedTime((TOPTR), __time_start_##TN, __time_stop_##TN);
+#define START_TIME_CU(TN) START_TIME(CN)
+#define STOP_TIME_CU(TN) STOP_TIME(TN)
+#define GET_TIME_CU(TN, TOPTR) GET_TIME(TN, TOPTR)
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.backward.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.backward.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..11a50b976b215a3ed79adc8eaba55d9fc34967ce
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.backward.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.backward.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.backward_dbg.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.backward_dbg.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..2ea5a61892823d95cd9dfd46946d7a32bda76a85
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.backward_dbg.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.backward_dbg.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.calc_gradients.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.calc_gradients.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ba6e1f98042012795a3a422ce65c6507708ffbbe
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.calc_gradients.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.calc_gradients.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.calc_signature.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.calc_signature.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..e48ab7ff0b3d97e2ec5534b7e801b5d5f8e32ef4
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.calc_signature.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.calc_signature.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.construct.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.construct.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..38e97d904fb267ab8e9b5181cbf6425a26c6ff22
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.construct.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.construct.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.create_selector.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.create_selector.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..cfc278a48fbb17b0d41e0e37c7846608cd421046
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.create_selector.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.create_selector.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.destruct.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.destruct.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a82640ba0994df4c61b2031ac703d37f7d10b0c7
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.destruct.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.destruct.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.fill_bg.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.fill_bg.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8c975b366d45975b637c04924968d2da2791b952
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.fill_bg.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.fill_bg.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.forward.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.forward.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..4e7332a35b1f1663ca49ff1adfe6d7866250d817
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.forward.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.forward.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.norm_cam_gradients.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.norm_cam_gradients.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9b934a411170a183c8a51d556c3dadebc16b0440
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.norm_cam_gradients.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.norm_cam_gradients.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.norm_sphere_gradients.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.norm_sphere_gradients.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..a26a20a7525aea812ab3e45516d2dfaeba1069bc
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.norm_sphere_gradients.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.norm_sphere_gradients.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.render.gpu.cu b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.render.gpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b99a8fe632267be682b4d58875cea159ae0c0e5d
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/cuda/renderer.render.gpu.cu
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.render.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/global.h b/pytorch3d/pytorch3d/csrc/pulsar/global.h
new file mode 100644
index 0000000000000000000000000000000000000000..0d3580fb4b3a96d40613c13940cc4ce49e94d9e9
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/global.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_GLOBAL_H
+#define PULSAR_GLOBAL_H
+
+#include "./constants.h"
+#ifndef WIN32
+#include <csignal>
+#endif
+
+#if defined(_WIN64) || defined(_WIN32)
+#define uint unsigned int
+#define ushort unsigned short
+#endif
+
+#include "./logging.h" // <- include before torch/extension.h
+
+#define MAX_GRAD_SPHERES 128
+
+#ifdef __CUDACC__
+#define INLINE __forceinline__
+#define HOST __host__
+#define DEVICE __device__
+#define GLOBAL __global__
+#define RESTRICT __restrict__
+#define DEBUGBREAK()
+#pragma diag_suppress = attribute_not_allowed
+#pragma diag_suppress = 1866
+#pragma diag_suppress = 2941
+#pragma diag_suppress = 2951
+#pragma diag_suppress = 2967
+#else // __CUDACC__
+#define INLINE inline
+#define HOST
+#define DEVICE
+#define GLOBAL
+#define RESTRICT
+#define DEBUGBREAK() std::raise(SIGINT)
+// Don't care about pytorch warnings; they shouldn't clutter our warnings.
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+#include <torch/extension.h>
+#pragma clang diagnostic pop
+#ifdef WITH_CUDA
+#include <ATen/cuda/CUDAContext.h>
+#else
+#ifndef cudaStream_t
+typedef void* cudaStream_t;
+#endif
+struct int2 {
+  int x, y;
+};
+struct ushort2 {
+  unsigned short x, y;
+};
+struct float2 {
+  float x, y;
+};
+struct float3 {
+  float x, y, z;
+};
+#endif
+namespace py = pybind11;
+// inline float3 make_float3(const float& x, const float& y, const float& z) {
+//   float3 res;
+//   res.x = x;
+//   res.y = y;
+//   res.z = z;
+//   return res;
+// }
+
+inline bool operator==(const float3& a, const float3& b) {
+  return a.x == b.x && a.y == b.y && a.z == b.z;
+}
+#endif // __CUDACC__
+#define IHD INLINE HOST DEVICE
+
+// An assertion command that can be used on host and device.
+#ifdef PULSAR_ASSERTIONS
+#ifdef __CUDACC__
+#define PASSERT(VAL)                                     \
+  if (!(VAL)) {                                          \
+    printf(                                              \
+        "Pulsar assertion failed in %s, line %d: %s.\n", \
+        __FILE__,                                        \
+        __LINE__,                                        \
+        #VAL);                                           \
+  }
+#else
+#define PASSERT(VAL)                                     \
+  if (!(VAL)) {                                          \
+    printf(                                              \
+        "Pulsar assertion failed in %s, line %d: %s.\n", \
+        __FILE__,                                        \
+        __LINE__,                                        \
+        #VAL);                                           \
+    std::raise(SIGINT);                                  \
+  }
+#endif
+#else
+#define PASSERT(VAL)
+#endif
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/README.md b/pytorch3d/pytorch3d/csrc/pulsar/host/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..34f1bade9134da24f4038425c4b50fe1fffc45dc
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/README.md
@@ -0,0 +1,5 @@
+# Device-specific host compilation units
+
+This folder contains `.cpp` files to create compilation units
+for device specific functions. See `../include/README.md` for
+more information.
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/commands.h b/pytorch3d/pytorch3d/csrc/pulsar/host/commands.h
new file mode 100644
index 0000000000000000000000000000000000000000..997c410d576ee352a0c32d67930702fac97c7994
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/commands.h
@@ -0,0 +1,391 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_COMMANDS_H_
+#define PULSAR_NATIVE_COMMANDS_H_
+
+#ifdef _MSC_VER
+#include <intrin.h>
+#define __builtin_popcount (int)__popcnt
+#endif
+
+// Definitions for CPU commands.
+// #include <execution>
+// #include <numeric>
+
+namespace cg {
+struct coalesced_group {
+  INLINE uint thread_rank() const {
+    return 0u;
+  }
+  INLINE uint size() const {
+    return 1u;
+  }
+  INLINE uint ballot(uint val) const {
+    return static_cast<uint>(val > 0);
+  }
+};
+
+struct thread_block {
+  INLINE uint thread_rank() const {
+    return 0u;
+  }
+  INLINE uint size() const {
+    return 1u;
+  }
+  INLINE void sync() const {}
+};
+
+INLINE coalesced_group coalesced_threads() {
+  coalesced_group ret;
+  return ret;
+}
+
+INLINE thread_block this_thread_block() {
+  thread_block ret;
+  return ret;
+}
+} // namespace cg
+#define SHFL_SYNC(a, b, c) (b)
+template <typename T>
+T WARP_CUMSUM(
+    const cg::coalesced_group& group,
+    const uint& mask,
+    const T& base) {
+  return base;
+}
+
+template <typename T>
+DEVICE T
+WARP_MAX(const cg::coalesced_group& group, const uint& mask, const T& base) {
+  return base;
+}
+
+template <typename T>
+DEVICE T
+WARP_SUM(const cg::coalesced_group& group, const uint& mask, const T& base) {
+  return base;
+}
+
+INLINE DEVICE float3 WARP_SUM_FLOAT3(
+    const cg::coalesced_group& group,
+    const uint& mask,
+    const float3& base) {
+  return base;
+}
+
+#define ACTIVEMASK() (1u << 31)
+#define ALIGN(VAL)
+#define SYNC()
+#define THREADFENCE_B()
+#define BALLOT(mask, val) (val != 0)
+#define SHARED
+// Floating point.
+#define FMAX(a, b) std::fmax((a), (b))
+#define FMIN(a, b) std::fmin((a), (b))
+INLINE float atomicMax(float* address, float val) {
+  *address = std::max(*address, val);
+  return *address;
+}
+INLINE float atomicMin(float* address, float val) {
+  *address = std::min(*address, val);
+  return *address;
+}
+#define FMUL(a, b) ((a) * (b))
+#define FDIV(a, b) ((a) / (b))
+#define FSUB(a, b) ((a) - (b))
+#define FABSLEQAS(a, b, c) \
+  ((a) <= (b) ? FSUB((b), (a)) <= (c) : FSUB((a), (b)) < (c))
+#define FADD(a, b) ((a) + (b))
+#define FSQRT(a) sqrtf(a)
+#define FEXP(a) fasterexp(a)
+#define FLN(a) fasterlog(a)
+#define FPOW(a, b) powf((a), (b))
+#define FROUND(x) roundf(x)
+#define FCEIL(a) ceilf(a)
+#define FFLOOR(a) floorf(a)
+#define FSATURATE(x) std::max(0.f, std::min(1.f, x))
+#define FABS(a) abs(a)
+#define FMA(x, y, z) ((x) * (y) + (z))
+#define I2F(a) static_cast<float>(a)
+#define FRCP(x) (1.f / (x))
+#define IASF(x, loc) memcpy(&(loc), &(x), sizeof(x))
+#define FASI(x, loc) memcpy(&(loc), &(x), sizeof(x))
+#define DMAX(a, b) std::max((a), (b))
+#define DMIN(a, b) std::min((a), (b))
+#define DSATURATE(a) DMIN(1., DMAX(0., (a)))
+#define DSQRT(a) sqrt(a)
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+//
+// uint.
+#define CLZ(VAL) _clz(VAL)
+template <typename T>
+INLINE T ATOMICADD(T* address, T val) {
+  T old = *address;
+  *address += val;
+  return old;
+}
+template <typename T>
+INLINE void ATOMICADD_F3(T* address, T val) {
+  ATOMICADD(&(address->x), val.x);
+  ATOMICADD(&(address->y), val.y);
+  ATOMICADD(&(address->z), val.z);
+}
+#define ATOMICADD_B(a, b) ATOMICADD((a), (b))
+#define POPC(a) __builtin_popcount(a)
+
+// int.
+#define IMIN(a, b) std::min((a), (b))
+#define IMAX(a, b) std::max((a), (b))
+#define IABS(a) abs(a)
+
+// Checks.
+// like TORCH_CHECK_ARG in PyTorch > 1.10
+#define ARGCHECK(cond, argN, ...) \
+  TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)
+
+// Math.
+#define NORM3DF(x, y, z) sqrtf(x* x + y * y + z * z)
+#define RNORM3DF(x, y, z) (1.f / sqrtf(x * x + y * y + z * z))
+
+// High level.
+#define PREFETCH(PTR)
+#define GET_SORT_WS_SIZE(RES_PTR, KEY_TYPE, VAL_TYPE, NUM_OBJECTS) \
+  *(RES_PTR) = 0;
+#define GET_REDUCE_WS_SIZE(RES_PTR, TYPE, REDUCE_OP, NUM_OBJECTS) \
+  *(RES_PTR) = 0;
+#define GET_SELECT_WS_SIZE(                              \
+    RES_PTR, TYPE_SELECTOR, TYPE_SELECTION, NUM_OBJECTS) \
+  *(RES_PTR) = 0;
+#define GET_SUM_WS_SIZE(RES_PTR, TYPE_SUM, NUM_OBJECTS) *(RES_PTR) = 0;
+#define GET_MM_WS_SIZE(RES_PTR, TYPE, NUM_OBJECTS) *(RES_PTR) = 0;
+
+#define SORT_DESCENDING(                                                     \
+    TMPN1, SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS)       \
+  std::vector<size_t> TMPN1(NUM_OBJECTS);                                    \
+  std::iota(TMPN1.begin(), TMPN1.end(), 0);                                  \
+  const auto TMPN1##_val_ptr = (SORT_PTR);                                   \
+  std::sort(                                                                 \
+      TMPN1.begin(), TMPN1.end(), [&TMPN1##_val_ptr](size_t i1, size_t i2) { \
+        return TMPN1##_val_ptr[i1] > TMPN1##_val_ptr[i2];                    \
+      });                                                                    \
+  for (int i = 0; i < (NUM_OBJECTS); ++i) {                                  \
+    (SORTED_PTR)[i] = (SORT_PTR)[TMPN1[i]];                                  \
+  }                                                                          \
+  for (int i = 0; i < (NUM_OBJECTS); ++i) {                                  \
+    (VAL_SORTED_PTR)[i] = (VAL_PTR)[TMPN1[i]];                               \
+  }
+
+#define SORT_ASCENDING(                                                 \
+    SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS, STREAM) \
+  {                                                                     \
+    std::vector<size_t> TMPN1(NUM_OBJECTS);                             \
+    std::iota(TMPN1.begin(), TMPN1.end(), 0);                           \
+    const auto TMPN1_val_ptr = (SORT_PTR);                              \
+    std::sort(                                                          \
+        TMPN1.begin(),                                                  \
+        TMPN1.end(),                                                    \
+        [&TMPN1_val_ptr](size_t i1, size_t i2) -> bool {                \
+          return TMPN1_val_ptr[i1] < TMPN1_val_ptr[i2];                 \
+        });                                                             \
+    for (int i = 0; i < (NUM_OBJECTS); ++i) {                           \
+      (SORTED_PTR)[i] = (SORT_PTR)[TMPN1[i]];                           \
+    }                                                                   \
+    for (int i = 0; i < (NUM_OBJECTS); ++i) {                           \
+      (VAL_SORTED_PTR)[i] = (VAL_PTR)[TMPN1[i]];                        \
+    }                                                                   \
+  }
+
+#define SORT_DESCENDING_WS( \
+    TMPN1,                  \
+    SORT_PTR,               \
+    SORTED_PTR,             \
+    VAL_PTR,                \
+    VAL_SORTED_PTR,         \
+    NUM_OBJECTS,            \
+    WORSPACE_PTR,           \
+    WORKSPACE_SIZE)         \
+  SORT_DESCENDING(          \
+      TMPN1, SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS)
+
+#define SORT_ASCENDING_WS( \
+    SORT_PTR,              \
+    SORTED_PTR,            \
+    VAL_PTR,               \
+    VAL_SORTED_PTR,        \
+    NUM_OBJECTS,           \
+    WORSPACE_PTR,          \
+    WORKSPACE_SIZE,        \
+    STREAM)                \
+  SORT_ASCENDING(          \
+      SORT_PTR, SORTED_PTR, VAL_PTR, VAL_SORTED_PTR, NUM_OBJECTS, STREAM)
+
+#define REDUCE(REDUCE_PTR, RESULT_PTR, NUM_ITEMS, REDUCE_OP, REDUCE_INIT) \
+  {                                                                       \
+    *(RESULT_PTR) = (REDUCE_INIT);                                        \
+    for (int i = 0; i < (NUM_ITEMS); ++i) {                               \
+      *(RESULT_PTR) = REDUCE_OP(*(RESULT_PTR), (REDUCE_PTR)[i]);          \
+    }                                                                     \
+  }
+#define REDUCE_WS(  \
+    REDUCE_PTR,     \
+    RESULT_PTR,     \
+    NUM_ITEMS,      \
+    REDUCE_OP,      \
+    REDUCE_INIT,    \
+    WORKSPACE_PTR,  \
+    WORKSPACE_SIZE, \
+    STREAM)         \
+  REDUCE(REDUCE_PTR, RESULT_PTR, NUM_ITEMS, REDUCE_OP, REDUCE_INIT)
+
+#define SELECT_FLAGS_WS(                    \
+    FLAGS_PTR,                              \
+    ITEM_PTR,                               \
+    OUT_PTR,                                \
+    NUM_SELECTED_PTR,                       \
+    NUM_ITEMS,                              \
+    WORKSPACE_PTR,                          \
+    WORSPACE_BYTES,                         \
+    STREAM)                                 \
+  {                                         \
+    *NUM_SELECTED_PTR = 0;                  \
+    ptrdiff_t write_pos = 0;                \
+    for (int i = 0; i < NUM_ITEMS; ++i) {   \
+      if (FLAGS_PTR[i]) {                   \
+        OUT_PTR[write_pos++] = ITEM_PTR[i]; \
+        *NUM_SELECTED_PTR += 1;             \
+      }                                     \
+    }                                       \
+  }
+
+template <typename T>
+void SUM_WS(
+    T* SUM_PTR,
+    T* OUT_PTR,
+    size_t NUM_OBJECTS,
+    char* WORKSPACE_PTR,
+    size_t WORKSPACE_BYTES,
+    cudaStream_t STREAM) {
+  *(OUT_PTR) = T();
+  for (int i = 0; i < (NUM_OBJECTS); ++i) {
+    *(OUT_PTR) = *(OUT_PTR) + (SUM_PTR)[i];
+  }
+}
+
+template <typename T>
+void MIN_WS(
+    T* MIN_PTR,
+    T* OUT_PTR,
+    size_t NUM_OBJECTS,
+    char* WORKSPACE_PTR,
+    size_t WORKSPACE_BYTES,
+    cudaStream_t STREAM) {
+  *(OUT_PTR) = T();
+  for (int i = 0; i < (NUM_OBJECTS); ++i) {
+    *(OUT_PTR) = std::min<T>(*(OUT_PTR), (MIN_PTR)[i]);
+  }
+}
+
+template <typename T>
+void MAX_WS(
+    T* MAX_PTR,
+    T* OUT_PTR,
+    size_t NUM_OBJECTS,
+    char* WORKSPACE_PTR,
+    size_t WORKSPACE_BYTES,
+    cudaStream_t STREAM) {
+  *(OUT_PTR) = T();
+  for (int i = 0; i < (NUM_OBJECTS); ++i) {
+    *(OUT_PTR) = std::max<T>(*(OUT_PTR), (MAX_PTR)[i]);
+  }
+}
+//
+//
+//
+//
+#define COPY_HOST_DEV(PTR_D, PTR_H, TYPE, SIZE) \
+  std::memcpy((PTR_D), (PTR_H), sizeof(TYPE) * (SIZE))
+//
+#define COPY_DEV_HOST(PTR_H, PTR_D, TYPE, SIZE) \
+  std::memcpy((PTR_H), (PTR_D), sizeof(TYPE) * (SIZE))
+//
+#define COPY_DEV_DEV(PTR_T, PTR_S, TYPE, SIZE) \
+  std::memcpy((PTR_T), (PTR_S), sizeof(TYPE) * SIZE)
+//
+
+#define MALLOC(VAR, TYPE, SIZE) MALLOC_HOST(VAR, TYPE, SIZE)
+#define FREE(PTR) FREE_HOST(PTR)
+#define MEMSET(VAR, VAL, TYPE, SIZE, STREAM) \
+  memset((VAR), (VAL), sizeof(TYPE) * (SIZE))
+//
+
+#define LAUNCH_MAX_PARALLEL_1D(FUNC, N, STREAM, ...) FUNC(__VA_ARGS__);
+#define LAUNCH_PARALLEL_1D(FUNC, N, TN, STREAM, ...) FUNC(__VA_ARGS__);
+#define LAUNCH_MAX_PARALLEL_2D(FUNC, NX, NY, STREAM, ...) FUNC(__VA_ARGS__);
+#define LAUNCH_PARALLEL_2D(FUNC, NX, NY, TX, TY, STREAM, ...) FUNC(__VA_ARGS__);
+//
+//
+//
+//
+//
+#define GET_PARALLEL_IDX_1D(VARNAME, N) \
+  for (uint VARNAME = 0; VARNAME < (N); ++VARNAME) {
+#define GET_PARALLEL_IDS_2D(VAR_X, VAR_Y, WIDTH, HEIGHT)          \
+  int2 blockDim;                                                  \
+  blockDim.x = 1;                                                 \
+  blockDim.y = 1;                                                 \
+  uint __parallel_2d_width = WIDTH;                               \
+  uint __parallel_2d_height = HEIGHT;                             \
+  for (uint VAR_Y = 0; VAR_Y < __parallel_2d_height; ++(VAR_Y)) { \
+    for (uint VAR_X = 0; VAR_X < __parallel_2d_width; ++(VAR_X)) {
+//
+//
+//
+#define END_PARALLEL() \
+  end_parallel:;       \
+  }
+#define END_PARALLEL_NORET() }
+#define END_PARALLEL_2D() \
+  end_parallel:;          \
+  }                       \
+  }
+#define END_PARALLEL_2D_NORET() \
+  }                             \
+  }
+#define RETURN_PARALLEL() goto end_parallel;
+#define CHECKLAUNCH()
+#define ISONDEVICE false
+#define SYNCDEVICE()
+#define START_TIME(TN) \
+  auto __time_start_##TN = std::chrono::steady_clock::now();
+#define STOP_TIME(TN) auto __time_stop_##TN = std::chrono::steady_clock::now();
+#define GET_TIME(TN, TOPTR)                                       \
+  *TOPTR = std::chrono::duration_cast<std::chrono::milliseconds>( \
+               __time_stop_##TN - __time_start_##TN)              \
+               .count()
+#define START_TIME_CU(TN)                          \
+  cudaEvent_t __time_start_##TN, __time_stop_##TN; \
+  cudaEventCreate(&__time_start_##TN);             \
+  cudaEventCreate(&__time_stop_##TN);              \
+  cudaEventRecord(__time_start_##TN);
+#define STOP_TIME_CU(TN) cudaEventRecord(__time_stop_##TN);
+#define GET_TIME_CU(TN, TOPTR)            \
+  cudaEventSynchronize(__time_stop_##TN); \
+  cudaEventElapsedTime((TOPTR), __time_start_##TN, __time_stop_##TN);
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.backward.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.backward.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..11a50b976b215a3ed79adc8eaba55d9fc34967ce
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.backward.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.backward.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.backward_dbg.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.backward_dbg.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2ea5a61892823d95cd9dfd46946d7a32bda76a85
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.backward_dbg.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.backward_dbg.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.calc_gradients.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.calc_gradients.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ba6e1f98042012795a3a422ce65c6507708ffbbe
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.calc_gradients.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.calc_gradients.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.calc_signature.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.calc_signature.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e48ab7ff0b3d97e2ec5534b7e801b5d5f8e32ef4
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.calc_signature.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.calc_signature.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.construct.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.construct.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..38e97d904fb267ab8e9b5181cbf6425a26c6ff22
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.construct.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.construct.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.create_selector.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.create_selector.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..cfc278a48fbb17b0d41e0e37c7846608cd421046
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.create_selector.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.create_selector.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.destruct.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.destruct.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a82640ba0994df4c61b2031ac703d37f7d10b0c7
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.destruct.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.destruct.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.fill_bg.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.fill_bg.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c975b366d45975b637c04924968d2da2791b952
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.fill_bg.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.fill_bg.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.forward.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.forward.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4e7332a35b1f1663ca49ff1adfe6d7866250d817
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.forward.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.forward.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.norm_cam_gradients.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.norm_cam_gradients.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b934a411170a183c8a51d556c3dadebc16b0440
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.norm_cam_gradients.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.norm_cam_gradients.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.norm_sphere_gradients.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.norm_sphere_gradients.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a26a20a7525aea812ab3e45516d2dfaeba1069bc
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.norm_sphere_gradients.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.norm_sphere_gradients.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.render.cpu.cpp b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.render.cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b99a8fe632267be682b4d58875cea159ae0c0e5d
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/host/renderer.render.cpu.cpp
@@ -0,0 +1,9 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "../include/renderer.render.instantiate.h"
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/README.md b/pytorch3d/pytorch3d/csrc/pulsar/include/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e963ff043abdbbf88af350512f60fb70a02a4774
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/README.md
@@ -0,0 +1,16 @@
+# The `include` folder
+
+This folder contains header files with implementations of several useful
+algorithms. These implementations are usually done in files called `x.device.h`
+and use macros that route every device specific command to the right
+implementation (see `commands.h`).
+
+If you're using a device specific implementation, include `x.device.h`.
+This gives you the high-speed, device specific implementation that lets
+you work with all the details of the datastructure. All function calls are
+inlined. If you need to work with the high-level interface and be able to
+dynamically pick a device, only include `x.h`. The functions there are
+templated with a boolean `DEV` flag and are instantiated in device specific
+compilation units. You will not be able to use any other functions, but can
+use `func<true>(params)` to work on a CUDA device, or `func<false>(params)`
+to work on the host.
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/camera.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/camera.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ca21e09ba25c9d8c739fa7670d11f8e047a0ac8
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/camera.device.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_CAMERA_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_CAMERA_DEVICE_H_
+
+#include "../global.h"
+#include "./camera.h"
+#include "./commands.h"
+
+namespace pulsar {
+IHD CamGradInfo::CamGradInfo() {
+  cam_pos = make_float3(0.f, 0.f, 0.f);
+  pixel_0_0_center = make_float3(0.f, 0.f, 0.f);
+  pixel_dir_x = make_float3(0.f, 0.f, 0.f);
+  pixel_dir_y = make_float3(0.f, 0.f, 0.f);
+}
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/camera.h b/pytorch3d/pytorch3d/csrc/pulsar/include/camera.h
new file mode 100644
index 0000000000000000000000000000000000000000..2883be40f4bdec10463badbe48ce41202f6fe2fe
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/camera.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_CAMERA_H_
+#define PULSAR_NATIVE_INCLUDE_CAMERA_H_
+
+#include <stdint.h>
+#include "../global.h"
+
+namespace pulsar {
+/**
+ * Everything that's needed to raycast with our camera model.
+ */
+struct CamInfo {
+  float3 eye; /** Position in world coordinates. */
+  float3 pixel_0_0_center; /** LUC center of pixel position in world
+                              coordinates. */
+  float3 pixel_dir_x; /** Direction for increasing x for one pixel to the next,
+                       * in  world coordinates. */
+  float3 pixel_dir_y; /** Direction for increasing y for one pixel to the next,
+                       * in  world coordinates. */
+  float3 sensor_dir_z; /** Normalized direction vector from eye through the
+                        * sensor in z direction (optical axis). */
+  float half_pixel_size; /** Half size of a pixel, in world coordinates. This
+                          * must be consistent with pixel_dir_x and pixel_dir_y!
+                          */
+  float focal_length; /** The focal length, if applicable. */
+  uint aperture_width; /** Full image width in px, possibly not fully used
+                        * in case of a shifted principal point. */
+  uint aperture_height; /** Full image height in px, possibly not fully used
+                         * in case of a shifted principal point. */
+  uint film_width; /** Resulting image width. */
+  uint film_height; /** Resulting image height. */
+  /** The top left coordinates (inclusive) of the film in the full aperture. */
+  uint film_border_left, film_border_top;
+  int32_t principal_point_offset_x; /** Horizontal principal point offset. */
+  int32_t principal_point_offset_y; /** Vertical principal point offset. */
+  float min_dist; /** Minimum distance for a ball to be rendered. */
+  float max_dist; /** Maximum distance for a ball to be rendered. */
+  float norm_fac; /** 1 / (max_dist - min_dist), pre-computed. */
+  /** The depth where to place the background, in normalized coordinates where
+   * 0. is the backmost depth and 1. the frontmost. */
+  float background_normalization_depth;
+  /** The number of image content channels to use. Usually three. */
+  uint n_channels;
+  /** Whether to use an orthogonal instead of a perspective projection. */
+  bool orthogonal_projection;
+  /** Whether to use a right-handed system (inverts the z axis). */
+  bool right_handed;
+};
+
+inline bool operator==(const CamInfo& a, const CamInfo& b) {
+  return a.film_width == b.film_width && a.film_height == b.film_height &&
+      a.background_normalization_depth == b.background_normalization_depth &&
+      a.n_channels == b.n_channels &&
+      a.orthogonal_projection == b.orthogonal_projection &&
+      a.right_handed == b.right_handed;
+};
+
+struct CamGradInfo {
+  HOST DEVICE CamGradInfo();
+  float3 cam_pos;
+  float3 pixel_0_0_center;
+  float3 pixel_dir_x;
+  float3 pixel_dir_y;
+};
+
+// TODO: remove once https://github.com/NVlabs/cub/issues/172 is resolved.
+struct IntWrapper {
+  int val;
+};
+
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/closest_sphere_tracker.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/closest_sphere_tracker.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..e191e22b7cdd7c04acc5c6bd2e2b749b4c84e320
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/closest_sphere_tracker.device.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_CLOSEST_SPHERE_TRACKER_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_CLOSEST_SPHERE_TRACKER_DEVICE_H_
+
+#include "../global.h"
+
+namespace pulsar {
+namespace Renderer {
+
+/**
+ * A facility to track the closest spheres to the camera.
+ *
+ * Their max number is defined by MAX_GRAD_SPHERES (this is defined in
+ * `pulsar/native/global.h`). This is done to keep the performance as high as
+ * possible because this struct needs to do updates continuously on the GPU.
+ */
+struct ClosestSphereTracker {
+ public:
+  IHD ClosestSphereTracker(const int& n_track) : n_hits(0), n_track(n_track) {
+    PASSERT(n_track < MAX_GRAD_SPHERES);
+    // Initialize the sphere IDs to -1 and the weights to 0.
+    for (int i = 0; i < n_track; ++i) {
+      this->most_important_sphere_ids[i] = -1;
+      this->closest_sphere_intersection_depths[i] = MAX_FLOAT;
+    }
+  };
+
+  IHD void track(
+      const uint& sphere_idx,
+      const float& intersection_depth,
+      const uint& coord_x,
+      const uint& coord_y) {
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_TRACKER_PIX,
+        "tracker|tracking sphere %u (depth: %f).\n",
+        sphere_idx,
+        intersection_depth);
+    for (int i = IMIN(this->n_hits, n_track) - 1; i >= -1; --i) {
+      if (i < 0 ||
+          this->closest_sphere_intersection_depths[i] < intersection_depth) {
+        // Write position is i+1.
+        PULSAR_LOG_DEV_PIX(
+            PULSAR_LOG_TRACKER_PIX,
+            "tracker|determined writing position: %d.\n",
+            i + 1);
+        if (i + 1 < n_track) {
+          // Shift every other sphere back.
+          for (int j = n_track - 1; j > i + 1; --j) {
+            this->closest_sphere_intersection_depths[j] =
+                this->closest_sphere_intersection_depths[j - 1];
+            this->most_important_sphere_ids[j] =
+                this->most_important_sphere_ids[j - 1];
+          }
+          this->closest_sphere_intersection_depths[i + 1] = intersection_depth;
+          this->most_important_sphere_ids[i + 1] = sphere_idx;
+        }
+        break;
+      }
+    }
+#if PULSAR_LOG_TRACKER_PIX
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_TRACKER_PIX,
+        "tracker|sphere list after adding sphere %u:\n",
+        sphere_idx);
+    for (int i = 0; i < n_track; ++i) {
+      PULSAR_LOG_DEV_PIX(
+          PULSAR_LOG_TRACKER_PIX,
+          "tracker|sphere %d: %d (depth: %f).\n",
+          i,
+          this->most_important_sphere_ids[i],
+          this->closest_sphere_intersection_depths[i]);
+    }
+#endif // PULSAR_LOG_TRACKER_PIX
+    this->n_hits += 1;
+  }
+
+  /**
+   * Get the number of hits registered.
+   */
+  IHD int get_n_hits() const {
+    return this->n_hits;
+  }
+
+  /**
+   * Get the idx closest sphere ID.
+   *
+   * For example, get_closest_sphere_id(0) gives the overall closest
+   * sphere id.
+   *
+   * This method is implemented for highly optimized scenarios and will *not*
+   * perform an index check at runtime if assertions are disabled. idx must be
+   * >=0 and < IMIN(n_hits, n_track) for a valid result, if it is >=
+   * n_hits it will return -1.
+   */
+  IHD int get_closest_sphere_id(const int& idx) {
+    PASSERT(idx >= 0 && idx < n_track);
+    return this->most_important_sphere_ids[idx];
+  }
+
+  /**
+   * Get the idx closest sphere normalized_depth.
+   *
+   * For example, get_closest_sphere_depth(0) gives the overall closest
+   * sphere depth (normalized).
+   *
+   * This method is implemented for highly optimized scenarios and will *not*
+   * perform an index check at runtime if assertions are disabled. idx must be
+   * >=0 and < IMIN(n_hits, n_track) for a valid result, if it is >=
+   * n_hits it will return 1. + FEPS.
+   */
+  IHD float get_closest_sphere_depth(const int& idx) {
+    PASSERT(idx >= 0 && idx < n_track);
+    return this->closest_sphere_intersection_depths[idx];
+  }
+
+ private:
+  /** The number of registered hits so far. */
+  int n_hits;
+  /** The number of intersections to track. Must be <MAX_GRAD_SPHERES. */
+  int n_track;
+  /** The sphere ids of the n_track spheres with the highest color
+   * contribution. */
+  int most_important_sphere_ids[MAX_GRAD_SPHERES];
+  /** The normalized depths of the closest n_track spheres. */
+  float closest_sphere_intersection_depths[MAX_GRAD_SPHERES];
+};
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/commands.h b/pytorch3d/pytorch3d/csrc/pulsar/include/commands.h
new file mode 100644
index 0000000000000000000000000000000000000000..b927ffcd0ac84ddcb0c40e5b2ebcd66f49eab17e
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/commands.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_COMMANDS_ROUTING_H_
+#define PULSAR_NATIVE_COMMANDS_ROUTING_H_
+
+#include "../global.h"
+
+// Commands available everywhere.
+#define MALLOC_HOST(VAR, TYPE, SIZE) \
+  VAR = static_cast<TYPE*>(malloc(sizeof(TYPE) * (SIZE)))
+#define FREE_HOST(PTR) free(PTR)
+
+/* Include command definitions depending on CPU or GPU use. */
+
+#ifdef __CUDACC__
+// TODO: find out which compiler we're using here and use the suppression.
+// #pragma push
+// #pragma diag_suppress = 68
+#include <ATen/cuda/CUDAContext.h>
+// #pragma pop
+#include "../cuda/commands.h"
+#else
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Weverything"
+#pragma clang diagnostic pop
+#include "../host/commands.h"
+#endif
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/fastermath.h b/pytorch3d/pytorch3d/csrc/pulsar/include/fastermath.h
new file mode 100644
index 0000000000000000000000000000000000000000..cae598f9c0a7f903b502702dcb62173c8841a3b8
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/fastermath.h
@@ -0,0 +1,88 @@
+#ifndef PULSAR_NATIVE_INCLUDE_FASTERMATH_H_
+#define PULSAR_NATIVE_INCLUDE_FASTERMATH_H_
+
+// @lint-ignore-every LICENSELINT
+/*=====================================================================*
+ *                   Copyright (C) 2011 Paul Mineiro                   *
+ * All rights reserved.                                                *
+ *                                                                     *
+ * Redistribution and use in source and binary forms, with             *
+ * or without modification, are permitted provided that the            *
+ * following conditions are met:                                       *
+ *                                                                     *
+ *     * Redistributions of source code must retain the                *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer.                                       *
+ *                                                                     *
+ *     * Redistributions in binary form must reproduce the             *
+ *     above copyright notice, this list of conditions and             *
+ *     the following disclaimer in the documentation and/or            *
+ *     other materials provided with the distribution.                 *
+ *                                                                     *
+ *     * Neither the name of Paul Mineiro nor the names                *
+ *     of other contributors may be used to endorse or promote         *
+ *     products derived from this software without specific            *
+ *     prior written permission.                                       *
+ *                                                                     *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND              *
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,         *
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES               *
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE             *
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER               *
+ * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,                 *
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES            *
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE           *
+ * GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR                *
+ * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF          *
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT           *
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY              *
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE             *
+ * POSSIBILITY OF SUCH DAMAGE.                                         *
+ *                                                                     *
+ * Contact: Paul Mineiro <paul@mineiro.com>                            *
+ *=====================================================================*/
+
+#include <stdint.h>
+#include "./commands.h"
+
+#ifdef __cplusplus
+#define cast_uint32_t static_cast<uint32_t>
+#else
+#define cast_uint32_t (uint32_t)
+#endif
+
+IHD float fasterlog2(float x) {
+  union {
+    float f;
+    uint32_t i;
+  } vx = {x};
+  float y = vx.i;
+  y *= 1.1920928955078125e-7f;
+  return y - 126.94269504f;
+}
+
+IHD float fasterlog(float x) {
+  //  return 0.69314718f * fasterlog2 (x);
+  union {
+    float f;
+    uint32_t i;
+  } vx = {x};
+  float y = vx.i;
+  y *= 8.2629582881927490e-8f;
+  return y - 87.989971088f;
+}
+
+IHD float fasterpow2(float p) {
+  float clipp = (p < -126) ? -126.0f : p;
+  union {
+    uint32_t i;
+    float f;
+  } v = {cast_uint32_t((1 << 23) * (clipp + 126.94269504f))};
+  return v.f;
+}
+
+IHD float fasterexp(float p) {
+  return fasterpow2(1.442695040f * p);
+}
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/math.h b/pytorch3d/pytorch3d/csrc/pulsar/include/math.h
new file mode 100644
index 0000000000000000000000000000000000000000..a30f08e4fd6e5c7cf803a474830fa55fc88383d0
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/math.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_IMPL_MATH_H_
+#define PULSAR_NATIVE_IMPL_MATH_H_
+
+#include "./camera.h"
+#include "./commands.h"
+#include "./fastermath.h"
+
+/**
+ * Get the direction of val.
+ *
+ * Returns +1 if val is positive, -1 if val is zero or negative.
+ */
+IHD int sign_dir(const int& val) {
+  return -(static_cast<int>((val <= 0)) << 1) + 1;
+};
+
+/**
+ * Get the direction of val.
+ *
+ * Returns +1 if val is positive, -1 if val is zero or negative.
+ */
+IHD float sign_dir(const float& val) {
+  return static_cast<float>(1 - (static_cast<int>((val <= 0)) << 1));
+};
+
+/**
+ * Integer ceil division.
+ */
+IHD uint iDivCeil(uint a, uint b) {
+  return (a % b != 0) ? (a / b + 1) : (a / b);
+}
+
+IHD float3 outer_product_sum(const float3& a) {
+  return make_float3(
+      a.x * a.x + a.x * a.y + a.x * a.z,
+      a.x * a.y + a.y * a.y + a.y * a.z,
+      a.x * a.z + a.y * a.z + a.z * a.z);
+}
+
+// TODO: put intrinsics here.
+IHD float3 operator+(const float3& a, const float3& b) {
+  return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+IHD void operator+=(float3& a, const float3& b) {
+  a.x += b.x;
+  a.y += b.y;
+  a.z += b.z;
+}
+
+IHD void operator-=(float3& a, const float3& b) {
+  a.x -= b.x;
+  a.y -= b.y;
+  a.z -= b.z;
+}
+
+IHD void operator/=(float3& a, const float& b) {
+  a.x /= b;
+  a.y /= b;
+  a.z /= b;
+}
+
+IHD void operator*=(float3& a, const float& b) {
+  a.x *= b;
+  a.y *= b;
+  a.z *= b;
+}
+
+IHD float3 operator/(const float3& a, const float& b) {
+  return make_float3(a.x / b, a.y / b, a.z / b);
+}
+
+IHD float3 operator-(const float3& a, const float3& b) {
+  return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+
+IHD float3 operator*(const float3& a, const float& b) {
+  return make_float3(a.x * b, a.y * b, a.z * b);
+}
+
+IHD float3 operator*(const float3& a, const float3& b) {
+  return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+
+IHD float3 operator*(const float& a, const float3& b) {
+  return b * a;
+}
+
+INLINE DEVICE float length(const float3& v) {
+  // TODO: benchmark what's faster.
+  return NORM3DF(v.x, v.y, v.z);
+  // return __fsqrt_rn(v.x * v.x + v.y * v.y + v.z * v.z);
+}
+
+/**
+ * Left-hand multiplication of the constructed rotation matrix with the vector.
+ */
+IHD float3 rotate(
+    const float3& v,
+    const float3& dir_x,
+    const float3& dir_y,
+    const float3& dir_z) {
+  return make_float3(
+      dir_x.x * v.x + dir_x.y * v.y + dir_x.z * v.z,
+      dir_y.x * v.x + dir_y.y * v.y + dir_y.z * v.z,
+      dir_z.x * v.x + dir_z.y * v.y + dir_z.z * v.z);
+}
+
+INLINE DEVICE float3 normalize(const float3& v) {
+  return v * RNORM3DF(v.x, v.y, v.z);
+}
+
+INLINE DEVICE float dot(const float3& a, const float3& b) {
+  return FADD(FADD(FMUL(a.x, b.x), FMUL(a.y, b.y)), FMUL(a.z, b.z));
+}
+
+INLINE DEVICE float3 cross(const float3& a, const float3& b) {
+  // TODO: faster
+  return make_float3(
+      a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+}
+
+namespace pulsar {
+IHD CamGradInfo operator+(const CamGradInfo& a, const CamGradInfo& b) {
+  CamGradInfo res;
+  res.cam_pos = a.cam_pos + b.cam_pos;
+  res.pixel_0_0_center = a.pixel_0_0_center + b.pixel_0_0_center;
+  res.pixel_dir_x = a.pixel_dir_x + b.pixel_dir_x;
+  res.pixel_dir_y = a.pixel_dir_y + b.pixel_dir_y;
+  return res;
+}
+
+IHD CamGradInfo operator*(const CamGradInfo& a, const float& b) {
+  CamGradInfo res;
+  res.cam_pos = a.cam_pos * b;
+  res.pixel_0_0_center = a.pixel_0_0_center * b;
+  res.pixel_dir_x = a.pixel_dir_x * b;
+  res.pixel_dir_y = a.pixel_dir_y * b;
+  return res;
+}
+
+IHD IntWrapper operator+(const IntWrapper& a, const IntWrapper& b) {
+  IntWrapper res;
+  res.val = a.val + b.val;
+  return res;
+}
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcc7db236119fe2adfbff2c6294c4cda00888eb9
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward.device.h
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_RENDERER_BACKWARD_DEVICE_H_
+#define PULSAR_NATIVE_RENDERER_BACKWARD_DEVICE_H_
+
+#include "./camera.device.h"
+#include "./math.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template <bool DEV>
+void backward(
+    Renderer* self,
+    const float* grad_im,
+    const float* image,
+    const float* forw_info,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* vert_opy_d,
+    const size_t& num_balls,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    cudaStream_t stream) {
+  ARGCHECK(gamma > 0.f && gamma <= 1.f, 6, "gamma must be in [0., 1.]");
+  ARGCHECK(
+      percent_allowed_difference >= 0.f && percent_allowed_difference <= 1.f,
+      7,
+      "percent_allowed_difference must be in [0., 1.]");
+  ARGCHECK(max_n_hits >= 1u, 8, "max_n_hits must be >= 1");
+  ARGCHECK(
+      num_balls > 0 && num_balls <= self->max_num_balls,
+      9,
+      "num_balls must be >0 and less than max num balls!");
+  ARGCHECK(
+      cam.film_width == self->cam.film_width &&
+          cam.film_height == self->cam.film_height,
+      5,
+      "cam film size must agree");
+  ARGCHECK(mode <= 1, 10, "mode must be <= 1!");
+  if (percent_allowed_difference < EPS) {
+    LOG(WARNING) << "percent_allowed_difference < " << FEPS << "! Clamping to "
+                 << FEPS << ".";
+    percent_allowed_difference = FEPS;
+  }
+  if (percent_allowed_difference > 1.f - FEPS) {
+    LOG(WARNING) << "percent_allowed_difference > " << (1.f - FEPS)
+                 << "! Clamping to " << (1.f - FEPS) << ".";
+    percent_allowed_difference = 1.f - FEPS;
+  }
+  LOG_IF(INFO, PULSAR_LOG_RENDER) << "Rendering backward pass...";
+  // Update camera.
+  self->cam.eye = cam.eye;
+  self->cam.pixel_0_0_center = cam.pixel_0_0_center - cam.eye;
+  self->cam.pixel_dir_x = cam.pixel_dir_x;
+  self->cam.pixel_dir_y = cam.pixel_dir_y;
+  self->cam.sensor_dir_z = cam.sensor_dir_z;
+  self->cam.half_pixel_size = cam.half_pixel_size;
+  self->cam.focal_length = cam.focal_length;
+  self->cam.aperture_width = cam.aperture_width;
+  self->cam.aperture_height = cam.aperture_height;
+  self->cam.min_dist = cam.min_dist;
+  self->cam.max_dist = cam.max_dist;
+  self->cam.norm_fac = cam.norm_fac;
+  self->cam.principal_point_offset_x = cam.principal_point_offset_x;
+  self->cam.principal_point_offset_y = cam.principal_point_offset_y;
+  self->cam.film_border_left = cam.film_border_left;
+  self->cam.film_border_top = cam.film_border_top;
+#ifdef PULSAR_TIMINGS_ENABLED
+  START_TIME(calc_signature);
+#endif
+  LAUNCH_MAX_PARALLEL_1D(
+      calc_signature<DEV>,
+      num_balls,
+      stream,
+      *self,
+      reinterpret_cast<const float3*>(vert_pos),
+      vert_col,
+      vert_rad,
+      num_balls);
+  CHECKLAUNCH();
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(calc_signature);
+  START_TIME(calc_gradients);
+#endif
+  MEMSET(self->grad_pos_d, 0, float3, num_balls, stream);
+  MEMSET(self->grad_col_d, 0, float, num_balls * self->cam.n_channels, stream);
+  MEMSET(self->grad_rad_d, 0, float, num_balls, stream);
+  MEMSET(self->grad_cam_d, 0, float, 12, stream);
+  MEMSET(self->grad_cam_buf_d, 0, CamGradInfo, num_balls, stream);
+  MEMSET(self->grad_opy_d, 0, float, num_balls, stream);
+  MEMSET(self->ids_sorted_d, 0, int, num_balls, stream);
+  LAUNCH_PARALLEL_2D(
+      calc_gradients<DEV>,
+      self->cam.film_width,
+      self->cam.film_height,
+      GRAD_BLOCK_SIZE,
+      GRAD_BLOCK_SIZE,
+      stream,
+      self->cam,
+      grad_im,
+      gamma,
+      reinterpret_cast<const float3*>(vert_pos),
+      vert_col,
+      vert_rad,
+      vert_opy_d,
+      num_balls,
+      image,
+      forw_info,
+      self->di_d,
+      self->ii_d,
+      dif_pos,
+      dif_col,
+      dif_rad,
+      dif_cam,
+      dif_opy,
+      self->grad_rad_d,
+      self->grad_col_d,
+      self->grad_pos_d,
+      self->grad_cam_buf_d,
+      self->grad_opy_d,
+      self->ids_sorted_d,
+      self->n_track);
+  CHECKLAUNCH();
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(calc_gradients);
+  START_TIME(normalize);
+#endif
+  LAUNCH_MAX_PARALLEL_1D(
+      norm_sphere_gradients<DEV>, num_balls, stream, *self, num_balls);
+  CHECKLAUNCH();
+  if (dif_cam) {
+    SUM_WS(
+        self->grad_cam_buf_d,
+        reinterpret_cast<CamGradInfo*>(self->grad_cam_d),
+        static_cast<int>(num_balls),
+        self->workspace_d,
+        self->workspace_size,
+        stream);
+    CHECKLAUNCH();
+    SUM_WS(
+        (IntWrapper*)(self->ids_sorted_d),
+        (IntWrapper*)(self->n_grad_contributions_d),
+        static_cast<int>(num_balls),
+        self->workspace_d,
+        self->workspace_size,
+        stream);
+    CHECKLAUNCH();
+    LAUNCH_MAX_PARALLEL_1D(
+        norm_cam_gradients<DEV>, static_cast<int64_t>(1), stream, *self);
+    CHECKLAUNCH();
+  }
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(normalize);
+  float time_ms;
+  // This blocks the result and prevents batch-processing from parallelizing.
+  GET_TIME(calc_signature, &time_ms);
+  std::cout << "Time for signature calculation: " << time_ms << " ms"
+            << std::endl;
+  GET_TIME(calc_gradients, &time_ms);
+  std::cout << "Time for gradient calculation: " << time_ms << " ms"
+            << std::endl;
+  GET_TIME(normalize, &time_ms);
+  std::cout << "Time for aggregation and normalization: " << time_ms << " ms"
+            << std::endl;
+#endif
+  LOG_IF(INFO, PULSAR_LOG_RENDER) << "Backward pass complete.";
+}
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..425a6227873aec451bc4831617501e6d91e1b8e0
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward.instantiate.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "./renderer.backward.device.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template void backward<ISONDEVICE>(
+    Renderer* self,
+    const float* grad_im,
+    const float* image,
+    const float* forw_info,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* vert_opy,
+    const size_t& num_balls,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    cudaStream_t stream);
+
+} // namespace Renderer
+} // namespace pulsar
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward_dbg.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward_dbg.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..618bc8e7359ca0bde2e5b7f1aff6594f82e0a685
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward_dbg.device.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_RENDERER_BACKWARD_DBG_DEVICE_H_
+#define PULSAR_NATIVE_RENDERER_BACKWARD_DBG_DEVICE_H_
+
+#include "./camera.device.h"
+#include "./math.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template <bool DEV>
+void backward_dbg(
+    Renderer* self,
+    const float* grad_im,
+    const float* image,
+    const float* forw_info,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* vert_opy_d,
+    const size_t& num_balls,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    const uint& pos_x,
+    const uint& pos_y,
+    cudaStream_t stream) {
+  ARGCHECK(gamma > 0.f && gamma <= 1.f, 6, "gamma must be in [0., 1.]");
+  ARGCHECK(
+      percent_allowed_difference >= 0.f && percent_allowed_difference <= 1.f,
+      7,
+      "percent_allowed_difference must be in [0., 1.]");
+  ARGCHECK(max_n_hits >= 1u, 8, "max_n_hits must be >= 1");
+  ARGCHECK(
+      num_balls > 0 && num_balls <= self->max_num_balls,
+      9,
+      "num_balls must be >0 and less than max num balls!");
+  ARGCHECK(
+      cam.film_width == self->cam.film_width &&
+          cam.film_height == self->cam.film_height,
+      5,
+      "cam film size must agree");
+  ARGCHECK(mode <= 1, 10, "mode must be <= 1!");
+  if (percent_allowed_difference < EPS) {
+    LOG(WARNING) << "percent_allowed_difference < " << FEPS << "! Clamping to "
+                 << FEPS << ".";
+    percent_allowed_difference = FEPS;
+  }
+  ARGCHECK(
+      pos_x < cam.film_width && pos_y < cam.film_height,
+      15,
+      "pos_x must be < width and pos_y < height.");
+  if (percent_allowed_difference > 1.f - FEPS) {
+    LOG(WARNING) << "percent_allowed_difference > " << (1.f - FEPS)
+                 << "! Clamping to " << (1.f - FEPS) << ".";
+    percent_allowed_difference = 1.f - FEPS;
+  }
+  LOG_IF(INFO, PULSAR_LOG_RENDER)
+      << "Rendering debug backward pass for x: " << pos_x << ", y: " << pos_y;
+  // Update camera.
+  self->cam.eye = cam.eye;
+  self->cam.pixel_0_0_center = cam.pixel_0_0_center - cam.eye;
+  self->cam.pixel_dir_x = cam.pixel_dir_x;
+  self->cam.pixel_dir_y = cam.pixel_dir_y;
+  self->cam.sensor_dir_z = cam.sensor_dir_z;
+  self->cam.half_pixel_size = cam.half_pixel_size;
+  self->cam.focal_length = cam.focal_length;
+  self->cam.aperture_width = cam.aperture_width;
+  self->cam.aperture_height = cam.aperture_height;
+  self->cam.min_dist = cam.min_dist;
+  self->cam.max_dist = cam.max_dist;
+  self->cam.norm_fac = cam.norm_fac;
+  self->cam.principal_point_offset_x = cam.principal_point_offset_x;
+  self->cam.principal_point_offset_y = cam.principal_point_offset_y;
+  self->cam.film_border_left = cam.film_border_left;
+  self->cam.film_border_top = cam.film_border_top;
+  LAUNCH_MAX_PARALLEL_1D(
+      calc_signature<DEV>,
+      num_balls,
+      stream,
+      *self,
+      reinterpret_cast<const float3*>(vert_pos),
+      vert_col,
+      vert_rad,
+      num_balls);
+  CHECKLAUNCH();
+  MEMSET(self->grad_pos_d, 0, float3, num_balls, stream);
+  MEMSET(self->grad_col_d, 0, float, num_balls * self->cam.n_channels, stream);
+  MEMSET(self->grad_rad_d, 0, float, num_balls, stream);
+  MEMSET(self->grad_cam_d, 0, float, 12, stream);
+  MEMSET(self->grad_cam_buf_d, 0, CamGradInfo, num_balls, stream);
+  MEMSET(self->grad_opy_d, 0, float, num_balls, stream);
+  MEMSET(self->ids_sorted_d, 0, int, num_balls, stream);
+  LAUNCH_MAX_PARALLEL_2D(
+      calc_gradients<DEV>,
+      (int64_t)1,
+      (int64_t)1,
+      stream,
+      self->cam,
+      grad_im,
+      gamma,
+      reinterpret_cast<const float3*>(vert_pos),
+      vert_col,
+      vert_rad,
+      vert_opy_d,
+      num_balls,
+      image,
+      forw_info,
+      self->di_d,
+      self->ii_d,
+      dif_pos,
+      dif_col,
+      dif_rad,
+      dif_cam,
+      dif_opy,
+      self->grad_rad_d,
+      self->grad_col_d,
+      self->grad_pos_d,
+      self->grad_cam_buf_d,
+      self->grad_opy_d,
+      self->ids_sorted_d,
+      self->n_track,
+      pos_x,
+      pos_y);
+  CHECKLAUNCH();
+  // We're not doing sphere gradient normalization here.
+  SUM_WS(
+      self->grad_cam_buf_d,
+      reinterpret_cast<CamGradInfo*>(self->grad_cam_d),
+      static_cast<int>(1),
+      self->workspace_d,
+      self->workspace_size,
+      stream);
+  CHECKLAUNCH();
+  // We're not doing camera gradient normalization here.
+  LOG_IF(INFO, PULSAR_LOG_RENDER) << "Debug backward pass complete.";
+}
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward_dbg.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward_dbg.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..499e282b30ee9608bd0022b9617445dbcbd96c33
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.backward_dbg.instantiate.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "./renderer.backward_dbg.device.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template void backward_dbg<ISONDEVICE>(
+    Renderer* self,
+    const float* grad_im,
+    const float* image,
+    const float* forw_info,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* vert_opy,
+    const size_t& num_balls,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    const uint& pos_x,
+    const uint& pos_y,
+    cudaStream_t stream);
+
+} // namespace Renderer
+} // namespace pulsar
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_gradients.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_gradients.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..acac31d1aca4dcd3000b4321923aecc6cdc1a0f2
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_gradients.device.h
@@ -0,0 +1,198 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CALC_GRADIENTS_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CALC_GRADIENTS_H_
+
+#include "../global.h"
+#include "./commands.h"
+#include "./renderer.h"
+
+#include "./renderer.draw.device.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template <bool DEV>
+GLOBAL void calc_gradients(
+    const CamInfo cam, /** Camera in world coordinates. */
+    float const* const RESTRICT grad_im, /** The gradient image. */
+    const float
+        gamma, /** The transparency parameter used in the forward pass. */
+    float3 const* const RESTRICT vert_poss, /** Vertex position vector. */
+    float const* const RESTRICT vert_cols, /** Vertex color vector. */
+    float const* const RESTRICT vert_rads, /** Vertex radius vector. */
+    float const* const RESTRICT opacity, /** Vertex opacity. */
+    const uint num_balls, /** Number of balls. */
+    float const* const RESTRICT result_d, /** Result image. */
+    float const* const RESTRICT forw_info_d, /** Forward pass info. */
+    DrawInfo const* const RESTRICT di_d, /** Draw information. */
+    IntersectInfo const* const RESTRICT ii_d, /** Intersect information. */
+    // Mode switches.
+    const bool calc_grad_pos,
+    const bool calc_grad_col,
+    const bool calc_grad_rad,
+    const bool calc_grad_cam,
+    const bool calc_grad_opy,
+    // Out variables.
+    float* const RESTRICT grad_rad_d, /** Radius gradients. */
+    float* const RESTRICT grad_col_d, /** Color gradients. */
+    float3* const RESTRICT grad_pos_d, /** Position gradients. */
+    CamGradInfo* const RESTRICT grad_cam_buf_d, /** Camera gradient buffer. */
+    float* const RESTRICT grad_opy_d, /** Opacity gradient buffer. */
+    int* const RESTRICT
+        grad_contributed_d, /** Gradient contribution counter. */
+    // Infrastructure.
+    const int n_track,
+    const uint offs_x,
+    const uint offs_y /** Debug offsets. */
+) {
+  uint limit_x = cam.film_width, limit_y = cam.film_height;
+  if (offs_x != 0) {
+    // We're in debug mode.
+    limit_x = 1;
+    limit_y = 1;
+  }
+  GET_PARALLEL_IDS_2D(coord_x_base, coord_y_base, limit_x, limit_y);
+  // coord_x_base and coord_y_base are in the film coordinate system.
+  // We now need to translate to the aperture coordinate system. If
+  // the principal point was shifted left/up nothing has to be
+  // subtracted - only shift needs to be added in case it has been
+  // shifted down/right.
+  const uint film_coord_x = coord_x_base + offs_x;
+  const uint ap_coord_x = film_coord_x +
+      2 * static_cast<uint>(std::max(0, cam.principal_point_offset_x));
+  const uint film_coord_y = coord_y_base + offs_y;
+  const uint ap_coord_y = film_coord_y +
+      2 * static_cast<uint>(std::max(0, cam.principal_point_offset_y));
+  const float3 ray_dir = /** Ray cast through the pixel, normalized. */
+      cam.pixel_0_0_center + ap_coord_x * cam.pixel_dir_x +
+      ap_coord_y * cam.pixel_dir_y;
+  const float norm_ray_dir = length(ray_dir);
+  // ray_dir_norm *must* be calculated here in the same way as in the draw
+  // function to have the same values withno other numerical instabilities
+  // (for example, ray_dir * FRCP(norm_ray_dir) does not work)!
+  float3 ray_dir_norm; /** Ray cast through the pixel, normalized. */
+  float2 projected_ray; /** Ray intersection with the sensor. */
+  if (cam.orthogonal_projection) {
+    ray_dir_norm = cam.sensor_dir_z;
+    projected_ray.x = static_cast<float>(ap_coord_x);
+    projected_ray.y = static_cast<float>(ap_coord_y);
+  } else {
+    ray_dir_norm = normalize(
+        cam.pixel_0_0_center + ap_coord_x * cam.pixel_dir_x +
+        ap_coord_y * cam.pixel_dir_y);
+    // This is a reasonable assumption for normal focal lengths and image sizes.
+    PASSERT(FABS(ray_dir_norm.z) > FEPS);
+    projected_ray.x = ray_dir_norm.x / ray_dir_norm.z * cam.focal_length;
+    projected_ray.y = ray_dir_norm.y / ray_dir_norm.z * cam.focal_length;
+  }
+  float* result = const_cast<float*>(
+      result_d + film_coord_y * cam.film_width * cam.n_channels +
+      film_coord_x * cam.n_channels);
+  const float* grad_im_l = grad_im +
+      film_coord_y * cam.film_width * cam.n_channels +
+      film_coord_x * cam.n_channels;
+  // For writing...
+  float3 grad_pos;
+  float grad_rad, grad_opy;
+  CamGradInfo grad_cam_local = CamGradInfo();
+  // Set up shared infrastructure.
+  const int fwi_loc = film_coord_y * cam.film_width * (3 + 2 * n_track) +
+      film_coord_x * (3 + 2 * n_track);
+  float sm_m = forw_info_d[fwi_loc];
+  float sm_d = forw_info_d[fwi_loc + 1];
+  PULSAR_LOG_DEV_APIX(
+      PULSAR_LOG_GRAD,
+      "grad|sm_m: %f, sm_d: %f, result: "
+      "%f, %f, %f; grad_im: %f, %f, %f.\n",
+      sm_m,
+      sm_d,
+      result[0],
+      result[1],
+      result[2],
+      grad_im_l[0],
+      grad_im_l[1],
+      grad_im_l[2]);
+  // Start processing.
+  for (int grad_idx = 0; grad_idx < n_track; ++grad_idx) {
+    int sphere_idx;
+    FASI(forw_info_d[fwi_loc + 3 + 2 * grad_idx], sphere_idx);
+    PASSERT(
+        sphere_idx == -1 ||
+        sphere_idx >= 0 && static_cast<uint>(sphere_idx) < num_balls);
+    if (sphere_idx >= 0) {
+      // TODO: make more efficient.
+      grad_pos = make_float3(0.f, 0.f, 0.f);
+      grad_rad = 0.f;
+      grad_cam_local = CamGradInfo();
+      const DrawInfo di = di_d[sphere_idx];
+      grad_opy = 0.f;
+      draw(
+          di,
+          opacity == NULL ? 1.f : opacity[sphere_idx],
+          cam,
+          gamma,
+          ray_dir_norm,
+          projected_ray,
+          // Mode switches.
+          false, // draw only
+          calc_grad_pos,
+          calc_grad_col,
+          calc_grad_rad,
+          calc_grad_cam,
+          calc_grad_opy,
+          // Position info.
+          ap_coord_x,
+          ap_coord_y,
+          sphere_idx,
+          // Optional in.
+          &ii_d[sphere_idx],
+          &ray_dir,
+          &norm_ray_dir,
+          grad_im_l,
+          NULL,
+          // In/out
+          &sm_d,
+          &sm_m,
+          result,
+          // Optional out.
+          NULL,
+          NULL,
+          &grad_pos,
+          grad_col_d + sphere_idx * cam.n_channels,
+          &grad_rad,
+          &grad_cam_local,
+          &grad_opy);
+      ATOMICADD(&(grad_rad_d[sphere_idx]), grad_rad);
+      // Color has been added directly.
+      ATOMICADD_F3(&(grad_pos_d[sphere_idx]), grad_pos);
+      ATOMICADD_F3(
+          &(grad_cam_buf_d[sphere_idx].cam_pos), grad_cam_local.cam_pos);
+      if (!cam.orthogonal_projection) {
+        ATOMICADD_F3(
+            &(grad_cam_buf_d[sphere_idx].pixel_0_0_center),
+            grad_cam_local.pixel_0_0_center);
+      }
+      ATOMICADD_F3(
+          &(grad_cam_buf_d[sphere_idx].pixel_dir_x),
+          grad_cam_local.pixel_dir_x);
+      ATOMICADD_F3(
+          &(grad_cam_buf_d[sphere_idx].pixel_dir_y),
+          grad_cam_local.pixel_dir_y);
+      ATOMICADD(&(grad_opy_d[sphere_idx]), grad_opy);
+      ATOMICADD(&(grad_contributed_d[sphere_idx]), 1);
+    }
+  }
+  END_PARALLEL_2D_NORET();
+};
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_gradients.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_gradients.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad4525482df7a88e440152980e1e7127beac38a3
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_gradients.instantiate.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "./renderer.calc_gradients.device.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template GLOBAL void calc_gradients<ISONDEVICE>(
+    const CamInfo cam, /** Camera in world coordinates. */
+    float const* const RESTRICT grad_im, /** The gradient image. */
+    const float
+        gamma, /** The transparency parameter used in the forward pass. */
+    float3 const* const RESTRICT vert_poss, /** Vertex position vector. */
+    float const* const RESTRICT vert_cols, /** Vertex color vector. */
+    float const* const RESTRICT vert_rads, /** Vertex radius vector. */
+    float const* const RESTRICT opacity, /** Vertex opacity. */
+    const uint num_balls, /** Number of balls. */
+    float const* const RESTRICT result_d, /** Result image. */
+    float const* const RESTRICT forw_info_d, /** Forward pass info. */
+    DrawInfo const* const RESTRICT di_d, /** Draw information. */
+    IntersectInfo const* const RESTRICT ii_d, /** Intersect information. */
+    // Mode switches.
+    const bool calc_grad_pos,
+    const bool calc_grad_col,
+    const bool calc_grad_rad,
+    const bool calc_grad_cam,
+    const bool calc_grad_opy,
+    // Out variables.
+    float* const RESTRICT grad_rad_d, /** Radius gradients. */
+    float* const RESTRICT grad_col_d, /** Color gradients. */
+    float3* const RESTRICT grad_pos_d, /** Position gradients. */
+    CamGradInfo* const RESTRICT grad_cam_buf_d, /** Camera gradient buffer. */
+    float* const RESTRICT grad_opy_d, /** Opacity gradient buffer. */
+    int* const RESTRICT
+        grad_contributed_d, /** Gradient contribution counter. */
+    // Infrastructure.
+    const int n_track,
+    const uint offs_x,
+    const uint offs_y);
+
+} // namespace Renderer
+} // namespace pulsar
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_signature.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_signature.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..c7b3ba56f856c87117ad2dd55dd529023dd82dfb
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_signature.device.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CALC_SIGNATURE_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CALC_SIGNATURE_DEVICE_H_
+
+#include "../global.h"
+#include "./camera.device.h"
+#include "./commands.h"
+#include "./math.h"
+#include "./renderer.get_screen_area.device.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template <bool DEV>
+GLOBAL void calc_signature(
+    Renderer renderer,
+    float3 const* const RESTRICT vert_poss,
+    float const* const RESTRICT vert_cols,
+    float const* const RESTRICT vert_rads,
+    const uint num_balls) {
+  /* We're not using RESTRICT here for the pointers within `renderer`. Just one
+     value is being read from each of the pointers, so the effect would be
+     negligible or non-existent. */
+  GET_PARALLEL_IDX_1D(idx, num_balls);
+  // Create aliases.
+  // For reading...
+  const float3& vert_pos = vert_poss[idx]; /** Vertex position. */
+  const float* vert_col =
+      vert_cols + idx * renderer.cam.n_channels; /** Vertex color. */
+  const float& vert_rad = vert_rads[idx]; /** Vertex radius. */
+  const CamInfo& cam = renderer.cam; /** Camera in world coordinates. */
+  // For writing...
+  /** Ball ID (either original index of the ball or -1 if not visible). */
+  int& id_out = renderer.ids_d[idx];
+  /** Intersection helper structure for the ball. */
+  IntersectInfo& intersect_helper_out = renderer.ii_d[idx];
+  /** Draw helper structure for this ball. */
+  DrawInfo& draw_helper_out = renderer.di_d[idx];
+  /** Minimum possible intersection depth for this ball. */
+  float& closest_possible_intersect_out = renderer.min_depth_d[idx];
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|vert_pos: %.9f, %.9f, %.9f, vert_col (first three): "
+      "%.9f, %.9f, %.9f.\n",
+      idx,
+      vert_pos.x,
+      vert_pos.y,
+      vert_pos.z,
+      vert_col[0],
+      vert_col[1],
+      vert_col[2]);
+  // Set flags to invalid for a potential early return.
+  id_out = -1; // Invalid ID.
+  closest_possible_intersect_out =
+      MAX_FLOAT; // These spheres are sorted to the very end.
+  intersect_helper_out.max.x = MAX_USHORT; // No intersection possible.
+  intersect_helper_out.min.x = MAX_USHORT;
+  intersect_helper_out.max.y = MAX_USHORT;
+  intersect_helper_out.min.y = MAX_USHORT;
+  // Start processing.
+  /** Ball center in the camera coordinate system. */
+  const float3 ball_center_cam = vert_pos - cam.eye;
+  /** Distance to the ball center in the camera coordinate system. */
+  const float t_center = length(ball_center_cam);
+  /** Closest possible intersection with this ball from the camera. */
+  float closest_possible_intersect;
+  if (cam.orthogonal_projection) {
+    const float3 ball_center_cam_rot = rotate(
+        ball_center_cam,
+        cam.pixel_dir_x / length(cam.pixel_dir_x),
+        cam.pixel_dir_y / length(cam.pixel_dir_y),
+        cam.sensor_dir_z);
+    closest_possible_intersect = ball_center_cam_rot.z - vert_rad;
+  } else {
+    closest_possible_intersect = t_center - vert_rad;
+  }
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|t_center: %f. vert_rad: %f. "
+      "closest_possible_intersect: %f.\n",
+      idx,
+      t_center,
+      vert_rad,
+      closest_possible_intersect);
+  /**
+   * Corner points of the enclosing projected rectangle of the ball.
+   * They are first calculated in the camera coordinate system, then
+   * converted to the pixel coordinate system.
+   */
+  float x_1, x_2, y_1, y_2;
+  bool hits_screen_plane;
+  float3 ray_center_norm = ball_center_cam / t_center;
+  PASSERT(vert_rad >= 0.f);
+  if (closest_possible_intersect < cam.min_dist ||
+      closest_possible_intersect > cam.max_dist) {
+    PULSAR_LOG_DEV(
+        PULSAR_LOG_CALC_SIGNATURE,
+        "signature %d|ignoring sphere out of min/max bounds: %.9f, "
+        "min: %.9f, max: %.9f.\n",
+        idx,
+        closest_possible_intersect,
+        cam.min_dist,
+        cam.max_dist);
+    RETURN_PARALLEL();
+  }
+  // Find the relevant region on the screen plane.
+  hits_screen_plane = get_screen_area(
+      ball_center_cam,
+      ray_center_norm,
+      vert_rad,
+      cam,
+      idx,
+      &x_1,
+      &x_2,
+      &y_1,
+      &y_2);
+  if (!hits_screen_plane)
+    RETURN_PARALLEL();
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|in pixels: x_1: %f, x_2: %f, y_1: %f, y_2: %f.\n",
+      idx,
+      x_1,
+      x_2,
+      y_1,
+      y_2);
+  // Check whether the pixel coordinates are on screen.
+  if (FMAX(x_1, x_2) <= static_cast<float>(cam.film_border_left) ||
+      FMIN(x_1, x_2) >=
+          static_cast<float>(cam.film_border_left + cam.film_width) - 0.5f ||
+      FMAX(y_1, y_2) <= static_cast<float>(cam.film_border_top) ||
+      FMIN(y_1, y_2) >
+          static_cast<float>(cam.film_border_top + cam.film_height) - 0.5f)
+    RETURN_PARALLEL();
+  // Write results.
+  id_out = idx;
+  intersect_helper_out.min.x = static_cast<ushort>(
+      FMAX(FMIN(x_1, x_2), static_cast<float>(cam.film_border_left)));
+  intersect_helper_out.min.y = static_cast<ushort>(
+      FMAX(FMIN(y_1, y_2), static_cast<float>(cam.film_border_top)));
+  // In the following calculations, the max that needs to be stored is
+  // exclusive.
+  // That means that the calculated value needs to be `ceil`ed and incremented
+  // to find the correct value.
+  intersect_helper_out.max.x = static_cast<ushort>(FMIN(
+      FCEIL(FMAX(x_1, x_2)) + 1,
+      static_cast<float>(cam.film_border_left + cam.film_width)));
+  intersect_helper_out.max.y = static_cast<ushort>(FMIN(
+      FCEIL(FMAX(y_1, y_2)) + 1,
+      static_cast<float>(cam.film_border_top + cam.film_height)));
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|limits after refining: x_1: %u, x_2: %u, "
+      "y_1: %u, y_2: %u.\n",
+      idx,
+      intersect_helper_out.min.x,
+      intersect_helper_out.max.x,
+      intersect_helper_out.min.y,
+      intersect_helper_out.max.y);
+  if (intersect_helper_out.min.x == MAX_USHORT) {
+    id_out = -1;
+    RETURN_PARALLEL();
+  }
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|writing info. closest_possible_intersect: %.9f. "
+      "ray_center_norm: %.9f, %.9f, %.9f. t_center: %.9f. radius: %.9f.\n",
+      idx,
+      closest_possible_intersect,
+      ray_center_norm.x,
+      ray_center_norm.y,
+      ray_center_norm.z,
+      t_center,
+      vert_rad);
+  closest_possible_intersect_out = closest_possible_intersect;
+  draw_helper_out.ray_center_norm = ray_center_norm;
+  draw_helper_out.t_center = t_center;
+  draw_helper_out.radius = vert_rad;
+  if (cam.n_channels <= 3) {
+    draw_helper_out.first_color = vert_col[0];
+    for (uint c_id = 1; c_id < cam.n_channels; ++c_id) {
+      draw_helper_out.color_union.color[c_id - 1] = vert_col[c_id];
+    }
+  } else {
+    draw_helper_out.color_union.ptr = const_cast<float*>(vert_col);
+  }
+  END_PARALLEL();
+};
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_signature.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_signature.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..de724e0d838ee633cd4d92abeb7fca24ff033017
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.calc_signature.instantiate.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CALC_SIGNATURE_INSTANTIATE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CALC_SIGNATURE_INSTANTIATE_H_
+
+#include "./renderer.calc_signature.device.h"
+
+namespace pulsar {
+namespace Renderer {
+template GLOBAL void calc_signature<ISONDEVICE>(
+    Renderer renderer,
+    float3 const* const RESTRICT vert_poss,
+    float const* const RESTRICT vert_cols,
+    float const* const RESTRICT vert_rads,
+    const uint num_balls);
+}
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.construct.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.construct.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f31a3c6aed6ddb685ce8932531e94466fca798a
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.construct.device.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CONSTRUCT_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CONSTRUCT_DEVICE_H_
+
+#include "../global.h"
+#include "./camera.device.h"
+#include "./commands.h"
+#include "./math.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template <bool DEV>
+HOST void construct(
+    Renderer* self,
+    const size_t& max_num_balls,
+    const int& width,
+    const int& height,
+    const bool& orthogonal_projection,
+    const bool& right_handed_system,
+    const float& background_normalization_depth,
+    const uint& n_channels,
+    const uint& n_track) {
+  ARGCHECK(
+      (max_num_balls > 0 && max_num_balls < MAX_INT),
+      2,
+      ("the maximum number of balls must be >0 and <" +
+       std::to_string(MAX_INT) + ". Is " + std::to_string(max_num_balls) + ".")
+          .c_str());
+  ARGCHECK(width > 1, 3, "the image width must be > 1");
+  ARGCHECK(height > 1, 4, "the image height must be > 1");
+  ARGCHECK(
+      background_normalization_depth > 0.f &&
+          background_normalization_depth < 1.f,
+      6,
+      "background_normalization_depth must be in ]0., 1.[.");
+  ARGCHECK(n_channels > 0, 7, "n_channels must be >0!");
+  ARGCHECK(
+      n_track > 0 && n_track <= MAX_GRAD_SPHERES,
+      8,
+      ("n_track must be >0 and <" + std::to_string(MAX_GRAD_SPHERES) + ". Is " +
+       std::to_string(n_track) + ".")
+          .c_str());
+  self->cam.film_width = width;
+  self->cam.film_height = height;
+  self->max_num_balls = max_num_balls;
+  MALLOC(self->result_d, float, width* height* n_channels);
+  self->cam.orthogonal_projection = orthogonal_projection;
+  self->cam.right_handed = right_handed_system;
+  self->cam.background_normalization_depth = background_normalization_depth;
+  self->cam.n_channels = n_channels;
+  MALLOC(self->min_depth_d, float, max_num_balls);
+  MALLOC(self->min_depth_sorted_d, float, max_num_balls);
+  MALLOC(self->ii_d, IntersectInfo, max_num_balls);
+  MALLOC(self->ii_sorted_d, IntersectInfo, max_num_balls);
+  MALLOC(self->ids_d, int, max_num_balls);
+  MALLOC(self->ids_sorted_d, int, max_num_balls);
+  size_t sort_id_size = 0;
+  GET_SORT_WS_SIZE(&sort_id_size, float, int, max_num_balls);
+  CHECKLAUNCH();
+  size_t sort_ii_size = 0;
+  GET_SORT_WS_SIZE(&sort_ii_size, float, IntersectInfo, max_num_balls);
+  CHECKLAUNCH();
+  size_t sort_di_size = 0;
+  GET_SORT_WS_SIZE(&sort_di_size, float, DrawInfo, max_num_balls);
+  CHECKLAUNCH();
+  size_t select_ii_size = 0;
+  GET_SELECT_WS_SIZE(&select_ii_size, char, IntersectInfo, max_num_balls);
+  size_t select_di_size = 0;
+  GET_SELECT_WS_SIZE(&select_di_size, char, DrawInfo, max_num_balls);
+  size_t sum_size = 0;
+  GET_SUM_WS_SIZE(&sum_size, CamGradInfo, max_num_balls);
+  size_t sum_cont_size = 0;
+  GET_SUM_WS_SIZE(&sum_cont_size, int, max_num_balls);
+  size_t reduce_size = 0;
+  GET_REDUCE_WS_SIZE(
+      &reduce_size, IntersectInfo, IntersectInfoMinMax(), max_num_balls);
+  self->workspace_size = IMAX(
+      IMAX(IMAX(sort_id_size, sort_ii_size), sort_di_size),
+      IMAX(
+          IMAX(select_di_size, select_ii_size),
+          IMAX(IMAX(sum_size, sum_cont_size), reduce_size)));
+  MALLOC(self->workspace_d, char, self->workspace_size);
+  MALLOC(self->di_d, DrawInfo, max_num_balls);
+  MALLOC(self->di_sorted_d, DrawInfo, max_num_balls);
+  MALLOC(self->region_flags_d, char, max_num_balls);
+  MALLOC(self->num_selected_d, size_t, 1);
+  MALLOC(self->forw_info_d, float, width* height*(3 + 2 * n_track));
+  MALLOC(self->min_max_pixels_d, IntersectInfo, 1);
+  MALLOC(self->grad_pos_d, float3, max_num_balls);
+  MALLOC(self->grad_col_d, float, max_num_balls* n_channels);
+  MALLOC(self->grad_rad_d, float, max_num_balls);
+  MALLOC(self->grad_cam_d, float, 12);
+  MALLOC(self->grad_cam_buf_d, CamGradInfo, max_num_balls);
+  MALLOC(self->grad_opy_d, float, max_num_balls);
+  MALLOC(self->n_grad_contributions_d, int, 1);
+  self->n_track = static_cast<int>(n_track);
+}
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.construct.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.construct.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..bbd8646519f35dc94a252f80d501d696d5757741
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.construct.instantiate.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CONSTRUCT_INSTANTIATE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CONSTRUCT_INSTANTIATE_H_
+
+#include "./renderer.construct.device.h"
+
+namespace pulsar {
+namespace Renderer {
+template void construct<ISONDEVICE>(
+    Renderer* self,
+    const size_t& max_num_balls,
+    const int& width,
+    const int& height,
+    const bool& orthogonal_projection,
+    const bool& right_handed_system,
+    const float& background_normalization_depth,
+    const uint& n_channels,
+    const uint& n_track);
+}
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.create_selector.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.create_selector.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c2b60fa86bf028955fb5cead54b34ec701c9d73
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.create_selector.device.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CREATE_SELECTOR_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CREATE_SELECTOR_DEVICE_H_
+
+#include "../global.h"
+#include "./commands.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template <bool DEV>
+GLOBAL void create_selector(
+    IntersectInfo const* const RESTRICT ii_sorted_d,
+    const uint num_balls,
+    const int min_x,
+    const int max_x,
+    const int min_y,
+    const int max_y,
+    /* Out variables. */
+    char* RESTRICT region_flags_d) {
+  GET_PARALLEL_IDX_1D(idx, num_balls);
+  bool hit = (static_cast<int>(ii_sorted_d[idx].min.x) <= max_x) &&
+      (static_cast<int>(ii_sorted_d[idx].max.x) > min_x) &&
+      (static_cast<int>(ii_sorted_d[idx].min.y) <= max_y) &&
+      (static_cast<int>(ii_sorted_d[idx].max.y) > min_y);
+  region_flags_d[idx] = hit;
+  END_PARALLEL_NORET();
+}
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.create_selector.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.create_selector.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..b092a99a0581b833e634bf8379c168c163b2a6ff
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.create_selector.instantiate.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CREATE_SELECTOR_INSTANTIATE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CREATE_SELECTOR_INSTANTIATE_H_
+
+#include "./renderer.create_selector.device.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template GLOBAL void create_selector<ISONDEVICE>(
+    IntersectInfo const* const RESTRICT ii_sorted_d,
+    const uint num_balls,
+    const int min_x,
+    const int max_x,
+    const int min_y,
+    const int max_y,
+    /* Out variables. */
+    char* RESTRICT region_flags_d);
+
+}
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.destruct.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.destruct.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..2405e3cd071c1fdfafd28c0fd47a0a9de172f282
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.destruct.device.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_DESTRUCT_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_DESTRUCT_H_
+
+#include "../global.h"
+#include "./commands.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template <bool DEV>
+HOST void destruct(Renderer* self) {
+  if (self->result_d != NULL)
+    FREE(self->result_d);
+  self->result_d = NULL;
+  if (self->min_depth_d != NULL)
+    FREE(self->min_depth_d);
+  self->min_depth_d = NULL;
+  if (self->min_depth_sorted_d != NULL)
+    FREE(self->min_depth_sorted_d);
+  self->min_depth_sorted_d = NULL;
+  if (self->ii_d != NULL)
+    FREE(self->ii_d);
+  self->ii_d = NULL;
+  if (self->ii_sorted_d != NULL)
+    FREE(self->ii_sorted_d);
+  self->ii_sorted_d = NULL;
+  if (self->ids_d != NULL)
+    FREE(self->ids_d);
+  self->ids_d = NULL;
+  if (self->ids_sorted_d != NULL)
+    FREE(self->ids_sorted_d);
+  self->ids_sorted_d = NULL;
+  if (self->workspace_d != NULL)
+    FREE(self->workspace_d);
+  self->workspace_d = NULL;
+  if (self->di_d != NULL)
+    FREE(self->di_d);
+  self->di_d = NULL;
+  if (self->di_sorted_d != NULL)
+    FREE(self->di_sorted_d);
+  self->di_sorted_d = NULL;
+  if (self->region_flags_d != NULL)
+    FREE(self->region_flags_d);
+  self->region_flags_d = NULL;
+  if (self->num_selected_d != NULL)
+    FREE(self->num_selected_d);
+  self->num_selected_d = NULL;
+  if (self->forw_info_d != NULL)
+    FREE(self->forw_info_d);
+  self->forw_info_d = NULL;
+  if (self->min_max_pixels_d != NULL)
+    FREE(self->min_max_pixels_d);
+  self->min_max_pixels_d = NULL;
+  if (self->grad_pos_d != NULL)
+    FREE(self->grad_pos_d);
+  self->grad_pos_d = NULL;
+  if (self->grad_col_d != NULL)
+    FREE(self->grad_col_d);
+  self->grad_col_d = NULL;
+  if (self->grad_rad_d != NULL)
+    FREE(self->grad_rad_d);
+  self->grad_rad_d = NULL;
+  if (self->grad_cam_d != NULL)
+    FREE(self->grad_cam_d);
+  self->grad_cam_d = NULL;
+  if (self->grad_cam_buf_d != NULL)
+    FREE(self->grad_cam_buf_d);
+  self->grad_cam_buf_d = NULL;
+  if (self->grad_opy_d != NULL)
+    FREE(self->grad_opy_d);
+  self->grad_opy_d = NULL;
+  if (self->n_grad_contributions_d != NULL)
+    FREE(self->n_grad_contributions_d);
+  self->n_grad_contributions_d = NULL;
+}
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.destruct.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.destruct.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..5faf9ba5bf4ad2753ebafb4b026673bfe6c0c548
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.destruct.instantiate.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_DESTRUCT_INSTANTIATE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_DESTRUCT_INSTANTIATE_H_
+
+#include "./renderer.destruct.device.h"
+
+namespace pulsar {
+namespace Renderer {
+template void destruct<ISONDEVICE>(Renderer* self);
+}
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.draw.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.draw.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..820f79eede5098c4836989b78e84387f8c0ad130
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.draw.device.h
@@ -0,0 +1,846 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_CALC_SIGNATURE_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_CALC_SIGNATURE_DEVICE_H_
+
+#include "../global.h"
+#include "./camera.device.h"
+#include "./commands.h"
+#include "./math.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+/**
+ * Draw a ball into the `result`.
+ *
+ * Returns whether a hit was noticed. See README for an explanation of sphere
+ * points and variable notation.
+ */
+INLINE DEVICE bool draw(
+    /* In variables. */
+    const DrawInfo& draw_info, /** The draw information for this ball. */
+    const float& opacity, /** The sphere opacity. */
+    const CamInfo&
+        cam, /** Camera information. Doesn't have to be normalized. */
+    const float& gamma, /** 'Transparency' indicator (see paper for details). */
+    const float3& ray_dir_norm, /** The direction of the ray, normalized. */
+    const float2& projected_ray, /** The intersection of the ray with the image
+                                    in pixel space. */
+    /** Mode switches. */
+    const bool& draw_only, /** Whether we are in draw vs. grad mode. */
+    const bool& calc_grad_pos, /** Calculate position gradients. */
+    const bool& calc_grad_col, /** Calculate color gradients. */
+    const bool& calc_grad_rad, /** Calculate radius gradients. */
+    const bool& calc_grad_cam, /** Calculate camera gradients. */
+    const bool& calc_grad_opy, /** Calculate opacity gradients. */
+    /** Position info. */
+    const uint& coord_x, /** The pixel position x to draw at. */
+    const uint& coord_y, /** The pixel position y to draw at. */
+    const uint& idx, /** The id of the sphere to process. */
+    /* Optional in variables. */
+    IntersectInfo const* const RESTRICT
+        intersect_info, /** The intersect information for this ball. */
+    float3 const* const RESTRICT ray_dir, /** The ray direction (not normalized)
+                             to draw at. Only used for grad computation. */
+    float const* const RESTRICT norm_ray_dir, /** The length of the direction
+                                 vector. Only used for grad computation. */
+    float const* const RESTRICT grad_pix, /** The gradient for this pixel. Only
+                              used for grad computation. */
+    float const* const RESTRICT
+        ln_pad_over_1minuspad, /** Allowed percentage indicator. */
+    /* In or out variables, depending on mode. */
+    float* const RESTRICT sm_d, /** Normalization denominator. */
+    float* const RESTRICT
+        sm_m, /** Maximum of normalization weight factors observed. */
+    float* const RESTRICT
+        result, /** Result pixel color. Must be zeros initially. */
+    /* Optional out variables. */
+    float* const RESTRICT depth_threshold, /** The depth threshold to use. Only
+                                              used for rendering. */
+    float* const RESTRICT intersection_depth_norm_out, /** The intersection
+                                             depth. Only set when rendering. */
+    float3* const RESTRICT grad_pos, /** Gradient w.r.t. position. */
+    float* const RESTRICT grad_col, /** Gradient w.r.t. color. */
+    float* const RESTRICT grad_rad, /** Gradient w.r.t. radius. */
+    CamGradInfo* const RESTRICT grad_cam, /** Gradient w.r.t. camera. */
+    float* const RESTRICT grad_opy /** Gradient w.r.t. opacity. */
+) {
+  // TODO: variable reuse?
+  PASSERT(
+      isfinite(draw_info.ray_center_norm.x) &&
+      isfinite(draw_info.ray_center_norm.y) &&
+      isfinite(draw_info.ray_center_norm.z));
+  PASSERT(isfinite(draw_info.t_center) && draw_info.t_center >= 0.f);
+  PASSERT(
+      isfinite(draw_info.radius) && draw_info.radius >= 0.f &&
+      draw_info.radius <= draw_info.t_center);
+  PASSERT(isfinite(ray_dir_norm.x));
+  PASSERT(isfinite(ray_dir_norm.y));
+  PASSERT(isfinite(ray_dir_norm.z));
+  PASSERT(isfinite(*sm_d));
+  PASSERT(
+      cam.orthogonal_projection && cam.focal_length == 0.f ||
+      cam.focal_length > 0.f);
+  PASSERT(gamma <= 1.f && gamma >= 1e-5f);
+  /** The ball center in the camera coordinate system. */
+  float3 center = draw_info.ray_center_norm * draw_info.t_center;
+  /** The vector from the reference point to the ball center. */
+  float3 raydiff;
+  if (cam.orthogonal_projection) {
+    center = rotate(
+        center,
+        cam.pixel_dir_x / length(cam.pixel_dir_x),
+        cam.pixel_dir_y / length(cam.pixel_dir_y),
+        cam.sensor_dir_z);
+    raydiff =
+        make_float3( // TODO: make offset consistent with `get_screen_area`.
+            center.x -
+                (projected_ray.x -
+                 static_cast<float>(cam.aperture_width) * .5f) *
+                    (2.f * cam.half_pixel_size),
+            center.y -
+                (projected_ray.y -
+                 static_cast<float>(cam.aperture_height) * .5f) *
+                    (2.f * cam.half_pixel_size),
+            0.f);
+  } else {
+    /** The reference point on the ray; the point in the same distance
+     * from the camera as the ball center, but along the ray.
+     */
+    const float3 rayref = ray_dir_norm * draw_info.t_center;
+    raydiff = center - rayref;
+  }
+  /** The closeness of the reference point to ball center in world coords.
+   *
+   * In [0., radius].
+   */
+  const float closeness_world = length(raydiff);
+  /** The reciprocal radius. */
+  const float radius_rcp = FRCP(draw_info.radius);
+  /** The closeness factor normalized with the ball radius.
+   *
+   * In [0., 1.].
+   */
+  float closeness = FSATURATE(FMA(-closeness_world, radius_rcp, 1.f));
+  PULSAR_LOG_DEV_PIX(
+      PULSAR_LOG_DRAW_PIX,
+      "drawprep %u|center: %.9f, %.9f, %.9f. raydiff: %.9f, "
+      "%.9f, %.9f. closeness_world: %.9f. closeness: %.9f\n",
+      idx,
+      center.x,
+      center.y,
+      center.z,
+      raydiff.x,
+      raydiff.y,
+      raydiff.z,
+      closeness_world,
+      closeness);
+  /** Whether this is the 'center pixel' for this ball, the pixel that
+   * is closest to its projected center. This information is used to
+   * make sure to draw 'tiny' spheres with less than one pixel in
+   * projected size.
+   */
+  bool ray_through_center_pixel;
+  float projected_radius, projected_x, projected_y;
+  if (cam.orthogonal_projection) {
+    projected_x = center.x / (2.f * cam.half_pixel_size) +
+        (static_cast<float>(cam.aperture_width) - 1.f) / 2.f;
+    projected_y = center.y / (2.f * cam.half_pixel_size) +
+        (static_cast<float>(cam.aperture_height) - 1.f) / 2.f;
+    projected_radius = draw_info.radius / (2.f * cam.half_pixel_size);
+    ray_through_center_pixel =
+        (FABS(FSUB(projected_x, projected_ray.x)) < 0.5f + FEPS &&
+         FABS(FSUB(projected_y, projected_ray.y)) < 0.5f + FEPS);
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_DRAW_PIX,
+        "drawprep %u|closeness_world: %.9f. closeness: %.9f. "
+        "projected (x, y): %.9f, %.9f. projected_ray (x, y): "
+        "%.9f, %.9f. ray_through_center_pixel: %d.\n",
+        idx,
+        closeness_world,
+        closeness,
+        projected_x,
+        projected_y,
+        projected_ray.x,
+        projected_ray.y,
+        ray_through_center_pixel);
+  } else {
+    // Misusing this variable for half pixel size projected to the depth
+    // at which the sphere resides. Leave some slack for numerical
+    // inaccuracy (factor 1.5).
+    projected_x = FMUL(cam.half_pixel_size * 1.5, draw_info.t_center) *
+        FRCP(cam.focal_length);
+    projected_radius = FMUL(draw_info.radius, cam.focal_length) *
+        FRCP(draw_info.t_center) / (2.f * cam.half_pixel_size);
+    ray_through_center_pixel = projected_x > closeness_world;
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_DRAW_PIX,
+        "drawprep %u|closeness_world: %.9f. closeness: %.9f. "
+        "projected half pixel size: %.9f. "
+        "ray_through_center_pixel: %d.\n",
+        idx,
+        closeness_world,
+        closeness,
+        projected_x,
+        ray_through_center_pixel);
+  }
+  if (draw_only && draw_info.radius < closeness_world &&
+      !ray_through_center_pixel) {
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_DRAW_PIX,
+        "drawprep %u|Abandoning since no hit has been detected.\n",
+        idx);
+    return false;
+  } else {
+    // This is always a hit since we are following the forward execution pass.
+    // p2 is the closest intersection point with the sphere.
+  }
+  if (ray_through_center_pixel && projected_radius < 1.f) {
+    // Make a tiny sphere visible.
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_DRAW_PIX,
+        "drawprep %u|Setting closeness to 1 (projected radius: %.9f).\n",
+        idx,
+        projected_radius);
+    closeness = 1.;
+  }
+  PASSERT(closeness >= 0.f && closeness <= 1.f);
+  /** Distance between the camera (`o`) and `p1`, the closest point to the
+   * ball center along the casted ray.
+   *
+   * In [t_center - radius, t_center].
+   */
+  float o__p1_;
+  /** The distance from ball center to p1.
+   *
+   * In [0., sqrt(t_center ^ 2 - (t_center - radius) ^ 2)].
+   */
+  float c__p1_;
+  if (cam.orthogonal_projection) {
+    o__p1_ = FABS(center.z);
+    c__p1_ = length(raydiff);
+  } else {
+    o__p1_ = dot(center, ray_dir_norm);
+    /**
+     * This is being calculated as sqrt(t_center^2 - o__p1_^2) =
+     * sqrt((t_center + o__p1_) * (t_center - o__p1_)) to avoid
+     * catastrophic cancellation in floating point representations.
+     */
+    c__p1_ = FSQRT(
+        (draw_info.t_center + o__p1_) * FMAX(draw_info.t_center - o__p1_, 0.f));
+    // PASSERT(o__p1_ >= draw_info.t_center - draw_info.radius);
+    // Numerical errors lead to too large values.
+    o__p1_ = FMIN(o__p1_, draw_info.t_center);
+    // PASSERT(o__p1_ <= draw_info.t_center);
+  }
+  /** The distance from the closest point to the sphere center (p1)
+   * to the closest intersection point (p2).
+   *
+   * In [0., radius].
+   */
+  const float p1__p2_ =
+      FSQRT((draw_info.radius + c__p1_) * FMAX(draw_info.radius - c__p1_, 0.f));
+  PASSERT(p1__p2_ >= 0.f && p1__p2_ <= draw_info.radius);
+  PULSAR_LOG_DEV_PIX(
+      PULSAR_LOG_DRAW_PIX,
+      "drawprep %u|o__p1_: %.9f, c__p1_: %.9f, p1__p2_: %.9f.\n",
+      idx,
+      o__p1_,
+      c__p1_,
+      p1__p2_);
+  /** The intersection depth of the ray with this ball.
+   *
+   * In [t_center - radius, t_center].
+   */
+  const float intersection_depth = (o__p1_ - p1__p2_);
+  PASSERT(
+      cam.orthogonal_projection &&
+          (intersection_depth >= center.z - draw_info.radius &&
+           intersection_depth <= center.z) ||
+      intersection_depth >= draw_info.t_center - draw_info.radius &&
+          intersection_depth <= draw_info.t_center);
+  /** Normalized distance of the closest intersection point; in [0., 1.]. */
+  const float norm_dist =
+      FMUL(FSUB(intersection_depth, cam.min_dist), cam.norm_fac);
+  PASSERT(norm_dist >= 0.f && norm_dist <= 1.f);
+  /** Scaled, normalized distance in [1., 0.] (closest, farthest). */
+  const float norm_dist_scaled = FSUB(1.f, norm_dist) / gamma * opacity;
+  PASSERT(norm_dist_scaled >= 0.f && norm_dist_scaled <= 1.f / gamma);
+  PULSAR_LOG_DEV_PIX(
+      PULSAR_LOG_DRAW_PIX,
+      "drawprep %u|intersection_depth: %.9f, norm_dist: %.9f, "
+      "norm_dist_scaled: %.9f.\n",
+      idx,
+      intersection_depth,
+      norm_dist,
+      norm_dist_scaled);
+  float const* const col_ptr =
+      cam.n_channels > 3 ? draw_info.color_union.ptr : &draw_info.first_color;
+  // The implementation for the numerically stable weighted softmax is based
+  // on https://arxiv.org/pdf/1805.02867.pdf .
+  if (draw_only) {
+    /** The old maximum observed value. */
+    const float sm_m_old = *sm_m;
+    *sm_m = FMAX(*sm_m, norm_dist_scaled);
+    const float coeff_exp = FEXP(norm_dist_scaled - *sm_m);
+    PASSERT(isfinite(coeff_exp));
+    /** The color coefficient for the ball color; in [0., 1.]. */
+    const float coeff = closeness * coeff_exp * opacity;
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_DRAW_PIX,
+        "draw %u|coeff: %.9f. closeness: %.9f. coeff_exp: %.9f. "
+        "opacity: %.9f.\n",
+        idx,
+        coeff,
+        closeness,
+        coeff_exp,
+        opacity);
+    // Rendering.
+    if (sm_m_old == *sm_m) {
+      // Use the fact that exp(0) = 1 to avoid the exp calculation for
+      // the case that the maximum remains the same (which it should
+      // most of the time).
+      *sm_d = FADD(*sm_d, coeff);
+      for (uint c_id = 0; c_id < cam.n_channels; ++c_id) {
+        PASSERT(isfinite(result[c_id]));
+        result[c_id] = FMA(coeff, col_ptr[c_id], result[c_id]);
+      }
+    } else {
+      const float exp_correction = FEXP(sm_m_old - *sm_m);
+      *sm_d = FMA(*sm_d, exp_correction, coeff);
+      for (uint c_id = 0; c_id < cam.n_channels; ++c_id) {
+        PASSERT(isfinite(result[c_id]));
+        result[c_id] =
+            FMA(coeff, col_ptr[c_id], FMUL(result[c_id], exp_correction));
+      }
+    }
+    PASSERT(isfinite(*sm_d));
+    *intersection_depth_norm_out = intersection_depth;
+    // Update the depth threshold.
+    *depth_threshold =
+        1.f - (FLN(*sm_d + FEPS) + *ln_pad_over_1minuspad + *sm_m) * gamma;
+    *depth_threshold =
+        FMA(*depth_threshold, FSUB(cam.max_dist, cam.min_dist), cam.min_dist);
+  } else {
+    // Gradient computation.
+    const float coeff_exp = FEXP(norm_dist_scaled - *sm_m);
+    const float gamma_rcp = FRCP(gamma);
+    const float radius_sq = FMUL(draw_info.radius, draw_info.radius);
+    const float coeff = FMAX(
+        FMIN(closeness * coeff_exp * opacity, *sm_d - FEPS),
+        0.f); // in [0., sm_d - FEPS].
+    PASSERT(coeff >= 0.f && coeff <= *sm_d);
+    const float otherw = *sm_d - coeff; // in [FEPS, sm_d].
+    const float p1__p2_safe = FMAX(p1__p2_, FEPS); // in [eps, t_center].
+    const float cam_range = FSUB(cam.max_dist, cam.min_dist); // in ]0, inf[
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_GRAD,
+        "grad %u|pos: %.9f, %.9f, %.9f. pixeldirx: %.9f, %.9f, %.9f. "
+        "pixeldiry: %.9f, %.9f, %.9f. pixel00center: %.9f, %.9f, %.9f.\n",
+        idx,
+        draw_info.ray_center_norm.x * draw_info.t_center,
+        draw_info.ray_center_norm.y * draw_info.t_center,
+        draw_info.ray_center_norm.z * draw_info.t_center,
+        cam.pixel_dir_x.x,
+        cam.pixel_dir_x.y,
+        cam.pixel_dir_x.z,
+        cam.pixel_dir_y.x,
+        cam.pixel_dir_y.y,
+        cam.pixel_dir_y.z,
+        cam.pixel_0_0_center.x,
+        cam.pixel_0_0_center.y,
+        cam.pixel_0_0_center.z);
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_GRAD,
+        "grad %u|ray_dir: %.9f, %.9f, %.9f. "
+        "ray_dir_norm: %.9f, %.9f, %.9f. "
+        "draw_info.ray_center_norm: %.9f, %.9f, %.9f.\n",
+        idx,
+        ray_dir->x,
+        ray_dir->y,
+        ray_dir->z,
+        ray_dir_norm.x,
+        ray_dir_norm.y,
+        ray_dir_norm.z,
+        draw_info.ray_center_norm.x,
+        draw_info.ray_center_norm.y,
+        draw_info.ray_center_norm.z);
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_GRAD,
+        "grad %u|coeff_exp: %.9f. "
+        "norm_dist_scaled: %.9f. cam.norm_fac: %f.\n",
+        idx,
+        coeff_exp,
+        norm_dist_scaled,
+        cam.norm_fac);
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_GRAD,
+        "grad %u|p1__p2_: %.9f. p1__p2_safe: %.9f.\n",
+        idx,
+        p1__p2_,
+        p1__p2_safe);
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_GRAD,
+        "grad %u|o__p1_: %.9f. c__p1_: %.9f.\n",
+        idx,
+        o__p1_,
+        c__p1_);
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_GRAD,
+        "grad %u|intersection_depth: %f. norm_dist: %f. "
+        "coeff: %.9f. closeness: %f. coeff_exp: %f. opacity: "
+        "%f. color: %f, %f, %f.\n",
+        idx,
+        intersection_depth,
+        norm_dist,
+        coeff,
+        closeness,
+        coeff_exp,
+        opacity,
+        draw_info.first_color,
+        draw_info.color_union.color[0],
+        draw_info.color_union.color[1]);
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_GRAD,
+        "grad %u|t_center: %.9f. "
+        "radius: %.9f. max_dist: %f. min_dist: %f. gamma: %f.\n",
+        idx,
+        draw_info.t_center,
+        draw_info.radius,
+        cam.max_dist,
+        cam.min_dist,
+        gamma);
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_GRAD,
+        "grad %u|sm_d: %f. sm_m: %f. grad_pix (first three): %f, %f, %f.\n",
+        idx,
+        *sm_d,
+        *sm_m,
+        grad_pix[0],
+        grad_pix[1],
+        grad_pix[2]);
+    PULSAR_LOG_DEV_PIX(PULSAR_LOG_GRAD, "grad %u|otherw: %f.\n", idx, otherw);
+    if (calc_grad_col) {
+      const float sm_d_norm = FRCP(FMAX(*sm_d, FEPS));
+      // First do the multiplication of coeff (in [0., sm_d]) and 1/sm_d. The
+      // result is a factor in [0., 1.] to be multiplied with the incoming
+      // gradient.
+      for (uint c_id = 0; c_id < cam.n_channels; ++c_id) {
+        ATOMICADD(grad_col + c_id, grad_pix[c_id] * FMUL(coeff, sm_d_norm));
+      }
+      PULSAR_LOG_DEV_PIX(
+          PULSAR_LOG_GRAD,
+          "grad %u|dimDdcol.x: %f. dresDdcol.x: %f.\n",
+          idx,
+          FMUL(coeff, sm_d_norm) * grad_pix[0],
+          coeff * sm_d_norm);
+    }
+    // We disable the computation for too small spheres.
+    // The comparison is made this way to avoid subtraction of unsigned types.
+    if (calc_grad_cam || calc_grad_pos || calc_grad_rad || calc_grad_opy) {
+      //! First find dimDdcoeff.
+      const float n0 =
+          otherw * FRCP(FMAX(*sm_d * *sm_d, FEPS)); // in [0., 1. / sm_d].
+      PASSERT(isfinite(n0) && n0 >= 0. && n0 <= 1. / *sm_d + 1e2f * FEPS);
+      // We'll aggergate dimDdcoeff over all the 'color' channels.
+      float dimDdcoeff = 0.f;
+      const float otherw_safe_rcp = FRCP(FMAX(otherw, FEPS));
+      float othercol;
+      for (uint c_id = 0; c_id < cam.n_channels; ++c_id) {
+        othercol =
+            (result[c_id] * *sm_d - col_ptr[c_id] * coeff) * otherw_safe_rcp;
+        PULSAR_LOG_DEV_PIX(
+            PULSAR_LOG_GRAD,
+            "grad %u|othercol[%u]: %.9f.\n",
+            idx,
+            c_id,
+            othercol);
+        dimDdcoeff +=
+            FMUL(FMUL(grad_pix[c_id], FSUB(col_ptr[c_id], othercol)), n0);
+      }
+      PASSERT(isfinite(dimDdcoeff));
+      PULSAR_LOG_DEV_PIX(
+          PULSAR_LOG_GRAD,
+          "grad %u|dimDdcoeff: %.9f, n0: %f.\n",
+          idx,
+          dimDdcoeff,
+          n0);
+      if (calc_grad_opy) {
+        //! dimDdopacity.
+        *grad_opy += dimDdcoeff * coeff_exp * closeness *
+            (1.f + opacity * (1.f - norm_dist) * gamma_rcp);
+        PULSAR_LOG_DEV_PIX(
+            PULSAR_LOG_GRAD,
+            "grad %u|dcoeffDdopacity: %.9f, dimDdopacity: %.9f.\n",
+            idx,
+            coeff_exp * closeness,
+            dimDdcoeff * coeff_exp * closeness);
+      }
+      if (intersect_info->max.x >= intersect_info->min.x + 3 &&
+          intersect_info->max.y >= intersect_info->min.y + 3) {
+        //! Now find dcoeffDdintersection_depth and dcoeffDdcloseness.
+        const float dcoeffDdintersection_depth =
+            -closeness * coeff_exp * opacity * opacity / (gamma * cam_range);
+        const float dcoeffDdcloseness = coeff_exp * opacity;
+        PULSAR_LOG_DEV_PIX(
+            PULSAR_LOG_GRAD,
+            "grad %u|dcoeffDdintersection_depth: %.9f. "
+            "dimDdintersection_depth: %.9f. "
+            "dcoeffDdcloseness: %.9f. dimDdcloseness: %.9f.\n",
+            idx,
+            dcoeffDdintersection_depth,
+            dimDdcoeff * dcoeffDdintersection_depth,
+            dcoeffDdcloseness,
+            dimDdcoeff * dcoeffDdcloseness);
+        //! Here, the execution paths for orthogonal and pinyhole camera split.
+        if (cam.orthogonal_projection) {
+          if (calc_grad_rad) {
+            //! Find dcoeffDdrad.
+            float dcoeffDdrad =
+                dcoeffDdcloseness * (closeness_world / radius_sq) -
+                dcoeffDdintersection_depth * draw_info.radius / p1__p2_safe;
+            PASSERT(isfinite(dcoeffDdrad));
+            *grad_rad += FMUL(dimDdcoeff, dcoeffDdrad);
+            PULSAR_LOG_DEV_PIX(
+                PULSAR_LOG_GRAD,
+                "grad %u|dimDdrad: %.9f. dcoeffDdrad: %.9f.\n",
+                idx,
+                FMUL(dimDdcoeff, dcoeffDdrad),
+                dcoeffDdrad);
+          }
+          if (calc_grad_pos || calc_grad_cam) {
+            float3 dimDdcenter = raydiff /
+                p1__p2_safe; /* making it dintersection_depthDdcenter. */
+            dimDdcenter.z = sign_dir(center.z);
+            PASSERT(FABS(center.z) >= cam.min_dist && cam.min_dist >= FEPS);
+            dimDdcenter *= dcoeffDdintersection_depth; // dcoeffDdcenter
+            dimDdcenter -= dcoeffDdcloseness * /* dclosenessDdcenter. */
+                raydiff * FRCP(FMAX(length(raydiff) * draw_info.radius, FEPS));
+            PULSAR_LOG_DEV_PIX(
+                PULSAR_LOG_GRAD,
+                "grad %u|dcoeffDdcenter: %.9f, %.9f, %.9f.\n",
+                idx,
+                dimDdcenter.x,
+                dimDdcenter.y,
+                dimDdcenter.z);
+            // Now dcoeffDdcenter is stored in dimDdcenter.
+            dimDdcenter *= dimDdcoeff;
+            PULSAR_LOG_DEV_PIX(
+                PULSAR_LOG_GRAD,
+                "grad %u|dimDdcenter: %.9f, %.9f, %.9f.\n",
+                idx,
+                dimDdcenter.x,
+                dimDdcenter.y,
+                dimDdcenter.z);
+            // Prepare for posglob and cam pos.
+            const float pixel_size = length(cam.pixel_dir_x);
+            // pixel_size is the same as length(pixeldiry)!
+            const float pixel_size_rcp = FRCP(pixel_size);
+            float3 dcenterDdposglob =
+                (cam.pixel_dir_x + cam.pixel_dir_y) * pixel_size_rcp +
+                cam.sensor_dir_z;
+            PULSAR_LOG_DEV_PIX(
+                PULSAR_LOG_GRAD,
+                "grad %u|dcenterDdposglob: %.9f, %.9f, %.9f.\n",
+                idx,
+                dcenterDdposglob.x,
+                dcenterDdposglob.y,
+                dcenterDdposglob.z);
+            if (calc_grad_pos) {
+              //! dcenterDdposglob.
+              *grad_pos += dimDdcenter * dcenterDdposglob;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdpos: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  dimDdcenter.x * dcenterDdposglob.x,
+                  dimDdcenter.y * dcenterDdposglob.y,
+                  dimDdcenter.z * dcenterDdposglob.z);
+            }
+            if (calc_grad_cam) {
+              //! Camera.
+              grad_cam->cam_pos -= dimDdcenter * dcenterDdposglob;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdeye: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  -dimDdcenter.x * dcenterDdposglob.x,
+                  -dimDdcenter.y * dcenterDdposglob.y,
+                  -dimDdcenter.z * dcenterDdposglob.z);
+              // coord_world
+              /*
+              float3 dclosenessDdcoord_world =
+                raydiff * FRCP(FMAX(draw_info.radius * length(raydiff), FEPS));
+              float3 dintersection_depthDdcoord_world = -2.f * raydiff;
+              */
+              float3 dimDdcoord_world = /* dcoeffDdcoord_world */
+                  dcoeffDdcloseness * raydiff *
+                      FRCP(FMAX(draw_info.radius * length(raydiff), FEPS)) -
+                  dcoeffDdintersection_depth * raydiff / p1__p2_safe;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dcoeffDdcoord_world: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  dimDdcoord_world.x,
+                  dimDdcoord_world.y,
+                  dimDdcoord_world.z);
+              dimDdcoord_world *= dimDdcoeff;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdcoord_world: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  dimDdcoord_world.x,
+                  dimDdcoord_world.y,
+                  dimDdcoord_world.z);
+              // The third component of dimDdcoord_world is 0!
+              PASSERT(dimDdcoord_world.z == 0.f);
+              float3 coord_world = center - raydiff;
+              coord_world.z = 0.f;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|coord_world: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  coord_world.x,
+                  coord_world.y,
+                  coord_world.z);
+              // Do this component-wise to save unnecessary matmul steps.
+              grad_cam->pixel_dir_x += dimDdcoord_world.x * cam.pixel_dir_x *
+                  coord_world.x * pixel_size_rcp * pixel_size_rcp;
+              grad_cam->pixel_dir_x += dimDdcoord_world.y * cam.pixel_dir_x *
+                  coord_world.y * pixel_size_rcp * pixel_size_rcp;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdpixel_dir_x|coord_world: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  grad_cam->pixel_dir_x.x,
+                  grad_cam->pixel_dir_x.y,
+                  grad_cam->pixel_dir_x.z);
+              // dcenterkDdpixel_dir_k.
+              float3 center_in_pixels = draw_info.ray_center_norm *
+                  draw_info.t_center * pixel_size_rcp;
+              grad_cam->pixel_dir_x += dimDdcenter.x *
+                  (center_in_pixels -
+                   outer_product_sum(cam.pixel_dir_x) * center_in_pixels *
+                       pixel_size_rcp * pixel_size_rcp);
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dcenter0dpixel_dir_x: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  (center_in_pixels -
+                   outer_product_sum(cam.pixel_dir_x) * center_in_pixels *
+                       pixel_size_rcp * pixel_size_rcp)
+                      .x,
+                  (center_in_pixels -
+                   outer_product_sum(cam.pixel_dir_x) * center_in_pixels *
+                       pixel_size_rcp * pixel_size_rcp)
+                      .y,
+                  (center_in_pixels -
+                   outer_product_sum(cam.pixel_dir_x) * center_in_pixels *
+                       pixel_size_rcp * pixel_size_rcp)
+                      .z);
+              grad_cam->pixel_dir_y += dimDdcenter.y *
+                  (center_in_pixels -
+                   outer_product_sum(cam.pixel_dir_y) * center_in_pixels *
+                       pixel_size_rcp * pixel_size_rcp);
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dcenter1dpixel_dir_y: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  (center_in_pixels -
+                   outer_product_sum(cam.pixel_dir_y) * center_in_pixels *
+                       pixel_size_rcp * pixel_size_rcp)
+                      .x,
+                  (center_in_pixels -
+                   outer_product_sum(cam.pixel_dir_y) * center_in_pixels *
+                       pixel_size_rcp * pixel_size_rcp)
+                      .y,
+                  (center_in_pixels -
+                   outer_product_sum(cam.pixel_dir_y) * center_in_pixels *
+                       pixel_size_rcp * pixel_size_rcp)
+                      .z);
+              // dcenterzDdpixel_dir_k.
+              float sensordirz_norm_rcp = FRCP(
+                  FMAX(length(cross(cam.pixel_dir_y, cam.pixel_dir_x)), FEPS));
+              grad_cam->pixel_dir_x += dimDdcenter.z *
+                  (dot(center, cam.sensor_dir_z) *
+                       cross(cam.pixel_dir_y, cam.sensor_dir_z) -
+                   cross(cam.pixel_dir_y, center)) *
+                  sensordirz_norm_rcp;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dcenterzDdpixel_dir_x: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  ((dot(center, cam.sensor_dir_z) *
+                        cross(cam.pixel_dir_y, cam.sensor_dir_z) -
+                    cross(cam.pixel_dir_y, center)) *
+                   sensordirz_norm_rcp)
+                      .x,
+                  ((dot(center, cam.sensor_dir_z) *
+                        cross(cam.pixel_dir_y, cam.sensor_dir_z) -
+                    cross(cam.pixel_dir_y, center)) *
+                   sensordirz_norm_rcp)
+                      .y,
+                  ((dot(center, cam.sensor_dir_z) *
+                        cross(cam.pixel_dir_y, cam.sensor_dir_z) -
+                    cross(cam.pixel_dir_y, center)) *
+                   sensordirz_norm_rcp)
+                      .z);
+              grad_cam->pixel_dir_y += dimDdcenter.z *
+                  (dot(center, cam.sensor_dir_z) *
+                       cross(cam.pixel_dir_x, cam.sensor_dir_z) -
+                   cross(cam.pixel_dir_x, center)) *
+                  sensordirz_norm_rcp;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dcenterzDdpixel_dir_y: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  ((dot(center, cam.sensor_dir_z) *
+                        cross(cam.pixel_dir_x, cam.sensor_dir_z) -
+                    cross(cam.pixel_dir_x, center)) *
+                   sensordirz_norm_rcp)
+                      .x,
+                  ((dot(center, cam.sensor_dir_z) *
+                        cross(cam.pixel_dir_x, cam.sensor_dir_z) -
+                    cross(cam.pixel_dir_x, center)) *
+                   sensordirz_norm_rcp)
+                      .y,
+                  ((dot(center, cam.sensor_dir_z) *
+                        cross(cam.pixel_dir_x, cam.sensor_dir_z) -
+                    cross(cam.pixel_dir_x, center)) *
+                   sensordirz_norm_rcp)
+                      .z);
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdpixel_dir_x: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  grad_cam->pixel_dir_x.x,
+                  grad_cam->pixel_dir_x.y,
+                  grad_cam->pixel_dir_x.z);
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdpixel_dir_y: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  grad_cam->pixel_dir_y.x,
+                  grad_cam->pixel_dir_y.y,
+                  grad_cam->pixel_dir_y.z);
+            }
+          }
+        } else {
+          if (calc_grad_rad) {
+            //! Find dcoeffDdrad.
+            float dcoeffDdrad =
+                dcoeffDdcloseness * (closeness_world / radius_sq) -
+                dcoeffDdintersection_depth * draw_info.radius / p1__p2_safe;
+            PASSERT(isfinite(dcoeffDdrad));
+            *grad_rad += FMUL(dimDdcoeff, dcoeffDdrad);
+            PULSAR_LOG_DEV_PIX(
+                PULSAR_LOG_GRAD,
+                "grad %u|dimDdrad: %.9f. dcoeffDdrad: %.9f.\n",
+                idx,
+                FMUL(dimDdcoeff, dcoeffDdrad),
+                dcoeffDdrad);
+          }
+          if (calc_grad_pos || calc_grad_cam) {
+            const float3 tmp1 = center - ray_dir_norm * o__p1_;
+            const float3 tmp1n = tmp1 / p1__p2_safe;
+            const float ray_dir_normDotRaydiff = dot(ray_dir_norm, raydiff);
+            const float3 dcoeffDdray = dcoeffDdintersection_depth *
+                    (tmp1 - o__p1_ * tmp1n) / *norm_ray_dir +
+                dcoeffDdcloseness *
+                    (ray_dir_norm * -ray_dir_normDotRaydiff + raydiff) /
+                    (closeness_world * draw_info.radius) *
+                    (draw_info.t_center / *norm_ray_dir);
+            PULSAR_LOG_DEV_PIX(
+                PULSAR_LOG_GRAD,
+                "grad %u|dcoeffDdray: %.9f, %.9f, %.9f. dimDdray: "
+                "%.9f, %.9f, %.9f.\n",
+                idx,
+                dcoeffDdray.x,
+                dcoeffDdray.y,
+                dcoeffDdray.z,
+                dimDdcoeff * dcoeffDdray.x,
+                dimDdcoeff * dcoeffDdray.y,
+                dimDdcoeff * dcoeffDdray.z);
+            const float3 dcoeffDdcenter =
+                dcoeffDdintersection_depth * (ray_dir_norm + tmp1n) +
+                dcoeffDdcloseness *
+                    (draw_info.ray_center_norm * ray_dir_normDotRaydiff -
+                     raydiff) /
+                    (closeness_world * draw_info.radius);
+            PULSAR_LOG_DEV_PIX(
+                PULSAR_LOG_GRAD,
+                "grad %u|dcoeffDdcenter: %.9f, %.9f, %.9f. "
+                "dimDdcenter: %.9f, %.9f, %.9f.\n",
+                idx,
+                dcoeffDdcenter.x,
+                dcoeffDdcenter.y,
+                dcoeffDdcenter.z,
+                dimDdcoeff * dcoeffDdcenter.x,
+                dimDdcoeff * dcoeffDdcenter.y,
+                dimDdcoeff * dcoeffDdcenter.z);
+            if (calc_grad_pos) {
+              *grad_pos += dimDdcoeff * dcoeffDdcenter;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdposglob: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  dimDdcoeff * dcoeffDdcenter.x,
+                  dimDdcoeff * dcoeffDdcenter.y,
+                  dimDdcoeff * dcoeffDdcenter.z);
+            }
+            if (calc_grad_cam) {
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdeye: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  -dimDdcoeff * (dcoeffDdcenter.x + dcoeffDdray.x),
+                  -dimDdcoeff * (dcoeffDdcenter.y + dcoeffDdray.y),
+                  -dimDdcoeff * (dcoeffDdcenter.z + dcoeffDdray.z));
+              grad_cam->cam_pos += -dimDdcoeff * (dcoeffDdcenter + dcoeffDdray);
+              grad_cam->pixel_0_0_center += dimDdcoeff * dcoeffDdray;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdpixel00centerglob: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  dimDdcoeff * dcoeffDdray.x,
+                  dimDdcoeff * dcoeffDdray.y,
+                  dimDdcoeff * dcoeffDdray.z);
+              grad_cam->pixel_dir_x +=
+                  (dimDdcoeff * static_cast<float>(coord_x)) * dcoeffDdray;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdpixel_dir_x: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  (dimDdcoeff * static_cast<float>(coord_x)) * dcoeffDdray.x,
+                  (dimDdcoeff * static_cast<float>(coord_x)) * dcoeffDdray.y,
+                  (dimDdcoeff * static_cast<float>(coord_x)) * dcoeffDdray.z);
+              grad_cam->pixel_dir_y +=
+                  (dimDdcoeff * static_cast<float>(coord_y)) * dcoeffDdray;
+              PULSAR_LOG_DEV_PIX(
+                  PULSAR_LOG_GRAD,
+                  "grad %u|dimDdpixel_dir_y: %.9f, %.9f, %.9f.\n",
+                  idx,
+                  (dimDdcoeff * static_cast<float>(coord_y)) * dcoeffDdray.x,
+                  (dimDdcoeff * static_cast<float>(coord_y)) * dcoeffDdray.y,
+                  (dimDdcoeff * static_cast<float>(coord_y)) * dcoeffDdray.z);
+            }
+          }
+        }
+      }
+    }
+  }
+  return true;
+};
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.fill_bg.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.fill_bg.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..11d69e9e8e42828ca2656fa1e1addd97b1d167ea
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.fill_bg.device.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_FILL_BG_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_FILL_BG_DEVICE_H_
+
+#include "../global.h"
+#include "./camera.h"
+#include "./commands.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template <bool DEV>
+GLOBAL void fill_bg(
+    Renderer renderer,
+    const CamInfo cam,
+    float const* const bg_col_d,
+    const float gamma,
+    const uint mode) {
+  GET_PARALLEL_IDS_2D(coord_x, coord_y, cam.film_width, cam.film_height);
+  int write_loc = coord_y * cam.film_width * (3 + 2 * renderer.n_track) +
+      coord_x * (3 + 2 * renderer.n_track);
+  if (renderer.forw_info_d[write_loc + 1] // sm_d
+      == 0.f) {
+    // This location has not been processed yet.
+    // Write first the forw_info:
+    // sm_m
+    renderer.forw_info_d[write_loc] =
+        cam.background_normalization_depth / gamma;
+    // sm_d
+    renderer.forw_info_d[write_loc + 1] = 1.f;
+    // max_closest_possible_intersection_hit
+    renderer.forw_info_d[write_loc + 2] = -1.f;
+    // sphere IDs and intersection depths.
+    for (int i = 0; i < renderer.n_track; ++i) {
+      int sphere_id = -1;
+      IASF(sphere_id, renderer.forw_info_d[write_loc + 3 + i * 2]);
+      renderer.forw_info_d[write_loc + 3 + i * 2 + 1] = -1.f;
+    }
+    if (mode == 0) {
+      // Image background.
+      for (int i = 0; i < cam.n_channels; ++i) {
+        renderer.result_d
+            [coord_y * cam.film_width * cam.n_channels +
+             coord_x * cam.n_channels + i] = bg_col_d[i];
+      }
+    }
+  }
+  END_PARALLEL_2D_NORET();
+};
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.fill_bg.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.fill_bg.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..6849b0ea9c0459e3d466326269b55ff3b4ea4762
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.fill_bg.instantiate.h
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "./renderer.fill_bg.device.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template GLOBAL void fill_bg<ISONDEVICE>(
+    Renderer renderer,
+    const CamInfo norm,
+    float const* const bg_col_d,
+    const float gamma,
+    const uint mode);
+
+} // namespace Renderer
+} // namespace pulsar
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.forward.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.forward.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..e794a711d44a1b559b058c705d194dbb0637ed40
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.forward.device.h
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_FORWARD_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_FORWARD_DEVICE_H_
+
+#include "../global.h"
+#include "./camera.device.h"
+#include "./commands.h"
+#include "./math.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template <bool DEV>
+void forward(
+    Renderer* self,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* bg_col_d,
+    const float* opacity_d,
+    const size_t& num_balls,
+    const uint& mode,
+    cudaStream_t stream) {
+  ARGCHECK(gamma > 0.f && gamma <= 1.f, 6, "gamma must be in [0., 1.]");
+  ARGCHECK(
+      percent_allowed_difference >= 0.f && percent_allowed_difference <= 1.f,
+      7,
+      "percent_allowed_difference must be in [0., 1.]");
+  ARGCHECK(max_n_hits >= 1u, 8, "max_n_hits must be >= 1");
+  ARGCHECK(
+      num_balls > 0 && num_balls <= self->max_num_balls,
+      9,
+      ("num_balls must be >0 and <= max num balls! (" +
+       std::to_string(num_balls) + " vs. " +
+       std::to_string(self->max_num_balls) + ")")
+          .c_str());
+  ARGCHECK(
+      cam.film_width == self->cam.film_width &&
+          cam.film_height == self->cam.film_height,
+      5,
+      "cam result width and height must agree");
+  ARGCHECK(mode <= 1, 10, "mode must be <= 1!");
+  if (percent_allowed_difference > 1.f - FEPS) {
+    LOG(WARNING) << "percent_allowed_difference > " << (1.f - FEPS)
+                 << "! Clamping to " << (1.f - FEPS) << ".";
+    percent_allowed_difference = 1.f - FEPS;
+  }
+  LOG_IF(INFO, PULSAR_LOG_RENDER) << "Rendering forward pass...";
+  // Update camera and transform into a new virtual camera system with
+  // centered principal point and subsection rendering.
+  self->cam.eye = cam.eye;
+  self->cam.pixel_0_0_center = cam.pixel_0_0_center - cam.eye;
+  self->cam.pixel_dir_x = cam.pixel_dir_x;
+  self->cam.pixel_dir_y = cam.pixel_dir_y;
+  self->cam.sensor_dir_z = cam.sensor_dir_z;
+  self->cam.half_pixel_size = cam.half_pixel_size;
+  self->cam.focal_length = cam.focal_length;
+  self->cam.aperture_width = cam.aperture_width;
+  self->cam.aperture_height = cam.aperture_height;
+  self->cam.min_dist = cam.min_dist;
+  self->cam.max_dist = cam.max_dist;
+  self->cam.norm_fac = cam.norm_fac;
+  self->cam.principal_point_offset_x = cam.principal_point_offset_x;
+  self->cam.principal_point_offset_y = cam.principal_point_offset_y;
+  self->cam.film_border_left = cam.film_border_left;
+  self->cam.film_border_top = cam.film_border_top;
+#ifdef PULSAR_TIMINGS_ENABLED
+  START_TIME(calc_signature);
+#endif
+  LAUNCH_MAX_PARALLEL_1D(
+      calc_signature<DEV>,
+      num_balls,
+      stream,
+      *self,
+      reinterpret_cast<const float3*>(vert_pos),
+      vert_col,
+      vert_rad,
+      num_balls);
+  CHECKLAUNCH();
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(calc_signature);
+  START_TIME(sort);
+#endif
+  SORT_ASCENDING_WS(
+      self->min_depth_d,
+      self->min_depth_sorted_d,
+      self->ids_d,
+      self->ids_sorted_d,
+      num_balls,
+      self->workspace_d,
+      self->workspace_size,
+      stream);
+  SORT_ASCENDING_WS(
+      self->min_depth_d,
+      self->min_depth_sorted_d,
+      self->ii_d,
+      self->ii_sorted_d,
+      num_balls,
+      self->workspace_d,
+      self->workspace_size,
+      stream);
+  SORT_ASCENDING_WS(
+      self->min_depth_d,
+      self->min_depth_sorted_d,
+      self->di_d,
+      self->di_sorted_d,
+      num_balls,
+      self->workspace_d,
+      self->workspace_size,
+      stream);
+  CHECKLAUNCH();
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(sort);
+  START_TIME(minmax);
+#endif
+  IntersectInfo pixel_minmax;
+  pixel_minmax.min.x = MAX_USHORT;
+  pixel_minmax.min.y = MAX_USHORT;
+  pixel_minmax.max.x = 0;
+  pixel_minmax.max.y = 0;
+  REDUCE_WS(
+      self->ii_sorted_d,
+      self->min_max_pixels_d,
+      num_balls,
+      IntersectInfoMinMax(),
+      pixel_minmax,
+      self->workspace_d,
+      self->workspace_size,
+      stream);
+  COPY_DEV_HOST(&pixel_minmax, self->min_max_pixels_d, IntersectInfo, 1);
+  LOG_IF(INFO, PULSAR_LOG_RENDER)
+      << "Region with pixels to render: " << pixel_minmax.min.x << ":"
+      << pixel_minmax.max.x << " (x), " << pixel_minmax.min.y << ":"
+      << pixel_minmax.max.y << " (y).";
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(minmax);
+  START_TIME(render);
+#endif
+  MEMSET(
+      self->result_d,
+      0,
+      float,
+      self->cam.film_width * self->cam.film_height * self->cam.n_channels,
+      stream);
+  MEMSET(
+      self->forw_info_d,
+      0,
+      float,
+      self->cam.film_width * self->cam.film_height * (3 + 2 * self->n_track),
+      stream);
+  if (pixel_minmax.max.y > pixel_minmax.min.y &&
+      pixel_minmax.max.x > pixel_minmax.min.x) {
+    PASSERT(
+        pixel_minmax.min.x >= static_cast<ushort>(self->cam.film_border_left) &&
+        pixel_minmax.min.x <
+            static_cast<ushort>(
+                self->cam.film_border_left + self->cam.film_width) &&
+        pixel_minmax.max.x <=
+            static_cast<ushort>(
+                self->cam.film_border_left + self->cam.film_width) &&
+        pixel_minmax.min.y >= static_cast<ushort>(self->cam.film_border_top) &&
+        pixel_minmax.min.y <
+            static_cast<ushort>(
+                self->cam.film_border_top + self->cam.film_height) &&
+        pixel_minmax.max.y <=
+            static_cast<ushort>(
+                self->cam.film_border_top + self->cam.film_height));
+    // Cut the image in 3x3 regions.
+    int y_step = RENDER_BLOCK_SIZE *
+        iDivCeil(pixel_minmax.max.y - pixel_minmax.min.y,
+                 3u * RENDER_BLOCK_SIZE);
+    int x_step = RENDER_BLOCK_SIZE *
+        iDivCeil(pixel_minmax.max.x - pixel_minmax.min.x,
+                 3u * RENDER_BLOCK_SIZE);
+    LOG_IF(INFO, PULSAR_LOG_RENDER) << "Using image slices of size " << x_step
+                                    << ", " << y_step << " (W, H).";
+    for (int y_min = pixel_minmax.min.y; y_min < pixel_minmax.max.y;
+         y_min += y_step) {
+      for (int x_min = pixel_minmax.min.x; x_min < pixel_minmax.max.x;
+           x_min += x_step) {
+        // Create region selection.
+        LAUNCH_MAX_PARALLEL_1D(
+            create_selector<DEV>,
+            num_balls,
+            stream,
+            self->ii_sorted_d,
+            num_balls,
+            x_min,
+            x_min + x_step,
+            y_min,
+            y_min + y_step,
+            self->region_flags_d);
+        CHECKLAUNCH();
+        SELECT_FLAGS_WS(
+            self->region_flags_d,
+            self->ii_sorted_d,
+            self->ii_d,
+            self->num_selected_d,
+            num_balls,
+            self->workspace_d,
+            self->workspace_size,
+            stream);
+        CHECKLAUNCH();
+        SELECT_FLAGS_WS(
+            self->region_flags_d,
+            self->di_sorted_d,
+            self->di_d,
+            self->num_selected_d,
+            num_balls,
+            self->workspace_d,
+            self->workspace_size,
+            stream);
+        CHECKLAUNCH();
+        SELECT_FLAGS_WS(
+            self->region_flags_d,
+            self->ids_sorted_d,
+            self->ids_d,
+            self->num_selected_d,
+            num_balls,
+            self->workspace_d,
+            self->workspace_size,
+            stream);
+        CHECKLAUNCH();
+        LAUNCH_PARALLEL_2D(
+            render<DEV>,
+            x_step,
+            y_step,
+            RENDER_BLOCK_SIZE,
+            RENDER_BLOCK_SIZE,
+            stream,
+            self->num_selected_d,
+            self->ii_d,
+            self->di_d,
+            self->min_depth_d,
+            self->ids_d,
+            opacity_d,
+            self->cam,
+            gamma,
+            percent_allowed_difference,
+            max_n_hits,
+            bg_col_d,
+            mode,
+            x_min,
+            y_min,
+            x_step,
+            y_step,
+            self->result_d,
+            self->forw_info_d,
+            self->n_track);
+        CHECKLAUNCH();
+      }
+    }
+  }
+  if (mode == 0) {
+    LAUNCH_MAX_PARALLEL_2D(
+        fill_bg<DEV>,
+        static_cast<int64_t>(self->cam.film_width),
+        static_cast<int64_t>(self->cam.film_height),
+        stream,
+        *self,
+        self->cam,
+        bg_col_d,
+        gamma,
+        mode);
+    CHECKLAUNCH();
+  }
+#ifdef PULSAR_TIMINGS_ENABLED
+  STOP_TIME(render);
+  float time_ms;
+  // This blocks the result and prevents batch-processing from parallelizing.
+  GET_TIME(calc_signature, &time_ms);
+  std::cout << "Time for signature calculation: " << time_ms << " ms"
+            << std::endl;
+  GET_TIME(sort, &time_ms);
+  std::cout << "Time for sorting: " << time_ms << " ms" << std::endl;
+  GET_TIME(minmax, &time_ms);
+  std::cout << "Time for minmax pixel calculation: " << time_ms << " ms"
+            << std::endl;
+  GET_TIME(render, &time_ms);
+  std::cout << "Time for rendering: " << time_ms << " ms" << std::endl;
+#endif
+  LOG_IF(INFO, PULSAR_LOG_RENDER) << "Forward pass complete.";
+}
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.forward.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.forward.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..db84e51cc7221a6536c27da34d5483a792d746be
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.forward.instantiate.h
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "./renderer.forward.device.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template void forward<ISONDEVICE>(
+    Renderer* self,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* bg_col_d,
+    const float* opacity_d,
+    const size_t& num_balls,
+    const uint& mode,
+    cudaStream_t stream);
+
+} // namespace Renderer
+} // namespace pulsar
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.get_screen_area.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.get_screen_area.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..08ee49d9c1fd5e5c9851f5d43a85f53aef1e6aec
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.get_screen_area.device.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_GET_SCREEN_AREA_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_GET_SCREEN_AREA_DEVICE_H_
+
+#include "../global.h"
+#include "./camera.device.h"
+#include "./commands.h"
+#include "./math.h"
+
+namespace pulsar {
+namespace Renderer {
+
+/**
+ * Find the closest enclosing screen area rectangle in pixels that encloses a
+ * ball.
+ *
+ * The method returns the two x and the two y values of the boundaries. They
+ * are not ordered yet and you need to find min and max for the left/right and
+ * lower/upper boundary.
+ *
+ * The return values are floats and need to be rounded appropriately.
+ */
+INLINE DEVICE bool get_screen_area(
+    const float3& ball_center_cam,
+    const float3& ray_center_norm,
+    const float& vert_rad,
+    const CamInfo& cam,
+    const uint& idx,
+    /* Out variables. */
+    float* x_1,
+    float* x_2,
+    float* y_1,
+    float* y_2) {
+  float cos_alpha = dot(cam.sensor_dir_z, ray_center_norm);
+  float2 o__c_, alpha, theta;
+  if (cos_alpha < EPS) {
+    PULSAR_LOG_DEV(
+        PULSAR_LOG_CALC_SIGNATURE,
+        "signature %d|ball not visible. cos_alpha: %.9f.\n",
+        idx,
+        cos_alpha);
+    // No intersection, ball won't be visible.
+    return false;
+  }
+  // Multiply the direction vector with the camera rotation matrix
+  // to have the optical axis being the canonical z vector (0, 0, 1).
+  // TODO: optimize.
+  const float3 ball_center_cam_rot = rotate(
+      ball_center_cam,
+      cam.pixel_dir_x / length(cam.pixel_dir_x),
+      cam.pixel_dir_y / length(cam.pixel_dir_y),
+      cam.sensor_dir_z);
+  PULSAR_LOG_DEV(
+      PULSAR_LOG_CALC_SIGNATURE,
+      "signature %d|ball_center_cam_rot: %f, %f, %f.\n",
+      idx,
+      ball_center_cam.x,
+      ball_center_cam.y,
+      ball_center_cam.z);
+  const float pixel_size_norm_fac = FRCP(2.f * cam.half_pixel_size);
+  const float optical_offset_x =
+      (static_cast<float>(cam.aperture_width) - 1.f) * .5f;
+  const float optical_offset_y =
+      (static_cast<float>(cam.aperture_height) - 1.f) * .5f;
+  if (cam.orthogonal_projection) {
+    *x_1 =
+        FMA(ball_center_cam_rot.x - vert_rad,
+            pixel_size_norm_fac,
+            optical_offset_x);
+    *x_2 =
+        FMA(ball_center_cam_rot.x + vert_rad,
+            pixel_size_norm_fac,
+            optical_offset_x);
+    *y_1 =
+        FMA(ball_center_cam_rot.y - vert_rad,
+            pixel_size_norm_fac,
+            optical_offset_y);
+    *y_2 =
+        FMA(ball_center_cam_rot.y + vert_rad,
+            pixel_size_norm_fac,
+            optical_offset_y);
+    return true;
+  } else {
+    o__c_.x = FMAX(
+        FSQRT(
+            ball_center_cam_rot.x * ball_center_cam_rot.x +
+            ball_center_cam_rot.z * ball_center_cam_rot.z),
+        FEPS);
+    o__c_.y = FMAX(
+        FSQRT(
+            ball_center_cam_rot.y * ball_center_cam_rot.y +
+            ball_center_cam_rot.z * ball_center_cam_rot.z),
+        FEPS);
+    PULSAR_LOG_DEV(
+        PULSAR_LOG_CALC_SIGNATURE,
+        "signature %d|o__c_: %f, %f.\n",
+        idx,
+        o__c_.x,
+        o__c_.y);
+    alpha.x = sign_dir(ball_center_cam_rot.x) *
+        acos(FMIN(FMAX(ball_center_cam_rot.z / o__c_.x, -1.f), 1.f));
+    alpha.y = -sign_dir(ball_center_cam_rot.y) *
+        acos(FMIN(FMAX(ball_center_cam_rot.z / o__c_.y, -1.f), 1.f));
+    theta.x = asin(FMIN(FMAX(vert_rad / o__c_.x, -1.f), 1.f));
+    theta.y = asin(FMIN(FMAX(vert_rad / o__c_.y, -1.f), 1.f));
+    PULSAR_LOG_DEV(
+        PULSAR_LOG_CALC_SIGNATURE,
+        "signature %d|alpha.x: %f, alpha.y: %f, theta.x: %f, theta.y: %f.\n",
+        idx,
+        alpha.x,
+        alpha.y,
+        theta.x,
+        theta.y);
+    *x_1 = tan(alpha.x - theta.x) * cam.focal_length;
+    *x_2 = tan(alpha.x + theta.x) * cam.focal_length;
+    *y_1 = tan(alpha.y - theta.y) * cam.focal_length;
+    *y_2 = tan(alpha.y + theta.y) * cam.focal_length;
+    PULSAR_LOG_DEV(
+        PULSAR_LOG_CALC_SIGNATURE,
+        "signature %d|in sensor plane: x_1: %f, x_2: %f, y_1: %f, y_2: %f.\n",
+        idx,
+        *x_1,
+        *x_2,
+        *y_1,
+        *y_2);
+    *x_1 = FMA(*x_1, pixel_size_norm_fac, optical_offset_x);
+    *x_2 = FMA(*x_2, pixel_size_norm_fac, optical_offset_x);
+    *y_1 = FMA(*y_1, -pixel_size_norm_fac, optical_offset_y);
+    *y_2 = FMA(*y_2, -pixel_size_norm_fac, optical_offset_y);
+    return true;
+  }
+};
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.h
new file mode 100644
index 0000000000000000000000000000000000000000..01d6a61bed60249b1d9f65e2f0e277aef543c80c
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.h
@@ -0,0 +1,468 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_H_
+
+#include <algorithm>
+
+#include "../global.h"
+#include "./camera.h"
+
+namespace pulsar {
+namespace Renderer {
+
+//! Remember to order struct members from larger size to smaller size
+//! to avoid padding (for more info, see for example here:
+//! http://www.catb.org/esr/structure-packing/).
+
+/**
+ * This is the information that's needed to do a fast screen point
+ * intersection with one of the balls.
+ *
+ * Aim to keep this below 8 bytes (256 bytes per cache-line / 32 threads in a
+ * warp = 8 bytes per thread).
+ */
+struct IntersectInfo {
+  ushort2 min; /** minimum x, y in pixel coordinates. */
+  ushort2 max; /** maximum x, y in pixel coordinates. */
+};
+static_assert(
+    sizeof(IntersectInfo) == 8,
+    "The compiled size of `IntersectInfo` is wrong.");
+
+/**
+ * Reduction operation to find the limits of multiple IntersectInfo objects.
+ */
+struct IntersectInfoMinMax {
+  IHD IntersectInfo
+  operator()(const IntersectInfo& a, const IntersectInfo& b) const {
+    // Treat the special case of an invalid intersect info object or one for
+    // a ball out of bounds.
+    if (b.max.x == MAX_USHORT && b.min.x == MAX_USHORT &&
+        b.max.y == MAX_USHORT && b.min.y == MAX_USHORT) {
+      return a;
+    }
+    if (a.max.x == MAX_USHORT && a.min.x == MAX_USHORT &&
+        a.max.y == MAX_USHORT && a.min.y == MAX_USHORT) {
+      return b;
+    }
+    IntersectInfo result;
+    result.min.x = std::min<ushort>(a.min.x, b.min.x);
+    result.min.y = std::min<ushort>(a.min.y, b.min.y);
+    result.max.x = std::max<ushort>(a.max.x, b.max.x);
+    result.max.y = std::max<ushort>(a.max.y, b.max.y);
+    return result;
+  }
+};
+
+/**
+ * All information that's needed to draw a ball.
+ *
+ * It's necessary to keep this information in float (not half) format,
+ * because the loss in accuracy would be too high and lead to artifacts.
+ */
+struct DrawInfo {
+  float3 ray_center_norm; /** Ray to the ball center, normalized. */
+  /** Ball color.
+   *
+   * This might be the full color in the case of n_channels <= 3. Otherwise,
+   * a pointer to the original 'color' data is stored in the following union.
+   */
+  float first_color;
+  union {
+    float color[2];
+    float* ptr;
+  } color_union;
+  float t_center; /** Distance from the camera to the ball center. */
+  float radius; /** Ball radius. */
+};
+static_assert(
+    sizeof(DrawInfo) == 8 * 4,
+    "The compiled size of `DrawInfo` is wrong.");
+
+/**
+ * An object to collect all associated data with the renderer.
+ *
+ * The `_d` suffixed pointers point to memory 'on-device', potentially on the
+ * GPU. All other variables are expected to point to CPU memory.
+ */
+struct Renderer {
+  /** Dummy initializer to make sure all pointers are set to NULL to
+   * be safe for the device-specific 'construct' and 'destruct' methods.
+   */
+  inline Renderer() {
+    max_num_balls = 0;
+    result_d = NULL;
+    min_depth_d = NULL;
+    min_depth_sorted_d = NULL;
+    ii_d = NULL;
+    ii_sorted_d = NULL;
+    ids_d = NULL;
+    ids_sorted_d = NULL;
+    workspace_d = NULL;
+    di_d = NULL;
+    di_sorted_d = NULL;
+    region_flags_d = NULL;
+    num_selected_d = NULL;
+    forw_info_d = NULL;
+    grad_pos_d = NULL;
+    grad_col_d = NULL;
+    grad_rad_d = NULL;
+    grad_cam_d = NULL;
+    grad_opy_d = NULL;
+    grad_cam_buf_d = NULL;
+    n_grad_contributions_d = NULL;
+  };
+  /** The camera for this renderer. In world-coordinates. */
+  CamInfo cam;
+  /**
+   * The maximum amount of balls the renderer can handle. Resources are
+   * pre-allocated to account for this size. Less than this amount of balls
+   * can be rendered, but not more.
+   */
+  int max_num_balls;
+  /** The result buffer. */
+  float* result_d;
+  /** Closest possible intersection depth per sphere w.r.t. the camera. */
+  float* min_depth_d;
+  /** Closest possible intersection depth per sphere, ordered ascending. */
+  float* min_depth_sorted_d;
+  /** The intersect infos per sphere. */
+  IntersectInfo* ii_d;
+  /** The intersect infos per sphere, ordered by their closest possible
+   * intersection depth (asc.). */
+  IntersectInfo* ii_sorted_d;
+  /** Original sphere IDs. */
+  int* ids_d;
+  /** Original sphere IDs, ordered by their closest possible intersection depth
+   * (asc.). */
+  int* ids_sorted_d;
+  /** Workspace for CUB routines. */
+  char* workspace_d;
+  /** Workspace size for CUB routines. */
+  size_t workspace_size;
+  /** The draw information structures for each sphere. */
+  DrawInfo* di_d;
+  /** The draw information structures sorted by closest possible intersection
+   * depth (asc.). */
+  DrawInfo* di_sorted_d;
+  /** Region association buffer. */
+  char* region_flags_d;
+  /** Num spheres in the current region. */
+  size_t* num_selected_d;
+  /** Pointer to information from the forward pass. */
+  float* forw_info_d;
+  /** Struct containing information about the min max pixels that contain
+   * rendered information in the image. */
+  IntersectInfo* min_max_pixels_d;
+  /** Gradients w.r.t. position. */
+  float3* grad_pos_d;
+  /** Gradients w.r.t. color. */
+  float* grad_col_d;
+  /** Gradients w.r.t. radius. */
+  float* grad_rad_d;
+  /** Gradients w.r.t. camera parameters. */
+  float* grad_cam_d;
+  /** Gradients w.r.t. opacity. */
+  float* grad_opy_d;
+  /** Camera gradient information by sphere.
+   *
+   * Here, every sphere's contribution to the camera gradients is stored. It is
+   * aggregated and written to grad_cam_d in a separate step. This avoids write
+   * conflicts when processing the spheres.
+   */
+  CamGradInfo* grad_cam_buf_d;
+  /** Total of all gradient contributions for this image. */
+  int* n_grad_contributions_d;
+  /** The number of spheres to track for backpropagation. */
+  int n_track;
+};
+
+inline bool operator==(const Renderer& a, const Renderer& b) {
+  return a.cam == b.cam && a.max_num_balls == b.max_num_balls;
+}
+
+/**
+ * Construct a renderer.
+ */
+template <bool DEV>
+void construct(
+    Renderer* self,
+    const size_t& max_num_balls,
+    const int& width,
+    const int& height,
+    const bool& orthogonal_projection,
+    const bool& right_handed_system,
+    const float& background_normalization_depth,
+    const uint& n_channels,
+    const uint& n_track);
+
+/**
+ * Destruct the renderer and free the associated memory.
+ */
+template <bool DEV>
+void destruct(Renderer* self);
+
+/**
+ * Create a selection of points inside a rectangle.
+ *
+ * This write boolen values into `region_flags_d', which can
+ * for example be used by a CUB function to extract the selection.
+ */
+template <bool DEV>
+GLOBAL void create_selector(
+    IntersectInfo const* const RESTRICT ii_sorted_d,
+    const uint num_balls,
+    const int min_x,
+    const int max_x,
+    const int min_y,
+    const int max_y,
+    /* Out variables. */
+    char* RESTRICT region_flags_d);
+
+/**
+ * Calculate a signature for a ball.
+ *
+ * Populate the `ids_d`, `ii_d`, `di_d` and `min_depth_d` fields of the
+ * renderer. For spheres not visible in the image, sets the id field to -1,
+ * min_depth_d to MAX_FLOAT and the ii_d.min.x fields to MAX_USHORT.
+ */
+template <bool DEV>
+GLOBAL void calc_signature(
+    Renderer renderer,
+    float3 const* const RESTRICT vert_poss,
+    float const* const RESTRICT vert_cols,
+    float const* const RESTRICT vert_rads,
+    const uint num_balls);
+
+/**
+ * The block size for rendering.
+ *
+ * This should be as large as possible, but is limited due to the amount
+ * of variables we use and the memory required per thread.
+ */
+#define RENDER_BLOCK_SIZE 16
+/**
+ * The buffer size of spheres to be loaded and analyzed for relevance.
+ *
+ * This must be at least RENDER_BLOCK_SIZE * RENDER_BLOCK_SIZE so that
+ * for every iteration through the loading loop every thread could add a
+ * 'hit' to the buffer.
+ */
+#define RENDER_BUFFER_SIZE RENDER_BLOCK_SIZE* RENDER_BLOCK_SIZE * 2
+/**
+ * The threshold after which the spheres that are in the render buffer
+ * are rendered and the buffer is flushed.
+ *
+ * Must be less than RENDER_BUFFER_SIZE.
+ */
+#define RENDER_BUFFER_LOAD_THRESH 16 * 4
+
+/**
+ * The render function.
+ *
+ * Assumptions:
+ *   * the focal length is appropriately chosen,
+ *   * ray_dir_norm.z is > EPS.
+ *   * to be completed...
+ */
+template <bool DEV>
+GLOBAL void render(
+    size_t const* const RESTRICT
+        num_balls, /** Number of balls relevant for this pass. */
+    IntersectInfo const* const RESTRICT ii_d, /** Intersect information. */
+    DrawInfo const* const RESTRICT di_d, /** Draw information. */
+    float const* const RESTRICT min_depth_d, /** Minimum depth per sphere. */
+    int const* const RESTRICT id_d, /** IDs. */
+    float const* const RESTRICT op_d, /** Opacity. */
+    const CamInfo cam_norm, /** Camera normalized with all vectors to be in the
+                             * camera coordinate system.
+                             */
+    const float gamma, /** Transparency parameter. **/
+    const float percent_allowed_difference, /** Maximum allowed
+                                               error in color. */
+    const uint max_n_hits,
+    const float* bg_col_d,
+    const uint mode,
+    const int x_min,
+    const int y_min,
+    const int x_step,
+    const int y_step,
+    // Out variables.
+    float* const RESTRICT result_d, /** The result image. */
+    float* const RESTRICT forw_info_d, /** Additional information needed for the
+                                           grad computation. */
+    // Infrastructure.
+    const int n_track /** The number of spheres to track. */
+);
+
+/**
+ * Makes sure to paint background information.
+ *
+ * This is required as a separate post-processing step because certain
+ * pixels may not be processed during the forward pass if there is no
+ * possibility for a sphere to be present at their location.
+ */
+template <bool DEV>
+GLOBAL void fill_bg(
+    Renderer renderer,
+    const CamInfo norm,
+    float const* const bg_col_d,
+    const float gamma,
+    const uint mode);
+
+/**
+ * Rendering forward pass.
+ *
+ * Takes a renderer and sphere data as inputs and creates a rendering.
+ */
+template <bool DEV>
+void forward(
+    Renderer* self,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* bg_col_d,
+    const float* opacity_d,
+    const size_t& num_balls,
+    const uint& mode,
+    cudaStream_t stream);
+
+/**
+ * Normalize the camera gradients by the number of spheres that contributed.
+ */
+template <bool DEV>
+GLOBAL void norm_cam_gradients(Renderer renderer);
+
+/**
+ * Normalize the sphere gradients.
+ *
+ * We're assuming that the samples originate from a Monte Carlo
+ * sampling process and normalize by number and sphere area.
+ */
+template <bool DEV>
+GLOBAL void norm_sphere_gradients(Renderer renderer, const int num_balls);
+
+#define GRAD_BLOCK_SIZE 16
+/** Calculate the gradients.
+ */
+template <bool DEV>
+GLOBAL void calc_gradients(
+    const CamInfo cam, /** Camera in world coordinates. */
+    float const* const RESTRICT grad_im, /** The gradient image. */
+    const float
+        gamma, /** The transparency parameter used in the forward pass. */
+    float3 const* const RESTRICT vert_poss, /** Vertex position vector. */
+    float const* const RESTRICT vert_cols, /** Vertex color vector. */
+    float const* const RESTRICT vert_rads, /** Vertex radius vector. */
+    float const* const RESTRICT opacity, /** Vertex opacity. */
+    const uint num_balls, /** Number of balls. */
+    float const* const RESTRICT result_d, /** Result image. */
+    float const* const RESTRICT forw_info_d, /** Forward pass info. */
+    DrawInfo const* const RESTRICT di_d, /** Draw information. */
+    IntersectInfo const* const RESTRICT ii_d, /** Intersect information. */
+    // Mode switches.
+    const bool calc_grad_pos,
+    const bool calc_grad_col,
+    const bool calc_grad_rad,
+    const bool calc_grad_cam,
+    const bool calc_grad_opy,
+    // Out variables.
+    float* const RESTRICT grad_rad_d, /** Radius gradients. */
+    float* const RESTRICT grad_col_d, /** Color gradients. */
+    float3* const RESTRICT grad_pos_d, /** Position gradients. */
+    CamGradInfo* const RESTRICT grad_cam_buf_d, /** Camera gradient buffer. */
+    float* const RESTRICT grad_opy_d, /** Opacity gradient buffer. */
+    int* const RESTRICT
+        grad_contributed_d, /** Gradient contribution counter. */
+    // Infrastructure.
+    const int n_track,
+    const uint offs_x = 0,
+    const uint offs_y = 0);
+
+/**
+ * A full backward pass.
+ *
+ * Creates the gradients for the given gradient_image and the spheres.
+ */
+template <bool DEV>
+void backward(
+    Renderer* self,
+    const float* grad_im,
+    const float* image,
+    const float* forw_info,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* vert_opy,
+    const size_t& num_balls,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    cudaStream_t stream);
+
+/**
+ * A debug backward pass.
+ *
+ * This is a function to debug the gradient calculation. It calculates the
+ * gradients for exactly one pixel (set with pos_x and pos_y) without averaging.
+ *
+ * *Uses only the first sphere for camera gradient calculation!*
+ */
+template <bool DEV>
+void backward_dbg(
+    Renderer* self,
+    const float* grad_im,
+    const float* image,
+    const float* forw_info,
+    const float* vert_pos,
+    const float* vert_col,
+    const float* vert_rad,
+    const CamInfo& cam,
+    const float& gamma,
+    float percent_allowed_difference,
+    const uint& max_n_hits,
+    const float* vert_opy,
+    const size_t& num_balls,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    const uint& pos_x,
+    const uint& pos_y,
+    cudaStream_t stream);
+
+template <bool DEV>
+void nn(
+    const float* ref_ptr,
+    const float* tar_ptr,
+    const uint& k,
+    const uint& d,
+    const uint& n,
+    float* dist_ptr,
+    int32_t* inds_ptr,
+    cudaStream_t stream);
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_cam_gradients.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_cam_gradients.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..2a409caaf0f997572b6975962f51e9b49701ec1a
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_cam_gradients.device.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_NORM_CAM_GRADIENTS_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_NORM_CAM_GRADIENTS_DEVICE_H_
+
+#include "../global.h"
+#include "./camera.device.h"
+#include "./commands.h"
+#include "./math.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+/**
+ * Normalize the camera gradients by the number of spheres that contributed.
+ */
+template <bool DEV>
+GLOBAL void norm_cam_gradients(Renderer renderer) {
+  GET_PARALLEL_IDX_1D(idx, 1);
+  CamGradInfo* cgi = reinterpret_cast<CamGradInfo*>(renderer.grad_cam_d);
+  *cgi = *cgi * FRCP(static_cast<float>(*renderer.n_grad_contributions_d));
+  END_PARALLEL_NORET();
+};
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_cam_gradients.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_cam_gradients.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..130306bef32e67718071ecfde3a142e71d1e9da8
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_cam_gradients.instantiate.h
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "./renderer.norm_cam_gradients.device.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template GLOBAL void norm_cam_gradients<ISONDEVICE>(Renderer renderer);
+
+} // namespace Renderer
+} // namespace pulsar
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_sphere_gradients.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_sphere_gradients.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..bab832112c90e65af0ca5f9b49868a5264aec15e
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_sphere_gradients.device.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_NORM_SPHERE_GRADIENTS_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_NORM_SPHERE_GRADIENTS_H_
+
+#include "../global.h"
+#include "./commands.h"
+#include "./math.h"
+#include "./renderer.h"
+
+namespace pulsar {
+namespace Renderer {
+
+/**
+ * Normalize the sphere gradients.
+ *
+ * We're assuming that the samples originate from a Monte Carlo
+ * sampling process and normalize by number and sphere area.
+ */
+template <bool DEV>
+GLOBAL void norm_sphere_gradients(Renderer renderer, const int num_balls) {
+  GET_PARALLEL_IDX_1D(idx, num_balls);
+  float norm_fac = 0.f;
+  IntersectInfo ii;
+  if (renderer.ids_sorted_d[idx] > 0) {
+    ii = renderer.ii_d[idx];
+    // Normalize the sphere gradients as averages.
+    // This avoids the case that there are small spheres in a scene with still
+    // un-converged colors whereas the big spheres already converged, just
+    // because their integrated learning rate is 'higher'.
+    norm_fac = FRCP(static_cast<float>(renderer.ids_sorted_d[idx]));
+  }
+  PULSAR_LOG_DEV_NODE(
+      PULSAR_LOG_NORMALIZE,
+      "ids_sorted_d[idx]: %d, norm_fac: %.9f.\n",
+      renderer.ids_sorted_d[idx],
+      norm_fac);
+  renderer.grad_rad_d[idx] *= norm_fac;
+  for (uint c_idx = 0; c_idx < renderer.cam.n_channels; ++c_idx) {
+    renderer.grad_col_d[idx * renderer.cam.n_channels + c_idx] *= norm_fac;
+  }
+  renderer.grad_pos_d[idx] *= norm_fac;
+  renderer.grad_opy_d[idx] *= norm_fac;
+
+  if (renderer.ids_sorted_d[idx] > 0) {
+    // For the camera, we need to be more correct and have the gradients
+    // be proportional to the area they cover in the image.
+    // This leads to a formulation very much like in monte carlo integration:
+    norm_fac = FRCP(static_cast<float>(renderer.ids_sorted_d[idx])) *
+        (static_cast<float>(ii.max.x) - static_cast<float>(ii.min.x)) *
+        (static_cast<float>(ii.max.y) - static_cast<float>(ii.min.y)) *
+        1e-3f; // for better numerics.
+  }
+  renderer.grad_cam_buf_d[idx].cam_pos *= norm_fac;
+  renderer.grad_cam_buf_d[idx].pixel_0_0_center *= norm_fac;
+  renderer.grad_cam_buf_d[idx].pixel_dir_x *= norm_fac;
+  renderer.grad_cam_buf_d[idx].pixel_dir_y *= norm_fac;
+  // The sphere only contributes to the camera gradients if it is
+  // large enough in screen space.
+  if (renderer.ids_sorted_d[idx] > 0 && ii.max.x >= ii.min.x + 3 &&
+      ii.max.y >= ii.min.y + 3)
+    renderer.ids_sorted_d[idx] = 1;
+  END_PARALLEL_NORET();
+};
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_sphere_gradients.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_sphere_gradients.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..17a82fbed9cabe09a71bc2f011f70d183830cf77
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.norm_sphere_gradients.instantiate.h
@@ -0,0 +1,19 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "./renderer.norm_sphere_gradients.device.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template GLOBAL void norm_sphere_gradients<ISONDEVICE>(
+    Renderer renderer,
+    const int num_balls);
+
+} // namespace Renderer
+} // namespace pulsar
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.render.device.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.render.device.h
new file mode 100644
index 0000000000000000000000000000000000000000..a5be7c1e227de92805303d202a9c8ac32dbcc230
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.render.device.h
@@ -0,0 +1,416 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_RENDER_DEVICE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_RENDER_DEVICE_H_
+
+#include "../global.h"
+#include "./camera.device.h"
+#include "./commands.h"
+#include "./math.h"
+#include "./renderer.h"
+
+#include "./closest_sphere_tracker.device.h"
+#include "./renderer.draw.device.h"
+
+namespace pulsar {
+namespace Renderer {
+
+template <bool DEV>
+GLOBAL void render(
+    size_t const* const RESTRICT
+        num_balls, /** Number of balls relevant for this pass. */
+    IntersectInfo const* const RESTRICT ii_d, /** Intersect information. */
+    DrawInfo const* const RESTRICT di_d, /** Draw information. */
+    float const* const RESTRICT min_depth_d, /** Minimum depth per sphere. */
+    int const* const RESTRICT ids_d, /** IDs. */
+    float const* const RESTRICT op_d, /** Opacity. */
+    const CamInfo cam_norm, /** Camera normalized with all vectors to be in the
+                             * camera coordinate system.
+                             */
+    const float gamma, /** Transparency parameter. **/
+    const float percent_allowed_difference, /** Maximum allowed
+                                               error in color. */
+    const uint max_n_hits,
+    const float* bg_col,
+    const uint mode,
+    const int x_min,
+    const int y_min,
+    const int x_step,
+    const int y_step,
+    // Out variables.
+    float* const RESTRICT result_d, /** The result image. */
+    float* const RESTRICT forw_info_d, /** Additional information needed for the
+                                           grad computation. */
+    const int n_track /** The number of spheres to track for backprop. */
+) {
+  // Do not early stop threads in this block here. They can all contribute to
+  // the scanning process, we just have to prevent from writing their result.
+  GET_PARALLEL_IDS_2D(offs_x, offs_y, x_step, y_step);
+  // Variable declarations and const initializations.
+  const float ln_pad_over_1minuspad =
+      FLN(percent_allowed_difference / (1.f - percent_allowed_difference));
+  /** A facility to track the closest spheres to the camera
+      (in preparation for gradient calculation). */
+  ClosestSphereTracker tracker(n_track);
+  const uint coord_x = x_min + offs_x; /** Ray coordinate x. */
+  const uint coord_y = y_min + offs_y; /** Ray coordinate y. */
+  float3 ray_dir_norm; /** Ray cast through the pixel, normalized. */
+  float2 projected_ray; /** Ray intersection with the sensor. */
+  if (cam_norm.orthogonal_projection) {
+    ray_dir_norm = cam_norm.sensor_dir_z;
+    projected_ray.x = static_cast<float>(coord_x);
+    projected_ray.y = static_cast<float>(coord_y);
+  } else {
+    ray_dir_norm = normalize(
+        cam_norm.pixel_0_0_center + coord_x * cam_norm.pixel_dir_x +
+        coord_y * cam_norm.pixel_dir_y);
+    // This is a reasonable assumption for normal focal lengths and image sizes.
+    PASSERT(FABS(ray_dir_norm.z) > FEPS);
+    projected_ray.x = ray_dir_norm.x / ray_dir_norm.z * cam_norm.focal_length;
+    projected_ray.y = ray_dir_norm.y / ray_dir_norm.z * cam_norm.focal_length;
+  }
+  PULSAR_LOG_DEV_PIX(
+      PULSAR_LOG_RENDER_PIX,
+      "render|ray_dir_norm: %.9f, %.9f, %.9f. projected_ray: %.9f, %.9f.\n",
+      ray_dir_norm.x,
+      ray_dir_norm.y,
+      ray_dir_norm.z,
+      projected_ray.x,
+      projected_ray.y);
+  // Set up shared infrastructure.
+  /** This entire thread block. */
+  cg::thread_block thread_block = cg::this_thread_block();
+  /** The collaborators within a warp. */
+  cg::coalesced_group thread_warp = cg::coalesced_threads();
+  /** The number of loaded balls in the load buffer di_l. */
+  SHARED uint n_loaded;
+  /** Draw information buffer. */
+  SHARED DrawInfo di_l[RENDER_BUFFER_SIZE];
+  /** The original sphere id of each loaded sphere. */
+  SHARED uint sphere_id_l[RENDER_BUFFER_SIZE];
+  /** The number of pixels in this block that are done. */
+  SHARED int n_pixels_done;
+  /** Whether loading of balls is completed. */
+  SHARED bool loading_done;
+  /** The number of balls loaded overall (just for statistics). */
+  SHARED int n_balls_loaded;
+  /** The area this thread block covers. */
+  SHARED IntersectInfo block_area;
+  if (thread_block.thread_rank() == 0) {
+    // Initialize the shared variables.
+    n_loaded = 0;
+    block_area.min.x = static_cast<ushort>(coord_x);
+    block_area.max.x = static_cast<ushort>(IMIN(
+        coord_x + blockDim.x, cam_norm.film_border_left + cam_norm.film_width));
+    block_area.min.y = static_cast<ushort>(coord_y);
+    block_area.max.y = static_cast<ushort>(IMIN(
+        coord_y + blockDim.y, cam_norm.film_border_top + cam_norm.film_height));
+    n_pixels_done = 0;
+    loading_done = false;
+    n_balls_loaded = 0;
+  }
+  PULSAR_LOG_DEV_PIX(
+      PULSAR_LOG_RENDER_PIX,
+      "render|block_area.min: %d, %d. block_area.max: %d, %d.\n",
+      block_area.min.x,
+      block_area.min.y,
+      block_area.max.x,
+      block_area.max.y);
+  // Initialization of the pixel with the background color.
+  /**
+   * The result of this very pixel.
+   * the offset calculation might overflow if this thread is out of
+   * bounds of the film. However, in this case result is not
+   * accessed, so this is fine.
+   */
+  float* result = result_d +
+      (coord_y - cam_norm.film_border_top) * cam_norm.film_width *
+          cam_norm.n_channels +
+      (coord_x - cam_norm.film_border_left) * cam_norm.n_channels;
+  if (coord_x >= cam_norm.film_border_left &&
+      coord_x < cam_norm.film_border_left + cam_norm.film_width &&
+      coord_y >= cam_norm.film_border_top &&
+      coord_y < cam_norm.film_border_top + cam_norm.film_height) {
+    // Initialize the result.
+    if (mode == 0u) {
+      for (uint c_id = 0; c_id < cam_norm.n_channels; ++c_id)
+        result[c_id] = bg_col[c_id];
+    } else {
+      result[0] = 0.f;
+    }
+  }
+  /** Normalization denominator. */
+  float sm_d = 1.f;
+  /** Normalization tracker for stable softmax. The maximum observed value. */
+  float sm_m = cam_norm.background_normalization_depth / gamma;
+  /** Whether this pixel has had all information needed for drawing. */
+  bool done =
+      (coord_x < cam_norm.film_border_left ||
+       coord_x >= cam_norm.film_border_left + cam_norm.film_width ||
+       coord_y < cam_norm.film_border_top ||
+       coord_y >= cam_norm.film_border_top + cam_norm.film_height);
+  /** The depth threshold for a new point to have at least
+   * `percent_allowed_difference` influence on the result color. All points that
+   * are further away than this are ignored.
+   */
+  float depth_threshold = done ? -1.f : MAX_FLOAT;
+  /** The closest intersection possible of a ball that was hit by this pixel
+   * ray. */
+  float max_closest_possible_intersection_hit = -1.f;
+  bool hit; /** Whether a sphere was hit. */
+  float intersection_depth; /** The intersection_depth for a sphere at this
+                               pixel. */
+  float closest_possible_intersection; /** The closest possible intersection
+    for this sphere. */
+  float max_closest_possible_intersection;
+  // Sync up threads so that everyone is similarly initialized.
+  thread_block.sync();
+  //! Coalesced loading and intersection analysis of balls.
+  for (uint ball_idx = thread_block.thread_rank();
+       ball_idx < iDivCeil(static_cast<uint>(*num_balls), thread_block.size()) *
+               thread_block.size() &&
+       !loading_done && n_pixels_done < thread_block.size();
+       ball_idx += thread_block.size()) {
+    if (ball_idx < static_cast<uint>(*num_balls)) { // Account for overflow.
+      const IntersectInfo& ii = ii_d[ball_idx];
+      hit = (ii.min.x <= block_area.max.x) && (ii.max.x > block_area.min.x) &&
+          (ii.min.y <= block_area.max.y) && (ii.max.y > block_area.min.y);
+      if (hit) {
+        uint write_idx = ATOMICADD_B(&n_loaded, 1u);
+        di_l[write_idx] = di_d[ball_idx];
+        sphere_id_l[write_idx] = static_cast<uint>(ids_d[ball_idx]);
+        PULSAR_LOG_DEV_PIXB(
+            PULSAR_LOG_RENDER_PIX,
+            "render|found intersection with sphere %u.\n",
+            sphere_id_l[write_idx]);
+      }
+      if (ii.min.x == MAX_USHORT)
+        // This is an invalid sphere (out of image). These spheres have
+        // maximum depth. Since we ordered the spheres by earliest possible
+        // intersection depth we re certain that there will no other sphere
+        // that is relevant after this one.
+        loading_done = true;
+    }
+    // Reset n_pixels_done.
+    n_pixels_done = 0;
+    thread_block.sync(); // Make sure n_loaded is updated.
+    if (n_loaded > RENDER_BUFFER_LOAD_THRESH) {
+      // The load buffer is full enough. Draw.
+      if (thread_block.thread_rank() == 0)
+        n_balls_loaded += n_loaded;
+      max_closest_possible_intersection = 0.f;
+      // This excludes threads outside of the image boundary. Also, it reduces
+      // block artifacts.
+      if (!done) {
+        for (uint draw_idx = 0; draw_idx < n_loaded; ++draw_idx) {
+          intersection_depth = 0.f;
+          if (cam_norm.orthogonal_projection) {
+            // The closest possible intersection is the distance to the camera
+            // plane.
+            closest_possible_intersection = min_depth_d[sphere_id_l[draw_idx]];
+          } else {
+            closest_possible_intersection =
+                di_l[draw_idx].t_center - di_l[draw_idx].radius;
+          }
+          PULSAR_LOG_DEV_PIX(
+              PULSAR_LOG_RENDER_PIX,
+              "render|drawing sphere %u (depth: %f, "
+              "closest possible intersection: %f).\n",
+              sphere_id_l[draw_idx],
+              di_l[draw_idx].t_center,
+              closest_possible_intersection);
+          hit = draw(
+              di_l[draw_idx], // Sphere to draw.
+              op_d == NULL ? 1.f : op_d[sphere_id_l[draw_idx]], // Opacity.
+              cam_norm, // Cam.
+              gamma, // Gamma.
+              ray_dir_norm, // Ray direction.
+              projected_ray, // Ray intersection with the image.
+              // Mode switches.
+              true, // Draw.
+              false,
+              false,
+              false,
+              false,
+              false, // No gradients.
+              // Position info.
+              coord_x,
+              coord_y,
+              sphere_id_l[draw_idx],
+              // Optional in variables.
+              NULL, // intersect information.
+              NULL, // ray_dir.
+              NULL, // norm_ray_dir.
+              NULL, // grad_pix.
+              &ln_pad_over_1minuspad,
+              // in/out variables
+              &sm_d,
+              &sm_m,
+              result,
+              // Optional out.
+              &depth_threshold,
+              &intersection_depth,
+              NULL,
+              NULL,
+              NULL,
+              NULL,
+              NULL // gradients.
+          );
+          if (hit) {
+            max_closest_possible_intersection_hit = FMAX(
+                max_closest_possible_intersection_hit,
+                closest_possible_intersection);
+            tracker.track(
+                sphere_id_l[draw_idx], intersection_depth, coord_x, coord_y);
+          }
+          max_closest_possible_intersection = FMAX(
+              max_closest_possible_intersection, closest_possible_intersection);
+        }
+        PULSAR_LOG_DEV_PIX(
+            PULSAR_LOG_RENDER_PIX,
+            "render|max_closest_possible_intersection: %f, "
+            "depth_threshold: %f.\n",
+            max_closest_possible_intersection,
+            depth_threshold);
+      }
+      done = done ||
+          (percent_allowed_difference > 0.f &&
+           max_closest_possible_intersection > depth_threshold) ||
+          tracker.get_n_hits() >= max_n_hits;
+      uint warp_done = thread_warp.ballot(done);
+      if (thread_warp.thread_rank() == 0)
+        ATOMICADD_B(&n_pixels_done, POPC(warp_done));
+      // This sync is necessary to keep n_loaded until all threads are done with
+      // painting.
+      thread_block.sync();
+      n_loaded = 0;
+    }
+    thread_block.sync();
+  }
+  if (thread_block.thread_rank() == 0)
+    n_balls_loaded += n_loaded;
+  PULSAR_LOG_DEV_PIX(
+      PULSAR_LOG_RENDER_PIX,
+      "render|loaded %d balls in total.\n",
+      n_balls_loaded);
+  if (!done) {
+    for (uint draw_idx = 0; draw_idx < n_loaded; ++draw_idx) {
+      intersection_depth = 0.f;
+      if (cam_norm.orthogonal_projection) {
+        // The closest possible intersection is the distance to the camera
+        // plane.
+        closest_possible_intersection = min_depth_d[sphere_id_l[draw_idx]];
+      } else {
+        closest_possible_intersection =
+            di_l[draw_idx].t_center - di_l[draw_idx].radius;
+      }
+      PULSAR_LOG_DEV_PIX(
+          PULSAR_LOG_RENDER_PIX,
+          "render|drawing sphere %u (depth: %f, "
+          "closest possible intersection: %f).\n",
+          sphere_id_l[draw_idx],
+          di_l[draw_idx].t_center,
+          closest_possible_intersection);
+      hit = draw(
+          di_l[draw_idx], // Sphere to draw.
+          op_d == NULL ? 1.f : op_d[sphere_id_l[draw_idx]], // Opacity.
+          cam_norm, // Cam.
+          gamma, // Gamma.
+          ray_dir_norm, // Ray direction.
+          projected_ray, // Ray intersection with the image.
+          // Mode switches.
+          true, // Draw.
+          false,
+          false,
+          false,
+          false,
+          false, // No gradients.
+          // Logging info.
+          coord_x,
+          coord_y,
+          sphere_id_l[draw_idx],
+          // Optional in variables.
+          NULL, // intersect information.
+          NULL, // ray_dir.
+          NULL, // norm_ray_dir.
+          NULL, // grad_pix.
+          &ln_pad_over_1minuspad,
+          // in/out variables
+          &sm_d,
+          &sm_m,
+          result,
+          // Optional out.
+          &depth_threshold,
+          &intersection_depth,
+          NULL,
+          NULL,
+          NULL,
+          NULL,
+          NULL // gradients.
+      );
+      if (hit) {
+        max_closest_possible_intersection_hit = FMAX(
+            max_closest_possible_intersection_hit,
+            closest_possible_intersection);
+        tracker.track(
+            sphere_id_l[draw_idx], intersection_depth, coord_x, coord_y);
+      }
+    }
+  }
+  if (coord_x < cam_norm.film_border_left ||
+      coord_y < cam_norm.film_border_top ||
+      coord_x >= cam_norm.film_border_left + cam_norm.film_width ||
+      coord_y >= cam_norm.film_border_top + cam_norm.film_height) {
+    RETURN_PARALLEL();
+  }
+  if (mode == 1u) {
+    // The subtractions, for example coord_y - cam_norm.film_border_left, are
+    // safe even though both components are uints. We checked their relation
+    // just above.
+    result_d
+        [(coord_y - cam_norm.film_border_top) * cam_norm.film_width *
+             cam_norm.n_channels +
+         (coord_x - cam_norm.film_border_left) * cam_norm.n_channels] =
+            static_cast<float>(tracker.get_n_hits());
+  } else {
+    float sm_d_normfac = FRCP(FMAX(sm_d, FEPS));
+    for (uint c_id = 0; c_id < cam_norm.n_channels; ++c_id)
+      result[c_id] *= sm_d_normfac;
+    int write_loc = (coord_y - cam_norm.film_border_top) * cam_norm.film_width *
+            (3 + 2 * n_track) +
+        (coord_x - cam_norm.film_border_left) * (3 + 2 * n_track);
+    forw_info_d[write_loc] = sm_m;
+    forw_info_d[write_loc + 1] = sm_d;
+    forw_info_d[write_loc + 2] = max_closest_possible_intersection_hit;
+    PULSAR_LOG_DEV_PIX(
+        PULSAR_LOG_RENDER_PIX,
+        "render|writing the %d most important ball infos.\n",
+        IMIN(n_track, tracker.get_n_hits()));
+    for (int i = 0; i < n_track; ++i) {
+      int sphere_id = tracker.get_closest_sphere_id(i);
+      IASF(sphere_id, forw_info_d[write_loc + 3 + i * 2]);
+      forw_info_d[write_loc + 3 + i * 2 + 1] =
+          tracker.get_closest_sphere_depth(i) == MAX_FLOAT
+          ? -1.f
+          : tracker.get_closest_sphere_depth(i);
+      PULSAR_LOG_DEV_PIX(
+          PULSAR_LOG_RENDER_PIX,
+          "render|writing %d most important: id: %d, normalized depth: %f.\n",
+          i,
+          tracker.get_closest_sphere_id(i),
+          tracker.get_closest_sphere_depth(i));
+    }
+  }
+  END_PARALLEL_2D();
+}
+
+} // namespace Renderer
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.render.instantiate.h b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.render.instantiate.h
new file mode 100644
index 0000000000000000000000000000000000000000..c75080e9f9305109e2b7ff19888d2646e2f8c363
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/include/renderer.render.instantiate.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_INCLUDE_RENDERER_RENDER_INSTANTIATE_H_
+#define PULSAR_NATIVE_INCLUDE_RENDERER_RENDER_INSTANTIATE_H_
+
+#include "./renderer.render.device.h"
+
+namespace pulsar {
+namespace Renderer {
+template GLOBAL void render<ISONDEVICE>(
+    size_t const* const RESTRICT
+        num_balls, /** Number of balls relevant for this pass. */
+    IntersectInfo const* const RESTRICT ii_d, /** Intersect information. */
+    DrawInfo const* const RESTRICT di_d, /** Draw information. */
+    float const* const RESTRICT min_depth_d, /** Minimum depth per sphere. */
+    int const* const RESTRICT id_d, /** IDs. */
+    float const* const RESTRICT op_d, /** Opacity. */
+    const CamInfo cam_norm, /** Camera normalized with all vectors to be in the
+                             * camera coordinate system.
+                             */
+    const float gamma, /** Transparency parameter. **/
+    const float percent_allowed_difference, /** Maximum allowed
+                                               error in color. */
+    const uint max_n_hits,
+    const float* bg_col_d,
+    const uint mode,
+    const int x_min,
+    const int y_min,
+    const int x_step,
+    const int y_step,
+    // Out variables.
+    float* const RESTRICT result_d, /** The result image. */
+    float* const RESTRICT forw_info_d, /** Additional information needed for the
+                                            grad computation. */
+    const int n_track /** The number of spheres to track for backprop. */
+);
+}
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/logging.h b/pytorch3d/pytorch3d/csrc/pulsar/logging.h
new file mode 100644
index 0000000000000000000000000000000000000000..620f2d10ffc2e155036cfb71d59a51a4223e45a0
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/logging.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_LOGGING_H_
+#define PULSAR_LOGGING_H_
+
+// #define PULSAR_LOGGING_ENABLED
+/**
+ * Enable detailed per-operation timings.
+ *
+ * This timing scheme is not appropriate to measure batched calculations.
+ * Use `PULSAR_TIMINGS_BATCHED_ENABLED` for that.
+ */
+// #define PULSAR_TIMINGS_ENABLED
+/**
+ * Time batched operations.
+ */
+// #define PULSAR_TIMINGS_BATCHED_ENABLED
+#if defined(PULSAR_TIMINGS_BATCHED_ENABLED) && defined(PULSAR_TIMINGS_ENABLED)
+#pragma message("Pulsar|batched and unbatched timings enabled. This will not")
+#pragma message("Pulsar|create meaningful results.")
+#endif
+
+#ifdef PULSAR_LOGGING_ENABLED
+
+// Control logging.
+// 0: INFO, 1: WARNING, 2: ERROR, 3: FATAL (Abort after logging).
+#define CAFFE2_LOG_THRESHOLD 0
+#define PULSAR_LOG_INIT false
+#define PULSAR_LOG_FORWARD false
+#define PULSAR_LOG_CALC_SIGNATURE false
+#define PULSAR_LOG_RENDER false
+#define PULSAR_LOG_RENDER_PIX false
+#define PULSAR_LOG_RENDER_PIX_X 428
+#define PULSAR_LOG_RENDER_PIX_Y 669
+#define PULSAR_LOG_RENDER_PIX_ALL false
+#define PULSAR_LOG_TRACKER_PIX false
+#define PULSAR_LOG_TRACKER_PIX_X 428
+#define PULSAR_LOG_TRACKER_PIX_Y 669
+#define PULSAR_LOG_TRACKER_PIX_ALL false
+#define PULSAR_LOG_DRAW_PIX false
+#define PULSAR_LOG_DRAW_PIX_X 428
+#define PULSAR_LOG_DRAW_PIX_Y 669
+#define PULSAR_LOG_DRAW_PIX_ALL false
+#define PULSAR_LOG_BACKWARD false
+#define PULSAR_LOG_GRAD false
+#define PULSAR_LOG_GRAD_X 509
+#define PULSAR_LOG_GRAD_Y 489
+#define PULSAR_LOG_GRAD_ALL false
+#define PULSAR_LOG_NORMALIZE false
+#define PULSAR_LOG_NORMALIZE_X 0
+#define PULSAR_LOG_NORMALIZE_ALL false
+
+#define PULSAR_LOG_DEV(ID, ...) \
+  if ((ID)) {                   \
+    printf(__VA_ARGS__);        \
+  }
+#define PULSAR_LOG_DEV_APIX(ID, MSG, ...)                               \
+  if ((ID) && (film_coord_x == (ID##_X) && film_coord_y == (ID##_Y)) || \
+      ID##_ALL) {                                                       \
+    printf(                                                             \
+        "%u %u (ap %u %u)|" MSG,                                        \
+        film_coord_x,                                                   \
+        film_coord_y,                                                   \
+        ap_coord_x,                                                     \
+        ap_coord_y,                                                     \
+        __VA_ARGS__);                                                   \
+  }
+#define PULSAR_LOG_DEV_PIX(ID, MSG, ...)                                  \
+  if ((ID) && (coord_x == (ID##_X) && coord_y == (ID##_Y)) || ID##_ALL) { \
+    printf("%u %u|" MSG, coord_x, coord_y, __VA_ARGS__);                  \
+  }
+#ifdef __CUDACC__
+#define PULSAR_LOG_DEV_PIXB(ID, MSG, ...)                       \
+  if ((ID) && static_cast<int>(block_area.min.x) <= (ID##_X) && \
+      static_cast<int>(block_area.max.x) > (ID##_X) &&          \
+      static_cast<int>(block_area.min.y) <= (ID##_Y) &&         \
+      static_cast<int>(block_area.max.y) > (ID##_Y)) {          \
+    printf("%u %u|" MSG, coord_x, coord_y, __VA_ARGS__);        \
+  }
+#else
+#define PULSAR_LOG_DEV_PIXB(ID, MSG, ...)                   \
+  if ((ID) && coord_x == (ID##_X) && coord_y == (ID##_Y)) { \
+    printf("%u %u|" MSG, coord_x, coord_y, __VA_ARGS__);    \
+  }
+#endif
+#define PULSAR_LOG_DEV_NODE(ID, MSG, ...)      \
+  if ((ID) && idx == (ID##_X) || (ID##_ALL)) { \
+    printf("%u|" MSG, idx, __VA_ARGS__);       \
+  }
+
+#else
+
+#define CAFFE2_LOG_THRESHOLD 2
+
+#define PULSAR_LOG_RENDER false
+#define PULSAR_LOG_INIT false
+#define PULSAR_LOG_FORWARD false
+#define PULSAR_LOG_BACKWARD false
+#define PULSAR_LOG_TRACKER_PIX false
+
+#define PULSAR_LOG_DEV(...)
+#define PULSAR_LOG_DEV_APIX(...)
+#define PULSAR_LOG_DEV_PIX(...)
+#define PULSAR_LOG_DEV_PIXB(...)
+#define PULSAR_LOG_DEV_NODE(...)
+
+#endif
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/pytorch/camera.cpp b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/camera.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7c38d6954db0e74cdbf7b22295be51a009757346
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/camera.cpp
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "./camera.h"
+#include "../include/math.h"
+
+namespace pulsar {
+namespace pytorch {
+
+CamInfo cam_info_from_params(
+    const torch::Tensor& cam_pos,
+    const torch::Tensor& pixel_0_0_center,
+    const torch::Tensor& pixel_vec_x,
+    const torch::Tensor& pixel_vec_y,
+    const torch::Tensor& principal_point_offset,
+    const float& focal_length,
+    const uint& width,
+    const uint& height,
+    const float& min_dist,
+    const float& max_dist,
+    const bool& right_handed) {
+  CamInfo res;
+  fill_cam_vecs(
+      cam_pos.detach().cpu(),
+      pixel_0_0_center.detach().cpu(),
+      pixel_vec_x.detach().cpu(),
+      pixel_vec_y.detach().cpu(),
+      principal_point_offset.detach().cpu(),
+      right_handed,
+      &res);
+  res.half_pixel_size = 0.5f * length(res.pixel_dir_x);
+  if (length(res.pixel_dir_y) * 0.5f - res.half_pixel_size > EPS) {
+    throw std::runtime_error("Pixel sizes must agree in x and y direction!");
+  }
+  res.focal_length = focal_length;
+  res.aperture_width =
+      width + 2u * static_cast<uint>(abs(res.principal_point_offset_x));
+  res.aperture_height =
+      height + 2u * static_cast<uint>(abs(res.principal_point_offset_y));
+  res.pixel_0_0_center -=
+      res.pixel_dir_x * static_cast<float>(abs(res.principal_point_offset_x));
+  res.pixel_0_0_center -=
+      res.pixel_dir_y * static_cast<float>(abs(res.principal_point_offset_y));
+  res.film_width = width;
+  res.film_height = height;
+  res.film_border_left =
+      static_cast<uint>(std::max(0, 2 * res.principal_point_offset_x));
+  res.film_border_top =
+      static_cast<uint>(std::max(0, 2 * res.principal_point_offset_y));
+  LOG_IF(INFO, PULSAR_LOG_INIT)
+      << "Aperture width, height: " << res.aperture_width << ", "
+      << res.aperture_height;
+  LOG_IF(INFO, PULSAR_LOG_INIT)
+      << "Film width, height: " << res.film_width << ", " << res.film_height;
+  LOG_IF(INFO, PULSAR_LOG_INIT)
+      << "Film border left, top: " << res.film_border_left << ", "
+      << res.film_border_top;
+  res.min_dist = min_dist;
+  res.max_dist = max_dist;
+  res.norm_fac = 1.f / (max_dist - min_dist);
+  return res;
+};
+
+} // namespace pytorch
+} // namespace pulsar
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/pytorch/camera.h b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/camera.h
new file mode 100644
index 0000000000000000000000000000000000000000..b559cf5f6f76e81e7f3f095a849f133ec7f60d3a
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/camera.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_CAMERA_H_
+#define PULSAR_NATIVE_CAMERA_H_
+
+#include <tuple>
+#include "../global.h"
+
+#include "../include/camera.h"
+
+namespace pulsar {
+namespace pytorch {
+
+inline void fill_cam_vecs(
+    const torch::Tensor& pos_vec,
+    const torch::Tensor& pixel_0_0_center,
+    const torch::Tensor& pixel_dir_x,
+    const torch::Tensor& pixel_dir_y,
+    const torch::Tensor& principal_point_offset,
+    const bool& right_handed,
+    CamInfo* res) {
+  res->eye.x = pos_vec.data_ptr<float>()[0];
+  res->eye.y = pos_vec.data_ptr<float>()[1];
+  res->eye.z = pos_vec.data_ptr<float>()[2];
+  res->pixel_0_0_center.x = pixel_0_0_center.data_ptr<float>()[0];
+  res->pixel_0_0_center.y = pixel_0_0_center.data_ptr<float>()[1];
+  res->pixel_0_0_center.z = pixel_0_0_center.data_ptr<float>()[2];
+  res->pixel_dir_x.x = pixel_dir_x.data_ptr<float>()[0];
+  res->pixel_dir_x.y = pixel_dir_x.data_ptr<float>()[1];
+  res->pixel_dir_x.z = pixel_dir_x.data_ptr<float>()[2];
+  res->pixel_dir_y.x = pixel_dir_y.data_ptr<float>()[0];
+  res->pixel_dir_y.y = pixel_dir_y.data_ptr<float>()[1];
+  res->pixel_dir_y.z = pixel_dir_y.data_ptr<float>()[2];
+  auto sensor_dir_z = pixel_dir_y.cross(pixel_dir_x);
+  sensor_dir_z /= sensor_dir_z.norm();
+  if (right_handed) {
+    sensor_dir_z *= -1.f;
+  }
+  res->sensor_dir_z.x = sensor_dir_z.data_ptr<float>()[0];
+  res->sensor_dir_z.y = sensor_dir_z.data_ptr<float>()[1];
+  res->sensor_dir_z.z = sensor_dir_z.data_ptr<float>()[2];
+  res->principal_point_offset_x = principal_point_offset.data_ptr<int32_t>()[0];
+  res->principal_point_offset_y = principal_point_offset.data_ptr<int32_t>()[1];
+}
+
+CamInfo cam_info_from_params(
+    const torch::Tensor& cam_pos,
+    const torch::Tensor& pixel_0_0_center,
+    const torch::Tensor& pixel_vec_x,
+    const torch::Tensor& pixel_vec_y,
+    const torch::Tensor& principal_point_offset,
+    const float& focal_length,
+    const uint& width,
+    const uint& height,
+    const float& min_dist,
+    const float& max_dist,
+    const bool& right_handed);
+
+} // namespace pytorch
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/pytorch/renderer.cpp b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/renderer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..39f07c3b0dba71a1e301f52c8c10c485179548fc
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/renderer.cpp
@@ -0,0 +1,1599 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "./renderer.h"
+#include "../include/commands.h"
+#include "./camera.h"
+#include "./util.h"
+
+#include <ATen/ATen.h>
+#ifdef WITH_CUDA
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#endif
+
+#ifndef TORCH_CHECK_ARG
+// torch <= 1.10
+#define TORCH_CHECK_ARG(cond, argN, ...) \
+  TORCH_CHECK(cond, "invalid argument ", argN, ": ", __VA_ARGS__)
+#endif
+
+namespace PRE = ::pulsar::Renderer;
+
+namespace pulsar {
+namespace pytorch {
+
+Renderer::Renderer(
+    const unsigned int& width,
+    const unsigned int& height,
+    const unsigned int& max_n_balls,
+    const bool& orthogonal_projection,
+    const bool& right_handed_system,
+    const float& background_normalization_depth,
+    const uint& n_channels,
+    const uint& n_track) {
+  LOG_IF(INFO, PULSAR_LOG_INIT) << "Initializing renderer.";
+  TORCH_CHECK_ARG(width > 0, 1, "image width must be > 0!");
+  TORCH_CHECK_ARG(height > 0, 2, "image height must be > 0!");
+  TORCH_CHECK_ARG(max_n_balls > 0, 3, "max_n_balls must be > 0!");
+  TORCH_CHECK_ARG(
+      background_normalization_depth > 0.f &&
+          background_normalization_depth < 1.f,
+      5,
+      "background_normalization_depth must be in ]0., 1.[");
+  TORCH_CHECK_ARG(n_channels > 0, 6, "n_channels must be > 0");
+  TORCH_CHECK_ARG(
+      n_track > 0 && n_track <= MAX_GRAD_SPHERES,
+      7,
+      ("n_track must be > 0 and <" + std::to_string(MAX_GRAD_SPHERES) +
+       ". Is " + std::to_string(n_track) + ".")
+          .c_str());
+  LOG_IF(INFO, PULSAR_LOG_INIT)
+      << "Image width: " << width << ", height: " << height;
+  this->renderer_vec.emplace_back();
+  this->device_type = c10::DeviceType::CPU;
+  this->device_index = -1;
+  PRE::construct<false>(
+      this->renderer_vec.data(),
+      max_n_balls,
+      width,
+      height,
+      orthogonal_projection,
+      right_handed_system,
+      background_normalization_depth,
+      n_channels,
+      n_track);
+  this->device_tracker = torch::zeros(1);
+};
+
+Renderer::~Renderer() {
+  if (this->device_type == c10::DeviceType::CUDA) {
+// Can't happen in the case that not compiled with CUDA.
+#ifdef WITH_CUDA
+    at::cuda::CUDAGuard device_guard(this->device_tracker.device());
+    for (auto nrend : this->renderer_vec) {
+      PRE::destruct<true>(&nrend);
+    }
+#endif
+  } else {
+    for (auto nrend : this->renderer_vec) {
+      PRE::destruct<false>(&nrend);
+    }
+  }
+}
+
+bool Renderer::operator==(const Renderer& rhs) const {
+  LOG_IF(INFO, PULSAR_LOG_INIT) << "Equality check.";
+  bool renderer_agrees = (this->renderer_vec[0] == rhs.renderer_vec[0]);
+  LOG_IF(INFO, PULSAR_LOG_INIT) << "  Renderer agrees: " << renderer_agrees;
+  bool device_agrees =
+      (this->device_tracker.device() == rhs.device_tracker.device());
+  LOG_IF(INFO, PULSAR_LOG_INIT) << "  Device agrees: " << device_agrees;
+  return (renderer_agrees && device_agrees);
+};
+
+void Renderer::ensure_on_device(torch::Device device, bool /*non_blocking*/) {
+  TORCH_CHECK_ARG(
+      device.type() == c10::DeviceType::CUDA ||
+          device.type() == c10::DeviceType::CPU,
+      1,
+      "Only CPU and CUDA device types are supported.");
+  if (device.type() != this->device_type ||
+      device.index() != this->device_index) {
+#ifdef WITH_CUDA
+    LOG_IF(INFO, PULSAR_LOG_INIT)
+        << "Transferring render buffers between devices.";
+    int prev_active;
+    cudaGetDevice(&prev_active);
+    if (this->device_type == c10::DeviceType::CUDA) {
+      LOG_IF(INFO, PULSAR_LOG_INIT) << "  Destructing on CUDA.";
+      cudaSetDevice(this->device_index);
+      for (auto& nrend : this->renderer_vec) {
+        PRE::destruct<true>(&nrend);
+      }
+    } else {
+      LOG_IF(INFO, PULSAR_LOG_INIT) << "  Destructing on CPU.";
+      for (auto& nrend : this->renderer_vec) {
+        PRE::destruct<false>(&nrend);
+      }
+    }
+    if (device.type() == c10::DeviceType::CUDA) {
+      LOG_IF(INFO, PULSAR_LOG_INIT) << "  Constructing on CUDA.";
+      cudaSetDevice(device.index());
+      for (auto& nrend : this->renderer_vec) {
+        PRE::construct<true>(
+            &nrend,
+            this->renderer_vec[0].max_num_balls,
+            this->renderer_vec[0].cam.film_width,
+            this->renderer_vec[0].cam.film_height,
+            this->renderer_vec[0].cam.orthogonal_projection,
+            this->renderer_vec[0].cam.right_handed,
+            this->renderer_vec[0].cam.background_normalization_depth,
+            this->renderer_vec[0].cam.n_channels,
+            this->n_track());
+      }
+    } else {
+      LOG_IF(INFO, PULSAR_LOG_INIT) << "  Constructing on CPU.";
+      for (auto& nrend : this->renderer_vec) {
+        PRE::construct<false>(
+            &nrend,
+            this->renderer_vec[0].max_num_balls,
+            this->renderer_vec[0].cam.film_width,
+            this->renderer_vec[0].cam.film_height,
+            this->renderer_vec[0].cam.orthogonal_projection,
+            this->renderer_vec[0].cam.right_handed,
+            this->renderer_vec[0].cam.background_normalization_depth,
+            this->renderer_vec[0].cam.n_channels,
+            this->n_track());
+      }
+    }
+    cudaSetDevice(prev_active);
+    this->device_type = device.type();
+    this->device_index = device.index();
+#else
+    throw std::runtime_error(
+        "pulsar was built without CUDA "
+        "but a device move to a CUDA device was initiated.");
+#endif
+  }
+};
+
+void Renderer::ensure_n_renderers_gte(const size_t& batch_size) {
+  if (this->renderer_vec.size() < batch_size) {
+    ptrdiff_t diff = batch_size - this->renderer_vec.size();
+    LOG_IF(INFO, PULSAR_LOG_INIT)
+        << "Increasing render buffers by " << diff
+        << " to account for batch size " << batch_size;
+    for (ptrdiff_t i = 0; i < diff; ++i) {
+      this->renderer_vec.emplace_back();
+      if (this->device_type == c10::DeviceType::CUDA) {
+#ifdef WITH_CUDA
+        PRE::construct<true>(
+            &this->renderer_vec[this->renderer_vec.size() - 1],
+            this->max_num_balls(),
+            this->width(),
+            this->height(),
+            this->renderer_vec[0].cam.orthogonal_projection,
+            this->renderer_vec[0].cam.right_handed,
+            this->renderer_vec[0].cam.background_normalization_depth,
+            this->renderer_vec[0].cam.n_channels,
+            this->n_track());
+#endif
+      } else {
+        PRE::construct<false>(
+            &this->renderer_vec[this->renderer_vec.size() - 1],
+            this->max_num_balls(),
+            this->width(),
+            this->height(),
+            this->renderer_vec[0].cam.orthogonal_projection,
+            this->renderer_vec[0].cam.right_handed,
+            this->renderer_vec[0].cam.background_normalization_depth,
+            this->renderer_vec[0].cam.n_channels,
+            this->n_track());
+      }
+    }
+  }
+}
+
+std::tuple<size_t, size_t, bool, torch::Tensor> Renderer::arg_check(
+    const torch::Tensor& vert_pos,
+    const torch::Tensor& vert_col,
+    const torch::Tensor& vert_radii,
+    const torch::Tensor& cam_pos,
+    const torch::Tensor& pixel_0_0_center,
+    const torch::Tensor& pixel_vec_x,
+    const torch::Tensor& pixel_vec_y,
+    const torch::Tensor& focal_length,
+    const torch::Tensor& principal_point_offsets,
+    const float& gamma,
+    const float& max_depth,
+    float& min_depth,
+    const c10::optional<torch::Tensor>& bg_col,
+    const c10::optional<torch::Tensor>& opacity,
+    const float& percent_allowed_difference,
+    const uint& max_n_hits,
+    const uint& mode) {
+  LOG_IF(INFO, PULSAR_LOG_FORWARD || PULSAR_LOG_BACKWARD) << "Arg check.";
+  size_t batch_size = 1;
+  size_t n_points;
+  bool batch_processing = false;
+  if (vert_pos.ndimension() == 3) {
+    // Check all parameters adhere batch size.
+    batch_processing = true;
+    batch_size = vert_pos.size(0);
+    TORCH_CHECK_ARG(
+        vert_col.ndimension() == 3 &&
+            vert_col.size(0) == static_cast<int64_t>(batch_size),
+        2,
+        "vert_col needs to have batch size.");
+    TORCH_CHECK_ARG(
+        vert_radii.ndimension() == 2 &&
+            vert_radii.size(0) == static_cast<int64_t>(batch_size),
+        3,
+        "vert_radii must be specified per batch.");
+    TORCH_CHECK_ARG(
+        cam_pos.ndimension() == 2 &&
+            cam_pos.size(0) == static_cast<int64_t>(batch_size),
+        4,
+        "cam_pos must be specified per batch and have the correct batch size.");
+    TORCH_CHECK_ARG(
+        pixel_0_0_center.ndimension() == 2 &&
+            pixel_0_0_center.size(0) == static_cast<int64_t>(batch_size),
+        5,
+        "pixel_0_0_center must be specified per batch.");
+    TORCH_CHECK_ARG(
+        pixel_vec_x.ndimension() == 2 &&
+            pixel_vec_x.size(0) == static_cast<int64_t>(batch_size),
+        6,
+        "pixel_vec_x must be specified per batch.");
+    TORCH_CHECK_ARG(
+        pixel_vec_y.ndimension() == 2 &&
+            pixel_vec_y.size(0) == static_cast<int64_t>(batch_size),
+        7,
+        "pixel_vec_y must be specified per batch.");
+    TORCH_CHECK_ARG(
+        focal_length.ndimension() == 1 &&
+            focal_length.size(0) == static_cast<int64_t>(batch_size),
+        8,
+        "focal_length must be specified per batch.");
+    TORCH_CHECK_ARG(
+        principal_point_offsets.ndimension() == 2 &&
+            principal_point_offsets.size(0) == static_cast<int64_t>(batch_size),
+        9,
+        "principal_point_offsets must be specified per batch.");
+    if (opacity.has_value()) {
+      TORCH_CHECK_ARG(
+          opacity.value().ndimension() == 2 &&
+              opacity.value().size(0) == static_cast<int64_t>(batch_size),
+          13,
+          "Opacity needs to be specified batch-wise.");
+    }
+    // Check all parameters are for a matching number of points.
+    n_points = vert_pos.size(1);
+    TORCH_CHECK_ARG(
+        vert_col.size(1) == static_cast<int64_t>(n_points),
+        2,
+        ("The number of points for vertex positions (" +
+         std::to_string(n_points) + ") and vertex colors (" +
+         std::to_string(vert_col.size(1)) + ") doesn't agree.")
+            .c_str());
+    TORCH_CHECK_ARG(
+        vert_radii.size(1) == static_cast<int64_t>(n_points),
+        3,
+        ("The number of points for vertex positions (" +
+         std::to_string(n_points) + ") and vertex radii (" +
+         std::to_string(vert_col.size(1)) + ") doesn't agree.")
+            .c_str());
+    if (opacity.has_value()) {
+      TORCH_CHECK_ARG(
+          opacity.value().size(1) == static_cast<int64_t>(n_points),
+          13,
+          "Opacity needs to be specified per point.");
+    }
+    // Check all parameters have the correct last dimension size.
+    TORCH_CHECK_ARG(
+        vert_pos.size(2) == 3,
+        1,
+        ("Vertex positions must be 3D (have shape " +
+         std::to_string(vert_pos.size(2)) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        vert_col.size(2) == this->renderer_vec[0].cam.n_channels,
+        2,
+        ("Vertex colors must have the right number of channels (have shape " +
+         std::to_string(vert_col.size(2)) + ", need " +
+         std::to_string(this->renderer_vec[0].cam.n_channels) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        cam_pos.size(1) == 3,
+        4,
+        ("Camera position must be 3D (has shape " +
+         std::to_string(cam_pos.size(1)) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        pixel_0_0_center.size(1) == 3,
+        5,
+        ("pixel_0_0_center must be 3D (has shape " +
+         std::to_string(pixel_0_0_center.size(1)) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        pixel_vec_x.size(1) == 3,
+        6,
+        ("pixel_vec_x must be 3D (has shape " +
+         std::to_string(pixel_vec_x.size(1)) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        pixel_vec_y.size(1) == 3,
+        7,
+        ("pixel_vec_y must be 3D (has shape " +
+         std::to_string(pixel_vec_y.size(1)) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        principal_point_offsets.size(1) == 2,
+        9,
+        "principal_point_offsets must contain x and y offsets.");
+    // Ensure enough renderers are available for the batch.
+    ensure_n_renderers_gte(batch_size);
+  } else {
+    // Check all parameters are of correct dimension.
+    TORCH_CHECK_ARG(
+        vert_col.ndimension() == 2, 2, "vert_col needs to have dimension 2.");
+    TORCH_CHECK_ARG(
+        vert_radii.ndimension() == 1, 3, "vert_radii must have dimension 1.");
+    TORCH_CHECK_ARG(
+        cam_pos.ndimension() == 1, 4, "cam_pos must have dimension 1.");
+    TORCH_CHECK_ARG(
+        pixel_0_0_center.ndimension() == 1,
+        5,
+        "pixel_0_0_center must have dimension 1.");
+    TORCH_CHECK_ARG(
+        pixel_vec_x.ndimension() == 1, 6, "pixel_vec_x must have dimension 1.");
+    TORCH_CHECK_ARG(
+        pixel_vec_y.ndimension() == 1, 7, "pixel_vec_y must have dimension 1.");
+    TORCH_CHECK_ARG(
+        focal_length.ndimension() == 0,
+        8,
+        "focal_length must have dimension 0.");
+    TORCH_CHECK_ARG(
+        principal_point_offsets.ndimension() == 1,
+        9,
+        "principal_point_offsets must have dimension 1.");
+    if (opacity.has_value()) {
+      TORCH_CHECK_ARG(
+          opacity.value().ndimension() == 1,
+          13,
+          "Opacity needs to be specified per sample.");
+    }
+    // Check each.
+    n_points = vert_pos.size(0);
+    TORCH_CHECK_ARG(
+        vert_col.size(0) == static_cast<int64_t>(n_points),
+        2,
+        ("The number of points for vertex positions (" +
+         std::to_string(n_points) + ") and vertex colors (" +
+         std::to_string(vert_col.size(0)) + ") doesn't agree.")
+            .c_str());
+    TORCH_CHECK_ARG(
+        vert_radii.size(0) == static_cast<int64_t>(n_points),
+        3,
+        ("The number of points for vertex positions (" +
+         std::to_string(n_points) + ") and vertex radii (" +
+         std::to_string(vert_col.size(0)) + ") doesn't agree.")
+            .c_str());
+    if (opacity.has_value()) {
+      TORCH_CHECK_ARG(
+          opacity.value().size(0) == static_cast<int64_t>(n_points),
+          12,
+          "Opacity needs to be specified per point.");
+    }
+    // Check all parameters have the correct last dimension size.
+    TORCH_CHECK_ARG(
+        vert_pos.size(1) == 3,
+        1,
+        ("Vertex positions must be 3D (have shape " +
+         std::to_string(vert_pos.size(1)) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        vert_col.size(1) == this->renderer_vec[0].cam.n_channels,
+        2,
+        ("Vertex colors must have the right number of channels (have shape " +
+         std::to_string(vert_col.size(1)) + ", need " +
+         std::to_string(this->renderer_vec[0].cam.n_channels) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        cam_pos.size(0) == 3,
+        4,
+        ("Camera position must be 3D (has shape " +
+         std::to_string(cam_pos.size(0)) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        pixel_0_0_center.size(0) == 3,
+        5,
+        ("pixel_0_0_center must be 3D (has shape " +
+         std::to_string(pixel_0_0_center.size(0)) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        pixel_vec_x.size(0) == 3,
+        6,
+        ("pixel_vec_x must be 3D (has shape " +
+         std::to_string(pixel_vec_x.size(0)) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        pixel_vec_y.size(0) == 3,
+        7,
+        ("pixel_vec_y must be 3D (has shape " +
+         std::to_string(pixel_vec_y.size(0)) + ")!")
+            .c_str());
+    TORCH_CHECK_ARG(
+        principal_point_offsets.size(0) == 2,
+        9,
+        "principal_point_offsets must have x and y component.");
+  }
+  // Check device placement.
+  auto dev = torch::device_of(vert_pos).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      1,
+      ("Vertex positions must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  dev = torch::device_of(vert_col).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      2,
+      ("Vertex colors must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  dev = torch::device_of(vert_radii).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      3,
+      ("Vertex radii must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  dev = torch::device_of(cam_pos).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      4,
+      ("Camera position must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  dev = torch::device_of(pixel_0_0_center).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      5,
+      ("pixel_0_0_center must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  dev = torch::device_of(pixel_vec_x).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      6,
+      ("pixel_vec_x must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  dev = torch::device_of(pixel_vec_y).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      7,
+      ("pixel_vec_y must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  dev = torch::device_of(principal_point_offsets).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      9,
+      ("principal_point_offsets must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  if (opacity.has_value()) {
+    dev = torch::device_of(opacity.value()).value();
+    TORCH_CHECK_ARG(
+        dev.type() == this->device_type && dev.index() == this->device_index,
+        13,
+        ("opacity must be stored on device " +
+         c10::DeviceTypeName(this->device_type) + ", index " +
+         std::to_string(this->device_index) + "! Is stored on " +
+         c10::DeviceTypeName(dev.type()) + ", index " +
+         std::to_string(dev.index()) + ".")
+            .c_str());
+  }
+  // Type checks.
+  TORCH_CHECK_ARG(
+      vert_pos.scalar_type() == c10::kFloat, 1, "pulsar requires float types.");
+  TORCH_CHECK_ARG(
+      vert_col.scalar_type() == c10::kFloat, 2, "pulsar requires float types.");
+  TORCH_CHECK_ARG(
+      vert_radii.scalar_type() == c10::kFloat,
+      3,
+      "pulsar requires float types.");
+  TORCH_CHECK_ARG(
+      cam_pos.scalar_type() == c10::kFloat, 4, "pulsar requires float types.");
+  TORCH_CHECK_ARG(
+      pixel_0_0_center.scalar_type() == c10::kFloat,
+      5,
+      "pulsar requires float types.");
+  TORCH_CHECK_ARG(
+      pixel_vec_x.scalar_type() == c10::kFloat,
+      6,
+      "pulsar requires float types.");
+  TORCH_CHECK_ARG(
+      pixel_vec_y.scalar_type() == c10::kFloat,
+      7,
+      "pulsar requires float types.");
+  TORCH_CHECK_ARG(
+      focal_length.scalar_type() == c10::kFloat,
+      8,
+      "pulsar requires float types.");
+  TORCH_CHECK_ARG(
+      // Unfortunately, the PyTorch interface is inconsistent for
+      // Int32: in Python, there exists an explicit int32 type, in
+      // C++ this is currently `c10::kInt`.
+      principal_point_offsets.scalar_type() == c10::kInt,
+      9,
+      "principal_point_offsets must be provided as int32.");
+  if (opacity.has_value()) {
+    TORCH_CHECK_ARG(
+        opacity.value().scalar_type() == c10::kFloat,
+        13,
+        "opacity must be a float type.");
+  }
+  // Content checks.
+  TORCH_CHECK_ARG(
+      (vert_radii > FEPS).all().item<bool>(),
+      3,
+      ("Vertex radii must be > FEPS (min is " +
+       std::to_string(vert_radii.min().item<float>()) + ").")
+          .c_str());
+  if (this->orthogonal()) {
+    TORCH_CHECK_ARG(
+        (focal_length == 0.f).all().item<bool>(),
+        8,
+        ("for an orthogonal projection focal length must be zero (abs max: " +
+         std::to_string(focal_length.abs().max().item<float>()) + ").")
+            .c_str());
+  } else {
+    TORCH_CHECK_ARG(
+        (focal_length > FEPS).all().item<bool>(),
+        8,
+        ("for a perspective projection focal length must be > FEPS (min " +
+         std::to_string(focal_length.min().item<float>()) + ").")
+            .c_str());
+  }
+  TORCH_CHECK_ARG(
+      gamma <= 1.f && gamma >= 1E-5f,
+      10,
+      ("gamma must be in [1E-5, 1] (" + std::to_string(gamma) + ").").c_str());
+  if (min_depth == 0.f) {
+    min_depth = focal_length.max().item<float>() + 2.f * FEPS;
+  }
+  TORCH_CHECK_ARG(
+      min_depth > focal_length.max().item<float>(),
+      12,
+      ("min_depth must be > focal_length (" + std::to_string(min_depth) +
+       " vs. " + std::to_string(focal_length.max().item<float>()) + ").")
+          .c_str());
+  TORCH_CHECK_ARG(
+      max_depth > min_depth + FEPS,
+      11,
+      ("max_depth must be > min_depth + FEPS (" + std::to_string(max_depth) +
+       " vs. " + std::to_string(min_depth + FEPS) + ").")
+          .c_str());
+  TORCH_CHECK_ARG(
+      percent_allowed_difference >= 0.f && percent_allowed_difference < 1.f,
+      14,
+      ("percent_allowed_difference must be in [0., 1.[ (" +
+       std::to_string(percent_allowed_difference) + ").")
+          .c_str());
+  TORCH_CHECK_ARG(max_n_hits > 0, 14, "max_n_hits must be > 0!");
+  TORCH_CHECK_ARG(mode < 2, 15, "mode must be in {0, 1}.");
+  torch::Tensor real_bg_col;
+  if (bg_col.has_value()) {
+    TORCH_CHECK_ARG(
+        bg_col.value().device().type() == this->device_type &&
+            bg_col.value().device().index() == this->device_index,
+        13,
+        "bg_col must be stored on the renderer device!");
+    TORCH_CHECK_ARG(
+        bg_col.value().ndimension() == 1 &&
+            bg_col.value().size(0) == renderer_vec[0].cam.n_channels,
+        13,
+        "bg_col must have the same number of channels as the image,).");
+    real_bg_col = bg_col.value();
+  } else {
+    real_bg_col = torch::ones(
+                      {renderer_vec[0].cam.n_channels},
+                      c10::Device(this->device_type, this->device_index))
+                      .to(c10::kFloat);
+  }
+  if (opacity.has_value()) {
+    TORCH_CHECK_ARG(
+        (opacity.value() >= 0.f).all().item<bool>(),
+        13,
+        "opacity must be >= 0.");
+    TORCH_CHECK_ARG(
+        (opacity.value() <= 1.f).all().item<bool>(),
+        13,
+        "opacity must be <= 1.");
+  }
+  LOG_IF(INFO, PULSAR_LOG_FORWARD || PULSAR_LOG_BACKWARD)
+      << "  batch_size: " << batch_size;
+  LOG_IF(INFO, PULSAR_LOG_FORWARD || PULSAR_LOG_BACKWARD)
+      << "  n_points: " << n_points;
+  LOG_IF(INFO, PULSAR_LOG_FORWARD || PULSAR_LOG_BACKWARD)
+      << "  batch_processing: " << batch_processing;
+  return std::tuple<size_t, size_t, bool, torch::Tensor>(
+      batch_size, n_points, batch_processing, real_bg_col);
+}
+
+std::tuple<torch::Tensor, torch::Tensor> Renderer::forward(
+    const torch::Tensor& vert_pos,
+    const torch::Tensor& vert_col,
+    const torch::Tensor& vert_radii,
+    const torch::Tensor& cam_pos,
+    const torch::Tensor& pixel_0_0_center,
+    const torch::Tensor& pixel_vec_x,
+    const torch::Tensor& pixel_vec_y,
+    const torch::Tensor& focal_length,
+    const torch::Tensor& principal_point_offsets,
+    const float& gamma,
+    const float& max_depth,
+    float min_depth,
+    const c10::optional<torch::Tensor>& bg_col,
+    const c10::optional<torch::Tensor>& opacity,
+    const float& percent_allowed_difference,
+    const uint& max_n_hits,
+    const uint& mode) {
+  // Parameter checks.
+  this->ensure_on_device(this->device_tracker.device());
+  size_t batch_size;
+  size_t n_points;
+  bool batch_processing;
+  torch::Tensor real_bg_col;
+  std::tie(batch_size, n_points, batch_processing, real_bg_col) =
+      this->arg_check(
+          vert_pos,
+          vert_col,
+          vert_radii,
+          cam_pos,
+          pixel_0_0_center,
+          pixel_vec_x,
+          pixel_vec_y,
+          focal_length,
+          principal_point_offsets,
+          gamma,
+          max_depth,
+          min_depth,
+          bg_col,
+          opacity,
+          percent_allowed_difference,
+          max_n_hits,
+          mode);
+  LOG_IF(INFO, PULSAR_LOG_FORWARD) << "Extracting camera objects...";
+  // Create the camera information.
+  std::vector<CamInfo> cam_infos(batch_size);
+  if (batch_processing) {
+    for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+      cam_infos[batch_i] = cam_info_from_params(
+          cam_pos[batch_i],
+          pixel_0_0_center[batch_i],
+          pixel_vec_x[batch_i],
+          pixel_vec_y[batch_i],
+          principal_point_offsets[batch_i],
+          focal_length[batch_i].item<float>(),
+          this->renderer_vec[0].cam.film_width,
+          this->renderer_vec[0].cam.film_height,
+          min_depth,
+          max_depth,
+          this->renderer_vec[0].cam.right_handed);
+    }
+  } else {
+    cam_infos[0] = cam_info_from_params(
+        cam_pos,
+        pixel_0_0_center,
+        pixel_vec_x,
+        pixel_vec_y,
+        principal_point_offsets,
+        focal_length.item<float>(),
+        this->renderer_vec[0].cam.film_width,
+        this->renderer_vec[0].cam.film_height,
+        min_depth,
+        max_depth,
+        this->renderer_vec[0].cam.right_handed);
+  }
+  LOG_IF(INFO, PULSAR_LOG_FORWARD) << "Processing...";
+  // Let's go!
+  // Contiguous version of opacity, if available. We need to create this object
+  // in scope to keep it alive.
+  torch::Tensor opacity_contiguous;
+  float const* opacity_ptr = nullptr;
+  if (opacity.has_value()) {
+    opacity_contiguous = opacity.value().contiguous();
+    opacity_ptr = opacity_contiguous.data_ptr<float>();
+  }
+  if (this->device_type == c10::DeviceType::CUDA) {
+// No else check necessary - if not compiled with CUDA
+// we can't even reach this code (the renderer can't be
+// moved to a CUDA device).
+#ifdef WITH_CUDA
+    int prev_active;
+    cudaGetDevice(&prev_active);
+    cudaSetDevice(this->device_index);
+#ifdef PULSAR_TIMINGS_BATCHED_ENABLED
+    START_TIME_CU(batch_forward);
+#endif
+    if (batch_processing) {
+      for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+        // These calls are non-blocking and just kick off the computations.
+        PRE::forward<true>(
+            &this->renderer_vec[batch_i],
+            vert_pos[batch_i].contiguous().data_ptr<float>(),
+            vert_col[batch_i].contiguous().data_ptr<float>(),
+            vert_radii[batch_i].contiguous().data_ptr<float>(),
+            cam_infos[batch_i],
+            gamma,
+            percent_allowed_difference,
+            max_n_hits,
+            real_bg_col.contiguous().data_ptr<float>(),
+            opacity_ptr,
+            n_points,
+            mode,
+            at::cuda::getCurrentCUDAStream());
+      }
+    } else {
+      PRE::forward<true>(
+          this->renderer_vec.data(),
+          vert_pos.contiguous().data_ptr<float>(),
+          vert_col.contiguous().data_ptr<float>(),
+          vert_radii.contiguous().data_ptr<float>(),
+          cam_infos[0],
+          gamma,
+          percent_allowed_difference,
+          max_n_hits,
+          real_bg_col.contiguous().data_ptr<float>(),
+          opacity_ptr,
+          n_points,
+          mode,
+          at::cuda::getCurrentCUDAStream());
+    }
+#ifdef PULSAR_TIMINGS_BATCHED_ENABLED
+    STOP_TIME_CU(batch_forward);
+    float time_ms;
+    GET_TIME_CU(batch_forward, &time_ms);
+    std::cout << "Forward render batched time per example: "
+              << time_ms / static_cast<float>(batch_size) << "ms" << std::endl;
+#endif
+    cudaSetDevice(prev_active);
+#endif
+  } else {
+#ifdef PULSAR_TIMINGS_BATCHED_ENABLED
+    START_TIME(batch_forward);
+#endif
+    if (batch_processing) {
+      for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+        // These calls are non-blocking and just kick off the computations.
+        PRE::forward<false>(
+            &this->renderer_vec[batch_i],
+            vert_pos[batch_i].contiguous().data_ptr<float>(),
+            vert_col[batch_i].contiguous().data_ptr<float>(),
+            vert_radii[batch_i].contiguous().data_ptr<float>(),
+            cam_infos[batch_i],
+            gamma,
+            percent_allowed_difference,
+            max_n_hits,
+            real_bg_col.contiguous().data_ptr<float>(),
+            opacity_ptr,
+            n_points,
+            mode,
+            nullptr);
+      }
+    } else {
+      PRE::forward<false>(
+          this->renderer_vec.data(),
+          vert_pos.contiguous().data_ptr<float>(),
+          vert_col.contiguous().data_ptr<float>(),
+          vert_radii.contiguous().data_ptr<float>(),
+          cam_infos[0],
+          gamma,
+          percent_allowed_difference,
+          max_n_hits,
+          real_bg_col.contiguous().data_ptr<float>(),
+          opacity_ptr,
+          n_points,
+          mode,
+          nullptr);
+    }
+#ifdef PULSAR_TIMINGS_BATCHED_ENABLED
+    STOP_TIME(batch_forward);
+    float time_ms;
+    GET_TIME(batch_forward, &time_ms);
+    std::cout << "Forward render batched time per example: "
+              << time_ms / static_cast<float>(batch_size) << "ms" << std::endl;
+#endif
+  }
+  LOG_IF(INFO, PULSAR_LOG_FORWARD) << "Extracting results...";
+  // Create the results.
+  std::vector<torch::Tensor> results(batch_size);
+  std::vector<torch::Tensor> forw_infos(batch_size);
+  for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+    results[batch_i] = from_blob(
+        this->renderer_vec[batch_i].result_d,
+        {this->renderer_vec[0].cam.film_height,
+         this->renderer_vec[0].cam.film_width,
+         this->renderer_vec[0].cam.n_channels},
+        this->device_type,
+        this->device_index,
+        torch::kFloat,
+        this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+            ? at::cuda::getCurrentCUDAStream()
+#else
+            ? (cudaStream_t) nullptr
+#endif
+            : (cudaStream_t) nullptr);
+    if (mode == 1)
+      results[batch_i] = results[batch_i].slice(2, 0, 1, 1);
+    forw_infos[batch_i] = from_blob(
+        this->renderer_vec[batch_i].forw_info_d,
+        {this->renderer_vec[0].cam.film_height,
+         this->renderer_vec[0].cam.film_width,
+         3 + 2 * this->n_track()},
+        this->device_type,
+        this->device_index,
+        torch::kFloat,
+        this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+            ? at::cuda::getCurrentCUDAStream()
+#else
+            ? (cudaStream_t) nullptr
+#endif
+            : (cudaStream_t) nullptr);
+  }
+  LOG_IF(INFO, PULSAR_LOG_FORWARD) << "Forward render complete.";
+  if (batch_processing) {
+    return std::tuple<torch::Tensor, torch::Tensor>(
+        torch::stack(results), torch::stack(forw_infos));
+  } else {
+    return std::tuple<torch::Tensor, torch::Tensor>(results[0], forw_infos[0]);
+  }
+};
+
+std::tuple<
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>,
+    at::optional<torch::Tensor>>
+Renderer::backward(
+    const torch::Tensor& grad_im,
+    const torch::Tensor& image,
+    const torch::Tensor& forw_info,
+    const torch::Tensor& vert_pos,
+    const torch::Tensor& vert_col,
+    const torch::Tensor& vert_radii,
+    const torch::Tensor& cam_pos,
+    const torch::Tensor& pixel_0_0_center,
+    const torch::Tensor& pixel_vec_x,
+    const torch::Tensor& pixel_vec_y,
+    const torch::Tensor& focal_length,
+    const torch::Tensor& principal_point_offsets,
+    const float& gamma,
+    const float& max_depth,
+    float min_depth,
+    const c10::optional<torch::Tensor>& bg_col,
+    const c10::optional<torch::Tensor>& opacity,
+    const float& percent_allowed_difference,
+    const uint& max_n_hits,
+    const uint& mode,
+    const bool& dif_pos,
+    const bool& dif_col,
+    const bool& dif_rad,
+    const bool& dif_cam,
+    const bool& dif_opy,
+    const at::optional<std::pair<uint, uint>>& dbg_pos) {
+  this->ensure_on_device(this->device_tracker.device());
+  size_t batch_size;
+  size_t n_points;
+  bool batch_processing;
+  torch::Tensor real_bg_col;
+  std::tie(batch_size, n_points, batch_processing, real_bg_col) =
+      this->arg_check(
+          vert_pos,
+          vert_col,
+          vert_radii,
+          cam_pos,
+          pixel_0_0_center,
+          pixel_vec_x,
+          pixel_vec_y,
+          focal_length,
+          principal_point_offsets,
+          gamma,
+          max_depth,
+          min_depth,
+          bg_col,
+          opacity,
+          percent_allowed_difference,
+          max_n_hits,
+          mode);
+  // Additional checks for the gradient computation.
+  TORCH_CHECK_ARG(
+      (grad_im.ndimension() == 3 + batch_processing &&
+       static_cast<uint>(grad_im.size(0 + batch_processing)) ==
+           this->height() &&
+       static_cast<uint>(grad_im.size(1 + batch_processing)) == this->width() &&
+       static_cast<uint>(grad_im.size(2 + batch_processing)) ==
+           this->renderer_vec[0].cam.n_channels),
+      1,
+      "The gradient image size is not correct.");
+  TORCH_CHECK_ARG(
+      (image.ndimension() == 3 + batch_processing &&
+       static_cast<uint>(image.size(0 + batch_processing)) == this->height() &&
+       static_cast<uint>(image.size(1 + batch_processing)) == this->width() &&
+       static_cast<uint>(image.size(2 + batch_processing)) ==
+           this->renderer_vec[0].cam.n_channels),
+      2,
+      "The result image size is not correct.");
+  TORCH_CHECK_ARG(
+      grad_im.scalar_type() == c10::kFloat,
+      1,
+      "The gradient image must be of float type.");
+  TORCH_CHECK_ARG(
+      image.scalar_type() == c10::kFloat,
+      2,
+      "The image must be of float type.");
+  if (dif_opy) {
+    TORCH_CHECK_ARG(
+        opacity.has_value(), 13, "dif_opy set requires opacity values.");
+  }
+  if (batch_processing) {
+    TORCH_CHECK_ARG(
+        grad_im.size(0) == static_cast<int64_t>(batch_size),
+        1,
+        "Gradient image batch size must agree.");
+    TORCH_CHECK_ARG(
+        image.size(0) == static_cast<int64_t>(batch_size),
+        2,
+        "Image batch size must agree.");
+    TORCH_CHECK_ARG(
+        forw_info.size(0) == static_cast<int64_t>(batch_size),
+        3,
+        "forward info must have batch size.");
+  }
+  TORCH_CHECK_ARG(
+      (forw_info.ndimension() == 3 + batch_processing &&
+       static_cast<uint>(forw_info.size(0 + batch_processing)) ==
+           this->height() &&
+       static_cast<uint>(forw_info.size(1 + batch_processing)) ==
+           this->width() &&
+       static_cast<uint>(forw_info.size(2 + batch_processing)) ==
+           3 + 2 * this->n_track()),
+      3,
+      "The forward info image size is not correct.");
+  TORCH_CHECK_ARG(
+      forw_info.scalar_type() == c10::kFloat,
+      3,
+      "The forward info must be of float type.");
+  // Check device.
+  auto dev = torch::device_of(grad_im).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      1,
+      ("grad_im must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  dev = torch::device_of(image).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      2,
+      ("image must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  dev = torch::device_of(forw_info).value();
+  TORCH_CHECK_ARG(
+      dev.type() == this->device_type && dev.index() == this->device_index,
+      3,
+      ("forw_info must be stored on device " +
+       c10::DeviceTypeName(this->device_type) + ", index " +
+       std::to_string(this->device_index) + "! Are stored on " +
+       c10::DeviceTypeName(dev.type()) + ", index " +
+       std::to_string(dev.index()) + ".")
+          .c_str());
+  if (dbg_pos.has_value()) {
+    TORCH_CHECK_ARG(
+        dbg_pos.value().first < this->width() &&
+            dbg_pos.value().second < this->height(),
+        23,
+        "The debug position must be within image bounds.");
+  }
+  // Prepare the return value.
+  std::tuple<
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>>
+      ret;
+  if (mode == 1 || (!dif_pos && !dif_col && !dif_rad && !dif_cam && !dif_opy)) {
+    return ret;
+  }
+  // Create the camera information.
+  std::vector<CamInfo> cam_infos(batch_size);
+  if (batch_processing) {
+    for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+      cam_infos[batch_i] = cam_info_from_params(
+          cam_pos[batch_i],
+          pixel_0_0_center[batch_i],
+          pixel_vec_x[batch_i],
+          pixel_vec_y[batch_i],
+          principal_point_offsets[batch_i],
+          focal_length[batch_i].item<float>(),
+          this->renderer_vec[0].cam.film_width,
+          this->renderer_vec[0].cam.film_height,
+          min_depth,
+          max_depth,
+          this->renderer_vec[0].cam.right_handed);
+    }
+  } else {
+    cam_infos[0] = cam_info_from_params(
+        cam_pos,
+        pixel_0_0_center,
+        pixel_vec_x,
+        pixel_vec_y,
+        principal_point_offsets,
+        focal_length.item<float>(),
+        this->renderer_vec[0].cam.film_width,
+        this->renderer_vec[0].cam.film_height,
+        min_depth,
+        max_depth,
+        this->renderer_vec[0].cam.right_handed);
+  }
+  // Let's go!
+  // Contiguous version of opacity, if available. We need to create this object
+  // in scope to keep it alive.
+  torch::Tensor opacity_contiguous;
+  float const* opacity_ptr = nullptr;
+  if (opacity.has_value()) {
+    opacity_contiguous = opacity.value().contiguous();
+    opacity_ptr = opacity_contiguous.data_ptr<float>();
+  }
+  if (this->device_type == c10::DeviceType::CUDA) {
+// No else check necessary - it's not possible to move
+// the renderer to a CUDA device if not built with CUDA.
+#ifdef WITH_CUDA
+    int prev_active;
+    cudaGetDevice(&prev_active);
+    cudaSetDevice(this->device_index);
+#ifdef PULSAR_TIMINGS_BATCHED_ENABLED
+    START_TIME_CU(batch_backward);
+#endif
+    if (batch_processing) {
+      for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+        // These calls are non-blocking and just kick off the computations.
+        if (dbg_pos.has_value()) {
+          PRE::backward_dbg<true>(
+              &this->renderer_vec[batch_i],
+              grad_im[batch_i].contiguous().data_ptr<float>(),
+              image[batch_i].contiguous().data_ptr<float>(),
+              forw_info[batch_i].contiguous().data_ptr<float>(),
+              vert_pos[batch_i].contiguous().data_ptr<float>(),
+              vert_col[batch_i].contiguous().data_ptr<float>(),
+              vert_radii[batch_i].contiguous().data_ptr<float>(),
+              cam_infos[batch_i],
+              gamma,
+              percent_allowed_difference,
+              max_n_hits,
+              opacity_ptr,
+              n_points,
+              mode,
+              dif_pos,
+              dif_col,
+              dif_rad,
+              dif_cam,
+              dif_opy,
+              dbg_pos.value().first,
+              dbg_pos.value().second,
+              at::cuda::getCurrentCUDAStream());
+        } else {
+          PRE::backward<true>(
+              &this->renderer_vec[batch_i],
+              grad_im[batch_i].contiguous().data_ptr<float>(),
+              image[batch_i].contiguous().data_ptr<float>(),
+              forw_info[batch_i].contiguous().data_ptr<float>(),
+              vert_pos[batch_i].contiguous().data_ptr<float>(),
+              vert_col[batch_i].contiguous().data_ptr<float>(),
+              vert_radii[batch_i].contiguous().data_ptr<float>(),
+              cam_infos[batch_i],
+              gamma,
+              percent_allowed_difference,
+              max_n_hits,
+              opacity_ptr,
+              n_points,
+              mode,
+              dif_pos,
+              dif_col,
+              dif_rad,
+              dif_cam,
+              dif_opy,
+              at::cuda::getCurrentCUDAStream());
+        }
+      }
+    } else {
+      if (dbg_pos.has_value()) {
+        PRE::backward_dbg<true>(
+            this->renderer_vec.data(),
+            grad_im.contiguous().data_ptr<float>(),
+            image.contiguous().data_ptr<float>(),
+            forw_info.contiguous().data_ptr<float>(),
+            vert_pos.contiguous().data_ptr<float>(),
+            vert_col.contiguous().data_ptr<float>(),
+            vert_radii.contiguous().data_ptr<float>(),
+            cam_infos[0],
+            gamma,
+            percent_allowed_difference,
+            max_n_hits,
+            opacity_ptr,
+            n_points,
+            mode,
+            dif_pos,
+            dif_col,
+            dif_rad,
+            dif_cam,
+            dif_opy,
+            dbg_pos.value().first,
+            dbg_pos.value().second,
+            at::cuda::getCurrentCUDAStream());
+      } else {
+        PRE::backward<true>(
+            this->renderer_vec.data(),
+            grad_im.contiguous().data_ptr<float>(),
+            image.contiguous().data_ptr<float>(),
+            forw_info.contiguous().data_ptr<float>(),
+            vert_pos.contiguous().data_ptr<float>(),
+            vert_col.contiguous().data_ptr<float>(),
+            vert_radii.contiguous().data_ptr<float>(),
+            cam_infos[0],
+            gamma,
+            percent_allowed_difference,
+            max_n_hits,
+            opacity_ptr,
+            n_points,
+            mode,
+            dif_pos,
+            dif_col,
+            dif_rad,
+            dif_cam,
+            dif_opy,
+            at::cuda::getCurrentCUDAStream());
+      }
+    }
+    cudaSetDevice(prev_active);
+#ifdef PULSAR_TIMINGS_BATCHED_ENABLED
+    STOP_TIME_CU(batch_backward);
+    float time_ms;
+    GET_TIME_CU(batch_backward, &time_ms);
+    std::cout << "Backward render batched time per example: "
+              << time_ms / static_cast<float>(batch_size) << "ms" << std::endl;
+#endif
+#endif // WITH_CUDA
+  } else {
+#ifdef PULSAR_TIMINGS_BATCHED_ENABLED
+    START_TIME(batch_backward);
+#endif
+    if (batch_processing) {
+      for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+        // These calls are non-blocking and just kick off the computations.
+        if (dbg_pos.has_value()) {
+          PRE::backward_dbg<false>(
+              &this->renderer_vec[batch_i],
+              grad_im[batch_i].contiguous().data_ptr<float>(),
+              image[batch_i].contiguous().data_ptr<float>(),
+              forw_info[batch_i].contiguous().data_ptr<float>(),
+              vert_pos[batch_i].contiguous().data_ptr<float>(),
+              vert_col[batch_i].contiguous().data_ptr<float>(),
+              vert_radii[batch_i].contiguous().data_ptr<float>(),
+              cam_infos[batch_i],
+              gamma,
+              percent_allowed_difference,
+              max_n_hits,
+              opacity_ptr,
+              n_points,
+              mode,
+              dif_pos,
+              dif_col,
+              dif_rad,
+              dif_cam,
+              dif_opy,
+              dbg_pos.value().first,
+              dbg_pos.value().second,
+              nullptr);
+        } else {
+          PRE::backward<false>(
+              &this->renderer_vec[batch_i],
+              grad_im[batch_i].contiguous().data_ptr<float>(),
+              image[batch_i].contiguous().data_ptr<float>(),
+              forw_info[batch_i].contiguous().data_ptr<float>(),
+              vert_pos[batch_i].contiguous().data_ptr<float>(),
+              vert_col[batch_i].contiguous().data_ptr<float>(),
+              vert_radii[batch_i].contiguous().data_ptr<float>(),
+              cam_infos[batch_i],
+              gamma,
+              percent_allowed_difference,
+              max_n_hits,
+              opacity_ptr,
+              n_points,
+              mode,
+              dif_pos,
+              dif_col,
+              dif_rad,
+              dif_cam,
+              dif_opy,
+              nullptr);
+        }
+      }
+    } else {
+      if (dbg_pos.has_value()) {
+        PRE::backward_dbg<false>(
+            this->renderer_vec.data(),
+            grad_im.contiguous().data_ptr<float>(),
+            image.contiguous().data_ptr<float>(),
+            forw_info.contiguous().data_ptr<float>(),
+            vert_pos.contiguous().data_ptr<float>(),
+            vert_col.contiguous().data_ptr<float>(),
+            vert_radii.contiguous().data_ptr<float>(),
+            cam_infos[0],
+            gamma,
+            percent_allowed_difference,
+            max_n_hits,
+            opacity_ptr,
+            n_points,
+            mode,
+            dif_pos,
+            dif_col,
+            dif_rad,
+            dif_cam,
+            dif_opy,
+            dbg_pos.value().first,
+            dbg_pos.value().second,
+            nullptr);
+      } else {
+        PRE::backward<false>(
+            this->renderer_vec.data(),
+            grad_im.contiguous().data_ptr<float>(),
+            image.contiguous().data_ptr<float>(),
+            forw_info.contiguous().data_ptr<float>(),
+            vert_pos.contiguous().data_ptr<float>(),
+            vert_col.contiguous().data_ptr<float>(),
+            vert_radii.contiguous().data_ptr<float>(),
+            cam_infos[0],
+            gamma,
+            percent_allowed_difference,
+            max_n_hits,
+            opacity_ptr,
+            n_points,
+            mode,
+            dif_pos,
+            dif_col,
+            dif_rad,
+            dif_cam,
+            dif_opy,
+            nullptr);
+      }
+    }
+#ifdef PULSAR_TIMINGS_BATCHED_ENABLED
+    STOP_TIME(batch_backward);
+    float time_ms;
+    GET_TIME(batch_backward, &time_ms);
+    std::cout << "Backward render batched time per example: "
+              << time_ms / static_cast<float>(batch_size) << "ms" << std::endl;
+#endif
+  }
+  if (dif_pos) {
+    if (batch_processing) {
+      std::vector<torch::Tensor> results(batch_size);
+      for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+        results[batch_i] = from_blob(
+            reinterpret_cast<float*>(this->renderer_vec[batch_i].grad_pos_d),
+            {static_cast<ptrdiff_t>(n_points), 3},
+            this->device_type,
+            this->device_index,
+            torch::kFloat,
+            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
+                : (cudaStream_t) nullptr);
+      }
+      std::get<0>(ret) = torch::stack(results);
+    } else {
+      std::get<0>(ret) = from_blob(
+          reinterpret_cast<float*>(this->renderer_vec[0].grad_pos_d),
+          {static_cast<ptrdiff_t>(n_points), 3},
+          this->device_type,
+          this->device_index,
+          torch::kFloat,
+          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
+              : (cudaStream_t) nullptr);
+    }
+  }
+  if (dif_col) {
+    if (batch_processing) {
+      std::vector<torch::Tensor> results(batch_size);
+      for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+        results[batch_i] = from_blob(
+            reinterpret_cast<float*>(this->renderer_vec[batch_i].grad_col_d),
+            {static_cast<ptrdiff_t>(n_points),
+             this->renderer_vec[0].cam.n_channels},
+            this->device_type,
+            this->device_index,
+            torch::kFloat,
+            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
+                : (cudaStream_t) nullptr);
+      }
+      std::get<1>(ret) = torch::stack(results);
+    } else {
+      std::get<1>(ret) = from_blob(
+          reinterpret_cast<float*>(this->renderer_vec[0].grad_col_d),
+          {static_cast<ptrdiff_t>(n_points),
+           this->renderer_vec[0].cam.n_channels},
+          this->device_type,
+          this->device_index,
+          torch::kFloat,
+          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
+              : (cudaStream_t) nullptr);
+    }
+  }
+  if (dif_rad) {
+    if (batch_processing) {
+      std::vector<torch::Tensor> results(batch_size);
+      for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+        results[batch_i] = from_blob(
+            reinterpret_cast<float*>(this->renderer_vec[batch_i].grad_rad_d),
+            {static_cast<ptrdiff_t>(n_points)},
+            this->device_type,
+            this->device_index,
+            torch::kFloat,
+            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
+                : (cudaStream_t) nullptr);
+      }
+      std::get<2>(ret) = torch::stack(results);
+    } else {
+      std::get<2>(ret) = from_blob(
+          reinterpret_cast<float*>(this->renderer_vec[0].grad_rad_d),
+          {static_cast<ptrdiff_t>(n_points)},
+          this->device_type,
+          this->device_index,
+          torch::kFloat,
+          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
+              : (cudaStream_t) nullptr);
+    }
+  }
+  if (dif_cam) {
+    if (batch_processing) {
+      std::vector<torch::Tensor> res_p1(batch_size);
+      std::vector<torch::Tensor> res_p2(batch_size);
+      std::vector<torch::Tensor> res_p3(batch_size);
+      std::vector<torch::Tensor> res_p4(batch_size);
+      for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+        res_p1[batch_i] = from_blob(
+            reinterpret_cast<float*>(this->renderer_vec[batch_i].grad_cam_d),
+            {3},
+            this->device_type,
+            this->device_index,
+            torch::kFloat,
+            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
+                : (cudaStream_t) nullptr);
+        res_p2[batch_i] = from_blob(
+            reinterpret_cast<float*>(
+                this->renderer_vec[batch_i].grad_cam_d + 3),
+            {3},
+            this->device_type,
+            this->device_index,
+            torch::kFloat,
+            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
+                : (cudaStream_t) nullptr);
+        res_p3[batch_i] = from_blob(
+            reinterpret_cast<float*>(
+                this->renderer_vec[batch_i].grad_cam_d + 6),
+            {3},
+            this->device_type,
+            this->device_index,
+            torch::kFloat,
+            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
+                : (cudaStream_t) nullptr);
+        res_p4[batch_i] = from_blob(
+            reinterpret_cast<float*>(
+                this->renderer_vec[batch_i].grad_cam_d + 9),
+            {3},
+            this->device_type,
+            this->device_index,
+            torch::kFloat,
+            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
+                : (cudaStream_t) nullptr);
+      }
+      std::get<3>(ret) = torch::stack(res_p1);
+      std::get<4>(ret) = torch::stack(res_p2);
+      std::get<5>(ret) = torch::stack(res_p3);
+      std::get<6>(ret) = torch::stack(res_p4);
+    } else {
+      std::get<3>(ret) = from_blob(
+          reinterpret_cast<float*>(this->renderer_vec[0].grad_cam_d),
+          {3},
+          this->device_type,
+          this->device_index,
+          torch::kFloat,
+          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
+              : (cudaStream_t) nullptr);
+      std::get<4>(ret) = from_blob(
+          reinterpret_cast<float*>(this->renderer_vec[0].grad_cam_d + 3),
+          {3},
+          this->device_type,
+          this->device_index,
+          torch::kFloat,
+          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
+              : (cudaStream_t) nullptr);
+      std::get<5>(ret) = from_blob(
+          reinterpret_cast<float*>(this->renderer_vec[0].grad_cam_d + 6),
+          {3},
+          this->device_type,
+          this->device_index,
+          torch::kFloat,
+          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
+              : (cudaStream_t) nullptr);
+      std::get<6>(ret) = from_blob(
+          reinterpret_cast<float*>(this->renderer_vec[0].grad_cam_d + 9),
+          {3},
+          this->device_type,
+          this->device_index,
+          torch::kFloat,
+          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
+              : (cudaStream_t) nullptr);
+    }
+  }
+  if (dif_opy) {
+    if (batch_processing) {
+      std::vector<torch::Tensor> results(batch_size);
+      for (size_t batch_i = 0; batch_i < batch_size; ++batch_i) {
+        results[batch_i] = from_blob(
+            reinterpret_cast<float*>(this->renderer_vec[batch_i].grad_opy_d),
+            {static_cast<ptrdiff_t>(n_points)},
+            this->device_type,
+            this->device_index,
+            torch::kFloat,
+            this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+                ? at::cuda::getCurrentCUDAStream()
+#else
+                ? (cudaStream_t) nullptr
+#endif
+                : (cudaStream_t) nullptr);
+      }
+      std::get<7>(ret) = torch::stack(results);
+    } else {
+      std::get<7>(ret) = from_blob(
+          reinterpret_cast<float*>(this->renderer_vec[0].grad_opy_d),
+          {static_cast<ptrdiff_t>(n_points)},
+          this->device_type,
+          this->device_index,
+          torch::kFloat,
+          this->device_type == c10::DeviceType::CUDA
+#ifdef WITH_CUDA
+              ? at::cuda::getCurrentCUDAStream()
+#else
+              ? (cudaStream_t) nullptr
+#endif
+              : (cudaStream_t) nullptr);
+    }
+  }
+  return ret;
+};
+
+} // namespace pytorch
+} // namespace pulsar
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/pytorch/renderer.h b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/renderer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3ccb37c51bc0f129cd7aeb541ddc398afd53dd0
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/renderer.h
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_PYTORCH_RENDERER_H_
+#define PULSAR_NATIVE_PYTORCH_RENDERER_H_
+
+#include "../global.h"
+#include "../include/renderer.h"
+
+namespace pulsar {
+namespace pytorch {
+
+struct Renderer {
+ public:
+  /**
+   * Pytorch Pulsar differentiable rendering module.
+   */
+  explicit Renderer(
+      const unsigned int& width,
+      const unsigned int& height,
+      const uint& max_n_balls,
+      const bool& orthogonal_projection,
+      const bool& right_handed_system,
+      const float& background_normalization_depth,
+      const uint& n_channels,
+      const uint& n_track);
+  ~Renderer();
+
+  std::tuple<torch::Tensor, torch::Tensor> forward(
+      const torch::Tensor& vert_pos,
+      const torch::Tensor& vert_col,
+      const torch::Tensor& vert_radii,
+      const torch::Tensor& cam_pos,
+      const torch::Tensor& pixel_0_0_center,
+      const torch::Tensor& pixel_vec_x,
+      const torch::Tensor& pixel_vec_y,
+      const torch::Tensor& focal_length,
+      const torch::Tensor& principal_point_offsets,
+      const float& gamma,
+      const float& max_depth,
+      float min_depth,
+      const c10::optional<torch::Tensor>& bg_col,
+      const c10::optional<torch::Tensor>& opacity,
+      const float& percent_allowed_difference,
+      const uint& max_n_hits,
+      const uint& mode);
+
+  std::tuple<
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>,
+      at::optional<torch::Tensor>>
+  backward(
+      const torch::Tensor& grad_im,
+      const torch::Tensor& image,
+      const torch::Tensor& forw_info,
+      const torch::Tensor& vert_pos,
+      const torch::Tensor& vert_col,
+      const torch::Tensor& vert_radii,
+      const torch::Tensor& cam_pos,
+      const torch::Tensor& pixel_0_0_center,
+      const torch::Tensor& pixel_vec_x,
+      const torch::Tensor& pixel_vec_y,
+      const torch::Tensor& focal_length,
+      const torch::Tensor& principal_point_offsets,
+      const float& gamma,
+      const float& max_depth,
+      float min_depth,
+      const c10::optional<torch::Tensor>& bg_col,
+      const c10::optional<torch::Tensor>& opacity,
+      const float& percent_allowed_difference,
+      const uint& max_n_hits,
+      const uint& mode,
+      const bool& dif_pos,
+      const bool& dif_col,
+      const bool& dif_rad,
+      const bool& dif_cam,
+      const bool& dif_opy,
+      const at::optional<std::pair<uint, uint>>& dbg_pos);
+
+  // Infrastructure.
+  /**
+   * Ensure that the renderer is placed on this device.
+   * Is nearly a no-op if the device is correct.
+   */
+  void ensure_on_device(torch::Device device, bool non_blocking = false);
+
+  /**
+   * Ensure that at least n renderers are available.
+   */
+  void ensure_n_renderers_gte(const size_t& batch_size);
+
+  /**
+   * Check the parameters.
+   */
+  std::tuple<size_t, size_t, bool, torch::Tensor> arg_check(
+      const torch::Tensor& vert_pos,
+      const torch::Tensor& vert_col,
+      const torch::Tensor& vert_radii,
+      const torch::Tensor& cam_pos,
+      const torch::Tensor& pixel_0_0_center,
+      const torch::Tensor& pixel_vec_x,
+      const torch::Tensor& pixel_vec_y,
+      const torch::Tensor& focal_length,
+      const torch::Tensor& principal_point_offsets,
+      const float& gamma,
+      const float& max_depth,
+      float& min_depth,
+      const c10::optional<torch::Tensor>& bg_col,
+      const c10::optional<torch::Tensor>& opacity,
+      const float& percent_allowed_difference,
+      const uint& max_n_hits,
+      const uint& mode);
+
+  bool operator==(const Renderer& rhs) const;
+  inline friend std::ostream& operator<<(
+      std::ostream& stream,
+      const Renderer& self) {
+    stream << "pulsar::Renderer[";
+    // Device info.
+    stream << self.device_type;
+    if (self.device_index != -1)
+      stream << ", ID " << self.device_index;
+    stream << "]";
+    return stream;
+  }
+
+  inline uint width() const {
+    return this->renderer_vec[0].cam.film_width;
+  }
+  inline uint height() const {
+    return this->renderer_vec[0].cam.film_height;
+  }
+  inline int max_num_balls() const {
+    return this->renderer_vec[0].max_num_balls;
+  }
+  inline bool orthogonal() const {
+    return this->renderer_vec[0].cam.orthogonal_projection;
+  }
+  inline bool right_handed() const {
+    return this->renderer_vec[0].cam.right_handed;
+  }
+  inline uint n_track() const {
+    return static_cast<uint>(this->renderer_vec[0].n_track);
+  }
+
+  /** A tensor that is registered as a buffer with this Module to track its
+   * device placement. Unfortunately, pytorch doesn't offer tracking Module
+   * device placement in a better way as of now.
+   */
+  torch::Tensor device_tracker;
+
+ protected:
+  /** The device type for this renderer. */
+  c10::DeviceType device_type;
+  /** The device index for this renderer. */
+  c10::DeviceIndex device_index;
+  /** Pointer to the underlying pulsar renderers. */
+  std::vector<pulsar::Renderer::Renderer> renderer_vec;
+};
+
+} // namespace pytorch
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/pytorch/tensor_util.cpp b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/tensor_util.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e768101e0e67936b30fa745286f7e42513e710d4
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/tensor_util.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef WITH_CUDA
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda_runtime_api.h>
+#endif
+#include <torch/extension.h>
+
+#include "./tensor_util.h"
+
+namespace pulsar {
+namespace pytorch {
+
+torch::Tensor sphere_ids_from_result_info_nograd(
+    const torch::Tensor& forw_info) {
+  torch::Tensor result = torch::zeros(
+      {forw_info.size(0),
+       forw_info.size(1),
+       forw_info.size(2),
+       (forw_info.size(3) - 3) / 2},
+      torch::TensorOptions().device(forw_info.device()).dtype(torch::kInt32));
+  // Get the relevant slice, contiguous.
+  torch::Tensor tmp =
+      forw_info
+          .slice(
+              /*dim=*/3, /*start=*/3, /*end=*/forw_info.size(3), /*step=*/2)
+          .contiguous();
+  if (forw_info.device().type() == c10::DeviceType::CUDA) {
+#ifdef WITH_CUDA
+    cudaMemcpyAsync(
+        result.data_ptr(),
+        tmp.data_ptr(),
+        sizeof(uint32_t) * tmp.size(0) * tmp.size(1) * tmp.size(2) *
+            tmp.size(3),
+        cudaMemcpyDeviceToDevice,
+        at::cuda::getCurrentCUDAStream());
+#else
+    throw std::runtime_error(
+        "Copy on CUDA device initiated but built "
+        "without CUDA support.");
+#endif
+  } else {
+    memcpy(
+        result.data_ptr(),
+        tmp.data_ptr(),
+        sizeof(uint32_t) * tmp.size(0) * tmp.size(1) * tmp.size(2) *
+            tmp.size(3));
+  }
+  // `tmp` is freed after this, the memory might get reallocated. However,
+  // only kernels in the same stream should ever be able to write to this
+  // memory, which are executed only after the memcpy is complete. That's
+  // why we can just continue.
+  return result;
+}
+
+} // namespace pytorch
+} // namespace pulsar
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/pytorch/tensor_util.h b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/tensor_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..fedf94e3d8b51a41904a6627b074daeb949d1733
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/tensor_util.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_PYTORCH_TENSOR_UTIL_H_
+#define PULSAR_NATIVE_PYTORCH_TENSOR_UTIL_H_
+
+#include <ATen/ATen.h>
+
+namespace pulsar {
+namespace pytorch {
+
+torch::Tensor sphere_ids_from_result_info_nograd(
+    const torch::Tensor& forw_info);
+
+}
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/pytorch/util.cpp b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/util.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..017a1ac33ed0757b08e80bcbb894d1414fba8bf7
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/util.cpp
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef WITH_CUDA
+#include <cuda_runtime_api.h>
+
+namespace pulsar {
+namespace pytorch {
+
+void cudaDevToDev(
+    void* trg,
+    const void* src,
+    const int& size,
+    const cudaStream_t& stream) {
+  cudaMemcpyAsync(trg, src, size, cudaMemcpyDeviceToDevice, stream);
+}
+
+void cudaDevToHost(
+    void* trg,
+    const void* src,
+    const int& size,
+    const cudaStream_t& stream) {
+  cudaMemcpyAsync(trg, src, size, cudaMemcpyDeviceToHost, stream);
+}
+
+} // namespace pytorch
+} // namespace pulsar
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/pytorch/util.h b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/util.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a536fab301b64c4f93f6a52fb223789107b312e
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/pytorch/util.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef PULSAR_NATIVE_PYTORCH_UTIL_H_
+#define PULSAR_NATIVE_PYTORCH_UTIL_H_
+
+#include <ATen/ATen.h>
+#include "../global.h"
+
+namespace pulsar {
+namespace pytorch {
+
+void cudaDevToDev(
+    void* trg,
+    const void* src,
+    const int& size,
+    const cudaStream_t& stream);
+void cudaDevToHost(
+    void* trg,
+    const void* src,
+    const int& size,
+    const cudaStream_t& stream);
+
+/**
+ * This method takes a memory pointer and wraps it into a pytorch tensor.
+ *
+ * This is preferred over `torch::from_blob`, since that requires a CUDA
+ * managed pointer. However, working with these for high performance
+ * operations is slower. Most of the rendering operations should stay
+ * local to the respective GPU anyways, so unmanaged pointers are
+ * preferred.
+ */
+template <typename T>
+torch::Tensor from_blob(
+    const T* ptr,
+    const torch::IntArrayRef& shape,
+    const c10::DeviceType& device_type,
+    const c10::DeviceIndex& device_index,
+    const torch::Dtype& dtype,
+    const cudaStream_t& stream) {
+  torch::Tensor ret = torch::zeros(
+      shape, torch::device({device_type, device_index}).dtype(dtype));
+  const int num_elements =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>{});
+  if (device_type == c10::DeviceType::CUDA) {
+#ifdef WITH_CUDA
+    cudaDevToDev(
+        ret.data_ptr(),
+        static_cast<const void*>(ptr),
+        sizeof(T) * num_elements,
+        stream);
+#else
+    throw std::runtime_error(
+        "Initiating devToDev copy on a build without CUDA.");
+#endif
+    // TODO: check for synchronization.
+  } else {
+    memcpy(ret.data_ptr(), ptr, sizeof(T) * num_elements);
+  }
+  return ret;
+};
+
+} // namespace pytorch
+} // namespace pulsar
+
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/pulsar/warnings.cpp b/pytorch3d/pytorch3d/csrc/pulsar/warnings.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3ec13d7eec5f3092f2dd86059a9ae4287bacb98d
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/pulsar/warnings.cpp
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "./global.h"
+#include "./logging.h"
+
+/**
+ * A compilation unit to provide warnings about the code and avoid
+ * repeated messages.
+ */
+#ifdef PULSAR_ASSERTIONS
+#pragma message("WARNING: assertions are enabled in Pulsar.")
+#endif
+#ifdef PULSAR_LOGGING_ENABLED
+#pragma message("WARNING: logging is enabled in Pulsar.")
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_coarse/bitmask.cuh b/pytorch3d/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..14f6b5120d2ae053708e4e7ef39ed48effbeb1ea
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_coarse/bitmask.cuh
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#define BINMASK_H
+
+// A BitMask represents a bool array of shape (H, W, N). We pack values into
+// the bits of unsigned ints; a single unsigned int has B = 32 bits, so to hold
+// all values we use H * W * (N / B) = H * W * D values. We want to store
+// BitMasks in shared memory, so we assume that the memory has already been
+// allocated for it elsewhere.
+class BitMask {
+ public:
+  __device__ BitMask(unsigned int* data, int H, int W, int N)
+      : data(data), H(H), W(W), B(8 * sizeof(unsigned int)), D(N / B) {
+    // TODO: check if the data is null.
+    N = ceilf(N % 32); // take ceil incase N % 32 != 0
+    block_clear(); // clear the data
+  }
+
+  // Use all threads in the current block to clear all bits of this BitMask
+  __device__ void block_clear() {
+    for (int i = threadIdx.x; i < H * W * D; i += blockDim.x) {
+      data[i] = 0;
+    }
+    __syncthreads();
+  }
+
+  __device__ int _get_elem_idx(int y, int x, int d) {
+    return y * W * D + x * D + d / B;
+  }
+
+  __device__ int _get_bit_idx(int d) {
+    return d % B;
+  }
+
+  // Turn on a single bit (y, x, d)
+  __device__ void set(int y, int x, int d) {
+    int elem_idx = _get_elem_idx(y, x, d);
+    int bit_idx = _get_bit_idx(d);
+    const unsigned int mask = 1U << bit_idx;
+    atomicOr(data + elem_idx, mask);
+  }
+
+  // Turn off a single bit (y, x, d)
+  __device__ void unset(int y, int x, int d) {
+    int elem_idx = _get_elem_idx(y, x, d);
+    int bit_idx = _get_bit_idx(d);
+    const unsigned int mask = ~(1U << bit_idx);
+    atomicAnd(data + elem_idx, mask);
+  }
+
+  // Check whether the bit (y, x, d) is on or off
+  __device__ bool get(int y, int x, int d) {
+    int elem_idx = _get_elem_idx(y, x, d);
+    int bit_idx = _get_bit_idx(d);
+    return (data[elem_idx] >> bit_idx) & 1U;
+  }
+
+  // Compute the number of bits set in the row (y, x, :)
+  __device__ int count(int y, int x) {
+    int total = 0;
+    for (int i = 0; i < D; ++i) {
+      int elem_idx = y * W * D + x * D + i;
+      unsigned int elem = data[elem_idx];
+      total += __popc(elem);
+    }
+    return total;
+  }
+
+ private:
+  unsigned int* data;
+  int H, W, B, D;
+};
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu b/pytorch3d/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5cc76577ec2961ef37f3a84eacbead17b529505b
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.cu
@@ -0,0 +1,373 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <float.h>
+#include <math.h>
+#include <tuple>
+#include "rasterize_coarse/bitmask.cuh"
+#include "rasterize_points/rasterization_utils.cuh"
+#include "utils/float_math.cuh"
+#include "utils/geometry_utils.cuh" // For kEpsilon -- gross
+
+__global__ void TriangleBoundingBoxKernel(
+    const float* face_verts, // (F, 3, 3)
+    const int F,
+    const float blur_radius,
+    float* bboxes, // (4, F)
+    bool* skip_face) { // (F,)
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = blockDim.x * gridDim.x;
+  const float sqrt_radius = sqrt(blur_radius);
+  for (int f = tid; f < F; f += num_threads) {
+    const float v0x = face_verts[f * 9 + 0 * 3 + 0];
+    const float v0y = face_verts[f * 9 + 0 * 3 + 1];
+    const float v0z = face_verts[f * 9 + 0 * 3 + 2];
+    const float v1x = face_verts[f * 9 + 1 * 3 + 0];
+    const float v1y = face_verts[f * 9 + 1 * 3 + 1];
+    const float v1z = face_verts[f * 9 + 1 * 3 + 2];
+    const float v2x = face_verts[f * 9 + 2 * 3 + 0];
+    const float v2y = face_verts[f * 9 + 2 * 3 + 1];
+    const float v2z = face_verts[f * 9 + 2 * 3 + 2];
+    const float xmin = FloatMin3(v0x, v1x, v2x) - sqrt_radius;
+    const float xmax = FloatMax3(v0x, v1x, v2x) + sqrt_radius;
+    const float ymin = FloatMin3(v0y, v1y, v2y) - sqrt_radius;
+    const float ymax = FloatMax3(v0y, v1y, v2y) + sqrt_radius;
+    const float zmin = FloatMin3(v0z, v1z, v2z);
+    const bool skip = zmin < kEpsilon;
+    bboxes[0 * F + f] = xmin;
+    bboxes[1 * F + f] = xmax;
+    bboxes[2 * F + f] = ymin;
+    bboxes[3 * F + f] = ymax;
+    skip_face[f] = skip;
+  }
+}
+
+__global__ void PointBoundingBoxKernel(
+    const float* points, // (P, 3)
+    const float* radius, // (P,)
+    const int P,
+    float* bboxes, // (4, P)
+    bool* skip_points) {
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  const int num_threads = blockDim.x * gridDim.x;
+  for (int p = tid; p < P; p += num_threads) {
+    const float x = points[p * 3 + 0];
+    const float y = points[p * 3 + 1];
+    const float z = points[p * 3 + 2];
+    const float r = radius[p];
+    // TODO: change to kEpsilon to match triangles?
+    const bool skip = z < 0;
+    bboxes[0 * P + p] = x - r;
+    bboxes[1 * P + p] = x + r;
+    bboxes[2 * P + p] = y - r;
+    bboxes[3 * P + p] = y + r;
+    skip_points[p] = skip;
+  }
+}
+
+__global__ void RasterizeCoarseCudaKernel(
+    const float* bboxes, // (4, E) (xmin, xmax, ymin, ymax)
+    const bool* should_skip, // (E,)
+    const int64_t* elem_first_idxs,
+    const int64_t* elems_per_batch,
+    const int N,
+    const int E,
+    const int H,
+    const int W,
+    const int bin_size,
+    const int chunk_size,
+    const int max_elem_per_bin,
+    int* elems_per_bin,
+    int* bin_elems) {
+  extern __shared__ char sbuf[];
+  const int M = max_elem_per_bin;
+  // Integer divide round up
+  const int num_bins_x = 1 + (W - 1) / bin_size;
+  const int num_bins_y = 1 + (H - 1) / bin_size;
+
+  // NDC range depends on the ratio of W/H
+  // The shorter side from (H, W) is given an NDC range of 2.0 and
+  // the other side is scaled by the ratio of H:W.
+  const float NDC_x_half_range = NonSquareNdcRange(W, H) / 2.0f;
+  const float NDC_y_half_range = NonSquareNdcRange(H, W) / 2.0f;
+
+  // Size of half a pixel in NDC units is the NDC half range
+  // divided by the corresponding image dimension
+  const float half_pix_x = NDC_x_half_range / W;
+  const float half_pix_y = NDC_y_half_range / H;
+
+  // This is a boolean array of shape (num_bins_y, num_bins_x, chunk_size)
+  // stored in shared memory that will track whether each elem in the chunk
+  // falls into each bin of the image.
+  BitMask binmask((unsigned int*)sbuf, num_bins_y, num_bins_x, chunk_size);
+
+  // Have each block handle a chunk of elements
+  const int chunks_per_batch = 1 + (E - 1) / chunk_size;
+  const int num_chunks = N * chunks_per_batch;
+
+  for (int chunk = blockIdx.x; chunk < num_chunks; chunk += gridDim.x) {
+    const int batch_idx = chunk / chunks_per_batch; // batch index
+    const int chunk_idx = chunk % chunks_per_batch;
+    const int elem_chunk_start_idx = chunk_idx * chunk_size;
+
+    binmask.block_clear();
+    const int64_t elem_start_idx = elem_first_idxs[batch_idx];
+    const int64_t elem_stop_idx = elem_start_idx + elems_per_batch[batch_idx];
+
+    // Have each thread handle a different face within the chunk
+    for (int e = threadIdx.x; e < chunk_size; e += blockDim.x) {
+      const int e_idx = elem_chunk_start_idx + e;
+
+      // Check that we are still within the same element of the batch
+      if (e_idx >= elem_stop_idx || e_idx < elem_start_idx) {
+        continue;
+      }
+
+      if (should_skip[e_idx]) {
+        continue;
+      }
+      const float xmin = bboxes[0 * E + e_idx];
+      const float xmax = bboxes[1 * E + e_idx];
+      const float ymin = bboxes[2 * E + e_idx];
+      const float ymax = bboxes[3 * E + e_idx];
+
+      // Brute-force search over all bins; TODO(T54294966) something smarter.
+      for (int by = 0; by < num_bins_y; ++by) {
+        // Y coordinate of the top and bottom of the bin.
+        // PixToNdc gives the location of the center of each pixel, so we
+        // need to add/subtract a half pixel to get the true extent of the bin.
+        // Reverse ordering of Y axis so that +Y is upwards in the image.
+        const float bin_y_min =
+            PixToNonSquareNdc(by * bin_size, H, W) - half_pix_y;
+        const float bin_y_max =
+            PixToNonSquareNdc((by + 1) * bin_size - 1, H, W) + half_pix_y;
+        const bool y_overlap = (ymin <= bin_y_max) && (bin_y_min < ymax);
+
+        for (int bx = 0; bx < num_bins_x; ++bx) {
+          // X coordinate of the left and right of the bin.
+          // Reverse ordering of x axis so that +X is left.
+          const float bin_x_max =
+              PixToNonSquareNdc((bx + 1) * bin_size - 1, W, H) + half_pix_x;
+          const float bin_x_min =
+              PixToNonSquareNdc(bx * bin_size, W, H) - half_pix_x;
+
+          const bool x_overlap = (xmin <= bin_x_max) && (bin_x_min < xmax);
+          if (y_overlap && x_overlap) {
+            binmask.set(by, bx, e);
+          }
+        }
+      }
+    }
+    __syncthreads();
+    // Now we have processed every elem in the current chunk. We need to
+    // count the number of elems in each bin so we can write the indices
+    // out to global memory. We have each thread handle a different bin.
+    for (int byx = threadIdx.x; byx < num_bins_y * num_bins_x;
+         byx += blockDim.x) {
+      const int by = byx / num_bins_x;
+      const int bx = byx % num_bins_x;
+      const int count = binmask.count(by, bx);
+      const int elems_per_bin_idx =
+          batch_idx * num_bins_y * num_bins_x + by * num_bins_x + bx;
+
+      // This atomically increments the (global) number of elems found
+      // in the current bin, and gets the previous value of the counter;
+      // this effectively allocates space in the bin_faces array for the
+      // elems in the current chunk that fall into this bin.
+      const int start = atomicAdd(elems_per_bin + elems_per_bin_idx, count);
+
+      // Now loop over the binmask and write the active bits for this bin
+      // out to bin_faces.
+      int next_idx = batch_idx * num_bins_y * num_bins_x * M +
+          by * num_bins_x * M + bx * M + start;
+      for (int e = 0; e < chunk_size; ++e) {
+        if (binmask.get(by, bx, e)) {
+          // TODO(T54296346) find the correct method for handling errors in
+          // CUDA. Throw an error if num_faces_per_bin > max_faces_per_bin.
+          // Either decrease bin size or increase max_faces_per_bin
+          bin_elems[next_idx] = elem_chunk_start_idx + e;
+          next_idx++;
+        }
+      }
+    }
+    __syncthreads();
+  }
+}
+
+at::Tensor RasterizeCoarseCuda(
+    const at::Tensor& bboxes,
+    const at::Tensor& should_skip,
+    const at::Tensor& elem_first_idxs,
+    const at::Tensor& elems_per_batch,
+    const std::tuple<int, int> image_size,
+    const int bin_size,
+    const int max_elems_per_bin) {
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(bboxes.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
+
+  const int E = bboxes.size(1);
+  const int N = elems_per_batch.size(0);
+  const int M = max_elems_per_bin;
+
+  // Integer divide round up
+  const int num_bins_y = 1 + (H - 1) / bin_size;
+  const int num_bins_x = 1 + (W - 1) / bin_size;
+
+  if (num_bins_y >= kMaxItemsPerBin || num_bins_x >= kMaxItemsPerBin) {
+    std::stringstream ss;
+    ss << "In RasterizeCoarseCuda got num_bins_y: " << num_bins_y
+       << ", num_bins_x: " << num_bins_x << ", "
+       << "; that's too many!";
+    AT_ERROR(ss.str());
+  }
+  auto opts = elems_per_batch.options().dtype(at::kInt);
+  at::Tensor elems_per_bin = at::zeros({N, num_bins_y, num_bins_x}, opts);
+  at::Tensor bin_elems = at::full({N, num_bins_y, num_bins_x, M}, -1, opts);
+
+  if (bin_elems.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return bin_elems;
+  }
+
+  const int chunk_size = 512;
+  const size_t shared_size = num_bins_y * num_bins_x * chunk_size / 8;
+  const size_t blocks = 64;
+  const size_t threads = 512;
+
+  RasterizeCoarseCudaKernel<<<blocks, threads, shared_size, stream>>>(
+      bboxes.contiguous().data_ptr<float>(),
+      should_skip.contiguous().data_ptr<bool>(),
+      elem_first_idxs.contiguous().data_ptr<int64_t>(),
+      elems_per_batch.contiguous().data_ptr<int64_t>(),
+      N,
+      E,
+      H,
+      W,
+      bin_size,
+      chunk_size,
+      M,
+      elems_per_bin.data_ptr<int32_t>(),
+      bin_elems.data_ptr<int32_t>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return bin_elems;
+}
+
+at::Tensor RasterizeMeshesCoarseCuda(
+    const at::Tensor& face_verts,
+    const at::Tensor& mesh_to_face_first_idx,
+    const at::Tensor& num_faces_per_mesh,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int bin_size,
+    const int max_faces_per_bin) {
+  TORCH_CHECK(
+      face_verts.ndimension() == 3 && face_verts.size(1) == 3 &&
+          face_verts.size(2) == 3,
+      "face_verts must have dimensions (num_faces, 3, 3)");
+
+  // Check inputs are on the same device
+  at::TensorArg face_verts_t{face_verts, "face_verts", 1},
+      mesh_to_face_first_idx_t{
+          mesh_to_face_first_idx, "mesh_to_face_first_idx", 2},
+      num_faces_per_mesh_t{num_faces_per_mesh, "num_faces_per_mesh", 3};
+  at::CheckedFrom c = "RasterizeMeshesCoarseCuda";
+  at::checkAllSameGPU(
+      c, {face_verts_t, mesh_to_face_first_idx_t, num_faces_per_mesh_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(face_verts.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Allocate tensors for bboxes and should_skip
+  const int F = face_verts.size(0);
+  auto float_opts = face_verts.options().dtype(at::kFloat);
+  auto bool_opts = face_verts.options().dtype(at::kBool);
+  at::Tensor bboxes = at::empty({4, F}, float_opts);
+  at::Tensor should_skip = at::empty({F}, bool_opts);
+
+  // Launch kernel to compute triangle bboxes
+  const size_t blocks = 128;
+  const size_t threads = 256;
+  TriangleBoundingBoxKernel<<<blocks, threads, 0, stream>>>(
+      face_verts.contiguous().data_ptr<float>(),
+      F,
+      blur_radius,
+      bboxes.contiguous().data_ptr<float>(),
+      should_skip.contiguous().data_ptr<bool>());
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return RasterizeCoarseCuda(
+      bboxes,
+      should_skip,
+      mesh_to_face_first_idx,
+      num_faces_per_mesh,
+      image_size,
+      bin_size,
+      max_faces_per_bin);
+}
+
+at::Tensor RasterizePointsCoarseCuda(
+    const at::Tensor& points, // (P, 3)
+    const at::Tensor& cloud_to_packed_first_idx, // (N,)
+    const at::Tensor& num_points_per_cloud, // (N,)
+    const std::tuple<int, int> image_size,
+    const at::Tensor& radius,
+    const int bin_size,
+    const int max_points_per_bin) {
+  TORCH_CHECK(
+      points.ndimension() == 2 && points.size(1) == 3,
+      "points must have dimensions (num_points, 3)");
+
+  // Check inputs are on the same device
+  at::TensorArg points_t{points, "points", 1},
+      cloud_to_packed_first_idx_t{
+          cloud_to_packed_first_idx, "cloud_to_packed_first_idx", 2},
+      num_points_per_cloud_t{num_points_per_cloud, "num_points_per_cloud", 3};
+  at::CheckedFrom c = "RasterizePointsCoarseCuda";
+  at::checkAllSameGPU(
+      c, {points_t, cloud_to_packed_first_idx_t, num_points_per_cloud_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // Allocate tensors for bboxes and should_skip
+  const int P = points.size(0);
+  auto float_opts = points.options().dtype(at::kFloat);
+  auto bool_opts = points.options().dtype(at::kBool);
+  at::Tensor bboxes = at::empty({4, P}, float_opts);
+  at::Tensor should_skip = at::empty({P}, bool_opts);
+
+  // Launch kernel to compute point bboxes
+  const size_t blocks = 128;
+  const size_t threads = 256;
+  PointBoundingBoxKernel<<<blocks, threads, 0, stream>>>(
+      points.contiguous().data_ptr<float>(),
+      radius.contiguous().data_ptr<float>(),
+      P,
+      bboxes.contiguous().data_ptr<float>(),
+      should_skip.contiguous().data_ptr<bool>());
+  AT_CUDA_CHECK(cudaGetLastError());
+
+  return RasterizeCoarseCuda(
+      bboxes,
+      should_skip,
+      cloud_to_packed_first_idx,
+      num_points_per_cloud,
+      image_size,
+      bin_size,
+      max_points_per_bin);
+}
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.h b/pytorch3d/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.h
new file mode 100644
index 0000000000000000000000000000000000000000..6c0f0b412a64739a3d50a43cf007674e0a0ffef5
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_coarse/rasterize_coarse.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <torch/extension.h>
+#include <tuple>
+
+// Arguments are the same as RasterizeMeshesCoarse from
+// rasterize_meshes/rasterize_meshes.h
+#ifdef WITH_CUDA
+torch::Tensor RasterizeMeshesCoarseCuda(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& mesh_to_face_first_idx,
+    const torch::Tensor& num_faces_per_mesh,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int bin_size,
+    const int max_faces_per_bin);
+#endif
+
+// Arguments are the same as RasterizePointsCoarse from
+// rasterize_points/rasterize_points.h
+#ifdef WITH_CUDA
+torch::Tensor RasterizePointsCoarseCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& cloud_to_packed_first_idx,
+    const torch::Tensor& num_points_per_cloud,
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int bin_size,
+    const int max_points_per_bin);
+#endif
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu b/pytorch3d/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c044e58d90a38cd7671459c4bba3619d631891c5
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.cu
@@ -0,0 +1,820 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <float.h>
+#include <math.h>
+#include <thrust/tuple.h>
+#include <cstdio>
+#include <tuple>
+#include "rasterize_points/rasterization_utils.cuh"
+#include "utils/float_math.cuh"
+#include "utils/geometry_utils.cuh"
+
+namespace {
+// A structure for holding details about a pixel.
+struct Pixel {
+  float z;
+  int64_t idx; // idx of face
+  float dist; // abs distance of pixel to face
+  float3 bary;
+};
+
+__device__ bool operator<(const Pixel& a, const Pixel& b) {
+  return a.z < b.z || (a.z == b.z && a.idx < b.idx);
+}
+
+// Get the xyz coordinates of the three vertices for the face given by the
+// index face_idx into face_verts.
+__device__ thrust::tuple<float3, float3, float3> GetSingleFaceVerts(
+    const float* face_verts,
+    int face_idx) {
+  const float x0 = face_verts[face_idx * 9 + 0];
+  const float y0 = face_verts[face_idx * 9 + 1];
+  const float z0 = face_verts[face_idx * 9 + 2];
+  const float x1 = face_verts[face_idx * 9 + 3];
+  const float y1 = face_verts[face_idx * 9 + 4];
+  const float z1 = face_verts[face_idx * 9 + 5];
+  const float x2 = face_verts[face_idx * 9 + 6];
+  const float y2 = face_verts[face_idx * 9 + 7];
+  const float z2 = face_verts[face_idx * 9 + 8];
+
+  const float3 v0xyz = make_float3(x0, y0, z0);
+  const float3 v1xyz = make_float3(x1, y1, z1);
+  const float3 v2xyz = make_float3(x2, y2, z2);
+
+  return thrust::make_tuple(v0xyz, v1xyz, v2xyz);
+}
+
+// Get the min/max x/y/z values for the face given by vertices v0, v1, v2.
+__device__ thrust::tuple<float2, float2, float2>
+GetFaceBoundingBox(float3 v0, float3 v1, float3 v2) {
+  const float xmin = FloatMin3(v0.x, v1.x, v2.x);
+  const float ymin = FloatMin3(v0.y, v1.y, v2.y);
+  const float zmin = FloatMin3(v0.z, v1.z, v2.z);
+  const float xmax = FloatMax3(v0.x, v1.x, v2.x);
+  const float ymax = FloatMax3(v0.y, v1.y, v2.y);
+  const float zmax = FloatMax3(v0.z, v1.z, v2.z);
+
+  return thrust::make_tuple(
+      make_float2(xmin, xmax),
+      make_float2(ymin, ymax),
+      make_float2(zmin, zmax));
+}
+
+// Check if the point (px, py) lies outside the face bounding box face_bbox.
+// Return true if the point is outside.
+__device__ bool CheckPointOutsideBoundingBox(
+    float3 v0,
+    float3 v1,
+    float3 v2,
+    float blur_radius,
+    float2 pxy) {
+  const auto bbox = GetFaceBoundingBox(v0, v1, v2);
+  const float2 xlims = thrust::get<0>(bbox);
+  const float2 ylims = thrust::get<1>(bbox);
+  const float2 zlims = thrust::get<2>(bbox);
+
+  const float x_min = xlims.x - blur_radius;
+  const float y_min = ylims.x - blur_radius;
+  const float x_max = xlims.y + blur_radius;
+  const float y_max = ylims.y + blur_radius;
+
+  // Faces with at least one vertex behind the camera won't render correctly
+  // and should be removed or clipped before calling the rasterizer
+  const bool z_invalid = zlims.x < kEpsilon;
+
+  // Check if the current point is oustside the triangle bounding box.
+  return (
+      pxy.x > x_max || pxy.x < x_min || pxy.y > y_max || pxy.y < y_min ||
+      z_invalid);
+}
+
+// This function checks if a pixel given by xy location pxy lies within the
+// face with index face_idx in face_verts. One of the inputs is a list (q)
+// which contains Pixel structs with the indices of the faces which intersect
+// with this pixel sorted by closest z distance. If the point pxy lies in the
+// face, the list (q) is updated and re-orderered in place. In addition
+// the auxiliary variables q_size, q_max_z and q_max_idx are also modified.
+// This code is shared between RasterizeMeshesNaiveCudaKernel and
+// RasterizeMeshesFineCudaKernel.
+template <typename FaceQ>
+__device__ void CheckPixelInsideFace(
+    const float* face_verts, // (F, 3, 3)
+    const int64_t* clipped_faces_neighbor_idx, // (F,)
+    const int face_idx,
+    int& q_size,
+    float& q_max_z,
+    int& q_max_idx,
+    FaceQ& q,
+    const float blur_radius,
+    const float2 pxy, // Coordinates of the pixel
+    const int K,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces) {
+  const auto v012 = GetSingleFaceVerts(face_verts, face_idx);
+  const float3 v0 = thrust::get<0>(v012);
+  const float3 v1 = thrust::get<1>(v012);
+  const float3 v2 = thrust::get<2>(v012);
+
+  // Only need xy for barycentric coordinates and distance calculations.
+  const float2 v0xy = make_float2(v0.x, v0.y);
+  const float2 v1xy = make_float2(v1.x, v1.y);
+  const float2 v2xy = make_float2(v2.x, v2.y);
+
+  // Perform checks and skip if:
+  // 1. the face is behind the camera
+  // 2. the face is facing away from the camera
+  // 3. the face has very small face area
+  // 4. the pixel is outside the face bbox
+  const float zmax = FloatMax3(v0.z, v1.z, v2.z);
+  const bool outside_bbox = CheckPointOutsideBoundingBox(
+      v0, v1, v2, sqrt(blur_radius), pxy); // use sqrt of blur for bbox
+  const float face_area = EdgeFunctionForward(v0xy, v1xy, v2xy);
+  // Check if the face is visible to the camera.
+  const bool back_face = face_area < 0.0;
+  const bool zero_face_area =
+      (face_area <= kEpsilon && face_area >= -1.0f * kEpsilon);
+
+  if (zmax < 0 || cull_backfaces && back_face || outside_bbox ||
+      zero_face_area) {
+    return;
+  }
+
+  // Calculate barycentric coords and euclidean dist to triangle.
+  const float3 p_bary0 = BarycentricCoordsForward(pxy, v0xy, v1xy, v2xy);
+  const float3 p_bary = !perspective_correct
+      ? p_bary0
+      : BarycentricPerspectiveCorrectionForward(p_bary0, v0.z, v1.z, v2.z);
+  const float3 p_bary_clip =
+      !clip_barycentric_coords ? p_bary : BarycentricClipForward(p_bary);
+
+  const float pz =
+      p_bary_clip.x * v0.z + p_bary_clip.y * v1.z + p_bary_clip.z * v2.z;
+
+  if (pz < 0) {
+    return; // Face is behind the image plane.
+  }
+
+  // Get abs squared distance
+  const float dist = PointTriangleDistanceForward(pxy, v0xy, v1xy, v2xy);
+
+  // Use the unclipped bary coordinates to determine if the point is inside the
+  // face.
+  const bool inside = p_bary.x > 0.0f && p_bary.y > 0.0f && p_bary.z > 0.0f;
+  const float signed_dist = inside ? -dist : dist;
+  // Check if pixel is outside blur region
+  if (!inside && dist >= blur_radius) {
+    return;
+  }
+
+  // Handle the case where a face (f) partially behind the image plane is
+  // clipped to a quadrilateral and then split into two faces (t1, t2). In this
+  // case we:
+  // 1. Find the index of the neighboring face (e.g. for t1 need index of t2)
+  // 2. Check if the neighboring face (t2) is already in the top K faces
+  // 3. If yes, compare the distance of the pixel to t1 with the distance to t2.
+  // 4. If dist_t1 < dist_t2, overwrite the values for t2 in the top K faces.
+  const int neighbor_idx = clipped_faces_neighbor_idx[face_idx];
+  int neighbor_idx_top_k = -1;
+
+  // Check if neighboring face is already in the top K.
+  // -1 is the fill value in clipped_faces_neighbor_idx
+  if (neighbor_idx != -1) {
+    // Only need to loop until q_size.
+    for (int i = 0; i < q_size; i++) {
+      if (q[i].idx == neighbor_idx) {
+        neighbor_idx_top_k = i;
+        break;
+      }
+    }
+  }
+  // If neighbor idx is not -1 then it is in the top K struct.
+  if (neighbor_idx_top_k != -1) {
+    // If dist of current face is less than neighbor then overwrite the
+    // neighbor face values in the top K struct.
+    float neighbor_dist = abs(q[neighbor_idx_top_k].dist);
+    if (dist < neighbor_dist) {
+      // Overwrite the neighbor face values
+      q[neighbor_idx_top_k] = {pz, face_idx, signed_dist, p_bary_clip};
+
+      // If pz > q_max then overwrite the max values and index of the max.
+      // q_size stays the same.
+      if (pz > q_max_z) {
+        q_max_z = pz;
+        q_max_idx = neighbor_idx_top_k;
+      }
+    }
+  } else {
+    // Handle as a normal face
+    if (q_size < K) {
+      // Just insert it.
+      q[q_size] = {pz, face_idx, signed_dist, p_bary_clip};
+      if (pz > q_max_z) {
+        q_max_z = pz;
+        q_max_idx = q_size;
+      }
+      q_size++;
+    } else if (pz < q_max_z) {
+      // Overwrite the old max, and find the new max.
+      q[q_max_idx] = {pz, face_idx, signed_dist, p_bary_clip};
+      q_max_z = pz;
+      for (int i = 0; i < K; i++) {
+        if (q[i].z > q_max_z) {
+          q_max_z = q[i].z;
+          q_max_idx = i;
+        }
+      }
+    }
+  }
+}
+
+} // namespace
+
+// ****************************************************************************
+// *                          NAIVE RASTERIZATION                      *
+// ****************************************************************************
+__global__ void RasterizeMeshesNaiveCudaKernel(
+    const float* face_verts,
+    const int64_t* mesh_to_face_first_idx,
+    const int64_t* num_faces_per_mesh,
+    const int64_t* clipped_faces_neighbor_idx,
+    const float blur_radius,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces,
+    const int N,
+    const int H,
+    const int W,
+    const int K,
+    int64_t* face_idxs,
+    float* zbuf,
+    float* pix_dists,
+    float* bary) {
+  // Simple version: One thread per output pixel
+  int num_threads = gridDim.x * blockDim.x;
+  int tid = blockDim.x * blockIdx.x + threadIdx.x;
+
+  for (int i = tid; i < N * H * W; i += num_threads) {
+    // Convert linear index to 3D index
+    const int n = i / (H * W); // batch index.
+    const int pix_idx = i % (H * W);
+
+    // Reverse ordering of X and Y axes
+    const int yi = H - 1 - pix_idx / W;
+    const int xi = W - 1 - pix_idx % W;
+
+    // screen coordinates to ndc coordinates of pixel.
+    const float xf = PixToNonSquareNdc(xi, W, H);
+    const float yf = PixToNonSquareNdc(yi, H, W);
+    const float2 pxy = make_float2(xf, yf);
+
+    // For keeping track of the K closest points we want a data structure
+    // that (1) gives O(1) access to the closest point for easy comparisons,
+    // and (2) allows insertion of new elements. In the CPU version we use
+    // std::priority_queue; then (2) is O(log K). We can't use STL
+    // containers in CUDA; we could roll our own max heap in an array, but
+    // that would likely have a lot of warp divergence so we do something
+    // simpler instead: keep the elements in an unsorted array, but keep
+    // track of the max value and the index of the max value. Then (1) is
+    // still O(1) time, while (2) is O(K) with a clean loop. Since K <= 8
+    // this should be fast enough for our purposes.
+    Pixel q[kMaxPointsPerPixel];
+    int q_size = 0;
+    float q_max_z = -1000;
+    int q_max_idx = -1;
+
+    // Using the batch index of the thread get the start and stop
+    // indices for the faces.
+    const int64_t face_start_idx = mesh_to_face_first_idx[n];
+    const int64_t face_stop_idx = face_start_idx + num_faces_per_mesh[n];
+
+    // Loop through the faces in the mesh.
+    for (int f = face_start_idx; f < face_stop_idx; ++f) {
+      // Check if the pixel pxy is inside the face bounding box and if it is,
+      // update q, q_size, q_max_z and q_max_idx in place.
+
+      CheckPixelInsideFace(
+          face_verts,
+          clipped_faces_neighbor_idx,
+          f,
+          q_size,
+          q_max_z,
+          q_max_idx,
+          q,
+          blur_radius,
+          pxy,
+          K,
+          perspective_correct,
+          clip_barycentric_coords,
+          cull_backfaces);
+    }
+
+    // TODO: make sorting an option as only top k is needed, not sorted values.
+    BubbleSort(q, q_size);
+    int idx = n * H * W * K + pix_idx * K;
+
+    for (int k = 0; k < q_size; ++k) {
+      face_idxs[idx + k] = q[k].idx;
+      zbuf[idx + k] = q[k].z;
+      pix_dists[idx + k] = q[k].dist;
+      bary[(idx + k) * 3 + 0] = q[k].bary.x;
+      bary[(idx + k) * 3 + 1] = q[k].bary.y;
+      bary[(idx + k) * 3 + 2] = q[k].bary.z;
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+RasterizeMeshesNaiveCuda(
+    const at::Tensor& face_verts,
+    const at::Tensor& mesh_to_faces_packed_first_idx,
+    const at::Tensor& num_faces_per_mesh,
+    const at::Tensor& clipped_faces_neighbor_idx,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int num_closest,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces) {
+  TORCH_CHECK(
+      face_verts.ndimension() == 3 && face_verts.size(1) == 3 &&
+          face_verts.size(2) == 3,
+      "face_verts must have dimensions (num_faces, 3, 3)");
+
+  TORCH_CHECK(
+      num_faces_per_mesh.size(0) == mesh_to_faces_packed_first_idx.size(0),
+      "num_faces_per_mesh must have save size first dimension as mesh_to_faces_packed_first_idx");
+
+  TORCH_CHECK(
+      clipped_faces_neighbor_idx.size(0) == face_verts.size(0),
+      "clipped_faces_neighbor_idx must have save size first dimension as face_verts");
+
+  if (num_closest > kMaxPointsPerPixel) {
+    std::stringstream ss;
+    ss << "Must have points_per_pixel <= " << kMaxPointsPerPixel;
+    AT_ERROR(ss.str());
+  }
+
+  // Check inputs are on the same device
+  at::TensorArg face_verts_t{face_verts, "face_verts", 1},
+      mesh_to_faces_packed_first_idx_t{
+          mesh_to_faces_packed_first_idx, "mesh_to_faces_packed_first_idx", 2},
+      num_faces_per_mesh_t{num_faces_per_mesh, "num_faces_per_mesh", 3},
+      clipped_faces_neighbor_idx_t{
+          clipped_faces_neighbor_idx, "clipped_faces_neighbor_idx", 4};
+  at::CheckedFrom c = "RasterizeMeshesNaiveCuda";
+  at::checkAllSameGPU(
+      c,
+      {face_verts_t,
+       mesh_to_faces_packed_first_idx_t,
+       num_faces_per_mesh_t,
+       clipped_faces_neighbor_idx_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(face_verts.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int N = num_faces_per_mesh.size(0); // batch size.
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
+  const int K = num_closest;
+
+  auto long_opts = num_faces_per_mesh.options().dtype(at::kLong);
+  auto float_opts = face_verts.options().dtype(at::kFloat);
+
+  at::Tensor face_idxs = at::full({N, H, W, K}, -1, long_opts);
+  at::Tensor zbuf = at::full({N, H, W, K}, -1, float_opts);
+  at::Tensor pix_dists = at::full({N, H, W, K}, -1, float_opts);
+  at::Tensor bary = at::full({N, H, W, K, 3}, -1, float_opts);
+
+  if (face_idxs.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(face_idxs, zbuf, bary, pix_dists);
+  }
+
+  const size_t blocks = 1024;
+  const size_t threads = 64;
+
+  RasterizeMeshesNaiveCudaKernel<<<blocks, threads, 0, stream>>>(
+      face_verts.contiguous().data_ptr<float>(),
+      mesh_to_faces_packed_first_idx.contiguous().data_ptr<int64_t>(),
+      num_faces_per_mesh.contiguous().data_ptr<int64_t>(),
+      clipped_faces_neighbor_idx.contiguous().data_ptr<int64_t>(),
+      blur_radius,
+      perspective_correct,
+      clip_barycentric_coords,
+      cull_backfaces,
+      N,
+      H,
+      W,
+      K,
+      face_idxs.data_ptr<int64_t>(),
+      zbuf.data_ptr<float>(),
+      pix_dists.data_ptr<float>(),
+      bary.data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(face_idxs, zbuf, bary, pix_dists);
+}
+
+// ****************************************************************************
+// *                            BACKWARD PASS                                 *
+// ****************************************************************************
+// TODO: benchmark parallelizing over faces_verts instead of over pixels.
+__global__ void RasterizeMeshesBackwardCudaKernel(
+    const float* face_verts, // (F, 3, 3)
+    const int64_t* pix_to_face, // (N, H, W, K)
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const int N,
+    const int H,
+    const int W,
+    const int K,
+    const float* grad_zbuf, // (N, H, W, K)
+    const float* grad_bary, // (N, H, W, K, 3)
+    const float* grad_dists, // (N, H, W, K)
+    float* grad_face_verts) { // (F, 3, 3)
+
+  // Parallelize over each pixel in images of
+  // size H * W, for each image in the batch of size N.
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int t_i = tid; t_i < N * H * W; t_i += num_threads) {
+    // Convert linear index to 3D index
+    const int n = t_i / (H * W); // batch index.
+    const int pix_idx = t_i % (H * W);
+
+    // Reverse ordering of X and Y axes.
+    const int yi = H - 1 - pix_idx / W;
+    const int xi = W - 1 - pix_idx % W;
+
+    const float xf = PixToNonSquareNdc(xi, W, H);
+    const float yf = PixToNonSquareNdc(yi, H, W);
+    const float2 pxy = make_float2(xf, yf);
+
+    // Loop over all the faces for this pixel.
+    for (int k = 0; k < K; k++) {
+      // Index into (N, H, W, K, :) grad tensors
+      // pixel index + top k index
+      int i = n * H * W * K + pix_idx * K + k;
+
+      const int f = pix_to_face[i];
+      if (f < 0) {
+        continue; // padded face.
+      }
+      // Get xyz coordinates of the three face vertices.
+      const auto v012 = GetSingleFaceVerts(face_verts, f);
+      const float3 v0 = thrust::get<0>(v012);
+      const float3 v1 = thrust::get<1>(v012);
+      const float3 v2 = thrust::get<2>(v012);
+
+      // Only neex xy for barycentric coordinate and distance calculations.
+      const float2 v0xy = make_float2(v0.x, v0.y);
+      const float2 v1xy = make_float2(v1.x, v1.y);
+      const float2 v2xy = make_float2(v2.x, v2.y);
+
+      // Get upstream gradients for the face.
+      const float grad_dist_upstream = grad_dists[i];
+      const float grad_zbuf_upstream = grad_zbuf[i];
+      const float grad_bary_upstream_w0 = grad_bary[i * 3 + 0];
+      const float grad_bary_upstream_w1 = grad_bary[i * 3 + 1];
+      const float grad_bary_upstream_w2 = grad_bary[i * 3 + 2];
+      const float3 grad_bary_upstream = make_float3(
+          grad_bary_upstream_w0, grad_bary_upstream_w1, grad_bary_upstream_w2);
+
+      const float3 b_w = BarycentricCoordsForward(pxy, v0xy, v1xy, v2xy);
+      const float3 b_pp = !perspective_correct
+          ? b_w
+          : BarycentricPerspectiveCorrectionForward(b_w, v0.z, v1.z, v2.z);
+
+      const float3 b_w_clip =
+          !clip_barycentric_coords ? b_pp : BarycentricClipForward(b_pp);
+
+      const bool inside = b_pp.x > 0.0f && b_pp.y > 0.0f && b_pp.z > 0.0f;
+      const float sign = inside ? -1.0f : 1.0f;
+
+      auto grad_dist_f = PointTriangleDistanceBackward(
+          pxy, v0xy, v1xy, v2xy, sign * grad_dist_upstream);
+      const float2 ddist_d_v0 = thrust::get<1>(grad_dist_f);
+      const float2 ddist_d_v1 = thrust::get<2>(grad_dist_f);
+      const float2 ddist_d_v2 = thrust::get<3>(grad_dist_f);
+
+      // Upstream gradient for barycentric coords from zbuf calculation:
+      // zbuf = bary_w0 * z0 + bary_w1 * z1 + bary_w2 * z2
+      // Therefore
+      // d_zbuf/d_bary_w0 = z0
+      // d_zbuf/d_bary_w1 = z1
+      // d_zbuf/d_bary_w2 = z2
+      const float3 d_zbuf_d_bwclip = make_float3(v0.z, v1.z, v2.z);
+
+      // Total upstream barycentric gradients are the sum of
+      // external upstream gradients and contribution from zbuf.
+      const float3 grad_bary_f_sum =
+          (grad_bary_upstream + grad_zbuf_upstream * d_zbuf_d_bwclip);
+
+      float3 grad_bary0 = grad_bary_f_sum;
+
+      if (clip_barycentric_coords) {
+        grad_bary0 = BarycentricClipBackward(b_w, grad_bary_f_sum);
+      }
+
+      float dz0_persp = 0.0f, dz1_persp = 0.0f, dz2_persp = 0.0f;
+      if (perspective_correct) {
+        auto perspective_grads = BarycentricPerspectiveCorrectionBackward(
+            b_w, v0.z, v1.z, v2.z, grad_bary0);
+        grad_bary0 = thrust::get<0>(perspective_grads);
+        dz0_persp = thrust::get<1>(perspective_grads);
+        dz1_persp = thrust::get<2>(perspective_grads);
+        dz2_persp = thrust::get<3>(perspective_grads);
+      }
+
+      auto grad_bary_f =
+          BarycentricCoordsBackward(pxy, v0xy, v1xy, v2xy, grad_bary0);
+      const float2 dbary_d_v0 = thrust::get<1>(grad_bary_f);
+      const float2 dbary_d_v1 = thrust::get<2>(grad_bary_f);
+      const float2 dbary_d_v2 = thrust::get<3>(grad_bary_f);
+
+      atomicAdd(grad_face_verts + f * 9 + 0, dbary_d_v0.x + ddist_d_v0.x);
+      atomicAdd(grad_face_verts + f * 9 + 1, dbary_d_v0.y + ddist_d_v0.y);
+      atomicAdd(
+          grad_face_verts + f * 9 + 2,
+          grad_zbuf_upstream * b_w_clip.x + dz0_persp);
+      atomicAdd(grad_face_verts + f * 9 + 3, dbary_d_v1.x + ddist_d_v1.x);
+      atomicAdd(grad_face_verts + f * 9 + 4, dbary_d_v1.y + ddist_d_v1.y);
+      atomicAdd(
+          grad_face_verts + f * 9 + 5,
+          grad_zbuf_upstream * b_w_clip.y + dz1_persp);
+      atomicAdd(grad_face_verts + f * 9 + 6, dbary_d_v2.x + ddist_d_v2.x);
+      atomicAdd(grad_face_verts + f * 9 + 7, dbary_d_v2.y + ddist_d_v2.y);
+      atomicAdd(
+          grad_face_verts + f * 9 + 8,
+          grad_zbuf_upstream * b_w_clip.z + dz2_persp);
+    }
+  }
+}
+
+at::Tensor RasterizeMeshesBackwardCuda(
+    const at::Tensor& face_verts, // (F, 3, 3)
+    const at::Tensor& pix_to_face, // (N, H, W, K)
+    const at::Tensor& grad_zbuf, // (N, H, W, K)
+    const at::Tensor& grad_bary, // (N, H, W, K, 3)
+    const at::Tensor& grad_dists, // (N, H, W, K)
+    const bool perspective_correct,
+    const bool clip_barycentric_coords) {
+  // Check inputs are on the same device
+  at::TensorArg face_verts_t{face_verts, "face_verts", 1},
+      pix_to_face_t{pix_to_face, "pix_to_face", 2},
+      grad_zbuf_t{grad_zbuf, "grad_zbuf", 3},
+      grad_bary_t{grad_bary, "grad_bary", 4},
+      grad_dists_t{grad_dists, "grad_dists", 5};
+  at::CheckedFrom c = "RasterizeMeshesBackwardCuda";
+  at::checkAllSameGPU(
+      c, {face_verts_t, pix_to_face_t, grad_zbuf_t, grad_bary_t, grad_dists_t});
+  at::checkAllSameType(
+      c, {face_verts_t, grad_zbuf_t, grad_bary_t, grad_dists_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(face_verts.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int F = face_verts.size(0);
+  const int N = pix_to_face.size(0);
+  const int H = pix_to_face.size(1);
+  const int W = pix_to_face.size(2);
+  const int K = pix_to_face.size(3);
+
+  at::Tensor grad_face_verts = at::zeros({F, 3, 3}, face_verts.options());
+
+  if (grad_face_verts.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_face_verts;
+  }
+
+  const size_t blocks = 1024;
+  const size_t threads = 64;
+
+  RasterizeMeshesBackwardCudaKernel<<<blocks, threads, 0, stream>>>(
+      face_verts.contiguous().data_ptr<float>(),
+      pix_to_face.contiguous().data_ptr<int64_t>(),
+      perspective_correct,
+      clip_barycentric_coords,
+      N,
+      H,
+      W,
+      K,
+      grad_zbuf.contiguous().data_ptr<float>(),
+      grad_bary.contiguous().data_ptr<float>(),
+      grad_dists.contiguous().data_ptr<float>(),
+      grad_face_verts.data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_face_verts;
+}
+
+// ****************************************************************************
+// *                            FINE RASTERIZATION                            *
+// ****************************************************************************
+__global__ void RasterizeMeshesFineCudaKernel(
+    const float* face_verts, // (F, 3, 3)
+    const int32_t* bin_faces, // (N, BH, BW, T)
+    const int64_t* clipped_faces_neighbor_idx, // (F,)
+    const float blur_radius,
+    const int bin_size,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces,
+    const int N,
+    const int BH,
+    const int BW,
+    const int M,
+    const int H,
+    const int W,
+    const int K,
+    int64_t* face_idxs, // (N, H, W, K)
+    float* zbuf, // (N, H, W, K)
+    float* pix_dists, // (N, H, W, K)
+    float* bary // (N, H, W, K, 3)
+) {
+  // This can be more than H * W if H or W are not divisible by bin_size.
+  int num_pixels = N * BH * BW * bin_size * bin_size;
+  int num_threads = gridDim.x * blockDim.x;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    // Convert linear index into bin and pixel indices. We make the within
+    // block pixel ids move the fastest, so that adjacent threads will fall
+    // into the same bin; this should give them coalesced memory reads when
+    // they read from faces and bin_faces.
+    int i = pid;
+    const int n = i / (BH * BW * bin_size * bin_size);
+    i %= BH * BW * bin_size * bin_size;
+    // bin index y
+    const int by = i / (BW * bin_size * bin_size);
+    i %= BW * bin_size * bin_size;
+    // bin index y
+    const int bx = i / (bin_size * bin_size);
+    // pixel within the bin
+    i %= bin_size * bin_size;
+
+    // Pixel x, y indices
+    const int yi = i / bin_size + by * bin_size;
+    const int xi = i % bin_size + bx * bin_size;
+
+    if (yi >= H || xi >= W)
+      continue;
+
+    const float xf = PixToNonSquareNdc(xi, W, H);
+    const float yf = PixToNonSquareNdc(yi, H, W);
+
+    const float2 pxy = make_float2(xf, yf);
+
+    // This part looks like the naive rasterization kernel, except we use
+    // bin_faces to only look at a subset of faces already known to fall
+    // in this bin. TODO abstract out this logic into some data structure
+    // that is shared by both kernels?
+    Pixel q[kMaxPointsPerPixel];
+    int q_size = 0;
+    float q_max_z = -1000;
+    int q_max_idx = -1;
+
+    for (int m = 0; m < M; m++) {
+      const int f = bin_faces[n * BH * BW * M + by * BW * M + bx * M + m];
+      if (f < 0) {
+        continue; // bin_faces uses -1 as a sentinal value.
+      }
+      // Check if the pixel pxy is inside the face bounding box and if it is,
+      // update q, q_size, q_max_z and q_max_idx in place.
+      CheckPixelInsideFace(
+          face_verts,
+          clipped_faces_neighbor_idx,
+          f,
+          q_size,
+          q_max_z,
+          q_max_idx,
+          q,
+          blur_radius,
+          pxy,
+          K,
+          perspective_correct,
+          clip_barycentric_coords,
+          cull_backfaces);
+    }
+
+    // Now we've looked at all the faces for this bin, so we can write
+    // output for the current pixel.
+    // TODO: make sorting an option as only top k is needed, not sorted values.
+    BubbleSort(q, q_size);
+
+    // Reverse ordering of the X and Y axis so that
+    // in the image +Y is pointing up and +X is pointing left.
+    const int yidx = H - 1 - yi;
+    const int xidx = W - 1 - xi;
+
+    const int pix_idx = n * H * W * K + yidx * W * K + xidx * K;
+    for (int k = 0; k < q_size; k++) {
+      face_idxs[pix_idx + k] = q[k].idx;
+      zbuf[pix_idx + k] = q[k].z;
+      pix_dists[pix_idx + k] = q[k].dist;
+      bary[(pix_idx + k) * 3 + 0] = q[k].bary.x;
+      bary[(pix_idx + k) * 3 + 1] = q[k].bary.y;
+      bary[(pix_idx + k) * 3 + 2] = q[k].bary.z;
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+RasterizeMeshesFineCuda(
+    const at::Tensor& face_verts,
+    const at::Tensor& bin_faces,
+    const at::Tensor& clipped_faces_neighbor_idx,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int bin_size,
+    const int faces_per_pixel,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces) {
+  TORCH_CHECK(
+      face_verts.ndimension() == 3 && face_verts.size(1) == 3 &&
+          face_verts.size(2) == 3,
+      "face_verts must have dimensions (num_faces, 3, 3)");
+  TORCH_CHECK(bin_faces.ndimension() == 4, "bin_faces must have 4 dimensions");
+  TORCH_CHECK(
+      clipped_faces_neighbor_idx.size(0) == face_verts.size(0),
+      "clipped_faces_neighbor_idx must have the same first dimension as face_verts");
+
+  // Check inputs are on the same device
+  at::TensorArg face_verts_t{face_verts, "face_verts", 1},
+      bin_faces_t{bin_faces, "bin_faces", 2},
+      clipped_faces_neighbor_idx_t{
+          clipped_faces_neighbor_idx, "clipped_faces_neighbor_idx", 3};
+  at::CheckedFrom c = "RasterizeMeshesFineCuda";
+  at::checkAllSameGPU(
+      c, {face_verts_t, bin_faces_t, clipped_faces_neighbor_idx_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(face_verts.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  // bin_faces shape (N, BH, BW, M)
+  const int N = bin_faces.size(0);
+  const int BH = bin_faces.size(1);
+  const int BW = bin_faces.size(2);
+  const int M = bin_faces.size(3);
+  const int K = faces_per_pixel;
+
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
+
+  if (K > kMaxPointsPerPixel) {
+    AT_ERROR("Must have num_closest <= 150");
+  }
+  auto long_opts = bin_faces.options().dtype(at::kLong);
+  auto float_opts = face_verts.options().dtype(at::kFloat);
+
+  at::Tensor face_idxs = at::full({N, H, W, K}, -1, long_opts);
+  at::Tensor zbuf = at::full({N, H, W, K}, -1, float_opts);
+  at::Tensor pix_dists = at::full({N, H, W, K}, -1, float_opts);
+  at::Tensor bary = at::full({N, H, W, K, 3}, -1, float_opts);
+
+  if (face_idxs.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(face_idxs, zbuf, bary, pix_dists);
+  }
+
+  const size_t blocks = 1024;
+  const size_t threads = 64;
+
+  RasterizeMeshesFineCudaKernel<<<blocks, threads, 0, stream>>>(
+      face_verts.contiguous().data_ptr<float>(),
+      bin_faces.contiguous().data_ptr<int32_t>(),
+      clipped_faces_neighbor_idx.contiguous().data_ptr<int64_t>(),
+      blur_radius,
+      bin_size,
+      perspective_correct,
+      clip_barycentric_coords,
+      cull_backfaces,
+      N,
+      BH,
+      BW,
+      M,
+      H,
+      W,
+      K,
+      face_idxs.data_ptr<int64_t>(),
+      zbuf.data_ptr<float>(),
+      pix_dists.data_ptr<float>(),
+      bary.data_ptr<float>());
+
+  return std::make_tuple(face_idxs, zbuf, bary, pix_dists);
+}
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.h b/pytorch3d/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.h
new file mode 100644
index 0000000000000000000000000000000000000000..15305d5faaf3f57e47bb29a1bef13fc38789e0cb
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_meshes/rasterize_meshes.h
@@ -0,0 +1,549 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <cstdio>
+#include <tuple>
+#include "rasterize_coarse/rasterize_coarse.h"
+#include "utils/pytorch3d_cutils.h"
+
+// ****************************************************************************
+// *                            FORWARD PASS                                 *
+// ****************************************************************************
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizeMeshesNaiveCpu(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& mesh_to_face_first_idx,
+    const torch::Tensor& num_faces_per_mesh,
+    const torch::Tensor& clipped_faces_neighbor_idx,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int faces_per_pixel,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces);
+
+#ifdef WITH_CUDA
+std::tuple<at::Tensor, at::Tensor, at::Tensor, at::Tensor>
+RasterizeMeshesNaiveCuda(
+    const at::Tensor& face_verts,
+    const at::Tensor& mesh_to_face_first_idx,
+    const at::Tensor& num_faces_per_mesh,
+    const torch::Tensor& clipped_faces_neighbor_idx,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int num_closest,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces);
+#endif
+// Forward pass for rasterizing a batch of meshes.
+//
+// Args:
+//    face_verts: Tensor of shape (F, 3, 3) giving (packed) vertex positions for
+//                faces in all the meshes in the batch. Concretely,
+//                face_verts[f, i] = [x, y, z] gives the coordinates for the
+//                ith vertex of the fth face. These vertices are expected to be
+//                in NDC coordinates in the range [-1, 1].
+//    mesh_to_face_first_idx: LongTensor of shape (N) giving the index in
+//                            faces_verts of the first face in each mesh in
+//                            the batch where N is the batch size.
+//    num_faces_per_mesh: LongTensor of shape (N) giving the number of faces
+//                        for each mesh in the batch.
+//    clipped_faces_neighbor_idx: LongTensor of shape (F,) giving the
+//        index of the neighboring face for each face which was clipped to a
+//        quadrilateral and then divided into two triangles.
+//        e.g. for a face f partially behind the image plane which is split into
+//        two triangles (t1, t2): clipped_faces_neighbor_idx[t1_idx] = t2_idx
+//        Faces which are not clipped and subdivided are set to -1.
+//    image_size: Tuple (H, W) giving the size in pixels of the output
+//                image to be rasterized.
+//    blur_radius: float distance in NDC coordinates uses to expand the face
+//                 bounding boxes for the rasterization. Set to 0.0 if no blur
+//                 is required.
+//    faces_per_pixel: the number of closeset faces to rasterize per pixel.
+//    perspective_correct: Whether to apply perspective correction when
+//                         computing barycentric coordinates. If this is True,
+//                         then this function returns world-space barycentric
+//                         coordinates for each pixel; if this is False then
+//                         this function instead returns screen-space
+//                         barycentric coordinates for each pixel.
+//    clip_barycentric_coords: Whether, after any perspective correction
+//          is applied but before the depth is calculated (e.g. for
+//          z clipping), to "correct" a location outside the face (i.e. with
+//          a negative barycentric coordinate) to a position on the edge of the
+//          face.
+//    cull_backfaces: Bool, Whether to only rasterize mesh faces which are
+//                    visible to the camera.  This assumes that vertices of
+//                    front-facing triangles are ordered in an anti-clockwise
+//                    fashion, and triangles that face away from the camera are
+//                    in a clockwise order relative to the current view
+//                    direction. NOTE: This will only work if the mesh faces are
+//                    consistently defined with counter-clockwise ordering when
+//                    viewed from the outside.
+//
+// Returns:
+//    A 4 element tuple of:
+//    pix_to_face: int64 tensor of shape (N, H, W, K) giving the face index of
+//                 each of the closest faces to the pixel in the rasterized
+//                 image, or -1 for pixels that are not covered by any face.
+//    zbuf: float32 Tensor of shape (N, H, W, K) giving the depth of each of
+//          the closest faces for each pixel.
+//    barycentric_coords: float tensor of shape (N, H, W, K, 3) giving
+//                        barycentric coordinates of the pixel with respect to
+//                        each of the closest faces along the z axis, padded
+//                        with -1 for pixels hit by fewer than
+//                        faces_per_pixel faces.
+//    dists: float tensor of shape (N, H, W, K) giving the euclidean distance
+//           in the (NDC) x/y plane between each pixel and its K closest
+//           faces along the z axis padded  with -1 for pixels hit by fewer than
+//           faces_per_pixel faces.
+inline std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizeMeshesNaive(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& mesh_to_face_first_idx,
+    const torch::Tensor& num_faces_per_mesh,
+    const torch::Tensor& clipped_faces_neighbor_idx,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int faces_per_pixel,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces) {
+  // TODO: Better type checking.
+  if (face_verts.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(face_verts);
+    CHECK_CUDA(mesh_to_face_first_idx);
+    CHECK_CUDA(num_faces_per_mesh);
+    return RasterizeMeshesNaiveCuda(
+        face_verts,
+        mesh_to_face_first_idx,
+        num_faces_per_mesh,
+        clipped_faces_neighbor_idx,
+        image_size,
+        blur_radius,
+        faces_per_pixel,
+        perspective_correct,
+        clip_barycentric_coords,
+        cull_backfaces);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return RasterizeMeshesNaiveCpu(
+        face_verts,
+        mesh_to_face_first_idx,
+        num_faces_per_mesh,
+        clipped_faces_neighbor_idx,
+        image_size,
+        blur_radius,
+        faces_per_pixel,
+        perspective_correct,
+        clip_barycentric_coords,
+        cull_backfaces);
+  }
+}
+
+// ****************************************************************************
+// *                            BACKWARD PASS                                 *
+// ****************************************************************************
+
+torch::Tensor RasterizeMeshesBackwardCpu(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& pix_to_face,
+    const torch::Tensor& grad_zbuf,
+    const torch::Tensor& grad_bary,
+    const torch::Tensor& grad_dists,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords);
+
+#ifdef WITH_CUDA
+torch::Tensor RasterizeMeshesBackwardCuda(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& pix_to_face,
+    const torch::Tensor& grad_zbuf,
+    const torch::Tensor& grad_bary,
+    const torch::Tensor& grad_dists,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords);
+#endif
+
+// Args:
+//    face_verts: float32 Tensor of shape (F, 3, 3) (from forward pass) giving
+//                (packed) vertex positions for faces in all the meshes in
+//                 the batch.
+//    pix_to_face: int64 tensor of shape (N, H, W, K) giving the face index of
+//                 each of the closest faces to the pixel in the rasterized
+//                 image, or -1 for pixels that are not covered by any face.
+//    grad_zbuf: Tensor of shape (N, H, W, K) giving upstream gradients
+//               d(loss)/d(zbuf) of the zbuf tensor from the forward pass.
+//    grad_bary: Tensor of shape (N, H, W, K, 3) giving upstream gradients
+//               d(loss)/d(bary) of the barycentric_coords tensor returned by
+//               the forward pass.
+//    grad_dists: Tensor of shape (N, H, W, K) giving upstream gradients
+//                d(loss)/d(dists) of the dists tensor from the forward pass.
+//    perspective_correct: Whether to apply perspective correction when
+//                         computing barycentric coordinates. If this is True,
+//                         then this function returns world-space barycentric
+//                         coordinates for each pixel; if this is False then
+//                         this function instead returns screen-space
+//                         barycentric coordinates for each pixel.
+//    clip_barycentric_coords: Whether, after any perspective correction
+//          is applied but before the depth is calculated (e.g. for
+//          z clipping), to "correct" a location outside the face (i.e. with
+//          a negative barycentric coordinate) to a position on the edge of the
+//          face.
+//
+// Returns:
+//    grad_face_verts: float32 Tensor of shape (F, 3, 3) giving downstream
+//                     gradients for the face vertices.
+torch::Tensor RasterizeMeshesBackward(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& pix_to_face,
+    const torch::Tensor& grad_zbuf,
+    const torch::Tensor& grad_bary,
+    const torch::Tensor& grad_dists,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords) {
+  if (face_verts.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(face_verts);
+    CHECK_CUDA(pix_to_face);
+    CHECK_CUDA(grad_zbuf);
+    CHECK_CUDA(grad_bary);
+    CHECK_CUDA(grad_dists);
+    return RasterizeMeshesBackwardCuda(
+        face_verts,
+        pix_to_face,
+        grad_zbuf,
+        grad_bary,
+        grad_dists,
+        perspective_correct,
+        clip_barycentric_coords);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return RasterizeMeshesBackwardCpu(
+        face_verts,
+        pix_to_face,
+        grad_zbuf,
+        grad_bary,
+        grad_dists,
+        perspective_correct,
+        clip_barycentric_coords);
+  }
+}
+
+// ****************************************************************************
+// *                          COARSE RASTERIZATION                            *
+// ****************************************************************************
+
+// RasterizeMeshesCoarseCuda in rasterize_coarse/rasterize_coarse.h
+
+torch::Tensor RasterizeMeshesCoarseCpu(
+    const torch::Tensor& face_verts,
+    const at::Tensor& mesh_to_face_first_idx,
+    const at::Tensor& num_faces_per_mesh,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int bin_size,
+    const int max_faces_per_bin);
+
+// Args:
+//    face_verts: Tensor of shape (F, 3, 3) giving (packed) vertex positions for
+//                faces in all the meshes in the batch. Concretely,
+//                face_verts[f, i] = [x, y, z] gives the coordinates for the
+//                ith vertex of the fth face. These vertices are expected to be
+//                in NDC coordinates in the range [-1, 1].
+//    mesh_to_face_first_idx: LongTensor of shape (N) giving the index in
+//                            faces_verts of the first face in each mesh in
+//                            the batch where N is the batch size.
+//    num_faces_per_mesh: LongTensor of shape (N) giving the number of faces
+//                        for each mesh in the batch.
+//    image_size: Tuple (H, W) giving the size in pixels of the output
+//                image to be rasterized.
+//    blur_radius: float distance in NDC coordinates uses to expand the face
+//                 bounding boxes for the rasterization. Set to 0.0 if no blur
+//                 is required.
+//    bin_size: Size of each bin within the image (in pixels)
+//    max_faces_per_bin: Maximum number of faces to count in each bin.
+//
+// Returns:
+//   bin_face_idxs: Tensor of shape (N, num_bins, num_bins, K) giving the
+//                  indices of faces that fall into each bin.
+
+torch::Tensor RasterizeMeshesCoarse(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& mesh_to_face_first_idx,
+    const torch::Tensor& num_faces_per_mesh,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int bin_size,
+    const int max_faces_per_bin) {
+  if (face_verts.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(face_verts);
+    CHECK_CUDA(mesh_to_face_first_idx);
+    CHECK_CUDA(num_faces_per_mesh);
+    return RasterizeMeshesCoarseCuda(
+        face_verts,
+        mesh_to_face_first_idx,
+        num_faces_per_mesh,
+        image_size,
+        blur_radius,
+        bin_size,
+        max_faces_per_bin);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return RasterizeMeshesCoarseCpu(
+        face_verts,
+        mesh_to_face_first_idx,
+        num_faces_per_mesh,
+        image_size,
+        blur_radius,
+        bin_size,
+        max_faces_per_bin);
+  }
+}
+
+// ****************************************************************************
+// *                            FINE RASTERIZATION                            *
+// ****************************************************************************
+
+#ifdef WITH_CUDA
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizeMeshesFineCuda(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& bin_faces,
+    const torch::Tensor& clipped_faces_neighbor_idx,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int bin_size,
+    const int faces_per_pixel,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces);
+#endif
+// Args:
+//    face_verts: Tensor of shape (F, 3, 3) giving (packed) vertex positions for
+//                faces in all the meshes in the batch. Concretely,
+//                face_verts[f, i] = [x, y, z] gives the coordinates for the
+//                ith vertex of the fth face. These vertices are expected to be
+//                in NDC coordinates in the range [-1, 1].
+//    bin_faces: int32 Tensor of shape (N, B, B, M) giving the indices of faces
+//               that fall into each bin (output from coarse rasterization).
+//    clipped_faces_neighbor_idx: LongTensor of shape (F,) giving the
+//        index of the neighboring face for each face which was clipped to a
+//        quadrilateral and then divided into two triangles.
+//        e.g. for a face f partially behind the image plane which is split into
+//        two triangles (t1, t2): clipped_faces_neighbor_idx[t1_idx] = t2_idx
+//        Faces which are not clipped and subdivided are set to -1.
+//    image_size: Tuple (H, W) giving the size in pixels of the output
+//                image to be rasterized.
+//    blur_radius: float distance in NDC coordinates uses to expand the face
+//                 bounding boxes for the rasterization. Set to 0.0 if no blur
+//                 is required.
+//    bin_size: Size of each bin within the image (in pixels)
+//    faces_per_pixel: the number of closeset faces to rasterize per pixel.
+//    perspective_correct: Whether to apply perspective correction when
+//                         computing barycentric coordinates. If this is True,
+//                         then this function returns world-space barycentric
+//                         coordinates for each pixel; if this is False then
+//                         this function instead returns screen-space
+//                         barycentric coordinates for each pixel.
+//    clip_barycentric_coords: Whether, after any perspective correction
+//          is applied but before the depth is calculated (e.g. for
+//          z clipping), to "correct" a location outside the face (i.e. with
+//          a negative barycentric coordinate) to a position on the edge of the
+//          face.
+//    cull_backfaces: Bool, Whether to only rasterize mesh faces which are
+//                    visible to the camera.  This assumes that vertices of
+//                    front-facing triangles are ordered in an anti-clockwise
+//                    fashion, and triangles that face away from the camera are
+//                    in a clockwise order relative to the current view
+//                    direction. NOTE: This will only work if the mesh faces are
+//                    consistently defined with counter-clockwise ordering when
+//                    viewed from the outside.
+//
+// Returns (same as rasterize_meshes):
+//    A 4 element tuple of:
+//    pix_to_face: int64 tensor of shape (N, H, W, K) giving the face index of
+//                 each of the closest faces to the pixel in the rasterized
+//                 image, or -1 for pixels that are not covered by any face.
+//    zbuf: float32 Tensor of shape (N, H, W, K) giving the depth of each of
+//          the closest faces for each pixel.
+//    barycentric_coords: float tensor of shape (N, H, W, K, 3) giving
+//                        barycentric coordinates of the pixel with respect to
+//                        each of the closest faces along the z axis, padded
+//                        with -1 for pixels hit by fewer than
+//                        faces_per_pixel faces.
+//    dists: float tensor of shape (N, H, W, K) giving the euclidean distance
+//           in the (NDC) x/y plane between each pixel and its K closest
+//           faces along the z axis padded  with -1 for pixels hit by fewer than
+//           faces_per_pixel faces.
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizeMeshesFine(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& bin_faces,
+    const torch::Tensor& clipped_faces_neighbor_idx,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int bin_size,
+    const int faces_per_pixel,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces) {
+  if (face_verts.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(face_verts);
+    CHECK_CUDA(bin_faces);
+    return RasterizeMeshesFineCuda(
+        face_verts,
+        bin_faces,
+        clipped_faces_neighbor_idx,
+        image_size,
+        blur_radius,
+        bin_size,
+        faces_per_pixel,
+        perspective_correct,
+        clip_barycentric_coords,
+        cull_backfaces);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("NOT IMPLEMENTED");
+  }
+}
+
+// ****************************************************************************
+// *                         MAIN ENTRY POINT                                 *
+// ****************************************************************************
+
+// This is the main entry point for the forward pass of the mesh rasterizer;
+// it uses either naive or coarse-to-fine rasterization based on bin_size.
+//
+// Args:
+//    face_verts: Tensor of shape (F, 3, 3) giving (packed) vertex positions for
+//                faces in all the meshes in the batch. Concretely,
+//                face_verts[f, i] = [x, y, z] gives the coordinates for the
+//                ith vertex of the fth face. These vertices are expected to be
+//                in NDC coordinates in the range [-1, 1].
+//    mesh_to_face_first_idx: LongTensor of shape (N) giving the index in
+//                            faces_verts of the first face in each mesh in
+//                            the batch where N is the batch size.
+//    num_faces_per_mesh: LongTensor of shape (N) giving the number of faces
+//                        for each mesh in the batch.
+//    clipped_faces_neighbor_idx: LongTensor of shape (F,) giving the
+//        index of the neighboring face for each face which was clipped to a
+//        quadrilateral and then divided into two triangles.
+//        e.g. for a face f partially behind the image plane which is split into
+//        two triangles (t1, t2): clipped_faces_neighbor_idx[t1_idx] = t2_idx
+//        Faces which are not clipped and subdivided are set to -1.
+//    image_size: Tuple (H, W) giving the size in pixels of the output
+//                image to be rasterized.
+//    blur_radius: float distance in NDC coordinates uses to expand the face
+//                 bounding boxes for the rasterization. Set to 0.0 if no blur
+//                 is required.
+//    faces_per_pixel: the number of closeset faces to rasterize per pixel.
+//    bin_size: Bin size (in pixels) for coarse-to-fine rasterization. Setting
+//              bin_size=0 uses naive rasterization instead.
+//    max_faces_per_bin: The maximum number of faces allowed to fall into each
+//                      bin when using coarse-to-fine rasterization.
+//    perspective_correct: Whether to apply perspective correction when
+//                         computing barycentric coordinates. If this is True,
+//                         then this function returns world-space barycentric
+//                         coordinates for each pixel; if this is False then
+//                         this function instead returns screen-space
+//                         barycentric coordinates for each pixel.
+//    clip_barycentric_coords: Whether, after any perspective correction
+//          is applied but before the depth is calculated (e.g. for
+//          z clipping), to "correct" a location outside the face (i.e. with
+//          a negative barycentric coordinate) to a position on the edge of the
+//          face.
+//    cull_backfaces: Bool, Whether to only rasterize mesh faces which are
+//                    visible to the camera.  This assumes that vertices of
+//                    front-facing triangles are ordered in an anti-clockwise
+//                    fashion, and triangles that face away from the camera are
+//                    in a clockwise order relative to the current view
+//                    direction. NOTE: This will only work if the mesh faces are
+//                    consistently defined with counter-clockwise ordering when
+//                    viewed from the outside.
+//
+// Returns:
+//    A 4 element tuple of:
+//    pix_to_face: int64 tensor of shape (N, H, W, K) giving the face index of
+//                 each of the closest faces to the pixel in the rasterized
+//                 image, or -1 for pixels that are not covered by any face.
+//    zbuf: float32 Tensor of shape (N, H, W, K) giving the depth of each of
+//          the closest faces for each pixel.
+//    barycentric_coords: float tensor of shape (N, H, W, K, 3) giving
+//                        barycentric coordinates of the pixel with respect to
+//                        each of the closest faces along the z axis, padded
+//                        with -1 for pixels hit by fewer than
+//                        faces_per_pixel faces.
+//    dists: float tensor of shape (N, H, W, K) giving the euclidean distance
+//           in the (NDC) x/y plane between each pixel and its K closest
+//           faces along the z axis padded  with -1 for pixels hit by fewer than
+//           faces_per_pixel faces.
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizeMeshes(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& mesh_to_face_first_idx,
+    const torch::Tensor& num_faces_per_mesh,
+    const torch::Tensor& clipped_faces_neighbor_idx,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int faces_per_pixel,
+    const int bin_size,
+    const int max_faces_per_bin,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces) {
+  if (bin_size > 0 && max_faces_per_bin > 0) {
+    // Use coarse-to-fine rasterization
+    at::Tensor bin_faces = RasterizeMeshesCoarse(
+        face_verts,
+        mesh_to_face_first_idx,
+        num_faces_per_mesh,
+        image_size,
+        blur_radius,
+        bin_size,
+        max_faces_per_bin);
+    return RasterizeMeshesFine(
+        face_verts,
+        bin_faces,
+        clipped_faces_neighbor_idx,
+        image_size,
+        blur_radius,
+        bin_size,
+        faces_per_pixel,
+        perspective_correct,
+        clip_barycentric_coords,
+        cull_backfaces);
+  } else {
+    // Use the naive per-pixel implementation
+    return RasterizeMeshesNaive(
+        face_verts,
+        mesh_to_face_first_idx,
+        num_faces_per_mesh,
+        clipped_faces_neighbor_idx,
+        image_size,
+        blur_radius,
+        faces_per_pixel,
+        perspective_correct,
+        clip_barycentric_coords,
+        cull_backfaces);
+  }
+}
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp b/pytorch3d/pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8b2a85e6d2350ea63492304780acfffb59f14dd0
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_meshes/rasterize_meshes_cpu.cpp
@@ -0,0 +1,578 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <algorithm>
+#include <list>
+#include <queue>
+#include <tuple>
+#include "rasterize_points/rasterization_utils.h"
+#include "utils/geometry_utils.h"
+#include "utils/vec2.h"
+#include "utils/vec3.h"
+
+// Get (x, y, z) values for vertex from (3, 3) tensor face.
+template <typename Face>
+auto ExtractVerts(const Face& face, const int vertex_index) {
+  return std::make_tuple(
+      face[vertex_index][0], face[vertex_index][1], face[vertex_index][2]);
+}
+
+// Compute min/max x/y for each face.
+auto ComputeFaceBoundingBoxes(const torch::Tensor& face_verts) {
+  const int total_F = face_verts.size(0);
+  auto float_opts = face_verts.options().dtype(torch::kFloat32);
+  auto face_verts_a = face_verts.accessor<float, 3>();
+  torch::Tensor face_bboxes = torch::full({total_F, 6}, -2.0, float_opts);
+
+  // Loop through all the faces
+  for (int f = 0; f < total_F; ++f) {
+    const auto& face = face_verts_a[f];
+    float x0, x1, x2, y0, y1, y2, z0, z1, z2;
+    std::tie(x0, y0, z0) = ExtractVerts(face, 0);
+    std::tie(x1, y1, z1) = ExtractVerts(face, 1);
+    std::tie(x2, y2, z2) = ExtractVerts(face, 2);
+
+    const float x_min = std::min(x0, std::min(x1, x2));
+    const float y_min = std::min(y0, std::min(y1, y2));
+    const float x_max = std::max(x0, std::max(x1, x2));
+    const float y_max = std::max(y0, std::max(y1, y2));
+    const float z_min = std::min(z0, std::min(z1, z2));
+    const float z_max = std::max(z0, std::max(z1, z2));
+
+    face_bboxes[f][0] = x_min;
+    face_bboxes[f][1] = y_min;
+    face_bboxes[f][2] = x_max;
+    face_bboxes[f][3] = y_max;
+    face_bboxes[f][4] = z_min;
+    face_bboxes[f][5] = z_max;
+  }
+
+  return face_bboxes;
+}
+
+// Check if the point (px, py) lies inside the face bounding box face_bbox.
+// Return true if the point is outside.
+template <typename Face>
+bool CheckPointOutsideBoundingBox(
+    const Face& face_bbox,
+    float blur_radius,
+    float px,
+    float py) {
+  // Read triangle bbox coordinates and expand by blur radius.
+  float x_min = face_bbox[0] - blur_radius;
+  float y_min = face_bbox[1] - blur_radius;
+  float x_max = face_bbox[2] + blur_radius;
+  float y_max = face_bbox[3] + blur_radius;
+
+  // Faces with at least one vertex behind the camera won't render correctly
+  // and should be removed or clipped before calling the rasterizer
+  const bool z_invalid = face_bbox[4] < kEpsilon;
+
+  // Check if the current point is within the triangle bounding box.
+  return (px > x_max || px < x_min || py > y_max || py < y_min || z_invalid);
+}
+
+// Calculate areas of all faces. Returns a tensor of shape (total_faces, 1)
+// where faces with zero area have value -1.
+auto ComputeFaceAreas(const torch::Tensor& face_verts) {
+  const int total_F = face_verts.size(0);
+  auto float_opts = face_verts.options().dtype(torch::kFloat32);
+  auto face_verts_a = face_verts.accessor<float, 3>();
+  torch::Tensor face_areas = torch::full({total_F}, -1, float_opts);
+
+  // Loop through all the faces
+  for (int f = 0; f < total_F; ++f) {
+    const auto& face = face_verts_a[f];
+    float x0, x1, x2, y0, y1, y2, z0, z1, z2;
+    std::tie(x0, y0, z0) = ExtractVerts(face, 0);
+    std::tie(x1, y1, z1) = ExtractVerts(face, 1);
+    std::tie(x2, y2, z2) = ExtractVerts(face, 2);
+
+    const vec2<float> v0(x0, y0);
+    const vec2<float> v1(x1, y1);
+    const vec2<float> v2(x2, y2);
+
+    const float face_area = EdgeFunctionForward(v0, v1, v2);
+    face_areas[f] = face_area;
+  }
+
+  return face_areas;
+}
+
+// Helper function to use with std::find_if to find the index of any
+// values in the top k struct which match a given idx.
+struct IsNeighbor {
+  IsNeighbor(int neighbor_idx) {
+    this->neighbor_idx = neighbor_idx;
+  }
+  bool operator()(std::tuple<float, int, float, float, float, float> elem) {
+    return (std::get<1>(elem) == neighbor_idx);
+  }
+  int neighbor_idx;
+};
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizeMeshesNaiveCpu(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& mesh_to_face_first_idx,
+    const torch::Tensor& num_faces_per_mesh,
+    const torch::Tensor& clipped_faces_neighbor_idx,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int faces_per_pixel,
+    const bool perspective_correct,
+    const bool clip_barycentric_coords,
+    const bool cull_backfaces) {
+  if (face_verts.ndimension() != 3 || face_verts.size(1) != 3 ||
+      face_verts.size(2) != 3) {
+    AT_ERROR("face_verts must have dimensions (num_faces, 3, 3)");
+  }
+  if (num_faces_per_mesh.size(0) != mesh_to_face_first_idx.size(0)) {
+    AT_ERROR(
+        "num_faces_per_mesh must have save size first dimension as mesh_to_face_first_idx");
+  }
+
+  const int32_t N = mesh_to_face_first_idx.size(0); // batch_size.
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
+  const int K = faces_per_pixel;
+
+  auto long_opts = num_faces_per_mesh.options().dtype(torch::kInt64);
+  auto float_opts = face_verts.options().dtype(torch::kFloat32);
+
+  // Initialize output tensors.
+  torch::Tensor face_idxs = torch::full({N, H, W, K}, -1, long_opts);
+  torch::Tensor zbuf = torch::full({N, H, W, K}, -1, float_opts);
+  torch::Tensor pix_dists = torch::full({N, H, W, K}, -1, float_opts);
+  torch::Tensor barycentric_coords =
+      torch::full({N, H, W, K, 3}, -1, float_opts);
+
+  auto face_verts_a = face_verts.accessor<float, 3>();
+  auto face_idxs_a = face_idxs.accessor<int64_t, 4>();
+  auto zbuf_a = zbuf.accessor<float, 4>();
+  auto pix_dists_a = pix_dists.accessor<float, 4>();
+  auto barycentric_coords_a = barycentric_coords.accessor<float, 5>();
+  auto neighbor_idx_a = clipped_faces_neighbor_idx.accessor<int64_t, 1>();
+
+  auto face_bboxes = ComputeFaceBoundingBoxes(face_verts);
+  auto face_bboxes_a = face_bboxes.accessor<float, 2>();
+  auto face_areas = ComputeFaceAreas(face_verts);
+  auto face_areas_a = face_areas.accessor<float, 1>();
+
+  for (int n = 0; n < N; ++n) {
+    // Loop through each mesh in the batch.
+    // Get the start index of the faces in faces_packed and the num faces
+    // in the mesh to avoid having to loop through all the faces.
+    const int face_start_idx = mesh_to_face_first_idx[n].item().to<int32_t>();
+    const int face_stop_idx =
+        (face_start_idx + num_faces_per_mesh[n].item().to<int32_t>());
+
+    // Iterate through the horizontal lines of the image from top to bottom.
+    for (int yi = 0; yi < H; ++yi) {
+      // Reverse the order of yi so that +Y is pointing upwards in the image.
+      const int yidx = H - 1 - yi;
+
+      // Y coordinate of the top of the pixel.
+      const float yf = PixToNonSquareNdc(yidx, H, W);
+      // Iterate through pixels on this horizontal line, left to right.
+      for (int xi = 0; xi < W; ++xi) {
+        // Reverse the order of xi so that +X is pointing to the left in the
+        // image.
+        const int xidx = W - 1 - xi;
+
+        // X coordinate of the left of the pixel.
+        const float xf = PixToNonSquareNdc(xidx, W, H);
+
+        // Use a deque to hold values:
+        // (z, idx, r, bary.x, bary.y. bary.z)
+        // Sort the deque as needed to mimic a priority queue.
+        std::deque<std::tuple<float, int, float, float, float, float>> q;
+
+        // Loop through the faces in the mesh.
+        for (int f = face_start_idx; f < face_stop_idx; ++f) {
+          // Get coordinates of three face vertices.
+          const auto& face = face_verts_a[f];
+          float x0, x1, x2, y0, y1, y2, z0, z1, z2;
+          std::tie(x0, y0, z0) = ExtractVerts(face, 0);
+          std::tie(x1, y1, z1) = ExtractVerts(face, 1);
+          std::tie(x2, y2, z2) = ExtractVerts(face, 2);
+
+          const vec2<float> v0(x0, y0);
+          const vec2<float> v1(x1, y1);
+          const vec2<float> v2(x2, y2);
+
+          const float face_area = face_areas_a[f];
+          const bool back_face = face_area < 0.0;
+          // Check if the face is visible to the camera.
+          if (cull_backfaces && back_face) {
+            continue;
+          }
+          // Skip faces with zero area.
+          if (face_area <= kEpsilon && face_area >= -1.0f * kEpsilon) {
+            continue;
+          }
+
+          // Skip if point is outside the face bounding box.
+          const auto face_bbox = face_bboxes_a[f];
+          const bool outside_bbox = CheckPointOutsideBoundingBox(
+              face_bbox, std::sqrt(blur_radius), xf, yf);
+          if (outside_bbox) {
+            continue;
+          }
+
+          // Compute barycentric coordinates and use this to get the
+          // depth of the point on the triangle.
+          const vec2<float> pxy(xf, yf);
+          const vec3<float> bary0 =
+              BarycentricCoordinatesForward(pxy, v0, v1, v2);
+          const vec3<float> bary = !perspective_correct
+              ? bary0
+              : BarycentricPerspectiveCorrectionForward(bary0, z0, z1, z2);
+
+          const vec3<float> bary_clip =
+              !clip_barycentric_coords ? bary : BarycentricClipForward(bary);
+
+          // Use barycentric coordinates to get the depth of the current pixel
+          const float pz =
+              (bary_clip.x * z0 + bary_clip.y * z1 + bary_clip.z * z2);
+
+          if (pz < 0) {
+            continue; // Point is behind the image plane so ignore.
+          }
+
+          // Compute squared distance of the point to the triangle.
+          const float dist = PointTriangleDistanceForward(pxy, v0, v1, v2);
+
+          // Use the bary coordinates to determine if the point is
+          // inside the face.
+          const bool inside = bary.x > 0.0f && bary.y > 0.0f && bary.z > 0.0f;
+
+          // If the point is inside the triangle then signed_dist
+          // is negative.
+          const float signed_dist = inside ? -dist : dist;
+
+          // Check if pixel is outside blur region
+          if (!inside && dist >= blur_radius) {
+            continue;
+          }
+
+          // Handle the case where a face (f) partially behind the image plane
+          // is clipped to a quadrilateral and then split into two faces (t1,
+          // t2). In this case we:
+          // 1. Find the index of the neighbor (e.g. for t1 need index of t2)
+          // 2. Check if the neighbor (t2) is already in the top K faces
+          // 3. If yes, compare the distance of the pixel to t1 with the
+          // distance to t2.
+          // 4. If dist_t1 < dist_t2, overwrite the values for t2 in the top K
+          // faces.
+          const int neighbor_idx = neighbor_idx_a[f];
+          int idx_top_k = -1;
+
+          // Check if neighboring face is already in the top K.
+          if (neighbor_idx != -1) {
+            const auto it =
+                std::find_if(q.begin(), q.end(), IsNeighbor(neighbor_idx));
+            // Get the index of the element from the iterator
+            idx_top_k = (it != q.end()) ? it - q.begin() : idx_top_k;
+          }
+
+          // If idx_top_k idx is not -1 then it is in the top K struct.
+          if (idx_top_k != -1) {
+            // If dist of current face is less than neighbor, overwrite
+            // the neighbor face values in the top K struct.
+            const auto neighbor = q[idx_top_k];
+            const float dist_neighbor = std::abs(std::get<2>(neighbor));
+            if (dist < dist_neighbor) {
+              // Overwrite the neighbor face values.
+              q[idx_top_k] = std::make_tuple(
+                  pz, f, signed_dist, bary_clip.x, bary_clip.y, bary_clip.z);
+            }
+          } else {
+            // Handle as a normal face.
+            // The current pixel lies inside the current face.
+            // Add at the end of the deque.
+            q.emplace_back(
+                pz, f, signed_dist, bary_clip.x, bary_clip.y, bary_clip.z);
+          }
+
+          // Sort the deque inplace based on the z distance
+          // to mimic using a priority queue.
+          std::sort(q.begin(), q.end());
+          if (static_cast<int>(q.size()) > K) {
+            // remove the last value
+            q.pop_back();
+          }
+        }
+        while (!q.empty()) {
+          // Loop through and add values to the output tensors
+          auto t = q.back();
+          q.pop_back();
+          const int i = q.size();
+          zbuf_a[n][yi][xi][i] = std::get<0>(t);
+          face_idxs_a[n][yi][xi][i] = std::get<1>(t);
+          pix_dists_a[n][yi][xi][i] = std::get<2>(t);
+          barycentric_coords_a[n][yi][xi][i][0] = std::get<3>(t);
+          barycentric_coords_a[n][yi][xi][i][1] = std::get<4>(t);
+          barycentric_coords_a[n][yi][xi][i][2] = std::get<5>(t);
+        }
+      }
+    }
+  }
+  return std::make_tuple(face_idxs, zbuf, barycentric_coords, pix_dists);
+}
+
+torch::Tensor RasterizeMeshesBackwardCpu(
+    const torch::Tensor& face_verts, // (F, 3, 3)
+    const torch::Tensor& pix_to_face, // (N, H, W, K)
+    const torch::Tensor& grad_zbuf, // (N, H, W, K)
+    const torch::Tensor& grad_bary, // (N, H, W, K, 3)
+    const torch::Tensor& grad_dists, // (N, H, W, K)
+    const bool perspective_correct,
+    const bool clip_barycentric_coords) {
+  const int F = face_verts.size(0);
+  const int N = pix_to_face.size(0);
+  const int H = pix_to_face.size(1);
+  const int W = pix_to_face.size(2);
+  const int K = pix_to_face.size(3);
+
+  torch::Tensor grad_face_verts = torch::zeros({F, 3, 3}, face_verts.options());
+  auto face_verts_a = face_verts.accessor<float, 3>();
+  auto pix_to_face_a = pix_to_face.accessor<int64_t, 4>();
+  auto grad_dists_a = grad_dists.accessor<float, 4>();
+  auto grad_zbuf_a = grad_zbuf.accessor<float, 4>();
+  auto grad_bary_a = grad_bary.accessor<float, 5>();
+
+  for (int n = 0; n < N; ++n) {
+    // Iterate through the horizontal lines of the image from top to bottom.
+    for (int y = 0; y < H; ++y) {
+      // Reverse the order of yi so that +Y is pointing upwards in the image.
+      const int yidx = H - 1 - y;
+
+      // Y coordinate of the top of the pixel.
+      const float yf = PixToNonSquareNdc(yidx, H, W);
+      // Iterate through pixels on this horizontal line, left to right.
+      for (int x = 0; x < W; ++x) {
+        // Reverse the order of xi so that +X is pointing to the left in the
+        // image.
+        const int xidx = W - 1 - x;
+
+        // X coordinate of the left of the pixel.
+        const float xf = PixToNonSquareNdc(xidx, W, H);
+        const vec2<float> pxy(xf, yf);
+
+        // Iterate through the faces that hit this pixel.
+        for (int k = 0; k < K; ++k) {
+          // Get face index from forward pass output.
+          const int f = pix_to_face_a[n][y][x][k];
+          if (f < 0) {
+            continue; // padded face.
+          }
+          // Get coordinates of the three face vertices.
+          const auto face_verts_f = face_verts_a[f];
+          const float x0 = face_verts_f[0][0];
+          const float y0 = face_verts_f[0][1];
+          const float z0 = face_verts_f[0][2];
+          const float x1 = face_verts_f[1][0];
+          const float y1 = face_verts_f[1][1];
+          const float z1 = face_verts_f[1][2];
+          const float x2 = face_verts_f[2][0];
+          const float y2 = face_verts_f[2][1];
+          const float z2 = face_verts_f[2][2];
+          const vec2<float> v0xy(x0, y0);
+          const vec2<float> v1xy(x1, y1);
+          const vec2<float> v2xy(x2, y2);
+
+          // Get upstream gradients for the face.
+          const float grad_dist_upstream = grad_dists_a[n][y][x][k];
+          const float grad_zbuf_upstream = grad_zbuf_a[n][y][x][k];
+          const auto grad_bary_upstream_w012 = grad_bary_a[n][y][x][k];
+          const float grad_bary_upstream_w0 = grad_bary_upstream_w012[0];
+          const float grad_bary_upstream_w1 = grad_bary_upstream_w012[1];
+          const float grad_bary_upstream_w2 = grad_bary_upstream_w012[2];
+          const vec3<float> grad_bary_upstream(
+              grad_bary_upstream_w0,
+              grad_bary_upstream_w1,
+              grad_bary_upstream_w2);
+
+          const vec3<float> bary0 =
+              BarycentricCoordinatesForward(pxy, v0xy, v1xy, v2xy);
+          const vec3<float> bary = !perspective_correct
+              ? bary0
+              : BarycentricPerspectiveCorrectionForward(bary0, z0, z1, z2);
+          const vec3<float> bary_clip =
+              !clip_barycentric_coords ? bary : BarycentricClipForward(bary);
+
+          // Distances inside the face are negative so get the
+          // correct sign to apply to the upstream gradient.
+          const bool inside = bary.x > 0.0f && bary.y > 0.0f && bary.z > 0.0f;
+          const float sign = inside ? -1.0f : 1.0f;
+
+          const auto grad_dist_f = PointTriangleDistanceBackward(
+              pxy, v0xy, v1xy, v2xy, sign * grad_dist_upstream);
+          const auto ddist_d_v0 = std::get<1>(grad_dist_f);
+          const auto ddist_d_v1 = std::get<2>(grad_dist_f);
+          const auto ddist_d_v2 = std::get<3>(grad_dist_f);
+
+          // Upstream gradient for barycentric coords from zbuf calculation:
+          // zbuf = bary_w0 * z0 + bary_w1 * z1 + bary_w2 * z2
+          // Therefore
+          // d_zbuf/d_bary_w0 = z0
+          // d_zbuf/d_bary_w1 = z1
+          // d_zbuf/d_bary_w2 = z2
+          const vec3<float> d_zbuf_d_baryclip(z0, z1, z2);
+
+          // Total upstream barycentric gradients are the sum of
+          // external upstream gradients and contribution from zbuf.
+          const vec3<float> grad_bary_f_sum =
+              (grad_bary_upstream + grad_zbuf_upstream * d_zbuf_d_baryclip);
+
+          vec3<float> grad_bary0 = grad_bary_f_sum;
+
+          if (clip_barycentric_coords) {
+            grad_bary0 = BarycentricClipBackward(bary, grad_bary0);
+          }
+
+          if (perspective_correct) {
+            auto perspective_grads = BarycentricPerspectiveCorrectionBackward(
+                bary0, z0, z1, z2, grad_bary0);
+            grad_bary0 = std::get<0>(perspective_grads);
+            grad_face_verts[f][0][2] += std::get<1>(perspective_grads);
+            grad_face_verts[f][1][2] += std::get<2>(perspective_grads);
+            grad_face_verts[f][2][2] += std::get<3>(perspective_grads);
+          }
+
+          auto grad_bary_f =
+              BarycentricCoordsBackward(pxy, v0xy, v1xy, v2xy, grad_bary0);
+          const vec2<float> dbary_d_v0 = std::get<1>(grad_bary_f);
+          const vec2<float> dbary_d_v1 = std::get<2>(grad_bary_f);
+          const vec2<float> dbary_d_v2 = std::get<3>(grad_bary_f);
+
+          // Update output gradient buffer.
+          grad_face_verts[f][0][0] += dbary_d_v0.x + ddist_d_v0.x;
+          grad_face_verts[f][0][1] += dbary_d_v0.y + ddist_d_v0.y;
+          grad_face_verts[f][0][2] += grad_zbuf_upstream * bary_clip.x;
+          grad_face_verts[f][1][0] += dbary_d_v1.x + ddist_d_v1.x;
+          grad_face_verts[f][1][1] += dbary_d_v1.y + ddist_d_v1.y;
+          grad_face_verts[f][1][2] += grad_zbuf_upstream * bary_clip.y;
+          grad_face_verts[f][2][0] += dbary_d_v2.x + ddist_d_v2.x;
+          grad_face_verts[f][2][1] += dbary_d_v2.y + ddist_d_v2.y;
+          grad_face_verts[f][2][2] += grad_zbuf_upstream * bary_clip.z;
+        }
+      }
+    }
+  }
+  return grad_face_verts;
+}
+
+torch::Tensor RasterizeMeshesCoarseCpu(
+    const torch::Tensor& face_verts,
+    const torch::Tensor& mesh_to_face_first_idx,
+    const torch::Tensor& num_faces_per_mesh,
+    const std::tuple<int, int> image_size,
+    const float blur_radius,
+    const int bin_size,
+    const int max_faces_per_bin) {
+  if (face_verts.ndimension() != 3 || face_verts.size(1) != 3 ||
+      face_verts.size(2) != 3) {
+    AT_ERROR("face_verts must have dimensions (num_faces, 3, 3)");
+  }
+  if (num_faces_per_mesh.ndimension() != 1) {
+    AT_ERROR("num_faces_per_mesh can only have one dimension");
+  }
+
+  const int N = num_faces_per_mesh.size(0); // batch size.
+  const int M = max_faces_per_bin;
+
+  const float H = std::get<0>(image_size);
+  const float W = std::get<1>(image_size);
+
+  // Integer division round up.
+  const int BH = 1 + (H - 1) / bin_size;
+  const int BW = 1 + (W - 1) / bin_size;
+
+  auto opts = num_faces_per_mesh.options().dtype(torch::kInt32);
+  torch::Tensor faces_per_bin = torch::zeros({N, BH, BW}, opts);
+  torch::Tensor bin_faces = torch::full({N, BH, BW, M}, -1, opts);
+  auto bin_faces_a = bin_faces.accessor<int32_t, 4>();
+
+  // Precompute all face bounding boxes.
+  auto face_bboxes = ComputeFaceBoundingBoxes(face_verts);
+  auto face_bboxes_a = face_bboxes.accessor<float, 2>();
+
+  const float ndc_x_range = NonSquareNdcRange(W, H);
+  const float pixel_width_x = ndc_x_range / W;
+  const float bin_width_x = pixel_width_x * bin_size;
+
+  const float ndc_y_range = NonSquareNdcRange(H, W);
+  const float pixel_width_y = ndc_y_range / H;
+  const float bin_width_y = pixel_width_y * bin_size;
+
+  // Iterate through the meshes in the batch.
+  for (int n = 0; n < N; ++n) {
+    const int face_start_idx = mesh_to_face_first_idx[n].item().to<int32_t>();
+    const int face_stop_idx =
+        (face_start_idx + num_faces_per_mesh[n].item().to<int32_t>());
+
+    float bin_y_min = -1.0f;
+    float bin_y_max = bin_y_min + bin_width_y;
+
+    // Iterate through the horizontal bins from top to bottom.
+    for (int by = 0; by < BH; ++by) {
+      float bin_x_min = -1.0f;
+      float bin_x_max = bin_x_min + bin_width_x;
+
+      // Iterate through bins on this horizontal line, left to right.
+      for (int bx = 0; bx < BW; ++bx) {
+        int32_t faces_hit = 0;
+
+        for (int32_t f = face_start_idx; f < face_stop_idx; ++f) {
+          // Get bounding box and expand by blur radius.
+          float face_x_min = face_bboxes_a[f][0] - std::sqrt(blur_radius);
+          float face_y_min = face_bboxes_a[f][1] - std::sqrt(blur_radius);
+          float face_x_max = face_bboxes_a[f][2] + std::sqrt(blur_radius);
+          float face_y_max = face_bboxes_a[f][3] + std::sqrt(blur_radius);
+          float face_z_min = face_bboxes_a[f][4];
+
+          // Faces with at least one vertex behind the camera won't render
+          // correctly and should be removed or clipped before calling the
+          // rasterizer
+          if (face_z_min < kEpsilon) {
+            continue;
+          }
+
+          // Use a half-open interval so that faces exactly on the
+          // boundary between bins will fall into exactly one bin.
+          bool x_overlap =
+              (face_x_min <= bin_x_max) && (bin_x_min < face_x_max);
+          bool y_overlap =
+              (face_y_min <= bin_y_max) && (bin_y_min < face_y_max);
+
+          if (x_overlap && y_overlap) {
+            // Got too many faces for this bin, so throw an error.
+            if (faces_hit >= max_faces_per_bin) {
+              AT_ERROR("Got too many faces per bin");
+            }
+            // The current point falls in the current bin, so
+            // record it.
+            bin_faces_a[n][by][bx][faces_hit] = f;
+            faces_hit++;
+          }
+        }
+
+        // Shift the bin to the right for the next loop iteration
+        bin_x_min = bin_x_max;
+        bin_x_max = bin_x_min + bin_width_x;
+      }
+      // Shift the bin down for the next loop iteration
+      bin_y_min = bin_y_max;
+      bin_y_max = bin_y_min + bin_width_y;
+    }
+  }
+  return bin_faces;
+}
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_points/rasterization_utils.cuh b/pytorch3d/pytorch3d/csrc/rasterize_points/rasterization_utils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..76a0cf7aa121813c02436387daea21b02952a98b
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_points/rasterization_utils.cuh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// The default value of the NDC range is [-1, 1], however in the case that
+// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
+// the longer side is scaled by the ratio of H:W. S1 is the dimension for which
+// the NDC range is calculated and S2 is the other image dimension.
+// e.g. to get the NDC x range S1 = W and S2 = H
+__device__ inline float NonSquareNdcRange(int S1, int S2) {
+  float range = 2.0f;
+  if (S1 > S2) {
+    // First multiply S1 by float range so that division results
+    // in a float value.
+    range = (S1 * range) / S2;
+  }
+  return range;
+}
+
+// Given a pixel coordinate 0 <= i < S1, convert it to a normalized device
+// coordinates. We divide the NDC range into S1 evenly-sized
+// pixels, and assume that each pixel falls in the *center* of its range.
+// The default value of the NDC range is [-1, 1], however in the case that
+// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
+// the longer side is scaled by the ratio of H:W. The dimension of i should be
+// S1 and the other image dimension is S2 For example, to get the x and y NDC
+// coordinates or a given pixel i:
+//     x = PixToNonSquareNdc(i, W, H)
+//     y = PixToNonSquareNdc(i, H, W)
+__device__ inline float PixToNonSquareNdc(int i, int S1, int S2) {
+  float range = NonSquareNdcRange(S1, S2);
+  // NDC: offset + (i * pixel_width + half_pixel_width)
+  // The NDC range is [-range/2, range/2].
+  float offset = (range / 2.0f);
+  return -offset + (range * i + offset) / S1;
+}
+
+// The maximum number of points per pixel that we can return. Since we use
+// thread-local arrays to hold and sort points, the maximum size of the array
+// needs to be known at compile time. There might be some fancy template magic
+// we could use to make this more dynamic, but for now just fix a constant.
+// TODO: is 8 enough? Would increasing have performance considerations?
+const int32_t kMaxPointsPerPixel = 150;
+
+const int32_t kMaxItemsPerBin = 22;
+
+template <typename T>
+__device__ inline void BubbleSort(T* arr, int n) {
+  // Bubble sort. We only use it for tiny thread-local arrays (n < 8); in this
+  // regime we care more about warp divergence than computational complexity.
+  for (int i = 0; i < n - 1; ++i) {
+    for (int j = 0; j < n - i - 1; ++j) {
+      if (arr[j + 1] < arr[j]) {
+        T temp = arr[j];
+        arr[j] = arr[j + 1];
+        arr[j + 1] = temp;
+      }
+    }
+  }
+}
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_points/rasterization_utils.h b/pytorch3d/pytorch3d/csrc/rasterize_points/rasterization_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..664f8a3ec9b8ed9483381d95d67f431a7ad19867
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_points/rasterization_utils.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// The default value of the NDC range is [-1, 1], however in the case that
+// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
+// the longer side is scaled by the ratio of H:W. S1 is the dimension for which
+// the NDC range is calculated and S2 is the other image dimension.
+// e.g. to get the NDC x range S1 = W and S2 = H
+inline float NonSquareNdcRange(int S1, int S2) {
+  float range = 2.0f;
+  if (S1 > S2) {
+    range = (S1 * range) / S2;
+  }
+  return range;
+}
+
+// Given a pixel coordinate 0 <= i < S1, convert it to a normalized device
+// coordinates. We divide the NDC range into S1 evenly-sized
+// pixels, and assume that each pixel falls in the *center* of its range.
+// The default value of the NDC range is [-1, 1], however in the case that
+// H != W, the NDC range is set such that the shorter side has range [-1, 1] and
+// the longer side is scaled by the ratio of H:W. The dimension of i should be
+// S1 and the other image dimension is S2 For example, to get the x and y NDC
+// coordinates or a given pixel i:
+//     x = PixToNonSquareNdc(i, W, H)
+//     y = PixToNonSquareNdc(i, H, W)
+inline float PixToNonSquareNdc(int i, int S1, int S2) {
+  float range = NonSquareNdcRange(S1, S2);
+  // NDC: offset + (i * pixel_width + half_pixel_width)
+  // The NDC range is [-range/2, range/2].
+  const float offset = (range / 2.0f);
+  return -offset + (range * i + offset) / S1;
+}
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_points/rasterize_points.cu b/pytorch3d/pytorch3d/csrc/rasterize_points/rasterize_points.cu
new file mode 100644
index 0000000000000000000000000000000000000000..251be7f4218975b9c0539ccec2fff383abad7d9b
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_points/rasterize_points.cu
@@ -0,0 +1,461 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <math.h>
+#include <cstdio>
+#include <sstream>
+#include <tuple>
+#include "rasterize_points/rasterization_utils.cuh"
+
+namespace {
+// A little structure for holding details about a pixel.
+struct Pix {
+  float z; // Depth of the reference point.
+  int32_t idx; // Index of the reference point.
+  float dist2; // Euclidean distance square to the reference point.
+};
+
+__device__ inline bool operator<(const Pix& a, const Pix& b) {
+  return a.z < b.z;
+}
+
+// This function checks if a pixel given by xy location pxy lies within the
+// point with index p and batch index n. One of the inputs is a list (q)
+// which contains Pixel structs with the indices of the points which intersect
+// with this pixel sorted by closest z distance. If the pixel pxy lies in the
+// point, the list (q) is updated and re-orderered in place. In addition
+// the auxiliary variables q_size, q_max_z and q_max_idx are also modified.
+// This code is shared between RasterizePointsNaiveCudaKernel and
+// RasterizePointsFineCudaKernel.
+template <typename PointQ>
+__device__ void CheckPixelInsidePoint(
+    const float* points, // (P, 3)
+    const int p_idx,
+    int& q_size,
+    float& q_max_z,
+    int& q_max_idx,
+    PointQ& q,
+    const float* radius,
+    const float xf,
+    const float yf,
+    const int K) {
+  const float px = points[p_idx * 3 + 0];
+  const float py = points[p_idx * 3 + 1];
+  const float pz = points[p_idx * 3 + 2];
+  const float p_radius = radius[p_idx];
+  const float radius2 = p_radius * p_radius;
+  if (pz < 0)
+    return; // Don't render points behind the camera
+  const float dx = xf - px;
+  const float dy = yf - py;
+  const float dist2 = dx * dx + dy * dy;
+  if (dist2 < radius2) {
+    if (q_size < K) {
+      // Just insert it
+      q[q_size] = {pz, p_idx, dist2};
+      if (pz > q_max_z) {
+        q_max_z = pz;
+        q_max_idx = q_size;
+      }
+      q_size++;
+    } else if (pz < q_max_z) {
+      // Overwrite the old max, and find the new max
+      q[q_max_idx] = {pz, p_idx, dist2};
+      q_max_z = pz;
+      for (int i = 0; i < K; i++) {
+        if (q[i].z > q_max_z) {
+          q_max_z = q[i].z;
+          q_max_idx = i;
+        }
+      }
+    }
+  }
+}
+} // namespace
+// ****************************************************************************
+// *                          NAIVE RASTERIZATION                             *
+// ****************************************************************************
+
+__global__ void RasterizePointsNaiveCudaKernel(
+    const float* points, // (P, 3)
+    const int64_t* cloud_to_packed_first_idx, // (N)
+    const int64_t* num_points_per_cloud, // (N)
+    const float* radius,
+    const int N,
+    const int H,
+    const int W,
+    const int K,
+    int32_t* point_idxs, // (N, H, W, K)
+    float* zbuf, // (N, H, W, K)
+    float* pix_dists) { // (N, H, W, K)
+  // Simple version: One thread per output pixel
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockDim.x * blockIdx.x + threadIdx.x;
+  for (int i = tid; i < N * H * W; i += num_threads) {
+    // Convert linear index to 3D index
+    const int n = i / (H * W); // Batch index
+    const int pix_idx = i % (H * W);
+
+    // Reverse ordering of the X and Y axis as the camera coordinates
+    // assume that +Y is pointing up and +X is pointing left.
+    const int yi = H - 1 - pix_idx / W;
+    const int xi = W - 1 - pix_idx % W;
+
+    // screen coordinates to ndc coordinates of pixel.
+    const float xf = PixToNonSquareNdc(xi, W, H);
+    const float yf = PixToNonSquareNdc(yi, H, W);
+
+    // For keeping track of the K closest points we want a data structure
+    // that (1) gives O(1) access to the closest point for easy comparisons,
+    // and (2) allows insertion of new elements. In the CPU version we use
+    // std::priority_queue; then (2) is O(log K). We can't use STL
+    // containers in CUDA; we could roll our own max heap in an array, but
+    // that would likely have a lot of warp divergence so we do something
+    // simpler instead: keep the elements in an unsorted array, but keep
+    // track of the max value and the index of the max value. Then (1) is
+    // still O(1) time, while (2) is O(K) with a clean loop. Since K <= 8
+    // this should be fast enough for our purposes.
+    // TODO(jcjohns) Abstract this out into a standalone data structure
+    Pix q[kMaxPointsPerPixel];
+    int q_size = 0;
+    float q_max_z = -1000;
+    int q_max_idx = -1;
+
+    // Using the batch index of the thread get the start and stop
+    // indices for the points.
+    const int64_t point_start_idx = cloud_to_packed_first_idx[n];
+    const int64_t point_stop_idx = point_start_idx + num_points_per_cloud[n];
+
+    for (int p_idx = point_start_idx; p_idx < point_stop_idx; ++p_idx) {
+      CheckPixelInsidePoint(
+          points, p_idx, q_size, q_max_z, q_max_idx, q, radius, xf, yf, K);
+    }
+    BubbleSort(q, q_size);
+    int idx = n * H * W * K + pix_idx * K;
+    for (int k = 0; k < q_size; ++k) {
+      point_idxs[idx + k] = q[k].idx;
+      zbuf[idx + k] = q[k].z;
+      pix_dists[idx + k] = q[k].dist2;
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsNaiveCuda(
+    const at::Tensor& points, // (P. 3)
+    const at::Tensor& cloud_to_packed_first_idx, // (N)
+    const at::Tensor& num_points_per_cloud, // (N)
+    const std::tuple<int, int> image_size,
+    const at::Tensor& radius,
+    const int points_per_pixel) {
+  // Check inputs are on the same device
+  at::TensorArg points_t{points, "points", 1},
+      cloud_to_packed_first_idx_t{
+          cloud_to_packed_first_idx, "cloud_to_packed_first_idx", 2},
+      num_points_per_cloud_t{num_points_per_cloud, "num_points_per_cloud", 3};
+  at::CheckedFrom c = "RasterizePointsNaiveCuda";
+  at::checkAllSameGPU(
+      c, {points_t, cloud_to_packed_first_idx_t, num_points_per_cloud_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  TORCH_CHECK(
+      points.ndimension() == 2 && points.size(1) == 3,
+      "points must have dimensions (num_points, 3)");
+  TORCH_CHECK(
+      num_points_per_cloud.size(0) == cloud_to_packed_first_idx.size(0),
+      "num_points_per_cloud must have same size first dimension as cloud_to_packed_first_idx");
+
+  const int N = num_points_per_cloud.size(0); // batch size.
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
+  const int K = points_per_pixel;
+
+  if (K > kMaxPointsPerPixel) {
+    std::stringstream ss;
+    ss << "Must have points_per_pixel <= " << kMaxPointsPerPixel;
+    AT_ERROR(ss.str());
+  }
+
+  auto int_opts = num_points_per_cloud.options().dtype(at::kInt);
+  auto float_opts = points.options().dtype(at::kFloat);
+  at::Tensor point_idxs = at::full({N, H, W, K}, -1, int_opts);
+  at::Tensor zbuf = at::full({N, H, W, K}, -1, float_opts);
+  at::Tensor pix_dists = at::full({N, H, W, K}, -1, float_opts);
+
+  if (point_idxs.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(point_idxs, zbuf, pix_dists);
+  }
+
+  const size_t blocks = 1024;
+  const size_t threads = 64;
+  RasterizePointsNaiveCudaKernel<<<blocks, threads, 0, stream>>>(
+      points.contiguous().data_ptr<float>(),
+      cloud_to_packed_first_idx.contiguous().data_ptr<int64_t>(),
+      num_points_per_cloud.contiguous().data_ptr<int64_t>(),
+      radius.contiguous().data_ptr<float>(),
+      N,
+      H,
+      W,
+      K,
+      point_idxs.contiguous().data_ptr<int32_t>(),
+      zbuf.contiguous().data_ptr<float>(),
+      pix_dists.contiguous().data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(point_idxs, zbuf, pix_dists);
+}
+
+// ****************************************************************************
+// *                            FINE RASTERIZATION                            *
+// ****************************************************************************
+
+__global__ void RasterizePointsFineCudaKernel(
+    const float* points, // (P, 3)
+    const int32_t* bin_points, // (N, BH, BW, T)
+    const float* radius,
+    const int bin_size,
+    const int N,
+    const int BH, // num_bins y
+    const int BW, // num_bins x
+    const int M,
+    const int H,
+    const int W,
+    const int K,
+    int32_t* point_idxs, // (N, H, W, K)
+    float* zbuf, // (N, H, W, K)
+    float* pix_dists) { // (N, H, W, K)
+  // This can be more than H * W if H or W are not divisible by bin_size.
+  const int num_pixels = N * BH * BW * bin_size * bin_size;
+  const int num_threads = gridDim.x * blockDim.x;
+  const int tid = blockIdx.x * blockDim.x + threadIdx.x;
+
+  for (int pid = tid; pid < num_pixels; pid += num_threads) {
+    // Convert linear index into bin and pixel indices. We make the within
+    // block pixel ids move the fastest, so that adjacent threads will fall
+    // into the same bin; this should give them coalesced memory reads when
+    // they read from points and bin_points.
+    int i = pid;
+    const int n = i / (BH * BW * bin_size * bin_size);
+    i %= BH * BW * bin_size * bin_size;
+    const int by = i / (BW * bin_size * bin_size);
+    i %= BW * bin_size * bin_size;
+    const int bx = i / (bin_size * bin_size);
+    i %= bin_size * bin_size;
+
+    const int yi = i / bin_size + by * bin_size;
+    const int xi = i % bin_size + bx * bin_size;
+
+    if (yi >= H || xi >= W)
+      continue;
+
+    const float xf = PixToNonSquareNdc(xi, W, H);
+    const float yf = PixToNonSquareNdc(yi, H, W);
+
+    // This part looks like the naive rasterization kernel, except we use
+    // bin_points to only look at a subset of points already known to fall
+    // in this bin. TODO abstract out this logic into some data structure
+    // that is shared by both kernels?
+    Pix q[kMaxPointsPerPixel];
+    int q_size = 0;
+    float q_max_z = -1000;
+    int q_max_idx = -1;
+    for (int m = 0; m < M; ++m) {
+      const int p = bin_points[n * BH * BW * M + by * BW * M + bx * M + m];
+      if (p < 0) {
+        // bin_points uses -1 as a sentinal value
+        continue;
+      }
+      CheckPixelInsidePoint(
+          points, p, q_size, q_max_z, q_max_idx, q, radius, xf, yf, K);
+    }
+    // Now we've looked at all the points for this bin, so we can write
+    // output for the current pixel.
+    BubbleSort(q, q_size);
+
+    // Reverse ordering of the X and Y axis as the camera coordinates
+    // assume that +Y is pointing up and +X is pointing left.
+    const int yidx = H - 1 - yi;
+    const int xidx = W - 1 - xi;
+
+    const int pix_idx = n * H * W * K + yidx * W * K + xidx * K;
+    for (int k = 0; k < q_size; ++k) {
+      point_idxs[pix_idx + k] = q[k].idx;
+      zbuf[pix_idx + k] = q[k].z;
+      pix_dists[pix_idx + k] = q[k].dist2;
+    }
+  }
+}
+
+std::tuple<at::Tensor, at::Tensor, at::Tensor> RasterizePointsFineCuda(
+    const at::Tensor& points, // (P, 3)
+    const at::Tensor& bin_points,
+    const std::tuple<int, int> image_size,
+    const at::Tensor& radius,
+    const int bin_size,
+    const int points_per_pixel) {
+  // Check inputs are on the same device
+  at::TensorArg points_t{points, "points", 1},
+      bin_points_t{bin_points, "bin_points", 2};
+  at::CheckedFrom c = "RasterizePointsFineCuda";
+  at::checkAllSameGPU(c, {points_t, bin_points_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int N = bin_points.size(0);
+  const int BH = bin_points.size(1);
+  const int BW = bin_points.size(2);
+  const int M = bin_points.size(3);
+  const int K = points_per_pixel;
+
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
+
+  if (K > kMaxPointsPerPixel) {
+    AT_ERROR("Must have num_closest <= 150");
+  }
+  auto int_opts = bin_points.options().dtype(at::kInt);
+  auto float_opts = points.options().dtype(at::kFloat);
+  at::Tensor point_idxs = at::full({N, H, W, K}, -1, int_opts);
+  at::Tensor zbuf = at::full({N, H, W, K}, -1, float_opts);
+  at::Tensor pix_dists = at::full({N, H, W, K}, -1, float_opts);
+
+  if (point_idxs.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return std::make_tuple(point_idxs, zbuf, pix_dists);
+  }
+
+  const size_t blocks = 1024;
+  const size_t threads = 64;
+  RasterizePointsFineCudaKernel<<<blocks, threads, 0, stream>>>(
+      points.contiguous().data_ptr<float>(),
+      bin_points.contiguous().data_ptr<int32_t>(),
+      radius.contiguous().data_ptr<float>(),
+      bin_size,
+      N,
+      BH,
+      BW,
+      M,
+      H,
+      W,
+      K,
+      point_idxs.contiguous().data_ptr<int32_t>(),
+      zbuf.contiguous().data_ptr<float>(),
+      pix_dists.contiguous().data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return std::make_tuple(point_idxs, zbuf, pix_dists);
+}
+
+// ****************************************************************************
+// *                            BACKWARD PASS                                 *
+// ****************************************************************************
+// TODO(T55115174) Add more documentation for backward kernel.
+__global__ void RasterizePointsBackwardCudaKernel(
+    const float* points, // (P, 3)
+    const int32_t* idxs, // (N, H, W, K)
+    const int N,
+    const int P,
+    const int H,
+    const int W,
+    const int K,
+    const float* grad_zbuf, // (N, H, W, K)
+    const float* grad_dists, // (N, H, W, K)
+    float* grad_points) { // (P, 3)
+  // Parallelized over each of K points per pixel, for each pixel in images of
+  // size H * W, for each image in the batch of size N.
+  int num_threads = gridDim.x * blockDim.x;
+  int tid = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = tid; i < N * H * W * K; i += num_threads) {
+    // const int n = i / (H * W * K); // batch index (not needed).
+    const int yxk = i % (H * W * K);
+    const int yi = yxk / (W * K);
+    const int xk = yxk % (W * K);
+    const int xi = xk / K;
+    // k = xk % K (We don't actually need k, but this would be it.)
+    // Reverse ordering of X and Y axes.
+    const int yidx = H - 1 - yi;
+    const int xidx = W - 1 - xi;
+
+    const float xf = PixToNonSquareNdc(xidx, W, H);
+    const float yf = PixToNonSquareNdc(yidx, H, W);
+
+    const int p = idxs[i];
+    if (p < 0)
+      continue;
+    const float grad_dist2 = grad_dists[i];
+    const int p_ind = p * 3; // index into packed points tensor
+    const float px = points[p_ind + 0];
+    const float py = points[p_ind + 1];
+    const float dx = px - xf;
+    const float dy = py - yf;
+    const float grad_px = 2.0f * grad_dist2 * dx;
+    const float grad_py = 2.0f * grad_dist2 * dy;
+    const float grad_pz = grad_zbuf[i];
+    atomicAdd(grad_points + p_ind + 0, grad_px);
+    atomicAdd(grad_points + p_ind + 1, grad_py);
+    atomicAdd(grad_points + p_ind + 2, grad_pz);
+  }
+}
+
+at::Tensor RasterizePointsBackwardCuda(
+    const at::Tensor& points, // (N, P, 3)
+    const at::Tensor& idxs, // (N, H, W, K)
+    const at::Tensor& grad_zbuf, // (N, H, W, K)
+    const at::Tensor& grad_dists) { // (N, H, W, K)
+
+  // Check inputs are on the same device
+  at::TensorArg points_t{points, "points", 1}, idxs_t{idxs, "idxs", 2},
+      grad_zbuf_t{grad_zbuf, "grad_zbuf", 3},
+      grad_dists_t{grad_dists, "grad_dists", 4};
+  at::CheckedFrom c = "RasterizePointsBackwardCuda";
+  at::checkAllSameGPU(c, {points_t, idxs_t, grad_zbuf_t, grad_dists_t});
+  at::checkAllSameType(c, {points_t, grad_zbuf_t, grad_dists_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int P = points.size(0);
+  const int N = idxs.size(0);
+  const int H = idxs.size(1);
+  const int W = idxs.size(2);
+  const int K = idxs.size(3);
+
+  at::Tensor grad_points = at::zeros({P, 3}, points.options());
+
+  if (grad_points.numel() == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return grad_points;
+  }
+
+  const size_t blocks = 1024;
+  const size_t threads = 64;
+
+  RasterizePointsBackwardCudaKernel<<<blocks, threads, 0, stream>>>(
+      points.contiguous().data_ptr<float>(),
+      idxs.contiguous().data_ptr<int32_t>(),
+      N,
+      P,
+      H,
+      W,
+      K,
+      grad_zbuf.contiguous().data_ptr<float>(),
+      grad_dists.contiguous().data_ptr<float>(),
+      grad_points.contiguous().data_ptr<float>());
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return grad_points;
+}
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_points/rasterize_points.h b/pytorch3d/pytorch3d/csrc/rasterize_points/rasterize_points.h
new file mode 100644
index 0000000000000000000000000000000000000000..e97b9dbfee0510a07be14be2ca7dae6534bc93d3
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_points/rasterize_points.h
@@ -0,0 +1,360 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <cstdio>
+#include <tuple>
+#include "rasterize_coarse/rasterize_coarse.h"
+#include "utils/pytorch3d_cutils.h"
+
+// ****************************************************************************
+// *                          NAIVE RASTERIZATION                             *
+// ****************************************************************************
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaiveCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& cloud_to_packed_first_idx,
+    const torch::Tensor& num_points_per_cloud,
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int points_per_pixel);
+
+#ifdef WITH_CUDA
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor>
+RasterizePointsNaiveCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& cloud_to_packed_first_idx,
+    const torch::Tensor& num_points_per_cloud,
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int points_per_pixel);
+#endif
+// Naive (forward) pointcloud rasterization: For each pixel, for each point,
+// check whether that point hits the pixel.
+//
+// Args:
+//  points: Tensor of shape (P, 3) giving (packed) positions for
+//          points in all N pointclouds in the batch where P is the total
+//          number of points in the batch across all pointclouds. These points
+//          are expected to be in NDC coordinates in the range [-1, 1].
+//  cloud_to_packed_first_idx: LongTensor of shape (N) giving the index in
+//                          points_packed of the first point in each pointcloud
+//                          in the batch where N is the batch size.
+//  num_points_per_cloud: LongTensor of shape (N) giving the number of points
+//                        for each pointcloud in the batch.
+//  image_size: Tuple (H, W) giving the size in pixels of the output
+//              image to be rasterized.
+//  radius: FloatTensor of shape (P) giving the radius (in NDC units) of
+//          each point in points.
+//  points_per_pixel: (K) The number closest of points to return for each pixel
+//
+// Returns:
+//  A 4 element tuple of:
+//  idxs: int32 Tensor of shape (N, S, S, K) giving the indices of the
+//        closest K points along the z-axis for each pixel, padded with -1 for
+//        pixels hit by fewer than K points. The indices refer to points in
+//        points packed i.e a tensor of shape (P, 3) representing the flattened
+//        points for all pointclouds in the batch.
+//  zbuf: float32 Tensor of shape (N, S, S, K) giving the depth of each
+//        closest point for each pixel.
+//  dists: float32 Tensor of shape (N, S, S, K) giving squared Euclidean
+//          distance in the (NDC) x/y plane between each pixel and its K closest
+//          points along the z axis.
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaive(
+    const torch::Tensor& points,
+    const torch::Tensor& cloud_to_packed_first_idx,
+    const torch::Tensor& num_points_per_cloud,
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int points_per_pixel) {
+  if (points.is_cuda() && cloud_to_packed_first_idx.is_cuda() &&
+      num_points_per_cloud.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(cloud_to_packed_first_idx);
+    CHECK_CUDA(num_points_per_cloud);
+    CHECK_CUDA(radius);
+    return RasterizePointsNaiveCuda(
+        points,
+        cloud_to_packed_first_idx,
+        num_points_per_cloud,
+        image_size,
+        radius,
+        points_per_pixel);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return RasterizePointsNaiveCpu(
+        points,
+        cloud_to_packed_first_idx,
+        num_points_per_cloud,
+        image_size,
+        radius,
+        points_per_pixel);
+  }
+}
+
+// ****************************************************************************
+// *                          COARSE RASTERIZATION                            *
+// ****************************************************************************
+
+// RasterizePointsCoarseCuda in rasterize_coarse/rasterize_coarse.h
+
+torch::Tensor RasterizePointsCoarseCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& cloud_to_packed_first_idx,
+    const torch::Tensor& num_points_per_cloud,
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int bin_size,
+    const int max_points_per_bin);
+
+// Args:
+//  points: Tensor of shape (P, 3) giving (packed) positions for
+//          points in all N pointclouds in the batch where P is the total
+//          number of points in the batch across all pointclouds. These points
+//          are expected to be in NDC coordinates in the range [-1, 1].
+//  cloud_to_packed_first_idx: LongTensor of shape (N) giving the index in
+//                          points_packed of the first point in each pointcloud
+//                          in the batch where N is the batch size.
+//  num_points_per_cloud: LongTensor of shape (N) giving the number of points
+//                        for each pointcloud in the batch.
+//  image_size: Tuple (H, W) giving the size in pixels of the output
+//              image to be rasterized.
+//  radius: FloatTensor of shape (P) giving the radius (in NDC units) of
+//          each point in points.
+//  bin_size: Size of each bin within the image (in pixels)
+//  max_points_per_bin: The maximum number of points allowed to fall into each
+//                      bin when using coarse-to-fine rasterization.
+//
+// Returns:
+//  points_per_bin: Tensor of shape (N, num_bins, num_bins) giving the number
+//                  of points that fall in each bin
+//  bin_points: Tensor of shape (N, num_bins, num_bins, K) giving the indices
+//              of points that fall into each bin.
+torch::Tensor RasterizePointsCoarse(
+    const torch::Tensor& points,
+    const torch::Tensor& cloud_to_packed_first_idx,
+    const torch::Tensor& num_points_per_cloud,
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int bin_size,
+    const int max_points_per_bin) {
+  if (points.is_cuda() && cloud_to_packed_first_idx.is_cuda() &&
+      num_points_per_cloud.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(cloud_to_packed_first_idx);
+    CHECK_CUDA(num_points_per_cloud);
+    CHECK_CUDA(radius);
+    return RasterizePointsCoarseCuda(
+        points,
+        cloud_to_packed_first_idx,
+        num_points_per_cloud,
+        image_size,
+        radius,
+        bin_size,
+        max_points_per_bin);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return RasterizePointsCoarseCpu(
+        points,
+        cloud_to_packed_first_idx,
+        num_points_per_cloud,
+        image_size,
+        radius,
+        bin_size,
+        max_points_per_bin);
+  }
+}
+
+// ****************************************************************************
+// *                            FINE RASTERIZATION                            *
+// ****************************************************************************
+
+#ifdef WITH_CUDA
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsFineCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& bin_points,
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int bin_size,
+    const int points_per_pixel);
+#endif
+// Args:
+//  points: Tensor of shape (P, 3) giving (packed) positions for
+//          points in all N pointclouds in the batch where P is the total
+//          number of points in the batch across all pointclouds. These points
+//          are expected to be in NDC coordinates in the range [-1, 1].
+//  bin_points: int32 Tensor of shape (N, B, B, M) giving the indices of points
+//              that fall into each bin (output from coarse rasterization)
+//  image_size: Tuple (H, W) giving the size in pixels of the output
+//              image to be rasterized.
+//  radius: FloatTensor of shape (P) giving the radius (in NDC units) of
+//          each point in points.
+//  bin_size: Size of each bin (in pixels)
+//  points_per_pixel: How many points to rasterize for each pixel
+//
+// Returns (same as rasterize_points):
+//  idxs: int32 Tensor of shape (N, S, S, K) giving the indices of the
+//        closest K points along the z-axis for each pixel, padded with -1 for
+//        pixels hit by fewer than K points. The indices refer to points in
+//        points packed i.e a tensor of shape (P, 3) representing the flattened
+//        points for all pointclouds in the batch.
+//  zbuf: float32 Tensor of shape (N, S, S, K) giving the depth of each of each
+//        closest point for each pixel
+//  dists: float32 Tensor of shape (N, S, S, K) giving squared Euclidean
+//         distance in the (NDC) x/y plane between each pixel and its K closest
+//         points along the z axis.
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsFine(
+    const torch::Tensor& points,
+    const torch::Tensor& bin_points,
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int bin_size,
+    const int points_per_pixel) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(bin_points);
+    return RasterizePointsFineCuda(
+        points, bin_points, image_size, radius, bin_size, points_per_pixel);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    AT_ERROR("NOT IMPLEMENTED");
+  }
+}
+
+// ****************************************************************************
+// *                            BACKWARD PASS                                 *
+// ****************************************************************************
+
+torch::Tensor RasterizePointsBackwardCpu(
+    const torch::Tensor& points,
+    const torch::Tensor& idxs,
+    const torch::Tensor& grad_zbuf,
+    const torch::Tensor& grad_dists);
+
+#ifdef WITH_CUDA
+torch::Tensor RasterizePointsBackwardCuda(
+    const torch::Tensor& points,
+    const torch::Tensor& idxs,
+    const torch::Tensor& grad_zbuf,
+    const torch::Tensor& grad_dists);
+#endif
+// Args:
+//  points: Tensor of shape (P, 3) giving (packed) positions for
+//          points in all N pointclouds in the batch where P is the total
+//          number of points in the batch across all pointclouds. These points
+//          are expected to be in NDC coordinates in the range [-1, 1].
+//  idxs: int32 Tensor of shape (N, H, W, K) (from forward pass)
+//  grad_zbuf: float32 Tensor of shape (N, H, W, K) giving upstream gradient
+//             d(loss)/d(zbuf) of the distances from each pixel to its nearest
+//             points.
+//  grad_dists: Tensor of shape (N, H, W, K) giving upstream gradient
+//              d(loss)/d(dists) of the dists tensor returned by the forward
+//              pass.
+//
+// Returns:
+//  grad_points: float32 Tensor of shape (N, P, 3) giving downstream gradients
+torch::Tensor RasterizePointsBackward(
+    const torch::Tensor& points,
+    const torch::Tensor& idxs,
+    const torch::Tensor& grad_zbuf,
+    const torch::Tensor& grad_dists) {
+  if (points.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(idxs);
+    CHECK_CUDA(grad_zbuf);
+    CHECK_CUDA(grad_dists);
+    return RasterizePointsBackwardCuda(points, idxs, grad_zbuf, grad_dists);
+#else
+    AT_ERROR("Not compiled with GPU support");
+#endif
+  } else {
+    return RasterizePointsBackwardCpu(points, idxs, grad_zbuf, grad_dists);
+  }
+}
+
+// ****************************************************************************
+// *                         MAIN ENTRY POINT                                 *
+// ****************************************************************************
+
+// This is the main entry point for the forward pass of the point rasterizer;
+// it uses either naive or coarse-to-fine rasterization based on bin_size.
+//
+// Args:
+//  points: Tensor of shape (P, 3) giving (packed) positions for
+//          points in all N pointclouds in the batch where P is the total
+//          number of points in the batch across all pointclouds. These points
+//          are expected to be in NDC coordinates in the range [-1, 1].
+//  cloud_to_packed_first_idx: LongTensor of shape (N) giving the index in
+//                          points_packed of the first point in each pointcloud
+//                          in the batch where N is the batch size.
+//  num_points_per_cloud: LongTensor of shape (N) giving the number of points
+//                        for each pointcloud in the batch.
+//  image_size: Tuple (H, W) giving the size in pixels of the output
+//              image to be rasterized.
+//  radius: FloatTensor of shape (P) giving the radius (in NDC units) of
+//          each point in points.
+//  points_per_pixel: (K) The number of points to return for each pixel
+//  bin_size: Bin size (in pixels) for coarse-to-fine rasterization. Setting
+//            bin_size=0 uses naive rasterization instead.
+//  max_points_per_bin: The maximum number of points allowed to fall into each
+//                      bin when using coarse-to-fine rasterization.
+//
+// Returns:
+//  idxs: int32 Tensor of shape (N, S, S, K) giving the indices of the
+//        closest K points along the z-axis for each pixel, padded with -1 for
+//        pixels hit by fewer than K points. The indices refer to points in
+//        points packed i.e a tensor of shape (P, 3) representing the flattened
+//        points for all pointclouds in the batch.
+//  zbuf: float32 Tensor of shape (N, S, S, K) giving the depth of each of each
+//        closest point for each pixel
+//  dists: float32 Tensor of shape (N, S, S, K) giving squared Euclidean
+//         distance in the (NDC) x/y plane between each pixel and its K closest
+//         points along the z axis.
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePoints(
+    const torch::Tensor& points,
+    const torch::Tensor& cloud_to_packed_first_idx,
+    const torch::Tensor& num_points_per_cloud,
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int points_per_pixel,
+    const int bin_size,
+    const int max_points_per_bin) {
+  if (bin_size == 0) {
+    // Use the naive per-pixel implementation
+    return RasterizePointsNaive(
+        points,
+        cloud_to_packed_first_idx,
+        num_points_per_cloud,
+        image_size,
+        radius,
+        points_per_pixel);
+  } else {
+    // Use coarse-to-fine rasterization
+    const auto bin_points = RasterizePointsCoarse(
+        points,
+        cloud_to_packed_first_idx,
+        num_points_per_cloud,
+        image_size,
+        radius,
+        bin_size,
+        max_points_per_bin);
+    return RasterizePointsFine(
+        points, bin_points, image_size, radius, bin_size, points_per_pixel);
+  }
+}
diff --git a/pytorch3d/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp b/pytorch3d/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3266b86e4ad31dbb1b0fd7d24c0753725c481fc9
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/rasterize_points/rasterize_points_cpu.cpp
@@ -0,0 +1,251 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <queue>
+#include <tuple>
+#include "rasterization_utils.h"
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> RasterizePointsNaiveCpu(
+    const torch::Tensor& points, // (P, 3)
+    const torch::Tensor& cloud_to_packed_first_idx, // (N)
+    const torch::Tensor& num_points_per_cloud, // (N)
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int points_per_pixel) {
+  const int32_t N = cloud_to_packed_first_idx.size(0); // batch_size.
+
+  const int H = std::get<0>(image_size);
+  const int W = std::get<1>(image_size);
+  const int K = points_per_pixel;
+
+  // Initialize output tensors.
+  auto int_opts = num_points_per_cloud.options().dtype(torch::kInt32);
+  auto float_opts = points.options().dtype(torch::kFloat32);
+  torch::Tensor point_idxs = torch::full({N, H, W, K}, -1, int_opts);
+  torch::Tensor zbuf = torch::full({N, H, W, K}, -1, float_opts);
+  torch::Tensor pix_dists = torch::full({N, H, W, K}, -1, float_opts);
+
+  auto points_a = points.accessor<float, 2>();
+  auto point_idxs_a = point_idxs.accessor<int32_t, 4>();
+  auto zbuf_a = zbuf.accessor<float, 4>();
+  auto pix_dists_a = pix_dists.accessor<float, 4>();
+  auto radius_a = radius.accessor<float, 1>();
+
+  for (int n = 0; n < N; ++n) {
+    // Loop through each pointcloud in the batch.
+    // Get the start index of the points in points_packed and the num points
+    // in the point cloud.
+    const int point_start_idx =
+        cloud_to_packed_first_idx[n].item().to<int32_t>();
+    const int point_stop_idx =
+        (point_start_idx + num_points_per_cloud[n].item().to<int32_t>());
+
+    for (int yi = 0; yi < H; ++yi) {
+      // Reverse the order of yi so that +Y is pointing upwards in the image.
+      const int yidx = H - 1 - yi;
+      const float yf = PixToNonSquareNdc(yidx, H, W);
+
+      for (int xi = 0; xi < W; ++xi) {
+        // Reverse the order of xi so that +X is pointing to the left in the
+        // image.
+        const int xidx = W - 1 - xi;
+        const float xf = PixToNonSquareNdc(xidx, W, H);
+
+        // Use a priority queue to hold (z, idx, r)
+        std::priority_queue<std::tuple<float, int, float>> q;
+        for (int p = point_start_idx; p < point_stop_idx; ++p) {
+          const float px = points_a[p][0];
+          const float py = points_a[p][1];
+          const float pz = points_a[p][2];
+          const float p_radius = radius_a[p];
+          const float radius2 = p_radius * p_radius;
+          if (pz < 0) {
+            continue;
+          }
+          const float dx = px - xf;
+          const float dy = py - yf;
+          const float dist2 = dx * dx + dy * dy;
+          if (dist2 < radius2) {
+            // The current point hit the current pixel
+            q.emplace(pz, p, dist2);
+            if ((int)q.size() > K) {
+              q.pop();
+            }
+          }
+        }
+        // Now all the points have been seen, so pop elements off the queue
+        // one by one and write them into the output tensors.
+        while (!q.empty()) {
+          auto t = q.top();
+          q.pop();
+          int i = q.size();
+          zbuf_a[n][yi][xi][i] = std::get<0>(t);
+          point_idxs_a[n][yi][xi][i] = std::get<1>(t);
+          pix_dists_a[n][yi][xi][i] = std::get<2>(t);
+        }
+      }
+    }
+  }
+  return std::make_tuple(point_idxs, zbuf, pix_dists);
+}
+
+torch::Tensor RasterizePointsCoarseCpu(
+    const torch::Tensor& points, // (P, 3)
+    const torch::Tensor& cloud_to_packed_first_idx, // (N)
+    const torch::Tensor& num_points_per_cloud, // (N)
+    const std::tuple<int, int> image_size,
+    const torch::Tensor& radius,
+    const int bin_size,
+    const int max_points_per_bin) {
+  const int32_t N = cloud_to_packed_first_idx.size(0); // batch_size.
+  const int M = max_points_per_bin;
+
+  const float H = std::get<0>(image_size);
+  const float W = std::get<1>(image_size);
+
+  // Integer division round up.
+  const int BH = 1 + (H - 1) / bin_size;
+  const int BW = 1 + (W - 1) / bin_size;
+
+  auto opts = num_points_per_cloud.options().dtype(torch::kInt32);
+  torch::Tensor points_per_bin = torch::zeros({N, BH, BW}, opts);
+  torch::Tensor bin_points = torch::full({N, BH, BW, M}, -1, opts);
+
+  auto points_a = points.accessor<float, 2>();
+  auto points_per_bin_a = points_per_bin.accessor<int32_t, 3>();
+  auto bin_points_a = bin_points.accessor<int32_t, 4>();
+  auto radius_a = radius.accessor<float, 1>();
+
+  const float ndc_x_range = NonSquareNdcRange(W, H);
+  const float pixel_width_x = ndc_x_range / W;
+  const float bin_width_x = pixel_width_x * bin_size;
+
+  const float ndc_y_range = NonSquareNdcRange(H, W);
+  const float pixel_width_y = ndc_y_range / H;
+  const float bin_width_y = pixel_width_y * bin_size;
+
+  for (int n = 0; n < N; ++n) {
+    // Loop through each pointcloud in the batch.
+    // Get the start index of the points in points_packed and the num points
+    // in the point cloud.
+    const int point_start_idx =
+        cloud_to_packed_first_idx[n].item().to<int32_t>();
+    const int point_stop_idx =
+        (point_start_idx + num_points_per_cloud[n].item().to<int32_t>());
+
+    float bin_y_min = -1.0f;
+    float bin_y_max = bin_y_min + bin_width_y;
+
+    // Iterate through the horizontal bins from top to bottom.
+    for (int by = 0; by < BH; by++) {
+      float bin_x_min = -1.0f;
+      float bin_x_max = bin_x_min + bin_width_x;
+
+      // Iterate through bins on this horizontal line, left to right.
+      for (int bx = 0; bx < BW; bx++) {
+        int32_t points_hit = 0;
+        for (int p = point_start_idx; p < point_stop_idx; ++p) {
+          float px = points_a[p][0];
+          float py = points_a[p][1];
+          float pz = points_a[p][2];
+          const float p_radius = radius_a[p];
+          if (pz < 0) {
+            continue;
+          }
+          float point_x_min = px - p_radius;
+          float point_x_max = px + p_radius;
+          float point_y_min = py - p_radius;
+          float point_y_max = py + p_radius;
+
+          // Use a half-open interval so that points exactly on the
+          // boundary between bins will fall into exactly one bin.
+          bool x_hit = (point_x_min <= bin_x_max) && (bin_x_min <= point_x_max);
+          bool y_hit = (point_y_min <= bin_y_max) && (bin_y_min <= point_y_max);
+          if (x_hit && y_hit) {
+            // Got too many points for this bin, so throw an error.
+            if (points_hit >= max_points_per_bin) {
+              AT_ERROR("Got too many points per bin");
+            }
+            // The current point falls in the current bin, so
+            // record it.
+            bin_points_a[n][by][bx][points_hit] = p;
+            points_hit++;
+          }
+        }
+        // Record the number of points found in this bin
+        points_per_bin_a[n][by][bx] = points_hit;
+
+        // Shift the bin to the right for the next loop iteration
+        bin_x_min = bin_x_max;
+        bin_x_max = bin_x_min + bin_width_x;
+      }
+      // Shift the bin down for the next loop iteration
+      bin_y_min = bin_y_max;
+      bin_y_max = bin_y_min + bin_width_y;
+    }
+  }
+  return bin_points;
+}
+
+torch::Tensor RasterizePointsBackwardCpu(
+    const torch::Tensor& points, // (P, 3)
+    const torch::Tensor& idxs, // (N, H, W, K)
+    const torch::Tensor& grad_zbuf, // (N, H, W, K)
+    const torch::Tensor& grad_dists) { // (N, H, W, K)
+
+  const int N = idxs.size(0);
+  const int P = points.size(0);
+  const int H = idxs.size(1);
+  const int W = idxs.size(2);
+  const int K = idxs.size(3);
+
+  torch::Tensor grad_points = torch::zeros({P, 3}, points.options());
+
+  auto points_a = points.accessor<float, 2>();
+  auto idxs_a = idxs.accessor<int32_t, 4>();
+  auto grad_dists_a = grad_dists.accessor<float, 4>();
+  auto grad_zbuf_a = grad_zbuf.accessor<float, 4>();
+  auto grad_points_a = grad_points.accessor<float, 2>();
+
+  for (int n = 0; n < N; ++n) { // Loop over images in the batch
+    for (int y = 0; y < H; ++y) { // Loop over rows in the image
+      // Reverse the order of yi so that +Y is pointing upwards in the image.
+      const int yidx = H - 1 - y;
+      // Y coordinate of the top of the pixel.
+      const float yf = PixToNonSquareNdc(yidx, H, W);
+
+      // Iterate through pixels on this horizontal line, left to right.
+      for (int x = 0; x < W; ++x) { // Loop over pixels in the row
+
+        // Reverse the order of xi so that +X is pointing to the left in the
+        // image.
+        const int xidx = W - 1 - x;
+        const float xf = PixToNonSquareNdc(xidx, W, H);
+        for (int k = 0; k < K; ++k) { // Loop over points for the pixel
+          const int p = idxs_a[n][y][x][k];
+          if (p < 0) {
+            break;
+          }
+          const float grad_dist2 = grad_dists_a[n][y][x][k];
+          const float px = points_a[p][0];
+          const float py = points_a[p][1];
+          const float dx = px - xf;
+          const float dy = py - yf;
+          // Remember: dists[n][y][x][k] = dx * dx + dy * dy;
+          const float grad_px = 2.0f * grad_dist2 * dx;
+          const float grad_py = 2.0f * grad_dist2 * dy;
+          grad_points_a[p][0] += grad_px;
+          grad_points_a[p][1] += grad_py;
+          grad_points_a[p][2] += grad_zbuf_a[n][y][x][k];
+        }
+      }
+    }
+  }
+  return grad_points;
+}
diff --git a/pytorch3d/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.cu b/pytorch3d/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5437cccb39b29289911efce754d7b95cc1e3bbbf
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.cu
@@ -0,0 +1,252 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "utils/pytorch3d_cutils.h"
+#include "utils/warp_reduce.cuh"
+
+template <unsigned int block_size>
+__global__ void FarthestPointSamplingKernel(
+    // clang-format off
+    const at::PackedTensorAccessor64<float, 3, at::RestrictPtrTraits> points,
+    const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits> lengths,
+    const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits> K,
+    at::PackedTensorAccessor64<int64_t, 2, at::RestrictPtrTraits> idxs,
+    at::PackedTensorAccessor64<float, 2, at::RestrictPtrTraits> min_point_dist,
+    const at::PackedTensorAccessor64<int64_t, 1, at::RestrictPtrTraits> start_idxs
+    // clang-format on
+) {
+  // Get constants
+  const int64_t N = points.size(0);
+  const int64_t P = points.size(1);
+  const int64_t D = points.size(2);
+
+  // Create single shared memory buffer which is split and cast to different
+  // types: dists/dists_idx are used to save the maximum distances seen by the
+  // points processed by any one thread and the associated point indices.
+  // These values only need to be accessed by other threads in this block which
+  // are processing the same batch and not by other blocks.
+  extern __shared__ char shared_buf[];
+  float* dists = (float*)shared_buf; // block_size floats
+  int64_t* dists_idx = (int64_t*)&dists[block_size]; // block_size int64_t
+
+  // Get batch index and thread index
+  const int64_t batch_idx = blockIdx.x;
+  const size_t tid = threadIdx.x;
+
+  // If K is greater than the number of points in the pointcloud
+  // we only need to iterate until the smaller value is reached.
+  const int64_t k_n = min(K[batch_idx], lengths[batch_idx]);
+
+  // Write the first selected point to global memory in the first thread
+  int64_t selected = start_idxs[batch_idx];
+  if (tid == 0)
+    idxs[batch_idx][0] = selected;
+
+  // Iterate to find k_n sampled points
+  for (int64_t k = 1; k < k_n; ++k) {
+    // Keep track of the maximum of the minimum distance to previously selected
+    // points seen by this thread
+    int64_t max_dist_idx = 0;
+    float max_dist = -1.0;
+
+    // Iterate through all the points in this pointcloud. For already selected
+    // points, the minimum distance to the set of previously selected points
+    // will be 0.0 so they won't be selected again.
+    for (int64_t p = tid; p < lengths[batch_idx]; p += block_size) {
+      // Calculate the distance to the last selected point
+      float dist2 = 0.0;
+      for (int64_t d = 0; d < D; ++d) {
+        float diff = points[batch_idx][selected][d] - points[batch_idx][p][d];
+        dist2 += (diff * diff);
+      }
+
+      // If the distance of point p to the last selected point is
+      // less than the previous minimum distance of p to the set of selected
+      // points, then updated the corresponding value in min_point_dist
+      // so it always contains the min distance.
+      const float p_min_dist = min(dist2, min_point_dist[batch_idx][p]);
+      min_point_dist[batch_idx][p] = p_min_dist;
+
+      // Update the max distance and point idx for this thread.
+      max_dist_idx = (p_min_dist > max_dist) ? p : max_dist_idx;
+      max_dist = (p_min_dist > max_dist) ? p_min_dist : max_dist;
+    }
+
+    // After going through all points for this thread, save the max
+    // point and idx seen by this thread. Each thread sees P/block_size points.
+    dists[tid] = max_dist;
+    dists_idx[tid] = max_dist_idx;
+    // Sync to ensure all threads in the block have updated their max point.
+    __syncthreads();
+
+    // Parallelized block reduction to find the max point seen by
+    // all the threads in this block for iteration k.
+    // Each block represents one batch element so we can use a divide/conquer
+    // approach to find the max, syncing all threads after each step.
+
+    for (int s = block_size / 2; s > 0; s >>= 1) {
+      if (tid < s) {
+        // Compare the best point seen by two threads and update the shared
+        // memory at the location of the first thread index with the max out
+        // of the two threads.
+        if (dists[tid] < dists[tid + s]) {
+          dists[tid] = dists[tid + s];
+          dists_idx[tid] = dists_idx[tid + s];
+        }
+      }
+      __syncthreads();
+    }
+
+    // TODO(nikhilar): As reduction proceeds, the number of “active” threads
+    // decreases. When tid < 32, there should only be one warp left which could
+    // be unrolled.
+
+    // The overall max after reducing will be saved
+    // at the location of tid = 0.
+    selected = dists_idx[0];
+
+    if (tid == 0) {
+      // Write the farthest point for iteration k to global memory
+      idxs[batch_idx][k] = selected;
+    }
+  }
+}
+
+at::Tensor FarthestPointSamplingCuda(
+    const at::Tensor& points, // (N, P, 3)
+    const at::Tensor& lengths, // (N,)
+    const at::Tensor& K, // (N,)
+    const at::Tensor& start_idxs) {
+  // Check inputs are on the same device
+  at::TensorArg p_t{points, "points", 1}, lengths_t{lengths, "lengths", 2},
+      k_t{K, "K", 3}, start_idxs_t{start_idxs, "start_idxs", 4};
+  at::CheckedFrom c = "FarthestPointSamplingCuda";
+  at::checkAllSameGPU(c, {p_t, lengths_t, k_t, start_idxs_t});
+  at::checkAllSameType(c, {lengths_t, k_t, start_idxs_t});
+
+  // Set the device for the kernel launch based on the device of points
+  at::cuda::CUDAGuard device_guard(points.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  TORCH_CHECK(
+      points.size(0) == lengths.size(0),
+      "Point and lengths must have the same batch dimension");
+
+  TORCH_CHECK(
+      points.size(0) == K.size(0),
+      "Points and K must have the same batch dimension");
+
+  const int64_t N = points.size(0);
+  const int64_t P = points.size(1);
+  const int64_t max_K = at::max(K).item<int64_t>();
+
+  // Initialize the output tensor with the sampled indices
+  auto idxs = at::full({N, max_K}, -1, lengths.options());
+  auto min_point_dist = at::full({N, P}, 1e10, points.options());
+
+  if (N == 0 || P == 0) {
+    AT_CUDA_CHECK(cudaGetLastError());
+    return idxs;
+  }
+
+  // Set the number of blocks to the batch size so that the
+  // block reduction step can be done for each pointcloud
+  // to find the max distance point in the pointcloud at each iteration.
+  const size_t blocks = N;
+
+  // Set the threads to the nearest power of 2 of the number of
+  // points in the pointcloud (up to the max threads in a block).
+  // This will ensure each thread processes the minimum necessary number of
+  // points (P/threads).
+  const int points_pow_2 = std::log(static_cast<double>(P)) / std::log(2.0);
+  const size_t threads = max(min(1 << points_pow_2, MAX_THREADS_PER_BLOCK), 1);
+
+  // Create the accessors
+  auto points_a = points.packed_accessor64<float, 3, at::RestrictPtrTraits>();
+  auto lengths_a =
+      lengths.packed_accessor64<int64_t, 1, at::RestrictPtrTraits>();
+  auto K_a = K.packed_accessor64<int64_t, 1, at::RestrictPtrTraits>();
+  auto idxs_a = idxs.packed_accessor64<int64_t, 2, at::RestrictPtrTraits>();
+  auto start_idxs_a =
+      start_idxs.packed_accessor64<int64_t, 1, at::RestrictPtrTraits>();
+  auto min_point_dist_a =
+      min_point_dist.packed_accessor64<float, 2, at::RestrictPtrTraits>();
+
+  // Initialize the shared memory which will be used to store the
+  // distance/index of the best point seen by each thread.
+  size_t shared_mem = threads * sizeof(float) + threads * sizeof(int64_t);
+  // TODO: using shared memory for min_point_dist gives an ~2x speed up
+  // compared to using a global (N, P) shaped tensor, however for
+  // larger pointclouds this may exceed the shared memory limit per block.
+  // If a speed up is required for smaller pointclouds, then the storage
+  // could be switched to shared memory if the required total shared memory is
+  // within the memory limit per block.
+
+  // Support a case for all powers of 2 up to MAX_THREADS_PER_BLOCK possible per
+  // block.
+  switch (threads) {
+    case 1024:
+      FarthestPointSamplingKernel<1024>
+          <<<blocks, threads, shared_mem, stream>>>(
+              points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    case 512:
+      FarthestPointSamplingKernel<512><<<blocks, threads, shared_mem, stream>>>(
+          points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    case 256:
+      FarthestPointSamplingKernel<256><<<blocks, threads, shared_mem, stream>>>(
+          points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    case 128:
+      FarthestPointSamplingKernel<128><<<blocks, threads, shared_mem, stream>>>(
+          points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    case 64:
+      FarthestPointSamplingKernel<64><<<blocks, threads, shared_mem, stream>>>(
+          points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    case 32:
+      FarthestPointSamplingKernel<32><<<blocks, threads, shared_mem, stream>>>(
+          points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    case 16:
+      FarthestPointSamplingKernel<16><<<blocks, threads, shared_mem, stream>>>(
+          points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    case 8:
+      FarthestPointSamplingKernel<8><<<blocks, threads, shared_mem, stream>>>(
+          points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    case 4:
+      FarthestPointSamplingKernel<4><<<threads, threads, shared_mem, stream>>>(
+          points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    case 2:
+      FarthestPointSamplingKernel<2><<<threads, threads, shared_mem, stream>>>(
+          points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    case 1:
+      FarthestPointSamplingKernel<1><<<threads, threads, shared_mem, stream>>>(
+          points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+      break;
+    default:
+      FarthestPointSamplingKernel<1024>
+          <<<blocks, threads, shared_mem, stream>>>(
+              points_a, lengths_a, K_a, idxs_a, min_point_dist_a, start_idxs_a);
+  }
+
+  AT_CUDA_CHECK(cudaGetLastError());
+  return idxs;
+}
diff --git a/pytorch3d/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.h b/pytorch3d/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.h
new file mode 100644
index 0000000000000000000000000000000000000000..87c7faf541e0120908b132cc3af18380a4a86d1b
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/sample_farthest_points/sample_farthest_points.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <tuple>
+#include "utils/pytorch3d_cutils.h"
+
+// Iterative farthest point sampling algorithm [1] to subsample a set of
+// K points from a given pointcloud. At each iteration, a point is selected
+// which has the largest nearest neighbor distance to any of the
+// already selected points.
+
+// Farthest point sampling provides more uniform coverage of the input
+// point cloud compared to uniform random sampling.
+
+// [1] Charles R. Qi et al, "PointNet++: Deep Hierarchical Feature Learning
+//     on Point Sets in a Metric Space", NeurIPS 2017.
+
+// Args:
+//     points: (N, P, D) float32 Tensor containing the batch of pointclouds.
+//     lengths: (N,) long Tensor giving the number of points in each pointcloud
+//        (to support heterogeneous batches of pointclouds).
+//     K: a tensor of length (N,) giving the number of
+//        samples to select for each element in the batch.
+//        The number of samples is typically << P.
+//     start_idxs: (N,) long Tensor giving the index of the first point to
+//        sample. Default is all 0. When a random start point is required,
+//        start_idxs should be set to a random value between [0, lengths[n]]
+//        for batch element n.
+// Returns:
+//     selected_indices: (N, K) array of selected indices. If the values in
+//        K are not all the same, then the shape will be (N, max(K), D), and
+//        padded with -1 for batch elements where k_i < max(K). The selected
+//        points are gathered in the pytorch autograd wrapper.
+
+at::Tensor FarthestPointSamplingCuda(
+    const at::Tensor& points,
+    const at::Tensor& lengths,
+    const at::Tensor& K,
+    const at::Tensor& start_idxs);
+
+at::Tensor FarthestPointSamplingCpu(
+    const at::Tensor& points,
+    const at::Tensor& lengths,
+    const at::Tensor& K,
+    const at::Tensor& start_idxs);
+
+// Exposed implementation.
+at::Tensor FarthestPointSampling(
+    const at::Tensor& points,
+    const at::Tensor& lengths,
+    const at::Tensor& K,
+    const at::Tensor& start_idxs) {
+  if (points.is_cuda() || lengths.is_cuda() || K.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(points);
+    CHECK_CUDA(lengths);
+    CHECK_CUDA(K);
+    CHECK_CUDA(start_idxs);
+    return FarthestPointSamplingCuda(points, lengths, K, start_idxs);
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  return FarthestPointSamplingCpu(points, lengths, K, start_idxs);
+}
diff --git a/pytorch3d/pytorch3d/csrc/sample_farthest_points/sample_farthest_points_cpu.cpp b/pytorch3d/pytorch3d/csrc/sample_farthest_points/sample_farthest_points_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aaa860ce612b655f935397d5d800949ee560be54
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/sample_farthest_points/sample_farthest_points_cpu.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <iterator>
+#include <random>
+#include <vector>
+
+at::Tensor FarthestPointSamplingCpu(
+    const at::Tensor& points,
+    const at::Tensor& lengths,
+    const at::Tensor& K,
+    const at::Tensor& start_idxs) {
+  // Get constants
+  const int64_t N = points.size(0);
+  const int64_t P = points.size(1);
+  const int64_t D = points.size(2);
+  const int64_t max_K = torch::max(K).item<int64_t>();
+
+  // Initialize an output array for the sampled indices
+  // of shape (N, max_K)
+  auto long_opts = lengths.options();
+  torch::Tensor sampled_indices = torch::full({N, max_K}, -1, long_opts);
+
+  // Create accessors for all tensors
+  auto points_a = points.accessor<float, 3>();
+  auto lengths_a = lengths.accessor<int64_t, 1>();
+  auto k_a = K.accessor<int64_t, 1>();
+  auto sampled_indices_a = sampled_indices.accessor<int64_t, 2>();
+  auto start_idxs_a = start_idxs.accessor<int64_t, 1>();
+
+  // Initialize a mask to prevent duplicates
+  // If true, the point has already been selected.
+  std::vector<unsigned char> selected_points_mask(P, false);
+
+  // Initialize to infinity a vector of
+  // distances from each point to any of the previously selected points
+  std::vector<float> dists(P, std::numeric_limits<float>::max());
+
+  for (int64_t n = 0; n < N; ++n) {
+    // Resize and reset points mask and distances for each batch
+    selected_points_mask.resize(lengths_a[n]);
+    dists.resize(lengths_a[n]);
+    std::fill(selected_points_mask.begin(), selected_points_mask.end(), false);
+    std::fill(dists.begin(), dists.end(), std::numeric_limits<float>::max());
+
+    // Get the starting point index and save it
+    int64_t last_idx = start_idxs_a[n];
+    sampled_indices_a[n][0] = last_idx;
+
+    // Set the value of the mask at this point to false
+    selected_points_mask[last_idx] = true;
+
+    // For heterogeneous pointclouds, use the minimum of the
+    // length for that cloud compared to K as the number of
+    // points to sample
+    const int64_t batch_k = std::min(lengths_a[n], k_a[n]);
+
+    // Iteratively select batch_k points per batch
+    for (int64_t k = 1; k < batch_k; ++k) {
+      // Iterate through all the points
+      for (int64_t p = 0; p < lengths_a[n]; ++p) {
+        if (selected_points_mask[p]) {
+          // For already selected points set the distance to 0.0
+          dists[p] = 0.0;
+          continue;
+        }
+
+        // Calculate the distance to the last selected point
+        float dist2 = 0.0;
+        for (int64_t d = 0; d < D; ++d) {
+          float diff = points_a[n][last_idx][d] - points_a[n][p][d];
+          dist2 += diff * diff;
+        }
+
+        // If the distance of this point to the last selected point is closer
+        // than the distance to any of the previously selected points, then
+        // update this distance
+        if (dist2 < dists[p]) {
+          dists[p] = dist2;
+        }
+      }
+
+      // The aim is to pick the point that has the largest
+      // nearest neighbour distance to any of the already selected points
+      auto itr = std::max_element(dists.begin(), dists.end());
+      last_idx = std::distance(dists.begin(), itr);
+
+      // Save selected point
+      sampled_indices_a[n][k] = last_idx;
+
+      // Set the mask value to true to prevent duplicates.
+      selected_points_mask[last_idx] = true;
+    }
+  }
+
+  return sampled_indices;
+}
diff --git a/pytorch3d/pytorch3d/csrc/sample_pdf/sample_pdf.cu b/pytorch3d/pytorch3d/csrc/sample_pdf/sample_pdf.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f39b93f455b0daefa6c1cd3ac8525c466d6e6d72
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/sample_pdf/sample_pdf.cu
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+
+// There is no intermediate memory, so no reason not to have blocksize=32.
+// 256 is a reasonable number of blocks.
+
+// DESIGN
+// We exploit the fact that n_samples is not tiny.
+// A chunk of work is T*blocksize many samples from
+// a single batch elememt.
+// For each batch element there will be
+// chunks_per_batch = 1 + (n_samples-1)/(T*blocksize) of them.
+// The number of potential chunks to do is
+// n_chunks = chunks_per_batch * n_batches.
+// These chunks are divided among the gridSize-many blocks.
+// In block b, we work on chunks b, b+gridSize, b+2*gridSize etc .
+// In chunk i, we work on batch_element i/chunks_per_batch
+// on samples starting from (i%chunks_per_batch) * (T*blocksize)
+
+// BEGIN HYPOTHETICAL
+// Another option (not implemented) if batch_size was always large
+// would be as follows.
+
+// A chunk of work is S samples from each of blocksize-many
+// batch elements.
+// For each batch element there will be
+// chunks_per_batch = (1+(n_samples-1)/S) of them.
+// The number of potential chunks to do is
+// n_chunks = chunks_per_batch * (1+(n_batches-1)/blocksize)
+// These chunks are divided among the gridSize-many blocks.
+// In block b, we work on chunks b, b+gridSize, b+2*gridSize etc .
+// In chunk i, we work on samples starting from S*(i%chunks_per_batch)
+// on batch elements starting from blocksize*(i/chunks_per_batch).
+// END HYPOTHETICAL
+
+__global__ void SamplePdfCudaKernel(
+    const float* __restrict__ bins,
+    const float* __restrict__ weights,
+    float* __restrict__ outputs,
+    float eps,
+    const int T,
+    const int64_t batch_size,
+    const int64_t n_bins,
+    const int64_t n_samples) {
+  const int64_t chunks_per_batch = 1 + (n_samples - 1) / (T * blockDim.x);
+  const int64_t n_chunks = chunks_per_batch * batch_size;
+
+  for (int64_t i_chunk = blockIdx.x; i_chunk < n_chunks; i_chunk += gridDim.x) {
+    // Loop over the chunks.
+    int64_t i_batch_element = i_chunk / chunks_per_batch;
+    int64_t sample_start = (i_chunk % chunks_per_batch) * (T * blockDim.x);
+    const float* const weight_startp = weights + n_bins * i_batch_element;
+    const float* const bin_startp = bins + (1 + n_bins) * i_batch_element;
+
+    // Each chunk looks at a single batch element, so we do the preprocessing
+    // which depends on the batch element, namely finding the total weight.
+    // Idenntical work is being done in sync here by every thread of the block.
+    float total_weight = eps;
+    for (int64_t i_bin = 0; i_bin < n_bins; ++i_bin) {
+      total_weight += weight_startp[i_bin];
+    }
+
+    float* const output_startp =
+        outputs + n_samples * i_batch_element + sample_start;
+
+    for (int t = 0; t < T; ++t) {
+      // Loop over T, which is the number of samples each thread makes within
+      // the chunk.
+      const int64_t i_sample_within_chunk = threadIdx.x + t * blockDim.x;
+      if (sample_start + i_sample_within_chunk >= n_samples) {
+        // Some threads need to exit early because the sample they would
+        // make is unwanted.
+        continue;
+      }
+      // output_startp[i_sample_within_chunk] contains the quantile we (i.e.
+      // this thread) are calcvulating.
+      float uniform = total_weight * output_startp[i_sample_within_chunk];
+      int64_t i_bin = 0;
+      // We find the bin containing the quantile by walking along the weights.
+      // This loop must be thread dependent. I.e. the whole warp will wait until
+      // every thread has found the bin for its quantile.
+      // It may be best to write it differently.
+      while (i_bin + 1 < n_bins && uniform > weight_startp[i_bin]) {
+        uniform -= weight_startp[i_bin];
+        ++i_bin;
+      }
+
+      // Now we know which bin to look in, we use linear interpolation
+      // to find the location of the quantile within the bin, and
+      // write the answer back.
+      float bin_start = bin_startp[i_bin];
+      float bin_end = bin_startp[i_bin + 1];
+      float bin_weight = weight_startp[i_bin];
+      float output_value = bin_start;
+      if (uniform > bin_weight) {
+        output_value = bin_end;
+      } else if (bin_weight > eps) {
+        output_value += (uniform / bin_weight) * (bin_end - bin_start);
+      }
+      output_startp[i_sample_within_chunk] = output_value;
+    }
+  }
+}
+
+void SamplePdfCuda(
+    const at::Tensor& bins,
+    const at::Tensor& weights,
+    const at::Tensor& outputs,
+    float eps) {
+  // Check inputs are on the same device
+  at::TensorArg bins_t{bins, "bins", 1}, weights_t{weights, "weights", 2},
+      outputs_t{outputs, "outputs", 3};
+  at::CheckedFrom c = "SamplePdfCuda";
+  at::checkAllSameGPU(c, {bins_t, weights_t, outputs_t});
+  at::checkAllSameType(c, {bins_t, weights_t, outputs_t});
+
+  // Set the device for the kernel launch based on the device of the input
+  at::cuda::CUDAGuard device_guard(bins.device());
+  cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+
+  const int64_t batch_size = bins.size(0);
+  const int64_t n_bins = weights.size(1);
+  const int64_t n_samples = outputs.size(1);
+
+  const int64_t threads = 32;
+  const int64_t T = n_samples <= threads ? 1 : 2;
+  const int64_t chunks_per_batch = 1 + (n_samples - 1) / (T * threads);
+  const int64_t n_chunks = chunks_per_batch * batch_size;
+
+  const int64_t max_blocks = 1024;
+  const int64_t blocks = n_chunks < max_blocks ? n_chunks : max_blocks;
+
+  SamplePdfCudaKernel<<<blocks, threads, 0, stream>>>(
+      bins.contiguous().data_ptr<float>(),
+      weights.contiguous().data_ptr<float>(),
+      outputs.data_ptr<float>(), // Checked contiguous in header file.
+      eps,
+      T,
+      batch_size,
+      n_bins,
+      n_samples);
+
+  AT_CUDA_CHECK(cudaGetLastError());
+}
diff --git a/pytorch3d/pytorch3d/csrc/sample_pdf/sample_pdf.h b/pytorch3d/pytorch3d/csrc/sample_pdf/sample_pdf.h
new file mode 100644
index 0000000000000000000000000000000000000000..af963b2f74963a96b0886bb1eea77853551c75d3
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/sample_pdf/sample_pdf.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+#include <cstdio>
+#include <tuple>
+#include "utils/pytorch3d_cutils.h"
+
+// ****************************************************************************
+// *                          SamplePdf                                       *
+// ****************************************************************************
+
+//  Samples a probability density functions defined by bin edges `bins` and
+//  the non-negative per-bin probabilities `weights`.
+
+//  Args:
+//      bins: FloatTensor of shape `(batch_size, n_bins+1)` denoting the edges
+//      of the sampling bins.
+
+//      weights: FloatTensor of shape `(batch_size, n_bins)` containing
+//      non-negative numbers representing the probability of sampling the
+//      corresponding bin.
+
+//      uniforms: The quantiles to draw, FloatTensor of shape
+//      `(batch_size, n_samples)`.
+
+//      outputs: On call, this contains the quantiles to draw. It is overwritten
+//              with the drawn samples. FloatTensor of shape
+//              `(batch_size, n_samples), where `n_samples are drawn from each
+//               distribution.
+
+//      eps: A constant preventing division by zero in case empty bins are
+//      present.
+
+//  Not differentiable
+
+#ifdef WITH_CUDA
+void SamplePdfCuda(
+    const torch::Tensor& bins,
+    const torch::Tensor& weights,
+    const torch::Tensor& outputs,
+    float eps);
+#endif
+
+void SamplePdfCpu(
+    const torch::Tensor& bins,
+    const torch::Tensor& weights,
+    const torch::Tensor& outputs,
+    float eps);
+
+inline void SamplePdf(
+    const torch::Tensor& bins,
+    const torch::Tensor& weights,
+    const torch::Tensor& outputs,
+    float eps) {
+  if (bins.is_cuda()) {
+#ifdef WITH_CUDA
+    CHECK_CUDA(weights);
+    CHECK_CONTIGUOUS_CUDA(outputs);
+    SamplePdfCuda(bins, weights, outputs, eps);
+    return;
+#else
+    AT_ERROR("Not compiled with GPU support.");
+#endif
+  }
+  CHECK_CONTIGUOUS(outputs);
+  SamplePdfCpu(bins, weights, outputs, eps);
+}
diff --git a/pytorch3d/pytorch3d/csrc/sample_pdf/sample_pdf_cpu.cpp b/pytorch3d/pytorch3d/csrc/sample_pdf/sample_pdf_cpu.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db19f386fd8eea4e4ad8368a429bcef6c18ad7a2
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/sample_pdf/sample_pdf_cpu.cpp
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <torch/extension.h>
+#include <algorithm>
+#include <thread>
+#include <vector>
+
+// If the number of bins is the typical 64, it is
+// quicker to use binary search than linear scan.
+// With more bins, it is more important.
+// There is no equivalent CUDA implementation yet.
+#define USE_BINARY_SEARCH
+
+namespace {
+// This worker function does the job of SamplePdf but only on
+// batch elements in [start_batch, end_batch).
+void SamplePdfCpu_worker(
+    const torch::Tensor& bins,
+    const torch::Tensor& weights,
+    const torch::Tensor& outputs,
+    float eps,
+    int64_t start_batch,
+    int64_t end_batch) {
+  const int64_t n_bins = weights.size(1);
+  const int64_t n_samples = outputs.size(1);
+
+  auto bins_a = bins.accessor<float, 2>();
+  auto weights_a = weights.accessor<float, 2>();
+  float* output_p = outputs.data_ptr<float>() + start_batch * n_samples;
+
+#ifdef USE_BINARY_SEARCH
+  std::vector<float> partial_sums(n_bins);
+#endif
+
+  for (int64_t i_batch_elt = start_batch; i_batch_elt < end_batch;
+       ++i_batch_elt) {
+    auto bin_a = bins_a[i_batch_elt];
+    auto weight_a = weights_a[i_batch_elt];
+
+    // Here we do the work which has to be done once per batch element.
+    // i.e. (1) finding the total weight. (2) If using binary search,
+    // precompute the partial sums of the weights.
+
+    float total_weight = 0;
+    for (int64_t i_bin = 0; i_bin < n_bins; ++i_bin) {
+      total_weight += weight_a[i_bin];
+#ifdef USE_BINARY_SEARCH
+      partial_sums[i_bin] = total_weight;
+#endif
+    }
+    total_weight += eps;
+
+    for (int64_t i_sample = 0; i_sample < n_samples; ++i_sample) {
+      // Here we are taking a single random quantile (which is stored
+      // in *output_p) and using it to make a single sample, which we
+      // write back to the same location. First we find which bin
+      // the quantile lives in, either by binary search in the
+      // precomputed partial sums, or by scanning through the weights.
+
+      float uniform = total_weight * *output_p;
+#ifdef USE_BINARY_SEARCH
+      int64_t i_bin = std::lower_bound(
+                          partial_sums.begin(), --partial_sums.end(), uniform) -
+          partial_sums.begin();
+      if (i_bin > 0) {
+        uniform -= partial_sums[i_bin - 1];
+      }
+#else
+      int64_t i_bin = 0;
+      while (i_bin + 1 < n_bins && uniform > weight_a[i_bin]) {
+        uniform -= weight_a[i_bin];
+        ++i_bin;
+      }
+#endif
+
+      // Now i_bin identifies the bin the quantile lives in, we use
+      // straight line interpolation to find the position of the
+      // quantile within the bin, and write it to *output_p.
+
+      float bin_start = bin_a[i_bin];
+      float bin_end = bin_a[i_bin + 1];
+      float bin_weight = weight_a[i_bin];
+      float output_value = bin_start;
+      if (uniform > bin_weight) {
+        output_value = bin_end;
+      } else if (bin_weight > eps) {
+        output_value += (uniform / bin_weight) * (bin_end - bin_start);
+      }
+      *output_p = output_value;
+      ++output_p;
+    }
+  }
+}
+
+} // anonymous namespace
+
+void SamplePdfCpu(
+    const torch::Tensor& bins,
+    const torch::Tensor& weights,
+    const torch::Tensor& outputs,
+    float eps) {
+  const int64_t batch_size = bins.size(0);
+  const int64_t max_threads = std::min(4, at::get_num_threads());
+  const int64_t n_threads = std::min(max_threads, batch_size);
+  if (batch_size == 0) {
+    return;
+  }
+
+  // SamplePdfCpu_worker does the work of this function. We send separate ranges
+  // of batch elements to that function in nThreads-1 separate threads.
+
+  std::vector<std::thread> threads;
+  threads.reserve(n_threads - 1);
+  const int64_t batch_elements_per_thread = 1 + (batch_size - 1) / n_threads;
+  int64_t start_batch = 0;
+  for (int iThread = 0; iThread < n_threads - 1; ++iThread) {
+    threads.emplace_back(
+        SamplePdfCpu_worker,
+        bins,
+        weights,
+        outputs,
+        eps,
+        start_batch,
+        start_batch + batch_elements_per_thread);
+    start_batch += batch_elements_per_thread;
+  }
+
+  // The remaining batch elements are calculated in this threads. If nThreads is
+  // 1 then all the work happens in this line.
+  SamplePdfCpu_worker(bins, weights, outputs, eps, start_batch, batch_size);
+  for (auto&& thread : threads) {
+    thread.join();
+  }
+}
diff --git a/pytorch3d/pytorch3d/csrc/utils/dispatch.cuh b/pytorch3d/pytorch3d/csrc/utils/dispatch.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..306315b93ed7354490950fab5037da53467a0858
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/utils/dispatch.cuh
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This file provides utilities for dispatching to specialized versions of
+// functions. This is especially useful for CUDA kernels, since specializing
+// them to particular input sizes can often allow the compiler to unroll loops
+// and place arrays into registers, which can give huge performance speedups.
+//
+// As an example, suppose we have the following function which is specialized
+// based on a compile-time int64_t value:
+//
+// template<typename T, int64_t x>
+// struct SquareOffset {
+//   static void run(T y) {
+//     T val = x * x + y;
+//     std::cout << val << std::endl;
+//   }
+// }
+//
+// This function takes one compile-time argument x, and one run-time argument y.
+// We might want to compile specialized versions of this for x=0, x=1, etc and
+// then dispatch to the correct one based on the runtime value of x.
+// One simple way to achieve this is with a lookup table:
+//
+// template<typename T>
+// void DispatchSquareOffset(const int64_t x, T y) {
+//   if (x == 0) {
+//     SquareOffset<T, 0>::run(y);
+//   } else if (x == 1) {
+//     SquareOffset<T, 1>::run(y);
+//   } else if (x == 2) {
+//     SquareOffset<T, 2>::run(y);
+//   }
+// }
+//
+// This function takes both x and y as run-time arguments, and dispatches to
+// different specialized versions of SquareOffset based on the run-time value
+// of x. This works, but it's tedious and error-prone. If we want to change the
+// set of x values for which we provide compile-time specializations, then we
+// will need to do a lot of tedius editing of the dispatch function. Also, if we
+// want to provide compile-time specializations for another function other than
+// SquareOffset, we will need to duplicate the entire lookup table.
+//
+// To solve these problems, we can use the DispatchKernel1D function provided by
+// this file instead:
+//
+// template<typename T>
+// void DispatchSquareOffset(const int64_t x, T y) {
+//     constexpr int64_t xmin = 0;
+//     constexpr int64_t xmax = 2;
+//     DispatchKernel1D<SquareOffset, T, xmin, xmax>(x, y);
+// }
+//
+// DispatchKernel1D uses template metaprogramming to compile specialized
+// versions of SquareOffset for all values of x with xmin <= x <= xmax, and
+// then dispatches to the correct one based on the run-time value of x. If we
+// want to change the range of x values for which SquareOffset is specialized
+// at compile-time, then all we have to do is change the values of the
+// compile-time constants xmin and xmax.
+//
+// This file also allows us to similarly dispatch functions that depend on two
+// compile-time int64_t values, using the DispatchKernel2D function like this:
+//
+// template<typename T, int64_t x, int64_t y>
+// struct Sum {
+//   static void run(T z, T w) {
+//     T val = x + y + z + w;
+//     std::cout << val << std::endl;
+//   }
+// }
+//
+// template<typename T>
+// void DispatchSum(const int64_t x, const int64_t y, int z, int w) {
+//   constexpr int64_t xmin = 1;
+//   constexpr int64_t xmax = 3;
+//   constexpr int64_t ymin = 2;
+//   constexpr int64_t ymax = 5;
+//   DispatchKernel2D<Sum, T, xmin, xmax, ymin, ymax>(x, y, z, w);
+// }
+//
+// Like its 1D counterpart, DispatchKernel2D uses template metaprogramming to
+// compile specialized versions of sum for all values of (x, y) with
+// xmin <= x <= xmax and ymin <= y <= ymax, then dispatches to the correct
+// specialized version based on the runtime values of x and y.
+
+// Define some helper structs in an anonymous namespace.
+namespace {
+
+// 1D dispatch: general case.
+// Kernel is the function we want to dispatch to; it should take a typename and
+// an int64_t as template args, and it should define a static void function
+// run which takes any number of arguments of any type.
+// In order to dispatch, we will take an additional template argument curN,
+// and increment it via template recursion until it is equal to the run-time
+// argument N.
+template <
+    template <typename, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t curN,
+    typename... Args>
+struct DispatchKernelHelper1D {
+  static void run(const int64_t N, Args... args) {
+    if (curN == N) {
+      // The compile-time value curN is equal to the run-time value N, so we
+      // can dispatch to the run method of the Kernel.
+      Kernel<T, curN>::run(args...);
+    } else if (curN < N) {
+      // Increment curN via template recursion
+      DispatchKernelHelper1D<Kernel, T, minN, maxN, curN + 1, Args...>::run(
+          N, args...);
+    }
+    // We shouldn't get here -- throw an error?
+  }
+};
+
+// 1D dispatch: Specialization when curN == maxN
+// We need this base case to avoid infinite template recursion.
+template <
+    template <typename, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    typename... Args>
+struct DispatchKernelHelper1D<Kernel, T, minN, maxN, maxN, Args...> {
+  static void run(const int64_t N, Args... args) {
+    if (N == maxN) {
+      Kernel<T, maxN>::run(args...);
+    }
+    // We shouldn't get here -- throw an error?
+  }
+};
+
+// 2D dispatch, general case.
+// This is similar to the 1D case: we take additional template args curN and
+// curM, and increment them via template recursion until they are equal to
+// the run-time values of N and M, at which point we dispatch to the run
+// method of the kernel.
+template <
+    template <typename, int64_t, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t curN,
+    int64_t minM,
+    int64_t maxM,
+    int64_t curM,
+    typename... Args>
+struct DispatchKernelHelper2D {
+  static void run(const int64_t N, const int64_t M, Args... args) {
+    if (curN == N && curM == M) {
+      Kernel<T, curN, curM>::run(args...);
+    } else if (curN < N && curM < M) {
+      // Increment both curN and curM. This isn't strictly necessary; we could
+      // just increment one or the other at each step. But this helps to cut
+      // on the number of recursive calls we make.
+      DispatchKernelHelper2D<
+          Kernel,
+          T,
+          minN,
+          maxN,
+          curN + 1,
+          minM,
+          maxM,
+          curM + 1,
+          Args...>::run(N, M, args...);
+    } else if (curN < N) {
+      // Increment curN only
+      DispatchKernelHelper2D<
+          Kernel,
+          T,
+          minN,
+          maxN,
+          curN + 1,
+          minM,
+          maxM,
+          curM,
+          Args...>::run(N, M, args...);
+    } else if (curM < M) {
+      // Increment curM only
+      DispatchKernelHelper2D<
+          Kernel,
+          T,
+          minN,
+          maxN,
+          curN,
+          minM,
+          maxM,
+          curM + 1,
+          Args...>::run(N, M, args...);
+    }
+  }
+};
+
+// 2D dispatch, specialization for curN == maxN
+template <
+    template <typename, int64_t, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t minM,
+    int64_t maxM,
+    int64_t curM,
+    typename... Args>
+struct DispatchKernelHelper2D<
+    Kernel,
+    T,
+    minN,
+    maxN,
+    maxN,
+    minM,
+    maxM,
+    curM,
+    Args...> {
+  static void run(const int64_t N, const int64_t M, Args... args) {
+    if (maxN == N && curM == M) {
+      Kernel<T, maxN, curM>::run(args...);
+    } else if (curM < maxM) {
+      DispatchKernelHelper2D<
+          Kernel,
+          T,
+          minN,
+          maxN,
+          maxN,
+          minM,
+          maxM,
+          curM + 1,
+          Args...>::run(N, M, args...);
+    }
+    // We should not get here -- throw an error?
+  }
+};
+
+// 2D dispatch, specialization for curM == maxM
+template <
+    template <typename, int64_t, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t curN,
+    int64_t minM,
+    int64_t maxM,
+    typename... Args>
+struct DispatchKernelHelper2D<
+    Kernel,
+    T,
+    minN,
+    maxN,
+    curN,
+    minM,
+    maxM,
+    maxM,
+    Args...> {
+  static void run(const int64_t N, const int64_t M, Args... args) {
+    if (curN == N && maxM == M) {
+      Kernel<T, curN, maxM>::run(args...);
+    } else if (curN < maxN) {
+      DispatchKernelHelper2D<
+          Kernel,
+          T,
+          minN,
+          maxN,
+          curN + 1,
+          minM,
+          maxM,
+          maxM,
+          Args...>::run(N, M, args...);
+    }
+    // We should not get here -- throw an error?
+  }
+};
+
+// 2D dispatch, specialization for curN == maxN, curM == maxM
+template <
+    template <typename, int64_t, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t minM,
+    int64_t maxM,
+    typename... Args>
+struct DispatchKernelHelper2D<
+    Kernel,
+    T,
+    minN,
+    maxN,
+    maxN,
+    minM,
+    maxM,
+    maxM,
+    Args...> {
+  static void run(const int64_t N, const int64_t M, Args... args) {
+    if (maxN == N && maxM == M) {
+      Kernel<T, maxN, maxM>::run(args...);
+    }
+    // We should not get here -- throw an error?
+  }
+};
+
+} // namespace
+
+// This is the function we expect users to call to dispatch to 1D functions
+template <
+    template <typename, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    typename... Args>
+void DispatchKernel1D(const int64_t N, Args... args) {
+  if (minN <= N && N <= maxN) {
+    // Kick off the template recursion by calling the Helper with curN = minN
+    DispatchKernelHelper1D<Kernel, T, minN, maxN, minN, Args...>::run(
+        N, args...);
+  }
+  // Maybe throw an error if we tried to dispatch outside the allowed range?
+}
+
+// This is the function we expect users to call to dispatch to 2D functions
+template <
+    template <typename, int64_t, int64_t>
+    class Kernel,
+    typename T,
+    int64_t minN,
+    int64_t maxN,
+    int64_t minM,
+    int64_t maxM,
+    typename... Args>
+void DispatchKernel2D(const int64_t N, const int64_t M, Args... args) {
+  if (minN <= N && N <= maxN && minM <= M && M <= maxM) {
+    // Kick off the template recursion by calling the Helper with curN = minN
+    // and curM = minM
+    DispatchKernelHelper2D<
+        Kernel,
+        T,
+        minN,
+        maxN,
+        minN,
+        minM,
+        maxM,
+        minM,
+        Args...>::run(N, M, args...);
+  }
+  // Maybe throw an error if we tried to dispatch outside the specified range?
+}
diff --git a/pytorch3d/pytorch3d/csrc/utils/float_math.cuh b/pytorch3d/pytorch3d/csrc/utils/float_math.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..1f7b2de1be0249ff5f69a74e2c9552102b31342c
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/utils/float_math.cuh
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <thrust/tuple.h>
+
+// Set epsilon
+#ifdef _MSC_VER
+#define vEpsilon 1e-8f
+#else
+const auto vEpsilon = 1e-8;
+#endif
+
+// Common functions and operators for float2.
+
+__device__ inline float2 operator-(const float2& a, const float2& b) {
+  return make_float2(a.x - b.x, a.y - b.y);
+}
+
+__device__ inline float2 operator+(const float2& a, const float2& b) {
+  return make_float2(a.x + b.x, a.y + b.y);
+}
+
+__device__ inline float2 operator/(const float2& a, const float2& b) {
+  return make_float2(a.x / b.x, a.y / b.y);
+}
+
+__device__ inline float2 operator/(const float2& a, const float b) {
+  return make_float2(a.x / b, a.y / b);
+}
+
+__device__ inline float2 operator*(const float2& a, const float2& b) {
+  return make_float2(a.x * b.x, a.y * b.y);
+}
+
+__device__ inline float2 operator*(const float a, const float2& b) {
+  return make_float2(a * b.x, a * b.y);
+}
+
+__device__ inline float FloatMin3(const float a, const float b, const float c) {
+  return fminf(a, fminf(b, c));
+}
+
+__device__ inline float FloatMax3(const float a, const float b, const float c) {
+  return fmaxf(a, fmaxf(b, c));
+}
+
+__device__ inline float dot(const float2& a, const float2& b) {
+  return a.x * b.x + a.y * b.y;
+}
+
+// Backward pass for the dot product.
+// Args:
+//     a, b: Coordinates of two points.
+//     grad_dot: Upstream gradient for the output.
+//
+// Returns:
+//    tuple of gradients for each of the input points:
+//      (float2 grad_a, float2 grad_b)
+//
+__device__ inline thrust::tuple<float2, float2>
+DotBackward(const float2& a, const float2& b, const float& grad_dot) {
+  return thrust::make_tuple(grad_dot * b, grad_dot * a);
+}
+
+__device__ inline float sum(const float2& a) {
+  return a.x + a.y;
+}
+
+// Common functions and operators for float3.
+
+__device__ inline float3 operator-(const float3& a, const float3& b) {
+  return make_float3(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+
+__device__ inline float3 operator+(const float3& a, const float3& b) {
+  return make_float3(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+__device__ inline float3 operator/(const float3& a, const float3& b) {
+  return make_float3(a.x / b.x, a.y / b.y, a.z / b.z);
+}
+
+__device__ inline float3 operator/(const float3& a, const float b) {
+  return make_float3(a.x / b, a.y / b, a.z / b);
+}
+
+__device__ inline float3 operator*(const float3& a, const float3& b) {
+  return make_float3(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+
+__device__ inline float3 operator*(const float a, const float3& b) {
+  return make_float3(a * b.x, a * b.y, a * b.z);
+}
+
+__device__ inline float dot(const float3& a, const float3& b) {
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+__device__ inline float sum(const float3& a) {
+  return a.x + a.y + a.z;
+}
+
+__device__ inline float3 cross(const float3& a, const float3& b) {
+  return make_float3(
+      a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+}
+
+__device__ inline thrust::tuple<float3, float3>
+cross_backward(const float3& a, const float3& b, const float3& grad_cross) {
+  const float grad_ax = -grad_cross.y * b.z + grad_cross.z * b.y;
+  const float grad_ay = grad_cross.x * b.z - grad_cross.z * b.x;
+  const float grad_az = -grad_cross.x * b.y + grad_cross.y * b.x;
+  const float3 grad_a = make_float3(grad_ax, grad_ay, grad_az);
+
+  const float grad_bx = grad_cross.y * a.z - grad_cross.z * a.y;
+  const float grad_by = -grad_cross.x * a.z + grad_cross.z * a.x;
+  const float grad_bz = grad_cross.x * a.y - grad_cross.y * a.x;
+  const float3 grad_b = make_float3(grad_bx, grad_by, grad_bz);
+
+  return thrust::make_tuple(grad_a, grad_b);
+}
+
+__device__ inline float norm(const float3& a) {
+  return sqrt(dot(a, a));
+}
+
+__device__ inline float3 normalize(const float3& a) {
+  return a / (norm(a) + vEpsilon);
+}
+
+__device__ inline float3 normalize_backward(
+    const float3& a,
+    const float3& grad_normz) {
+  const float a_norm = norm(a) + vEpsilon;
+  const float3 out = a / a_norm;
+
+  const float grad_ax = grad_normz.x * (1.0f - out.x * out.x) / a_norm +
+      grad_normz.y * (-out.x * out.y) / a_norm +
+      grad_normz.z * (-out.x * out.z) / a_norm;
+  const float grad_ay = grad_normz.x * (-out.x * out.y) / a_norm +
+      grad_normz.y * (1.0f - out.y * out.y) / a_norm +
+      grad_normz.z * (-out.y * out.z) / a_norm;
+  const float grad_az = grad_normz.x * (-out.x * out.z) / a_norm +
+      grad_normz.y * (-out.y * out.z) / a_norm +
+      grad_normz.z * (1.0f - out.z * out.z) / a_norm;
+  return make_float3(grad_ax, grad_ay, grad_az);
+}
diff --git a/pytorch3d/pytorch3d/csrc/utils/geometry_utils.cuh b/pytorch3d/pytorch3d/csrc/utils/geometry_utils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..9e2979aca71206b2dd223e99e563e1011e484e6c
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/utils/geometry_utils.cuh
@@ -0,0 +1,783 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <cstdio>
+#include "float_math.cuh"
+
+// Set epsilon for preventing floating point errors and division by 0.
+#ifdef _MSC_VER
+#define kEpsilon 1e-8f
+#else
+const auto kEpsilon = 1e-8;
+#endif
+
+// ************************************************************* //
+//                          vec2 utils                           //
+// ************************************************************* //
+
+// Determines whether a point p is on the right side of a 2D line segment
+// given by the end points v0, v1.
+//
+// Args:
+//     p: vec2 Coordinates of a point.
+//     v0, v1: vec2 Coordinates of the end points of the edge.
+//
+// Returns:
+//     area: The signed area of the parallelogram given by the vectors
+//           A = p - v0
+//           B = v1 - v0
+//
+__device__ inline float
+EdgeFunctionForward(const float2& p, const float2& v0, const float2& v1) {
+  return (p.x - v0.x) * (v1.y - v0.y) - (p.y - v0.y) * (v1.x - v0.x);
+}
+
+// Backward pass for the edge function returning partial dervivatives for each
+// of the input points.
+//
+// Args:
+//     p: vec2 Coordinates of a point.
+//     v0, v1: vec2 Coordinates of the end points of the edge.
+//     grad_edge: Upstream gradient for output from edge function.
+//
+// Returns:
+//     tuple of gradients for each of the input points:
+//     (float2 d_edge_dp, float2 d_edge_dv0, float2 d_edge_dv1)
+//
+__device__ inline thrust::tuple<float2, float2, float2> EdgeFunctionBackward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float& grad_edge) {
+  const float2 dedge_dp = make_float2(v1.y - v0.y, v0.x - v1.x);
+  const float2 dedge_dv0 = make_float2(p.y - v1.y, v1.x - p.x);
+  const float2 dedge_dv1 = make_float2(v0.y - p.y, p.x - v0.x);
+  return thrust::make_tuple(
+      grad_edge * dedge_dp, grad_edge * dedge_dv0, grad_edge * dedge_dv1);
+}
+
+// The forward pass for computing the barycentric coordinates of a point
+// relative to a triangle.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the triangle vertices.
+//
+// Returns
+//     bary: (w0, w1, w2) barycentric coordinates in the range [0, 1].
+//
+__device__ inline float3 BarycentricCoordsForward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float2& v2) {
+  const float area = EdgeFunctionForward(v2, v0, v1) + kEpsilon;
+  const float w0 = EdgeFunctionForward(p, v1, v2) / area;
+  const float w1 = EdgeFunctionForward(p, v2, v0) / area;
+  const float w2 = EdgeFunctionForward(p, v0, v1) / area;
+  return make_float3(w0, w1, w2);
+}
+
+// The backward pass for computing the barycentric coordinates of a point
+// relative to a triangle.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: (x, y) coordinates of the triangle vertices.
+//     grad_bary_upstream: vec3<T> Upstream gradient for each of the
+//                         barycentric coordaintes [grad_w0, grad_w1, grad_w2].
+//
+// Returns
+//    tuple of gradients for each of the triangle vertices:
+//    (float2 grad_v0, float2 grad_v1, float2 grad_v2)
+//
+__device__ inline thrust::tuple<float2, float2, float2, float2>
+BarycentricCoordsBackward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float2& v2,
+    const float3& grad_bary_upstream) {
+  const float area = EdgeFunctionForward(v2, v0, v1) + kEpsilon;
+  const float area2 = pow(area, 2.0f);
+  const float e0 = EdgeFunctionForward(p, v1, v2);
+  const float e1 = EdgeFunctionForward(p, v2, v0);
+  const float e2 = EdgeFunctionForward(p, v0, v1);
+
+  const float grad_w0 = grad_bary_upstream.x;
+  const float grad_w1 = grad_bary_upstream.y;
+  const float grad_w2 = grad_bary_upstream.z;
+
+  // Calculate component of the gradient from each of w0, w1 and w2.
+  // e.g. for w0:
+  // dloss/dw0_v = dl/dw0 * dw0/dw0_top * dw0_top/dv
+  //               + dl/dw0 * dw0/dw0_bot * dw0_bot/dv
+  const float dw0_darea = -e0 / (area2);
+  const float dw0_e0 = 1 / area;
+  const float dloss_d_w0area = grad_w0 * dw0_darea;
+  const float dloss_e0 = grad_w0 * dw0_e0;
+  auto de0_dv = EdgeFunctionBackward(p, v1, v2, dloss_e0);
+  auto dw0area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w0area);
+  const float2 dw0_p = thrust::get<0>(de0_dv);
+  const float2 dw0_dv0 = thrust::get<1>(dw0area_dv);
+  const float2 dw0_dv1 = thrust::get<1>(de0_dv) + thrust::get<2>(dw0area_dv);
+  const float2 dw0_dv2 = thrust::get<2>(de0_dv) + thrust::get<0>(dw0area_dv);
+
+  const float dw1_darea = -e1 / (area2);
+  const float dw1_e1 = 1 / area;
+  const float dloss_d_w1area = grad_w1 * dw1_darea;
+  const float dloss_e1 = grad_w1 * dw1_e1;
+  auto de1_dv = EdgeFunctionBackward(p, v2, v0, dloss_e1);
+  auto dw1area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w1area);
+  const float2 dw1_p = thrust::get<0>(de1_dv);
+  const float2 dw1_dv0 = thrust::get<2>(de1_dv) + thrust::get<1>(dw1area_dv);
+  const float2 dw1_dv1 = thrust::get<2>(dw1area_dv);
+  const float2 dw1_dv2 = thrust::get<1>(de1_dv) + thrust::get<0>(dw1area_dv);
+
+  const float dw2_darea = -e2 / (area2);
+  const float dw2_e2 = 1 / area;
+  const float dloss_d_w2area = grad_w2 * dw2_darea;
+  const float dloss_e2 = grad_w2 * dw2_e2;
+  auto de2_dv = EdgeFunctionBackward(p, v0, v1, dloss_e2);
+  auto dw2area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w2area);
+  const float2 dw2_p = thrust::get<0>(de2_dv);
+  const float2 dw2_dv0 = thrust::get<1>(de2_dv) + thrust::get<1>(dw2area_dv);
+  const float2 dw2_dv1 = thrust::get<2>(de2_dv) + thrust::get<2>(dw2area_dv);
+  const float2 dw2_dv2 = thrust::get<0>(dw2area_dv);
+
+  const float2 dbary_p = dw0_p + dw1_p + dw2_p;
+  const float2 dbary_dv0 = dw0_dv0 + dw1_dv0 + dw2_dv0;
+  const float2 dbary_dv1 = dw0_dv1 + dw1_dv1 + dw2_dv1;
+  const float2 dbary_dv2 = dw0_dv2 + dw1_dv2 + dw2_dv2;
+
+  return thrust::make_tuple(dbary_p, dbary_dv0, dbary_dv1, dbary_dv2);
+}
+
+// Forward pass for applying perspective correction to barycentric coordinates.
+//
+// Args:
+//     bary: Screen-space barycentric coordinates for a point
+//     z0, z1, z2: Camera-space z-coordinates of the triangle vertices
+//
+// Returns
+//     World-space barycentric coordinates
+//
+__device__ inline float3 BarycentricPerspectiveCorrectionForward(
+    const float3& bary,
+    const float z0,
+    const float z1,
+    const float z2) {
+  const float w0_top = bary.x * z1 * z2;
+  const float w1_top = z0 * bary.y * z2;
+  const float w2_top = z0 * z1 * bary.z;
+  const float denom = fmaxf(w0_top + w1_top + w2_top, kEpsilon);
+  const float w0 = w0_top / denom;
+  const float w1 = w1_top / denom;
+  const float w2 = w2_top / denom;
+  return make_float3(w0, w1, w2);
+}
+
+// Backward pass for applying perspective correction to barycentric coordinates.
+//
+// Args:
+//     bary: Screen-space barycentric coordinates for a point
+//     z0, z1, z2: Camera-space z-coordinates of the triangle vertices
+//     grad_out: Upstream gradient of the loss with respect to the corrected
+//               barycentric coordinates.
+//
+// Returns a tuple of:
+//      grad_bary: Downstream gradient of the loss with respect to the the
+//                 uncorrected barycentric coordinates.
+//      grad_z0, grad_z1, grad_z2: Downstream gradient of the loss with respect
+//                                 to the z-coordinates of the triangle verts
+__device__ inline thrust::tuple<float3, float, float, float>
+BarycentricPerspectiveCorrectionBackward(
+    const float3& bary,
+    const float z0,
+    const float z1,
+    const float z2,
+    const float3& grad_out) {
+  // Recompute forward pass
+  const float w0_top = bary.x * z1 * z2;
+  const float w1_top = z0 * bary.y * z2;
+  const float w2_top = z0 * z1 * bary.z;
+  const float denom = fmaxf(w0_top + w1_top + w2_top, kEpsilon);
+
+  // Now do backward pass
+  const float grad_denom_top =
+      -w0_top * grad_out.x - w1_top * grad_out.y - w2_top * grad_out.z;
+  const float grad_denom = grad_denom_top / (denom * denom);
+  const float grad_w0_top = grad_denom + grad_out.x / denom;
+  const float grad_w1_top = grad_denom + grad_out.y / denom;
+  const float grad_w2_top = grad_denom + grad_out.z / denom;
+  const float grad_bary_x = grad_w0_top * z1 * z2;
+  const float grad_bary_y = grad_w1_top * z0 * z2;
+  const float grad_bary_z = grad_w2_top * z0 * z1;
+  const float3 grad_bary = make_float3(grad_bary_x, grad_bary_y, grad_bary_z);
+  const float grad_z0 = grad_w1_top * bary.y * z2 + grad_w2_top * bary.z * z1;
+  const float grad_z1 = grad_w0_top * bary.x * z2 + grad_w2_top * bary.z * z0;
+  const float grad_z2 = grad_w0_top * bary.x * z1 + grad_w1_top * bary.y * z0;
+  return thrust::make_tuple(grad_bary, grad_z0, grad_z1, grad_z2);
+}
+
+// Clip negative barycentric coordinates to 0.0 and renormalize so
+// the barycentric coordinates for a point sum to 1. When the blur_radius
+// is greater than 0, a face will still be recorded as overlapping a pixel
+// if the pixel is outside the face. In this case at least one of the
+// barycentric coordinates for the pixel relative to the face will be negative.
+// Clipping will ensure that the texture and z buffer are interpolated
+// correctly.
+//
+//  Args
+//     bary: (w0, w1, w2) barycentric coordinates which can be outside the
+//            range [0, 1].
+//
+//  Returns
+//     bary: (w0, w1, w2) barycentric coordinates in the range [0, 1] which
+//           satisfy the condition: sum(w0, w1, w2) = 1.0.
+//
+__device__ inline float3 BarycentricClipForward(const float3 bary) {
+  float3 w = make_float3(0.0f, 0.0f, 0.0f);
+  // Clamp lower bound only
+  w.x = max(bary.x, 0.0);
+  w.y = max(bary.y, 0.0);
+  w.z = max(bary.z, 0.0);
+  float w_sum = w.x + w.y + w.z;
+  w_sum = fmaxf(w_sum, 1e-5);
+  w.x /= w_sum;
+  w.y /= w_sum;
+  w.z /= w_sum;
+
+  return w;
+}
+
+// Backward pass for barycentric coordinate clipping.
+//
+//  Args
+//     bary: (w0, w1, w2) barycentric coordinates which can be outside the
+//            range [0, 1].
+//     grad_baryclip_upstream: vec3<T> Upstream gradient for each of the clipped
+//                         barycentric coordinates [grad_w0, grad_w1, grad_w2].
+//
+// Returns
+//    vec3<T> of gradients for the unclipped barycentric coordinates:
+//    (grad_w0, grad_w1, grad_w2)
+//
+__device__ inline float3 BarycentricClipBackward(
+    const float3 bary,
+    const float3 grad_baryclip_upstream) {
+  // Redo some of the forward pass calculations
+  float3 w = make_float3(0.0f, 0.0f, 0.0f);
+  // Clamp lower bound only
+  w.x = max(bary.x, 0.0);
+  w.y = max(bary.y, 0.0);
+  w.z = max(bary.z, 0.0);
+  float w_sum = w.x + w.y + w.z;
+
+  float3 grad_bary = make_float3(1.0f, 1.0f, 1.0f);
+  float3 grad_clip = make_float3(1.0f, 1.0f, 1.0f);
+  float3 grad_sum = make_float3(1.0f, 1.0f, 1.0f);
+
+  // Check if sum was clipped.
+  float grad_sum_clip = 1.0f;
+  if (w_sum < 1e-5) {
+    grad_sum_clip = 0.0f;
+    w_sum = 1e-5;
+  }
+
+  // Check if any of bary values have been clipped.
+  if (bary.x < 0.0f) {
+    grad_clip.x = 0.0f;
+  }
+  if (bary.y < 0.0f) {
+    grad_clip.y = 0.0f;
+  }
+  if (bary.z < 0.0f) {
+    grad_clip.z = 0.0f;
+  }
+
+  // Gradients of the sum.
+  grad_sum.x = -w.x / (pow(w_sum, 2.0f)) * grad_sum_clip;
+  grad_sum.y = -w.y / (pow(w_sum, 2.0f)) * grad_sum_clip;
+  grad_sum.z = -w.z / (pow(w_sum, 2.0f)) * grad_sum_clip;
+
+  // Gradients for each of the bary coordinates including the cross terms
+  // from the sum.
+  grad_bary.x = grad_clip.x *
+      (grad_baryclip_upstream.x * (1.0f / w_sum + grad_sum.x) +
+       grad_baryclip_upstream.y * (grad_sum.y) +
+       grad_baryclip_upstream.z * (grad_sum.z));
+
+  grad_bary.y = grad_clip.y *
+      (grad_baryclip_upstream.y * (1.0f / w_sum + grad_sum.y) +
+       grad_baryclip_upstream.x * (grad_sum.x) +
+       grad_baryclip_upstream.z * (grad_sum.z));
+
+  grad_bary.z = grad_clip.z *
+      (grad_baryclip_upstream.z * (1.0f / w_sum + grad_sum.z) +
+       grad_baryclip_upstream.x * (grad_sum.x) +
+       grad_baryclip_upstream.y * (grad_sum.y));
+
+  return grad_bary;
+}
+
+// Return minimum distance between line segment (v1 - v0) and point p.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1: Coordinates of the end points of the line segment.
+//
+// Returns:
+//     squared distance to the boundary of the triangle.
+//
+__device__ inline float
+PointLineDistanceForward(const float2& p, const float2& a, const float2& b) {
+  const float2 ba = b - a;
+  float l2 = dot(ba, ba);
+  float t = dot(ba, p - a) / l2;
+  if (l2 <= kEpsilon) {
+    return dot(p - b, p - b);
+  }
+  t = __saturatef(t); // clamp to the interval [+0.0, 1.0]
+  const float2 p_proj = a + t * ba;
+  const float2 d = (p_proj - p);
+  return dot(d, d); // squared distance
+}
+
+// Backward pass for point to line distance in 2D.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1: Coordinates of the end points of the line segment.
+//     grad_dist: Upstream gradient for the distance.
+//
+// Returns:
+//    tuple of gradients for each of the input points:
+//      (float2 grad_p, float2 grad_v0, float2 grad_v1)
+//
+__device__ inline thrust::tuple<float2, float2, float2>
+PointLineDistanceBackward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float& grad_dist) {
+  // Redo some of the forward pass calculations.
+  const float2 v1v0 = v1 - v0;
+  const float2 pv0 = p - v0;
+  const float t_bot = dot(v1v0, v1v0);
+  const float t_top = dot(v1v0, pv0);
+  float tt = t_top / t_bot;
+  tt = __saturatef(tt);
+  const float2 p_proj = (1.0f - tt) * v0 + tt * v1;
+  const float2 d = p - p_proj;
+  const float dist = sqrt(dot(d, d));
+
+  const float2 grad_p = -1.0f * grad_dist * 2.0f * (p_proj - p);
+  const float2 grad_v0 = grad_dist * (1.0f - tt) * 2.0f * (p_proj - p);
+  const float2 grad_v1 = grad_dist * tt * 2.0f * (p_proj - p);
+
+  return thrust::make_tuple(grad_p, grad_v0, grad_v1);
+}
+
+// The forward pass for calculating the shortest distance between a point
+// and a triangle.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the three triangle vertices.
+//
+// Returns:
+//     shortest squared distance from a point to a triangle.
+//
+__device__ inline float PointTriangleDistanceForward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float2& v2) {
+  // Compute distance to all 3 edges of the triangle and return the min.
+  const float e01_dist = PointLineDistanceForward(p, v0, v1);
+  const float e02_dist = PointLineDistanceForward(p, v0, v2);
+  const float e12_dist = PointLineDistanceForward(p, v1, v2);
+  const float edge_dist = fminf(fminf(e01_dist, e02_dist), e12_dist);
+  return edge_dist;
+}
+
+// Backward pass for point triangle distance.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the three triangle vertices.
+//     grad_dist: Upstream gradient for the distance.
+//
+// Returns:
+//    tuple of gradients for each of the triangle vertices:
+//      (float2 grad_v0, float2 grad_v1, float2 grad_v2)
+//
+__device__ inline thrust::tuple<float2, float2, float2, float2>
+PointTriangleDistanceBackward(
+    const float2& p,
+    const float2& v0,
+    const float2& v1,
+    const float2& v2,
+    const float& grad_dist) {
+  // Compute distance to all 3 edges of the triangle.
+  const float e01_dist = PointLineDistanceForward(p, v0, v1);
+  const float e02_dist = PointLineDistanceForward(p, v0, v2);
+  const float e12_dist = PointLineDistanceForward(p, v1, v2);
+
+  // Initialize output tensors.
+  float2 grad_v0 = make_float2(0.0f, 0.0f);
+  float2 grad_v1 = make_float2(0.0f, 0.0f);
+  float2 grad_v2 = make_float2(0.0f, 0.0f);
+  float2 grad_p = make_float2(0.0f, 0.0f);
+
+  // Find which edge is the closest and return PointLineDistanceBackward for
+  // that edge.
+  if (e01_dist <= e02_dist && e01_dist <= e12_dist) {
+    // Closest edge is v1 - v0.
+    auto grad_e01 = PointLineDistanceBackward(p, v0, v1, grad_dist);
+    grad_p = thrust::get<0>(grad_e01);
+    grad_v0 = thrust::get<1>(grad_e01);
+    grad_v1 = thrust::get<2>(grad_e01);
+  } else if (e02_dist <= e01_dist && e02_dist <= e12_dist) {
+    // Closest edge is v2 - v0.
+    auto grad_e02 = PointLineDistanceBackward(p, v0, v2, grad_dist);
+    grad_p = thrust::get<0>(grad_e02);
+    grad_v0 = thrust::get<1>(grad_e02);
+    grad_v2 = thrust::get<2>(grad_e02);
+  } else if (e12_dist <= e01_dist && e12_dist <= e02_dist) {
+    // Closest edge is v2 - v1.
+    auto grad_e12 = PointLineDistanceBackward(p, v1, v2, grad_dist);
+    grad_p = thrust::get<0>(grad_e12);
+    grad_v1 = thrust::get<1>(grad_e12);
+    grad_v2 = thrust::get<2>(grad_e12);
+  }
+
+  return thrust::make_tuple(grad_p, grad_v0, grad_v1, grad_v2);
+}
+
+// ************************************************************* //
+//                          vec3 utils                           //
+// ************************************************************* //
+
+// Computes the area of a triangle (v0, v1, v2).
+//
+// Args:
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns
+//     area: float: The area of the triangle
+//
+__device__ inline float
+AreaOfTriangle(const float3& v0, const float3& v1, const float3& v2) {
+  float3 p0 = v1 - v0;
+  float3 p1 = v2 - v0;
+
+  // compute the hypotenus of the scross product (p0 x p1)
+  float dd = hypot(
+      p0.y * p1.z - p0.z * p1.y,
+      hypot(p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x));
+
+  return dd / 2.0;
+}
+
+// Computes the barycentric coordinates of a point p relative
+// to a triangle (v0, v1, v2), i.e. p = w0 * v0 + w1 * v1 + w2 * v2
+// s.t. w0 + w1 + w2 = 1.0
+//
+// NOTE that this function assumes that p lives on the space spanned
+// by (v0, v1, v2).
+// TODO(gkioxari) explicitly check whether p is coplanar with (v0, v1, v2)
+// and throw an error if check fails
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns
+//     bary: (w0, w1, w2) barycentric coordinates
+//
+__device__ inline float3 BarycentricCoords3Forward(
+    const float3& p,
+    const float3& v0,
+    const float3& v1,
+    const float3& v2) {
+  float3 p0 = v1 - v0;
+  float3 p1 = v2 - v0;
+  float3 p2 = p - v0;
+
+  const float d00 = dot(p0, p0);
+  const float d01 = dot(p0, p1);
+  const float d11 = dot(p1, p1);
+  const float d20 = dot(p2, p0);
+  const float d21 = dot(p2, p1);
+
+  const float denom = d00 * d11 - d01 * d01 + kEpsilon;
+  const float w1 = (d11 * d20 - d01 * d21) / denom;
+  const float w2 = (d00 * d21 - d01 * d20) / denom;
+  const float w0 = 1.0f - w1 - w2;
+
+  return make_float3(w0, w1, w2);
+}
+
+// Checks whether the point p is inside the triangle (v0, v1, v2).
+// A point is inside the triangle, if all barycentric coordinates
+// wrt the triangle are >= 0 & <= 1.
+// If the triangle is degenerate, aka line or point, then return False.
+//
+// NOTE that this function assumes that p lives on the space spanned
+// by (v0, v1, v2).
+// TODO(gkioxari) explicitly check whether p is coplanar with (v0, v1, v2)
+// and throw an error if check fails
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns:
+//     inside: bool indicating wether p is inside triangle
+//
+__device__ inline bool IsInsideTriangle(
+    const float3& p,
+    const float3& v0,
+    const float3& v1,
+    const float3& v2) {
+  bool inside;
+  if (AreaOfTriangle(v0, v1, v2) < 1e-5) {
+    inside = 0;
+  } else {
+    float3 bary = BarycentricCoords3Forward(p, v0, v1, v2);
+    bool x_in = 0.0f <= bary.x && bary.x <= 1.0f;
+    bool y_in = 0.0f <= bary.y && bary.y <= 1.0f;
+    bool z_in = 0.0f <= bary.z && bary.z <= 1.0f;
+    inside = x_in && y_in && z_in;
+  }
+  return inside;
+}
+
+// Computes the minimum squared Euclidean distance between the point p
+// and the segment spanned by (v0, v1).
+// To find this we parametrize p as: x(t) = v0 + t * (v1 - v0)
+// and find t which minimizes (x(t) - p) ^ 2.
+// Note that p does not need to live in the space spanned by (v0, v1)
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1: vec3 coordinates of start and end of segment
+//
+// Returns:
+//     dist: the minimum squared distance of p from segment (v0, v1)
+//
+
+__device__ inline float
+PointLine3DistanceForward(const float3& p, const float3& v0, const float3& v1) {
+  const float3 v1v0 = v1 - v0;
+  const float3 pv0 = p - v0;
+  const float t_bot = dot(v1v0, v1v0);
+  const float t_top = dot(pv0, v1v0);
+  // if t_bot small, then v0 == v1, set tt to 0.
+  float tt = (t_bot < kEpsilon) ? 0.0f : (t_top / t_bot);
+
+  tt = __saturatef(tt); // clamps to [0, 1]
+
+  const float3 p_proj = v0 + tt * v1v0;
+  const float3 diff = p - p_proj;
+  const float dist = dot(diff, diff);
+  return dist;
+}
+
+// Backward function of the minimum squared Euclidean distance between the point
+// p and the line segment (v0, v1).
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1: vec3 coordinates of start and end of segment
+//     grad_dist: Float of the gradient wrt dist
+//
+// Returns:
+//    tuple of gradients for the point and line segment (v0, v1):
+//      (float3 grad_p, float3 grad_v0, float3 grad_v1)
+
+__device__ inline thrust::tuple<float3, float3, float3>
+PointLine3DistanceBackward(
+    const float3& p,
+    const float3& v0,
+    const float3& v1,
+    const float& grad_dist) {
+  const float3 v1v0 = v1 - v0;
+  const float3 pv0 = p - v0;
+  const float t_bot = dot(v1v0, v1v0);
+  const float t_top = dot(v1v0, pv0);
+
+  float3 grad_p = make_float3(0.0f, 0.0f, 0.0f);
+  float3 grad_v0 = make_float3(0.0f, 0.0f, 0.0f);
+  float3 grad_v1 = make_float3(0.0f, 0.0f, 0.0f);
+
+  const float tt = t_top / t_bot;
+
+  if (t_bot < kEpsilon) {
+    // if t_bot small, then v0 == v1,
+    // and dist = 0.5 * dot(pv0, pv0) + 0.5 * dot(pv1, pv1)
+    grad_p = grad_dist * 2.0f * pv0;
+    grad_v0 = -0.5f * grad_p;
+    grad_v1 = grad_v0;
+  } else if (tt < 0.0f) {
+    grad_p = grad_dist * 2.0f * pv0;
+    grad_v0 = -1.0f * grad_p;
+    // no gradients wrt v1
+  } else if (tt > 1.0f) {
+    grad_p = grad_dist * 2.0f * (p - v1);
+    grad_v1 = -1.0f * grad_p;
+    // no gradients wrt v0
+  } else {
+    const float3 p_proj = v0 + tt * v1v0;
+    const float3 diff = p - p_proj;
+    const float3 grad_base = grad_dist * 2.0f * diff;
+    grad_p = grad_base - dot(grad_base, v1v0) * v1v0 / t_bot;
+    const float3 dtt_v0 = (-1.0f * v1v0 - pv0 + 2.0f * tt * v1v0) / t_bot;
+    grad_v0 = (-1.0f + tt) * grad_base - dot(grad_base, v1v0) * dtt_v0;
+    const float3 dtt_v1 = (pv0 - 2.0f * tt * v1v0) / t_bot;
+    grad_v1 = -dot(grad_base, v1v0) * dtt_v1 - tt * grad_base;
+  }
+
+  return thrust::make_tuple(grad_p, grad_v0, grad_v1);
+}
+
+// Computes the squared distance of a point p relative to a triangle (v0, v1,
+// v2). If the point's projection p0 on the plane spanned by (v0, v1, v2) is
+// inside the triangle with vertices (v0, v1, v2), then the returned value is
+// the squared distance of p to its projection p0. Otherwise, the returned value
+// is the smallest squared distance of p from the line segments (v0, v1), (v0,
+// v2) and (v1, v2).
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns:
+//     dist: Float of the squared distance
+//
+
+__device__ inline float PointTriangle3DistanceForward(
+    const float3& p,
+    const float3& v0,
+    const float3& v1,
+    const float3& v2) {
+  float3 normal = cross(v2 - v0, v1 - v0);
+  const float norm_normal = norm(normal);
+  normal = normalize(normal);
+
+  // p0 is the projection of p on the plane spanned by (v0, v1, v2)
+  // i.e. p0 = p + t * normal, s.t. (p0 - v0) is orthogonal to normal
+  const float t = dot(v0 - p, normal);
+  const float3 p0 = p + t * normal;
+
+  bool is_inside = IsInsideTriangle(p0, v0, v1, v2);
+  float dist = 0.0f;
+
+  if ((is_inside) && (norm_normal > kEpsilon)) {
+    // if projection p0 is inside triangle spanned by (v0, v1, v2)
+    // then distance is equal to norm(p0 - p)^2
+    dist = t * t;
+  } else {
+    const float e01 = PointLine3DistanceForward(p, v0, v1);
+    const float e02 = PointLine3DistanceForward(p, v0, v2);
+    const float e12 = PointLine3DistanceForward(p, v1, v2);
+
+    dist = (e01 > e02) ? e02 : e01;
+    dist = (dist > e12) ? e12 : dist;
+  }
+
+  return dist;
+}
+
+// The backward pass for computing the squared distance of a point
+// to the triangle (v0, v1, v2).
+//
+// Args:
+//     p: xyz coordinates of a point
+//     v0, v1, v2: xyz coordinates of the triangle vertices
+//     grad_dist: Float of the gradient wrt dist
+//
+// Returns:
+//     tuple of gradients for the point and triangle:
+//        (float3 grad_p, float3 grad_v0, float3 grad_v1, float3 grad_v2)
+//
+
+__device__ inline thrust::tuple<float3, float3, float3, float3>
+PointTriangle3DistanceBackward(
+    const float3& p,
+    const float3& v0,
+    const float3& v1,
+    const float3& v2,
+    const float& grad_dist) {
+  const float3 v2v0 = v2 - v0;
+  const float3 v1v0 = v1 - v0;
+  const float3 v0p = v0 - p;
+  float3 raw_normal = cross(v2v0, v1v0);
+  const float norm_normal = norm(raw_normal);
+  float3 normal = normalize(raw_normal);
+
+  // p0 is the projection of p on the plane spanned by (v0, v1, v2)
+  // i.e. p0 = p + t * normal, s.t. (p0 - v0) is orthogonal to normal
+  const float t = dot(v0 - p, normal);
+  const float3 p0 = p + t * normal;
+  const float3 diff = t * normal;
+
+  bool is_inside = IsInsideTriangle(p0, v0, v1, v2);
+
+  float3 grad_p = make_float3(0.0f, 0.0f, 0.0f);
+  float3 grad_v0 = make_float3(0.0f, 0.0f, 0.0f);
+  float3 grad_v1 = make_float3(0.0f, 0.0f, 0.0f);
+  float3 grad_v2 = make_float3(0.0f, 0.0f, 0.0f);
+
+  if ((is_inside) && (norm_normal > kEpsilon)) {
+    // derivative of dist wrt p
+    grad_p = -2.0f * grad_dist * t * normal;
+    // derivative of dist wrt normal
+    const float3 grad_normal = 2.0f * grad_dist * t * (v0p + diff);
+    // derivative of dist wrt raw_normal
+    const float3 grad_raw_normal = normalize_backward(raw_normal, grad_normal);
+    // derivative of dist wrt v2v0 and v1v0
+    const auto grad_cross = cross_backward(v2v0, v1v0, grad_raw_normal);
+    const float3 grad_cross_v2v0 = thrust::get<0>(grad_cross);
+    const float3 grad_cross_v1v0 = thrust::get<1>(grad_cross);
+    grad_v0 =
+        grad_dist * 2.0f * t * normal - (grad_cross_v2v0 + grad_cross_v1v0);
+    grad_v1 = grad_cross_v1v0;
+    grad_v2 = grad_cross_v2v0;
+  } else {
+    const float e01 = PointLine3DistanceForward(p, v0, v1);
+    const float e02 = PointLine3DistanceForward(p, v0, v2);
+    const float e12 = PointLine3DistanceForward(p, v1, v2);
+
+    if ((e01 <= e02) && (e01 <= e12)) {
+      // e01 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v0, v1, grad_dist);
+      grad_p = thrust::get<0>(grads);
+      grad_v0 = thrust::get<1>(grads);
+      grad_v1 = thrust::get<2>(grads);
+    } else if ((e02 <= e01) && (e02 <= e12)) {
+      // e02 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v0, v2, grad_dist);
+      grad_p = thrust::get<0>(grads);
+      grad_v0 = thrust::get<1>(grads);
+      grad_v2 = thrust::get<2>(grads);
+    } else if ((e12 <= e01) && (e12 <= e02)) {
+      // e12 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v1, v2, grad_dist);
+      grad_p = thrust::get<0>(grads);
+      grad_v1 = thrust::get<1>(grads);
+      grad_v2 = thrust::get<2>(grads);
+    }
+  }
+
+  return thrust::make_tuple(grad_p, grad_v0, grad_v1, grad_v2);
+}
diff --git a/pytorch3d/pytorch3d/csrc/utils/geometry_utils.h b/pytorch3d/pytorch3d/csrc/utils/geometry_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..407849d8fa4abb29893cdecce8634d8d2ef905ad
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/utils/geometry_utils.h
@@ -0,0 +1,816 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <ATen/ATen.h>
+#include <algorithm>
+#include <tuple>
+#include <type_traits>
+#include "vec2.h"
+#include "vec3.h"
+
+// Set epsilon for preventing floating point errors and division by 0.
+const auto kEpsilon = 1e-8;
+
+// Determines whether a point p is on the right side of a 2D line segment
+// given by the end points v0, v1.
+//
+// Args:
+//     p: vec2 Coordinates of a point.
+//     v0, v1: vec2 Coordinates of the end points of the edge.
+//
+// Returns:
+//     area: The signed area of the parallelogram given by the vectors
+//           A = p - v0
+//           B = v1 - v0
+//
+//                 v1 ________
+//                   /\      /
+//               A  /  \    /
+//                 /    \  /
+//             v0 /______\/
+//                   B    p
+//
+//          The area can also be interpreted as the cross product A x B.
+//          If the sign of the area is positive, the point p is on the
+//          right side of the edge. Negative area indicates the point is on
+//          the left side of the edge. i.e. for an edge v1 - v0:
+//
+//                      v1
+//                     /
+//                    /
+//             -     /    +
+//                  /
+//                 /
+//               v0
+//
+template <typename T>
+T EdgeFunctionForward(const vec2<T>& p, const vec2<T>& v0, const vec2<T>& v1) {
+  const T edge = (p.x - v0.x) * (v1.y - v0.y) - (p.y - v0.y) * (v1.x - v0.x);
+  return edge;
+}
+
+// Backward pass for the edge function returning partial dervivatives for each
+// of the input points.
+//
+// Args:
+//     p: vec2 Coordinates of a point.
+//     v0, v1: vec2 Coordinates of the end points of the edge.
+//     grad_edge: Upstream gradient for output from edge function.
+//
+// Returns:
+//     tuple of gradients for each of the input points:
+//     (vec2<T> d_edge_dp, vec2<T> d_edge_dv0, vec2<T> d_edge_dv1)
+//
+template <typename T>
+inline std::tuple<vec2<T>, vec2<T>, vec2<T>> EdgeFunctionBackward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const T grad_edge) {
+  const vec2<T> dedge_dp(v1.y - v0.y, v0.x - v1.x);
+  const vec2<T> dedge_dv0(p.y - v1.y, v1.x - p.x);
+  const vec2<T> dedge_dv1(v0.y - p.y, p.x - v0.x);
+  return std::make_tuple(
+      grad_edge * dedge_dp, grad_edge * dedge_dv0, grad_edge * dedge_dv1);
+}
+
+// The forward pass for computing the barycentric coordinates of a point
+// relative to a triangle.
+// Ref:
+// https://www.scratchapixel.com/lessons/3d-basic-rendering/ray-tracing-rendering-a-triangle/barycentric-coordinates
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the triangle vertices.
+//
+// Returns
+//     bary: (w0, w1, w2) barycentric coordinates in the range [0, 1].
+//
+template <typename T>
+vec3<T> BarycentricCoordinatesForward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const vec2<T>& v2) {
+  const T area = EdgeFunctionForward(v2, v0, v1) + kEpsilon;
+  const T w0 = EdgeFunctionForward(p, v1, v2) / area;
+  const T w1 = EdgeFunctionForward(p, v2, v0) / area;
+  const T w2 = EdgeFunctionForward(p, v0, v1) / area;
+  return vec3<T>(w0, w1, w2);
+}
+
+// The backward pass for computing the barycentric coordinates of a point
+// relative to a triangle.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: (x, y) coordinates of the triangle vertices.
+//     grad_bary_upstream: vec3<T> Upstream gradient for each of the
+//                         barycentric coordaintes [grad_w0, grad_w1, grad_w2].
+//
+// Returns
+//    tuple of gradients for each of the triangle vertices:
+//    (vec2<T> grad_v0, vec2<T> grad_v1, vec2<T> grad_v2)
+//
+template <typename T>
+inline std::tuple<vec2<T>, vec2<T>, vec2<T>, vec2<T>> BarycentricCoordsBackward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const vec2<T>& v2,
+    const vec3<T>& grad_bary_upstream) {
+  const T area = EdgeFunctionForward(v2, v0, v1) + kEpsilon;
+  const T area2 = pow(area, 2.0f);
+  const T area_inv = 1.0f / area;
+  const T e0 = EdgeFunctionForward(p, v1, v2);
+  const T e1 = EdgeFunctionForward(p, v2, v0);
+  const T e2 = EdgeFunctionForward(p, v0, v1);
+
+  const T grad_w0 = grad_bary_upstream.x;
+  const T grad_w1 = grad_bary_upstream.y;
+  const T grad_w2 = grad_bary_upstream.z;
+
+  // Calculate component of the gradient from each of w0, w1 and w2.
+  // e.g. for w0:
+  // dloss/dw0_v = dl/dw0 * dw0/dw0_top * dw0_top/dv
+  //               + dl/dw0 * dw0/dw0_bot * dw0_bot/dv
+  const T dw0_darea = -e0 / (area2);
+  const T dw0_e0 = area_inv;
+  const T dloss_d_w0area = grad_w0 * dw0_darea;
+  const T dloss_e0 = grad_w0 * dw0_e0;
+  auto de0_dv = EdgeFunctionBackward(p, v1, v2, dloss_e0);
+  auto dw0area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w0area);
+  const vec2<T> dw0_p = std::get<0>(de0_dv);
+  const vec2<T> dw0_dv0 = std::get<1>(dw0area_dv);
+  const vec2<T> dw0_dv1 = std::get<1>(de0_dv) + std::get<2>(dw0area_dv);
+  const vec2<T> dw0_dv2 = std::get<2>(de0_dv) + std::get<0>(dw0area_dv);
+
+  const T dw1_darea = -e1 / (area2);
+  const T dw1_e1 = area_inv;
+  const T dloss_d_w1area = grad_w1 * dw1_darea;
+  const T dloss_e1 = grad_w1 * dw1_e1;
+  auto de1_dv = EdgeFunctionBackward(p, v2, v0, dloss_e1);
+  auto dw1area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w1area);
+  const vec2<T> dw1_p = std::get<0>(de1_dv);
+  const vec2<T> dw1_dv0 = std::get<2>(de1_dv) + std::get<1>(dw1area_dv);
+  const vec2<T> dw1_dv1 = std::get<2>(dw1area_dv);
+  const vec2<T> dw1_dv2 = std::get<1>(de1_dv) + std::get<0>(dw1area_dv);
+
+  const T dw2_darea = -e2 / (area2);
+  const T dw2_e2 = area_inv;
+  const T dloss_d_w2area = grad_w2 * dw2_darea;
+  const T dloss_e2 = grad_w2 * dw2_e2;
+  auto de2_dv = EdgeFunctionBackward(p, v0, v1, dloss_e2);
+  auto dw2area_dv = EdgeFunctionBackward(v2, v0, v1, dloss_d_w2area);
+  const vec2<T> dw2_p = std::get<0>(de2_dv);
+  const vec2<T> dw2_dv0 = std::get<1>(de2_dv) + std::get<1>(dw2area_dv);
+  const vec2<T> dw2_dv1 = std::get<2>(de2_dv) + std::get<2>(dw2area_dv);
+  const vec2<T> dw2_dv2 = std::get<0>(dw2area_dv);
+
+  const vec2<T> dbary_p = dw0_p + dw1_p + dw2_p;
+  const vec2<T> dbary_dv0 = dw0_dv0 + dw1_dv0 + dw2_dv0;
+  const vec2<T> dbary_dv1 = dw0_dv1 + dw1_dv1 + dw2_dv1;
+  const vec2<T> dbary_dv2 = dw0_dv2 + dw1_dv2 + dw2_dv2;
+
+  return std::make_tuple(dbary_p, dbary_dv0, dbary_dv1, dbary_dv2);
+}
+
+// Forward pass for applying perspective correction to barycentric coordinates.
+//
+// Args:
+//     bary: Screen-space barycentric coordinates for a point
+//     z0, z1, z2: Camera-space z-coordinates of the triangle vertices
+//
+// Returns
+//     World-space barycentric coordinates
+//
+template <typename T>
+inline vec3<T> BarycentricPerspectiveCorrectionForward(
+    const vec3<T>& bary,
+    const T z0,
+    const T z1,
+    const T z2) {
+  const T w0_top = bary.x * z1 * z2;
+  const T w1_top = bary.y * z0 * z2;
+  const T w2_top = bary.z * z0 * z1;
+  const T denom = std::max<T>(w0_top + w1_top + w2_top, kEpsilon);
+  const T w0 = w0_top / denom;
+  const T w1 = w1_top / denom;
+  const T w2 = w2_top / denom;
+  return vec3<T>(w0, w1, w2);
+}
+
+// Backward pass for applying perspective correction to barycentric coordinates.
+//
+// Args:
+//     bary: Screen-space barycentric coordinates for a point
+//     z0, z1, z2: Camera-space z-coordinates of the triangle vertices
+//     grad_out: Upstream gradient of the loss with respect to the corrected
+//               barycentric coordinates.
+//
+// Returns a tuple of:
+//      grad_bary: Downstream gradient of the loss with respect to the the
+//                 uncorrected barycentric coordinates.
+//      grad_z0, grad_z1, grad_z2: Downstream gradient of the loss with respect
+//                                 to the z-coordinates of the triangle verts
+template <typename T>
+inline std::tuple<vec3<T>, T, T, T> BarycentricPerspectiveCorrectionBackward(
+    const vec3<T>& bary,
+    const T z0,
+    const T z1,
+    const T z2,
+    const vec3<T>& grad_out) {
+  // Recompute forward pass
+  const T w0_top = bary.x * z1 * z2;
+  const T w1_top = bary.y * z0 * z2;
+  const T w2_top = bary.z * z0 * z1;
+  const T denom = std::max<T>(w0_top + w1_top + w2_top, kEpsilon);
+
+  // Now do backward pass
+  const T grad_denom_top =
+      -w0_top * grad_out.x - w1_top * grad_out.y - w2_top * grad_out.z;
+  const T grad_denom = grad_denom_top / (denom * denom);
+  const T grad_w0_top = grad_denom + grad_out.x / denom;
+  const T grad_w1_top = grad_denom + grad_out.y / denom;
+  const T grad_w2_top = grad_denom + grad_out.z / denom;
+  const T grad_bary_x = grad_w0_top * z1 * z2;
+  const T grad_bary_y = grad_w1_top * z0 * z2;
+  const T grad_bary_z = grad_w2_top * z0 * z1;
+  const vec3<T> grad_bary(grad_bary_x, grad_bary_y, grad_bary_z);
+  const T grad_z0 = grad_w1_top * bary.y * z2 + grad_w2_top * bary.z * z1;
+  const T grad_z1 = grad_w0_top * bary.x * z2 + grad_w2_top * bary.z * z0;
+  const T grad_z2 = grad_w0_top * bary.x * z1 + grad_w1_top * bary.y * z0;
+  return std::make_tuple(grad_bary, grad_z0, grad_z1, grad_z2);
+}
+
+// Clip negative barycentric coordinates to 0.0 and renormalize so
+// the barycentric coordinates for a point sum to 1. When the blur_radius
+// is greater than 0, a face will still be recorded as overlapping a pixel
+// if the pixel is outside the face. In this case at least one of the
+// barycentric coordinates for the pixel relative to the face will be negative.
+// Clipping will ensure that the texture and z buffer are interpolated
+// correctly.
+//
+//  Args
+//     bary: (w0, w1, w2) barycentric coordinates which can contain values < 0.
+//
+//  Returns
+//     bary: (w0, w1, w2) barycentric coordinates in the range [0, 1] which
+//           satisfy the condition: sum(w0, w1, w2) = 1.0.
+//
+template <typename T>
+vec3<T> BarycentricClipForward(const vec3<T> bary) {
+  vec3<T> w(0.0f, 0.0f, 0.0f);
+  // Only clamp negative values to 0.0.
+  // No need to clamp values > 1.0 as they will be renormalized.
+  w.x = std::max(bary.x, 0.0f);
+  w.y = std::max(bary.y, 0.0f);
+  w.z = std::max(bary.z, 0.0f);
+  float w_sum = w.x + w.y + w.z;
+  w_sum = std::fmaxf(w_sum, 1e-5);
+  w.x /= w_sum;
+  w.y /= w_sum;
+  w.z /= w_sum;
+  return w;
+}
+
+// Backward pass for barycentric coordinate clipping.
+//
+//  Args
+//     bary: (w0, w1, w2) barycentric coordinates which can contain values < 0.
+//     grad_baryclip_upstream: vec3<T> Upstream gradient for each of the clipped
+//                         barycentric coordinates [grad_w0, grad_w1, grad_w2].
+//
+// Returns
+//    vec3<T> of gradients for the unclipped barycentric coordinates:
+//    (grad_w0, grad_w1, grad_w2)
+//
+template <typename T>
+vec3<T> BarycentricClipBackward(
+    const vec3<T> bary,
+    const vec3<T> grad_baryclip_upstream) {
+  // Redo some of the forward pass calculations
+  vec3<T> w(0.0f, 0.0f, 0.0f);
+  w.x = std::max(bary.x, 0.0f);
+  w.y = std::max(bary.y, 0.0f);
+  w.z = std::max(bary.z, 0.0f);
+  float w_sum = w.x + w.y + w.z;
+
+  vec3<T> grad_bary(1.0f, 1.0f, 1.0f);
+  vec3<T> grad_clip(1.0f, 1.0f, 1.0f);
+  vec3<T> grad_sum(1.0f, 1.0f, 1.0f);
+
+  // Check if the sum was clipped.
+  float grad_sum_clip = 1.0f;
+  if (w_sum < 1e-5) {
+    grad_sum_clip = 0.0f;
+    w_sum = 1e-5;
+  }
+
+  // Check if any of the bary coordinates have been clipped.
+  // Only negative values are clamped to 0.0.
+  if (bary.x < 0.0f) {
+    grad_clip.x = 0.0f;
+  }
+  if (bary.y < 0.0f) {
+    grad_clip.y = 0.0f;
+  }
+  if (bary.z < 0.0f) {
+    grad_clip.z = 0.0f;
+  }
+
+  // Gradients of the sum.
+  grad_sum.x = -w.x / (pow(w_sum, 2.0f)) * grad_sum_clip;
+  grad_sum.y = -w.y / (pow(w_sum, 2.0f)) * grad_sum_clip;
+  grad_sum.z = -w.z / (pow(w_sum, 2.0f)) * grad_sum_clip;
+
+  // Gradients for each of the bary coordinates including the cross terms
+  // from the sum.
+  grad_bary.x = grad_clip.x *
+      (grad_baryclip_upstream.x * (1.0f / w_sum + grad_sum.x) +
+       grad_baryclip_upstream.y * (grad_sum.y) +
+       grad_baryclip_upstream.z * (grad_sum.z));
+
+  grad_bary.y = grad_clip.y *
+      (grad_baryclip_upstream.y * (1.0f / w_sum + grad_sum.y) +
+       grad_baryclip_upstream.x * (grad_sum.x) +
+       grad_baryclip_upstream.z * (grad_sum.z));
+
+  grad_bary.z = grad_clip.z *
+      (grad_baryclip_upstream.z * (1.0f / w_sum + grad_sum.z) +
+       grad_baryclip_upstream.x * (grad_sum.x) +
+       grad_baryclip_upstream.y * (grad_sum.y));
+
+  return grad_bary;
+}
+
+// Calculate minimum distance between a line segment (v1 - v0) and point p.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1: Coordinates of the end points of the line segment.
+//
+// Returns:
+//     squared distance of the point to the line.
+//
+// Consider the line extending the segment - this can be parameterized as:
+// v0 + t (v1 - v0).
+//
+// First find the projection of point p onto the line. It falls where:
+// t = [(p - v0) . (v1 - v0)] / |v1 - v0|^2
+// where . is the dot product.
+//
+// The parameter t is clamped from [0, 1] to handle points outside the
+// segment (v1 - v0).
+//
+// Once the projection of the point on the segment is known, the distance from
+// p to the projection gives the minimum distance to the segment.
+//
+template <typename T>
+T PointLineDistanceForward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1) {
+  const vec2<T> v1v0 = v1 - v0;
+  const T l2 = dot(v1v0, v1v0);
+  if (l2 <= kEpsilon) {
+    return dot(p - v1, p - v1);
+  }
+
+  const T t = dot(v1v0, p - v0) / l2;
+  const T tt = std::min(std::max(t, 0.00f), 1.00f);
+  const vec2<T> p_proj = v0 + tt * v1v0;
+  return dot(p - p_proj, p - p_proj);
+}
+
+template <typename T>
+T PointLine3DistanceForward(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1) {
+  const vec3<T> v1v0 = v1 - v0;
+  const T l2 = dot(v1v0, v1v0);
+  if (l2 <= kEpsilon) {
+    return dot(p - v1, p - v1);
+  }
+
+  const T t = dot(v1v0, p - v0) / l2;
+  const T tt = std::min(std::max(t, 0.00f), 1.00f);
+  const vec3<T> p_proj = v0 + tt * v1v0;
+  return dot(p - p_proj, p - p_proj);
+}
+
+// Backward pass for point to line distance in 2D.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1: Coordinates of the end points of the line segment.
+//     grad_dist: Upstream gradient for the distance.
+//
+// Returns:
+//    tuple of gradients for each of the input points:
+//      (vec2<T> grad_p, vec2<T> grad_v0, vec2<T> grad_v1)
+//
+template <typename T>
+inline std::tuple<vec2<T>, vec2<T>, vec2<T>> PointLineDistanceBackward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const T& grad_dist) {
+  // Redo some of the forward pass calculations.
+  const vec2<T> v1v0 = v1 - v0;
+  const vec2<T> pv0 = p - v0;
+  const T t_bot = dot(v1v0, v1v0);
+  const T t_top = dot(v1v0, pv0);
+  const T t = t_top / t_bot;
+  const T tt = std::min(std::max(t, 0.00f), 1.00f);
+  const vec2<T> p_proj = (1.0f - tt) * v0 + tt * v1;
+
+  const vec2<T> grad_v0 = grad_dist * (1.0f - tt) * 2.0f * (p_proj - p);
+  const vec2<T> grad_v1 = grad_dist * tt * 2.0f * (p_proj - p);
+  const vec2<T> grad_p = -1.0f * grad_dist * 2.0f * (p_proj - p);
+
+  return std::make_tuple(grad_p, grad_v0, grad_v1);
+}
+
+template <typename T>
+std::tuple<vec3<T>, vec3<T>, vec3<T>> PointLine3DistanceBackward(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1,
+    const T& grad_dist) {
+  const vec3<T> v1v0 = v1 - v0;
+  const vec3<T> pv0 = p - v0;
+  const T t_bot = dot(v1v0, v1v0);
+  const T t_top = dot(v1v0, pv0);
+
+  vec3<T> grad_p{0.0f, 0.0f, 0.0f};
+  vec3<T> grad_v0{0.0f, 0.0f, 0.0f};
+  vec3<T> grad_v1{0.0f, 0.0f, 0.0f};
+
+  const T tt = t_top / t_bot;
+
+  if (t_bot < kEpsilon) {
+    // if t_bot small, then v0 == v1,
+    // and dist = 0.5 * dot(pv0, pv0) + 0.5 * dot(pv1, pv1)
+    grad_p = grad_dist * 2.0f * pv0;
+    grad_v0 = -0.5f * grad_p;
+    grad_v1 = grad_v0;
+  } else if (tt < 0.0f) {
+    grad_p = grad_dist * 2.0f * pv0;
+    grad_v0 = -1.0f * grad_p;
+    // no gradients wrt v1
+  } else if (tt > 1.0f) {
+    grad_p = grad_dist * 2.0f * (p - v1);
+    grad_v1 = -1.0f * grad_p;
+    // no gradients wrt v0
+  } else {
+    const vec3<T> p_proj = v0 + tt * v1v0;
+    const vec3<T> diff = p - p_proj;
+    const vec3<T> grad_base = grad_dist * 2.0f * diff;
+    grad_p = grad_base - dot(grad_base, v1v0) * v1v0 / t_bot;
+    const vec3<T> dtt_v0 = (-1.0f * v1v0 - pv0 + 2.0f * tt * v1v0) / t_bot;
+    grad_v0 = (-1.0f + tt) * grad_base - dot(grad_base, v1v0) * dtt_v0;
+    const vec3<T> dtt_v1 = (pv0 - 2.0f * tt * v1v0) / t_bot;
+    grad_v1 = -dot(grad_base, v1v0) * dtt_v1 - tt * grad_base;
+  }
+
+  return std::make_tuple(grad_p, grad_v0, grad_v1);
+}
+
+// The forward pass for calculating the shortest distance between a point
+// and a triangle.
+// Ref: https://www.randygaul.net/2014/07/23/distance-point-to-line-segment/
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the three triangle vertices.
+//
+// Returns:
+//     shortest squared distance from a point to a triangle.
+//
+//
+template <typename T>
+T PointTriangleDistanceForward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const vec2<T>& v2) {
+  // Compute distance of point to 3 edges of the triangle and return the
+  // minimum value.
+  const T e01_dist = PointLineDistanceForward(p, v0, v1);
+  const T e02_dist = PointLineDistanceForward(p, v0, v2);
+  const T e12_dist = PointLineDistanceForward(p, v1, v2);
+  const T edge_dist = std::min(std::min(e01_dist, e02_dist), e12_dist);
+
+  return edge_dist;
+}
+
+// Backward pass for point triangle distance.
+//
+// Args:
+//     p: Coordinates of a point.
+//     v0, v1, v2: Coordinates of the three triangle vertices.
+//     grad_dist: Upstream gradient for the distance.
+//
+// Returns:
+//    tuple of gradients for each of the triangle vertices:
+//      (vec2<T> grad_v0, vec2<T> grad_v1, vec2<T> grad_v2)
+//
+template <typename T>
+inline std::tuple<vec2<T>, vec2<T>, vec2<T>, vec2<T>>
+PointTriangleDistanceBackward(
+    const vec2<T>& p,
+    const vec2<T>& v0,
+    const vec2<T>& v1,
+    const vec2<T>& v2,
+    const T& grad_dist) {
+  // Compute distance to all 3 edges of the triangle.
+  const T e01_dist = PointLineDistanceForward(p, v0, v1);
+  const T e02_dist = PointLineDistanceForward(p, v0, v2);
+  const T e12_dist = PointLineDistanceForward(p, v1, v2);
+
+  // Initialize output tensors.
+  vec2<T> grad_v0(0.0f, 0.0f);
+  vec2<T> grad_v1(0.0f, 0.0f);
+  vec2<T> grad_v2(0.0f, 0.0f);
+  vec2<T> grad_p(0.0f, 0.0f);
+
+  // Find which edge is the closest and return PointLineDistanceBackward for
+  // that edge.
+  if (e01_dist <= e02_dist && e01_dist <= e12_dist) {
+    // Closest edge is v1 - v0.
+    auto grad_e01 = PointLineDistanceBackward(p, v0, v1, grad_dist);
+    grad_p = std::get<0>(grad_e01);
+    grad_v0 = std::get<1>(grad_e01);
+    grad_v1 = std::get<2>(grad_e01);
+  } else if (e02_dist <= e01_dist && e02_dist <= e12_dist) {
+    // Closest edge is v2 - v0.
+    auto grad_e02 = PointLineDistanceBackward(p, v0, v2, grad_dist);
+    grad_p = std::get<0>(grad_e02);
+    grad_v0 = std::get<1>(grad_e02);
+    grad_v2 = std::get<2>(grad_e02);
+  } else if (e12_dist <= e01_dist && e12_dist <= e02_dist) {
+    // Closest edge is v2 - v1.
+    auto grad_e12 = PointLineDistanceBackward(p, v1, v2, grad_dist);
+    grad_p = std::get<0>(grad_e12);
+    grad_v1 = std::get<1>(grad_e12);
+    grad_v2 = std::get<2>(grad_e12);
+  }
+
+  return std::make_tuple(grad_p, grad_v0, grad_v1, grad_v2);
+}
+
+// Computes the area of a triangle (v0, v1, v2).
+// Args:
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns:
+//     area: float: the area of the triangle
+//
+template <typename T>
+T AreaOfTriangle(const vec3<T>& v0, const vec3<T>& v1, const vec3<T>& v2) {
+  vec3<T> p0 = v1 - v0;
+  vec3<T> p1 = v2 - v0;
+
+  // compute the hypotenus of the scross product (p0 x p1)
+  float dd = std::hypot(
+      p0.y * p1.z - p0.z * p1.y,
+      std::hypot(p0.z * p1.x - p0.x * p1.z, p0.x * p1.y - p0.y * p1.x));
+
+  return dd / 2.0;
+}
+
+// Computes the squared distance of a point p relative to a triangle (v0, v1,
+// v2). If the point's projection p0 on the plane spanned by (v0, v1, v2) is
+// inside the triangle with vertices (v0, v1, v2), then the returned value is
+// the squared distance of p to its projection p0. Otherwise, the returned value
+// is the smallest squared distance of p from the line segments (v0, v1), (v0,
+// v2) and (v1, v2).
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns:
+//     dist: Float of the squared distance
+//
+
+const float vEpsilon = 1e-8;
+
+template <typename T>
+vec3<T> BarycentricCoords3Forward(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1,
+    const vec3<T>& v2) {
+  vec3<T> p0 = v1 - v0;
+  vec3<T> p1 = v2 - v0;
+  vec3<T> p2 = p - v0;
+
+  const T d00 = dot(p0, p0);
+  const T d01 = dot(p0, p1);
+  const T d11 = dot(p1, p1);
+  const T d20 = dot(p2, p0);
+  const T d21 = dot(p2, p1);
+
+  const T denom = d00 * d11 - d01 * d01 + kEpsilon;
+  const T w1 = (d11 * d20 - d01 * d21) / denom;
+  const T w2 = (d00 * d21 - d01 * d20) / denom;
+  const T w0 = 1.0f - w1 - w2;
+
+  return vec3<T>(w0, w1, w2);
+}
+
+// Checks whether the point p is inside the triangle (v0, v1, v2).
+// A point is inside the triangle, if all barycentric coordinates
+// wrt the triangle are >= 0 & <= 1.
+// If the triangle is degenerate, aka line or point, then return False.
+//
+// NOTE that this function assumes that p lives on the space spanned
+// by (v0, v1, v2).
+// TODO(gkioxari) explicitly check whether p is coplanar with (v0, v1, v2)
+// and throw an error if check fails
+//
+// Args:
+//     p: vec3 coordinates of a point
+//     v0, v1, v2: vec3 coordinates of the triangle vertices
+//
+// Returns:
+//     inside: bool indicating wether p is inside triangle
+//
+template <typename T>
+static bool IsInsideTriangle(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1,
+    const vec3<T>& v2) {
+  bool inside;
+  if (AreaOfTriangle(v0, v1, v2) < 1e-5) {
+    inside = 0;
+  } else {
+    vec3<T> bary = BarycentricCoords3Forward(p, v0, v1, v2);
+    bool x_in = 0.0f <= bary.x && bary.x <= 1.0f;
+    bool y_in = 0.0f <= bary.y && bary.y <= 1.0f;
+    bool z_in = 0.0f <= bary.z && bary.z <= 1.0f;
+    inside = x_in && y_in && z_in;
+  }
+  return inside;
+}
+
+template <typename T>
+T PointTriangle3DistanceForward(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1,
+    const vec3<T>& v2) {
+  vec3<T> normal = cross(v2 - v0, v1 - v0);
+  const T norm_normal = norm(normal);
+  normal = normal / (norm_normal + vEpsilon);
+
+  // p0 is the projection of p on the plane spanned by (v0, v1, v2)
+  // i.e. p0 = p + t * normal, s.t. (p0 - v0) is orthogonal to normal
+  const T t = dot(v0 - p, normal);
+  const vec3<T> p0 = p + t * normal;
+
+  bool is_inside = IsInsideTriangle(p0, v0, v1, v2);
+  T dist = 0.0f;
+
+  if ((is_inside) && (norm_normal > kEpsilon)) {
+    // if projection p0 is inside triangle spanned by (v0, v1, v2)
+    // then distance is equal to norm(p0 - p)^2
+    dist = t * t;
+  } else {
+    const float e01 = PointLine3DistanceForward(p, v0, v1);
+    const float e02 = PointLine3DistanceForward(p, v0, v2);
+    const float e12 = PointLine3DistanceForward(p, v1, v2);
+
+    dist = (e01 > e02) ? e02 : e01;
+    dist = (dist > e12) ? e12 : dist;
+  }
+
+  return dist;
+}
+
+template <typename T>
+std::tuple<vec3<T>, vec3<T>>
+cross_backward(const vec3<T>& a, const vec3<T>& b, const vec3<T>& grad_cross) {
+  const float grad_ax = -grad_cross.y * b.z + grad_cross.z * b.y;
+  const float grad_ay = grad_cross.x * b.z - grad_cross.z * b.x;
+  const float grad_az = -grad_cross.x * b.y + grad_cross.y * b.x;
+  const vec3<T> grad_a = vec3<T>(grad_ax, grad_ay, grad_az);
+
+  const float grad_bx = grad_cross.y * a.z - grad_cross.z * a.y;
+  const float grad_by = -grad_cross.x * a.z + grad_cross.z * a.x;
+  const float grad_bz = grad_cross.x * a.y - grad_cross.y * a.x;
+  const vec3<T> grad_b = vec3<T>(grad_bx, grad_by, grad_bz);
+
+  return std::make_tuple(grad_a, grad_b);
+}
+
+template <typename T>
+vec3<T> normalize_backward(const vec3<T>& a, const vec3<T>& grad_normz) {
+  const float a_norm = norm(a) + vEpsilon;
+  const vec3<T> out = a / a_norm;
+
+  const float grad_ax = grad_normz.x * (1.0f - out.x * out.x) / a_norm +
+      grad_normz.y * (-out.x * out.y) / a_norm +
+      grad_normz.z * (-out.x * out.z) / a_norm;
+  const float grad_ay = grad_normz.x * (-out.x * out.y) / a_norm +
+      grad_normz.y * (1.0f - out.y * out.y) / a_norm +
+      grad_normz.z * (-out.y * out.z) / a_norm;
+  const float grad_az = grad_normz.x * (-out.x * out.z) / a_norm +
+      grad_normz.y * (-out.y * out.z) / a_norm +
+      grad_normz.z * (1.0f - out.z * out.z) / a_norm;
+  return vec3<T>(grad_ax, grad_ay, grad_az);
+}
+
+// The backward pass for computing the squared distance of a point
+// to the triangle (v0, v1, v2).
+//
+// Args:
+//     p: xyz coordinates of a point
+//     v0, v1, v2: xyz coordinates of the triangle vertices
+//     grad_dist: Float of the gradient wrt dist
+//
+// Returns:
+//     tuple of gradients for the point and triangle:
+//        (float3 grad_p, float3 grad_v0, float3 grad_v1, float3 grad_v2)
+//
+
+template <typename T>
+static std::tuple<vec3<T>, vec3<T>, vec3<T>, vec3<T>>
+PointTriangle3DistanceBackward(
+    const vec3<T>& p,
+    const vec3<T>& v0,
+    const vec3<T>& v1,
+    const vec3<T>& v2,
+    const T& grad_dist) {
+  const vec3<T> v2v0 = v2 - v0;
+  const vec3<T> v1v0 = v1 - v0;
+  const vec3<T> v0p = v0 - p;
+  vec3<T> raw_normal = cross(v2v0, v1v0);
+  const T norm_normal = norm(raw_normal);
+  vec3<T> normal = raw_normal / (norm_normal + vEpsilon);
+
+  // p0 is the projection of p on the plane spanned by (v0, v1, v2)
+  // i.e. p0 = p + t * normal, s.t. (p0 - v0) is orthogonal to normal
+  const T t = dot(v0 - p, normal);
+  const vec3<T> p0 = p + t * normal;
+  const vec3<T> diff = t * normal;
+
+  bool is_inside = IsInsideTriangle(p0, v0, v1, v2);
+
+  vec3<T> grad_p(0.0f, 0.0f, 0.0f);
+  vec3<T> grad_v0(0.0f, 0.0f, 0.0f);
+  vec3<T> grad_v1(0.0f, 0.0f, 0.0f);
+  vec3<T> grad_v2(0.0f, 0.0f, 0.0f);
+
+  if ((is_inside) && (norm_normal > kEpsilon)) {
+    // derivative of dist wrt p
+    grad_p = -2.0f * grad_dist * t * normal;
+    // derivative of dist wrt normal
+    const vec3<T> grad_normal = 2.0f * grad_dist * t * (v0p + diff);
+    // derivative of dist wrt raw_normal
+    const vec3<T> grad_raw_normal = normalize_backward(raw_normal, grad_normal);
+    // derivative of dist wrt v2v0 and v1v0
+    const auto grad_cross = cross_backward(v2v0, v1v0, grad_raw_normal);
+    const vec3<T> grad_cross_v2v0 = std::get<0>(grad_cross);
+    const vec3<T> grad_cross_v1v0 = std::get<1>(grad_cross);
+    grad_v0 =
+        grad_dist * 2.0f * t * normal - (grad_cross_v2v0 + grad_cross_v1v0);
+    grad_v1 = grad_cross_v1v0;
+    grad_v2 = grad_cross_v2v0;
+  } else {
+    const T e01 = PointLine3DistanceForward(p, v0, v1);
+    const T e02 = PointLine3DistanceForward(p, v0, v2);
+    const T e12 = PointLine3DistanceForward(p, v1, v2);
+
+    if ((e01 <= e02) && (e01 <= e12)) {
+      // e01 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v0, v1, grad_dist);
+      grad_p = std::get<0>(grads);
+      grad_v0 = std::get<1>(grads);
+      grad_v1 = std::get<2>(grads);
+    } else if ((e02 <= e01) && (e02 <= e12)) {
+      // e02 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v0, v2, grad_dist);
+      grad_p = std::get<0>(grads);
+      grad_v0 = std::get<1>(grads);
+      grad_v2 = std::get<2>(grads);
+    } else if ((e12 <= e01) && (e12 <= e02)) {
+      // e12 is smallest
+      const auto grads = PointLine3DistanceBackward(p, v1, v2, grad_dist);
+      grad_p = std::get<0>(grads);
+      grad_v1 = std::get<1>(grads);
+      grad_v2 = std::get<2>(grads);
+    }
+  }
+
+  return std::make_tuple(grad_p, grad_v0, grad_v1, grad_v2);
+}
diff --git a/pytorch3d/pytorch3d/csrc/utils/index_utils.cuh b/pytorch3d/pytorch3d/csrc/utils/index_utils.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..12dfeb44aabd82d17c586fa9ce84d2c1c9bb8f45
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/utils/index_utils.cuh
@@ -0,0 +1,224 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This converts dynamic array lookups into static array lookups, for small
+// arrays up to size 32.
+//
+// Suppose we have a small thread-local array:
+//
+// float vals[10];
+//
+// Ideally we should only index this array using static indices:
+//
+// for (int i = 0; i < 10; ++i) vals[i] = i * i;
+//
+// If we do so, then the CUDA compiler may be able to place the array into
+// registers, which can have a big performance improvement. However if we
+// access the array dynamically, the the compiler may force the array into
+// local memory, which has the same latency as global memory.
+//
+// These functions convert dynamic array access into static array access
+// using a brute-force lookup table. It can be used like this:
+//
+// float vals[10];
+// int idx = 3;
+// float val = 3.14f;
+// RegisterIndexUtils<float, 10>::set(vals, idx, val);
+// float val2 = RegisterIndexUtils<float, 10>::get(vals, idx);
+//
+// The implementation is based on fbcuda/RegisterUtils.cuh:
+// https://github.com/facebook/fbcuda/blob/master/RegisterUtils.cuh
+// To avoid depending on the entire library, we just reimplement these two
+// functions. The fbcuda implementation is a bit more sophisticated, and uses
+// the preprocessor to generate switch statements that go up to N for each
+// value of N. We are lazy and just have a giant explicit switch statement.
+//
+// We might be able to use a template metaprogramming approach similar to
+// DispatchKernel1D for this. However DispatchKernel1D is intended to be used
+// for dispatching to the correct CUDA kernel on the host, while this is
+// is intended to run on the device. I was concerned that a metaprogramming
+// approach for this might lead to extra function calls at runtime if the
+// compiler fails to optimize them away, which could be very slow on device.
+// However I didn't actually benchmark or test this.
+template <typename T, int N>
+struct RegisterIndexUtils {
+  __device__ __forceinline__ static T get(const T arr[N], int idx) {
+    if (idx < 0 || idx >= N)
+      return T();
+    switch (idx) {
+      case 0:
+        return arr[0];
+      case 1:
+        return arr[1];
+      case 2:
+        return arr[2];
+      case 3:
+        return arr[3];
+      case 4:
+        return arr[4];
+      case 5:
+        return arr[5];
+      case 6:
+        return arr[6];
+      case 7:
+        return arr[7];
+      case 8:
+        return arr[8];
+      case 9:
+        return arr[9];
+      case 10:
+        return arr[10];
+      case 11:
+        return arr[11];
+      case 12:
+        return arr[12];
+      case 13:
+        return arr[13];
+      case 14:
+        return arr[14];
+      case 15:
+        return arr[15];
+      case 16:
+        return arr[16];
+      case 17:
+        return arr[17];
+      case 18:
+        return arr[18];
+      case 19:
+        return arr[19];
+      case 20:
+        return arr[20];
+      case 21:
+        return arr[21];
+      case 22:
+        return arr[22];
+      case 23:
+        return arr[23];
+      case 24:
+        return arr[24];
+      case 25:
+        return arr[25];
+      case 26:
+        return arr[26];
+      case 27:
+        return arr[27];
+      case 28:
+        return arr[28];
+      case 29:
+        return arr[29];
+      case 30:
+        return arr[30];
+      case 31:
+        return arr[31];
+    };
+    return T();
+  }
+
+  __device__ __forceinline__ static void set(T arr[N], int idx, T val) {
+    if (idx < 0 || idx >= N)
+      return;
+    switch (idx) {
+      case 0:
+        arr[0] = val;
+        break;
+      case 1:
+        arr[1] = val;
+        break;
+      case 2:
+        arr[2] = val;
+        break;
+      case 3:
+        arr[3] = val;
+        break;
+      case 4:
+        arr[4] = val;
+        break;
+      case 5:
+        arr[5] = val;
+        break;
+      case 6:
+        arr[6] = val;
+        break;
+      case 7:
+        arr[7] = val;
+        break;
+      case 8:
+        arr[8] = val;
+        break;
+      case 9:
+        arr[9] = val;
+        break;
+      case 10:
+        arr[10] = val;
+        break;
+      case 11:
+        arr[11] = val;
+        break;
+      case 12:
+        arr[12] = val;
+        break;
+      case 13:
+        arr[13] = val;
+        break;
+      case 14:
+        arr[14] = val;
+        break;
+      case 15:
+        arr[15] = val;
+        break;
+      case 16:
+        arr[16] = val;
+        break;
+      case 17:
+        arr[17] = val;
+        break;
+      case 18:
+        arr[18] = val;
+        break;
+      case 19:
+        arr[19] = val;
+        break;
+      case 20:
+        arr[20] = val;
+        break;
+      case 21:
+        arr[21] = val;
+        break;
+      case 22:
+        arr[22] = val;
+        break;
+      case 23:
+        arr[23] = val;
+        break;
+      case 24:
+        arr[24] = val;
+        break;
+      case 25:
+        arr[25] = val;
+        break;
+      case 26:
+        arr[26] = val;
+        break;
+      case 27:
+        arr[27] = val;
+        break;
+      case 28:
+        arr[28] = val;
+        break;
+      case 29:
+        arr[29] = val;
+        break;
+      case 30:
+        arr[30] = val;
+        break;
+      case 31:
+        arr[31] = val;
+        break;
+    }
+  }
+};
diff --git a/pytorch3d/pytorch3d/csrc/utils/mink.cuh b/pytorch3d/pytorch3d/csrc/utils/mink.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..8fb4417de5dbf595ce1a3f2c03a576312f1e3575
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/utils/mink.cuh
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#define MINK_H
+
+#include "index_utils.cuh"
+
+// A data structure to keep track of the smallest K keys seen so far as well
+// as their associated values, intended to be used in device code.
+// This data structure doesn't allocate any memory; keys and values are stored
+// in arrays passed to the constructor.
+//
+// The implementation is generic; it can be used for any key type that supports
+// the < operator, and can be used with any value type.
+//
+// Example usage:
+//
+// float keys[K];
+// int values[K];
+// MinK<float, int> mink(keys, values, K);
+// for (...) {
+//   // Produce some key and value from somewhere
+//   mink.add(key, value);
+// }
+// mink.sort();
+//
+// Now keys and values store the smallest K keys seen so far and the values
+// associated to these keys:
+//
+// for (int k = 0; k < K; ++k) {
+//   float key_k = keys[k];
+//   int value_k = values[k];
+// }
+template <typename key_t, typename value_t>
+class MinK {
+ public:
+  // Constructor.
+  //
+  // Arguments:
+  //   keys: Array in which to store keys
+  //   values: Array in which to store values
+  //   K: How many values to keep track of
+  __device__ MinK(key_t* keys, value_t* vals, int K)
+      : keys(keys), vals(vals), K(K), _size(0) {}
+
+  // Try to add a new key and associated value to the data structure. If the key
+  // is one of the smallest K seen so far then it will be kept; otherwise it
+  // it will not be kept.
+  //
+  // This takes O(1) operations if the new key is not kept, or if the structure
+  // currently contains fewer than K elements. Otherwise this takes O(K) time.
+  //
+  // Arguments:
+  //   key: The key to add
+  //   val: The value associated to the key
+  __device__ __forceinline__ void add(const key_t& key, const value_t& val) {
+    if (_size < K) {
+      keys[_size] = key;
+      vals[_size] = val;
+      if (_size == 0 || key > max_key) {
+        max_key = key;
+        max_idx = _size;
+      }
+      _size++;
+    } else if (key < max_key) {
+      keys[max_idx] = key;
+      vals[max_idx] = val;
+      max_key = key;
+      for (int k = 0; k < K; ++k) {
+        key_t cur_key = keys[k];
+        if (cur_key > max_key) {
+          max_key = cur_key;
+          max_idx = k;
+        }
+      }
+    }
+  }
+
+  // Get the number of items currently stored in the structure.
+  // This takes O(1) time.
+  __device__ __forceinline__ int size() {
+    return _size;
+  }
+
+  // Sort the items stored in the structure using bubble sort.
+  // This takes O(K^2) time.
+  __device__ __forceinline__ void sort() {
+    for (int i = 0; i < _size - 1; ++i) {
+      for (int j = 0; j < _size - i - 1; ++j) {
+        if (keys[j + 1] < keys[j]) {
+          key_t key = keys[j];
+          value_t val = vals[j];
+          keys[j] = keys[j + 1];
+          vals[j] = vals[j + 1];
+          keys[j + 1] = key;
+          vals[j + 1] = val;
+        }
+      }
+    }
+  }
+
+ private:
+  key_t* keys;
+  value_t* vals;
+  int K;
+  int _size;
+  key_t max_key;
+  int max_idx;
+};
+
+// This is a version of MinK that only touches the arrays using static indexing
+// via RegisterIndexUtils. If the keys and values are stored in thread-local
+// arrays, then this may allow the compiler to place them in registers for
+// fast access.
+//
+// This has the same API as RegisterMinK, but doesn't support sorting.
+// We found that sorting via RegisterIndexUtils gave very poor performance,
+// and suspect it may have prevented the compiler from placing the arrays
+// into registers.
+template <typename key_t, typename value_t, int K>
+class RegisterMinK {
+ public:
+  __device__ RegisterMinK(key_t* keys, value_t* vals)
+      : keys(keys), vals(vals), _size(0) {}
+
+  __device__ __forceinline__ void add(const key_t& key, const value_t& val) {
+    if (_size < K) {
+      RegisterIndexUtils<key_t, K>::set(keys, _size, key);
+      RegisterIndexUtils<value_t, K>::set(vals, _size, val);
+      if (_size == 0 || key > max_key) {
+        max_key = key;
+        max_idx = _size;
+      }
+      _size++;
+    } else if (key < max_key) {
+      RegisterIndexUtils<key_t, K>::set(keys, max_idx, key);
+      RegisterIndexUtils<value_t, K>::set(vals, max_idx, val);
+      max_key = key;
+      for (int k = 0; k < K; ++k) {
+        key_t cur_key = RegisterIndexUtils<key_t, K>::get(keys, k);
+        if (cur_key > max_key) {
+          max_key = cur_key;
+          max_idx = k;
+        }
+      }
+    }
+  }
+
+  __device__ __forceinline__ int size() {
+    return _size;
+  }
+
+ private:
+  key_t* keys;
+  value_t* vals;
+  int _size;
+  key_t max_key;
+  int max_idx;
+};
diff --git a/pytorch3d/pytorch3d/csrc/utils/pytorch3d_cutils.h b/pytorch3d/pytorch3d/csrc/utils/pytorch3d_cutils.h
new file mode 100644
index 0000000000000000000000000000000000000000..c9e80194916c4438827757e17647d04ce72ebba5
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/utils/pytorch3d_cutils.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <torch/extension.h>
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor.")
+#define CHECK_CONTIGUOUS(x) \
+  TORCH_CHECK(x.is_contiguous(), #x " must be contiguous.")
+#define CHECK_CONTIGUOUS_CUDA(x) \
+  CHECK_CUDA(x);                 \
+  CHECK_CONTIGUOUS(x)
+
+// Max possible threads per block
+const int MAX_THREADS_PER_BLOCK = 1024;
diff --git a/pytorch3d/pytorch3d/csrc/utils/vec2.h b/pytorch3d/pytorch3d/csrc/utils/vec2.h
new file mode 100644
index 0000000000000000000000000000000000000000..5e1743c3742047c742f1437b01773ea1f357531b
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/utils/vec2.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+#include <type_traits>
+
+// A fixed-sized vector with basic arithmetic operators useful for
+// representing 2D coordinates.
+// TODO: switch to Eigen if more functionality is needed.
+
+template <
+    typename T,
+    typename = std::enable_if_t<
+        std::is_same<T, double>::value || std::is_same<T, float>::value>>
+struct vec2 {
+  T x, y;
+  typedef T scalar_t;
+  vec2(T x, T y) : x(x), y(y) {}
+};
+
+template <typename T>
+inline vec2<T> operator+(const vec2<T>& a, const vec2<T>& b) {
+  return vec2<T>(a.x + b.x, a.y + b.y);
+}
+
+template <typename T>
+inline vec2<T> operator-(const vec2<T>& a, const vec2<T>& b) {
+  return vec2<T>(a.x - b.x, a.y - b.y);
+}
+
+template <typename T>
+inline vec2<T> operator*(const T a, const vec2<T>& b) {
+  return vec2<T>(a * b.x, a * b.y);
+}
+
+template <typename T>
+inline vec2<T> operator/(const vec2<T>& a, const T b) {
+  if (b == 0.0) {
+    AT_ERROR(
+        "denominator in vec2 division is 0"); // prevent divide by 0 errors.
+  }
+  return vec2<T>(a.x / b, a.y / b);
+}
+
+template <typename T>
+inline T dot(const vec2<T>& a, const vec2<T>& b) {
+  return a.x * b.x + a.y * b.y;
+}
+
+template <typename T>
+inline T norm(const vec2<T>& a, const vec2<T>& b) {
+  const vec2<T> ba = b - a;
+  return sqrt(dot(ba, ba));
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const vec2<T>& v) {
+  os << "vec2(" << v.x << ", " << v.y << ")";
+  return os;
+}
diff --git a/pytorch3d/pytorch3d/csrc/utils/vec3.h b/pytorch3d/pytorch3d/csrc/utils/vec3.h
new file mode 100644
index 0000000000000000000000000000000000000000..92415165eb273892c32178c22cf4dde4e7364d6c
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/utils/vec3.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+// A fixed-sized vector with basic arithmetic operators useful for
+// representing 3D coordinates.
+// TODO: switch to Eigen if more functionality is needed.
+
+template <
+    typename T,
+    typename = std::enable_if_t<
+        std::is_same<T, double>::value || std::is_same<T, float>::value>>
+struct vec3 {
+  T x, y, z;
+  typedef T scalar_t;
+  vec3(T x, T y, T z) : x(x), y(y), z(z) {}
+};
+
+template <typename T>
+inline vec3<T> operator+(const vec3<T>& a, const vec3<T>& b) {
+  return vec3<T>(a.x + b.x, a.y + b.y, a.z + b.z);
+}
+
+template <typename T>
+inline vec3<T> operator-(const vec3<T>& a, const vec3<T>& b) {
+  return vec3<T>(a.x - b.x, a.y - b.y, a.z - b.z);
+}
+
+template <typename T>
+inline vec3<T> operator/(const vec3<T>& a, const T b) {
+  if (b == 0.0) {
+    AT_ERROR(
+        "denominator in vec3 division is 0"); // prevent divide by 0 errors.
+  }
+  return vec3<T>(a.x / b, a.y / b, a.z / b);
+}
+
+template <typename T>
+inline vec3<T> operator*(const T a, const vec3<T>& b) {
+  return vec3<T>(a * b.x, a * b.y, a * b.z);
+}
+
+template <typename T>
+inline vec3<T> operator*(const vec3<T>& a, const vec3<T>& b) {
+  return vec3<T>(a.x * b.x, a.y * b.y, a.z * b.z);
+}
+
+template <typename T>
+inline T dot(const vec3<T>& a, const vec3<T>& b) {
+  return a.x * b.x + a.y * b.y + a.z * b.z;
+}
+
+template <typename T>
+inline vec3<T> cross(const vec3<T>& a, const vec3<T>& b) {
+  return vec3<T>(
+      a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+}
+
+template <typename T>
+inline T norm(const vec3<T>& a) {
+  return sqrt(dot(a, a));
+}
+
+template <typename T>
+std::ostream& operator<<(std::ostream& os, const vec3<T>& v) {
+  os << "vec3(" << v.x << ", " << v.y << ", " << v.z << ")";
+  return os;
+}
diff --git a/pytorch3d/pytorch3d/csrc/utils/warp_reduce.cuh b/pytorch3d/pytorch3d/csrc/utils/warp_reduce.cuh
new file mode 100644
index 0000000000000000000000000000000000000000..5f021b7586476f2b7ef43a016c658cf6a00ae1b7
--- /dev/null
+++ b/pytorch3d/pytorch3d/csrc/utils/warp_reduce.cuh
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <float.h>
+#include <math.h>
+#include <cstdio>
+
+// Helper functions WarpReduceMin and WarpReduceMax used in .cu files
+// Starting in Volta, instructions are no longer synchronous within a warp.
+// We need to call __syncwarp() to sync the 32 threads in the warp
+// instead of all the threads in the block.
+
+template <typename scalar_t>
+__device__ void
+WarpReduceMin(scalar_t* min_dists, int64_t* min_idxs, const size_t tid) {
+  // s = 32
+  if (min_dists[tid] > min_dists[tid + 32]) {
+    min_idxs[tid] = min_idxs[tid + 32];
+    min_dists[tid] = min_dists[tid + 32];
+  }
+  __syncwarp();
+  // s = 16
+  if (min_dists[tid] > min_dists[tid + 16]) {
+    min_idxs[tid] = min_idxs[tid + 16];
+    min_dists[tid] = min_dists[tid + 16];
+  }
+  __syncwarp();
+  // s = 8
+  if (min_dists[tid] > min_dists[tid + 8]) {
+    min_idxs[tid] = min_idxs[tid + 8];
+    min_dists[tid] = min_dists[tid + 8];
+  }
+  __syncwarp();
+  // s = 4
+  if (min_dists[tid] > min_dists[tid + 4]) {
+    min_idxs[tid] = min_idxs[tid + 4];
+    min_dists[tid] = min_dists[tid + 4];
+  }
+  __syncwarp();
+  // s = 2
+  if (min_dists[tid] > min_dists[tid + 2]) {
+    min_idxs[tid] = min_idxs[tid + 2];
+    min_dists[tid] = min_dists[tid + 2];
+  }
+  __syncwarp();
+  // s = 1
+  if (min_dists[tid] > min_dists[tid + 1]) {
+    min_idxs[tid] = min_idxs[tid + 1];
+    min_dists[tid] = min_dists[tid + 1];
+  }
+  __syncwarp();
+}
+
+template <typename scalar_t>
+__device__ void WarpReduceMax(
+    volatile scalar_t* dists,
+    volatile int64_t* dists_idx,
+    const size_t tid) {
+  if (dists[tid] < dists[tid + 32]) {
+    dists[tid] = dists[tid + 32];
+    dists_idx[tid] = dists_idx[tid + 32];
+  }
+  __syncwarp();
+  if (dists[tid] < dists[tid + 16]) {
+    dists[tid] = dists[tid + 16];
+    dists_idx[tid] = dists_idx[tid + 16];
+  }
+  __syncwarp();
+  if (dists[tid] < dists[tid + 8]) {
+    dists[tid] = dists[tid + 8];
+    dists_idx[tid] = dists_idx[tid + 8];
+  }
+  __syncwarp();
+  if (dists[tid] < dists[tid + 4]) {
+    dists[tid] = dists[tid + 4];
+    dists_idx[tid] = dists_idx[tid + 4];
+  }
+  __syncwarp();
+  if (dists[tid] < dists[tid + 2]) {
+    dists[tid] = dists[tid + 2];
+    dists_idx[tid] = dists_idx[tid + 2];
+  }
+  __syncwarp();
+  if (dists[tid] < dists[tid + 1]) {
+    dists[tid] = dists[tid + 1];
+    dists_idx[tid] = dists_idx[tid + 1];
+  }
+  __syncwarp();
+}
diff --git a/pytorch3d/pytorch3d/datasets/__init__.py b/pytorch3d/pytorch3d/datasets/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dd11698ee4089280f974837ca6e6ec1b4a958d4
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .r2n2 import R2N2, BlenderCamera, collate_batched_R2N2, render_cubified_voxels
+from .shapenet import ShapeNetCore
+from .utils import collate_batched_meshes
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/datasets/r2n2/__init__.py b/pytorch3d/pytorch3d/datasets/r2n2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b5ebb1c01d21771408487f2ddb330b276212d8b
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/r2n2/__init__.py
@@ -0,0 +1,11 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .r2n2 import R2N2
+from .utils import BlenderCamera, collate_batched_R2N2, render_cubified_voxels
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/datasets/r2n2/r2n2.py b/pytorch3d/pytorch3d/datasets/r2n2/r2n2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e261eb0d0a22f904009ac42f7b350079623a1d1
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/r2n2/r2n2.py
@@ -0,0 +1,425 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import warnings
+from os import path
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+from PIL import Image
+from pytorch3d.common.types import Device
+from pytorch3d.datasets.shapenet_base import ShapeNetBase
+from pytorch3d.renderer import HardPhongShader
+from tabulate import tabulate
+
+from .utils import (
+    BlenderCamera,
+    align_bbox,
+    compute_extrinsic_matrix,
+    read_binvox_coords,
+    voxelize,
+)
+
+
+SYNSET_DICT_DIR = Path(__file__).resolve().parent
+MAX_CAMERA_DISTANCE = 1.75  # Constant from R2N2.
+VOXEL_SIZE = 128
+# Intrinsic matrix extracted from Blender. Taken from meshrcnn codebase:
+# https://github.com/facebookresearch/meshrcnn/blob/main/shapenet/utils/coords.py
+BLENDER_INTRINSIC = torch.tensor(
+    [
+        [2.1875, 0.0, 0.0, 0.0],
+        [0.0, 2.1875, 0.0, 0.0],
+        [0.0, 0.0, -1.002002, -0.2002002],
+        [0.0, 0.0, -1.0, 0.0],
+    ]
+)
+
+
+class R2N2(ShapeNetBase):  # pragma: no cover
+    """
+    This class loads the R2N2 dataset from a given directory into a Dataset object.
+    The R2N2 dataset contains 13 categories that are a subset of the ShapeNetCore v.1
+    dataset. The R2N2 dataset also contains its own 24 renderings of each object and
+    voxelized models. Most of the models have all 24 views in the same split, but there
+    are eight of them that divide their views between train and test splits.
+
+    """
+
+    def __init__(
+        self,
+        split: str,
+        shapenet_dir,
+        r2n2_dir,
+        splits_file,
+        return_all_views: bool = True,
+        return_voxels: bool = False,
+        views_rel_path: str = "ShapeNetRendering",
+        voxels_rel_path: str = "ShapeNetVoxels",
+        load_textures: bool = True,
+        texture_resolution: int = 4,
+    ) -> None:
+        """
+        Store each object's synset id and models id the given directories.
+
+        Args:
+            split (str): One of (train, val, test).
+            shapenet_dir (path): Path to ShapeNet core v1.
+            r2n2_dir (path): Path to the R2N2 dataset.
+            splits_file (path): File containing the train/val/test splits.
+            return_all_views (bool): Indicator of whether or not to load all the views in
+                the split. If set to False, one of the views in the split will be randomly
+                selected and loaded.
+            return_voxels(bool): Indicator of whether or not to return voxels as a tensor
+                of shape (D, D, D) where D is the number of voxels along each dimension.
+            views_rel_path: path to rendered views within the r2n2_dir. If not specified,
+                the renderings are assumed to be at os.path.join(rn2n_dir, "ShapeNetRendering").
+            voxels_rel_path: path to rendered views within the r2n2_dir. If not specified,
+                the renderings are assumed to be at os.path.join(rn2n_dir, "ShapeNetVoxels").
+            load_textures: Boolean indicating whether textures should loaded for the model.
+                Textures will be of type TexturesAtlas i.e. a texture map per face.
+            texture_resolution: Int specifying the resolution of the texture map per face
+                created using the textures in the obj file. A
+                (texture_resolution, texture_resolution, 3) map is created per face.
+
+        """
+        super().__init__()
+        self.shapenet_dir = shapenet_dir
+        self.r2n2_dir = r2n2_dir
+        self.views_rel_path = views_rel_path
+        self.voxels_rel_path = voxels_rel_path
+        self.load_textures = load_textures
+        self.texture_resolution = texture_resolution
+        # Examine if split is valid.
+        if split not in ["train", "val", "test"]:
+            raise ValueError("split has to be one of (train, val, test).")
+        # Synset dictionary mapping synset offsets in R2N2 to corresponding labels.
+        with open(
+            path.join(SYNSET_DICT_DIR, "r2n2_synset_dict.json"), "r"
+        ) as read_dict:
+            self.synset_dict = json.load(read_dict)
+        # Inverse dictionary mapping synset labels to corresponding offsets.
+        self.synset_inv = {label: offset for offset, label in self.synset_dict.items()}
+
+        # Store synset and model ids of objects mentioned in the splits_file.
+        with open(splits_file) as splits:
+            split_dict = json.load(splits)[split]
+
+        self.return_images = True
+        # Check if the folder containing R2N2 renderings is included in r2n2_dir.
+        if not path.isdir(path.join(r2n2_dir, views_rel_path)):
+            self.return_images = False
+            msg = (
+                "%s not found in %s. R2N2 renderings will "
+                "be skipped when returning models."
+            ) % (views_rel_path, r2n2_dir)
+            warnings.warn(msg)
+
+        self.return_voxels = return_voxels
+        # Check if the folder containing voxel coordinates is included in r2n2_dir.
+        if not path.isdir(path.join(r2n2_dir, voxels_rel_path)):
+            self.return_voxels = False
+            msg = (
+                "%s not found in %s. Voxel coordinates will "
+                "be skipped when returning models."
+            ) % (voxels_rel_path, r2n2_dir)
+            warnings.warn(msg)
+
+        synset_set = set()
+        # Store lists of views of each model in a list.
+        self.views_per_model_list = []
+        # Store tuples of synset label and total number of views in each category in a list.
+        synset_num_instances = []
+        for synset in split_dict.keys():
+            # Examine if the given synset is present in the ShapeNetCore dataset
+            # and is also part of the standard R2N2 dataset.
+            if not (
+                path.isdir(path.join(shapenet_dir, synset))
+                and synset in self.synset_dict
+            ):
+                msg = (
+                    "Synset category %s from the splits file is either not "
+                    "present in %s or not part of the standard R2N2 dataset."
+                ) % (synset, shapenet_dir)
+                warnings.warn(msg)
+                continue
+
+            synset_set.add(synset)
+            self.synset_start_idxs[synset] = len(self.synset_ids)
+            # Start counting total number of views in the current category.
+            synset_view_count = 0
+            for model in split_dict[synset]:
+                # Examine if the given model is present in the ShapeNetCore path.
+                shapenet_path = path.join(shapenet_dir, synset, model)
+                if not path.isdir(shapenet_path):
+                    msg = "Model %s from category %s is not present in %s." % (
+                        model,
+                        synset,
+                        shapenet_dir,
+                    )
+                    warnings.warn(msg)
+                    continue
+                self.synset_ids.append(synset)
+                self.model_ids.append(model)
+
+                model_views = split_dict[synset][model]
+                # Randomly select a view index if return_all_views set to False.
+                if not return_all_views:
+                    rand_idx = torch.randint(len(model_views), (1,))
+                    model_views = [model_views[rand_idx]]
+                self.views_per_model_list.append(model_views)
+                synset_view_count += len(model_views)
+            synset_num_instances.append((self.synset_dict[synset], synset_view_count))
+            model_count = len(self.synset_ids) - self.synset_start_idxs[synset]
+            self.synset_num_models[synset] = model_count
+        headers = ["category", "#instances"]
+        synset_num_instances.append(("total", sum(n for _, n in synset_num_instances)))
+        print(
+            tabulate(synset_num_instances, headers, numalign="left", stralign="center")
+        )
+
+        # Examine if all the synsets in the standard R2N2 mapping are present.
+        # Update self.synset_inv so that it only includes the loaded categories.
+        synset_not_present = [
+            self.synset_inv.pop(self.synset_dict[synset])
+            for synset in self.synset_dict
+            if synset not in synset_set
+        ]
+        if len(synset_not_present) > 0:
+            msg = (
+                "The following categories are included in R2N2's"
+                "official mapping but not found in the dataset location %s: %s"
+            ) % (shapenet_dir, ", ".join(synset_not_present))
+            warnings.warn(msg)
+
+    def __getitem__(self, model_idx, view_idxs: Optional[List[int]] = None) -> Dict:
+        """
+        Read a model by the given index.
+
+        Args:
+            model_idx: The idx of the model to be retrieved in the dataset.
+            view_idx: List of indices of the view to be returned. Each index needs to be
+                contained in the loaded split (always between 0 and 23, inclusive). If
+                an invalid index is supplied, view_idx will be ignored and all the loaded
+                views will be returned.
+
+        Returns:
+            dictionary with following keys:
+            - verts: FloatTensor of shape (V, 3).
+            - faces: faces.verts_idx, LongTensor of shape (F, 3).
+            - synset_id (str): synset id.
+            - model_id (str): model id.
+            - label (str): synset label.
+            - images: FloatTensor of shape (V, H, W, C), where V is number of views
+                returned. Returns a batch of the renderings of the models from the R2N2 dataset.
+            - R: Rotation matrix of shape (V, 3, 3), where V is number of views returned.
+            - T: Translation matrix of shape (V, 3), where V is number of views returned.
+            - K: Intrinsic matrix of shape (V, 4, 4), where V is number of views returned.
+            - voxels: Voxels of shape (D, D, D), where D is the number of voxels along each
+                dimension.
+        """
+        if isinstance(model_idx, tuple):
+            model_idx, view_idxs = model_idx
+        if view_idxs is not None:
+            if isinstance(view_idxs, int):
+                view_idxs = [view_idxs]
+            if not isinstance(view_idxs, list) and not torch.is_tensor(view_idxs):
+                raise TypeError(
+                    "view_idxs is of type %s but it needs to be a list."
+                    % type(view_idxs)
+                )
+
+        model_views = self.views_per_model_list[model_idx]
+        if view_idxs is not None and any(
+            idx not in self.views_per_model_list[model_idx] for idx in view_idxs
+        ):
+            msg = """At least one of the indices in view_idxs is not available.
+                Specified view of the model needs to be contained in the
+                loaded split. If return_all_views is set to False, only one
+                random view is loaded. Try accessing the specified view(s)
+                after loading the dataset with self.return_all_views set to True.
+                Now returning all view(s) in the loaded dataset."""
+            warnings.warn(msg)
+        elif view_idxs is not None:
+            model_views = view_idxs
+
+        model = self._get_item_ids(model_idx)
+        model_path = path.join(
+            self.shapenet_dir, model["synset_id"], model["model_id"], "model.obj"
+        )
+
+        verts, faces, textures = self._load_mesh(model_path)
+        model["verts"] = verts
+        model["faces"] = faces
+        model["textures"] = textures
+        model["label"] = self.synset_dict[model["synset_id"]]
+
+        model["images"] = None
+        images, Rs, Ts, voxel_RTs = [], [], [], []
+        # Retrieve R2N2's renderings if required.
+        if self.return_images:
+            rendering_path = path.join(
+                self.r2n2_dir,
+                self.views_rel_path,
+                model["synset_id"],
+                model["model_id"],
+                "rendering",
+            )
+            # Read metadata file to obtain params for calibration matrices.
+            with open(path.join(rendering_path, "rendering_metadata.txt"), "r") as f:
+                metadata_lines = f.readlines()
+            for i in model_views:
+                # Read image.
+                image_path = path.join(rendering_path, "%02d.png" % i)
+                raw_img = Image.open(image_path)
+                image = torch.from_numpy(np.array(raw_img) / 255.0)[..., :3]
+                images.append(image.to(dtype=torch.float32))
+
+                # Get camera calibration.
+                azim, elev, yaw, dist_ratio, fov = [
+                    float(v) for v in metadata_lines[i].strip().split(" ")
+                ]
+                dist = dist_ratio * MAX_CAMERA_DISTANCE
+                # Extrinsic matrix before transformation to PyTorch3D world space.
+                RT = compute_extrinsic_matrix(azim, elev, dist)
+                R, T = self._compute_camera_calibration(RT)
+                Rs.append(R)
+                Ts.append(T)
+                voxel_RTs.append(RT)
+
+            # Intrinsic matrix extracted from the Blender with slight modification to work with
+            # PyTorch3D world space. Taken from meshrcnn codebase:
+            # https://github.com/facebookresearch/meshrcnn/blob/main/shapenet/utils/coords.py
+            K = torch.tensor(
+                [
+                    [2.1875, 0.0, 0.0, 0.0],
+                    [0.0, 2.1875, 0.0, 0.0],
+                    [0.0, 0.0, -1.002002, -0.2002002],
+                    [0.0, 0.0, 1.0, 0.0],
+                ]
+            )
+            model["images"] = torch.stack(images)
+            model["R"] = torch.stack(Rs)
+            model["T"] = torch.stack(Ts)
+            model["K"] = K.expand(len(model_views), 4, 4)
+
+        voxels_list = []
+
+        # Read voxels if required.
+        voxel_path = path.join(
+            self.r2n2_dir,
+            self.voxels_rel_path,
+            model["synset_id"],
+            model["model_id"],
+            "model.binvox",
+        )
+        if self.return_voxels:
+            if not path.isfile(voxel_path):
+                msg = "Voxel file not found for model %s from category %s."
+                raise FileNotFoundError(msg % (model["model_id"], model["synset_id"]))
+
+            with open(voxel_path, "rb") as f:
+                # Read voxel coordinates as a tensor of shape (N, 3).
+                voxel_coords = read_binvox_coords(f)
+            # Align voxels to the same coordinate system as mesh verts.
+            voxel_coords = align_bbox(voxel_coords, model["verts"])
+            for RT in voxel_RTs:
+                # Compute projection matrix.
+                P = BLENDER_INTRINSIC.mm(RT)
+                # Convert voxel coordinates of shape (N, 3) to voxels of shape (D, D, D).
+                voxels = voxelize(voxel_coords, P, VOXEL_SIZE)
+                voxels_list.append(voxels)
+            model["voxels"] = torch.stack(voxels_list)
+
+        return model
+
+    def _compute_camera_calibration(self, RT):
+        """
+        Helper function for calculating rotation and translation matrices from ShapeNet
+        to camera transformation and ShapeNet to PyTorch3D transformation.
+
+        Args:
+            RT: Extrinsic matrix that performs ShapeNet world view to camera view
+                transformation.
+
+        Returns:
+            R: Rotation matrix of shape (3, 3).
+            T: Translation matrix of shape (3).
+        """
+        # Transform the mesh vertices from shapenet world to pytorch3d world.
+        shapenet_to_pytorch3d = torch.tensor(
+            [
+                [-1.0, 0.0, 0.0, 0.0],
+                [0.0, 1.0, 0.0, 0.0],
+                [0.0, 0.0, -1.0, 0.0],
+                [0.0, 0.0, 0.0, 1.0],
+            ],
+            dtype=torch.float32,
+        )
+        RT = torch.transpose(RT, 0, 1).mm(shapenet_to_pytorch3d)  # (4, 4)
+        # Extract rotation and translation matrices from RT.
+        R = RT[:3, :3]
+        T = RT[3, :3]
+        return R, T
+
+    def render(
+        self,
+        model_ids: Optional[List[str]] = None,
+        categories: Optional[List[str]] = None,
+        sample_nums: Optional[List[int]] = None,
+        idxs: Optional[List[int]] = None,
+        view_idxs: Optional[List[int]] = None,
+        shader_type=HardPhongShader,
+        device: Device = "cpu",
+        **kwargs
+    ) -> torch.Tensor:
+        """
+        Render models with BlenderCamera by default to achieve the same orientations as the
+        R2N2 renderings. Also accepts other types of cameras and any of the args that the
+        render function in the ShapeNetBase class accepts.
+
+        Args:
+            view_idxs: each model will be rendered with the orientation(s) of the specified
+                views. Only render by view_idxs if no camera or args for BlenderCamera is
+                supplied.
+            Accepts any of the args of the render function in ShapeNetBase:
+            model_ids: List[str] of model_ids of models intended to be rendered.
+            categories: List[str] of categories intended to be rendered. categories
+                and sample_nums must be specified at the same time. categories can be given
+                in the form of synset offsets or labels, or a combination of both.
+            sample_nums: List[int] of number of models to be randomly sampled from
+                each category. Could also contain one single integer, in which case it
+                will be broadcasted for every category.
+            idxs: List[int] of indices of models to be rendered in the dataset.
+            shader_type: Shader to use for rendering. Examples include HardPhongShader
+            (default), SoftPhongShader etc or any other type of valid Shader class.
+            device: Device (as str or torch.device) on which the tensors should be located.
+            **kwargs: Accepts any of the kwargs that the renderer supports and any of the
+                args that BlenderCamera supports.
+
+        Returns:
+            Batch of rendered images of shape (N, H, W, 3).
+        """
+        idxs = self._handle_render_inputs(model_ids, categories, sample_nums, idxs)
+        r = torch.cat([self[idxs[i], view_idxs]["R"] for i in range(len(idxs))])
+        t = torch.cat([self[idxs[i], view_idxs]["T"] for i in range(len(idxs))])
+        k = torch.cat([self[idxs[i], view_idxs]["K"] for i in range(len(idxs))])
+        # Initialize default camera using R, T, K from kwargs or R, T, K of the specified views.
+        blend_cameras = BlenderCamera(
+            R=kwargs.get("R", r),
+            T=kwargs.get("T", t),
+            K=kwargs.get("K", k),
+            device=device,
+        )
+        cameras = kwargs.get("cameras", blend_cameras).to(device)
+        kwargs.pop("cameras", None)
+        # pass down all the same inputs
+        return super().render(
+            idxs=idxs, shader_type=shader_type, device=device, cameras=cameras, **kwargs
+        )
diff --git a/pytorch3d/pytorch3d/datasets/r2n2/r2n2_synset_dict.json b/pytorch3d/pytorch3d/datasets/r2n2/r2n2_synset_dict.json
new file mode 100644
index 0000000000000000000000000000000000000000..b8cbae58173e58ea0607e95161e65944979aff23
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/r2n2/r2n2_synset_dict.json
@@ -0,0 +1,15 @@
+{
+    "04256520": "sofa",
+    "02933112": "cabinet",
+    "02828884": "bench",
+    "03001627": "chair",
+    "03211117": "display",
+    "04090263": "rifle",
+    "03691459": "loudspeaker",
+    "03636649": "lamp",
+    "04401088": "telephone",
+    "02691156": "airplane",
+    "04379243": "table",
+    "02958343": "car",
+    "04530566": "watercraft"
+}
diff --git a/pytorch3d/pytorch3d/datasets/r2n2/utils.py b/pytorch3d/pytorch3d/datasets/r2n2/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..360bca629fcad943663d7ffb60b4364f727755c9
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/r2n2/utils.py
@@ -0,0 +1,493 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Dict, List
+
+import numpy as np
+import torch
+from pytorch3d.common.types import Device
+from pytorch3d.datasets.utils import collate_batched_meshes
+from pytorch3d.ops import cubify
+from pytorch3d.renderer import (
+    HardPhongShader,
+    MeshRasterizer,
+    MeshRenderer,
+    PointLights,
+    RasterizationSettings,
+    TexturesVertex,
+)
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.transforms import Transform3d
+
+
+# Empirical min and max over the dataset from meshrcnn.
+# https://github.com/facebookresearch/meshrcnn/blob/main/shapenet/utils/coords.py#L9
+SHAPENET_MIN_ZMIN = 0.67
+SHAPENET_MAX_ZMAX = 0.92
+# Threshold for cubify from meshrcnn:
+# https://github.com/facebookresearch/meshrcnn/blob/main/configs/shapenet/voxmesh_R50.yaml#L11
+CUBIFY_THRESH = 0.2
+
+# Default values of rotation, translation and intrinsic matrices for BlenderCamera.
+r = np.expand_dims(np.eye(3), axis=0)  # (1, 3, 3)
+t = np.expand_dims(np.zeros(3), axis=0)  # (1, 3)
+k = np.expand_dims(np.eye(4), axis=0)  # (1, 4, 4)
+
+
+def collate_batched_R2N2(batch: List[Dict]):  # pragma: no cover
+    """
+    Take a list of objects in the form of dictionaries and merge them
+    into a single dictionary. This function can be used with a Dataset
+    object to create a torch.utils.data.Dataloader which directly
+    returns Meshes objects.
+    TODO: Add support for textures.
+
+    Args:
+        batch: List of dictionaries containing information about objects
+            in the dataset.
+
+    Returns:
+        collated_dict: Dictionary of collated lists. If batch contains both
+            verts and faces, a collated mesh batch is also returned.
+    """
+    collated_dict = collate_batched_meshes(batch)
+
+    # If collate_batched_meshes receives R2N2 items with images and that
+    # all models have the same number of views V, stack the batches of
+    # views of each model into a new batch of shape (N, V, H, W, 3).
+    # Otherwise leave it as a list.
+    if "images" in collated_dict:
+        try:
+            collated_dict["images"] = torch.stack(collated_dict["images"])
+        except RuntimeError:
+            print(
+                "Models don't have the same number of views. Now returning "
+                "lists of images instead of batches."
+            )
+
+    # If collate_batched_meshes receives R2N2 items with camera calibration
+    # matrices and that all models have the same number of views V, stack each
+    # type of matrices into a new batch of shape (N, V, ...).
+    # Otherwise leave them as lists.
+    if all(x in collated_dict for x in ["R", "T", "K"]):
+        try:
+            collated_dict["R"] = torch.stack(collated_dict["R"])  # (N, V, 3, 3)
+            collated_dict["T"] = torch.stack(collated_dict["T"])  # (N, V, 3)
+            collated_dict["K"] = torch.stack(collated_dict["K"])  # (N, V, 4, 4)
+        except RuntimeError:
+            print(
+                "Models don't have the same number of views. Now returning "
+                "lists of calibration matrices instead of a batched tensor."
+            )
+
+    # If collate_batched_meshes receives voxels and all models have the same
+    # number of views V, stack the batches of voxels into a new batch of shape
+    # (N, V, S, S, S), where S is the voxel size.
+    if "voxels" in collated_dict:
+        try:
+            collated_dict["voxels"] = torch.stack(collated_dict["voxels"])
+        except RuntimeError:
+            print(
+                "Models don't have the same number of views. Now returning "
+                "lists of voxels instead of a batched tensor."
+            )
+    return collated_dict
+
+
+def compute_extrinsic_matrix(azimuth, elevation, distance):  # pragma: no cover
+    """
+    Copied from meshrcnn codebase:
+    https://github.com/facebookresearch/meshrcnn/blob/main/shapenet/utils/coords.py#L96
+
+    Compute 4x4 extrinsic matrix that converts from homogeneous world coordinates
+    to homogeneous camera coordinates. We assume that the camera is looking at the
+    origin.
+    Used in R2N2 Dataset when computing calibration matrices.
+
+    Args:
+        azimuth: Rotation about the z-axis, in degrees.
+        elevation: Rotation above the xy-plane, in degrees.
+        distance: Distance from the origin.
+
+    Returns:
+        FloatTensor of shape (4, 4).
+    """
+    azimuth, elevation, distance = float(azimuth), float(elevation), float(distance)
+
+    az_rad = -math.pi * azimuth / 180.0
+    el_rad = -math.pi * elevation / 180.0
+    sa = math.sin(az_rad)
+    ca = math.cos(az_rad)
+    se = math.sin(el_rad)
+    ce = math.cos(el_rad)
+    R_world2obj = torch.tensor(
+        [[ca * ce, sa * ce, -se], [-sa, ca, 0], [ca * se, sa * se, ce]]
+    )
+    R_obj2cam = torch.tensor([[0.0, 1.0, 0.0], [0.0, 0.0, 1.0], [1.0, 0.0, 0.0]])
+    R_world2cam = R_obj2cam.mm(R_world2obj)
+    cam_location = torch.tensor([[distance, 0, 0]]).t()
+    T_world2cam = -(R_obj2cam.mm(cam_location))
+    RT = torch.cat([R_world2cam, T_world2cam], dim=1)
+    RT = torch.cat([RT, torch.tensor([[0.0, 0, 0, 1]])])
+
+    # Georgia: For some reason I cannot fathom, when Blender loads a .obj file it
+    # rotates the model 90 degrees about the x axis. To compensate for this quirk we
+    # roll that rotation into the extrinsic matrix here
+    rot = torch.tensor([[1, 0, 0, 0], [0, 0, -1, 0], [0, 1, 0, 0], [0, 0, 0, 1]])
+    RT = RT.mm(rot.to(RT))
+
+    return RT
+
+
+def read_binvox_coords(
+    f,
+    integer_division: bool = True,
+    dtype: torch.dtype = torch.float32,
+):  # pragma: no cover
+    """
+    Copied from meshrcnn codebase:
+    https://github.com/facebookresearch/meshrcnn/blob/main/shapenet/utils/binvox_torch.py#L5
+
+    Read a binvox file and return the indices of all nonzero voxels.
+
+    This matches the behavior of binvox_rw.read_as_coord_array
+    (https://github.com/dimatura/binvox-rw-py/blob/public/binvox_rw.py#L153)
+    but this implementation uses torch rather than numpy, and is more efficient
+    due to improved vectorization.
+
+    Georgia: I think that binvox_rw.read_as_coord_array actually has a bug; when converting
+    linear indices into three-dimensional indices, they use floating-point
+    division instead of integer division. We can reproduce their incorrect
+    implementation by passing integer_division=False.
+
+    Args:
+      f (str): A file pointer to the binvox file to read
+      integer_division (bool): If False, then match the buggy implementation from binvox_rw
+      dtype: Datatype of the output tensor. Use float64 to match binvox_rw
+
+    Returns:
+      coords (tensor): A tensor of shape (N, 3) where N is the number of nonzero voxels,
+           and coords[i] = (x, y, z) gives the index of the ith nonzero voxel. If the
+           voxel grid has shape (V, V, V) then we have 0 <= x, y, z < V.
+    """
+    size, translation, scale = _read_binvox_header(f)
+    storage = torch.ByteStorage.from_buffer(f.read())
+    data = torch.tensor([], dtype=torch.uint8)
+    data.set_(source=storage)
+    vals, counts = data[::2], data[1::2]
+    idxs = _compute_idxs(vals, counts)
+    if not integer_division:
+        idxs = idxs.to(dtype)
+    x_idxs = idxs // (size * size)
+    zy_idxs = idxs % (size * size)
+    z_idxs = zy_idxs // size
+    y_idxs = zy_idxs % size
+    coords = torch.stack([x_idxs, y_idxs, z_idxs], dim=1)
+    return coords.to(dtype)
+
+
+def _compute_idxs(vals, counts):  # pragma: no cover
+    """
+    Copied from meshrcnn codebase:
+    https://github.com/facebookresearch/meshrcnn/blob/main/shapenet/utils/binvox_torch.py#L58
+
+    Fast vectorized version of index computation.
+
+    Args:
+        vals: tensor of binary values indicating voxel presence in a dense format.
+        counts: tensor of number of occurrence of each value in vals.
+
+    Returns:
+        idxs: A tensor of shape (N), where N is the number of nonzero voxels.
+    """
+    # Consider an example where:
+    # vals   = [0, 1, 0, 1, 1]
+    # counts = [2, 3, 3, 2, 1]
+    #
+    # These values of counts and vals mean that the dense binary grid is:
+    # [0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1]
+    #
+    # So the nonzero indices we want to return are:
+    # [2, 3, 4, 8, 9, 10]
+
+    # After the cumsum we will have:
+    # end_idxs = [2, 5, 8, 10, 11]
+    end_idxs = counts.cumsum(dim=0)
+
+    # After masking and computing start_idx we have:
+    # end_idxs   = [5, 10, 11]
+    # counts     = [3,  2,  1]
+    # start_idxs = [2,  8, 10]
+    mask = vals == 1
+    end_idxs = end_idxs[mask]
+    counts = counts[mask].to(end_idxs)
+    start_idxs = end_idxs - counts
+
+    # We initialize delta as:
+    # [2, 1, 1, 1, 1, 1]
+    delta = torch.ones(counts.sum().item(), dtype=torch.int64)
+    delta[0] = start_idxs[0]
+
+    # We compute pos = [3, 5], val = [3, 0]; then delta is
+    # [2, 1, 1, 4, 1, 1]
+    pos = counts.cumsum(dim=0)[:-1]
+    val = start_idxs[1:] - end_idxs[:-1]
+    delta[pos] += val
+
+    # A final cumsum gives the idx we want: [2, 3, 4, 8, 9, 10]
+    idxs = delta.cumsum(dim=0)
+    return idxs
+
+
+def _read_binvox_header(f):  # pragma: no cover
+    """
+    Copied from meshrcnn codebase:
+    https://github.com/facebookresearch/meshrcnn/blob/main/shapenet/utils/binvox_torch.py#L99
+
+    Read binvox header and extract information regarding voxel sizes and translations
+    to original voxel coordinates.
+
+    Args:
+        f (str): A file pointer to the binvox file to read.
+
+    Returns:
+        size (int): size of voxel.
+        translation (tuple(float)): translation to original voxel coordinates.
+        scale (float): scale to original voxel coordinates.
+    """
+    # First line of the header should be "#binvox 1"
+    line = f.readline().strip()
+    if line != b"#binvox 1":
+        raise ValueError("Invalid header (line 1)")
+
+    # Second line of the header should be "dim [int] [int] [int]"
+    # and all three int should be the same
+    line = f.readline().strip()
+    if not line.startswith(b"dim "):
+        raise ValueError("Invalid header (line 2)")
+    dims = line.split(b" ")
+    try:
+        dims = [int(d) for d in dims[1:]]
+    except ValueError:
+        raise ValueError("Invalid header (line 2)")
+    if len(dims) != 3 or dims[0] != dims[1] or dims[0] != dims[2]:
+        raise ValueError("Invalid header (line 2)")
+    size = dims[0]
+
+    # Third line of the header should be "translate [float] [float] [float]"
+    line = f.readline().strip()
+    if not line.startswith(b"translate "):
+        raise ValueError("Invalid header (line 3)")
+    translation = line.split(b" ")
+    if len(translation) != 4:
+        raise ValueError("Invalid header (line 3)")
+    try:
+        translation = tuple(float(t) for t in translation[1:])
+    except ValueError:
+        raise ValueError("Invalid header (line 3)")
+
+    # Fourth line of the header should be "scale [float]"
+    line = f.readline().strip()
+    if not line.startswith(b"scale "):
+        raise ValueError("Invalid header (line 4)")
+    line = line.split(b" ")
+    if not len(line) == 2:
+        raise ValueError("Invalid header (line 4)")
+    scale = float(line[1])
+
+    # Fifth line of the header should be "data"
+    line = f.readline().strip()
+    if not line == b"data":
+        raise ValueError("Invalid header (line 5)")
+
+    return size, translation, scale
+
+
+def align_bbox(src, tgt):  # pragma: no cover
+    """
+    Copied from meshrcnn codebase:
+    https://github.com/facebookresearch/meshrcnn/blob/main/tools/preprocess_shapenet.py#L263
+
+    Return a copy of src points in the coordinate system of tgt by applying a
+    scale and shift along each coordinate axis to make the min / max values align.
+
+    Args:
+        src, tgt: Torch Tensor of shape (N, 3)
+
+    Returns:
+        out: Torch Tensor of shape (N, 3)
+    """
+    if src.ndim != 2 or tgt.ndim != 2:
+        raise ValueError("Both src and tgt need to have dimensions of 2.")
+    if src.shape[-1] != 3 or tgt.shape[-1] != 3:
+        raise ValueError(
+            "Both src and tgt need to have sizes of 3 along the second dimension."
+        )
+    src_min = src.min(dim=0)[0]
+    src_max = src.max(dim=0)[0]
+    tgt_min = tgt.min(dim=0)[0]
+    tgt_max = tgt.max(dim=0)[0]
+    scale = (tgt_max - tgt_min) / (src_max - src_min)
+    shift = tgt_min - scale * src_min
+    out = scale * src + shift
+    return out
+
+
+def voxelize(voxel_coords, P, V):  # pragma: no cover
+    """
+    Copied from meshrcnn codebase:
+    https://github.com/facebookresearch/meshrcnn/blob/main/tools/preprocess_shapenet.py#L284
+    but changing flip y to flip x.
+
+    Creating voxels of shape (D, D, D) from voxel_coords and projection matrix.
+
+    Args:
+        voxel_coords: FloatTensor of shape (V, 3) giving voxel's coordinates aligned to
+            the vertices.
+        P: FloatTensor of shape (4, 4) giving the projection matrix.
+        V: Voxel size of the output.
+
+    Returns:
+        voxels: Tensor of shape (D, D, D) giving the voxelized result.
+    """
+    device = voxel_coords.device
+    voxel_coords = project_verts(voxel_coords, P)
+
+    # Using the actual zmin and zmax of the model is bad because we need them
+    # to perform the inverse transform, which transform voxels back into world
+    # space for refinement or evaluation. Instead we use an empirical min and
+    # max over the dataset; that way it is consistent for all images.
+    zmin = SHAPENET_MIN_ZMIN
+    zmax = SHAPENET_MAX_ZMAX
+
+    # Once we know zmin and zmax, we need to adjust the z coordinates so the
+    # range [zmin, zmax] instead runs from [-1, 1]
+    m = 2.0 / (zmax - zmin)
+    b = -2.0 * zmin / (zmax - zmin) - 1
+    voxel_coords[:, 2].mul_(m).add_(b)
+    voxel_coords[:, 0].mul_(-1)  # Flip x
+
+    # Now voxels are in [-1, 1]^3; map to [0, V-1)^3
+    voxel_coords = 0.5 * (V - 1) * (voxel_coords + 1.0)
+    voxel_coords = voxel_coords.round().to(torch.int64)
+    valid = (0 <= voxel_coords) * (voxel_coords < V)
+    valid = valid[:, 0] * valid[:, 1] * valid[:, 2]
+    x, y, z = voxel_coords.unbind(dim=1)
+    x, y, z = x[valid], y[valid], z[valid]
+    voxels = torch.zeros(V, V, V, dtype=torch.uint8, device=device)
+    voxels[z, y, x] = 1
+
+    return voxels
+
+
+def project_verts(verts, P, eps=1e-1):  # pragma: no cover
+    """
+    Copied from meshrcnn codebase:
+    https://github.com/facebookresearch/meshrcnn/blob/main/shapenet/utils/coords.py#L159
+
+    Project vertices using a 4x4 transformation matrix.
+
+    Args:
+        verts: FloatTensor of shape (N, V, 3) giving a batch of vertex positions or of
+            shape (V, 3) giving a single set of vertex positions.
+        P: FloatTensor of shape (N, 4, 4) giving projection matrices or of shape (4, 4)
+            giving a single projection matrix.
+
+    Returns:
+        verts_out: FloatTensor of shape (N, V, 3) giving vertex positions (x, y, z)
+            where verts_out[i] is the result of transforming verts[i] by P[i].
+    """
+    # Handle unbatched inputs
+    singleton = False
+    if verts.dim() == 2:
+        assert P.dim() == 2
+        singleton = True
+        verts, P = verts[None], P[None]
+
+    N, V = verts.shape[0], verts.shape[1]
+    dtype, device = verts.dtype, verts.device
+
+    # Add an extra row of ones to the world-space coordinates of verts before
+    # multiplying by the projection matrix. We could avoid this allocation by
+    # instead multiplying by a 4x3 submatrix of the projection matrix, then
+    # adding the remaining 4x1 vector. Not sure whether there will be much
+    # performance difference between the two.
+    ones = torch.ones(N, V, 1, dtype=dtype, device=device)
+    verts_hom = torch.cat([verts, ones], dim=2)
+    verts_cam_hom = torch.bmm(verts_hom, P.transpose(1, 2))
+
+    # Avoid division by zero by clamping the absolute value
+    w = verts_cam_hom[:, :, 3:]
+    w_sign = w.sign()
+    w_sign[w == 0] = 1
+    w = w_sign * w.abs().clamp(min=eps)
+
+    verts_proj = verts_cam_hom[:, :, :3] / w
+
+    if singleton:
+        return verts_proj[0]
+    return verts_proj
+
+
+class BlenderCamera(CamerasBase):  # pragma: no cover
+    """
+    Camera for rendering objects with calibration matrices from the R2N2 dataset
+    (which uses Blender for rendering the views for each model).
+    """
+
+    def __init__(self, R=r, T=t, K=k, device: Device = "cpu") -> None:
+        """
+        Args:
+            R: Rotation matrix of shape (N, 3, 3).
+            T: Translation matrix of shape (N, 3).
+            K: Intrinsic matrix of shape (N, 4, 4).
+            device: Device (as str or torch.device).
+        """
+        # The initializer formats all inputs to torch tensors and broadcasts
+        # all the inputs to have the same batch dimension where necessary.
+        super().__init__(device=device, R=R, T=T, K=K)
+
+    def get_projection_transform(self, **kwargs) -> Transform3d:
+        transform = Transform3d(device=self.device)
+        transform._matrix = self.K.transpose(1, 2).contiguous()
+        return transform
+
+
+def render_cubified_voxels(
+    voxels: torch.Tensor, shader_type=HardPhongShader, device: Device = "cpu", **kwargs
+):  # pragma: no cover
+    """
+    Use the Cubify operator to convert inputs voxels to a mesh and then render that mesh.
+
+    Args:
+        voxels: FloatTensor of shape (N, D, D, D) where N is the batch size and
+            D is the number of voxels along each dimension.
+        shader_type: shader_type: shader_type: Shader to use for rendering. Examples
+            include HardPhongShader (default), SoftPhongShader etc or any other type
+            of valid Shader class.
+        device: Device (as str or torch.device) on which the tensors should be located.
+        **kwargs: Accepts any of the kwargs that the renderer supports.
+    Returns:
+        Batch of rendered images of shape (N, H, W, 3).
+    """
+    cubified_voxels = cubify(voxels, CUBIFY_THRESH).to(device)
+    cubified_voxels.textures = TexturesVertex(
+        verts_features=torch.ones_like(cubified_voxels.verts_padded(), device=device)
+    )
+    cameras = BlenderCamera(device=device)
+    renderer = MeshRenderer(
+        rasterizer=MeshRasterizer(
+            cameras=cameras,
+            raster_settings=kwargs.get("raster_settings", RasterizationSettings()),
+        ),
+        shader=shader_type(
+            device=device,
+            cameras=cameras,
+            lights=kwargs.get("lights", PointLights()).to(device),
+        ),
+    )
+    return renderer(cubified_voxels)
diff --git a/pytorch3d/pytorch3d/datasets/shapenet/__init__.py b/pytorch3d/pytorch3d/datasets/shapenet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e0e5fd9cbe4f68e404f77ddaf4fa384f1f73cf0
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/shapenet/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .shapenet_core import ShapeNetCore
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/datasets/shapenet/shapenet_core.py b/pytorch3d/pytorch3d/datasets/shapenet/shapenet_core.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4179499ef6d8558af8ad174beecd62210b7f9f3
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/shapenet/shapenet_core.py
@@ -0,0 +1,158 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import os
+import warnings
+from os import path
+from pathlib import Path
+from typing import Dict
+
+from pytorch3d.datasets.shapenet_base import ShapeNetBase
+
+
+SYNSET_DICT_DIR = Path(__file__).resolve().parent
+
+
+class ShapeNetCore(ShapeNetBase):  # pragma: no cover
+    """
+    This class loads ShapeNetCore from a given directory into a Dataset object.
+    ShapeNetCore is a subset of the ShapeNet dataset and can be downloaded from
+    https://www.shapenet.org/.
+    """
+
+    def __init__(
+        self,
+        data_dir,
+        synsets=None,
+        version: int = 1,
+        load_textures: bool = True,
+        texture_resolution: int = 4,
+    ) -> None:
+        """
+        Store each object's synset id and models id from data_dir.
+
+        Args:
+            data_dir: Path to ShapeNetCore data.
+            synsets: List of synset categories to load from ShapeNetCore in the form of
+                synset offsets or labels. A combination of both is also accepted.
+                When no category is specified, all categories in data_dir are loaded.
+            version: (int) version of ShapeNetCore data in data_dir, 1 or 2.
+                Default is set to be 1. Version 1 has 57 categories and version 2 has 55
+                categories.
+                Note: version 1 has two categories 02858304(boat) and 02992529(cellphone)
+                that are hyponyms of categories 04530566(watercraft) and 04401088(telephone)
+                respectively. You can combine the categories manually if needed.
+                Version 2 doesn't have 02858304(boat) or 02834778(bicycle) compared to
+                version 1.
+            load_textures: Boolean indicating whether textures should loaded for the model.
+                Textures will be of type TexturesAtlas i.e. a texture map per face.
+            texture_resolution: Int specifying the resolution of the texture map per face
+                created using the textures in the obj file. A
+                (texture_resolution, texture_resolution, 3) map is created per face.
+        """
+        super().__init__()
+        self.shapenet_dir = data_dir
+        self.load_textures = load_textures
+        self.texture_resolution = texture_resolution
+
+        if version not in [1, 2]:
+            raise ValueError("Version number must be either 1 or 2.")
+        self.model_dir = "model.obj" if version == 1 else "models/model_normalized.obj"
+
+        # Synset dictionary mapping synset offsets to corresponding labels.
+        dict_file = "shapenet_synset_dict_v%d.json" % version
+        with open(path.join(SYNSET_DICT_DIR, dict_file), "r") as read_dict:
+            self.synset_dict = json.load(read_dict)
+        # Inverse dictionary mapping synset labels to corresponding offsets.
+        self.synset_inv = {label: offset for offset, label in self.synset_dict.items()}
+
+        # If categories are specified, check if each category is in the form of either
+        # synset offset or synset label, and if the category exists in the given directory.
+        if synsets is not None:
+            # Set of categories to load in the form of synset offsets.
+            synset_set = set()
+            for synset in synsets:
+                if (synset in self.synset_dict.keys()) and (
+                    path.isdir(path.join(data_dir, synset))
+                ):
+                    synset_set.add(synset)
+                elif (synset in self.synset_inv.keys()) and (
+                    (path.isdir(path.join(data_dir, self.synset_inv[synset])))
+                ):
+                    synset_set.add(self.synset_inv[synset])
+                else:
+                    msg = (
+                        "Synset category %s either not part of ShapeNetCore dataset "
+                        "or cannot be found in %s."
+                    ) % (synset, data_dir)
+                    warnings.warn(msg)
+        # If no category is given, load every category in the given directory.
+        # Ignore synset folders not included in the official mapping.
+        else:
+            synset_set = {
+                synset
+                for synset in os.listdir(data_dir)
+                if path.isdir(path.join(data_dir, synset))
+                and synset in self.synset_dict
+            }
+
+        # Check if there are any categories in the official mapping that are not loaded.
+        # Update self.synset_inv so that it only includes the loaded categories.
+        synset_not_present = set(self.synset_dict.keys()).difference(synset_set)
+        [self.synset_inv.pop(self.synset_dict[synset]) for synset in synset_not_present]
+
+        if len(synset_not_present) > 0:
+            msg = (
+                "The following categories are included in ShapeNetCore ver.%d's "
+                "official mapping but not found in the dataset location %s: %s"
+                ""
+            ) % (version, data_dir, ", ".join(synset_not_present))
+            warnings.warn(msg)
+
+        # Extract model_id of each object from directory names.
+        # Each grandchildren directory of data_dir contains an object, and the name
+        # of the directory is the object's model_id.
+        for synset in synset_set:
+            self.synset_start_idxs[synset] = len(self.synset_ids)
+            for model in os.listdir(path.join(data_dir, synset)):
+                if not path.exists(path.join(data_dir, synset, model, self.model_dir)):
+                    msg = (
+                        "Object file not found in the model directory %s "
+                        "under synset directory %s."
+                    ) % (model, synset)
+                    warnings.warn(msg)
+                    continue
+                self.synset_ids.append(synset)
+                self.model_ids.append(model)
+            model_count = len(self.synset_ids) - self.synset_start_idxs[synset]
+            self.synset_num_models[synset] = model_count
+
+    def __getitem__(self, idx: int) -> Dict:
+        """
+        Read a model by the given index.
+
+        Args:
+            idx: The idx of the model to be retrieved in the dataset.
+
+        Returns:
+            dictionary with following keys:
+            - verts: FloatTensor of shape (V, 3).
+            - faces: LongTensor of shape (F, 3) which indexes into the verts tensor.
+            - synset_id (str): synset id
+            - model_id (str): model id
+            - label (str): synset label.
+        """
+        model = self._get_item_ids(idx)
+        model_path = path.join(
+            self.shapenet_dir, model["synset_id"], model["model_id"], self.model_dir
+        )
+        verts, faces, textures = self._load_mesh(model_path)
+        model["verts"] = verts
+        model["faces"] = faces
+        model["textures"] = textures
+        model["label"] = self.synset_dict[model["synset_id"]]
+        return model
diff --git a/pytorch3d/pytorch3d/datasets/shapenet/shapenet_synset_dict_v1.json b/pytorch3d/pytorch3d/datasets/shapenet/shapenet_synset_dict_v1.json
new file mode 100644
index 0000000000000000000000000000000000000000..b2fc62ae62107a81e078ec02432fb554ae8f1b41
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/shapenet/shapenet_synset_dict_v1.json
@@ -0,0 +1,59 @@
+{
+    "04379243": "table",
+    "02958343": "car",
+    "03001627": "chair",
+    "02691156": "airplane",
+    "04256520": "sofa",
+    "04090263": "rifle",
+    "03636649": "lamp",
+    "04530566": "watercraft",
+    "02828884": "bench",
+    "03691459": "loudspeaker",
+    "02933112": "cabinet",
+    "03211117": "display",
+    "04401088": "telephone",
+    "02924116": "bus",
+    "02808440": "bathtub",
+    "03467517": "guitar",
+    "03325088": "faucet",
+    "03046257": "clock",
+    "03991062": "flowerpot",
+    "03593526": "jar",
+    "02876657": "bottle",
+    "02871439": "bookshelf",
+    "03642806": "laptop",
+    "03624134": "knife",
+    "04468005": "train",
+    "02747177": "trash bin",
+    "03790512": "motorbike",
+    "03948459": "pistol",
+    "03337140": "file cabinet",
+    "02818832": "bed",
+    "03928116": "piano",
+    "04330267": "stove",
+    "03797390": "mug",
+    "02880940": "bowl",
+    "04554684": "washer",
+    "04004475": "printer",
+    "03513137": "helmet",
+    "03761084": "microwaves",
+    "04225987": "skateboard",
+    "04460130": "tower",
+    "02942699": "camera",
+    "02801938": "basket",
+    "02946921": "can",
+    "03938244": "pillow",
+    "03710193": "mailbox",
+    "03207941": "dishwasher",
+    "04099429": "rocket",
+    "02773838": "bag",
+    "02843684": "birdhouse",
+    "03261776": "earphone",
+    "03759954": "microphone",
+    "04074963": "remote",
+    "03085013": "keyboard",
+    "02834778": "bicycle",
+    "02954340": "cap",
+    "02858304": "boat",
+    "02992529": "mobile phone"
+}
diff --git a/pytorch3d/pytorch3d/datasets/shapenet/shapenet_synset_dict_v2.json b/pytorch3d/pytorch3d/datasets/shapenet/shapenet_synset_dict_v2.json
new file mode 100644
index 0000000000000000000000000000000000000000..f0107c93c3535e2454070be1dcb622ac66899c90
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/shapenet/shapenet_synset_dict_v2.json
@@ -0,0 +1,57 @@
+{
+    "02691156": "airplane",
+    "02747177": "trash bin",
+    "02773838": "bag",
+    "02801938": "basket",
+    "02808440": "bathtub",
+    "02818832": "bed",
+    "02828884": "bench",
+    "02843684": "birdhouse",
+    "02871439": "bookshelf",
+    "02876657": "bottle",
+    "02880940": "bowl",
+    "02924116": "bus",
+    "02933112": "cabinet",
+    "02942699": "camera",
+    "02946921": "can",
+    "02954340": "cap",
+    "02958343": "car",
+    "02992529": "cellphone",
+    "03001627": "chair",
+    "03046257": "clock",
+    "03085013": "keyboard",
+    "03207941": "dishwasher",
+    "03211117": "display",
+    "03261776": "earphone",
+    "03325088": "faucet",
+    "03337140": "file cabinet",
+    "03467517": "guitar",
+    "03513137": "helmet",
+    "03593526": "jar",
+    "03624134": "knife",
+    "03636649": "lamp",
+    "03642806": "laptop",
+    "03691459": "loudspeaker",
+    "03710193": "mailbox",
+    "03759954": "microphone",
+    "03761084": "microwaves",
+    "03790512": "motorbike",
+    "03797390": "mug",
+    "03928116": "piano",
+    "03938244": "pillow",
+    "03948459": "pistol",
+    "03991062": "flowerpot",
+    "04004475": "printer",
+    "04074963": "remote",
+    "04090263": "rifle",
+    "04099429": "rocket",
+    "04225987": "skateboard",
+    "04256520": "sofa",
+    "04330267": "stove",
+    "04379243": "table",
+    "04401088": "telephone",
+    "04460130": "tower",
+    "04468005": "train",
+    "04530566": "watercraft",
+    "04554684": "washer"
+}
diff --git a/pytorch3d/pytorch3d/datasets/shapenet_base.py b/pytorch3d/pytorch3d/datasets/shapenet_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..624d8b622fb3934176ea082c63f4bcd94372c17d
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/shapenet_base.py
@@ -0,0 +1,286 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from pytorch3d.common.types import Device
+from pytorch3d.io import load_obj
+from pytorch3d.renderer import (
+    FoVPerspectiveCameras,
+    HardPhongShader,
+    MeshRasterizer,
+    MeshRenderer,
+    PointLights,
+    RasterizationSettings,
+    TexturesVertex,
+)
+
+from .utils import collate_batched_meshes
+
+
+class ShapeNetBase(torch.utils.data.Dataset):  # pragma: no cover
+    """
+    'ShapeNetBase' implements a base Dataset for ShapeNet and R2N2 with helper methods.
+    It is not intended to be used on its own as a Dataset for a Dataloader. Both __init__
+    and __getitem__ need to be implemented.
+    """
+
+    def __init__(self) -> None:
+        """
+        Set up lists of synset_ids and model_ids.
+        """
+        self.synset_ids = []
+        self.model_ids = []
+        self.synset_inv = {}
+        self.synset_start_idxs = {}
+        self.synset_num_models = {}
+        self.shapenet_dir = ""
+        self.model_dir = "model.obj"
+        self.load_textures = True
+        self.texture_resolution = 4
+
+    def __len__(self) -> int:
+        """
+        Return number of total models in the loaded dataset.
+        """
+        return len(self.model_ids)
+
+    def __getitem__(self, idx) -> Dict:
+        """
+        Read a model by the given index. Need to be implemented for every child class
+        of ShapeNetBase.
+
+        Args:
+            idx: The idx of the model to be retrieved in the dataset.
+
+        Returns:
+            dictionary containing information about the model.
+        """
+        raise NotImplementedError(
+            "__getitem__ should be implemented in the child class of ShapeNetBase"
+        )
+
+    def _get_item_ids(self, idx) -> Dict:
+        """
+        Read a model by the given index.
+
+        Args:
+            idx: The idx of the model to be retrieved in the dataset.
+
+        Returns:
+            dictionary with following keys:
+            - synset_id (str): synset id
+            - model_id (str): model id
+        """
+        model = {}
+        model["synset_id"] = self.synset_ids[idx]
+        model["model_id"] = self.model_ids[idx]
+        return model
+
+    def _load_mesh(self, model_path) -> Tuple:
+        verts, faces, aux = load_obj(
+            model_path,
+            create_texture_atlas=self.load_textures,
+            load_textures=self.load_textures,
+            texture_atlas_size=self.texture_resolution,
+        )
+        if self.load_textures:
+            textures = aux.texture_atlas
+            # Some meshes don't have textures. In this case
+            # create a white texture map
+            if textures is None:
+                textures = verts.new_ones(
+                    faces.verts_idx.shape[0],
+                    self.texture_resolution,
+                    self.texture_resolution,
+                    3,
+                )
+        else:
+            textures = None
+
+        return verts, faces.verts_idx, textures
+
+    def render(
+        self,
+        model_ids: Optional[List[str]] = None,
+        categories: Optional[List[str]] = None,
+        sample_nums: Optional[List[int]] = None,
+        idxs: Optional[List[int]] = None,
+        shader_type=HardPhongShader,
+        device: Device = "cpu",
+        **kwargs
+    ) -> torch.Tensor:
+        """
+        If a list of model_ids are supplied, render all the objects by the given model_ids.
+        If no model_ids are supplied, but categories and sample_nums are specified, randomly
+        select a number of objects (number specified in sample_nums) in the given categories
+        and render these objects. If instead a list of idxs is specified, check if the idxs
+        are all valid and render models by the given idxs. Otherwise, randomly select a number
+        (first number in sample_nums, default is set to be 1) of models from the loaded dataset
+        and render these models.
+
+        Args:
+            model_ids: List[str] of model_ids of models intended to be rendered.
+            categories: List[str] of categories intended to be rendered. categories
+                and sample_nums must be specified at the same time. categories can be given
+                in the form of synset offsets or labels, or a combination of both.
+            sample_nums: List[int] of number of models to be randomly sampled from
+                each category. Could also contain one single integer, in which case it
+                will be broadcasted for every category.
+            idxs: List[int] of indices of models to be rendered in the dataset.
+            shader_type: Select shading. Valid options include HardPhongShader (default),
+                SoftPhongShader, HardGouraudShader, SoftGouraudShader, HardFlatShader,
+                SoftSilhouetteShader.
+            device: Device (as str or torch.device) on which the tensors should be located.
+            **kwargs: Accepts any of the kwargs that the renderer supports.
+
+        Returns:
+            Batch of rendered images of shape (N, H, W, 3).
+        """
+        idxs = self._handle_render_inputs(model_ids, categories, sample_nums, idxs)
+        # Use the getitem method which loads mesh + texture
+        models = [self[idx] for idx in idxs]
+        meshes = collate_batched_meshes(models)["mesh"]
+        if meshes.textures is None:
+            meshes.textures = TexturesVertex(
+                verts_features=torch.ones_like(meshes.verts_padded(), device=device)
+            )
+
+        meshes = meshes.to(device)
+        cameras = kwargs.get("cameras", FoVPerspectiveCameras()).to(device)
+        if len(cameras) != 1 and len(cameras) % len(meshes) != 0:
+            raise ValueError("Mismatch between batch dims of cameras and meshes.")
+        if len(cameras) > 1:
+            # When rendering R2N2 models, if more than one views are provided, broadcast
+            # the meshes so that each mesh can be rendered for each of the views.
+            meshes = meshes.extend(len(cameras) // len(meshes))
+        renderer = MeshRenderer(
+            rasterizer=MeshRasterizer(
+                cameras=cameras,
+                raster_settings=kwargs.get("raster_settings", RasterizationSettings()),
+            ),
+            shader=shader_type(
+                device=device,
+                cameras=cameras,
+                lights=kwargs.get("lights", PointLights()).to(device),
+            ),
+        )
+        return renderer(meshes)
+
+    def _handle_render_inputs(
+        self,
+        model_ids: Optional[List[str]] = None,
+        categories: Optional[List[str]] = None,
+        sample_nums: Optional[List[int]] = None,
+        idxs: Optional[List[int]] = None,
+    ) -> List[int]:
+        """
+        Helper function for converting user provided model_ids, categories and sample_nums
+        to indices of models in the loaded dataset. If model idxs are provided, we check if
+        the idxs are valid. If no models are specified, the first model in the loaded dataset
+        is chosen. The function returns the file paths to the selected models.
+
+        Args:
+            model_ids: List[str] of model_ids of models to be rendered.
+            categories: List[str] of categories to be rendered.
+            sample_nums: List[int] of number of models to be randomly sampled from
+                each category.
+            idxs: List[int] of indices of models to be rendered in the dataset.
+
+        Returns:
+            List of paths of models to be rendered.
+        """
+        # Get corresponding indices if model_ids are supplied.
+        if model_ids is not None and len(model_ids) > 0:
+            idxs = []
+            for model_id in model_ids:
+                if model_id not in self.model_ids:
+                    raise ValueError(
+                        "model_id %s not found in the loaded dataset." % model_id
+                    )
+                idxs.append(self.model_ids.index(model_id))
+
+        # Sample random models if categories and sample_nums are supplied and get
+        # the corresponding indices.
+        elif categories is not None and len(categories) > 0:
+            sample_nums = [1] if sample_nums is None else sample_nums
+            if len(categories) != len(sample_nums) and len(sample_nums) != 1:
+                raise ValueError(
+                    "categories and sample_nums needs to be of the same length or "
+                    "sample_nums needs to be of length 1."
+                )
+
+            idxs_tensor = torch.empty(0, dtype=torch.int32)
+            for i in range(len(categories)):
+                category = self.synset_inv.get(categories[i], categories[i])
+                if category not in self.synset_inv.values():
+                    raise ValueError(
+                        "Category %s is not in the loaded dataset." % category
+                    )
+                # Broadcast if sample_nums has length of 1.
+                sample_num = sample_nums[i] if len(sample_nums) > 1 else sample_nums[0]
+                sampled_idxs = self._sample_idxs_from_category(
+                    sample_num=sample_num, category=category
+                )
+                idxs_tensor = torch.cat((idxs_tensor, sampled_idxs))
+            idxs = idxs_tensor.tolist()
+        # Check if the indices are valid if idxs are supplied.
+        elif idxs is not None and len(idxs) > 0:
+            if any(idx < 0 or idx >= len(self.model_ids) for idx in idxs):
+                raise IndexError(
+                    "One or more idx values are out of bounds. Indices need to be"
+                    "between 0 and %s." % (len(self.model_ids) - 1)
+                )
+        # Check if sample_nums is specified, if so sample sample_nums[0] number
+        # of indices from the entire loaded dataset. Otherwise randomly select one
+        # index from the dataset.
+        else:
+            sample_nums = [1] if sample_nums is None else sample_nums
+            if len(sample_nums) > 1:
+                msg = (
+                    "More than one sample sizes specified, now sampling "
+                    "%d models from the dataset." % sample_nums[0]
+                )
+                warnings.warn(msg)
+            idxs = self._sample_idxs_from_category(sample_nums[0])
+        return idxs
+
+    def _sample_idxs_from_category(
+        self, sample_num: int = 1, category: Optional[str] = None
+    ) -> List[int]:
+        """
+        Helper function for sampling a number of indices from the given category.
+
+        Args:
+            sample_num: number of indices to be sampled from the given category.
+            category: category synset of the category to be sampled from. If not
+                specified, sample from all models in the loaded dataset.
+        """
+        start = self.synset_start_idxs[category] if category is not None else 0
+        range_len = (
+            self.synset_num_models[category] if category is not None else self.__len__()
+        )
+        replacement = sample_num > range_len
+        sampled_idxs = (
+            torch.multinomial(
+                torch.ones((range_len), dtype=torch.float32),
+                sample_num,
+                replacement=replacement,
+            )
+            + start
+        )
+        if replacement:
+            msg = (
+                "Sample size %d is larger than the number of objects in %s, "
+                "values sampled with replacement."
+            ) % (
+                sample_num,
+                "category " + category if category is not None else "all categories",
+            )
+            warnings.warn(msg)
+        return sampled_idxs
diff --git a/pytorch3d/pytorch3d/datasets/utils.py b/pytorch3d/pytorch3d/datasets/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..37f5bc8d3aff7f6d480f8c7f390edeeac9a33cd7
--- /dev/null
+++ b/pytorch3d/pytorch3d/datasets/utils.py
@@ -0,0 +1,48 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List
+
+from pytorch3d.renderer.mesh import TexturesAtlas
+from pytorch3d.structures import Meshes
+
+
+def collate_batched_meshes(batch: List[Dict]):  # pragma: no cover
+    """
+    Take a list of objects in the form of dictionaries and merge them
+    into a single dictionary. This function can be used with a Dataset
+    object to create a torch.utils.data.Dataloader which directly
+    returns Meshes objects.
+    TODO: Add support for textures.
+
+    Args:
+        batch: List of dictionaries containing information about objects
+            in the dataset.
+
+    Returns:
+        collated_dict: Dictionary of collated lists. If batch contains both
+            verts and faces, a collated mesh batch is also returned.
+    """
+    if batch is None or len(batch) == 0:
+        return None
+    collated_dict = {}
+    for k in batch[0].keys():
+        collated_dict[k] = [d[k] for d in batch]
+
+    collated_dict["mesh"] = None
+    if {"verts", "faces"}.issubset(collated_dict.keys()):
+
+        textures = None
+        if "textures" in collated_dict:
+            textures = TexturesAtlas(atlas=collated_dict["textures"])
+
+        collated_dict["mesh"] = Meshes(
+            verts=collated_dict["verts"],
+            faces=collated_dict["faces"],
+            textures=textures,
+        )
+
+    return collated_dict
diff --git a/pytorch3d/pytorch3d/io/__init__.py b/pytorch3d/pytorch3d/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..45bd99cad74b34a6becdab2fc75d158673f96bdd
--- /dev/null
+++ b/pytorch3d/pytorch3d/io/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from .obj_io import load_obj, load_objs_as_meshes, save_obj
+from .pluggable import IO
+from .ply_io import load_ply, save_ply
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/io/experimental_gltf_io.py b/pytorch3d/pytorch3d/io/experimental_gltf_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdb7895fb551eec8939bd864a65f01a0cb60235e
--- /dev/null
+++ b/pytorch3d/pytorch3d/io/experimental_gltf_io.py
@@ -0,0 +1,573 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+"""
+This module implements loading meshes from glTF 2 assets stored in a
+GLB container file or a glTF JSON file with embedded binary data.
+It is experimental.
+
+The module provides a MeshFormatInterpreter called
+MeshGlbFormat which must be used explicitly.
+e.g.
+
+.. code-block:: python
+
+    from pytorch3d.io import IO
+    from pytorch3d.io.experimental_gltf_io import MeshGlbFormat
+
+    io = IO()
+    io.register_meshes_format(MeshGlbFormat())
+    io.load_mesh(...)
+
+This implementation is quite restricted in what it supports.
+
+    - It does not try to validate the input against the standard.
+    - It loads the default scene only.
+    - Only triangulated geometry is supported.
+    - The geometry of all meshes of the entire scene is aggregated into a single mesh.
+      Use `load_meshes()` instead to get un-aggregated (but transformed) ones.
+    - All material properties are ignored except for either vertex color, baseColorTexture
+      or baseColorFactor. If available, one of these (in this order) is exclusively
+      used which does not match the semantics of the standard.
+"""
+
+import json
+import struct
+import warnings
+from base64 import b64decode
+from collections import deque
+from enum import IntEnum
+from io import BytesIO
+from typing import Any, BinaryIO, Dict, List, Optional, Tuple, Union, cast
+
+import numpy as np
+import torch
+from iopath.common.file_io import PathManager
+from PIL import Image
+from pytorch3d.io.utils import PathOrStr, _open_file
+from pytorch3d.renderer.mesh import TexturesBase, TexturesUV, TexturesVertex
+from pytorch3d.structures import Meshes, join_meshes_as_scene
+from pytorch3d.transforms import Transform3d, quaternion_to_matrix
+
+from .pluggable_formats import MeshFormatInterpreter, endswith
+
+
+_GLTF_MAGIC = 0x46546C67
+_JSON_CHUNK_TYPE = 0x4E4F534A
+_BINARY_CHUNK_TYPE = 0x004E4942
+_DATA_URI_PREFIX = "data:application/octet-stream;base64,"
+
+
+class _PrimitiveMode(IntEnum):
+    POINTS = 0
+    LINES = 1
+    LINE_LOOP = 2
+    LINE_STRIP = 3
+    TRIANGLES = 4
+    TRIANGLE_STRIP = 5
+    TRIANGLE_FAN = 6
+
+
+class _ComponentType(IntEnum):
+    BYTE = 5120
+    UNSIGNED_BYTE = 5121
+    SHORT = 5122
+    UNSIGNED_SHORT = 5123
+    UNSIGNED_INT = 5125
+    FLOAT = 5126
+
+
+_ITEM_TYPES: Dict[int, Any] = {
+    5120: np.int8,
+    5121: np.uint8,
+    5122: np.int16,
+    5123: np.uint16,
+    5125: np.uint32,
+    5126: np.float32,
+}
+
+
+_ElementShape = Union[Tuple[int], Tuple[int, int]]
+_ELEMENT_SHAPES: Dict[str, _ElementShape] = {
+    "SCALAR": (1,),
+    "VEC2": (2,),
+    "VEC3": (3,),
+    "VEC4": (4,),
+    "MAT2": (2, 2),
+    "MAT3": (3, 3),
+    "MAT4": (4, 4),
+}
+
+
+def _read_header(stream: BinaryIO) -> Optional[Tuple[int, int]]:
+    header = stream.read(12)
+    magic, version, length = struct.unpack("<III", header)
+
+    if magic != _GLTF_MAGIC:
+        return None
+
+    return version, length
+
+
+def _read_chunks(
+    stream: BinaryIO, length: int
+) -> Optional[Tuple[Dict[str, Any], np.ndarray]]:
+    """
+    Get the json header and the binary data from a
+    GLB file.
+    """
+    json_data = None
+    binary_data = None
+
+    while stream.tell() < length:
+        chunk_header = stream.read(8)
+        chunk_length, chunk_type = struct.unpack("<II", chunk_header)
+        chunk_data = stream.read(chunk_length)
+        if chunk_type == _JSON_CHUNK_TYPE:
+            json_data = json.loads(chunk_data)
+        elif chunk_type == _BINARY_CHUNK_TYPE:
+            binary_data = chunk_data
+        else:
+            warnings.warn("Unsupported chunk type")
+            return None
+
+    if json_data is None:
+        raise ValueError("Missing json header")
+
+    if binary_data is not None:
+        binary_data = np.frombuffer(binary_data, dtype=np.uint8)
+
+    return json_data, binary_data
+
+
+def _make_node_transform(node: Dict[str, Any]) -> Transform3d:
+    """
+    Convert a transform from the json data in to a PyTorch3D
+    Transform3d format.
+    """
+    array = node.get("matrix")
+    if array is not None:  # Stored in column-major order
+        M = np.array(array, dtype=np.float32).reshape(4, 4, order="F")
+        return Transform3d(matrix=torch.from_numpy(M))
+
+    out = Transform3d()
+
+    # Given some of (scale/rotation/translation), we do them in that order to
+    # get points in to the world space.
+    # See https://github.com/KhronosGroup/glTF/issues/743 .
+
+    array = node.get("scale", None)
+    if array is not None:
+        scale_vector = torch.FloatTensor(array)
+        out = out.scale(scale_vector[None])
+
+    # Rotation quaternion (x, y, z, w) where w is the scalar
+    array = node.get("rotation", None)
+    if array is not None:
+        x, y, z, w = array
+        # We negate w. This is equivalent to inverting the rotation.
+        # This is needed as quaternion_to_matrix makes a matrix which
+        # operates on column vectors, whereas Transform3d wants a
+        # matrix which operates on row vectors.
+        rotation_quaternion = torch.FloatTensor([-w, x, y, z])
+        rotation_matrix = quaternion_to_matrix(rotation_quaternion)
+        out = out.rotate(R=rotation_matrix)
+
+    array = node.get("translation", None)
+    if array is not None:
+        translation_vector = torch.FloatTensor(array)
+        out = out.translate(x=translation_vector[None])
+
+    return out
+
+
+class _GLTFLoader:
+    def __init__(self, stream: BinaryIO) -> None:
+        self._json_data = None
+        # Map from buffer index to (decoded) binary data
+        self._binary_data = {}
+
+        version_and_length = _read_header(stream)
+        if version_and_length is None:  # GLTF
+            stream.seek(0)
+            json_data = json.load(stream)
+        else:  # GLB
+            version, length = version_and_length
+            if version != 2:
+                warnings.warn("Unsupported version")
+                return
+            json_and_binary_data = _read_chunks(stream, length)
+            if json_and_binary_data is None:
+                raise ValueError("Data not found")
+            json_data, binary_data = json_and_binary_data
+            self._binary_data[0] = binary_data
+
+        self._json_data = json_data
+        self._accessors = json_data.get("accessors", [])
+        self._buffer_views = json_data.get("bufferViews", [])
+        self._buffers = json_data.get("buffers", [])
+        self._texture_map_images = {}
+
+    def _access_image(self, image_index: int) -> np.ndarray:
+        """
+        Get the data for an image from the file. This is only called
+        by _get_texture_map_image which caches it.
+        """
+
+        image_json = self._json_data["images"][image_index]
+        buffer_view = self._buffer_views[image_json["bufferView"]]
+        if "byteStride" in buffer_view:
+            raise NotImplementedError("strided buffer views")
+
+        length = buffer_view["byteLength"]
+        offset = buffer_view.get("byteOffset", 0)
+
+        binary_data = self.get_binary_data(buffer_view["buffer"])
+
+        bytesio = BytesIO(binary_data[offset : offset + length].tobytes())
+        with Image.open(bytesio) as f:
+            array = np.array(f)
+            if array.dtype == np.uint8:
+                return array.astype(np.float32) / 255.0
+            else:
+                return array
+
+    def _get_texture_map_image(self, image_index: int) -> torch.Tensor:
+        """
+        Return a texture map image as a torch tensor.
+        Calling this function repeatedly with the same arguments returns
+        the very same tensor, this allows a memory optimization to happen
+        later in TexturesUV.join_scene.
+        Any alpha channel is ignored.
+        """
+        im = self._texture_map_images.get(image_index)
+        if im is not None:
+            return im
+
+        im = torch.from_numpy(self._access_image(image_index))[:, :, :3]
+        self._texture_map_images[image_index] = im
+        return im
+
+    def _access_data(self, accessor_index: int) -> np.ndarray:
+        """
+        Get the raw data from an accessor as a numpy array.
+        """
+        accessor = self._accessors[accessor_index]
+
+        buffer_view_index = accessor.get("bufferView")
+        # Undefined buffer view (all zeros) are not (yet) supported
+        if buffer_view_index is None:
+            raise NotImplementedError("Undefined buffer view")
+
+        accessor_byte_offset = accessor.get("byteOffset", 0)
+        component_type = accessor["componentType"]
+        element_count = accessor["count"]
+        element_type = accessor["type"]
+
+        # Sparse accessors are not (yet) supported
+        if accessor.get("sparse") is not None:
+            raise NotImplementedError("Sparse Accessors")
+
+        buffer_view = self._buffer_views[buffer_view_index]
+        buffer_index = buffer_view["buffer"]
+        buffer_byte_length = buffer_view["byteLength"]
+        element_byte_offset = buffer_view.get("byteOffset", 0)
+        element_byte_stride = buffer_view.get("byteStride", 0)
+        if element_byte_stride != 0 and element_byte_stride < 4:
+            raise ValueError("Stride is too small.")
+        if element_byte_stride > 252:
+            raise ValueError("Stride is too big.")
+
+        element_shape = _ELEMENT_SHAPES[element_type]
+        item_type = _ITEM_TYPES[component_type]
+        item_dtype = np.dtype(item_type)
+        item_count = np.prod(element_shape)
+        item_size = item_dtype.itemsize
+        size = element_count * item_count * item_size
+        if size > buffer_byte_length:
+            raise ValueError("Buffer did not have enough data for the accessor")
+
+        buffer_ = self._buffers[buffer_index]
+        binary_data = self.get_binary_data(buffer_index)
+        if len(binary_data) < buffer_["byteLength"]:
+            raise ValueError("Not enough binary data for the buffer")
+
+        if element_byte_stride == 0:
+            element_byte_stride = item_size * item_count
+        # The same buffer can store interleaved elements
+        if element_byte_stride < item_size * item_count:
+            raise ValueError("Items should not overlap")
+
+        dtype = np.dtype(
+            {
+                "names": ["element"],
+                "formats": [str(element_shape) + item_dtype.str],
+                "offsets": [0],
+                "itemsize": element_byte_stride,
+            }
+        )
+
+        byte_offset = accessor_byte_offset + element_byte_offset
+        if byte_offset % item_size != 0:
+            raise ValueError("Misaligned data")
+        byte_length = element_count * element_byte_stride
+        buffer_view = binary_data[byte_offset : byte_offset + byte_length].view(dtype)[
+            "element"
+        ]
+
+        # Convert matrix data from column-major (OpenGL) to row-major order
+        if element_type in ("MAT2", "MAT3", "MAT4"):
+            buffer_view = np.transpose(buffer_view, (0, 2, 1))
+
+        return buffer_view
+
+    def _get_primitive_attribute(
+        self, primitive_attributes: Dict[str, Any], key: str, dtype
+    ) -> Optional[np.ndarray]:
+        accessor_index = primitive_attributes.get(key)
+        if accessor_index is None:
+            return None
+        primitive_attribute = self._access_data(accessor_index)
+        if key == "JOINTS_0":
+            pass
+        elif dtype == np.uint8:
+            primitive_attribute /= 255.0
+        elif dtype == np.uint16:
+            primitive_attribute /= 65535.0
+        else:
+            if dtype != np.float32:
+                raise ValueError("Unexpected data type")
+        primitive_attribute = primitive_attribute.astype(dtype)
+        return primitive_attribute
+
+    def get_binary_data(self, buffer_index: int):
+        """
+        Get the binary data from a buffer as a 1D numpy array of bytes.
+        This is implemented for explicit uri data buffers or the main GLB data
+        segment.
+        """
+        buffer_ = self._buffers[buffer_index]
+        binary_data = self._binary_data.get(buffer_index)
+        if binary_data is None:  # Lazily decode binary data
+            uri = buffer_.get("uri")
+            if not uri.startswith(_DATA_URI_PREFIX):
+                raise NotImplementedError("Unexpected URI type")
+            binary_data = b64decode(uri[len(_DATA_URI_PREFIX) :])
+            binary_data = np.frombuffer(binary_data, dtype=np.uint8)
+            self._binary_data[buffer_index] = binary_data
+        return binary_data
+
+    def get_texture_for_mesh(
+        self, primitive: Dict[str, Any], indices: torch.Tensor
+    ) -> Optional[TexturesBase]:
+        """
+        Get the texture object representing the given mesh primitive.
+
+        Args:
+            primitive: the mesh primitive being loaded.
+            indices: the face indices of the mesh
+        """
+        attributes = primitive["attributes"]
+        vertex_colors = self._get_primitive_attribute(attributes, "COLOR_0", np.float32)
+        if vertex_colors is not None:
+            return TexturesVertex(torch.from_numpy(vertex_colors))
+
+        vertex_texcoords_0 = self._get_primitive_attribute(
+            attributes, "TEXCOORD_0", np.float32
+        )
+        if vertex_texcoords_0 is not None:
+            verts_uvs = torch.from_numpy(vertex_texcoords_0)
+            verts_uvs[:, 1] = 1 - verts_uvs[:, -1]
+            faces_uvs = indices
+            material_index = primitive.get("material", 0)
+            material = self._json_data["materials"][material_index]
+            material_roughness = material["pbrMetallicRoughness"]
+            if "baseColorTexture" in material_roughness:
+                texture_index = material_roughness["baseColorTexture"]["index"]
+                texture_json = self._json_data["textures"][texture_index]
+                # Todo - include baseColorFactor when also given
+                # Todo - look at the sampler
+                image_index = texture_json["source"]
+                map = self._get_texture_map_image(image_index)
+            elif "baseColorFactor" in material_roughness:
+                # Constant color?
+                map = torch.FloatTensor(material_roughness["baseColorFactor"])[
+                    None, None, :3
+                ]
+            texture = TexturesUV(
+                # pyre-fixme[61]: `map` may not be initialized here.
+                maps=[map],  # alpha channel ignored
+                faces_uvs=[faces_uvs],
+                verts_uvs=[verts_uvs],
+            )
+            return texture
+
+        return None
+
+    def load(self, include_textures: bool) -> List[Tuple[Optional[str], Meshes]]:
+        """
+        Attempt to load all the meshes making up the default scene from
+        the file as a list of possibly-named Meshes objects.
+
+        Args:
+            include_textures: Whether to try loading textures.
+
+        Returns:
+            Meshes object containing one mesh.
+        """
+        if self._json_data is None:
+            raise ValueError("Initialization problem")
+
+        # This loads the default scene from the file.
+        # This is usually the only one.
+        # It is possible to have multiple scenes, in which case
+        # you could choose another here instead of taking the default.
+        scene_index = self._json_data.get("scene")
+
+        if scene_index is None:
+            raise ValueError("Default scene is not specified.")
+
+        scene = self._json_data["scenes"][scene_index]
+        nodes = self._json_data.get("nodes", [])
+        meshes = self._json_data.get("meshes", [])
+        root_node_indices = scene["nodes"]
+
+        mesh_transform = Transform3d()
+        names_meshes_list: List[Tuple[Optional[str], Meshes]] = []
+
+        # Keep track and apply the transform of the scene node to mesh vertices
+        Q = deque([(Transform3d(), node_index) for node_index in root_node_indices])
+
+        while Q:
+            parent_transform, current_node_index = Q.popleft()
+
+            current_node = nodes[current_node_index]
+
+            transform = _make_node_transform(current_node)
+            current_transform = transform.compose(parent_transform)
+
+            if "mesh" in current_node:
+                mesh_index = current_node["mesh"]
+                mesh = meshes[mesh_index]
+                mesh_name = mesh.get("name", None)
+                mesh_transform = current_transform
+
+                for primitive in mesh["primitives"]:
+                    attributes = primitive["attributes"]
+                    accessor_index = attributes["POSITION"]
+                    positions = torch.from_numpy(
+                        self._access_data(accessor_index).copy()
+                    )
+                    positions = mesh_transform.transform_points(positions)
+
+                    mode = primitive.get("mode", _PrimitiveMode.TRIANGLES)
+                    if mode != _PrimitiveMode.TRIANGLES:
+                        raise NotImplementedError("Non triangular meshes")
+
+                    if "indices" in primitive:
+                        accessor_index = primitive["indices"]
+                        indices = self._access_data(accessor_index).astype(np.int64)
+                    else:
+                        indices = np.arange(0, len(positions), dtype=np.int64)
+                    indices = torch.from_numpy(indices.reshape(-1, 3))
+
+                    texture = None
+                    if include_textures:
+                        texture = self.get_texture_for_mesh(primitive, indices)
+
+                    mesh_obj = Meshes(
+                        verts=[positions], faces=[indices], textures=texture
+                    )
+                    names_meshes_list.append((mesh_name, mesh_obj))
+
+            if "children" in current_node:
+                children_node_indices = current_node["children"]
+                Q.extend(
+                    [
+                        (current_transform, node_index)
+                        for node_index in children_node_indices
+                    ]
+                )
+
+        return names_meshes_list
+
+
+def load_meshes(
+    path: PathOrStr,
+    path_manager: PathManager,
+    include_textures: bool = True,
+) -> List[Tuple[Optional[str], Meshes]]:
+    """
+    Loads all the meshes from the default scene in the given GLB file.
+    and returns them separately.
+
+    Args:
+        path: path to read from
+        path_manager: PathManager object for interpreting the path
+        include_textures: whether to load textures
+
+    Returns:
+        List of (name, mesh) pairs, where the name is the optional name property
+            from the GLB file, or None if it is absent, and the mesh is a Meshes
+            object containing one mesh.
+    """
+    with _open_file(path, path_manager, "rb") as f:
+        loader = _GLTFLoader(cast(BinaryIO, f))
+    names_meshes_list = loader.load(include_textures=include_textures)
+    return names_meshes_list
+
+
+class MeshGlbFormat(MeshFormatInterpreter):
+    """
+    Implements loading meshes from glTF 2 assets stored in a
+    GLB container file or a glTF JSON file with embedded binary data.
+
+    This implementation is quite restricted in what it supports.
+
+        - It does not try to validate the input against the standard.
+        - It loads the default scene only.
+        - Only triangulated geometry is supported.
+        - The geometry of all meshes of the entire scene is aggregated into a single mesh.
+        Use `load_meshes()` instead to get un-aggregated (but transformed) ones.
+        - All material properties are ignored except for either vertex color, baseColorTexture
+        or baseColorFactor. If available, one of these (in this order) is exclusively
+        used which does not match the semantics of the standard.
+    """
+
+    def __init__(self) -> None:
+        self.known_suffixes = (".glb",)
+
+    def read(
+        self,
+        path: PathOrStr,
+        include_textures: bool,
+        device,
+        path_manager: PathManager,
+        **kwargs,
+    ) -> Optional[Meshes]:
+        if not endswith(path, self.known_suffixes):
+            return None
+
+        names_meshes_list = load_meshes(
+            path=path,
+            path_manager=path_manager,
+            include_textures=include_textures,
+        )
+
+        meshes_list = [mesh for name, mesh in names_meshes_list]
+        mesh = join_meshes_as_scene(meshes_list)
+        return mesh.to(device)
+
+    def save(
+        self,
+        data: Meshes,
+        path: PathOrStr,
+        path_manager: PathManager,
+        binary: Optional[bool],
+        **kwargs,
+    ) -> bool:
+        return False
diff --git a/pytorch3d/pytorch3d/io/mtl_io.py b/pytorch3d/pytorch3d/io/mtl_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4c7f6dba376bde3e8ef30789efea73ac12abc43
--- /dev/null
+++ b/pytorch3d/pytorch3d/io/mtl_io.py
@@ -0,0 +1,525 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""This module implements utility functions for loading .mtl files and textures."""
+import os
+import warnings
+from typing import Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from iopath.common.file_io import PathManager
+from pytorch3d.common.types import Device
+from pytorch3d.io.utils import _open_file, _read_image
+
+
+def make_mesh_texture_atlas(
+    material_properties: Dict,
+    texture_images: Dict,
+    face_material_names,
+    faces_uvs: torch.Tensor,
+    verts_uvs: torch.Tensor,
+    texture_size: int,
+    texture_wrap: Optional[str],
+) -> torch.Tensor:
+    """
+    Given properties for materials defined in the .mtl file, and the face texture uv
+    coordinates, construct an (F, R, R, 3) texture atlas where R is the texture_size
+    and F is the number of faces in the mesh.
+
+    Args:
+        material_properties: dict of properties for each material. If a material
+                does not have any properties it will have an empty dict.
+        texture_images: dict of material names and texture images
+        face_material_names: numpy array of the material name corresponding to each
+            face. Faces which don't have an associated material will be an empty string.
+            For these faces, a uniform white texture is assigned.
+        faces_uvs: LongTensor of shape (F, 3,) giving the index into the verts_uvs for
+            each face in the mesh.
+        verts_uvs: FloatTensor of shape (V, 2) giving the uv coordinates for each vertex.
+        texture_size: the resolution of the per face texture map returned by this function.
+            Each face will have a texture map of shape (texture_size, texture_size, 3).
+        texture_wrap: string, one of ["repeat", "clamp", None]
+            If `texture_wrap="repeat"` for uv values outside the range [0, 1] the integer part
+            is ignored and a repeating pattern is formed.
+            If `texture_wrap="clamp"` the values are clamped to the range [0, 1].
+            If None, do nothing.
+
+    Returns:
+        atlas: FloatTensor of shape (F, texture_size, texture_size, 3) giving the per
+        face texture map.
+    """
+    # Create an R x R texture map per face in the mesh
+    R = texture_size
+    F = faces_uvs.shape[0]
+
+    # Initialize the per face texture map to a white color.
+    # TODO: allow customization of this base color?
+    atlas = torch.ones(size=(F, R, R, 3), dtype=torch.float32, device=faces_uvs.device)
+
+    # Check for empty materials.
+    if not material_properties and not texture_images:
+        return atlas
+
+    # Iterate through the material properties - not
+    # all materials have texture images so this is
+    # done first separately to the texture interpolation.
+    for material_name, props in material_properties.items():
+        # Bool to indicate which faces use this texture map.
+        faces_material_ind = torch.from_numpy(face_material_names == material_name).to(
+            faces_uvs.device
+        )
+        if faces_material_ind.sum() > 0:
+            # For these faces, update the base color to the
+            # diffuse material color.
+            if "diffuse_color" not in props:
+                continue
+            atlas[faces_material_ind, ...] = props["diffuse_color"][None, :]
+
+    # If there are vertex texture coordinates, create an (F, 3, 2)
+    # tensor of the vertex textures per face.
+    faces_verts_uvs = verts_uvs[faces_uvs] if len(verts_uvs) > 0 else None
+
+    # Some meshes only have material properties and no texture image.
+    # In this case, return the atlas here.
+    if faces_verts_uvs is None:
+        return atlas
+
+    if texture_wrap == "repeat":
+        # If texture uv coordinates are outside the range [0, 1] follow
+        # the convention GL_REPEAT in OpenGL i.e the integer part of the coordinate
+        # will be ignored and a repeating pattern is formed.
+        # Shapenet data uses this format see:
+        # https://shapenet.org/qaforum/index.php?qa=15&qa_1=why-is-the-texture-coordinate-in-the-obj-file-not-in-the-range # noqa: B950
+        if (faces_verts_uvs > 1).any() or (faces_verts_uvs < 0).any():
+            msg = "Texture UV coordinates outside the range [0, 1]. \
+                The integer part will be ignored to form a repeating pattern."
+            warnings.warn(msg)
+            faces_verts_uvs = faces_verts_uvs % 1
+    elif texture_wrap == "clamp":
+        # Clamp uv coordinates to the [0, 1] range.
+        faces_verts_uvs = faces_verts_uvs.clamp(0.0, 1.0)
+
+    # Iterate through the materials used in this mesh. Update the
+    # texture atlas for the faces which use this material.
+    # Faces without texture are white.
+    for material_name, image in list(texture_images.items()):
+        # Only use the RGB colors
+        if image.shape[2] == 4:
+            image = image[:, :, :3]
+
+        # Reverse the image y direction
+        image = torch.flip(image, [0]).type_as(faces_verts_uvs)
+
+        # Bool to indicate which faces use this texture map.
+        faces_material_ind = torch.from_numpy(face_material_names == material_name).to(
+            faces_verts_uvs.device
+        )
+
+        # Find the subset of faces which use this texture with this texture image
+        uvs_subset = faces_verts_uvs[faces_material_ind, :, :]
+
+        # Update the texture atlas for the faces which use this texture.
+        # TODO: should the texture map values be multiplied
+        # by the diffuse material color (i.e. use *= as the atlas has
+        # been initialized to the diffuse color)?. This is
+        # not being done in SoftRas.
+        atlas[faces_material_ind, :, :] = make_material_atlas(image, uvs_subset, R)
+
+    return atlas
+
+
+def make_material_atlas(
+    image: torch.Tensor, faces_verts_uvs: torch.Tensor, texture_size: int
+) -> torch.Tensor:
+    r"""
+    Given a single texture image and the uv coordinates for all the
+    face vertices, create a square texture map per face using
+    the formulation from [1].
+
+    For a triangle with vertices (v0, v1, v2) we can create a barycentric coordinate system
+    with the x axis being the vector (v0 - v2) and the y axis being the vector (v1 - v2).
+    The barycentric coordinates range from [0, 1] in the +x and +y direction so this creates
+    a triangular texture space with vertices at (0, 1), (0, 0) and (1, 0).
+
+    The per face texture map is of shape (texture_size, texture_size, 3)
+    which is a square. To map a triangular texture to a square grid, each
+    triangle is parametrized as follows (e.g. R = texture_size = 3):
+
+    The triangle texture is first divided into RxR = 9 subtriangles which each
+    map to one grid cell. The numbers in the grid cells and triangles show the mapping.
+
+    ..code-block::python
+
+        Triangular Texture Space:
+
+              1
+                |\
+                |6 \
+                |____\
+                |\  7 |\
+                |3 \  |4 \
+                |____\|____\
+                |\ 8  |\  5 |\
+                |0 \  |1 \  |2 \
+                |____\|____\|____\
+               0                   1
+
+        Square per face texture map:
+
+               R ____________________
+                |      |      |      |
+                |  6   |  7   |  8   |
+                |______|______|______|
+                |      |      |      |
+                |  3   |  4   |  5   |
+                |______|______|______|
+                |      |      |      |
+                |  0   |  1   |  2   |
+                |______|______|______|
+               0                      R
+
+
+    The barycentric coordinates of each grid cell are calculated using the
+    xy coordinates:
+
+    ..code-block::python
+
+            The cartesian coordinates are:
+
+            Grid 1:
+
+               R ____________________
+                |      |      |      |
+                |  20  |  21  |  22  |
+                |______|______|______|
+                |      |      |      |
+                |  10  |  11  |  12  |
+                |______|______|______|
+                |      |      |      |
+                |  00  |  01  |  02  |
+                |______|______|______|
+               0                      R
+
+            where 02 means y = 0, x = 2
+
+        Now consider this subset of the triangle which corresponds to
+        grid cells 0 and 8:
+
+        ..code-block::python
+
+            1/R  ________
+                |\    8  |
+                |  \     |
+                | 0   \  |
+                |_______\|
+               0          1/R
+
+        The centroids of the triangles are:
+            0: (1/3, 1/3) * 1/R
+            8: (2/3, 2/3) * 1/R
+
+    For each grid cell we can now calculate the centroid `(c_y, c_x)`
+    of the corresponding texture triangle:
+        - if `(x + y) < R`, then offset the centroid of
+            triangle 0 by `(y, x) * (1/R)`
+        - if `(x + y) > R`, then offset the centroid of
+            triangle 8 by `((R-1-y), (R-1-x)) * (1/R)`.
+
+    This is equivalent to updating the portion of Grid 1
+    above the diagonal, replacing `(y, x)` with `((R-1-y), (R-1-x))`:
+
+    ..code-block::python
+
+              R _____________________
+                |      |      |      |
+                |  20  |  01  |  00  |
+                |______|______|______|
+                |      |      |      |
+                |  10  |  11  |  10  |
+                |______|______|______|
+                |      |      |      |
+                |  00  |  01  |  02  |
+                |______|______|______|
+               0                      R
+
+    The barycentric coordinates (w0, w1, w2) are then given by:
+
+    ..code-block::python
+
+        w0 = c_x
+        w1 = c_y
+        w2 = 1- w0 - w1
+
+    Args:
+        image: FloatTensor of shape (H, W, 3)
+        faces_verts_uvs: uv coordinates for each vertex in each face  (F, 3, 2)
+        texture_size: int
+
+    Returns:
+        atlas: a FloatTensor of shape (F, texture_size, texture_size, 3) giving a
+            per face texture map.
+
+    [1] Liu et al, 'Soft Rasterizer: A Differentiable Renderer for Image-based
+        3D Reasoning', ICCV 2019
+    """
+    R = texture_size
+    device = faces_verts_uvs.device
+    rng = torch.arange(R, device=device)
+
+    # Meshgrid returns (row, column) i.e (Y, X)
+    # Change order to (X, Y) to make the grid.
+    Y, X = torch.meshgrid(rng, rng)
+    # pyre-fixme[28]: Unexpected keyword argument `axis`.
+    grid = torch.stack([X, Y], axis=-1)  # (R, R, 2)
+
+    # Grid cells below the diagonal: x + y < R.
+    below_diag = grid.sum(-1) < R
+
+    # map a [0, R] grid -> to a [0, 1] barycentric coordinates of
+    # the texture triangle centroids.
+    bary = torch.zeros((R, R, 3), device=device)  # (R, R, 3)
+    slc = torch.arange(2, device=device)[:, None]
+    # w0, w1
+    bary[below_diag, slc] = ((grid[below_diag] + 1.0 / 3.0) / R).T
+    # w0, w1 for above diagonal grid cells.
+    # pyre-fixme[16]: `float` has no attribute `T`.
+    bary[~below_diag, slc] = (((R - 1.0 - grid[~below_diag]) + 2.0 / 3.0) / R).T
+    # w2 = 1. - w0 - w1
+    bary[..., -1] = 1 - bary[..., :2].sum(dim=-1)
+
+    # Calculate the uv position in the image for each pixel
+    # in the per face texture map
+    # (F, 1, 1, 3, 2) * (R, R, 3, 1) -> (F, R, R, 3, 2) -> (F, R, R, 2)
+    uv_pos = (faces_verts_uvs[:, None, None] * bary[..., None]).sum(-2)
+
+    # bi-linearly interpolate the textures from the images
+    # using the uv coordinates given by uv_pos.
+    textures = _bilinear_interpolation_grid_sample(image, uv_pos)
+
+    return textures
+
+
+def _bilinear_interpolation_vectorized(
+    image: torch.Tensor, grid: torch.Tensor
+) -> torch.Tensor:
+    """
+    Bi linearly interpolate the image using the uv positions in the flow-field
+    grid (following the naming conventions for torch.nn.functional.grid_sample).
+
+    This implementation uses the same steps as in the SoftRasterizer CUDA kernel
+    for loading textures. We are keeping it for reference to make it easy to
+    compare if required.
+
+    However it doesn't properly handle the out of bound values in the same way as
+    the grid_sample function does with the padding_mode argument.
+    This vectorized version requires less memory than
+    _bilinear_interpolation_grid_sample but is slightly slower.
+
+    Args:
+        image: FloatTensor of shape (H, W, D) a single image/input tensor with D
+            channels.
+        grid: FloatTensor of shape (N, R, R, 2) giving the pixel locations of the
+            points at which to sample a value in the image. The grid values must
+            be in the range [0, 1]. u is the x direction and v is the y direction.
+
+    Returns:
+        out: FloatTensor of shape (N, H, W, D) giving the interpolated
+            D dimensional value from image at each of the pixel locations in grid.
+
+    """
+    H, W, _ = image.shape
+    # Convert [0, 1] to the range [0, W-1] and [0, H-1]
+    grid = grid * torch.tensor([W - 1, H - 1]).type_as(grid)
+    weight_1 = grid - grid.int()
+    weight_0 = 1.0 - weight_1
+
+    grid_x, grid_y = grid.unbind(-1)
+    y0 = grid_y.to(torch.int64)
+    y1 = (grid_y + 1).to(torch.int64)
+    x0 = grid_x.to(torch.int64)
+    x1 = x0 + 1
+
+    weight_x0, weight_y0 = weight_0.unbind(-1)
+    weight_x1, weight_y1 = weight_1.unbind(-1)
+
+    # Bi-linear interpolation
+    # griditions = [[y,     x], [(y+1),     x]
+    #              [y, (x+1)], [(y+1), (x+1)]]
+    # weights   = [[wx0*wy0, wx0*wy1],
+    #              [wx1*wy0, wx1*wy1]]
+    out = (
+        image[y0, x0] * (weight_x0 * weight_y0)[..., None]
+        + image[y1, x0] * (weight_x0 * weight_y1)[..., None]
+        + image[y0, x1] * (weight_x1 * weight_y0)[..., None]
+        + image[y1, x1] * (weight_x1 * weight_y1)[..., None]
+    )
+
+    return out
+
+
+def _bilinear_interpolation_grid_sample(
+    image: torch.Tensor, grid: torch.Tensor
+) -> torch.Tensor:
+    """
+    Bi linearly interpolate the image using the uv positions in the flow-field
+    grid (following the conventions for torch.nn.functional.grid_sample).
+
+    This implementation is faster than _bilinear_interpolation_vectorized but
+    requires more memory so can cause OOMs. If speed is an issue try this function
+    instead.
+
+    Args:
+        image: FloatTensor of shape (H, W, D) a single image/input tensor with D
+            channels.
+        grid: FloatTensor of shape (N, R, R, 2) giving the pixel locations of the
+            points at which to sample a value in the image. The grid values must
+            be in the range [0, 1]. u is the x direction and v is the y direction.
+
+    Returns:
+        out: FloatTensor of shape (N, H, W, D) giving the interpolated
+            D dimensional value from image at each of the pixel locations in grid.
+    """
+
+    N = grid.shape[0]
+    # convert [0, 1] to the range [-1, 1] expected by grid_sample.
+    grid = grid * 2.0 - 1.0
+    image = image.permute(2, 0, 1)[None, ...].expand(N, -1, -1, -1)  # (N, 3, H, W)
+    # Align_corners has to be set to True to match the output of the SoftRas
+    # cuda kernel for bilinear sampling.
+    out = F.grid_sample(image, grid, mode="bilinear", align_corners=True)
+    return out.permute(0, 2, 3, 1)
+
+
+MaterialProperties = Dict[str, Dict[str, torch.Tensor]]
+TextureFiles = Dict[str, str]
+TextureImages = Dict[str, torch.Tensor]
+
+
+def _parse_mtl(
+    f: str, path_manager: PathManager, device: Device = "cpu"
+) -> Tuple[MaterialProperties, TextureFiles]:
+    material_properties = {}
+    texture_files = {}
+    material_name = ""
+
+    with _open_file(f, path_manager, "r") as f:
+        for line in f:
+            tokens = line.strip().split()
+            if not tokens:
+                continue
+            if tokens[0] == "newmtl":
+                material_name = tokens[1]
+                material_properties[material_name] = {}
+            elif tokens[0] == "map_Kd":
+                # Diffuse texture map
+                # Account for the case where filenames might have spaces
+                filename = line.strip()[7:]
+                texture_files[material_name] = filename
+            elif tokens[0] == "Kd":
+                # RGB diffuse reflectivity
+                kd = np.array(tokens[1:4]).astype(np.float32)
+                kd = torch.from_numpy(kd).to(device)
+                material_properties[material_name]["diffuse_color"] = kd
+            elif tokens[0] == "Ka":
+                # RGB ambient reflectivity
+                ka = np.array(tokens[1:4]).astype(np.float32)
+                ka = torch.from_numpy(ka).to(device)
+                material_properties[material_name]["ambient_color"] = ka
+            elif tokens[0] == "Ks":
+                # RGB specular reflectivity
+                ks = np.array(tokens[1:4]).astype(np.float32)
+                ks = torch.from_numpy(ks).to(device)
+                material_properties[material_name]["specular_color"] = ks
+            elif tokens[0] == "Ns":
+                # Specular exponent
+                ns = np.array(tokens[1:4]).astype(np.float32)
+                ns = torch.from_numpy(ns).to(device)
+                material_properties[material_name]["shininess"] = ns
+
+    return material_properties, texture_files
+
+
+def _load_texture_images(
+    material_names: List[str],
+    data_dir: str,
+    material_properties: MaterialProperties,
+    texture_files: TextureFiles,
+    path_manager: PathManager,
+) -> Tuple[MaterialProperties, TextureImages]:
+    final_material_properties = {}
+    texture_images = {}
+
+    # Only keep the materials referenced in the obj.
+    for material_name in material_names:
+        if material_name in texture_files:
+            # Load the texture image.
+            path = os.path.join(data_dir, texture_files[material_name])
+            if path_manager.exists(path):
+                image = (
+                    _read_image(path, path_manager=path_manager, format="RGB") / 255.0
+                )
+                image = torch.from_numpy(image)
+                texture_images[material_name] = image
+            else:
+                msg = f"Texture file does not exist: {path}"
+                warnings.warn(msg)
+
+        if material_name in material_properties:
+            final_material_properties[material_name] = material_properties[
+                material_name
+            ]
+
+    return final_material_properties, texture_images
+
+
+def load_mtl(
+    f: str,
+    *,
+    material_names: List[str],
+    data_dir: str,
+    device: Device = "cpu",
+    path_manager: PathManager,
+) -> Tuple[MaterialProperties, TextureImages]:
+    """
+    Load texture images and material reflectivity values for ambient, diffuse
+    and specular light (Ka, Kd, Ks, Ns).
+
+    Args:
+        f: path to the material information.
+        material_names: a list of the material names found in the .obj file.
+        data_dir: the directory where the material texture files are located.
+        device: Device (as str or torch.tensor) on which to return the new tensors.
+        path_manager: PathManager for interpreting both f and material_names.
+
+    Returns:
+        material_properties: dict of properties for each material. If a material
+                does not have any properties it will have an empty dict.
+                {
+                    material_name_1:  {
+                        "ambient_color": tensor of shape (1, 3),
+                        "diffuse_color": tensor of shape (1, 3),
+                        "specular_color": tensor of shape (1, 3),
+                        "shininess": tensor of shape (1)
+                    },
+                    material_name_2: {},
+                    ...
+                }
+        texture_images: dict of material names and texture images
+                {
+                    material_name_1: (H, W, 3) image,
+                    ...
+                }
+    """
+    material_properties, texture_files = _parse_mtl(f, path_manager, device)
+    return _load_texture_images(
+        material_names,
+        data_dir,
+        material_properties,
+        texture_files,
+        path_manager=path_manager,
+    )
diff --git a/pytorch3d/pytorch3d/io/obj_io.py b/pytorch3d/pytorch3d/io/obj_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4cefc704e8a32bfdea4b5f04ea5f718ba85bc2f
--- /dev/null
+++ b/pytorch3d/pytorch3d/io/obj_io.py
@@ -0,0 +1,830 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+"""This module implements utility functions for loading and saving meshes."""
+import os
+import warnings
+from collections import namedtuple
+from pathlib import Path
+from typing import List, Optional
+
+import numpy as np
+import torch
+from iopath.common.file_io import PathManager
+from PIL import Image
+from pytorch3d.common.types import Device
+from pytorch3d.io.mtl_io import load_mtl, make_mesh_texture_atlas
+from pytorch3d.io.utils import PathOrStr, _check_faces_indices, _make_tensor, _open_file
+from pytorch3d.renderer import TexturesAtlas, TexturesUV
+from pytorch3d.structures import Meshes, join_meshes_as_batch
+
+from .pluggable_formats import MeshFormatInterpreter, endswith
+
+
+# Faces & Aux type returned from load_obj function.
+_Faces = namedtuple("Faces", "verts_idx normals_idx textures_idx materials_idx")
+_Aux = namedtuple(
+    "Properties", "normals verts_uvs material_colors texture_images texture_atlas"
+)
+
+
+def _format_faces_indices(faces_indices, max_index, device, pad_value=None):
+    """
+    Format indices and check for invalid values. Indices can refer to
+    values in one of the face properties: vertices, textures or normals.
+    See comments of the load_obj function for more details.
+
+    Args:
+        faces_indices: List of ints of indices.
+        max_index: Max index for the face property.
+        pad_value: if any of the face_indices are padded, specify
+            the value of the padding (e.g. -1). This is only used
+            for texture indices indices where there might
+            not be texture information for all the faces.
+
+    Returns:
+        faces_indices: List of ints of indices.
+
+    Raises:
+        ValueError if indices are not in a valid range.
+    """
+    faces_indices = _make_tensor(
+        faces_indices, cols=3, dtype=torch.int64, device=device
+    )
+
+    if pad_value is not None:
+        mask = faces_indices.eq(pad_value).all(dim=-1)
+
+    # Change to 0 based indexing.
+    faces_indices[(faces_indices > 0)] -= 1
+
+    # Negative indexing counts from the end.
+    faces_indices[(faces_indices < 0)] += max_index
+
+    if pad_value is not None:
+        faces_indices[mask] = pad_value
+
+    return _check_faces_indices(faces_indices, max_index, pad_value)
+
+
+def load_obj(
+    f,
+    load_textures=True,
+    create_texture_atlas: bool = False,
+    texture_atlas_size: int = 4,
+    texture_wrap: Optional[str] = "repeat",
+    device: Device = "cpu",
+    path_manager: Optional[PathManager] = None,
+):
+    """
+    Load a mesh from a .obj file and optionally textures from a .mtl file.
+    Currently this handles verts, faces, vertex texture uv coordinates, normals,
+    texture images and material reflectivity values.
+
+    Note .obj files are 1-indexed. The tensors returned from this function
+    are 0-indexed. OBJ spec reference: http://www.martinreddy.net/gfx/3d/OBJ.spec
+
+    Example .obj file format:
+    ::
+        # this is a comment
+        v 1.000000 -1.000000 -1.000000
+        v 1.000000 -1.000000 1.000000
+        v -1.000000 -1.000000 1.000000
+        v -1.000000 -1.000000 -1.000000
+        v 1.000000 1.000000 -1.000000
+        vt 0.748573 0.750412
+        vt 0.749279 0.501284
+        vt 0.999110 0.501077
+        vt 0.999455 0.750380
+        vn 0.000000 0.000000 -1.000000
+        vn -1.000000 -0.000000 -0.000000
+        vn -0.000000 -0.000000 1.000000
+        f 5/2/1 1/2/1 4/3/1
+        f 5/1/1 4/3/1 2/4/1
+
+    The first character of the line denotes the type of input:
+    ::
+        - v is a vertex
+        - vt is the texture coordinate of one vertex
+        - vn is the normal of one vertex
+        - f is a face
+
+    Faces are interpreted as follows:
+    ::
+        5/2/1 describes the first vertex of the first triangle
+        - 5: index of vertex [1.000000 1.000000 -1.000000]
+        - 2: index of texture coordinate [0.749279 0.501284]
+        - 1: index of normal [0.000000 0.000000 -1.000000]
+
+    If there are faces with more than 3 vertices
+    they are subdivided into triangles. Polygonal faces are assumed to have
+    vertices ordered counter-clockwise so the (right-handed) normal points
+    out of the screen e.g. a proper rectangular face would be specified like this:
+    ::
+        0_________1
+        |         |
+        |         |
+        3 ________2
+
+    The face would be split into two triangles: (0, 2, 1) and (0, 3, 2),
+    both of which are also oriented counter-clockwise and have normals
+    pointing out of the screen.
+
+    Args:
+        f: A file-like object (with methods read, readline, tell, and seek),
+           a pathlib path or a string containing a file name.
+        load_textures: Boolean indicating whether material files are loaded
+        create_texture_atlas: Bool, If True a per face texture map is created and
+            a tensor `texture_atlas` is also returned in `aux`.
+        texture_atlas_size: Int specifying the resolution of the texture map per face
+            when `create_texture_atlas=True`. A (texture_size, texture_size, 3)
+            map is created per face.
+        texture_wrap: string, one of ["repeat", "clamp"]. This applies when computing
+            the texture atlas.
+            If `texture_mode="repeat"`, for uv values outside the range [0, 1] the integer part
+            is ignored and a repeating pattern is formed.
+            If `texture_mode="clamp"` the values are clamped to the range [0, 1].
+            If None, then there is no transformation of the texture values.
+        device: Device (as str or torch.device) on which to return the new tensors.
+        path_manager: optionally a PathManager object to interpret paths.
+
+    Returns:
+        6-element tuple containing
+
+        - **verts**: FloatTensor of shape (V, 3).
+        - **faces**: NamedTuple with fields:
+            - verts_idx: LongTensor of vertex indices, shape (F, 3).
+            - normals_idx: (optional) LongTensor of normal indices, shape (F, 3).
+            - textures_idx: (optional) LongTensor of texture indices, shape (F, 3).
+              This can be used to index into verts_uvs.
+            - materials_idx: (optional) List of indices indicating which
+              material the texture is derived from for each face.
+              If there is no material for a face, the index is -1.
+              This can be used to retrieve the corresponding values
+              in material_colors/texture_images after they have been
+              converted to tensors or Materials/Textures data
+              structures - see textures.py and materials.py for
+              more info.
+        - **aux**: NamedTuple with fields:
+            - normals: FloatTensor of shape (N, 3)
+            - verts_uvs: FloatTensor of shape (T, 2), giving the uv coordinate per
+              vertex. If a vertex is shared between two faces, it can have
+              a different uv value for each instance. Therefore it is
+              possible that the number of verts_uvs is greater than
+              num verts i.e. T > V.
+              vertex.
+            - material_colors: if `load_textures=True` and the material has associated
+              properties this will be a dict of material names and properties of the form:
+
+              .. code-block:: python
+
+                  {
+                      material_name_1:  {
+                          "ambient_color": tensor of shape (1, 3),
+                          "diffuse_color": tensor of shape (1, 3),
+                          "specular_color": tensor of shape (1, 3),
+                          "shininess": tensor of shape (1)
+                      },
+                      material_name_2: {},
+                      ...
+                  }
+
+              If a material does not have any properties it will have an
+              empty dict. If `load_textures=False`, `material_colors` will None.
+
+            - texture_images: if `load_textures=True` and the material has a texture map,
+              this will be a dict of the form:
+
+              .. code-block:: python
+
+                  {
+                      material_name_1: (H, W, 3) image,
+                      ...
+                  }
+              If `load_textures=False`, `texture_images` will None.
+            - texture_atlas: if `load_textures=True` and `create_texture_atlas=True`,
+              this will be a FloatTensor of the form: (F, texture_size, textures_size, 3)
+              If the material does not have a texture map, then all faces
+              will have a uniform white texture.  Otherwise `texture_atlas` will be
+              None.
+    """
+    data_dir = "./"
+    if isinstance(f, (str, bytes, Path)):
+        data_dir = os.path.dirname(f)
+    if path_manager is None:
+        path_manager = PathManager()
+    with _open_file(f, path_manager, "r") as f:
+        return _load_obj(
+            f,
+            data_dir=data_dir,
+            load_textures=load_textures,
+            create_texture_atlas=create_texture_atlas,
+            texture_atlas_size=texture_atlas_size,
+            texture_wrap=texture_wrap,
+            path_manager=path_manager,
+            device=device,
+        )
+
+
+def load_objs_as_meshes(
+    files: list,
+    device: Optional[Device] = None,
+    load_textures: bool = True,
+    create_texture_atlas: bool = False,
+    texture_atlas_size: int = 4,
+    texture_wrap: Optional[str] = "repeat",
+    path_manager: Optional[PathManager] = None,
+):
+    """
+    Load meshes from a list of .obj files using the load_obj function, and
+    return them as a Meshes object. This only works for meshes which have a
+    single texture image for the whole mesh. See the load_obj function for more
+    details. material_colors and normals are not stored.
+
+    Args:
+        files: A list of file-like objects (with methods read, readline, tell,
+            and seek), pathlib paths or strings containing file names.
+        device: Desired device of returned Meshes. Default:
+            uses the current device for the default tensor type.
+        load_textures: Boolean indicating whether material files are loaded
+        create_texture_atlas, texture_atlas_size, texture_wrap: as for load_obj.
+        path_manager: optionally a PathManager object to interpret paths.
+
+    Returns:
+        New Meshes object.
+    """
+    mesh_list = []
+    for f_obj in files:
+        verts, faces, aux = load_obj(
+            f_obj,
+            load_textures=load_textures,
+            create_texture_atlas=create_texture_atlas,
+            texture_atlas_size=texture_atlas_size,
+            texture_wrap=texture_wrap,
+            path_manager=path_manager,
+        )
+        tex = None
+        if create_texture_atlas:
+            # TexturesAtlas type
+            tex = TexturesAtlas(atlas=[aux.texture_atlas.to(device)])
+        else:
+            # TexturesUV type
+            tex_maps = aux.texture_images
+            if tex_maps is not None and len(tex_maps) > 0:
+                verts_uvs = aux.verts_uvs.to(device)  # (V, 2)
+                faces_uvs = faces.textures_idx.to(device)  # (F, 3)
+                image = list(tex_maps.values())[0].to(device)[None]
+                tex = TexturesUV(
+                    verts_uvs=[verts_uvs], faces_uvs=[faces_uvs], maps=image
+                )
+
+        mesh = Meshes(
+            verts=[verts.to(device)], faces=[faces.verts_idx.to(device)], textures=tex
+        )
+        mesh_list.append(mesh)
+    if len(mesh_list) == 1:
+        return mesh_list[0]
+    return join_meshes_as_batch(mesh_list)
+
+
+class MeshObjFormat(MeshFormatInterpreter):
+    def __init__(self) -> None:
+        self.known_suffixes = (".obj",)
+
+    def read(
+        self,
+        path: PathOrStr,
+        include_textures: bool,
+        device: Device,
+        path_manager: PathManager,
+        create_texture_atlas: bool = False,
+        texture_atlas_size: int = 4,
+        texture_wrap: Optional[str] = "repeat",
+        **kwargs,
+    ) -> Optional[Meshes]:
+        if not endswith(path, self.known_suffixes):
+            return None
+        mesh = load_objs_as_meshes(
+            files=[path],
+            device=device,
+            load_textures=include_textures,
+            create_texture_atlas=create_texture_atlas,
+            texture_atlas_size=texture_atlas_size,
+            texture_wrap=texture_wrap,
+            path_manager=path_manager,
+        )
+        return mesh
+
+    def save(
+        self,
+        data: Meshes,
+        path: PathOrStr,
+        path_manager: PathManager,
+        binary: Optional[bool],
+        decimal_places: Optional[int] = None,
+        **kwargs,
+    ) -> bool:
+        if not endswith(path, self.known_suffixes):
+            return False
+
+        verts = data.verts_list()[0]
+        faces = data.faces_list()[0]
+        save_obj(
+            f=path,
+            verts=verts,
+            faces=faces,
+            decimal_places=decimal_places,
+            path_manager=path_manager,
+        )
+        return True
+
+
+def _parse_face(
+    line,
+    tokens,
+    material_idx,
+    faces_verts_idx,
+    faces_normals_idx,
+    faces_textures_idx,
+    faces_materials_idx,
+):
+    face = tokens[1:]
+    face_list = [f.split("/") for f in face]
+    face_verts = []
+    face_normals = []
+    face_textures = []
+
+    for vert_props in face_list:
+        # Vertex index.
+        face_verts.append(int(vert_props[0]))
+        if len(vert_props) > 1:
+            if vert_props[1] != "":
+                # Texture index is present e.g. f 4/1/1.
+                face_textures.append(int(vert_props[1]))
+            if len(vert_props) > 2:
+                # Normal index present e.g. 4/1/1 or 4//1.
+                face_normals.append(int(vert_props[2]))
+            if len(vert_props) > 3:
+                raise ValueError(
+                    "Face vertices can only have 3 properties. \
+                                Face vert %s, Line: %s"
+                    % (str(vert_props), str(line))
+                )
+
+    # Triplets must be consistent for all vertices in a face e.g.
+    # legal statement: f 4/1/1 3/2/1 2/1/1.
+    # illegal statement: f 4/1/1 3//1 2//1.
+    # If the face does not have normals or textures indices
+    # fill with pad value = -1. This will ensure that
+    # all the face index tensors will have F values where
+    # F is the number of faces.
+    if len(face_normals) > 0:
+        if not (len(face_verts) == len(face_normals)):
+            raise ValueError(
+                "Face %s is an illegal statement. \
+                        Vertex properties are inconsistent. Line: %s"
+                % (str(face), str(line))
+            )
+    else:
+        face_normals = [-1] * len(face_verts)  # Fill with -1
+    if len(face_textures) > 0:
+        if not (len(face_verts) == len(face_textures)):
+            raise ValueError(
+                "Face %s is an illegal statement. \
+                        Vertex properties are inconsistent. Line: %s"
+                % (str(face), str(line))
+            )
+    else:
+        face_textures = [-1] * len(face_verts)  # Fill with -1
+
+    # Subdivide faces with more than 3 vertices.
+    # See comments of the load_obj function for more details.
+    for i in range(len(face_verts) - 2):
+        faces_verts_idx.append((face_verts[0], face_verts[i + 1], face_verts[i + 2]))
+        faces_normals_idx.append(
+            (face_normals[0], face_normals[i + 1], face_normals[i + 2])
+        )
+        faces_textures_idx.append(
+            (face_textures[0], face_textures[i + 1], face_textures[i + 2])
+        )
+        faces_materials_idx.append(material_idx)
+
+
+def _parse_obj(f, data_dir: str):
+    """
+    Load a mesh from a file-like object. See load_obj function for more details
+    about the return values.
+    """
+    verts, normals, verts_uvs = [], [], []
+    faces_verts_idx, faces_normals_idx, faces_textures_idx = [], [], []
+    faces_materials_idx = []
+    material_names = []
+    mtl_path = None
+
+    lines = [line.strip() for line in f]
+
+    # startswith expects each line to be a string. If the file is read in as
+    # bytes then first decode to strings.
+    if lines and isinstance(lines[0], bytes):
+        lines = [el.decode("utf-8") for el in lines]
+
+    materials_idx = -1
+
+    for line in lines:
+        tokens = line.strip().split()
+        if line.startswith("mtllib"):
+            if len(tokens) < 2:
+                raise ValueError("material file name is not specified")
+            # NOTE: only allow one .mtl file per .obj.
+            # Definitions for multiple materials can be included
+            # in this one .mtl file.
+            mtl_path = line[len(tokens[0]) :].strip()  # Take the remainder of the line
+            mtl_path = os.path.join(data_dir, mtl_path)
+        elif len(tokens) and tokens[0] == "usemtl":
+            material_name = tokens[1]
+            # materials are often repeated for different parts
+            # of a mesh.
+            if material_name not in material_names:
+                material_names.append(material_name)
+                materials_idx = len(material_names) - 1
+            else:
+                materials_idx = material_names.index(material_name)
+        elif line.startswith("v "):  # Line is a vertex.
+            vert = [float(x) for x in tokens[1:4]]
+            if len(vert) != 3:
+                msg = "Vertex %s does not have 3 values. Line: %s"
+                raise ValueError(msg % (str(vert), str(line)))
+            verts.append(vert)
+        elif line.startswith("vt "):  # Line is a texture.
+            tx = [float(x) for x in tokens[1:3]]
+            if len(tx) != 2:
+                raise ValueError(
+                    "Texture %s does not have 2 values. Line: %s" % (str(tx), str(line))
+                )
+            verts_uvs.append(tx)
+        elif line.startswith("vn "):  # Line is a normal.
+            norm = [float(x) for x in tokens[1:4]]
+            if len(norm) != 3:
+                msg = "Normal %s does not have 3 values. Line: %s"
+                raise ValueError(msg % (str(norm), str(line)))
+            normals.append(norm)
+        elif line.startswith("f "):  # Line is a face.
+            # Update face properties info.
+            _parse_face(
+                line,
+                tokens,
+                materials_idx,
+                faces_verts_idx,
+                faces_normals_idx,
+                faces_textures_idx,
+                faces_materials_idx,
+            )
+
+    return (
+        verts,
+        normals,
+        verts_uvs,
+        faces_verts_idx,
+        faces_normals_idx,
+        faces_textures_idx,
+        faces_materials_idx,
+        material_names,
+        mtl_path,
+    )
+
+
+def _load_materials(
+    material_names: List[str],
+    f: Optional[str],
+    *,
+    data_dir: str,
+    load_textures: bool,
+    device: Device,
+    path_manager: PathManager,
+):
+    """
+    Load materials and optionally textures from the specified path.
+
+    Args:
+        material_names: a list of the material names found in the .obj file.
+        f: path to the material information.
+        data_dir: the directory where the material texture files are located.
+        load_textures: whether textures should be loaded.
+        device: Device (as str or torch.device) on which to return the new tensors.
+        path_manager: PathManager object to interpret paths.
+
+    Returns:
+        material_colors: dict of properties for each material.
+        texture_images: dict of material names and texture images.
+    """
+    if not load_textures:
+        return None, None
+
+    if not material_names or f is None:
+        if material_names:
+            warnings.warn("No mtl file provided")
+        return None, None
+
+    if not path_manager.exists(f):
+        warnings.warn(f"Mtl file does not exist: {f}")
+        return None, None
+
+    # Texture mode uv wrap
+    return load_mtl(
+        f,
+        material_names=material_names,
+        data_dir=data_dir,
+        path_manager=path_manager,
+        device=device,
+    )
+
+
+def _load_obj(
+    f_obj,
+    *,
+    data_dir,
+    load_textures: bool = True,
+    create_texture_atlas: bool = False,
+    texture_atlas_size: int = 4,
+    texture_wrap: Optional[str] = "repeat",
+    path_manager: PathManager,
+    device: Device = "cpu",
+):
+    """
+    Load a mesh from a file-like object. See load_obj function more details.
+    Any material files associated with the obj are expected to be in the
+    directory given by data_dir.
+    """
+
+    if texture_wrap is not None and texture_wrap not in ["repeat", "clamp"]:
+        msg = "texture_wrap must be one of ['repeat', 'clamp'] or None, got %s"
+        raise ValueError(msg % texture_wrap)
+
+    (
+        verts,
+        normals,
+        verts_uvs,
+        faces_verts_idx,
+        faces_normals_idx,
+        faces_textures_idx,
+        faces_materials_idx,
+        material_names,
+        mtl_path,
+    ) = _parse_obj(f_obj, data_dir)
+
+    verts = _make_tensor(verts, cols=3, dtype=torch.float32, device=device)  # (V, 3)
+    normals = _make_tensor(
+        normals,
+        cols=3,
+        dtype=torch.float32,
+        device=device,
+    )  # (N, 3)
+    verts_uvs = _make_tensor(
+        verts_uvs,
+        cols=2,
+        dtype=torch.float32,
+        device=device,
+    )  # (T, 2)
+
+    faces_verts_idx = _format_faces_indices(
+        faces_verts_idx, verts.shape[0], device=device
+    )
+
+    # Repeat for normals and textures if present.
+    if len(faces_normals_idx):
+        faces_normals_idx = _format_faces_indices(
+            faces_normals_idx, normals.shape[0], device=device, pad_value=-1
+        )
+    if len(faces_textures_idx):
+        faces_textures_idx = _format_faces_indices(
+            faces_textures_idx, verts_uvs.shape[0], device=device, pad_value=-1
+        )
+    if len(faces_materials_idx):
+        faces_materials_idx = torch.tensor(
+            faces_materials_idx, dtype=torch.int64, device=device
+        )
+
+    texture_atlas = None
+    material_colors, texture_images = _load_materials(
+        material_names,
+        mtl_path,
+        data_dir=data_dir,
+        load_textures=load_textures,
+        path_manager=path_manager,
+        device=device,
+    )
+
+    if create_texture_atlas:
+        # Using the images and properties from the
+        # material file make a per face texture map.
+
+        # Create an array of strings of material names for each face.
+        # If faces_materials_idx == -1 then that face doesn't have a material.
+        idx = faces_materials_idx.cpu().numpy()
+        face_material_names = np.array(material_names)[idx]  # (F,)
+        face_material_names[idx == -1] = ""
+
+        # Construct the atlas.
+        texture_atlas = make_mesh_texture_atlas(
+            material_colors,
+            texture_images,
+            face_material_names,
+            faces_textures_idx,
+            verts_uvs,
+            texture_atlas_size,
+            texture_wrap,
+        )
+
+    faces = _Faces(
+        verts_idx=faces_verts_idx,
+        normals_idx=faces_normals_idx,
+        textures_idx=faces_textures_idx,
+        materials_idx=faces_materials_idx,
+    )
+    aux = _Aux(
+        normals=normals if len(normals) else None,
+        verts_uvs=verts_uvs if len(verts_uvs) else None,
+        material_colors=material_colors,
+        texture_images=texture_images,
+        texture_atlas=texture_atlas,
+    )
+    return verts, faces, aux
+
+
+def save_obj(
+    f: PathOrStr,
+    verts,
+    faces,
+    decimal_places: Optional[int] = None,
+    path_manager: Optional[PathManager] = None,
+    *,
+    verts_uvs: Optional[torch.Tensor] = None,
+    faces_uvs: Optional[torch.Tensor] = None,
+    texture_map: Optional[torch.Tensor] = None,
+) -> None:
+    """
+    Save a mesh to an .obj file.
+
+    Args:
+        f: File (str or path) to which the mesh should be written.
+        verts: FloatTensor of shape (V, 3) giving vertex coordinates.
+        faces: LongTensor of shape (F, 3) giving faces.
+        decimal_places: Number of decimal places for saving.
+        path_manager: Optional PathManager for interpreting f if
+            it is a str.
+        verts_uvs: FloatTensor of shape (V, 2) giving the uv coordinate per vertex.
+        faces_uvs: LongTensor of shape (F, 3) giving the index into verts_uvs for
+            each vertex in the face.
+        texture_map: FloatTensor of shape (H, W, 3) representing the texture map
+            for the mesh which will be saved as an image. The values are expected
+            to be in the range [0, 1],
+    """
+    if len(verts) and (verts.dim() != 2 or verts.size(1) != 3):
+        message = "'verts' should either be empty or of shape (num_verts, 3)."
+        raise ValueError(message)
+
+    if len(faces) and (faces.dim() != 2 or faces.size(1) != 3):
+        message = "'faces' should either be empty or of shape (num_faces, 3)."
+        raise ValueError(message)
+
+    if faces_uvs is not None and (faces_uvs.dim() != 2 or faces_uvs.size(1) != 3):
+        message = "'faces_uvs' should either be empty or of shape (num_faces, 3)."
+        raise ValueError(message)
+
+    if verts_uvs is not None and (verts_uvs.dim() != 2 or verts_uvs.size(1) != 2):
+        message = "'verts_uvs' should either be empty or of shape (num_verts, 2)."
+        raise ValueError(message)
+
+    if texture_map is not None and (texture_map.dim() != 3 or texture_map.size(2) != 3):
+        message = "'texture_map' should either be empty or of shape (H, W, 3)."
+        raise ValueError(message)
+
+    if path_manager is None:
+        path_manager = PathManager()
+
+    save_texture = all([t is not None for t in [faces_uvs, verts_uvs, texture_map]])
+    output_path = Path(f)
+
+    # Save the .obj file
+    with _open_file(f, path_manager, "w") as f:
+        if save_texture:
+            # Add the header required for the texture info to be loaded correctly
+            obj_header = "\nmtllib {0}.mtl\nusemtl mesh\n\n".format(output_path.stem)
+            f.write(obj_header)
+        _save(
+            f,
+            verts,
+            faces,
+            decimal_places,
+            verts_uvs=verts_uvs,
+            faces_uvs=faces_uvs,
+            save_texture=save_texture,
+        )
+
+    # Save the .mtl and .png files associated with the texture
+    if save_texture:
+        image_path = output_path.with_suffix(".png")
+        mtl_path = output_path.with_suffix(".mtl")
+        if isinstance(f, str):
+            # Back to str for iopath interpretation.
+            image_path = str(image_path)
+            mtl_path = str(mtl_path)
+
+        # Save texture map to output folder
+        # pyre-fixme[16] # undefined attribute cpu
+        texture_map = texture_map.detach().cpu() * 255.0
+        image = Image.fromarray(texture_map.numpy().astype(np.uint8))
+        with _open_file(image_path, path_manager, "wb") as im_f:
+            image.save(im_f)
+
+        # Create .mtl file with the material name and texture map filename
+        # TODO: enable material properties to also be saved.
+        with _open_file(mtl_path, path_manager, "w") as f_mtl:
+            lines = f"newmtl mesh\n" f"map_Kd {output_path.stem}.png\n"
+            f_mtl.write(lines)
+
+
+# TODO (nikhilar) Speed up this function.
+def _save(
+    f,
+    verts,
+    faces,
+    decimal_places: Optional[int] = None,
+    *,
+    verts_uvs: Optional[torch.Tensor] = None,
+    faces_uvs: Optional[torch.Tensor] = None,
+    save_texture: bool = False,
+) -> None:
+
+    if len(verts) and (verts.dim() != 2 or verts.size(1) != 3):
+        message = "'verts' should either be empty or of shape (num_verts, 3)."
+        raise ValueError(message)
+
+    if len(faces) and (faces.dim() != 2 or faces.size(1) != 3):
+        message = "'faces' should either be empty or of shape (num_faces, 3)."
+        raise ValueError(message)
+
+    if not (len(verts) or len(faces)):
+        warnings.warn("Empty 'verts' and 'faces' arguments provided")
+        return
+
+    verts, faces = verts.cpu(), faces.cpu()
+
+    lines = ""
+
+    if len(verts):
+        if decimal_places is None:
+            float_str = "%f"
+        else:
+            float_str = "%" + ".%df" % decimal_places
+
+        V, D = verts.shape
+        for i in range(V):
+            vert = [float_str % verts[i, j] for j in range(D)]
+            lines += "v %s\n" % " ".join(vert)
+
+    if save_texture:
+        if faces_uvs is not None and (faces_uvs.dim() != 2 or faces_uvs.size(1) != 3):
+            message = "'faces_uvs' should either be empty or of shape (num_faces, 3)."
+            raise ValueError(message)
+
+        if verts_uvs is not None and (verts_uvs.dim() != 2 or verts_uvs.size(1) != 2):
+            message = "'verts_uvs' should either be empty or of shape (num_verts, 2)."
+            raise ValueError(message)
+
+        # pyre-fixme[16] # undefined attribute cpu
+        verts_uvs, faces_uvs = verts_uvs.cpu(), faces_uvs.cpu()
+
+        # Save verts uvs after verts
+        if len(verts_uvs):
+            uV, uD = verts_uvs.shape
+            for i in range(uV):
+                uv = [float_str % verts_uvs[i, j] for j in range(uD)]
+                lines += "vt %s\n" % " ".join(uv)
+
+    if torch.any(faces >= verts.shape[0]) or torch.any(faces < 0):
+        warnings.warn("Faces have invalid indices")
+
+    if len(faces):
+        F, P = faces.shape
+        for i in range(F):
+            if save_texture:
+                # Format faces as {verts_idx}/{verts_uvs_idx}
+                face = [
+                    "%d/%d" % (faces[i, j] + 1, faces_uvs[i, j] + 1) for j in range(P)
+                ]
+            else:
+                face = ["%d" % (faces[i, j] + 1) for j in range(P)]
+
+            if i + 1 < F:
+                lines += "f %s\n" % " ".join(face)
+
+            elif i + 1 == F:
+                # No newline at the end of the file.
+                lines += "f %s" % " ".join(face)
+
+    f.write(lines)
diff --git a/pytorch3d/pytorch3d/io/off_io.py b/pytorch3d/pytorch3d/io/off_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..84217656efc32ff247ac6a87fd83497dc4dfe1eb
--- /dev/null
+++ b/pytorch3d/pytorch3d/io/off_io.py
@@ -0,0 +1,494 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+"""
+This module implements utility functions for loading and saving
+meshes as .off files.
+
+This format is introduced, for example, at
+http://www.geomview.org/docs/html/OFF.html .
+"""
+import warnings
+from typing import Optional, Tuple, Union, cast
+
+import numpy as np
+import torch
+from iopath.common.file_io import PathManager
+from pytorch3d.io.utils import PathOrStr, _check_faces_indices, _open_file
+from pytorch3d.renderer import TexturesAtlas, TexturesVertex
+from pytorch3d.structures import Meshes
+
+from .pluggable_formats import MeshFormatInterpreter, endswith
+
+
+def _is_line_empty(line: Union[str, bytes]) -> bool:
+    """
+    Returns whether line is not relevant in an OFF file.
+    """
+    line = line.strip()
+    return len(line) == 0 or line[:1] == b"#"
+
+
+def _count_next_line_periods(file) -> int:
+    """
+    Returns the number of . characters before any # on the next
+    meaningful line.
+    """
+    old_offset = file.tell()
+    line = file.readline()
+    while _is_line_empty(line):
+        line = file.readline()
+        if len(line) == 0:
+            raise ValueError("Premature end of file")
+
+    contents = line.split(b"#")[0]
+    count = contents.count(b".")
+    file.seek(old_offset)
+    return count
+
+
+def _read_faces_lump(
+    file, n_faces: int, n_colors: Optional[int]
+) -> Optional[Tuple[np.ndarray, int, Optional[np.ndarray]]]:
+    """
+    Parse n_faces faces and faces_colors from the file,
+    if they all have the same number of vertices.
+    This is used in two ways.
+    1) To try to read all faces.
+    2) To read faces one-by-one if that failed.
+
+    Args:
+        file: file-like object being read.
+        n_faces: The known number of faces yet to read.
+        n_colors: The number of colors if known already.
+
+    Returns:
+        - 2D numpy array of faces
+        - number of colors found
+        - 2D numpy array of face colors if found.
+        of None if there are faces with different numbers of vertices.
+    """
+    if n_faces == 0:
+        return np.array([[]]), 0, None
+    old_offset = file.tell()
+    try:
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", message=".* Empty input file.*", category=UserWarning
+            )
+            data = np.loadtxt(file, dtype=np.float32, ndmin=2, max_rows=n_faces)
+    except ValueError as e:
+        if n_faces > 1 and "Wrong number of columns" in e.args[0]:
+            file.seek(old_offset)
+            return None
+        raise ValueError("Not enough face data.")
+
+    if len(data) != n_faces:
+        raise ValueError("Not enough face data.")
+    face_size = int(data[0, 0])
+    if (data[:, 0] != face_size).any():
+        msg = "A line of face data did not have the specified length."
+        raise ValueError(msg)
+    if face_size < 3:
+        raise ValueError("Faces must have at least 3 vertices.")
+
+    n_colors_found = data.shape[1] - 1 - face_size
+    if n_colors is not None and n_colors_found != n_colors:
+        raise ValueError("Number of colors differs between faces.")
+    n_colors = n_colors_found
+    if n_colors not in [0, 3, 4]:
+        raise ValueError("Unexpected number of colors.")
+
+    face_raw_data = data[:, 1 : 1 + face_size].astype("int64")
+    if face_size == 3:
+        face_data = face_raw_data
+    else:
+        face_arrays = [
+            face_raw_data[:, [0, i + 1, i + 2]] for i in range(face_size - 2)
+        ]
+        face_data = np.vstack(face_arrays)
+
+    if n_colors == 0:
+        return face_data, 0, None
+    colors = data[:, 1 + face_size :]
+    if face_size == 3:
+        return face_data, n_colors, colors
+    return face_data, n_colors, np.tile(colors, (face_size - 2, 1))
+
+
+def _read_faces(
+    file, n_faces: int
+) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
+    """
+    Returns faces and face colors from the file.
+
+    Args:
+        file: file-like object being read.
+        n_faces: The known number of faces.
+
+    Returns:
+        2D numpy arrays of faces and face colors, or None for each if
+            they are not present.
+    """
+    if n_faces == 0:
+        return None, None
+
+    color_is_int = 0 == _count_next_line_periods(file)
+    color_scale = 1 / 255.0 if color_is_int else 1
+
+    faces_ncolors_colors = _read_faces_lump(file, n_faces=n_faces, n_colors=None)
+    if faces_ncolors_colors is not None:
+        faces, _, colors = faces_ncolors_colors
+        if colors is None:
+            return faces, None
+        return faces, colors * color_scale
+
+    faces_list, colors_list = [], []
+    n_colors = None
+    for _ in range(n_faces):
+        faces_ncolors_colors = _read_faces_lump(file, n_faces=1, n_colors=n_colors)
+        faces_found, n_colors, colors_found = cast(
+            Tuple[np.ndarray, int, Optional[np.ndarray]], faces_ncolors_colors
+        )
+        faces_list.append(faces_found)
+        colors_list.append(colors_found)
+    faces = np.vstack(faces_list)
+    if n_colors == 0:
+        colors = None
+    else:
+        colors = np.vstack(colors_list) * color_scale
+    return faces, colors
+
+
+def _read_verts(file, n_verts: int) -> Tuple[np.ndarray, Optional[np.ndarray]]:
+    """
+    Returns verts and vertex colors from the file.
+
+    Args:
+        file: file-like object being read.
+        n_verts: The known number of faces.
+
+    Returns:
+        2D numpy arrays of verts and (if present)
+        vertex colors.
+    """
+
+    color_is_int = 3 == _count_next_line_periods(file)
+    color_scale = 1 / 255.0 if color_is_int else 1
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore", message=".* Empty input file.*", category=UserWarning
+        )
+        data = np.loadtxt(file, dtype=np.float32, ndmin=2, max_rows=n_verts)
+    if data.shape[0] != n_verts:
+        raise ValueError("Not enough vertex data.")
+    if data.shape[1] not in [3, 6, 7]:
+        raise ValueError("Bad vertex data.")
+
+    if data.shape[1] == 3:
+        return data, None
+    return data[:, :3], data[:, 3:] * color_scale  # []
+
+
+def _load_off_stream(file) -> dict:
+    """
+    Load the data from a stream of an .off file.
+
+    Example .off file format:
+
+    off
+    8 6 1927                   { number of vertices, faces, and (not used) edges }
+    # comment                  { comments with # sign }
+    0 0 0                      { start of vertex list }
+    0 0 1
+    0 1 1
+    0 1 0
+    1 0 0
+    1 0 1
+    1 1 1
+    1 1 0
+    4 0 1 2 3                  { start of face list }
+    4 7 6 5 4
+    4 0 4 5 1
+    4 1 5 6 2
+    4 2 6 7 3
+    4 3 7 4 0
+
+    Args:
+        file:  A binary file-like object (with methods read, readline,
+            tell and seek).
+
+    Returns dictionary possibly containing:
+        verts: (always present) FloatTensor of shape (V, 3).
+        verts_colors: FloatTensor of shape (V, C) where C is 3 or 4.
+        faces: LongTensor of vertex indices, split into triangles, shape (F, 3).
+        faces_colors: FloatTensor of shape (F, C), where C is 3 or 4.
+    """
+    header = file.readline()
+
+    while _is_line_empty(header):
+        header = file.readline()
+
+    if header[:3].lower() == b"off":
+        header = header[3:]
+
+    while _is_line_empty(header):
+        header = file.readline()
+
+    items = header.split()
+    if len(items) < 3:
+        raise ValueError("Invalid counts line: %s" % header)
+
+    try:
+        n_verts = int(items[0])
+    except ValueError:
+        raise ValueError("Invalid counts line: %s" % header)
+    try:
+        n_faces = int(items[1])
+    except ValueError:
+        raise ValueError("Invalid counts line: %s" % header)
+
+    if (len(items) > 3 and not items[3].startswith(b"#")) or n_verts < 0 or n_faces < 0:
+        raise ValueError("Invalid counts line: %s" % header)
+
+    verts, verts_colors = _read_verts(file, n_verts)
+    faces, faces_colors = _read_faces(file, n_faces)
+
+    end = file.read().strip()
+    if len(end) != 0:
+        raise ValueError("Extra data at end of file: " + str(end[:20]))
+
+    out = {"verts": verts}
+    if verts_colors is not None:
+        out["verts_colors"] = verts_colors
+    if faces is not None:
+        out["faces"] = faces
+    if faces_colors is not None:
+        out["faces_colors"] = faces_colors
+    return out
+
+
+def _write_off_data(
+    file,
+    verts: torch.Tensor,
+    verts_colors: Optional[torch.Tensor] = None,
+    faces: Optional[torch.LongTensor] = None,
+    faces_colors: Optional[torch.Tensor] = None,
+    decimal_places: Optional[int] = None,
+) -> None:
+    """
+    Internal implementation for saving 3D data to a .off file.
+
+    Args:
+        file: Binary file object to which the 3D data should be written.
+        verts: FloatTensor of shape (V, 3) giving vertex coordinates.
+        verts_colors: FloatTensor of shape (V, C) giving vertex colors where C is 3 or 4.
+        faces: LongTensor of shape (F, 3) giving faces.
+        faces_colors: FloatTensor of shape (V, C) giving face colors where C is 3 or 4.
+        decimal_places: Number of decimal places for saving.
+    """
+    nfaces = 0 if faces is None else faces.shape[0]
+    file.write(f"off\n{verts.shape[0]} {nfaces} 0\n".encode("ascii"))
+
+    if verts_colors is not None:
+        verts = torch.cat((verts, verts_colors), dim=1)
+    if decimal_places is None:
+        float_str = "%f"
+    else:
+        float_str = "%" + ".%df" % decimal_places
+    np.savetxt(file, verts.cpu().detach().numpy(), float_str)
+
+    if faces is not None:
+        _check_faces_indices(faces, max_index=verts.shape[0])
+
+    if faces_colors is not None:
+        face_data = torch.cat(
+            [
+                cast(torch.Tensor, faces).cpu().to(torch.float64),
+                faces_colors.detach().cpu().to(torch.float64),
+            ],
+            dim=1,
+        )
+        format = "3 %d %d %d" + " %f" * faces_colors.shape[1]
+        np.savetxt(file, face_data.numpy(), format)
+    elif faces is not None:
+        np.savetxt(file, faces.cpu().detach().numpy(), "3 %d %d %d")
+
+
+def _save_off(
+    file,
+    *,
+    verts: torch.Tensor,
+    verts_colors: Optional[torch.Tensor] = None,
+    faces: Optional[torch.LongTensor] = None,
+    faces_colors: Optional[torch.Tensor] = None,
+    decimal_places: Optional[int] = None,
+    path_manager: PathManager,
+) -> None:
+    """
+    Save a mesh to an ascii .off file.
+
+    Args:
+        file: File (or path) to which the mesh should be written.
+        verts: FloatTensor of shape (V, 3) giving vertex coordinates.
+        verts_colors: FloatTensor of shape (V, C) giving vertex colors where C is 3 or 4.
+        faces: LongTensor of shape (F, 3) giving faces.
+        faces_colors: FloatTensor of shape (V, C) giving face colors where C is 3 or 4.
+        decimal_places: Number of decimal places for saving.
+    """
+    if len(verts) and not (verts.dim() == 2 and verts.size(1) == 3):
+        message = "Argument 'verts' should either be empty or of shape (num_verts, 3)."
+        raise ValueError(message)
+
+    if verts_colors is not None and 0 == len(verts_colors):
+        verts_colors = None
+    if faces_colors is not None and 0 == len(faces_colors):
+        faces_colors = None
+    if faces is not None and 0 == len(faces):
+        faces = None
+
+    if verts_colors is not None:
+        if not (verts_colors.dim() == 2 and verts_colors.size(1) in [3, 4]):
+            message = "verts_colors should have shape (num_faces, C)."
+            raise ValueError(message)
+        if verts_colors.shape[0] != verts.shape[0]:
+            message = "verts_colors should have the same length as verts."
+            raise ValueError(message)
+
+    if faces is not None and not (faces.dim() == 2 and faces.size(1) == 3):
+        message = "Argument 'faces' if present should have shape (num_faces, 3)."
+        raise ValueError(message)
+    if faces_colors is not None and faces is None:
+        message = "Cannot have face colors without faces"
+        raise ValueError(message)
+
+    if faces_colors is not None:
+        if not (faces_colors.dim() == 2 and faces_colors.size(1) in [3, 4]):
+            message = "faces_colors should have shape (num_faces, C)."
+            raise ValueError(message)
+        if faces_colors.shape[0] != cast(torch.LongTensor, faces).shape[0]:
+            message = "faces_colors should have the same length as faces."
+            raise ValueError(message)
+
+    with _open_file(file, path_manager, "wb") as f:
+        _write_off_data(f, verts, verts_colors, faces, faces_colors, decimal_places)
+
+
+class MeshOffFormat(MeshFormatInterpreter):
+    """
+    Loads and saves meshes in the ascii OFF format. This is a simple
+    format which can only deal with the following texture types:
+
+    - TexturesVertex, i.e. one color for each vertex
+    - TexturesAtlas with R=1, i.e. one color for each face.
+
+    There are some possible features of OFF files which we do not support
+    and which appear to be rare:
+
+    - Four dimensional data.
+    - Binary data.
+    - Vertex Normals.
+    - Texture coordinates.
+    - "COFF" header.
+
+    Example .off file format:
+
+    off
+    8 6 1927                   { number of vertices, faces, and (not used) edges }
+    # comment                  { comments with # sign }
+    0 0 0                      { start of vertex list }
+    0 0 1
+    0 1 1
+    0 1 0
+    1 0 0
+    1 0 1
+    1 1 1
+    1 1 0
+    4 0 1 2 3                  { start of face list }
+    4 7 6 5 4
+    4 0 4 5 1
+    4 1 5 6 2
+    4 2 6 7 3
+    4 3 7 4 0
+
+    """
+
+    def __init__(self) -> None:
+        self.known_suffixes = (".off",)
+
+    def read(
+        self,
+        path: PathOrStr,
+        include_textures: bool,
+        device,
+        path_manager: PathManager,
+        **kwargs,
+    ) -> Optional[Meshes]:
+        if not endswith(path, self.known_suffixes):
+            return None
+
+        with _open_file(path, path_manager, "rb") as f:
+            data = _load_off_stream(f)
+        verts = torch.from_numpy(data["verts"]).to(device)
+        if "faces" in data:
+            faces = torch.from_numpy(data["faces"]).to(dtype=torch.int64, device=device)
+        else:
+            faces = torch.zeros((0, 3), dtype=torch.int64, device=device)
+
+        textures = None
+        if "verts_colors" in data:
+            if "faces_colors" in data:
+                msg = "Faces colors ignored because vertex colors provided too."
+                warnings.warn(msg)
+            verts_colors = torch.from_numpy(data["verts_colors"]).to(device)
+            textures = TexturesVertex([verts_colors])
+        elif "faces_colors" in data:
+            faces_colors = torch.from_numpy(data["faces_colors"]).to(device)
+            textures = TexturesAtlas([faces_colors[:, None, None, :]])
+
+        mesh = Meshes(
+            verts=[verts.to(device)], faces=[faces.to(device)], textures=textures
+        )
+        return mesh
+
+    def save(
+        self,
+        data: Meshes,
+        path: PathOrStr,
+        path_manager: PathManager,
+        binary: Optional[bool],
+        decimal_places: Optional[int] = None,
+        **kwargs,
+    ) -> bool:
+        if not endswith(path, self.known_suffixes):
+            return False
+
+        verts = data.verts_list()[0]
+        faces = data.faces_list()[0]
+        if isinstance(data.textures, TexturesVertex):
+            [verts_colors] = data.textures.verts_features_list()
+        else:
+            verts_colors = None
+
+        faces_colors = None
+        if isinstance(data.textures, TexturesAtlas):
+            [atlas] = data.textures.atlas_list()
+            F, R, _, D = atlas.shape
+            if R == 1:
+                faces_colors = atlas[:, 0, 0, :]
+
+        _save_off(
+            file=path,
+            verts=verts,
+            faces=faces,
+            verts_colors=verts_colors,
+            faces_colors=faces_colors,
+            decimal_places=decimal_places,
+            path_manager=path_manager,
+        )
+        return True
diff --git a/pytorch3d/pytorch3d/io/pluggable.py b/pytorch3d/pytorch3d/io/pluggable.py
new file mode 100644
index 0000000000000000000000000000000000000000..389ccb866c8e9f51eadf05289c019e869dc8553d
--- /dev/null
+++ b/pytorch3d/pytorch3d/io/pluggable.py
@@ -0,0 +1,220 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from collections import deque
+from pathlib import Path
+from typing import Deque, Optional, Union
+
+from iopath.common.file_io import PathManager
+from pytorch3d.common.types import Device
+from pytorch3d.structures import Meshes, Pointclouds
+
+from .obj_io import MeshObjFormat
+from .off_io import MeshOffFormat
+from .pluggable_formats import MeshFormatInterpreter, PointcloudFormatInterpreter
+from .ply_io import MeshPlyFormat, PointcloudPlyFormat
+
+
+"""
+This module has the master functions for loading and saving data.
+
+The main usage is via the IO object, and its methods
+`load_mesh`, `save_mesh`, `load_pointcloud` and `save_pointcloud`.
+
+For example, to load a mesh you might do
+```
+from pytorch3d.io import IO
+
+mesh = IO().load_mesh("mymesh.obj")
+```
+
+and to save a point cloud you might do
+
+```
+pcl = Pointclouds(...)
+IO().save_pointcloud(pcl, "output_pointcloud.obj")
+```
+
+"""
+
+
+class IO:
+    """
+    This class is the interface to flexible loading and saving of meshes and point clouds.
+
+    In simple cases the user will just initialize an instance of this class as `IO()`
+    and then use its load and save functions. The arguments of the initializer are not
+    usually needed.
+
+    The user can add their own formats for saving and loading by passing their own objects
+    to the register_* functions.
+
+    Args:
+        include_default_formats: If False, the built-in file formats will not be available.
+            Then only user-registered formats can be used.
+        path_manager: Used to customize how paths given as strings are interpreted.
+    """
+
+    def __init__(
+        self,
+        include_default_formats: bool = True,
+        path_manager: Optional[PathManager] = None,
+    ) -> None:
+        if path_manager is None:
+            self.path_manager = PathManager()
+        else:
+            self.path_manager = path_manager
+
+        self.mesh_interpreters: Deque[MeshFormatInterpreter] = deque()
+        self.pointcloud_interpreters: Deque[PointcloudFormatInterpreter] = deque()
+
+        if include_default_formats:
+            self.register_default_formats()
+
+    def register_default_formats(self) -> None:
+        self.register_meshes_format(MeshObjFormat())
+        self.register_meshes_format(MeshOffFormat())
+        self.register_meshes_format(MeshPlyFormat())
+        self.register_pointcloud_format(PointcloudPlyFormat())
+
+    def register_meshes_format(self, interpreter: MeshFormatInterpreter) -> None:
+        """
+        Register a new interpreter for a new mesh file format.
+
+        Args:
+            interpreter: the new interpreter to use, which must be an instance
+                of a class which inherits MeshFormatInterpreter.
+        """
+        if not isinstance(interpreter, MeshFormatInterpreter):
+            raise ValueError("Invalid interpreter")
+        self.mesh_interpreters.appendleft(interpreter)
+
+    def register_pointcloud_format(
+        self, interpreter: PointcloudFormatInterpreter
+    ) -> None:
+        """
+        Register a new interpreter for a new point cloud file format.
+
+        Args:
+            interpreter: the new interpreter to use, which must be an instance
+                of a class which inherits PointcloudFormatInterpreter.
+        """
+        if not isinstance(interpreter, PointcloudFormatInterpreter):
+            raise ValueError("Invalid interpreter")
+        self.pointcloud_interpreters.appendleft(interpreter)
+
+    def load_mesh(
+        self,
+        path: Union[str, Path],
+        include_textures: bool = True,
+        device: Device = "cpu",
+        **kwargs,
+    ) -> Meshes:
+        """
+        Attempt to load a mesh from the given file, using a registered format.
+        Materials are not returned. If you have a .obj file with materials
+        you might want to load them with the load_obj function instead.
+
+        Args:
+            path: file to read
+            include_textures: whether to try to load texture information
+            device: device on which to leave the data.
+
+        Returns:
+            new Meshes object containing one mesh.
+        """
+        for mesh_interpreter in self.mesh_interpreters:
+            mesh = mesh_interpreter.read(
+                path,
+                include_textures=include_textures,
+                path_manager=self.path_manager,
+                device=device,
+                **kwargs,
+            )
+            if mesh is not None:
+                return mesh
+
+        raise ValueError(f"No mesh interpreter found to read {path}.")
+
+    def save_mesh(
+        self,
+        data: Meshes,
+        path: Union[str, Path],
+        binary: Optional[bool] = None,
+        include_textures: bool = True,
+        **kwargs,
+    ) -> None:
+        """
+        Attempt to save a mesh to the given file, using a registered format.
+
+        Args:
+            data: a 1-element Meshes
+            path: file to write
+            binary: If there is a choice, whether to save in a binary format.
+            include_textures: If textures are present, whether to try to save
+                                them.
+        """
+        if len(data) != 1:
+            raise ValueError("Can only save a single mesh.")
+
+        for mesh_interpreter in self.mesh_interpreters:
+            success = mesh_interpreter.save(
+                data, path, path_manager=self.path_manager, binary=binary, **kwargs
+            )
+            if success:
+                return
+
+        raise ValueError(f"No mesh interpreter found to write to {path}.")
+
+    def load_pointcloud(
+        self, path: Union[str, Path], device: Device = "cpu", **kwargs
+    ) -> Pointclouds:
+        """
+        Attempt to load a point cloud from the given file, using a registered format.
+
+        Args:
+            path: file to read
+            device: Device (as str or torch.device) on which to load the data.
+
+        Returns:
+            new Pointclouds object containing one mesh.
+        """
+        for pointcloud_interpreter in self.pointcloud_interpreters:
+            pointcloud = pointcloud_interpreter.read(
+                path, path_manager=self.path_manager, device=device, **kwargs
+            )
+            if pointcloud is not None:
+                return pointcloud
+
+        raise ValueError(f"No point cloud interpreter found to read {path}.")
+
+    def save_pointcloud(
+        self,
+        data: Pointclouds,
+        path: Union[str, Path],
+        binary: Optional[bool] = None,
+        **kwargs,
+    ) -> None:
+        """
+        Attempt to save a point cloud to the given file, using a registered format.
+
+        Args:
+            data: a 1-element Pointclouds
+            path: file to write
+            binary: If there is a choice, whether to save in a binary format.
+        """
+        if len(data) != 1:
+            raise ValueError("Can only save a single point cloud.")
+
+        for pointcloud_interpreter in self.pointcloud_interpreters:
+            success = pointcloud_interpreter.save(
+                data, path, path_manager=self.path_manager, binary=binary, **kwargs
+            )
+            if success:
+                return
+
+        raise ValueError(f"No point cloud interpreter found to write to {path}.")
diff --git a/pytorch3d/pytorch3d/io/pluggable_formats.py b/pytorch3d/pytorch3d/io/pluggable_formats.py
new file mode 100644
index 0000000000000000000000000000000000000000..acd0a02580a8431df7a090a367fb7aad8fc4361f
--- /dev/null
+++ b/pytorch3d/pytorch3d/io/pluggable_formats.py
@@ -0,0 +1,140 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import pathlib
+from typing import Optional, Tuple
+
+from iopath.common.file_io import PathManager
+from pytorch3d.common.types import Device
+from pytorch3d.io.utils import PathOrStr
+from pytorch3d.structures import Meshes, Pointclouds
+
+
+"""
+This module has the base classes which must be extended to define
+an interpreter for loading and saving data in a particular format.
+These can be registered on an IO object so that they can be used in
+its load_* and save_* functions.
+"""
+
+
+def endswith(path: PathOrStr, suffixes: Tuple[str, ...]) -> bool:
+    """
+    Returns whether the path ends with one of the given suffixes.
+    If `path` is not actually a path, returns True. This is useful
+    for allowing interpreters to bypass inappropriate paths, but
+    always accepting streams.
+    """
+    if isinstance(path, pathlib.Path):
+        return path.suffix.lower() in suffixes
+    if isinstance(path, str):
+        return path.lower().endswith(suffixes)
+    return True
+
+
+class MeshFormatInterpreter:
+    """
+    This is a base class for an interpreter which can read or write
+    a mesh in a particular format.
+    """
+
+    def read(
+        self,
+        path: PathOrStr,
+        include_textures: bool,
+        device: Device,
+        path_manager: PathManager,
+        **kwargs,
+    ) -> Optional[Meshes]:
+        """
+        Read the data from the specified file and return it as
+        a Meshes object.
+
+        Args:
+            path: path to load.
+            include_textures: whether to try to load texture information.
+            device: torch.device to load data on to.
+            path_manager: PathManager to interpret the path.
+
+        Returns:
+            None if self is not the appropriate object to interpret the given
+                path.
+            Otherwise, the read Meshes object.
+        """
+        raise NotImplementedError()
+
+    def save(
+        self,
+        data: Meshes,
+        path: PathOrStr,
+        path_manager: PathManager,
+        binary: Optional[bool],
+        **kwargs,
+    ) -> bool:
+        """
+        Save the given Meshes object to the given path.
+
+        Args:
+            data: mesh to save
+            path: path to save to, which may be overwritten.
+            path_manager: PathManager to interpret the path.
+            binary: If there is a choice, whether to save in a binary format.
+
+        Returns:
+            False: if self is not the appropriate object to write to the given path.
+            True: on success.
+        """
+        raise NotImplementedError()
+
+
+class PointcloudFormatInterpreter:
+    """
+    This is a base class for an interpreter which can read or write
+    a point cloud in a particular format.
+    """
+
+    def read(
+        self, path: PathOrStr, device: Device, path_manager: PathManager, **kwargs
+    ) -> Optional[Pointclouds]:
+        """
+        Read the data from the specified file and return it as
+        a Pointclouds object.
+
+        Args:
+            path: path to load.
+            device: torch.device to load data on to.
+            path_manager: PathManager to interpret the path.
+
+        Returns:
+            None if self is not the appropriate object to interpret the given
+                path.
+            Otherwise, the read Pointclouds object.
+        """
+        raise NotImplementedError()
+
+    def save(
+        self,
+        data: Pointclouds,
+        path: PathOrStr,
+        path_manager: PathManager,
+        binary: Optional[bool],
+        **kwargs,
+    ) -> bool:
+        """
+        Save the given Pointclouds object to the given path.
+
+        Args:
+            data: point cloud object to save
+            path: path to save to, which may be overwritten.
+            path_manager: PathManager to interpret the path.
+            binary: If there is a choice, whether to save in a binary format.
+
+        Returns:
+            False: if self is not the appropriate object to write to the given path.
+            True: on success.
+        """
+        raise NotImplementedError()
diff --git a/pytorch3d/pytorch3d/io/ply_io.py b/pytorch3d/pytorch3d/io/ply_io.py
new file mode 100644
index 0000000000000000000000000000000000000000..a188591521cdb9eba74912a8fe26586a4664e112
--- /dev/null
+++ b/pytorch3d/pytorch3d/io/ply_io.py
@@ -0,0 +1,1436 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+"""
+This module implements utility functions for loading and saving
+meshes and point clouds as PLY files.
+"""
+import itertools
+import struct
+import sys
+import warnings
+from collections import namedtuple
+from io import BytesIO, TextIOBase
+from typing import List, Optional, Tuple, cast
+
+import numpy as np
+import torch
+from iopath.common.file_io import PathManager
+from pytorch3d.io.utils import PathOrStr, _check_faces_indices, _make_tensor, _open_file
+from pytorch3d.renderer import TexturesVertex
+from pytorch3d.structures import Meshes, Pointclouds
+
+from .pluggable_formats import (
+    MeshFormatInterpreter,
+    PointcloudFormatInterpreter,
+    endswith,
+)
+
+
+_PlyTypeData = namedtuple("_PlyTypeData", "size struct_char np_type")
+
+_PLY_TYPES = {
+    "char": _PlyTypeData(1, "b", np.byte),
+    "uchar": _PlyTypeData(1, "B", np.ubyte),
+    "short": _PlyTypeData(2, "h", np.short),
+    "ushort": _PlyTypeData(2, "H", np.ushort),
+    "int": _PlyTypeData(4, "i", np.int32),
+    "uint": _PlyTypeData(4, "I", np.uint32),
+    "float": _PlyTypeData(4, "f", np.float32),
+    "double": _PlyTypeData(8, "d", np.float64),
+    "int8": _PlyTypeData(1, "b", np.byte),
+    "uint8": _PlyTypeData(1, "B", np.ubyte),
+    "int16": _PlyTypeData(2, "h", np.short),
+    "uint16": _PlyTypeData(2, "H", np.ushort),
+    "int32": _PlyTypeData(4, "i", np.int32),
+    "uint32": _PlyTypeData(4, "I", np.uint32),
+    "float32": _PlyTypeData(4, "f", np.float32),
+    "float64": _PlyTypeData(8, "d", np.float64),
+}
+
+_Property = namedtuple("_Property", "name data_type list_size_type")
+
+
+class _PlyElementType:
+    """
+    Description of an element of a Ply file.
+    Members:
+        self.properties: (List[_Property]) description of all the properties.
+                            Each one contains a name and data type.
+        self.count:      (int) number of such elements in the file
+        self.name:       (str) name of the element
+    """
+
+    def __init__(self, name: str, count: int) -> None:
+        self.name = name
+        self.count = count
+        self.properties: List[_Property] = []
+
+    def add_property(
+        self, name: str, data_type: str, list_size_type: Optional[str] = None
+    ):
+        """Adds a new property.
+
+        Args:
+            name:           (str) name of the property.
+            data_type:      (str) PLY data type.
+            list_size_type: (str) PLY data type of the list size, or None if not
+                            a list.
+        """
+        for property in self.properties:
+            if property.name == name:
+                msg = "Cannot have two properties called %s in %s."
+                raise ValueError(msg % (name, self.name))
+        self.properties.append(_Property(name, data_type, list_size_type))
+
+    def is_fixed_size(self) -> bool:
+        """Return whether the Element has no list properties
+
+        Returns:
+            True if none of the properties are lists.
+        """
+        for property in self.properties:
+            if property.list_size_type is not None:
+                return False
+        return True
+
+    def is_constant_type_fixed_size(self) -> bool:
+        """Return whether the Element has all properties of the same non-list
+        type.
+
+        Returns:
+            True if none of the properties are lists and all the properties
+            share a type.
+        """
+        if not self.is_fixed_size():
+            return False
+        first_type = _PLY_TYPES[self.properties[0].data_type]
+        for property in self.properties:
+            if _PLY_TYPES[property.data_type] != first_type:
+                return False
+        return True
+
+    def try_constant_list(self) -> bool:
+        """Whether the element is just a single list, which might have a
+        constant size, and therefore we could try to parse quickly with numpy.
+
+        Returns:
+            True if the only property is a list.
+        """
+        if len(self.properties) != 1:
+            return False
+        if self.properties[0].list_size_type is None:
+            return False
+        return True
+
+
+class _PlyHeader:
+    def __init__(self, f) -> None:
+        """
+        Load a header of a Ply file from a file-like object.
+        Members:
+            self.elements:   (List[_PlyElementType]) element description
+            self.ascii:      (bool) Whether in ascii format
+            self.big_endian: (bool) (if not ascii) whether big endian
+            self.obj_info:   (List[str]) arbitrary extra data
+
+        Args:
+            f: file-like object.
+        """
+        if f.readline() not in [b"ply\n", b"ply\r\n", "ply\n"]:
+            raise ValueError("Invalid file header.")
+        seen_format = False
+        self.elements: List[_PlyElementType] = []
+        self.obj_info = []
+        while True:
+            line = f.readline()
+            if isinstance(line, bytes):
+                line = line.decode("ascii")
+            line = line.strip()
+            if line == "end_header":
+                if not self.elements:
+                    raise ValueError("No elements found.")
+                if not self.elements[-1].properties:
+                    raise ValueError("Found an element with no properties.")
+                if not seen_format:
+                    raise ValueError("No format line found.")
+                break
+            if not seen_format:
+                if line == "format ascii 1.0":
+                    seen_format = True
+                    self.ascii = True
+                    continue
+                if line == "format binary_little_endian 1.0":
+                    seen_format = True
+                    self.ascii = False
+                    self.big_endian = False
+                    continue
+                if line == "format binary_big_endian 1.0":
+                    seen_format = True
+                    self.ascii = False
+                    self.big_endian = True
+                    continue
+            if line.startswith("format"):
+                raise ValueError("Invalid format line.")
+            if line.startswith("comment") or len(line) == 0:
+                continue
+            if line.startswith("element"):
+                self._parse_element(line)
+                continue
+            if line.startswith("obj_info "):
+                self.obj_info.append(line[9:])
+                continue
+            if line.startswith("property"):
+                self._parse_property(line)
+                continue
+            raise ValueError("Invalid line: %s." % line)
+
+    def _parse_property(self, line: str):
+        """
+        Decode a ply file header property line.
+
+        Args:
+            line: (str) the ply file's line.
+        """
+        if not self.elements:
+            raise ValueError("Encountered property before any element.")
+        items = line.split(" ")
+        if len(items) not in [3, 5]:
+            raise ValueError("Invalid line: %s" % line)
+        datatype = items[1]
+        name = items[-1]
+        if datatype == "list":
+            datatype = items[3]
+            list_size_type = items[2]
+            if list_size_type not in _PLY_TYPES:
+                raise ValueError("Invalid datatype: %s" % list_size_type)
+        else:
+            list_size_type = None
+        if datatype not in _PLY_TYPES:
+            raise ValueError("Invalid datatype: %s" % datatype)
+        self.elements[-1].add_property(name, datatype, list_size_type)
+
+    def _parse_element(self, line: str):
+        """
+        Decode a ply file header element line.
+
+        Args:
+            line: (str) the ply file's line.
+        """
+        if self.elements and not self.elements[-1].properties:
+            raise ValueError("Found an element with no properties.")
+        items = line.split(" ")
+        if len(items) != 3:
+            raise ValueError("Invalid line: %s" % line)
+        try:
+            count = int(items[2])
+        except ValueError:
+            msg = "Number of items for %s was not a number."
+            raise ValueError(msg % items[1])
+        self.elements.append(_PlyElementType(items[1], count))
+
+
+def _read_ply_fixed_size_element_ascii(f, definition: _PlyElementType):
+    """
+    Given an element which has no lists and one type, read the
+    corresponding data.
+
+    For example
+
+        element vertex 8
+        property float x
+        property float y
+        property float z
+
+    Args:
+        f: file-like object being read.
+        definition: The element object which describes what we are reading.
+
+    Returns:
+        1-element list containing a 2D numpy array corresponding to the data.
+        The rows are the different values. There is one column for each property.
+    """
+    np_type = _PLY_TYPES[definition.properties[0].data_type].np_type
+    old_offset = f.tell()
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore", message=".* Empty input file.*", category=UserWarning
+        )
+        data = np.loadtxt(
+            f, dtype=np_type, comments=None, ndmin=2, max_rows=definition.count
+        )
+    if not len(data):  # np.loadtxt() seeks even on empty data
+        f.seek(old_offset)
+    if data.shape[1] != len(definition.properties):
+        raise ValueError("Inconsistent data for %s." % definition.name)
+    if data.shape[0] != definition.count:
+        raise ValueError("Not enough data for %s." % definition.name)
+    return [data]
+
+
+def _read_ply_nolist_element_ascii(f, definition: _PlyElementType):
+    """
+    Given an element which has no lists and multiple types, read the
+    corresponding data, by loading all the data as float64 and converting
+    the relevant parts later.
+
+    For example, given
+
+        element vertex 8
+        property float x
+        property float y
+        property float z
+        property uchar red
+        property uchar green
+        property uchar blue
+
+    the output will have two arrays, the first containing (x,y,z)
+    and the second (red,green,blue).
+
+    Args:
+        f: file-like object being read.
+        definition: The element object which describes what we are reading.
+
+    Returns:
+        List of 2D numpy arrays corresponding to the data.
+    """
+    old_offset = f.tell()
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore", message=".* Empty input file.*", category=UserWarning
+        )
+        data = np.loadtxt(
+            f, dtype=np.float64, comments=None, ndmin=2, max_rows=definition.count
+        )
+    if not len(data):  # np.loadtxt() seeks even on empty data
+        f.seek(old_offset)
+    if data.shape[1] != len(definition.properties):
+        raise ValueError("Inconsistent data for %s." % definition.name)
+    if data.shape[0] != definition.count:
+        raise ValueError("Not enough data for %s." % definition.name)
+    pieces = []
+    offset = 0
+    for dtype, it in itertools.groupby(p.data_type for p in definition.properties):
+        count = sum(1 for _ in it)
+        end_offset = offset + count
+        piece = data[:, offset:end_offset].astype(_PLY_TYPES[dtype].np_type)
+        pieces.append(piece)
+        offset = end_offset
+    return pieces
+
+
+def _try_read_ply_constant_list_ascii(f, definition: _PlyElementType):
+    """
+    If definition is an element which is a single list, attempt to read the
+    corresponding data assuming every value has the same length.
+    If the data is ragged, return None and leave f undisturbed.
+
+    For example, if the element is
+
+        element face 2
+        property list uchar int vertex_index
+
+    and the data is
+
+        4 0 1 2 3
+        4 7 6 5 4
+
+    then the function will return
+
+        [[0, 1, 2, 3],
+         [7, 6, 5, 4]]
+
+    but if the data is
+
+        4 0 1 2 3
+        3 6 5 4
+
+    then the function will return None.
+
+    Args:
+        f: file-like object being read.
+        definition: The element object which describes what we are reading.
+
+    Returns:
+        If every element has the same size, 2D numpy array corresponding to the
+        data. The rows are the different values. Otherwise None.
+    """
+    np_type = _PLY_TYPES[definition.properties[0].data_type].np_type
+    old_offset = f.tell()
+    try:
+        with warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore", message=".* Empty input file.*", category=UserWarning
+            )
+            data = np.loadtxt(
+                f, dtype=np_type, comments=None, ndmin=2, max_rows=definition.count
+            )
+    except ValueError:
+        f.seek(old_offset)
+        return None
+    if not len(data):  # np.loadtxt() seeks even on empty data
+        f.seek(old_offset)
+    if (data[:, 0] != data.shape[1] - 1).any():
+        msg = "A line of %s data did not have the specified length."
+        raise ValueError(msg % definition.name)
+    if data.shape[0] != definition.count:
+        raise ValueError("Not enough data for %s." % definition.name)
+    return data[:, 1:]
+
+
+def _parse_heterogeneous_property_ascii(datum, line_iter, property: _Property):
+    """
+    Read a general data property from an ascii .ply file.
+
+    Args:
+        datum: list to append the single value to. That value will be a numpy
+                array if the property is a list property, otherwise an int or
+                float.
+        line_iter: iterator to words on the line from which we read.
+        property: the property object describing the property we are reading.
+    """
+    value = next(line_iter, None)
+    if value is None:
+        raise ValueError("Too little data for an element.")
+    if property.list_size_type is None:
+        try:
+            if property.data_type in ["double", "float"]:
+                datum.append(float(value))
+            else:
+                datum.append(int(value))
+        except ValueError:
+            raise ValueError("Bad numerical data.")
+    else:
+        try:
+            length = int(value)
+        except ValueError:
+            raise ValueError("A list length was not a number.")
+        list_value = np.zeros(length, dtype=_PLY_TYPES[property.data_type].np_type)
+        for i in range(length):
+            inner_value = next(line_iter, None)
+            if inner_value is None:
+                raise ValueError("Too little data for an element.")
+            try:
+                list_value[i] = float(inner_value)
+            except ValueError:
+                raise ValueError("Bad numerical data.")
+        datum.append(list_value)
+
+
+def _read_ply_element_ascii(f, definition: _PlyElementType):
+    """
+    Decode all instances of a single element from an ascii .ply file.
+
+    Args:
+        f: file-like object being read.
+        definition: The element object which describes what we are reading.
+
+    Returns:
+        In simple cases where every element has the same size, 2D numpy array
+        corresponding to the data. The rows are the different values.
+        Otherwise a list of lists of values, where the outer list is
+        each occurrence of the element, and the inner lists have one value per
+        property.
+    """
+    if not definition.count:
+        return []
+    if definition.is_constant_type_fixed_size():
+        return _read_ply_fixed_size_element_ascii(f, definition)
+    if definition.is_fixed_size():
+        return _read_ply_nolist_element_ascii(f, definition)
+    if definition.try_constant_list():
+        data = _try_read_ply_constant_list_ascii(f, definition)
+        if data is not None:
+            return data
+
+    # We failed to read the element as a lump, must process each line manually.
+    data = []
+    for _i in range(definition.count):
+        line_string = f.readline()
+        if line_string == "":
+            raise ValueError("Not enough data for %s." % definition.name)
+        datum = []
+        line_iter = iter(line_string.strip().split())
+        for property in definition.properties:
+            _parse_heterogeneous_property_ascii(datum, line_iter, property)
+        data.append(datum)
+        if next(line_iter, None) is not None:
+            raise ValueError("Too much data for an element.")
+    return data
+
+
+def _read_raw_array(f, aim: str, length: int, dtype: type = np.uint8, dtype_size=1):
+    """
+    Read [length] elements from a file.
+
+    Args:
+        f: file object
+        aim: name of target for error message
+        length: number of elements
+        dtype: numpy type
+        dtype_size: number of bytes per element.
+
+    Returns:
+        new numpy array
+    """
+
+    if isinstance(f, BytesIO):
+        # np.fromfile is faster but won't work on a BytesIO
+        needed_bytes = length * dtype_size
+        bytes_data = bytearray(needed_bytes)
+        n_bytes_read = f.readinto(bytes_data)
+        if n_bytes_read != needed_bytes:
+            raise ValueError("Not enough data for %s." % aim)
+        data = np.frombuffer(bytes_data, dtype=dtype)
+    else:
+        data = np.fromfile(f, dtype=dtype, count=length)
+        if data.shape[0] != length:
+            raise ValueError("Not enough data for %s." % aim)
+    return data
+
+
+def _read_ply_fixed_size_element_binary(
+    f, definition: _PlyElementType, big_endian: bool
+):
+    """
+    Given an element which has no lists and one type, read the
+    corresponding data.
+
+    For example
+
+        element vertex 8
+        property float x
+        property float y
+        property float z
+
+
+    Args:
+        f: file-like object being read.
+        definition: The element object which describes what we are reading.
+        big_endian: (bool) whether the document is encoded as big endian.
+
+    Returns:
+        1-element list containing a 2D numpy array corresponding to the data.
+        The rows are the different values. There is one column for each property.
+    """
+    ply_type = _PLY_TYPES[definition.properties[0].data_type]
+    np_type = ply_type.np_type
+    type_size = ply_type.size
+    needed_length = definition.count * len(definition.properties)
+    data = _read_raw_array(f, definition.name, needed_length, np_type, type_size)
+
+    if (sys.byteorder == "big") != big_endian:
+        data = data.byteswap()
+    return [data.reshape(definition.count, len(definition.properties))]
+
+
+def _read_ply_element_binary_nolists(f, definition: _PlyElementType, big_endian: bool):
+    """
+    Given an element which has no lists, read the corresponding data as tuple
+    of numpy arrays, one for each set of adjacent columns with the same type.
+
+    For example, given
+
+        element vertex 8
+        property float x
+        property float y
+        property float z
+        property uchar red
+        property uchar green
+        property uchar blue
+
+    the output will have two arrays, the first containing (x,y,z)
+    and the second (red,green,blue).
+
+    Args:
+        f: file-like object being read.
+        definition: The element object which describes what we are reading.
+        big_endian: (bool) whether the document is encoded as big endian.
+
+    Returns:
+        List of 2D numpy arrays corresponding to the data. The rows are the different
+        values.
+    """
+    size = sum(_PLY_TYPES[prop.data_type].size for prop in definition.properties)
+    needed_bytes = size * definition.count
+    data = _read_raw_array(f, definition.name, needed_bytes).reshape(-1, size)
+    offset = 0
+    pieces = []
+    for dtype, it in itertools.groupby(p.data_type for p in definition.properties):
+        count = sum(1 for _ in it)
+        bytes_each = count * _PLY_TYPES[dtype].size
+        end_offset = offset + bytes_each
+
+        # what we want to do is
+        # piece = data[:, offset:end_offset].view(_PLY_TYPES[dtype].np_type)
+        # but it fails in the general case
+        # because of https://github.com/numpy/numpy/issues/9496.
+        piece = np.lib.stride_tricks.as_strided(
+            data[:1, offset:end_offset].view(_PLY_TYPES[dtype].np_type),
+            shape=(definition.count, count),
+            strides=(data.strides[0], _PLY_TYPES[dtype].size),
+        )
+
+        if (sys.byteorder == "big") != big_endian:
+            piece = piece.byteswap()
+        pieces.append(piece)
+        offset = end_offset
+    return pieces
+
+
+def _try_read_ply_constant_list_binary(
+    f, definition: _PlyElementType, big_endian: bool
+):
+    """
+    If definition is an element which is a single list, attempt to read the
+    corresponding data assuming every value has the same length.
+    If the data is ragged, return None and leave f undisturbed.
+
+    For example, if the element is
+
+        element face 2
+        property list uchar int vertex_index
+
+    and the data is
+
+        4 0 1 2 3
+        4 7 6 5 4
+
+    then the function will return
+
+        [[0, 1, 2, 3],
+         [7, 6, 5, 4]]
+
+    but if the data is
+
+        4 0 1 2 3
+        3 6 5 4
+
+    then the function will return None.
+
+    Args:
+        f: file-like object being read.
+        definition: The element object which describes what we are reading.
+        big_endian: (bool) whether the document is encoded as big endian.
+
+    Returns:
+        If every element has the same size, 2D numpy array corresponding to the
+        data. The rows are the different values. Otherwise None.
+    """
+    property = definition.properties[0]
+    endian_str = ">" if big_endian else "<"
+    length_format = endian_str + _PLY_TYPES[property.list_size_type].struct_char
+    length_struct = struct.Struct(length_format)
+
+    def get_length():
+        bytes_data = f.read(length_struct.size)
+        if len(bytes_data) != length_struct.size:
+            raise ValueError("Not enough data for %s." % definition.name)
+        [length] = length_struct.unpack(bytes_data)
+        return length
+
+    old_offset = f.tell()
+
+    length = get_length()
+    np_type = _PLY_TYPES[definition.properties[0].data_type].np_type
+    type_size = _PLY_TYPES[definition.properties[0].data_type].size
+    data_size = type_size * length
+
+    output = np.zeros((definition.count, length), dtype=np_type)
+
+    for i in range(definition.count):
+        bytes_data = f.read(data_size)
+        if len(bytes_data) != data_size:
+            raise ValueError("Not enough data for %s" % definition.name)
+        output[i] = np.frombuffer(bytes_data, dtype=np_type)
+        if i + 1 == definition.count:
+            break
+        if length != get_length():
+            f.seek(old_offset)
+            return None
+    if (sys.byteorder == "big") != big_endian:
+        output = output.byteswap()
+
+    return output
+
+
+def _read_ply_element_binary(f, definition: _PlyElementType, big_endian: bool) -> list:
+    """
+    Decode all instances of a single element from a binary .ply file.
+
+    Args:
+        f: file-like object being read.
+        definition: The element object which describes what we are reading.
+        big_endian: (bool) whether the document is encoded as big endian.
+
+    Returns:
+        In simple cases where every element has the same size, 2D numpy array
+        corresponding to the data. The rows are the different values.
+        Otherwise a list of lists/tuples of values, where the outer list is
+        each occurrence of the element, and the inner lists have one value per
+        property.
+    """
+    if not definition.count:
+        return []
+
+    if definition.is_constant_type_fixed_size():
+        return _read_ply_fixed_size_element_binary(f, definition, big_endian)
+    if definition.is_fixed_size():
+        return _read_ply_element_binary_nolists(f, definition, big_endian)
+    if definition.try_constant_list():
+        data = _try_read_ply_constant_list_binary(f, definition, big_endian)
+        if data is not None:
+            return data
+
+    # We failed to read the element as a lump, must process each line manually.
+    endian_str = ">" if big_endian else "<"
+    property_structs = []
+    for property in definition.properties:
+        initial_type = property.list_size_type or property.data_type
+        property_structs.append(
+            struct.Struct(endian_str + _PLY_TYPES[initial_type].struct_char)
+        )
+
+    data = []
+    for _i in range(definition.count):
+        datum = []
+        for property, property_struct in zip(definition.properties, property_structs):
+            size = property_struct.size
+            initial_data = f.read(size)
+            if len(initial_data) != size:
+                raise ValueError("Not enough data for %s" % definition.name)
+            [initial] = property_struct.unpack(initial_data)
+            if property.list_size_type is None:
+                datum.append(initial)
+            else:
+                type_size = _PLY_TYPES[property.data_type].size
+                needed_bytes = type_size * initial
+                list_data = f.read(needed_bytes)
+                if len(list_data) != needed_bytes:
+                    raise ValueError("Not enough data for %s" % definition.name)
+                np_type = _PLY_TYPES[property.data_type].np_type
+                list_np = np.frombuffer(list_data, dtype=np_type)
+                if (sys.byteorder == "big") != big_endian:
+                    list_np = list_np.byteswap()
+                datum.append(list_np)
+        data.append(datum)
+    return data
+
+
+def _load_ply_raw_stream(f) -> Tuple[_PlyHeader, dict]:
+    """
+    Implementation for _load_ply_raw which takes a stream.
+
+    Args:
+        f:  A binary or text file-like object.
+
+    Returns:
+        header: A _PlyHeader object describing the metadata in the ply file.
+        elements: A dictionary of element names to values. If an element is regular, in
+        the sense of having no lists or being one uniformly-sized list, then the
+        value will be a 2D numpy array. If not, it is a list of the relevant
+        property values.
+    """
+
+    header = _PlyHeader(f)
+    elements = {}
+    if header.ascii:
+        for element in header.elements:
+            elements[element.name] = _read_ply_element_ascii(f, element)
+    else:
+        if isinstance(f, TextIOBase):
+            raise ValueError(
+                "Cannot safely read a binary ply file using a Text stream."
+            )
+        big = header.big_endian
+        for element in header.elements:
+            elements[element.name] = _read_ply_element_binary(f, element, big)
+    end = f.read().strip()
+    if len(end) != 0:
+        raise ValueError("Extra data at end of file: " + str(end[:20]))
+    return header, elements
+
+
+def _load_ply_raw(f, path_manager: PathManager) -> Tuple[_PlyHeader, dict]:
+    """
+    Load the data from a .ply file.
+
+    Args:
+        f:  A binary or text file-like object (with methods read, readline,
+            tell and seek), a pathlib path or a string containing a file name.
+            If the ply file is binary, a text stream is not supported.
+            It is recommended to use a binary stream.
+        path_manager: PathManager for loading if f is a str.
+
+    Returns:
+        header: A _PlyHeader object describing the metadata in the ply file.
+        elements: A dictionary of element names to values. If an element is
+                  regular, in the sense of having no lists or being one
+                  uniformly-sized list, then the value will be a 2D numpy array.
+                  If it has no lists but more than one type, it will be a list of arrays.
+                  If not, it is a list of the relevant property values.
+    """
+    with _open_file(f, path_manager, "rb") as f:
+        header, elements = _load_ply_raw_stream(f)
+    return header, elements
+
+
+def _get_verts_column_indices(
+    vertex_head: _PlyElementType,
+) -> Tuple[List[int], Optional[List[int]], float, Optional[List[int]]]:
+    """
+    Get the columns of verts, verts_colors, and verts_normals in the vertex
+    element of a parsed ply file, together with a color scale factor.
+    When the colors are in byte format, they are scaled from 0..255 to [0,1].
+    Otherwise they are not scaled.
+
+    For example, if the vertex element looks as follows:
+
+        element vertex 892
+        property double x
+        property double y
+        property double z
+        property double nx
+        property double ny
+        property double nz
+        property uchar red
+        property uchar green
+        property uchar blue
+
+    then the return value will be ([0,1,2], [6,7,8], 1.0/255, [3,4,5])
+
+    Args:
+        vertex_head: as returned from load_ply_raw.
+
+    Returns:
+        point_idxs: List[int] of 3 point columns.
+        color_idxs: List[int] of 3 color columns if they are present,
+                    otherwise None.
+        color_scale: value to scale colors by.
+        normal_idxs: List[int] of 3 normals columns if they are present,
+                    otherwise None.
+    """
+    point_idxs: List[Optional[int]] = [None, None, None]
+    color_idxs: List[Optional[int]] = [None, None, None]
+    normal_idxs: List[Optional[int]] = [None, None, None]
+    for i, prop in enumerate(vertex_head.properties):
+        if prop.list_size_type is not None:
+            raise ValueError("Invalid vertices in file: did not expect list.")
+        for j, letter in enumerate(["x", "y", "z"]):
+            if prop.name == letter:
+                point_idxs[j] = i
+        for j, name in enumerate(["red", "green", "blue"]):
+            if prop.name == name:
+                color_idxs[j] = i
+        for j, name in enumerate(["nx", "ny", "nz"]):
+            if prop.name == name:
+                normal_idxs[j] = i
+    if None in point_idxs:
+        raise ValueError("Invalid vertices in file.")
+    color_scale = 1.0
+    if all(
+        idx is not None and _PLY_TYPES[vertex_head.properties[idx].data_type].size == 1
+        for idx in color_idxs
+    ):
+        color_scale = 1.0 / 255
+    return (
+        point_idxs,
+        # pyre-fixme[22]: The cast is redundant.
+        None if None in color_idxs else cast(List[int], color_idxs),
+        color_scale,
+        # pyre-fixme[22]: The cast is redundant.
+        None if None in normal_idxs else cast(List[int], normal_idxs),
+    )
+
+
+def _get_verts(
+    header: _PlyHeader, elements: dict
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+    """
+    Get the vertex locations, colors and normals from a parsed ply file.
+
+    Args:
+        header, elements: as returned from load_ply_raw.
+
+    Returns:
+        verts: FloatTensor of shape (V, 3).
+        vertex_colors: None or FloatTensor of shape (V, 3).
+        vertex_normals: None or FloatTensor of shape (V, 3).
+    """
+
+    vertex = elements.get("vertex", None)
+    if vertex is None:
+        raise ValueError("The ply file has no vertex element.")
+    if not isinstance(vertex, list):
+        raise ValueError("Invalid vertices in file.")
+    vertex_head = next(head for head in header.elements if head.name == "vertex")
+    point_idxs, color_idxs, color_scale, normal_idxs = _get_verts_column_indices(
+        vertex_head
+    )
+
+    # Case of no vertices
+    if vertex_head.count == 0:
+        verts = torch.zeros((0, 3), dtype=torch.float32)
+        if color_idxs is None:
+            return verts, None, None
+        return verts, torch.zeros((0, 3), dtype=torch.float32), None
+
+    # Simple case where the only data is the vertices themselves
+    if (
+        len(vertex) == 1
+        and isinstance(vertex[0], np.ndarray)
+        and vertex[0].ndim == 2
+        and vertex[0].shape[1] == 3
+    ):
+        return _make_tensor(vertex[0], cols=3, dtype=torch.float32), None, None
+
+    vertex_colors = None
+    vertex_normals = None
+
+    if len(vertex) == 1:
+        # This is the case where the whole vertex element has one type,
+        # so it was read as a single array and we can index straight into it.
+        verts = torch.tensor(vertex[0][:, point_idxs], dtype=torch.float32)
+        if color_idxs is not None:
+            vertex_colors = color_scale * torch.tensor(
+                vertex[0][:, color_idxs], dtype=torch.float32
+            )
+        if normal_idxs is not None:
+            vertex_normals = torch.tensor(
+                vertex[0][:, normal_idxs], dtype=torch.float32
+            )
+    else:
+        # The vertex element is heterogeneous. It was read as several arrays,
+        # part by part, where a part is a set of properties with the same type.
+        # For each property (=column in the file), we store in
+        # prop_to_partnum_col its partnum (i.e. the index of what part it is
+        # in) and its column number (its index within its part).
+        prop_to_partnum_col = [
+            (partnum, col)
+            for partnum, array in enumerate(vertex)
+            for col in range(array.shape[1])
+        ]
+        verts = torch.empty(size=(vertex_head.count, 3), dtype=torch.float32)
+        for axis in range(3):
+            partnum, col = prop_to_partnum_col[point_idxs[axis]]
+            verts.numpy()[:, axis] = vertex[partnum][:, col]
+            # Note that in the previous line, we made the assignment
+            # as numpy arrays by casting verts. If we took the (more
+            # obvious) method of converting the right hand side to
+            # torch, then we might have an extra data copy because
+            # torch wants contiguity. The code would be like:
+            #   if not vertex[partnum].flags["C_CONTIGUOUS"]:
+            #      vertex[partnum] = np.ascontiguousarray(vertex[partnum])
+            #   verts[:, axis] = torch.tensor((vertex[partnum][:, col]))
+        if color_idxs is not None:
+            vertex_colors = torch.empty(
+                size=(vertex_head.count, 3), dtype=torch.float32
+            )
+            for color in range(3):
+                partnum, col = prop_to_partnum_col[color_idxs[color]]
+                vertex_colors.numpy()[:, color] = vertex[partnum][:, col]
+            vertex_colors *= color_scale
+        if normal_idxs is not None:
+            vertex_normals = torch.empty(
+                size=(vertex_head.count, 3), dtype=torch.float32
+            )
+            for axis in range(3):
+                partnum, col = prop_to_partnum_col[normal_idxs[axis]]
+                vertex_normals.numpy()[:, axis] = vertex[partnum][:, col]
+
+    return verts, vertex_colors, vertex_normals
+
+
+def _load_ply(
+    f, *, path_manager: PathManager
+) -> Tuple[
+    torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor], Optional[torch.Tensor]
+]:
+    """
+    Load the data from a .ply file.
+
+    Args:
+        f:  A binary or text file-like object (with methods read, readline,
+            tell and seek), a pathlib path or a string containing a file name.
+            If the ply file is in the binary ply format rather than the text
+            ply format, then a text stream is not supported.
+            It is easiest to use a binary stream in all cases.
+        path_manager: PathManager for loading if f is a str.
+
+    Returns:
+        verts: FloatTensor of shape (V, 3).
+        faces: None or LongTensor of vertex indices, shape (F, 3).
+        vertex_colors: None or FloatTensor of shape (V, 3).
+        vertex_normals: None or FloatTensor of shape (V, 3).
+    """
+    header, elements = _load_ply_raw(f, path_manager=path_manager)
+
+    verts, vertex_colors, vertex_normals = _get_verts(header, elements)
+
+    face = elements.get("face", None)
+    if face is not None:
+        face_head = next(head for head in header.elements if head.name == "face")
+        if (
+            len(face_head.properties) != 1
+            or face_head.properties[0].list_size_type is None
+        ):
+            raise ValueError("Unexpected form of faces data.")
+        # face_head.properties[0].name is usually "vertex_index" or "vertex_indices"
+        # but we don't need to enforce this.
+
+    if face is None:
+        faces = None
+    elif not len(face):
+        # pyre is happier when this condition is not joined to the
+        # previous one with `or`.
+        faces = None
+    elif isinstance(face, np.ndarray) and face.ndim == 2:  # Homogeneous elements
+        if face.shape[1] < 3:
+            raise ValueError("Faces must have at least 3 vertices.")
+        face_arrays = [face[:, [0, i + 1, i + 2]] for i in range(face.shape[1] - 2)]
+        faces = torch.LongTensor(np.vstack(face_arrays))
+    else:
+        face_list = []
+        for face_item in face:
+            if face_item.ndim != 1:
+                raise ValueError("Bad face data.")
+            if face_item.shape[0] < 3:
+                raise ValueError("Faces must have at least 3 vertices.")
+            for i in range(face_item.shape[0] - 2):
+                face_list.append([face_item[0], face_item[i + 1], face_item[i + 2]])
+        faces = torch.tensor(face_list, dtype=torch.int64)
+
+    if faces is not None:
+        _check_faces_indices(faces, max_index=verts.shape[0])
+
+    return verts, faces, vertex_colors, vertex_normals
+
+
+def load_ply(
+    f, *, path_manager: Optional[PathManager] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Load the verts and faces from a .ply file.
+    Note that the preferred way to load data from such a file
+    is to use the IO.load_mesh and IO.load_pointcloud functions,
+    which can read more of the data.
+
+    Example .ply file format:
+
+    ply
+    format ascii 1.0           { ascii/binary, format version number }
+    comment made by Greg Turk  { comments keyword specified, like all lines }
+    comment this file is a cube
+    element vertex 8           { define "vertex" element, 8 of them in file }
+    property float x           { vertex contains float "x" coordinate }
+    property float y           { y coordinate is also a vertex property }
+    property float z           { z coordinate, too }
+    element face 6             { there are 6 "face" elements in the file }
+    property list uchar int vertex_index { "vertex_indices" is a list of ints }
+    end_header                 { delimits the end of the header }
+    0 0 0                      { start of vertex list }
+    0 0 1
+    0 1 1
+    0 1 0
+    1 0 0
+    1 0 1
+    1 1 1
+    1 1 0
+    4 0 1 2 3                  { start of face list }
+    4 7 6 5 4
+    4 0 4 5 1
+    4 1 5 6 2
+    4 2 6 7 3
+    4 3 7 4 0
+
+    Args:
+        f:  A binary or text file-like object (with methods read, readline,
+            tell and seek), a pathlib path or a string containing a file name.
+            If the ply file is in the binary ply format rather than the text
+            ply format, then a text stream is not supported.
+            It is easiest to use a binary stream in all cases.
+        path_manager: PathManager for loading if f is a str.
+
+    Returns:
+        verts: FloatTensor of shape (V, 3).
+        faces: LongTensor of vertex indices, shape (F, 3).
+    """
+
+    if path_manager is None:
+        path_manager = PathManager()
+    verts, faces, _, _ = _load_ply(f, path_manager=path_manager)
+    if faces is None:
+        faces = torch.zeros(0, 3, dtype=torch.int64)
+
+    return verts, faces
+
+
+def _write_ply_header(
+    f,
+    *,
+    verts: torch.Tensor,
+    faces: Optional[torch.LongTensor],
+    verts_normals: Optional[torch.Tensor],
+    verts_colors: Optional[torch.Tensor],
+    ascii: bool,
+    colors_as_uint8: bool,
+) -> None:
+    """
+    Internal implementation for writing header when saving to a .ply file.
+
+    Args:
+        f: File object to which the 3D data should be written.
+        verts: FloatTensor of shape (V, 3) giving vertex coordinates.
+        faces: LongTensor of shape (F, 3) giving faces.
+        verts_normals: FloatTensor of shape (V, 3) giving vertex normals.
+        verts_colors: FloatTensor of shape (V, 3) giving vertex colors.
+        ascii: (bool) whether to use the ascii ply format.
+        colors_as_uint8: Whether to save colors as numbers in the range
+                    [0, 255] instead of float32.
+    """
+    assert not len(verts) or (verts.dim() == 2 and verts.size(1) == 3)
+    assert faces is None or not len(faces) or (faces.dim() == 2 and faces.size(1) == 3)
+    assert verts_normals is None or (
+        verts_normals.dim() == 2 and verts_normals.size(1) == 3
+    )
+    assert verts_colors is None or (
+        verts_colors.dim() == 2 and verts_colors.size(1) == 3
+    )
+
+    if ascii:
+        f.write(b"ply\nformat ascii 1.0\n")
+    elif sys.byteorder == "big":
+        f.write(b"ply\nformat binary_big_endian 1.0\n")
+    else:
+        f.write(b"ply\nformat binary_little_endian 1.0\n")
+    f.write(f"element vertex {verts.shape[0]}\n".encode("ascii"))
+    f.write(b"property float x\n")
+    f.write(b"property float y\n")
+    f.write(b"property float z\n")
+    if verts_normals is not None:
+        f.write(b"property float nx\n")
+        f.write(b"property float ny\n")
+        f.write(b"property float nz\n")
+    if verts_colors is not None:
+        color_ply_type = b"uchar" if colors_as_uint8 else b"float"
+        for color in (b"red", b"green", b"blue"):
+            f.write(b"property " + color_ply_type + b" " + color + b"\n")
+    if len(verts) and faces is not None:
+        f.write(f"element face {faces.shape[0]}\n".encode("ascii"))
+        f.write(b"property list uchar int vertex_index\n")
+    f.write(b"end_header\n")
+
+
+def _save_ply(
+    f,
+    *,
+    verts: torch.Tensor,
+    faces: Optional[torch.LongTensor],
+    verts_normals: Optional[torch.Tensor],
+    verts_colors: Optional[torch.Tensor],
+    ascii: bool,
+    decimal_places: Optional[int] = None,
+    colors_as_uint8: bool,
+) -> None:
+    """
+    Internal implementation for saving 3D data to a .ply file.
+
+    Args:
+        f: File object to which the 3D data should be written.
+        verts: FloatTensor of shape (V, 3) giving vertex coordinates.
+        faces: LongTensor of shape (F, 3) giving faces.
+        verts_normals: FloatTensor of shape (V, 3) giving vertex normals.
+        verts_colors: FloatTensor of shape (V, 3) giving vertex colors.
+        ascii: (bool) whether to use the ascii ply format.
+        decimal_places: Number of decimal places for saving if ascii=True.
+        colors_as_uint8: Whether to save colors as numbers in the range
+                    [0, 255] instead of float32.
+    """
+    _write_ply_header(
+        f,
+        verts=verts,
+        faces=faces,
+        verts_normals=verts_normals,
+        verts_colors=verts_colors,
+        ascii=ascii,
+        colors_as_uint8=colors_as_uint8,
+    )
+
+    if not (len(verts)):
+        warnings.warn("Empty 'verts' provided")
+        return
+
+    color_np_type = np.ubyte if colors_as_uint8 else np.float32
+    verts_dtype = [("verts", np.float32, 3)]
+    if verts_normals is not None:
+        verts_dtype.append(("normals", np.float32, 3))
+    if verts_colors is not None:
+        verts_dtype.append(("colors", color_np_type, 3))
+
+    vert_data = np.zeros(verts.shape[0], dtype=verts_dtype)
+    vert_data["verts"] = verts.detach().cpu().numpy()
+    if verts_normals is not None:
+        vert_data["normals"] = verts_normals.detach().cpu().numpy()
+    if verts_colors is not None:
+        color_data = verts_colors.detach().cpu().numpy()
+        if colors_as_uint8:
+            vert_data["colors"] = np.rint(color_data * 255)
+        else:
+            vert_data["colors"] = color_data
+
+    if ascii:
+        if decimal_places is None:
+            float_str = b"%f"
+        else:
+            float_str = b"%" + b".%df" % decimal_places
+        float_group_str = (float_str + b" ") * 3
+        formats = [float_group_str]
+        if verts_normals is not None:
+            formats.append(float_group_str)
+        if verts_colors is not None:
+            formats.append(b"%d %d %d " if colors_as_uint8 else float_group_str)
+        formats[-1] = formats[-1][:-1] + b"\n"
+        for line_data in vert_data:
+            for data, format in zip(line_data, formats):
+                f.write(format % tuple(data))
+    else:
+        if isinstance(f, BytesIO):
+            # tofile only works with real files, but is faster than this.
+            f.write(vert_data.tobytes())
+        else:
+            vert_data.tofile(f)
+
+    if faces is not None:
+        faces_array = faces.detach().cpu().numpy()
+
+        _check_faces_indices(faces, max_index=verts.shape[0])
+
+        if len(faces_array):
+            if ascii:
+                np.savetxt(f, faces_array, "3 %d %d %d")
+            else:
+                faces_recs = np.zeros(
+                    len(faces_array),
+                    dtype=[("count", np.uint8), ("vertex_indices", np.uint32, 3)],
+                )
+                faces_recs["count"] = 3
+                faces_recs["vertex_indices"] = faces_array
+                faces_uints = faces_recs.view(np.uint8)
+
+                if isinstance(f, BytesIO):
+                    f.write(faces_uints.tobytes())
+                else:
+                    faces_uints.tofile(f)
+
+
+def save_ply(
+    f,
+    verts: torch.Tensor,
+    faces: Optional[torch.LongTensor] = None,
+    verts_normals: Optional[torch.Tensor] = None,
+    ascii: bool = False,
+    decimal_places: Optional[int] = None,
+    path_manager: Optional[PathManager] = None,
+) -> None:
+    """
+    Save a mesh to a .ply file.
+
+    Args:
+        f: File (or path) to which the mesh should be written.
+        verts: FloatTensor of shape (V, 3) giving vertex coordinates.
+        faces: LongTensor of shape (F, 3) giving faces.
+        verts_normals: FloatTensor of shape (V, 3) giving vertex normals.
+        ascii: (bool) whether to use the ascii ply format.
+        decimal_places: Number of decimal places for saving if ascii=True.
+        path_manager: PathManager for interpreting f if it is a str.
+    """
+
+    if len(verts) and not (verts.dim() == 2 and verts.size(1) == 3):
+        message = "Argument 'verts' should either be empty or of shape (num_verts, 3)."
+        raise ValueError(message)
+
+    if (
+        faces is not None
+        and len(faces)
+        and not (faces.dim() == 2 and faces.size(1) == 3)
+    ):
+        message = "Argument 'faces' should either be empty or of shape (num_faces, 3)."
+        raise ValueError(message)
+
+    if (
+        verts_normals is not None
+        and len(verts_normals)
+        and not (
+            verts_normals.dim() == 2
+            and verts_normals.size(1) == 3
+            and verts_normals.size(0) == verts.size(0)
+        )
+    ):
+        message = "Argument 'verts_normals' should either be empty or of shape (num_verts, 3)."
+        raise ValueError(message)
+
+    if path_manager is None:
+        path_manager = PathManager()
+    with _open_file(f, path_manager, "wb") as f:
+        _save_ply(
+            f,
+            verts=verts,
+            faces=faces,
+            verts_normals=verts_normals,
+            verts_colors=None,
+            ascii=ascii,
+            decimal_places=decimal_places,
+            colors_as_uint8=False,
+        )
+
+
+class MeshPlyFormat(MeshFormatInterpreter):
+    def __init__(self) -> None:
+        self.known_suffixes = (".ply",)
+
+    def read(
+        self,
+        path: PathOrStr,
+        include_textures: bool,
+        device,
+        path_manager: PathManager,
+        **kwargs,
+    ) -> Optional[Meshes]:
+        if not endswith(path, self.known_suffixes):
+            return None
+
+        verts, faces, verts_colors, verts_normals = _load_ply(
+            f=path, path_manager=path_manager
+        )
+        if faces is None:
+            faces = torch.zeros(0, 3, dtype=torch.int64)
+
+        texture = None
+        if include_textures and verts_colors is not None:
+            texture = TexturesVertex([verts_colors.to(device)])
+
+        if verts_normals is not None:
+            verts_normals = [verts_normals]
+        mesh = Meshes(
+            verts=[verts.to(device)],
+            faces=[faces.to(device)],
+            textures=texture,
+            verts_normals=verts_normals,
+        )
+        return mesh
+
+    def save(
+        self,
+        data: Meshes,
+        path: PathOrStr,
+        path_manager: PathManager,
+        binary: Optional[bool],
+        decimal_places: Optional[int] = None,
+        colors_as_uint8: bool = False,
+        **kwargs,
+    ) -> bool:
+        """
+        Extra optional args:
+            colors_as_uint8: (bool) Whether to save colors as numbers in the
+                        range [0, 255] instead of float32.
+        """
+        if not endswith(path, self.known_suffixes):
+            return False
+
+        verts = data.verts_list()[0]
+        faces = data.faces_list()[0]
+
+        if data.has_verts_normals():
+            verts_normals = data.verts_normals_list()[0]
+        else:
+            verts_normals = None
+
+        if isinstance(data.textures, TexturesVertex):
+            mesh_verts_colors = data.textures.verts_features_list()[0]
+            n_colors = mesh_verts_colors.shape[1]
+            if n_colors == 3:
+                verts_colors = mesh_verts_colors
+            else:
+                warnings.warn(
+                    f"Texture will not be saved as it has {n_colors} colors, not 3."
+                )
+                verts_colors = None
+        else:
+            verts_colors = None
+
+        with _open_file(path, path_manager, "wb") as f:
+            _save_ply(
+                f=f,
+                verts=verts,
+                faces=faces,
+                verts_colors=verts_colors,
+                verts_normals=verts_normals,
+                ascii=binary is False,
+                decimal_places=decimal_places,
+                colors_as_uint8=colors_as_uint8,
+            )
+        return True
+
+
+class PointcloudPlyFormat(PointcloudFormatInterpreter):
+    def __init__(self) -> None:
+        self.known_suffixes = (".ply",)
+
+    def read(
+        self,
+        path: PathOrStr,
+        device,
+        path_manager: PathManager,
+        **kwargs,
+    ) -> Optional[Pointclouds]:
+        if not endswith(path, self.known_suffixes):
+            return None
+
+        verts, faces, features, normals = _load_ply(f=path, path_manager=path_manager)
+        verts = verts.to(device)
+        if features is not None:
+            features = [features.to(device)]
+        if normals is not None:
+            normals = [normals.to(device)]
+
+        pointcloud = Pointclouds(points=[verts], features=features, normals=normals)
+        return pointcloud
+
+    def save(
+        self,
+        data: Pointclouds,
+        path: PathOrStr,
+        path_manager: PathManager,
+        binary: Optional[bool],
+        decimal_places: Optional[int] = None,
+        colors_as_uint8: bool = False,
+        **kwargs,
+    ) -> bool:
+        """
+        Extra optional args:
+            colors_as_uint8: (bool) Whether to save colors as numbers in the
+                        range [0, 255] instead of float32.
+        """
+        if not endswith(path, self.known_suffixes):
+            return False
+
+        points = data.points_list()[0]
+        features = data.features_packed()
+        normals = data.normals_packed()
+
+        with _open_file(path, path_manager, "wb") as f:
+            _save_ply(
+                f=f,
+                verts=points,
+                verts_colors=features,
+                verts_normals=normals,
+                faces=None,
+                ascii=binary is False,
+                decimal_places=decimal_places,
+                colors_as_uint8=colors_as_uint8,
+            )
+        return True
diff --git a/pytorch3d/pytorch3d/io/utils.py b/pytorch3d/pytorch3d/io/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..55a8e7720115c05af48cbe3216530b03f45f4b7a
--- /dev/null
+++ b/pytorch3d/pytorch3d/io/utils.py
@@ -0,0 +1,93 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import contextlib
+import pathlib
+import warnings
+from typing import IO, ContextManager, Optional, Union
+
+import numpy as np
+import torch
+from iopath.common.file_io import PathManager
+from PIL import Image
+
+from ..common.types import Device
+
+
+@contextlib.contextmanager
+def nullcontext(x):
+    """
+    This is just like contextlib.nullcontext but also works in Python 3.6.
+    """
+    yield x
+
+
+PathOrStr = Union[pathlib.Path, str]
+
+
+def _open_file(f, path_manager: PathManager, mode="r") -> ContextManager[IO]:
+    if isinstance(f, str):
+        f = path_manager.open(f, mode)
+        return contextlib.closing(f)
+    elif isinstance(f, pathlib.Path):
+        f = f.open(mode)
+        return contextlib.closing(f)
+    else:
+        return nullcontext(f)
+
+
+def _make_tensor(
+    data, cols: int, dtype: torch.dtype, device: Device = "cpu"
+) -> torch.Tensor:
+    """
+    Return a 2D tensor with the specified cols and dtype filled with data,
+    even when data is empty.
+    """
+    if not len(data):
+        return torch.zeros((0, cols), dtype=dtype, device=device)
+
+    return torch.tensor(data, dtype=dtype, device=device)
+
+
+def _check_faces_indices(
+    faces_indices: torch.Tensor, max_index: int, pad_value: Optional[int] = None
+) -> torch.Tensor:
+    if pad_value is None:
+        mask = torch.ones(faces_indices.shape[:-1]).bool()  # Keep all faces
+    else:
+        # pyre-fixme[16]: `torch.ByteTensor` has no attribute `any`
+        mask = faces_indices.ne(pad_value).any(dim=-1)
+    if torch.any(faces_indices[mask] >= max_index) or torch.any(
+        faces_indices[mask] < 0
+    ):
+        warnings.warn("Faces have invalid indices")
+    return faces_indices
+
+
+def _read_image(file_name: str, path_manager: PathManager, format=None):
+    """
+    Read an image from a file using Pillow.
+    Args:
+        file_name: image file path.
+        path_manager: PathManager for interpreting file_name.
+        format: one of ["RGB", "BGR"]
+    Returns:
+        image: an image of shape (H, W, C).
+    """
+    if format not in ["RGB", "BGR"]:
+        raise ValueError("format can only be one of [RGB, BGR]; got %s", format)
+    with path_manager.open(file_name, "rb") as f:
+        # pyre-fixme[6]: Expected `Union[str, typing.BinaryIO]` for 1st param but
+        #  got `Union[typing.IO[bytes], typing.IO[str]]`.
+        image = Image.open(f)
+        if format is not None:
+            # PIL only supports RGB. First convert to RGB and flip channels
+            # below for BGR.
+            image = image.convert("RGB")
+        image = np.asarray(image).astype(np.float32)
+        if format == "BGR":
+            image = image[:, :, ::-1]
+        return image
diff --git a/pytorch3d/pytorch3d/loss/__init__.py b/pytorch3d/pytorch3d/loss/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0a1fb09e680b81983f8e08847684c0b6159901c6
--- /dev/null
+++ b/pytorch3d/pytorch3d/loss/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from .chamfer import chamfer_distance
+from .mesh_edge_loss import mesh_edge_loss
+from .mesh_laplacian_smoothing import mesh_laplacian_smoothing
+from .mesh_normal_consistency import mesh_normal_consistency
+from .point_mesh_distance import point_mesh_edge_distance, point_mesh_face_distance
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/loss/chamfer.py b/pytorch3d/pytorch3d/loss/chamfer.py
new file mode 100644
index 0000000000000000000000000000000000000000..485b5874ada20311e8195b3815c2abbe56de2b74
--- /dev/null
+++ b/pytorch3d/pytorch3d/loss/chamfer.py
@@ -0,0 +1,219 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Union
+
+import torch
+import torch.nn.functional as F
+from pytorch3d.ops.knn import knn_gather, knn_points
+from pytorch3d.structures.pointclouds import Pointclouds
+
+
+def _validate_chamfer_reduction_inputs(
+    batch_reduction: Union[str, None], point_reduction: str
+):
+    """Check the requested reductions are valid.
+
+    Args:
+        batch_reduction: Reduction operation to apply for the loss across the
+            batch, can be one of ["mean", "sum"] or None.
+        point_reduction: Reduction operation to apply for the loss across the
+            points, can be one of ["mean", "sum"].
+    """
+    if batch_reduction is not None and batch_reduction not in ["mean", "sum"]:
+        raise ValueError('batch_reduction must be one of ["mean", "sum"] or None')
+    if point_reduction not in ["mean", "sum"]:
+        raise ValueError('point_reduction must be one of ["mean", "sum"]')
+
+
+def _handle_pointcloud_input(
+    points: Union[torch.Tensor, Pointclouds],
+    lengths: Union[torch.Tensor, None],
+    normals: Union[torch.Tensor, None],
+):
+    """
+    If points is an instance of Pointclouds, retrieve the padded points tensor
+    along with the number of points per batch and the padded normals.
+    Otherwise, return the input points (and normals) with the number of points per cloud
+    set to the size of the second dimension of `points`.
+    """
+    if isinstance(points, Pointclouds):
+        X = points.points_padded()
+        lengths = points.num_points_per_cloud()
+        normals = points.normals_padded()  # either a tensor or None
+    elif torch.is_tensor(points):
+        if points.ndim != 3:
+            raise ValueError("Expected points to be of shape (N, P, D)")
+        X = points
+        if lengths is not None and (
+            lengths.ndim != 1 or lengths.shape[0] != X.shape[0]
+        ):
+            raise ValueError("Expected lengths to be of shape (N,)")
+        if lengths is None:
+            lengths = torch.full(
+                (X.shape[0],), X.shape[1], dtype=torch.int64, device=points.device
+            )
+        if normals is not None and normals.ndim != 3:
+            raise ValueError("Expected normals to be of shape (N, P, 3")
+    else:
+        raise ValueError(
+            "The input pointclouds should be either "
+            + "Pointclouds objects or torch.Tensor of shape "
+            + "(minibatch, num_points, 3)."
+        )
+    return X, lengths, normals
+
+
+def chamfer_distance(
+    x,
+    y,
+    x_lengths=None,
+    y_lengths=None,
+    x_normals=None,
+    y_normals=None,
+    weights=None,
+    batch_reduction: Union[str, None] = "mean",
+    point_reduction: str = "mean",
+):
+    """
+    Chamfer distance between two pointclouds x and y.
+
+    Args:
+        x: FloatTensor of shape (N, P1, D) or a Pointclouds object representing
+            a batch of point clouds with at most P1 points in each batch element,
+            batch size N and feature dimension D.
+        y: FloatTensor of shape (N, P2, D) or a Pointclouds object representing
+            a batch of point clouds with at most P2 points in each batch element,
+            batch size N and feature dimension D.
+        x_lengths: Optional LongTensor of shape (N,) giving the number of points in each
+            cloud in x.
+        y_lengths: Optional LongTensor of shape (N,) giving the number of points in each
+            cloud in y.
+        x_normals: Optional FloatTensor of shape (N, P1, D).
+        y_normals: Optional FloatTensor of shape (N, P2, D).
+        weights: Optional FloatTensor of shape (N,) giving weights for
+            batch elements for reduction operation.
+        batch_reduction: Reduction operation to apply for the loss across the
+            batch, can be one of ["mean", "sum"] or None.
+        point_reduction: Reduction operation to apply for the loss across the
+            points, can be one of ["mean", "sum"].
+
+    Returns:
+        2-element tuple containing
+
+        - **loss**: Tensor giving the reduced distance between the pointclouds
+          in x and the pointclouds in y.
+        - **loss_normals**: Tensor giving the reduced cosine distance of normals
+          between pointclouds in x and pointclouds in y. Returns None if
+          x_normals and y_normals are None.
+    """
+    _validate_chamfer_reduction_inputs(batch_reduction, point_reduction)
+
+    x, x_lengths, x_normals = _handle_pointcloud_input(x, x_lengths, x_normals)
+    y, y_lengths, y_normals = _handle_pointcloud_input(y, y_lengths, y_normals)
+
+    return_normals = x_normals is not None and y_normals is not None
+
+    N, P1, D = x.shape
+    P2 = y.shape[1]
+
+    # Check if inputs are heterogeneous and create a lengths mask.
+    is_x_heterogeneous = (x_lengths != P1).any()
+    is_y_heterogeneous = (y_lengths != P2).any()
+    x_mask = (
+        torch.arange(P1, device=x.device)[None] >= x_lengths[:, None]
+    )  # shape [N, P1]
+    y_mask = (
+        torch.arange(P2, device=y.device)[None] >= y_lengths[:, None]
+    )  # shape [N, P2]
+
+    if y.shape[0] != N or y.shape[2] != D:
+        raise ValueError("y does not have the correct shape.")
+    if weights is not None:
+        if weights.size(0) != N:
+            raise ValueError("weights must be of shape (N,).")
+        if not (weights >= 0).all():
+            raise ValueError("weights cannot be negative.")
+        if weights.sum() == 0.0:
+            weights = weights.view(N, 1)
+            if batch_reduction in ["mean", "sum"]:
+                return (
+                    (x.sum((1, 2)) * weights).sum() * 0.0,
+                    (x.sum((1, 2)) * weights).sum() * 0.0,
+                )
+            return ((x.sum((1, 2)) * weights) * 0.0, (x.sum((1, 2)) * weights) * 0.0)
+
+    cham_norm_x = x.new_zeros(())
+    cham_norm_y = x.new_zeros(())
+
+    x_nn = knn_points(x, y, lengths1=x_lengths, lengths2=y_lengths, K=1)
+    y_nn = knn_points(y, x, lengths1=y_lengths, lengths2=x_lengths, K=1)
+
+    cham_x = x_nn.dists[..., 0]  # (N, P1)
+    cham_y = y_nn.dists[..., 0]  # (N, P2)
+
+    if is_x_heterogeneous:
+        cham_x[x_mask] = 0.0
+    if is_y_heterogeneous:
+        cham_y[y_mask] = 0.0
+
+    if weights is not None:
+        cham_x *= weights.view(N, 1)
+        cham_y *= weights.view(N, 1)
+
+    if return_normals:
+        # Gather the normals using the indices and keep only value for k=0
+        x_normals_near = knn_gather(y_normals, x_nn.idx, y_lengths)[..., 0, :]
+        y_normals_near = knn_gather(x_normals, y_nn.idx, x_lengths)[..., 0, :]
+
+        cham_norm_x = 1 - torch.abs(
+            F.cosine_similarity(x_normals, x_normals_near, dim=2, eps=1e-6)
+        )
+        cham_norm_y = 1 - torch.abs(
+            F.cosine_similarity(y_normals, y_normals_near, dim=2, eps=1e-6)
+        )
+
+        if is_x_heterogeneous:
+            cham_norm_x[x_mask] = 0.0
+        if is_y_heterogeneous:
+            cham_norm_y[y_mask] = 0.0
+
+        if weights is not None:
+            cham_norm_x *= weights.view(N, 1)
+            cham_norm_y *= weights.view(N, 1)
+
+    # Apply point reduction
+    cham_x = cham_x.sum(1)  # (N,)
+    cham_y = cham_y.sum(1)  # (N,)
+    if return_normals:
+        cham_norm_x = cham_norm_x.sum(1)  # (N,)
+        cham_norm_y = cham_norm_y.sum(1)  # (N,)
+    if point_reduction == "mean":
+        cham_x /= x_lengths
+        cham_y /= y_lengths
+        if return_normals:
+            cham_norm_x /= x_lengths
+            cham_norm_y /= y_lengths
+
+    if batch_reduction is not None:
+        # batch_reduction == "sum"
+        cham_x = cham_x.sum()
+        cham_y = cham_y.sum()
+        if return_normals:
+            cham_norm_x = cham_norm_x.sum()
+            cham_norm_y = cham_norm_y.sum()
+        if batch_reduction == "mean":
+            div = weights.sum() if weights is not None else N
+            cham_x /= div
+            cham_y /= div
+            if return_normals:
+                cham_norm_x /= div
+                cham_norm_y /= div
+
+    cham_dist = cham_x + cham_y
+    cham_normals = cham_norm_x + cham_norm_y if return_normals else None
+
+    return cham_dist, cham_normals
diff --git a/pytorch3d/pytorch3d/loss/mesh_edge_loss.py b/pytorch3d/pytorch3d/loss/mesh_edge_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..ccb98dd9a7bec8848b65fb0258d63320360d1c37
--- /dev/null
+++ b/pytorch3d/pytorch3d/loss/mesh_edge_loss.py
@@ -0,0 +1,50 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+def mesh_edge_loss(meshes, target_length: float = 0.0):
+    """
+    Computes mesh edge length regularization loss averaged across all meshes
+    in a batch. Each mesh contributes equally to the final loss, regardless of
+    the number of edges per mesh in the batch by weighting each mesh with the
+    inverse number of edges. For example, if mesh 3 (out of N) has only E=4
+    edges, then the loss for each edge in mesh 3 should be multiplied by 1/E to
+    contribute to the final loss.
+
+    Args:
+        meshes: Meshes object with a batch of meshes.
+        target_length: Resting value for the edge length.
+
+    Returns:
+        loss: Average loss across the batch. Returns 0 if meshes contains
+        no meshes or all empty meshes.
+    """
+    if meshes.isempty():
+        return torch.tensor(
+            [0.0], dtype=torch.float32, device=meshes.device, requires_grad=True
+        )
+
+    N = len(meshes)
+    edges_packed = meshes.edges_packed()  # (sum(E_n), 3)
+    verts_packed = meshes.verts_packed()  # (sum(V_n), 3)
+    edge_to_mesh_idx = meshes.edges_packed_to_mesh_idx()  # (sum(E_n), )
+    num_edges_per_mesh = meshes.num_edges_per_mesh()  # N
+
+    # Determine the weight for each edge based on the number of edges in the
+    # mesh it corresponds to.
+    # TODO (nikhilar) Find a faster way of computing the weights for each edge
+    # as this is currently a bottleneck for meshes with a large number of faces.
+    weights = num_edges_per_mesh.gather(0, edge_to_mesh_idx)
+    weights = 1.0 / weights.float()
+
+    verts_edges = verts_packed[edges_packed]
+    v0, v1 = verts_edges.unbind(1)
+    loss = ((v0 - v1).norm(dim=1, p=2) - target_length) ** 2.0
+    loss = loss * weights
+
+    return loss.sum() / N
diff --git a/pytorch3d/pytorch3d/loss/mesh_laplacian_smoothing.py b/pytorch3d/pytorch3d/loss/mesh_laplacian_smoothing.py
new file mode 100644
index 0000000000000000000000000000000000000000..98037d606db3a95027ff9a4b70fdde3f345f698e
--- /dev/null
+++ b/pytorch3d/pytorch3d/loss/mesh_laplacian_smoothing.py
@@ -0,0 +1,132 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from pytorch3d.ops import cot_laplacian
+
+
+def mesh_laplacian_smoothing(meshes, method: str = "uniform"):
+    r"""
+    Computes the laplacian smoothing objective for a batch of meshes.
+    This function supports three variants of Laplacian smoothing,
+    namely with uniform weights("uniform"), with cotangent weights ("cot"),
+    and cotangent curvature ("cotcurv").For more details read [1, 2].
+
+    Args:
+        meshes: Meshes object with a batch of meshes.
+        method: str specifying the method for the laplacian.
+    Returns:
+        loss: Average laplacian smoothing loss across the batch.
+        Returns 0 if meshes contains no meshes or all empty meshes.
+
+    Consider a mesh M = (V, F), with verts of shape Nx3 and faces of shape Mx3.
+    The Laplacian matrix L is a NxN tensor such that LV gives a tensor of vectors:
+    for a uniform Laplacian, LuV[i] points to the centroid of its neighboring
+    vertices, a cotangent Laplacian LcV[i] is known to be an approximation of
+    the surface normal, while the curvature variant LckV[i] scales the normals
+    by the discrete mean curvature. For vertex i, assume S[i] is the set of
+    neighboring vertices to i, a_ij and b_ij are the "outside" angles in the
+    two triangles connecting vertex v_i and its neighboring vertex v_j
+    for j in S[i], as seen in the diagram below.
+
+    .. code-block:: python
+
+               a_ij
+                /\
+               /  \
+              /    \
+             /      \
+        v_i /________\ v_j
+            \        /
+             \      /
+              \    /
+               \  /
+                \/
+               b_ij
+
+        The definition of the Laplacian is LV[i] = sum_j w_ij (v_j - v_i)
+        For the uniform variant,    w_ij = 1 / |S[i]|
+        For the cotangent variant,
+            w_ij = (cot a_ij + cot b_ij) / (sum_k cot a_ik + cot b_ik)
+        For the cotangent curvature, w_ij = (cot a_ij + cot b_ij) / (4 A[i])
+        where A[i] is the sum of the areas of all triangles containing vertex v_i.
+
+    There is a nice trigonometry identity to compute cotangents. Consider a triangle
+    with side lengths A, B, C and angles a, b, c.
+
+    .. code-block:: python
+
+               c
+              /|\
+             / | \
+            /  |  \
+         B /  H|   \ A
+          /    |    \
+         /     |     \
+        /a_____|_____b\
+               C
+
+        Then cot a = (B^2 + C^2 - A^2) / 4 * area
+        We know that area = CH/2, and by the law of cosines we have
+
+        A^2 = B^2 + C^2 - 2BC cos a => B^2 + C^2 - A^2 = 2BC cos a
+
+        Putting these together, we get:
+
+        B^2 + C^2 - A^2     2BC cos a
+        _______________  =  _________ = (B/H) cos a = cos a / sin a = cot a
+           4 * area            2CH
+
+
+    [1] Desbrun et al, "Implicit fairing of irregular meshes using diffusion
+    and curvature flow", SIGGRAPH 1999.
+
+    [2] Nealan et al, "Laplacian Mesh Optimization", Graphite 2006.
+    """
+
+    if meshes.isempty():
+        return torch.tensor(
+            [0.0], dtype=torch.float32, device=meshes.device, requires_grad=True
+        )
+
+    N = len(meshes)
+    verts_packed = meshes.verts_packed()  # (sum(V_n), 3)
+    faces_packed = meshes.faces_packed()  # (sum(F_n), 3)
+    num_verts_per_mesh = meshes.num_verts_per_mesh()  # (N,)
+    verts_packed_idx = meshes.verts_packed_to_mesh_idx()  # (sum(V_n),)
+    weights = num_verts_per_mesh.gather(0, verts_packed_idx)  # (sum(V_n),)
+    weights = 1.0 / weights.float()
+
+    # We don't want to backprop through the computation of the Laplacian;
+    # just treat it as a magic constant matrix that is used to transform
+    # verts into normals
+    with torch.no_grad():
+        if method == "uniform":
+            L = meshes.laplacian_packed()
+        elif method in ["cot", "cotcurv"]:
+            L, inv_areas = cot_laplacian(verts_packed, faces_packed)
+            if method == "cot":
+                norm_w = torch.sparse.sum(L, dim=1).to_dense().view(-1, 1)
+                idx = norm_w > 0
+                norm_w[idx] = 1.0 / norm_w[idx]
+            else:
+                L_sum = torch.sparse.sum(L, dim=1).to_dense().view(-1, 1)
+                norm_w = 0.25 * inv_areas
+        else:
+            raise ValueError("Method should be one of {uniform, cot, cotcurv}")
+
+    if method == "uniform":
+        loss = L.mm(verts_packed)
+    elif method == "cot":
+        loss = L.mm(verts_packed) * norm_w - verts_packed
+    elif method == "cotcurv":
+        # pyre-fixme[61]: `norm_w` may not be initialized here.
+        loss = (L.mm(verts_packed) - L_sum * verts_packed) * norm_w
+    loss = loss.norm(dim=1)
+
+    loss = loss * weights
+    return loss.sum() / N
diff --git a/pytorch3d/pytorch3d/loss/mesh_normal_consistency.py b/pytorch3d/pytorch3d/loss/mesh_normal_consistency.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e55c01ff8a76b9069d6334df8359925316bf0bb
--- /dev/null
+++ b/pytorch3d/pytorch3d/loss/mesh_normal_consistency.py
@@ -0,0 +1,132 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from pytorch3d import _C
+
+
+def mesh_normal_consistency(meshes):
+    r"""
+    Computes the normal consistency of each mesh in meshes.
+    We compute the normal consistency for each pair of neighboring faces.
+    If e = (v0, v1) is the connecting edge of two neighboring faces f0 and f1,
+    then the normal consistency between f0 and f1
+
+    .. code-block:: python
+
+                    a
+                    /\
+                   /  \
+                  / f0 \
+                 /      \
+            v0  /____e___\ v1
+                \        /
+                 \      /
+                  \ f1 /
+                   \  /
+                    \/
+                    b
+
+    The normal consistency is
+
+    .. code-block:: python
+
+        nc(f0, f1) = 1 - cos(n0, n1)
+
+        where cos(n0, n1) = n0^n1 / ||n0|| / ||n1|| is the cosine of the angle
+        between the normals n0 and n1, and
+
+        n0 = (v1 - v0) x (a - v0)
+        n1 = - (v1 - v0) x (b - v0) = (b - v0) x (v1 - v0)
+
+    This means that if nc(f0, f1) = 0 then n0 and n1 point to the same
+    direction, while if nc(f0, f1) = 2 then n0 and n1 point opposite direction.
+
+    .. note::
+        For well-constructed meshes the assumption that only two faces share an
+        edge is true. This assumption could make the implementation easier and faster.
+        This implementation does not follow this assumption. All the faces sharing e,
+        which can be any in number, are discovered.
+
+    Args:
+        meshes: Meshes object with a batch of meshes.
+
+    Returns:
+        loss: Average normal consistency across the batch.
+        Returns 0 if meshes contains no meshes or all empty meshes.
+    """
+    if meshes.isempty():
+        return torch.tensor(
+            [0.0], dtype=torch.float32, device=meshes.device, requires_grad=True
+        )
+
+    N = len(meshes)
+    verts_packed = meshes.verts_packed()  # (sum(V_n), 3)
+    faces_packed = meshes.faces_packed()  # (sum(F_n), 3)
+    edges_packed = meshes.edges_packed()  # (sum(E_n), 2)
+    verts_packed_to_mesh_idx = meshes.verts_packed_to_mesh_idx()  # (sum(V_n),)
+    face_to_edge = meshes.faces_packed_to_edges_packed()  # (sum(F_n), 3)
+    E = edges_packed.shape[0]  # sum(E_n)
+    F = faces_packed.shape[0]  # sum(F_n)
+
+    # We don't want gradients for the following operation. The goal is to
+    # find for each edge e all the vertices associated with e. In the example
+    # above, the vertices associated with e are (a, b), i.e. the points connected
+    # on faces to e.
+    with torch.no_grad():
+        edge_idx = face_to_edge.reshape(F * 3)  # (3 * F,) indexes into edges
+        vert_idx = (
+            faces_packed.view(1, F, 3).expand(3, F, 3).transpose(0, 1).reshape(3 * F, 3)
+        )
+        edge_idx, edge_sort_idx = edge_idx.sort()
+        vert_idx = vert_idx[edge_sort_idx]
+
+        # In well constructed meshes each edge is shared by precisely 2 faces
+        # However, in many meshes, this assumption is not always satisfied.
+        # We want to find all faces that share an edge, a number which can
+        # vary and which depends on the topology.
+        # In particular, we find the vertices not on the edge on the shared faces.
+        # In the example above, we want to associate edge e with vertices a and b.
+        # This operation is done more efficiently in cpu with lists.
+        # TODO(gkioxari) find a better way to do this.
+
+        # edge_idx represents the index of the edge for each vertex. We can count
+        # the number of vertices which are associated with each edge.
+        # There can be a different number for each edge.
+        edge_num = edge_idx.bincount(minlength=E)
+
+        # This calculates all pairs of vertices which are opposite to the same edge.
+        vert_edge_pair_idx = _C.mesh_normal_consistency_find_verts(edge_num.cpu()).to(
+            edge_num.device
+        )
+
+    if vert_edge_pair_idx.shape[0] == 0:
+        return torch.tensor(
+            [0.0], dtype=torch.float32, device=meshes.device, requires_grad=True
+        )
+
+    v0_idx = edges_packed[edge_idx, 0]
+    v0 = verts_packed[v0_idx]
+    v1_idx = edges_packed[edge_idx, 1]
+    v1 = verts_packed[v1_idx]
+
+    # two of the following cross products are zeros as they are cross product
+    # with either (v1-v0)x(v1-v0) or (v1-v0)x(v0-v0)
+    n_temp0 = (v1 - v0).cross(verts_packed[vert_idx[:, 0]] - v0, dim=1)
+    n_temp1 = (v1 - v0).cross(verts_packed[vert_idx[:, 1]] - v0, dim=1)
+    n_temp2 = (v1 - v0).cross(verts_packed[vert_idx[:, 2]] - v0, dim=1)
+    n = n_temp0 + n_temp1 + n_temp2
+    n0 = n[vert_edge_pair_idx[:, 0]]
+    n1 = -n[vert_edge_pair_idx[:, 1]]
+    loss = 1 - torch.cosine_similarity(n0, n1, dim=1)
+
+    verts_packed_to_mesh_idx = verts_packed_to_mesh_idx[vert_idx[:, 0]]
+    verts_packed_to_mesh_idx = verts_packed_to_mesh_idx[vert_edge_pair_idx[:, 0]]
+    num_normals = verts_packed_to_mesh_idx.bincount(minlength=N)
+    weights = 1.0 / num_normals[verts_packed_to_mesh_idx].float()
+
+    loss = loss * weights
+    return loss.sum() / N
diff --git a/pytorch3d/pytorch3d/loss/point_mesh_distance.py b/pytorch3d/pytorch3d/loss/point_mesh_distance.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9f7b3bb4b1e4950315dcdf105474d83406b811a
--- /dev/null
+++ b/pytorch3d/pytorch3d/loss/point_mesh_distance.py
@@ -0,0 +1,361 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from pytorch3d import _C
+from pytorch3d.structures import Meshes, Pointclouds
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+
+"""
+This file defines distances between meshes and pointclouds.
+The functions make use of the definition of a distance between a point and
+an edge segment or the distance of a point and a triangle (face).
+
+The exact mathematical formulations and implementations of these
+distances can be found in `csrc/utils/geometry_utils.cuh`.
+"""
+
+
+# PointFaceDistance
+class _PointFaceDistance(Function):
+    """
+    Torch autograd Function wrapper PointFaceDistance Cuda implementation
+    """
+
+    @staticmethod
+    def forward(ctx, points, points_first_idx, tris, tris_first_idx, max_points):
+        """
+        Args:
+            ctx: Context object used to calculate gradients.
+            points: FloatTensor of shape `(P, 3)`
+            points_first_idx: LongTensor of shape `(N,)` indicating the first point
+                index in each example in the batch
+            tris: FloatTensor of shape `(T, 3, 3)` of triangular faces. The `t`-th
+                triangular face is spanned by `(tris[t, 0], tris[t, 1], tris[t, 2])`
+            tris_first_idx: LongTensor of shape `(N,)` indicating the first face
+                index in each example in the batch
+            max_points: Scalar equal to maximum number of points in the batch
+        Returns:
+            dists: FloatTensor of shape `(P,)`, where `dists[p]` is the squared
+                euclidean distance of `p`-th point to the closest triangular face
+                in the corresponding example in the batch
+            idxs: LongTensor of shape `(P,)` indicating the closest triangular face
+                in the corresponding example in the batch.
+
+            `dists[p]` is
+            `d(points[p], tris[idxs[p], 0], tris[idxs[p], 1], tris[idxs[p], 2])`
+            where `d(u, v0, v1, v2)` is the distance of point `u` from the triangular
+            face `(v0, v1, v2)`
+
+        """
+        dists, idxs = _C.point_face_dist_forward(
+            points, points_first_idx, tris, tris_first_idx, max_points
+        )
+        ctx.save_for_backward(points, tris, idxs)
+        return dists
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_dists):
+        grad_dists = grad_dists.contiguous()
+        points, tris, idxs = ctx.saved_tensors
+        grad_points, grad_tris = _C.point_face_dist_backward(
+            points, tris, idxs, grad_dists
+        )
+        return grad_points, None, grad_tris, None, None
+
+
+# pyre-fixme[16]: `_PointFaceDistance` has no attribute `apply`.
+point_face_distance = _PointFaceDistance.apply
+
+
+# FacePointDistance
+class _FacePointDistance(Function):
+    """
+    Torch autograd Function wrapper FacePointDistance Cuda implementation
+    """
+
+    @staticmethod
+    def forward(ctx, points, points_first_idx, tris, tris_first_idx, max_tris):
+        """
+        Args:
+            ctx: Context object used to calculate gradients.
+            points: FloatTensor of shape `(P, 3)`
+            points_first_idx: LongTensor of shape `(N,)` indicating the first point
+                index in each example in the batch
+            tris: FloatTensor of shape `(T, 3, 3)` of triangular faces. The `t`-th
+                triangular face is spanned by `(tris[t, 0], tris[t, 1], tris[t, 2])`
+            tris_first_idx: LongTensor of shape `(N,)` indicating the first face
+                index in each example in the batch
+            max_tris: Scalar equal to maximum number of faces in the batch
+        Returns:
+            dists: FloatTensor of shape `(T,)`, where `dists[t]` is the squared
+                euclidean distance of `t`-th triangular face to the closest point in the
+                corresponding example in the batch
+            idxs: LongTensor of shape `(T,)` indicating the closest point in the
+                corresponding example in the batch.
+
+            `dists[t] = d(points[idxs[t]], tris[t, 0], tris[t, 1], tris[t, 2])`,
+            where `d(u, v0, v1, v2)` is the distance of point `u` from the triangular
+            face `(v0, v1, v2)`.
+        """
+        dists, idxs = _C.face_point_dist_forward(
+            points, points_first_idx, tris, tris_first_idx, max_tris
+        )
+        ctx.save_for_backward(points, tris, idxs)
+        return dists
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_dists):
+        grad_dists = grad_dists.contiguous()
+        points, tris, idxs = ctx.saved_tensors
+        grad_points, grad_tris = _C.face_point_dist_backward(
+            points, tris, idxs, grad_dists
+        )
+        return grad_points, None, grad_tris, None, None
+
+
+# pyre-fixme[16]: `_FacePointDistance` has no attribute `apply`.
+face_point_distance = _FacePointDistance.apply
+
+
+# PointEdgeDistance
+class _PointEdgeDistance(Function):
+    """
+    Torch autograd Function wrapper PointEdgeDistance Cuda implementation
+    """
+
+    @staticmethod
+    def forward(ctx, points, points_first_idx, segms, segms_first_idx, max_points):
+        """
+        Args:
+            ctx: Context object used to calculate gradients.
+            points: FloatTensor of shape `(P, 3)`
+            points_first_idx: LongTensor of shape `(N,)` indicating the first point
+                index for each example in the mesh
+            segms: FloatTensor of shape `(S, 2, 3)` of edge segments. The `s`-th
+                edge segment is spanned by `(segms[s, 0], segms[s, 1])`
+            segms_first_idx: LongTensor of shape `(N,)` indicating the first edge
+                index for each example in the mesh
+            max_points: Scalar equal to maximum number of points in the batch
+        Returns:
+            dists: FloatTensor of shape `(P,)`, where `dists[p]` is the squared
+                euclidean distance of `p`-th point to the closest edge in the
+                corresponding example in the batch
+            idxs: LongTensor of shape `(P,)` indicating the closest edge in the
+                corresponding example in the batch.
+
+            `dists[p] = d(points[p], segms[idxs[p], 0], segms[idxs[p], 1])`,
+            where `d(u, v0, v1)` is the distance of point `u` from the edge segment
+            spanned by `(v0, v1)`.
+        """
+        dists, idxs = _C.point_edge_dist_forward(
+            points, points_first_idx, segms, segms_first_idx, max_points
+        )
+        ctx.save_for_backward(points, segms, idxs)
+        return dists
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_dists):
+        grad_dists = grad_dists.contiguous()
+        points, segms, idxs = ctx.saved_tensors
+        grad_points, grad_segms = _C.point_edge_dist_backward(
+            points, segms, idxs, grad_dists
+        )
+        return grad_points, None, grad_segms, None, None
+
+
+# pyre-fixme[16]: `_PointEdgeDistance` has no attribute `apply`.
+point_edge_distance = _PointEdgeDistance.apply
+
+
+# EdgePointDistance
+class _EdgePointDistance(Function):
+    """
+    Torch autograd Function wrapper EdgePointDistance Cuda implementation
+    """
+
+    @staticmethod
+    def forward(ctx, points, points_first_idx, segms, segms_first_idx, max_segms):
+        """
+        Args:
+            ctx: Context object used to calculate gradients.
+            points: FloatTensor of shape `(P, 3)`
+            points_first_idx: LongTensor of shape `(N,)` indicating the first point
+                index for each example in the mesh
+            segms: FloatTensor of shape `(S, 2, 3)` of edge segments. The `s`-th
+                edge segment is spanned by `(segms[s, 0], segms[s, 1])`
+            segms_first_idx: LongTensor of shape `(N,)` indicating the first edge
+                index for each example in the mesh
+            max_segms: Scalar equal to maximum number of edges in the batch
+        Returns:
+            dists: FloatTensor of shape `(S,)`, where `dists[s]` is the squared
+                euclidean distance of `s`-th edge to the closest point in the
+                corresponding example in the batch
+            idxs: LongTensor of shape `(S,)` indicating the closest point in the
+                corresponding example in the batch.
+
+            `dists[s] = d(points[idxs[s]], edges[s, 0], edges[s, 1])`,
+            where `d(u, v0, v1)` is the distance of point `u` from the segment
+            spanned by `(v0, v1)`.
+        """
+        dists, idxs = _C.edge_point_dist_forward(
+            points, points_first_idx, segms, segms_first_idx, max_segms
+        )
+        ctx.save_for_backward(points, segms, idxs)
+        return dists
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_dists):
+        grad_dists = grad_dists.contiguous()
+        points, segms, idxs = ctx.saved_tensors
+        grad_points, grad_segms = _C.edge_point_dist_backward(
+            points, segms, idxs, grad_dists
+        )
+        return grad_points, None, grad_segms, None, None
+
+
+# pyre-fixme[16]: `_EdgePointDistance` has no attribute `apply`.
+edge_point_distance = _EdgePointDistance.apply
+
+
+def point_mesh_edge_distance(meshes: Meshes, pcls: Pointclouds):
+    """
+    Computes the distance between a pointcloud and a mesh within a batch.
+    Given a pair `(mesh, pcl)` in the batch, we define the distance to be the
+    sum of two distances, namely `point_edge(mesh, pcl) + edge_point(mesh, pcl)`
+
+    `point_edge(mesh, pcl)`: Computes the squared distance of each point p in pcl
+        to the closest edge segment in mesh and averages across all points in pcl
+    `edge_point(mesh, pcl)`: Computes the squared distance of each edge segment in mesh
+        to the closest point in pcl and averages across all edges in mesh.
+
+    The above distance functions are applied for all `(mesh, pcl)` pairs in the batch
+    and then averaged across the batch.
+
+    Args:
+        meshes: A Meshes data structure containing N meshes
+        pcls: A Pointclouds data structure containing N pointclouds
+
+    Returns:
+        loss: The `point_edge(mesh, pcl) + edge_point(mesh, pcl)` distance
+            between all `(mesh, pcl)` in a batch averaged across the batch.
+    """
+    if len(meshes) != len(pcls):
+        raise ValueError("meshes and pointclouds must be equal sized batches")
+    N = len(meshes)
+
+    # packed representation for pointclouds
+    points = pcls.points_packed()  # (P, 3)
+    points_first_idx = pcls.cloud_to_packed_first_idx()
+    max_points = pcls.num_points_per_cloud().max().item()
+
+    # packed representation for edges
+    verts_packed = meshes.verts_packed()
+    edges_packed = meshes.edges_packed()
+    segms = verts_packed[edges_packed]  # (S, 2, 3)
+    segms_first_idx = meshes.mesh_to_edges_packed_first_idx()
+    max_segms = meshes.num_edges_per_mesh().max().item()
+
+    # point to edge distance: shape (P,)
+    point_to_edge = point_edge_distance(
+        points, points_first_idx, segms, segms_first_idx, max_points
+    )
+
+    # weight each example by the inverse of number of points in the example
+    point_to_cloud_idx = pcls.packed_to_cloud_idx()  # (sum(P_i), )
+    num_points_per_cloud = pcls.num_points_per_cloud()  # (N,)
+    weights_p = num_points_per_cloud.gather(0, point_to_cloud_idx)
+    weights_p = 1.0 / weights_p.float()
+    point_to_edge = point_to_edge * weights_p
+    point_dist = point_to_edge.sum() / N
+
+    # edge to edge distance: shape (S,)
+    edge_to_point = edge_point_distance(
+        points, points_first_idx, segms, segms_first_idx, max_segms
+    )
+
+    # weight each example by the inverse of number of edges in the example
+    segm_to_mesh_idx = meshes.edges_packed_to_mesh_idx()  # (sum(S_n),)
+    num_segms_per_mesh = meshes.num_edges_per_mesh()  # (N,)
+    weights_s = num_segms_per_mesh.gather(0, segm_to_mesh_idx)
+    weights_s = 1.0 / weights_s.float()
+    edge_to_point = edge_to_point * weights_s
+    edge_dist = edge_to_point.sum() / N
+
+    return point_dist + edge_dist
+
+
+def point_mesh_face_distance(meshes: Meshes, pcls: Pointclouds):
+    """
+    Computes the distance between a pointcloud and a mesh within a batch.
+    Given a pair `(mesh, pcl)` in the batch, we define the distance to be the
+    sum of two distances, namely `point_face(mesh, pcl) + face_point(mesh, pcl)`
+
+    `point_face(mesh, pcl)`: Computes the squared distance of each point p in pcl
+        to the closest triangular face in mesh and averages across all points in pcl
+    `face_point(mesh, pcl)`: Computes the squared distance of each triangular face in
+        mesh to the closest point in pcl and averages across all faces in mesh.
+
+    The above distance functions are applied for all `(mesh, pcl)` pairs in the batch
+    and then averaged across the batch.
+
+    Args:
+        meshes: A Meshes data structure containing N meshes
+        pcls: A Pointclouds data structure containing N pointclouds
+
+    Returns:
+        loss: The `point_face(mesh, pcl) + face_point(mesh, pcl)` distance
+            between all `(mesh, pcl)` in a batch averaged across the batch.
+    """
+
+    if len(meshes) != len(pcls):
+        raise ValueError("meshes and pointclouds must be equal sized batches")
+    N = len(meshes)
+
+    # packed representation for pointclouds
+    points = pcls.points_packed()  # (P, 3)
+    points_first_idx = pcls.cloud_to_packed_first_idx()
+    max_points = pcls.num_points_per_cloud().max().item()
+
+    # packed representation for faces
+    verts_packed = meshes.verts_packed()
+    faces_packed = meshes.faces_packed()
+    tris = verts_packed[faces_packed]  # (T, 3, 3)
+    tris_first_idx = meshes.mesh_to_faces_packed_first_idx()
+    max_tris = meshes.num_faces_per_mesh().max().item()
+
+    # point to face distance: shape (P,)
+    point_to_face = point_face_distance(
+        points, points_first_idx, tris, tris_first_idx, max_points
+    )
+
+    # weight each example by the inverse of number of points in the example
+    point_to_cloud_idx = pcls.packed_to_cloud_idx()  # (sum(P_i),)
+    num_points_per_cloud = pcls.num_points_per_cloud()  # (N,)
+    weights_p = num_points_per_cloud.gather(0, point_to_cloud_idx)
+    weights_p = 1.0 / weights_p.float()
+    point_to_face = point_to_face * weights_p
+    point_dist = point_to_face.sum() / N
+
+    # face to point distance: shape (T,)
+    face_to_point = face_point_distance(
+        points, points_first_idx, tris, tris_first_idx, max_tris
+    )
+
+    # weight each example by the inverse of number of faces in the example
+    tri_to_mesh_idx = meshes.faces_packed_to_mesh_idx()  # (sum(T_n),)
+    num_tris_per_mesh = meshes.num_faces_per_mesh()  # (N, )
+    weights_t = num_tris_per_mesh.gather(0, tri_to_mesh_idx)
+    weights_t = 1.0 / weights_t.float()
+    face_to_point = face_to_point * weights_t
+    face_dist = face_to_point.sum() / N
+
+    return point_dist + face_dist
diff --git a/pytorch3d/pytorch3d/ops/__init__.py b/pytorch3d/pytorch3d/ops/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a9b028292c688a0134c914c551156c949cae2cd
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/__init__.py
@@ -0,0 +1,41 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .ball_query import ball_query
+from .cameras_alignment import corresponding_cameras_alignment
+from .cubify import cubify
+from .graph_conv import GraphConv
+from .interp_face_attrs import interpolate_face_attributes
+from .iou_box3d import box3d_overlap
+from .knn import knn_gather, knn_points
+from .laplacian_matrices import cot_laplacian, laplacian, norm_laplacian
+from .mesh_face_areas_normals import mesh_face_areas_normals
+from .mesh_filtering import taubin_smoothing
+from .packed_to_padded import packed_to_padded, padded_to_packed
+from .perspective_n_points import efficient_pnp
+from .points_alignment import corresponding_points_alignment, iterative_closest_point
+from .points_normals import (
+    estimate_pointcloud_local_coord_frames,
+    estimate_pointcloud_normals,
+)
+from .points_to_volumes import (
+    add_pointclouds_to_volumes,
+    add_points_features_to_volume_densities_features,
+)
+from .sample_farthest_points import sample_farthest_points
+from .sample_points_from_meshes import sample_points_from_meshes
+from .subdivide_meshes import SubdivideMeshes
+from .utils import (
+    convert_pointclouds_to_tensor,
+    eyes,
+    get_point_covariances,
+    is_pointclouds,
+    wmean,
+)
+from .vert_align import vert_align
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/ops/ball_query.py b/pytorch3d/pytorch3d/ops/ball_query.py
new file mode 100644
index 0000000000000000000000000000000000000000..e352d8787e4511016107e5aad4578a6af29ed52b
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/ball_query.py
@@ -0,0 +1,140 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Union
+
+import torch
+from pytorch3d import _C
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from .knn import _KNN
+from .utils import masked_gather
+
+
+class _ball_query(Function):
+    """
+    Torch autograd Function wrapper for Ball Query C++/CUDA implementations.
+    """
+
+    @staticmethod
+    def forward(ctx, p1, p2, lengths1, lengths2, K, radius):
+        """
+        Arguments defintions the same as in the ball_query function
+        """
+        idx, dists = _C.ball_query(p1, p2, lengths1, lengths2, K, radius)
+        ctx.save_for_backward(p1, p2, lengths1, lengths2, idx)
+        ctx.mark_non_differentiable(idx)
+        return dists, idx
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_dists, grad_idx):
+        p1, p2, lengths1, lengths2, idx = ctx.saved_tensors
+        # TODO(gkioxari) Change cast to floats once we add support for doubles.
+        if not (grad_dists.dtype == torch.float32):
+            grad_dists = grad_dists.float()
+        if not (p1.dtype == torch.float32):
+            p1 = p1.float()
+        if not (p2.dtype == torch.float32):
+            p2 = p2.float()
+
+        # Reuse the KNN backward function
+        grad_p1, grad_p2 = _C.knn_points_backward(
+            p1, p2, lengths1, lengths2, idx, grad_dists
+        )
+        return grad_p1, grad_p2, None, None, None, None
+
+
+def ball_query(
+    p1: torch.Tensor,
+    p2: torch.Tensor,
+    lengths1: Union[torch.Tensor, None] = None,
+    lengths2: Union[torch.Tensor, None] = None,
+    K: int = 500,
+    radius: float = 0.2,
+    return_nn: bool = True,
+):
+    """
+    Ball Query is an alternative to KNN. It can be
+    used to find all points in p2 that are within a specified radius
+    to the query point in p1 (with an upper limit of K neighbors).
+
+    The neighbors returned are not necssarily the *nearest* to the
+    point in p1, just the first K values in p2 which are within the
+    specified radius.
+
+    This method is faster than kNN when there are large numbers of points
+    in p2 and the ordering of neighbors is not important compared to the
+    distance being within the radius threshold.
+
+    "Ball query’s local neighborhood guarantees a fixed region scale thus
+    making local region features more generalizable across space, which is
+    preferred for tasks requiring local pattern recognition
+    (e.g. semantic point labeling)" [1].
+
+    [1] Charles R. Qi et al, "PointNet++: Deep Hierarchical Feature Learning
+        on Point Sets in a Metric Space", NeurIPS 2017.
+
+    Args:
+        p1: Tensor of shape (N, P1, D) giving a batch of N point clouds, each
+            containing up to P1 points of dimension D. These represent the centers of
+            the ball queries.
+        p2: Tensor of shape (N, P2, D) giving a batch of N point clouds, each
+            containing up to P2 points of dimension D.
+        lengths1: LongTensor of shape (N,) of values in the range [0, P1], giving the
+            length of each pointcloud in p1. Or None to indicate that every cloud has
+            length P1.
+        lengths2: LongTensor of shape (N,) of values in the range [0, P2], giving the
+            length of each pointcloud in p2. Or None to indicate that every cloud has
+            length P2.
+        K: Integer giving the upper bound on the number of samples to take
+            within the radius
+        radius: the radius around each point within which the neighbors need to be located
+        return_nn: If set to True returns the K neighbor points in p2 for each point in p1.
+
+    Returns:
+        dists: Tensor of shape (N, P1, K) giving the squared distances to
+            the neighbors. This is padded with zeros both where a cloud in p2
+            has fewer than S points and where a cloud in p1 has fewer than P1 points
+            and also if there are fewer than K points which satisfy the radius threshold.
+
+        idx: LongTensor of shape (N, P1, K) giving the indices of the
+            S neighbors in p2 for points in p1.
+            Concretely, if `p1_idx[n, i, k] = j` then `p2[n, j]` is the k-th
+            neighbor to `p1[n, i]` in `p2[n]`. This is padded with -1 both where a cloud
+            in p2 has fewer than S points and where a cloud in p1 has fewer than P1
+            points and also if there are fewer than K points which satisfy the radius threshold.
+
+        nn: Tensor of shape (N, P1, K, D) giving the K neighbors in p2 for
+            each point in p1. Concretely, `p2_nn[n, i, k]` gives the k-th neighbor
+            for `p1[n, i]`. Returned if `return_nn` is True.  The output is a tensor
+            of shape (N, P1, K, U).
+
+    """
+    if p1.shape[0] != p2.shape[0]:
+        raise ValueError("pts1 and pts2 must have the same batch dimension.")
+    if p1.shape[2] != p2.shape[2]:
+        raise ValueError("pts1 and pts2 must have the same point dimension.")
+
+    p1 = p1.contiguous()
+    p2 = p2.contiguous()
+    P1 = p1.shape[1]
+    P2 = p2.shape[1]
+    N = p1.shape[0]
+
+    if lengths1 is None:
+        lengths1 = torch.full((N,), P1, dtype=torch.int64, device=p1.device)
+    if lengths2 is None:
+        lengths2 = torch.full((N,), P2, dtype=torch.int64, device=p1.device)
+
+    # pyre-fixme[16]: `_ball_query` has no attribute `apply`.
+    dists, idx = _ball_query.apply(p1, p2, lengths1, lengths2, K, radius)
+
+    # Gather the neighbors if needed
+    points_nn = masked_gather(p2, idx) if return_nn else None
+
+    return _KNN(dists=dists, idx=idx, knn=points_nn)
diff --git a/pytorch3d/pytorch3d/ops/cameras_alignment.py b/pytorch3d/pytorch3d/ops/cameras_alignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..a196c04ae681eabcdb02a9808b14cdb11f09bc14
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/cameras_alignment.py
@@ -0,0 +1,227 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import TYPE_CHECKING
+
+import torch
+
+from .. import ops
+
+
+if TYPE_CHECKING:
+    from pytorch3d.renderer.cameras import CamerasBase
+
+
+def corresponding_cameras_alignment(
+    cameras_src: "CamerasBase",
+    cameras_tgt: "CamerasBase",
+    estimate_scale: bool = True,
+    mode: str = "extrinsics",
+    eps: float = 1e-9,
+) -> "CamerasBase":  # pragma: no cover
+    """
+    .. warning::
+        The `corresponding_cameras_alignment` API is experimental
+        and subject to change!
+
+    Estimates a single similarity transformation between two sets of cameras
+    `cameras_src` and `cameras_tgt` and returns an aligned version of
+    `cameras_src`.
+
+    Given source cameras [(R_1, T_1), (R_2, T_2), ..., (R_N, T_N)] and
+    target cameras [(R_1', T_1'), (R_2', T_2'), ..., (R_N', T_N')],
+    where (R_i, T_i) is a 2-tuple of the camera rotation and translation matrix
+    respectively, the algorithm finds a global rotation, translation and scale
+    (R_A, T_A, s_A) which aligns all source cameras with the target cameras
+    such that the following holds:
+
+        Under the change of coordinates using a similarity transform
+        (R_A, T_A, s_A) a 3D point X' is mapped to X with:
+            ```
+            X = (X' R_A + T_A) / s_A
+            ```
+        Then, for all cameras `i`, we assume that the following holds:
+            ```
+            X R_i + T_i = s' (X' R_i' + T_i'),
+            ```
+        i.e. an adjusted point X' is mapped by a camera (R_i', T_i')
+        to the same point as imaged from camera (R_i, T_i) after resolving
+        the scale ambiguity with a global scalar factor s'.
+
+        Substituting for X above gives rise to the following:
+            ```
+            (X' R_A + T_A) / s_A R_i + T_i = s' (X' R_i' + T_i')       // · s_A
+            (X' R_A + T_A) R_i + T_i s_A = (s' s_A) (X' R_i' + T_i')
+            s' := 1 / s_A  # without loss of generality
+            (X' R_A + T_A) R_i + T_i s_A = X' R_i' + T_i'
+            X' R_A R_i + T_A R_i + T_i s_A = X' R_i' + T_i'
+               ^^^^^^^   ^^^^^^^^^^^^^^^^^
+               ~= R_i'        ~= T_i'
+            ```
+        i.e. after estimating R_A, T_A, s_A, the aligned source cameras have
+        extrinsics:
+            `cameras_src_align = (R_A R_i, T_A R_i + T_i s_A) ~= (R_i', T_i')`
+
+    We support two ways `R_A, T_A, s_A` can be estimated:
+        1) `mode=='centers'`
+            Estimates the similarity alignment between camera centers using
+            Umeyama's algorithm (see `pytorch3d.ops.corresponding_points_alignment`
+            for details) and transforms camera extrinsics accordingly.
+
+        2) `mode=='extrinsics'`
+            Defines the alignment problem as a system
+            of the following equations:
+                ```
+                for all i:
+                [ R_A   0 ] x [ R_i         0 ] = [ R_i' 0 ]
+                [ T_A^T 1 ]   [ (s_A T_i^T) 1 ]   [ T_i' 1 ]
+                ```
+            `R_A, T_A` and `s_A` are then obtained by solving the
+            system in the least squares sense.
+
+    The estimated camera transformation is a true similarity transform, i.e.
+    it cannot be a reflection.
+
+    Args:
+        cameras_src: `N` cameras to be aligned.
+        cameras_tgt: `N` target cameras.
+        estimate_scale: Controls whether the alignment transform is rigid
+            (`estimate_scale=False`), or a similarity (`estimate_scale=True`).
+            `s_A` is set to `1` if `estimate_scale==False`.
+        mode: Controls the alignment algorithm.
+            Can be one either `'centers'` or `'extrinsics'`. Please refer to the
+            description above for details.
+        eps: A scalar for clamping to avoid dividing by zero.
+            Active when `estimate_scale==True`.
+
+    Returns:
+        cameras_src_aligned: `cameras_src` after applying the alignment transform.
+    """
+
+    if cameras_src.R.shape[0] != cameras_tgt.R.shape[0]:
+        raise ValueError(
+            "cameras_src and cameras_tgt have to contain the same number of cameras!"
+        )
+
+    if mode == "centers":
+        align_fun = _align_camera_centers
+    elif mode == "extrinsics":
+        align_fun = _align_camera_extrinsics
+    else:
+        raise ValueError("mode has to be one of (centers, extrinsics)")
+
+    align_t_R, align_t_T, align_t_s = align_fun(
+        cameras_src, cameras_tgt, estimate_scale=estimate_scale, eps=eps
+    )
+
+    # create a new cameras object and set the R and T accordingly
+    cameras_src_aligned = cameras_src.clone()
+    cameras_src_aligned.R = torch.bmm(align_t_R.expand_as(cameras_src.R), cameras_src.R)
+    cameras_src_aligned.T = (
+        torch.bmm(
+            align_t_T[:, None].repeat(cameras_src.R.shape[0], 1, 1), cameras_src.R
+        )[:, 0]
+        + cameras_src.T * align_t_s
+    )
+
+    return cameras_src_aligned
+
+
+def _align_camera_centers(
+    cameras_src: "CamerasBase",
+    cameras_tgt: "CamerasBase",
+    estimate_scale: bool = True,
+    eps: float = 1e-9,
+):  # pragma: no cover
+    """
+    Use Umeyama's algorithm to align the camera centers.
+    """
+    centers_src = cameras_src.get_camera_center()
+    centers_tgt = cameras_tgt.get_camera_center()
+    align_t = ops.corresponding_points_alignment(
+        centers_src[None],
+        centers_tgt[None],
+        estimate_scale=estimate_scale,
+        allow_reflection=False,
+        eps=eps,
+    )
+    # the camera transform is the inverse of the estimated transform between centers
+    align_t_R = align_t.R.permute(0, 2, 1)
+    align_t_T = -(torch.bmm(align_t.T[:, None], align_t_R))[:, 0]
+    align_t_s = align_t.s[0]
+
+    return align_t_R, align_t_T, align_t_s
+
+
+def _align_camera_extrinsics(
+    cameras_src: "CamerasBase",
+    cameras_tgt: "CamerasBase",
+    estimate_scale: bool = True,
+    eps: float = 1e-9,
+):  # pragma: no cover
+    """
+    Get the global rotation R_A with svd of cov(RR^T):
+        ```
+        R_A R_i = R_i' for all i
+        R_A [R_1 R_2 ... R_N] = [R_1' R_2' ... R_N']
+        U, _, V = svd([R_1 R_2 ... R_N]^T [R_1' R_2' ... R_N'])
+        R_A = (U V^T)^T
+        ```
+    """
+    RRcov = torch.bmm(cameras_src.R, cameras_tgt.R.transpose(2, 1)).mean(0)
+    U, _, V = torch.svd(RRcov)
+    align_t_R = V @ U.t()
+
+    """
+    The translation + scale `T_A` and `s_A` is computed by finding
+    a translation and scaling that aligns two tensors `A, B`
+    defined as follows:
+        ```
+        T_A R_i + s_A T_i   = T_i'        ;  for all i    // · R_i^T
+        s_A T_i R_i^T + T_A = T_i' R_i^T  ;  for all i
+            ^^^^^^^^^         ^^^^^^^^^^
+                A_i                B_i
+
+        A_i := T_i R_i^T
+        A = [A_1 A_2 ... A_N]
+        B_i := T_i' R_i^T
+        B = [B_1 B_2 ... B_N]
+        ```
+    The scale s_A can be retrieved by matching the correlations of
+    the points sets A and B:
+        ```
+        s_A = (A-mean(A))*(B-mean(B)).sum() / ((A-mean(A))**2).sum()
+        ```
+    The translation `T_A` is then defined as:
+        ```
+        T_A = mean(B) - mean(A) * s_A
+        ```
+    """
+    # pyre-fixme[29]:
+    #  `Union[BoundMethod[typing.Callable(torch.Tensor.__getitem__)[[Named(self,
+    #  torch.Tensor), Named(item, typing.Any)], typing.Any], torch.Tensor],
+    #  torch.Tensor, torch.nn.Module]` is not a function.
+    A = torch.bmm(cameras_src.R, cameras_src.T[:, :, None])[:, :, 0]
+    # pyre-fixme[29]:
+    #  `Union[BoundMethod[typing.Callable(torch.Tensor.__getitem__)[[Named(self,
+    #  torch.Tensor), Named(item, typing.Any)], typing.Any], torch.Tensor],
+    #  torch.Tensor, torch.nn.Module]` is not a function.
+    B = torch.bmm(cameras_src.R, cameras_tgt.T[:, :, None])[:, :, 0]
+    Amu = A.mean(0, keepdim=True)
+    Bmu = B.mean(0, keepdim=True)
+    if estimate_scale and A.shape[0] > 1:
+        # get the scaling component by matching covariances
+        # of centered A and centered B
+        Ac = A - Amu
+        Bc = B - Bmu
+        align_t_s = (Ac * Bc).mean() / (Ac ** 2).mean().clamp(eps)
+    else:
+        # set the scale to identity
+        align_t_s = 1.0
+    # get the translation as the difference between the means of A and B
+    align_t_T = Bmu - align_t_s * Amu
+
+    return align_t_R, align_t_T, align_t_s
diff --git a/pytorch3d/pytorch3d/ops/cubify.py b/pytorch3d/pytorch3d/ops/cubify.py
new file mode 100644
index 0000000000000000000000000000000000000000..e9200c9b5647bc374306ec7d47b4c6928c3e6506
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/cubify.py
@@ -0,0 +1,242 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+import torch.nn.functional as F
+from pytorch3d.structures import Meshes
+
+
+def unravel_index(idx, dims) -> torch.Tensor:
+    r"""
+    Equivalent to np.unravel_index
+    Args:
+      idx: A LongTensor whose elements are indices into the
+          flattened version of an array of dimensions dims.
+      dims: The shape of the array to be indexed.
+    Implemented only for dims=(N, H, W, D)
+    """
+    if len(dims) != 4:
+        raise ValueError("Expects a 4-element list.")
+    N, H, W, D = dims
+    n = idx // (H * W * D)
+    h = (idx - n * H * W * D) // (W * D)
+    w = (idx - n * H * W * D - h * W * D) // D
+    d = idx - n * H * W * D - h * W * D - w * D
+    return torch.stack((n, h, w, d), dim=1)
+
+
+def ravel_index(idx, dims) -> torch.Tensor:
+    """
+    Computes the linear index in an array of shape dims.
+    It performs the reverse functionality of unravel_index
+    Args:
+      idx: A LongTensor of shape (N, 3). Each row corresponds to indices into an
+          array of dimensions dims.
+      dims: The shape of the array to be indexed.
+    Implemented only for dims=(H, W, D)
+    """
+    if len(dims) != 3:
+        raise ValueError("Expects a 3-element list")
+    if idx.shape[1] != 3:
+        raise ValueError("Expects an index tensor of shape Nx3")
+    H, W, D = dims
+    linind = idx[:, 0] * W * D + idx[:, 1] * D + idx[:, 2]
+    return linind
+
+
+@torch.no_grad()
+def cubify(voxels, thresh, device=None, align: str = "topleft") -> Meshes:
+    r"""
+    Converts a voxel to a mesh by replacing each occupied voxel with a cube
+    consisting of 12 faces and 8 vertices. Shared vertices are merged, and
+    internal faces are removed.
+    Args:
+      voxels: A FloatTensor of shape (N, D, H, W) containing occupancy probabilities.
+      thresh: A scalar threshold. If a voxel occupancy is larger than
+          thresh, the voxel is considered occupied.
+      device: The device of the output meshes
+      align: Defines the alignment of the mesh vertices and the grid locations.
+          Has to be one of {"topleft", "corner", "center"}. See below for explanation.
+          Default is "topleft".
+    Returns:
+      meshes: A Meshes object of the corresponding meshes.
+
+
+    The alignment between the vertices of the cubified mesh and the voxel locations (or pixels)
+    is defined by the choice of `align`. We support three modes, as shown below for a 2x2 grid:
+
+                X---X----         X-------X        ---------
+                |   |   |         |   |   |        | X | X |
+                X---X----         ---------        ---------
+                |   |   |         |   |   |        | X | X |
+                ---------         X-------X        ---------
+
+                 topleft           corner            center
+
+    In the figure, X denote the grid locations and the squares represent the added cuboids.
+    When `align="topleft"`, then the top left corner of each cuboid corresponds to the
+    pixel coordinate of the input grid.
+    When `align="corner"`, then the corners of the output mesh span the whole grid.
+    When `align="center"`, then the grid locations form the center of the cuboids.
+    """
+
+    if device is None:
+        device = voxels.device
+
+    if align not in ["topleft", "corner", "center"]:
+        raise ValueError("Align mode must be one of (topleft, corner, center).")
+
+    if len(voxels) == 0:
+        return Meshes(verts=[], faces=[])
+
+    N, D, H, W = voxels.size()
+    # vertices corresponding to a unit cube: 8x3
+    cube_verts = torch.tensor(
+        [
+            [0, 0, 0],
+            [0, 0, 1],
+            [0, 1, 0],
+            [0, 1, 1],
+            [1, 0, 0],
+            [1, 0, 1],
+            [1, 1, 0],
+            [1, 1, 1],
+        ],
+        dtype=torch.int64,
+        device=device,
+    )
+
+    # faces corresponding to a unit cube: 12x3
+    cube_faces = torch.tensor(
+        [
+            [0, 1, 2],
+            [1, 3, 2],  # left face: 0, 1
+            [2, 3, 6],
+            [3, 7, 6],  # bottom face: 2, 3
+            [0, 2, 6],
+            [0, 6, 4],  # front face: 4, 5
+            [0, 5, 1],
+            [0, 4, 5],  # up face: 6, 7
+            [6, 7, 5],
+            [6, 5, 4],  # right face: 8, 9
+            [1, 7, 3],
+            [1, 5, 7],  # back face: 10, 11
+        ],
+        dtype=torch.int64,
+        device=device,
+    )
+
+    wx = torch.tensor([0.5, 0.5], device=device).view(1, 1, 1, 1, 2)
+    wy = torch.tensor([0.5, 0.5], device=device).view(1, 1, 1, 2, 1)
+    wz = torch.tensor([0.5, 0.5], device=device).view(1, 1, 2, 1, 1)
+
+    voxelt = voxels.ge(thresh).float()
+    # N x 1 x D x H x W
+    voxelt = voxelt.view(N, 1, D, H, W)
+
+    # N x 1 x (D-1) x (H-1) x (W-1)
+    voxelt_x = F.conv3d(voxelt, wx).gt(0.5).float()
+    voxelt_y = F.conv3d(voxelt, wy).gt(0.5).float()
+    voxelt_z = F.conv3d(voxelt, wz).gt(0.5).float()
+
+    # 12 x N x 1 x D x H x W
+    faces_idx = torch.ones((cube_faces.size(0), N, 1, D, H, W), device=device)
+
+    # add left face
+    faces_idx[0, :, :, :, :, 1:] = 1 - voxelt_x
+    faces_idx[1, :, :, :, :, 1:] = 1 - voxelt_x
+    # add bottom face
+    faces_idx[2, :, :, :, :-1, :] = 1 - voxelt_y
+    faces_idx[3, :, :, :, :-1, :] = 1 - voxelt_y
+    # add front face
+    faces_idx[4, :, :, 1:, :, :] = 1 - voxelt_z
+    faces_idx[5, :, :, 1:, :, :] = 1 - voxelt_z
+    # add up face
+    faces_idx[6, :, :, :, 1:, :] = 1 - voxelt_y
+    faces_idx[7, :, :, :, 1:, :] = 1 - voxelt_y
+    # add right face
+    faces_idx[8, :, :, :, :, :-1] = 1 - voxelt_x
+    faces_idx[9, :, :, :, :, :-1] = 1 - voxelt_x
+    # add back face
+    faces_idx[10, :, :, :-1, :, :] = 1 - voxelt_z
+    faces_idx[11, :, :, :-1, :, :] = 1 - voxelt_z
+
+    faces_idx *= voxelt
+
+    # N x H x W x D x 12
+    faces_idx = faces_idx.permute(1, 2, 4, 5, 3, 0).squeeze(1)
+    # (NHWD) x 12
+    faces_idx = faces_idx.contiguous()
+    faces_idx = faces_idx.view(-1, cube_faces.size(0))
+
+    # boolean to linear index
+    # NF x 2
+    linind = torch.nonzero(faces_idx, as_tuple=False)
+    # NF x 4
+    nyxz = unravel_index(linind[:, 0], (N, H, W, D))
+
+    # NF x 3: faces
+    faces = torch.index_select(cube_faces, 0, linind[:, 1])
+
+    grid_faces = []
+    for d in range(cube_faces.size(1)):
+        # NF x 3
+        xyz = torch.index_select(cube_verts, 0, faces[:, d])
+        permute_idx = torch.tensor([1, 0, 2], device=device)
+        yxz = torch.index_select(xyz, 1, permute_idx)
+        yxz += nyxz[:, 1:]
+        # NF x 1
+        temp = ravel_index(yxz, (H + 1, W + 1, D + 1))
+        grid_faces.append(temp)
+    # NF x 3
+    grid_faces = torch.stack(grid_faces, dim=1)
+
+    y, x, z = torch.meshgrid(
+        torch.arange(H + 1), torch.arange(W + 1), torch.arange(D + 1)
+    )
+    y = y.to(device=device, dtype=torch.float32)
+    x = x.to(device=device, dtype=torch.float32)
+    z = z.to(device=device, dtype=torch.float32)
+
+    if align == "center":
+        x = x - 0.5
+        y = y - 0.5
+        z = z - 0.5
+
+    margin = 0.0 if align == "corner" else 1.0
+    y = y * 2.0 / (H - margin) - 1.0
+    x = x * 2.0 / (W - margin) - 1.0
+    z = z * 2.0 / (D - margin) - 1.0
+
+    # ((H+1)(W+1)(D+1)) x 3
+    grid_verts = torch.stack((x, y, z), dim=3).view(-1, 3)
+
+    if len(nyxz) == 0:
+        verts_list = [torch.tensor([], dtype=torch.float32, device=device)] * N
+        faces_list = [torch.tensor([], dtype=torch.int64, device=device)] * N
+        return Meshes(verts=verts_list, faces=faces_list)
+
+    num_verts = grid_verts.size(0)
+    grid_faces += nyxz[:, 0].view(-1, 1) * num_verts
+    idleverts = torch.ones(num_verts * N, dtype=torch.uint8, device=device)
+
+    idleverts.scatter_(0, grid_faces.flatten(), 0)
+    grid_faces -= nyxz[:, 0].view(-1, 1) * num_verts
+    split_size = torch.bincount(nyxz[:, 0], minlength=N)
+    faces_list = list(torch.split(grid_faces, split_size.tolist(), 0))
+
+    idleverts = idleverts.view(N, num_verts)
+    idlenum = idleverts.cumsum(1)
+
+    verts_list = [
+        # pyre-fixme[16]: `Tensor` has no attribute `index_select`.
+        grid_verts.index_select(0, (idleverts[n] == 0).nonzero(as_tuple=False)[:, 0])
+        for n in range(N)
+    ]
+    faces_list = [nface - idlenum[n][nface] for n, nface in enumerate(faces_list)]
+
+    return Meshes(verts=verts_list, faces=faces_list)
diff --git a/pytorch3d/pytorch3d/ops/graph_conv.py b/pytorch3d/pytorch3d/ops/graph_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..0de1adc5ba41394ab63c0a537a58b8777ae79c3b
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/graph_conv.py
@@ -0,0 +1,176 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+import torch.nn as nn
+from pytorch3d import _C
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+
+class GraphConv(nn.Module):
+    """A single graph convolution layer."""
+
+    def __init__(
+        self,
+        input_dim: int,
+        output_dim: int,
+        init: str = "normal",
+        directed: bool = False,
+    ) -> None:
+        """
+        Args:
+            input_dim: Number of input features per vertex.
+            output_dim: Number of output features per vertex.
+            init: Weight initialization method. Can be one of ['zero', 'normal'].
+            directed: Bool indicating if edges in the graph are directed.
+        """
+        super().__init__()
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.directed = directed
+        self.w0 = nn.Linear(input_dim, output_dim)
+        self.w1 = nn.Linear(input_dim, output_dim)
+
+        if init == "normal":
+            nn.init.normal_(self.w0.weight, mean=0, std=0.01)
+            nn.init.normal_(self.w1.weight, mean=0, std=0.01)
+            self.w0.bias.data.zero_()
+            self.w1.bias.data.zero_()
+        elif init == "zero":
+            self.w0.weight.data.zero_()
+            self.w1.weight.data.zero_()
+        else:
+            raise ValueError('Invalid GraphConv initialization "%s"' % init)
+
+    def forward(self, verts, edges):
+        """
+        Args:
+            verts: FloatTensor of shape (V, input_dim) where V is the number of
+                vertices and input_dim is the number of input features
+                per vertex. input_dim has to match the input_dim specified
+                in __init__.
+            edges: LongTensor of shape (E, 2) where E is the number of edges
+                where each edge has the indices of the two vertices which
+                form the edge.
+
+        Returns:
+            out: FloatTensor of shape (V, output_dim) where output_dim is the
+            number of output features per vertex.
+        """
+        if verts.is_cuda != edges.is_cuda:
+            raise ValueError("verts and edges tensors must be on the same device.")
+        if verts.shape[0] == 0:
+            # empty graph.
+            return verts.new_zeros((0, self.output_dim)) * verts.sum()
+
+        verts_w0 = self.w0(verts)  # (V, output_dim)
+        verts_w1 = self.w1(verts)  # (V, output_dim)
+
+        if torch.cuda.is_available() and verts.is_cuda and edges.is_cuda:
+            neighbor_sums = gather_scatter(verts_w1, edges, self.directed)
+        else:
+            neighbor_sums = gather_scatter_python(
+                verts_w1, edges, self.directed
+            )  # (V, output_dim)
+
+        # Add neighbor features to each vertex's features.
+        out = verts_w0 + neighbor_sums
+        return out
+
+    def __repr__(self):
+        Din, Dout, directed = self.input_dim, self.output_dim, self.directed
+        return "GraphConv(%d -> %d, directed=%r)" % (Din, Dout, directed)
+
+
+def gather_scatter_python(input, edges, directed: bool = False):
+    """
+    Python implementation of gather_scatter for aggregating features of
+    neighbor nodes in a graph.
+
+    Given a directed graph: v0 -> v1 -> v2 the updated feature for v1 depends
+    on v2 in order to be consistent with Morris et al. AAAI 2019
+    (https://arxiv.org/abs/1810.02244). This only affects
+    directed graphs; for undirected graphs v1 will depend on both v0 and v2,
+    no matter which way the edges are physically stored.
+
+    Args:
+        input: Tensor of shape (num_vertices, input_dim).
+        edges: Tensor of edge indices of shape (num_edges, 2).
+        directed: bool indicating if edges are directed.
+
+    Returns:
+        output: Tensor of same shape as input.
+    """
+    if not (input.dim() == 2):
+        raise ValueError("input can only have 2 dimensions.")
+    if not (edges.dim() == 2):
+        raise ValueError("edges can only have 2 dimensions.")
+    if not (edges.shape[1] == 2):
+        raise ValueError("edges must be of shape (num_edges, 2).")
+
+    num_vertices, input_feature_dim = input.shape
+    num_edges = edges.shape[0]
+    output = torch.zeros_like(input)
+    idx0 = edges[:, 0].view(num_edges, 1).expand(num_edges, input_feature_dim)
+    idx1 = edges[:, 1].view(num_edges, 1).expand(num_edges, input_feature_dim)
+
+    # pyre-fixme[16]: `Tensor` has no attribute `scatter_add`.
+    output = output.scatter_add(0, idx0, input.gather(0, idx1))
+    if not directed:
+        output = output.scatter_add(0, idx1, input.gather(0, idx0))
+    return output
+
+
+class GatherScatter(Function):
+    """
+    Torch autograd Function wrapper for gather_scatter C++/CUDA implementations.
+    """
+
+    @staticmethod
+    def forward(ctx, input, edges, directed=False):
+        """
+        Args:
+            ctx: Context object used to calculate gradients.
+            input: Tensor of shape (num_vertices, input_dim)
+            edges: Tensor of edge indices of shape (num_edges, 2)
+            directed: Bool indicating if edges are directed.
+
+        Returns:
+            output: Tensor of same shape as input.
+        """
+        if not (input.dim() == 2):
+            raise ValueError("input can only have 2 dimensions.")
+        if not (edges.dim() == 2):
+            raise ValueError("edges can only have 2 dimensions.")
+        if not (edges.shape[1] == 2):
+            raise ValueError("edges must be of shape (num_edges, 2).")
+        if not (input.dtype == torch.float32):
+            raise ValueError("input has to be of type torch.float32.")
+
+        ctx.directed = directed
+        input, edges = input.contiguous(), edges.contiguous()
+        ctx.save_for_backward(edges)
+        backward = False
+        output = _C.gather_scatter(input, edges, directed, backward)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        grad_output = grad_output.contiguous()
+        edges = ctx.saved_tensors[0]
+        directed = ctx.directed
+        backward = True
+        grad_input = _C.gather_scatter(grad_output, edges, directed, backward)
+        grad_edges = None
+        grad_directed = None
+        return grad_input, grad_edges, grad_directed
+
+
+# pyre-fixme[16]: `GatherScatter` has no attribute `apply`.
+gather_scatter = GatherScatter.apply
diff --git a/pytorch3d/pytorch3d/ops/interp_face_attrs.py b/pytorch3d/pytorch3d/ops/interp_face_attrs.py
new file mode 100644
index 0000000000000000000000000000000000000000..52ddcc959fbc3f014c9d98ce16f2a234d7c570b9
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/interp_face_attrs.py
@@ -0,0 +1,101 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from pytorch3d import _C
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+
+def interpolate_face_attributes(
+    pix_to_face: torch.Tensor,
+    barycentric_coords: torch.Tensor,
+    face_attributes: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Interpolate arbitrary face attributes using the barycentric coordinates
+    for each pixel in the rasterized output.
+
+    Args:
+        pix_to_face: LongTensor of shape (...) specifying the indices
+            of the faces (in the packed representation) which overlap each
+            pixel in the image. A value < 0 indicates that the pixel does not
+            overlap any face and should be skipped.
+        barycentric_coords: FloatTensor of shape (N, H, W, K, 3) specifying
+            the barycentric coordinates of each pixel
+            relative to the faces (in the packed
+            representation) which overlap the pixel.
+        face_attributes: packed attributes of shape (total_faces, 3, D),
+            specifying the value of the attribute for each
+            vertex in the face.
+
+    Returns:
+        pixel_vals: tensor of shape (N, H, W, K, D) giving the interpolated
+        value of the face attribute for each pixel.
+    """
+    # Check shapes
+    F, FV, D = face_attributes.shape
+    if FV != 3:
+        raise ValueError("Faces can only have three vertices; got %r" % FV)
+    N, H, W, K, _ = barycentric_coords.shape
+    if pix_to_face.shape != (N, H, W, K):
+        msg = "pix_to_face must have shape (batch_size, H, W, K); got %r"
+        raise ValueError(msg % (pix_to_face.shape,))
+
+    # On CPU use the python version
+    # TODO: Implement a C++ version of this function
+    if not pix_to_face.is_cuda:
+        args = (pix_to_face, barycentric_coords, face_attributes)
+        return interpolate_face_attributes_python(*args)
+
+    # Otherwise flatten and call the custom autograd function
+    N, H, W, K = pix_to_face.shape
+    pix_to_face = pix_to_face.view(-1)
+    barycentric_coords = barycentric_coords.view(N * H * W * K, 3)
+    args = (pix_to_face, barycentric_coords, face_attributes)
+    # pyre-fixme[16]: `_InterpFaceAttrs` has no attribute `apply`.
+    out = _InterpFaceAttrs.apply(*args)
+    out = out.view(N, H, W, K, -1)
+    return out
+
+
+class _InterpFaceAttrs(Function):
+    @staticmethod
+    def forward(ctx, pix_to_face, barycentric_coords, face_attrs):
+        args = (pix_to_face, barycentric_coords, face_attrs)
+        ctx.save_for_backward(*args)
+        return _C.interp_face_attrs_forward(*args)
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_pix_attrs):
+        args = ctx.saved_tensors
+        args = args + (grad_pix_attrs,)
+        grads = _C.interp_face_attrs_backward(*args)
+        grad_pix_to_face = None
+        grad_barycentric_coords = grads[0]
+        grad_face_attrs = grads[1]
+        return grad_pix_to_face, grad_barycentric_coords, grad_face_attrs
+
+
+def interpolate_face_attributes_python(
+    pix_to_face: torch.Tensor,
+    barycentric_coords: torch.Tensor,
+    face_attributes: torch.Tensor,
+) -> torch.Tensor:
+    F, FV, D = face_attributes.shape
+    N, H, W, K, _ = barycentric_coords.shape
+
+    # Replace empty pixels in pix_to_face with 0 in order to interpolate.
+    mask = pix_to_face < 0
+    pix_to_face = pix_to_face.clone()
+    pix_to_face[mask] = 0
+    idx = pix_to_face.view(N * H * W * K, 1, 1).expand(N * H * W * K, 3, D)
+    # pyre-fixme[16]: `Tensor` has no attribute `gather`.
+    pixel_face_vals = face_attributes.gather(0, idx).view(N, H, W, K, 3, D)
+    pixel_vals = (barycentric_coords[..., None] * pixel_face_vals).sum(dim=-2)
+    pixel_vals[mask] = 0  # Replace masked values in output.
+    return pixel_vals
diff --git a/pytorch3d/pytorch3d/ops/iou_box3d.py b/pytorch3d/pytorch3d/ops/iou_box3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..4401c3e2f2b477b94b89525ba918429683cb0925
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/iou_box3d.py
@@ -0,0 +1,169 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+import torch.nn.functional as F
+from pytorch3d import _C
+from torch.autograd import Function
+
+
+# -------------------------------------------------- #
+#                  CONSTANTS                         #
+# -------------------------------------------------- #
+"""
+_box_planes and _box_triangles define the 4- and 3-connectivity
+of the 8 box corners.
+_box_planes gives the quad faces of the 3D box
+_box_triangles gives the triangle faces of the 3D box
+"""
+_box_planes = [
+    [0, 1, 2, 3],
+    [3, 2, 6, 7],
+    [0, 1, 5, 4],
+    [0, 3, 7, 4],
+    [1, 2, 6, 5],
+    [4, 5, 6, 7],
+]
+_box_triangles = [
+    [0, 1, 2],
+    [0, 3, 2],
+    [4, 5, 6],
+    [4, 6, 7],
+    [1, 5, 6],
+    [1, 6, 2],
+    [0, 4, 7],
+    [0, 7, 3],
+    [3, 2, 6],
+    [3, 6, 7],
+    [0, 1, 5],
+    [0, 4, 5],
+]
+
+
+def _check_coplanar(boxes: torch.Tensor, eps: float = 1e-4) -> None:
+    faces = torch.tensor(_box_planes, dtype=torch.int64, device=boxes.device)
+    # pyre-fixme[16]: `boxes` has no attribute `index_select`.
+    verts = boxes.index_select(index=faces.view(-1), dim=1)
+    B = boxes.shape[0]
+    P, V = faces.shape
+    # (B, P, 4, 3) -> (B, P, 3)
+    v0, v1, v2, v3 = verts.reshape(B, P, V, 3).unbind(2)
+
+    # Compute the normal
+    e0 = F.normalize(v1 - v0, dim=-1)
+    e1 = F.normalize(v2 - v0, dim=-1)
+    normal = F.normalize(torch.cross(e0, e1, dim=-1), dim=-1)
+
+    # Check the fourth vertex is also on the same plane
+    mat1 = (v3 - v0).view(B, 1, -1)  # (B, 1, P*3)
+    mat2 = normal.view(B, -1, 1)  # (B, P*3, 1)
+    if not (mat1.bmm(mat2).abs() < eps).all().item():
+        msg = "Plane vertices are not coplanar"
+        raise ValueError(msg)
+
+    return
+
+
+def _check_nonzero(boxes: torch.Tensor, eps: float = 1e-4) -> None:
+    """
+    Checks that the sides of the box have a non zero area
+    """
+    faces = torch.tensor(_box_triangles, dtype=torch.int64, device=boxes.device)
+    # pyre-fixme[16]: `boxes` has no attribute `index_select`.
+    verts = boxes.index_select(index=faces.view(-1), dim=1)
+    B = boxes.shape[0]
+    T, V = faces.shape
+    # (B, T, 3, 3) -> (B, T, 3)
+    v0, v1, v2 = verts.reshape(B, T, V, 3).unbind(2)
+
+    normals = torch.cross(v1 - v0, v2 - v0, dim=-1)  # (B, T, 3)
+    face_areas = normals.norm(dim=-1) / 2
+
+    if (face_areas < eps).any().item():
+        msg = "Planes have zero areas"
+        raise ValueError(msg)
+
+    return
+
+
+class _box3d_overlap(Function):
+    """
+    Torch autograd Function wrapper for box3d_overlap C++/CUDA implementations.
+    Backward is not supported.
+    """
+
+    @staticmethod
+    def forward(ctx, boxes1, boxes2):
+        """
+        Arguments defintions the same as in the box3d_overlap function
+        """
+        vol, iou = _C.iou_box3d(boxes1, boxes2)
+        return vol, iou
+
+    @staticmethod
+    def backward(ctx, grad_vol, grad_iou):
+        raise ValueError("box3d_overlap backward is not supported")
+
+
+def box3d_overlap(
+    boxes1: torch.Tensor, boxes2: torch.Tensor, eps: float = 1e-4
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Computes the intersection of 3D boxes1 and boxes2.
+
+    Inputs boxes1, boxes2 are tensors of shape (B, 8, 3)
+    (where B doesn't have to be the same for boxes1 and boxes1),
+    containing the 8 corners of the boxes, as follows:
+
+        (4) +---------+. (5)
+            | ` .     |  ` .
+            | (0) +---+-----+ (1)
+            |     |   |     |
+        (7) +-----+---+. (6)|
+            ` .   |     ` . |
+            (3) ` +---------+ (2)
+
+
+    NOTE: Throughout this implementation, we assume that boxes
+    are defined by their 8 corners exactly in the order specified in the
+    diagram above for the function to give correct results. In addition
+    the vertices on each plane must be coplanar.
+    As an alternative to the diagram, this is a unit bounding
+    box which has the correct vertex ordering:
+
+    box_corner_vertices = [
+        [0, 0, 0],
+        [1, 0, 0],
+        [1, 1, 0],
+        [0, 1, 0],
+        [0, 0, 1],
+        [1, 0, 1],
+        [1, 1, 1],
+        [0, 1, 1],
+    ]
+
+    Args:
+        boxes1: tensor of shape (N, 8, 3) of the coordinates of the 1st boxes
+        boxes2: tensor of shape (M, 8, 3) of the coordinates of the 2nd boxes
+    Returns:
+        vol: (N, M) tensor of the volume of the intersecting convex shapes
+        iou: (N, M) tensor of the intersection over union which is
+            defined as: `iou = vol / (vol1 + vol2 - vol)`
+    """
+    if not all((8, 3) == box.shape[1:] for box in [boxes1, boxes2]):
+        raise ValueError("Each box in the batch must be of shape (8, 3)")
+
+    _check_coplanar(boxes1, eps)
+    _check_coplanar(boxes2, eps)
+    _check_nonzero(boxes1, eps)
+    _check_nonzero(boxes2, eps)
+
+    # pyre-fixme[16]: `_box3d_overlap` has no attribute `apply`.
+    vol, iou = _box3d_overlap.apply(boxes1, boxes2)
+
+    return vol, iou
diff --git a/pytorch3d/pytorch3d/ops/knn.py b/pytorch3d/pytorch3d/ops/knn.py
new file mode 100644
index 0000000000000000000000000000000000000000..403ddfbee446c881ab0bcb87d1c984211391ad37
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/knn.py
@@ -0,0 +1,235 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections import namedtuple
+from typing import Union
+
+import torch
+from pytorch3d import _C
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+
+_KNN = namedtuple("KNN", "dists idx knn")
+
+
+class _knn_points(Function):
+    """
+    Torch autograd Function wrapper for KNN C++/CUDA implementations.
+    """
+
+    @staticmethod
+    # pyre-fixme[14]: `forward` overrides method defined in `Function` inconsistently.
+    def forward(
+        ctx, p1, p2, lengths1, lengths2, K, version, return_sorted: bool = True
+    ):
+        """
+        K-Nearest neighbors on point clouds.
+
+        Args:
+            p1: Tensor of shape (N, P1, D) giving a batch of N point clouds, each
+                containing up to P1 points of dimension D.
+            p2: Tensor of shape (N, P2, D) giving a batch of N point clouds, each
+                containing up to P2 points of dimension D.
+            lengths1: LongTensor of shape (N,) of values in the range [0, P1], giving the
+                length of each pointcloud in p1. Or None to indicate that every cloud has
+                length P1.
+            lengths2: LongTensor of shape (N,) of values in the range [0, P2], giving the
+                length of each pointcloud in p2. Or None to indicate that every cloud has
+                length P2.
+            K: Integer giving the number of nearest neighbors to return.
+            version: Which KNN implementation to use in the backend. If version=-1,
+                the correct implementation is selected based on the shapes of the inputs.
+            return_sorted: (bool) whether to return the nearest neighbors sorted in
+                ascending order of distance.
+
+        Returns:
+            p1_dists: Tensor of shape (N, P1, K) giving the squared distances to
+                the nearest neighbors. This is padded with zeros both where a cloud in p2
+                has fewer than K points and where a cloud in p1 has fewer than P1 points.
+
+            p1_idx: LongTensor of shape (N, P1, K) giving the indices of the
+                K nearest neighbors from points in p1 to points in p2.
+                Concretely, if `p1_idx[n, i, k] = j` then `p2[n, j]` is the k-th nearest
+                neighbors to `p1[n, i]` in `p2[n]`. This is padded with zeros both where a cloud
+                in p2 has fewer than K points and where a cloud in p1 has fewer than P1 points.
+        """
+
+        idx, dists = _C.knn_points_idx(p1, p2, lengths1, lengths2, K, version)
+
+        # sort KNN in ascending order if K > 1
+        if K > 1 and return_sorted:
+            if lengths2.min() < K:
+                P1 = p1.shape[1]
+                mask = lengths2[:, None] <= torch.arange(K, device=dists.device)[None]
+                # mask has shape [N, K], true where dists irrelevant
+                mask = mask[:, None].expand(-1, P1, -1)
+                # mask has shape [N, P1, K], true where dists irrelevant
+                dists[mask] = float("inf")
+                dists, sort_idx = dists.sort(dim=2)
+                dists[mask] = 0
+            else:
+                dists, sort_idx = dists.sort(dim=2)
+            # pyre-fixme[16]: `Tensor` has no attribute `gather`.
+            idx = idx.gather(2, sort_idx)
+
+        ctx.save_for_backward(p1, p2, lengths1, lengths2, idx)
+        ctx.mark_non_differentiable(idx)
+        return dists, idx
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_dists, grad_idx):
+        p1, p2, lengths1, lengths2, idx = ctx.saved_tensors
+        # TODO(gkioxari) Change cast to floats once we add support for doubles.
+        if not (grad_dists.dtype == torch.float32):
+            grad_dists = grad_dists.float()
+        if not (p1.dtype == torch.float32):
+            p1 = p1.float()
+        if not (p2.dtype == torch.float32):
+            p2 = p2.float()
+        grad_p1, grad_p2 = _C.knn_points_backward(
+            p1, p2, lengths1, lengths2, idx, grad_dists
+        )
+        return grad_p1, grad_p2, None, None, None, None, None
+
+
+def knn_points(
+    p1: torch.Tensor,
+    p2: torch.Tensor,
+    lengths1: Union[torch.Tensor, None] = None,
+    lengths2: Union[torch.Tensor, None] = None,
+    K: int = 1,
+    version: int = -1,
+    return_nn: bool = False,
+    return_sorted: bool = True,
+):
+    """
+    K-Nearest neighbors on point clouds.
+
+    Args:
+        p1: Tensor of shape (N, P1, D) giving a batch of N point clouds, each
+            containing up to P1 points of dimension D.
+        p2: Tensor of shape (N, P2, D) giving a batch of N point clouds, each
+            containing up to P2 points of dimension D.
+        lengths1: LongTensor of shape (N,) of values in the range [0, P1], giving the
+            length of each pointcloud in p1. Or None to indicate that every cloud has
+            length P1.
+        lengths2: LongTensor of shape (N,) of values in the range [0, P2], giving the
+            length of each pointcloud in p2. Or None to indicate that every cloud has
+            length P2.
+        K: Integer giving the number of nearest neighbors to return.
+        version: Which KNN implementation to use in the backend. If version=-1,
+            the correct implementation is selected based on the shapes of the inputs.
+        return_nn: If set to True returns the K nearest neighbors in p2 for each point in p1.
+        return_sorted: (bool) whether to return the nearest neighbors sorted in
+            ascending order of distance.
+
+    Returns:
+        dists: Tensor of shape (N, P1, K) giving the squared distances to
+            the nearest neighbors. This is padded with zeros both where a cloud in p2
+            has fewer than K points and where a cloud in p1 has fewer than P1 points.
+
+        idx: LongTensor of shape (N, P1, K) giving the indices of the
+            K nearest neighbors from points in p1 to points in p2.
+            Concretely, if `p1_idx[n, i, k] = j` then `p2[n, j]` is the k-th nearest
+            neighbors to `p1[n, i]` in `p2[n]`. This is padded with zeros both where a cloud
+            in p2 has fewer than K points and where a cloud in p1 has fewer than P1
+            points.
+
+        nn: Tensor of shape (N, P1, K, D) giving the K nearest neighbors in p2 for
+            each point in p1. Concretely, `p2_nn[n, i, k]` gives the k-th nearest neighbor
+            for `p1[n, i]`. Returned if `return_nn` is True.
+            The nearest neighbors are collected using `knn_gather`
+
+            .. code-block::
+
+                p2_nn = knn_gather(p2, p1_idx, lengths2)
+
+            which is a helper function that allows indexing any tensor of shape (N, P2, U) with
+            the indices `p1_idx` returned by `knn_points`. The output is a tensor
+            of shape (N, P1, K, U).
+
+    """
+    if p1.shape[0] != p2.shape[0]:
+        raise ValueError("pts1 and pts2 must have the same batch dimension.")
+    if p1.shape[2] != p2.shape[2]:
+        raise ValueError("pts1 and pts2 must have the same point dimension.")
+
+    p1 = p1.contiguous()
+    p2 = p2.contiguous()
+
+    P1 = p1.shape[1]
+    P2 = p2.shape[1]
+
+    if lengths1 is None:
+        lengths1 = torch.full((p1.shape[0],), P1, dtype=torch.int64, device=p1.device)
+    if lengths2 is None:
+        lengths2 = torch.full((p1.shape[0],), P2, dtype=torch.int64, device=p1.device)
+
+    # pyre-fixme[16]: `_knn_points` has no attribute `apply`.
+    p1_dists, p1_idx = _knn_points.apply(
+        p1, p2, lengths1, lengths2, K, version, return_sorted
+    )
+
+    p2_nn = None
+    if return_nn:
+        p2_nn = knn_gather(p2, p1_idx, lengths2)
+
+    return _KNN(dists=p1_dists, idx=p1_idx, knn=p2_nn if return_nn else None)
+
+
+def knn_gather(
+    x: torch.Tensor, idx: torch.Tensor, lengths: Union[torch.Tensor, None] = None
+):
+    """
+    A helper function for knn that allows indexing a tensor x with the indices `idx`
+    returned by `knn_points`.
+
+    For example, if `dists, idx = knn_points(p, x, lengths_p, lengths, K)`
+    where p is a tensor of shape (N, L, D) and x a tensor of shape (N, M, D),
+    then one can compute the K nearest neighbors of p with `p_nn = knn_gather(x, idx, lengths)`.
+    It can also be applied for any tensor x of shape (N, M, U) where U != D.
+
+    Args:
+        x: Tensor of shape (N, M, U) containing U-dimensional features to
+            be gathered.
+        idx: LongTensor of shape (N, L, K) giving the indices returned by `knn_points`.
+        lengths: LongTensor of shape (N,) of values in the range [0, M], giving the
+            length of each example in the batch in x. Or None to indicate that every
+            example has length M.
+    Returns:
+        x_out: Tensor of shape (N, L, K, U) resulting from gathering the elements of x
+            with idx, s.t. `x_out[n, l, k] = x[n, idx[n, l, k]]`.
+            If `k > lengths[n]` then `x_out[n, l, k]` is filled with 0.0.
+    """
+    N, M, U = x.shape
+    _N, L, K = idx.shape
+
+    if N != _N:
+        raise ValueError("x and idx must have same batch dimension.")
+
+    if lengths is None:
+        lengths = torch.full((x.shape[0],), M, dtype=torch.int64, device=x.device)
+
+    idx_expanded = idx[:, :, :, None].expand(-1, -1, -1, U)
+    # idx_expanded has shape [N, L, K, U]
+
+    x_out = x[:, :, None].expand(-1, -1, K, -1).gather(1, idx_expanded)
+    # p2_nn has shape [N, L, K, U]
+
+    needs_mask = lengths.min() < K
+    if needs_mask:
+        # mask has shape [N, K], true where idx is irrelevant because
+        # there is less number of points in p2 than K
+        mask = lengths[:, None] <= torch.arange(K, device=x.device)[None]
+
+        # expand mask to shape [N, L, K, U]
+        mask = mask[:, None].expand(-1, L, -1)
+        mask = mask[:, :, :, None].expand(-1, -1, -1, U)
+        x_out[mask] = 0.0
+
+    return x_out
diff --git a/pytorch3d/pytorch3d/ops/laplacian_matrices.py b/pytorch3d/pytorch3d/ops/laplacian_matrices.py
new file mode 100644
index 0000000000000000000000000000000000000000..b18e6ca8e0bdf9454eb06b8024ba87452b72e098
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/laplacian_matrices.py
@@ -0,0 +1,172 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+
+
+# ------------------------ Laplacian Matrices ------------------------ #
+# This file contains implementations of differentiable laplacian matrices.
+# These include
+# 1) Standard Laplacian matrix
+# 2) Cotangent Laplacian matrix
+# 3) Norm Laplacian matrix
+# -------------------------------------------------------------------- #
+
+
+def laplacian(verts: torch.Tensor, edges: torch.Tensor) -> torch.Tensor:
+    """
+    Computes the laplacian matrix.
+    The definition of the laplacian is
+    L[i, j] =    -1       , if i == j
+    L[i, j] = 1 / deg(i)  , if (i, j) is an edge
+    L[i, j] =    0        , otherwise
+    where deg(i) is the degree of the i-th vertex in the graph.
+
+    Args:
+        verts: tensor of shape (V, 3) containing the vertices of the graph
+        edges: tensor of shape (E, 2) containing the vertex indices of each edge
+    Returns:
+        L: Sparse FloatTensor of shape (V, V)
+    """
+    V = verts.shape[0]
+
+    e0, e1 = edges.unbind(1)
+
+    idx01 = torch.stack([e0, e1], dim=1)  # (E, 2)
+    idx10 = torch.stack([e1, e0], dim=1)  # (E, 2)
+    idx = torch.cat([idx01, idx10], dim=0).t()  # (2, 2*E)
+
+    # First, we construct the adjacency matrix,
+    # i.e. A[i, j] = 1 if (i,j) is an edge, or
+    # A[e0, e1] = 1 &  A[e1, e0] = 1
+    ones = torch.ones(idx.shape[1], dtype=torch.float32, device=verts.device)
+    A = torch.sparse.FloatTensor(idx, ones, (V, V))
+
+    # the sum of i-th row of A gives the degree of the i-th vertex
+    deg = torch.sparse.sum(A, dim=1).to_dense()
+
+    # We construct the Laplacian matrix by adding the non diagonal values
+    # i.e. L[i, j] = 1 ./ deg(i) if (i, j) is an edge
+    deg0 = deg[e0]
+    deg0 = torch.where(deg0 > 0.0, 1.0 / deg0, deg0)
+    deg1 = deg[e1]
+    deg1 = torch.where(deg1 > 0.0, 1.0 / deg1, deg1)
+    val = torch.cat([deg0, deg1])
+    L = torch.sparse.FloatTensor(idx, val, (V, V))
+
+    # Then we add the diagonal values L[i, i] = -1.
+    idx = torch.arange(V, device=verts.device)
+    idx = torch.stack([idx, idx], dim=0)
+    ones = torch.ones(idx.shape[1], dtype=torch.float32, device=verts.device)
+    L -= torch.sparse.FloatTensor(idx, ones, (V, V))
+
+    return L
+
+
+def cot_laplacian(
+    verts: torch.Tensor, faces: torch.Tensor, eps: float = 1e-12
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Returns the Laplacian matrix with cotangent weights and the inverse of the
+    face areas.
+
+    Args:
+        verts: tensor of shape (V, 3) containing the vertices of the graph
+        faces: tensor of shape (F, 3) containing the vertex indices of each face
+    Returns:
+        2-element tuple containing
+        - **L**: Sparse FloatTensor of shape (V,V) for the Laplacian matrix.
+           Here, L[i, j] = cot a_ij + cot b_ij iff (i, j) is an edge in meshes.
+           See the description above for more clarity.
+        - **inv_areas**: FloatTensor of shape (V,) containing the inverse of sum of
+           face areas containing each vertex
+    """
+    V, F = verts.shape[0], faces.shape[0]
+
+    face_verts = verts[faces]
+    v0, v1, v2 = face_verts[:, 0], face_verts[:, 1], face_verts[:, 2]
+
+    # Side lengths of each triangle, of shape (sum(F_n),)
+    # A is the side opposite v1, B is opposite v2, and C is opposite v3
+    A = (v1 - v2).norm(dim=1)
+    B = (v0 - v2).norm(dim=1)
+    C = (v0 - v1).norm(dim=1)
+
+    # Area of each triangle (with Heron's formula); shape is (sum(F_n),)
+    s = 0.5 * (A + B + C)
+    # note that the area can be negative (close to 0) causing nans after sqrt()
+    # we clip it to a small positive value
+    # pyre-fixme[16]: `float` has no attribute `clamp_`.
+    area = (s * (s - A) * (s - B) * (s - C)).clamp_(min=eps).sqrt()
+
+    # Compute cotangents of angles, of shape (sum(F_n), 3)
+    A2, B2, C2 = A * A, B * B, C * C
+    cota = (B2 + C2 - A2) / area
+    cotb = (A2 + C2 - B2) / area
+    cotc = (A2 + B2 - C2) / area
+    cot = torch.stack([cota, cotb, cotc], dim=1)
+    cot /= 4.0
+
+    # Construct a sparse matrix by basically doing:
+    # L[v1, v2] = cota
+    # L[v2, v0] = cotb
+    # L[v0, v1] = cotc
+    ii = faces[:, [1, 2, 0]]
+    jj = faces[:, [2, 0, 1]]
+    idx = torch.stack([ii, jj], dim=0).view(2, F * 3)
+    L = torch.sparse.FloatTensor(idx, cot.view(-1), (V, V))
+
+    # Make it symmetric; this means we are also setting
+    # L[v2, v1] = cota
+    # L[v0, v2] = cotb
+    # L[v1, v0] = cotc
+    L += L.t()
+
+    # For each vertex, compute the sum of areas for triangles containing it.
+    idx = faces.view(-1)
+    inv_areas = torch.zeros(V, dtype=torch.float32, device=verts.device)
+    val = torch.stack([area] * 3, dim=1).view(-1)
+    inv_areas.scatter_add_(0, idx, val)
+    idx = inv_areas > 0
+    inv_areas[idx] = 1.0 / inv_areas[idx]
+    inv_areas = inv_areas.view(-1, 1)
+
+    return L, inv_areas
+
+
+def norm_laplacian(
+    verts: torch.Tensor, edges: torch.Tensor, eps: float = 1e-12
+) -> torch.Tensor:
+    """
+    Norm laplacian computes a variant of the laplacian matrix which weights each
+    affinity with the normalized distance of the neighboring nodes.
+    More concretely,
+    L[i, j] = 1. / wij where wij = ||vi - vj|| if (vi, vj) are neighboring nodes
+
+    Args:
+        verts: tensor of shape (V, 3) containing the vertices of the graph
+        edges: tensor of shape (E, 2) containing the vertex indices of each edge
+    Returns:
+        L: Sparse FloatTensor of shape (V, V)
+    """
+    edge_verts = verts[edges]  # (E, 2, 3)
+    v0, v1 = edge_verts[:, 0], edge_verts[:, 1]
+
+    # Side lengths of each edge, of shape (E,)
+    w01 = 1.0 / ((v0 - v1).norm(dim=1) + eps)
+
+    # Construct a sparse matrix by basically doing:
+    # L[v0, v1] = w01
+    # L[v1, v0] = w01
+    e01 = edges.t()  # (2, E)
+
+    V = verts.shape[0]
+    L = torch.sparse.FloatTensor(e01, w01, (V, V))
+    L = L + L.t()
+
+    return L
diff --git a/pytorch3d/pytorch3d/ops/marching_cubes.py b/pytorch3d/pytorch3d/ops/marching_cubes.py
new file mode 100644
index 0000000000000000000000000000000000000000..e723e427c5fcf6aca701da0a61ac20a36bcc0343
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/marching_cubes.py
@@ -0,0 +1,350 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Dict, List, Optional, Tuple
+
+import torch
+from pytorch3d.ops.marching_cubes_data import EDGE_TABLE, EDGE_TO_VERTICES, FACE_TABLE
+from pytorch3d.transforms import Translate
+
+
+EPS = 0.00001
+
+
+class Cube:
+    def __init__(self, bfl_vertex: Tuple[int, int, int], spacing: int = 1) -> None:
+        """
+        Initializes a cube given the bottom front left vertex coordinate
+        and the cube spacing
+
+        Edge and vertex convention:
+
+                    v4_______e4____________v5
+                    /|                    /|
+                   / |                   / |
+                e7/  |                e5/  |
+                 /___|______e6_________/   |
+              v7|    |                 |v6 |e9
+                |    |                 |   |
+                |    |e8               |e10|
+             e11|    |                 |   |
+                |    |_________________|___|
+                |   / v0      e0       |   /v1
+                |  /                   |  /
+                | /e3                  | /e1
+                |/_____________________|/
+                v3         e2          v2
+
+        Args:
+            bfl_vertex: a tuple of size 3 corresponding to the bottom front left vertex
+                of the cube in (x, y, z) format
+            spacing: the length of each edge of the cube
+        """
+        # match corner orders to algorithm convention
+        if len(bfl_vertex) != 3:
+            msg = "The vertex {} is size {} instead of size 3".format(
+                bfl_vertex, len(bfl_vertex)
+            )
+            raise ValueError(msg)
+
+        x, y, z = bfl_vertex
+        self.vertices = torch.tensor(
+            [
+                [x, y, z + spacing],
+                [x + spacing, y, z + spacing],
+                [x + spacing, y, z],
+                [x, y, z],
+                [x, y + spacing, z + spacing],
+                [x + spacing, y + spacing, z + spacing],
+                [x + spacing, y + spacing, z],
+                [x, y + spacing, z],
+            ]
+        )
+
+    def get_index(self, volume_data: torch.Tensor, isolevel: float) -> int:
+        """
+        Calculates the cube_index in the range 0-255 to index
+        into EDGE_TABLE and FACE_TABLE
+        Args:
+            volume_data: the 3D scalar data
+            isolevel: the isosurface value used as a threshold
+                for determining whether a point is inside/outside
+                the volume
+        """
+        cube_index = 0
+        bit = 1
+        for index in range(len(self.vertices)):
+            vertex = self.vertices[index]
+            value = _get_value(vertex, volume_data)
+            if value < isolevel:
+                cube_index |= bit
+            bit *= 2
+        return cube_index
+
+
+def marching_cubes_naive(
+    volume_data_batch: torch.Tensor,
+    isolevel: Optional[float] = None,
+    spacing: int = 1,
+    return_local_coords: bool = True,
+) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    """
+    Runs the classic marching cubes algorithm, iterating over
+    the coordinates of the volume_data and using a given isolevel
+    for determining intersected edges of cubes of size `spacing`.
+    Returns vertices and faces of the obtained mesh.
+    This operation is non-differentiable.
+
+    This is a naive implementation, and is not optimized for efficiency.
+
+    Args:
+        volume_data_batch: a Tensor of size (N, D, H, W) corresponding to
+            a batch of 3D scalar fields
+        isolevel: the isosurface value to use as the threshold to determine
+            whether points are within a volume. If None, then the average of the
+            maximum and minimum value of the scalar field will be used.
+        spacing: an integer specifying the cube size to use
+        return_local_coords: bool. If True the output vertices will be in local coordinates in
+        the range [-1, 1] x [-1, 1] x [-1, 1]. If False they will be in the range
+        [0, W-1] x [0, H-1] x [0, D-1]
+    Returns:
+        verts: [(V_0, 3), (V_1, 3), ...] List of N FloatTensors of vertices.
+        faces: [(F_0, 3), (F_1, 3), ...] List of N LongTensors of faces.
+    """
+    volume_data_batch = volume_data_batch.detach().cpu()
+    batched_verts, batched_faces = [], []
+    D, H, W = volume_data_batch.shape[1:]
+    volume_size_xyz = volume_data_batch.new_tensor([W, H, D])[None]
+
+    if return_local_coords:
+        # Convert from local coordinates in the range [-1, 1] range to
+        # world coordinates in the range [0, D-1], [0, H-1], [0, W-1]
+        local_to_world_transform = Translate(
+            x=+1.0, y=+1.0, z=+1.0, device=volume_data_batch.device
+        ).scale((volume_size_xyz - 1) * spacing * 0.5)
+        # Perform the inverse to go from world to local
+        world_to_local_transform = local_to_world_transform.inverse()
+
+    for i in range(len(volume_data_batch)):
+        volume_data = volume_data_batch[i]
+        curr_isolevel = (
+            ((volume_data.max() + volume_data.min()) / 2).item()
+            if isolevel is None
+            else isolevel
+        )
+        edge_vertices_to_index = {}
+        vertex_coords_to_index = {}
+        verts, faces = [], []
+        # Use length - spacing for the bounds since we are using
+        # cubes of size spacing, with the lowest x,y,z values
+        # (bottom front left)
+        for x in range(0, W - spacing, spacing):
+            for y in range(0, H - spacing, spacing):
+                for z in range(0, D - spacing, spacing):
+                    cube = Cube((x, y, z), spacing)
+                    new_verts, new_faces = polygonise(
+                        cube,
+                        curr_isolevel,
+                        volume_data,
+                        edge_vertices_to_index,
+                        vertex_coords_to_index,
+                    )
+                    verts.extend(new_verts)
+                    faces.extend(new_faces)
+        if len(faces) > 0 and len(verts) > 0:
+            verts = torch.tensor(verts, dtype=torch.float32)
+            # Convert vertices from world to local coords
+            if return_local_coords:
+                verts = world_to_local_transform.transform_points(verts[None, ...])
+                verts = verts.squeeze()
+            batched_verts.append(verts)
+            batched_faces.append(torch.tensor(faces, dtype=torch.int64))
+    return batched_verts, batched_faces
+
+
+def polygonise(
+    cube: Cube,
+    isolevel: float,
+    volume_data: torch.Tensor,
+    edge_vertices_to_index: Dict[Tuple[Tuple, Tuple], int],
+    vertex_coords_to_index: Dict[Tuple[float, float, float], int],
+) -> Tuple[list, list]:
+    """
+    Runs the classic marching cubes algorithm for one Cube in the volume.
+    Returns the vertices and faces for the given cube.
+
+    Args:
+        cube: a Cube indicating the cube being examined for edges that intersect
+            the volume data.
+        isolevel: the isosurface value to use as the threshold to determine
+            whether points are within a volume.
+        volume_data: a Tensor of shape (D, H, W) corresponding to
+            a 3D scalar field
+        edge_vertices_to_index: A dictionary which maps an edge's two coordinates
+            to the index of its interpolated point, if that interpolated point
+            has already been used by a previous point
+        vertex_coords_to_index: A dictionary mapping a point (x, y, z) to the corresponding
+            index of that vertex, if that point has already been marked as a vertex.
+    Returns:
+        verts: List of triangle vertices for the given cube in the volume
+        faces: List of triangle faces for the given cube in the volume
+    """
+    num_existing_verts = max(edge_vertices_to_index.values(), default=-1) + 1
+    verts, faces = [], []
+    cube_index = cube.get_index(volume_data, isolevel)
+    edges = EDGE_TABLE[cube_index]
+    edge_indices = _get_edge_indices(edges)
+    if len(edge_indices) == 0:
+        return [], []
+
+    new_verts, edge_index_to_point_index = _calculate_interp_vertices(
+        edge_indices,
+        volume_data,
+        cube,
+        isolevel,
+        edge_vertices_to_index,
+        vertex_coords_to_index,
+        num_existing_verts,
+    )
+
+    # Create faces
+    face_triangles = FACE_TABLE[cube_index]
+    for i in range(0, len(face_triangles), 3):
+        tri1 = edge_index_to_point_index[face_triangles[i]]
+        tri2 = edge_index_to_point_index[face_triangles[i + 1]]
+        tri3 = edge_index_to_point_index[face_triangles[i + 2]]
+        if tri1 != tri2 and tri2 != tri3 and tri1 != tri3:
+            faces.append([tri1, tri2, tri3])
+
+    verts += new_verts
+    return verts, faces
+
+
+def _get_edge_indices(edges: int) -> List[int]:
+    """
+    Finds which edge numbers are intersected given the bit representation
+    detailed in marching_cubes_data.EDGE_TABLE.
+
+    Args:
+        edges: an integer corresponding to the value at cube_index
+            from the EDGE_TABLE in marching_cubes_data.py
+
+    Returns:
+        edge_indices: A list of edge indices
+    """
+    if edges == 0:
+        return []
+
+    edge_indices = []
+    for i in range(12):
+        if edges & (2 ** i):
+            edge_indices.append(i)
+    return edge_indices
+
+
+def _calculate_interp_vertices(
+    edge_indices: List[int],
+    volume_data: torch.Tensor,
+    cube: Cube,
+    isolevel: float,
+    edge_vertices_to_index: Dict[Tuple[Tuple, Tuple], int],
+    vertex_coords_to_index: Dict[Tuple[float, float, float], int],
+    num_existing_verts: int,
+) -> Tuple[List, Dict[int, int]]:
+    """
+    Finds the interpolated vertices for the intersected edges, either referencing
+    previous calculations or newly calculating and storing the new interpolated
+    points.
+
+    Args:
+        edge_indices: the numbers of the edges which are intersected. See the
+            Cube class for more detail on the edge numbering convention.
+        volume_data: a Tensor of size (D, H, W) corresponding to
+            a 3D scalar field
+        cube: a Cube indicating the cube being examined for edges that intersect
+            the volume
+        isolevel: the isosurface value to use as the threshold to determine
+            whether points are within a volume.
+        edge_vertices_to_index: A dictionary which maps an edge's two coordinates
+            to the index of its interpolated point, if that interpolated point
+            has already been used by a previous point
+        vertex_coords_to_index: A dictionary mapping a point (x, y, z) to the corresponding
+            index of that vertex, if that point has already been marked as a vertex.
+        num_existing_verts: the number of vertices that have been found in previous
+            calls to polygonise for the given volume_data in the above function, marching_cubes.
+            This is equal to the 1 + the maximum value in edge_vertices_to_index.
+    Returns:
+        interp_points: a list of new interpolated points
+        edge_index_to_point_index: a dictionary mapping an edge number to the index in the
+            marching cubes' vertices list of the interpolated point on that edge. To be precise,
+            it refers to the index within the vertices list after interp_points
+            has been appended to the verts list constructed in the marching_cubes_naive
+            function.
+    """
+    interp_points = []
+    edge_index_to_point_index = {}
+    for edge_index in edge_indices:
+        v1, v2 = EDGE_TO_VERTICES[edge_index]
+        point1, point2 = cube.vertices[v1], cube.vertices[v2]
+        p_tuple1, p_tuple2 = tuple(point1.tolist()), tuple(point2.tolist())
+        if (p_tuple1, p_tuple2) in edge_vertices_to_index:
+            edge_index_to_point_index[edge_index] = edge_vertices_to_index[
+                (p_tuple1, p_tuple2)
+            ]
+        else:
+            val1, val2 = _get_value(point1, volume_data), _get_value(
+                point2, volume_data
+            )
+
+            point = None
+            if abs(isolevel - val1) < EPS:
+                point = point1
+
+            if abs(isolevel - val2) < EPS:
+                point = point2
+
+            if abs(val1 - val2) < EPS:
+                point = point1
+
+            if point is None:
+                mu = (isolevel - val1) / (val2 - val1)
+                x1, y1, z1 = point1
+                x2, y2, z2 = point2
+                x = x1 + mu * (x2 - x1)
+                y = y1 + mu * (y2 - y1)
+                z = z1 + mu * (z2 - z1)
+            else:
+                x, y, z = point
+
+            x, y, z = x.item(), y.item(), z.item()  # for dictionary keys
+
+            vert_index = None
+            if (x, y, z) in vertex_coords_to_index:
+                vert_index = vertex_coords_to_index[(x, y, z)]
+            else:
+                vert_index = num_existing_verts + len(interp_points)
+                interp_points.append([x, y, z])
+                vertex_coords_to_index[(x, y, z)] = vert_index
+
+            edge_vertices_to_index[(p_tuple1, p_tuple2)] = vert_index
+            edge_index_to_point_index[edge_index] = vert_index
+
+    return interp_points, edge_index_to_point_index
+
+
+def _get_value(point: Tuple[int, int, int], volume_data: torch.Tensor) -> float:
+    """
+    Gets the value at a given coordinate point in the scalar field.
+
+    Args:
+        point: data of shape (3) corresponding to an xyz coordinate.
+        volume_data: a Tensor of size (D, H, W) corresponding to
+            a 3D scalar field
+    Returns:
+        data: scalar value in the volume at the given point
+    """
+    x, y, z = point
+    return volume_data[z][y][x]
diff --git a/pytorch3d/pytorch3d/ops/marching_cubes_data.py b/pytorch3d/pytorch3d/ops/marching_cubes_data.py
new file mode 100644
index 0000000000000000000000000000000000000000..8fb7c68e3bb878f91bfa97d377307d4f86c0e98f
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/marching_cubes_data.py
@@ -0,0 +1,549 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# A length 256 list which maps a cubeindex to a number
+# with the intersected edges' bits set to 1.
+# Each cubeindex corresponds to a given cube configuration, where
+# it is composed of a bitstring where the 0th bit is flipped if vertex 0
+# is below the isosurface (i.e. 0x01), for each of the 8 vertices.
+EDGE_TABLE = [
+    0x0,
+    0x109,
+    0x203,
+    0x30A,
+    0x406,
+    0x50F,
+    0x605,
+    0x70C,
+    0x80C,
+    0x905,
+    0xA0F,
+    0xB06,
+    0xC0A,
+    0xD03,
+    0xE09,
+    0xF00,
+    0x190,
+    0x99,
+    0x393,
+    0x29A,
+    0x596,
+    0x49F,
+    0x795,
+    0x69C,
+    0x99C,
+    0x895,
+    0xB9F,
+    0xA96,
+    0xD9A,
+    0xC93,
+    0xF99,
+    0xE90,
+    0x230,
+    0x339,
+    0x33,
+    0x13A,
+    0x636,
+    0x73F,
+    0x435,
+    0x53C,
+    0xA3C,
+    0xB35,
+    0x83F,
+    0x936,
+    0xE3A,
+    0xF33,
+    0xC39,
+    0xD30,
+    0x3A0,
+    0x2A9,
+    0x1A3,
+    0xAA,
+    0x7A6,
+    0x6AF,
+    0x5A5,
+    0x4AC,
+    0xBAC,
+    0xAA5,
+    0x9AF,
+    0x8A6,
+    0xFAA,
+    0xEA3,
+    0xDA9,
+    0xCA0,
+    0x460,
+    0x569,
+    0x663,
+    0x76A,
+    0x66,
+    0x16F,
+    0x265,
+    0x36C,
+    0xC6C,
+    0xD65,
+    0xE6F,
+    0xF66,
+    0x86A,
+    0x963,
+    0xA69,
+    0xB60,
+    0x5F0,
+    0x4F9,
+    0x7F3,
+    0x6FA,
+    0x1F6,
+    0xFF,
+    0x3F5,
+    0x2FC,
+    0xDFC,
+    0xCF5,
+    0xFFF,
+    0xEF6,
+    0x9FA,
+    0x8F3,
+    0xBF9,
+    0xAF0,
+    0x650,
+    0x759,
+    0x453,
+    0x55A,
+    0x256,
+    0x35F,
+    0x55,
+    0x15C,
+    0xE5C,
+    0xF55,
+    0xC5F,
+    0xD56,
+    0xA5A,
+    0xB53,
+    0x859,
+    0x950,
+    0x7C0,
+    0x6C9,
+    0x5C3,
+    0x4CA,
+    0x3C6,
+    0x2CF,
+    0x1C5,
+    0xCC,
+    0xFCC,
+    0xEC5,
+    0xDCF,
+    0xCC6,
+    0xBCA,
+    0xAC3,
+    0x9C9,
+    0x8C0,
+    0x8C0,
+    0x9C9,
+    0xAC3,
+    0xBCA,
+    0xCC6,
+    0xDCF,
+    0xEC5,
+    0xFCC,
+    0xCC,
+    0x1C5,
+    0x2CF,
+    0x3C6,
+    0x4CA,
+    0x5C3,
+    0x6C9,
+    0x7C0,
+    0x950,
+    0x859,
+    0xB53,
+    0xA5A,
+    0xD56,
+    0xC5F,
+    0xF55,
+    0xE5C,
+    0x15C,
+    0x55,
+    0x35F,
+    0x256,
+    0x55A,
+    0x453,
+    0x759,
+    0x650,
+    0xAF0,
+    0xBF9,
+    0x8F3,
+    0x9FA,
+    0xEF6,
+    0xFFF,
+    0xCF5,
+    0xDFC,
+    0x2FC,
+    0x3F5,
+    0xFF,
+    0x1F6,
+    0x6FA,
+    0x7F3,
+    0x4F9,
+    0x5F0,
+    0xB60,
+    0xA69,
+    0x963,
+    0x86A,
+    0xF66,
+    0xE6F,
+    0xD65,
+    0xC6C,
+    0x36C,
+    0x265,
+    0x16F,
+    0x66,
+    0x76A,
+    0x663,
+    0x569,
+    0x460,
+    0xCA0,
+    0xDA9,
+    0xEA3,
+    0xFAA,
+    0x8A6,
+    0x9AF,
+    0xAA5,
+    0xBAC,
+    0x4AC,
+    0x5A5,
+    0x6AF,
+    0x7A6,
+    0xAA,
+    0x1A3,
+    0x2A9,
+    0x3A0,
+    0xD30,
+    0xC39,
+    0xF33,
+    0xE3A,
+    0x936,
+    0x83F,
+    0xB35,
+    0xA3C,
+    0x53C,
+    0x435,
+    0x73F,
+    0x636,
+    0x13A,
+    0x33,
+    0x339,
+    0x230,
+    0xE90,
+    0xF99,
+    0xC93,
+    0xD9A,
+    0xA96,
+    0xB9F,
+    0x895,
+    0x99C,
+    0x69C,
+    0x795,
+    0x49F,
+    0x596,
+    0x29A,
+    0x393,
+    0x99,
+    0x190,
+    0xF00,
+    0xE09,
+    0xD03,
+    0xC0A,
+    0xB06,
+    0xA0F,
+    0x905,
+    0x80C,
+    0x70C,
+    0x605,
+    0x50F,
+    0x406,
+    0x30A,
+    0x203,
+    0x109,
+    0x0,
+]
+
+# Maps each edge (by index) to the corresponding cube vertices
+EDGE_TO_VERTICES = [
+    [0, 1],
+    [1, 2],
+    [3, 2],
+    [0, 3],
+    [4, 5],
+    [5, 6],
+    [7, 6],
+    [4, 7],
+    [0, 4],
+    [1, 5],
+    [2, 6],
+    [3, 7],
+]
+
+# A list of lists mapping a cube_index (a given configuration)
+# to a list of faces corresponding to that configuration. Each face is represented
+# by 3 consecutive numbers. A configuration will at most have 5 faces.
+#
+# Table taken from http://paulbourke.net/geometry/polygonise/
+FACE_TABLE = [
+    [],
+    [0, 8, 3],
+    [0, 1, 9],
+    [1, 8, 3, 9, 8, 1],
+    [1, 2, 10],
+    [0, 8, 3, 1, 2, 10],
+    [9, 2, 10, 0, 2, 9],
+    [2, 8, 3, 2, 10, 8, 10, 9, 8],
+    [3, 11, 2],
+    [0, 11, 2, 8, 11, 0],
+    [1, 9, 0, 2, 3, 11],
+    [1, 11, 2, 1, 9, 11, 9, 8, 11],
+    [3, 10, 1, 11, 10, 3],
+    [0, 10, 1, 0, 8, 10, 8, 11, 10],
+    [3, 9, 0, 3, 11, 9, 11, 10, 9],
+    [9, 8, 10, 10, 8, 11],
+    [4, 7, 8],
+    [4, 3, 0, 7, 3, 4],
+    [0, 1, 9, 8, 4, 7],
+    [4, 1, 9, 4, 7, 1, 7, 3, 1],
+    [1, 2, 10, 8, 4, 7],
+    [3, 4, 7, 3, 0, 4, 1, 2, 10],
+    [9, 2, 10, 9, 0, 2, 8, 4, 7],
+    [2, 10, 9, 2, 9, 7, 2, 7, 3, 7, 9, 4],
+    [8, 4, 7, 3, 11, 2],
+    [11, 4, 7, 11, 2, 4, 2, 0, 4],
+    [9, 0, 1, 8, 4, 7, 2, 3, 11],
+    [4, 7, 11, 9, 4, 11, 9, 11, 2, 9, 2, 1],
+    [3, 10, 1, 3, 11, 10, 7, 8, 4],
+    [1, 11, 10, 1, 4, 11, 1, 0, 4, 7, 11, 4],
+    [4, 7, 8, 9, 0, 11, 9, 11, 10, 11, 0, 3],
+    [4, 7, 11, 4, 11, 9, 9, 11, 10],
+    [9, 5, 4],
+    [9, 5, 4, 0, 8, 3],
+    [0, 5, 4, 1, 5, 0],
+    [8, 5, 4, 8, 3, 5, 3, 1, 5],
+    [1, 2, 10, 9, 5, 4],
+    [3, 0, 8, 1, 2, 10, 4, 9, 5],
+    [5, 2, 10, 5, 4, 2, 4, 0, 2],
+    [2, 10, 5, 3, 2, 5, 3, 5, 4, 3, 4, 8],
+    [9, 5, 4, 2, 3, 11],
+    [0, 11, 2, 0, 8, 11, 4, 9, 5],
+    [0, 5, 4, 0, 1, 5, 2, 3, 11],
+    [2, 1, 5, 2, 5, 8, 2, 8, 11, 4, 8, 5],
+    [10, 3, 11, 10, 1, 3, 9, 5, 4],
+    [4, 9, 5, 0, 8, 1, 8, 10, 1, 8, 11, 10],
+    [5, 4, 0, 5, 0, 11, 5, 11, 10, 11, 0, 3],
+    [5, 4, 8, 5, 8, 10, 10, 8, 11],
+    [9, 7, 8, 5, 7, 9],
+    [9, 3, 0, 9, 5, 3, 5, 7, 3],
+    [0, 7, 8, 0, 1, 7, 1, 5, 7],
+    [1, 5, 3, 3, 5, 7],
+    [9, 7, 8, 9, 5, 7, 10, 1, 2],
+    [10, 1, 2, 9, 5, 0, 5, 3, 0, 5, 7, 3],
+    [8, 0, 2, 8, 2, 5, 8, 5, 7, 10, 5, 2],
+    [2, 10, 5, 2, 5, 3, 3, 5, 7],
+    [7, 9, 5, 7, 8, 9, 3, 11, 2],
+    [9, 5, 7, 9, 7, 2, 9, 2, 0, 2, 7, 11],
+    [2, 3, 11, 0, 1, 8, 1, 7, 8, 1, 5, 7],
+    [11, 2, 1, 11, 1, 7, 7, 1, 5],
+    [9, 5, 8, 8, 5, 7, 10, 1, 3, 10, 3, 11],
+    [5, 7, 0, 5, 0, 9, 7, 11, 0, 1, 0, 10, 11, 10, 0],
+    [11, 10, 0, 11, 0, 3, 10, 5, 0, 8, 0, 7, 5, 7, 0],
+    [11, 10, 5, 7, 11, 5],
+    [10, 6, 5],
+    [0, 8, 3, 5, 10, 6],
+    [9, 0, 1, 5, 10, 6],
+    [1, 8, 3, 1, 9, 8, 5, 10, 6],
+    [1, 6, 5, 2, 6, 1],
+    [1, 6, 5, 1, 2, 6, 3, 0, 8],
+    [9, 6, 5, 9, 0, 6, 0, 2, 6],
+    [5, 9, 8, 5, 8, 2, 5, 2, 6, 3, 2, 8],
+    [2, 3, 11, 10, 6, 5],
+    [11, 0, 8, 11, 2, 0, 10, 6, 5],
+    [0, 1, 9, 2, 3, 11, 5, 10, 6],
+    [5, 10, 6, 1, 9, 2, 9, 11, 2, 9, 8, 11],
+    [6, 3, 11, 6, 5, 3, 5, 1, 3],
+    [0, 8, 11, 0, 11, 5, 0, 5, 1, 5, 11, 6],
+    [3, 11, 6, 0, 3, 6, 0, 6, 5, 0, 5, 9],
+    [6, 5, 9, 6, 9, 11, 11, 9, 8],
+    [5, 10, 6, 4, 7, 8],
+    [4, 3, 0, 4, 7, 3, 6, 5, 10],
+    [1, 9, 0, 5, 10, 6, 8, 4, 7],
+    [10, 6, 5, 1, 9, 7, 1, 7, 3, 7, 9, 4],
+    [6, 1, 2, 6, 5, 1, 4, 7, 8],
+    [1, 2, 5, 5, 2, 6, 3, 0, 4, 3, 4, 7],
+    [8, 4, 7, 9, 0, 5, 0, 6, 5, 0, 2, 6],
+    [7, 3, 9, 7, 9, 4, 3, 2, 9, 5, 9, 6, 2, 6, 9],
+    [3, 11, 2, 7, 8, 4, 10, 6, 5],
+    [5, 10, 6, 4, 7, 2, 4, 2, 0, 2, 7, 11],
+    [0, 1, 9, 4, 7, 8, 2, 3, 11, 5, 10, 6],
+    [9, 2, 1, 9, 11, 2, 9, 4, 11, 7, 11, 4, 5, 10, 6],
+    [8, 4, 7, 3, 11, 5, 3, 5, 1, 5, 11, 6],
+    [5, 1, 11, 5, 11, 6, 1, 0, 11, 7, 11, 4, 0, 4, 11],
+    [0, 5, 9, 0, 6, 5, 0, 3, 6, 11, 6, 3, 8, 4, 7],
+    [6, 5, 9, 6, 9, 11, 4, 7, 9, 7, 11, 9],
+    [10, 4, 9, 6, 4, 10],
+    [4, 10, 6, 4, 9, 10, 0, 8, 3],
+    [10, 0, 1, 10, 6, 0, 6, 4, 0],
+    [8, 3, 1, 8, 1, 6, 8, 6, 4, 6, 1, 10],
+    [1, 4, 9, 1, 2, 4, 2, 6, 4],
+    [3, 0, 8, 1, 2, 9, 2, 4, 9, 2, 6, 4],
+    [0, 2, 4, 4, 2, 6],
+    [8, 3, 2, 8, 2, 4, 4, 2, 6],
+    [10, 4, 9, 10, 6, 4, 11, 2, 3],
+    [0, 8, 2, 2, 8, 11, 4, 9, 10, 4, 10, 6],
+    [3, 11, 2, 0, 1, 6, 0, 6, 4, 6, 1, 10],
+    [6, 4, 1, 6, 1, 10, 4, 8, 1, 2, 1, 11, 8, 11, 1],
+    [9, 6, 4, 9, 3, 6, 9, 1, 3, 11, 6, 3],
+    [8, 11, 1, 8, 1, 0, 11, 6, 1, 9, 1, 4, 6, 4, 1],
+    [3, 11, 6, 3, 6, 0, 0, 6, 4],
+    [6, 4, 8, 11, 6, 8],
+    [7, 10, 6, 7, 8, 10, 8, 9, 10],
+    [0, 7, 3, 0, 10, 7, 0, 9, 10, 6, 7, 10],
+    [10, 6, 7, 1, 10, 7, 1, 7, 8, 1, 8, 0],
+    [10, 6, 7, 10, 7, 1, 1, 7, 3],
+    [1, 2, 6, 1, 6, 8, 1, 8, 9, 8, 6, 7],
+    [2, 6, 9, 2, 9, 1, 6, 7, 9, 0, 9, 3, 7, 3, 9],
+    [7, 8, 0, 7, 0, 6, 6, 0, 2],
+    [7, 3, 2, 6, 7, 2],
+    [2, 3, 11, 10, 6, 8, 10, 8, 9, 8, 6, 7],
+    [2, 0, 7, 2, 7, 11, 0, 9, 7, 6, 7, 10, 9, 10, 7],
+    [1, 8, 0, 1, 7, 8, 1, 10, 7, 6, 7, 10, 2, 3, 11],
+    [11, 2, 1, 11, 1, 7, 10, 6, 1, 6, 7, 1],
+    [8, 9, 6, 8, 6, 7, 9, 1, 6, 11, 6, 3, 1, 3, 6],
+    [0, 9, 1, 11, 6, 7],
+    [7, 8, 0, 7, 0, 6, 3, 11, 0, 11, 6, 0],
+    [7, 11, 6],
+    [7, 6, 11],
+    [3, 0, 8, 11, 7, 6],
+    [0, 1, 9, 11, 7, 6],
+    [8, 1, 9, 8, 3, 1, 11, 7, 6],
+    [10, 1, 2, 6, 11, 7],
+    [1, 2, 10, 3, 0, 8, 6, 11, 7],
+    [2, 9, 0, 2, 10, 9, 6, 11, 7],
+    [6, 11, 7, 2, 10, 3, 10, 8, 3, 10, 9, 8],
+    [7, 2, 3, 6, 2, 7],
+    [7, 0, 8, 7, 6, 0, 6, 2, 0],
+    [2, 7, 6, 2, 3, 7, 0, 1, 9],
+    [1, 6, 2, 1, 8, 6, 1, 9, 8, 8, 7, 6],
+    [10, 7, 6, 10, 1, 7, 1, 3, 7],
+    [10, 7, 6, 1, 7, 10, 1, 8, 7, 1, 0, 8],
+    [0, 3, 7, 0, 7, 10, 0, 10, 9, 6, 10, 7],
+    [7, 6, 10, 7, 10, 8, 8, 10, 9],
+    [6, 8, 4, 11, 8, 6],
+    [3, 6, 11, 3, 0, 6, 0, 4, 6],
+    [8, 6, 11, 8, 4, 6, 9, 0, 1],
+    [9, 4, 6, 9, 6, 3, 9, 3, 1, 11, 3, 6],
+    [6, 8, 4, 6, 11, 8, 2, 10, 1],
+    [1, 2, 10, 3, 0, 11, 0, 6, 11, 0, 4, 6],
+    [4, 11, 8, 4, 6, 11, 0, 2, 9, 2, 10, 9],
+    [10, 9, 3, 10, 3, 2, 9, 4, 3, 11, 3, 6, 4, 6, 3],
+    [8, 2, 3, 8, 4, 2, 4, 6, 2],
+    [0, 4, 2, 4, 6, 2],
+    [1, 9, 0, 2, 3, 4, 2, 4, 6, 4, 3, 8],
+    [1, 9, 4, 1, 4, 2, 2, 4, 6],
+    [8, 1, 3, 8, 6, 1, 8, 4, 6, 6, 10, 1],
+    [10, 1, 0, 10, 0, 6, 6, 0, 4],
+    [4, 6, 3, 4, 3, 8, 6, 10, 3, 0, 3, 9, 10, 9, 3],
+    [10, 9, 4, 6, 10, 4],
+    [4, 9, 5, 7, 6, 11],
+    [0, 8, 3, 4, 9, 5, 11, 7, 6],
+    [5, 0, 1, 5, 4, 0, 7, 6, 11],
+    [11, 7, 6, 8, 3, 4, 3, 5, 4, 3, 1, 5],
+    [9, 5, 4, 10, 1, 2, 7, 6, 11],
+    [6, 11, 7, 1, 2, 10, 0, 8, 3, 4, 9, 5],
+    [7, 6, 11, 5, 4, 10, 4, 2, 10, 4, 0, 2],
+    [3, 4, 8, 3, 5, 4, 3, 2, 5, 10, 5, 2, 11, 7, 6],
+    [7, 2, 3, 7, 6, 2, 5, 4, 9],
+    [9, 5, 4, 0, 8, 6, 0, 6, 2, 6, 8, 7],
+    [3, 6, 2, 3, 7, 6, 1, 5, 0, 5, 4, 0],
+    [6, 2, 8, 6, 8, 7, 2, 1, 8, 4, 8, 5, 1, 5, 8],
+    [9, 5, 4, 10, 1, 6, 1, 7, 6, 1, 3, 7],
+    [1, 6, 10, 1, 7, 6, 1, 0, 7, 8, 7, 0, 9, 5, 4],
+    [4, 0, 10, 4, 10, 5, 0, 3, 10, 6, 10, 7, 3, 7, 10],
+    [7, 6, 10, 7, 10, 8, 5, 4, 10, 4, 8, 10],
+    [6, 9, 5, 6, 11, 9, 11, 8, 9],
+    [3, 6, 11, 0, 6, 3, 0, 5, 6, 0, 9, 5],
+    [0, 11, 8, 0, 5, 11, 0, 1, 5, 5, 6, 11],
+    [6, 11, 3, 6, 3, 5, 5, 3, 1],
+    [1, 2, 10, 9, 5, 11, 9, 11, 8, 11, 5, 6],
+    [0, 11, 3, 0, 6, 11, 0, 9, 6, 5, 6, 9, 1, 2, 10],
+    [11, 8, 5, 11, 5, 6, 8, 0, 5, 10, 5, 2, 0, 2, 5],
+    [6, 11, 3, 6, 3, 5, 2, 10, 3, 10, 5, 3],
+    [5, 8, 9, 5, 2, 8, 5, 6, 2, 3, 8, 2],
+    [9, 5, 6, 9, 6, 0, 0, 6, 2],
+    [1, 5, 8, 1, 8, 0, 5, 6, 8, 3, 8, 2, 6, 2, 8],
+    [1, 5, 6, 2, 1, 6],
+    [1, 3, 6, 1, 6, 10, 3, 8, 6, 5, 6, 9, 8, 9, 6],
+    [10, 1, 0, 10, 0, 6, 9, 5, 0, 5, 6, 0],
+    [0, 3, 8, 5, 6, 10],
+    [10, 5, 6],
+    [11, 5, 10, 7, 5, 11],
+    [11, 5, 10, 11, 7, 5, 8, 3, 0],
+    [5, 11, 7, 5, 10, 11, 1, 9, 0],
+    [10, 7, 5, 10, 11, 7, 9, 8, 1, 8, 3, 1],
+    [11, 1, 2, 11, 7, 1, 7, 5, 1],
+    [0, 8, 3, 1, 2, 7, 1, 7, 5, 7, 2, 11],
+    [9, 7, 5, 9, 2, 7, 9, 0, 2, 2, 11, 7],
+    [7, 5, 2, 7, 2, 11, 5, 9, 2, 3, 2, 8, 9, 8, 2],
+    [2, 5, 10, 2, 3, 5, 3, 7, 5],
+    [8, 2, 0, 8, 5, 2, 8, 7, 5, 10, 2, 5],
+    [9, 0, 1, 5, 10, 3, 5, 3, 7, 3, 10, 2],
+    [9, 8, 2, 9, 2, 1, 8, 7, 2, 10, 2, 5, 7, 5, 2],
+    [1, 3, 5, 3, 7, 5],
+    [0, 8, 7, 0, 7, 1, 1, 7, 5],
+    [9, 0, 3, 9, 3, 5, 5, 3, 7],
+    [9, 8, 7, 5, 9, 7],
+    [5, 8, 4, 5, 10, 8, 10, 11, 8],
+    [5, 0, 4, 5, 11, 0, 5, 10, 11, 11, 3, 0],
+    [0, 1, 9, 8, 4, 10, 8, 10, 11, 10, 4, 5],
+    [10, 11, 4, 10, 4, 5, 11, 3, 4, 9, 4, 1, 3, 1, 4],
+    [2, 5, 1, 2, 8, 5, 2, 11, 8, 4, 5, 8],
+    [0, 4, 11, 0, 11, 3, 4, 5, 11, 2, 11, 1, 5, 1, 11],
+    [0, 2, 5, 0, 5, 9, 2, 11, 5, 4, 5, 8, 11, 8, 5],
+    [9, 4, 5, 2, 11, 3],
+    [2, 5, 10, 3, 5, 2, 3, 4, 5, 3, 8, 4],
+    [5, 10, 2, 5, 2, 4, 4, 2, 0],
+    [3, 10, 2, 3, 5, 10, 3, 8, 5, 4, 5, 8, 0, 1, 9],
+    [5, 10, 2, 5, 2, 4, 1, 9, 2, 9, 4, 2],
+    [8, 4, 5, 8, 5, 3, 3, 5, 1],
+    [0, 4, 5, 1, 0, 5],
+    [8, 4, 5, 8, 5, 3, 9, 0, 5, 0, 3, 5],
+    [9, 4, 5],
+    [4, 11, 7, 4, 9, 11, 9, 10, 11],
+    [0, 8, 3, 4, 9, 7, 9, 11, 7, 9, 10, 11],
+    [1, 10, 11, 1, 11, 4, 1, 4, 0, 7, 4, 11],
+    [3, 1, 4, 3, 4, 8, 1, 10, 4, 7, 4, 11, 10, 11, 4],
+    [4, 11, 7, 9, 11, 4, 9, 2, 11, 9, 1, 2],
+    [9, 7, 4, 9, 11, 7, 9, 1, 11, 2, 11, 1, 0, 8, 3],
+    [11, 7, 4, 11, 4, 2, 2, 4, 0],
+    [11, 7, 4, 11, 4, 2, 8, 3, 4, 3, 2, 4],
+    [2, 9, 10, 2, 7, 9, 2, 3, 7, 7, 4, 9],
+    [9, 10, 7, 9, 7, 4, 10, 2, 7, 8, 7, 0, 2, 0, 7],
+    [3, 7, 10, 3, 10, 2, 7, 4, 10, 1, 10, 0, 4, 0, 10],
+    [1, 10, 2, 8, 7, 4],
+    [4, 9, 1, 4, 1, 7, 7, 1, 3],
+    [4, 9, 1, 4, 1, 7, 0, 8, 1, 8, 7, 1],
+    [4, 0, 3, 7, 4, 3],
+    [4, 8, 7],
+    [9, 10, 8, 10, 11, 8],
+    [3, 0, 9, 3, 9, 11, 11, 9, 10],
+    [0, 1, 10, 0, 10, 8, 8, 10, 11],
+    [3, 1, 10, 11, 3, 10],
+    [1, 2, 11, 1, 11, 9, 9, 11, 8],
+    [3, 0, 9, 3, 9, 11, 1, 2, 9, 2, 11, 9],
+    [0, 2, 11, 8, 0, 11],
+    [3, 2, 11],
+    [2, 3, 8, 2, 8, 10, 10, 8, 9],
+    [9, 10, 2, 0, 9, 2],
+    [2, 3, 8, 2, 8, 10, 0, 1, 8, 1, 10, 8],
+    [1, 10, 2],
+    [1, 3, 8, 9, 1, 8],
+    [0, 9, 1],
+    [0, 3, 8],
+    [],
+]
diff --git a/pytorch3d/pytorch3d/ops/mesh_face_areas_normals.py b/pytorch3d/pytorch3d/ops/mesh_face_areas_normals.py
new file mode 100644
index 0000000000000000000000000000000000000000..11f9c9d8ffed7e68504df9a8d6611e9e0fd15e7c
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/mesh_face_areas_normals.py
@@ -0,0 +1,67 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from pytorch3d import _C
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+
+class _MeshFaceAreasNormals(Function):
+    """
+    Torch autograd Function wrapper for face areas & normals C++/CUDA implementations.
+    """
+
+    @staticmethod
+    def forward(ctx, verts, faces):
+        """
+        Args:
+            ctx: Context object used to calculate gradients.
+            verts: FloatTensor of shape (V, 3), representing the packed
+                batch verts tensor.
+            faces: LongTensor of shape (F, 3), representing the packed
+                batch faces tensor
+        Returns:
+            areas: FloatTensor of shape (F,) with the areas of each face
+            normals: FloatTensor of shape (F,3) with the normals of each face
+        """
+        if not (verts.dim() == 2):
+            raise ValueError("verts need to be of shape Vx3.")
+        if not (verts.shape[1] == 3):
+            raise ValueError("verts need to be of shape Vx3.")
+        if not (faces.dim() == 2):
+            raise ValueError("faces need to be of shape Fx3.")
+        if not (faces.shape[1] == 3):
+            raise ValueError("faces need to be of shape Fx3.")
+        if not (faces.dtype == torch.int64):
+            raise ValueError("faces need to be of type torch.int64.")
+        # TODO(gkioxari) Change cast to floats once we add support for doubles.
+        if not (verts.dtype == torch.float32):
+            verts = verts.float()
+
+        ctx.save_for_backward(verts, faces)
+        areas, normals = _C.face_areas_normals_forward(verts, faces)
+        return areas, normals
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_areas, grad_normals):
+        grad_areas = grad_areas.contiguous()
+        grad_normals = grad_normals.contiguous()
+        verts, faces = ctx.saved_tensors
+        # TODO(gkioxari) Change cast to floats once we add support for doubles.
+        if not (grad_areas.dtype == torch.float32):
+            grad_areas = grad_areas.float()
+        if not (grad_normals.dtype == torch.float32):
+            grad_normals = grad_normals.float()
+        grad_verts = _C.face_areas_normals_backward(
+            grad_areas, grad_normals, verts, faces
+        )
+        return grad_verts, None
+
+
+# pyre-fixme[16]: `_MeshFaceAreasNormals` has no attribute `apply`.
+mesh_face_areas_normals = _MeshFaceAreasNormals.apply
diff --git a/pytorch3d/pytorch3d/ops/mesh_filtering.py b/pytorch3d/pytorch3d/ops/mesh_filtering.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d3596502b413151538bab88b0b0f9bd4f948e36
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/mesh_filtering.py
@@ -0,0 +1,61 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from pytorch3d.ops import norm_laplacian
+from pytorch3d.structures import Meshes, utils as struct_utils
+
+
+# ------------------------ Mesh Smoothing ------------------------ #
+# This file contains differentiable operators to filter meshes
+# The ops include
+# 1) Taubin Smoothing
+# TODO(gkioxari) add more! :)
+# ---------------------------------------------------------------- #
+
+
+# ----------------------- Taubin Smoothing ----------------------- #
+
+
+def taubin_smoothing(
+    meshes: Meshes, lambd: float = 0.53, mu: float = -0.53, num_iter: int = 10
+) -> Meshes:
+    """
+    Taubin smoothing [1] is an iterative smoothing operator for meshes.
+    At each iteration
+        verts := (1 - λ) * verts + λ * L * verts
+        verts := (1 - μ) * verts + μ * L * verts
+
+    This function returns a new mesh with smoothed vertices.
+    Args:
+        meshes: Meshes input to be smoothed
+        lambd, mu: float parameters for Taubin smoothing,
+            lambd > 0, mu < 0
+        num_iter: number of iterations to execute smoothing
+    Returns:
+        mesh: Smoothed input Meshes
+
+    [1] Curve and Surface Smoothing without Shrinkage,
+        Gabriel Taubin, ICCV 1997
+    """
+    verts = meshes.verts_packed()  # V x 3
+    edges = meshes.edges_packed()  # E x 3
+
+    for _ in range(num_iter):
+        L = norm_laplacian(verts, edges)
+        total_weight = torch.sparse.sum(L, dim=1).to_dense().view(-1, 1)
+        verts = (1 - lambd) * verts + lambd * torch.mm(L, verts) / total_weight
+
+        # pyre-ignore
+        L = norm_laplacian(verts, edges)
+        total_weight = torch.sparse.sum(L, dim=1).to_dense().view(-1, 1)
+        verts = (1 - mu) * verts + mu * torch.mm(L, verts) / total_weight
+
+    verts_list = struct_utils.packed_to_list(
+        verts, meshes.num_verts_per_mesh().tolist()
+    )
+    mesh = Meshes(verts=list(verts_list), faces=meshes.faces_list())
+    return mesh
diff --git a/pytorch3d/pytorch3d/ops/packed_to_padded.py b/pytorch3d/pytorch3d/ops/packed_to_padded.py
new file mode 100644
index 0000000000000000000000000000000000000000..639b0062b18044ebf9c1af13e7492a17fcc7f16d
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/packed_to_padded.py
@@ -0,0 +1,175 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from pytorch3d import _C
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+
+class _PackedToPadded(Function):
+    """
+    Torch autograd Function wrapper for packed_to_padded C++/CUDA implementations.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, first_idxs, max_size):
+        """
+        Args:
+            ctx: Context object used to calculate gradients.
+            inputs: FloatTensor of shape (F, D), representing the packed batch tensor.
+                e.g. areas for faces in a batch of meshes.
+            first_idxs: LongTensor of shape (N,) where N is the number of
+                elements in the batch and `first_idxs[i] = f`
+                means that the inputs for batch element i begin at `inputs[f]`.
+            max_size: Max length of an element in the batch.
+
+        Returns:
+            inputs_padded: FloatTensor of shape (N, max_size, D) where max_size is max
+                of `sizes`. The values for batch element i which start at
+                `inputs[first_idxs[i]]` will be copied to `inputs_padded[i, :]`,
+                with zeros padding out the extra inputs.
+        """
+        if not (inputs.dim() == 2):
+            raise ValueError("input can only be 2-dimensional.")
+        if not (first_idxs.dim() == 1):
+            raise ValueError("first_idxs can only be 1-dimensional.")
+        if not (inputs.dtype == torch.float32):
+            raise ValueError("input has to be of type torch.float32.")
+        if not (first_idxs.dtype == torch.int64):
+            raise ValueError("first_idxs has to be of type torch.int64.")
+        if not isinstance(max_size, int):
+            raise ValueError("max_size has to be int.")
+
+        ctx.save_for_backward(first_idxs)
+        ctx.num_inputs = int(inputs.shape[0])
+        inputs, first_idxs = inputs.contiguous(), first_idxs.contiguous()
+        inputs_padded = _C.packed_to_padded(inputs, first_idxs, max_size)
+        return inputs_padded
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        grad_output = grad_output.contiguous()
+        first_idxs = ctx.saved_tensors[0]
+        num_inputs = ctx.num_inputs
+        grad_input = _C.padded_to_packed(grad_output, first_idxs, num_inputs)
+        return grad_input, None, None
+
+
+def packed_to_padded(inputs, first_idxs, max_size):
+    """
+    Torch wrapper that handles allowed input shapes. See description below.
+
+    Args:
+        inputs: FloatTensor of shape (F,) or (F, D), representing the packed
+            batch tensor, e.g. areas for faces in a batch of meshes.
+        first_idxs: LongTensor of shape (N,) where N is the number of
+            elements in the batch and `first_idxs[i] = f`
+            means that the inputs for batch element i begin at `inputs[f]`.
+        max_size: Max length of an element in the batch.
+
+    Returns:
+        inputs_padded: FloatTensor of shape (N, max_size) or (N, max_size, D)
+            where max_size is  max of `sizes`. The values for batch element i
+            which start at `inputs[first_idxs[i]]` will be copied to
+            `inputs_padded[i, :]`, with zeros padding out the extra inputs.
+
+    To handle the allowed input shapes, we convert the inputs tensor of shape
+    (F,) to (F, 1). We reshape the output back to (N, max_size) from
+    (N, max_size, 1).
+    """
+    # if inputs is of shape (F,), reshape into (F, 1)
+    flat = False
+    if inputs.dim() == 1:
+        flat = True
+        inputs = inputs.unsqueeze(1)
+    inputs_padded = _PackedToPadded.apply(inputs, first_idxs, max_size)
+    # if flat is True, reshape output to (N, max_size) from (N, max_size, 1)
+    if flat:
+        inputs_padded = inputs_padded.squeeze(2)
+    return inputs_padded
+
+
+class _PaddedToPacked(Function):
+    """
+    Torch autograd Function wrapper for padded_to_packed C++/CUDA implementations.
+    """
+
+    @staticmethod
+    def forward(ctx, inputs, first_idxs, num_inputs):
+        """
+        Args:
+            ctx: Context object used to calculate gradients.
+            inputs: FloatTensor of shape (N, max_size, D), representing
+            the padded tensor, e.g. areas for faces in a batch of meshes.
+            first_idxs: LongTensor of shape (N,) where N is the number of
+                elements in the batch and `first_idxs[i] = f`
+                means that the inputs for batch element i begin at `inputs_packed[f]`.
+            num_inputs: Number of packed entries (= F)
+
+        Returns:
+            inputs_packed: FloatTensor of shape (F, D) where
+                `inputs_packed[first_idx[i]:] = inputs[i, :]`.
+        """
+        if not (inputs.dim() == 3):
+            raise ValueError("input can only be 3-dimensional.")
+        if not (first_idxs.dim() == 1):
+            raise ValueError("first_idxs can only be 1-dimensional.")
+        if not (inputs.dtype == torch.float32):
+            raise ValueError("input has to be of type torch.float32.")
+        if not (first_idxs.dtype == torch.int64):
+            raise ValueError("first_idxs has to be of type torch.int64.")
+        if not isinstance(num_inputs, int):
+            raise ValueError("max_size has to be int.")
+
+        ctx.save_for_backward(first_idxs)
+        ctx.max_size = inputs.shape[1]
+        inputs, first_idxs = inputs.contiguous(), first_idxs.contiguous()
+        inputs_packed = _C.padded_to_packed(inputs, first_idxs, num_inputs)
+        return inputs_packed
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        grad_output = grad_output.contiguous()
+        first_idxs = ctx.saved_tensors[0]
+        max_size = ctx.max_size
+        grad_input = _C.packed_to_padded(grad_output, first_idxs, max_size)
+        return grad_input, None, None
+
+
+def padded_to_packed(inputs, first_idxs, num_inputs):
+    """
+    Torch wrapper that handles allowed input shapes. See description below.
+
+    Args:
+        inputs: FloatTensor of shape (N, max_size) or (N, max_size, D),
+            representing the padded tensor, e.g. areas for faces in a batch of
+            meshes.
+        first_idxs: LongTensor of shape (N,) where N is the number of
+            elements in the batch and `first_idxs[i] = f`
+            means that the inputs for batch element i begin at `inputs_packed[f]`.
+        num_inputs: Number of packed entries (= F)
+
+    Returns:
+        inputs_packed: FloatTensor of shape (F,) or (F, D) where
+            `inputs_packed[first_idx[i]:] = inputs[i, :]`.
+
+    To handle the allowed input shapes, we convert the inputs tensor of shape
+    (N, max_size)  to (N, max_size, 1). We reshape the output back to (F,) from
+    (F, 1).
+    """
+    # if inputs is of shape (N, max_size), reshape into (N, max_size, 1))
+    flat = False
+    if inputs.dim() == 2:
+        flat = True
+        inputs = inputs.unsqueeze(2)
+    inputs_packed = _PaddedToPacked.apply(inputs, first_idxs, num_inputs)
+    # if flat is True, reshape output to (F,) from (F, 1)
+    if flat:
+        inputs_packed = inputs_packed.squeeze(1)
+    return inputs_packed
diff --git a/pytorch3d/pytorch3d/ops/perspective_n_points.py b/pytorch3d/pytorch3d/ops/perspective_n_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..fae183c7487dc3e92e1ba77aa845706b04b99272
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/perspective_n_points.py
@@ -0,0 +1,410 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+This file contains Efficient PnP algorithm for Perspective-n-Points problem.
+It finds a camera position (defined by rotation `R` and translation `T`) that
+minimizes re-projection error between the given 3D points `x` and
+the corresponding uncalibrated 2D points `y`.
+"""
+
+import warnings
+from typing import NamedTuple, Optional
+
+import torch
+import torch.nn.functional as F
+from pytorch3d.ops import points_alignment, utils as oputil
+
+
+class EpnpSolution(NamedTuple):
+    x_cam: torch.Tensor
+    R: torch.Tensor
+    T: torch.Tensor
+    err_2d: torch.Tensor
+    err_3d: torch.Tensor
+
+
+def _define_control_points(x, weight, storage_opts=None):
+    """
+    Returns control points that define barycentric coordinates
+    Args:
+        x: Batch of 3-dimensional points of shape `(minibatch, num_points, 3)`.
+        weight: Batch of non-negative weights of
+            shape `(minibatch, num_point)`. `None` means equal weights.
+        storage_opts: dict of keyword arguments to the tensor constructor.
+    """
+    storage_opts = storage_opts or {}
+    x_mean = oputil.wmean(x, weight)
+    c_world = F.pad(torch.eye(3, **storage_opts), (0, 0, 0, 1), value=0.0).expand_as(
+        x[:, :4, :]
+    )
+    return c_world + x_mean
+
+
+def _compute_alphas(x, c_world):
+    """
+    Computes barycentric coordinates of x in the frame c_world.
+    Args:
+        x: Batch of 3-dimensional points of shape `(minibatch, num_points, 3)`.
+        c_world: control points in world coordinates.
+    """
+    x = F.pad(x, (0, 1), value=1.0)
+    c = F.pad(c_world, (0, 1), value=1.0)
+    return torch.matmul(x, torch.inverse(c))  # B x N x 4
+
+
+def _build_M(y, alphas, weight):
+    """Returns the matrix defining the reprojection equations.
+    Args:
+        y: projected points in camera coordinates of size B x N x 2
+        alphas: barycentric coordinates of size B x N x 4
+        weight: Batch of non-negative weights of
+            shape `(minibatch, num_point)`. `None` means equal weights.
+    """
+    bs, n, _ = y.size()
+
+    # prepend t with the column of v's
+    def prepad(t, v):
+        return F.pad(t, (1, 0), value=v)
+
+    if weight is not None:
+        # weight the alphas in order to get a correctly weighted version of M
+        alphas = alphas * weight[:, :, None]
+
+    # outer left-multiply by alphas
+    def lm_alphas(t):
+        return torch.matmul(alphas[..., None], t).reshape(bs, n, 12)
+
+    M = torch.cat(
+        (
+            lm_alphas(
+                prepad(prepad(-y[:, :, 0, None, None], 0.0), 1.0)
+            ),  # u constraints
+            lm_alphas(
+                prepad(prepad(-y[:, :, 1, None, None], 1.0), 0.0)
+            ),  # v constraints
+        ),
+        dim=-1,
+    ).reshape(bs, -1, 12)
+
+    return M
+
+
+def _null_space(m, kernel_dim):
+    """Finds the null space (kernel) basis of the matrix
+    Args:
+        m: the batch of input matrices, B x N x 12
+        kernel_dim: number of dimensions to approximate the kernel
+    Returns:
+        * a batch of null space basis vectors
+            of size B x 4 x 3 x kernel_dim
+        * a batch of spectral values where near-0s correspond to actual
+            kernel vectors, of size B x kernel_dim
+    """
+    mTm = torch.bmm(m.transpose(1, 2), m)
+    s, v = torch.symeig(mTm, eigenvectors=True)
+    return v[:, :, :kernel_dim].reshape(-1, 4, 3, kernel_dim), s[:, :kernel_dim]
+
+
+def _reproj_error(y_hat, y, weight, eps=1e-9):
+    """Projects estimated 3D points and computes the reprojection error
+    Args:
+        y_hat: a batch of predicted 2D points in homogeneous coordinates
+        y: a batch of ground-truth 2D points
+        weight: Batch of non-negative weights of
+            shape `(minibatch, num_point)`. `None` means equal weights.
+    Returns:
+        Optionally weighted RMSE of difference between y and y_hat.
+    """
+    y_hat = y_hat / torch.clamp(y_hat[..., 2:], eps)
+    dist = ((y - y_hat[..., :2]) ** 2).sum(dim=-1, keepdim=True) ** 0.5
+    return oputil.wmean(dist, weight)[:, 0, 0]
+
+
+def _algebraic_error(x_w_rotated, x_cam, weight):
+    """Computes the residual of Umeyama in 3D.
+    Args:
+        x_w_rotated: The given 3D points rotated with the predicted camera.
+        x_cam: the lifted 2D points y
+        weight: Batch of non-negative weights of
+            shape `(minibatch, num_point)`. `None` means equal weights.
+    Returns:
+        Optionally weighted MSE of difference between x_w_rotated and x_cam.
+    """
+    dist = ((x_w_rotated - x_cam) ** 2).sum(dim=-1, keepdim=True)
+    return oputil.wmean(dist, weight)[:, 0, 0]
+
+
+def _compute_norm_sign_scaling_factor(c_cam, alphas, x_world, y, weight, eps=1e-9):
+    """Given a solution, adjusts the scale and flip
+    Args:
+        c_cam: control points in camera coordinates
+        alphas: barycentric coordinates of the points
+        x_world: Batch of 3-dimensional points of shape `(minibatch, num_points, 3)`.
+        y: Batch of 2-dimensional points of shape `(minibatch, num_points, 2)`.
+        weights: Batch of non-negative weights of
+            shape `(minibatch, num_point)`. `None` means equal weights.
+        eps: epsilon to threshold negative `z` values
+    """
+    # position of reference points in camera coordinates
+    x_cam = torch.matmul(alphas, c_cam)
+
+    x_cam = x_cam * (1.0 - 2.0 * (oputil.wmean(x_cam[..., 2:], weight) < 0).float())
+    if torch.any(x_cam[..., 2:] < -eps):
+        neg_rate = oputil.wmean((x_cam[..., 2:] < 0).float(), weight, dim=(0, 1)).item()
+        warnings.warn("\nEPnP: %2.2f%% points have z<0." % (neg_rate * 100.0))
+
+    R, T, s = points_alignment.corresponding_points_alignment(
+        x_world, x_cam, weight, estimate_scale=True
+    )
+    s = s.clamp(eps)
+    x_cam = x_cam / s[:, None, None]
+    T = T / s[:, None]
+    x_w_rotated = torch.matmul(x_world, R) + T[:, None, :]
+    err_2d = _reproj_error(x_w_rotated, y, weight)
+    err_3d = _algebraic_error(x_w_rotated, x_cam, weight)
+
+    return EpnpSolution(x_cam, R, T, err_2d, err_3d)
+
+
+def _gen_pairs(input, dim=-2, reducer=lambda a, b: ((a - b) ** 2).sum(dim=-1)):
+    """Generates all pairs of different rows and then applies the reducer
+    Args:
+        input: a tensor
+        dim: a dimension to generate pairs across
+        reducer: a function of generated pair of rows to apply (beyond just concat)
+    Returns:
+        for default args, for A x B x C input, will output A x (B choose 2)
+    """
+    n = input.size()[dim]
+    range = torch.arange(n)
+    idx = torch.combinations(range).to(input).long()
+    left = input.index_select(dim, idx[:, 0])
+    right = input.index_select(dim, idx[:, 1])
+    return reducer(left, right)
+
+
+def _kernel_vec_distances(v):
+    """Computes the coefficients for linearization of the quadratic system
+        to match all pairwise distances between 4 control points (dim=1).
+        The last dimension corresponds to the coefficients for quadratic terms
+        Bij = Bi * Bj, where Bi and Bj correspond to kernel vectors.
+    Arg:
+        v: tensor of B x 4 x 3 x D, where D is dim(kernel), usually 4
+    Returns:
+        a tensor of B x 6 x [(D choose 2) + D];
+        for D=4, the last dim means [B11 B22 B33 B44 B12 B13 B14 B23 B24 B34].
+    """
+    dv = _gen_pairs(v, dim=-3, reducer=lambda a, b: a - b)  # B x 6 x 3 x D
+
+    # we should take dot-product of all (i,j), i < j, with coeff 2
+    rows_2ij = 2.0 * _gen_pairs(dv, dim=-1, reducer=lambda a, b: (a * b).sum(dim=-2))
+    # this should produce B x 6 x (D choose 2) tensor
+
+    # we should take dot-product of all (i,i)
+    rows_ii = (dv ** 2).sum(dim=-2)
+    # this should produce B x 6 x D tensor
+
+    return torch.cat((rows_ii, rows_2ij), dim=-1)
+
+
+def _solve_lstsq_subcols(rhs, lhs, lhs_col_idx):
+    """Solves an over-determined linear system for selected LHS columns.
+        A batched version of `torch.lstsq`.
+    Args:
+        rhs: right-hand side vectors
+        lhs: left-hand side matrices
+        lhs_col_idx: a slice of columns in lhs
+    Returns:
+        a least-squares solution for lhs * X = rhs
+    """
+    lhs = lhs.index_select(-1, torch.tensor(lhs_col_idx, device=lhs.device).long())
+    return torch.matmul(torch.pinverse(lhs), rhs[:, :, None])
+
+
+def _binary_sign(t):
+    return (t >= 0).to(t) * 2.0 - 1.0
+
+
+def _find_null_space_coords_1(kernel_dsts, cw_dst, eps=1e-9):
+    """Solves case 1 from the paper [1]; solve for 4 coefficients:
+       [B11 B22 B33 B44 B12 B13 B14 B23 B24 B34]
+         ^               ^   ^   ^
+    Args:
+        kernel_dsts: distances between kernel vectors
+        cw_dst: distances between control points
+    Returns:
+        coefficients to weight kernel vectors
+    [1] Moreno-Noguer, F., Lepetit, V., & Fua, P. (2009).
+    EPnP: An Accurate O(n) solution to the PnP problem.
+    International Journal of Computer Vision.
+    https://www.epfl.ch/labs/cvlab/software/multi-view-stereo/epnp/
+    """
+    beta = _solve_lstsq_subcols(cw_dst, kernel_dsts, [0, 4, 5, 6])
+
+    beta = beta * _binary_sign(beta[:, :1, :])
+    return beta / torch.clamp(beta[:, :1, :] ** 0.5, eps)
+
+
+def _find_null_space_coords_2(kernel_dsts, cw_dst):
+    """Solves case 2 from the paper; solve for 3 coefficients:
+        [B11 B22 B33 B44 B12 B13 B14 B23 B24 B34]
+          ^   ^           ^
+    Args:
+        kernel_dsts: distances between kernel vectors
+        cw_dst: distances between control points
+    Returns:
+        coefficients to weight kernel vectors
+    [1] Moreno-Noguer, F., Lepetit, V., & Fua, P. (2009).
+    EPnP: An Accurate O(n) solution to the PnP problem.
+    International Journal of Computer Vision.
+    https://www.epfl.ch/labs/cvlab/software/multi-view-stereo/epnp/
+    """
+    beta = _solve_lstsq_subcols(cw_dst, kernel_dsts, [0, 4, 1])
+
+    coord_0 = (beta[:, :1, :].abs() ** 0.5) * _binary_sign(beta[:, 1:2, :])
+    coord_1 = (beta[:, 2:3, :].abs() ** 0.5) * (
+        (beta[:, :1, :] >= 0) == (beta[:, 2:3, :] >= 0)
+    ).float()
+
+    return torch.cat((coord_0, coord_1, torch.zeros_like(beta[:, :2, :])), dim=1)
+
+
+def _find_null_space_coords_3(kernel_dsts, cw_dst, eps=1e-9):
+    """Solves case 3 from the paper; solve for 5 coefficients:
+        [B11 B22 B33 B44 B12 B13 B14 B23 B24 B34]
+          ^   ^           ^   ^       ^
+    Args:
+        kernel_dsts: distances between kernel vectors
+        cw_dst: distances between control points
+    Returns:
+        coefficients to weight kernel vectors
+    [1] Moreno-Noguer, F., Lepetit, V., & Fua, P. (2009).
+    EPnP: An Accurate O(n) solution to the PnP problem.
+    International Journal of Computer Vision.
+    https://www.epfl.ch/labs/cvlab/software/multi-view-stereo/epnp/
+    """
+    beta = _solve_lstsq_subcols(cw_dst, kernel_dsts, [0, 4, 1, 5, 7])
+
+    coord_0 = (beta[:, :1, :].abs() ** 0.5) * _binary_sign(beta[:, 1:2, :])
+    coord_1 = (beta[:, 2:3, :].abs() ** 0.5) * (
+        (beta[:, :1, :] >= 0) == (beta[:, 2:3, :] >= 0)
+    ).float()
+    coord_2 = beta[:, 3:4, :] / torch.clamp(coord_0[:, :1, :], eps)
+
+    return torch.cat(
+        (coord_0, coord_1, coord_2, torch.zeros_like(beta[:, :1, :])), dim=1
+    )
+
+
+def efficient_pnp(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    weights: Optional[torch.Tensor] = None,
+    skip_quadratic_eq: bool = False,
+) -> EpnpSolution:
+    """
+    Implements Efficient PnP algorithm [1] for Perspective-n-Points problem:
+    finds a camera position (defined by rotation `R` and translation `T`) that
+    minimizes re-projection error between the given 3D points `x` and
+    the corresponding uncalibrated 2D points `y`, i.e. solves
+
+    `y[i] = Proj(x[i] R[i] + T[i])`
+
+    in the least-squares sense, where `i` are indices within the batch, and
+    `Proj` is the perspective projection operator: `Proj([x y z]) = [x/z y/z]`.
+    In the noise-less case, 4 points are enough to find the solution as long
+    as they are not co-planar.
+
+    Args:
+        x: Batch of 3-dimensional points of shape `(minibatch, num_points, 3)`.
+        y: Batch of 2-dimensional points of shape `(minibatch, num_points, 2)`.
+        weights: Batch of non-negative weights of
+            shape `(minibatch, num_point)`. `None` means equal weights.
+        skip_quadratic_eq: If True, assumes the solution space for the
+            linear system is one-dimensional, i.e. takes the scaled eigenvector
+            that corresponds to the smallest eigenvalue as a solution.
+            If False, finds the candidate coordinates in the potentially
+            4D null space by approximately solving the systems of quadratic
+            equations. The best candidate is chosen by examining the 2D
+            re-projection error. While this option finds a better solution,
+            especially when the number of points is small or perspective
+            distortions are low (the points are far away), it may be more
+            difficult to back-propagate through.
+
+    Returns:
+        `EpnpSolution` namedtuple containing elements:
+        **x_cam**: Batch of transformed points `x` that is used to find
+            the camera parameters, of shape `(minibatch, num_points, 3)`.
+            In the general (noisy) case, they are not exactly equal to
+            `x[i] R[i] + T[i]` but are some affine transform of `x[i]`s.
+        **R**: Batch of rotation matrices of shape `(minibatch, 3, 3)`.
+        **T**: Batch of translation vectors of shape `(minibatch, 3)`.
+        **err_2d**: Batch of mean 2D re-projection errors of shape
+            `(minibatch,)`. Specifically, if `yhat` is the re-projection for
+            the `i`-th batch element, it returns `sum_j norm(yhat_j - y_j)`
+            where `j` iterates over points and `norm` denotes the L2 norm.
+        **err_3d**: Batch of mean algebraic errors of shape `(minibatch,)`.
+            Specifically, those are squared distances between `x_world` and
+            estimated points on the rays defined by `y`.
+
+    [1] Moreno-Noguer, F., Lepetit, V., & Fua, P. (2009).
+    EPnP: An Accurate O(n) solution to the PnP problem.
+    International Journal of Computer Vision.
+    https://www.epfl.ch/labs/cvlab/software/multi-view-stereo/epnp/
+    """
+    # define control points in a world coordinate system (centered on the 3d
+    # points centroid); 4 x 3
+    # TODO: more stable when initialised with the center and eigenvectors!
+    c_world = _define_control_points(
+        x.detach(), weights, storage_opts={"dtype": x.dtype, "device": x.device}
+    )
+
+    # find the linear combination of the control points to represent the 3d points
+    alphas = _compute_alphas(x, c_world)
+
+    M = _build_M(y, alphas, weights)
+
+    # Compute kernel M
+    kernel, spectrum = _null_space(M, 4)
+
+    c_world_distances = _gen_pairs(c_world)
+    kernel_dsts = _kernel_vec_distances(kernel)
+
+    betas = (
+        []
+        if skip_quadratic_eq
+        else [
+            fnsc(kernel_dsts, c_world_distances)
+            for fnsc in [
+                _find_null_space_coords_1,
+                _find_null_space_coords_2,
+                _find_null_space_coords_3,
+            ]
+        ]
+    )
+
+    c_cam_variants = [kernel] + [
+        torch.matmul(kernel, beta[:, None, :, :]) for beta in betas
+    ]
+
+    solutions = [
+        _compute_norm_sign_scaling_factor(c_cam[..., 0], alphas, x, y, weights)
+        for c_cam in c_cam_variants
+    ]
+
+    sol_zipped = EpnpSolution(*(torch.stack(list(col)) for col in zip(*solutions)))
+    best = torch.argmin(sol_zipped.err_2d, dim=0)
+
+    def gather1d(source, idx):
+        # reduces the dim=1 by picking the slices in a 1D tensor idx
+        # in other words, it is batched index_select.
+        return source.gather(
+            0,
+            idx.reshape(1, -1, *([1] * (len(source.shape) - 2))).expand_as(source[:1]),
+        )[0]
+
+    return EpnpSolution(*[gather1d(sol_col, best) for sol_col in sol_zipped])
diff --git a/pytorch3d/pytorch3d/ops/points_alignment.py b/pytorch3d/pytorch3d/ops/points_alignment.py
new file mode 100644
index 0000000000000000000000000000000000000000..e361f29a8966a210e294b4a9eda0be2907628b61
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/points_alignment.py
@@ -0,0 +1,388 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from typing import TYPE_CHECKING, List, NamedTuple, Optional, Union
+
+import torch
+from pytorch3d.ops import knn_points
+from pytorch3d.structures import utils as strutil
+
+from . import utils as oputil
+
+
+if TYPE_CHECKING:
+    from pytorch3d.structures.pointclouds import Pointclouds
+
+
+# named tuples for inputs/outputs
+class SimilarityTransform(NamedTuple):
+    R: torch.Tensor
+    T: torch.Tensor
+    s: torch.Tensor
+
+
+class ICPSolution(NamedTuple):
+    converged: bool
+    rmse: Union[torch.Tensor, None]
+    Xt: torch.Tensor
+    RTs: SimilarityTransform
+    t_history: List[SimilarityTransform]
+
+
+def iterative_closest_point(
+    X: Union[torch.Tensor, "Pointclouds"],
+    Y: Union[torch.Tensor, "Pointclouds"],
+    init_transform: Optional[SimilarityTransform] = None,
+    max_iterations: int = 100,
+    relative_rmse_thr: float = 1e-6,
+    estimate_scale: bool = False,
+    allow_reflection: bool = False,
+    verbose: bool = False,
+) -> ICPSolution:
+    """
+    Executes the iterative closest point (ICP) algorithm [1, 2] in order to find
+    a similarity transformation (rotation `R`, translation `T`, and
+    optionally scale `s`) between two given differently-sized sets of
+    `d`-dimensional points `X` and `Y`, such that:
+
+    `s[i] X[i] R[i] + T[i] = Y[NN[i]]`,
+
+    for all batch indices `i` in the least squares sense. Here, Y[NN[i]] stands
+    for the indices of nearest neighbors from `Y` to each point in `X`.
+    Note, however, that the solution is only a local optimum.
+
+    Args:
+        **X**: Batch of `d`-dimensional points
+            of shape `(minibatch, num_points_X, d)` or a `Pointclouds` object.
+        **Y**: Batch of `d`-dimensional points
+            of shape `(minibatch, num_points_Y, d)` or a `Pointclouds` object.
+        **init_transform**: A named-tuple `SimilarityTransform` of tensors
+            `R`, `T, `s`, where `R` is a batch of orthonormal matrices of
+            shape `(minibatch, d, d)`, `T` is a batch of translations
+            of shape `(minibatch, d)` and `s` is a batch of scaling factors
+            of shape `(minibatch,)`.
+        **max_iterations**: The maximum number of ICP iterations.
+        **relative_rmse_thr**: A threshold on the relative root mean squared error
+            used to terminate the algorithm.
+        **estimate_scale**: If `True`, also estimates a scaling component `s`
+            of the transformation. Otherwise assumes the identity
+            scale and returns a tensor of ones.
+        **allow_reflection**: If `True`, allows the algorithm to return `R`
+            which is orthonormal but has determinant==-1.
+        **verbose**: If `True`, prints status messages during each ICP iteration.
+
+    Returns:
+        A named tuple `ICPSolution` with the following fields:
+        **converged**: A boolean flag denoting whether the algorithm converged
+            successfully (=`True`) or not (=`False`).
+        **rmse**: Attained root mean squared error after termination of ICP.
+        **Xt**: The point cloud `X` transformed with the final transformation
+            (`R`, `T`, `s`). If `X` is a `Pointclouds` object, returns an
+            instance of `Pointclouds`, otherwise returns `torch.Tensor`.
+        **RTs**: A named tuple `SimilarityTransform` containing
+        a batch of similarity transforms with fields:
+            **R**: Batch of orthonormal matrices of shape `(minibatch, d, d)`.
+            **T**: Batch of translations of shape `(minibatch, d)`.
+            **s**: batch of scaling factors of shape `(minibatch, )`.
+        **t_history**: A list of named tuples `SimilarityTransform`
+            the transformation parameters after each ICP iteration.
+
+    References:
+        [1] Besl & McKay: A Method for Registration of 3-D Shapes. TPAMI, 1992.
+        [2] https://en.wikipedia.org/wiki/Iterative_closest_point
+    """
+
+    # make sure we convert input Pointclouds structures to
+    # padded tensors of shape (N, P, 3)
+    Xt, num_points_X = oputil.convert_pointclouds_to_tensor(X)
+    Yt, num_points_Y = oputil.convert_pointclouds_to_tensor(Y)
+
+    b, size_X, dim = Xt.shape
+
+    if (Xt.shape[2] != Yt.shape[2]) or (Xt.shape[0] != Yt.shape[0]):
+        raise ValueError(
+            "Point sets X and Y have to have the same "
+            + "number of batches and data dimensions."
+        )
+
+    if ((num_points_Y < Yt.shape[1]).any() or (num_points_X < Xt.shape[1]).any()) and (
+        num_points_Y != num_points_X
+    ).any():
+        # we have a heterogeneous input (e.g. because X/Y is
+        # an instance of Pointclouds)
+        mask_X = (
+            torch.arange(size_X, dtype=torch.int64, device=Xt.device)[None]
+            < num_points_X[:, None]
+        ).type_as(Xt)
+    else:
+        mask_X = Xt.new_ones(b, size_X)
+
+    # clone the initial point cloud
+    Xt_init = Xt.clone()
+
+    if init_transform is not None:
+        # parse the initial transform from the input and apply to Xt
+        try:
+            R, T, s = init_transform
+            assert (
+                R.shape == torch.Size((b, dim, dim))
+                and T.shape == torch.Size((b, dim))
+                and s.shape == torch.Size((b,))
+            )
+        except Exception:
+            raise ValueError(
+                "The initial transformation init_transform has to be "
+                "a named tuple SimilarityTransform with elements (R, T, s). "
+                "R are dim x dim orthonormal matrices of shape "
+                "(minibatch, dim, dim), T is a batch of dim-dimensional "
+                "translations of shape (minibatch, dim) and s is a batch "
+                "of scalars of shape (minibatch,)."
+            )
+        # apply the init transform to the input point cloud
+        Xt = _apply_similarity_transform(Xt, R, T, s)
+    else:
+        # initialize the transformation with identity
+        R = oputil.eyes(dim, b, device=Xt.device, dtype=Xt.dtype)
+        T = Xt.new_zeros((b, dim))
+        s = Xt.new_ones(b)
+
+    prev_rmse = None
+    rmse = None
+    iteration = -1
+    converged = False
+
+    # initialize the transformation history
+    t_history = []
+
+    # the main loop over ICP iterations
+    for iteration in range(max_iterations):
+        Xt_nn_points = knn_points(
+            Xt, Yt, lengths1=num_points_X, lengths2=num_points_Y, K=1, return_nn=True
+        ).knn[:, :, 0, :]
+
+        # get the alignment of the nearest neighbors from Yt with Xt_init
+        R, T, s = corresponding_points_alignment(
+            Xt_init,
+            Xt_nn_points,
+            weights=mask_X,
+            estimate_scale=estimate_scale,
+            allow_reflection=allow_reflection,
+        )
+
+        # apply the estimated similarity transform to Xt_init
+        Xt = _apply_similarity_transform(Xt_init, R, T, s)
+
+        # add the current transformation to the history
+        t_history.append(SimilarityTransform(R, T, s))
+
+        # compute the root mean squared error
+        Xt_sq_diff = ((Xt - Xt_nn_points) ** 2).sum(2)
+        rmse = oputil.wmean(Xt_sq_diff[:, :, None], mask_X).sqrt()[:, 0, 0]
+
+        # compute the relative rmse
+        if prev_rmse is None:
+            relative_rmse = rmse.new_ones(b)
+        else:
+            relative_rmse = (prev_rmse - rmse) / prev_rmse
+
+        if verbose:
+            rmse_msg = (
+                f"ICP iteration {iteration}: mean/max rmse = "
+                + f"{rmse.mean():1.2e}/{rmse.max():1.2e} "
+                + f"; mean relative rmse = {relative_rmse.mean():1.2e}"
+            )
+            print(rmse_msg)
+
+        # check for convergence
+        if (relative_rmse <= relative_rmse_thr).all():
+            converged = True
+            break
+
+        # update the previous rmse
+        prev_rmse = rmse
+
+    if verbose:
+        if converged:
+            print(f"ICP has converged in {iteration + 1} iterations.")
+        else:
+            print(f"ICP has not converged in {max_iterations} iterations.")
+
+    if oputil.is_pointclouds(X):
+        Xt = X.update_padded(Xt)  # type: ignore
+
+    return ICPSolution(converged, rmse, Xt, SimilarityTransform(R, T, s), t_history)
+
+
+# threshold for checking that point crosscorelation
+# is full rank in corresponding_points_alignment
+AMBIGUOUS_ROT_SINGULAR_THR = 1e-15
+
+
+def corresponding_points_alignment(
+    X: Union[torch.Tensor, "Pointclouds"],
+    Y: Union[torch.Tensor, "Pointclouds"],
+    weights: Union[torch.Tensor, List[torch.Tensor], None] = None,
+    estimate_scale: bool = False,
+    allow_reflection: bool = False,
+    eps: float = 1e-9,
+) -> SimilarityTransform:
+    """
+    Finds a similarity transformation (rotation `R`, translation `T`
+    and optionally scale `s`)  between two given sets of corresponding
+    `d`-dimensional points `X` and `Y` such that:
+
+    `s[i] X[i] R[i] + T[i] = Y[i]`,
+
+    for all batch indexes `i` in the least squares sense.
+
+    The algorithm is also known as Umeyama [1].
+
+    Args:
+        **X**: Batch of `d`-dimensional points of shape `(minibatch, num_point, d)`
+            or a `Pointclouds` object.
+        **Y**: Batch of `d`-dimensional points of shape `(minibatch, num_point, d)`
+            or a `Pointclouds` object.
+        **weights**: Batch of non-negative weights of
+            shape `(minibatch, num_point)` or list of `minibatch` 1-dimensional
+            tensors that may have different shapes; in that case, the length of
+            i-th tensor should be equal to the number of points in X_i and Y_i.
+            Passing `None` means uniform weights.
+        **estimate_scale**: If `True`, also estimates a scaling component `s`
+            of the transformation. Otherwise assumes an identity
+            scale and returns a tensor of ones.
+        **allow_reflection**: If `True`, allows the algorithm to return `R`
+            which is orthonormal but has determinant==-1.
+        **eps**: A scalar for clamping to avoid dividing by zero. Active for the
+            code that estimates the output scale `s`.
+
+    Returns:
+        3-element named tuple `SimilarityTransform` containing
+        - **R**: Batch of orthonormal matrices of shape `(minibatch, d, d)`.
+        - **T**: Batch of translations of shape `(minibatch, d)`.
+        - **s**: batch of scaling factors of shape `(minibatch, )`.
+
+    References:
+        [1] Shinji Umeyama: Least-Suqares Estimation of
+        Transformation Parameters Between Two Point Patterns
+    """
+
+    # make sure we convert input Pointclouds structures to tensors
+    Xt, num_points = oputil.convert_pointclouds_to_tensor(X)
+    Yt, num_points_Y = oputil.convert_pointclouds_to_tensor(Y)
+
+    if (Xt.shape != Yt.shape) or (num_points != num_points_Y).any():
+        raise ValueError(
+            "Point sets X and Y have to have the same \
+            number of batches, points and dimensions."
+        )
+    if weights is not None:
+        if isinstance(weights, list):
+            if any(np != w.shape[0] for np, w in zip(num_points, weights)):
+                raise ValueError(
+                    "number of weights should equal to the "
+                    + "number of points in the point cloud."
+                )
+            weights = [w[..., None] for w in weights]
+            weights = strutil.list_to_padded(weights)[..., 0]
+
+        if Xt.shape[:2] != weights.shape:
+            raise ValueError("weights should have the same first two dimensions as X.")
+
+    b, n, dim = Xt.shape
+
+    if (num_points < Xt.shape[1]).any() or (num_points < Yt.shape[1]).any():
+        # in case we got Pointclouds as input, mask the unused entries in Xc, Yc
+        mask = (
+            torch.arange(n, dtype=torch.int64, device=Xt.device)[None]
+            < num_points[:, None]
+        ).type_as(Xt)
+        weights = mask if weights is None else mask * weights.type_as(Xt)
+
+    # compute the centroids of the point sets
+    Xmu = oputil.wmean(Xt, weight=weights, eps=eps)
+    Ymu = oputil.wmean(Yt, weight=weights, eps=eps)
+
+    # mean-center the point sets
+    Xc = Xt - Xmu
+    Yc = Yt - Ymu
+
+    total_weight = torch.clamp(num_points, 1)
+    # special handling for heterogeneous point clouds and/or input weights
+    if weights is not None:
+        Xc *= weights[:, :, None]
+        Yc *= weights[:, :, None]
+        total_weight = torch.clamp(weights.sum(1), eps)
+
+    if (num_points < (dim + 1)).any():
+        warnings.warn(
+            "The size of one of the point clouds is <= dim+1. "
+            + "corresponding_points_alignment cannot return a unique rotation."
+        )
+
+    # compute the covariance XYcov between the point sets Xc, Yc
+    XYcov = torch.bmm(Xc.transpose(2, 1), Yc)
+    XYcov = XYcov / total_weight[:, None, None]
+
+    # decompose the covariance matrix XYcov
+    U, S, V = torch.svd(XYcov)
+
+    # catch ambiguous rotation by checking the magnitude of singular values
+    if (S.abs() <= AMBIGUOUS_ROT_SINGULAR_THR).any() and not (
+        num_points < (dim + 1)
+    ).any():
+        warnings.warn(
+            "Excessively low rank of "
+            + "cross-correlation between aligned point clouds. "
+            + "corresponding_points_alignment cannot return a unique rotation."
+        )
+
+    # identity matrix used for fixing reflections
+    E = torch.eye(dim, dtype=XYcov.dtype, device=XYcov.device)[None].repeat(b, 1, 1)
+
+    if not allow_reflection:
+        # reflection test:
+        #   checks whether the estimated rotation has det==1,
+        #   if not, finds the nearest rotation s.t. det==1 by
+        #   flipping the sign of the last singular vector U
+        R_test = torch.bmm(U, V.transpose(2, 1))
+        E[:, -1, -1] = torch.det(R_test)
+
+    # find the rotation matrix by composing U and V again
+    R = torch.bmm(torch.bmm(U, E), V.transpose(2, 1))
+
+    if estimate_scale:
+        # estimate the scaling component of the transformation
+        trace_ES = (torch.diagonal(E, dim1=1, dim2=2) * S).sum(1)
+        Xcov = (Xc * Xc).sum((1, 2)) / total_weight
+
+        # the scaling component
+        s = trace_ES / torch.clamp(Xcov, eps)
+
+        # translation component
+        T = Ymu[:, 0, :] - s[:, None] * torch.bmm(Xmu, R)[:, 0, :]
+    else:
+        # translation component
+        T = Ymu[:, 0, :] - torch.bmm(Xmu, R)[:, 0, :]
+
+        # unit scaling since we do not estimate scale
+        s = T.new_ones(b)
+
+    return SimilarityTransform(R, T, s)
+
+
+def _apply_similarity_transform(
+    X: torch.Tensor, R: torch.Tensor, T: torch.Tensor, s: torch.Tensor
+) -> torch.Tensor:
+    """
+    Applies a similarity transformation parametrized with a batch of orthonormal
+    matrices `R` of shape `(minibatch, d, d)`, a batch of translations `T`
+    of shape `(minibatch, d)` and a batch of scaling factors `s`
+    of shape `(minibatch,)` to a given `d`-dimensional cloud `X`
+    of shape `(minibatch, num_points, d)`
+    """
+    X = s[:, None, None] * torch.bmm(X, R) + T[:, None, :]
+    return X
diff --git a/pytorch3d/pytorch3d/ops/points_normals.py b/pytorch3d/pytorch3d/ops/points_normals.py
new file mode 100644
index 0000000000000000000000000000000000000000..d11be3b2314b60fdc87e715d29cdba9496c89411
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/points_normals.py
@@ -0,0 +1,176 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import TYPE_CHECKING, Tuple, Union
+
+import torch
+
+from .utils import convert_pointclouds_to_tensor, get_point_covariances
+
+
+if TYPE_CHECKING:
+    from ..structures import Pointclouds
+
+
+def estimate_pointcloud_normals(
+    pointclouds: Union[torch.Tensor, "Pointclouds"],
+    neighborhood_size: int = 50,
+    disambiguate_directions: bool = True,
+) -> torch.Tensor:
+    """
+    Estimates the normals of a batch of `pointclouds`.
+
+    The function uses `estimate_pointcloud_local_coord_frames` to estimate
+    the normals. Please refer to that function for more detailed information.
+
+    Args:
+      **pointclouds**: Batch of 3-dimensional points of shape
+        `(minibatch, num_point, 3)` or a `Pointclouds` object.
+      **neighborhood_size**: The size of the neighborhood used to estimate the
+        geometry around each point.
+      **disambiguate_directions**: If `True`, uses the algorithm from [1] to
+        ensure sign consistency of the normals of neighboring points.
+
+    Returns:
+      **normals**: A tensor of normals for each input point
+        of shape `(minibatch, num_point, 3)`.
+        If `pointclouds` are of `Pointclouds` class, returns a padded tensor.
+
+    References:
+      [1] Tombari, Salti, Di Stefano: Unique Signatures of Histograms for
+      Local Surface Description, ECCV 2010.
+    """
+
+    curvatures, local_coord_frames = estimate_pointcloud_local_coord_frames(
+        pointclouds,
+        neighborhood_size=neighborhood_size,
+        disambiguate_directions=disambiguate_directions,
+    )
+
+    # the normals correspond to the first vector of each local coord frame
+    normals = local_coord_frames[:, :, :, 0]
+
+    return normals
+
+
+def estimate_pointcloud_local_coord_frames(
+    pointclouds: Union[torch.Tensor, "Pointclouds"],
+    neighborhood_size: int = 50,
+    disambiguate_directions: bool = True,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Estimates the principal directions of curvature (which includes normals)
+    of a batch of `pointclouds`.
+
+    The algorithm first finds `neighborhood_size` nearest neighbors for each
+    point of the point clouds, followed by obtaining principal vectors of
+    covariance matrices of each of the point neighborhoods.
+    The main principal vector corresponds to the normals, while the
+    other 2 are the direction of the highest curvature and the 2nd highest
+    curvature.
+
+    Note that each principal direction is given up to a sign. Hence,
+    the function implements `disambiguate_directions` switch that allows
+    to ensure consistency of the sign of neighboring normals. The implementation
+    follows the sign disabiguation from SHOT descriptors [1].
+
+    The algorithm also returns the curvature values themselves.
+    These are the eigenvalues of the estimated covariance matrices
+    of each point neighborhood.
+
+    Args:
+      **pointclouds**: Batch of 3-dimensional points of shape
+        `(minibatch, num_point, 3)` or a `Pointclouds` object.
+      **neighborhood_size**: The size of the neighborhood used to estimate the
+        geometry around each point.
+      **disambiguate_directions**: If `True`, uses the algorithm from [1] to
+        ensure sign consistency of the normals of neighboring points.
+
+    Returns:
+      **curvatures**: The three principal curvatures of each point
+        of shape `(minibatch, num_point, 3)`.
+        If `pointclouds` are of `Pointclouds` class, returns a padded tensor.
+      **local_coord_frames**: The three principal directions of the curvature
+        around each point of shape `(minibatch, num_point, 3, 3)`.
+        The principal directions are stored in columns of the output.
+        E.g. `local_coord_frames[i, j, :, 0]` is the normal of
+        `j`-th point in the `i`-th pointcloud.
+        If `pointclouds` are of `Pointclouds` class, returns a padded tensor.
+
+    References:
+      [1] Tombari, Salti, Di Stefano: Unique Signatures of Histograms for
+      Local Surface Description, ECCV 2010.
+    """
+
+    points_padded, num_points = convert_pointclouds_to_tensor(pointclouds)
+
+    ba, N, dim = points_padded.shape
+    if dim != 3:
+        raise ValueError(
+            "The pointclouds argument has to be of shape (minibatch, N, 3)"
+        )
+
+    if (num_points <= neighborhood_size).any():
+        raise ValueError(
+            "The neighborhood_size argument has to be"
+            + " >= size of each of the point clouds."
+        )
+
+    # undo global mean for stability
+    # TODO: replace with tutil.wmean once landed
+    pcl_mean = points_padded.sum(1) / num_points[:, None]
+    points_centered = points_padded - pcl_mean[:, None, :]
+
+    # get the per-point covariance and nearest neighbors used to compute it
+    cov, knns = get_point_covariances(points_centered, num_points, neighborhood_size)
+
+    # get the local coord frames as principal directions of
+    # the per-point covariance
+    # this is done with torch.symeig, which returns the
+    # eigenvectors (=principal directions) in an ascending order of their
+    # corresponding eigenvalues, while the smallest eigenvalue's eigenvector
+    # corresponds to the normal direction
+    curvatures, local_coord_frames = torch.symeig(cov, eigenvectors=True)
+
+    # disambiguate the directions of individual principal vectors
+    if disambiguate_directions:
+        # disambiguate normal
+        n = _disambiguate_vector_directions(
+            points_centered, knns, local_coord_frames[:, :, :, 0]
+        )
+        # disambiguate the main curvature
+        z = _disambiguate_vector_directions(
+            points_centered, knns, local_coord_frames[:, :, :, 2]
+        )
+        # the secondary curvature is just a cross between n and z
+        y = torch.cross(n, z, dim=2)
+        # cat to form the set of principal directions
+        local_coord_frames = torch.stack((n, y, z), dim=3)
+
+    return curvatures, local_coord_frames
+
+
+def _disambiguate_vector_directions(pcl, knns, vecs):
+    """
+    Disambiguates normal directions according to [1].
+
+    References:
+      [1] Tombari, Salti, Di Stefano: Unique Signatures of Histograms for
+      Local Surface Description, ECCV 2010.
+    """
+    # parse out K from the shape of knns
+    K = knns.shape[2]
+    # the difference between the mean of each neighborhood and
+    # each element of the neighborhood
+    df = knns - pcl[:, :, None]
+    # projection of the difference on the principal direction
+    proj = (vecs[:, :, None] * df).sum(3)
+    # check how many projections are positive
+    n_pos = (proj > 0).type_as(knns).sum(2, keepdim=True)
+    # flip the principal directions where number of positive correlations
+    flip = (n_pos < (0.5 * K)).type_as(knns)
+    vecs = (1.0 - 2.0 * flip) * vecs
+    return vecs
diff --git a/pytorch3d/pytorch3d/ops/points_to_volumes.py b/pytorch3d/pytorch3d/ops/points_to_volumes.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d18903b5904a79be2ec797dc8242e81457d7d81
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/points_to_volumes.py
@@ -0,0 +1,749 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import TYPE_CHECKING, Optional, Tuple
+
+import torch
+from pytorch3d import _C
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+
+if TYPE_CHECKING:
+    from ..structures import Pointclouds, Volumes
+
+
+class _points_to_volumes_function(Function):
+    """
+    For each point in a pointcloud, add point_weight to the
+    corresponding volume density and point_weight times its features
+    to the corresponding volume features.
+
+    This function does not require any contiguity internally and therefore
+    doesn't need to make copies of its inputs, which is useful when GPU memory
+    is at a premium. (An implementation requiring contiguous inputs might be faster
+    though). The volumes are modified in place.
+
+    This function is differentiable with respect to
+    points_features, volume_densities and volume_features.
+    If splat is True then it is also differentiable with respect to
+    points_3d.
+
+    It may be useful to think about this function as a sort of opposite to
+    torch.nn.functional.grid_sample with 5D inputs.
+
+    Args:
+        points_3d: Batch of 3D point cloud coordinates of shape
+            `(minibatch, N, 3)` where N is the number of points
+            in each point cloud. Coordinates have to be specified in the
+            local volume coordinates (ranging in [-1, 1]).
+        points_features: Features of shape `(minibatch, N, feature_dim)`
+            corresponding to the points of the input point cloud `points_3d`.
+        volume_features: Batch of input feature volumes
+            of shape `(minibatch, feature_dim, D, H, W)`
+        volume_densities: Batch of input feature volume densities
+            of shape `(minibatch, 1, D, H, W)`. Each voxel should
+            contain a non-negative number corresponding to its
+            opaqueness (the higher, the less transparent).
+
+        grid_sizes: `LongTensor` of shape (minibatch, 3) representing the
+            spatial resolutions of each of the the non-flattened `volumes`
+            tensors. Note that the following has to hold:
+                `torch.prod(grid_sizes, dim=1)==N_voxels`.
+
+        point_weight: A scalar controlling how much weight a single point has.
+
+        mask: A binary mask of shape `(minibatch, N)` determining
+            which 3D points are going to be converted to the resulting
+            volume. Set to `None` if all points are valid.
+
+        align_corners: as for grid_sample.
+
+        splat: if true, trilinear interpolation. If false all the weight goes in
+            the nearest voxel.
+
+    Returns:
+        volume_densities and volume_features, which have been modified in place.
+    """
+
+    @staticmethod
+    # pyre-fixme[14]: `forward` overrides method defined in `Function` inconsistently.
+    def forward(
+        ctx,
+        points_3d: torch.Tensor,
+        points_features: torch.Tensor,
+        volume_densities: torch.Tensor,
+        volume_features: torch.Tensor,
+        grid_sizes: torch.LongTensor,
+        point_weight: float,
+        mask: torch.Tensor,
+        align_corners: bool,
+        splat: bool,
+    ):
+
+        ctx.mark_dirty(volume_densities, volume_features)
+
+        N, P, D = points_3d.shape
+        if D != 3:
+            raise ValueError("points_3d must be 3D")
+        if points_3d.dtype != torch.float32:
+            raise ValueError("points_3d must be float32")
+        if points_features.dtype != torch.float32:
+            raise ValueError("points_features must be float32")
+        N1, P1, C = points_features.shape
+        if N1 != N or P1 != P:
+            raise ValueError("Bad points_features shape")
+        if volume_densities.dtype != torch.float32:
+            raise ValueError("volume_densities must be float32")
+        N2, one, D, H, W = volume_densities.shape
+        if N2 != N or one != 1:
+            raise ValueError("Bad volume_densities shape")
+        if volume_features.dtype != torch.float32:
+            raise ValueError("volume_features must be float32")
+        N3, C1, D1, H1, W1 = volume_features.shape
+        if N3 != N or C1 != C or D1 != D or H1 != H or W1 != W:
+            raise ValueError("Bad volume_features shape")
+        if grid_sizes.dtype != torch.int64:
+            raise ValueError("grid_sizes must be int64")
+        N4, D1 = grid_sizes.shape
+        if N4 != N or D1 != 3:
+            raise ValueError("Bad grid_sizes.shape")
+        if mask.dtype != torch.float32:
+            raise ValueError("mask must be float32")
+        N5, P2 = mask.shape
+        if N5 != N or P2 != P:
+            raise ValueError("Bad mask shape")
+
+        # pyre-fixme[16]: Module `pytorch3d` has no attribute `_C`.
+        _C.points_to_volumes_forward(
+            points_3d,
+            points_features,
+            volume_densities,
+            volume_features,
+            grid_sizes,
+            mask,
+            point_weight,
+            align_corners,
+            splat,
+        )
+        if splat:
+            ctx.save_for_backward(points_3d, points_features, grid_sizes, mask)
+        else:
+            ctx.save_for_backward(points_3d, grid_sizes, mask)
+        ctx.point_weight = point_weight
+        ctx.splat = splat
+        ctx.align_corners = align_corners
+        return volume_densities, volume_features
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_volume_densities, grad_volume_features):
+        splat = ctx.splat
+        N, C = grad_volume_features.shape[:2]
+        if splat:
+            points_3d, points_features, grid_sizes, mask = ctx.saved_tensors
+            P = points_3d.shape[1]
+            grad_points_3d = torch.zeros_like(points_3d)
+        else:
+            points_3d, grid_sizes, mask = ctx.saved_tensors
+            P = points_3d.shape[1]
+            ones = points_3d.new_zeros(1, 1, 1)
+            # There is no gradient. Just need something to let its accessors exist.
+            grad_points_3d = ones.expand_as(points_3d)
+            # points_features not needed. Just need something to let its accessors exist.
+            points_features = ones.expand(N, P, C)
+        grad_points_features = points_3d.new_zeros(N, P, C)
+        _C.points_to_volumes_backward(
+            points_3d,
+            points_features,
+            grid_sizes,
+            mask,
+            ctx.point_weight,
+            ctx.align_corners,
+            splat,
+            grad_volume_densities,
+            grad_volume_features,
+            grad_points_3d,
+            grad_points_features,
+        )
+
+        return (
+            (grad_points_3d if splat else None),
+            grad_points_features,
+            grad_volume_densities,
+            grad_volume_features,
+            None,
+            None,
+            None,
+            None,
+            None,
+        )
+
+
+# pyre-fixme[16]: `_points_to_volumes_function` has no attribute `apply`.
+_points_to_volumes = _points_to_volumes_function.apply
+
+
+def add_pointclouds_to_volumes(
+    pointclouds: "Pointclouds",
+    initial_volumes: "Volumes",
+    mode: str = "trilinear",
+    min_weight: float = 1e-4,
+    _python: bool = False,
+) -> "Volumes":
+    """
+    Add a batch of point clouds represented with a `Pointclouds` structure
+    `pointclouds` to a batch of existing volumes represented with a
+    `Volumes` structure `initial_volumes`.
+
+    More specifically, the method casts a set of weighted votes (the weights are
+    determined based on `mode="trilinear"|"nearest"`) into the pre-initialized
+    `features` and `densities` fields of `initial_volumes`.
+
+    The method returns an updated `Volumes` object that contains a copy
+    of `initial_volumes` with its `features` and `densities` updated with the
+    result of the pointcloud addition.
+
+    Example:
+        ```
+        # init a random point cloud
+        pointclouds = Pointclouds(
+            points=torch.randn(4, 100, 3), features=torch.rand(4, 100, 5)
+        )
+        # init an empty volume centered around [0.5, 0.5, 0.5] in world coordinates
+        # with a voxel size of 1.0.
+        initial_volumes = Volumes(
+            features = torch.zeros(4, 5, 25, 25, 25),
+            densities = torch.zeros(4, 1, 25, 25, 25),
+            volume_translation = [-0.5, -0.5, -0.5],
+            voxel_size = 1.0,
+        )
+        # add the pointcloud to the 'initial_volumes' buffer using
+        # trilinear splatting
+        updated_volumes = add_pointclouds_to_volumes(
+            pointclouds=pointclouds,
+            initial_volumes=initial_volumes,
+            mode="trilinear",
+        )
+        ```
+
+    Args:
+        pointclouds: Batch of 3D pointclouds represented with a `Pointclouds`
+            structure. Note that `pointclouds.features` have to be defined.
+        initial_volumes: Batch of initial `Volumes` with pre-initialized 1-dimensional
+            densities which contain non-negative numbers corresponding to the
+            opaqueness of each voxel (the higher, the less transparent).
+        mode: The mode of the conversion of individual points into the volume.
+            Set either to `nearest` or `trilinear`:
+            `nearest`: Each 3D point is first rounded to the volumetric
+                lattice. Each voxel is then labeled with the average
+                over features that fall into the given voxel.
+                The gradients of nearest neighbor conversion w.r.t. the
+                3D locations of the points in `pointclouds` are *not* defined.
+            `trilinear`: Each 3D point casts 8 weighted votes to the 8-neighborhood
+                of its floating point coordinate. The weights are
+                determined using a trilinear interpolation scheme.
+                Trilinear splatting is fully differentiable w.r.t. all input arguments.
+        min_weight: A scalar controlling the lowest possible total per-voxel
+            weight used to normalize the features accumulated in a voxel.
+            Only active for `mode==trilinear`.
+        _python: Set to True to use a pure Python implementation, e.g. for test
+            purposes, which requires more memory and may be slower.
+
+    Returns:
+        updated_volumes: Output `Volumes` structure containing the conversion result.
+    """
+
+    if len(initial_volumes) != len(pointclouds):
+        raise ValueError(
+            "'initial_volumes' and 'pointclouds' have to have the same batch size."
+        )
+
+    # obtain the features and densities
+    pcl_feats = pointclouds.features_padded()
+    pcl_3d = pointclouds.points_padded()
+
+    if pcl_feats is None:
+        raise ValueError("'pointclouds' have to have their 'features' defined.")
+
+    # obtain the conversion mask
+    n_per_pcl = pointclouds.num_points_per_cloud().type_as(pcl_feats)
+    mask = torch.arange(n_per_pcl.max(), dtype=pcl_feats.dtype, device=pcl_feats.device)
+    mask = (mask[None, :] < n_per_pcl[:, None]).type_as(mask)
+
+    # convert to the coord frame of the volume
+    pcl_3d_local = initial_volumes.world_to_local_coords(pcl_3d)
+
+    features_new, densities_new = add_points_features_to_volume_densities_features(
+        points_3d=pcl_3d_local,
+        points_features=pcl_feats,
+        volume_features=initial_volumes.features(),
+        volume_densities=initial_volumes.densities(),
+        min_weight=min_weight,
+        grid_sizes=initial_volumes.get_grid_sizes(),
+        mask=mask,
+        mode=mode,
+        _python=_python,
+    )
+
+    return initial_volumes.update_padded(
+        new_densities=densities_new, new_features=features_new
+    )
+
+
+def add_points_features_to_volume_densities_features(
+    points_3d: torch.Tensor,
+    points_features: torch.Tensor,
+    volume_densities: torch.Tensor,
+    volume_features: Optional[torch.Tensor],
+    mode: str = "trilinear",
+    min_weight: float = 1e-4,
+    mask: Optional[torch.Tensor] = None,
+    grid_sizes: Optional[torch.LongTensor] = None,
+    _python: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Convert a batch of point clouds represented with tensors of per-point
+    3d coordinates and their features to a batch of volumes represented
+    with tensors of densities and features.
+
+    Args:
+        points_3d: Batch of 3D point cloud coordinates of shape
+            `(minibatch, N, 3)` where N is the number of points
+            in each point cloud. Coordinates have to be specified in the
+            local volume coordinates (ranging in [-1, 1]).
+        points_features: Features of shape `(minibatch, N, feature_dim)` corresponding
+            to the points of the input point clouds `pointcloud`.
+        volume_densities: Batch of input feature volume densities of shape
+            `(minibatch, 1, D, H, W)`. Each voxel should
+            contain a non-negative number corresponding to its
+            opaqueness (the higher, the less transparent).
+        volume_features: Batch of input feature volumes of shape
+            `(minibatch, feature_dim, D, H, W)`
+            If set to `None`, the `volume_features` will be automatically
+            instantiated with a correct size and filled with 0s.
+        mode: The mode of the conversion of individual points into the volume.
+            Set either to `nearest` or `trilinear`:
+            `nearest`: Each 3D point is first rounded to the volumetric
+                lattice. Each voxel is then labeled with the average
+                over features that fall into the given voxel.
+                The gradients of nearest neighbor rounding w.r.t. the
+                input point locations `points_3d` are *not* defined.
+            `trilinear`: Each 3D point casts 8 weighted votes to the 8-neighborhood
+                of its floating point coordinate. The weights are
+                determined using a trilinear interpolation scheme.
+                Trilinear splatting is fully differentiable w.r.t. all input arguments.
+        min_weight: A scalar controlling the lowest possible total per-voxel
+            weight used to normalize the features accumulated in a voxel.
+            Only active for `mode==trilinear`.
+        mask: A binary mask of shape `(minibatch, N)` determining which 3D points
+            are going to be converted to the resulting volume.
+            Set to `None` if all points are valid.
+        grid_sizes: `LongTensor` of shape (minibatch, 3) representing the
+            spatial resolutions of each of the the non-flattened `volumes` tensors,
+            or None to indicate the whole volume is used for every batch element.
+        _python: Set to True to use a pure Python implementation.
+    Returns:
+        volume_features: Output volume of shape `(minibatch, feature_dim, D, H, W)`
+        volume_densities: Occupancy volume of shape `(minibatch, 1, D, H, W)`
+            containing the total amount of votes cast to each of the voxels.
+    """
+
+    # number of points in the point cloud, its dim and batch size
+    ba, n_points, feature_dim = points_features.shape
+    ba_volume, density_dim = volume_densities.shape[:2]
+
+    if density_dim != 1:
+        raise ValueError("Only one-dimensional densities are allowed.")
+
+    # init the volumetric grid sizes if uninitialized
+    if grid_sizes is None:
+        # grid sizes shape (minibatch, 3)
+        grid_sizes = (
+            torch.LongTensor(list(volume_densities.shape[2:]))
+            .to(volume_densities.device)
+            .expand(volume_densities.shape[0], 3)
+        )
+
+    if _python:
+        return _add_points_features_to_volume_densities_features_python(
+            points_3d=points_3d,
+            points_features=points_features,
+            volume_densities=volume_densities,
+            volume_features=volume_features,
+            mode=mode,
+            min_weight=min_weight,
+            mask=mask,
+            grid_sizes=grid_sizes,
+        )
+
+    if mode == "trilinear":
+        splat = True
+    elif mode == "nearest":
+        splat = False
+    else:
+        raise ValueError('No such interpolation mode "%s"' % mode)
+
+    if mask is None:
+        mask = points_3d.new_ones(1).expand(points_3d.shape[:2])
+
+    volume_densities, volume_features = _points_to_volumes(
+        points_3d,
+        points_features,
+        volume_densities,
+        volume_features,
+        grid_sizes,
+        1.0,  # point_weight
+        mask,
+        True,  # align_corners
+        splat,
+    )
+    if splat:
+        # divide each feature by the total weight of the votes
+        volume_features = volume_features / volume_densities.clamp(min_weight)
+    else:
+        # divide each feature by the total weight of the votes
+        volume_features = volume_features / volume_densities.clamp(1.0)
+
+    return volume_features, volume_densities
+
+
+def _add_points_features_to_volume_densities_features_python(
+    *,
+    points_3d: torch.Tensor,
+    points_features: torch.Tensor,
+    volume_densities: torch.Tensor,
+    volume_features: Optional[torch.Tensor],
+    mode: str,
+    min_weight: float,
+    mask: Optional[torch.Tensor],
+    grid_sizes: torch.LongTensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Python implementation for add_points_features_to_volume_densities_features.
+
+    Returns:
+        volume_features: Output volume of shape `(minibatch, feature_dim, D, H, W)`
+        volume_densities: Occupancy volume of shape `(minibatch, 1, D, H, W)`
+            containing the total amount of votes cast to each of the voxels.
+    """
+    ba, n_points, feature_dim = points_features.shape
+
+    # flatten densities and features
+    v_shape = volume_densities.shape[2:]
+    volume_densities_flatten = volume_densities.view(ba, -1, 1)
+    n_voxels = volume_densities_flatten.shape[1]
+
+    if volume_features is None:
+        # initialize features if not passed in
+        volume_features_flatten = volume_densities.new_zeros(ba, feature_dim, n_voxels)
+    else:
+        # otherwise just flatten
+        volume_features_flatten = volume_features.view(ba, feature_dim, n_voxels)
+
+    if mode == "trilinear":  # do the splatting (trilinear interp)
+        volume_features, volume_densities = _splat_points_to_volumes(
+            points_3d,
+            points_features,
+            volume_densities_flatten,
+            volume_features_flatten,
+            grid_sizes,
+            mask=mask,
+            min_weight=min_weight,
+        )
+    elif mode == "nearest":  # nearest neighbor interp
+        volume_features, volume_densities = _round_points_to_volumes(
+            points_3d,
+            points_features,
+            volume_densities_flatten,
+            volume_features_flatten,
+            grid_sizes,
+            mask=mask,
+        )
+    else:
+        raise ValueError('No such interpolation mode "%s"' % mode)
+
+    # reshape into the volume shape
+    volume_features = volume_features.view(ba, feature_dim, *v_shape)
+    volume_densities = volume_densities.view(ba, 1, *v_shape)
+    return volume_features, volume_densities
+
+
+def _check_points_to_volumes_inputs(
+    points_3d: torch.Tensor,
+    points_features: torch.Tensor,
+    volume_densities: torch.Tensor,
+    volume_features: torch.Tensor,
+    grid_sizes: torch.LongTensor,
+    mask: Optional[torch.Tensor] = None,
+):
+
+    max_grid_size = grid_sizes.max(dim=0).values
+    if torch.prod(max_grid_size) > volume_densities.shape[1]:
+        raise ValueError(
+            "One of the grid sizes corresponds to a larger number"
+            + " of elements than the number of elements in volume_densities."
+        )
+
+    _, n_voxels, density_dim = volume_densities.shape
+
+    if density_dim != 1:
+        raise ValueError("Only one-dimensional densities are allowed.")
+
+    ba, n_points, feature_dim = points_features.shape
+
+    if volume_features.shape[1] != feature_dim:
+        raise ValueError(
+            "volume_features have a different number of channels"
+            + " than points_features."
+        )
+
+    if volume_features.shape[2] != n_voxels:
+        raise ValueError(
+            "volume_features have a different number of elements"
+            + " than volume_densities."
+        )
+
+
+def _splat_points_to_volumes(
+    points_3d: torch.Tensor,
+    points_features: torch.Tensor,
+    volume_densities: torch.Tensor,
+    volume_features: torch.Tensor,
+    grid_sizes: torch.LongTensor,
+    min_weight: float = 1e-4,
+    mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Convert a batch of point clouds to a batch of volumes using trilinear
+    splatting into a volume.
+
+    Args:
+        points_3d: Batch of 3D point cloud coordinates of shape
+            `(minibatch, N, 3)` where N is the number of points
+            in each point cloud. Coordinates have to be specified in the
+            local volume coordinates (ranging in [-1, 1]).
+        points_features: Features of shape `(minibatch, N, feature_dim)`
+            corresponding to the points of the input point cloud `points_3d`.
+        volume_features: Batch of input *flattened* feature volumes
+            of shape `(minibatch, feature_dim, N_voxels)`
+        volume_densities: Batch of input *flattened* feature volume densities
+            of shape `(minibatch, N_voxels, 1)`. Each voxel should
+            contain a non-negative number corresponding to its
+            opaqueness (the higher, the less transparent).
+        grid_sizes: `LongTensor` of shape (minibatch, 3) representing the
+            spatial resolutions of each of the the non-flattened `volumes` tensors.
+            Note that the following has to hold:
+                `torch.prod(grid_sizes, dim=1)==N_voxels`
+        min_weight: A scalar controlling the lowest possible total per-voxel
+            weight used to normalize the features accumulated in a voxel.
+        mask: A binary mask of shape `(minibatch, N)` determining which 3D points
+            are going to be converted to the resulting volume.
+            Set to `None` if all points are valid.
+    Returns:
+        volume_features: Output volume of shape `(minibatch, D, N_voxels)`.
+        volume_densities: Occupancy volume of shape `(minibatch, 1, N_voxels)`
+            containing the total amount of votes cast to each of the voxels.
+    """
+
+    _check_points_to_volumes_inputs(
+        points_3d,
+        points_features,
+        volume_densities,
+        volume_features,
+        grid_sizes,
+        mask=mask,
+    )
+
+    _, n_voxels, density_dim = volume_densities.shape
+    ba, n_points, feature_dim = points_features.shape
+
+    # minibatch x n_points x feature_dim -> minibatch x feature_dim x n_points
+    points_features = points_features.permute(0, 2, 1).contiguous()
+
+    # XYZ = the upper-left volume index of the 8-neighborhood of every point
+    # grid_sizes is of the form (minibatch, depth-height-width)
+    grid_sizes_xyz = grid_sizes[:, [2, 1, 0]]
+
+    # Convert from points_3d in the range [-1, 1] to
+    # indices in the volume grid in the range [0, grid_sizes_xyz-1]
+    points_3d_indices = ((points_3d + 1) * 0.5) * (
+        grid_sizes_xyz[:, None].type_as(points_3d) - 1
+    )
+    XYZ = points_3d_indices.floor().long()
+    rXYZ = points_3d_indices - XYZ.type_as(points_3d)  # remainder of floor
+
+    # split into separate coordinate vectors
+    X, Y, Z = XYZ.split(1, dim=2)
+    # rX = remainder after floor = 1-"the weight of each vote into
+    #      the X coordinate of the 8-neighborhood"
+    rX, rY, rZ = rXYZ.split(1, dim=2)
+
+    # get random indices for the purpose of adding out-of-bounds values
+    # pyre-fixme[16]: `Tensor` has no attribute `new_zeros`.
+    rand_idx = X.new_zeros(X.shape).random_(0, n_voxels)
+
+    # iterate over the x, y, z indices of the 8-neighborhood (xdiff, ydiff, zdiff)
+    for xdiff in (0, 1):
+        X_ = X + xdiff
+        wX = (1 - xdiff) + (2 * xdiff - 1) * rX
+        for ydiff in (0, 1):
+            Y_ = Y + ydiff
+            wY = (1 - ydiff) + (2 * ydiff - 1) * rY
+            for zdiff in (0, 1):
+                Z_ = Z + zdiff
+                wZ = (1 - zdiff) + (2 * zdiff - 1) * rZ
+
+                # weight of each vote into the given cell of 8-neighborhood
+                w = wX * wY * wZ
+
+                # valid - binary indicators of votes that fall into the volume
+                valid = (
+                    (0 <= X_)
+                    * (X_ < grid_sizes_xyz[:, None, 0:1])
+                    * (0 <= Y_)
+                    * (Y_ < grid_sizes_xyz[:, None, 1:2])
+                    * (0 <= Z_)
+                    * (Z_ < grid_sizes_xyz[:, None, 2:3])
+                ).long()
+
+                # linearized indices into the volume
+                idx = (Z_ * grid_sizes[:, None, 1:2] + Y_) * grid_sizes[
+                    :, None, 2:3
+                ] + X_
+
+                # out-of-bounds features added to a random voxel idx with weight=0.
+                idx_valid = idx * valid + rand_idx * (1 - valid)
+                w_valid = w * valid.type_as(w)
+                if mask is not None:
+                    w_valid = w_valid * mask.type_as(w)[:, :, None]
+
+                # scatter add casts the votes into the weight accumulator
+                # and the feature accumulator
+                # pyre-fixme[16]: `Tensor` has no attribute `scatter_add_`.
+                volume_densities.scatter_add_(1, idx_valid, w_valid)
+
+                # reshape idx_valid -> (minibatch, feature_dim, n_points)
+                idx_valid = idx_valid.view(ba, 1, n_points).expand_as(points_features)
+                w_valid = w_valid.view(ba, 1, n_points)
+
+                # volume_features of shape (minibatch, feature_dim, n_voxels)
+                volume_features.scatter_add_(2, idx_valid, w_valid * points_features)
+
+    # divide each feature by the total weight of the votes
+    volume_features = volume_features / volume_densities.view(ba, 1, n_voxels).clamp(
+        min_weight
+    )
+
+    return volume_features, volume_densities
+
+
+def _round_points_to_volumes(
+    points_3d: torch.Tensor,
+    points_features: torch.Tensor,
+    volume_densities: torch.Tensor,
+    volume_features: torch.Tensor,
+    grid_sizes: torch.LongTensor,
+    mask: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Convert a batch of point clouds to a batch of volumes using rounding to the
+    nearest integer coordinate of the volume. Features that fall into the same
+    voxel are averaged.
+
+    Args:
+        points_3d: Batch of 3D point cloud coordinates of shape
+            `(minibatch, N, 3)` where N is the number of points
+            in each point cloud. Coordinates have to be specified in the
+            local volume coordinates (ranging in [-1, 1]).
+        points_features: Features of shape `(minibatch, N, feature_dim)`
+            corresponding to the points of the input point cloud `points_3d`.
+        volume_features: Batch of input *flattened* feature volumes
+            of shape `(minibatch, feature_dim, N_voxels)`
+        volume_densities: Batch of input *flattened* feature volume densities
+            of shape `(minibatch, 1, N_voxels)`. Each voxel should
+            contain a non-negative number corresponding to its
+            opaqueness (the higher, the less transparent).
+        grid_sizes: `LongTensor` of shape (minibatch, 3) representing the
+            spatial resolutions of each of the the non-flattened `volumes` tensors.
+            Note that the following has to hold:
+                `torch.prod(grid_sizes, dim=1)==N_voxels`
+        mask: A binary mask of shape `(minibatch, N)` determining which 3D points
+            are going to be converted to the resulting volume.
+            Set to `None` if all points are valid.
+    Returns:
+        volume_features: Output volume of shape `(minibatch, D, N_voxels)`.
+        volume_densities: Occupancy volume of shape `(minibatch, 1, N_voxels)`
+            containing the total amount of votes cast to each of the voxels.
+    """
+
+    _check_points_to_volumes_inputs(
+        points_3d,
+        points_features,
+        volume_densities,
+        volume_features,
+        grid_sizes,
+        mask=mask,
+    )
+
+    _, n_voxels, density_dim = volume_densities.shape
+    ba, n_points, feature_dim = points_features.shape
+
+    # minibatch x n_points x feature_dim-> minibatch x feature_dim x n_points
+    points_features = points_features.permute(0, 2, 1).contiguous()
+
+    # round the coordinates to nearest integer
+    # grid_sizes is of the form (minibatch, depth-height-width)
+    grid_sizes_xyz = grid_sizes[:, [2, 1, 0]]
+    XYZ = ((points_3d.detach() + 1) * 0.5) * (
+        grid_sizes_xyz[:, None].type_as(points_3d) - 1
+    )
+    XYZ = torch.round(XYZ).long()
+
+    # split into separate coordinate vectors
+    X, Y, Z = XYZ.split(1, dim=2)
+
+    # valid - binary indicators of votes that fall into the volume
+    grid_sizes = grid_sizes.type_as(XYZ)
+    valid = (
+        (0 <= X)
+        * (X < grid_sizes_xyz[:, None, 0:1])
+        * (0 <= Y)
+        * (Y < grid_sizes_xyz[:, None, 1:2])
+        * (0 <= Z)
+        * (Z < grid_sizes_xyz[:, None, 2:3])
+    ).long()
+    if mask is not None:
+        valid = valid * mask[:, :, None].long()
+
+    # get random indices for the purpose of adding out-of-bounds values
+    rand_idx = valid.new_zeros(X.shape).random_(0, n_voxels)
+
+    # linearized indices into the volume
+    idx = (Z * grid_sizes[:, None, 1:2] + Y) * grid_sizes[:, None, 2:3] + X
+
+    # out-of-bounds features added to a random voxel idx with weight=0.
+    idx_valid = idx * valid + rand_idx * (1 - valid)
+    w_valid = valid.type_as(volume_features)
+
+    # scatter add casts the votes into the weight accumulator
+    # and the feature accumulator
+    # pyre-fixme[16]: `Tensor` has no attribute `scatter_add_`.
+    volume_densities.scatter_add_(1, idx_valid, w_valid)
+
+    # reshape idx_valid -> (minibatch, feature_dim, n_points)
+    idx_valid = idx_valid.view(ba, 1, n_points).expand_as(points_features)
+    w_valid = w_valid.view(ba, 1, n_points)
+
+    # volume_features of shape (minibatch, feature_dim, n_voxels)
+    volume_features.scatter_add_(2, idx_valid, w_valid * points_features)
+
+    # divide each feature by the total weight of the votes
+    volume_features = volume_features / volume_densities.view(ba, 1, n_voxels).clamp(
+        1.0
+    )
+
+    return volume_features, volume_densities
diff --git a/pytorch3d/pytorch3d/ops/sample_farthest_points.py b/pytorch3d/pytorch3d/ops/sample_farthest_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdf62099a185841f38c5ed0724523f73fd67583e
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/sample_farthest_points.py
@@ -0,0 +1,174 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from random import randint
+from typing import List, Optional, Tuple, Union
+
+import torch
+from pytorch3d import _C
+
+from .utils import masked_gather
+
+
+def sample_farthest_points(
+    points: torch.Tensor,
+    lengths: Optional[torch.Tensor] = None,
+    K: Union[int, List, torch.Tensor] = 50,
+    random_start_point: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Iterative farthest point sampling algorithm [1] to subsample a set of
+    K points from a given pointcloud. At each iteration, a point is selected
+    which has the largest nearest neighbor distance to any of the
+    already selected points.
+
+    Farthest point sampling provides more uniform coverage of the input
+    point cloud compared to uniform random sampling.
+
+    [1] Charles R. Qi et al, "PointNet++: Deep Hierarchical Feature Learning
+        on Point Sets in a Metric Space", NeurIPS 2017.
+
+    Args:
+        points: (N, P, D) array containing the batch of pointclouds
+        lengths: (N,) number of points in each pointcloud (to support heterogeneous
+            batches of pointclouds)
+        K: samples required in each sampled point cloud (this is typically << P). If
+            K is an int then the same number of samples are selected for each
+            pointcloud in the batch. If K is a tensor is should be length (N,)
+            giving the number of samples to select for each element in the batch
+        random_start_point: bool, if True, a random point is selected as the starting
+            point for iterative sampling.
+
+    Returns:
+        selected_points: (N, K, D), array of selected values from points. If the input
+            K is a tensor, then the shape will be (N, max(K), D), and padded with
+            0.0 for batch elements where k_i < max(K).
+        selected_indices: (N, K) array of selected indices. If the input
+            K is a tensor, then the shape will be (N, max(K), D), and padded with
+            -1 for batch elements where k_i < max(K).
+    """
+    N, P, D = points.shape
+    device = points.device
+
+    # Validate inputs
+    if lengths is None:
+        lengths = torch.full((N,), P, dtype=torch.int64, device=device)
+
+    if lengths.shape != (N,):
+        raise ValueError("points and lengths must have same batch dimension.")
+
+    # TODO: support providing K as a ratio of the total number of points instead of as an int
+    if isinstance(K, int):
+        K = torch.full((N,), K, dtype=torch.int64, device=device)
+    elif isinstance(K, list):
+        K = torch.tensor(K, dtype=torch.int64, device=device)
+
+    if K.shape[0] != N:
+        raise ValueError("K and points must have the same batch dimension")
+
+    # Check dtypes are correct and convert if necessary
+    if not (points.dtype == torch.float32):
+        points = points.to(torch.float32)
+    if not (lengths.dtype == torch.int64):
+        lengths = lengths.to(torch.int64)
+    if not (K.dtype == torch.int64):
+        K = K.to(torch.int64)
+
+    # Generate the starting indices for sampling
+    start_idxs = torch.zeros_like(lengths)
+    if random_start_point:
+        for n in range(N):
+            start_idxs[n] = torch.randint(high=lengths[n], size=(1,)).item()
+
+    with torch.no_grad():
+        # pyre-fixme[16]: `pytorch3d_._C` has no attribute `sample_farthest_points`.
+        idx = _C.sample_farthest_points(points, lengths, K, start_idxs)
+    sampled_points = masked_gather(points, idx)
+
+    return sampled_points, idx
+
+
+def sample_farthest_points_naive(
+    points: torch.Tensor,
+    lengths: Optional[torch.Tensor] = None,
+    K: Union[int, List, torch.Tensor] = 50,
+    random_start_point: bool = False,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Same Args/Returns as sample_farthest_points
+    """
+    N, P, D = points.shape
+    device = points.device
+
+    # Validate inputs
+    if lengths is None:
+        lengths = torch.full((N,), P, dtype=torch.int64, device=device)
+
+    if lengths.shape[0] != N:
+        raise ValueError("points and lengths must have same batch dimension.")
+
+    # TODO: support providing K as a ratio of the total number of points instead of as an int
+    if isinstance(K, int):
+        K = torch.full((N,), K, dtype=torch.int64, device=device)
+    elif isinstance(K, list):
+        K = torch.tensor(K, dtype=torch.int64, device=device)
+
+    if K.shape[0] != N:
+        raise ValueError("K and points must have the same batch dimension")
+
+    # Find max value of K
+    max_K = torch.max(K)
+
+    # List of selected indices from each batch element
+    all_sampled_indices = []
+
+    for n in range(N):
+        # Initialize an array for the sampled indices, shape: (max_K,)
+        sample_idx_batch = torch.full(
+            (max_K,), fill_value=-1, dtype=torch.int64, device=device
+        )
+
+        # Initialize closest distances to inf, shape: (P,)
+        # This will be updated at each iteration to track the closest distance of the
+        # remaining points to any of the selected points
+        closest_dists = points.new_full(
+            (lengths[n],), float("inf"), dtype=torch.float32
+        )
+
+        # Select a random point index and save it as the starting point
+        selected_idx = randint(0, lengths[n] - 1) if random_start_point else 0
+        sample_idx_batch[0] = selected_idx
+
+        # If the pointcloud has fewer than K points then only iterate over the min
+        k_n = min(lengths[n], K[n])
+
+        # Iteratively select points for a maximum of k_n
+        for i in range(1, k_n):
+            # Find the distance between the last selected point
+            # and all the other points. If a point has already been selected
+            # it's distance will be 0.0 so it will not be selected again as the max.
+            dist = points[n, selected_idx, :] - points[n, : lengths[n], :]
+            dist_to_last_selected = (dist ** 2).sum(-1)  # (P - i)
+
+            # If closer than currently saved distance to one of the selected
+            # points, then updated closest_dists
+            closest_dists = torch.min(dist_to_last_selected, closest_dists)  # (P - i)
+
+            # The aim is to pick the point that has the largest
+            # nearest neighbour distance to any of the already selected points
+            selected_idx = torch.argmax(closest_dists)
+            sample_idx_batch[i] = selected_idx
+
+        # Add the list of points for this batch to the final list
+        all_sampled_indices.append(sample_idx_batch)
+
+    all_sampled_indices = torch.stack(all_sampled_indices, dim=0)
+
+    # Gather the points
+    all_sampled_points = masked_gather(points, all_sampled_indices)
+
+    # Return (N, max_K, D) subsampled points and indices
+    return all_sampled_points, all_sampled_indices
diff --git a/pytorch3d/pytorch3d/ops/sample_points_from_meshes.py b/pytorch3d/pytorch3d/ops/sample_points_from_meshes.py
new file mode 100644
index 0000000000000000000000000000000000000000..09b1dc30f370c10ea7b81783347ca9368ff7fa2a
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/sample_points_from_meshes.py
@@ -0,0 +1,177 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+"""
+This module implements utility functions for sampling points from
+batches of meshes.
+"""
+import sys
+from typing import Tuple, Union
+
+import torch
+from pytorch3d.ops.mesh_face_areas_normals import mesh_face_areas_normals
+from pytorch3d.ops.packed_to_padded import packed_to_padded
+from pytorch3d.renderer.mesh.rasterizer import Fragments as MeshFragments
+
+
+def sample_points_from_meshes(
+    meshes,
+    num_samples: int = 10000,
+    return_normals: bool = False,
+    return_textures: bool = False,
+) -> Union[
+    torch.Tensor,
+    Tuple[torch.Tensor, torch.Tensor],
+    Tuple[torch.Tensor, torch.Tensor, torch.Tensor],
+]:
+    """
+    Convert a batch of meshes to a batch of pointclouds by uniformly sampling
+    points on the surface of the mesh with probability proportional to the
+    face area.
+
+    Args:
+        meshes: A Meshes object with a batch of N meshes.
+        num_samples: Integer giving the number of point samples per mesh.
+        return_normals: If True, return normals for the sampled points.
+        return_textures: If True, return textures for the sampled points.
+
+    Returns:
+        3-element tuple containing
+
+        - **samples**: FloatTensor of shape (N, num_samples, 3) giving the
+          coordinates of sampled points for each mesh in the batch. For empty
+          meshes the corresponding row in the samples array will be filled with 0.
+        - **normals**: FloatTensor of shape (N, num_samples, 3) giving a normal vector
+          to each sampled point. Only returned if return_normals is True.
+          For empty meshes the corresponding row in the normals array will
+          be filled with 0.
+        - **textures**: FloatTensor of shape (N, num_samples, C) giving a C-dimensional
+          texture vector to each sampled point. Only returned if return_textures is True.
+          For empty meshes the corresponding row in the textures array will
+          be filled with 0.
+
+        Note that in a future releases, we will replace the 3-element tuple output
+        with a `Pointclouds` datastructure, as follows
+
+        .. code-block:: python
+
+            Pointclouds(samples, normals=normals, features=textures)
+    """
+    if meshes.isempty():
+        raise ValueError("Meshes are empty.")
+
+    verts = meshes.verts_packed()
+    if not torch.isfinite(verts).all():
+        raise ValueError("Meshes contain nan or inf.")
+
+    if return_textures and meshes.textures is None:
+        raise ValueError("Meshes do not contain textures.")
+
+    faces = meshes.faces_packed()
+    mesh_to_face = meshes.mesh_to_faces_packed_first_idx()
+    num_meshes = len(meshes)
+    num_valid_meshes = torch.sum(meshes.valid)  # Non empty meshes.
+
+    # Initialize samples tensor with fill value 0 for empty meshes.
+    samples = torch.zeros((num_meshes, num_samples, 3), device=meshes.device)
+
+    # Only compute samples for non empty meshes
+    with torch.no_grad():
+        areas, _ = mesh_face_areas_normals(verts, faces)  # Face areas can be zero.
+        max_faces = meshes.num_faces_per_mesh().max().item()
+        areas_padded = packed_to_padded(
+            areas, mesh_to_face[meshes.valid], max_faces
+        )  # (N, F)
+
+        # TODO (gkioxari) Confirm multinomial bug is not present with real data.
+        sample_face_idxs = areas_padded.multinomial(
+            num_samples, replacement=True
+        )  # (N, num_samples)
+        sample_face_idxs += mesh_to_face[meshes.valid].view(num_valid_meshes, 1)
+
+    # Get the vertex coordinates of the sampled faces.
+    face_verts = verts[faces]
+    v0, v1, v2 = face_verts[:, 0], face_verts[:, 1], face_verts[:, 2]
+
+    # Randomly generate barycentric coords.
+    w0, w1, w2 = _rand_barycentric_coords(
+        num_valid_meshes, num_samples, verts.dtype, verts.device
+    )
+
+    # Use the barycentric coords to get a point on each sampled face.
+    a = v0[sample_face_idxs]  # (N, num_samples, 3)
+    b = v1[sample_face_idxs]
+    c = v2[sample_face_idxs]
+    samples[meshes.valid] = w0[:, :, None] * a + w1[:, :, None] * b + w2[:, :, None] * c
+
+    if return_normals:
+        # Initialize normals tensor with fill value 0 for empty meshes.
+        # Normals for the sampled points are face normals computed from
+        # the vertices of the face in which the sampled point lies.
+        normals = torch.zeros((num_meshes, num_samples, 3), device=meshes.device)
+        vert_normals = (v1 - v0).cross(v2 - v1, dim=1)
+        vert_normals = vert_normals / vert_normals.norm(dim=1, p=2, keepdim=True).clamp(
+            min=sys.float_info.epsilon
+        )
+        vert_normals = vert_normals[sample_face_idxs]
+        normals[meshes.valid] = vert_normals
+
+    if return_textures:
+        # fragment data are of shape NxHxWxK. Here H=S, W=1 & K=1.
+        pix_to_face = sample_face_idxs.view(len(meshes), num_samples, 1, 1)  # NxSx1x1
+        bary = torch.stack((w0, w1, w2), dim=2).unsqueeze(2).unsqueeze(2)  # NxSx1x1x3
+        # zbuf and dists are not used in `sample_textures` so we initialize them with dummy
+        dummy = torch.zeros(
+            (len(meshes), num_samples, 1, 1), device=meshes.device, dtype=torch.float32
+        )  # NxSx1x1
+        fragments = MeshFragments(
+            pix_to_face=pix_to_face, zbuf=dummy, bary_coords=bary, dists=dummy
+        )
+        textures = meshes.sample_textures(fragments)  # NxSx1x1xC
+        textures = textures[:, :, 0, 0, :]  # NxSxC
+
+    # return
+    # TODO(gkioxari) consider returning a Pointclouds instance [breaking]
+    if return_normals and return_textures:
+        # pyre-fixme[61]: `normals` may not be initialized here.
+        # pyre-fixme[61]: `textures` may not be initialized here.
+        return samples, normals, textures
+    if return_normals:  # return_textures is False
+        # pyre-fixme[61]: `normals` may not be initialized here.
+        return samples, normals
+    if return_textures:  # return_normals is False
+        # pyre-fixme[61]: `textures` may not be initialized here.
+        return samples, textures
+    return samples
+
+
+def _rand_barycentric_coords(
+    size1, size2, dtype: torch.dtype, device: torch.device
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Helper function to generate random barycentric coordinates which are uniformly
+    distributed over a triangle.
+
+    Args:
+        size1, size2: The number of coordinates generated will be size1*size2.
+                      Output tensors will each be of shape (size1, size2).
+        dtype: Datatype to generate.
+        device: A torch.device object on which the outputs will be allocated.
+
+    Returns:
+        w0, w1, w2: Tensors of shape (size1, size2) giving random barycentric
+            coordinates
+    """
+    uv = torch.rand(2, size1, size2, dtype=dtype, device=device)
+    u, v = uv[0], uv[1]
+    u_sqrt = u.sqrt()
+    w0 = 1.0 - u_sqrt
+    w1 = u_sqrt * (1.0 - v)
+    w2 = u_sqrt * v
+    # pyre-fixme[7]: Expected `Tuple[torch.Tensor, torch.Tensor, torch.Tensor]` but
+    #  got `Tuple[float, typing.Any, typing.Any]`.
+    return w0, w1, w2
diff --git a/pytorch3d/pytorch3d/ops/subdivide_meshes.py b/pytorch3d/pytorch3d/ops/subdivide_meshes.py
new file mode 100644
index 0000000000000000000000000000000000000000..7af1402c5dfd231a25271a370fa1d3d3a37847dd
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/subdivide_meshes.py
@@ -0,0 +1,466 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+import torch.nn as nn
+from pytorch3d.structures import Meshes
+
+
+class SubdivideMeshes(nn.Module):
+    """
+    Subdivide a triangle mesh by adding a new vertex at the center of each edge
+    and dividing each face into four new faces. Vectors of vertex
+    attributes can also be subdivided by averaging the values of the attributes
+    at the two vertices which form each edge. This implementation
+    preserves face orientation - if the vertices of a face are all ordered
+    counter-clockwise, then the faces in the subdivided meshes will also have
+    their vertices ordered counter-clockwise.
+
+    If meshes is provided as an input, the initializer performs the relatively
+    expensive computation of determining the new face indices. This one-time
+    computation can be reused for all meshes with the same face topology
+    but different vertex positions.
+    """
+
+    def __init__(self, meshes=None) -> None:
+        """
+        Args:
+            meshes: Meshes object or None. If a meshes object is provided,
+                the first mesh is used to compute the new faces of the
+                subdivided topology which can be reused for meshes with
+                the same input topology.
+        """
+        super(SubdivideMeshes, self).__init__()
+
+        self.precomputed = False
+        self._N = -1
+        if meshes is not None:
+            # This computation is on indices, so gradients do not need to be
+            # tracked.
+            mesh = meshes[0]
+            with torch.no_grad():
+                subdivided_faces = self.subdivide_faces(mesh)
+                if subdivided_faces.shape[1] != 3:
+                    raise ValueError("faces can only have three vertices")
+                self.register_buffer("_subdivided_faces", subdivided_faces)
+                self.precomputed = True
+
+    def subdivide_faces(self, meshes):
+        r"""
+        Args:
+            meshes: a Meshes object.
+
+        Returns:
+            subdivided_faces_packed: (4*sum(F_n), 3) shape LongTensor of
+            original and new faces.
+
+        Refer to pytorch3d.structures.meshes.py for more details on packed
+        representations of faces.
+
+        Each face is split into 4 faces e.g. Input face
+        ::
+                   v0
+                   /\
+                  /  \
+                 /    \
+             e1 /      \ e0
+               /        \
+              /          \
+             /            \
+            /______________\
+          v2       e2       v1
+
+          faces_packed = [[0, 1, 2]]
+          faces_packed_to_edges_packed = [[2, 1, 0]]
+
+        `faces_packed_to_edges_packed` is used to represent all the new
+        vertex indices corresponding to the mid-points of edges in the mesh.
+        The actual vertex coordinates will be computed in the forward function.
+        To get the indices of the new vertices, offset
+        `faces_packed_to_edges_packed` by the total number of vertices.
+        ::
+            faces_packed_to_edges_packed = [[2, 1, 0]] + 3 = [[5, 4, 3]]
+
+        e.g. subdivided face
+        ::
+                   v0
+                   /\
+                  /  \
+                 / f0 \
+             v4 /______\ v3
+               /\      /\
+              /  \ f3 /  \
+             / f2 \  / f1 \
+            /______\/______\
+           v2       v5       v1
+
+           f0 = [0, 3, 4]
+           f1 = [1, 5, 3]
+           f2 = [2, 4, 5]
+           f3 = [5, 4, 3]
+
+        """
+        verts_packed = meshes.verts_packed()
+        with torch.no_grad():
+            faces_packed = meshes.faces_packed()
+            faces_packed_to_edges_packed = (
+                meshes.faces_packed_to_edges_packed() + verts_packed.shape[0]
+            )
+
+            f0 = torch.stack(
+                [
+                    faces_packed[:, 0],
+                    faces_packed_to_edges_packed[:, 2],
+                    faces_packed_to_edges_packed[:, 1],
+                ],
+                dim=1,
+            )
+            f1 = torch.stack(
+                [
+                    faces_packed[:, 1],
+                    faces_packed_to_edges_packed[:, 0],
+                    faces_packed_to_edges_packed[:, 2],
+                ],
+                dim=1,
+            )
+            f2 = torch.stack(
+                [
+                    faces_packed[:, 2],
+                    faces_packed_to_edges_packed[:, 1],
+                    faces_packed_to_edges_packed[:, 0],
+                ],
+                dim=1,
+            )
+            f3 = faces_packed_to_edges_packed
+            subdivided_faces_packed = torch.cat(
+                [f0, f1, f2, f3], dim=0
+            )  # (4*sum(F_n), 3)
+
+            return subdivided_faces_packed
+
+    def forward(self, meshes, feats=None):
+        """
+        Subdivide a batch of meshes by adding a new vertex on each edge, and
+        dividing each face into four new faces. New meshes contains two types
+        of vertices:
+        1) Vertices that appear in the input meshes.
+           Data for these vertices are copied from the input meshes.
+        2) New vertices at the midpoint of each edge.
+           Data for these vertices is the average of the data for the two
+           vertices that make up the edge.
+
+        Args:
+            meshes: Meshes object representing a batch of meshes.
+            feats: Per-vertex features to be subdivided along with the verts.
+                Should be parallel to the packed vert representation of the
+                input meshes; so it should have shape (V, D) where V is the
+                total number of verts in the input meshes. Default: None.
+
+        Returns:
+            2-element tuple containing
+
+            - **new_meshes**: Meshes object of a batch of subdivided meshes.
+            - **new_feats**: (optional) Tensor of subdivided feats, parallel to the
+              (packed) vertices of the subdivided meshes. Only returned
+              if feats is not None.
+
+        """
+        self._N = len(meshes)
+        if self.precomputed:
+            return self.subdivide_homogeneous(meshes, feats)
+        else:
+            return self.subdivide_heterogenerous(meshes, feats)
+
+    def subdivide_homogeneous(self, meshes, feats=None):
+        """
+        Subdivide verts (and optionally features) of a batch of meshes
+        where each mesh has the same topology of faces. The subdivided faces
+        are precomputed in the initializer.
+
+        Args:
+            meshes: Meshes object representing a batch of meshes.
+            feats: Per-vertex features to be subdivided along with the verts.
+
+        Returns:
+            2-element tuple containing
+
+            - **new_meshes**: Meshes object of a batch of subdivided meshes.
+            - **new_feats**: (optional) Tensor of subdivided feats, parallel to the
+              (packed) vertices of the subdivided meshes. Only returned
+              if feats is not None.
+        """
+        verts = meshes.verts_padded()  # (N, V, D)
+        edges = meshes[0].edges_packed()
+
+        # The set of faces is the same across the different meshes.
+        new_faces = self._subdivided_faces.view(1, -1, 3).expand(self._N, -1, -1)
+
+        # Add one new vertex at the midpoint of each edge by taking the average
+        # of the vertices that form each edge.
+        new_verts = verts[:, edges].mean(dim=2)
+        new_verts = torch.cat([verts, new_verts], dim=1)  # (sum(V_n)+sum(E_n), 3)
+        new_feats = None
+
+        # Calculate features for new vertices.
+        if feats is not None:
+            if feats.dim() == 2:
+                # feats is in packed format, transform it from packed to
+                # padded, i.e. (N*V, D) to (N, V, D).
+                feats = feats.view(verts.size(0), verts.size(1), feats.size(1))
+            if feats.dim() != 3:
+                raise ValueError("features need to be of shape (N, V, D) or (N*V, D)")
+
+            # Take average of the features at the vertices that form each edge.
+            new_feats = feats[:, edges].mean(dim=2)
+            new_feats = torch.cat([feats, new_feats], dim=1)  # (sum(V_n)+sum(E_n), 3)
+
+        new_meshes = Meshes(verts=new_verts, faces=new_faces)
+
+        if feats is None:
+            return new_meshes
+        else:
+            return new_meshes, new_feats
+
+    def subdivide_heterogenerous(self, meshes, feats=None):
+        """
+        Subdivide faces, verts (and optionally features) of a batch of meshes
+        where each mesh can have different face topologies.
+
+        Args:
+            meshes: Meshes object representing a batch of meshes.
+            feats: Per-vertex features to be subdivided along with the verts.
+
+        Returns:
+            2-element tuple containing
+
+            - **new_meshes**: Meshes object of a batch of subdivided meshes.
+            - **new_feats**: (optional) Tensor of subdivided feats, parallel to the
+              (packed) vertices of the subdivided meshes. Only returned
+              if feats is not None.
+        """
+
+        # The computation of new faces is on face indices, so gradients do not
+        # need to be tracked.
+        verts = meshes.verts_packed()
+        with torch.no_grad():
+            new_faces = self.subdivide_faces(meshes)
+            edges = meshes.edges_packed()
+            face_to_mesh_idx = meshes.faces_packed_to_mesh_idx()
+            edge_to_mesh_idx = meshes.edges_packed_to_mesh_idx()
+            num_edges_per_mesh = edge_to_mesh_idx.bincount(minlength=self._N)
+            num_verts_per_mesh = meshes.num_verts_per_mesh()
+            num_faces_per_mesh = meshes.num_faces_per_mesh()
+
+            # Add one new vertex at the midpoint of each edge.
+            new_verts_per_mesh = num_verts_per_mesh + num_edges_per_mesh  # (N,)
+            new_face_to_mesh_idx = torch.cat([face_to_mesh_idx] * 4, dim=0)
+
+            # Calculate the indices needed to group the new and existing verts
+            # for each mesh.
+            verts_sort_idx = create_verts_index(
+                num_verts_per_mesh, num_edges_per_mesh, meshes.device
+            )  # (sum(V_n)+sum(E_n),)
+
+            verts_ordered_idx_init = torch.zeros(
+                new_verts_per_mesh.sum(), dtype=torch.int64, device=meshes.device
+            )  # (sum(V_n)+sum(E_n),)
+
+            # Reassign vertex indices so that existing and new vertices for each
+            # mesh are sequential.
+            verts_ordered_idx = verts_ordered_idx_init.scatter_add(
+                0,
+                verts_sort_idx,
+                torch.arange(new_verts_per_mesh.sum(), device=meshes.device),
+            )
+
+            # Retrieve vertex indices for each face.
+            new_faces = verts_ordered_idx[new_faces]
+
+            # Calculate the indices needed to group the existing and new faces
+            # for each mesh.
+            face_sort_idx = create_faces_index(num_faces_per_mesh, device=meshes.device)
+
+            # Reorder the faces to sequentially group existing and new faces
+            # for each mesh.
+            new_faces = new_faces[face_sort_idx]
+            new_face_to_mesh_idx = new_face_to_mesh_idx[face_sort_idx]
+            new_faces_per_mesh = new_face_to_mesh_idx.bincount(
+                minlength=self._N
+            )  # (sum(F_n)*4)
+
+        # Add one new vertex at the midpoint of each edge by taking the average
+        # of the verts that form each edge.
+        new_verts = verts[edges].mean(dim=1)
+        new_verts = torch.cat([verts, new_verts], dim=0)
+
+        # Reorder the verts to sequentially group existing and new verts for
+        # each mesh.
+        new_verts = new_verts[verts_sort_idx]
+
+        if feats is not None:
+            new_feats = feats[edges].mean(dim=1)
+            new_feats = torch.cat([feats, new_feats], dim=0)
+            new_feats = new_feats[verts_sort_idx]
+
+        verts_list = list(new_verts.split(new_verts_per_mesh.tolist(), 0))
+        faces_list = list(new_faces.split(new_faces_per_mesh.tolist(), 0))
+        new_verts_per_mesh_cumsum = torch.cat(
+            [
+                new_verts_per_mesh.new_full(size=(1,), fill_value=0.0),
+                new_verts_per_mesh.cumsum(0)[:-1],
+            ],
+            dim=0,
+        )
+        faces_list = [
+            faces_list[n] - new_verts_per_mesh_cumsum[n] for n in range(self._N)
+        ]
+        if feats is not None:
+            feats_list = new_feats.split(new_verts_per_mesh.tolist(), 0)
+        new_meshes = Meshes(verts=verts_list, faces=faces_list)
+
+        if feats is None:
+            return new_meshes
+        else:
+            new_feats = torch.cat(feats_list, dim=0)
+            return new_meshes, new_feats
+
+
+def create_verts_index(verts_per_mesh, edges_per_mesh, device=None):
+    """
+    Helper function to group the vertex indices for each mesh. New vertices are
+    stacked at the end of the original verts tensor, so in order to have
+    sequential packing, the verts tensor needs to be reordered so that the
+    vertices corresponding to each mesh are grouped together.
+
+    Args:
+        verts_per_mesh: Tensor of shape (N,) giving the number of vertices
+            in each mesh in the batch where N is the batch size.
+        edges_per_mesh: Tensor of shape (N,) giving the number of edges
+            in each mesh in the batch
+
+    Returns:
+        verts_idx: A tensor with vert indices for each mesh ordered sequentially
+            by mesh index.
+    """
+    # e.g. verts_per_mesh = (4, 5, 6)
+    # e.g. edges_per_mesh = (5, 7, 9)
+
+    V = verts_per_mesh.sum()  # e.g. 15
+    E = edges_per_mesh.sum()  # e.g. 21
+
+    verts_per_mesh_cumsum = verts_per_mesh.cumsum(dim=0)  # (N,) e.g. (4, 9, 15)
+    edges_per_mesh_cumsum = edges_per_mesh.cumsum(dim=0)  # (N,) e.g. (5, 12, 21)
+
+    v_to_e_idx = verts_per_mesh_cumsum.clone()
+
+    # vertex to edge index.
+    v_to_e_idx[1:] += edges_per_mesh_cumsum[
+        :-1
+    ]  # e.g. (4, 9, 15) + (0, 5, 12) = (4, 14, 27)
+
+    # vertex to edge offset.
+    v_to_e_offset = V - verts_per_mesh_cumsum  # e.g. 15 - (4, 9, 15) = (11, 6, 0)
+    v_to_e_offset[1:] += edges_per_mesh_cumsum[
+        :-1
+    ]  # e.g. (11, 6, 0) + (0, 5, 12) = (11, 11, 12)
+    e_to_v_idx = (
+        verts_per_mesh_cumsum[:-1] + edges_per_mesh_cumsum[:-1]
+    )  # (4, 9) + (5, 12) = (9, 21)
+    e_to_v_offset = (
+        verts_per_mesh_cumsum[:-1] - edges_per_mesh_cumsum[:-1] - V
+    )  # (4, 9) - (5, 12) - 15 = (-16, -18)
+
+    # Add one new vertex per edge.
+    idx_diffs = torch.ones(V + E, device=device, dtype=torch.int64)  # (36,)
+    idx_diffs[v_to_e_idx] += v_to_e_offset
+    idx_diffs[e_to_v_idx] += e_to_v_offset
+
+    # e.g.
+    # [
+    #  1, 1, 1, 1, 12, 1, 1, 1, 1,
+    #  -15, 1, 1, 1, 1, 12, 1, 1, 1, 1, 1, 1,
+    #  -17, 1, 1, 1, 1, 1, 13, 1, 1, 1, 1, 1, 1, 1
+    # ]
+
+    verts_idx = idx_diffs.cumsum(dim=0) - 1
+
+    # e.g.
+    # [
+    #  0, 1, 2, 3, 15, 16, 17, 18, 19,                            --> mesh 0
+    #  4, 5, 6, 7, 8, 20, 21, 22, 23, 24, 25, 26,                 --> mesh 1
+    #  9, 10, 11, 12, 13, 14, 27, 28, 29, 30, 31, 32, 33, 34, 35  --> mesh 2
+    # ]
+    # where for mesh 0, [0, 1, 2, 3] are the indices of the existing verts, and
+    # [15, 16, 17, 18, 19] are the indices of the new verts after subdivision.
+
+    return verts_idx
+
+
+def create_faces_index(faces_per_mesh, device=None):
+    """
+    Helper function to group the faces indices for each mesh. New faces are
+    stacked at the end of the original faces tensor, so in order to have
+    sequential packing, the faces tensor needs to be reordered to that faces
+    corresponding to each mesh are grouped together.
+
+    Args:
+        faces_per_mesh: Tensor of shape (N,) giving the number of faces
+            in each mesh in the batch where N is the batch size.
+
+    Returns:
+        faces_idx: A tensor with face indices for each mesh ordered sequentially
+            by mesh index.
+    """
+    # e.g. faces_per_mesh = [2, 5, 3]
+
+    F = faces_per_mesh.sum()  # e.g. 10
+    faces_per_mesh_cumsum = faces_per_mesh.cumsum(dim=0)  # (N,) e.g. (2, 7, 10)
+
+    switch1_idx = faces_per_mesh_cumsum.clone()
+    switch1_idx[1:] += (
+        3 * faces_per_mesh_cumsum[:-1]
+    )  # e.g. (2, 7, 10) + (0, 6, 21) = (2, 13, 31)
+
+    switch2_idx = 2 * faces_per_mesh_cumsum  # e.g. (4, 14, 20)
+    switch2_idx[1:] += (
+        2 * faces_per_mesh_cumsum[:-1]
+    )  # e.g. (4, 14, 20) + (0, 4, 14) = (4, 18, 34)
+
+    switch3_idx = 3 * faces_per_mesh_cumsum  # e.g. (6, 21, 30)
+    switch3_idx[1:] += faces_per_mesh_cumsum[
+        :-1
+    ]  # e.g. (6, 21, 30) + (0, 2, 7) = (6, 23, 37)
+
+    switch4_idx = 4 * faces_per_mesh_cumsum[:-1]  # e.g. (8, 28)
+
+    switch123_offset = F - faces_per_mesh  # e.g. (8, 5, 7)
+
+    idx_diffs = torch.ones(4 * F, device=device, dtype=torch.int64)
+    idx_diffs[switch1_idx] += switch123_offset
+    idx_diffs[switch2_idx] += switch123_offset
+    idx_diffs[switch3_idx] += switch123_offset
+    idx_diffs[switch4_idx] -= 3 * F
+
+    # e.g
+    # [
+    #  1, 1, 9, 1, 9, 1, 9, 1,                                       -> mesh 0
+    #  -29, 1, 1, 1, 1, 6, 1, 1, 1, 1, 6, 1, 1, 1, 1, 6, 1, 1, 1, 1, -> mesh 1
+    #  -29, 1, 1, 8, 1, 1, 8, 1, 1, 8, 1, 1                          -> mesh 2
+    # ]
+
+    faces_idx = idx_diffs.cumsum(dim=0) - 1
+
+    # e.g.
+    # [
+    #  0, 1, 10, 11, 20, 21, 30, 31,
+    #  2, 3, 4, 5, 6, 12, 13, 14, 15, 16, 22, 23, 24, 25, 26, 32, 33, 34, 35, 36,
+    #  7, 8, 9, 17, 18, 19, 27, 28, 29, 37, 38, 39
+    # ]
+    # where for mesh 0, [0, 1] are the indices of the existing faces, and
+    # [10, 11, 20, 21, 30, 31] are the indices of the new faces after subdivision.
+
+    return faces_idx
diff --git a/pytorch3d/pytorch3d/ops/utils.py b/pytorch3d/pytorch3d/ops/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b33b708b06f9fe6605ccf1b66b887aae38a58ab
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/utils.py
@@ -0,0 +1,199 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import TYPE_CHECKING, Optional, Tuple, Union
+
+import torch
+
+from .knn import knn_points
+
+
+if TYPE_CHECKING:
+    from pytorch3d.structures import Pointclouds
+
+
+def masked_gather(points: torch.Tensor, idx: torch.Tensor) -> torch.Tensor:
+    """
+    Helper function for torch.gather to collect the points at
+    the given indices in idx where some of the indices might be -1 to
+    indicate padding. These indices are first replaced with 0.
+    Then the points are gathered after which the padded values
+    are set to 0.0.
+
+    Args:
+        points: (N, P, D) float32 tensor of points
+        idx: (N, K) or (N, P, K) long tensor of indices into points, where
+            some indices are -1 to indicate padding
+
+    Returns:
+        selected_points: (N, K, D) float32 tensor of points
+            at the given indices
+    """
+
+    if len(idx) != len(points):
+        raise ValueError("points and idx must have the same batch dimension")
+
+    N, P, D = points.shape
+
+    if idx.ndim == 3:
+        # Case: KNN, Ball Query where idx is of shape (N, P', K)
+        # where P' is not necessarily the same as P as the
+        # points may be gathered from a different pointcloud.
+        K = idx.shape[2]
+        # Match dimensions for points and indices
+        idx_expanded = idx[..., None].expand(-1, -1, -1, D)
+        points = points[:, :, None, :].expand(-1, -1, K, -1)
+    elif idx.ndim == 2:
+        # Farthest point sampling where idx is of shape (N, K)
+        idx_expanded = idx[..., None].expand(-1, -1, D)
+    else:
+        raise ValueError("idx format is not supported %s" % repr(idx.shape))
+
+    idx_expanded_mask = idx_expanded.eq(-1)
+    idx_expanded = idx_expanded.clone()
+    # Replace -1 values with 0 for gather
+    idx_expanded[idx_expanded_mask] = 0
+    # Gather points
+    selected_points = points.gather(dim=1, index=idx_expanded)
+    # Replace padded values
+    selected_points[idx_expanded_mask] = 0.0
+    return selected_points
+
+
+def wmean(
+    x: torch.Tensor,
+    weight: Optional[torch.Tensor] = None,
+    dim: Union[int, Tuple[int]] = -2,
+    keepdim: bool = True,
+    eps: float = 1e-9,
+) -> torch.Tensor:
+    """
+    Finds the mean of the input tensor across the specified dimension.
+    If the `weight` argument is provided, computes weighted mean.
+    Args:
+        x: tensor of shape `(*, D)`, where D is assumed to be spatial;
+        weights: if given, non-negative tensor of shape `(*,)`. It must be
+            broadcastable to `x.shape[:-1]`. Note that the weights for
+            the last (spatial) dimension are assumed same;
+        dim: dimension(s) in `x` to average over;
+        keepdim: tells whether to keep the resulting singleton dimension.
+        eps: minimum clamping value in the denominator.
+    Returns:
+        the mean tensor:
+        * if `weights` is None => `mean(x, dim)`,
+        * otherwise => `sum(x*w, dim) / max{sum(w, dim), eps}`.
+    """
+    args = {"dim": dim, "keepdim": keepdim}
+
+    if weight is None:
+        return x.mean(**args)
+
+    if any(
+        xd != wd and xd != 1 and wd != 1
+        for xd, wd in zip(x.shape[-2::-1], weight.shape[::-1])
+    ):
+        raise ValueError("wmean: weights are not compatible with the tensor")
+
+    return (x * weight[..., None]).sum(**args) / weight[..., None].sum(**args).clamp(
+        eps
+    )
+
+
+def eyes(
+    dim: int,
+    N: int,
+    device: Optional[torch.device] = None,
+    dtype: torch.dtype = torch.float32,
+) -> torch.Tensor:
+    """
+    Generates a batch of `N` identity matrices of shape `(N, dim, dim)`.
+
+    Args:
+        **dim**: The dimensionality of the identity matrices.
+        **N**: The number of identity matrices.
+        **device**: The device to be used for allocating the matrices.
+        **dtype**: The datatype of the matrices.
+
+    Returns:
+        **identities**: A batch of identity matrices of shape `(N, dim, dim)`.
+    """
+    identities = torch.eye(dim, device=device, dtype=dtype)
+    return identities[None].repeat(N, 1, 1)
+
+
+def convert_pointclouds_to_tensor(pcl: Union[torch.Tensor, "Pointclouds"]):
+    """
+    If `type(pcl)==Pointclouds`, converts a `pcl` object to a
+    padded representation and returns it together with the number of points
+    per batch. Otherwise, returns the input itself with the number of points
+    set to the size of the second dimension of `pcl`.
+    """
+    if is_pointclouds(pcl):
+        X = pcl.points_padded()  # type: ignore
+        num_points = pcl.num_points_per_cloud()  # type: ignore
+    elif torch.is_tensor(pcl):
+        X = pcl
+        num_points = X.shape[1] * torch.ones(  # type: ignore
+            X.shape[0], device=X.device, dtype=torch.int64
+        )
+    else:
+        raise ValueError(
+            "The inputs X, Y should be either Pointclouds objects or tensors."
+        )
+    return X, num_points
+
+
+def is_pointclouds(pcl: Union[torch.Tensor, "Pointclouds"]):
+    """Checks whether the input `pcl` is an instance of `Pointclouds`
+    by checking the existence of `points_padded` and `num_points_per_cloud`
+    functions.
+    """
+    return hasattr(pcl, "points_padded") and hasattr(pcl, "num_points_per_cloud")
+
+
+def get_point_covariances(
+    points_padded: torch.Tensor,
+    num_points_per_cloud: torch.Tensor,
+    neighborhood_size: int,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Computes the per-point covariance matrices by of the 3D locations of
+    K-nearest neighbors of each point.
+
+    Args:
+        **points_padded**: Input point clouds as a padded tensor
+            of shape `(minibatch, num_points, dim)`.
+        **num_points_per_cloud**: Number of points per cloud
+            of shape `(minibatch,)`.
+        **neighborhood_size**: Number of nearest neighbors for each point
+            used to estimate the covariance matrices.
+
+    Returns:
+        **covariances**: A batch of per-point covariance matrices
+            of shape `(minibatch, dim, dim)`.
+        **k_nearest_neighbors**: A batch of `neighborhood_size` nearest
+            neighbors for each of the point cloud points
+            of shape `(minibatch, num_points, neighborhood_size, dim)`.
+    """
+    # get K nearest neighbor idx for each point in the point cloud
+    k_nearest_neighbors = knn_points(
+        points_padded,
+        points_padded,
+        lengths1=num_points_per_cloud,
+        lengths2=num_points_per_cloud,
+        K=neighborhood_size,
+        return_nn=True,
+    ).knn
+    # obtain the mean of the neighborhood
+    pt_mean = k_nearest_neighbors.mean(2, keepdim=True)
+    # compute the diff of the neighborhood and the mean of the neighborhood
+    central_diff = k_nearest_neighbors - pt_mean
+    # per-nn-point covariances
+    per_pt_cov = central_diff.unsqueeze(4) * central_diff.unsqueeze(3)
+    # per-point covariances
+    covariances = per_pt_cov.mean(2)
+
+    return covariances, k_nearest_neighbors
diff --git a/pytorch3d/pytorch3d/ops/vert_align.py b/pytorch3d/pytorch3d/ops/vert_align.py
new file mode 100644
index 0000000000000000000000000000000000000000..06c2d462d6ecc9124170726c57986893502ffe51
--- /dev/null
+++ b/pytorch3d/pytorch3d/ops/vert_align.py
@@ -0,0 +1,107 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+import torch.nn.functional as F
+
+
+def vert_align(
+    feats,
+    verts,
+    return_packed: bool = False,
+    interp_mode: str = "bilinear",
+    padding_mode: str = "zeros",
+    align_corners: bool = True,
+) -> torch.Tensor:
+    """
+    Sample vertex features from a feature map. This operation is called
+    "perceptual feature pooling" in [1] or "vert align" in [2].
+
+    [1] Wang et al, "Pixel2Mesh: Generating 3D Mesh Models from Single
+        RGB Images", ECCV 2018.
+    [2] Gkioxari et al, "Mesh R-CNN", ICCV 2019
+
+    Args:
+        feats: FloatTensor of shape (N, C, H, W) representing image features
+            from which to sample or a list of features each with potentially
+            different C, H or W dimensions.
+        verts: FloatTensor of shape (N, V, 3) or an object (e.g. Meshes or Pointclouds)
+            with `verts_padded' or `points_padded' as an attribute giving the (x, y, z)
+            vertex positions for which to sample. (x, y) verts should be normalized such
+            that (-1, -1) corresponds to top-left and (+1, +1) to bottom-right
+            location in the input feature map.
+        return_packed: (bool) Indicates whether to return packed features
+        interp_mode: (str) Specifies how to interpolate features.
+            ('bilinear' or 'nearest')
+        padding_mode: (str) Specifies how to handle vertices outside of the
+            [-1, 1] range. ('zeros', 'reflection', or 'border')
+        align_corners (bool): Geometrically, we consider the pixels of the
+            input  as squares rather than points.
+            If set to ``True``, the extrema (``-1`` and ``1``) are considered as
+            referring to the center points of the input's corner pixels. If set
+            to ``False``, they are instead considered as referring to the corner
+            points of the input's corner pixels, making the sampling more
+            resolution agnostic. Default: ``True``
+
+    Returns:
+        feats_sampled: FloatTensor of shape (N, V, C) giving sampled features for each
+            vertex. If feats is a list, we return concatenated features in axis=2 of
+            shape (N, V, sum(C_n)) where C_n = feats[n].shape[1].
+            If return_packed = True, the features are transformed to a packed
+            representation of shape (sum(V), C)
+    """
+    if torch.is_tensor(verts):
+        if verts.dim() != 3:
+            raise ValueError("verts tensor should be 3 dimensional")
+        grid = verts
+    elif hasattr(verts, "verts_padded"):
+        grid = verts.verts_padded()
+    elif hasattr(verts, "points_padded"):
+        grid = verts.points_padded()
+    else:
+        raise ValueError(
+            "verts must be a tensor or have a "
+            + "`points_padded' or`verts_padded` attribute."
+        )
+
+    grid = grid[:, None, :, :2]  # (N, 1, V, 2)
+
+    if torch.is_tensor(feats):
+        feats = [feats]
+    for feat in feats:
+        if feat.dim() != 4:
+            raise ValueError("feats must have shape (N, C, H, W)")
+        if grid.shape[0] != feat.shape[0]:
+            raise ValueError("inconsistent batch dimension")
+
+    feats_sampled = []
+    for feat in feats:
+        feat_sampled = F.grid_sample(
+            feat,
+            grid,
+            mode=interp_mode,
+            padding_mode=padding_mode,
+            align_corners=align_corners,
+        )  # (N, C, 1, V)
+        # pyre-fixme[28]: Unexpected keyword argument `dim`.
+        feat_sampled = feat_sampled.squeeze(dim=2).transpose(1, 2)  # (N, V, C)
+        feats_sampled.append(feat_sampled)
+    feats_sampled = torch.cat(feats_sampled, dim=2)  # (N, V, sum(C))
+
+    if return_packed:
+        # flatten the first two dimensions: (N*V, C)
+        feats_sampled = feats_sampled.view(-1, feats_sampled.shape[-1])
+        if hasattr(verts, "verts_padded_to_packed_idx"):
+            idx = (
+                verts.verts_padded_to_packed_idx()
+                .view(-1, 1)
+                .expand(-1, feats_sampled.shape[-1])
+            )
+            # pyre-fixme[16]: `Tensor` has no attribute `gather`.
+            feats_sampled = feats_sampled.gather(0, idx)  # (sum(V), C)
+
+    return feats_sampled
diff --git a/pytorch3d/pytorch3d/renderer/__init__.py b/pytorch3d/pytorch3d/renderer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe9ccdf1018c11138ca595bdc86d4d6dffd2a530
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .blending import (
+    BlendParams,
+    hard_rgb_blend,
+    sigmoid_alpha_blend,
+    softmax_rgb_blend,
+)
+from .camera_utils import rotate_on_spot
+from .cameras import OpenGLOrthographicCameras  # deprecated
+from .cameras import OpenGLPerspectiveCameras  # deprecated
+from .cameras import SfMOrthographicCameras  # deprecated
+from .cameras import SfMPerspectiveCameras  # deprecated
+from .cameras import (
+    FoVOrthographicCameras,
+    FoVPerspectiveCameras,
+    OrthographicCameras,
+    PerspectiveCameras,
+    camera_position_from_spherical_angles,
+    get_world_to_view_transform,
+    look_at_rotation,
+    look_at_view_transform,
+)
+from .implicit import (
+    AbsorptionOnlyRaymarcher,
+    EmissionAbsorptionRaymarcher,
+    GridRaysampler,
+    ImplicitRenderer,
+    MonteCarloRaysampler,
+    NDCGridRaysampler,
+    RayBundle,
+    VolumeRenderer,
+    VolumeSampler,
+    ray_bundle_to_ray_points,
+    ray_bundle_variables_to_ray_points,
+    HarmonicEmbedding,
+)
+from .lighting import AmbientLights, DirectionalLights, PointLights, diffuse, specular
+from .materials import Materials
+from .mesh import (
+    HardFlatShader,
+    HardGouraudShader,
+    HardPhongShader,
+    MeshRasterizer,
+    MeshRenderer,
+    RasterizationSettings,
+    SoftGouraudShader,
+    SoftPhongShader,
+    SoftSilhouetteShader,
+    Textures,
+    TexturesAtlas,
+    TexturesUV,
+    TexturesVertex,
+    gouraud_shading,
+    phong_shading,
+    rasterize_meshes,
+)
+from .points import (
+    AlphaCompositor,
+    NormWeightedCompositor,
+    PointsRasterizationSettings,
+    PointsRasterizer,
+    PointsRenderer,
+    PulsarPointsRenderer,
+    rasterize_points,
+)
+from .utils import TensorProperties, convert_to_tensors_and_broadcast
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/renderer/blending.py b/pytorch3d/pytorch3d/renderer/blending.py
new file mode 100644
index 0000000000000000000000000000000000000000..86d59db9a016915a0b4a74d95f6e834e7ebef4f4
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/blending.py
@@ -0,0 +1,238 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import NamedTuple, Sequence, Union
+
+import torch
+from pytorch3d import _C
+
+
+# Example functions for blending the top K colors per pixel using the outputs
+# from rasterization.
+# NOTE: All blending function should return an RGBA image per batch element
+
+
+class BlendParams(NamedTuple):
+    """
+    Data class to store blending params with defaults
+
+    Members:
+        sigma (float): Controls the width of the sigmoid function used to
+            calculate the 2D distance based probability. Determines the
+            sharpness of the edges of the shape.
+            Higher => faces have less defined edges.
+        gamma (float): Controls the scaling of the exponential function used
+            to set the opacity of the color.
+            Higher => faces are more transparent.
+        background_color: RGB values for the background color as a tuple or
+            as a tensor of three floats.
+    """
+
+    sigma: float = 1e-4
+    gamma: float = 1e-4
+    background_color: Union[torch.Tensor, Sequence[float]] = (1.0, 1.0, 1.0)
+
+
+def hard_rgb_blend(
+    colors: torch.Tensor, fragments, blend_params: BlendParams
+) -> torch.Tensor:
+    """
+    Naive blending of top K faces to return an RGBA image
+      - **RGB** - choose color of the closest point i.e. K=0
+      - **A** - 1.0
+
+    Args:
+        colors: (N, H, W, K, 3) RGB color for each of the top K faces per pixel.
+        fragments: the outputs of rasterization. From this we use
+            - pix_to_face: LongTensor of shape (N, H, W, K) specifying the indices
+              of the faces (in the packed representation) which
+              overlap each pixel in the image. This is used to
+              determine the output shape.
+        blend_params: BlendParams instance that contains a background_color
+        field specifying the color for the background
+    Returns:
+        RGBA pixel_colors: (N, H, W, 4)
+    """
+    N, H, W, K = fragments.pix_to_face.shape
+    device = fragments.pix_to_face.device
+
+    # Mask for the background.
+    is_background = fragments.pix_to_face[..., 0] < 0  # (N, H, W)
+
+    background_color_ = blend_params.background_color
+    if isinstance(background_color_, torch.Tensor):
+        background_color = background_color_.to(device)
+    else:
+        background_color = colors.new_tensor(background_color_)
+
+    # Find out how much background_color needs to be expanded to be used for masked_scatter.
+    num_background_pixels = is_background.sum()
+
+    # Set background color.
+    pixel_colors = colors[..., 0, :].masked_scatter(
+        is_background[..., None],
+        background_color[None, :].expand(num_background_pixels, -1),
+    )  # (N, H, W, 3)
+
+    # Concat with the alpha channel.
+    alpha = (~is_background).type_as(pixel_colors)[..., None]
+
+    return torch.cat([pixel_colors, alpha], dim=-1)  # (N, H, W, 4)
+
+
+# Wrapper for the C++/CUDA Implementation of sigmoid alpha blend.
+class _SigmoidAlphaBlend(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, dists, pix_to_face, sigma):
+        alphas = _C.sigmoid_alpha_blend(dists, pix_to_face, sigma)
+        ctx.save_for_backward(dists, pix_to_face, alphas)
+        ctx.sigma = sigma
+        return alphas
+
+    @staticmethod
+    def backward(ctx, grad_alphas):
+        dists, pix_to_face, alphas = ctx.saved_tensors
+        sigma = ctx.sigma
+        grad_dists = _C.sigmoid_alpha_blend_backward(
+            grad_alphas, alphas, dists, pix_to_face, sigma
+        )
+        return grad_dists, None, None
+
+
+# pyre-fixme[16]: `_SigmoidAlphaBlend` has no attribute `apply`.
+_sigmoid_alpha = _SigmoidAlphaBlend.apply
+
+
+def sigmoid_alpha_blend(colors, fragments, blend_params: BlendParams) -> torch.Tensor:
+    """
+    Silhouette blending to return an RGBA image
+      - **RGB** - choose color of the closest point.
+      - **A** - blend based on the 2D distance based probability map [1].
+
+    Args:
+        colors: (N, H, W, K, 3) RGB color for each of the top K faces per pixel.
+        fragments: the outputs of rasterization. From this we use
+            - pix_to_face: LongTensor of shape (N, H, W, K) specifying the indices
+              of the faces (in the packed representation) which
+              overlap each pixel in the image.
+            - dists: FloatTensor of shape (N, H, W, K) specifying
+              the 2D euclidean distance from the center of each pixel
+              to each of the top K overlapping faces.
+
+    Returns:
+        RGBA pixel_colors: (N, H, W, 4)
+
+    [1] Liu et al, 'Soft Rasterizer: A Differentiable Renderer for Image-based
+        3D Reasoning', ICCV 2019
+    """
+    N, H, W, K = fragments.pix_to_face.shape
+    pixel_colors = torch.ones((N, H, W, 4), dtype=colors.dtype, device=colors.device)
+    pixel_colors[..., :3] = colors[..., 0, :]
+    alpha = _sigmoid_alpha(fragments.dists, fragments.pix_to_face, blend_params.sigma)
+    pixel_colors[..., 3] = alpha
+    return pixel_colors
+
+
+def softmax_rgb_blend(
+    colors: torch.Tensor,
+    fragments,
+    blend_params: BlendParams,
+    znear: Union[float, torch.Tensor] = 1.0,
+    zfar: Union[float, torch.Tensor] = 100,
+) -> torch.Tensor:
+    """
+    RGB and alpha channel blending to return an RGBA image based on the method
+    proposed in [1]
+      - **RGB** - blend the colors based on the 2D distance based probability map and
+        relative z distances.
+      - **A** - blend based on the 2D distance based probability map.
+
+    Args:
+        colors: (N, H, W, K, 3) RGB color for each of the top K faces per pixel.
+        fragments: namedtuple with outputs of rasterization. We use properties
+            - pix_to_face: LongTensor of shape (N, H, W, K) specifying the indices
+              of the faces (in the packed representation) which
+              overlap each pixel in the image.
+            - dists: FloatTensor of shape (N, H, W, K) specifying
+              the 2D euclidean distance from the center of each pixel
+              to each of the top K overlapping faces.
+            - zbuf: FloatTensor of shape (N, H, W, K) specifying
+              the interpolated depth from each pixel to to each of the
+              top K overlapping faces.
+        blend_params: instance of BlendParams dataclass containing properties
+            - sigma: float, parameter which controls the width of the sigmoid
+              function used to calculate the 2D distance based probability.
+              Sigma controls the sharpness of the edges of the shape.
+            - gamma: float, parameter which controls the scaling of the
+              exponential function used to control the opacity of the color.
+            - background_color: (3) element list/tuple/torch.Tensor specifying
+              the RGB values for the background color.
+        znear: float, near clipping plane in the z direction
+        zfar: float, far clipping plane in the z direction
+
+    Returns:
+        RGBA pixel_colors: (N, H, W, 4)
+
+    [0] Shichen Liu et al, 'Soft Rasterizer: A Differentiable Renderer for
+    Image-based 3D Reasoning'
+    """
+
+    N, H, W, K = fragments.pix_to_face.shape
+    device = fragments.pix_to_face.device
+    pixel_colors = torch.ones((N, H, W, 4), dtype=colors.dtype, device=colors.device)
+    background_ = blend_params.background_color
+    if not isinstance(background_, torch.Tensor):
+        background = torch.tensor(background_, dtype=torch.float32, device=device)
+    else:
+        background = background_.to(device)
+
+    # Weight for background color
+    eps = 1e-10
+
+    # Mask for padded pixels.
+    mask = fragments.pix_to_face >= 0
+
+    # Sigmoid probability map based on the distance of the pixel to the face.
+    prob_map = torch.sigmoid(-fragments.dists / blend_params.sigma) * mask
+
+    # The cumulative product ensures that alpha will be 0.0 if at least 1
+    # face fully covers the pixel as for that face, prob will be 1.0.
+    # This results in a multiplication by 0.0 because of the (1.0 - prob)
+    # term. Therefore 1.0 - alpha will be 1.0.
+    alpha = torch.prod((1.0 - prob_map), dim=-1)
+
+    # Weights for each face. Adjust the exponential by the max z to prevent
+    # overflow. zbuf shape (N, H, W, K), find max over K.
+    # TODO: there may still be some instability in the exponent calculation.
+
+    # Reshape to be compatible with (N, H, W, K) values in fragments
+    if torch.is_tensor(zfar):
+        # pyre-fixme[16]
+        zfar = zfar[:, None, None, None]
+    if torch.is_tensor(znear):
+        znear = znear[:, None, None, None]
+
+    z_inv = (zfar - fragments.zbuf) / (zfar - znear) * mask
+    z_inv_max = torch.max(z_inv, dim=-1).values[..., None].clamp(min=eps)
+    weights_num = prob_map * torch.exp((z_inv - z_inv_max) / blend_params.gamma)
+
+    # Also apply exp normalize trick for the background color weight.
+    # Clamp to ensure delta is never 0.
+    # pyre-fixme[6]: Expected `Tensor` for 1st param but got `float`.
+    delta = torch.exp((eps - z_inv_max) / blend_params.gamma).clamp(min=eps)
+
+    # Normalize weights.
+    # weights_num shape: (N, H, W, K). Sum over K and divide through by the sum.
+    denom = weights_num.sum(dim=-1)[..., None] + delta
+
+    # Sum: weights * textures + background color
+    weighted_colors = (weights_num[..., None] * colors).sum(dim=-2)
+    weighted_background = delta * background
+    pixel_colors[..., :3] = (weighted_colors + weighted_background) / denom
+    pixel_colors[..., 3] = 1.0 - alpha
+
+    return pixel_colors
diff --git a/pytorch3d/pytorch3d/renderer/camera_conversions.py b/pytorch3d/pytorch3d/renderer/camera_conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c03ab70d8d441eb8f7bf0bf8cea531d562d17cf
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/camera_conversions.py
@@ -0,0 +1,192 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import logging
+from typing import Tuple
+
+import torch
+
+from ..transforms import matrix_to_rotation_6d
+from .cameras import PerspectiveCameras
+
+
+LOGGER = logging.getLogger(__name__)
+
+
+def _cameras_from_opencv_projection(
+    R: torch.Tensor,
+    tvec: torch.Tensor,
+    camera_matrix: torch.Tensor,
+    image_size: torch.Tensor,
+) -> PerspectiveCameras:
+    focal_length = torch.stack([camera_matrix[:, 0, 0], camera_matrix[:, 1, 1]], dim=-1)
+    principal_point = camera_matrix[:, :2, 2]
+
+    # Retype the image_size correctly and flip to width, height.
+    image_size_wh = image_size.to(R).flip(dims=(1,))
+
+    # Screen to NDC conversion:
+    # For non square images, we scale the points such that smallest side
+    # has range [-1, 1] and the largest side has range [-u, u], with u > 1.
+    # This convention is consistent with the PyTorch3D renderer, as well as
+    # the transformation function `get_ndc_to_screen_transform`.
+    scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0
+    scale = scale.expand(-1, 2)
+    c0 = image_size_wh / 2.0
+
+    # Get the PyTorch3D focal length and principal point.
+    focal_pytorch3d = focal_length / scale
+    p0_pytorch3d = -(principal_point - c0) / scale
+
+    # For R, T we flip x, y axes (opencv screen space has an opposite
+    # orientation of screen axes).
+    # We also transpose R (opencv multiplies points from the opposite=left side).
+    R_pytorch3d = R.clone().permute(0, 2, 1)
+    T_pytorch3d = tvec.clone()
+    R_pytorch3d[:, :, :2] *= -1
+    T_pytorch3d[:, :2] *= -1
+
+    return PerspectiveCameras(
+        R=R_pytorch3d,
+        T=T_pytorch3d,
+        focal_length=focal_pytorch3d,
+        principal_point=p0_pytorch3d,
+        image_size=image_size,
+    )
+
+
+def _opencv_from_cameras_projection(
+    cameras: PerspectiveCameras,
+    image_size: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    R_pytorch3d = cameras.R.clone()  # pyre-ignore
+    T_pytorch3d = cameras.T.clone()  # pyre-ignore
+    focal_pytorch3d = cameras.focal_length
+    p0_pytorch3d = cameras.principal_point
+    T_pytorch3d[:, :2] *= -1
+    R_pytorch3d[:, :, :2] *= -1
+    tvec = T_pytorch3d
+    R = R_pytorch3d.permute(0, 2, 1)
+
+    # Retype the image_size correctly and flip to width, height.
+    image_size_wh = image_size.to(R).flip(dims=(1,))
+
+    # NDC to screen conversion.
+    scale = image_size_wh.to(R).min(dim=1, keepdim=True)[0] / 2.0
+    scale = scale.expand(-1, 2)
+    c0 = image_size_wh / 2.0
+
+    # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch.Tensor.__neg__)[[Named...
+    principal_point = -p0_pytorch3d * scale + c0
+    focal_length = focal_pytorch3d * scale
+
+    camera_matrix = torch.zeros_like(R)
+    camera_matrix[:, :2, 2] = principal_point
+    camera_matrix[:, 2, 2] = 1.0
+    camera_matrix[:, 0, 0] = focal_length[:, 0]
+    camera_matrix[:, 1, 1] = focal_length[:, 1]
+    return R, tvec, camera_matrix
+
+
+def _pulsar_from_opencv_projection(
+    R: torch.Tensor,
+    tvec: torch.Tensor,
+    camera_matrix: torch.Tensor,
+    image_size: torch.Tensor,
+    znear: float = 0.1,
+) -> torch.Tensor:
+    assert len(camera_matrix.size()) == 3, "This function requires batched inputs!"
+    assert len(R.size()) == 3, "This function requires batched inputs!"
+    assert len(tvec.size()) in (2, 3), "This function reuqires batched inputs!"
+
+    # Validate parameters.
+    image_size_wh = image_size.to(R).flip(dims=(1,))
+    assert torch.all(
+        image_size_wh > 0
+    ), "height and width must be positive but min is: %s" % (
+        str(image_size_wh.min().item())
+    )
+    assert (
+        camera_matrix.size(1) == 3 and camera_matrix.size(2) == 3
+    ), "Incorrect camera matrix shape: expected 3x3 but got %dx%d" % (
+        camera_matrix.size(1),
+        camera_matrix.size(2),
+    )
+    assert (
+        R.size(1) == 3 and R.size(2) == 3
+    ), "Incorrect R shape: expected 3x3 but got %dx%d" % (
+        R.size(1),
+        R.size(2),
+    )
+    if len(tvec.size()) == 2:
+        tvec = tvec.unsqueeze(2)
+    assert (
+        tvec.size(1) == 3 and tvec.size(2) == 1
+    ), "Incorrect tvec shape: expected 3x1 but got %dx%d" % (
+        tvec.size(1),
+        tvec.size(2),
+    )
+    # Check batch size.
+    batch_size = camera_matrix.size(0)
+    assert R.size(0) == batch_size, "Expected R to have batch size %d. Has size %d." % (
+        batch_size,
+        R.size(0),
+    )
+    assert (
+        tvec.size(0) == batch_size
+    ), "Expected tvec to have batch size %d. Has size %d." % (
+        batch_size,
+        tvec.size(0),
+    )
+    # Check image sizes.
+    image_w = image_size_wh[0, 0]
+    image_h = image_size_wh[0, 1]
+    assert torch.all(
+        image_size_wh[:, 0] == image_w
+    ), "All images in a batch must have the same width!"
+    assert torch.all(
+        image_size_wh[:, 1] == image_h
+    ), "All images in a batch must have the same height!"
+    # Focal length.
+    fx = camera_matrix[:, 0, 0].unsqueeze(1)
+    fy = camera_matrix[:, 1, 1].unsqueeze(1)
+    # Check that we introduce less than 1% error by averaging the focal lengths.
+    fx_y = fx / fy
+    if torch.any(fx_y > 1.01) or torch.any(fx_y < 0.99):
+        LOGGER.warning(
+            "Pulsar only supports a single focal lengths. For converting OpenCV "
+            "focal lengths, we average them for x and y directions. "
+            "The focal lengths for x and y you provided differ by more than 1%, "
+            "which means this could introduce a noticeable error."
+        )
+    f = (fx + fy) / 2
+    # Normalize f into normalized device coordinates.
+    focal_length_px = f / image_w
+    # Transfer into focal_length and sensor_width.
+    focal_length = torch.tensor([znear - 1e-5], dtype=torch.float32, device=R.device)
+    focal_length = focal_length[None, :].repeat(batch_size, 1)
+    sensor_width = focal_length / focal_length_px
+    # Principal point.
+    cx = camera_matrix[:, 0, 2].unsqueeze(1)
+    cy = camera_matrix[:, 1, 2].unsqueeze(1)
+    # Transfer principal point offset into centered offset.
+    cx = -(cx - image_w / 2)
+    cy = cy - image_h / 2
+    # Concatenate to final vector.
+    param = torch.cat([focal_length, sensor_width, cx, cy], dim=1)
+    R_trans = R.permute(0, 2, 1)
+    cam_pos = -torch.bmm(R_trans, tvec).squeeze(2)
+    cam_rot = matrix_to_rotation_6d(R_trans)
+    cam_params = torch.cat([cam_pos, cam_rot, param], dim=1)
+    return cam_params
+
+
+def _pulsar_from_cameras_projection(
+    cameras: PerspectiveCameras,
+    image_size: torch.Tensor,
+) -> torch.Tensor:
+    opencv_R, opencv_T, opencv_K = _opencv_from_cameras_projection(cameras, image_size)
+    return _pulsar_from_opencv_projection(opencv_R, opencv_T, opencv_K, image_size)
diff --git a/pytorch3d/pytorch3d/renderer/camera_utils.py b/pytorch3d/pytorch3d/renderer/camera_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a7a8f2b0882f6e97a753f791e79e3464bca6280
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/camera_utils.py
@@ -0,0 +1,143 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+from pytorch3d.transforms import Transform3d
+
+
+def camera_to_eye_at_up(
+    world_to_view_transform: Transform3d,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Given a world to view transform, return the eye, at and up vectors which
+    represent its position.
+
+    For example, if cam is a camera object, then after running
+
+    .. code-block::
+
+        from cameras import look_at_view_transform
+        eye, at, up = camera_to_eye_at_up(cam.get_world_to_view_transform())
+        R, T = look_at_view_transform(eye=eye, at=at, up=up)
+
+    any other camera created from R and T will have the same world to view
+    transform as cam.
+
+    Also, given a camera position R and T, then after running:
+
+    .. code-block::
+
+        from cameras import get_world_to_view_transform, look_at_view_transform
+        eye, at, up = camera_to_eye_at_up(get_world_to_view_transform(R=R, T=T))
+        R2, T2 = look_at_view_transform(eye=eye, at=at, up=up)
+
+    R2 will equal R and T2 will equal T.
+
+    Args:
+        world_to_view_transform: Transform3d representing the extrinsic
+            transformation of N cameras.
+
+    Returns:
+        eye: FloatTensor of shape [N, 3] representing the camera centers in world space.
+        at: FloatTensor of shape [N, 3] representing points in world space directly in
+            front of the cameras e.g. the positions of objects to be viewed by the
+            cameras.
+        up: FloatTensor of shape [N, 3] representing vectors in world space which
+            when projected on to the camera plane point upwards.
+    """
+    cam_trans = world_to_view_transform.inverse()
+    # In the PyTorch3D right handed coordinate system, the camera in view space
+    # is always at the origin looking along the +z axis.
+
+    # The up vector is not a position so cannot be transformed with
+    # transform_points. However the position eye+up above the camera
+    # (whose position vector in the camera coordinate frame is an up vector)
+    # can be transformed with transform_points.
+    eye_at_up_view = torch.tensor(
+        [[0, 0, 0], [0, 0, 1], [0, 1, 0]], dtype=torch.float32, device=cam_trans.device
+    )
+    eye_at_up_world = cam_trans.transform_points(eye_at_up_view).reshape(-1, 3, 3)
+
+    eye, at, up_plus_eye = eye_at_up_world.unbind(1)
+    up = up_plus_eye - eye
+    return eye, at, up
+
+
+def rotate_on_spot(
+    R: torch.Tensor, T: torch.Tensor, rotation: torch.Tensor
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Given a camera position as R and T (batched or not),
+    and a rotation matrix (batched or not)
+    return a new R and T representing camera position(s)
+    in the same location but rotated on the spot by the
+    given rotation. In particular the new world to view
+    rotation will be the previous one followed by the inverse
+    of the given rotation.
+
+    For example, adding the following lines before constructing a camera
+    will make the camera point a little to the right of where it
+    otherwise would have been.
+
+    .. code-block::
+
+        from math import radians
+        from pytorch3d.transforms import axis_angle_to_matrix
+        angles = [0, radians(10), 0]
+        rotation = axis_angle_to_matrix(torch.FloatTensor(angles))
+        R, T = rotate_on_spot(R, T, rotation)
+
+    Note here that if you have a column vector, then when you
+    premultiply it by this `rotation` (see the rotation_conversions doc),
+    then it will be rotated anticlockwise if facing the -y axis.
+    In our context, where we postmultiply row vectors to transform them,
+    `rotation` will rotate the camera clockwise around the -y axis
+    (i.e. when looking down), which is a turn to the right.
+
+    If angles was [radians(10), 0, 0], the camera would get pointed
+    up a bit instead.
+
+    If angles was [0, 0, radians(10)], the camera would be rotated anticlockwise
+    a bit, so the image would appear rotated clockwise from how it
+    otherwise would have been.
+
+    If you want to translate the camera from the origin in camera
+    coordinates, this is simple and does not need a separate function.
+    In particular, a translation by X = [a, b, c] would cause
+    the camera to move a units left, b units up, and c units
+    forward. This is achieved by using T-X in place of T.
+
+    Args:
+        R: FloatTensor of shape [3, 3] or [N, 3, 3]
+        T: FloatTensor of shape [3] or [N, 3]
+        rotation: FloatTensor of shape [3, 3] or [n, 3, 3]
+        where if neither n nor N is 1, then n and N must be equal.
+
+    Returns:
+        R: FloatTensor of shape [max(N, n), 3, 3]
+        T: FloatTensor of shape [max(N, n), 3]
+    """
+    if R.ndim == 2:
+        R = R[None]
+    if T.ndim == 1:
+        T = T[None]
+    if rotation.ndim == 2:
+        rotation = rotation[None]
+
+    if R.ndim != 3 or R.shape[1:] != (3, 3):
+        raise ValueError("Invalid R")
+    if T.ndim != 2 or T.shape[1] != 3:
+        raise ValueError("Invalid T")
+    if rotation.ndim != 3 or rotation.shape[1:] != (3, 3):
+        raise ValueError("Invalid rotation")
+
+    new_R = R @ rotation.transpose(1, 2)
+    old_RT = torch.bmm(R, T[:, :, None])
+    new_T = torch.matmul(new_R.transpose(1, 2), old_RT)[:, :, 0]
+
+    return new_R, new_T
diff --git a/pytorch3d/pytorch3d/renderer/cameras.py b/pytorch3d/pytorch3d/renderer/cameras.py
new file mode 100644
index 0000000000000000000000000000000000000000..837670de56ced73058a4baaa52cc383320daeaa0
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/cameras.py
@@ -0,0 +1,1788 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import warnings
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from pytorch3d.common.types import Device
+from pytorch3d.transforms import Rotate, Transform3d, Translate
+
+from .utils import TensorProperties, convert_to_tensors_and_broadcast
+
+
+# Default values for rotation and translation matrices.
+_R = torch.eye(3)[None]  # (1, 3, 3)
+_T = torch.zeros(1, 3)  # (1, 3)
+
+
+class CamerasBase(TensorProperties):
+    """
+    `CamerasBase` implements a base class for all cameras.
+
+    For cameras, there are four different coordinate systems (or spaces)
+    - World coordinate system: This is the system the object lives - the world.
+    - Camera view coordinate system: This is the system that has its origin on the camera
+        and the and the Z-axis perpendicular to the image plane.
+        In PyTorch3D, we assume that +X points left, and +Y points up and
+        +Z points out from the image plane.
+        The transformation from world --> view happens after applying a rotation (R)
+        and translation (T)
+    - NDC coordinate system: This is the normalized coordinate system that confines
+        in a volume the rendered part of the object or scene. Also known as view volume.
+        For square images, given the PyTorch3D convention, (+1, +1, znear)
+        is the top left near corner, and (-1, -1, zfar) is the bottom right far
+        corner of the volume.
+        The transformation from view --> NDC happens after applying the camera
+        projection matrix (P) if defined in NDC space.
+        For non square images, we scale the points such that smallest side
+        has range [-1, 1] and the largest side has range [-u, u], with u > 1.
+    - Screen coordinate system: This is another representation of the view volume with
+        the XY coordinates defined in image space instead of a normalized space.
+
+    A better illustration of the coordinate systems can be found in
+    pytorch3d/docs/notes/cameras.md.
+
+    It defines methods that are common to all camera models:
+        - `get_camera_center` that returns the optical center of the camera in
+            world coordinates
+        - `get_world_to_view_transform` which returns a 3D transform from
+            world coordinates to the camera view coordinates (R, T)
+        - `get_full_projection_transform` which composes the projection
+            transform (P) with the world-to-view transform (R, T)
+        - `transform_points` which takes a set of input points in world coordinates and
+            projects to the space the camera is defined in (NDC or screen)
+        - `get_ndc_camera_transform` which defines the transform from screen/NDC to
+            PyTorch3D's NDC space
+        - `transform_points_ndc` which takes a set of points in world coordinates and
+            projects them to PyTorch3D's NDC space
+        - `transform_points_screen` which takes a set of points in world coordinates and
+            projects them to screen space
+
+    For each new camera, one should implement the `get_projection_transform`
+    routine that returns the mapping from camera view coordinates to camera
+    coordinates (NDC or screen).
+
+    Another useful function that is specific to each camera model is
+    `unproject_points` which sends points from camera coordinates (NDC or screen)
+    back to camera view or world coordinates depending on the `world_coordinates`
+    boolean argument of the function.
+    """
+
+    # Used in __getitem__ to index the relevant fields
+    # When creating a new camera, this should be set in the __init__
+    _FIELDS: Tuple = ()
+
+    def get_projection_transform(self):
+        """
+        Calculate the projective transformation matrix.
+
+        Args:
+            **kwargs: parameters for the projection can be passed in as keyword
+                arguments to override the default values set in `__init__`.
+
+        Return:
+            a `Transform3d` object which represents a batch of projection
+            matrices of shape (N, 3, 3)
+        """
+        raise NotImplementedError()
+
+    def unproject_points(self):
+        """
+        Transform input points from camera coodinates (NDC or screen)
+        to the world / camera coordinates.
+
+        Each of the input points `xy_depth` of shape (..., 3) is
+        a concatenation of the x, y location and its depth.
+
+        For instance, for an input 2D tensor of shape `(num_points, 3)`
+        `xy_depth` takes the following form:
+            `xy_depth[i] = [x[i], y[i], depth[i]]`,
+        for a each point at an index `i`.
+
+        The following example demonstrates the relationship between
+        `transform_points` and `unproject_points`:
+
+        .. code-block:: python
+
+            cameras = # camera object derived from CamerasBase
+            xyz = # 3D points of shape (batch_size, num_points, 3)
+            # transform xyz to the camera view coordinates
+            xyz_cam = cameras.get_world_to_view_transform().transform_points(xyz)
+            # extract the depth of each point as the 3rd coord of xyz_cam
+            depth = xyz_cam[:, :, 2:]
+            # project the points xyz to the camera
+            xy = cameras.transform_points(xyz)[:, :, :2]
+            # append depth to xy
+            xy_depth = torch.cat((xy, depth), dim=2)
+            # unproject to the world coordinates
+            xyz_unproj_world = cameras.unproject_points(xy_depth, world_coordinates=True)
+            print(torch.allclose(xyz, xyz_unproj_world)) # True
+            # unproject to the camera coordinates
+            xyz_unproj = cameras.unproject_points(xy_depth, world_coordinates=False)
+            print(torch.allclose(xyz_cam, xyz_unproj)) # True
+
+        Args:
+            xy_depth: torch tensor of shape (..., 3).
+            world_coordinates: If `True`, unprojects the points back to world
+                coordinates using the camera extrinsics `R` and `T`.
+                `False` ignores `R` and `T` and unprojects to
+                the camera view coordinates.
+            from_ndc: If `False` (default), assumes xy part of input is in
+                NDC space if self.in_ndc(), otherwise in screen space. If
+                `True`, assumes xy is in NDC space even if the camera
+                is defined in screen space.
+
+        Returns
+            new_points: unprojected points with the same shape as `xy_depth`.
+        """
+        raise NotImplementedError()
+
+    def get_camera_center(self, **kwargs) -> torch.Tensor:
+        """
+        Return the 3D location of the camera optical center
+        in the world coordinates.
+
+        Args:
+            **kwargs: parameters for the camera extrinsics can be passed in
+                as keyword arguments to override the default values
+                set in __init__.
+
+        Setting T here will update the values set in init as this
+        value may be needed later on in the rendering pipeline e.g. for
+        lighting calculations.
+
+        Returns:
+            C: a batch of 3D locations of shape (N, 3) denoting
+            the locations of the center of each camera in the batch.
+        """
+        w2v_trans = self.get_world_to_view_transform(**kwargs)
+        P = w2v_trans.inverse().get_matrix()
+        # the camera center is the translation component (the first 3 elements
+        # of the last row) of the inverted world-to-view
+        # transform (4x4 RT matrix)
+        C = P[:, 3, :3]
+        return C
+
+    def get_world_to_view_transform(self, **kwargs) -> Transform3d:
+        """
+        Return the world-to-view transform.
+
+        Args:
+            **kwargs: parameters for the camera extrinsics can be passed in
+                as keyword arguments to override the default values
+                set in __init__.
+
+        Setting R and T here will update the values set in init as these
+        values may be needed later on in the rendering pipeline e.g. for
+        lighting calculations.
+
+        Returns:
+            A Transform3d object which represents a batch of transforms
+            of shape (N, 3, 3)
+        """
+        R: torch.Tensor = kwargs.get("R", self.R)
+        T: torch.Tensor = kwargs.get("T", self.T)
+        self.R = R  # pyre-ignore[16]
+        self.T = T  # pyre-ignore[16]
+        world_to_view_transform = get_world_to_view_transform(R=R, T=T)
+        return world_to_view_transform
+
+    def get_full_projection_transform(self, **kwargs) -> Transform3d:
+        """
+        Return the full world-to-camera transform composing the
+        world-to-view and view-to-camera transforms.
+        If camera is defined in NDC space, the projected points are in NDC space.
+        If camera is defined in screen space, the projected points are in screen space.
+
+        Args:
+            **kwargs: parameters for the projection transforms can be passed in
+                as keyword arguments to override the default values
+                set in __init__.
+
+        Setting R and T here will update the values set in init as these
+        values may be needed later on in the rendering pipeline e.g. for
+        lighting calculations.
+
+        Returns:
+            a Transform3d object which represents a batch of transforms
+            of shape (N, 3, 3)
+        """
+        self.R: torch.Tensor = kwargs.get("R", self.R)  # pyre-ignore[16]
+        self.T: torch.Tensor = kwargs.get("T", self.T)  # pyre-ignore[16]
+        world_to_view_transform = self.get_world_to_view_transform(R=self.R, T=self.T)
+        view_to_proj_transform = self.get_projection_transform(**kwargs)
+        return world_to_view_transform.compose(view_to_proj_transform)
+
+    def transform_points(
+        self, points, eps: Optional[float] = None, **kwargs
+    ) -> torch.Tensor:
+        """
+        Transform input points from world to camera space with the
+        projection matrix defined by the camera.
+
+        For `CamerasBase.transform_points`, setting `eps > 0`
+        stabilizes gradients since it leads to avoiding division
+        by excessively low numbers for points close to the camera plane.
+
+        Args:
+            points: torch tensor of shape (..., 3).
+            eps: If eps!=None, the argument is used to clamp the
+                divisor in the homogeneous normalization of the points
+                transformed to the ndc space. Please see
+                `transforms.Transform3d.transform_points` for details.
+
+                For `CamerasBase.transform_points`, setting `eps > 0`
+                stabilizes gradients since it leads to avoiding division
+                by excessively low numbers for points close to the
+                camera plane.
+
+        Returns
+            new_points: transformed points with the same shape as the input.
+        """
+        world_to_proj_transform = self.get_full_projection_transform(**kwargs)
+        return world_to_proj_transform.transform_points(points, eps=eps)
+
+    def get_ndc_camera_transform(self, **kwargs) -> Transform3d:
+        """
+        Returns the transform from camera projection space (screen or NDC) to NDC space.
+        For cameras that can be specified in screen space, this transform
+        allows points to be converted from screen to NDC space.
+        The default transform scales the points from [0, W]x[0, H]
+        to [-1, 1]x[-u, u] or [-u, u]x[-1, 1] where u > 1 is the aspect ratio of the image.
+        This function should be modified per camera definitions if need be,
+        e.g. for Perspective/Orthographic cameras we provide a custom implementation.
+        This transform assumes PyTorch3D coordinate system conventions for
+        both the NDC space and the input points.
+
+        This transform interfaces with the PyTorch3D renderer which assumes
+        input points to the renderer to be in NDC space.
+        """
+        if self.in_ndc():
+            return Transform3d(device=self.device, dtype=torch.float32)
+        else:
+            # For custom cameras which can be defined in screen space,
+            # users might might have to implement the screen to NDC transform based
+            # on the definition of the camera parameters.
+            # See PerspectiveCameras/OrthographicCameras for an example.
+            # We don't flip xy because we assume that world points are in
+            # PyTorch3D coordinates, and thus conversion from screen to ndc
+            # is a mere scaling from image to [-1, 1] scale.
+            image_size = kwargs.get("image_size", self.get_image_size())
+            return get_screen_to_ndc_transform(
+                self, with_xyflip=False, image_size=image_size
+            )
+
+    def transform_points_ndc(
+        self, points, eps: Optional[float] = None, **kwargs
+    ) -> torch.Tensor:
+        """
+        Transforms points from PyTorch3D world/camera space to NDC space.
+        Input points follow the PyTorch3D coordinate system conventions: +X left, +Y up.
+        Output points are in NDC space: +X left, +Y up, origin at image center.
+
+        Args:
+            points: torch tensor of shape (..., 3).
+            eps: If eps!=None, the argument is used to clamp the
+                divisor in the homogeneous normalization of the points
+                transformed to the ndc space. Please see
+                `transforms.Transform3d.transform_points` for details.
+
+                For `CamerasBase.transform_points`, setting `eps > 0`
+                stabilizes gradients since it leads to avoiding division
+                by excessively low numbers for points close to the
+                camera plane.
+
+        Returns
+            new_points: transformed points with the same shape as the input.
+        """
+        world_to_ndc_transform = self.get_full_projection_transform(**kwargs)
+        if not self.in_ndc():
+            to_ndc_transform = self.get_ndc_camera_transform(**kwargs)
+            world_to_ndc_transform = world_to_ndc_transform.compose(to_ndc_transform)
+
+        return world_to_ndc_transform.transform_points(points, eps=eps)
+
+    def transform_points_screen(
+        self, points, eps: Optional[float] = None, **kwargs
+    ) -> torch.Tensor:
+        """
+        Transforms points from PyTorch3D world/camera space to screen space.
+        Input points follow the PyTorch3D coordinate system conventions: +X left, +Y up.
+        Output points are in screen space: +X right, +Y down, origin at top left corner.
+
+        Args:
+            points: torch tensor of shape (..., 3).
+            eps: If eps!=None, the argument is used to clamp the
+                divisor in the homogeneous normalization of the points
+                transformed to the ndc space. Please see
+                `transforms.Transform3d.transform_points` for details.
+
+                For `CamerasBase.transform_points`, setting `eps > 0`
+                stabilizes gradients since it leads to avoiding division
+                by excessively low numbers for points close to the
+                camera plane.
+
+        Returns
+            new_points: transformed points with the same shape as the input.
+        """
+        points_ndc = self.transform_points_ndc(points, eps=eps, **kwargs)
+        image_size = kwargs.get("image_size", self.get_image_size())
+        return get_ndc_to_screen_transform(
+            self, with_xyflip=True, image_size=image_size
+        ).transform_points(points_ndc, eps=eps)
+
+    def clone(self):
+        """
+        Returns a copy of `self`.
+        """
+        cam_type = type(self)
+        other = cam_type(device=self.device)
+        return super().clone(other)
+
+    def is_perspective(self):
+        raise NotImplementedError()
+
+    def in_ndc(self):
+        """
+        Specifies whether the camera is defined in NDC space
+        or in screen (image) space
+        """
+        raise NotImplementedError()
+
+    def get_znear(self):
+        return self.znear if hasattr(self, "znear") else None
+
+    def get_image_size(self):
+        """
+        Returns the image size, if provided, expected in the form of (height, width)
+        The image size is used for conversion of projected points to screen coordinates.
+        """
+        return self.image_size if hasattr(self, "image_size") else None
+
+    def __getitem__(
+        self, index: Union[int, List[int], torch.LongTensor]
+    ) -> "CamerasBase":
+        """
+        Override for the __getitem__ method in TensorProperties which needs to be
+        refactored.
+
+        Args:
+            index: an int/list/long tensor used to index all the fields in the cameras given by
+                self._FIELDS.
+        Returns:
+            if `index` is an index int/list/long tensor return an instance of the current
+            cameras class with only the values at the selected index.
+        """
+
+        kwargs = {}
+
+        if not isinstance(index, (int, list, torch.LongTensor, torch.cuda.LongTensor)):
+            msg = "Invalid index type, expected int, List[int] or torch.LongTensor; got %r"
+            raise ValueError(msg % type(index))
+
+        if isinstance(index, int):
+            index = [index]
+
+        if max(index) >= len(self):
+            raise ValueError(f"Index {max(index)} is out of bounds for select cameras")
+
+        for field in self._FIELDS:
+            val = getattr(self, field, None)
+            if val is None:
+                continue
+
+            # e.g. "in_ndc" is set as attribute "_in_ndc" on the class
+            # but provided as "in_ndc" on initialization
+            if field.startswith("_"):
+                field = field[1:]
+
+            if isinstance(val, (str, bool)):
+                kwargs[field] = val
+            elif isinstance(val, torch.Tensor):
+                # In the init, all inputs will be converted to
+                # tensors before setting as attributes
+                kwargs[field] = val[index]
+            else:
+                raise ValueError(f"Field {field} type is not supported for indexing")
+
+        kwargs["device"] = self.device
+        return self.__class__(**kwargs)
+
+
+############################################################
+#             Field of View Camera Classes                 #
+############################################################
+
+
+def OpenGLPerspectiveCameras(
+    znear=1.0,
+    zfar=100.0,
+    aspect_ratio=1.0,
+    fov=60.0,
+    degrees: bool = True,
+    R: torch.Tensor = _R,
+    T: torch.Tensor = _T,
+    device: Device = "cpu",
+) -> "FoVPerspectiveCameras":
+    """
+    OpenGLPerspectiveCameras has been DEPRECATED. Use FoVPerspectiveCameras instead.
+    Preserving OpenGLPerspectiveCameras for backward compatibility.
+    """
+
+    warnings.warn(
+        """OpenGLPerspectiveCameras is deprecated,
+        Use FoVPerspectiveCameras instead.
+        OpenGLPerspectiveCameras will be removed in future releases.""",
+        PendingDeprecationWarning,
+    )
+
+    return FoVPerspectiveCameras(
+        znear=znear,
+        zfar=zfar,
+        aspect_ratio=aspect_ratio,
+        fov=fov,
+        degrees=degrees,
+        R=R,
+        T=T,
+        device=device,
+    )
+
+
+class FoVPerspectiveCameras(CamerasBase):
+    """
+    A class which stores a batch of parameters to generate a batch of
+    projection matrices by specifying the field of view.
+    The definition of the parameters follow the OpenGL perspective camera.
+
+    The extrinsics of the camera (R and T matrices) can also be set in the
+    initializer or passed in to `get_full_projection_transform` to get
+    the full transformation from world -> ndc.
+
+    The `transform_points` method calculates the full world -> ndc transform
+    and then applies it to the input points.
+
+    The transforms can also be returned separately as Transform3d objects.
+
+    * Setting the Aspect Ratio for Non Square Images *
+
+    If the desired output image size is non square (i.e. a tuple of (H, W) where H != W)
+    the aspect ratio needs special consideration: There are two aspect ratios
+    to be aware of:
+        - the aspect ratio of each pixel
+        - the aspect ratio of the output image
+    The `aspect_ratio` setting in the FoVPerspectiveCameras sets the
+    pixel aspect ratio. When using this camera with the differentiable rasterizer
+    be aware that in the rasterizer we assume square pixels, but allow
+    variable image aspect ratio (i.e rectangle images).
+
+    In most cases you will want to set the camera `aspect_ratio=1.0`
+    (i.e. square pixels) and only vary the output image dimensions in pixels
+    for rasterization.
+    """
+
+    # For __getitem__
+    _FIELDS = (
+        "K",
+        "znear",
+        "zfar",
+        "aspect_ratio",
+        "fov",
+        "R",
+        "T",
+        "degrees",
+    )
+
+    def __init__(
+        self,
+        znear=1.0,
+        zfar=100.0,
+        aspect_ratio=1.0,
+        fov=60.0,
+        degrees: bool = True,
+        R: torch.Tensor = _R,
+        T: torch.Tensor = _T,
+        K: Optional[torch.Tensor] = None,
+        device: Device = "cpu",
+    ) -> None:
+        """
+
+        Args:
+            znear: near clipping plane of the view frustrum.
+            zfar: far clipping plane of the view frustrum.
+            aspect_ratio: aspect ratio of the image pixels.
+                1.0 indicates square pixels.
+            fov: field of view angle of the camera.
+            degrees: bool, set to True if fov is specified in degrees.
+            R: Rotation matrix of shape (N, 3, 3)
+            T: Translation matrix of shape (N, 3)
+            K: (optional) A calibration matrix of shape (N, 4, 4)
+                If provided, don't need znear, zfar, fov, aspect_ratio, degrees
+            device: Device (as str or torch.device)
+        """
+        # The initializer formats all inputs to torch tensors and broadcasts
+        # all the inputs to have the same batch dimension where necessary.
+        super().__init__(
+            device=device,
+            znear=znear,
+            zfar=zfar,
+            aspect_ratio=aspect_ratio,
+            fov=fov,
+            R=R,
+            T=T,
+            K=K,
+        )
+
+        # No need to convert to tensor or broadcast.
+        self.degrees = degrees
+
+    def compute_projection_matrix(
+        self, znear, zfar, fov, aspect_ratio, degrees: bool
+    ) -> torch.Tensor:
+        """
+        Compute the calibration matrix K of shape (N, 4, 4)
+
+        Args:
+            znear: near clipping plane of the view frustrum.
+            zfar: far clipping plane of the view frustrum.
+            fov: field of view angle of the camera.
+            aspect_ratio: aspect ratio of the image pixels.
+                1.0 indicates square pixels.
+            degrees: bool, set to True if fov is specified in degrees.
+
+        Returns:
+            torch.FloatTensor of the calibration matrix with shape (N, 4, 4)
+        """
+        K = torch.zeros((self._N, 4, 4), device=self.device, dtype=torch.float32)
+        ones = torch.ones((self._N), dtype=torch.float32, device=self.device)
+        if degrees:
+            fov = (np.pi / 180) * fov
+
+        if not torch.is_tensor(fov):
+            fov = torch.tensor(fov, device=self.device)
+        tanHalfFov = torch.tan((fov / 2))
+        max_y = tanHalfFov * znear
+        min_y = -max_y
+        max_x = max_y * aspect_ratio
+        min_x = -max_x
+
+        # NOTE: In OpenGL the projection matrix changes the handedness of the
+        # coordinate frame. i.e the NDC space positive z direction is the
+        # camera space negative z direction. This is because the sign of the z
+        # in the projection matrix is set to -1.0.
+        # In pytorch3d we maintain a right handed coordinate system throughout
+        # so the so the z sign is 1.0.
+        z_sign = 1.0
+
+        K[:, 0, 0] = 2.0 * znear / (max_x - min_x)
+        K[:, 1, 1] = 2.0 * znear / (max_y - min_y)
+        K[:, 0, 2] = (max_x + min_x) / (max_x - min_x)
+        K[:, 1, 2] = (max_y + min_y) / (max_y - min_y)
+        K[:, 3, 2] = z_sign * ones
+
+        # NOTE: This maps the z coordinate from [0, 1] where z = 0 if the point
+        # is at the near clipping plane and z = 1 when the point is at the far
+        # clipping plane.
+        K[:, 2, 2] = z_sign * zfar / (zfar - znear)
+        K[:, 2, 3] = -(zfar * znear) / (zfar - znear)
+
+        return K
+
+    def get_projection_transform(self, **kwargs) -> Transform3d:
+        """
+        Calculate the perspective projection matrix with a symmetric
+        viewing frustrum. Use column major order.
+        The viewing frustrum will be projected into ndc, s.t.
+        (max_x, max_y) -> (+1, +1)
+        (min_x, min_y) -> (-1, -1)
+
+        Args:
+            **kwargs: parameters for the projection can be passed in as keyword
+                arguments to override the default values set in `__init__`.
+
+        Return:
+            a Transform3d object which represents a batch of projection
+            matrices of shape (N, 4, 4)
+
+        .. code-block:: python
+
+            h1 = (max_y + min_y)/(max_y - min_y)
+            w1 = (max_x + min_x)/(max_x - min_x)
+            tanhalffov = tan((fov/2))
+            s1 = 1/tanhalffov
+            s2 = 1/(tanhalffov * (aspect_ratio))
+
+            # To map z to the range [0, 1] use:
+            f1 =  far / (far - near)
+            f2 = -(far * near) / (far - near)
+
+            # Projection matrix
+            K = [
+                    [s1,   0,   w1,   0],
+                    [0,   s2,   h1,   0],
+                    [0,    0,   f1,  f2],
+                    [0,    0,    1,   0],
+            ]
+        """
+        K = kwargs.get("K", self.K)
+        if K is not None:
+            if K.shape != (self._N, 4, 4):
+                msg = "Expected K to have shape of (%r, 4, 4)"
+                raise ValueError(msg % (self._N))
+        else:
+            K = self.compute_projection_matrix(
+                kwargs.get("znear", self.znear),
+                kwargs.get("zfar", self.zfar),
+                kwargs.get("fov", self.fov),
+                kwargs.get("aspect_ratio", self.aspect_ratio),
+                kwargs.get("degrees", self.degrees),
+            )
+
+        # Transpose the projection matrix as PyTorch3D transforms use row vectors.
+        transform = Transform3d(
+            matrix=K.transpose(1, 2).contiguous(), device=self.device
+        )
+        return transform
+
+    def unproject_points(
+        self,
+        xy_depth: torch.Tensor,
+        world_coordinates: bool = True,
+        scaled_depth_input: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        """>!
+        FoV cameras further allow for passing depth in world units
+        (`scaled_depth_input=False`) or in the [0, 1]-normalized units
+        (`scaled_depth_input=True`)
+
+        Args:
+            scaled_depth_input: If `True`, assumes the input depth is in
+                the [0, 1]-normalized units. If `False` the input depth is in
+                the world units.
+        """
+
+        # obtain the relevant transformation to ndc
+        if world_coordinates:
+            to_ndc_transform = self.get_full_projection_transform()
+        else:
+            to_ndc_transform = self.get_projection_transform()
+
+        if scaled_depth_input:
+            # the input is scaled depth, so we don't have to do anything
+            xy_sdepth = xy_depth
+        else:
+            # parse out important values from the projection matrix
+            K_matrix = self.get_projection_transform(**kwargs.copy()).get_matrix()
+            # parse out f1, f2 from K_matrix
+            unsqueeze_shape = [1] * xy_depth.dim()
+            unsqueeze_shape[0] = K_matrix.shape[0]
+            f1 = K_matrix[:, 2, 2].reshape(unsqueeze_shape)
+            f2 = K_matrix[:, 3, 2].reshape(unsqueeze_shape)
+            # get the scaled depth
+            sdepth = (f1 * xy_depth[..., 2:3] + f2) / xy_depth[..., 2:3]
+            # concatenate xy + scaled depth
+            xy_sdepth = torch.cat((xy_depth[..., 0:2], sdepth), dim=-1)
+
+        # unproject with inverse of the projection
+        unprojection_transform = to_ndc_transform.inverse()
+        return unprojection_transform.transform_points(xy_sdepth)
+
+    def is_perspective(self):
+        return True
+
+    def in_ndc(self):
+        return True
+
+
+def OpenGLOrthographicCameras(
+    znear=1.0,
+    zfar=100.0,
+    top=1.0,
+    bottom=-1.0,
+    left=-1.0,
+    right=1.0,
+    scale_xyz=((1.0, 1.0, 1.0),),  # (1, 3)
+    R: torch.Tensor = _R,
+    T: torch.Tensor = _T,
+    device: Device = "cpu",
+) -> "FoVOrthographicCameras":
+    """
+    OpenGLOrthographicCameras has been DEPRECATED. Use FoVOrthographicCameras instead.
+    Preserving OpenGLOrthographicCameras for backward compatibility.
+    """
+
+    warnings.warn(
+        """OpenGLOrthographicCameras is deprecated,
+        Use FoVOrthographicCameras instead.
+        OpenGLOrthographicCameras will be removed in future releases.""",
+        PendingDeprecationWarning,
+    )
+
+    return FoVOrthographicCameras(
+        znear=znear,
+        zfar=zfar,
+        max_y=top,
+        min_y=bottom,
+        max_x=right,
+        min_x=left,
+        scale_xyz=scale_xyz,
+        R=R,
+        T=T,
+        device=device,
+    )
+
+
+class FoVOrthographicCameras(CamerasBase):
+    """
+    A class which stores a batch of parameters to generate a batch of
+    projection matrices by specifying the field of view.
+    The definition of the parameters follow the OpenGL orthographic camera.
+    """
+
+    # For __getitem__
+    _FIELDS = (
+        "K",
+        "znear",
+        "zfar",
+        "R",
+        "T",
+        "max_y",
+        "min_y",
+        "max_x",
+        "min_x",
+        "scale_xyz",
+    )
+
+    def __init__(
+        self,
+        znear=1.0,
+        zfar=100.0,
+        max_y=1.0,
+        min_y=-1.0,
+        max_x=1.0,
+        min_x=-1.0,
+        scale_xyz=((1.0, 1.0, 1.0),),  # (1, 3)
+        R: torch.Tensor = _R,
+        T: torch.Tensor = _T,
+        K: Optional[torch.Tensor] = None,
+        device: Device = "cpu",
+    ):
+        """
+
+        Args:
+            znear: near clipping plane of the view frustrum.
+            zfar: far clipping plane of the view frustrum.
+            max_y: maximum y coordinate of the frustrum.
+            min_y: minimum y coordinate of the frustrum.
+            max_x: maximum x coordinate of the frustrum.
+            min_x: minimum x coordinate of the frustrum
+            scale_xyz: scale factors for each axis of shape (N, 3).
+            R: Rotation matrix of shape (N, 3, 3).
+            T: Translation of shape (N, 3).
+            K: (optional) A calibration matrix of shape (N, 4, 4)
+                If provided, don't need znear, zfar, max_y, min_y, max_x, min_x, scale_xyz
+            device: torch.device or string.
+
+        Only need to set min_x, max_x, min_y, max_y for viewing frustrums
+        which are non symmetric about the origin.
+        """
+        # The initializer formats all inputs to torch tensors and broadcasts
+        # all the inputs to have the same batch dimension where necessary.
+        super().__init__(
+            device=device,
+            znear=znear,
+            zfar=zfar,
+            max_y=max_y,
+            min_y=min_y,
+            max_x=max_x,
+            min_x=min_x,
+            scale_xyz=scale_xyz,
+            R=R,
+            T=T,
+            K=K,
+        )
+
+    def compute_projection_matrix(
+        self, znear, zfar, max_x, min_x, max_y, min_y, scale_xyz
+    ) -> torch.Tensor:
+        """
+        Compute the calibration matrix K of shape (N, 4, 4)
+
+        Args:
+            znear: near clipping plane of the view frustrum.
+            zfar: far clipping plane of the view frustrum.
+            max_x: maximum x coordinate of the frustrum.
+            min_x: minimum x coordinate of the frustrum
+            max_y: maximum y coordinate of the frustrum.
+            min_y: minimum y coordinate of the frustrum.
+            scale_xyz: scale factors for each axis of shape (N, 3).
+        """
+        K = torch.zeros((self._N, 4, 4), dtype=torch.float32, device=self.device)
+        ones = torch.ones((self._N), dtype=torch.float32, device=self.device)
+        # NOTE: OpenGL flips handedness of coordinate system between camera
+        # space and NDC space so z sign is -ve. In PyTorch3D we maintain a
+        # right handed coordinate system throughout.
+        z_sign = +1.0
+
+        K[:, 0, 0] = (2.0 / (max_x - min_x)) * scale_xyz[:, 0]
+        K[:, 1, 1] = (2.0 / (max_y - min_y)) * scale_xyz[:, 1]
+        K[:, 0, 3] = -(max_x + min_x) / (max_x - min_x)
+        K[:, 1, 3] = -(max_y + min_y) / (max_y - min_y)
+        K[:, 3, 3] = ones
+
+        # NOTE: This maps the z coordinate to the range [0, 1] and replaces the
+        # the OpenGL z normalization to [-1, 1]
+        K[:, 2, 2] = z_sign * (1.0 / (zfar - znear)) * scale_xyz[:, 2]
+        K[:, 2, 3] = -znear / (zfar - znear)
+
+        return K
+
+    def get_projection_transform(self, **kwargs) -> Transform3d:
+        """
+        Calculate the orthographic projection matrix.
+        Use column major order.
+
+        Args:
+            **kwargs: parameters for the projection can be passed in to
+                      override the default values set in __init__.
+        Return:
+            a Transform3d object which represents a batch of projection
+               matrices of shape (N, 4, 4)
+
+        .. code-block:: python
+
+            scale_x = 2 / (max_x - min_x)
+            scale_y = 2 / (max_y - min_y)
+            scale_z = 2 / (far-near)
+            mid_x = (max_x + min_x) / (max_x - min_x)
+            mix_y = (max_y + min_y) / (max_y - min_y)
+            mid_z = (far + near) / (far - near)
+
+            K = [
+                    [scale_x,        0,         0,  -mid_x],
+                    [0,        scale_y,         0,  -mix_y],
+                    [0,              0,  -scale_z,  -mid_z],
+                    [0,              0,         0,       1],
+            ]
+        """
+        K = kwargs.get("K", self.K)
+        if K is not None:
+            if K.shape != (self._N, 4, 4):
+                msg = "Expected K to have shape of (%r, 4, 4)"
+                raise ValueError(msg % (self._N))
+        else:
+            K = self.compute_projection_matrix(
+                kwargs.get("znear", self.znear),
+                kwargs.get("zfar", self.zfar),
+                kwargs.get("max_x", self.max_x),
+                kwargs.get("min_x", self.min_x),
+                kwargs.get("max_y", self.max_y),
+                kwargs.get("min_y", self.min_y),
+                kwargs.get("scale_xyz", self.scale_xyz),
+            )
+
+        transform = Transform3d(
+            matrix=K.transpose(1, 2).contiguous(), device=self.device
+        )
+        return transform
+
+    def unproject_points(
+        self,
+        xy_depth: torch.Tensor,
+        world_coordinates: bool = True,
+        scaled_depth_input: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        """>!
+        FoV cameras further allow for passing depth in world units
+        (`scaled_depth_input=False`) or in the [0, 1]-normalized units
+        (`scaled_depth_input=True`)
+
+        Args:
+            scaled_depth_input: If `True`, assumes the input depth is in
+                the [0, 1]-normalized units. If `False` the input depth is in
+                the world units.
+        """
+
+        if world_coordinates:
+            to_ndc_transform = self.get_full_projection_transform(**kwargs.copy())
+        else:
+            to_ndc_transform = self.get_projection_transform(**kwargs.copy())
+
+        if scaled_depth_input:
+            # the input depth is already scaled
+            xy_sdepth = xy_depth
+        else:
+            # we have to obtain the scaled depth first
+            K = self.get_projection_transform(**kwargs).get_matrix()
+            unsqueeze_shape = [1] * K.dim()
+            unsqueeze_shape[0] = K.shape[0]
+            mid_z = K[:, 3, 2].reshape(unsqueeze_shape)
+            scale_z = K[:, 2, 2].reshape(unsqueeze_shape)
+            scaled_depth = scale_z * xy_depth[..., 2:3] + mid_z
+            # cat xy and scaled depth
+            xy_sdepth = torch.cat((xy_depth[..., :2], scaled_depth), dim=-1)
+        # finally invert the transform
+        unprojection_transform = to_ndc_transform.inverse()
+        return unprojection_transform.transform_points(xy_sdepth)
+
+    def is_perspective(self):
+        return False
+
+    def in_ndc(self):
+        return True
+
+
+############################################################
+#             MultiView Camera Classes                     #
+############################################################
+"""
+Note that the MultiView Cameras accept parameters in NDC space.
+"""
+
+
+def SfMPerspectiveCameras(
+    focal_length=1.0,
+    principal_point=((0.0, 0.0),),
+    R: torch.Tensor = _R,
+    T: torch.Tensor = _T,
+    device: Device = "cpu",
+) -> "PerspectiveCameras":
+    """
+    SfMPerspectiveCameras has been DEPRECATED. Use PerspectiveCameras instead.
+    Preserving SfMPerspectiveCameras for backward compatibility.
+    """
+
+    warnings.warn(
+        """SfMPerspectiveCameras is deprecated,
+        Use PerspectiveCameras instead.
+        SfMPerspectiveCameras will be removed in future releases.""",
+        PendingDeprecationWarning,
+    )
+
+    return PerspectiveCameras(
+        focal_length=focal_length,
+        principal_point=principal_point,
+        R=R,
+        T=T,
+        device=device,
+    )
+
+
+class PerspectiveCameras(CamerasBase):
+    """
+    A class which stores a batch of parameters to generate a batch of
+    transformation matrices using the multi-view geometry convention for
+    perspective camera.
+
+    Parameters for this camera are specified in NDC if `in_ndc` is set to True.
+    If parameters are specified in screen space, `in_ndc` must be set to False.
+    """
+
+    # For __getitem__
+    _FIELDS = (
+        "K",
+        "R",
+        "T",
+        "focal_length",
+        "principal_point",
+        "_in_ndc",  # arg is in_ndc but attribute set as _in_ndc
+        "image_size",
+    )
+
+    def __init__(
+        self,
+        focal_length=1.0,
+        principal_point=((0.0, 0.0),),
+        R: torch.Tensor = _R,
+        T: torch.Tensor = _T,
+        K: Optional[torch.Tensor] = None,
+        device: Device = "cpu",
+        in_ndc: bool = True,
+        image_size: Optional[Union[List, Tuple, torch.Tensor]] = None,
+    ) -> None:
+        """
+
+        Args:
+            focal_length: Focal length of the camera in world units.
+                A tensor of shape (N, 1) or (N, 2) for
+                square and non-square pixels respectively.
+            principal_point: xy coordinates of the center of
+                the principal point of the camera in pixels.
+                A tensor of shape (N, 2).
+            in_ndc: True if camera parameters are specified in NDC.
+                If camera parameters are in screen space, it must
+                be set to False.
+            R: Rotation matrix of shape (N, 3, 3)
+            T: Translation matrix of shape (N, 3)
+            K: (optional) A calibration matrix of shape (N, 4, 4)
+                If provided, don't need focal_length, principal_point
+            image_size: (height, width) of image size.
+                A tensor of shape (N, 2) or a list/tuple. Required for screen cameras.
+            device: torch.device or string
+        """
+        # The initializer formats all inputs to torch tensors and broadcasts
+        # all the inputs to have the same batch dimension where necessary.
+        kwargs = {"image_size": image_size} if image_size is not None else {}
+        super().__init__(
+            device=device,
+            focal_length=focal_length,
+            principal_point=principal_point,
+            R=R,
+            T=T,
+            K=K,
+            _in_ndc=in_ndc,
+            **kwargs,  # pyre-ignore
+        )
+        if image_size is not None:
+            if (self.image_size < 1).any():  # pyre-ignore
+                raise ValueError("Image_size provided has invalid values")
+        else:
+            self.image_size = None
+
+    def get_projection_transform(self, **kwargs) -> Transform3d:
+        """
+        Calculate the projection matrix using the
+        multi-view geometry convention.
+
+        Args:
+            **kwargs: parameters for the projection can be passed in as keyword
+                arguments to override the default values set in __init__.
+
+        Returns:
+            A `Transform3d` object with a batch of `N` projection transforms.
+
+        .. code-block:: python
+
+            fx = focal_length[:, 0]
+            fy = focal_length[:, 1]
+            px = principal_point[:, 0]
+            py = principal_point[:, 1]
+
+            K = [
+                    [fx,   0,   px,   0],
+                    [0,   fy,   py,   0],
+                    [0,    0,    0,   1],
+                    [0,    0,    1,   0],
+            ]
+        """
+        K = kwargs.get("K", self.K)
+        if K is not None:
+            if K.shape != (self._N, 4, 4):
+                msg = "Expected K to have shape of (%r, 4, 4)"
+                raise ValueError(msg % (self._N))
+        else:
+            K = _get_sfm_calibration_matrix(
+                self._N,
+                self.device,
+                kwargs.get("focal_length", self.focal_length),
+                kwargs.get("principal_point", self.principal_point),
+                orthographic=False,
+            )
+
+        transform = Transform3d(
+            matrix=K.transpose(1, 2).contiguous(), device=self.device
+        )
+        return transform
+
+    def unproject_points(
+        self,
+        xy_depth: torch.Tensor,
+        world_coordinates: bool = True,
+        from_ndc: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Args:
+            from_ndc: If `False` (default), assumes xy part of input is in
+                NDC space if self.in_ndc(), otherwise in screen space. If
+                `True`, assumes xy is in NDC space even if the camera
+                is defined in screen space.
+        """
+        if world_coordinates:
+            to_camera_transform = self.get_full_projection_transform(**kwargs)
+        else:
+            to_camera_transform = self.get_projection_transform(**kwargs)
+        if from_ndc:
+            to_camera_transform = to_camera_transform.compose(
+                self.get_ndc_camera_transform()
+            )
+
+        unprojection_transform = to_camera_transform.inverse()
+        xy_inv_depth = torch.cat(
+            (xy_depth[..., :2], 1.0 / xy_depth[..., 2:3]), dim=-1  # type: ignore
+        )
+        return unprojection_transform.transform_points(xy_inv_depth)
+
+    def get_principal_point(self, **kwargs) -> torch.Tensor:
+        """
+        Return the camera's principal point
+
+        Args:
+            **kwargs: parameters for the camera extrinsics can be passed in
+                as keyword arguments to override the default values
+                set in __init__.
+        """
+        proj_mat = self.get_projection_transform(**kwargs).get_matrix()
+        return proj_mat[:, 2, :2]
+
+    def get_ndc_camera_transform(self, **kwargs) -> Transform3d:
+        """
+        Returns the transform from camera projection space (screen or NDC) to NDC space.
+        If the camera is defined already in NDC space, the transform is identity.
+        For cameras defined in screen space, we adjust the principal point computation
+        which is defined in the image space (commonly) and scale the points to NDC space.
+
+        This transform leaves the depth unchanged.
+
+        Important: This transforms assumes PyTorch3D conventions for the input points,
+        i.e. +X left, +Y up.
+        """
+        if self.in_ndc():
+            ndc_transform = Transform3d(device=self.device, dtype=torch.float32)
+        else:
+            # when cameras are defined in screen/image space, the principal point is
+            # provided in the (+X right, +Y down), aka image, coordinate system.
+            # Since input points are defined in the PyTorch3D system (+X left, +Y up),
+            # we need to adjust for the principal point transform.
+            pr_point_fix = torch.zeros(
+                (self._N, 4, 4), device=self.device, dtype=torch.float32
+            )
+            pr_point_fix[:, 0, 0] = 1.0
+            pr_point_fix[:, 1, 1] = 1.0
+            pr_point_fix[:, 2, 2] = 1.0
+            pr_point_fix[:, 3, 3] = 1.0
+            pr_point_fix[:, :2, 3] = -2.0 * self.get_principal_point(**kwargs)
+            pr_point_fix_transform = Transform3d(
+                matrix=pr_point_fix.transpose(1, 2).contiguous(), device=self.device
+            )
+            image_size = kwargs.get("image_size", self.get_image_size())
+            screen_to_ndc_transform = get_screen_to_ndc_transform(
+                self, with_xyflip=False, image_size=image_size
+            )
+            ndc_transform = pr_point_fix_transform.compose(screen_to_ndc_transform)
+
+        return ndc_transform
+
+    def is_perspective(self):
+        return True
+
+    def in_ndc(self):
+        return self._in_ndc
+
+
+def SfMOrthographicCameras(
+    focal_length=1.0,
+    principal_point=((0.0, 0.0),),
+    R: torch.Tensor = _R,
+    T: torch.Tensor = _T,
+    device: Device = "cpu",
+) -> "OrthographicCameras":
+    """
+    SfMOrthographicCameras has been DEPRECATED. Use OrthographicCameras instead.
+    Preserving SfMOrthographicCameras for backward compatibility.
+    """
+
+    warnings.warn(
+        """SfMOrthographicCameras is deprecated,
+        Use OrthographicCameras instead.
+        SfMOrthographicCameras will be removed in future releases.""",
+        PendingDeprecationWarning,
+    )
+
+    return OrthographicCameras(
+        focal_length=focal_length,
+        principal_point=principal_point,
+        R=R,
+        T=T,
+        device=device,
+    )
+
+
+class OrthographicCameras(CamerasBase):
+    """
+    A class which stores a batch of parameters to generate a batch of
+    transformation matrices using the multi-view geometry convention for
+    orthographic camera.
+
+    Parameters for this camera are specified in NDC if `in_ndc` is set to True.
+    If parameters are specified in screen space, `in_ndc` must be set to False.
+    """
+
+    # For __getitem__
+    _FIELDS = (
+        "K",
+        "R",
+        "T",
+        "focal_length",
+        "principal_point",
+        "_in_ndc",
+        "image_size",
+    )
+
+    def __init__(
+        self,
+        focal_length=1.0,
+        principal_point=((0.0, 0.0),),
+        R: torch.Tensor = _R,
+        T: torch.Tensor = _T,
+        K: Optional[torch.Tensor] = None,
+        device: Device = "cpu",
+        in_ndc: bool = True,
+        image_size: Optional[Union[List, Tuple, torch.Tensor]] = None,
+    ) -> None:
+        """
+
+        Args:
+            focal_length: Focal length of the camera in world units.
+                A tensor of shape (N, 1) or (N, 2) for
+                square and non-square pixels respectively.
+            principal_point: xy coordinates of the center of
+                the principal point of the camera in pixels.
+                A tensor of shape (N, 2).
+            in_ndc: True if camera parameters are specified in NDC.
+                If False, then camera parameters are in screen space.
+            R: Rotation matrix of shape (N, 3, 3)
+            T: Translation matrix of shape (N, 3)
+            K: (optional) A calibration matrix of shape (N, 4, 4)
+                If provided, don't need focal_length, principal_point, image_size
+            image_size: (height, width) of image size.
+                A tensor of shape (N, 2) or list/tuple. Required for screen cameras.
+            device: torch.device or string
+        """
+        # The initializer formats all inputs to torch tensors and broadcasts
+        # all the inputs to have the same batch dimension where necessary.
+        kwargs = {"image_size": image_size} if image_size is not None else {}
+        super().__init__(
+            device=device,
+            focal_length=focal_length,
+            principal_point=principal_point,
+            R=R,
+            T=T,
+            K=K,
+            _in_ndc=in_ndc,
+            **kwargs,  # pyre-ignore
+        )
+        if image_size is not None:
+            if (self.image_size < 1).any():  # pyre-ignore
+                raise ValueError("Image_size provided has invalid values")
+        else:
+            self.image_size = None
+
+    def get_projection_transform(self, **kwargs) -> Transform3d:
+        """
+        Calculate the projection matrix using
+        the multi-view geometry convention.
+
+        Args:
+            **kwargs: parameters for the projection can be passed in as keyword
+                arguments to override the default values set in __init__.
+
+        Returns:
+            A `Transform3d` object with a batch of `N` projection transforms.
+
+        .. code-block:: python
+
+            fx = focal_length[:,0]
+            fy = focal_length[:,1]
+            px = principal_point[:,0]
+            py = principal_point[:,1]
+
+            K = [
+                    [fx,   0,    0,  px],
+                    [0,   fy,    0,  py],
+                    [0,    0,    1,   0],
+                    [0,    0,    0,   1],
+            ]
+        """
+        K = kwargs.get("K", self.K)
+        if K is not None:
+            if K.shape != (self._N, 4, 4):
+                msg = "Expected K to have shape of (%r, 4, 4)"
+                raise ValueError(msg % (self._N))
+        else:
+            K = _get_sfm_calibration_matrix(
+                self._N,
+                self.device,
+                kwargs.get("focal_length", self.focal_length),
+                kwargs.get("principal_point", self.principal_point),
+                orthographic=True,
+            )
+
+        transform = Transform3d(
+            matrix=K.transpose(1, 2).contiguous(), device=self.device
+        )
+        return transform
+
+    def unproject_points(
+        self,
+        xy_depth: torch.Tensor,
+        world_coordinates: bool = True,
+        from_ndc: bool = False,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Args:
+            from_ndc: If `False` (default), assumes xy part of input is in
+                NDC space if self.in_ndc(), otherwise in screen space. If
+                `True`, assumes xy is in NDC space even if the camera
+                is defined in screen space.
+        """
+        if world_coordinates:
+            to_camera_transform = self.get_full_projection_transform(**kwargs)
+        else:
+            to_camera_transform = self.get_projection_transform(**kwargs)
+        if from_ndc:
+            to_camera_transform = to_camera_transform.compose(
+                self.get_ndc_camera_transform()
+            )
+
+        unprojection_transform = to_camera_transform.inverse()
+        return unprojection_transform.transform_points(xy_depth)
+
+    def get_principal_point(self, **kwargs) -> torch.Tensor:
+        """
+        Return the camera's principal point
+
+        Args:
+            **kwargs: parameters for the camera extrinsics can be passed in
+                as keyword arguments to override the default values
+                set in __init__.
+        """
+        proj_mat = self.get_projection_transform(**kwargs).get_matrix()
+        return proj_mat[:, 3, :2]
+
+    def get_ndc_camera_transform(self, **kwargs) -> Transform3d:
+        """
+        Returns the transform from camera projection space (screen or NDC) to NDC space.
+        If the camera is defined already in NDC space, the transform is identity.
+        For cameras defined in screen space, we adjust the principal point computation
+        which is defined in the image space (commonly) and scale the points to NDC space.
+
+        Important: This transforms assumes PyTorch3D conventions for the input points,
+        i.e. +X left, +Y up.
+        """
+        if self.in_ndc():
+            ndc_transform = Transform3d(device=self.device, dtype=torch.float32)
+        else:
+            # when cameras are defined in screen/image space, the principal point is
+            # provided in the (+X right, +Y down), aka image, coordinate system.
+            # Since input points are defined in the PyTorch3D system (+X left, +Y up),
+            # we need to adjust for the principal point transform.
+            pr_point_fix = torch.zeros(
+                (self._N, 4, 4), device=self.device, dtype=torch.float32
+            )
+            pr_point_fix[:, 0, 0] = 1.0
+            pr_point_fix[:, 1, 1] = 1.0
+            pr_point_fix[:, 2, 2] = 1.0
+            pr_point_fix[:, 3, 3] = 1.0
+            pr_point_fix[:, :2, 3] = -2.0 * self.get_principal_point(**kwargs)
+            pr_point_fix_transform = Transform3d(
+                matrix=pr_point_fix.transpose(1, 2).contiguous(), device=self.device
+            )
+            image_size = kwargs.get("image_size", self.get_image_size())
+            screen_to_ndc_transform = get_screen_to_ndc_transform(
+                self, with_xyflip=False, image_size=image_size
+            )
+            ndc_transform = pr_point_fix_transform.compose(screen_to_ndc_transform)
+
+        return ndc_transform
+
+    def is_perspective(self):
+        return False
+
+    def in_ndc(self):
+        return self._in_ndc
+
+
+################################################
+#       Helper functions for cameras           #
+################################################
+
+
+def _get_sfm_calibration_matrix(
+    N: int,
+    device: Device,
+    focal_length,
+    principal_point,
+    orthographic: bool = False,
+) -> torch.Tensor:
+    """
+    Returns a calibration matrix of a perspective/orthographic camera.
+
+    Args:
+        N: Number of cameras.
+        focal_length: Focal length of the camera.
+        principal_point: xy coordinates of the center of
+            the principal point of the camera in pixels.
+        orthographic: Boolean specifying if the camera is orthographic or not
+
+        The calibration matrix `K` is set up as follows:
+
+        .. code-block:: python
+
+            fx = focal_length[:,0]
+            fy = focal_length[:,1]
+            px = principal_point[:,0]
+            py = principal_point[:,1]
+
+            for orthographic==True:
+                K = [
+                        [fx,   0,    0,  px],
+                        [0,   fy,    0,  py],
+                        [0,    0,    1,   0],
+                        [0,    0,    0,   1],
+                ]
+            else:
+                K = [
+                        [fx,   0,   px,   0],
+                        [0,   fy,   py,   0],
+                        [0,    0,    0,   1],
+                        [0,    0,    1,   0],
+                ]
+
+    Returns:
+        A calibration matrix `K` of the SfM-conventioned camera
+        of shape (N, 4, 4).
+    """
+
+    if not torch.is_tensor(focal_length):
+        focal_length = torch.tensor(focal_length, device=device)
+
+    if focal_length.ndim in (0, 1) or focal_length.shape[1] == 1:
+        fx = fy = focal_length
+    else:
+        fx, fy = focal_length.unbind(1)
+
+    if not torch.is_tensor(principal_point):
+        principal_point = torch.tensor(principal_point, device=device)
+
+    px, py = principal_point.unbind(1)
+
+    K = fx.new_zeros(N, 4, 4)
+    K[:, 0, 0] = fx
+    K[:, 1, 1] = fy
+    if orthographic:
+        K[:, 0, 3] = px
+        K[:, 1, 3] = py
+        K[:, 2, 2] = 1.0
+        K[:, 3, 3] = 1.0
+    else:
+        K[:, 0, 2] = px
+        K[:, 1, 2] = py
+        K[:, 3, 2] = 1.0
+        K[:, 2, 3] = 1.0
+
+    return K
+
+
+################################################
+# Helper functions for world to view transforms
+################################################
+
+
+def get_world_to_view_transform(
+    R: torch.Tensor = _R, T: torch.Tensor = _T
+) -> Transform3d:
+    """
+    This function returns a Transform3d representing the transformation
+    matrix to go from world space to view space by applying a rotation and
+    a translation.
+
+    PyTorch3D uses the same convention as Hartley & Zisserman.
+    I.e., for camera extrinsic parameters R (rotation) and T (translation),
+    we map a 3D point `X_world` in world coordinates to
+    a point `X_cam` in camera coordinates with:
+    `X_cam = X_world R + T`
+
+    Args:
+        R: (N, 3, 3) matrix representing the rotation.
+        T: (N, 3) matrix representing the translation.
+
+    Returns:
+        a Transform3d object which represents the composed RT transformation.
+
+    """
+    # TODO: also support the case where RT is specified as one matrix
+    # of shape (N, 4, 4).
+
+    if T.shape[0] != R.shape[0]:
+        msg = "Expected R, T to have the same batch dimension; got %r, %r"
+        raise ValueError(msg % (R.shape[0], T.shape[0]))
+    if T.dim() != 2 or T.shape[1:] != (3,):
+        msg = "Expected T to have shape (N, 3); got %r"
+        raise ValueError(msg % repr(T.shape))
+    if R.dim() != 3 or R.shape[1:] != (3, 3):
+        msg = "Expected R to have shape (N, 3, 3); got %r"
+        raise ValueError(msg % repr(R.shape))
+
+    # Create a Transform3d object
+    T_ = Translate(T, device=T.device)
+    R_ = Rotate(R, device=R.device)
+    return R_.compose(T_)
+
+
+def camera_position_from_spherical_angles(
+    distance: float,
+    elevation: float,
+    azimuth: float,
+    degrees: bool = True,
+    device: Device = "cpu",
+) -> torch.Tensor:
+    """
+    Calculate the location of the camera based on the distance away from
+    the target point, the elevation and azimuth angles.
+
+    Args:
+        distance: distance of the camera from the object.
+        elevation, azimuth: angles.
+            The inputs distance, elevation and azimuth can be one of the following
+                - Python scalar
+                - Torch scalar
+                - Torch tensor of shape (N) or (1)
+        degrees: bool, whether the angles are specified in degrees or radians.
+        device: str or torch.device, device for new tensors to be placed on.
+
+    The vectors are broadcast against each other so they all have shape (N, 1).
+
+    Returns:
+        camera_position: (N, 3) xyz location of the camera.
+    """
+    broadcasted_args = convert_to_tensors_and_broadcast(
+        distance, elevation, azimuth, device=device
+    )
+    dist, elev, azim = broadcasted_args
+    if degrees:
+        elev = math.pi / 180.0 * elev
+        azim = math.pi / 180.0 * azim
+    x = dist * torch.cos(elev) * torch.sin(azim)
+    y = dist * torch.sin(elev)
+    z = dist * torch.cos(elev) * torch.cos(azim)
+    camera_position = torch.stack([x, y, z], dim=1)
+    if camera_position.dim() == 0:
+        camera_position = camera_position.view(1, -1)  # add batch dim.
+    return camera_position.view(-1, 3)
+
+
+def look_at_rotation(
+    camera_position, at=((0, 0, 0),), up=((0, 1, 0),), device: Device = "cpu"
+) -> torch.Tensor:
+    """
+    This function takes a vector 'camera_position' which specifies the location
+    of the camera in world coordinates and two vectors `at` and `up` which
+    indicate the position of the object and the up directions of the world
+    coordinate system respectively. The object is assumed to be centered at
+    the origin.
+
+    The output is a rotation matrix representing the transformation
+    from world coordinates -> view coordinates.
+
+    Args:
+        camera_position: position of the camera in world coordinates
+        at: position of the object in world coordinates
+        up: vector specifying the up direction in the world coordinate frame.
+
+    The inputs camera_position, at and up can each be a
+        - 3 element tuple/list
+        - torch tensor of shape (1, 3)
+        - torch tensor of shape (N, 3)
+
+    The vectors are broadcast against each other so they all have shape (N, 3).
+
+    Returns:
+        R: (N, 3, 3) batched rotation matrices
+    """
+    # Format input and broadcast
+    broadcasted_args = convert_to_tensors_and_broadcast(
+        camera_position, at, up, device=device
+    )
+    camera_position, at, up = broadcasted_args
+    for t, n in zip([camera_position, at, up], ["camera_position", "at", "up"]):
+        if t.shape[-1] != 3:
+            msg = "Expected arg %s to have shape (N, 3); got %r"
+            raise ValueError(msg % (n, t.shape))
+    z_axis = F.normalize(at - camera_position, eps=1e-5)
+    x_axis = F.normalize(torch.cross(up, z_axis, dim=1), eps=1e-5)
+    y_axis = F.normalize(torch.cross(z_axis, x_axis, dim=1), eps=1e-5)
+    is_close = torch.isclose(x_axis, torch.tensor(0.0), atol=5e-3).all(
+        dim=1, keepdim=True
+    )
+    if is_close.any():
+        replacement = F.normalize(torch.cross(y_axis, z_axis, dim=1), eps=1e-5)
+        x_axis = torch.where(is_close, replacement, x_axis)
+    R = torch.cat((x_axis[:, None, :], y_axis[:, None, :], z_axis[:, None, :]), dim=1)
+    return R.transpose(1, 2)
+
+
+def look_at_view_transform(
+    dist=1.0,
+    elev=0.0,
+    azim=0.0,
+    degrees: bool = True,
+    eye: Optional[Sequence] = None,
+    at=((0, 0, 0),),  # (1, 3)
+    up=((0, 1, 0),),  # (1, 3)
+    device: Device = "cpu",
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    This function returns a rotation and translation matrix
+    to apply the 'Look At' transformation from world -> view coordinates [0].
+
+    Args:
+        dist: distance of the camera from the object
+        elev: angle in degrees or radians. This is the angle between the
+            vector from the object to the camera, and the horizontal plane y = 0 (xz-plane).
+        azim: angle in degrees or radians. The vector from the object to
+            the camera is projected onto a horizontal plane y = 0.
+            azim is the angle between the projected vector and a
+            reference vector at (0, 0, 1) on the reference plane (the horizontal plane).
+        dist, elev and azim can be of shape (1), (N).
+        degrees: boolean flag to indicate if the elevation and azimuth
+            angles are specified in degrees or radians.
+        eye: the position of the camera(s) in world coordinates. If eye is not
+            None, it will override the camera position derived from dist, elev, azim.
+        up: the direction of the x axis in the world coordinate system.
+        at: the position of the object(s) in world coordinates.
+        eye, up and at can be of shape (1, 3) or (N, 3).
+
+    Returns:
+        2-element tuple containing
+
+        - **R**: the rotation to apply to the points to align with the camera.
+        - **T**: the translation to apply to the points to align with the camera.
+
+    References:
+    [0] https://www.scratchapixel.com
+    """
+
+    if eye is not None:
+        broadcasted_args = convert_to_tensors_and_broadcast(eye, at, up, device=device)
+        eye, at, up = broadcasted_args
+        C = eye
+    else:
+        broadcasted_args = convert_to_tensors_and_broadcast(
+            dist, elev, azim, at, up, device=device
+        )
+        dist, elev, azim, at, up = broadcasted_args
+        C = (
+            camera_position_from_spherical_angles(
+                dist, elev, azim, degrees=degrees, device=device
+            )
+            + at
+        )
+
+    R = look_at_rotation(C, at, up, device=device)
+    T = -torch.bmm(R.transpose(1, 2), C[:, :, None])[:, :, 0]
+    return R, T
+
+
+def get_ndc_to_screen_transform(
+    cameras,
+    with_xyflip: bool = False,
+    image_size: Optional[Union[List, Tuple, torch.Tensor]] = None,
+) -> Transform3d:
+    """
+    PyTorch3D NDC to screen conversion.
+    Conversion from PyTorch3D's NDC space (+X left, +Y up) to screen/image space
+    (+X right, +Y down, origin top left).
+
+    Args:
+        cameras
+        with_xyflip: flips x- and y-axis if set to True.
+    Optional kwargs:
+        image_size: ((height, width),) specifying the height, width
+        of the image. If not provided, it reads it from cameras.
+
+    We represent the NDC to screen conversion as a Transform3d
+    with projection matrix
+
+    K = [
+            [s,   0,    0,  cx],
+            [0,   s,    0,  cy],
+            [0,   0,    1,   0],
+            [0,   0,    0,   1],
+    ]
+
+    """
+    # We require the image size, which is necessary for the transform
+    if image_size is None:
+        msg = "For NDC to screen conversion, image_size=(height, width) needs to be specified."
+        raise ValueError(msg)
+
+    K = torch.zeros((cameras._N, 4, 4), device=cameras.device, dtype=torch.float32)
+    if not torch.is_tensor(image_size):
+        image_size = torch.tensor(image_size, device=cameras.device)
+    image_size = image_size.view(-1, 2)  # of shape (1 or B)x2
+    height, width = image_size.unbind(1)
+
+    # For non square images, we scale the points such that smallest side
+    # has range [-1, 1] and the largest side has range [-u, u], with u > 1.
+    # This convention is consistent with the PyTorch3D renderer
+    scale = (image_size.min(dim=1).values - 0.0) / 2.0
+
+    K[:, 0, 0] = scale
+    K[:, 1, 1] = scale
+    K[:, 0, 3] = -1.0 * (width - 0.0) / 2.0
+    K[:, 1, 3] = -1.0 * (height - 0.0) / 2.0
+    K[:, 2, 2] = 1.0
+    K[:, 3, 3] = 1.0
+
+    # Transpose the projection matrix as PyTorch3D transforms use row vectors.
+    transform = Transform3d(
+        matrix=K.transpose(1, 2).contiguous(), device=cameras.device
+    )
+
+    if with_xyflip:
+        # flip x, y axis
+        xyflip = torch.eye(4, device=cameras.device, dtype=torch.float32)
+        xyflip[0, 0] = -1.0
+        xyflip[1, 1] = -1.0
+        xyflip = xyflip.view(1, 4, 4).expand(cameras._N, -1, -1)
+        xyflip_transform = Transform3d(
+            matrix=xyflip.transpose(1, 2).contiguous(), device=cameras.device
+        )
+        transform = transform.compose(xyflip_transform)
+    return transform
+
+
+def get_screen_to_ndc_transform(
+    cameras,
+    with_xyflip: bool = False,
+    image_size: Optional[Union[List, Tuple, torch.Tensor]] = None,
+) -> Transform3d:
+    """
+    Screen to PyTorch3D NDC conversion.
+    Conversion from screen/image space (+X right, +Y down, origin top left)
+    to PyTorch3D's NDC space (+X left, +Y up).
+
+    Args:
+        cameras
+        with_xyflip: flips x- and y-axis if set to True.
+    Optional kwargs:
+        image_size: ((height, width),) specifying the height, width
+        of the image. If not provided, it reads it from cameras.
+
+    We represent the screen to NDC conversion as a Transform3d
+    with projection matrix
+
+    K = [
+            [1/s,    0,    0,  cx/s],
+            [  0,  1/s,    0,  cy/s],
+            [  0,    0,    1,     0],
+            [  0,    0,    0,     1],
+    ]
+
+    """
+    transform = get_ndc_to_screen_transform(
+        cameras,
+        with_xyflip=with_xyflip,
+        image_size=image_size,
+    ).inverse()
+    return transform
diff --git a/pytorch3d/pytorch3d/renderer/compositing.py b/pytorch3d/pytorch3d/renderer/compositing.py
new file mode 100644
index 0000000000000000000000000000000000000000..31be1443fe5bb6cb172ce55e252f77bdeea5ca9f
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/compositing.py
@@ -0,0 +1,245 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from pytorch3d import _C
+
+
+# Example functions for blending the top K features per pixel using the outputs
+# from rasterization.
+# NOTE: All blending function should return a (N, H, W, C) tensor per batch element.
+# This can be an image (C=3) or a set of features.
+
+
+class _CompositeAlphaPoints(torch.autograd.Function):
+    """
+    Composite features within a z-buffer using alpha compositing. Given a z-buffer
+    with corresponding features and weights, these values are accumulated according
+    to their weights such that features nearer in depth contribute more to the final
+    feature than ones further away.
+
+    Concretely this means:
+        weighted_fs[b,c,i,j] = sum_k cum_alpha_k * features[c,pointsidx[b,k,i,j]]
+        cum_alpha_k = alphas[b,k,i,j] * prod_l=0..k-1 (1 - alphas[b,l,i,j])
+
+    Args:
+        features: Packed Tensor of shape (C, P) giving the features of each point.
+        alphas: float32 Tensor of shape (N, points_per_pixel, image_size,
+            image_size) giving the weight of each point in the z-buffer.
+            Values should be in the interval [0, 1].
+        pointsidx: int32 Tensor of shape (N, points_per_pixel, image_size, image_size)
+            giving the indices of the nearest points at each pixel, sorted in z-order.
+            Concretely pointsidx[n, k, y, x] = p means that features[:, p] is the
+            feature of the kth closest point (along the z-direction) to pixel (y, x) in
+            batch element n. This is weighted by alphas[n, k, y, x].
+
+    Returns:
+        weighted_fs: Tensor of shape (N, C, image_size, image_size)
+            giving the accumulated features at each point.
+    """
+
+    @staticmethod
+    def forward(ctx, features, alphas, points_idx):
+        pt_cld = _C.accum_alphacomposite(features, alphas, points_idx)
+
+        ctx.save_for_backward(features.clone(), alphas.clone(), points_idx.clone())
+        return pt_cld
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_features = None
+        grad_alphas = None
+        grad_points_idx = None
+        features, alphas, points_idx = ctx.saved_tensors
+
+        grad_features, grad_alphas = _C.accum_alphacomposite_backward(
+            grad_output, features, alphas, points_idx
+        )
+
+        return grad_features, grad_alphas, grad_points_idx, None
+
+
+def alpha_composite(pointsidx, alphas, pt_clds) -> torch.Tensor:
+    """
+    Composite features within a z-buffer using alpha compositing. Given a z-buffer
+    with corresponding features and weights, these values are accumulated according
+    to their weights such that features nearer in depth contribute more to the final
+    feature than ones further away.
+
+    Concretely this means:
+        weighted_fs[b,c,i,j] = sum_k cum_alpha_k * features[c,pointsidx[b,k,i,j]]
+        cum_alpha_k = alphas[b,k,i,j] * prod_l=0..k-1 (1 - alphas[b,l,i,j])
+
+
+    Args:
+        pt_clds: Tensor of shape (N, C, P) giving the features of each point (can use
+            RGB for example).
+        alphas: float32 Tensor of shape (N, points_per_pixel, image_size,
+            image_size) giving the weight of each point in the z-buffer.
+            Values should be in the interval [0, 1].
+        pointsidx: int32 Tensor of shape (N, points_per_pixel, image_size, image_size)
+            giving the indices of the nearest points at each pixel, sorted in z-order.
+            Concretely pointsidx[n, k, y, x] = p means that features[n, :, p] is the
+            feature of the kth closest point (along the z-direction) to pixel (y, x) in
+            batch element n. This is weighted by alphas[n, k, y, x].
+
+    Returns:
+        Combined features: Tensor of shape (N, C, image_size, image_size)
+            giving the accumulated features at each point.
+    """
+    # pyre-fixme[16]: `_CompositeAlphaPoints` has no attribute `apply`.
+    return _CompositeAlphaPoints.apply(pt_clds, alphas, pointsidx)
+
+
+class _CompositeNormWeightedSumPoints(torch.autograd.Function):
+    """
+    Composite features within a z-buffer using normalized weighted sum. Given a z-buffer
+    with corresponding features and weights, these values are accumulated
+    according to their weights such that depth is ignored; the weights are used to
+    perform a weighted sum.
+
+    Concretely this means:
+        weighted_fs[b,c,i,j] =
+         sum_k alphas[b,k,i,j] * features[c,pointsidx[b,k,i,j]] / sum_k alphas[b,k,i,j]
+
+    Args:
+        features: Packed Tensor of shape (C, P) giving the features of each point.
+        alphas: float32 Tensor of shape (N, points_per_pixel, image_size,
+            image_size) giving the weight of each point in the z-buffer.
+            Values should be in the interval [0, 1].
+        pointsidx: int32 Tensor of shape (N, points_per_pixel, image_size, image_size)
+            giving the indices of the nearest points at each pixel, sorted in z-order.
+            Concretely pointsidx[n, k, y, x] = p means that features[:, p] is the
+            feature of the kth closest point (along the z-direction) to pixel (y, x) in
+            batch element n. This is weighted by alphas[n, k, y, x].
+
+    Returns:
+        weighted_fs: Tensor of shape (N, C, image_size, image_size)
+            giving the accumulated features at each point.
+    """
+
+    @staticmethod
+    def forward(ctx, features, alphas, points_idx):
+        pt_cld = _C.accum_weightedsumnorm(features, alphas, points_idx)
+
+        ctx.save_for_backward(features.clone(), alphas.clone(), points_idx.clone())
+        return pt_cld
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_features = None
+        grad_alphas = None
+        grad_points_idx = None
+        features, alphas, points_idx = ctx.saved_tensors
+
+        grad_features, grad_alphas = _C.accum_weightedsumnorm_backward(
+            grad_output, features, alphas, points_idx
+        )
+
+        return grad_features, grad_alphas, grad_points_idx, None
+
+
+def norm_weighted_sum(pointsidx, alphas, pt_clds) -> torch.Tensor:
+    """
+    Composite features within a z-buffer using normalized weighted sum. Given a z-buffer
+    with corresponding features and weights, these values are accumulated
+    according to their weights such that depth is ignored; the weights are used to
+    perform a weighted sum.
+
+    Concretely this means:
+        weighted_fs[b,c,i,j] =
+         sum_k alphas[b,k,i,j] * features[c,pointsidx[b,k,i,j]] / sum_k alphas[b,k,i,j]
+
+    Args:
+        pt_clds: Packed feature tensor of shape (C, P) giving the features of each point
+            (can use RGB for example).
+        alphas: float32 Tensor of shape (N, points_per_pixel, image_size,
+            image_size) giving the weight of each point in the z-buffer.
+            Values should be in the interval [0, 1].
+        pointsidx: int32 Tensor of shape (N, points_per_pixel, image_size, image_size)
+            giving the indices of the nearest points at each pixel, sorted in z-order.
+            Concretely pointsidx[n, k, y, x] = p means that features[:, p] is the
+            feature of the kth closest point (along the z-direction) to pixel (y, x) in
+            batch element n. This is weighted by alphas[n, k, y, x].
+
+    Returns:
+        Combined features: Tensor of shape (N, C, image_size, image_size)
+            giving the accumulated features at each point.
+    """
+    # pyre-fixme[16]: `_CompositeNormWeightedSumPoints` has no attribute `apply`.
+    return _CompositeNormWeightedSumPoints.apply(pt_clds, alphas, pointsidx)
+
+
+class _CompositeWeightedSumPoints(torch.autograd.Function):
+    """
+    Composite features within a z-buffer using normalized weighted sum. Given a z-buffer
+    with corresponding features and weights, these values are accumulated
+    according to their weights such that depth is ignored; the weights are used to
+    perform a weighted sum. As opposed to norm weighted sum, the weights are not
+    normalized to sum to 1.
+
+    Concretely this means:
+        weighted_fs[b,c,i,j] = sum_k alphas[b,k,i,j] * features[c,pointsidx[b,k,i,j]]
+
+    Args:
+        features: Packed Tensor of shape (C, P) giving the features of each point.
+        alphas: float32 Tensor of shape (N, points_per_pixel, image_size,
+            image_size) giving the weight of each point in the z-buffer.
+            Values should be in the interval [0, 1].
+        pointsidx: int32 Tensor of shape (N, points_per_pixel, image_size, image_size)
+            giving the indices of the nearest points at each pixel, sorted in z-order.
+            Concretely pointsidx[n, k, y, x] = p means that features[:, p] is the
+            feature of the kth closest point (along the z-direction) to pixel (y, x) in
+            batch element n. This is weighted by alphas[n, k, y, x].
+
+    Returns:
+        weighted_fs: Tensor of shape (N, C, image_size, image_size)
+            giving the accumulated features at each point.
+    """
+
+    @staticmethod
+    def forward(ctx, features, alphas, points_idx):
+        pt_cld = _C.accum_weightedsum(features, alphas, points_idx)
+
+        ctx.save_for_backward(features.clone(), alphas.clone(), points_idx.clone())
+        return pt_cld
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_features = None
+        grad_alphas = None
+        grad_points_idx = None
+        features, alphas, points_idx = ctx.saved_tensors
+
+        grad_features, grad_alphas = _C.accum_weightedsum_backward(
+            grad_output, features, alphas, points_idx
+        )
+
+        return grad_features, grad_alphas, grad_points_idx, None
+
+
+def weighted_sum(pointsidx, alphas, pt_clds) -> torch.Tensor:
+    """
+    Composite features within a z-buffer using normalized weighted sum.
+
+    Args:
+        pt_clds: Packed Tensor of shape (C, P) giving the features of each point
+            (can use RGB for example).
+        alphas: float32 Tensor of shape (N, points_per_pixel, image_size,
+            image_size) giving the weight of each point in the z-buffer.
+            Values should be in the interval [0, 1].
+        pointsidx: int32 Tensor of shape (N, points_per_pixel, image_size, image_size)
+            giving the indices of the nearest points at each pixel, sorted in z-order.
+            Concretely pointsidx[n, k, y, x] = p means that features[:, p] is the
+            feature of the kth closest point (along the z-direction) to pixel (y, x) in
+            batch element n. This is weighted by alphas[n, k, y, x].
+
+    Returns:
+        Combined features: Tensor of shape (N, C, image_size, image_size)
+            giving the accumulated features at each point.
+    """
+    # pyre-fixme[16]: `_CompositeWeightedSumPoints` has no attribute `apply`.
+    return _CompositeWeightedSumPoints.apply(pt_clds, alphas, pointsidx)
diff --git a/pytorch3d/pytorch3d/renderer/implicit/__init__.py b/pytorch3d/pytorch3d/renderer/implicit/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..66a59ae500e8ff9eba85c12126e9f012e8ddbc05
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/implicit/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .harmonic_embedding import HarmonicEmbedding
+from .raymarching import AbsorptionOnlyRaymarcher, EmissionAbsorptionRaymarcher
+from .raysampling import GridRaysampler, MonteCarloRaysampler, NDCGridRaysampler
+from .renderer import ImplicitRenderer, VolumeRenderer, VolumeSampler
+from .utils import (
+    RayBundle,
+    ray_bundle_to_ray_points,
+    ray_bundle_variables_to_ray_points,
+)
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/renderer/implicit/harmonic_embedding.py b/pytorch3d/pytorch3d/renderer/implicit/harmonic_embedding.py
new file mode 100644
index 0000000000000000000000000000000000000000..d07e7eb9ad0b70f679e90917ce59cc21e634473b
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/implicit/harmonic_embedding.py
@@ -0,0 +1,127 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+
+class HarmonicEmbedding(torch.nn.Module):
+    def __init__(
+        self,
+        n_harmonic_functions: int = 6,
+        omega_0: float = 1.0,
+        logspace: bool = True,
+        append_input: bool = True,
+    ) -> None:
+        """
+        Given an input tensor `x` of shape [minibatch, ... , dim],
+        the harmonic embedding layer converts each feature
+        (i.e. vector along the last dimension) in `x`
+        into a series of harmonic features `embedding`,
+        where for each i in range(dim) the following are present
+        in embedding[...]:
+            ```
+            [
+                sin(f_1*x[..., i]),
+                sin(f_2*x[..., i]),
+                ...
+                sin(f_N * x[..., i]),
+                cos(f_1*x[..., i]),
+                cos(f_2*x[..., i]),
+                ...
+                cos(f_N * x[..., i]),
+                x[..., i],              # only present if append_input is True.
+            ]
+            ```
+        where N corresponds to `n_harmonic_functions-1`, and f_i is a scalar
+        denoting the i-th frequency of the harmonic embedding.
+
+        If `logspace==True`, the frequencies `[f_1, ..., f_N]` are
+        powers of 2:
+            `f_1, ..., f_N = 2**torch.arange(n_harmonic_functions)`
+
+        If `logspace==False`, frequencies are linearly spaced between
+        `1.0` and `2**(n_harmonic_functions-1)`:
+            `f_1, ..., f_N = torch.linspace(
+                1.0, 2**(n_harmonic_functions-1), n_harmonic_functions
+            )`
+
+        Note that `x` is also premultiplied by the base frequency `omega_0`
+        before evaluating the harmonic functions.
+
+        Args:
+            n_harmonic_functions: int, number of harmonic
+                features
+            omega_0: float, base frequency
+            logspace: bool, Whether to space the frequencies in
+                logspace or linear space
+            append_input: bool, whether to concat the original
+                input to the harmonic embedding. If true the
+                output is of the form (x, embed.sin(), embed.cos()
+
+        """
+        super().__init__()
+
+        if logspace:
+            frequencies = 2.0 ** torch.arange(
+                n_harmonic_functions,
+                dtype=torch.float32,
+            )
+        else:
+            frequencies = torch.linspace(
+                1.0,
+                2.0 ** (n_harmonic_functions - 1),
+                n_harmonic_functions,
+                dtype=torch.float32,
+            )
+
+        self.register_buffer("_frequencies", frequencies * omega_0, persistent=False)
+        self.append_input = append_input
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            x: tensor of shape [..., dim]
+        Returns:
+            embedding: a harmonic embedding of `x`
+                of shape [..., (n_harmonic_functions * 2 + int(append_input)) * dim]
+        """
+        embed = (x[..., None] * self._frequencies).view(*x.shape[:-1], -1)
+        embed = torch.cat(
+            (embed.sin(), embed.cos(), x)
+            if self.append_input
+            else (embed.sin(), embed.cos()),
+            dim=-1,
+        )
+        return embed
+
+    @staticmethod
+    def get_output_dim_static(
+        input_dims: int,
+        n_harmonic_functions: int,
+        append_input: bool,
+    ) -> int:
+        """
+        Utility to help predict the shape of the output of `forward`.
+
+        Args:
+            input_dims: length of the last dimension of the input tensor
+            n_harmonic_functions: number of embedding frequencies
+            append_input: whether or not to concat the original
+                input to the harmonic embedding
+        Returns:
+            int: the length of the last dimension of the output tensor
+        """
+        return input_dims * (2 * n_harmonic_functions + int(append_input))
+
+    def get_output_dim(self, input_dims: int = 3) -> int:
+        """
+        Same as above. The default for input_dims is 3 for 3D applications
+        which use harmonic embedding for positional encoding,
+        so the input might be xyz.
+        """
+        return self.get_output_dim_static(
+            input_dims, len(self._frequencies), self.append_input
+        )
diff --git a/pytorch3d/pytorch3d/renderer/implicit/raymarching.py b/pytorch3d/pytorch3d/renderer/implicit/raymarching.py
new file mode 100644
index 0000000000000000000000000000000000000000..b25fd254dee7b310c471c3a89df1072307d422dc
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/implicit/raymarching.py
@@ -0,0 +1,231 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+
+
+class EmissionAbsorptionRaymarcher(torch.nn.Module):
+    """
+    Raymarch using the Emission-Absorption (EA) algorithm.
+
+    The algorithm independently renders each ray by analyzing density and
+    feature values sampled at (typically uniformly) spaced 3D locations along
+    each ray. The density values `rays_densities` are of shape
+    `(..., n_points_per_ray)`, their values should range between [0, 1], and
+    represent the opaqueness of each point (the higher the less transparent).
+    The feature values `rays_features` of shape
+    `(..., n_points_per_ray, feature_dim)` represent the content of the
+    point that is supposed to be rendered in case the given point is opaque
+    (i.e. its density -> 1.0).
+
+    EA first utilizes `rays_densities` to compute the absorption function
+    along each ray as follows:
+        ```
+        absorption = cumprod(1 - rays_densities, dim=-1)
+        ```
+    The value of absorption at position `absorption[..., k]` specifies
+    how much light has reached `k`-th point along a ray since starting
+    its trajectory at `k=0`-th point.
+
+    Each ray is then rendered into a tensor `features` of shape `(..., feature_dim)`
+    by taking a weighed combination of per-ray features `rays_features` as follows:
+        ```
+        weights = absorption * rays_densities
+        features = (rays_features * weights).sum(dim=-2)
+        ```
+    Where `weights` denote a function that has a strong peak around the location
+    of the first surface point that a given ray passes through.
+
+    Note that for a perfectly bounded volume (with a strictly binary density),
+    the `weights = cumprod(1 - rays_densities, dim=-1) * rays_densities`
+    function would yield 0 everywhere. In order to prevent this,
+    the result of the cumulative product is shifted `self.surface_thickness`
+    elements along the ray direction.
+    """
+
+    def __init__(self, surface_thickness: int = 1) -> None:
+        """
+        Args:
+            surface_thickness: Denotes the overlap between the absorption
+                function and the density function.
+        """
+        super().__init__()
+        self.surface_thickness = surface_thickness
+
+    def forward(
+        self,
+        rays_densities: torch.Tensor,
+        rays_features: torch.Tensor,
+        eps: float = 1e-10,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Args:
+            rays_densities: Per-ray density values represented with a tensor
+                of shape `(..., n_points_per_ray, 1)` whose values range in [0, 1].
+            rays_features: Per-ray feature values represented with a tensor
+                of shape `(..., n_points_per_ray, feature_dim)`.
+            eps: A lower bound added to `rays_densities` before computing
+                the absorption function (cumprod of `1-rays_densities` along
+                each ray). This prevents the cumprod to yield exact 0
+                which would inhibit any gradient-based learning.
+
+        Returns:
+            features_opacities: A tensor of shape `(..., feature_dim+1)`
+                that concatenates two tensors along the last dimension:
+                    1) features: A tensor of per-ray renders
+                        of shape `(..., feature_dim)`.
+                    2) opacities: A tensor of per-ray opacity values
+                        of shape `(..., 1)`. Its values range between [0, 1] and
+                        denote the total amount of light that has been absorbed
+                        for each ray. E.g. a value of 0 corresponds to the ray
+                        completely passing through a volume. Please refer to the
+                        `AbsorptionOnlyRaymarcher` documentation for the
+                        explanation of the algorithm that computes `opacities`.
+        """
+        _check_raymarcher_inputs(
+            rays_densities,
+            rays_features,
+            None,
+            z_can_be_none=True,
+            features_can_be_none=False,
+            density_1d=True,
+        )
+        _check_density_bounds(rays_densities)
+        rays_densities = rays_densities[..., 0]
+        absorption = _shifted_cumprod(
+            (1.0 + eps) - rays_densities, shift=self.surface_thickness
+        )
+        weights = rays_densities * absorption
+        features = (weights[..., None] * rays_features).sum(dim=-2)
+        opacities = 1.0 - torch.prod(1.0 - rays_densities, dim=-1, keepdim=True)
+
+        return torch.cat((features, opacities), dim=-1)
+
+
+class AbsorptionOnlyRaymarcher(torch.nn.Module):
+    """
+    Raymarch using the Absorption-Only (AO) algorithm.
+
+    The algorithm independently renders each ray by analyzing density and
+    feature values sampled at (typically uniformly) spaced 3D locations along
+    each ray. The density values `rays_densities` are of shape
+    `(..., n_points_per_ray, 1)`, their values should range between [0, 1], and
+    represent the opaqueness of each point (the higher the less transparent).
+    The algorithm only measures the total amount of light absorbed along each ray
+    and, besides outputting per-ray `opacity` values of shape `(...,)`,
+    does not produce any feature renderings.
+
+    The algorithm simply computes `total_transmission = prod(1 - rays_densities)`
+    of shape `(..., 1)` which, for each ray, measures the total amount of light
+    that passed through the volume.
+    It then returns `opacities = 1 - total_transmission`.
+    """
+
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self, rays_densities: torch.Tensor, **kwargs
+    ) -> Union[None, torch.Tensor]:
+        """
+        Args:
+            rays_densities: Per-ray density values represented with a tensor
+                of shape `(..., n_points_per_ray)` whose values range in [0, 1].
+
+        Returns:
+            opacities: A tensor of per-ray opacity values of shape `(..., 1)`.
+                Its values range between [0, 1] and denote the total amount
+                of light that has been absorbed for each ray. E.g. a value
+                of 0 corresponds to the ray completely passing through a volume.
+        """
+
+        _check_raymarcher_inputs(
+            rays_densities,
+            None,
+            None,
+            features_can_be_none=True,
+            z_can_be_none=True,
+            density_1d=True,
+        )
+        rays_densities = rays_densities[..., 0]
+        _check_density_bounds(rays_densities)
+        total_transmission = torch.prod(1 - rays_densities, dim=-1, keepdim=True)
+        opacities = 1.0 - total_transmission
+        # pyre-fixme[7]: Expected `Optional[torch.Tensor]` but got `float`.
+        return opacities
+
+
+def _shifted_cumprod(x, shift=1):
+    """
+    Computes `torch.cumprod(x, dim=-1)` and prepends `shift` number of
+    ones and removes `shift` trailing elements to/from the last dimension
+    of the result.
+    """
+    x_cumprod = torch.cumprod(x, dim=-1)
+    x_cumprod_shift = torch.cat(
+        [torch.ones_like(x_cumprod[..., :shift]), x_cumprod[..., :-shift]], dim=-1
+    )
+    return x_cumprod_shift
+
+
+def _check_density_bounds(
+    rays_densities: torch.Tensor, bounds: Tuple[float, float] = (0.0, 1.0)
+):
+    """
+    Checks whether the elements of `rays_densities` range within `bounds`.
+    If not issues a warning.
+    """
+    with torch.no_grad():
+        if (rays_densities.max() > bounds[1]) or (rays_densities.min() < bounds[0]):
+            warnings.warn(
+                "One or more elements of rays_densities are outside of valid"
+                + f"range {str(bounds)}"
+            )
+
+
+def _check_raymarcher_inputs(
+    rays_densities: torch.Tensor,
+    rays_features: Optional[torch.Tensor],
+    rays_z: Optional[torch.Tensor],
+    features_can_be_none: bool = False,
+    z_can_be_none: bool = False,
+    density_1d: bool = True,
+):
+    """
+    Checks the validity of the inputs to raymarching algorithms.
+    """
+    if not torch.is_tensor(rays_densities):
+        raise ValueError("rays_densities has to be an instance of torch.Tensor.")
+
+    if not z_can_be_none and not torch.is_tensor(rays_z):
+        raise ValueError("rays_z has to be an instance of torch.Tensor.")
+
+    if not features_can_be_none and not torch.is_tensor(rays_features):
+        raise ValueError("rays_features has to be an instance of torch.Tensor.")
+
+    if rays_densities.ndim < 1:
+        raise ValueError("rays_densities have to have at least one dimension.")
+
+    if density_1d and rays_densities.shape[-1] != 1:
+        raise ValueError(
+            "The size of the last dimension of rays_densities has to be one."
+        )
+
+    rays_shape = rays_densities.shape[:-1]
+
+    # pyre-fixme[16]: `Optional` has no attribute `shape`.
+    if not z_can_be_none and rays_z.shape != rays_shape:
+        raise ValueError("rays_z have to be of the same shape as rays_densities.")
+
+    if not features_can_be_none and rays_features.shape[:-1] != rays_shape:
+        raise ValueError(
+            "The first to previous to last dimensions of rays_features"
+            " have to be the same as all dimensions of rays_densities."
+        )
diff --git a/pytorch3d/pytorch3d/renderer/implicit/raysampling.py b/pytorch3d/pytorch3d/renderer/implicit/raysampling.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c9e80b82e87386de289cb9cfb263e53bb3a599d
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/implicit/raysampling.py
@@ -0,0 +1,332 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from ..cameras import CamerasBase
+from .utils import RayBundle
+
+
+"""
+This file defines three raysampling techniques:
+    - GridRaysampler which can be used to sample rays from pixels of an image grid
+    - NDCGridRaysampler which can be used to sample rays from pixels of an image grid,
+        which follows the pytorch3d convention for image grid coordinates
+    - MonteCarloRaysampler which randomly selects image pixels and emits rays from them
+"""
+
+
+class GridRaysampler(torch.nn.Module):
+    """
+    Samples a fixed number of points along rays which are regularly distributed
+    in a batch of rectangular image grids. Points along each ray
+    have uniformly-spaced z-coordinates between a predefined
+    minimum and maximum depth.
+
+    The raysampler first generates a 3D coordinate grid of the following form:
+    ```
+       / min_x, min_y, max_depth -------------- / max_x, min_y, max_depth
+      /                                        /|
+     /                                        / |     ^
+    / min_depth                    min_depth /  |     |
+    min_x ----------------------------- max_x   |     | image
+    min_y                               min_y   |     | height
+    |                                       |   |     |
+    |                                       |   |     v
+    |                                       |   |
+    |                                       |   / max_x, max_y,     ^
+    |                                       |  /  max_depth        /
+    min_x                               max_y /                   / n_pts_per_ray
+    max_y ----------------------------- max_x/ min_depth         v
+              < --- image_width --- >
+    ```
+
+    In order to generate ray points, `GridRaysampler` takes each 3D point of
+    the grid (with coordinates `[x, y, depth]`) and unprojects it
+    with `cameras.unproject_points([x, y, depth])`, where `cameras` are an
+    additional input to the `forward` function.
+
+    Note that this is a generic implementation that can support any image grid
+    coordinate convention. For a raysampler which follows the PyTorch3D
+    coordinate conventions please refer to `NDCGridRaysampler`.
+    As such, `NDCGridRaysampler` is a special case of `GridRaysampler`.
+    """
+
+    def __init__(
+        self,
+        min_x: float,
+        max_x: float,
+        min_y: float,
+        max_y: float,
+        image_width: int,
+        image_height: int,
+        n_pts_per_ray: int,
+        min_depth: float,
+        max_depth: float,
+    ) -> None:
+        """
+        Args:
+            min_x: The leftmost x-coordinate of each ray's source pixel's center.
+            max_x: The rightmost x-coordinate of each ray's source pixel's center.
+            min_y: The topmost y-coordinate of each ray's source pixel's center.
+            max_y: The bottommost y-coordinate of each ray's source pixel's center.
+            image_width: The horizontal size of the image grid.
+            image_height: The vertical size of the image grid.
+            n_pts_per_ray: The number of points sampled along each ray.
+            min_depth: The minimum depth of a ray-point.
+            max_depth: The maximum depth of a ray-point.
+        """
+        super().__init__()
+        self._n_pts_per_ray = n_pts_per_ray
+        self._min_depth = min_depth
+        self._max_depth = max_depth
+
+        # get the initial grid of image xy coords
+        _xy_grid = torch.stack(
+            tuple(
+                reversed(
+                    torch.meshgrid(
+                        torch.linspace(min_y, max_y, image_height, dtype=torch.float32),
+                        torch.linspace(min_x, max_x, image_width, dtype=torch.float32),
+                    )
+                )
+            ),
+            dim=-1,
+        )
+        self.register_buffer("_xy_grid", _xy_grid, persistent=False)
+
+    def forward(self, cameras: CamerasBase, **kwargs) -> RayBundle:
+        """
+        Args:
+            cameras: A batch of `batch_size` cameras from which the rays are emitted.
+        Returns:
+            A named tuple RayBundle with the following fields:
+            origins: A tensor of shape
+                `(batch_size, image_height, image_width, 3)`
+                denoting the locations of ray origins in the world coordinates.
+            directions: A tensor of shape
+                `(batch_size, image_height, image_width, 3)`
+                denoting the directions of each ray in the world coordinates.
+            lengths: A tensor of shape
+                `(batch_size, image_height, image_width, n_pts_per_ray)`
+                containing the z-coordinate (=depth) of each ray in world units.
+            xys: A tensor of shape
+                `(batch_size, image_height, image_width, 2)`
+                containing the 2D image coordinates of each ray.
+        """
+
+        batch_size = cameras.R.shape[0]
+
+        device = cameras.device
+
+        # expand the (H, W, 2) grid batch_size-times to (B, H, W, 2)
+        xy_grid = self._xy_grid.to(device)[None].expand(
+            batch_size, *self._xy_grid.shape
+        )
+
+        return _xy_to_ray_bundle(
+            cameras, xy_grid, self._min_depth, self._max_depth, self._n_pts_per_ray
+        )
+
+
+class NDCGridRaysampler(GridRaysampler):
+    """
+    Samples a fixed number of points along rays which are regularly distributed
+    in a batch of rectangular image grids. Points along each ray
+    have uniformly-spaced z-coordinates between a predefined minimum and maximum depth.
+
+    `NDCGridRaysampler` follows the screen conventions of the `Meshes` and `Pointclouds`
+    renderers. I.e. the pixel coordinates are in [-1, 1]x[-u, u] or [-u, u]x[-1, 1]
+    where u > 1 is the aspect ratio of the image.
+    """
+
+    def __init__(
+        self,
+        image_width: int,
+        image_height: int,
+        n_pts_per_ray: int,
+        min_depth: float,
+        max_depth: float,
+    ) -> None:
+        """
+        Args:
+            image_width: The horizontal size of the image grid.
+            image_height: The vertical size of the image grid.
+            n_pts_per_ray: The number of points sampled along each ray.
+            min_depth: The minimum depth of a ray-point.
+            max_depth: The maximum depth of a ray-point.
+        """
+        if image_width >= image_height:
+            range_x = image_width / image_height
+            range_y = 1.0
+        else:
+            range_x = 1.0
+            range_y = image_height / image_width
+
+        half_pix_width = range_x / image_width
+        half_pix_height = range_y / image_height
+        super().__init__(
+            min_x=range_x - half_pix_width,
+            max_x=-range_x + half_pix_width,
+            min_y=range_y - half_pix_height,
+            max_y=-range_y + half_pix_height,
+            image_width=image_width,
+            image_height=image_height,
+            n_pts_per_ray=n_pts_per_ray,
+            min_depth=min_depth,
+            max_depth=max_depth,
+        )
+
+
+class MonteCarloRaysampler(torch.nn.Module):
+    """
+    Samples a fixed number of pixels within denoted xy bounds uniformly at random.
+    For each pixel, a fixed number of points is sampled along its ray at uniformly-spaced
+    z-coordinates such that the z-coordinates range between a predefined minimum
+    and maximum depth.
+    """
+
+    def __init__(
+        self,
+        min_x: float,
+        max_x: float,
+        min_y: float,
+        max_y: float,
+        n_rays_per_image: int,
+        n_pts_per_ray: int,
+        min_depth: float,
+        max_depth: float,
+    ) -> None:
+        """
+        Args:
+            min_x: The smallest x-coordinate of each ray's source pixel.
+            max_x: The largest x-coordinate of each ray's source pixel.
+            min_y: The smallest y-coordinate of each ray's source pixel.
+            max_y: The largest y-coordinate of each ray's source pixel.
+            n_rays_per_image: The number of rays randomly sampled in each camera.
+            n_pts_per_ray: The number of points sampled along each ray.
+            min_depth: The minimum depth of each ray-point.
+            max_depth: The maximum depth of each ray-point.
+        """
+        super().__init__()
+        self._min_x = min_x
+        self._max_x = max_x
+        self._min_y = min_y
+        self._max_y = max_y
+        self._n_rays_per_image = n_rays_per_image
+        self._n_pts_per_ray = n_pts_per_ray
+        self._min_depth = min_depth
+        self._max_depth = max_depth
+
+    def forward(self, cameras: CamerasBase, **kwargs) -> RayBundle:
+        """
+        Args:
+            cameras: A batch of `batch_size` cameras from which the rays are emitted.
+        Returns:
+            A named tuple RayBundle with the following fields:
+            origins: A tensor of shape
+                `(batch_size, n_rays_per_image, 3)`
+                denoting the locations of ray origins in the world coordinates.
+            directions: A tensor of shape
+                `(batch_size, n_rays_per_image, 3)`
+                denoting the directions of each ray in the world coordinates.
+            lengths: A tensor of shape
+                `(batch_size, n_rays_per_image, n_pts_per_ray)`
+                containing the z-coordinate (=depth) of each ray in world units.
+            xys: A tensor of shape
+                `(batch_size, n_rays_per_image, 2)`
+                containing the 2D image coordinates of each ray.
+        """
+
+        batch_size = cameras.R.shape[0]
+
+        device = cameras.device
+
+        # get the initial grid of image xy coords
+        # of shape (batch_size, n_rays_per_image, 2)
+        rays_xy = torch.cat(
+            [
+                torch.rand(
+                    size=(batch_size, self._n_rays_per_image, 1),
+                    dtype=torch.float32,
+                    device=device,
+                )
+                * (high - low)
+                + low
+                for low, high in (
+                    (self._min_x, self._max_x),
+                    (self._min_y, self._max_y),
+                )
+            ],
+            dim=2,
+        )
+
+        return _xy_to_ray_bundle(
+            cameras, rays_xy, self._min_depth, self._max_depth, self._n_pts_per_ray
+        )
+
+
+def _xy_to_ray_bundle(
+    cameras: CamerasBase,
+    xy_grid: torch.Tensor,
+    min_depth: float,
+    max_depth: float,
+    n_pts_per_ray: int,
+) -> RayBundle:
+    """
+    Extends the `xy_grid` input of shape `(batch_size, ..., 2)` to rays.
+    This adds to each xy location in the grid a vector of `n_pts_per_ray` depths
+    uniformly spaced between `min_depth` and `max_depth`.
+
+    The extended grid is then unprojected with `cameras` to yield
+    ray origins, directions and depths.
+    """
+    batch_size = xy_grid.shape[0]
+    spatial_size = xy_grid.shape[1:-1]
+    n_rays_per_image = spatial_size.numel()  # pyre-ignore
+
+    # ray z-coords
+    depths = torch.linspace(
+        min_depth, max_depth, n_pts_per_ray, dtype=xy_grid.dtype, device=xy_grid.device
+    )
+    rays_zs = depths[None, None].expand(batch_size, n_rays_per_image, n_pts_per_ray)
+
+    # make two sets of points at a constant depth=1 and 2
+    to_unproject = torch.cat(
+        (
+            xy_grid.view(batch_size, 1, n_rays_per_image, 2)
+            .expand(batch_size, 2, n_rays_per_image, 2)
+            .reshape(batch_size, n_rays_per_image * 2, 2),
+            torch.cat(
+                (
+                    xy_grid.new_ones(batch_size, n_rays_per_image, 1),
+                    2.0 * xy_grid.new_ones(batch_size, n_rays_per_image, 1),
+                ),
+                dim=1,
+            ),
+        ),
+        dim=-1,
+    )
+
+    # unproject the points
+    unprojected = cameras.unproject_points(to_unproject, from_ndc=True)  # pyre-ignore
+
+    # split the two planes back
+    rays_plane_1_world = unprojected[:, :n_rays_per_image]
+    rays_plane_2_world = unprojected[:, n_rays_per_image:]
+
+    # directions are the differences between the two planes of points
+    rays_directions_world = rays_plane_2_world - rays_plane_1_world
+
+    # origins are given by subtracting the ray directions from the first plane
+    rays_origins_world = rays_plane_1_world - rays_directions_world
+
+    return RayBundle(
+        rays_origins_world.view(batch_size, *spatial_size, 3),
+        rays_directions_world.view(batch_size, *spatial_size, 3),
+        rays_zs.view(batch_size, *spatial_size, n_pts_per_ray),
+        xy_grid,
+    )
diff --git a/pytorch3d/pytorch3d/renderer/implicit/renderer.py b/pytorch3d/pytorch3d/renderer/implicit/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f4c55754e133bc3038f4cf9c42c071ce6c15e714
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/implicit/renderer.py
@@ -0,0 +1,391 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Callable, Tuple
+
+import torch
+
+from ...ops.utils import eyes
+from ...structures import Volumes
+from ...transforms import Transform3d
+from ..cameras import CamerasBase
+from .raysampling import RayBundle
+from .utils import _validate_ray_bundle_variables, ray_bundle_variables_to_ray_points
+
+
+# The implicit renderer class should be initialized with a
+# function for raysampling and a function for raymarching.
+
+# During the forward pass:
+# 1) The raysampler:
+#     - samples rays from input cameras
+#     - transforms the rays to world coordinates
+# 2) The volumetric_function (which is a callable argument of the forward pass)
+#    evaluates ray_densities and ray_features at the sampled ray-points.
+# 3) The raymarcher takes ray_densities and ray_features and uses a raymarching
+#    algorithm to render each ray.
+
+
+class ImplicitRenderer(torch.nn.Module):
+    """
+    A class for rendering a batch of implicit surfaces. The class should
+    be initialized with a raysampler and raymarcher class which both have
+    to be a `Callable`.
+
+    VOLUMETRIC_FUNCTION
+
+    The `forward` function of the renderer accepts as input the rendering cameras
+    as well as the `volumetric_function` `Callable`, which defines a field of opacity
+    and feature vectors over the 3D domain of the scene.
+
+    A standard `volumetric_function` has the following signature:
+    ```
+    def volumetric_function(
+        ray_bundle: RayBundle,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]
+    ```
+    With the following arguments:
+        `ray_bundle`: A RayBundle object containing the following variables:
+            `origins`: A tensor of shape `(minibatch, ..., 3)` denoting
+                the origins of the rendering rays.
+            `directions`: A tensor of shape `(minibatch, ..., 3)`
+                containing the direction vectors of rendering rays.
+            `lengths`: A tensor of shape
+                `(minibatch, ..., num_points_per_ray)`containing the
+                lengths at which the ray points are sampled.
+            `xys`: A tensor of shape
+                `(minibatch, ..., 2)` containing the
+                xy locations of each ray's pixel in the screen space.
+    Calling `volumetric_function` then returns the following:
+        `rays_densities`: A tensor of shape
+            `(minibatch, ..., num_points_per_ray, opacity_dim)` containing
+            the an opacity vector for each ray point.
+        `rays_features`: A tensor of shape
+            `(minibatch, ..., num_points_per_ray, feature_dim)` containing
+            the an feature vector for each ray point.
+
+    Note that, in order to increase flexibility of the API, we allow multiple
+    other arguments to enter the volumetric function via additional
+    (optional) keyword arguments `**kwargs`.
+    A typical use-case is passing a `CamerasBase` object as an additional
+    keyword argument, which can allow the volumetric function to adjust its
+    outputs based on the directions of the projection rays.
+
+    Example:
+        A simple volumetric function of a 0-centered
+        RGB sphere with a unit diameter is defined as follows:
+        ```
+        def volumetric_function(
+            ray_bundle: RayBundle,
+            **kwargs,
+        ) -> Tuple[torch.Tensor, torch.Tensor]:
+
+            # first convert the ray origins, directions and lengths
+            # to 3D ray point locations in world coords
+            rays_points_world = ray_bundle_to_ray_points(ray_bundle)
+
+            # set the densities as an inverse sigmoid of the
+            # ray point distance from the sphere centroid
+            rays_densities = torch.sigmoid(
+                -100.0 * rays_points_world.norm(dim=-1, keepdim=True)
+            )
+
+            # set the ray features to RGB colors proportional
+            # to the 3D location of the projection of ray points
+            # on the sphere surface
+            rays_features = torch.nn.functional.normalize(
+                rays_points_world, dim=-1
+            ) * 0.5 + 0.5
+
+            return rays_densities, rays_features
+        ```
+    """
+
+    def __init__(self, raysampler: Callable, raymarcher: Callable) -> None:
+        """
+        Args:
+            raysampler: A `Callable` that takes as input scene cameras
+                (an instance of `CamerasBase`) and returns a `RayBundle` that
+                describes the rays emitted from the cameras.
+            raymarcher: A `Callable` that receives the response of the
+                `volumetric_function` (an input to `self.forward`) evaluated
+                along the sampled rays, and renders the rays with a
+                ray-marching algorithm.
+        """
+        super().__init__()
+
+        if not callable(raysampler):
+            raise ValueError('"raysampler" has to be a "Callable" object.')
+        if not callable(raymarcher):
+            raise ValueError('"raymarcher" has to be a "Callable" object.')
+
+        self.raysampler = raysampler
+        self.raymarcher = raymarcher
+
+    def forward(
+        self, cameras: CamerasBase, volumetric_function: Callable, **kwargs
+    ) -> Tuple[torch.Tensor, RayBundle]:
+        """
+        Render a batch of images using a volumetric function
+        represented as a callable (e.g. a Pytorch module).
+
+        Args:
+            cameras: A batch of cameras that render the scene. A `self.raysampler`
+                takes the cameras as input and samples rays that pass through the
+                domain of the volumetric function.
+            volumetric_function: A `Callable` that accepts the parametrizations
+                of the rendering rays and returns the densities and features
+                at the respective 3D of the rendering rays. Please refer to
+                the main class documentation for details.
+
+        Returns:
+            images: A tensor of shape `(minibatch, ..., feature_dim + opacity_dim)`
+                containing the result of the rendering.
+            ray_bundle: A `RayBundle` containing the parametrizations of the
+                sampled rendering rays.
+        """
+
+        if not callable(volumetric_function):
+            raise ValueError('"volumetric_function" has to be a "Callable" object.')
+
+        # first call the ray sampler that returns the RayBundle parametrizing
+        # the rendering rays.
+        ray_bundle = self.raysampler(
+            cameras=cameras, volumetric_function=volumetric_function, **kwargs
+        )
+        # ray_bundle.origins - minibatch x ... x 3
+        # ray_bundle.directions - minibatch x ... x 3
+        # ray_bundle.lengths - minibatch x ... x n_pts_per_ray
+        # ray_bundle.xys - minibatch x ... x 2
+
+        # given sampled rays, call the volumetric function that
+        # evaluates the densities and features at the locations of the
+        # ray points
+        rays_densities, rays_features = volumetric_function(
+            ray_bundle=ray_bundle, cameras=cameras, **kwargs
+        )
+        # ray_densities - minibatch x ... x n_pts_per_ray x density_dim
+        # ray_features - minibatch x ... x n_pts_per_ray x feature_dim
+
+        # finally, march along the sampled rays to obtain the renders
+        images = self.raymarcher(
+            rays_densities=rays_densities,
+            rays_features=rays_features,
+            ray_bundle=ray_bundle,
+            **kwargs
+        )
+        # images - minibatch x ... x (feature_dim + opacity_dim)
+
+        return images, ray_bundle
+
+
+# The volume renderer class should be initialized with a
+# function for raysampling and a function for raymarching.
+
+# During the forward pass:
+# 1) The raysampler:
+#     - samples rays from input cameras
+#     - transforms the rays to world coordinates
+# 2) The scene volumes (which are an argument of the forward function)
+#    are then sampled at the locations of the ray-points to generate
+#    ray_densities and ray_features.
+# 3) The raymarcher takes ray_densities and ray_features and uses a raymarching
+#    algorithm to render each ray.
+
+
+class VolumeRenderer(torch.nn.Module):
+    """
+    A class for rendering a batch of Volumes. The class should
+    be initialized with a raysampler and a raymarcher class which both have
+    to be a `Callable`.
+    """
+
+    def __init__(
+        self, raysampler: Callable, raymarcher: Callable, sample_mode: str = "bilinear"
+    ) -> None:
+        """
+        Args:
+            raysampler: A `Callable` that takes as input scene cameras
+                (an instance of `CamerasBase`) and returns a `RayBundle` that
+                describes the rays emitted from the cameras.
+            raymarcher: A `Callable` that receives the `volumes`
+                (an instance of `Volumes` input to `self.forward`)
+                sampled at the ray-points, and renders the rays with a
+                ray-marching algorithm.
+            sample_mode: Defines the algorithm used to sample the volumetric
+                voxel grid. Can be either "bilinear" or "nearest".
+        """
+        super().__init__()
+
+        self.renderer = ImplicitRenderer(raysampler, raymarcher)
+        self._sample_mode = sample_mode
+
+    def forward(
+        self, cameras: CamerasBase, volumes: Volumes, **kwargs
+    ) -> Tuple[torch.Tensor, RayBundle]:
+        """
+        Render a batch of images using raymarching over rays cast through
+        input `Volumes`.
+
+        Args:
+            cameras: A batch of cameras that render the scene. A `self.raysampler`
+                takes the cameras as input and samples rays that pass through the
+                domain of the volumetric function.
+            volumes: An instance of the `Volumes` class representing a
+                batch of volumes that are being rendered.
+
+        Returns:
+            images: A tensor of shape `(minibatch, ..., (feature_dim + opacity_dim)`
+                containing the result of the rendering.
+            ray_bundle: A `RayBundle` containing the parametrizations of the
+                sampled rendering rays.
+        """
+        volumetric_function = VolumeSampler(volumes, sample_mode=self._sample_mode)
+        return self.renderer(
+            cameras=cameras, volumetric_function=volumetric_function, **kwargs
+        )
+
+
+class VolumeSampler(torch.nn.Module):
+    """
+    A module to sample a batch of volumes `Volumes`
+    at 3D points sampled along projection rays.
+    """
+
+    def __init__(self, volumes: Volumes, sample_mode: str = "bilinear") -> None:
+        """
+        Args:
+            volumes: An instance of the `Volumes` class representing a
+                batch of volumes that are being rendered.
+            sample_mode: Defines the algorithm used to sample the volumetric
+                voxel grid. Can be either "bilinear" or "nearest".
+        """
+        super().__init__()
+        if not isinstance(volumes, Volumes):
+            raise ValueError("'volumes' have to be an instance of the 'Volumes' class.")
+        self._volumes = volumes
+        self._sample_mode = sample_mode
+
+    def _get_ray_directions_transform(self):
+        """
+        Compose the ray-directions transform by removing the translation component
+        from the volume global-to-local coords transform.
+        """
+        world2local = self._volumes.get_world_to_local_coords_transform().get_matrix()
+        directions_transform_matrix = eyes(
+            4,
+            N=world2local.shape[0],
+            device=world2local.device,
+            dtype=world2local.dtype,
+        )
+        directions_transform_matrix[:, :3, :3] = world2local[:, :3, :3]
+        directions_transform = Transform3d(matrix=directions_transform_matrix)
+        return directions_transform
+
+    def forward(
+        self, ray_bundle: RayBundle, **kwargs
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Given an input ray parametrization, the forward function samples
+        `self._volumes` at the respective 3D ray-points.
+
+        Args:
+            ray_bundle: A RayBundle object with the following fields:
+                rays_origins_world: A tensor of shape `(minibatch, ..., 3)` denoting the
+                    origins of the sampling rays in world coords.
+                rays_directions_world: A tensor of shape `(minibatch, ..., 3)`
+                    containing the direction vectors of sampling rays in world coords.
+                rays_lengths: A tensor of shape `(minibatch, ..., num_points_per_ray)`
+                    containing the lengths at which the rays are sampled.
+
+        Returns:
+            rays_densities: A tensor of shape
+                `(minibatch, ..., num_points_per_ray, opacity_dim)` containing the
+                density vectors sampled from the volume at the locations of
+                the ray points.
+            rays_features: A tensor of shape
+                `(minibatch, ..., num_points_per_ray, feature_dim)` containing the
+                feature vectors sampled from the volume at the locations of
+                the ray points.
+        """
+
+        # take out the interesting parts of ray_bundle
+        rays_origins_world = ray_bundle.origins
+        rays_directions_world = ray_bundle.directions
+        rays_lengths = ray_bundle.lengths
+
+        # validate the inputs
+        _validate_ray_bundle_variables(
+            rays_origins_world, rays_directions_world, rays_lengths
+        )
+        if self._volumes.densities().shape[0] != rays_origins_world.shape[0]:
+            raise ValueError("Input volumes have to have the same batch size as rays.")
+
+        #########################################################
+        # 1) convert the origins/directions to the local coords #
+        #########################################################
+
+        # origins are mapped with the world_to_local transform of the volumes
+        rays_origins_local = self._volumes.world_to_local_coords(rays_origins_world)
+
+        # obtain the Transform3d object that transforms ray directions to local coords
+        directions_transform = self._get_ray_directions_transform()
+
+        # transform the directions to the local coords
+        rays_directions_local = directions_transform.transform_points(
+            rays_directions_world.view(rays_lengths.shape[0], -1, 3)
+        ).view(rays_directions_world.shape)
+
+        ############################
+        # 2) obtain the ray points #
+        ############################
+
+        # this op produces a fairly big tensor (minibatch, ..., n_samples_per_ray, 3)
+        rays_points_local = ray_bundle_variables_to_ray_points(
+            rays_origins_local, rays_directions_local, rays_lengths
+        )
+
+        ########################
+        # 3) sample the volume #
+        ########################
+
+        # generate the tensor for sampling
+        volumes_densities = self._volumes.densities()
+        dim_density = volumes_densities.shape[1]
+        volumes_features = self._volumes.features()
+        # adjust the volumes_features variable in case we have a feature-less volume
+        if volumes_features is None:
+            dim_feature = 0
+            data_to_sample = volumes_densities
+        else:
+            dim_feature = volumes_features.shape[1]
+            data_to_sample = torch.cat((volumes_densities, volumes_features), dim=1)
+
+        # reshape to a size which grid_sample likes
+        rays_points_local_flat = rays_points_local.view(
+            rays_points_local.shape[0], -1, 1, 1, 3
+        )
+
+        # run the grid sampler
+        data_sampled = torch.nn.functional.grid_sample(
+            data_to_sample,
+            rays_points_local_flat,
+            align_corners=True,
+            mode=self._sample_mode,
+        )
+
+        # permute the dimensions & reshape after sampling
+        data_sampled = data_sampled.permute(0, 2, 3, 4, 1).view(
+            *rays_points_local.shape[:-1], data_sampled.shape[1]
+        )
+
+        # split back to densities and features
+        rays_densities, rays_features = data_sampled.split(
+            [dim_density, dim_feature], dim=-1
+        )
+
+        return rays_densities, rays_features
diff --git a/pytorch3d/pytorch3d/renderer/implicit/sample_pdf.py b/pytorch3d/pytorch3d/renderer/implicit/sample_pdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..48dd166349298dca17f552593b596e00d0c5b92b
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/implicit/sample_pdf.py
@@ -0,0 +1,146 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from pytorch3d import _C
+
+
+def sample_pdf(
+    bins: torch.Tensor,
+    weights: torch.Tensor,
+    n_samples: int,
+    det: bool = False,
+    eps: float = 1e-5,
+) -> torch.Tensor:
+    """
+    Samples probability density functions defined by bin edges `bins` and
+    the non-negative per-bin probabilities `weights`.
+
+    Args:
+        bins: Tensor of shape `(..., n_bins+1)` denoting the edges of the sampling bins.
+        weights: Tensor of shape `(..., n_bins)` containing non-negative numbers
+            representing the probability of sampling the corresponding bin.
+        n_samples: The number of samples to draw from each set of bins.
+        det: If `False`, the sampling is random. `True` yields deterministic
+            uniformly-spaced sampling from the inverse cumulative density function.
+        eps: A constant preventing division by zero in case empty bins are present.
+
+    Returns:
+        samples: Tensor of shape `(..., n_samples)` containing `n_samples` samples
+            drawn from each probability distribution.
+
+    Refs:
+        [1] https://github.com/bmild/nerf/blob/55d8b00244d7b5178f4d003526ab6667683c9da9/run_nerf_helpers.py#L183  # noqa E501
+    """
+    if torch.is_grad_enabled() and (bins.requires_grad or weights.requires_grad):
+        raise NotImplementedError("sample_pdf differentiability.")
+    if weights.min() <= -eps:
+        raise ValueError("Negative weights provided.")
+    batch_shape = bins.shape[:-1]
+    n_bins = weights.shape[-1]
+    if n_bins + 1 != bins.shape[-1] or weights.shape[:-1] != batch_shape:
+        shapes = f"{bins.shape}{weights.shape}"
+        raise ValueError("Inconsistent shapes of bins and weights: " + shapes)
+    output_shape = batch_shape + (n_samples,)
+
+    if det:
+        u = torch.linspace(0.0, 1.0, n_samples, device=bins.device, dtype=torch.float32)
+        output = u.expand(output_shape).contiguous()
+    else:
+        output = torch.rand(output_shape, dtype=torch.float32, device=bins.device)
+
+    # pyre-fixme[16]: Module `pytorch3d` has no attribute `_C`.
+    _C.sample_pdf(
+        bins.reshape(-1, n_bins + 1),
+        weights.reshape(-1, n_bins),
+        output.reshape(-1, n_samples),
+        eps,
+    )
+
+    return output
+
+
+def sample_pdf_python(
+    bins: torch.Tensor,
+    weights: torch.Tensor,
+    N_samples: int,
+    det: bool = False,
+    eps: float = 1e-5,
+) -> torch.Tensor:
+    """
+    This is a pure python implementation of the `sample_pdf` function.
+    It may be faster than sample_pdf when the number of bins is very large,
+    because it behaves as O(batchsize * [n_bins + log(n_bins) * n_samples] )
+    whereas sample_pdf behaves as O(batchsize * n_bins * n_samples).
+    For 64 bins sample_pdf is much faster.
+
+    Samples probability density functions defined by bin edges `bins` and
+    the non-negative per-bin probabilities `weights`.
+
+    Note: This is a direct conversion of the TensorFlow function from the original
+    release [1] to PyTorch. It requires PyTorch 1.6 or greater due to the use of
+    torch.searchsorted.
+
+    Args:
+        bins: Tensor of shape `(..., n_bins+1)` denoting the edges of the sampling bins.
+        weights: Tensor of shape `(..., n_bins)` containing non-negative numbers
+            representing the probability of sampling the corresponding bin.
+        N_samples: The number of samples to draw from each set of bins.
+        det: If `False`, the sampling is random. `True` yields deterministic
+            uniformly-spaced sampling from the inverse cumulative density function.
+        eps: A constant preventing division by zero in case empty bins are present.
+
+    Returns:
+        samples: Tensor of shape `(..., N_samples)` containing `N_samples` samples
+            drawn from each probability distribution.
+
+    Refs:
+        [1] https://github.com/bmild/nerf/blob/55d8b00244d7b5178f4d003526ab6667683c9da9/run_nerf_helpers.py#L183  # noqa E501
+    """
+
+    # Get pdf
+    weights = weights + eps  # prevent nans
+    if weights.min() <= 0:
+        raise ValueError("Negative weights provided.")
+    pdf = weights / weights.sum(dim=-1, keepdim=True)
+    cdf = torch.cumsum(pdf, -1)
+    cdf = torch.cat([torch.zeros_like(cdf[..., :1]), cdf], -1)
+
+    # Take uniform samples u of shape (..., N_samples)
+    if det:
+        u = torch.linspace(0.0, 1.0, N_samples, device=cdf.device, dtype=cdf.dtype)
+        u = u.expand(list(cdf.shape[:-1]) + [N_samples]).contiguous()
+    else:
+        u = torch.rand(
+            list(cdf.shape[:-1]) + [N_samples], device=cdf.device, dtype=cdf.dtype
+        )
+
+    # Invert CDF
+    inds = torch.searchsorted(cdf, u, right=True)
+    # inds has shape (..., N_samples) identifying the bin of each sample.
+    below = (inds - 1).clamp(0)
+    above = inds.clamp(max=cdf.shape[-1] - 1)
+    # Below and above are of shape (..., N_samples), identifying the bin
+    # edges surrounding each sample.
+
+    inds_g = torch.stack([below, above], -1).view(
+        *below.shape[:-1], below.shape[-1] * 2
+    )
+    cdf_g = torch.gather(cdf, -1, inds_g).view(*below.shape, 2)
+    bins_g = torch.gather(bins, -1, inds_g).view(*below.shape, 2)
+    # cdf_g and bins_g are of shape (..., N_samples, 2) and identify
+    # the cdf and the index of the two bin edges surrounding each sample.
+
+    denom = cdf_g[..., 1] - cdf_g[..., 0]
+    denom = torch.where(denom < eps, torch.ones_like(denom), denom)
+    t = (u - cdf_g[..., 0]) / denom
+    # t is of shape  (..., N_samples) and identifies how far through
+    # each sample is in its bin.
+
+    samples = bins_g[..., 0] + t * (bins_g[..., 1] - bins_g[..., 0])
+
+    return samples
diff --git a/pytorch3d/pytorch3d/renderer/implicit/utils.py b/pytorch3d/pytorch3d/renderer/implicit/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a973bdee1fea3adff2a895a6556a85f9bf82bddd
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/implicit/utils.py
@@ -0,0 +1,130 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import NamedTuple
+
+import torch
+
+
+class RayBundle(NamedTuple):
+    """
+    RayBundle parametrizes points along projection rays by storing ray `origins`,
+    `directions` vectors and `lengths` at which the ray-points are sampled.
+    Furthermore, the xy-locations (`xys`) of the ray pixels are stored as well.
+    Note that `directions` don't have to be normalized; they define unit vectors
+    in the respective 1D coordinate systems; see documentation for
+    :func:`ray_bundle_to_ray_points` for the conversion formula.
+    """
+
+    origins: torch.Tensor
+    directions: torch.Tensor
+    lengths: torch.Tensor
+    xys: torch.Tensor
+
+
+def ray_bundle_to_ray_points(ray_bundle: RayBundle) -> torch.Tensor:
+    """
+    Converts rays parametrized with a `ray_bundle` (an instance of the `RayBundle`
+    named tuple) to 3D points by extending each ray according to the corresponding
+    length.
+
+    E.g. for 2 dimensional tensors `ray_bundle.origins`, `ray_bundle.directions`
+        and `ray_bundle.lengths`, the ray point at position `[i, j]` is:
+        ```
+            ray_bundle.points[i, j, :] = (
+                ray_bundle.origins[i, :]
+                + ray_bundle.directions[i, :] * ray_bundle.lengths[i, j]
+            )
+        ```
+    Note that both the directions and magnitudes of the vectors in
+    `ray_bundle.directions` matter.
+
+    Args:
+        ray_bundle: A `RayBundle` object with fields:
+            origins: A tensor of shape `(..., 3)`
+            directions: A tensor of shape `(..., 3)`
+            lengths: A tensor of shape `(..., num_points_per_ray)`
+
+    Returns:
+        rays_points: A tensor of shape `(..., num_points_per_ray, 3)`
+            containing the points sampled along each ray.
+    """
+    return ray_bundle_variables_to_ray_points(
+        ray_bundle.origins, ray_bundle.directions, ray_bundle.lengths
+    )
+
+
+def ray_bundle_variables_to_ray_points(
+    rays_origins: torch.Tensor,
+    rays_directions: torch.Tensor,
+    rays_lengths: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Converts rays parametrized with origins and directions
+    to 3D points by extending each ray according to the corresponding
+    ray length:
+
+    E.g. for 2 dimensional input tensors `rays_origins`, `rays_directions`
+    and `rays_lengths`, the ray point at position `[i, j]` is:
+        ```
+            rays_points[i, j, :] = (
+                rays_origins[i, :]
+                + rays_directions[i, :] * rays_lengths[i, j]
+            )
+        ```
+    Note that both the directions and magnitudes of the vectors in
+    `rays_directions` matter.
+
+    Args:
+        rays_origins: A tensor of shape `(..., 3)`
+        rays_directions: A tensor of shape `(..., 3)`
+        rays_lengths: A tensor of shape `(..., num_points_per_ray)`
+
+    Returns:
+        rays_points: A tensor of shape `(..., num_points_per_ray, 3)`
+            containing the points sampled along each ray.
+    """
+    rays_points = (
+        rays_origins[..., None, :]
+        + rays_lengths[..., :, None] * rays_directions[..., None, :]
+    )
+    return rays_points
+
+
+def _validate_ray_bundle_variables(
+    rays_origins: torch.Tensor,
+    rays_directions: torch.Tensor,
+    rays_lengths: torch.Tensor,
+):
+    """
+    Validate the shapes of RayBundle variables
+    `rays_origins`, `rays_directions`, and `rays_lengths`.
+    """
+    ndim = rays_origins.ndim
+    if any(r.ndim != ndim for r in (rays_directions, rays_lengths)):
+        raise ValueError(
+            "rays_origins, rays_directions and rays_lengths"
+            + " have to have the same number of dimensions."
+        )
+
+    if ndim <= 2:
+        raise ValueError(
+            "rays_origins, rays_directions and rays_lengths"
+            + " have to have at least 3 dimensions."
+        )
+
+    spatial_size = rays_origins.shape[:-1]
+    if any(spatial_size != r.shape[:-1] for r in (rays_directions, rays_lengths)):
+        raise ValueError(
+            "The shapes of rays_origins, rays_directions and rays_lengths"
+            + " may differ only in the last dimension."
+        )
+
+    if any(r.shape[-1] != 3 for r in (rays_origins, rays_directions)):
+        raise ValueError(
+            "The size of the last dimension of rays_origins/rays_directions"
+            + "has to be 3."
+        )
diff --git a/pytorch3d/pytorch3d/renderer/lighting.py b/pytorch3d/pytorch3d/renderer/lighting.py
new file mode 100644
index 0000000000000000000000000000000000000000..c10d0ec1946570a1ac7c267ef67c969c489a9f46
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/lighting.py
@@ -0,0 +1,332 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+import torch.nn.functional as F
+
+from ..common.types import Device
+from .utils import TensorProperties, convert_to_tensors_and_broadcast
+
+
+def diffuse(normals, color, direction) -> torch.Tensor:
+    """
+    Calculate the diffuse component of light reflection using Lambert's
+    cosine law.
+
+    Args:
+        normals: (N, ..., 3) xyz normal vectors. Normals and points are
+            expected to have the same shape.
+        color: (1, 3) or (N, 3) RGB color of the diffuse component of the light.
+        direction: (x,y,z) direction of the light
+
+    Returns:
+        colors: (N, ..., 3), same shape as the input points.
+
+    The normals and light direction should be in the same coordinate frame
+    i.e. if the points have been transformed from world -> view space then
+    the normals and direction should also be in view space.
+
+    NOTE: to use with the packed vertices (i.e. no batch dimension) reformat the
+    inputs in the following way.
+
+    .. code-block:: python
+
+        Args:
+            normals: (P, 3)
+            color: (N, 3)[batch_idx, :] -> (P, 3)
+            direction: (N, 3)[batch_idx, :] -> (P, 3)
+
+        Returns:
+            colors: (P, 3)
+
+        where batch_idx is of shape (P). For meshes, batch_idx can be:
+        meshes.verts_packed_to_mesh_idx() or meshes.faces_packed_to_mesh_idx()
+        depending on whether points refers to the vertex coordinates or
+        average/interpolated face coordinates.
+    """
+    # TODO: handle multiple directional lights per batch element.
+    # TODO: handle attenuation.
+
+    # Ensure color and location have same batch dimension as normals
+    normals, color, direction = convert_to_tensors_and_broadcast(
+        normals, color, direction, device=normals.device
+    )
+
+    # Reshape direction and color so they have all the arbitrary intermediate
+    # dimensions as normals. Assume first dim = batch dim and last dim = 3.
+    points_dims = normals.shape[1:-1]
+    expand_dims = (-1,) + (1,) * len(points_dims) + (3,)
+    if direction.shape != normals.shape:
+        direction = direction.view(expand_dims)
+    if color.shape != normals.shape:
+        color = color.view(expand_dims)
+
+    # Renormalize the normals in case they have been interpolated.
+    # We tried to replace the following with F.cosine_similarity, but it wasn't faster.
+    normals = F.normalize(normals, p=2, dim=-1, eps=1e-6)
+    direction = F.normalize(direction, p=2, dim=-1, eps=1e-6)
+    angle = F.relu(torch.sum(normals * direction, dim=-1))
+    return color * angle[..., None]
+
+
+def specular(
+    points, normals, direction, color, camera_position, shininess
+) -> torch.Tensor:
+    """
+    Calculate the specular component of light reflection.
+
+    Args:
+        points: (N, ..., 3) xyz coordinates of the points.
+        normals: (N, ..., 3) xyz normal vectors for each point.
+        color: (N, 3) RGB color of the specular component of the light.
+        direction: (N, 3) vector direction of the light.
+        camera_position: (N, 3) The xyz position of the camera.
+        shininess: (N)  The specular exponent of the material.
+
+    Returns:
+        colors: (N, ..., 3), same shape as the input points.
+
+    The points, normals, camera_position, and direction should be in the same
+    coordinate frame i.e. if the points have been transformed from
+    world -> view space then the normals, camera_position, and light direction
+    should also be in view space.
+
+    To use with a batch of packed points reindex in the following way.
+    .. code-block:: python::
+
+        Args:
+            points: (P, 3)
+            normals: (P, 3)
+            color: (N, 3)[batch_idx] -> (P, 3)
+            direction: (N, 3)[batch_idx] -> (P, 3)
+            camera_position: (N, 3)[batch_idx] -> (P, 3)
+            shininess: (N)[batch_idx] -> (P)
+        Returns:
+            colors: (P, 3)
+
+        where batch_idx is of shape (P). For meshes batch_idx can be:
+        meshes.verts_packed_to_mesh_idx() or meshes.faces_packed_to_mesh_idx().
+    """
+    # TODO: handle multiple directional lights
+    # TODO: attenuate based on inverse squared distance to the light source
+
+    if points.shape != normals.shape:
+        msg = "Expected points and normals to have the same shape: got %r, %r"
+        raise ValueError(msg % (points.shape, normals.shape))
+
+    # Ensure all inputs have same batch dimension as points
+    matched_tensors = convert_to_tensors_and_broadcast(
+        points, color, direction, camera_position, shininess, device=points.device
+    )
+    _, color, direction, camera_position, shininess = matched_tensors
+
+    # Reshape direction and color so they have all the arbitrary intermediate
+    # dimensions as points. Assume first dim = batch dim and last dim = 3.
+    points_dims = points.shape[1:-1]
+    expand_dims = (-1,) + (1,) * len(points_dims)
+    if direction.shape != normals.shape:
+        direction = direction.view(expand_dims + (3,))
+    if color.shape != normals.shape:
+        color = color.view(expand_dims + (3,))
+    if camera_position.shape != normals.shape:
+        camera_position = camera_position.view(expand_dims + (3,))
+    if shininess.shape != normals.shape:
+        shininess = shininess.view(expand_dims)
+
+    # Renormalize the normals in case they have been interpolated.
+    # We tried a version that uses F.cosine_similarity instead of renormalizing,
+    # but it was slower.
+    normals = F.normalize(normals, p=2, dim=-1, eps=1e-6)
+    direction = F.normalize(direction, p=2, dim=-1, eps=1e-6)
+    cos_angle = torch.sum(normals * direction, dim=-1)
+    # No specular highlights if angle is less than 0.
+    mask = (cos_angle > 0).to(torch.float32)
+
+    # Calculate the specular reflection.
+    view_direction = camera_position - points
+    view_direction = F.normalize(view_direction, p=2, dim=-1, eps=1e-6)
+    reflect_direction = -direction + 2 * (cos_angle[..., None] * normals)
+
+    # Cosine of the angle between the reflected light ray and the viewer
+    alpha = F.relu(torch.sum(view_direction * reflect_direction, dim=-1)) * mask
+    return color * torch.pow(alpha, shininess)[..., None]
+
+
+class DirectionalLights(TensorProperties):
+    def __init__(
+        self,
+        ambient_color=((0.5, 0.5, 0.5),),
+        diffuse_color=((0.3, 0.3, 0.3),),
+        specular_color=((0.2, 0.2, 0.2),),
+        direction=((0, 1, 0),),
+        device: Device = "cpu",
+    ) -> None:
+        """
+        Args:
+            ambient_color: RGB color of the ambient component.
+            diffuse_color: RGB color of the diffuse component.
+            specular_color: RGB color of the specular component.
+            direction: (x, y, z) direction vector of the light.
+            device: Device (as str or torch.device) on which the tensors should be located
+
+        The inputs can each be
+            - 3 element tuple/list or list of lists
+            - torch tensor of shape (1, 3)
+            - torch tensor of shape (N, 3)
+        The inputs are broadcast against each other so they all have batch
+        dimension N.
+        """
+        super().__init__(
+            device=device,
+            ambient_color=ambient_color,
+            diffuse_color=diffuse_color,
+            specular_color=specular_color,
+            direction=direction,
+        )
+        _validate_light_properties(self)
+        if self.direction.shape[-1] != 3:
+            msg = "Expected direction to have shape (N, 3); got %r"
+            raise ValueError(msg % repr(self.direction.shape))
+
+    def clone(self):
+        other = self.__class__(device=self.device)
+        return super().clone(other)
+
+    def diffuse(self, normals, points=None) -> torch.Tensor:
+        # NOTE: Points is not used but is kept in the args so that the API is
+        # the same for directional and point lights. The call sites should not
+        # need to know the light type.
+        return diffuse(
+            normals=normals,
+            color=self.diffuse_color,
+            direction=self.direction,
+        )
+
+    def specular(self, normals, points, camera_position, shininess) -> torch.Tensor:
+        return specular(
+            points=points,
+            normals=normals,
+            color=self.specular_color,
+            direction=self.direction,
+            camera_position=camera_position,
+            shininess=shininess,
+        )
+
+
+class PointLights(TensorProperties):
+    def __init__(
+        self,
+        ambient_color=((0.5, 0.5, 0.5),),
+        diffuse_color=((0.3, 0.3, 0.3),),
+        specular_color=((0.2, 0.2, 0.2),),
+        location=((0, 1, 0),),
+        device: Device = "cpu",
+    ) -> None:
+        """
+        Args:
+            ambient_color: RGB color of the ambient component
+            diffuse_color: RGB color of the diffuse component
+            specular_color: RGB color of the specular component
+            location: xyz position of the light.
+            device: Device (as str or torch.device) on which the tensors should be located
+
+        The inputs can each be
+            - 3 element tuple/list or list of lists
+            - torch tensor of shape (1, 3)
+            - torch tensor of shape (N, 3)
+        The inputs are broadcast against each other so they all have batch
+        dimension N.
+        """
+        super().__init__(
+            device=device,
+            ambient_color=ambient_color,
+            diffuse_color=diffuse_color,
+            specular_color=specular_color,
+            location=location,
+        )
+        _validate_light_properties(self)
+        if self.location.shape[-1] != 3:
+            msg = "Expected location to have shape (N, 3); got %r"
+            raise ValueError(msg % repr(self.location.shape))
+
+    def clone(self):
+        other = self.__class__(device=self.device)
+        return super().clone(other)
+
+    def reshape_location(self, points) -> torch.Tensor:
+        """
+        Reshape the location tensor to have dimensions
+        compatible with the points which can either be of
+        shape (P, 3) or (N, H, W, K, 3).
+        """
+        if self.location.ndim == points.ndim:
+            # pyre-fixme[7]
+            return self.location
+        # pyre-fixme[29]
+        return self.location[:, None, None, None, :]
+
+    def diffuse(self, normals, points) -> torch.Tensor:
+        location = self.reshape_location(points)
+        direction = location - points
+        return diffuse(normals=normals, color=self.diffuse_color, direction=direction)
+
+    def specular(self, normals, points, camera_position, shininess) -> torch.Tensor:
+        location = self.reshape_location(points)
+        direction = location - points
+        return specular(
+            points=points,
+            normals=normals,
+            color=self.specular_color,
+            direction=direction,
+            camera_position=camera_position,
+            shininess=shininess,
+        )
+
+
+class AmbientLights(TensorProperties):
+    """
+    A light object representing the same color of light everywhere.
+    By default, this is white, which effectively means lighting is
+    not used in rendering.
+    """
+
+    def __init__(self, *, ambient_color=None, device: Device = "cpu") -> None:
+        """
+        If ambient_color is provided, it should be a sequence of
+        triples of floats.
+
+        Args:
+            ambient_color: RGB color
+            device: Device (as str or torch.device) on which the tensors should be located
+
+        The ambient_color if provided, should be
+            - 3 element tuple/list or list of lists
+            - torch tensor of shape (1, 3)
+            - torch tensor of shape (N, 3)
+        """
+        if ambient_color is None:
+            ambient_color = ((1.0, 1.0, 1.0),)
+        super().__init__(ambient_color=ambient_color, device=device)
+
+    def clone(self):
+        other = self.__class__(device=self.device)
+        return super().clone(other)
+
+    def diffuse(self, normals, points) -> torch.Tensor:
+        return torch.zeros_like(points)
+
+    def specular(self, normals, points, camera_position, shininess) -> torch.Tensor:
+        return torch.zeros_like(points)
+
+
+def _validate_light_properties(obj):
+    props = ("ambient_color", "diffuse_color", "specular_color")
+    for n in props:
+        t = getattr(obj, n)
+        if t.shape[-1] != 3:
+            msg = "Expected %s to have shape (N, 3); got %r"
+            raise ValueError(msg % (n, t.shape))
diff --git a/pytorch3d/pytorch3d/renderer/materials.py b/pytorch3d/pytorch3d/renderer/materials.py
new file mode 100644
index 0000000000000000000000000000000000000000..140c33b0e9cc3862f5ce6159639b6eea1de40a43
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/materials.py
@@ -0,0 +1,63 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+
+from ..common.types import Device
+from .utils import TensorProperties
+
+
+class Materials(TensorProperties):
+    """
+    A class for storing a batch of material properties. Currently only one
+    material per batch element is supported.
+    """
+
+    def __init__(
+        self,
+        ambient_color=((1, 1, 1),),
+        diffuse_color=((1, 1, 1),),
+        specular_color=((1, 1, 1),),
+        shininess=64,
+        device: Device = "cpu",
+    ) -> None:
+        """
+        Args:
+            ambient_color: RGB ambient reflectivity of the material
+            diffuse_color: RGB diffuse reflectivity of the material
+            specular_color: RGB specular reflectivity of the material
+            shininess: The specular exponent for the material. This defines
+                the focus of the specular highlight with a high value
+                resulting in a concentrated highlight. Shininess values
+                can range from 0-1000.
+            device: Device (as str or torch.device) on which the tensors should be located
+
+        ambient_color, diffuse_color and specular_color can be of shape
+        (1, 3) or (N, 3). shininess can be of shape (1) or (N).
+
+        The colors and shininess are broadcast against each other so need to
+        have either the same batch dimension or batch dimension = 1.
+        """
+        super().__init__(
+            device=device,
+            diffuse_color=diffuse_color,
+            ambient_color=ambient_color,
+            specular_color=specular_color,
+            shininess=shininess,
+        )
+        for n in ["ambient_color", "diffuse_color", "specular_color"]:
+            t = getattr(self, n)
+            if t.shape[-1] != 3:
+                msg = "Expected %s to have shape (N, 3); got %r"
+                raise ValueError(msg % (n, t.shape))
+        if self.shininess.shape != torch.Size([self._N]):
+            msg = "shininess should have shape (N); got %r"
+            raise ValueError(msg % repr(self.shininess.shape))
+
+    def clone(self):
+        other = Materials(device=self.device)
+        return super().clone(other)
diff --git a/pytorch3d/pytorch3d/renderer/mesh/__init__.py b/pytorch3d/pytorch3d/renderer/mesh/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..8826c3d25a9f747fd7643b928142af6841004b83
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/mesh/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from .clip import (
+    ClipFrustum,
+    ClippedFaces,
+    clip_faces,
+    convert_clipped_rasterization_to_original_faces,
+)
+from .rasterize_meshes import rasterize_meshes
+from .rasterizer import MeshRasterizer, RasterizationSettings
+from .renderer import MeshRenderer
+from .shader import TexturedSoftPhongShader  # DEPRECATED
+from .shader import (
+    BlendParams,
+    HardFlatShader,
+    HardGouraudShader,
+    HardPhongShader,
+    SoftGouraudShader,
+    SoftPhongShader,
+    SoftSilhouetteShader,
+)
+from .shading import gouraud_shading, phong_shading
+from .textures import Textures  # DEPRECATED
+from .textures import TexturesAtlas, TexturesBase, TexturesUV, TexturesVertex
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/renderer/mesh/clip.py b/pytorch3d/pytorch3d/renderer/mesh/clip.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd2f6580bd02ce5906da62edefe8a1bb2c0c127b
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/mesh/clip.py
@@ -0,0 +1,720 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Any, List, Optional, Tuple
+
+import torch
+
+
+"""
+Mesh clipping is done before rasterization and is implemented using 4 cases
+(these will be referred to throughout the functions below)
+
+Case 1: the triangle is completely in front of the clipping plane (it is left
+        unchanged)
+Case 2: the triangle is completely behind the clipping plane (it is culled)
+Case 3: the triangle has exactly two vertices behind the clipping plane (it is
+        clipped into a smaller triangle)
+Case 4: the triangle has exactly one vertex behind the clipping plane (it is clipped
+        into a smaller quadrilateral and divided into two triangular faces)
+
+After rasterization, the Fragments from the clipped/modified triangles
+are mapped back to the triangles in the original mesh. The indices,
+barycentric coordinates and distances are all relative to original mesh triangles.
+
+NOTE: It is assumed that all z-coordinates are in world coordinates (not NDC
+coordinates), while x and y coordinates may be in NDC/screen coordinates
+(i.e after applying a projective transform e.g. cameras.transform_points(points)).
+"""
+
+
+class ClippedFaces:
+    """
+    Helper class to store the data for the clipped version of a Meshes object
+    (face_verts, mesh_to_face_first_idx, num_faces_per_mesh) along with
+    conversion information (faces_clipped_to_unclipped_idx, barycentric_conversion,
+    faces_clipped_to_conversion_idx, clipped_faces_neighbor_idx) required to convert
+    barycentric coordinates from rasterization of the clipped Meshes to barycentric
+    coordinates in terms of the unclipped Meshes.
+
+    Args:
+        face_verts: FloatTensor of shape (F_clipped, 3, 3) giving the verts of
+            each of the clipped faces
+        mesh_to_face_first_idx: an tensor of shape (N,), where N is the number of meshes
+            in the batch.  The ith element stores the index into face_verts
+            of the first face of the ith mesh.
+        num_faces_per_mesh: a tensor of shape (N,) storing the number of faces in each mesh.
+        faces_clipped_to_unclipped_idx: (F_clipped,) shaped LongTensor mapping each clipped
+            face back to the face in faces_unclipped (i.e. the faces in the original meshes
+            obtained using meshes.faces_packed())
+        barycentric_conversion: (T, 3, 3) FloatTensor, where barycentric_conversion[i, :, k]
+            stores the barycentric weights in terms of the world coordinates of the original
+            (big) unclipped triangle for the kth vertex in the clipped (small) triangle.
+            If the rasterizer then expresses some NDC coordinate in terms of barycentric
+            world coordinates for the clipped (small) triangle as alpha_clipped[i,:],
+            alpha_unclipped[i, :] = barycentric_conversion[i, :, :]*alpha_clipped[i, :]
+        faces_clipped_to_conversion_idx: (F_clipped,) shaped LongTensor mapping each clipped
+            face to the applicable row of barycentric_conversion (or set to -1 if conversion is
+            not needed).
+        clipped_faces_neighbor_idx: LongTensor of shape (F_clipped,) giving the index of the
+            neighboring face for each case 4 triangle. e.g. for a case 4 face with f split
+            into two triangles (t1, t2): clipped_faces_neighbor_idx[t1_idx] = t2_idx.
+            Faces which are not clipped and subdivided are set to -1 (i.e cases 1/2/3).
+    """
+
+    __slots__ = [
+        "face_verts",
+        "mesh_to_face_first_idx",
+        "num_faces_per_mesh",
+        "faces_clipped_to_unclipped_idx",
+        "barycentric_conversion",
+        "faces_clipped_to_conversion_idx",
+        "clipped_faces_neighbor_idx",
+    ]
+
+    def __init__(
+        self,
+        face_verts: torch.Tensor,
+        mesh_to_face_first_idx: torch.Tensor,
+        num_faces_per_mesh: torch.Tensor,
+        faces_clipped_to_unclipped_idx: Optional[torch.Tensor] = None,
+        barycentric_conversion: Optional[torch.Tensor] = None,
+        faces_clipped_to_conversion_idx: Optional[torch.Tensor] = None,
+        clipped_faces_neighbor_idx: Optional[torch.Tensor] = None,
+    ) -> None:
+        self.face_verts = face_verts
+        self.mesh_to_face_first_idx = mesh_to_face_first_idx
+        self.num_faces_per_mesh = num_faces_per_mesh
+        self.faces_clipped_to_unclipped_idx = faces_clipped_to_unclipped_idx
+        self.barycentric_conversion = barycentric_conversion
+        self.faces_clipped_to_conversion_idx = faces_clipped_to_conversion_idx
+        self.clipped_faces_neighbor_idx = clipped_faces_neighbor_idx
+
+
+class ClipFrustum:
+    """
+    Helper class to store the information needed to represent a view frustum
+    (left, right, top, bottom, znear, zfar), which is used to clip or cull triangles.
+    Values left as None mean that culling should not be performed for that axis.
+    The parameters perspective_correct, cull, and z_clip_value are used to define
+    behavior for clipping triangles to the frustum.
+
+    Args:
+        left: NDC coordinate of the left clipping plane (along x axis)
+        right: NDC coordinate of the right clipping plane (along x axis)
+        top: NDC coordinate of the top clipping plane (along y axis)
+        bottom: NDC coordinate of the bottom clipping plane (along y axis)
+        znear: world space z coordinate of the near clipping plane
+        zfar: world space z coordinate of the far clipping plane
+        perspective_correct: should be set to True for a perspective camera
+        cull: if True, triangles outside the frustum should be culled
+        z_clip_value: if not None, then triangles should be clipped (possibly into
+            smaller triangles) such that z >= z_clip_value.  This avoids projections
+            that go to infinity as z->0
+    """
+
+    __slots__ = [
+        "left",
+        "right",
+        "top",
+        "bottom",
+        "znear",
+        "zfar",
+        "perspective_correct",
+        "cull",
+        "z_clip_value",
+    ]
+
+    def __init__(
+        self,
+        left: Optional[float] = None,
+        right: Optional[float] = None,
+        top: Optional[float] = None,
+        bottom: Optional[float] = None,
+        znear: Optional[float] = None,
+        zfar: Optional[float] = None,
+        perspective_correct: bool = False,
+        cull: bool = True,
+        z_clip_value: Optional[float] = None,
+    ) -> None:
+        self.left = left
+        self.right = right
+        self.top = top
+        self.bottom = bottom
+        self.znear = znear
+        self.zfar = zfar
+        self.perspective_correct = perspective_correct
+        self.cull = cull
+        self.z_clip_value = z_clip_value
+
+
+def _get_culled_faces(face_verts: torch.Tensor, frustum: ClipFrustum) -> torch.Tensor:
+    """
+    Helper function used to find all the faces in Meshes which are
+    fully outside the view frustum. A face is culled if all 3 vertices are outside
+    the same axis of the view frustum.
+
+    Args:
+        face_verts: An (F,3,3) tensor, where F is the number of faces in
+            the packed representation of Meshes. The 2nd dimension represents the 3 vertices
+            of a triangle, and the 3rd dimension stores the xyz locations of each
+            vertex.
+        frustum: An instance of the ClipFrustum class with the information on the
+            position of the clipping planes.
+
+    Returns:
+        faces_culled: An boolean tensor of size F specifying whether or not each face should be
+            culled.
+    """
+    clipping_planes = (
+        (frustum.left, 0, "<"),
+        (frustum.right, 0, ">"),
+        (frustum.top, 1, "<"),
+        (frustum.bottom, 1, ">"),
+        (frustum.znear, 2, "<"),
+        (frustum.zfar, 2, ">"),
+    )
+    faces_culled = torch.zeros(
+        [face_verts.shape[0]], dtype=torch.bool, device=face_verts.device
+    )
+    for plane in clipping_planes:
+        clip_value, axis, op = plane
+        # If clip_value is None then don't clip along that plane
+        if frustum.cull and clip_value is not None:
+            if op == "<":
+                verts_clipped = face_verts[:, axis] < clip_value
+            else:
+                verts_clipped = face_verts[:, axis] > clip_value
+
+            # If all verts are clipped then face is outside the frustum
+            faces_culled |= verts_clipped.sum(1) == 3
+
+    return faces_culled
+
+
+def _find_verts_intersecting_clipping_plane(
+    face_verts: torch.Tensor,
+    p1_face_ind: torch.Tensor,
+    clip_value: float,
+    perspective_correct: bool,
+) -> Tuple[Tuple[Any, Any, Any, Any, Any], List[Any]]:
+    r"""
+    Helper function to find the vertices used to form a new triangle for case 3/case 4 faces.
+
+    Given a list of triangles that are already known to intersect the clipping plane,
+    solve for the two vertices p4 and p5 where the edges of the triangle intersects the
+    clipping plane.
+
+                       p1
+                       /\
+                      /  \
+                     /  t \
+     _____________p4/______\p5__________ clip_value
+                   /        \
+                  /____      \
+                p2     ---____\p3
+
+    Args:
+        face_verts: An (F,3,3) tensor, where F is the number of faces in
+            the packed representation of the Meshes, the 2nd dimension represents
+            the 3 vertices of the face, and the 3rd dimension stores the xyz locations of each
+            vertex.  The z-coordinates must be represented in world coordinates, while
+            the xy-coordinates may be in NDC/screen coordinates (i.e. after projection).
+        p1_face_ind: A tensor of shape (N,) with values in the range of 0 to 2.  In each
+            case 3/case 4 triangle, two vertices are on the same side of the
+            clipping plane and the 3rd is on the other side.  p1_face_ind stores the index of
+            the vertex that is not on the same side as any other vertex in the triangle.
+        clip_value: Float, the z-value defining where to clip the triangle.
+        perspective_correct: Bool, Should be set to true if a perspective camera was
+            used and xy-coordinates of face_verts_unclipped are in NDC/screen coordinates.
+
+    Returns:
+        A 2-tuple
+            p: (p1, p2, p3, p4, p5))
+            p_barycentric (p1_bary, p2_bary, p3_bary, p4_bary, p5_bary)
+
+        Each of p1...p5 is an (F,3) tensor of the xyz locations of the 5 points in the
+        diagram above for case 3/case 4 faces. Each p1_bary...p5_bary is an (F, 3) tensor
+        storing the barycentric weights used to encode p1...p5 in terms of the the original
+        unclipped triangle.
+    """
+
+    # Let T be number of triangles in face_verts (note that these correspond to the subset
+    # of case 1 or case 2 triangles). p1_face_ind, p2_face_ind, and p3_face_ind are (T)
+    # tensors with values in the range of 0 to 2.  p1_face_ind stores the index of the
+    # vertex that is not on the same side as any other vertex in the triangle, and
+    # p2_face_ind and p3_face_ind are the indices of the other two vertices preserving
+    # the same counterclockwise or clockwise ordering
+    T = face_verts.shape[0]
+    p2_face_ind = torch.remainder(p1_face_ind + 1, 3)
+    p3_face_ind = torch.remainder(p1_face_ind + 2, 3)
+
+    # p1, p2, p3 are (T, 3) tensors storing the corresponding (x, y, z) coordinates
+    # of p1_face_ind, p2_face_ind, p3_face_ind
+    # pyre-ignore[16]
+    p1 = face_verts.gather(1, p1_face_ind[:, None, None].expand(-1, -1, 3)).squeeze(1)
+    p2 = face_verts.gather(1, p2_face_ind[:, None, None].expand(-1, -1, 3)).squeeze(1)
+    p3 = face_verts.gather(1, p3_face_ind[:, None, None].expand(-1, -1, 3)).squeeze(1)
+
+    ##################################
+    # Solve for intersection point p4
+    ##################################
+
+    # p4 is a (T, 3) tensor is the point on the segment between p1 and p2 that
+    # intersects the clipping plane.
+    # Solve for the weight w2 such that p1.z*(1-w2) + p2.z*w2 = clip_value.
+    # Then interpolate p4 = p1*(1-w2) + p2*w2 where it is assumed that z-coordinates
+    # are expressed in world coordinates (since we want to clip z in world coordinates).
+    w2 = (p1[:, 2] - clip_value) / (p1[:, 2] - p2[:, 2])
+    p4 = p1 * (1 - w2[:, None]) + p2 * w2[:, None]
+    if perspective_correct:
+        # It is assumed that all z-coordinates are in world coordinates (not NDC
+        # coordinates), while x and y coordinates may be in NDC/screen coordinates.
+        # If x and y are in NDC/screen coordinates and a projective transform was used
+        # in a perspective camera, then we effectively want to:
+        # 1. Convert back to world coordinates (by multiplying by z)
+        # 2. Interpolate using w2
+        # 3. Convert back to NDC/screen coordinates (by dividing by the new z=clip_value)
+        p1_world = p1[:, :2] * p1[:, 2:3]
+        p2_world = p2[:, :2] * p2[:, 2:3]
+        p4[:, :2] = (p1_world * (1 - w2[:, None]) + p2_world * w2[:, None]) / clip_value
+
+    ##################################
+    # Solve for intersection point p5
+    ##################################
+
+    # p5 is a (T, 3) tensor representing the point on the segment between p1 and p3 that
+    # intersects the clipping plane.
+    # Solve for the weight w3 such that p1.z * (1-w3) + p2.z * w3 = clip_value,
+    # and then interpolate p5 = p1 * (1-w3) + p3 * w3
+    w3 = (p1[:, 2] - clip_value) / (p1[:, 2] - p3[:, 2])
+    w3 = w3.detach()
+    p5 = p1 * (1 - w3[:, None]) + p3 * w3[:, None]
+    if perspective_correct:
+        # Again if using a perspective camera, convert back to world coordinates
+        # interpolate and convert back
+        p1_world = p1[:, :2] * p1[:, 2:3]
+        p3_world = p3[:, :2] * p3[:, 2:3]
+        p5[:, :2] = (p1_world * (1 - w3[:, None]) + p3_world * w3[:, None]) / clip_value
+
+    # Set the barycentric coordinates of p1,p2,p3,p4,p5 in terms of the original
+    # unclipped triangle in face_verts.
+    T_idx = torch.arange(T, device=face_verts.device)
+    p_barycentric = [torch.zeros((T, 3), device=face_verts.device) for i in range(5)]
+    p_barycentric[0][(T_idx, p1_face_ind)] = 1
+    p_barycentric[1][(T_idx, p2_face_ind)] = 1
+    p_barycentric[2][(T_idx, p3_face_ind)] = 1
+    p_barycentric[3][(T_idx, p1_face_ind)] = 1 - w2
+    p_barycentric[3][(T_idx, p2_face_ind)] = w2
+    p_barycentric[4][(T_idx, p1_face_ind)] = 1 - w3
+    p_barycentric[4][(T_idx, p3_face_ind)] = w3
+
+    p = (p1, p2, p3, p4, p5)
+
+    return p, p_barycentric
+
+
+###################
+# Main Entry point
+###################
+def clip_faces(
+    face_verts_unclipped: torch.Tensor,
+    mesh_to_face_first_idx: torch.Tensor,
+    num_faces_per_mesh: torch.Tensor,
+    frustum: ClipFrustum,
+) -> ClippedFaces:
+    """
+    Clip a mesh to the portion contained within a view frustum and with z > z_clip_value.
+
+    There are two types of clipping:
+      1) Cull triangles that are completely outside the view frustum.  This is purely
+         to save computation by reducing the number of triangles that need to be
+         rasterized.
+      2) Clip triangles into the portion of the triangle where z > z_clip_value. The
+         clipped region may be a quadrilateral, which results in splitting a triangle
+         into two triangles. This does not save computation, but is necessary to
+         correctly rasterize using perspective cameras for triangles that pass through
+         z <= 0, because NDC/screen coordinates go to infinity at z=0.
+
+    Args:
+        face_verts_unclipped: An (F, 3, 3) tensor, where F is the number of faces in
+            the packed representation of Meshes, the 2nd dimension represents the 3 vertices
+            of the triangle, and the 3rd dimension stores the xyz locations of each
+            vertex.  The z-coordinates must be represented in world coordinates, while
+            the xy-coordinates may be in NDC/screen coordinates
+        mesh_to_face_first_idx: an tensor of shape (N,), where N is the number of meshes
+            in the batch.  The ith element stores the index into face_verts_unclipped
+            of the first face of the ith mesh.
+        num_faces_per_mesh: a tensor of shape (N,) storing the number of faces in each mesh.
+        frustum: a ClipFrustum object defining the frustum used to cull faces.
+
+    Returns:
+        clipped_faces: ClippedFaces object storing a clipped version of the Meshes
+            along with tensors that can be used to convert barycentric coordinates
+            returned by rasterization of the clipped meshes into a barycentric
+            coordinates for the unclipped meshes.
+    """
+    F = face_verts_unclipped.shape[0]
+    device = face_verts_unclipped.device
+
+    # Triangles completely outside the view frustum will be culled
+    # faces_culled is of shape (F, )
+    faces_culled = _get_culled_faces(face_verts_unclipped, frustum)
+
+    # Triangles that are partially behind the z clipping plane will be clipped to
+    # smaller triangles
+    z_clip_value = frustum.z_clip_value
+    perspective_correct = frustum.perspective_correct
+    if z_clip_value is not None:
+        # (F, 3) tensor (where F is the number of triangles) marking whether each vertex
+        # in a triangle is behind the clipping plane
+        faces_clipped_verts = face_verts_unclipped[:, :, 2] < z_clip_value
+
+        # (F) dim tensor containing the number of clipped vertices in each triangle
+        faces_num_clipped_verts = faces_clipped_verts.sum(1)
+    else:
+        faces_num_clipped_verts = torch.zeros([F], device=device)
+
+    # If no triangles need to be clipped or culled, avoid unnecessary computation
+    # and return early
+    if faces_num_clipped_verts.sum().item() == 0 and faces_culled.sum().item() == 0:
+        return ClippedFaces(
+            face_verts=face_verts_unclipped,
+            mesh_to_face_first_idx=mesh_to_face_first_idx,
+            num_faces_per_mesh=num_faces_per_mesh,
+        )
+
+    #####################################################################################
+    # Classify faces into the 4 relevant cases:
+    #   1) The triangle is completely in front of the clipping plane (it is left
+    #      unchanged)
+    #   2) The triangle is completely behind the clipping plane (it is culled)
+    #   3) The triangle has exactly two vertices behind the clipping plane (it is
+    #      clipped into a smaller triangle)
+    #   4) The triangle has exactly one vertex behind the clipping plane (it is clipped
+    #      into a smaller quadrilateral and split into two triangles)
+    #####################################################################################
+
+    # pyre-ignore[16]:
+    faces_unculled = ~faces_culled
+    # Case 1:  no clipped verts or culled faces
+    cases1_unclipped = (faces_num_clipped_verts == 0) & faces_unculled
+    case1_unclipped_idx = cases1_unclipped.nonzero(as_tuple=True)[0]
+    # Case 2: all verts clipped
+    case2_unclipped = (faces_num_clipped_verts == 3) | faces_culled
+    # Case 3: two verts clipped
+    case3_unclipped = (faces_num_clipped_verts == 2) & faces_unculled
+    case3_unclipped_idx = case3_unclipped.nonzero(as_tuple=True)[0]
+    # Case 4: one vert clipped
+    case4_unclipped = (faces_num_clipped_verts == 1) & faces_unculled
+    case4_unclipped_idx = case4_unclipped.nonzero(as_tuple=True)[0]
+
+    # faces_unclipped_to_clipped_idx is an (F) dim tensor storing the index of each
+    # face to the corresponding face in face_verts_clipped.
+    # Each case 2 triangle will be culled (deleted from face_verts_clipped),
+    # while each case 4 triangle will be split into two smaller triangles
+    # (replaced by two consecutive triangles in face_verts_clipped)
+
+    # case2_unclipped is an (F,) dim 0/1 tensor of all the case2 faces
+    # case4_unclipped is an (F,) dim 0/1 tensor of all the case4 faces
+    faces_delta = case4_unclipped.int() - case2_unclipped.int()
+    # faces_delta_cum gives the per face change in index. Faces which are
+    # clipped in the original mesh are mapped to the closest non clipped face
+    # in face_verts_clipped (this doesn't matter as they are not used
+    # during rasterization anyway).
+    faces_delta_cum = faces_delta.cumsum(0) - faces_delta
+    delta = 1 + case4_unclipped.int() - case2_unclipped.int()
+    faces_unclipped_to_clipped_idx = delta.cumsum(0) - delta
+
+    ###########################################
+    # Allocate tensors for the output Meshes.
+    # These will then be filled in for each case.
+    ###########################################
+    F_clipped = (
+        F + faces_delta_cum[-1].item() + faces_delta[-1].item()
+    )  # Total number of faces in the new Meshes
+    face_verts_clipped = torch.zeros(
+        (F_clipped, 3, 3), dtype=face_verts_unclipped.dtype, device=device
+    )
+    faces_clipped_to_unclipped_idx = torch.zeros(
+        [F_clipped], dtype=torch.int64, device=device
+    )
+
+    # Update version of mesh_to_face_first_idx and num_faces_per_mesh applicable to
+    # face_verts_clipped
+    mesh_to_face_first_idx_clipped = faces_unclipped_to_clipped_idx[
+        mesh_to_face_first_idx
+    ]
+    F_clipped_t = torch.full([1], F_clipped, dtype=torch.int64, device=device)
+    num_faces_next = torch.cat((mesh_to_face_first_idx_clipped[1:], F_clipped_t))
+    num_faces_per_mesh_clipped = num_faces_next - mesh_to_face_first_idx_clipped
+
+    ################# Start Case 1 ########################################
+
+    # Case 1: Triangles are fully visible, copy unchanged triangles into the
+    # appropriate position in the new list of faces
+    case1_clipped_idx = faces_unclipped_to_clipped_idx[case1_unclipped_idx]
+    face_verts_clipped[case1_clipped_idx] = face_verts_unclipped[case1_unclipped_idx]
+    faces_clipped_to_unclipped_idx[case1_clipped_idx] = case1_unclipped_idx
+
+    # If no triangles need to be clipped but some triangles were culled, avoid
+    # unnecessary clipping computation
+    if case3_unclipped_idx.shape[0] + case4_unclipped_idx.shape[0] == 0:
+        return ClippedFaces(
+            face_verts=face_verts_clipped,
+            mesh_to_face_first_idx=mesh_to_face_first_idx_clipped,
+            num_faces_per_mesh=num_faces_per_mesh_clipped,
+            faces_clipped_to_unclipped_idx=faces_clipped_to_unclipped_idx,
+        )
+
+    ################# End Case 1 ##########################################
+
+    ################# Start Case 3 ########################################
+
+    # Case 3: exactly two vertices are behind the camera, clipping the triangle into a
+    # triangle.  In the diagram below, we clip the bottom part of the triangle, and add
+    # new vertices p4 and p5 by intersecting with the clipping plane.  The updated
+    # triangle is the triangle between p4, p1, p5
+    #
+    #                   p1  (unclipped vertex)
+    #                   /\
+    #                  /  \
+    #                 /  t \
+    # _____________p4/______\p5__________ clip_value
+    # xxxxxxxxxxxxxx/        \xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+    # xxxxxxxxxxxxx/____      \xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+    # xxxxxxxxxx p2 xxxx---____\p3 xxxxxxxxxxxxxxxxxxxxxxxxxxx
+    # xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+    faces_case3 = face_verts_unclipped[case3_unclipped_idx]
+
+    # index (0, 1, or 2) of the vertex in front of the clipping plane
+    p1_face_ind = torch.where(~faces_clipped_verts[case3_unclipped_idx])[1]
+
+    # Solve for the points p4, p5 that intersect the clipping plane
+    p, p_barycentric = _find_verts_intersecting_clipping_plane(
+        faces_case3, p1_face_ind, z_clip_value, perspective_correct
+    )
+
+    p1, _, _, p4, p5 = p
+    p1_barycentric, _, _, p4_barycentric, p5_barycentric = p_barycentric
+
+    # Store clipped triangle
+    case3_clipped_idx = faces_unclipped_to_clipped_idx[case3_unclipped_idx]
+    t_barycentric = torch.stack((p4_barycentric, p5_barycentric, p1_barycentric), 2)
+    face_verts_clipped[case3_clipped_idx] = torch.stack((p4, p5, p1), 1)
+    faces_clipped_to_unclipped_idx[case3_clipped_idx] = case3_unclipped_idx
+
+    ################# End Case 3 ##########################################
+
+    ################# Start Case 4 ########################################
+
+    # Case 4: exactly one vertex is behind the camera, clip the triangle into a
+    # quadrilateral.  In the diagram below, we clip the bottom part of the triangle,
+    # and add new vertices p4 and p5 by intersecting with the cliiping plane.  The
+    # unclipped region is a quadrilateral, which is split into two triangles:
+    #   t1: p4, p2, p5
+    #   t2: p5, p2, p3
+    #
+    #            p3_____________________p2
+    #              \               __--/
+    #               \    t2    __--   /
+    #                \     __--  t1  /
+    # ______________p5\__--_________/p4_________clip_value
+    # xxxxxxxxxxxxxxxxx\           /xxxxxxxxxxxxxxxxxx
+    # xxxxxxxxxxxxxxxxxx\         /xxxxxxxxxxxxxxxxxxx
+    # xxxxxxxxxxxxxxxxxxx\       /xxxxxxxxxxxxxxxxxxxx
+    # xxxxxxxxxxxxxxxxxxxx\     /xxxxxxxxxxxxxxxxxxxxx
+    # xxxxxxxxxxxxxxxxxxxxx\   /xxxxxxxxxxxxxxxxxxxxx
+    # xxxxxxxxxxxxxxxxxxxxxx\ /xxxxxxxxxxxxxxxxxxxxx
+    #                      p1 (clipped vertex)
+
+    faces_case4 = face_verts_unclipped[case4_unclipped_idx]
+
+    # index (0, 1, or 2) of the vertex behind the clipping plane
+    p1_face_ind = torch.where(faces_clipped_verts[case4_unclipped_idx])[1]
+
+    # Solve for the points p4, p5 that intersect the clipping plane
+    p, p_barycentric = _find_verts_intersecting_clipping_plane(
+        faces_case4, p1_face_ind, z_clip_value, perspective_correct
+    )
+    _, p2, p3, p4, p5 = p
+    _, p2_barycentric, p3_barycentric, p4_barycentric, p5_barycentric = p_barycentric
+
+    # Store clipped triangles
+    case4_clipped_idx = faces_unclipped_to_clipped_idx[case4_unclipped_idx]
+    face_verts_clipped[case4_clipped_idx] = torch.stack((p4, p2, p5), 1)
+    face_verts_clipped[case4_clipped_idx + 1] = torch.stack((p5, p2, p3), 1)
+    t1_barycentric = torch.stack((p4_barycentric, p2_barycentric, p5_barycentric), 2)
+    t2_barycentric = torch.stack((p5_barycentric, p2_barycentric, p3_barycentric), 2)
+    faces_clipped_to_unclipped_idx[case4_clipped_idx] = case4_unclipped_idx
+    faces_clipped_to_unclipped_idx[case4_clipped_idx + 1] = case4_unclipped_idx
+
+    ##################### End Case 4 #########################
+
+    # Triangles that were clipped (case 3 & case 4) will require conversion of
+    # barycentric coordinates from being in terms of the smaller clipped triangle to in terms
+    # of the original big triangle.  If there are T clipped triangles,
+    # barycentric_conversion is a (T, 3, 3) tensor, where barycentric_conversion[i, :, k]
+    # stores the barycentric weights in terms of the world coordinates of the original
+    # (big) triangle for the kth vertex in the clipped (small) triangle.  If our
+    # rasterizer then expresses some NDC coordinate in terms of barycentric
+    # world coordinates for the clipped (small) triangle as alpha_clipped[i,:],
+    #   alpha_unclipped[i, :] = barycentric_conversion[i, :, :]*alpha_clipped[i, :]
+    barycentric_conversion = torch.cat((t_barycentric, t1_barycentric, t2_barycentric))
+
+    # faces_clipped_to_conversion_idx is an (F_clipped,) shape tensor mapping each output
+    # face to the applicable row of barycentric_conversion (or set to -1 if conversion is
+    # not needed)
+    faces_to_convert_idx = torch.cat(
+        (case3_clipped_idx, case4_clipped_idx, case4_clipped_idx + 1), 0
+    )
+    barycentric_idx = torch.arange(
+        barycentric_conversion.shape[0], dtype=torch.int64, device=device
+    )
+    faces_clipped_to_conversion_idx = torch.full(
+        [F_clipped], -1, dtype=torch.int64, device=device
+    )
+    faces_clipped_to_conversion_idx[faces_to_convert_idx] = barycentric_idx
+
+    # clipped_faces_quadrilateral_ind is an (F_clipped) dim tensor
+    # For case 4 clipped triangles (where a big triangle is split in two smaller triangles),
+    # store the index of the neighboring clipped triangle.
+    # This will be needed because if the soft rasterizer includes both
+    # triangles in the list of top K nearest triangles, we
+    # should only use the one with the smaller distance.
+    clipped_faces_neighbor_idx = torch.full(
+        [F_clipped], -1, dtype=torch.int64, device=device
+    )
+    clipped_faces_neighbor_idx[case4_clipped_idx] = case4_clipped_idx + 1
+    clipped_faces_neighbor_idx[case4_clipped_idx + 1] = case4_clipped_idx
+
+    clipped_faces = ClippedFaces(
+        face_verts=face_verts_clipped,
+        mesh_to_face_first_idx=mesh_to_face_first_idx_clipped,
+        num_faces_per_mesh=num_faces_per_mesh_clipped,
+        faces_clipped_to_unclipped_idx=faces_clipped_to_unclipped_idx,
+        barycentric_conversion=barycentric_conversion,
+        faces_clipped_to_conversion_idx=faces_clipped_to_conversion_idx,
+        clipped_faces_neighbor_idx=clipped_faces_neighbor_idx,
+    )
+    return clipped_faces
+
+
+def convert_clipped_rasterization_to_original_faces(
+    pix_to_face_clipped, bary_coords_clipped, clipped_faces: ClippedFaces
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Convert rasterization Fragments (expressed as pix_to_face_clipped,
+    bary_coords_clipped, dists_clipped) of clipped Meshes computed using clip_faces()
+    to the corresponding rasterization Fragments where barycentric coordinates and
+    face indices are in terms of the original unclipped Meshes. The distances are
+    handled in the rasterizer C++/CUDA kernels (i.e. for Cases 1/3 the distance
+    can be used directly and for Case 4 triangles the distance of the pixel to
+    the closest of the two subdivided triangles is used).
+
+    Args:
+        pix_to_face_clipped: LongTensor of shape (N, image_size, image_size,
+            faces_per_pixel) giving the indices of the nearest faces at each pixel,
+            sorted in ascending z-order. Concretely
+            ``pix_to_face_clipped[n, y, x, k] = f`` means that ``faces_verts_clipped[f]``
+            is the kth closest face (in the z-direction) to pixel (y, x). Pixels that
+            are hit by fewer than faces_per_pixel are padded with -1.
+        bary_coords_clipped: FloatTensor of shape
+            (N, image_size, image_size, faces_per_pixel, 3) giving the barycentric
+            coordinates in world coordinates of the nearest faces at each pixel, sorted
+            in ascending z-order.  Concretely, if ``pix_to_face_clipped[n, y, x, k] = f``
+            then ``[w0, w1, w2] = bary_coords_clipped[n, y, x, k]`` gives the
+            barycentric coords for pixel (y, x) relative to the face defined by
+            ``unproject(face_verts_clipped[f])``. Pixels hit by fewer than
+            faces_per_pixel are padded with -1.
+        clipped_faces: an instance of ClippedFaces class giving the auxillary variables
+            for converting rasterization outputs from clipped to unclipped Meshes.
+
+    Returns:
+        3-tuple: (pix_to_face_unclipped, bary_coords_unclipped, dists_unclipped) that
+        have the same definition as (pix_to_face_clipped, bary_coords_clipped,
+        dists_clipped) except that they pertain to faces_verts_unclipped instead of
+        faces_verts_clipped (i.e the original meshes as opposed to the modified meshes)
+    """
+    faces_clipped_to_unclipped_idx = clipped_faces.faces_clipped_to_unclipped_idx
+
+    # If no clipping then return inputs
+    if (
+        faces_clipped_to_unclipped_idx is None
+        or faces_clipped_to_unclipped_idx.numel() == 0
+    ):
+        return pix_to_face_clipped, bary_coords_clipped
+
+    device = pix_to_face_clipped.device
+
+    # Convert pix_to_face indices to now refer to the faces in the unclipped Meshes.
+    # Init empty tensor to fill in all the background values which have pix_to_face=-1.
+    empty = torch.full(pix_to_face_clipped.shape, -1, device=device, dtype=torch.int64)
+    pix_to_face_unclipped = torch.where(
+        pix_to_face_clipped != -1,
+        faces_clipped_to_unclipped_idx[pix_to_face_clipped],
+        empty,
+    )
+
+    # For triangles that were clipped into smaller triangle(s), convert barycentric
+    # coordinates from being in terms of the clipped triangle to being in terms of the
+    # original unclipped triangle.
+
+    # barycentric_conversion is a (T, 3, 3) tensor such that
+    # alpha_unclipped[i, :] = barycentric_conversion[i, :, :]*alpha_clipped[i, :]
+    barycentric_conversion = clipped_faces.barycentric_conversion
+
+    # faces_clipped_to_conversion_idx is an (F_clipped,) shape tensor mapping each output
+    # face to the applicable row of barycentric_conversion (or set to -1 if conversion is
+    # not needed)
+    faces_clipped_to_conversion_idx = clipped_faces.faces_clipped_to_conversion_idx
+
+    if barycentric_conversion is not None:
+        bary_coords_unclipped = bary_coords_clipped.clone()
+
+        # Select the subset of faces that require conversion, where N is the sum
+        # number of case3/case4 triangles that are in the closest k triangles to some
+        # rasterized pixel.
+        pix_to_conversion_idx = torch.where(
+            pix_to_face_clipped != -1,
+            faces_clipped_to_conversion_idx[pix_to_face_clipped],
+            empty,
+        )
+        faces_to_convert_mask = pix_to_conversion_idx != -1
+        N = faces_to_convert_mask.sum().item()
+
+        # Expand to (N, H, W, K, 3) to be the same shape as barycentric coordinates
+        faces_to_convert_mask_expanded = faces_to_convert_mask[:, :, :, :, None].expand(
+            -1, -1, -1, -1, 3
+        )
+
+        # An (N,) dim tensor of indices into barycentric_conversion
+        conversion_idx_subset = pix_to_conversion_idx[faces_to_convert_mask]
+
+        # An (N, 3, 1) tensor of barycentric coordinates in terms of the clipped triangles
+        bary_coords_clipped_subset = bary_coords_clipped[faces_to_convert_mask_expanded]
+        bary_coords_clipped_subset = bary_coords_clipped_subset.reshape((N, 3, 1))
+
+        # An (N, 3, 3) tensor storing matrices to convert from clipped to unclipped
+        # barycentric coordinates
+        bary_conversion_subset = barycentric_conversion[conversion_idx_subset]
+
+        # An (N, 3, 1) tensor of barycentric coordinates in terms of the unclipped triangle
+        bary_coords_unclipped_subset = bary_conversion_subset.bmm(
+            bary_coords_clipped_subset
+        )
+
+        bary_coords_unclipped_subset = bary_coords_unclipped_subset.reshape([N * 3])
+        bary_coords_unclipped[
+            faces_to_convert_mask_expanded
+        ] = bary_coords_unclipped_subset
+
+        # dists for case 4 faces will be handled in the rasterizer
+        # so no need to modify them here.
+    else:
+        bary_coords_unclipped = bary_coords_clipped
+
+    return pix_to_face_unclipped, bary_coords_unclipped
diff --git a/pytorch3d/pytorch3d/renderer/mesh/rasterize_meshes.py b/pytorch3d/pytorch3d/renderer/mesh/rasterize_meshes.py
new file mode 100644
index 0000000000000000000000000000000000000000..91ff8bc5b14507cf2ddfd704fae867c89dcad9d8
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/mesh/rasterize_meshes.py
@@ -0,0 +1,774 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from pytorch3d import _C
+
+from .clip import (
+    ClipFrustum,
+    clip_faces,
+    convert_clipped_rasterization_to_original_faces,
+)
+
+
+# TODO make the epsilon user configurable
+kEpsilon = 1e-8
+
+# Maximum number of faces per bins for
+# coarse-to-fine rasterization
+kMaxFacesPerBin = 22
+
+
+def rasterize_meshes(
+    meshes,
+    image_size: Union[int, List[int], Tuple[int, int]] = 256,
+    blur_radius: float = 0.0,
+    faces_per_pixel: int = 8,
+    bin_size: Optional[int] = None,
+    max_faces_per_bin: Optional[int] = None,
+    perspective_correct: bool = False,
+    clip_barycentric_coords: bool = False,
+    cull_backfaces: bool = False,
+    z_clip_value: Optional[float] = None,
+    cull_to_frustum: bool = False,
+):
+    """
+    Rasterize a batch of meshes given the shape of the desired output image.
+    Each mesh is rasterized onto a separate image of shape
+    (H, W) if `image_size` is a tuple or (image_size, image_size) if it
+    is an int.
+
+    If the desired image size is non square (i.e. a tuple of (H, W) where H != W)
+    the aspect ratio needs special consideration. There are two aspect ratios
+    to be aware of:
+        - the aspect ratio of each pixel
+        - the aspect ratio of the output image
+    The camera can be used to set the pixel aspect ratio. In the rasterizer,
+    we assume square pixels, but variable image aspect ratio (i.e rectangle images).
+
+    In most cases you will want to set the camera aspect ratio to
+    1.0 (i.e. square pixels) and only vary the
+    `image_size` (i.e. the output image dimensions in pixels).
+
+    Args:
+        meshes: A Meshes object representing a batch of meshes, batch size N.
+        image_size: Size in pixels of the output image to be rasterized.
+            Can optionally be a tuple of (H, W) in the case of non square images.
+        blur_radius: Float distance in the range [0, 2] used to expand the face
+            bounding boxes for rasterization. Setting blur radius
+            results in blurred edges around the shape instead of a
+            hard boundary. Set to 0 for no blur.
+        faces_per_pixel (Optional): Number of faces to save per pixel, returning
+            the nearest faces_per_pixel points along the z-axis.
+        bin_size: Size of bins to use for coarse-to-fine rasterization. Setting
+            bin_size=0 uses naive rasterization; setting bin_size=None attempts to
+            set it heuristically based on the shape of the input. This should not
+            affect the output, but can affect the speed of the forward pass.
+        max_faces_per_bin: Only applicable when using coarse-to-fine rasterization
+            (bin_size > 0); this is the maximum number of faces allowed within each
+            bin. This should not affect the output values, but can affect
+            the memory usage in the forward pass.
+        perspective_correct: Bool, Whether to apply perspective correction when computing
+            barycentric coordinates for pixels. This should be set to True if a perspective
+            camera is used.
+        clip_barycentric_coords: Whether, after any perspective correction is applied
+            but before the depth is calculated (e.g. for z clipping),
+            to "correct" a location outside the face (i.e. with a negative
+            barycentric coordinate) to a position on the edge of the face.
+        cull_backfaces: Bool, Whether to only rasterize mesh faces which are
+            visible to the camera.  This assumes that vertices of
+            front-facing triangles are ordered in an anti-clockwise
+            fashion, and triangles that face away from the camera are
+            in a clockwise order relative to the current view
+            direction. NOTE: This will only work if the mesh faces are
+            consistently defined with counter-clockwise ordering when
+            viewed from the outside.
+        z_clip_value: if not None, then triangles will be clipped (and possibly
+            subdivided into smaller triangles) such that z >= z_clip_value.
+            This avoids camera projections that go to infinity as z->0.
+            Default is None as clipping affects rasterization speed and
+            should only be turned on if explicitly needed.
+            See clip.py for all the extra computation that is required.
+        cull_to_frustum: if True, triangles outside the view frustum will be culled.
+            Culling involves removing all faces which fall outside view frustum.
+            Default is False so that it is turned on only when needed.
+
+    Returns:
+        4-element tuple containing
+
+        - **pix_to_face**: LongTensor of shape
+          (N, image_size, image_size, faces_per_pixel)
+          giving the indices of the nearest faces at each pixel,
+          sorted in ascending z-order.
+          Concretely ``pix_to_face[n, y, x, k] = f`` means that
+          ``faces_verts[f]`` is the kth closest face (in the z-direction)
+          to pixel (y, x). Pixels that are hit by fewer than
+          faces_per_pixel are padded with -1.
+        - **zbuf**: FloatTensor of shape (N, image_size, image_size, faces_per_pixel)
+          giving the NDC z-coordinates of the nearest faces at each pixel,
+          sorted in ascending z-order.
+          Concretely, if ``pix_to_face[n, y, x, k] = f`` then
+          ``zbuf[n, y, x, k] = face_verts[f, 2]``. Pixels hit by fewer than
+          faces_per_pixel are padded with -1.
+        - **barycentric**: FloatTensor of shape
+          (N, image_size, image_size, faces_per_pixel, 3)
+          giving the barycentric coordinates in NDC units of the
+          nearest faces at each pixel, sorted in ascending z-order.
+          Concretely, if ``pix_to_face[n, y, x, k] = f`` then
+          ``[w0, w1, w2] = barycentric[n, y, x, k]`` gives
+          the barycentric coords for pixel (y, x) relative to the face
+          defined by ``face_verts[f]``. Pixels hit by fewer than
+          faces_per_pixel are padded with -1.
+        - **pix_dists**: FloatTensor of shape
+          (N, image_size, image_size, faces_per_pixel)
+          giving the signed Euclidean distance (in NDC units) in the
+          x/y plane of each point closest to the pixel. Concretely if
+          ``pix_to_face[n, y, x, k] = f`` then ``pix_dists[n, y, x, k]`` is the
+          squared distance between the pixel (y, x) and the face given
+          by vertices ``face_verts[f]``. Pixels hit with fewer than
+          ``faces_per_pixel`` are padded with -1.
+
+        In the case that image_size is a tuple of (H, W) then the outputs
+        will be of shape `(N, H, W, ...)`.
+    """
+    verts_packed = meshes.verts_packed()
+    faces_packed = meshes.faces_packed()
+    face_verts = verts_packed[faces_packed]
+    mesh_to_face_first_idx = meshes.mesh_to_faces_packed_first_idx()
+    num_faces_per_mesh = meshes.num_faces_per_mesh()
+
+    # In the case that H != W use the max image size to set the bin_size
+    # to accommodate the num bins constraint in the coarse rasterizer.
+    # If the ratio of H:W is large this might cause issues as the smaller
+    # dimension will have fewer bins.
+    # TODO: consider a better way of setting the bin size.
+    if isinstance(image_size, (tuple, list)):
+        if len(image_size) != 2:
+            raise ValueError("Image size can only be a tuple/list of (H, W)")
+        if not all(i > 0 for i in image_size):
+            raise ValueError(
+                "Image sizes must be greater than 0; got %d, %d" % image_size
+            )
+        if not all(type(i) == int for i in image_size):
+            raise ValueError("Image sizes must be integers; got %f, %f" % image_size)
+        max_image_size = max(*image_size)
+        im_size = image_size
+    else:
+        im_size = (image_size, image_size)
+        max_image_size = image_size
+
+    clipped_faces_neighbor_idx = None
+
+    if z_clip_value is not None or cull_to_frustum:
+        # Cull faces outside the view frustum, and clip faces that are partially
+        # behind the camera into the portion of the triangle in front of the
+        # camera.  This may change the number of faces
+        frustum = ClipFrustum(
+            left=-1,
+            right=1,
+            top=-1,
+            bottom=1,
+            perspective_correct=perspective_correct,
+            z_clip_value=z_clip_value,
+            cull=cull_to_frustum,
+        )
+        clipped_faces = clip_faces(
+            face_verts, mesh_to_face_first_idx, num_faces_per_mesh, frustum=frustum
+        )
+        face_verts = clipped_faces.face_verts
+        mesh_to_face_first_idx = clipped_faces.mesh_to_face_first_idx
+        num_faces_per_mesh = clipped_faces.num_faces_per_mesh
+
+        # For case 4 clipped triangles (where a big triangle is split in two smaller triangles),
+        # need the index of the neighboring clipped triangle as only one can be in
+        # in the top K closest faces in the rasterization step.
+        clipped_faces_neighbor_idx = clipped_faces.clipped_faces_neighbor_idx
+
+    if clipped_faces_neighbor_idx is None:
+        # Set to the default which is all -1s.
+        clipped_faces_neighbor_idx = torch.full(
+            size=(face_verts.shape[0],),
+            fill_value=-1,
+            device=meshes.device,
+            dtype=torch.int64,
+        )
+
+    # TODO: Choose naive vs coarse-to-fine based on mesh size and image size.
+    if bin_size is None:
+        if not verts_packed.is_cuda:
+            # Binned CPU rasterization is not supported.
+            bin_size = 0
+        else:
+            # TODO better heuristics for bin size.
+            if max_image_size <= 64:
+                bin_size = 8
+            else:
+                # Heuristic based formula maps max_image_size -> bin_size as follows:
+                # max_image_size < 64 -> 8
+                # 16 < max_image_size < 256 -> 16
+                # 256 < max_image_size < 512 -> 32
+                # 512 < max_image_size < 1024 -> 64
+                # 1024 < max_image_size < 2048 -> 128
+                bin_size = int(2 ** max(np.ceil(np.log2(max_image_size)) - 4, 4))
+
+    if bin_size != 0:
+        # There is a limit on the number of faces per bin in the cuda kernel.
+        faces_per_bin = 1 + (max_image_size - 1) // bin_size
+        if faces_per_bin >= kMaxFacesPerBin:
+            raise ValueError(
+                "bin_size too small, number of faces per bin must be less than %d; got %d"
+                % (kMaxFacesPerBin, faces_per_bin)
+            )
+
+    if max_faces_per_bin is None:
+        max_faces_per_bin = int(max(10000, meshes._F / 5))
+
+    # pyre-fixme[16]: `_RasterizeFaceVerts` has no attribute `apply`.
+    pix_to_face, zbuf, barycentric_coords, dists = _RasterizeFaceVerts.apply(
+        face_verts,
+        mesh_to_face_first_idx,
+        num_faces_per_mesh,
+        clipped_faces_neighbor_idx,
+        im_size,
+        blur_radius,
+        faces_per_pixel,
+        bin_size,
+        max_faces_per_bin,
+        perspective_correct,
+        clip_barycentric_coords,
+        cull_backfaces,
+    )
+
+    if z_clip_value is not None or cull_to_frustum:
+        # If faces were clipped, map the rasterization result to be in terms of the
+        # original unclipped faces.  This may involve converting barycentric
+        # coordinates
+        outputs = convert_clipped_rasterization_to_original_faces(
+            pix_to_face,
+            barycentric_coords,
+            # pyre-fixme[61]: `clipped_faces` may not be initialized here.
+            clipped_faces,
+        )
+        pix_to_face, barycentric_coords = outputs
+
+    return pix_to_face, zbuf, barycentric_coords, dists
+
+
+class _RasterizeFaceVerts(torch.autograd.Function):
+    """
+    Torch autograd wrapper for forward and backward pass of rasterize_meshes
+    implemented in C++/CUDA.
+
+    Args:
+        face_verts: Tensor of shape (F, 3, 3) giving (packed) vertex positions
+            for faces in all the meshes in the batch. Concretely,
+            face_verts[f, i] = [x, y, z] gives the coordinates for the
+            ith vertex of the fth face. These vertices are expected to
+            be in NDC coordinates in the range [-1, 1].
+        mesh_to_face_first_idx: LongTensor of shape (N) giving the index in
+            faces_verts of the first face in each mesh in
+            the batch.
+        num_faces_per_mesh: LongTensor of shape (N) giving the number of faces
+            for each mesh in the batch.
+        image_size, blur_radius, faces_per_pixel: same as rasterize_meshes.
+        perspective_correct: same as rasterize_meshes.
+        cull_backfaces: same as rasterize_meshes.
+
+    Returns:
+        same as rasterize_meshes function.
+    """
+
+    @staticmethod
+    # pyre-fixme[14]: `forward` overrides method defined in `Function` inconsistently.
+    def forward(
+        ctx,
+        face_verts: torch.Tensor,
+        mesh_to_face_first_idx: torch.Tensor,
+        num_faces_per_mesh: torch.Tensor,
+        clipped_faces_neighbor_idx: torch.Tensor,
+        image_size: Union[List[int], Tuple[int, int]] = (256, 256),
+        blur_radius: float = 0.01,
+        faces_per_pixel: int = 0,
+        bin_size: int = 0,
+        max_faces_per_bin: int = 0,
+        perspective_correct: bool = False,
+        clip_barycentric_coords: bool = False,
+        cull_backfaces: bool = False,
+        z_clip_value: Optional[float] = None,
+        cull_to_frustum: bool = True,
+    ):
+        # pyre-fixme[16]: Module `pytorch3d` has no attribute `_C`.
+        pix_to_face, zbuf, barycentric_coords, dists = _C.rasterize_meshes(
+            face_verts,
+            mesh_to_face_first_idx,
+            num_faces_per_mesh,
+            clipped_faces_neighbor_idx,
+            image_size,
+            blur_radius,
+            faces_per_pixel,
+            bin_size,
+            max_faces_per_bin,
+            perspective_correct,
+            clip_barycentric_coords,
+            cull_backfaces,
+        )
+
+        ctx.save_for_backward(face_verts, pix_to_face)
+        ctx.mark_non_differentiable(pix_to_face)
+        ctx.perspective_correct = perspective_correct
+        ctx.clip_barycentric_coords = clip_barycentric_coords
+        return pix_to_face, zbuf, barycentric_coords, dists
+
+    @staticmethod
+    def backward(ctx, grad_pix_to_face, grad_zbuf, grad_barycentric_coords, grad_dists):
+        grad_face_verts = None
+        grad_mesh_to_face_first_idx = None
+        grad_num_faces_per_mesh = None
+        grad_clipped_faces_neighbor_idx = None
+        grad_image_size = None
+        grad_radius = None
+        grad_faces_per_pixel = None
+        grad_bin_size = None
+        grad_max_faces_per_bin = None
+        grad_perspective_correct = None
+        grad_clip_barycentric_coords = None
+        grad_cull_backfaces = None
+        face_verts, pix_to_face = ctx.saved_tensors
+        grad_face_verts = _C.rasterize_meshes_backward(
+            face_verts,
+            pix_to_face,
+            grad_zbuf,
+            grad_barycentric_coords,
+            grad_dists,
+            ctx.perspective_correct,
+            ctx.clip_barycentric_coords,
+        )
+        grads = (
+            grad_face_verts,
+            grad_mesh_to_face_first_idx,
+            grad_num_faces_per_mesh,
+            grad_clipped_faces_neighbor_idx,
+            grad_image_size,
+            grad_radius,
+            grad_faces_per_pixel,
+            grad_bin_size,
+            grad_max_faces_per_bin,
+            grad_perspective_correct,
+            grad_clip_barycentric_coords,
+            grad_cull_backfaces,
+        )
+        return grads
+
+
+def non_square_ndc_range(S1, S2):
+    """
+    In the case of non square images, we scale the NDC range
+    to maintain the aspect ratio. The smaller dimension has NDC
+    range of 2.0.
+
+    Args:
+        S1: dimension along with the NDC range is needed
+        S2: the other image dimension
+
+    Returns:
+        ndc_range: NDC range for dimension S1
+    """
+    ndc_range = 2.0
+    if S1 > S2:
+        ndc_range = (S1 / S2) * ndc_range
+    return ndc_range
+
+
+def pix_to_non_square_ndc(i, S1, S2):
+    """
+    The default value of the NDC range is [-1, 1].
+    However in the case of non square images, we scale the NDC range
+    to maintain the aspect ratio. The smaller dimension has NDC
+    range from [-1, 1] and the other dimension is scaled by
+    the ratio of H:W.
+    e.g. for image size (H, W) = (64, 128)
+       Height NDC range: [-1, 1]
+       Width NDC range: [-2, 2]
+
+    Args:
+        i: pixel position on axes S1
+        S1: dimension along with i is given
+        S2: the other image dimension
+
+    Returns:
+        pixel: NDC coordinate of point i for dimension S1
+    """
+    # NDC: x-offset + (i * pixel_width + half_pixel_width)
+    ndc_range = non_square_ndc_range(S1, S2)
+    offset = ndc_range / 2.0
+    return -offset + (ndc_range * i + offset) / S1
+
+
+def rasterize_meshes_python(  # noqa: C901
+    meshes,
+    image_size: Union[int, Tuple[int, int]] = 256,
+    blur_radius: float = 0.0,
+    faces_per_pixel: int = 8,
+    perspective_correct: bool = False,
+    clip_barycentric_coords: bool = False,
+    cull_backfaces: bool = False,
+    z_clip_value: Optional[float] = None,
+    cull_to_frustum: bool = True,
+    clipped_faces_neighbor_idx: Optional[torch.Tensor] = None,
+):
+    """
+    Naive PyTorch implementation of mesh rasterization with the same inputs and
+    outputs as the rasterize_meshes function.
+
+    This function is not optimized and is implemented as a comparison for the
+    C++/CUDA implementations.
+    """
+    N = len(meshes)
+    H, W = image_size if isinstance(image_size, tuple) else (image_size, image_size)
+
+    K = faces_per_pixel
+    device = meshes.device
+
+    verts_packed = meshes.verts_packed()
+    faces_packed = meshes.faces_packed()
+    faces_verts = verts_packed[faces_packed]
+    mesh_to_face_first_idx = meshes.mesh_to_faces_packed_first_idx()
+    num_faces_per_mesh = meshes.num_faces_per_mesh()
+
+    if z_clip_value is not None or cull_to_frustum:
+        # Cull faces outside the view frustum, and clip faces that are partially
+        # behind the camera into the portion of the triangle in front of the
+        # camera.  This may change the number of faces
+        frustum = ClipFrustum(
+            left=-1,
+            right=1,
+            top=-1,
+            bottom=1,
+            perspective_correct=perspective_correct,
+            z_clip_value=z_clip_value,
+            cull=cull_to_frustum,
+        )
+        clipped_faces = clip_faces(
+            faces_verts, mesh_to_face_first_idx, num_faces_per_mesh, frustum=frustum
+        )
+        faces_verts = clipped_faces.face_verts
+        mesh_to_face_first_idx = clipped_faces.mesh_to_face_first_idx
+        num_faces_per_mesh = clipped_faces.num_faces_per_mesh
+
+    # Initialize output tensors.
+    face_idxs = torch.full(
+        (N, H, W, K), fill_value=-1, dtype=torch.int64, device=device
+    )
+    zbuf = torch.full((N, H, W, K), fill_value=-1, dtype=torch.float32, device=device)
+    bary_coords = torch.full(
+        (N, H, W, K, 3), fill_value=-1, dtype=torch.float32, device=device
+    )
+    pix_dists = torch.full(
+        (N, H, W, K), fill_value=-1, dtype=torch.float32, device=device
+    )
+
+    # Calculate all face bounding boxes.
+    x_mins = torch.min(faces_verts[:, :, 0], dim=1, keepdim=True).values
+    x_maxs = torch.max(faces_verts[:, :, 0], dim=1, keepdim=True).values
+    y_mins = torch.min(faces_verts[:, :, 1], dim=1, keepdim=True).values
+    y_maxs = torch.max(faces_verts[:, :, 1], dim=1, keepdim=True).values
+    z_mins = torch.min(faces_verts[:, :, 2], dim=1, keepdim=True).values
+
+    # Expand by blur radius.
+    x_mins = x_mins - np.sqrt(blur_radius) - kEpsilon
+    x_maxs = x_maxs + np.sqrt(blur_radius) + kEpsilon
+    y_mins = y_mins - np.sqrt(blur_radius) - kEpsilon
+    y_maxs = y_maxs + np.sqrt(blur_radius) + kEpsilon
+
+    # Loop through meshes in the batch.
+    for n in range(N):
+        face_start_idx = mesh_to_face_first_idx[n]
+        face_stop_idx = face_start_idx + num_faces_per_mesh[n]
+
+        # Iterate through the horizontal lines of the image from top to bottom.
+        for yi in range(H):
+            # Y coordinate of one end of the image. Reverse the ordering
+            # of yi so that +Y is pointing up in the image.
+            yfix = H - 1 - yi
+            yf = pix_to_non_square_ndc(yfix, H, W)
+
+            # Iterate through pixels on this horizontal line, left to right.
+            for xi in range(W):
+                # X coordinate of one end of the image. Reverse the ordering
+                # of xi so that +X is pointing to the left in the image.
+                xfix = W - 1 - xi
+                xf = pix_to_non_square_ndc(xfix, W, H)
+                top_k_points = []
+
+                # Check whether each face in the mesh affects this pixel.
+                for f in range(face_start_idx, face_stop_idx):
+                    face = faces_verts[f].squeeze()
+                    v0, v1, v2 = face.unbind(0)
+
+                    face_area = edge_function(v0, v1, v2)
+
+                    # Ignore triangles facing away from the camera.
+                    back_face = face_area < 0
+                    if cull_backfaces and back_face:
+                        continue
+
+                    # Ignore faces which have zero area.
+                    if face_area == 0.0:
+                        continue
+
+                    outside_bbox = (
+                        xf < x_mins[f]
+                        or xf > x_maxs[f]
+                        or yf < y_mins[f]
+                        or yf > y_maxs[f]
+                    )
+
+                    # Faces with at least one vertex behind the camera won't
+                    # render correctly and should be removed or clipped before
+                    # calling the rasterizer
+                    if z_mins[f] < kEpsilon:
+                        continue
+
+                    # Check if pixel is outside of face bbox.
+                    if outside_bbox:
+                        continue
+
+                    # Compute barycentric coordinates and pixel z distance.
+                    pxy = torch.tensor([xf, yf], dtype=torch.float32, device=device)
+
+                    bary = barycentric_coordinates(pxy, v0[:2], v1[:2], v2[:2])
+                    if perspective_correct:
+                        z0, z1, z2 = v0[2], v1[2], v2[2]
+                        l0, l1, l2 = bary[0], bary[1], bary[2]
+                        top0 = l0 * z1 * z2
+                        top1 = z0 * l1 * z2
+                        top2 = z0 * z1 * l2
+                        bot = top0 + top1 + top2
+                        bary = torch.stack([top0 / bot, top1 / bot, top2 / bot])
+
+                    # Check if inside before clipping
+                    inside = all(x > 0.0 for x in bary)
+
+                    # Barycentric clipping
+                    if clip_barycentric_coords:
+                        bary = barycentric_coordinates_clip(bary)
+                    # use clipped barycentric coords to calculate the z value
+                    pz = bary[0] * v0[2] + bary[1] * v1[2] + bary[2] * v2[2]
+
+                    # Check if point is behind the image.
+                    if pz < 0:
+                        continue
+
+                    # Calculate signed 2D distance from point to face.
+                    # Points inside the triangle have negative distance.
+                    dist = point_triangle_distance(pxy, v0[:2], v1[:2], v2[:2])
+
+                    # Add an epsilon to prevent errors when comparing distance
+                    # to blur radius.
+                    if not inside and dist >= blur_radius:
+                        continue
+
+                    # Handle the case where a face (f) partially behind the image plane is
+                    # clipped to a quadrilateral and then split into two faces (t1, t2).
+                    top_k_idx = -1
+                    if (
+                        clipped_faces_neighbor_idx is not None
+                        and clipped_faces_neighbor_idx[f] != -1
+                    ):
+                        neighbor_idx = clipped_faces_neighbor_idx[f]
+                        # See if neighbor_idx is in top_k and find index
+                        top_k_idx = [
+                            i
+                            for i, val in enumerate(top_k_points)
+                            if val[1] == neighbor_idx
+                        ]
+                        top_k_idx = top_k_idx[0] if len(top_k_idx) > 0 else -1
+
+                    if top_k_idx != -1 and dist < top_k_points[top_k_idx][3]:
+                        # Overwrite the neighbor with current face info
+                        top_k_points[top_k_idx] = (pz, f, bary, dist, inside)
+                    else:
+                        # Handle as a normal face
+                        top_k_points.append((pz, f, bary, dist, inside))
+
+                    top_k_points.sort()
+                    if len(top_k_points) > K:
+                        top_k_points = top_k_points[:K]
+
+                # Save to output tensors.
+                for k, (pz, f, bary, dist, inside) in enumerate(top_k_points):
+                    zbuf[n, yi, xi, k] = pz
+                    face_idxs[n, yi, xi, k] = f
+                    bary_coords[n, yi, xi, k, 0] = bary[0]
+                    bary_coords[n, yi, xi, k, 1] = bary[1]
+                    bary_coords[n, yi, xi, k, 2] = bary[2]
+                    # Write the signed distance
+                    pix_dists[n, yi, xi, k] = -dist if inside else dist
+
+    if z_clip_value is not None or cull_to_frustum:
+        # If faces were clipped, map the rasterization result to be in terms of the
+        # original unclipped faces.  This may involve converting barycentric
+        # coordinates
+        (face_idxs, bary_coords,) = convert_clipped_rasterization_to_original_faces(
+            face_idxs,
+            bary_coords,
+            # pyre-fixme[61]: `clipped_faces` may not be initialized here.
+            clipped_faces,
+        )
+
+    return face_idxs, zbuf, bary_coords, pix_dists
+
+
+def edge_function(p, v0, v1):
+    r"""
+    Determines whether a point p is on the right side of a 2D line segment
+    given by the end points v0, v1.
+
+    Args:
+        p: (x, y) Coordinates of a point.
+        v0, v1: (x, y) Coordinates of the end points of the edge.
+
+    Returns:
+        area: The signed area of the parallelogram given by the vectors
+
+              .. code-block:: python
+
+                  B = p - v0
+                  A = v1 - v0
+
+                        v1 ________
+                          /\      /
+                      A  /  \    /
+                        /    \  /
+                    v0 /______\/
+                          B    p
+
+             The area can also be interpreted as the cross product A x B.
+             If the sign of the area is positive, the point p is on the
+             right side of the edge. Negative area indicates the point is on
+             the left side of the edge. i.e. for an edge v1 - v0
+
+             .. code-block:: python
+
+                             v1
+                            /
+                           /
+                    -     /    +
+                         /
+                        /
+                      v0
+    """
+    return (p[0] - v0[0]) * (v1[1] - v0[1]) - (p[1] - v0[1]) * (v1[0] - v0[0])
+
+
+def barycentric_coordinates_clip(bary):
+    """
+    Clip negative barycentric coordinates to 0.0 and renormalize so
+    the barycentric coordinates for a point sum to 1. When the blur_radius
+    is greater than 0, a face will still be recorded as overlapping a pixel
+    if the pixel is outside the face. In this case at least one of the
+    barycentric coordinates for the pixel relative to the face will be negative.
+    Clipping will ensure that the texture and z buffer are interpolated correctly.
+
+    Args:
+        bary: tuple of barycentric coordinates
+
+    Returns
+        bary_clip: (w0, w1, w2) barycentric coordinates with no negative values.
+    """
+    # Only negative values are clamped to 0.0.
+    w0_clip = torch.clamp(bary[0], min=0.0)
+    w1_clip = torch.clamp(bary[1], min=0.0)
+    w2_clip = torch.clamp(bary[2], min=0.0)
+    bary_sum = torch.clamp(w0_clip + w1_clip + w2_clip, min=1e-5)
+    w0_clip = w0_clip / bary_sum
+    w1_clip = w1_clip / bary_sum
+    w2_clip = w2_clip / bary_sum
+
+    return (w0_clip, w1_clip, w2_clip)
+
+
+def barycentric_coordinates(p, v0, v1, v2):
+    """
+    Compute the barycentric coordinates of a point relative to a triangle.
+
+    Args:
+        p: Coordinates of a point.
+        v0, v1, v2: Coordinates of the triangle vertices.
+
+    Returns
+        bary: (w0, w1, w2) barycentric coordinates in the range [0, 1].
+    """
+    area = edge_function(v2, v0, v1) + kEpsilon  # 2 x face area.
+    w0 = edge_function(p, v1, v2) / area
+    w1 = edge_function(p, v2, v0) / area
+    w2 = edge_function(p, v0, v1) / area
+    return (w0, w1, w2)
+
+
+def point_line_distance(p, v0, v1):
+    """
+    Return minimum distance between line segment (v1 - v0) and point p.
+
+    Args:
+        p: Coordinates of a point.
+        v0, v1: Coordinates of the end points of the line segment.
+
+    Returns:
+        non-square distance to the boundary of the triangle.
+
+    Consider the line extending the segment - this can be parameterized as
+    ``v0 + t (v1 - v0)``.
+
+    First find the projection of point p onto the line. It falls where
+    ``t = [(p - v0) . (v1 - v0)] / |v1 - v0|^2``
+    where . is the dot product.
+
+    The parameter t is clamped from [0, 1] to handle points outside the
+    segment (v1 - v0).
+
+    Once the projection of the point on the segment is known, the distance from
+    p to the projection gives the minimum distance to the segment.
+    """
+    if p.shape != v0.shape != v1.shape:
+        raise ValueError("All points must have the same number of coordinates")
+
+    v1v0 = v1 - v0
+    l2 = v1v0.dot(v1v0)  # |v1 - v0|^2
+    if l2 <= kEpsilon:
+        return (p - v1).dot(p - v1)  # v0 == v1
+
+    t = v1v0.dot(p - v0) / l2
+    t = torch.clamp(t, min=0.0, max=1.0)
+    p_proj = v0 + t * v1v0
+    delta_p = p_proj - p
+    return delta_p.dot(delta_p)
+
+
+def point_triangle_distance(p, v0, v1, v2):
+    """
+    Return shortest distance between a point and a triangle.
+
+    Args:
+        p: Coordinates of a point.
+        v0, v1, v2: Coordinates of the three triangle vertices.
+
+    Returns:
+        shortest absolute distance from the point to the triangle.
+    """
+    if p.shape != v0.shape != v1.shape != v2.shape:
+        raise ValueError("All points must have the same number of coordinates")
+
+    e01_dist = point_line_distance(p, v0, v1)
+    e02_dist = point_line_distance(p, v0, v2)
+    e12_dist = point_line_distance(p, v1, v2)
+    edge_dists_min = torch.min(torch.min(e01_dist, e02_dist), e12_dist)
+
+    return edge_dists_min
diff --git a/pytorch3d/pytorch3d/renderer/mesh/rasterizer.py b/pytorch3d/pytorch3d/renderer/mesh/rasterizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..59a061b2bd242b42a5ed4373a6bc5a17a285a0a0
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/mesh/rasterizer.py
@@ -0,0 +1,207 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import NamedTuple, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from .rasterize_meshes import rasterize_meshes
+
+
+# Class to store the outputs of mesh rasterization
+class Fragments(NamedTuple):
+    pix_to_face: torch.Tensor
+    zbuf: torch.Tensor
+    bary_coords: torch.Tensor
+    dists: torch.Tensor
+
+
+@dataclass
+class RasterizationSettings:
+    """
+    Class to store the mesh rasterization params with defaults
+
+    Members:
+        image_size: Either common height and width or (height, width), in pixels.
+        blur_radius: Float distance in the range [0, 2] used to expand the face
+            bounding boxes for rasterization. Setting blur radius
+            results in blurred edges around the shape instead of a
+            hard boundary. Set to 0 for no blur.
+        faces_per_pixel: (int) Number of faces to keep track of per pixel.
+            We return the nearest faces_per_pixel faces along the z-axis.
+        bin_size: Size of bins to use for coarse-to-fine rasterization. Setting
+            bin_size=0 uses naive rasterization; setting bin_size=None attempts
+            to set it heuristically based on the shape of the input. This should
+            not affect the output, but can affect the speed of the forward pass.
+        max_faces_per_bin: Only applicable when using coarse-to-fine
+            rasterization (bin_size != 0); this is the maximum number of faces
+            allowed within each bin. This should not affect the output values,
+            but can affect the memory usage in the forward pass.
+            Setting max_faces_per_bin=None attempts to set with a heuristic.
+        perspective_correct: Whether to apply perspective correction when
+            computing barycentric coordinates for pixels.
+            None (default) means make correction if the camera uses perspective.
+        clip_barycentric_coords: Whether, after any perspective correction
+            is applied but before the depth is calculated (e.g. for
+            z clipping), to "correct" a location outside the face (i.e. with
+            a negative barycentric coordinate) to a position on the edge of the
+            face. None (default) means clip if blur_radius > 0, which is a condition
+            under which such outside-face-points are likely.
+        cull_backfaces: Whether to only rasterize mesh faces which are
+            visible to the camera.  This assumes that vertices of
+            front-facing triangles are ordered in an anti-clockwise
+            fashion, and triangles that face away from the camera are
+            in a clockwise order relative to the current view
+            direction. NOTE: This will only work if the mesh faces are
+            consistently defined with counter-clockwise ordering when
+            viewed from the outside.
+        z_clip_value: if not None, then triangles will be clipped (and possibly
+            subdivided into smaller triangles) such that z >= z_clip_value.
+            This avoids camera projections that go to infinity as z->0.
+            Default is None as clipping affects rasterization speed and
+            should only be turned on if explicitly needed.
+            See clip.py for all the extra computation that is required.
+        cull_to_frustum: Whether to cull triangles outside the view frustum.
+            Culling involves removing all faces which fall outside view frustum.
+            Default is False for performance as often not needed.
+    """
+
+    image_size: Union[int, Tuple[int, int]] = 256
+    blur_radius: float = 0.0
+    faces_per_pixel: int = 1
+    bin_size: Optional[int] = None
+    max_faces_per_bin: Optional[int] = None
+    perspective_correct: Optional[bool] = None
+    clip_barycentric_coords: Optional[bool] = None
+    cull_backfaces: bool = False
+    z_clip_value: Optional[float] = None
+    cull_to_frustum: bool = False
+
+
+class MeshRasterizer(nn.Module):
+    """
+    This class implements methods for rasterizing a batch of heterogeneous
+    Meshes.
+    """
+
+    def __init__(self, cameras=None, raster_settings=None) -> None:
+        """
+        Args:
+            cameras: A cameras object which has a  `transform_points` method
+                which returns the transformed points after applying the
+                world-to-view and view-to-ndc transformations.
+            raster_settings: the parameters for rasterization. This should be a
+                named tuple.
+
+        All these initial settings can be overridden by passing keyword
+        arguments to the forward function.
+        """
+        super().__init__()
+        if raster_settings is None:
+            raster_settings = RasterizationSettings()
+
+        self.cameras = cameras
+        self.raster_settings = raster_settings
+
+    def to(self, device):
+        # Manually move to device cameras as it is not a subclass of nn.Module
+        self.cameras = self.cameras.to(device)
+        return self
+
+    def transform(self, meshes_world, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            meshes_world: a Meshes object representing a batch of meshes with
+                vertex coordinates in world space.
+
+        Returns:
+            meshes_proj: a Meshes object with the vertex positions projected
+            in NDC space
+
+        NOTE: keeping this as a separate function for readability but it could
+        be moved into forward.
+        """
+        cameras = kwargs.get("cameras", self.cameras)
+        if cameras is None:
+            msg = "Cameras must be specified either at initialization \
+                or in the forward pass of MeshRasterizer"
+            raise ValueError(msg)
+
+        n_cameras = len(cameras)
+        if n_cameras != 1 and n_cameras != len(meshes_world):
+            msg = "Wrong number (%r) of cameras for %r meshes"
+            raise ValueError(msg % (n_cameras, len(meshes_world)))
+
+        verts_world = meshes_world.verts_padded()
+
+        # NOTE: Retaining view space z coordinate for now.
+        # TODO: Revisit whether or not to transform z coordinate to [-1, 1] or
+        # [0, 1] range.
+        eps = kwargs.get("eps", None)
+        verts_view = cameras.get_world_to_view_transform(**kwargs).transform_points(
+            verts_world, eps=eps
+        )
+        # view to NDC transform
+        to_ndc_transform = cameras.get_ndc_camera_transform(**kwargs)
+        projection_transform = cameras.get_projection_transform(**kwargs).compose(
+            to_ndc_transform
+        )
+        verts_ndc = projection_transform.transform_points(verts_view, eps=eps)
+
+        verts_ndc[..., 2] = verts_view[..., 2]
+        meshes_ndc = meshes_world.update_padded(new_verts_padded=verts_ndc)
+        return meshes_ndc
+
+    def forward(self, meshes_world, **kwargs) -> Fragments:
+        """
+        Args:
+            meshes_world: a Meshes object representing a batch of meshes with
+                          coordinates in world space.
+        Returns:
+            Fragments: Rasterization outputs as a named tuple.
+        """
+        meshes_proj = self.transform(meshes_world, **kwargs)
+        raster_settings = kwargs.get("raster_settings", self.raster_settings)
+
+        # By default, turn on clip_barycentric_coords if blur_radius > 0.
+        # When blur_radius > 0, a face can be matched to a pixel that is outside the
+        # face, resulting in negative barycentric coordinates.
+        clip_barycentric_coords = raster_settings.clip_barycentric_coords
+        if clip_barycentric_coords is None:
+            clip_barycentric_coords = raster_settings.blur_radius > 0.0
+
+        # If not specified, infer perspective_correct and z_clip_value from the camera
+        cameras = kwargs.get("cameras", self.cameras)
+        if raster_settings.perspective_correct is not None:
+            perspective_correct = raster_settings.perspective_correct
+        else:
+            perspective_correct = cameras.is_perspective()
+        if raster_settings.z_clip_value is not None:
+            z_clip = raster_settings.z_clip_value
+        else:
+            znear = cameras.get_znear()
+            if isinstance(znear, torch.Tensor):
+                znear = znear.min().item()
+            z_clip = None if not perspective_correct or znear is None else znear / 2
+
+        pix_to_face, zbuf, bary_coords, dists = rasterize_meshes(
+            meshes_proj,
+            image_size=raster_settings.image_size,
+            blur_radius=raster_settings.blur_radius,
+            faces_per_pixel=raster_settings.faces_per_pixel,
+            bin_size=raster_settings.bin_size,
+            max_faces_per_bin=raster_settings.max_faces_per_bin,
+            clip_barycentric_coords=clip_barycentric_coords,
+            perspective_correct=perspective_correct,
+            cull_backfaces=raster_settings.cull_backfaces,
+            z_clip_value=z_clip,
+            cull_to_frustum=raster_settings.cull_to_frustum,
+        )
+        return Fragments(
+            pix_to_face=pix_to_face, zbuf=zbuf, bary_coords=bary_coords, dists=dists
+        )
diff --git a/pytorch3d/pytorch3d/renderer/mesh/renderer.py b/pytorch3d/pytorch3d/renderer/mesh/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..c335de1a0fe06d2802f0962323cfe6351d696233
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/mesh/renderer.py
@@ -0,0 +1,105 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+import torch.nn as nn
+
+
+# A renderer class should be initialized with a
+# function for rasterization and a function for shading.
+# The rasterizer should:
+#     - transform inputs from world -> screen space
+#     - rasterize inputs
+#     - return fragments
+# The shader can take fragments as input along with any other properties of
+# the scene and generate images.
+
+# E.g. rasterize inputs and then shade
+#
+# fragments = self.rasterize(meshes)
+# images = self.shader(fragments, meshes)
+# return images
+
+
+class MeshRenderer(nn.Module):
+    """
+    A class for rendering a batch of heterogeneous meshes. The class should
+    be initialized with a rasterizer and shader class which each have a forward
+    function.
+    """
+
+    def __init__(self, rasterizer, shader) -> None:
+        super().__init__()
+        self.rasterizer = rasterizer
+        self.shader = shader
+
+    def to(self, device):
+        # Rasterizer and shader have submodules which are not of type nn.Module
+        self.rasterizer.to(device)
+        self.shader.to(device)
+        return self
+
+    def forward(self, meshes_world, **kwargs) -> torch.Tensor:
+        """
+        Render a batch of images from a batch of meshes by rasterizing and then
+        shading.
+
+        NOTE: If the blur radius for rasterization is > 0.0, some pixels can
+        have one or more barycentric coordinates lying outside the range [0, 1].
+        For a pixel with out of bounds barycentric coordinates with respect to a
+        face f, clipping is required before interpolating the texture uv
+        coordinates and z buffer so that the colors and depths are limited to
+        the range for the corresponding face.
+        For this set rasterizer.raster_settings.clip_barycentric_coords=True
+        """
+        fragments = self.rasterizer(meshes_world, **kwargs)
+        images = self.shader(fragments, meshes_world, **kwargs)
+
+        return images
+
+
+class MeshRendererWithFragments(nn.Module):
+    """
+    A class for rendering a batch of heterogeneous meshes. The class should
+    be initialized with a rasterizer and shader class which each have a forward
+    function.
+
+    In the forward pass this class returns the `fragments` from which intermediate
+    values such as the depth map can be easily extracted e.g.
+
+    .. code-block:: python
+        images, fragments = renderer(meshes)
+        depth = fragments.zbuf
+    """
+
+    def __init__(self, rasterizer, shader) -> None:
+        super().__init__()
+        self.rasterizer = rasterizer
+        self.shader = shader
+
+    def to(self, device):
+        # Rasterizer and shader have submodules which are not of type nn.Module
+        self.rasterizer.to(device)
+        self.shader.to(device)
+
+    def forward(self, meshes_world, **kwargs):
+        """
+        Render a batch of images from a batch of meshes by rasterizing and then
+        shading.
+
+        NOTE: If the blur radius for rasterization is > 0.0, some pixels can
+        have one or more barycentric coordinates lying outside the range [0, 1].
+        For a pixel with out of bounds barycentric coordinates with respect to a
+        face f, clipping is required before interpolating the texture uv
+        coordinates and z buffer so that the colors and depths are limited to
+        the range for the corresponding face.
+        For this set rasterizer.raster_settings.clip_barycentric_coords=True
+        """
+        fragments = self.rasterizer(meshes_world, **kwargs)
+        images = self.shader(fragments, meshes_world, **kwargs)
+
+        return images, fragments
diff --git a/pytorch3d/pytorch3d/renderer/mesh/shader.py b/pytorch3d/pytorch3d/renderer/mesh/shader.py
new file mode 100644
index 0000000000000000000000000000000000000000..755b213dc325f68f0f03b8c2dd998ec3a88db71a
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/mesh/shader.py
@@ -0,0 +1,414 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from typing import Optional
+
+import torch
+import torch.nn as nn
+
+from ...common.types import Device
+from ...structures.meshes import Meshes
+from ..blending import (
+    BlendParams,
+    hard_rgb_blend,
+    sigmoid_alpha_blend,
+    softmax_rgb_blend,
+)
+from ..lighting import PointLights
+from ..materials import Materials
+from ..utils import TensorProperties
+from .rasterizer import Fragments
+from .shading import flat_shading, gouraud_shading, phong_shading
+
+
+# A Shader should take as input fragments from the output of rasterization
+# along with scene params and output images. A shader could perform operations
+# such as:
+#     - interpolate vertex attributes for all the fragments
+#     - sample colors from a texture map
+#     - apply per pixel lighting
+#     - blend colors across top K faces per pixel.
+
+
+class HardPhongShader(nn.Module):
+    """
+    Per pixel lighting - the lighting model is applied using the interpolated
+    coordinates and normals for each pixel. The blending function hard assigns
+    the color of the closest face for each pixel.
+
+    To use the default values, simply initialize the shader with the desired
+    device e.g.
+
+    .. code-block::
+
+        shader = HardPhongShader(device=torch.device("cuda:0"))
+    """
+
+    def __init__(
+        self,
+        device: Device = "cpu",
+        cameras: Optional[TensorProperties] = None,
+        lights: Optional[TensorProperties] = None,
+        materials: Optional[Materials] = None,
+        blend_params: Optional[BlendParams] = None,
+    ) -> None:
+        super().__init__()
+        self.lights = lights if lights is not None else PointLights(device=device)
+        self.materials = (
+            materials if materials is not None else Materials(device=device)
+        )
+        self.cameras = cameras
+        self.blend_params = blend_params if blend_params is not None else BlendParams()
+
+    # pyre-fixme[14]: `to` overrides method defined in `Module` inconsistently.
+    def to(self, device: Device):
+        # Manually move to device modules which are not subclasses of nn.Module
+        cameras = self.cameras
+        if cameras is not None:
+            self.cameras = cameras.to(device)
+        self.materials = self.materials.to(device)
+        self.lights = self.lights.to(device)
+        return self
+
+    def forward(self, fragments: Fragments, meshes: Meshes, **kwargs) -> torch.Tensor:
+        cameras = kwargs.get("cameras", self.cameras)
+        if cameras is None:
+            msg = "Cameras must be specified either at initialization \
+                or in the forward pass of HardPhongShader"
+            raise ValueError(msg)
+
+        texels = meshes.sample_textures(fragments)
+        lights = kwargs.get("lights", self.lights)
+        materials = kwargs.get("materials", self.materials)
+        blend_params = kwargs.get("blend_params", self.blend_params)
+        colors = phong_shading(
+            meshes=meshes,
+            fragments=fragments,
+            texels=texels,
+            lights=lights,
+            cameras=cameras,
+            materials=materials,
+        )
+        images = hard_rgb_blend(colors, fragments, blend_params)
+        return images
+
+
+class SoftPhongShader(nn.Module):
+    """
+    Per pixel lighting - the lighting model is applied using the interpolated
+    coordinates and normals for each pixel. The blending function returns the
+    soft aggregated color using all the faces per pixel.
+
+    To use the default values, simply initialize the shader with the desired
+    device e.g.
+
+    .. code-block::
+
+        shader = SoftPhongShader(device=torch.device("cuda:0"))
+    """
+
+    def __init__(
+        self,
+        device: Device = "cpu",
+        cameras: Optional[TensorProperties] = None,
+        lights: Optional[TensorProperties] = None,
+        materials: Optional[Materials] = None,
+        blend_params: Optional[BlendParams] = None,
+    ) -> None:
+        super().__init__()
+        self.lights = lights if lights is not None else PointLights(device=device)
+        self.materials = (
+            materials if materials is not None else Materials(device=device)
+        )
+        self.cameras = cameras
+        self.blend_params = blend_params if blend_params is not None else BlendParams()
+
+    # pyre-fixme[14]: `to` overrides method defined in `Module` inconsistently.
+    def to(self, device: Device):
+        # Manually move to device modules which are not subclasses of nn.Module
+        cameras = self.cameras
+        if cameras is not None:
+            self.cameras = cameras.to(device)
+        self.materials = self.materials.to(device)
+        self.lights = self.lights.to(device)
+        return self
+
+    def forward(self, fragments: Fragments, meshes: Meshes, **kwargs) -> torch.Tensor:
+        cameras = kwargs.get("cameras", self.cameras)
+        if cameras is None:
+            msg = "Cameras must be specified either at initialization \
+                or in the forward pass of SoftPhongShader"
+            raise ValueError(msg)
+
+        texels = meshes.sample_textures(fragments)
+        lights = kwargs.get("lights", self.lights)
+        materials = kwargs.get("materials", self.materials)
+        blend_params = kwargs.get("blend_params", self.blend_params)
+        colors = phong_shading(
+            meshes=meshes,
+            fragments=fragments,
+            texels=texels,
+            lights=lights,
+            cameras=cameras,
+            materials=materials,
+        )
+        znear = kwargs.get("znear", getattr(cameras, "znear", 1.0))
+        zfar = kwargs.get("zfar", getattr(cameras, "zfar", 100.0))
+        images = softmax_rgb_blend(
+            colors, fragments, blend_params, znear=znear, zfar=zfar
+        )
+        return images
+
+
+class HardGouraudShader(nn.Module):
+    """
+    Per vertex lighting - the lighting model is applied to the vertex colors and
+    the colors are then interpolated using the barycentric coordinates to
+    obtain the colors for each pixel. The blending function hard assigns
+    the color of the closest face for each pixel.
+
+    To use the default values, simply initialize the shader with the desired
+    device e.g.
+
+    .. code-block::
+
+        shader = HardGouraudShader(device=torch.device("cuda:0"))
+    """
+
+    def __init__(
+        self,
+        device: Device = "cpu",
+        cameras: Optional[TensorProperties] = None,
+        lights: Optional[TensorProperties] = None,
+        materials: Optional[Materials] = None,
+        blend_params: Optional[BlendParams] = None,
+    ) -> None:
+        super().__init__()
+        self.lights = lights if lights is not None else PointLights(device=device)
+        self.materials = (
+            materials if materials is not None else Materials(device=device)
+        )
+        self.cameras = cameras
+        self.blend_params = blend_params if blend_params is not None else BlendParams()
+
+    # pyre-fixme[14]: `to` overrides method defined in `Module` inconsistently.
+    def to(self, device: Device):
+        # Manually move to device modules which are not subclasses of nn.Module
+        cameras = self.cameras
+        if cameras is not None:
+            self.cameras = cameras.to(device)
+        self.materials = self.materials.to(device)
+        self.lights = self.lights.to(device)
+        return self
+
+    def forward(self, fragments: Fragments, meshes: Meshes, **kwargs) -> torch.Tensor:
+        cameras = kwargs.get("cameras", self.cameras)
+        if cameras is None:
+            msg = "Cameras must be specified either at initialization \
+                or in the forward pass of HardGouraudShader"
+            raise ValueError(msg)
+        lights = kwargs.get("lights", self.lights)
+        materials = kwargs.get("materials", self.materials)
+        blend_params = kwargs.get("blend_params", self.blend_params)
+
+        # As Gouraud shading applies the illumination to the vertex
+        # colors, the interpolated pixel texture is calculated in the
+        # shading step. In comparison, for Phong shading, the pixel
+        # textures are computed first after which the illumination is
+        # applied.
+        pixel_colors = gouraud_shading(
+            meshes=meshes,
+            fragments=fragments,
+            lights=lights,
+            cameras=cameras,
+            materials=materials,
+        )
+        images = hard_rgb_blend(pixel_colors, fragments, blend_params)
+        return images
+
+
+class SoftGouraudShader(nn.Module):
+    """
+    Per vertex lighting - the lighting model is applied to the vertex colors and
+    the colors are then interpolated using the barycentric coordinates to
+    obtain the colors for each pixel. The blending function returns the
+    soft aggregated color using all the faces per pixel.
+
+    To use the default values, simply initialize the shader with the desired
+    device e.g.
+
+    .. code-block::
+
+        shader = SoftGouraudShader(device=torch.device("cuda:0"))
+    """
+
+    def __init__(
+        self,
+        device: Device = "cpu",
+        cameras: Optional[TensorProperties] = None,
+        lights: Optional[TensorProperties] = None,
+        materials: Optional[Materials] = None,
+        blend_params: Optional[BlendParams] = None,
+    ) -> None:
+        super().__init__()
+        self.lights = lights if lights is not None else PointLights(device=device)
+        self.materials = (
+            materials if materials is not None else Materials(device=device)
+        )
+        self.cameras = cameras
+        self.blend_params = blend_params if blend_params is not None else BlendParams()
+
+    # pyre-fixme[14]: `to` overrides method defined in `Module` inconsistently.
+    def to(self, device: Device):
+        # Manually move to device modules which are not subclasses of nn.Module
+        cameras = self.cameras
+        if cameras is not None:
+            self.cameras = cameras.to(device)
+        self.materials = self.materials.to(device)
+        self.lights = self.lights.to(device)
+        return self
+
+    def forward(self, fragments: Fragments, meshes: Meshes, **kwargs) -> torch.Tensor:
+        cameras = kwargs.get("cameras", self.cameras)
+        if cameras is None:
+            msg = "Cameras must be specified either at initialization \
+                or in the forward pass of SoftGouraudShader"
+            raise ValueError(msg)
+        lights = kwargs.get("lights", self.lights)
+        materials = kwargs.get("materials", self.materials)
+        pixel_colors = gouraud_shading(
+            meshes=meshes,
+            fragments=fragments,
+            lights=lights,
+            cameras=cameras,
+            materials=materials,
+        )
+        znear = kwargs.get("znear", getattr(cameras, "znear", 1.0))
+        zfar = kwargs.get("zfar", getattr(cameras, "zfar", 100.0))
+        images = softmax_rgb_blend(
+            pixel_colors, fragments, self.blend_params, znear=znear, zfar=zfar
+        )
+        return images
+
+
+def TexturedSoftPhongShader(
+    device: Device = "cpu",
+    cameras: Optional[TensorProperties] = None,
+    lights: Optional[TensorProperties] = None,
+    materials: Optional[Materials] = None,
+    blend_params: Optional[BlendParams] = None,
+):
+    """
+    TexturedSoftPhongShader class has been DEPRECATED. Use SoftPhongShader instead.
+    Preserving TexturedSoftPhongShader as a function for backwards compatibility.
+    """
+    warnings.warn(
+        """TexturedSoftPhongShader is now deprecated;
+            use SoftPhongShader instead.""",
+        PendingDeprecationWarning,
+    )
+    return SoftPhongShader(
+        device=device,
+        cameras=cameras,
+        lights=lights,
+        materials=materials,
+        blend_params=blend_params,
+    )
+
+
+class HardFlatShader(nn.Module):
+    """
+    Per face lighting - the lighting model is applied using the average face
+    position and the face normal. The blending function hard assigns
+    the color of the closest face for each pixel.
+
+    To use the default values, simply initialize the shader with the desired
+    device e.g.
+
+    .. code-block::
+
+        shader = HardFlatShader(device=torch.device("cuda:0"))
+    """
+
+    def __init__(
+        self,
+        device: Device = "cpu",
+        cameras: Optional[TensorProperties] = None,
+        lights: Optional[TensorProperties] = None,
+        materials: Optional[Materials] = None,
+        blend_params: Optional[BlendParams] = None,
+    ) -> None:
+        super().__init__()
+        self.lights = lights if lights is not None else PointLights(device=device)
+        self.materials = (
+            materials if materials is not None else Materials(device=device)
+        )
+        self.cameras = cameras
+        self.blend_params = blend_params if blend_params is not None else BlendParams()
+
+    # pyre-fixme[14]: `to` overrides method defined in `Module` inconsistently.
+    def to(self, device: Device):
+        # Manually move to device modules which are not subclasses of nn.Module
+        cameras = self.cameras
+        if cameras is not None:
+            self.cameras = cameras.to(device)
+        self.materials = self.materials.to(device)
+        self.lights = self.lights.to(device)
+        return self
+
+    def forward(self, fragments: Fragments, meshes: Meshes, **kwargs) -> torch.Tensor:
+        cameras = kwargs.get("cameras", self.cameras)
+        if cameras is None:
+            msg = "Cameras must be specified either at initialization \
+                or in the forward pass of HardFlatShader"
+            raise ValueError(msg)
+        texels = meshes.sample_textures(fragments)
+        lights = kwargs.get("lights", self.lights)
+        materials = kwargs.get("materials", self.materials)
+        blend_params = kwargs.get("blend_params", self.blend_params)
+        colors = flat_shading(
+            meshes=meshes,
+            fragments=fragments,
+            texels=texels,
+            lights=lights,
+            cameras=cameras,
+            materials=materials,
+        )
+        images = hard_rgb_blend(colors, fragments, blend_params)
+        return images
+
+
+class SoftSilhouetteShader(nn.Module):
+    """
+    Calculate the silhouette by blending the top K faces for each pixel based
+    on the 2d euclidean distance of the center of the pixel to the mesh face.
+
+    Use this shader for generating silhouettes similar to SoftRasterizer [0].
+
+    .. note::
+
+        To be consistent with SoftRasterizer, initialize the
+        RasterizationSettings for the rasterizer with
+        `blur_radius = np.log(1. / 1e-4 - 1.) * blend_params.sigma`
+
+    [0] Liu et al, 'Soft Rasterizer: A Differentiable Renderer for Image-based
+        3D Reasoning', ICCV 2019
+    """
+
+    def __init__(self, blend_params: Optional[BlendParams] = None) -> None:
+        super().__init__()
+        self.blend_params = blend_params if blend_params is not None else BlendParams()
+
+    def forward(self, fragments: Fragments, meshes: Meshes, **kwargs) -> torch.Tensor:
+        """
+        Only want to render the silhouette so RGB values can be ones.
+        There is no need for lighting or texturing
+        """
+        colors = torch.ones_like(fragments.bary_coords)
+        blend_params = kwargs.get("blend_params", self.blend_params)
+        images = sigmoid_alpha_blend(colors, fragments, blend_params)
+        return images
diff --git a/pytorch3d/pytorch3d/renderer/mesh/shading.py b/pytorch3d/pytorch3d/renderer/mesh/shading.py
new file mode 100644
index 0000000000000000000000000000000000000000..31c612aa83f8d677ff071c012386b11962a12f98
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/mesh/shading.py
@@ -0,0 +1,195 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import Tuple
+
+import torch
+from pytorch3d.ops import interpolate_face_attributes
+
+from .textures import TexturesVertex
+
+
+def _apply_lighting(
+    points, normals, lights, cameras, materials
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Args:
+        points: torch tensor of shape (N, ..., 3) or (P, 3).
+        normals: torch tensor of shape (N, ..., 3) or (P, 3)
+        lights: instance of the Lights class.
+        cameras: instance of the Cameras class.
+        materials: instance of the Materials class.
+
+    Returns:
+        ambient_color: same shape as materials.ambient_color
+        diffuse_color: same shape as the input points
+        specular_color: same shape as the input points
+    """
+    light_diffuse = lights.diffuse(normals=normals, points=points)
+    light_specular = lights.specular(
+        normals=normals,
+        points=points,
+        camera_position=cameras.get_camera_center(),
+        shininess=materials.shininess,
+    )
+    ambient_color = materials.ambient_color * lights.ambient_color
+    diffuse_color = materials.diffuse_color * light_diffuse
+    specular_color = materials.specular_color * light_specular
+
+    if normals.dim() == 2 and points.dim() == 2:
+        # If given packed inputs remove batch dim in output.
+        return (
+            ambient_color.squeeze(),
+            diffuse_color.squeeze(),
+            specular_color.squeeze(),
+        )
+
+    if ambient_color.ndim != diffuse_color.ndim:
+        # Reshape from (N, 3) to have dimensions compatible with
+        # diffuse_color which is of shape (N, H, W, K, 3)
+        ambient_color = ambient_color[:, None, None, None, :]
+    return ambient_color, diffuse_color, specular_color
+
+
+def phong_shading(
+    meshes, fragments, lights, cameras, materials, texels
+) -> torch.Tensor:
+    """
+    Apply per pixel shading. First interpolate the vertex normals and
+    vertex coordinates using the barycentric coordinates to get the position
+    and normal at each pixel. Then compute the illumination for each pixel.
+    The pixel color is obtained by multiplying the pixel textures by the ambient
+    and diffuse illumination and adding the specular component.
+
+    Args:
+        meshes: Batch of meshes
+        fragments: Fragments named tuple with the outputs of rasterization
+        lights: Lights class containing a batch of lights
+        cameras: Cameras class containing a batch of cameras
+        materials: Materials class containing a batch of material properties
+        texels: texture per pixel of shape (N, H, W, K, 3)
+
+    Returns:
+        colors: (N, H, W, K, 3)
+    """
+    verts = meshes.verts_packed()  # (V, 3)
+    faces = meshes.faces_packed()  # (F, 3)
+    vertex_normals = meshes.verts_normals_packed()  # (V, 3)
+    faces_verts = verts[faces]
+    faces_normals = vertex_normals[faces]
+    pixel_coords = interpolate_face_attributes(
+        fragments.pix_to_face, fragments.bary_coords, faces_verts
+    )
+    pixel_normals = interpolate_face_attributes(
+        fragments.pix_to_face, fragments.bary_coords, faces_normals
+    )
+    ambient, diffuse, specular = _apply_lighting(
+        pixel_coords, pixel_normals, lights, cameras, materials
+    )
+    colors = (ambient + diffuse) * texels + specular
+    return colors
+
+
+def gouraud_shading(meshes, fragments, lights, cameras, materials) -> torch.Tensor:
+    """
+    Apply per vertex shading. First compute the vertex illumination by applying
+    ambient, diffuse and specular lighting. If vertex color is available,
+    combine the ambient and diffuse vertex illumination with the vertex color
+    and add the specular component to determine the vertex shaded color.
+    Then interpolate the vertex shaded colors using the barycentric coordinates
+    to get a color per pixel.
+
+    Gouraud shading is only supported for meshes with texture type `TexturesVertex`.
+    This is because the illumination is applied to the vertex colors.
+
+    Args:
+        meshes: Batch of meshes
+        fragments: Fragments named tuple with the outputs of rasterization
+        lights: Lights class containing a batch of lights parameters
+        cameras: Cameras class containing a batch of cameras parameters
+        materials: Materials class containing a batch of material properties
+
+    Returns:
+        colors: (N, H, W, K, 3)
+    """
+    if not isinstance(meshes.textures, TexturesVertex):
+        raise ValueError("Mesh textures must be an instance of TexturesVertex")
+
+    faces = meshes.faces_packed()  # (F, 3)
+    verts = meshes.verts_packed()  # (V, 3)
+    verts_normals = meshes.verts_normals_packed()  # (V, 3)
+    verts_colors = meshes.textures.verts_features_packed()  # (V, D)
+    vert_to_mesh_idx = meshes.verts_packed_to_mesh_idx()
+
+    # Format properties of lights and materials so they are compatible
+    # with the packed representation of the vertices. This transforms
+    # all tensor properties in the class from shape (N, ...) -> (V, ...) where
+    # V is the number of packed vertices. If the number of meshes in the
+    # batch is one then this is not necessary.
+    if len(meshes) > 1:
+        lights = lights.clone().gather_props(vert_to_mesh_idx)
+        cameras = cameras.clone().gather_props(vert_to_mesh_idx)
+        materials = materials.clone().gather_props(vert_to_mesh_idx)
+
+    # Calculate the illumination at each vertex
+    ambient, diffuse, specular = _apply_lighting(
+        verts, verts_normals, lights, cameras, materials
+    )
+
+    verts_colors_shaded = verts_colors * (ambient + diffuse) + specular
+    face_colors = verts_colors_shaded[faces]
+    colors = interpolate_face_attributes(
+        fragments.pix_to_face, fragments.bary_coords, face_colors
+    )
+    return colors
+
+
+def flat_shading(meshes, fragments, lights, cameras, materials, texels) -> torch.Tensor:
+    """
+    Apply per face shading. Use the average face position and the face normals
+    to compute the ambient, diffuse and specular lighting. Apply the ambient
+    and diffuse color to the pixel color and add the specular component to
+    determine the final pixel color.
+
+    Args:
+        meshes: Batch of meshes
+        fragments: Fragments named tuple with the outputs of rasterization
+        lights: Lights class containing a batch of lights parameters
+        cameras: Cameras class containing a batch of cameras parameters
+        materials: Materials class containing a batch of material properties
+        texels: texture per pixel of shape (N, H, W, K, 3)
+
+    Returns:
+        colors: (N, H, W, K, 3)
+    """
+    verts = meshes.verts_packed()  # (V, 3)
+    faces = meshes.faces_packed()  # (F, 3)
+    face_normals = meshes.faces_normals_packed()  # (V, 3)
+    faces_verts = verts[faces]
+    face_coords = faces_verts.mean(dim=-2)  # (F, 3, XYZ) mean xyz across verts
+
+    # Replace empty pixels in pix_to_face with 0 in order to interpolate.
+    mask = fragments.pix_to_face == -1
+    pix_to_face = fragments.pix_to_face.clone()
+    pix_to_face[mask] = 0
+
+    N, H, W, K = pix_to_face.shape
+    idx = pix_to_face.view(N * H * W * K, 1).expand(N * H * W * K, 3)
+
+    # gather pixel coords
+    pixel_coords = face_coords.gather(0, idx).view(N, H, W, K, 3)
+    pixel_coords[mask] = 0.0
+    # gather pixel normals
+    pixel_normals = face_normals.gather(0, idx).view(N, H, W, K, 3)
+    pixel_normals[mask] = 0.0
+
+    # Calculate the illumination at each face
+    ambient, diffuse, specular = _apply_lighting(
+        pixel_coords, pixel_normals, lights, cameras, materials
+    )
+    colors = (ambient + diffuse) * texels + specular
+    return colors
diff --git a/pytorch3d/pytorch3d/renderer/mesh/textures.py b/pytorch3d/pytorch3d/renderer/mesh/textures.py
new file mode 100644
index 0000000000000000000000000000000000000000..58abf5b71c962791e85bc106a4b664a398d38d60
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/mesh/textures.py
@@ -0,0 +1,1521 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+import warnings
+from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from pytorch3d.ops import interpolate_face_attributes
+from pytorch3d.structures.utils import list_to_packed, list_to_padded, padded_to_list
+from torch.nn.functional import interpolate
+
+from .utils import PackedRectangle, Rectangle, pack_unique_rectangles
+
+
+# This file contains classes and helper functions for texturing.
+# There are three types of textures: TexturesVertex, TexturesAtlas
+# and TexturesUV which inherit from a base textures class TexturesBase.
+#
+# Each texture class has a method 'sample_textures' to sample a
+# value given barycentric coordinates.
+#
+# All the textures accept either list or padded inputs. The values
+# are stored as either per face values (TexturesAtlas, TexturesUV),
+# or per face vertex features (TexturesVertex).
+
+
+def _list_to_padded_wrapper(
+    x: List[torch.Tensor],
+    pad_size: Union[list, tuple, None] = None,
+    pad_value: float = 0.0,
+) -> torch.Tensor:
+    r"""
+    This is a wrapper function for
+    pytorch3d.structures.utils.list_to_padded function which only accepts
+    3-dimensional inputs.
+
+    For this use case, the input x is of shape (F, 3, ...) where only F
+    is different for each element in the list
+
+    Transforms a list of N tensors each of shape (Mi, ...) into a single tensor
+    of shape (N, pad_size, ...), or (N, max(Mi), ...)
+    if pad_size is None.
+
+    Args:
+      x: list of Tensors
+      pad_size: int specifying the size of the first dimension
+        of the padded tensor
+      pad_value: float value to be used to fill the padded tensor
+
+    Returns:
+      x_padded: tensor consisting of padded input tensors
+    """
+    N = len(x)
+    dims = x[0].ndim
+    reshape_dims = x[0].shape[1:]
+    D = torch.prod(torch.tensor(reshape_dims)).item()
+    x_reshaped = []
+    for y in x:
+        if y.ndim != dims and y.shape[1:] != reshape_dims:
+            msg = (
+                "list_to_padded requires tensors to have the same number of dimensions"
+            )
+            raise ValueError(msg)
+        x_reshaped.append(y.reshape(-1, D))
+    x_padded = list_to_padded(x_reshaped, pad_size=pad_size, pad_value=pad_value)
+    return x_padded.reshape((N, -1) + reshape_dims)
+
+
+def _padded_to_list_wrapper(
+    x: torch.Tensor, split_size: Union[list, tuple, None] = None
+) -> List[torch.Tensor]:
+    r"""
+    This is a wrapper function for pytorch3d.structures.utils.padded_to_list
+    which only accepts 3-dimensional inputs.
+
+    For this use case, the input x is of shape (N, F, ...) where F
+    is the number of faces which is different for each tensor in the batch.
+
+    This function transforms a padded tensor of shape (N, M, ...) into a
+    list of N tensors of shape (Mi, ...) where (Mi) is specified in
+    split_size(i), or of shape (M,) if split_size is None.
+
+    Args:
+      x: padded Tensor
+      split_size: list of ints defining the number of items for each tensor
+        in the output list.
+
+    Returns:
+      x_list: a list of tensors
+    """
+    N, M = x.shape[:2]
+    reshape_dims = x.shape[2:]
+    D = torch.prod(torch.tensor(reshape_dims)).item()
+    x_reshaped = x.reshape(N, M, D)
+    x_list = padded_to_list(x_reshaped, split_size=split_size)
+    x_list = [xl.reshape((xl.shape[0],) + reshape_dims) for xl in x_list]
+    return x_list
+
+
+def _pad_texture_maps(
+    images: Union[Tuple[torch.Tensor], List[torch.Tensor]], align_corners: bool
+) -> torch.Tensor:
+    """
+    Pad all texture images so they have the same height and width.
+
+    Args:
+        images: list of N tensors of shape (H_i, W_i, C)
+        align_corners: used for interpolation
+
+    Returns:
+        tex_maps: Tensor of shape (N, max_H, max_W, C)
+    """
+    tex_maps = []
+    max_H = 0
+    max_W = 0
+    for im in images:
+        h, w, _C = im.shape
+        if h > max_H:
+            max_H = h
+        if w > max_W:
+            max_W = w
+        tex_maps.append(im)
+    max_shape = (max_H, max_W)
+
+    for i, image in enumerate(tex_maps):
+        if image.shape[:2] != max_shape:
+            image_BCHW = image.permute(2, 0, 1)[None]
+            new_image_BCHW = interpolate(
+                image_BCHW,
+                # pyre-fixme[6]: Expected `Optional[int]` for 2nd param but got
+                #  `Tuple[int, int]`.
+                size=max_shape,
+                mode="bilinear",
+                align_corners=align_corners,
+            )
+            tex_maps[i] = new_image_BCHW[0].permute(1, 2, 0)
+    tex_maps = torch.stack(tex_maps, dim=0)  # (num_tex_maps, max_H, max_W, C)
+    return tex_maps
+
+
+# A base class for defining a batch of textures
+# with helper methods.
+# This is also useful to have so that inside `Meshes`
+# we can allow the input textures to be any texture
+# type which is an instance of the base class.
+class TexturesBase:
+    def isempty(self):
+        if self._N is not None and self.valid is not None:
+            return self._N == 0 or self.valid.eq(False).all()
+        return False
+
+    def to(self, device):
+        for k in dir(self):
+            v = getattr(self, k)
+            if isinstance(v, (list, tuple)) and all(
+                torch.is_tensor(elem) for elem in v
+            ):
+                v = [elem.to(device) for elem in v]
+                setattr(self, k, v)
+            if torch.is_tensor(v) and v.device != device:
+                setattr(self, k, v.to(device))
+        self.device = device
+        return self
+
+    def _extend(self, N: int, props: List[str]) -> Dict[str, Union[torch.Tensor, List]]:
+        """
+        Create a dict with the specified properties
+        repeated N times per batch element.
+
+        Args:
+            N: number of new copies of each texture
+                in the batch.
+            props: a List of strings which refer to either
+                class attributes or class methods which
+                return tensors or lists.
+
+        Returns:
+            Dict with the same keys as props. The values are the
+            extended properties.
+        """
+        if not isinstance(N, int):
+            raise ValueError("N must be an integer.")
+        if N <= 0:
+            raise ValueError("N must be > 0.")
+
+        new_props = {}
+        for p in props:
+            t = getattr(self, p)
+            if callable(t):
+                t = t()  # class method
+            if isinstance(t, list):
+                if not all(isinstance(elem, (int, float)) for elem in t):
+                    raise ValueError("Extend only supports lists of scalars")
+                t = [[ti] * N for ti in t]
+                new_props[p] = list(itertools.chain(*t))
+            elif torch.is_tensor(t):
+                new_props[p] = t.repeat_interleave(N, dim=0)
+        return new_props
+
+    def _getitem(self, index: Union[int, slice], props: List[str]):
+        """
+        Helper function for __getitem__
+        """
+        new_props = {}
+        if isinstance(index, (int, slice)):
+            for p in props:
+                t = getattr(self, p)
+                if callable(t):
+                    t = t()  # class method
+                new_props[p] = t[index]
+        elif isinstance(index, list):
+            index = torch.tensor(index)
+        if isinstance(index, torch.Tensor):
+            if index.dtype == torch.bool:
+                index = index.nonzero()
+                index = index.squeeze(1) if index.numel() > 0 else index
+                index = index.tolist()
+            for p in props:
+                t = getattr(self, p)
+                if callable(t):
+                    t = t()  # class method
+                new_props[p] = [t[i] for i in index]
+
+        return new_props
+
+    def sample_textures(self) -> torch.Tensor:
+        """
+        Different texture classes sample textures in different ways
+        e.g. for vertex textures, the values at each vertex
+        are interpolated across the face using the barycentric
+        coordinates.
+        Each texture class should implement a sample_textures
+        method to take the `fragments` from rasterization.
+        Using `fragments.pix_to_face` and `fragments.bary_coords`
+        this function should return the sampled texture values for
+        each pixel in the output image.
+        """
+        raise NotImplementedError()
+
+    def faces_verts_textures_packed(self) -> torch.Tensor:
+        """
+        Returns the texture for each vertex for each face in the mesh.
+        For N meshes, this function returns sum(Fi)x3xC where Fi is the
+        number of faces in the i-th mesh and C is the dimensional of
+        the feature (C = 3 for RGB textures).
+        You can use the utils function in structures.utils to convert the
+        packed representation to a list or padded.
+        """
+        raise NotImplementedError()
+
+    def clone(self) -> "TexturesBase":
+        """
+        Each texture class should implement a method
+        to clone all necessary internal tensors.
+        """
+        raise NotImplementedError()
+
+    def detach(self) -> "TexturesBase":
+        """
+        Each texture class should implement a method
+        to detach all necessary internal tensors.
+        """
+        raise NotImplementedError()
+
+    def __getitem__(self, index) -> "TexturesBase":
+        """
+        Each texture class should implement a method
+        to get the texture properties for the
+        specified elements in the batch.
+        The TexturesBase._getitem(i) method
+        can be used as a helper function to retrieve the
+        class attributes for item i. Then, a new
+        instance of the child class can be created with
+        the attributes.
+        """
+        raise NotImplementedError()
+
+
+def Textures(
+    maps: Optional[Union[List[torch.Tensor], torch.Tensor]] = None,
+    faces_uvs: Optional[torch.Tensor] = None,
+    verts_uvs: Optional[torch.Tensor] = None,
+    verts_rgb: Optional[torch.Tensor] = None,
+) -> TexturesBase:
+    """
+    Textures class has been DEPRECATED.
+    Preserving Textures as a function for backwards compatibility.
+
+    Args:
+        maps: texture map per mesh. This can either be a list of maps
+          [(H, W, C)] or a padded tensor of shape (N, H, W, C).
+        faces_uvs: (N, F, 3) tensor giving the index into verts_uvs for each
+            vertex in the face. Padding value is assumed to be -1.
+        verts_uvs: (N, V, 2) tensor giving the uv coordinate per vertex.
+        verts_rgb: (N, V, C) tensor giving the color per vertex. Padding
+            value is assumed to be -1. (C=3 for RGB.)
+
+
+    Returns:
+        a Textures class which is an instance of TexturesBase e.g. TexturesUV,
+        TexturesAtlas, TexturesVertex
+
+    """
+
+    warnings.warn(
+        """Textures class is deprecated,
+        use TexturesUV, TexturesAtlas, TexturesVertex instead.
+        Textures class will be removed in future releases.""",
+        PendingDeprecationWarning,
+    )
+
+    if faces_uvs is not None and verts_uvs is not None and maps is not None:
+        return TexturesUV(maps=maps, faces_uvs=faces_uvs, verts_uvs=verts_uvs)
+
+    if verts_rgb is not None:
+        return TexturesVertex(verts_features=verts_rgb)
+
+    raise ValueError(
+        "Textures either requires all three of (faces uvs, verts uvs, maps) or verts rgb"
+    )
+
+
+class TexturesAtlas(TexturesBase):
+    def __init__(self, atlas: Union[torch.Tensor, List[torch.Tensor]]) -> None:
+        """
+        A texture representation where each face has a square texture map.
+        This is based on the implementation from SoftRasterizer [1].
+
+        Args:
+            atlas: (N, F, R, R, C) tensor giving the per face texture map.
+                The atlas can be created during obj loading with the
+                pytorch3d.io.load_obj function - in the input arguments
+                set `create_texture_atlas=True`. The atlas will be
+                returned in aux.texture_atlas.
+
+
+        The padded and list representations of the textures are stored
+        and the packed representations is computed on the fly and
+        not cached.
+
+        [1] Liu et al, 'Soft Rasterizer: A Differentiable Renderer for Image-based
+            3D Reasoning', ICCV 2019
+            See also https://github.com/ShichenLiu/SoftRas/issues/21
+        """
+        if isinstance(atlas, (list, tuple)):
+            correct_format = all(
+                (
+                    torch.is_tensor(elem)
+                    and elem.ndim == 4
+                    and elem.shape[1] == elem.shape[2]
+                    and elem.shape[1] == atlas[0].shape[1]
+                )
+                for elem in atlas
+            )
+            if not correct_format:
+                msg = (
+                    "Expected atlas to be a list of tensors of shape (F, R, R, C) "
+                    "with the same value of R."
+                )
+                raise ValueError(msg)
+            self._atlas_list = atlas
+            self._atlas_padded = None
+            self.device = torch.device("cpu")
+
+            # These values may be overridden when textures is
+            # passed into the Meshes constructor. For more details
+            # refer to the __init__ of Meshes.
+            self._N = len(atlas)
+            self._num_faces_per_mesh = [len(a) for a in atlas]
+
+            if self._N > 0:
+                self.device = atlas[0].device
+
+        elif torch.is_tensor(atlas):
+            if atlas.ndim != 5:
+                msg = "Expected atlas to be of shape (N, F, R, R, C); got %r"
+                raise ValueError(msg % repr(atlas.ndim))
+            self._atlas_padded = atlas
+            self._atlas_list = None
+            self.device = atlas.device
+
+            # These values may be overridden when textures is
+            # passed into the Meshes constructor. For more details
+            # refer to the __init__ of Meshes.
+            self._N = len(atlas)
+            max_F = atlas.shape[1]
+            self._num_faces_per_mesh = [max_F] * self._N
+        else:
+            raise ValueError("Expected atlas to be a tensor or list")
+
+        # The num_faces_per_mesh, N and valid
+        # are reset inside the Meshes object when textures is
+        # passed into the Meshes constructor. For more details
+        # refer to the __init__ of Meshes.
+        self.valid = torch.ones((self._N,), dtype=torch.bool, device=self.device)
+
+    def clone(self) -> "TexturesAtlas":
+        tex = self.__class__(atlas=self.atlas_padded().clone())
+        if self._atlas_list is not None:
+            tex._atlas_list = [atlas.clone() for atlas in self._atlas_list]
+        num_faces = (
+            self._num_faces_per_mesh.clone()
+            if torch.is_tensor(self._num_faces_per_mesh)
+            else self._num_faces_per_mesh
+        )
+        tex.valid = self.valid.clone()
+        tex._num_faces_per_mesh = num_faces
+        return tex
+
+    def detach(self) -> "TexturesAtlas":
+        tex = self.__class__(atlas=self.atlas_padded().detach())
+        if self._atlas_list is not None:
+            tex._atlas_list = [atlas.detach() for atlas in self._atlas_list]
+        num_faces = (
+            self._num_faces_per_mesh.detach()
+            if torch.is_tensor(self._num_faces_per_mesh)
+            else self._num_faces_per_mesh
+        )
+        tex.valid = self.valid.detach()
+        tex._num_faces_per_mesh = num_faces
+        return tex
+
+    def __getitem__(self, index) -> "TexturesAtlas":
+        props = ["atlas_list", "_num_faces_per_mesh"]
+        new_props = self._getitem(index, props=props)
+        atlas = new_props["atlas_list"]
+        if isinstance(atlas, list):
+            # multiple batch elements
+            new_tex = self.__class__(atlas=atlas)
+        elif torch.is_tensor(atlas):
+            # single element
+            new_tex = self.__class__(atlas=[atlas])
+        else:
+            raise ValueError("Not all values are provided in the correct format")
+        new_tex._num_faces_per_mesh = new_props["_num_faces_per_mesh"]
+        return new_tex
+
+    def atlas_padded(self) -> torch.Tensor:
+        if self._atlas_padded is None:
+            if self.isempty():
+                self._atlas_padded = torch.zeros(
+                    (self._N, 0, 0, 0, 3), dtype=torch.float32, device=self.device
+                )
+            else:
+                self._atlas_padded = _list_to_padded_wrapper(
+                    self._atlas_list, pad_value=0.0
+                )
+        return self._atlas_padded
+
+    def atlas_list(self) -> List[torch.Tensor]:
+        if self._atlas_list is None:
+            if self.isempty():
+                self._atlas_padded = [
+                    torch.empty((0, 0, 0, 3), dtype=torch.float32, device=self.device)
+                ] * self._N
+            self._atlas_list = _padded_to_list_wrapper(
+                self._atlas_padded, split_size=self._num_faces_per_mesh
+            )
+        return self._atlas_list
+
+    def atlas_packed(self) -> torch.Tensor:
+        if self.isempty():
+            return torch.zeros(
+                (self._N, 0, 0, 3), dtype=torch.float32, device=self.device
+            )
+        atlas_list = self.atlas_list()
+        return list_to_packed(atlas_list)[0]
+
+    def extend(self, N: int) -> "TexturesAtlas":
+        new_props = self._extend(N, ["atlas_padded", "_num_faces_per_mesh"])
+        new_tex = self.__class__(atlas=new_props["atlas_padded"])
+        new_tex._num_faces_per_mesh = new_props["_num_faces_per_mesh"]
+        return new_tex
+
+    def sample_textures(self, fragments, **kwargs) -> torch.Tensor:
+        """
+        This is similar to a nearest neighbor sampling and involves a
+        discretization step. The barycentric coordinates from
+        rasterization are used to find the nearest grid cell in the texture
+        atlas and the RGB is returned as the color.
+        This means that this step is differentiable with respect to the RGB
+        values of the texture atlas but not differentiable with respect to the
+        barycentric coordinates.
+
+        TODO: Add a different sampling mode which interpolates the barycentric
+        coordinates to sample the texture and will be differentiable w.r.t
+        the barycentric coordinates.
+
+        Args:
+            fragments:
+                The outputs of rasterization. From this we use
+
+                - pix_to_face: LongTensor of shape (N, H, W, K) specifying the indices
+                of the faces (in the packed representation) which
+                overlap each pixel in the image.
+                - barycentric_coords: FloatTensor of shape (N, H, W, K, 3) specifying
+                the barycentric coordinates of each pixel
+                relative to the faces (in the packed
+                representation) which overlap the pixel.
+
+        Returns:
+            texels: (N, H, W, K, C)
+        """
+        N, H, W, K = fragments.pix_to_face.shape
+        atlas_packed = self.atlas_packed()
+        R = atlas_packed.shape[1]
+        bary = fragments.bary_coords
+        pix_to_face = fragments.pix_to_face
+
+        bary_w01 = bary[..., :2]
+        # pyre-fixme[16]: `bool` has no attribute `__getitem__`.
+        mask = (pix_to_face < 0)[..., None]
+        bary_w01 = torch.where(mask, torch.zeros_like(bary_w01), bary_w01)
+        # If barycentric coordinates are > 1.0 (in the case of
+        # blur_radius > 0.0), wxy might be > R. We need to clamp this
+        # index to R-1 to index into the texture atlas.
+        w_xy = (bary_w01 * R).to(torch.int64).clamp(max=R - 1)  # (N, H, W, K, 2)
+
+        below_diag = (
+            bary_w01.sum(dim=-1) * R - w_xy.float().sum(dim=-1)
+        ) <= 1.0  # (N, H, W, K)
+        w_x, w_y = w_xy.unbind(-1)
+        w_x = torch.where(below_diag, w_x, (R - 1 - w_x))
+        w_y = torch.where(below_diag, w_y, (R - 1 - w_y))
+
+        texels = atlas_packed[pix_to_face, w_y, w_x]
+        texels = texels * (pix_to_face >= 0)[..., None].float()
+
+        return texels
+
+    def faces_verts_textures_packed(self) -> torch.Tensor:
+        """
+        Samples texture from each vertex for each face in the mesh.
+        For N meshes with {Fi} number of faces, it returns a
+        tensor of shape sum(Fi)x3xC (C = 3 for RGB).
+        You can use the utils function in structures.utils to convert the
+        packed representation to a list or padded.
+        """
+        atlas_packed = self.atlas_packed()
+        # assume each face consists of (v0, v1, v2).
+        # to sample from the atlas we only need the first two barycentric coordinates.
+        # for details on how this texture sample works refer to the sample_textures function.
+        t0 = atlas_packed[:, 0, -1]  # corresponding to v0  with bary = (1, 0)
+        t1 = atlas_packed[:, -1, 0]  # corresponding to v1 with bary = (0, 1)
+        t2 = atlas_packed[:, 0, 0]  # corresponding to v2 with bary = (0, 0)
+        return torch.stack((t0, t1, t2), dim=1)
+
+    def join_batch(self, textures: List["TexturesAtlas"]) -> "TexturesAtlas":
+        """
+        Join the list of textures given by `textures` to
+        self to create a batch of textures. Return a new
+        TexturesAtlas object with the combined textures.
+
+        Args:
+            textures: List of TexturesAtlas objects
+
+        Returns:
+            new_tex: TexturesAtlas object with the combined
+            textures from self and the list `textures`.
+        """
+        tex_types_same = all(isinstance(tex, TexturesAtlas) for tex in textures)
+        if not tex_types_same:
+            raise ValueError("All textures must be of type TexturesAtlas.")
+
+        atlas_list = []
+        atlas_list += self.atlas_list()
+        num_faces_per_mesh = self._num_faces_per_mesh
+        for tex in textures:
+            atlas_list += tex.atlas_list()
+            num_faces_per_mesh += tex._num_faces_per_mesh
+        new_tex = self.__class__(atlas=atlas_list)
+        new_tex._num_faces_per_mesh = num_faces_per_mesh
+        return new_tex
+
+    def join_scene(self) -> "TexturesAtlas":
+        """
+        Return a new TexturesAtlas amalgamating the batch.
+        """
+        return self.__class__(atlas=[torch.cat(self.atlas_list())])
+
+    def check_shapes(
+        self, batch_size: int, max_num_verts: int, max_num_faces: int
+    ) -> bool:
+        """
+        Check if the dimensions of the atlas match that of the mesh faces
+        """
+        # (N, F) should be the same
+        return self.atlas_padded().shape[0:2] == (batch_size, max_num_faces)
+
+
+class TexturesUV(TexturesBase):
+    def __init__(
+        self,
+        maps: Union[torch.Tensor, List[torch.Tensor]],
+        faces_uvs: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]],
+        verts_uvs: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]],
+        padding_mode: str = "border",
+        align_corners: bool = True,
+        sampling_mode: str = "bilinear",
+    ) -> None:
+        """
+        Textures are represented as a per mesh texture map and uv coordinates for each
+        vertex in each face. NOTE: this class only supports one texture map per mesh.
+
+        Args:
+            maps: texture map per mesh. This can either be a list of maps
+              [(H, W, C)] or a padded tensor of shape (N, H, W, C).
+              For RGB, C = 3.
+            faces_uvs: (N, F, 3) LongTensor giving the index into verts_uvs
+                        for each face
+            verts_uvs: (N, V, 2) tensor giving the uv coordinates per vertex
+                        (a FloatTensor with values between 0 and 1).
+            align_corners: If true, the extreme values 0 and 1 for verts_uvs
+                            indicate the centers of the edge pixels in the maps.
+            padding_mode: padding mode for outside grid values
+                                ("zeros", "border" or "reflection").
+            sampling_mode: type of interpolation used to sample the texture.
+                            Corresponds to the mode parameter in PyTorch's
+                            grid_sample ("nearest" or "bilinear").
+
+        The align_corners and padding_mode arguments correspond to the arguments
+        of the `grid_sample` torch function. There is an informative illustration of
+        the two align_corners options at
+        https://discuss.pytorch.org/t/22663/9 .
+
+        An example of how the indexing into the maps, with align_corners=True,
+        works is as follows.
+        If maps[i] has shape [1001, 101] and the value of verts_uvs[i][j]
+        is [0.4, 0.3], then a value of j in faces_uvs[i] means a vertex
+        whose color is given by maps[i][700, 40]. padding_mode affects what
+        happens if a value in verts_uvs is less than 0 or greater than 1.
+        Note that increasing a value in verts_uvs[..., 0] increases an index
+        in maps, whereas increasing a value in verts_uvs[..., 1] _decreases_
+        an _earlier_ index in maps.
+
+        If align_corners=False, an example would be as follows.
+        If maps[i] has shape [1000, 100] and the value of verts_uvs[i][j]
+        is [0.405, 0.2995], then a value of j in faces_uvs[i] means a vertex
+        whose color is given by maps[i][700, 40].
+        When align_corners=False, padding_mode even matters for values in
+        verts_uvs slightly above 0 or slightly below 1. In this case, the
+        padding_mode matters if the first value is outside the interval
+        [0.0005, 0.9995] or if the second is outside the interval
+        [0.005, 0.995].
+        """
+        self.padding_mode = padding_mode
+        self.align_corners = align_corners
+        self.sampling_mode = sampling_mode
+        if isinstance(faces_uvs, (list, tuple)):
+            for fv in faces_uvs:
+                if fv.ndim != 2 or fv.shape[-1] != 3:
+                    msg = "Expected faces_uvs to be of shape (F, 3); got %r"
+                    raise ValueError(msg % repr(fv.shape))
+            self._faces_uvs_list = faces_uvs
+            self._faces_uvs_padded = None
+            self.device = torch.device("cpu")
+
+            # These values may be overridden when textures is
+            # passed into the Meshes constructor. For more details
+            # refer to the __init__ of Meshes.
+            self._N = len(faces_uvs)
+            self._num_faces_per_mesh = [len(fv) for fv in faces_uvs]
+
+            if self._N > 0:
+                self.device = faces_uvs[0].device
+
+        elif torch.is_tensor(faces_uvs):
+            if faces_uvs.ndim != 3 or faces_uvs.shape[-1] != 3:
+                msg = "Expected faces_uvs to be of shape (N, F, 3); got %r"
+                raise ValueError(msg % repr(faces_uvs.shape))
+            self._faces_uvs_padded = faces_uvs
+            self._faces_uvs_list = None
+            self.device = faces_uvs.device
+
+            # These values may be overridden when textures is
+            # passed into the Meshes constructor. For more details
+            # refer to the __init__ of Meshes.
+            self._N = len(faces_uvs)
+            max_F = faces_uvs.shape[1]
+            self._num_faces_per_mesh = [max_F] * self._N
+        else:
+            raise ValueError("Expected faces_uvs to be a tensor or list")
+
+        if isinstance(verts_uvs, (list, tuple)):
+            for fv in verts_uvs:
+                if fv.ndim != 2 or fv.shape[-1] != 2:
+                    msg = "Expected verts_uvs to be of shape (V, 2); got %r"
+                    raise ValueError(msg % repr(fv.shape))
+            self._verts_uvs_list = verts_uvs
+            self._verts_uvs_padded = None
+
+            if len(verts_uvs) != self._N:
+                raise ValueError(
+                    "verts_uvs and faces_uvs must have the same batch dimension"
+                )
+            if not all(v.device == self.device for v in verts_uvs):
+                raise ValueError("verts_uvs and faces_uvs must be on the same device")
+
+        elif torch.is_tensor(verts_uvs):
+            if (
+                verts_uvs.ndim != 3
+                or verts_uvs.shape[-1] != 2
+                or verts_uvs.shape[0] != self._N
+            ):
+                msg = "Expected verts_uvs to be of shape (N, V, 2); got %r"
+                raise ValueError(msg % repr(verts_uvs.shape))
+            self._verts_uvs_padded = verts_uvs
+            self._verts_uvs_list = None
+
+            if verts_uvs.device != self.device:
+                raise ValueError("verts_uvs and faces_uvs must be on the same device")
+        else:
+            raise ValueError("Expected verts_uvs to be a tensor or list")
+
+        if isinstance(maps, (list, tuple)):
+            self._maps_list = maps
+        else:
+            self._maps_list = None
+        self._maps_padded = self._format_maps_padded(maps)
+
+        if self._maps_padded.device != self.device:
+            raise ValueError("maps must be on the same device as verts/faces uvs.")
+
+        self.valid = torch.ones((self._N,), dtype=torch.bool, device=self.device)
+
+    def _format_maps_padded(
+        self, maps: Union[torch.Tensor, List[torch.Tensor]]
+    ) -> torch.Tensor:
+        if isinstance(maps, torch.Tensor):
+            if maps.ndim != 4 or maps.shape[0] != self._N:
+                msg = "Expected maps to be of shape (N, H, W, C); got %r"
+                raise ValueError(msg % repr(maps.shape))
+            return maps
+
+        if isinstance(maps, (list, tuple)):
+            if len(maps) != self._N:
+                raise ValueError("Expected one texture map per mesh in the batch.")
+            if self._N > 0:
+                if not all(map.ndim == 3 for map in maps):
+                    raise ValueError("Invalid number of dimensions in texture maps")
+                if not all(map.shape[2] == maps[0].shape[2] for map in maps):
+                    raise ValueError("Inconsistent number of channels in maps")
+                maps_padded = _pad_texture_maps(maps, align_corners=self.align_corners)
+            else:
+                maps_padded = torch.empty(
+                    (self._N, 0, 0, 3), dtype=torch.float32, device=self.device
+                )
+            return maps_padded
+
+        raise ValueError("Expected maps to be a tensor or list of tensors.")
+
+    def clone(self) -> "TexturesUV":
+        tex = self.__class__(
+            self.maps_padded().clone(),
+            self.faces_uvs_padded().clone(),
+            self.verts_uvs_padded().clone(),
+            align_corners=self.align_corners,
+            padding_mode=self.padding_mode,
+            sampling_mode=self.sampling_mode,
+        )
+        if self._maps_list is not None:
+            tex._maps_list = [m.clone() for m in self._maps_list]
+        if self._verts_uvs_list is not None:
+            tex._verts_uvs_list = [v.clone() for v in self._verts_uvs_list]
+        if self._faces_uvs_list is not None:
+            tex._faces_uvs_list = [f.clone() for f in self._faces_uvs_list]
+        num_faces = (
+            self._num_faces_per_mesh.clone()
+            if torch.is_tensor(self._num_faces_per_mesh)
+            else self._num_faces_per_mesh
+        )
+        tex._num_faces_per_mesh = num_faces
+        tex.valid = self.valid.clone()
+        return tex
+
+    def detach(self) -> "TexturesUV":
+        tex = self.__class__(
+            self.maps_padded().detach(),
+            self.faces_uvs_padded().detach(),
+            self.verts_uvs_padded().detach(),
+            align_corners=self.align_corners,
+            padding_mode=self.padding_mode,
+            sampling_mode=self.sampling_mode,
+        )
+        if self._maps_list is not None:
+            tex._maps_list = [m.detach() for m in self._maps_list]
+        if self._verts_uvs_list is not None:
+            tex._verts_uvs_list = [v.detach() for v in self._verts_uvs_list]
+        if self._faces_uvs_list is not None:
+            tex._faces_uvs_list = [f.detach() for f in self._faces_uvs_list]
+        num_faces = (
+            self._num_faces_per_mesh.detach()
+            if torch.is_tensor(self._num_faces_per_mesh)
+            else self._num_faces_per_mesh
+        )
+        tex._num_faces_per_mesh = num_faces
+        tex.valid = self.valid.detach()
+        return tex
+
+    def __getitem__(self, index) -> "TexturesUV":
+        props = ["verts_uvs_list", "faces_uvs_list", "maps_list", "_num_faces_per_mesh"]
+        new_props = self._getitem(index, props)
+        faces_uvs = new_props["faces_uvs_list"]
+        verts_uvs = new_props["verts_uvs_list"]
+        maps = new_props["maps_list"]
+
+        # if index has multiple values then faces/verts/maps may be a list of tensors
+        if all(isinstance(f, (list, tuple)) for f in [faces_uvs, verts_uvs, maps]):
+            new_tex = self.__class__(
+                faces_uvs=faces_uvs,
+                verts_uvs=verts_uvs,
+                maps=maps,
+                padding_mode=self.padding_mode,
+                align_corners=self.align_corners,
+                sampling_mode=self.sampling_mode,
+            )
+        elif all(torch.is_tensor(f) for f in [faces_uvs, verts_uvs, maps]):
+            new_tex = self.__class__(
+                faces_uvs=[faces_uvs],
+                verts_uvs=[verts_uvs],
+                maps=[maps],
+                padding_mode=self.padding_mode,
+                align_corners=self.align_corners,
+                sampling_mode=self.sampling_mode,
+            )
+        else:
+            raise ValueError("Not all values are provided in the correct format")
+        new_tex._num_faces_per_mesh = new_props["_num_faces_per_mesh"]
+        return new_tex
+
+    def faces_uvs_padded(self) -> torch.Tensor:
+        if self._faces_uvs_padded is None:
+            if self.isempty():
+                self._faces_uvs_padded = torch.zeros(
+                    (self._N, 0, 3), dtype=torch.float32, device=self.device
+                )
+            else:
+                self._faces_uvs_padded = list_to_padded(
+                    self._faces_uvs_list, pad_value=0.0
+                )
+        return self._faces_uvs_padded
+
+    def faces_uvs_list(self) -> List[torch.Tensor]:
+        if self._faces_uvs_list is None:
+            if self.isempty():
+                self._faces_uvs_list = [
+                    torch.empty((0, 3), dtype=torch.float32, device=self.device)
+                ] * self._N
+            else:
+                self._faces_uvs_list = padded_to_list(
+                    self._faces_uvs_padded, split_size=self._num_faces_per_mesh
+                )
+        return self._faces_uvs_list
+
+    def verts_uvs_padded(self) -> torch.Tensor:
+        if self._verts_uvs_padded is None:
+            if self.isempty():
+                self._verts_uvs_padded = torch.zeros(
+                    (self._N, 0, 2), dtype=torch.float32, device=self.device
+                )
+            else:
+                self._verts_uvs_padded = list_to_padded(
+                    self._verts_uvs_list, pad_value=0.0
+                )
+        return self._verts_uvs_padded
+
+    def verts_uvs_list(self) -> List[torch.Tensor]:
+        if self._verts_uvs_list is None:
+            if self.isempty():
+                self._verts_uvs_list = [
+                    torch.empty((0, 2), dtype=torch.float32, device=self.device)
+                ] * self._N
+            else:
+                # The number of vertices in the mesh and in verts_uvs can differ
+                # e.g. if a vertex is shared between 3 faces, it can
+                # have up to 3 different uv coordinates.
+                self._verts_uvs_list = list(self._verts_uvs_padded.unbind(0))
+        return self._verts_uvs_list
+
+    # Currently only the padded maps are used.
+    def maps_padded(self) -> torch.Tensor:
+        return self._maps_padded
+
+    def maps_list(self) -> List[torch.Tensor]:
+        if self._maps_list is not None:
+            return self._maps_list
+        return self._maps_padded.unbind(0)
+
+    def extend(self, N: int) -> "TexturesUV":
+        new_props = self._extend(
+            N,
+            [
+                "maps_padded",
+                "verts_uvs_padded",
+                "faces_uvs_padded",
+                "_num_faces_per_mesh",
+            ],
+        )
+        new_tex = self.__class__(
+            maps=new_props["maps_padded"],
+            faces_uvs=new_props["faces_uvs_padded"],
+            verts_uvs=new_props["verts_uvs_padded"],
+            padding_mode=self.padding_mode,
+            align_corners=self.align_corners,
+            sampling_mode=self.sampling_mode,
+        )
+
+        new_tex._num_faces_per_mesh = new_props["_num_faces_per_mesh"]
+        return new_tex
+
+    def sample_textures(self, fragments, **kwargs) -> torch.Tensor:
+        """
+        Interpolate a 2D texture map using uv vertex texture coordinates for each
+        face in the mesh. First interpolate the vertex uvs using barycentric coordinates
+        for each pixel in the rasterized output. Then interpolate the texture map
+        using the uv coordinate for each pixel.
+
+        Args:
+            fragments:
+                The outputs of rasterization. From this we use
+
+                - pix_to_face: LongTensor of shape (N, H, W, K) specifying the indices
+                of the faces (in the packed representation) which
+                overlap each pixel in the image.
+                - barycentric_coords: FloatTensor of shape (N, H, W, K, 3) specifying
+                the barycentric coordinates of each pixel
+                relative to the faces (in the packed
+                representation) which overlap the pixel.
+
+        Returns:
+            texels: tensor of shape (N, H, W, K, C) giving the interpolated
+            texture for each pixel in the rasterized image.
+        """
+        if self.isempty():
+            faces_verts_uvs = torch.zeros(
+                (self._N, 3, 2), dtype=torch.float32, device=self.device
+            )
+        else:
+            packing_list = [
+                i[j] for i, j in zip(self.verts_uvs_list(), self.faces_uvs_list())
+            ]
+            faces_verts_uvs = torch.cat(packing_list)
+        texture_maps = self.maps_padded()
+
+        # pixel_uvs: (N, H, W, K, 2)
+        pixel_uvs = interpolate_face_attributes(
+            fragments.pix_to_face, fragments.bary_coords, faces_verts_uvs
+        )
+
+        N, H_out, W_out, K = fragments.pix_to_face.shape
+        N, H_in, W_in, C = texture_maps.shape  # 3 for RGB
+
+        # pixel_uvs: (N, H, W, K, 2) -> (N, K, H, W, 2) -> (NK, H, W, 2)
+        pixel_uvs = pixel_uvs.permute(0, 3, 1, 2, 4).reshape(N * K, H_out, W_out, 2)
+
+        # textures.map:
+        #   (N, H, W, C) -> (N, C, H, W) -> (1, N, C, H, W)
+        #   -> expand (K, N, C, H, W) -> reshape (N*K, C, H, W)
+        texture_maps = (
+            texture_maps.permute(0, 3, 1, 2)[None, ...]
+            .expand(K, -1, -1, -1, -1)
+            .transpose(0, 1)
+            .reshape(N * K, C, H_in, W_in)
+        )
+
+        # Textures: (N*K, C, H, W), pixel_uvs: (N*K, H, W, 2)
+        # Now need to format the pixel uvs and the texture map correctly!
+        # From pytorch docs, grid_sample takes `grid` and `input`:
+        #   grid specifies the sampling pixel locations normalized by
+        #   the input spatial dimensions It should have most
+        #   values in the range of [-1, 1]. Values x = -1, y = -1
+        #   is the left-top pixel of input, and values x = 1, y = 1 is the
+        #   right-bottom pixel of input.
+
+        pixel_uvs = pixel_uvs * 2.0 - 1.0
+
+        texture_maps = torch.flip(texture_maps, [2])  # flip y axis of the texture map
+        if texture_maps.device != pixel_uvs.device:
+            texture_maps = texture_maps.to(pixel_uvs.device)
+        texels = F.grid_sample(
+            texture_maps,
+            pixel_uvs,
+            mode=self.sampling_mode,
+            align_corners=self.align_corners,
+            padding_mode=self.padding_mode,
+        )
+        # texels now has shape (NK, C, H_out, W_out)
+        texels = texels.reshape(N, K, C, H_out, W_out).permute(0, 3, 4, 1, 2)
+        return texels
+
+    def faces_verts_textures_packed(self) -> torch.Tensor:
+        """
+        Samples texture from each vertex and for each face in the mesh.
+        For N meshes with {Fi} number of faces, it returns a
+        tensor of shape sum(Fi)x3xC (C = 3 for RGB).
+        You can use the utils function in structures.utils to convert the
+        packed representation to a list or padded.
+        """
+        if self.isempty():
+            return torch.zeros(
+                (0, 3, self.maps_padded().shape[-1]),
+                dtype=torch.float32,
+                device=self.device,
+            )
+        else:
+            packing_list = [
+                i[j] for i, j in zip(self.verts_uvs_list(), self.faces_uvs_list())
+            ]
+            faces_verts_uvs = _list_to_padded_wrapper(
+                packing_list, pad_value=0.0
+            )  # Nxmax(Fi)x3x2
+        texture_maps = self.maps_padded()  # NxHxWxC
+        texture_maps = texture_maps.permute(0, 3, 1, 2)  # NxCxHxW
+
+        faces_verts_uvs = faces_verts_uvs * 2.0 - 1.0
+        texture_maps = torch.flip(texture_maps, [2])  # flip y axis of the texture map
+
+        textures = F.grid_sample(
+            texture_maps,
+            faces_verts_uvs,
+            mode=self.sampling_mode,
+            align_corners=self.align_corners,
+            padding_mode=self.padding_mode,
+        )  # NxCxmax(Fi)x3
+
+        textures = textures.permute(0, 2, 3, 1)  # Nxmax(Fi)x3xC
+        textures = _padded_to_list_wrapper(
+            textures, split_size=self._num_faces_per_mesh
+        )  # list of N {Fix3xC} tensors
+        return list_to_packed(textures)[0]
+
+    def join_batch(self, textures: List["TexturesUV"]) -> "TexturesUV":
+        """
+        Join the list of textures given by `textures` to
+        self to create a batch of textures. Return a new
+        TexturesUV object with the combined textures.
+
+        Args:
+            textures: List of TexturesUV objects
+
+        Returns:
+            new_tex: TexturesUV object with the combined
+            textures from self and the list `textures`.
+        """
+        tex_types_same = all(isinstance(tex, TexturesUV) for tex in textures)
+        if not tex_types_same:
+            raise ValueError("All textures must be of type TexturesUV.")
+
+        padding_modes_same = all(
+            tex.padding_mode == self.padding_mode for tex in textures
+        )
+        if not padding_modes_same:
+            raise ValueError("All textures must have the same padding_mode.")
+        align_corners_same = all(
+            tex.align_corners == self.align_corners for tex in textures
+        )
+        if not align_corners_same:
+            raise ValueError("All textures must have the same align_corners value.")
+        sampling_mode_same = all(
+            tex.sampling_mode == self.sampling_mode for tex in textures
+        )
+        if not sampling_mode_same:
+            raise ValueError("All textures must have the same sampling_mode.")
+
+        verts_uvs_list = []
+        faces_uvs_list = []
+        maps_list = []
+        faces_uvs_list += self.faces_uvs_list()
+        verts_uvs_list += self.verts_uvs_list()
+        maps_list += self.maps_list()
+        num_faces_per_mesh = self._num_faces_per_mesh
+        for tex in textures:
+            verts_uvs_list += tex.verts_uvs_list()
+            faces_uvs_list += tex.faces_uvs_list()
+            num_faces_per_mesh += tex._num_faces_per_mesh
+            maps_list += tex.maps_list()
+
+        new_tex = self.__class__(
+            maps=maps_list,
+            verts_uvs=verts_uvs_list,
+            faces_uvs=faces_uvs_list,
+            padding_mode=self.padding_mode,
+            align_corners=self.align_corners,
+            sampling_mode=self.sampling_mode,
+        )
+        new_tex._num_faces_per_mesh = num_faces_per_mesh
+        return new_tex
+
+    def _place_map_into_single_map(
+        self, single_map: torch.Tensor, map_: torch.Tensor, location: PackedRectangle
+    ) -> None:
+        """
+        Copy map into a larger tensor single_map at the destination specified by location.
+        If align_corners is False, we add the needed border around the destination.
+
+        Used by join_scene.
+
+        Args:
+            single_map: (total_H, total_W, C)
+            map_: (H, W, C) source data
+            location: where to place map
+        """
+        do_flip = location.flipped
+        source = map_.transpose(0, 1) if do_flip else map_
+        border_width = 0 if self.align_corners else 1
+        lower_u = location.x + border_width
+        lower_v = location.y + border_width
+        upper_u = lower_u + source.shape[0]
+        upper_v = lower_v + source.shape[1]
+        single_map[lower_u:upper_u, lower_v:upper_v] = source
+
+        if self.padding_mode != "zeros" and not self.align_corners:
+            single_map[lower_u - 1, lower_v:upper_v] = single_map[
+                lower_u, lower_v:upper_v
+            ]
+            single_map[upper_u, lower_v:upper_v] = single_map[
+                upper_u - 1, lower_v:upper_v
+            ]
+            single_map[lower_u:upper_u, lower_v - 1] = single_map[
+                lower_u:upper_u, lower_v
+            ]
+            single_map[lower_u:upper_u, upper_v] = single_map[
+                lower_u:upper_u, upper_v - 1
+            ]
+            single_map[lower_u - 1, lower_v - 1] = single_map[lower_u, lower_v]
+            single_map[lower_u - 1, upper_v] = single_map[lower_u, upper_v - 1]
+            single_map[upper_u, lower_v - 1] = single_map[upper_u - 1, lower_v]
+            single_map[upper_u, upper_v] = single_map[upper_u - 1, upper_v - 1]
+
+    def join_scene(self) -> "TexturesUV":
+        """
+        Return a new TexturesUV amalgamating the batch.
+
+        We calculate a large single map which contains the original maps,
+        and find verts_uvs to point into it. This will not replicate
+        behavior of padding for verts_uvs values outside [0,1].
+
+        If align_corners=False, we need to add an artificial border around
+        every map.
+
+        We use the function `pack_unique_rectangles` to provide a layout for
+        the single map. This means that if self was created with a list of maps,
+        and to() has not been called, and there were two maps which were exactly
+        the same tensor object, then they will become the same data in the unified map.
+        _place_map_into_single_map is used to copy the maps into the single map.
+        The merging of verts_uvs and faces_uvs is handled locally in this function.
+        """
+        maps = self.maps_list()
+        heights_and_widths = []
+        extra_border = 0 if self.align_corners else 2
+        for map_ in maps:
+            heights_and_widths.append(
+                Rectangle(
+                    map_.shape[0] + extra_border, map_.shape[1] + extra_border, id(map_)
+                )
+            )
+        merging_plan = pack_unique_rectangles(heights_and_widths)
+        C = maps[0].shape[-1]
+        single_map = maps[0].new_zeros((*merging_plan.total_size, C))
+        verts_uvs = self.verts_uvs_list()
+        verts_uvs_merged = []
+
+        for map_, loc, uvs in zip(maps, merging_plan.locations, verts_uvs):
+            new_uvs = uvs.clone()
+            if loc.is_first:
+                self._place_map_into_single_map(single_map, map_, loc)
+            do_flip = loc.flipped
+            x_shape = map_.shape[1] if do_flip else map_.shape[0]
+            y_shape = map_.shape[0] if do_flip else map_.shape[1]
+
+            if do_flip:
+                # Here we have flipped / transposed the map.
+                # In uvs, the y values are decreasing from 1 to 0 and the x
+                # values increase from 0 to 1. We subtract all values from 1
+                # as the x's become y's and the y's become x's.
+                new_uvs = 1.0 - new_uvs[:, [1, 0]]
+                if TYPE_CHECKING:
+                    new_uvs = torch.Tensor(new_uvs)
+
+            # If align_corners is True, then an index of x (where x is in
+            # the range 0 .. map_.shape[1]-1) in one of the input maps
+            # was hit by a u of x/(map_.shape[1]-1).
+            # That x is located at the index loc[1] + x in the single_map, and
+            # to hit that we need u to equal (loc[1] + x) / (total_size[1]-1)
+            # so the old u should be mapped to
+            #   { u*(map_.shape[1]-1) + loc[1] } / (total_size[1]-1)
+
+            # Also, an index of y (where y is in
+            # the range 0 .. map_.shape[0]-1) in one of the input maps
+            # was hit by a v of 1 - y/(map_.shape[0]-1).
+            # That y is located at the index loc[0] + y in the single_map, and
+            # to hit that we need v to equal 1 - (loc[0] + y) / (total_size[0]-1)
+            # so the old v should be mapped to
+            #   1 - { (1-v)*(map_.shape[0]-1) + loc[0] } / (total_size[0]-1)
+            # =
+            # { v*(map_.shape[0]-1) + total_size[0] - map.shape[0] - loc[0] }
+            #        / (total_size[0]-1)
+
+            # If align_corners is False, then an index of x (where x is in
+            # the range 1 .. map_.shape[1]-2) in one of the input maps
+            # was hit by a u of (x+0.5)/(map_.shape[1]).
+            # That x is located at the index loc[1] + 1 + x in the single_map,
+            # (where the 1 is for the border)
+            # and to hit that we need u to equal (loc[1] + 1 + x + 0.5) / (total_size[1])
+            # so the old u should be mapped to
+            #   { loc[1] + 1 + u*map_.shape[1]-0.5 + 0.5 } / (total_size[1])
+            #  = { loc[1] + 1 + u*map_.shape[1] } / (total_size[1])
+
+            # Also, an index of y (where y is in
+            # the range 1 .. map_.shape[0]-2) in one of the input maps
+            # was hit by a v of 1 - (y+0.5)/(map_.shape[0]).
+            # That y is located at the index loc[0] + 1 + y in the single_map,
+            # (where the 1 is for the border)
+            # and to hit that we need v to equal 1 - (loc[0] + 1 + y + 0.5) / (total_size[0])
+            # so the old v should be mapped to
+            #   1 - { loc[0] + 1 + (1-v)*map_.shape[0]-0.5 + 0.5 } / (total_size[0])
+            #  = { total_size[0] - loc[0] -1 - (1-v)*map_.shape[0]  }
+            #         / (total_size[0])
+            #  = { total_size[0] - loc[0] - map.shape[0] - 1 + v*map_.shape[0] }
+            #         / (total_size[0])
+
+            # We change the y's in new_uvs for the scaling of height,
+            # and the x's for the scaling of width.
+            # That is why the 1's and 0's are mismatched in these lines.
+            one_if_align = 1 if self.align_corners else 0
+            one_if_not_align = 1 - one_if_align
+            denom_x = merging_plan.total_size[0] - one_if_align
+            scale_x = x_shape - one_if_align
+            denom_y = merging_plan.total_size[1] - one_if_align
+            scale_y = y_shape - one_if_align
+            new_uvs[:, 1] *= scale_x / denom_x
+            new_uvs[:, 1] += (
+                merging_plan.total_size[0] - x_shape - loc.x - one_if_not_align
+            ) / denom_x
+            new_uvs[:, 0] *= scale_y / denom_y
+            new_uvs[:, 0] += (loc.y + one_if_not_align) / denom_y
+
+            verts_uvs_merged.append(new_uvs)
+
+        faces_uvs_merged = []
+        offset = 0
+        for faces_uvs_, verts_uvs_ in zip(self.faces_uvs_list(), verts_uvs):
+            faces_uvs_merged.append(offset + faces_uvs_)
+            offset += verts_uvs_.shape[0]
+
+        return self.__class__(
+            maps=[single_map],
+            verts_uvs=[torch.cat(verts_uvs_merged)],
+            faces_uvs=[torch.cat(faces_uvs_merged)],
+            align_corners=self.align_corners,
+            padding_mode=self.padding_mode,
+            sampling_mode=self.sampling_mode,
+        )
+
+    def centers_for_image(self, index: int) -> torch.Tensor:
+        """
+        Return the locations in the texture map which correspond to the given
+        verts_uvs, for one of the meshes. This is potentially useful for
+        visualizing the data. See the texturesuv_image_matplotlib and
+        texturesuv_image_PIL functions.
+
+        Args:
+            index: batch index of the mesh whose centers to return.
+
+        Returns:
+            centers: coordinates of points in the texture image
+                - a FloatTensor of shape (V,2)
+        """
+        if self._N != 1:
+            raise ValueError(
+                "This function only supports plotting textures for one mesh."
+            )
+        texture_image = self.maps_padded()
+        verts_uvs = self.verts_uvs_list()[index][None]
+        _, H, W, _3 = texture_image.shape
+        coord1 = torch.arange(W).expand(H, W)
+        coord2 = torch.arange(H)[:, None].expand(H, W)
+        coords = torch.stack([coord1, coord2])[None]
+        with torch.no_grad():
+            # Get xy cartesian coordinates based on the uv coordinates
+            centers = F.grid_sample(
+                torch.flip(coords.to(texture_image), [2]),
+                # Convert from [0, 1] -> [-1, 1] range expected by grid sample
+                verts_uvs[:, None] * 2.0 - 1,
+                mode=self.sampling_mode,
+                align_corners=self.align_corners,
+                padding_mode=self.padding_mode,
+            ).cpu()
+            centers = centers[0, :, 0].T
+        return centers
+
+    def check_shapes(
+        self, batch_size: int, max_num_verts: int, max_num_faces: int
+    ) -> bool:
+        """
+        Check if the dimensions of the verts/faces uvs match that of the mesh
+        """
+        # (N, F) should be the same
+        # (N, V) is not guaranteed to be the same
+        return (self.faces_uvs_padded().shape[0:2] == (batch_size, max_num_faces)) and (
+            self.verts_uvs_padded().shape[0] == batch_size
+        )
+
+
+class TexturesVertex(TexturesBase):
+    def __init__(
+        self,
+        verts_features: Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]],
+    ) -> None:
+        """
+        Batched texture representation where each vertex in a mesh
+        has a C dimensional feature vector.
+
+        Args:
+            verts_features: list of (Vi, C) or (N, V, C) tensor giving a feature
+                vector with arbitrary dimensions for each vertex.
+        """
+        if isinstance(verts_features, (tuple, list)):
+            correct_shape = all(
+                (torch.is_tensor(v) and v.ndim == 2) for v in verts_features
+            )
+            if not correct_shape:
+                raise ValueError(
+                    "Expected verts_features to be a list of tensors of shape (V, C)."
+                )
+
+            self._verts_features_list = verts_features
+            self._verts_features_padded = None
+            self.device = torch.device("cpu")
+
+            # These values may be overridden when textures is
+            # passed into the Meshes constructor. For more details
+            # refer to the __init__ of Meshes.
+            self._N = len(verts_features)
+            self._num_verts_per_mesh = [len(fv) for fv in verts_features]
+
+            if self._N > 0:
+                self.device = verts_features[0].device
+
+        elif torch.is_tensor(verts_features):
+            if verts_features.ndim != 3:
+                msg = "Expected verts_features to be of shape (N, V, C); got %r"
+                raise ValueError(msg % repr(verts_features.shape))
+            self._verts_features_padded = verts_features
+            self._verts_features_list = None
+            self.device = verts_features.device
+
+            # These values may be overridden when textures is
+            # passed into the Meshes constructor. For more details
+            # refer to the __init__ of Meshes.
+            self._N = len(verts_features)
+            max_F = verts_features.shape[1]
+            self._num_verts_per_mesh = [max_F] * self._N
+        else:
+            raise ValueError("verts_features must be a tensor or list of tensors")
+
+        # This is set inside the Meshes object when textures is
+        # passed into the Meshes constructor. For more details
+        # refer to the __init__ of Meshes.
+        self.valid = torch.ones((self._N,), dtype=torch.bool, device=self.device)
+
+    def clone(self) -> "TexturesVertex":
+        tex = self.__class__(self.verts_features_padded().clone())
+        if self._verts_features_list is not None:
+            tex._verts_features_list = [f.clone() for f in self._verts_features_list]
+        tex._num_verts_per_mesh = self._num_verts_per_mesh.copy()
+        tex.valid = self.valid.clone()
+        return tex
+
+    def detach(self) -> "TexturesVertex":
+        tex = self.__class__(self.verts_features_padded().detach())
+        if self._verts_features_list is not None:
+            tex._verts_features_list = [f.detach() for f in self._verts_features_list]
+        tex._num_verts_per_mesh = self._num_verts_per_mesh.copy()
+        tex.valid = self.valid.detach()
+        return tex
+
+    def __getitem__(self, index) -> "TexturesVertex":
+        props = ["verts_features_list", "_num_verts_per_mesh"]
+        new_props = self._getitem(index, props)
+        verts_features = new_props["verts_features_list"]
+        if isinstance(verts_features, list):
+            # Handle the case of an empty list
+            if len(verts_features) == 0:
+                verts_features = torch.empty(
+                    size=(0, 0, 3),
+                    dtype=torch.float32,
+                    device=self.verts_features_padded().device,
+                )
+            new_tex = self.__class__(verts_features=verts_features)
+        elif torch.is_tensor(verts_features):
+            new_tex = self.__class__(verts_features=[verts_features])
+        else:
+            raise ValueError("Not all values are provided in the correct format")
+        new_tex._num_verts_per_mesh = new_props["_num_verts_per_mesh"]
+        return new_tex
+
+    def verts_features_padded(self) -> torch.Tensor:
+        if self._verts_features_padded is None:
+            if self.isempty():
+                self._verts_features_padded = torch.zeros(
+                    (self._N, 0, 3, 0), dtype=torch.float32, device=self.device
+                )
+            else:
+                self._verts_features_padded = list_to_padded(
+                    self._verts_features_list, pad_value=0.0
+                )
+        return self._verts_features_padded
+
+    def verts_features_list(self) -> List[torch.Tensor]:
+        if self._verts_features_list is None:
+            if self.isempty():
+                self._verts_features_list = [
+                    torch.empty((0, 3), dtype=torch.float32, device=self.device)
+                ] * self._N
+            else:
+                self._verts_features_list = padded_to_list(
+                    self._verts_features_padded, split_size=self._num_verts_per_mesh
+                )
+        return self._verts_features_list
+
+    def verts_features_packed(self) -> torch.Tensor:
+        if self.isempty():
+            return torch.zeros((self._N, 3, 0), dtype=torch.float32, device=self.device)
+        verts_features_list = self.verts_features_list()
+        return list_to_packed(verts_features_list)[0]
+
+    def extend(self, N: int) -> "TexturesVertex":
+        new_props = self._extend(N, ["verts_features_padded", "_num_verts_per_mesh"])
+        new_tex = self.__class__(verts_features=new_props["verts_features_padded"])
+        new_tex._num_verts_per_mesh = new_props["_num_verts_per_mesh"]
+        return new_tex
+
+    def sample_textures(self, fragments, faces_packed=None) -> torch.Tensor:
+        """
+        Determine the color for each rasterized face. Interpolate the colors for
+        vertices which form the face using the barycentric coordinates.
+        Args:
+            fragments:
+                The outputs of rasterization. From this we use
+
+                - pix_to_face: LongTensor of shape (N, H, W, K) specifying the indices
+                of the faces (in the packed representation) which
+                overlap each pixel in the image.
+                - barycentric_coords: FloatTensor of shape (N, H, W, K, 3) specifying
+                the barycentric coordinates of each pixel
+                relative to the faces (in the packed
+                representation) which overlap the pixel.
+
+        Returns:
+            texels: An texture per pixel of shape (N, H, W, K, C).
+            There will be one C dimensional value for each element in
+            fragments.pix_to_face.
+        """
+        verts_features_packed = self.verts_features_packed()
+        faces_verts_features = verts_features_packed[faces_packed]
+
+        texels = interpolate_face_attributes(
+            fragments.pix_to_face, fragments.bary_coords, faces_verts_features
+        )
+        return texels
+
+    def faces_verts_textures_packed(self, faces_packed=None) -> torch.Tensor:
+        """
+        Samples texture from each vertex and for each face in the mesh.
+        For N meshes with {Fi} number of faces, it returns a
+        tensor of shape sum(Fi)x3xC (C = 3 for RGB).
+        You can use the utils function in structures.utils to convert the
+        packed representation to a list or padded.
+        """
+        verts_features_packed = self.verts_features_packed()
+        faces_verts_features = verts_features_packed[faces_packed]
+        return faces_verts_features
+
+    def join_batch(self, textures: List["TexturesVertex"]) -> "TexturesVertex":
+        """
+        Join the list of textures given by `textures` to
+        self to create a batch of textures. Return a new
+        TexturesVertex object with the combined textures.
+
+        Args:
+            textures: List of TexturesVertex objects
+
+        Returns:
+            new_tex: TexturesVertex object with the combined
+            textures from self and the list `textures`.
+        """
+        tex_types_same = all(isinstance(tex, TexturesVertex) for tex in textures)
+        if not tex_types_same:
+            raise ValueError("All textures must be of type TexturesVertex.")
+
+        verts_features_list = []
+        verts_features_list += self.verts_features_list()
+        num_verts_per_mesh = self._num_verts_per_mesh.copy()
+        for tex in textures:
+            verts_features_list += tex.verts_features_list()
+            num_verts_per_mesh += tex._num_verts_per_mesh
+
+        new_tex = self.__class__(verts_features=verts_features_list)
+        new_tex._num_verts_per_mesh = num_verts_per_mesh
+        return new_tex
+
+    def join_scene(self) -> "TexturesVertex":
+        """
+        Return a new TexturesVertex amalgamating the batch.
+        """
+        return self.__class__(verts_features=[torch.cat(self.verts_features_list())])
+
+    def check_shapes(
+        self, batch_size: int, max_num_verts: int, max_num_faces: int
+    ) -> bool:
+        """
+        Check if the dimensions of the verts features match that of the mesh verts
+        """
+        # (N, V) should be the same
+        return self.verts_features_padded().shape[:-1] == (batch_size, max_num_verts)
diff --git a/pytorch3d/pytorch3d/renderer/mesh/utils.py b/pytorch3d/pytorch3d/renderer/mesh/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..359bd001eb58f307aa0524401d367a2e5bbfea89
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/mesh/utils.py
@@ -0,0 +1,319 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+from typing import List, NamedTuple, Tuple
+
+import torch
+from pytorch3d.ops import interpolate_face_attributes
+
+
+def _clip_barycentric_coordinates(bary) -> torch.Tensor:
+    """
+    Args:
+        bary: barycentric coordinates of shape (...., 3) where `...` represents
+            an arbitrary number of dimensions
+
+    Returns:
+        bary: Barycentric coordinates clipped (i.e any values < 0 are set to 0)
+        and renormalized. We only clip  the negative values. Values > 1 will fall
+        into the [0, 1] range after renormalization.
+        The output is the same shape as the input.
+    """
+    if bary.shape[-1] != 3:
+        msg = "Expected barycentric coords to have last dim = 3; got %r"
+        raise ValueError(msg % (bary.shape,))
+    ndims = bary.ndim - 1
+    mask = bary.eq(-1).all(dim=-1, keepdim=True).expand(*((-1,) * ndims + (3,)))
+    clipped = bary.clamp(min=0.0)
+    clipped[mask] = 0.0
+    clipped_sum = torch.clamp(clipped.sum(dim=-1, keepdim=True), min=1e-5)
+    clipped = clipped / clipped_sum
+    clipped[mask] = -1.0
+    return clipped
+
+
+def _interpolate_zbuf(
+    pix_to_face: torch.Tensor, barycentric_coords: torch.Tensor, meshes
+) -> torch.Tensor:
+    """
+    A helper function to calculate the z buffer for each pixel in the
+    rasterized output.
+
+    Args:
+        pix_to_face: LongTensor of shape (N, H, W, K) specifying the indices
+            of the faces (in the packed representation) which
+            overlap each pixel in the image.
+        barycentric_coords: FloatTensor of shape (N, H, W, K, 3) specifying
+            the barycentric coordinates of each pixel
+            relative to the faces (in the packed
+            representation) which overlap the pixel.
+        meshes: Meshes object representing a batch of meshes.
+
+    Returns:
+        zbuffer: (N, H, W, K) FloatTensor
+    """
+    verts = meshes.verts_packed()
+    faces = meshes.faces_packed()
+    faces_verts_z = verts[faces][..., 2][..., None]  # (F, 3, 1)
+    zbuf = interpolate_face_attributes(pix_to_face, barycentric_coords, faces_verts_z)[
+        ..., 0
+    ]  # (1, H, W, K)
+    zbuf[pix_to_face == -1] = -1
+    return zbuf
+
+
+# -----------  Rectangle Packing  -------------------- #
+
+
+class Rectangle(NamedTuple):
+    xsize: int
+    ysize: int
+    identifier: int
+
+
+class PackedRectangle(NamedTuple):
+    x: int
+    y: int
+    flipped: bool
+    is_first: bool
+
+
+class PackedRectangles(NamedTuple):
+    total_size: Tuple[int, int]
+    locations: List[PackedRectangle]
+
+
+# Note the order of members matters here because it determines the queue order.
+# We want to place longer rectangles first.
+class _UnplacedRectangle(NamedTuple):
+    size: Tuple[int, int]
+    ind: int
+    flipped: bool
+
+
+def _try_place_rectangle(
+    rect: _UnplacedRectangle,
+    placed_so_far: List[PackedRectangle],
+    occupied: List[Tuple[int, int]],
+) -> bool:
+    """
+    Try to place rect within the current bounding box.
+    Part of the implementation of pack_rectangles.
+
+    Note that the arguments `placed_so_far` and `occupied` are modified.
+
+    Args:
+        rect: rectangle to place
+        placed_so_far: the locations decided upon so far - a list of
+                    (x, y, whether flipped). The nth element is the
+                    location of the nth rectangle if it has been decided.
+                    (modified in place)
+        occupied: the nodes of the graph of extents of rightmost placed
+                    rectangles - (modified in place)
+
+    Returns:
+        True on success.
+
+    Example:
+    (We always have placed the first rectangle horizontally and other
+    rectangles above it.)
+    Let's say the placed boxes 1-4 are laid out like this.
+    The coordinates of the points marked X are stored in occupied.
+    It is to the right of the X's that we seek to place rect.
+
+        +-----------------------X
+        |2                      |
+        |                       +---X
+        |                       |4  |
+        |                       |   |
+        |                       +---+X
+        |                       |3   |
+        |                       |    |
+        +-----------------------+----+------X
+    y    |1                                  |
+    ^    |     --->x                         |
+    |    +-----------------------------------+
+
+    We want to place this rectangle.
+
+                +-+
+                |5|
+                | |
+                | |   = rect
+                | |
+                | |
+                | |
+                +-+
+
+    The call will succeed, returning True, leaving us with
+
+        +-----------------------X
+        |2                      |    +-X
+        |                       +---+|5|
+        |                       |4  || |
+        |                       |   || |
+        |                       +---++ |
+        |                       |3   | |
+        |                       |    | |
+        +-----------------------+----+-+----X
+        |1                                  |
+        |                                   |
+        +-----------------------------------+ .
+
+    """
+    total_width = occupied[0][0]
+    needed_height = rect.size[1]
+    current_start_idx = None
+    current_max_width = 0
+    previous_height = 0
+    currently_packed = 0
+    for idx, interval in enumerate(occupied):
+        if interval[0] <= total_width - rect.size[0]:
+            currently_packed += interval[1] - previous_height
+            current_max_width = max(interval[0], current_max_width)
+            if current_start_idx is None:
+                current_start_idx = idx
+            if currently_packed >= needed_height:
+                current_max_width = max(interval[0], current_max_width)
+                placed_so_far[rect.ind] = PackedRectangle(
+                    current_max_width,
+                    occupied[current_start_idx - 1][1],
+                    rect.flipped,
+                    True,
+                )
+                new_occupied = (
+                    current_max_width + rect.size[0],
+                    occupied[current_start_idx - 1][1] + needed_height,
+                )
+                if currently_packed == needed_height:
+                    occupied[idx] = new_occupied
+                    del occupied[current_start_idx:idx]
+                elif idx > current_start_idx:
+                    occupied[idx - 1] = new_occupied
+                    del occupied[current_start_idx : (idx - 1)]
+                else:
+                    occupied.insert(idx, new_occupied)
+                return True
+        else:
+            current_start_idx = None
+            current_max_width = 0
+            currently_packed = 0
+        previous_height = interval[1]
+    return False
+
+
+def pack_rectangles(sizes: List[Tuple[int, int]]) -> PackedRectangles:
+    """
+    Naive rectangle packing in to a large rectangle. Flipping (i.e. rotating
+    a rectangle by 90 degrees) is allowed.
+
+    This is used to join several uv maps into a single scene, see
+    TexturesUV.join_scene.
+
+    Args:
+        sizes: List of sizes of rectangles to pack
+
+    Returns:
+        total_size: size of total large rectangle
+        rectangles: location for each of the input rectangles.
+                    This includes whether they are flipped.
+                    The is_first field is always True.
+    """
+
+    if len(sizes) < 2:
+        raise ValueError("Cannot pack less than two boxes")
+
+    queue = []
+    for i, size in enumerate(sizes):
+        if size[0] < size[1]:
+            queue.append(_UnplacedRectangle((size[1], size[0]), i, True))
+        else:
+            queue.append(_UnplacedRectangle((size[0], size[1]), i, False))
+    queue.sort()
+    placed_so_far = [PackedRectangle(-1, -1, False, False)] * len(sizes)
+
+    biggest = queue.pop()
+    total_width, current_height = biggest.size
+    placed_so_far[biggest.ind] = PackedRectangle(0, 0, biggest.flipped, True)
+
+    second = queue.pop()
+    placed_so_far[second.ind] = PackedRectangle(0, current_height, second.flipped, True)
+    current_height += second.size[1]
+    occupied = [biggest.size, (second.size[0], current_height)]
+
+    for rect in reversed(queue):
+        if _try_place_rectangle(rect, placed_so_far, occupied):
+            continue
+
+        rotated = _UnplacedRectangle(
+            (rect.size[1], rect.size[0]), rect.ind, not rect.flipped
+        )
+        if _try_place_rectangle(rotated, placed_so_far, occupied):
+            continue
+
+        # rect wasn't placed in the current bounding box,
+        # so we add extra space to fit it in.
+        placed_so_far[rect.ind] = PackedRectangle(0, current_height, rect.flipped, True)
+        current_height += rect.size[1]
+        occupied.append((rect.size[0], current_height))
+
+    return PackedRectangles((total_width, current_height), placed_so_far)
+
+
+def pack_unique_rectangles(rectangles: List[Rectangle]) -> PackedRectangles:
+    """
+    Naive rectangle packing in to a large rectangle. Flipping (i.e. rotating
+    a rectangle by 90 degrees) is allowed. Inputs are deduplicated by their
+    identifier.
+
+    This is a wrapper around pack_rectangles, where inputs come with an
+    identifier. In particular, it calls pack_rectangles for the deduplicated inputs,
+    then returns the values for all the inputs. The output for all rectangles with
+    the same identifier will be the same, except that only the first one will have
+    the is_first field True.
+
+    This is used to join several uv maps into a single scene, see
+    TexturesUV.join_scene.
+
+    Args:
+        rectangles: List of sizes of rectangles to pack
+
+    Returns:
+        total_size: size of total large rectangle
+        rectangles: location for each of the input rectangles.
+                    This includes whether they are flipped.
+                    The is_first field is true for the first rectangle
+                    with each identifier.
+    """
+
+    if len(rectangles) < 2:
+        raise ValueError("Cannot pack less than two boxes")
+
+    input_map = {}
+    input_indices: List[Tuple[int, bool]] = []
+    unique_input_sizes: List[Tuple[int, int]] = []
+    for rectangle in rectangles:
+        if rectangle.identifier not in input_map:
+            unique_index = len(unique_input_sizes)
+            unique_input_sizes.append((rectangle.xsize, rectangle.ysize))
+            input_map[rectangle.identifier] = unique_index
+            input_indices.append((unique_index, True))
+        else:
+            unique_index = input_map[rectangle.identifier]
+            input_indices.append((unique_index, False))
+
+    if len(unique_input_sizes) == 1:
+        first = [PackedRectangle(0, 0, False, True)]
+        rest = (len(rectangles) - 1) * [PackedRectangle(0, 0, False, False)]
+        return PackedRectangles(unique_input_sizes[0], first + rest)
+
+    total_size, unique_locations = pack_rectangles(unique_input_sizes)
+    full_locations = []
+    for input_index, first in input_indices:
+        full_locations.append(unique_locations[input_index]._replace(is_first=first))
+
+    return PackedRectangles(total_size, full_locations)
diff --git a/pytorch3d/pytorch3d/renderer/points/__init__.py b/pytorch3d/pytorch3d/renderer/points/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0db6de5fd827331760d9d04ce3255db5312197a4
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/points/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .compositor import AlphaCompositor, NormWeightedCompositor
+from .pulsar.unified import PulsarPointsRenderer
+from .rasterize_points import rasterize_points
+from .rasterizer import PointsRasterizationSettings, PointsRasterizer
+from .renderer import PointsRenderer
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/renderer/points/compositor.py b/pytorch3d/pytorch3d/renderer/points/compositor.py
new file mode 100644
index 0000000000000000000000000000000000000000..c7777eece282a315924ac8a6992dc332a3b26e48
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/points/compositor.py
@@ -0,0 +1,112 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ..compositing import alpha_composite, norm_weighted_sum
+
+
+# A compositor should take as input 3D points and some corresponding information.
+# Given this information, the compositor can:
+#     - blend colors across the top K vertices at a pixel
+
+
+class AlphaCompositor(nn.Module):
+    """
+    Accumulate points using alpha compositing.
+    """
+
+    def __init__(
+        self, background_color: Optional[Union[Tuple, List, torch.Tensor]] = None
+    ) -> None:
+        super().__init__()
+        self.background_color = background_color
+
+    def forward(self, fragments, alphas, ptclds, **kwargs) -> torch.Tensor:
+        background_color = kwargs.get("background_color", self.background_color)
+        images = alpha_composite(fragments, alphas, ptclds)
+
+        # images are of shape (N, C, H, W)
+        # check for background color & feature size C (C=4 indicates rgba)
+        if background_color is not None and images.shape[1] == 4:
+            return _add_background_color_to_images(fragments, images, background_color)
+        return images
+
+
+class NormWeightedCompositor(nn.Module):
+    """
+    Accumulate points using a normalized weighted sum.
+    """
+
+    def __init__(
+        self, background_color: Optional[Union[Tuple, List, torch.Tensor]] = None
+    ) -> None:
+        super().__init__()
+        self.background_color = background_color
+
+    def forward(self, fragments, alphas, ptclds, **kwargs) -> torch.Tensor:
+        background_color = kwargs.get("background_color", self.background_color)
+        images = norm_weighted_sum(fragments, alphas, ptclds)
+
+        # images are of shape (N, C, H, W)
+        # check for background color & feature size C (C=4 indicates rgba)
+        if background_color is not None and images.shape[1] == 4:
+            return _add_background_color_to_images(fragments, images, background_color)
+        return images
+
+
+def _add_background_color_to_images(pix_idxs, images, background_color):
+    """
+    Mask pixels in images without corresponding points with a given background_color.
+
+    Args:
+        pix_idxs: int32 Tensor of shape (N, points_per_pixel, image_size, image_size)
+            giving the indices of the nearest points at each pixel, sorted in z-order.
+        images: Tensor of shape (N, 4, image_size, image_size) giving the
+            accumulated features at each point, where 4 refers to a rgba feature.
+        background_color: Tensor, list, or tuple with 3 or 4 values indicating the rgb/rgba
+            value for the new background. Values should be in the interval [0,1].
+     Returns:
+        images: Tensor of shape (N, 4, image_size, image_size), where pixels with
+            no nearest points have features set to the background color, and other
+            pixels with accumulated features have unchanged values.
+    """
+    # Initialize background mask
+    background_mask = pix_idxs[:, 0] < 0  # (N, H, W)
+
+    # Convert background_color to an appropriate tensor and check shape
+    if not torch.is_tensor(background_color):
+        background_color = images.new_tensor(background_color)
+
+    background_shape = background_color.shape
+
+    if len(background_shape) != 1 or background_shape[0] not in (3, 4):
+        warnings.warn(
+            "Background color should be size (3) or (4), but is size %s instead"
+            % (background_shape,)
+        )
+        return images
+
+    background_color = background_color.to(images)
+
+    # add alpha channel
+    if background_shape[0] == 3:
+        alpha = images.new_ones(1)
+        background_color = torch.cat([background_color, alpha])
+
+    num_background_pixels = background_mask.sum()
+
+    # permute so that features are the last dimension for masked_scatter to work
+    masked_images = images.permute(0, 2, 3, 1)[..., :4].masked_scatter(
+        background_mask[..., None],
+        background_color[None, :].expand(num_background_pixels, -1),
+    )
+
+    return masked_images.permute(0, 3, 1, 2)
diff --git a/pytorch3d/pytorch3d/renderer/points/pulsar/__init__.py b/pytorch3d/pytorch3d/renderer/points/pulsar/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..75229db7d9d83571a4e361ff9224fe33bcf36322
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/points/pulsar/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .renderer import Renderer  # noqa: F401
diff --git a/pytorch3d/pytorch3d/renderer/points/pulsar/renderer.py b/pytorch3d/pytorch3d/renderer/points/pulsar/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..4d7cf82f06e0c24def759fed0775b32c65553e18
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/points/pulsar/renderer.py
@@ -0,0 +1,705 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""pulsar renderer PyTorch integration.
+
+Proper Python support for pytorch requires creating a torch.autograd.function
+(independent of whether this is being done within the C++ module). This is done
+here and a torch.nn.Module is exposed for the use in more complex models.
+"""
+import logging
+import math
+import warnings
+from typing import Optional, Tuple, Union
+
+import torch
+from pytorch3d import _C
+from pytorch3d.transforms import axis_angle_to_matrix, rotation_6d_to_matrix
+
+
+LOGGER = logging.getLogger(__name__)
+GAMMA_WARNING_EMITTED = False
+AXANGLE_WARNING_EMITTED = False
+
+
+class _Render(torch.autograd.Function):
+    """
+    Differentiable rendering function for the Pulsar renderer.
+
+    Usually this will be used through the `Renderer` module, which takes care of
+    setting up the buffers and putting them on the correct device. If you use
+    the function directly, you will have to do this manually.
+
+    The steps for this are two-fold: first, you need to create a native Renderer
+    object to provide the required buffers. This is the `native_renderer` parameter
+    for this function. You can create it by creating a `pytorch3d._C.PulsarRenderer`
+    object (with parameters for width, height and maximum number of balls it should
+    be able to render). This object by default resides on the CPU. If you want to
+    shift the buffers to a different device, just assign an empty tensor on the target
+    device to its property `device_tracker`.
+
+    To convert camera parameters from a more convenient representation to the
+    required vectors as in this function, you can use the static
+    function `pytorch3d.renderer.points.pulsar.Renderer._transform_cam_params`.
+
+    Args:
+        * ctx: Pytorch context.
+        * vert_pos: vertex positions. [Bx]Nx3 tensor of positions in 3D space.
+        * vert_col: vertex colors. [Bx]NxK tensor of channels.
+        * vert_rad: vertex radii. [Bx]N tensor of radiuses, >0.
+        * cam_pos: camera position(s). [Bx]3 tensor in 3D coordinates.
+        * pixel_0_0_center: [Bx]3 tensor center(s) of the upper left pixel(s) in
+                            world coordinates.
+        * pixel_vec_x: [Bx]3 tensor from one pixel center to the next in image x
+                       direction in world coordinates.
+        * pixel_vec_y: [Bx]3 tensor from one pixel center to the next in image y
+                       direction in world coordinates.
+        * focal_length: [Bx]1 tensor of focal lengths in world coordinates.
+        * principal_point_offsets: [Bx]2 tensor of principal point offsets in pixels.
+        * gamma: sphere transparency in [1.,1E-5], with 1 being mostly transparent.
+                 [Bx]1.
+        * max_depth: maximum depth for spheres to render. Set this as tighly
+                     as possible to have good numerical accuracy for gradients.
+        * native_renderer: a `pytorch3d._C.PulsarRenderer` object.
+        * min_depth: a float with the minimum depth a sphere must have to be renderer.
+                     Must be 0. or > max(focal_length).
+        * bg_col: K tensor with a background color to use or None (uses all ones).
+        * opacity: [Bx]N tensor of opacity values in [0., 1.] or None (uses all ones).
+        * percent_allowed_difference: a float in [0., 1.[ with the maximum allowed
+                     difference in color space. This is used to speed up the
+                     computation. Default: 0.01.
+        * max_n_hits: a hard limit on the number of hits per ray. Default: max int.
+        * mode: render mode in {0, 1}. 0: render an image; 1: render the hit map.
+        * return_forward_info: whether to return a second map. This second map contains
+            13 channels: first channel contains sm_m (the maximum exponent factor
+            observed), the second sm_d (the normalization denominator, the sum of all
+            coefficients), the third the maximum closest possible intersection for a
+            hit. The following channels alternate with the float encoded integer index
+            of a sphere and its weight. They are the five spheres with the highest
+            color contribution to this pixel color, ordered descending.
+
+        Returns:
+            * image: [Bx]HxWxK float tensor with the resulting image.
+            * forw_info: [Bx]HxWx13 float forward information as described above,
+                  if enabled.
+    """
+
+    @staticmethod
+    # pyre-fixme[14]: `forward` overrides method defined in `Function` inconsistently.
+    def forward(
+        ctx,
+        vert_pos,
+        vert_col,
+        vert_rad,
+        cam_pos,
+        pixel_0_0_center,
+        pixel_vec_x,
+        pixel_vec_y,
+        focal_length,
+        principal_point_offsets,
+        gamma,
+        max_depth,
+        native_renderer,
+        min_depth=0.0,
+        bg_col=None,
+        opacity=None,
+        percent_allowed_difference=0.01,
+        max_n_hits=_C.MAX_UINT,
+        mode=0,
+        return_forward_info=False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if mode != 0:
+            assert not return_forward_info, (
+                "You are using a non-standard rendering mode. This does "
+                "not provide gradients, and also no `forward_info`. Please "
+                "set `return_forward_info` to `False`."
+            )
+        ctx.gamma = gamma
+        ctx.max_depth = max_depth
+        ctx.min_depth = min_depth
+        ctx.percent_allowed_difference = percent_allowed_difference
+        ctx.max_n_hits = max_n_hits
+        ctx.mode = mode
+        ctx.native_renderer = native_renderer
+        image, info = ctx.native_renderer.forward(
+            vert_pos,
+            vert_col,
+            vert_rad,
+            cam_pos,
+            pixel_0_0_center,
+            pixel_vec_x,
+            pixel_vec_y,
+            focal_length,
+            principal_point_offsets,
+            gamma,
+            max_depth,
+            min_depth,
+            bg_col,
+            opacity,
+            percent_allowed_difference,
+            max_n_hits,
+            mode,
+        )
+        if mode != 0:
+            # Backprop not possible!
+            info = None
+        # Prepare for backprop.
+        ctx.save_for_backward(
+            vert_pos,
+            vert_col,
+            vert_rad,
+            cam_pos,
+            pixel_0_0_center,
+            pixel_vec_x,
+            pixel_vec_y,
+            focal_length,
+            principal_point_offsets,
+            bg_col,
+            opacity,
+            image,
+            info,
+        )
+        if return_forward_info:
+            return image, info
+        else:
+            return image
+
+    @staticmethod
+    def backward(ctx, grad_im, *args):
+        global GAMMA_WARNING_EMITTED
+        (
+            vert_pos,
+            vert_col,
+            vert_rad,
+            cam_pos,
+            pixel_0_0_center,
+            pixel_vec_x,
+            pixel_vec_y,
+            focal_length,
+            principal_point_offsets,
+            bg_col,
+            opacity,
+            image,
+            info,
+        ) = ctx.saved_tensors
+        if (
+            (
+                ctx.needs_input_grad[0]
+                or ctx.needs_input_grad[2]
+                or ctx.needs_input_grad[3]
+                or ctx.needs_input_grad[4]
+                or ctx.needs_input_grad[5]
+                or ctx.needs_input_grad[6]
+                or ctx.needs_input_grad[7]
+            )
+            and ctx.gamma < 1e-3
+            and not GAMMA_WARNING_EMITTED
+        ):
+            warnings.warn(
+                "Optimizing for non-color parameters and having a gamma value < 1E-3! "
+                "This is probably not going to produce usable gradients."
+            )
+            GAMMA_WARNING_EMITTED = True
+        if ctx.mode == 0:
+            (
+                grad_pos,
+                grad_col,
+                grad_rad,
+                grad_cam_pos,
+                grad_pixel_0_0_center,
+                grad_pixel_vec_x,
+                grad_pixel_vec_y,
+                grad_opacity,
+            ) = ctx.native_renderer.backward(
+                grad_im,
+                image,
+                info,
+                vert_pos,
+                vert_col,
+                vert_rad,
+                cam_pos,
+                pixel_0_0_center,
+                pixel_vec_x,
+                pixel_vec_y,
+                focal_length,
+                principal_point_offsets,
+                ctx.gamma,
+                ctx.max_depth,
+                ctx.min_depth,
+                bg_col,
+                opacity,
+                ctx.percent_allowed_difference,
+                ctx.max_n_hits,
+                ctx.mode,
+                ctx.needs_input_grad[0],
+                ctx.needs_input_grad[1],
+                ctx.needs_input_grad[2],
+                ctx.needs_input_grad[3]
+                or ctx.needs_input_grad[4]
+                or ctx.needs_input_grad[5]
+                or ctx.needs_input_grad[6]
+                or ctx.needs_input_grad[7],
+                ctx.needs_input_grad[14],
+                None,  # No debug information provided.
+            )
+        else:
+            raise ValueError(
+                "Performing a backward pass for a "
+                "rendering with `mode != 0`! This is not possible."
+            )
+        return (
+            grad_pos,
+            grad_col,
+            grad_rad,
+            grad_cam_pos,
+            grad_pixel_0_0_center,
+            grad_pixel_vec_x,
+            grad_pixel_vec_y,
+            None,  # focal_length
+            None,  # principal_point_offsets
+            None,  # gamma
+            None,  # max_depth
+            None,  # native_renderer
+            None,  # min_depth
+            None,  # bg_col
+            grad_opacity,
+            None,  # percent_allowed_difference
+            None,  # max_n_hits
+            None,  # mode
+            None,  # return_forward_info
+        )
+
+
+class Renderer(torch.nn.Module):
+    """
+    Differentiable rendering module for the Pulsar renderer.
+
+    Set the maximum number of balls to a reasonable value. It is used to determine
+    several buffer sizes. It is no problem to render less balls than this number,
+    but never more.
+
+    When optimizing for sphere positions, sphere radiuses or camera parameters you
+    have to use higher gamma values (closer to one) and larger sphere sizes: spheres
+    can only 'move' to areas that they cover, and only with higher gamma values exists
+    a gradient w.r.t. their color depending on their position.
+
+    Args:
+        * width: result image width in pixels.
+        * height: result image height in pixels.
+        * max_num_balls: the maximum number of balls this renderer will handle.
+        * orthogonal_projection: use an orthogonal instead of perspective projection.
+            Default: False.
+        * right_handed_system: use a right-handed instead of a left-handed coordinate
+            system. This is relevant for compatibility with other drawing or scanning
+            systems. Pulsar by default assumes a left-handed world and camera coordinate
+            system as known from mathematics with x-axis to the right, y axis up and z
+            axis for increasing depth along the optical axis. In the image coordinate
+            system, only the y axis is pointing down, leading still to a left-handed
+            system. If you set this to True, it is assuming a right-handed world and
+            camera coordinate system with x axis to the right, y axis to the top and
+            z axis decreasing along the optical axis. Again, the image coordinate
+            system has a flipped y axis, remaining a right-handed system.
+            Default: False.
+        * background_normalized_depth: the normalized depth the background is placed
+            at.
+            This is on a scale from 0. to 1. between the specified min and max depth
+            (see the forward function). The value 0. is the most furthest depth whereas
+            1. is the closest. Be careful when setting the background too far front - it
+            may hide elements in your scene. Default: EPS.
+        * n_channels: the number of image content channels to use. This is usually three
+            for regular color representations, but can be a higher or lower number.
+            Default: 3.
+        * n_track: the number of spheres to track for gradient calculation per pixel.
+            Only the closest n_track spheres will receive gradients. Default: 5.
+    """
+
+    def __init__(
+        self,
+        width: int,
+        height: int,
+        max_num_balls: int,
+        orthogonal_projection: bool = False,
+        right_handed_system: bool = False,
+        background_normalized_depth: float = _C.EPS,
+        n_channels: int = 3,
+        n_track: int = 5,
+    ) -> None:
+        super(Renderer, self).__init__()
+        # pyre-fixme[16]: Module `pytorch3d` has no attribute `_C`.
+        self._renderer = _C.PulsarRenderer(
+            width,
+            height,
+            max_num_balls,
+            orthogonal_projection,
+            right_handed_system,
+            background_normalized_depth,
+            n_channels,
+            n_track,
+        )
+        self.register_buffer("device_tracker", torch.zeros(1))
+
+    @staticmethod
+    def sphere_ids_from_result_info_nograd(result_info: torch.Tensor) -> torch.Tensor:
+        """
+        Get the sphere IDs from a result info tensor.
+        """
+        if result_info.ndim == 3:
+            return Renderer.sphere_ids_from_result_info_nograd(result_info[None, ...])
+        # pyre-fixme[16]: Module `pytorch3d` has no attribute `_C`.
+        return _C.pulsar_sphere_ids_from_result_info_nograd(result_info)
+
+    @staticmethod
+    def depth_map_from_result_info_nograd(result_info: torch.Tensor) -> torch.Tensor:
+        """
+        Get the depth map from a result info tensor.
+
+        This returns a map of the same size as the image with just one channel
+        containing the closest intersection value at that position. Gradients
+        are not available for this tensor, but do note that you can use
+        `sphere_ids_from_result_info_nograd` to get the IDs of the spheres at
+        each position and directly create a loss on their depth if required.
+
+        The depth map contains -1. at positions where no intersection has
+        been detected.
+        """
+        return result_info[..., 4]
+
+    @staticmethod
+    def _transform_cam_params(
+        cam_params: torch.Tensor,
+        width: int,
+        height: int,
+        orthogonal: bool,
+        right_handed: bool,
+        first_R_then_T: bool = False,
+    ) -> Tuple[
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+        torch.Tensor,
+    ]:
+        """
+        Transform 8 component camera parameter vector(s) to the internal camera
+        representation.
+
+        The input vectors consists of:
+            * 3 components for camera position,
+            * 3 components for camera rotation (three rotation angles) or
+              6 components as described in "On the Continuity of Rotation
+              Representations in Neural Networks" (Zhou et al.),
+            * focal length,
+            * the sensor width in world coordinates,
+            * [optional] the principal point offset in x and y.
+
+        The sensor height is inferred by pixel size and sensor width to obtain
+        quadratic pixels.
+
+        Args:
+            * cam_params: [Bx]{8, 10, 11, 13}, input tensors as described above.
+            * width: number of pixels in x direction.
+            * height: number of pixels in y direction.
+            * orthogonal: bool, whether an orthogonal projection is used
+                  (does not use focal length).
+            * right_handed: bool, whether to use a right handed system
+                  (negative z in camera direction).
+            * first_R_then_T: bool, whether to first rotate, then translate
+                  the camera (PyTorch3D convention).
+
+        Returns:
+            * pos_vec: the position vector in 3D,
+            * pixel_0_0_center: the center of the upper left pixel in world coordinates,
+            * pixel_vec_x: the step to move one pixel on the image x axis
+                   in world coordinates,
+            * pixel_vec_y: the step to move one pixel on the image y axis
+                   in world coordinates,
+            * focal_length: the focal lengths,
+            * principal_point_offsets: the principal point offsets in x, y.
+        """
+        global AXANGLE_WARNING_EMITTED
+        # Set up all direction vectors, i.e., the sensor direction of all axes.
+        assert width > 0
+        assert height > 0
+        batch_processing = True
+        if cam_params.ndimension() == 1:
+            batch_processing = False
+            cam_params = cam_params[None, :]
+        batch_size = cam_params.size(0)
+        continuous_rep = True
+        if cam_params.shape[1] in [8, 10]:
+            if cam_params.requires_grad and not AXANGLE_WARNING_EMITTED:
+                warnings.warn(
+                    "Using an axis angle representation for camera rotations. "
+                    "This has discontinuities and should not be used for optimization. "
+                    "Alternatively, use a six-component representation as described in "
+                    "'On the Continuity of Rotation Representations in Neural Networks'"
+                    " (Zhou et al.). "
+                    "The `pytorch3d.transforms` module provides "
+                    "facilities for using this representation."
+                )
+                AXANGLE_WARNING_EMITTED = True
+            continuous_rep = False
+        else:
+            assert cam_params.shape[1] in [11, 13]
+        pos_vec: torch.Tensor = cam_params[:, :3]
+        principal_point_offsets: torch.Tensor = torch.zeros(
+            (cam_params.shape[0], 2), dtype=torch.int32, device=cam_params.device
+        )
+        if continuous_rep:
+            rot_vec = cam_params[:, 3:9]
+            focal_length: torch.Tensor = cam_params[:, 9:10]
+            sensor_size_x = cam_params[:, 10:11]
+            if cam_params.shape[1] == 13:
+                principal_point_offsets: torch.Tensor = cam_params[:, 11:13].to(
+                    torch.int32
+                )
+        else:
+            rot_vec = cam_params[:, 3:6]
+            focal_length: torch.Tensor = cam_params[:, 6:7]
+            sensor_size_x = cam_params[:, 7:8]
+            if cam_params.shape[1] == 10:
+                principal_point_offsets: torch.Tensor = cam_params[:, 8:10].to(
+                    torch.int32
+                )
+        # Always get quadratic pixels.
+        pixel_size_x = sensor_size_x / float(width)
+        sensor_size_y = height * pixel_size_x
+        if continuous_rep:
+            rot_mat = rotation_6d_to_matrix(rot_vec)
+        else:
+            rot_mat = axis_angle_to_matrix(rot_vec)
+        if first_R_then_T:
+            pos_vec = torch.matmul(rot_mat, pos_vec[..., None])[:, :, 0]
+        LOGGER.debug(
+            "Camera position: %s, rotation: %s. Focal length: %s.",
+            str(pos_vec),
+            str(rot_vec),
+            str(focal_length),
+        )
+        sensor_dir_x = torch.matmul(
+            rot_mat,
+            torch.tensor(
+                [1.0, 0.0, 0.0], dtype=torch.float32, device=rot_mat.device
+            ).repeat(batch_size, 1)[:, :, None],
+        )[:, :, 0]
+        sensor_dir_y = torch.matmul(
+            rot_mat,
+            torch.tensor(
+                [0.0, -1.0, 0.0], dtype=torch.float32, device=rot_mat.device
+            ).repeat(batch_size, 1)[:, :, None],
+        )[:, :, 0]
+        sensor_dir_z = torch.matmul(
+            rot_mat,
+            torch.tensor(
+                [0.0, 0.0, 1.0], dtype=torch.float32, device=rot_mat.device
+            ).repeat(batch_size, 1)[:, :, None],
+        )[:, :, 0]
+        if right_handed:
+            sensor_dir_z *= -1
+        LOGGER.debug(
+            "Sensor direction vectors: %s, %s, %s.",
+            str(sensor_dir_x),
+            str(sensor_dir_y),
+            str(sensor_dir_z),
+        )
+        if orthogonal:
+            sensor_center = pos_vec
+        else:
+            sensor_center = pos_vec + focal_length * sensor_dir_z
+        LOGGER.debug("Sensor center: %s.", str(sensor_center))
+        sensor_luc = (  # Sensor left upper corner.
+            sensor_center
+            - sensor_dir_x * (sensor_size_x / 2.0)
+            - sensor_dir_y * (sensor_size_y / 2.0)
+        )
+        LOGGER.debug("Sensor luc: %s.", str(sensor_luc))
+        pixel_size_x = sensor_size_x / float(width)
+        pixel_size_y = sensor_size_y / float(height)
+        LOGGER.debug(
+            "Pixel sizes (x): %s, (y) %s.", str(pixel_size_x), str(pixel_size_y)
+        )
+        pixel_vec_x: torch.Tensor = sensor_dir_x * pixel_size_x
+        pixel_vec_y: torch.Tensor = sensor_dir_y * pixel_size_y
+        pixel_0_0_center = sensor_luc + 0.5 * pixel_vec_x + 0.5 * pixel_vec_y
+        LOGGER.debug(
+            "Pixel 0 centers: %s, vec x: %s, vec y: %s.",
+            str(pixel_0_0_center),
+            str(pixel_vec_x),
+            str(pixel_vec_y),
+        )
+        if not orthogonal:
+            LOGGER.debug(
+                "Camera horizontal fovs: %s deg.",
+                str(
+                    2.0
+                    * torch.atan(0.5 * sensor_size_x / focal_length)
+                    / math.pi
+                    * 180.0
+                ),
+            )
+            LOGGER.debug(
+                "Camera vertical fovs: %s deg.",
+                str(
+                    2.0
+                    * torch.atan(0.5 * sensor_size_y / focal_length)
+                    / math.pi
+                    * 180.0
+                ),
+            )
+        # Reduce dimension.
+        focal_length: torch.Tensor = focal_length[:, 0]
+        if batch_processing:
+            return (
+                pos_vec,
+                pixel_0_0_center,
+                pixel_vec_x,
+                pixel_vec_y,
+                focal_length,
+                principal_point_offsets,
+            )
+        else:
+            return (
+                pos_vec[0],
+                pixel_0_0_center[0],
+                pixel_vec_x[0],
+                pixel_vec_y[0],
+                focal_length[0],
+                principal_point_offsets[0],
+            )
+
+    def forward(
+        self,
+        vert_pos: torch.Tensor,
+        vert_col: torch.Tensor,
+        vert_rad: torch.Tensor,
+        cam_params: torch.Tensor,
+        gamma: float,
+        max_depth: float,
+        min_depth: float = 0.0,
+        bg_col: Optional[torch.Tensor] = None,
+        opacity: Optional[torch.Tensor] = None,
+        percent_allowed_difference: float = 0.01,
+        max_n_hits: int = _C.MAX_UINT,
+        mode: int = 0,
+        return_forward_info: bool = False,
+        first_R_then_T: bool = False,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]:
+        """
+        Rendering pass to create an image from the provided spheres and camera
+        parameters.
+
+        Args:
+            * vert_pos: vertex positions. [Bx]Nx3 tensor of positions in 3D space.
+            * vert_col: vertex colors. [Bx]NxK tensor of channels.
+            * vert_rad: vertex radii. [Bx]N tensor of radiuses, >0.
+            * cam_params: camera parameter(s). [Bx]8 tensor, consisting of:
+                - 3 components for camera position,
+                - 3 components for camera rotation (axis angle representation) or
+                  6 components as described in "On the Continuity of Rotation
+                  Representations in Neural Networks" (Zhou et al.),
+                - focal length,
+                - the sensor width in world coordinates,
+                - [optional] an offset for the principal point in x, y (no gradients).
+            * gamma: sphere transparency in [1.,1E-5], with 1 being mostly transparent.
+                [Bx]1.
+            * max_depth: maximum depth for spheres to render. Set this as tightly
+                        as possible to have good numerical accuracy for gradients.
+                        float > min_depth + eps.
+            * min_depth: a float with the minimum depth a sphere must have to be
+                        rendered. Must be 0. or > max(focal_length) + eps.
+            * bg_col: K tensor with a background color to use or None (uses all ones).
+            * opacity: [Bx]N tensor of opacity values in [0., 1.] or None (uses all
+                    ones).
+            * percent_allowed_difference: a float in [0., 1.[ with the maximum allowed
+                        difference in color space. This is used to speed up the
+                        computation. Default: 0.01.
+            * max_n_hits: a hard limit on the number of hits per ray. Default: max int.
+            * mode: render mode in {0, 1}. 0: render an image; 1: render the hit map.
+            * return_forward_info: whether to return a second map. This second map
+                contains 13 channels: first channel contains sm_m (the maximum
+                exponent factor observed), the second sm_d (the normalization
+                denominator, the sum of all coefficients), the third the maximum closest
+                possible intersection for a hit. The following channels alternate with
+                the float encoded integer index of a sphere and its weight. They are the
+                five spheres with the highest color contribution to this pixel color,
+                ordered descending. Default: False.
+            * first_R_then_T: bool, whether to first apply rotation to the camera,
+                then translation (PyTorch3D convention). Default: False.
+
+        Returns:
+            * image: [Bx]HxWx3 float tensor with the resulting image.
+            * forw_info: [Bx]HxWx13 float forward information as described above, if
+                    enabled.
+        """
+        # The device tracker is registered as buffer.
+        self._renderer.device_tracker = self.device_tracker
+        (
+            pos_vec,
+            pixel_0_0_center,
+            pixel_vec_x,
+            pixel_vec_y,
+            focal_lengths,
+            principal_point_offsets,
+        ) = Renderer._transform_cam_params(
+            cam_params,
+            self._renderer.width,
+            self._renderer.height,
+            self._renderer.orthogonal,
+            self._renderer.right_handed,
+            first_R_then_T=first_R_then_T,
+        )
+        if (
+            focal_lengths.min().item() > 0.0
+            and max_depth > 10_000.0 * focal_lengths.min().item()
+        ):
+            warnings.warn(
+                (
+                    "Extreme ratio of `max_depth` vs. focal length detected "
+                    "(%f vs. %f, ratio: %f). This will likely lead to "
+                    "artifacts due to numerical instabilities."
+                )
+                % (
+                    max_depth,
+                    focal_lengths.min().item(),
+                    max_depth / focal_lengths.min().item(),
+                )
+            )
+        # pyre-fixme[16]: `_Render` has no attribute `apply`.
+        ret_res = _Render.apply(
+            vert_pos,
+            vert_col,
+            vert_rad,
+            pos_vec,
+            pixel_0_0_center,
+            pixel_vec_x,
+            pixel_vec_y,
+            # Focal length and sensor size don't need gradients other than through
+            # `pixel_vec_x` and `pixel_vec_y`. The focal length is only used in the
+            # renderer to determine the projection areas of the balls.
+            focal_lengths,
+            # principal_point_offsets does not receive gradients.
+            principal_point_offsets,
+            gamma,
+            max_depth,
+            self._renderer,
+            min_depth,
+            bg_col,
+            opacity,
+            percent_allowed_difference,
+            max_n_hits,
+            mode,
+            (mode == 0) and return_forward_info,
+        )
+        if return_forward_info and mode != 0:
+            return ret_res, None
+        return ret_res
+
+    def extra_repr(self) -> str:
+        """Extra information to print in pytorch graphs."""
+        return "width={}, height={}, max_num_balls={}".format(
+            self._renderer.width, self._renderer.height, self._renderer.max_num_balls
+        )
diff --git a/pytorch3d/pytorch3d/renderer/points/pulsar/unified.py b/pytorch3d/pytorch3d/renderer/points/pulsar/unified.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a9887041b1c85fb7ab4a229d18f313d3558641a
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/points/pulsar/unified.py
@@ -0,0 +1,554 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import warnings
+from typing import Any, Dict, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from ...camera_conversions import _pulsar_from_cameras_projection
+from ...cameras import (
+    FoVOrthographicCameras,
+    FoVPerspectiveCameras,
+    OrthographicCameras,
+    PerspectiveCameras,
+)
+from ..compositor import AlphaCompositor, NormWeightedCompositor
+from ..rasterizer import PointsRasterizer
+from .renderer import Renderer as PulsarRenderer
+
+
+def _ensure_float_tensor(val_in, device):
+    """Make sure that the value provided is wrapped a PyTorch float tensor."""
+    if not isinstance(val_in, torch.Tensor):
+        val_out = torch.tensor(val_in, dtype=torch.float32, device=device).reshape((1,))
+    else:
+        val_out = val_in.to(torch.float32).to(device).reshape((1,))
+    return val_out
+
+
+class PulsarPointsRenderer(nn.Module):
+    """
+    This renderer is a PyTorch3D interface wrapper around the pulsar renderer.
+
+    It provides an interface consistent with PyTorch3D Pointcloud rendering.
+    It will extract all necessary information from the rasterizer and compositor
+    objects and convert them to the pulsar required format, then invoke rendering
+    in the pulsar renderer. All gradients are handled appropriately through the
+    wrapper and the wrapper should provide equivalent results to using the pulsar
+    renderer directly.
+    """
+
+    def __init__(
+        self,
+        rasterizer: PointsRasterizer,
+        compositor: Optional[Union[NormWeightedCompositor, AlphaCompositor]] = None,
+        n_channels: int = 3,
+        max_num_spheres: int = int(1e6),  # noqa: B008
+        **kwargs,
+    ) -> None:
+        """
+        rasterizer (PointsRasterizer): An object encapsulating rasterization parameters.
+        compositor (ignored): Only keeping this for interface consistency. Default: None.
+        n_channels (int): The number of channels of the resulting image. Default: 3.
+        max_num_spheres (int): The maximum number of spheres intended to render with
+            this renderer. Default: 1e6.
+        kwargs (Any): kwargs to pass on to the pulsar renderer.
+            See `pytorch3d.renderer.points.pulsar.renderer.Renderer` for all options.
+        """
+        super().__init__()
+        self.rasterizer = rasterizer
+        if compositor is not None:
+            warnings.warn(
+                "Creating a `PulsarPointsRenderer` with a compositor object! "
+                "This object is ignored and just allowed as an argument for interface "
+                "compatibility."
+            )
+        # Initialize the pulsar renderers.
+        if not isinstance(
+            rasterizer.cameras,
+            (
+                FoVOrthographicCameras,
+                FoVPerspectiveCameras,
+                PerspectiveCameras,
+                OrthographicCameras,
+            ),
+        ):
+            raise ValueError(
+                "Only FoVPerspectiveCameras, PerspectiveCameras, "
+                "FoVOrthographicCameras and OrthographicCameras are supported "
+                "by the pulsar backend."
+            )
+        if isinstance(rasterizer.raster_settings.image_size, tuple):
+            height, width = rasterizer.raster_settings.image_size
+        else:
+            width = rasterizer.raster_settings.image_size
+            height = rasterizer.raster_settings.image_size
+        # Making sure about integer types.
+        width = int(width)
+        height = int(height)
+        max_num_spheres = int(max_num_spheres)
+        orthogonal_projection = isinstance(
+            rasterizer.cameras, (FoVOrthographicCameras, OrthographicCameras)
+        )
+        n_channels = int(n_channels)
+        self.renderer = PulsarRenderer(
+            width=width,
+            height=height,
+            max_num_balls=max_num_spheres,
+            orthogonal_projection=orthogonal_projection,
+            right_handed_system=False,
+            n_channels=n_channels,
+            **kwargs,
+        )
+
+    def _conf_check(self, point_clouds, kwargs: Dict[str, Any]) -> bool:
+        """
+        Verify internal configuration state with kwargs and pointclouds.
+
+        This method will raise ValueError's for any inconsistencies found. It
+        returns whether an orthogonal projection will be used.
+        """
+        if "gamma" not in kwargs.keys():
+            raise ValueError(
+                "gamma is a required keyword argument for the PulsarPointsRenderer!"
+            )
+        if (
+            len(point_clouds) != len(self.rasterizer.cameras)
+            and len(self.rasterizer.cameras) != 1
+        ):
+            raise ValueError(
+                (
+                    "The len(point_clouds) must either be equal to len(rasterizer.cameras) or "
+                    "only one camera must be used. len(point_clouds): %d, "
+                    "len(rasterizer.cameras): %d."
+                )
+                % (
+                    len(point_clouds),
+                    len(self.rasterizer.cameras),
+                )
+            )
+        # Make sure the rasterizer and cameras objects have no
+        # changes that can't be matched.
+        orthogonal_projection = isinstance(
+            self.rasterizer.cameras, (FoVOrthographicCameras, OrthographicCameras)
+        )
+        if orthogonal_projection != self.renderer._renderer.orthogonal:
+            raise ValueError(
+                "The camera type can not be changed after renderer initialization! "
+                "Current camera orthogonal: %r. Original orthogonal: %r."
+            ) % (orthogonal_projection, self.renderer._renderer.orthogonal)
+        image_size = self.rasterizer.raster_settings.image_size
+        if isinstance(image_size, tuple):
+            expected_height, expected_width = image_size
+        else:
+            expected_height = expected_width = image_size
+        if expected_width != self.renderer._renderer.width:
+            raise ValueError(
+                (
+                    "The rasterizer width can not be changed after renderer "
+                    "initialization! Current width: %s. Original width: %d."
+                )
+                % (
+                    expected_width,
+                    self.renderer._renderer.width,
+                )
+            )
+        if expected_height != self.renderer._renderer.height:
+            raise ValueError(
+                (
+                    "The rasterizer height can not be changed after renderer "
+                    "initialization! Current height: %s. Original height: %d."
+                )
+                % (
+                    expected_height,
+                    self.renderer._renderer.height,
+                )
+            )
+        return orthogonal_projection
+
+    def _extract_intrinsics(  # noqa: C901
+        self, orthogonal_projection, kwargs, cloud_idx, device
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, float, float]:
+        """
+        Translate the camera intrinsics from PyTorch3D format to pulsar format.
+        """
+        # Shorthand:
+        cameras = self.rasterizer.cameras
+        if orthogonal_projection:
+            focal_length = torch.zeros((1,), dtype=torch.float32)
+            if isinstance(cameras, FoVOrthographicCameras):
+                znear = kwargs.get("znear", cameras.znear)[cloud_idx]
+                zfar = kwargs.get("zfar", cameras.zfar)[cloud_idx]
+                max_y = kwargs.get("max_y", cameras.max_y)[cloud_idx]
+                min_y = kwargs.get("min_y", cameras.min_y)[cloud_idx]
+                max_x = kwargs.get("max_x", cameras.max_x)[cloud_idx]
+                min_x = kwargs.get("min_x", cameras.min_x)[cloud_idx]
+                if max_y != -min_y:
+                    raise ValueError(
+                        "The orthographic camera must be centered around 0. "
+                        f"Max is {max_y} and min is {min_y}."
+                    )
+                if max_x != -min_x:
+                    raise ValueError(
+                        "The orthographic camera must be centered around 0. "
+                        f"Max is {max_x} and min is {min_x}."
+                    )
+                if not torch.all(
+                    kwargs.get("scale_xyz", cameras.scale_xyz)[cloud_idx] == 1.0
+                ):
+                    raise ValueError(
+                        "The orthographic camera scale must be ((1.0, 1.0, 1.0),). "
+                        f"{kwargs.get('scale_xyz', cameras.scale_xyz)[cloud_idx]}."
+                    )
+                sensor_width = max_x - min_x
+                if not sensor_width > 0.0:
+                    raise ValueError(
+                        f"The orthographic camera must have positive size! Is: {sensor_width}."  # noqa: B950
+                    )
+                principal_point_x, principal_point_y = (
+                    torch.zeros((1,), dtype=torch.float32),
+                    torch.zeros((1,), dtype=torch.float32),
+                )
+            else:
+                # Currently, this means it must be an 'OrthographicCameras' object.
+                focal_length_conf = kwargs.get("focal_length", cameras.focal_length)[
+                    cloud_idx
+                ]
+                if (
+                    focal_length_conf.numel() == 2
+                    and focal_length_conf[0] * self.renderer._renderer.width
+                    - focal_length_conf[1] * self.renderer._renderer.height
+                    > 1e-5
+                ):
+                    raise ValueError(
+                        "Pulsar only supports a single focal length! "
+                        "Provided: %s." % (str(focal_length_conf))
+                    )
+                if focal_length_conf.numel() == 2:
+                    sensor_width = 2.0 / focal_length_conf[0]
+                else:
+                    if focal_length_conf.numel() != 1:
+                        raise ValueError(
+                            "Focal length not parsable: %s." % (str(focal_length_conf))
+                        )
+                    sensor_width = 2.0 / focal_length_conf
+                if "znear" not in kwargs.keys() or "zfar" not in kwargs.keys():
+                    raise ValueError(
+                        "pulsar needs znear and zfar values for "
+                        "the OrthographicCameras. Please provide them as keyword "
+                        "argument to the forward method."
+                    )
+                znear = kwargs["znear"][cloud_idx]
+                zfar = kwargs["zfar"][cloud_idx]
+                principal_point_x = (
+                    kwargs.get("principal_point", cameras.principal_point)[cloud_idx][0]
+                    * 0.5
+                    * self.renderer._renderer.width
+                )
+                principal_point_y = (
+                    kwargs.get("principal_point", cameras.principal_point)[cloud_idx][1]
+                    * 0.5
+                    * self.renderer._renderer.height
+                )
+        else:
+            if not isinstance(cameras, PerspectiveCameras):
+                # Create a virtual focal length that is closer than znear.
+                znear = kwargs.get("znear", cameras.znear)[cloud_idx]
+                zfar = kwargs.get("zfar", cameras.zfar)[cloud_idx]
+                focal_length = znear - 1e-6
+                # Create a sensor size that matches the expected fov assuming this f.
+                afov = kwargs.get("fov", cameras.fov)[cloud_idx]
+                if kwargs.get("degrees", cameras.degrees):
+                    afov *= math.pi / 180.0
+                sensor_width = math.tan(afov / 2.0) * 2.0 * focal_length
+                if not (
+                    kwargs.get("aspect_ratio", cameras.aspect_ratio)[cloud_idx]
+                    - self.renderer._renderer.width / self.renderer._renderer.height
+                    < 1e-6
+                ):
+                    raise ValueError(
+                        "The aspect ratio ("
+                        f"{kwargs.get('aspect_ratio', cameras.aspect_ratio)[cloud_idx]}) "
+                        "must agree with the resolution width / height ("
+                        f"{self.renderer._renderer.width / self.renderer._renderer.height})."  # noqa: B950
+                    )
+                principal_point_x, principal_point_y = (
+                    torch.zeros((1,), dtype=torch.float32),
+                    torch.zeros((1,), dtype=torch.float32),
+                )
+            else:
+                focal_length_conf = kwargs.get("focal_length", cameras.focal_length)[
+                    cloud_idx
+                ]
+                if (
+                    focal_length_conf.numel() == 2
+                    and focal_length_conf[0] * self.renderer._renderer.width
+                    - focal_length_conf[1] * self.renderer._renderer.height
+                    > 1e-5
+                ):
+                    raise ValueError(
+                        "Pulsar only supports a single focal length! "
+                        "Provided: %s." % (str(focal_length_conf))
+                    )
+                if "znear" not in kwargs.keys() or "zfar" not in kwargs.keys():
+                    raise ValueError(
+                        "pulsar needs znear and zfar values for "
+                        "the PerspectiveCameras. Please provide them as keyword "
+                        "argument to the forward method."
+                    )
+                znear = kwargs["znear"][cloud_idx]
+                zfar = kwargs["zfar"][cloud_idx]
+                if focal_length_conf.numel() == 2:
+                    focal_length_px = focal_length_conf[0]
+                else:
+                    if focal_length_conf.numel() != 1:
+                        raise ValueError(
+                            "Focal length not parsable: %s." % (str(focal_length_conf))
+                        )
+                    focal_length_px = focal_length_conf
+                focal_length = torch.tensor(
+                    [
+                        znear - 1e-6,
+                    ],
+                    dtype=torch.float32,
+                    device=focal_length_px.device,
+                )
+                sensor_width = focal_length / focal_length_px * 2.0
+                principal_point_x = (
+                    kwargs.get("principal_point", cameras.principal_point)[cloud_idx][0]
+                    * 0.5
+                    * self.renderer._renderer.width
+                )
+                principal_point_y = (
+                    kwargs.get("principal_point", cameras.principal_point)[cloud_idx][1]
+                    * 0.5
+                    * self.renderer._renderer.height
+                )
+        focal_length = _ensure_float_tensor(focal_length, device)
+        sensor_width = _ensure_float_tensor(sensor_width, device)
+        principal_point_x = _ensure_float_tensor(principal_point_x, device)
+        principal_point_y = _ensure_float_tensor(principal_point_y, device)
+        znear = _ensure_float_tensor(znear, device)
+        zfar = _ensure_float_tensor(zfar, device)
+        return (
+            focal_length,
+            sensor_width,
+            principal_point_x,
+            principal_point_y,
+            znear,
+            zfar,
+        )
+
+    def _extract_extrinsics(
+        self, kwargs, cloud_idx
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Extract the extrinsic information from the kwargs for a specific point cloud.
+
+        Instead of implementing a direct translation from the PyTorch3D to the Pulsar
+        camera model, we chain the two conversions of PyTorch3D->OpenCV and
+        OpenCV->Pulsar for better maintainability (PyTorch3D->OpenCV is maintained and
+        tested by the core PyTorch3D team, whereas OpenCV->Pulsar is maintained and
+        tested by the Pulsar team).
+        """
+        # Shorthand:
+        cameras = self.rasterizer.cameras
+        R = kwargs.get("R", cameras.R)[cloud_idx]
+        T = kwargs.get("T", cameras.T)[cloud_idx]
+        tmp_cams = PerspectiveCameras(
+            R=R.unsqueeze(0), T=T.unsqueeze(0), device=R.device
+        )
+        size_tensor = torch.tensor(
+            [[self.renderer._renderer.height, self.renderer._renderer.width]]
+        )
+        pulsar_cam = _pulsar_from_cameras_projection(tmp_cams, size_tensor)
+        cam_pos = pulsar_cam[0, :3]
+        cam_rot = pulsar_cam[0, 3:9]
+        return cam_pos, cam_rot
+
+    def _get_vert_rad(
+        self, vert_pos, cam_pos, orthogonal_projection, focal_length, kwargs, cloud_idx
+    ) -> torch.Tensor:
+        """
+        Get point radiuses.
+
+        These can be depending on the camera position in case of a perspective
+        transform.
+        """
+        # Normalize point radiuses.
+        # `self.rasterizer.raster_settings.radius` can either be a float
+        # or itself a tensor.
+        raster_rad = self.rasterizer.raster_settings.radius
+        if kwargs.get("radius_world", False):
+            return raster_rad
+        if (
+            isinstance(raster_rad, torch.Tensor)
+            and raster_rad.numel() > 1
+            and raster_rad.ndim > 1
+        ):
+            # In this case it must be a batched torch tensor.
+            raster_rad = raster_rad[cloud_idx]
+        if orthogonal_projection:
+            vert_rad = (
+                torch.ones(
+                    (vert_pos.shape[0],), dtype=torch.float32, device=vert_pos.device
+                )
+                * raster_rad
+            )
+        else:
+            point_dists = torch.norm((vert_pos - cam_pos), p=2, dim=1, keepdim=False)
+            vert_rad = raster_rad / focal_length.to(vert_pos.device) * point_dists
+            if isinstance(self.rasterizer.cameras, PerspectiveCameras):
+                # NDC normalization happens through adjusted focal length.
+                pass
+            else:
+                vert_rad = vert_rad / 2.0  # NDC normalization.
+        return vert_rad
+
+    # point_clouds is not typed to avoid a cyclic dependency.
+    def forward(self, point_clouds, **kwargs) -> torch.Tensor:
+        """
+        Get the rendering of the provided `Pointclouds`.
+
+        The number of point clouds in the `Pointclouds` object determines the
+        number of resulting images. The provided cameras can be either 1 or equal
+        to the number of pointclouds (in the first case, the same camera will be
+        used for all clouds, in the latter case each point cloud will be rendered
+        with the corresponding camera).
+
+        The following kwargs are support from PyTorch3D (depending on the selected
+        camera model potentially overriding camera parameters):
+            radius_world (bool): use the provided radiuses from the raster_settings
+              plain as radiuses in world space. Default: False.
+            znear (Iterable[float]): near geometry cutoff. Is required for
+              OrthographicCameras and PerspectiveCameras.
+            zfar (Iterable[float]): far geometry cutoff. Is required for
+              OrthographicCameras and PerspectiveCameras.
+            R (torch.Tensor): [Bx3x3] camera rotation matrices.
+            T (torch.Tensor): [Bx3] camera translation vectors.
+            principal_point (torch.Tensor): [Bx2] camera intrinsic principal
+              point offset vectors.
+            focal_length (torch.Tensor): [Bx1] camera intrinsic focal lengths.
+            aspect_ratio (Iterable[float]): camera aspect ratios.
+            fov (Iterable[float]): camera FOVs.
+            degrees (bool): whether FOVs are specified in degrees or
+              radians.
+            min_x (Iterable[float]): minimum x for the FoVOrthographicCameras.
+            max_x (Iterable[float]): maximum x for the FoVOrthographicCameras.
+            min_y (Iterable[float]): minimum y for the FoVOrthographicCameras.
+            max_y (Iterable[float]): maximum y for the FoVOrthographicCameras.
+
+        The following kwargs are supported from pulsar:
+            gamma (float): The gamma value to use. This defines the transparency for
+                differentiability (see pulsar paper for details). Must be in [1., 1e-5]
+                with 1.0 being mostly transparent. This keyword argument is *required*!
+            bg_col (torch.Tensor): The background color. Must be a tensor on the same
+                device as the point clouds, with as many channels as features (no batch
+                dimension - it is the same for all images in the batch).
+                Default: 0.0 for all channels.
+            percent_allowed_difference (float): a value in [0., 1.[ with the maximum
+                allowed difference in channel space. This is used to speed up the
+                computation. Default: 0.01.
+            max_n_hits (int): a hard limit on the number of sphere hits per ray.
+                Default: max int.
+            mode (int): render mode in {0, 1}. 0: render image; 1: render hit map.
+        """
+        orthogonal_projection: bool = self._conf_check(point_clouds, kwargs)
+        # Get access to inputs. We're using the list accessor and process
+        # them sequentially.
+        position_list = point_clouds.points_list()
+        features_list = point_clouds.features_list()
+        # Result list.
+        images = []
+        for cloud_idx, (vert_pos, vert_col) in enumerate(
+            zip(position_list, features_list)
+        ):
+            # Get extrinsics.
+            cam_pos, cam_rot = self._extract_extrinsics(kwargs, cloud_idx)
+            # Get intrinsics.
+            (
+                focal_length,
+                sensor_width,
+                principal_point_x,
+                principal_point_y,
+                znear,
+                zfar,
+            ) = self._extract_intrinsics(
+                orthogonal_projection, kwargs, cloud_idx, cam_pos.device
+            )
+            # Put everything together.
+            cam_params = torch.cat(
+                (
+                    cam_pos,
+                    cam_rot.to(cam_pos.device),
+                    torch.cat(
+                        [
+                            focal_length,
+                            sensor_width,
+                            principal_point_x,
+                            principal_point_y,
+                        ],
+                    ),
+                )
+            )
+            # Get point radiuses (can depend on camera position).
+            vert_rad = self._get_vert_rad(
+                vert_pos,
+                cam_pos,
+                orthogonal_projection,
+                focal_length,
+                kwargs,
+                cloud_idx,
+            )
+            # Clean kwargs for passing on.
+            gamma = kwargs["gamma"][cloud_idx]
+            if "first_R_then_T" in kwargs.keys():
+                raise ValueError("`first_R_then_T` is not supported in this interface.")
+            otherargs = {
+                argn: argv
+                for argn, argv in kwargs.items()
+                if argn
+                not in [
+                    "radius_world",
+                    "gamma",
+                    "znear",
+                    "zfar",
+                    "R",
+                    "T",
+                    "principal_point",
+                    "focal_length",
+                    "aspect_ratio",
+                    "fov",
+                    "degrees",
+                    "min_x",
+                    "max_x",
+                    "min_y",
+                    "max_y",
+                ]
+            }
+            # background color
+            if "bg_col" not in otherargs:
+                bg_col = torch.zeros(
+                    vert_col.shape[1], device=cam_params.device, dtype=torch.float32
+                )
+                otherargs["bg_col"] = bg_col
+            # Go!
+            images.append(
+                self.renderer(
+                    vert_pos=vert_pos,
+                    vert_col=vert_col,
+                    vert_rad=vert_rad,
+                    cam_params=cam_params,
+                    gamma=gamma,
+                    max_depth=zfar,
+                    min_depth=znear,
+                    **otherargs,
+                ).flip(dims=[0])
+            )
+        return torch.stack(images, dim=0)
diff --git a/pytorch3d/pytorch3d/renderer/points/rasterize_points.py b/pytorch3d/pytorch3d/renderer/points/rasterize_points.py
new file mode 100644
index 0000000000000000000000000000000000000000..1fce0fe3361fd83dcd867d1e452018d1eda2d569
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/points/rasterize_points.py
@@ -0,0 +1,331 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+from pytorch3d import _C
+from pytorch3d.renderer.mesh.rasterize_meshes import pix_to_non_square_ndc
+
+
+# Maximum number of faces per bins for
+# coarse-to-fine rasterization
+kMaxPointsPerBin = 22
+
+
+def rasterize_points(
+    pointclouds,
+    image_size: Union[int, List[int], Tuple[int, int]] = 256,
+    radius: Union[float, List, Tuple, torch.Tensor] = 0.01,
+    points_per_pixel: int = 8,
+    bin_size: Optional[int] = None,
+    max_points_per_bin: Optional[int] = None,
+):
+    """
+    Each pointcloud is rasterized onto a separate image of shape
+    (H, W) if `image_size` is a tuple or (image_size, image_size) if it
+    is an int.
+
+    If the desired image size is non square (i.e. a tuple of (H, W) where H != W)
+    the aspect ratio needs special consideration. There are two aspect ratios
+    to be aware of:
+        - the aspect ratio of each pixel
+        - the aspect ratio of the output image
+    The camera can be used to set the pixel aspect ratio. In the rasterizer,
+    we assume square pixels, but variable image aspect ratio (i.e rectangle images).
+
+    In most cases you will want to set the camera aspect ratio to
+    1.0 (i.e. square pixels) and only vary the
+    `image_size` (i.e. the output image dimensions in pix
+
+    Args:
+        pointclouds: A Pointclouds object representing a batch of point clouds to be
+            rasterized. This is a batch of N pointclouds, where each point cloud
+            can have a different number of points; the coordinates of each point
+            are (x, y, z). The coordinates are expected to
+            be in normalized device coordinates (NDC): [-1, 1]^3 with the camera at
+            (0, 0, 0); In the camera coordinate frame the x-axis goes from right-to-left,
+            the y-axis goes from bottom-to-top, and the z-axis goes from back-to-front.
+        image_size: Size in pixels of the output image to be rasterized.
+            Can optionally be a tuple of (H, W) in the case of non square images.
+        radius (Optional): The radius (in NDC units) of the disk to
+            be rasterized. This can either be a float in which case the same radius is used
+            for each point, or a torch.Tensor of shape (N, P) giving a radius per point
+            in the batch.
+        points_per_pixel (Optional): We will keep track of this many points per
+            pixel, returning the nearest points_per_pixel points along the z-axis
+        bin_size: Size of bins to use for coarse-to-fine rasterization. Setting
+            bin_size=0 uses naive rasterization; setting bin_size=None attempts to
+            set it heuristically based on the shape of the input. This should not
+            affect the output, but can affect the speed of the forward pass.
+        max_points_per_bin: Only applicable when using coarse-to-fine rasterization
+            (bin_size > 0); this is the maximum number of points allowed within each
+            bin. This should not affect the output values, but can affect
+            the memory usage in the forward pass.
+
+    Returns:
+        3-element tuple containing
+
+        - **idx**: int32 Tensor of shape (N, image_size, image_size, points_per_pixel)
+          giving the indices of the nearest points at each pixel, in ascending
+          z-order. Concretely `idx[n, y, x, k] = p` means that `points[p]` is the kth
+          closest point (along the z-direction) to pixel (y, x) - note that points
+          represents the packed points of shape (P, 3).
+          Pixels that are hit by fewer than points_per_pixel are padded with -1.
+        - **zbuf**: Tensor of shape (N, image_size, image_size, points_per_pixel)
+          giving the z-coordinates of the nearest points at each pixel, sorted in
+          z-order. Concretely, if `idx[n, y, x, k] = p` then
+          `zbuf[n, y, x, k] = points[n, p, 2]`. Pixels hit by fewer than
+          points_per_pixel are padded with -1
+        - **dists2**: Tensor of shape (N, image_size, image_size, points_per_pixel)
+          giving the squared Euclidean distance (in NDC units) in the x/y plane
+          for each point closest to the pixel. Concretely if `idx[n, y, x, k] = p`
+          then `dists[n, y, x, k]` is the squared distance between the pixel (y, x)
+          and the point `(points[n, p, 0], points[n, p, 1])`. Pixels hit with fewer
+          than points_per_pixel are padded with -1.
+
+        In the case that image_size is a tuple of (H, W) then the outputs
+        will be of shape `(N, H, W, ...)`.
+    """
+    points_packed = pointclouds.points_packed()
+    cloud_to_packed_first_idx = pointclouds.cloud_to_packed_first_idx()
+    num_points_per_cloud = pointclouds.num_points_per_cloud()
+
+    radius = _format_radius(radius, pointclouds)
+
+    # In the case that H != W use the max image size to set the bin_size
+    # to accommodate the num bins constraint in the coarse rasterizer.
+    # If the ratio of H:W is large this might cause issues as the smaller
+    # dimension will have fewer bins.
+    # TODO: consider a better way of setting the bin size.
+    if isinstance(image_size, (tuple, list)):
+        if len(image_size) != 2:
+            raise ValueError("Image size can only be a tuple/list of (H, W)")
+        if not all(i > 0 for i in image_size):
+            raise ValueError(
+                "Image sizes must be greater than 0; got %d, %d" % image_size
+            )
+        if not all(type(i) == int for i in image_size):
+            raise ValueError("Image sizes must be integers; got %f, %f" % image_size)
+        max_image_size = max(*image_size)
+        im_size = image_size
+    else:
+        im_size = (image_size, image_size)
+        max_image_size = image_size
+
+    if bin_size is None:
+        if not points_packed.is_cuda:
+            # Binned CPU rasterization not fully implemented
+            bin_size = 0
+        else:
+            bin_size = int(2 ** max(np.ceil(np.log2(max_image_size)) - 4, 4))
+
+    if bin_size != 0:
+        # There is a limit on the number of points per bin in the cuda kernel.
+        points_per_bin = 1 + (max_image_size - 1) // bin_size
+        if points_per_bin >= kMaxPointsPerBin:
+            raise ValueError(
+                "bin_size too small, number of points per bin must be less than %d; got %d"
+                % (kMaxPointsPerBin, points_per_bin)
+            )
+
+    if max_points_per_bin is None:
+        max_points_per_bin = int(max(10000, pointclouds._P / 5))
+
+    # Function.apply cannot take keyword args, so we handle defaults in this
+    # wrapper and call apply with positional args only
+    # pyre-fixme[16]: `_RasterizePoints` has no attribute `apply`.
+    return _RasterizePoints.apply(
+        points_packed,
+        cloud_to_packed_first_idx,
+        num_points_per_cloud,
+        im_size,
+        radius,
+        points_per_pixel,
+        bin_size,
+        max_points_per_bin,
+    )
+
+
+def _format_radius(
+    radius: Union[float, List, Tuple, torch.Tensor], pointclouds
+) -> torch.Tensor:
+    """
+    Format the radius as a torch tensor of shape (P_packed,)
+    where P_packed is the total number of points in the
+    batch (i.e. pointclouds.points_packed().shape[0]).
+
+    This will enable support for a different size radius
+    for each point in the batch.
+
+    Args:
+        radius: can be a float, List, Tuple or tensor of
+            shape (N, P_padded) where P_padded is the
+            maximum number of points for each pointcloud
+            in the batch.
+
+    Returns:
+        radius: torch.Tensor of shape (P_packed)
+    """
+    N, P_padded = pointclouds._N, pointclouds._P
+    points_packed = pointclouds.points_packed()
+    P_packed = points_packed.shape[0]
+    if isinstance(radius, (list, tuple)):
+        radius = torch.tensor(radius).type_as(points_packed)
+    if isinstance(radius, torch.Tensor):
+        if N == 1 and radius.ndim == 1:
+            radius = radius[None, ...]
+        if radius.shape != (N, P_padded):
+            msg = "radius must be of shape (N, P): got %s"
+            raise ValueError(msg % (repr(radius.shape)))
+        else:
+            padded_to_packed_idx = pointclouds.padded_to_packed_idx()
+            radius = radius.view(-1)[padded_to_packed_idx]
+    elif isinstance(radius, float):
+        radius = torch.full((P_packed,), fill_value=radius).type_as(points_packed)
+    else:
+        msg = "radius must be a float, list, tuple or tensor; got %s"
+        raise ValueError(msg % type(radius))
+    return radius
+
+
+class _RasterizePoints(torch.autograd.Function):
+    @staticmethod
+    # pyre-fixme[14]: `forward` overrides method defined in `Function` inconsistently.
+    def forward(
+        ctx,
+        points,  # (P, 3)
+        cloud_to_packed_first_idx,
+        num_points_per_cloud,
+        image_size: Union[List[int], Tuple[int, int]] = (256, 256),
+        radius: Union[float, torch.Tensor] = 0.01,
+        points_per_pixel: int = 8,
+        bin_size: int = 0,
+        max_points_per_bin: int = 0,
+    ):
+        # TODO: Add better error handling for when there are more than
+        # max_points_per_bin in any bin.
+        args = (
+            points,
+            cloud_to_packed_first_idx,
+            num_points_per_cloud,
+            image_size,
+            radius,
+            points_per_pixel,
+            bin_size,
+            max_points_per_bin,
+        )
+        # pyre-fixme[16]: Module `pytorch3d` has no attribute `_C`.
+        idx, zbuf, dists = _C.rasterize_points(*args)
+        ctx.save_for_backward(points, idx)
+        ctx.mark_non_differentiable(idx)
+        return idx, zbuf, dists
+
+    @staticmethod
+    def backward(ctx, grad_idx, grad_zbuf, grad_dists):
+        grad_points = None
+        grad_cloud_to_packed_first_idx = None
+        grad_num_points_per_cloud = None
+        grad_image_size = None
+        grad_radius = None
+        grad_points_per_pixel = None
+        grad_bin_size = None
+        grad_max_points_per_bin = None
+        points, idx = ctx.saved_tensors
+        args = (points, idx, grad_zbuf, grad_dists)
+        grad_points = _C.rasterize_points_backward(*args)
+        grads = (
+            grad_points,
+            grad_cloud_to_packed_first_idx,
+            grad_num_points_per_cloud,
+            grad_image_size,
+            grad_radius,
+            grad_points_per_pixel,
+            grad_bin_size,
+            grad_max_points_per_bin,
+        )
+        return grads
+
+
+def rasterize_points_python(
+    pointclouds,
+    image_size: Union[int, Tuple[int, int]] = 256,
+    radius: Union[float, torch.Tensor] = 0.01,
+    points_per_pixel: int = 8,
+):
+    """
+    Naive pure PyTorch implementation of pointcloud rasterization.
+
+    Inputs / Outputs: Same as above
+    """
+    N = len(pointclouds)
+    H, W = (
+        image_size
+        if isinstance(image_size, (tuple, list))
+        else (image_size, image_size)
+    )
+    K = points_per_pixel
+    device = pointclouds.device
+
+    points_packed = pointclouds.points_packed()
+    cloud_to_packed_first_idx = pointclouds.cloud_to_packed_first_idx()
+    num_points_per_cloud = pointclouds.num_points_per_cloud()
+
+    # Support variable size radius for each point in the batch
+    radius = _format_radius(radius, pointclouds)
+
+    # Initialize output tensors.
+    point_idxs = torch.full(
+        (N, H, W, K), fill_value=-1, dtype=torch.int32, device=device
+    )
+    zbuf = torch.full((N, H, W, K), fill_value=-1, dtype=torch.float32, device=device)
+    pix_dists = torch.full(
+        (N, H, W, K), fill_value=-1, dtype=torch.float32, device=device
+    )
+
+    # NDC is from [-1, 1]. Get pixel size using specified image size.
+    radius2 = radius * radius
+
+    # Iterate through the batch of point clouds.
+    for n in range(N):
+        point_start_idx = cloud_to_packed_first_idx[n]
+        point_stop_idx = point_start_idx + num_points_per_cloud[n]
+
+        # Iterate through the horizontal lines of the image from top to bottom.
+        for yi in range(H):
+            # Y coordinate of one end of the image. Reverse the ordering
+            # of yi so that +Y is pointing up in the image.
+            yfix = H - 1 - yi
+            yf = pix_to_non_square_ndc(yfix, H, W)
+
+            # Iterate through pixels on this horizontal line, left to right.
+            for xi in range(W):
+                # X coordinate of one end of the image. Reverse the ordering
+                # of xi so that +X is pointing to the left in the image.
+                xfix = W - 1 - xi
+                xf = pix_to_non_square_ndc(xfix, W, H)
+
+                top_k_points = []
+                # Check whether each point in the batch affects this pixel.
+                for p in range(point_start_idx, point_stop_idx):
+                    px, py, pz = points_packed[p, :]
+                    r = radius2[p]
+                    if pz < 0:
+                        continue
+                    dx = px - xf
+                    dy = py - yf
+                    dist2 = dx * dx + dy * dy
+                    if dist2 < r:
+                        top_k_points.append((pz, p, dist2))
+                        top_k_points.sort()
+                        if len(top_k_points) > K:
+                            top_k_points = top_k_points[:K]
+                for k, (pz, p, dist2) in enumerate(top_k_points):
+                    zbuf[n, yi, xi, k] = pz
+                    point_idxs[n, yi, xi, k] = p
+                    pix_dists[n, yi, xi, k] = dist2
+    return point_idxs, zbuf, pix_dists
diff --git a/pytorch3d/pytorch3d/renderer/points/rasterizer.py b/pytorch3d/pytorch3d/renderer/points/rasterizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9ab79936e15b7e252b293416d7482cefd545829
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/points/rasterizer.py
@@ -0,0 +1,138 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass
+from typing import NamedTuple, Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+
+from .rasterize_points import rasterize_points
+
+
+# Class to store the outputs of point rasterization
+class PointFragments(NamedTuple):
+    idx: torch.Tensor
+    zbuf: torch.Tensor
+    dists: torch.Tensor
+
+
+@dataclass
+class PointsRasterizationSettings:
+    """
+    Class to store the point rasterization params with defaults
+
+    Members:
+        image_size: Either common height and width or (height, width), in pixels.
+        radius: The radius (in NDC units) of each disk to be rasterized.
+            This can either be a float in which case the same radius is used
+            for each point, or a torch.Tensor of shape (N, P) giving a radius
+            per point in the batch.
+        points_per_pixel: (int) Number of points to keep track of per pixel.
+            We return the nearest points_per_pixel points along the z-axis.
+        bin_size: Size of bins to use for coarse-to-fine rasterization. Setting
+            bin_size=0 uses naive rasterization; setting bin_size=None attempts
+            to set it heuristically based on the shape of the input. This should
+            not affect the output, but can affect the speed of the forward pass.
+        max_points_per_bin: Only applicable when using coarse-to-fine
+            rasterization (bin_size != 0); this is the maximum number of points
+            allowed within each bin. This should not affect the output values,
+            but can affect the memory usage in the forward pass.
+            Setting max_points_per_bin=None attempts to set with a heuristic.
+    """
+
+    image_size: Union[int, Tuple[int, int]] = 256
+    radius: Union[float, torch.Tensor] = 0.01
+    points_per_pixel: int = 8
+    bin_size: Optional[int] = None
+    max_points_per_bin: Optional[int] = None
+
+
+class PointsRasterizer(nn.Module):
+    """
+    This class implements methods for rasterizing a batch of pointclouds.
+    """
+
+    def __init__(self, cameras=None, raster_settings=None) -> None:
+        """
+        cameras: A cameras object which has a  `transform_points` method
+                which returns the transformed points after applying the
+                world-to-view and view-to-ndc transformations.
+            raster_settings: the parameters for rasterization. This should be a
+                named tuple.
+
+        All these initial settings can be overridden by passing keyword
+        arguments to the forward function.
+        """
+        super().__init__()
+        if raster_settings is None:
+            raster_settings = PointsRasterizationSettings()
+
+        self.cameras = cameras
+        self.raster_settings = raster_settings
+
+    def transform(self, point_clouds, **kwargs) -> torch.Tensor:
+        """
+        Args:
+            point_clouds: a set of point clouds
+
+        Returns:
+            points_proj: the points with positions projected
+            in NDC space
+
+        NOTE: keeping this as a separate function for readability but it could
+        be moved into forward.
+        """
+        cameras = kwargs.get("cameras", self.cameras)
+        if cameras is None:
+            msg = "Cameras must be specified either at initialization \
+                or in the forward pass of PointsRasterizer"
+            raise ValueError(msg)
+
+        pts_world = point_clouds.points_padded()
+        # NOTE: Retaining view space z coordinate for now.
+        # TODO: Remove this line when the convention for the z coordinate in
+        # the rasterizer is decided. i.e. retain z in view space or transform
+        # to a different range.
+        eps = kwargs.get("eps", None)
+        pts_view = cameras.get_world_to_view_transform(**kwargs).transform_points(
+            pts_world, eps=eps
+        )
+        # view to NDC transform
+        to_ndc_transform = cameras.get_ndc_camera_transform(**kwargs)
+        projection_transform = cameras.get_projection_transform(**kwargs).compose(
+            to_ndc_transform
+        )
+        pts_ndc = projection_transform.transform_points(pts_view, eps=eps)
+
+        pts_ndc[..., 2] = pts_view[..., 2]
+        point_clouds = point_clouds.update_padded(pts_ndc)
+        return point_clouds
+
+    def to(self, device):
+        # Manually move to device cameras as it is not a subclass of nn.Module
+        self.cameras = self.cameras.to(device)
+        return self
+
+    def forward(self, point_clouds, **kwargs) -> PointFragments:
+        """
+        Args:
+            point_clouds: a set of point clouds with coordinates in world space.
+        Returns:
+            PointFragments: Rasterization outputs as a named tuple.
+        """
+        points_proj = self.transform(point_clouds, **kwargs)
+        raster_settings = kwargs.get("raster_settings", self.raster_settings)
+        idx, zbuf, dists2 = rasterize_points(
+            points_proj,
+            image_size=raster_settings.image_size,
+            radius=raster_settings.radius,
+            points_per_pixel=raster_settings.points_per_pixel,
+            bin_size=raster_settings.bin_size,
+            max_points_per_bin=raster_settings.max_points_per_bin,
+        )
+        return PointFragments(idx=idx, zbuf=zbuf, dists=dists2)
diff --git a/pytorch3d/pytorch3d/renderer/points/renderer.py b/pytorch3d/pytorch3d/renderer/points/renderer.py
new file mode 100644
index 0000000000000000000000000000000000000000..e54e6f0e0943c44538ebc9e244fc9b986a99370a
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/points/renderer.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+import torch.nn as nn
+
+
+# A renderer class should be initialized with a
+# function for rasterization and a function for compositing.
+# The rasterizer should:
+#     - transform inputs from world -> screen space
+#     - rasterize inputs
+#     - return fragments
+# The compositor can take fragments as input along with any other properties of
+# the scene and generate images.
+
+# E.g. rasterize inputs and then shade
+#
+# fragments = self.rasterize(point_clouds)
+# images = self.compositor(fragments, point_clouds)
+# return images
+
+
+class PointsRenderer(nn.Module):
+    """
+    A class for rendering a batch of points. The class should
+    be initialized with a rasterizer and compositor class which each have a forward
+    function.
+    """
+
+    def __init__(self, rasterizer, compositor) -> None:
+        super().__init__()
+        self.rasterizer = rasterizer
+        self.compositor = compositor
+
+    def to(self, device):
+        # Manually move to device rasterizer as the cameras
+        # within the class are not of type nn.Module
+        self.rasterizer = self.rasterizer.to(device)
+        self.compositor = self.compositor.to(device)
+        return self
+
+    def forward(self, point_clouds, **kwargs) -> torch.Tensor:
+        fragments = self.rasterizer(point_clouds, **kwargs)
+
+        # Construct weights based on the distance of a point to the true point.
+        # However, this could be done differently: e.g. predicted as opposed
+        # to a function of the weights.
+        r = self.rasterizer.raster_settings.radius
+
+        dists2 = fragments.dists.permute(0, 3, 1, 2)
+        weights = 1 - dists2 / (r * r)
+        images = self.compositor(
+            fragments.idx.long().permute(0, 3, 1, 2),
+            weights,
+            point_clouds.features_packed().permute(1, 0),
+            **kwargs,
+        )
+
+        # permute so image comes at the end
+        images = images.permute(0, 2, 3, 1)
+
+        return images
diff --git a/pytorch3d/pytorch3d/renderer/utils.py b/pytorch3d/pytorch3d/renderer/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..33e5359d2b7c7046364970e2625467004e23dad3
--- /dev/null
+++ b/pytorch3d/pytorch3d/renderer/utils.py
@@ -0,0 +1,355 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import copy
+import inspect
+import warnings
+from typing import Any, Optional, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+from ..common.types import Device, make_device
+
+
+class TensorAccessor(nn.Module):
+    """
+    A helper class to be used with the __getitem__ method. This can be used for
+    getting/setting the values for an attribute of a class at one particular
+    index.  This is useful when the attributes of a class are batched tensors
+    and one element in the batch needs to be modified.
+    """
+
+    def __init__(self, class_object, index: Union[int, slice]) -> None:
+        """
+        Args:
+            class_object: this should be an instance of a class which has
+                attributes which are tensors representing a batch of
+                values.
+            index: int/slice, an index indicating the position in the batch.
+                In __setattr__ and __getattr__ only the value of class
+                attributes at this index will be accessed.
+        """
+        self.__dict__["class_object"] = class_object
+        self.__dict__["index"] = index
+
+    def __setattr__(self, name: str, value: Any):
+        """
+        Update the attribute given by `name` to the value given by `value`
+        at the index specified by `self.index`.
+
+        Args:
+            name: str, name of the attribute.
+            value: value to set the attribute to.
+        """
+        v = getattr(self.class_object, name)
+        if not torch.is_tensor(v):
+            msg = "Can only set values on attributes which are tensors; got %r"
+            raise AttributeError(msg % type(v))
+
+        # Convert the attribute to a tensor if it is not a tensor.
+        if not torch.is_tensor(value):
+            value = torch.tensor(
+                value, device=v.device, dtype=v.dtype, requires_grad=v.requires_grad
+            )
+
+        # Check the shapes match the existing shape and the shape of the index.
+        if v.dim() > 1 and value.dim() > 1 and value.shape[1:] != v.shape[1:]:
+            msg = "Expected value to have shape %r; got %r"
+            raise ValueError(msg % (v.shape, value.shape))
+        if (
+            v.dim() == 0
+            and isinstance(self.index, slice)
+            and len(value) != len(self.index)
+        ):
+            msg = "Expected value to have len %r; got %r"
+            raise ValueError(msg % (len(self.index), len(value)))
+        self.class_object.__dict__[name][self.index] = value
+
+    def __getattr__(self, name: str):
+        """
+        Return the value of the attribute given by "name" on self.class_object
+        at the index specified in self.index.
+
+        Args:
+            name: string of the attribute name
+        """
+        if hasattr(self.class_object, name):
+            return self.class_object.__dict__[name][self.index]
+        else:
+            msg = "Attribute %s not found on %r"
+            return AttributeError(msg % (name, self.class_object.__name__))
+
+
+BROADCAST_TYPES = (float, int, list, tuple, torch.Tensor, np.ndarray)
+
+
+class TensorProperties(nn.Module):
+    """
+    A mix-in class for storing tensors as properties with helper methods.
+    """
+
+    def __init__(
+        self,
+        dtype: torch.dtype = torch.float32,
+        device: Device = "cpu",
+        **kwargs,
+    ) -> None:
+        """
+        Args:
+            dtype: data type to set for the inputs
+            device: Device (as str or torch.device)
+            kwargs: any number of keyword arguments. Any arguments which are
+                of type (float/int/list/tuple/tensor/array) are broadcasted and
+                other keyword arguments are set as attributes.
+        """
+        super().__init__()
+        self.device = make_device(device)
+        self._N = 0
+        if kwargs is not None:
+
+            # broadcast all inputs which are float/int/list/tuple/tensor/array
+            # set as attributes anything else e.g. strings, bools
+            args_to_broadcast = {}
+            for k, v in kwargs.items():
+                if v is None or isinstance(v, (str, bool)):
+                    setattr(self, k, v)
+                elif isinstance(v, BROADCAST_TYPES):
+                    args_to_broadcast[k] = v
+                else:
+                    msg = "Arg %s with type %r is not broadcastable"
+                    warnings.warn(msg % (k, type(v)))
+
+            names = args_to_broadcast.keys()
+            # convert from type dict.values to tuple
+            values = tuple(v for v in args_to_broadcast.values())
+
+            if len(values) > 0:
+                broadcasted_values = convert_to_tensors_and_broadcast(
+                    *values, device=device
+                )
+
+                # Set broadcasted values as attributes on self.
+                for i, n in enumerate(names):
+                    setattr(self, n, broadcasted_values[i])
+                    if self._N == 0:
+                        self._N = broadcasted_values[i].shape[0]
+
+    def __len__(self) -> int:
+        return self._N
+
+    def isempty(self) -> bool:
+        return self._N == 0
+
+    def __getitem__(self, index: Union[int, slice]) -> TensorAccessor:
+        """
+
+        Args:
+            index: an int or slice used to index all the fields.
+
+        Returns:
+            if `index` is an index int/slice return a TensorAccessor class
+            with getattribute/setattribute methods which return/update the value
+            at the index in the original class.
+        """
+        if isinstance(index, (int, slice)):
+            return TensorAccessor(class_object=self, index=index)
+
+        msg = "Expected index of type int or slice; got %r"
+        raise ValueError(msg % type(index))
+
+    # pyre-fixme[14]: `to` overrides method defined in `Module` inconsistently.
+    def to(self, device: Device = "cpu") -> "TensorProperties":
+        """
+        In place operation to move class properties which are tensors to a
+        specified device. If self has a property "device", update this as well.
+        """
+        device_ = make_device(device)
+        for k in dir(self):
+            v = getattr(self, k)
+            if k == "device":
+                setattr(self, k, device_)
+            if torch.is_tensor(v) and v.device != device_:
+                setattr(self, k, v.to(device_))
+        return self
+
+    def cpu(self) -> "TensorProperties":
+        return self.to("cpu")
+
+    # pyre-fixme[14]: `cuda` overrides method defined in `Module` inconsistently.
+    def cuda(self, device: Optional[int] = None) -> "TensorProperties":
+        return self.to(f"cuda:{device}" if device is not None else "cuda")
+
+    def clone(self, other) -> "TensorProperties":
+        """
+        Update the tensor properties of other with the cloned properties of self.
+        """
+        for k in dir(self):
+            v = getattr(self, k)
+            if inspect.ismethod(v) or k.startswith("__"):
+                continue
+            if torch.is_tensor(v):
+                v_clone = v.clone()
+            else:
+                v_clone = copy.deepcopy(v)
+            setattr(other, k, v_clone)
+        return other
+
+    def gather_props(self, batch_idx) -> "TensorProperties":
+        """
+        This is an in place operation to reformat all tensor class attributes
+        based on a set of given indices using torch.gather. This is useful when
+        attributes which are batched tensors e.g. shape (N, 3) need to be
+        multiplied with another tensor which has a different first dimension
+        e.g. packed vertices of shape (V, 3).
+
+        Example
+
+        .. code-block:: python
+
+            self.specular_color = (N, 3) tensor of specular colors for each mesh
+
+        A lighting calculation may use
+
+        .. code-block:: python
+
+            verts_packed = meshes.verts_packed()  # (V, 3)
+
+        To multiply these two tensors the batch dimension needs to be the same.
+        To achieve this we can do
+
+        .. code-block:: python
+
+            batch_idx = meshes.verts_packed_to_mesh_idx()  # (V)
+
+        This gives index of the mesh for each vertex in verts_packed.
+
+        .. code-block:: python
+
+            self.gather_props(batch_idx)
+            self.specular_color = (V, 3) tensor with the specular color for
+                                     each packed vertex.
+
+        torch.gather requires the index tensor to have the same shape as the
+        input tensor so this method takes care of the reshaping of the index
+        tensor to use with class attributes with arbitrary dimensions.
+
+        Args:
+            batch_idx: shape (B, ...) where `...` represents an arbitrary
+                number of dimensions
+
+        Returns:
+            self with all properties reshaped. e.g. a property with shape (N, 3)
+            is transformed to shape (B, 3).
+        """
+        # Iterate through the attributes of the class which are tensors.
+        for k in dir(self):
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                if v.shape[0] > 1:
+                    # There are different values for each batch element
+                    # so gather these using the batch_idx.
+                    # First clone the input batch_idx tensor before
+                    # modifying it.
+                    _batch_idx = batch_idx.clone()
+                    idx_dims = _batch_idx.shape
+                    tensor_dims = v.shape
+                    if len(idx_dims) > len(tensor_dims):
+                        msg = "batch_idx cannot have more dimensions than %s. "
+                        msg += "got shape %r and %s has shape %r"
+                        raise ValueError(msg % (k, idx_dims, k, tensor_dims))
+                    if idx_dims != tensor_dims:
+                        # To use torch.gather the index tensor (_batch_idx) has
+                        # to have the same shape as the input tensor.
+                        new_dims = len(tensor_dims) - len(idx_dims)
+                        new_shape = idx_dims + (1,) * new_dims
+                        expand_dims = (-1,) + tensor_dims[1:]
+                        _batch_idx = _batch_idx.view(*new_shape)
+                        _batch_idx = _batch_idx.expand(*expand_dims)
+
+                    v = v.gather(0, _batch_idx)
+                    setattr(self, k, v)
+        return self
+
+
+def format_tensor(
+    input,
+    dtype: torch.dtype = torch.float32,
+    device: Device = "cpu",
+) -> torch.Tensor:
+    """
+    Helper function for converting a scalar value to a tensor.
+
+    Args:
+        input: Python scalar, Python list/tuple, torch scalar, 1D torch tensor
+        dtype: data type for the input
+        device: Device (as str or torch.device) on which the tensor should be placed.
+
+    Returns:
+        input_vec: torch tensor with optional added batch dimension.
+    """
+    device_ = make_device(device)
+    if not torch.is_tensor(input):
+        input = torch.tensor(input, dtype=dtype, device=device_)
+
+    if input.dim() == 0:
+        input = input.view(1)
+
+    if input.device == device_:
+        return input
+
+    input = input.to(device=device)
+    return input
+
+
+def convert_to_tensors_and_broadcast(
+    *args,
+    dtype: torch.dtype = torch.float32,
+    device: Device = "cpu",
+):
+    """
+    Helper function to handle parsing an arbitrary number of inputs (*args)
+    which all need to have the same batch dimension.
+    The output is a list of tensors.
+
+    Args:
+        *args: an arbitrary number of inputs
+            Each of the values in `args` can be one of the following
+                - Python scalar
+                - Torch scalar
+                - Torch tensor of shape (N, K_i) or (1, K_i) where K_i are
+                  an arbitrary number of dimensions which can vary for each
+                  value in args. In this case each input is broadcast to a
+                  tensor of shape (N, K_i)
+        dtype: data type to use when creating new tensors.
+        device: torch device on which the tensors should be placed.
+
+    Output:
+        args: A list of tensors of shape (N, K_i)
+    """
+    # Convert all inputs to tensors with a batch dimension
+    args_1d = [format_tensor(c, dtype, device) for c in args]
+
+    # Find broadcast size
+    sizes = [c.shape[0] for c in args_1d]
+    N = max(sizes)
+
+    args_Nd = []
+    for c in args_1d:
+        if c.shape[0] != 1 and c.shape[0] != N:
+            msg = "Got non-broadcastable sizes %r" % sizes
+            raise ValueError(msg)
+
+        # Expand broadcast dim and keep non broadcast dims the same size
+        expand_sizes = (N,) + (-1,) * len(c.shape[1:])
+        args_Nd.append(c.expand(*expand_sizes))
+
+    if len(args) == 1:
+        args_Nd = args_Nd[0]  # Return the first element
+
+    return args_Nd
diff --git a/pytorch3d/pytorch3d/structures/__init__.py b/pytorch3d/pytorch3d/structures/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..37ddba7b44c2aaa15867230891088eec734dbcb6
--- /dev/null
+++ b/pytorch3d/pytorch3d/structures/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .meshes import Meshes, join_meshes_as_batch, join_meshes_as_scene
+from .pointclouds import Pointclouds
+from .utils import list_to_packed, list_to_padded, packed_to_list, padded_to_list
+from .volumes import Volumes
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/structures/meshes.py b/pytorch3d/pytorch3d/structures/meshes.py
new file mode 100644
index 0000000000000000000000000000000000000000..55addd6153a76e1f77c4988ae2456180ea0b044c
--- /dev/null
+++ b/pytorch3d/pytorch3d/structures/meshes.py
@@ -0,0 +1,1653 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Union
+
+import torch
+
+from ..common.types import Device, make_device
+from . import utils as struct_utils
+
+
+class Meshes:
+    """
+    This class provides functions for working with batches of triangulated
+    meshes with varying numbers of faces and vertices, and converting between
+    representations.
+
+    Within Meshes, there are three different representations of the faces and
+    verts data:
+
+    List
+      - only used for input as a starting point to convert to other representations.
+    Padded
+      - has specific batch dimension.
+    Packed
+      - no batch dimension.
+      - has auxiliary variables used to index into the padded representation.
+
+    Example:
+
+    Input list of verts V_n = [[V_1], [V_2], ... , [V_N]]
+    where V_1, ... , V_N are the number of verts in each mesh and N is the
+    number of meshes.
+
+    Input list of faces F_n = [[F_1], [F_2], ... , [F_N]]
+    where F_1, ... , F_N are the number of faces in each mesh.
+
+    # SPHINX IGNORE
+     List                      | Padded                  | Packed
+    ---------------------------|-------------------------|------------------------
+    [[V_1], ... , [V_N]]       | size = (N, max(V_n), 3) |  size = (sum(V_n), 3)
+                               |                         |
+    Example for verts:         |                         |
+                               |                         |
+    V_1 = 3, V_2 = 4, V_3 = 5  | size = (3, 5, 3)        |  size = (12, 3)
+                               |                         |
+    List([                     | tensor([                |  tensor([
+      [                        |     [                   |    [0.1, 0.3, 0.5],
+        [0.1, 0.3, 0.5],       |       [0.1, 0.3, 0.5],  |    [0.5, 0.2, 0.1],
+        [0.5, 0.2, 0.1],       |       [0.5, 0.2, 0.1],  |    [0.6, 0.8, 0.7],
+        [0.6, 0.8, 0.7],       |       [0.6, 0.8, 0.7],  |    [0.1, 0.3, 0.3],
+      ],                       |       [0,    0,    0],  |    [0.6, 0.7, 0.8],
+      [                        |       [0,    0,    0],  |    [0.2, 0.3, 0.4],
+        [0.1, 0.3, 0.3],       |     ],                  |    [0.1, 0.5, 0.3],
+        [0.6, 0.7, 0.8],       |     [                   |    [0.7, 0.3, 0.6],
+        [0.2, 0.3, 0.4],       |       [0.1, 0.3, 0.3],  |    [0.2, 0.4, 0.8],
+        [0.1, 0.5, 0.3],       |       [0.6, 0.7, 0.8],  |    [0.9, 0.5, 0.2],
+      ],                       |       [0.2, 0.3, 0.4],  |    [0.2, 0.3, 0.4],
+      [                        |       [0.1, 0.5, 0.3],  |    [0.9, 0.3, 0.8],
+        [0.7, 0.3, 0.6],       |       [0,    0,    0],  |  ])
+        [0.2, 0.4, 0.8],       |     ],                  |
+        [0.9, 0.5, 0.2],       |     [                   |
+        [0.2, 0.3, 0.4],       |       [0.7, 0.3, 0.6],  |
+        [0.9, 0.3, 0.8],       |       [0.2, 0.4, 0.8],  |
+      ]                        |       [0.9, 0.5, 0.2],  |
+    ])                         |       [0.2, 0.3, 0.4],  |
+                               |       [0.9, 0.3, 0.8],  |
+                               |     ]                   |
+                               |  ])                     |
+    Example for faces:         |                         |
+                               |                         |
+    F_1 = 1, F_2 = 2, F_3 = 7  | size = (3, 7, 3)        | size = (10, 3)
+                               |                         |
+    List([                     | tensor([                | tensor([
+      [                        |     [                   |    [ 0,  1,  2],
+        [0, 1, 2],             |       [0,   1,  2],     |    [ 3,  4,  5],
+      ],                       |       [-1, -1, -1],     |    [ 4,  5,  6],
+      [                        |       [-1, -1, -1]      |    [ 8,  9,  7],
+        [0, 1, 2],             |       [-1, -1, -1]      |    [ 7,  8, 10],
+        [1, 2, 3],             |       [-1, -1, -1]      |    [ 9, 10,  8],
+      ],                       |       [-1, -1, -1],     |    [11, 10,  9],
+      [                        |       [-1, -1, -1],     |    [11,  7,  8],
+        [1, 2, 0],             |     ],                  |    [11, 10,  8],
+        [0, 1, 3],             |     [                   |    [11,  9,  8],
+        [2, 3, 1],             |       [0,   1,  2],     |  ])
+        [4, 3, 2],             |       [1,   2,  3],     |
+        [4, 0, 1],             |       [-1, -1, -1],     |
+        [4, 3, 1],             |       [-1, -1, -1],     |
+        [4, 2, 1],             |       [-1, -1, -1],     |
+      ],                       |       [-1, -1, -1],     |
+    ])                         |       [-1, -1, -1],     |
+                               |     ],                  |
+                               |     [                   |
+                               |       [1,   2,  0],     |
+                               |       [0,   1,  3],     |
+                               |       [2,   3,  1],     |
+                               |       [4,   3,  2],     |
+                               |       [4,   0,  1],     |
+                               |       [4,   3,  1],     |
+                               |       [4,   2,  1],     |
+                               |     ]                   |
+                               |   ])                    |
+    -----------------------------------------------------------------------------
+
+    Auxiliary variables for packed representation
+
+    Name                           |   Size              |  Example from above
+    -------------------------------|---------------------|-----------------------
+                                   |                     |
+    verts_packed_to_mesh_idx       |  size = (sum(V_n))  |   tensor([
+                                   |                     |     0, 0, 0, 1, 1, 1,
+                                   |                     |     1, 2, 2, 2, 2, 2
+                                   |                     |   )]
+                                   |                     |   size = (12)
+                                   |                     |
+    mesh_to_verts_packed_first_idx |  size = (N)         |   tensor([0, 3, 7])
+                                   |                     |   size = (3)
+                                   |                     |
+    num_verts_per_mesh             |  size = (N)         |   tensor([3, 4, 5])
+                                   |                     |   size = (3)
+                                   |                     |
+    faces_packed_to_mesh_idx       |  size = (sum(F_n))  |   tensor([
+                                   |                     |     0, 1, 1, 2, 2, 2,
+                                   |                     |     2, 2, 2, 2
+                                   |                     |   )]
+                                   |                     |   size = (10)
+                                   |                     |
+    mesh_to_faces_packed_first_idx |  size = (N)         |   tensor([0, 1, 3])
+                                   |                     |   size = (3)
+                                   |                     |
+    num_faces_per_mesh             |  size = (N)         |   tensor([1, 2, 7])
+                                   |                     |   size = (3)
+                                   |                     |
+    verts_padded_to_packed_idx     |  size = (sum(V_n))  |  tensor([
+                                   |                     |     0, 1, 2, 5, 6, 7,
+                                   |                     |     8, 10, 11, 12, 13,
+                                   |                     |     14
+                                   |                     |  )]
+                                   |                     |  size = (12)
+    -----------------------------------------------------------------------------
+    # SPHINX IGNORE
+
+    From the faces, edges are computed and have packed and padded
+    representations with auxiliary variables.
+
+    E_n = [[E_1], ... , [E_N]]
+    where E_1, ... , E_N are the number of unique edges in each mesh.
+    Total number of unique edges = sum(E_n)
+
+    # SPHINX IGNORE
+    Name                           |   Size                  | Example from above
+    -------------------------------|-------------------------|----------------------
+                                   |                         |
+    edges_packed                   | size = (sum(E_n), 2)    |  tensor([
+                                   |                         |     [0, 1],
+                                   |                         |     [0, 2],
+                                   |                         |     [1, 2],
+                                   |                         |       ...
+                                   |                         |     [10, 11],
+                                   |                         |   )]
+                                   |                         |   size = (18, 2)
+                                   |                         |
+    num_edges_per_mesh             | size = (N)              |  tensor([3, 5, 10])
+                                   |                         |  size = (3)
+                                   |                         |
+    edges_packed_to_mesh_idx       | size = (sum(E_n))       |  tensor([
+                                   |                         |    0, 0, 0,
+                                   |                         |     . . .
+                                   |                         |    2, 2, 2
+                                   |                         |   ])
+                                   |                         |   size = (18)
+                                   |                         |
+    faces_packed_to_edges_packed   | size = (sum(F_n), 3)    |  tensor([
+                                   |                         |    [2,   1,  0],
+                                   |                         |    [5,   4,  3],
+                                   |                         |       .  .  .
+                                   |                         |    [12, 14, 16],
+                                   |                         |   ])
+                                   |                         |   size = (10, 3)
+                                   |                         |
+    mesh_to_edges_packed_first_idx | size = (N)              |  tensor([0, 3, 8])
+                                   |                         |  size = (3)
+    ----------------------------------------------------------------------------
+    # SPHINX IGNORE
+    """
+
+    _INTERNAL_TENSORS = [
+        "_verts_packed",
+        "_verts_packed_to_mesh_idx",
+        "_mesh_to_verts_packed_first_idx",
+        "_verts_padded",
+        "_num_verts_per_mesh",
+        "_faces_packed",
+        "_faces_packed_to_mesh_idx",
+        "_mesh_to_faces_packed_first_idx",
+        "_faces_padded",
+        "_faces_areas_packed",
+        "_verts_normals_packed",
+        "_faces_normals_packed",
+        "_num_faces_per_mesh",
+        "_edges_packed",
+        "_edges_packed_to_mesh_idx",
+        "_mesh_to_edges_packed_first_idx",
+        "_faces_packed_to_edges_packed",
+        "_num_edges_per_mesh",
+        "_verts_padded_to_packed_idx",
+        "_laplacian_packed",
+        "valid",
+        "equisized",
+    ]
+
+    def __init__(
+        self,
+        verts=None,
+        faces=None,
+        textures=None,
+        *,
+        verts_normals=None,
+    ) -> None:
+        """
+        Args:
+            verts:
+                Can be either
+
+                - List where each element is a tensor of shape (num_verts, 3)
+                  containing the (x, y, z) coordinates of each vertex.
+                - Padded float tensor with shape (num_meshes, max_num_verts, 3).
+                  Meshes should be padded with fill value of 0 so they all have
+                  the same number of vertices.
+            faces:
+                Can be either
+
+                - List where each element is a tensor of shape (num_faces, 3)
+                  containing the indices of the 3 vertices in the corresponding
+                  mesh in verts which form the triangular face.
+                - Padded long tensor of shape (num_meshes, max_num_faces, 3).
+                  Meshes should be padded with fill value of -1 so they have
+                  the same number of faces.
+            textures: Optional instance of the Textures class with mesh
+                texture properties.
+            verts_normals:
+                Optional. Can be either
+
+                - List where each element is a tensor of shape (num_verts, 3)
+                  containing the normals of each vertex.
+                - Padded float tensor with shape (num_meshes, max_num_verts, 3).
+                  They should be padded with fill value of 0 so they all have
+                  the same number of vertices.
+                Note that modifying the mesh later, e.g. with offset_verts_,
+                can cause these normals to be forgotten and normals to be recalculated
+                based on the new vertex positions.
+
+        Refer to comments above for descriptions of List and Padded representations.
+        """
+        self.device = torch.device("cpu")
+        if textures is not None and not hasattr(textures, "sample_textures"):
+            msg = "Expected textures to be an instance of type TexturesBase; got %r"
+            raise ValueError(msg % type(textures))
+
+        self.textures = textures
+
+        # Indicates whether the meshes in the list/batch have the same number
+        # of faces and vertices.
+        self.equisized = False
+
+        # Boolean indicator for each mesh in the batch
+        # True if mesh has non zero number of verts and face, False otherwise.
+        self.valid = None
+
+        self._N = 0  # batch size (number of meshes)
+        self._V = 0  # (max) number of vertices per mesh
+        self._F = 0  # (max) number of faces per mesh
+
+        # List of Tensors of verts and faces.
+        self._verts_list = None
+        self._faces_list = None
+
+        # Packed representation for verts.
+        self._verts_packed = None  # (sum(V_n), 3)
+        self._verts_packed_to_mesh_idx = None  # sum(V_n)
+
+        # Index to convert verts from flattened padded to packed
+        self._verts_padded_to_packed_idx = None  # N * max_V
+
+        # Index of each mesh's first vert in the packed verts.
+        # Assumes packing is sequential.
+        self._mesh_to_verts_packed_first_idx = None  # N
+
+        # Packed representation for faces.
+        self._faces_packed = None  # (sum(F_n), 3)
+        self._faces_packed_to_mesh_idx = None  # sum(F_n)
+
+        # Index of each mesh's first face in packed faces.
+        # Assumes packing is sequential.
+        self._mesh_to_faces_packed_first_idx = None  # N
+
+        # Packed representation of edges sorted by index of the first vertex
+        # in the edge. Edges can be shared between faces in a mesh.
+        self._edges_packed = None  # (sum(E_n), 2)
+
+        # Map from packed edges to corresponding mesh index.
+        self._edges_packed_to_mesh_idx = None  # sum(E_n)
+        self._num_edges_per_mesh = None  # N
+        self._mesh_to_edges_packed_first_idx = None  # N
+
+        # Map from packed faces to packed edges. This represents the index of
+        # the edge opposite the vertex for each vertex in the face. E.g.
+        #
+        #         v0
+        #         /\
+        #        /  \
+        #    e1 /    \ e2
+        #      /      \
+        #     /________\
+        #   v2    e0   v1
+        #
+        # Face (v0, v1, v2) => Edges (e0, e1, e2)
+        self._faces_packed_to_edges_packed = None  # (sum(F_n), 3)
+
+        # Padded representation of verts.
+        self._verts_padded = None  # (N, max(V_n), 3)
+        self._num_verts_per_mesh = None  # N
+
+        # Padded representation of faces.
+        self._faces_padded = None  # (N, max(F_n), 3)
+        self._num_faces_per_mesh = None  # N
+
+        # Face areas
+        self._faces_areas_packed = None
+
+        # Normals
+        self._verts_normals_packed = None
+        self._faces_normals_packed = None
+
+        # Packed representation of Laplacian Matrix
+        self._laplacian_packed = None
+
+        # Identify type of verts and faces.
+        if isinstance(verts, list) and isinstance(faces, list):
+            self._verts_list = verts
+            self._faces_list = [
+                f[f.gt(-1).all(1)].to(torch.int64) if len(f) > 0 else f for f in faces
+            ]
+            self._N = len(self._verts_list)
+            self.valid = torch.zeros((self._N,), dtype=torch.bool, device=self.device)
+            if self._N > 0:
+                self.device = self._verts_list[0].device
+                if not (
+                    all(v.device == self.device for v in verts)
+                    and all(f.device == self.device for f in faces)
+                ):
+                    raise ValueError(
+                        "All Verts and Faces tensors should be on same device."
+                    )
+                self._num_verts_per_mesh = torch.tensor(
+                    [len(v) for v in self._verts_list], device=self.device
+                )
+                self._V = int(self._num_verts_per_mesh.max())
+                self._num_faces_per_mesh = torch.tensor(
+                    [len(f) for f in self._faces_list], device=self.device
+                )
+                self._F = int(self._num_faces_per_mesh.max())
+                self.valid = torch.tensor(
+                    [
+                        len(v) > 0 and len(f) > 0
+                        for (v, f) in zip(self._verts_list, self._faces_list)
+                    ],
+                    dtype=torch.bool,
+                    device=self.device,
+                )
+                if (len(self._num_verts_per_mesh.unique()) == 1) and (
+                    len(self._num_faces_per_mesh.unique()) == 1
+                ):
+                    self.equisized = True
+
+        elif torch.is_tensor(verts) and torch.is_tensor(faces):
+            if verts.size(2) != 3 or faces.size(2) != 3:
+                raise ValueError("Verts or Faces tensors have incorrect dimensions.")
+            self._verts_padded = verts
+            self._faces_padded = faces.to(torch.int64)
+            self._N = self._verts_padded.shape[0]
+            self._V = self._verts_padded.shape[1]
+
+            if verts.device != faces.device:
+                msg = "Verts and Faces tensors should be on same device. \n Got {} and {}."
+                raise ValueError(msg.format(verts.device, faces.device))
+
+            self.device = self._verts_padded.device
+            self.valid = torch.zeros((self._N,), dtype=torch.bool, device=self.device)
+            if self._N > 0:
+                # Check that padded faces - which have value -1 - are at the
+                # end of the tensors
+                faces_not_padded = self._faces_padded.gt(-1).all(2)
+                self._num_faces_per_mesh = faces_not_padded.sum(1)
+                if (faces_not_padded[:, :-1] < faces_not_padded[:, 1:]).any():
+                    raise ValueError("Padding of faces must be at the end")
+
+                # NOTE that we don't check for the ordering of padded verts
+                # as long as the faces index correspond to the right vertices.
+
+                self.valid = self._num_faces_per_mesh > 0
+                self._F = int(self._num_faces_per_mesh.max())
+                if len(self._num_faces_per_mesh.unique()) == 1:
+                    self.equisized = True
+
+                self._num_verts_per_mesh = torch.full(
+                    size=(self._N,),
+                    fill_value=self._V,
+                    dtype=torch.int64,
+                    device=self.device,
+                )
+
+        else:
+            raise ValueError(
+                "Verts and Faces must be either a list or a tensor with \
+                    shape (batch_size, N, 3) where N is either the maximum \
+                       number of verts or faces respectively."
+            )
+
+        if self.isempty():
+            self._num_verts_per_mesh = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+            self._num_faces_per_mesh = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+
+        # Set the num verts/faces on the textures if present.
+        if textures is not None:
+            shape_ok = self.textures.check_shapes(self._N, self._V, self._F)
+            if not shape_ok:
+                msg = "Textures do not match the dimensions of Meshes."
+                raise ValueError(msg)
+
+            self.textures._num_faces_per_mesh = self._num_faces_per_mesh.tolist()
+            self.textures._num_verts_per_mesh = self._num_verts_per_mesh.tolist()
+            self.textures.valid = self.valid
+
+        if verts_normals is not None:
+            self._set_verts_normals(verts_normals)
+
+    def _set_verts_normals(self, verts_normals) -> None:
+        if isinstance(verts_normals, list):
+            if len(verts_normals) != self._N:
+                raise ValueError("Invalid verts_normals input")
+
+            for item, n_verts in zip(verts_normals, self._num_verts_per_mesh):
+                if (
+                    not isinstance(item, torch.Tensor)
+                    or item.ndim != 2
+                    or item.shape[1] != 3
+                    or item.shape[0] != n_verts
+                ):
+                    raise ValueError("Invalid verts_normals input")
+            self._verts_normals_packed = torch.cat(verts_normals, 0)
+        elif torch.is_tensor(verts_normals):
+            if (
+                verts_normals.ndim != 3
+                or verts_normals.size(2) != 3
+                or verts_normals.size(0) != self._N
+            ):
+                raise ValueError("Vertex normals tensor has incorrect dimensions.")
+            self._verts_normals_packed = struct_utils.padded_to_packed(
+                verts_normals, split_size=self._num_verts_per_mesh.tolist()
+            )
+        else:
+            raise ValueError("verts_normals must be a list or tensor")
+
+    def __len__(self) -> int:
+        return self._N
+
+    def __getitem__(self, index) -> "Meshes":
+        """
+        Args:
+            index: Specifying the index of the mesh to retrieve.
+                Can be an int, slice, list of ints or a boolean tensor.
+
+        Returns:
+            Meshes object with selected meshes. The mesh tensors are not cloned.
+        """
+        if isinstance(index, (int, slice)):
+            verts = self.verts_list()[index]
+            faces = self.faces_list()[index]
+        elif isinstance(index, list):
+            verts = [self.verts_list()[i] for i in index]
+            faces = [self.faces_list()[i] for i in index]
+        elif isinstance(index, torch.Tensor):
+            if index.dim() != 1 or index.dtype.is_floating_point:
+                raise IndexError(index)
+            # NOTE consider converting index to cpu for efficiency
+            if index.dtype == torch.bool:
+                # advanced indexing on a single dimension
+                index = index.nonzero()
+                index = index.squeeze(1) if index.numel() > 0 else index
+                index = index.tolist()
+            verts = [self.verts_list()[i] for i in index]
+            faces = [self.faces_list()[i] for i in index]
+        else:
+            raise IndexError(index)
+
+        textures = None if self.textures is None else self.textures[index]
+
+        if torch.is_tensor(verts) and torch.is_tensor(faces):
+            return self.__class__(verts=[verts], faces=[faces], textures=textures)
+        elif isinstance(verts, list) and isinstance(faces, list):
+            return self.__class__(verts=verts, faces=faces, textures=textures)
+        else:
+            raise ValueError("(verts, faces) not defined correctly")
+
+    def isempty(self) -> bool:
+        """
+        Checks whether any mesh is valid.
+
+        Returns:
+            bool indicating whether there is any data.
+        """
+        return self._N == 0 or self.valid.eq(False).all()
+
+    def verts_list(self):
+        """
+        Get the list representation of the vertices.
+
+        Returns:
+            list of tensors of vertices of shape (V_n, 3).
+        """
+        if self._verts_list is None:
+            assert (
+                self._verts_padded is not None
+            ), "verts_padded is required to compute verts_list."
+            self._verts_list = struct_utils.padded_to_list(
+                self._verts_padded, self.num_verts_per_mesh().tolist()
+            )
+        return self._verts_list
+
+    def faces_list(self):
+        """
+        Get the list representation of the faces.
+
+        Returns:
+            list of tensors of faces of shape (F_n, 3).
+        """
+        if self._faces_list is None:
+            assert (
+                self._faces_padded is not None
+            ), "faces_padded is required to compute faces_list."
+            self._faces_list = struct_utils.padded_to_list(
+                self._faces_padded, self.num_faces_per_mesh().tolist()
+            )
+        return self._faces_list
+
+    def verts_packed(self):
+        """
+        Get the packed representation of the vertices.
+
+        Returns:
+            tensor of vertices of shape (sum(V_n), 3).
+        """
+        self._compute_packed()
+        return self._verts_packed
+
+    def verts_packed_to_mesh_idx(self):
+        """
+        Return a 1D tensor with the same first dimension as verts_packed.
+        verts_packed_to_mesh_idx[i] gives the index of the mesh which contains
+        verts_packed[i].
+
+        Returns:
+            1D tensor of indices.
+        """
+        self._compute_packed()
+        return self._verts_packed_to_mesh_idx
+
+    def mesh_to_verts_packed_first_idx(self):
+        """
+        Return a 1D tensor x with length equal to the number of meshes such that
+        the first vertex of the ith mesh is verts_packed[x[i]].
+
+        Returns:
+            1D tensor of indices of first items.
+        """
+        self._compute_packed()
+        return self._mesh_to_verts_packed_first_idx
+
+    def num_verts_per_mesh(self):
+        """
+        Return a 1D tensor x with length equal to the number of meshes giving
+        the number of vertices in each mesh.
+
+        Returns:
+            1D tensor of sizes.
+        """
+        return self._num_verts_per_mesh
+
+    def faces_packed(self):
+        """
+        Get the packed representation of the faces.
+        Faces are given by the indices of the three vertices in verts_packed.
+
+        Returns:
+            tensor of faces of shape (sum(F_n), 3).
+        """
+        self._compute_packed()
+        return self._faces_packed
+
+    def faces_packed_to_mesh_idx(self):
+        """
+        Return a 1D tensor with the same first dimension as faces_packed.
+        faces_packed_to_mesh_idx[i] gives the index of the mesh which contains
+        faces_packed[i].
+
+        Returns:
+            1D tensor of indices.
+        """
+        self._compute_packed()
+        return self._faces_packed_to_mesh_idx
+
+    def mesh_to_faces_packed_first_idx(self):
+        """
+        Return a 1D tensor x with length equal to the number of meshes such that
+        the first face of the ith mesh is faces_packed[x[i]].
+
+        Returns:
+            1D tensor of indices of first items.
+        """
+        self._compute_packed()
+        return self._mesh_to_faces_packed_first_idx
+
+    def verts_padded(self):
+        """
+        Get the padded representation of the vertices.
+
+        Returns:
+            tensor of vertices of shape (N, max(V_n), 3).
+        """
+        self._compute_padded()
+        return self._verts_padded
+
+    def faces_padded(self):
+        """
+        Get the padded representation of the faces.
+
+        Returns:
+            tensor of faces of shape (N, max(F_n), 3).
+        """
+        self._compute_padded()
+        return self._faces_padded
+
+    def num_faces_per_mesh(self):
+        """
+        Return a 1D tensor x with length equal to the number of meshes giving
+        the number of faces in each mesh.
+
+        Returns:
+            1D tensor of sizes.
+        """
+        return self._num_faces_per_mesh
+
+    def edges_packed(self):
+        """
+        Get the packed representation of the edges.
+
+        Returns:
+            tensor of edges of shape (sum(E_n), 2).
+        """
+        self._compute_edges_packed()
+        return self._edges_packed
+
+    def edges_packed_to_mesh_idx(self):
+        """
+        Return a 1D tensor with the same first dimension as edges_packed.
+        edges_packed_to_mesh_idx[i] gives the index of the mesh which contains
+        edges_packed[i].
+
+        Returns:
+            1D tensor of indices.
+        """
+        self._compute_edges_packed()
+        return self._edges_packed_to_mesh_idx
+
+    def mesh_to_edges_packed_first_idx(self):
+        """
+        Return a 1D tensor x with length equal to the number of meshes such that
+        the first edge of the ith mesh is edges_packed[x[i]].
+
+        Returns:
+            1D tensor of indices of first items.
+        """
+        self._compute_edges_packed()
+        return self._mesh_to_edges_packed_first_idx
+
+    def faces_packed_to_edges_packed(self):
+        """
+        Get the packed representation of the faces in terms of edges.
+        Faces are given by the indices of the three edges in
+        the packed representation of the edges.
+
+        Returns:
+            tensor of faces of shape (sum(F_n), 3).
+        """
+        self._compute_edges_packed()
+        return self._faces_packed_to_edges_packed
+
+    def num_edges_per_mesh(self):
+        """
+        Return a 1D tensor x with length equal to the number of meshes giving
+        the number of edges in each mesh.
+
+        Returns:
+            1D tensor of sizes.
+        """
+        self._compute_edges_packed()
+        return self._num_edges_per_mesh
+
+    def verts_padded_to_packed_idx(self):
+        """
+        Return a 1D tensor x with length equal to the total number of vertices
+        such that verts_packed()[i] is element x[i] of the flattened padded
+        representation.
+        The packed representation can be calculated as follows.
+
+        .. code-block:: python
+
+            p = verts_padded().reshape(-1, 3)
+            verts_packed = p[x]
+
+        Returns:
+            1D tensor of indices.
+        """
+        if self._verts_padded_to_packed_idx is not None:
+            return self._verts_padded_to_packed_idx
+
+        self._verts_padded_to_packed_idx = torch.cat(
+            [
+                torch.arange(v, dtype=torch.int64, device=self.device) + i * self._V
+                for (i, v) in enumerate(self.num_verts_per_mesh())
+            ],
+            dim=0,
+        )
+        return self._verts_padded_to_packed_idx
+
+    def has_verts_normals(self) -> bool:
+        """
+        Check whether vertex normals are already present.
+        """
+        return self._verts_normals_packed is not None
+
+    def verts_normals_packed(self):
+        """
+        Get the packed representation of the vertex normals.
+
+        Returns:
+            tensor of normals of shape (sum(V_n), 3).
+        """
+        self._compute_vertex_normals()
+        return self._verts_normals_packed
+
+    def verts_normals_list(self):
+        """
+        Get the list representation of the vertex normals.
+
+        Returns:
+            list of tensors of normals of shape (V_n, 3).
+        """
+        if self.isempty():
+            return [
+                torch.empty((0, 3), dtype=torch.float32, device=self.device)
+            ] * self._N
+        verts_normals_packed = self.verts_normals_packed()
+        split_size = self.num_verts_per_mesh().tolist()
+        return struct_utils.packed_to_list(verts_normals_packed, split_size)
+
+    def verts_normals_padded(self):
+        """
+        Get the padded representation of the vertex normals.
+
+        Returns:
+            tensor of normals of shape (N, max(V_n), 3).
+        """
+        if self.isempty():
+            return torch.zeros((self._N, 0, 3), dtype=torch.float32, device=self.device)
+        verts_normals_list = self.verts_normals_list()
+        return struct_utils.list_to_padded(
+            verts_normals_list, (self._V, 3), pad_value=0.0, equisized=self.equisized
+        )
+
+    def faces_normals_packed(self):
+        """
+        Get the packed representation of the face normals.
+
+        Returns:
+            tensor of normals of shape (sum(F_n), 3).
+        """
+        self._compute_face_areas_normals()
+        return self._faces_normals_packed
+
+    def faces_normals_list(self):
+        """
+        Get the list representation of the face normals.
+
+        Returns:
+            list of tensors of normals of shape (F_n, 3).
+        """
+        if self.isempty():
+            return [
+                torch.empty((0, 3), dtype=torch.float32, device=self.device)
+            ] * self._N
+        faces_normals_packed = self.faces_normals_packed()
+        split_size = self.num_faces_per_mesh().tolist()
+        return struct_utils.packed_to_list(faces_normals_packed, split_size)
+
+    def faces_normals_padded(self):
+        """
+        Get the padded representation of the face normals.
+
+        Returns:
+            tensor of normals of shape (N, max(F_n), 3).
+        """
+        if self.isempty():
+            return torch.zeros((self._N, 0, 3), dtype=torch.float32, device=self.device)
+        faces_normals_list = self.faces_normals_list()
+        return struct_utils.list_to_padded(
+            faces_normals_list, (self._F, 3), pad_value=0.0, equisized=self.equisized
+        )
+
+    def faces_areas_packed(self):
+        """
+        Get the packed representation of the face areas.
+
+        Returns:
+            tensor of areas of shape (sum(F_n),).
+        """
+        self._compute_face_areas_normals()
+        return self._faces_areas_packed
+
+    def laplacian_packed(self):
+        self._compute_laplacian_packed()
+        return self._laplacian_packed
+
+    def _compute_face_areas_normals(self, refresh: bool = False):
+        """
+        Compute the area and normal of each face in faces_packed.
+        The convention of a normal for a face consisting of verts [v0, v1, v2]
+        is normal = (v1 - v0) x (v2 - v0)
+
+        Args:
+            refresh: Set to True to force recomputation of face areas.
+                     Default: False.
+        """
+        from ..ops.mesh_face_areas_normals import mesh_face_areas_normals
+
+        if not (
+            refresh
+            or any(
+                v is None
+                for v in [self._faces_areas_packed, self._faces_normals_packed]
+            )
+        ):
+            return
+        faces_packed = self.faces_packed()
+        verts_packed = self.verts_packed()
+        face_areas, face_normals = mesh_face_areas_normals(verts_packed, faces_packed)
+        self._faces_areas_packed = face_areas
+        self._faces_normals_packed = face_normals
+
+    def _compute_vertex_normals(self, refresh: bool = False):
+        """Computes the packed version of vertex normals from the packed verts
+        and faces. This assumes verts are shared between faces. The normal for
+        a vertex is computed as the sum of the normals of all the faces it is
+        part of weighed by the face areas.
+
+        Args:
+            refresh: Set to True to force recomputation of vertex normals.
+                Default: False.
+        """
+        if not (refresh or any(v is None for v in [self._verts_normals_packed])):
+            return
+
+        if self.isempty():
+            self._verts_normals_packed = torch.zeros(
+                (self._N, 3), dtype=torch.int64, device=self.device
+            )
+        else:
+            faces_packed = self.faces_packed()
+            verts_packed = self.verts_packed()
+            verts_normals = torch.zeros_like(verts_packed)
+            vertices_faces = verts_packed[faces_packed]
+
+            # NOTE: this is already applying the area weighting as the magnitude
+            # of the cross product is 2 x area of the triangle.
+            # pyre-fixme[16]: `Tensor` has no attribute `index_add`.
+            verts_normals = verts_normals.index_add(
+                0,
+                faces_packed[:, 1],
+                torch.cross(
+                    vertices_faces[:, 2] - vertices_faces[:, 1],
+                    vertices_faces[:, 0] - vertices_faces[:, 1],
+                    dim=1,
+                ),
+            )
+            verts_normals = verts_normals.index_add(
+                0,
+                faces_packed[:, 2],
+                torch.cross(
+                    vertices_faces[:, 0] - vertices_faces[:, 2],
+                    vertices_faces[:, 1] - vertices_faces[:, 2],
+                    dim=1,
+                ),
+            )
+            verts_normals = verts_normals.index_add(
+                0,
+                faces_packed[:, 0],
+                torch.cross(
+                    vertices_faces[:, 1] - vertices_faces[:, 0],
+                    vertices_faces[:, 2] - vertices_faces[:, 0],
+                    dim=1,
+                ),
+            )
+
+            self._verts_normals_packed = torch.nn.functional.normalize(
+                verts_normals, eps=1e-6, dim=1
+            )
+
+    def _compute_padded(self, refresh: bool = False):
+        """
+        Computes the padded version of meshes from verts_list and faces_list.
+        """
+        if not (
+            refresh or any(v is None for v in [self._verts_padded, self._faces_padded])
+        ):
+            return
+
+        verts_list = self.verts_list()
+        faces_list = self.faces_list()
+        assert (
+            faces_list is not None and verts_list is not None
+        ), "faces_list and verts_list arguments are required"
+
+        if self.isempty():
+            self._faces_padded = torch.zeros(
+                (self._N, 0, 3), dtype=torch.int64, device=self.device
+            )
+            self._verts_padded = torch.zeros(
+                (self._N, 0, 3), dtype=torch.float32, device=self.device
+            )
+        else:
+            self._faces_padded = struct_utils.list_to_padded(
+                faces_list, (self._F, 3), pad_value=-1.0, equisized=self.equisized
+            )
+            self._verts_padded = struct_utils.list_to_padded(
+                verts_list, (self._V, 3), pad_value=0.0, equisized=self.equisized
+            )
+
+    # TODO(nikhilar) Improve performance of _compute_packed.
+    def _compute_packed(self, refresh: bool = False):
+        """
+        Computes the packed version of the meshes from verts_list and faces_list
+        and sets the values of auxiliary tensors.
+
+        Args:
+            refresh: Set to True to force recomputation of packed representations.
+                Default: False.
+        """
+
+        if not (
+            refresh
+            or any(
+                v is None
+                for v in [
+                    self._verts_packed,
+                    self._verts_packed_to_mesh_idx,
+                    self._mesh_to_verts_packed_first_idx,
+                    self._faces_packed,
+                    self._faces_packed_to_mesh_idx,
+                    self._mesh_to_faces_packed_first_idx,
+                ]
+            )
+        ):
+            return
+
+        # Packed can be calculated from padded or list, so can call the
+        # accessor function for verts_list and faces_list.
+        verts_list = self.verts_list()
+        faces_list = self.faces_list()
+        if self.isempty():
+            self._verts_packed = torch.zeros(
+                (0, 3), dtype=torch.float32, device=self.device
+            )
+            self._verts_packed_to_mesh_idx = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+            self._mesh_to_verts_packed_first_idx = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+            self._num_verts_per_mesh = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+            self._faces_packed = -(
+                torch.ones((0, 3), dtype=torch.int64, device=self.device)
+            )
+            self._faces_packed_to_mesh_idx = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+            self._mesh_to_faces_packed_first_idx = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+            self._num_faces_per_mesh = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+            return
+
+        verts_list_to_packed = struct_utils.list_to_packed(verts_list)
+        self._verts_packed = verts_list_to_packed[0]
+        if not torch.allclose(self.num_verts_per_mesh(), verts_list_to_packed[1]):
+            raise ValueError("The number of verts per mesh should be consistent.")
+        self._mesh_to_verts_packed_first_idx = verts_list_to_packed[2]
+        self._verts_packed_to_mesh_idx = verts_list_to_packed[3]
+
+        faces_list_to_packed = struct_utils.list_to_packed(faces_list)
+        faces_packed = faces_list_to_packed[0]
+        if not torch.allclose(self.num_faces_per_mesh(), faces_list_to_packed[1]):
+            raise ValueError("The number of faces per mesh should be consistent.")
+        self._mesh_to_faces_packed_first_idx = faces_list_to_packed[2]
+        self._faces_packed_to_mesh_idx = faces_list_to_packed[3]
+
+        faces_packed_offset = self._mesh_to_verts_packed_first_idx[
+            self._faces_packed_to_mesh_idx
+        ]
+        self._faces_packed = faces_packed + faces_packed_offset.view(-1, 1)
+
+    def _compute_edges_packed(self, refresh: bool = False):
+        """
+        Computes edges in packed form from the packed version of faces and verts.
+        """
+        if not (
+            refresh
+            or any(
+                v is None
+                for v in [
+                    self._edges_packed,
+                    self._faces_packed_to_mesh_idx,
+                    self._edges_packed_to_mesh_idx,
+                    self._num_edges_per_mesh,
+                    self._mesh_to_edges_packed_first_idx,
+                ]
+            )
+        ):
+            return
+
+        if self.isempty():
+            self._edges_packed = torch.full(
+                (0, 2), fill_value=-1, dtype=torch.int64, device=self.device
+            )
+            self._edges_packed_to_mesh_idx = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+            return
+
+        faces = self.faces_packed()
+        F = faces.shape[0]
+        v0, v1, v2 = faces.chunk(3, dim=1)
+        e01 = torch.cat([v0, v1], dim=1)  # (sum(F_n), 2)
+        e12 = torch.cat([v1, v2], dim=1)  # (sum(F_n), 2)
+        e20 = torch.cat([v2, v0], dim=1)  # (sum(F_n), 2)
+
+        # All edges including duplicates.
+        edges = torch.cat([e12, e20, e01], dim=0)  # (sum(F_n)*3, 2)
+        edge_to_mesh = torch.cat(
+            [
+                self._faces_packed_to_mesh_idx,
+                self._faces_packed_to_mesh_idx,
+                self._faces_packed_to_mesh_idx,
+            ],
+            dim=0,
+        )  # sum(F_n)*3
+
+        # Sort the edges in increasing vertex order to remove duplicates as
+        # the same edge may appear in different orientations in different faces.
+        # i.e. rows in edges after sorting will be of the form (v0, v1) where v1 > v0.
+        # This sorting does not change the order in dim=0.
+        edges, _ = edges.sort(dim=1)
+
+        # Remove duplicate edges: convert each edge (v0, v1) into an
+        # integer hash = V * v0 + v1; this allows us to use the scalar version of
+        # unique which is much faster than edges.unique(dim=1) which is very slow.
+        # After finding the unique elements reconstruct the vertex indices as:
+        # (v0, v1) = (hash / V, hash % V)
+        # The inverse maps from unique_edges back to edges:
+        # unique_edges[inverse_idxs] == edges
+        # i.e. inverse_idxs[i] == j means that edges[i] == unique_edges[j]
+
+        V = self._verts_packed.shape[0]
+        edges_hash = V * edges[:, 0] + edges[:, 1]
+        u, inverse_idxs = torch.unique(edges_hash, return_inverse=True)
+
+        # Find indices of unique elements.
+        # TODO (nikhilar) remove following 4 lines when torch.unique has support
+        # for returning unique indices
+        sorted_hash, sort_idx = torch.sort(edges_hash, dim=0)
+        unique_mask = torch.ones(
+            edges_hash.shape[0], dtype=torch.bool, device=self.device
+        )
+        unique_mask[1:] = sorted_hash[1:] != sorted_hash[:-1]
+        unique_idx = sort_idx[unique_mask]
+
+        self._edges_packed = torch.stack([u // V, u % V], dim=1)
+        self._edges_packed_to_mesh_idx = edge_to_mesh[unique_idx]
+
+        self._faces_packed_to_edges_packed = inverse_idxs.reshape(3, F).t()
+
+        # Compute number of edges per mesh
+        num_edges_per_mesh = torch.zeros(self._N, dtype=torch.int32, device=self.device)
+        ones = torch.ones(1, dtype=torch.int32, device=self.device).expand(
+            self._edges_packed_to_mesh_idx.shape
+        )
+        num_edges_per_mesh = num_edges_per_mesh.scatter_add_(
+            0, self._edges_packed_to_mesh_idx, ones
+        )
+        self._num_edges_per_mesh = num_edges_per_mesh
+
+        # Compute first idx for each mesh in edges_packed
+        mesh_to_edges_packed_first_idx = torch.zeros(
+            self._N, dtype=torch.int64, device=self.device
+        )
+        num_edges_cumsum = num_edges_per_mesh.cumsum(dim=0)
+        mesh_to_edges_packed_first_idx[1:] = num_edges_cumsum[:-1].clone()
+
+        self._mesh_to_edges_packed_first_idx = mesh_to_edges_packed_first_idx
+
+    def _compute_laplacian_packed(self, refresh: bool = False):
+        """
+        Computes the laplacian in packed form.
+        The definition of the laplacian is
+        L[i, j] =    -1       , if i == j
+        L[i, j] = 1 / deg(i)  , if (i, j) is an edge
+        L[i, j] =    0        , otherwise
+        where deg(i) is the degree of the i-th vertex in the graph
+
+        Returns:
+            Sparse FloatTensor of shape (V, V) where V = sum(V_n)
+
+        """
+        from ..ops import laplacian
+
+        if not (refresh or self._laplacian_packed is None):
+            return
+
+        if self.isempty():
+            self._laplacian_packed = torch.zeros(
+                (0, 0), dtype=torch.float32, device=self.device
+            ).to_sparse()
+            return
+
+        verts_packed = self.verts_packed()  # (sum(V_n), 3)
+        edges_packed = self.edges_packed()  # (sum(E_n), 3)
+
+        self._laplacian_packed = laplacian(verts_packed, edges_packed)
+
+    def clone(self):
+        """
+        Deep copy of Meshes object. All internal tensors are cloned individually.
+
+        Returns:
+            new Meshes object.
+        """
+        verts_list = self.verts_list()
+        faces_list = self.faces_list()
+        new_verts_list = [v.clone() for v in verts_list]
+        new_faces_list = [f.clone() for f in faces_list]
+        other = self.__class__(verts=new_verts_list, faces=new_faces_list)
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.clone())
+
+        # Textures is not a tensor but has a clone method
+        if self.textures is not None:
+            other.textures = self.textures.clone()
+        return other
+
+    def detach(self):
+        """
+        Detach Meshes object. All internal tensors are detached individually.
+
+        Returns:
+            new Meshes object.
+        """
+        verts_list = self.verts_list()
+        faces_list = self.faces_list()
+        new_verts_list = [v.detach() for v in verts_list]
+        new_faces_list = [f.detach() for f in faces_list]
+        other = self.__class__(verts=new_verts_list, faces=new_faces_list)
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.detach())
+
+        # Textures is not a tensor but has a detach method
+        if self.textures is not None:
+            other.textures = self.textures.detach()
+        return other
+
+    def to(self, device: Device, copy: bool = False):
+        """
+        Match functionality of torch.Tensor.to()
+        If copy = True or the self Tensor is on a different device, the
+        returned tensor is a copy of self with the desired torch.device.
+        If copy = False and the self Tensor already has the correct torch.device,
+        then self is returned.
+
+        Args:
+            device: Device (as str or torch.device) for the new tensor.
+            copy: Boolean indicator whether or not to clone self. Default False.
+
+        Returns:
+            Meshes object.
+        """
+        device_ = make_device(device)
+        if not copy and self.device == device_:
+            return self
+
+        other = self.clone()
+        if self.device == device_:
+            return other
+
+        other.device = device_
+        if other._N > 0:
+            other._verts_list = [v.to(device_) for v in other._verts_list]
+            other._faces_list = [f.to(device_) for f in other._faces_list]
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.to(device_))
+        if self.textures is not None:
+            other.textures = other.textures.to(device_)
+        return other
+
+    def cpu(self):
+        return self.to("cpu")
+
+    def cuda(self):
+        return self.to("cuda")
+
+    def get_mesh_verts_faces(self, index: int):
+        """
+        Get tensors for a single mesh from the list representation.
+
+        Args:
+            index: Integer in the range [0, N).
+
+        Returns:
+            verts: Tensor of shape (V, 3).
+            faces: LongTensor of shape (F, 3).
+        """
+        if not isinstance(index, int):
+            raise ValueError("Mesh index must be an integer.")
+        if index < 0 or index > self._N:
+            raise ValueError(
+                "Mesh index must be in the range [0, N) where \
+            N is the number of meshes in the batch."
+            )
+        verts = self.verts_list()
+        faces = self.faces_list()
+        return verts[index], faces[index]
+
+    # TODO(nikhilar) Move function to a utils file.
+    def split(self, split_sizes: list):
+        """
+        Splits Meshes object of size N into a list of Meshes objects of
+        size len(split_sizes), where the i-th Meshes object is of size split_sizes[i].
+        Similar to torch.split().
+
+        Args:
+            split_sizes: List of integer sizes of Meshes objects to be returned.
+
+        Returns:
+            list[Meshes].
+        """
+        if not all(isinstance(x, int) for x in split_sizes):
+            raise ValueError("Value of split_sizes must be a list of integers.")
+        meshlist = []
+        curi = 0
+        for i in split_sizes:
+            meshlist.append(self[curi : curi + i])
+            curi += i
+        return meshlist
+
+    def offset_verts_(self, vert_offsets_packed):
+        """
+        Add an offset to the vertices of this Meshes. In place operation.
+        If normals are present they may be recalculated.
+
+        Args:
+            vert_offsets_packed: A Tensor of shape (3,) or the same shape as
+                                self.verts_packed, giving offsets to be added
+                                to all vertices.
+        Returns:
+            self.
+        """
+        verts_packed = self.verts_packed()
+        if vert_offsets_packed.shape == (3,):
+            update_normals = False
+            vert_offsets_packed = vert_offsets_packed.expand_as(verts_packed)
+        else:
+            update_normals = True
+        if vert_offsets_packed.shape != verts_packed.shape:
+            raise ValueError("Verts offsets must have dimension (all_v, 3).")
+        # update verts packed
+        self._verts_packed = verts_packed + vert_offsets_packed
+        new_verts_list = list(
+            self._verts_packed.split(self.num_verts_per_mesh().tolist(), 0)
+        )
+        # update verts list
+        # Note that since _compute_packed() has been executed, verts_list
+        # cannot be None even if not provided during construction.
+        self._verts_list = new_verts_list
+
+        # update verts padded
+        if self._verts_padded is not None:
+            for i, verts in enumerate(new_verts_list):
+                if len(verts) > 0:
+                    self._verts_padded[i, : verts.shape[0], :] = verts
+
+        # update face areas and normals and vertex normals
+        # only if the original attributes are present
+        if update_normals and any(
+            v is not None
+            for v in [self._faces_areas_packed, self._faces_normals_packed]
+        ):
+            self._compute_face_areas_normals(refresh=True)
+        if update_normals and self._verts_normals_packed is not None:
+            self._compute_vertex_normals(refresh=True)
+
+        return self
+
+    # TODO(nikhilar) Move out of place operator to a utils file.
+    def offset_verts(self, vert_offsets_packed):
+        """
+        Out of place offset_verts.
+
+        Args:
+            vert_offsets_packed: A Tensor of the same shape as self.verts_packed
+                giving offsets to be added to all vertices.
+        Returns:
+            new Meshes object.
+        """
+        new_mesh = self.clone()
+        return new_mesh.offset_verts_(vert_offsets_packed)
+
+    def scale_verts_(self, scale):
+        """
+        Multiply the vertices of this Meshes object by a scalar value.
+        In place operation.
+
+        Args:
+            scale: A scalar, or a Tensor of shape (N,).
+
+        Returns:
+            self.
+        """
+        if not torch.is_tensor(scale):
+            scale = torch.full((len(self),), scale, device=self.device)
+        new_verts_list = []
+        verts_list = self.verts_list()
+        for i, old_verts in enumerate(verts_list):
+            new_verts_list.append(scale[i] * old_verts)
+        # update list
+        self._verts_list = new_verts_list
+        # update packed
+        if self._verts_packed is not None:
+            self._verts_packed = torch.cat(new_verts_list, dim=0)
+        # update padded
+        if self._verts_padded is not None:
+            for i, verts in enumerate(self._verts_list):
+                if len(verts) > 0:
+                    self._verts_padded[i, : verts.shape[0], :] = verts
+
+        # update face areas and normals
+        # only if the original attributes are computed
+        if any(
+            v is not None
+            for v in [self._faces_areas_packed, self._faces_normals_packed]
+        ):
+            self._compute_face_areas_normals(refresh=True)
+        return self
+
+    def scale_verts(self, scale):
+        """
+        Out of place scale_verts.
+
+        Args:
+            scale: A scalar, or a Tensor of shape (N,).
+
+        Returns:
+            new Meshes object.
+        """
+        new_mesh = self.clone()
+        return new_mesh.scale_verts_(scale)
+
+    def update_padded(self, new_verts_padded):
+        """
+        This function allows for an update of verts_padded without having to
+        explicitly convert it to the list representation for heterogeneous batches.
+        Returns a Meshes structure with updated padded tensors and copies of the
+        auxiliary tensors at construction time.
+        It updates self._verts_padded with new_verts_padded, and does a
+        shallow copy of (faces_padded, faces_list, num_verts_per_mesh, num_faces_per_mesh).
+        If packed representations are computed in self, they are updated as well.
+
+        Args:
+            new_points_padded: FloatTensor of shape (N, V, 3)
+
+        Returns:
+            Meshes with updated padded representations
+        """
+
+        def check_shapes(x, size):
+            if x.shape[0] != size[0]:
+                raise ValueError("new values must have the same batch dimension.")
+            if x.shape[1] != size[1]:
+                raise ValueError("new values must have the same number of points.")
+            if x.shape[2] != size[2]:
+                raise ValueError("new values must have the same dimension.")
+
+        check_shapes(new_verts_padded, [self._N, self._V, 3])
+
+        new = self.__class__(verts=new_verts_padded, faces=self.faces_padded())
+
+        if new._N != self._N or new._V != self._V or new._F != self._F:
+            raise ValueError("Inconsistent sizes after construction.")
+
+        # overwrite the equisized flag
+        new.equisized = self.equisized
+
+        # overwrite textures if any
+        new.textures = self.textures
+
+        # copy auxiliary tensors
+        copy_tensors = ["_num_verts_per_mesh", "_num_faces_per_mesh", "valid"]
+
+        for k in copy_tensors:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(new, k, v)  # shallow copy
+
+        # shallow copy of faces_list if any, st new.faces_list()
+        # does not re-compute from _faces_padded
+        new._faces_list = self._faces_list
+
+        # update verts/faces packed if they are computed in self
+        if self._verts_packed is not None:
+            copy_tensors = [
+                "_faces_packed",
+                "_verts_packed_to_mesh_idx",
+                "_faces_packed_to_mesh_idx",
+                "_mesh_to_verts_packed_first_idx",
+                "_mesh_to_faces_packed_first_idx",
+            ]
+            for k in copy_tensors:
+                v = getattr(self, k)
+                assert torch.is_tensor(v)
+                setattr(new, k, v)  # shallow copy
+            # update verts_packed
+            pad_to_packed = self.verts_padded_to_packed_idx()
+            new_verts_packed = new_verts_padded.reshape(-1, 3)[pad_to_packed, :]
+            new._verts_packed = new_verts_packed
+            new._verts_padded_to_packed_idx = pad_to_packed
+
+        # update edges packed if they are computed in self
+        if self._edges_packed is not None:
+            copy_tensors = [
+                "_edges_packed",
+                "_edges_packed_to_mesh_idx",
+                "_mesh_to_edges_packed_first_idx",
+                "_faces_packed_to_edges_packed",
+                "_num_edges_per_mesh",
+            ]
+            for k in copy_tensors:
+                v = getattr(self, k)
+                assert torch.is_tensor(v)
+                setattr(new, k, v)  # shallow copy
+
+        # update laplacian if it is compute in self
+        if self._laplacian_packed is not None:
+            new._laplacian_packed = self._laplacian_packed
+
+        assert new._verts_list is None
+        assert new._verts_normals_packed is None
+        assert new._faces_normals_packed is None
+        assert new._faces_areas_packed is None
+
+        return new
+
+    # TODO(nikhilar) Move function to utils file.
+    def get_bounding_boxes(self):
+        """
+        Compute an axis-aligned bounding box for each mesh in this Meshes object.
+
+        Returns:
+            bboxes: Tensor of shape (N, 3, 2) where bbox[i, j] gives the
+            min and max values of mesh i along the jth coordinate axis.
+        """
+        all_mins, all_maxes = [], []
+        for verts in self.verts_list():
+            cur_mins = verts.min(dim=0)[0]  # (3,)
+            cur_maxes = verts.max(dim=0)[0]  # (3,)
+            all_mins.append(cur_mins)
+            all_maxes.append(cur_maxes)
+        all_mins = torch.stack(all_mins, dim=0)  # (N, 3)
+        all_maxes = torch.stack(all_maxes, dim=0)  # (N, 3)
+        bboxes = torch.stack([all_mins, all_maxes], dim=2)
+        return bboxes
+
+    def extend(self, N: int):
+        """
+        Create new Meshes class which contains each input mesh N times
+
+        Args:
+            N: number of new copies of each mesh.
+
+        Returns:
+            new Meshes object.
+        """
+        if not isinstance(N, int):
+            raise ValueError("N must be an integer.")
+        if N <= 0:
+            raise ValueError("N must be > 0.")
+        new_verts_list, new_faces_list = [], []
+        for verts, faces in zip(self.verts_list(), self.faces_list()):
+            new_verts_list.extend(verts.clone() for _ in range(N))
+            new_faces_list.extend(faces.clone() for _ in range(N))
+
+        tex = None
+        if self.textures is not None:
+            tex = self.textures.extend(N)
+
+        return self.__class__(verts=new_verts_list, faces=new_faces_list, textures=tex)
+
+    def sample_textures(self, fragments):
+        if self.textures is not None:
+
+            # Check dimensions of textures match that of meshes
+            shape_ok = self.textures.check_shapes(self._N, self._V, self._F)
+            if not shape_ok:
+                msg = "Textures do not match the dimensions of Meshes."
+                raise ValueError(msg)
+
+            # Pass in faces packed. If the textures are defined per
+            # vertex, the face indices are needed in order to interpolate
+            # the vertex attributes across the face.
+            return self.textures.sample_textures(
+                fragments, faces_packed=self.faces_packed()
+            )
+        else:
+            raise ValueError("Meshes does not have textures")
+
+
+def join_meshes_as_batch(meshes: List[Meshes], include_textures: bool = True):
+    """
+    Merge multiple Meshes objects, i.e. concatenate the meshes objects. They
+    must all be on the same device. If include_textures is true, they must all
+    be compatible, either all or none having textures, and all the Textures
+    objects being the same type. If include_textures is False, textures are
+    ignored.
+
+    If the textures are TexturesAtlas then being the same type includes having
+    the same resolution. If they are TexturesUV then it includes having the same
+    align_corners and padding_mode.
+
+    Args:
+        meshes: list of meshes.
+        include_textures: (bool) whether to try to join the textures.
+
+    Returns:
+        new Meshes object containing all the meshes from all the inputs.
+    """
+    if isinstance(meshes, Meshes):
+        # Meshes objects can be iterated and produce single Meshes. We avoid
+        # letting join_meshes_as_batch(mesh1, mesh2) silently do the wrong thing.
+        raise ValueError("Wrong first argument to join_meshes_as_batch.")
+    verts = [v for mesh in meshes for v in mesh.verts_list()]
+    faces = [f for mesh in meshes for f in mesh.faces_list()]
+    if len(meshes) == 0 or not include_textures:
+        return Meshes(verts=verts, faces=faces)
+
+    if meshes[0].textures is None:
+        if any(mesh.textures is not None for mesh in meshes):
+            raise ValueError("Inconsistent textures in join_meshes_as_batch.")
+        return Meshes(verts=verts, faces=faces)
+
+    if any(mesh.textures is None for mesh in meshes):
+        raise ValueError("Inconsistent textures in join_meshes_as_batch.")
+
+    # Now we know there are multiple meshes and they have textures to merge.
+    all_textures = [mesh.textures for mesh in meshes]
+    first = all_textures[0]
+    tex_types_same = all(type(tex) == type(first) for tex in all_textures)
+
+    if not tex_types_same:
+        raise ValueError("All meshes in the batch must have the same type of texture.")
+
+    tex = first.join_batch(all_textures[1:])
+    return Meshes(verts=verts, faces=faces, textures=tex)
+
+
+def join_meshes_as_scene(
+    meshes: Union[Meshes, List[Meshes]], include_textures: bool = True
+) -> Meshes:
+    """
+    Joins a batch of meshes in the form of a Meshes object or a list of Meshes
+    objects as a single mesh. If the input is a list, the Meshes objects in the
+    list must all be on the same device. Unless include_textures is False, the
+    meshes must all have the same type of texture or must all not have textures.
+
+    If textures are included, then the textures are joined as a single scene in
+    addition to the meshes. For this, texture types have an appropriate method
+    called join_scene which joins mesh textures into a single texture.
+    If the textures are TexturesAtlas then they must have the same resolution.
+    If they are TexturesUV then they must have the same align_corners and
+    padding_mode. Values in verts_uvs outside [0, 1] will not
+    be respected.
+
+    Args:
+        meshes: Meshes object that contains a batch of meshes, or a list of
+                    Meshes objects.
+        include_textures: (bool) whether to try to join the textures.
+
+    Returns:
+        new Meshes object containing a single mesh
+    """
+    if not isinstance(include_textures, (bool, int)):
+        # We want to avoid letting join_meshes_as_scene(mesh1, mesh2) silently
+        # do the wrong thing.
+        raise ValueError(
+            f"include_textures argument cannot be {type(include_textures)}"
+        )
+    if isinstance(meshes, List):
+        meshes = join_meshes_as_batch(meshes, include_textures=include_textures)
+
+    if len(meshes) == 1:
+        return meshes
+    verts = meshes.verts_packed()  # (sum(V_n), 3)
+    # Offset automatically done by faces_packed
+    faces = meshes.faces_packed()  # (sum(F_n), 3)
+    textures = None
+
+    if include_textures and meshes.textures is not None:
+        textures = meshes.textures.join_scene()
+
+    mesh = Meshes(verts=verts.unsqueeze(0), faces=faces.unsqueeze(0), textures=textures)
+    return mesh
diff --git a/pytorch3d/pytorch3d/structures/pointclouds.py b/pytorch3d/pytorch3d/structures/pointclouds.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc014af50bb364f081693e6201f9f604751ffc23
--- /dev/null
+++ b/pytorch3d/pytorch3d/structures/pointclouds.py
@@ -0,0 +1,1217 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import zip_longest
+from typing import List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+
+from ..common.types import Device, make_device
+from . import utils as struct_utils
+
+
+class Pointclouds:
+    """
+    This class provides functions for working with batches of 3d point clouds,
+    and converting between representations.
+
+    Within Pointclouds, there are three different representations of the data.
+
+    List
+       - only used for input as a starting point to convert to other representations.
+    Padded
+       - has specific batch dimension.
+    Packed
+       - no batch dimension.
+       - has auxiliary variables used to index into the padded representation.
+
+    Example
+
+    Input list of points = [[P_1], [P_2], ... , [P_N]]
+    where P_1, ... , P_N are the number of points in each cloud and N is the
+    number of clouds.
+
+    # SPHINX IGNORE
+     List                      | Padded                  | Packed
+    ---------------------------|-------------------------|------------------------
+    [[P_1], ... , [P_N]]       | size = (N, max(P_n), 3) |  size = (sum(P_n), 3)
+                               |                         |
+    Example for locations      |                         |
+    or colors:                 |                         |
+                               |                         |
+    P_1 = 3, P_2 = 4, P_3 = 5  | size = (3, 5, 3)        |  size = (12, 3)
+                               |                         |
+    List([                     | tensor([                |  tensor([
+      [                        |     [                   |    [0.1, 0.3, 0.5],
+        [0.1, 0.3, 0.5],       |       [0.1, 0.3, 0.5],  |    [0.5, 0.2, 0.1],
+        [0.5, 0.2, 0.1],       |       [0.5, 0.2, 0.1],  |    [0.6, 0.8, 0.7],
+        [0.6, 0.8, 0.7]        |       [0.6, 0.8, 0.7],  |    [0.1, 0.3, 0.3],
+      ],                       |       [0,    0,    0],  |    [0.6, 0.7, 0.8],
+      [                        |       [0,    0,    0]   |    [0.2, 0.3, 0.4],
+        [0.1, 0.3, 0.3],       |     ],                  |    [0.1, 0.5, 0.3],
+        [0.6, 0.7, 0.8],       |     [                   |    [0.7, 0.3, 0.6],
+        [0.2, 0.3, 0.4],       |       [0.1, 0.3, 0.3],  |    [0.2, 0.4, 0.8],
+        [0.1, 0.5, 0.3]        |       [0.6, 0.7, 0.8],  |    [0.9, 0.5, 0.2],
+      ],                       |       [0.2, 0.3, 0.4],  |    [0.2, 0.3, 0.4],
+      [                        |       [0.1, 0.5, 0.3],  |    [0.9, 0.3, 0.8],
+        [0.7, 0.3, 0.6],       |       [0,    0,    0]   |  ])
+        [0.2, 0.4, 0.8],       |     ],                  |
+        [0.9, 0.5, 0.2],       |     [                   |
+        [0.2, 0.3, 0.4],       |       [0.7, 0.3, 0.6],  |
+        [0.9, 0.3, 0.8],       |       [0.2, 0.4, 0.8],  |
+      ]                        |       [0.9, 0.5, 0.2],  |
+    ])                         |       [0.2, 0.3, 0.4],  |
+                               |       [0.9, 0.3, 0.8]   |
+                               |     ]                   |
+                               |  ])                     |
+    -----------------------------------------------------------------------------
+
+    Auxiliary variables for packed representation
+
+    Name                           |   Size              |  Example from above
+    -------------------------------|---------------------|-----------------------
+                                   |                     |
+    packed_to_cloud_idx            |  size = (sum(P_n))  |   tensor([
+                                   |                     |     0, 0, 0, 1, 1, 1,
+                                   |                     |     1, 2, 2, 2, 2, 2
+                                   |                     |   )]
+                                   |                     |   size = (12)
+                                   |                     |
+    cloud_to_packed_first_idx      |  size = (N)         |   tensor([0, 3, 7])
+                                   |                     |   size = (3)
+                                   |                     |
+    num_points_per_cloud           |  size = (N)         |   tensor([3, 4, 5])
+                                   |                     |   size = (3)
+                                   |                     |
+    padded_to_packed_idx           |  size = (sum(P_n))  |  tensor([
+                                   |                     |     0, 1, 2, 5, 6, 7,
+                                   |                     |     8, 10, 11, 12, 13,
+                                   |                     |     14
+                                   |                     |  )]
+                                   |                     |  size = (12)
+    -----------------------------------------------------------------------------
+    # SPHINX IGNORE
+    """
+
+    _INTERNAL_TENSORS = [
+        "_points_packed",
+        "_points_padded",
+        "_normals_packed",
+        "_normals_padded",
+        "_features_packed",
+        "_features_padded",
+        "_packed_to_cloud_idx",
+        "_cloud_to_packed_first_idx",
+        "_num_points_per_cloud",
+        "_padded_to_packed_idx",
+        "valid",
+        "equisized",
+    ]
+
+    def __init__(self, points, normals=None, features=None) -> None:
+        """
+        Args:
+            points:
+                Can be either
+
+                - List where each element is a tensor of shape (num_points, 3)
+                  containing the (x, y, z) coordinates of each point.
+                - Padded float tensor with shape (num_clouds, num_points, 3).
+            normals:
+                Can be either
+
+                - List where each element is a tensor of shape (num_points, 3)
+                  containing the normal vector for each point.
+                - Padded float tensor of shape (num_clouds, num_points, 3).
+            features:
+                Can be either
+
+                - List where each element is a tensor of shape (num_points, C)
+                  containing the features for the points in the cloud.
+                - Padded float tensor of shape (num_clouds, num_points, C).
+                where C is the number of channels in the features.
+                For example 3 for RGB color.
+
+        Refer to comments above for descriptions of List and Padded
+        representations.
+        """
+        self.device = torch.device("cpu")
+
+        # Indicates whether the clouds in the list/batch have the same number
+        # of points.
+        self.equisized = False
+
+        # Boolean indicator for each cloud in the batch.
+        # True if cloud has non zero number of points, False otherwise.
+        self.valid = None
+
+        self._N = 0  # batch size (number of clouds)
+        self._P = 0  # (max) number of points per cloud
+        self._C = None  # number of channels in the features
+
+        # List of Tensors of points and features.
+        self._points_list = None
+        self._normals_list = None
+        self._features_list = None
+
+        # Number of points per cloud.
+        self._num_points_per_cloud = None  # N
+
+        # Packed representation.
+        self._points_packed = None  # (sum(P_n), 3)
+        self._normals_packed = None  # (sum(P_n), 3)
+        self._features_packed = None  # (sum(P_n), C)
+
+        self._packed_to_cloud_idx = None  # sum(P_n)
+
+        # Index of each cloud's first point in the packed points.
+        # Assumes packing is sequential.
+        self._cloud_to_packed_first_idx = None  # N
+
+        # Padded representation.
+        self._points_padded = None  # (N, max(P_n), 3)
+        self._normals_padded = None  # (N, max(P_n), 3)
+        self._features_padded = None  # (N, max(P_n), C)
+
+        # Index to convert points from flattened padded to packed.
+        self._padded_to_packed_idx = None  # N * max_P
+
+        # Identify type of points.
+        if isinstance(points, list):
+            self._points_list = points
+            self._N = len(self._points_list)
+            self.valid = torch.zeros((self._N,), dtype=torch.bool, device=self.device)
+            self._num_points_per_cloud = []
+
+            if self._N > 0:
+                self.device = self._points_list[0].device
+                for p in self._points_list:
+                    if len(p) > 0 and (p.dim() != 2 or p.shape[1] != 3):
+                        raise ValueError("Clouds in list must be of shape Px3 or empty")
+                    if p.device != self.device:
+                        raise ValueError("All points must be on the same device")
+
+                num_points_per_cloud = torch.tensor(
+                    [len(p) for p in self._points_list], device=self.device
+                )
+                self._P = int(num_points_per_cloud.max())
+                self.valid = torch.tensor(
+                    [len(p) > 0 for p in self._points_list],
+                    dtype=torch.bool,
+                    device=self.device,
+                )
+
+                if len(num_points_per_cloud.unique()) == 1:
+                    self.equisized = True
+                self._num_points_per_cloud = num_points_per_cloud
+
+        elif torch.is_tensor(points):
+            if points.dim() != 3 or points.shape[2] != 3:
+                raise ValueError("Points tensor has incorrect dimensions.")
+            self._points_padded = points
+            self._N = self._points_padded.shape[0]
+            self._P = self._points_padded.shape[1]
+            self.device = self._points_padded.device
+            self.valid = torch.ones((self._N,), dtype=torch.bool, device=self.device)
+            self._num_points_per_cloud = torch.tensor(
+                [self._P] * self._N, device=self.device
+            )
+            self.equisized = True
+        else:
+            raise ValueError(
+                "Points must be either a list or a tensor with \
+                    shape (batch_size, P, 3) where P is the maximum number of \
+                    points in a cloud."
+            )
+
+        # parse normals
+        normals_parsed = self._parse_auxiliary_input(normals)
+        self._normals_list, self._normals_padded, normals_C = normals_parsed
+        if normals_C is not None and normals_C != 3:
+            raise ValueError("Normals are expected to be 3-dimensional")
+
+        # parse features
+        features_parsed = self._parse_auxiliary_input(features)
+        self._features_list, self._features_padded, features_C = features_parsed
+        if features_C is not None:
+            self._C = features_C
+
+    def _parse_auxiliary_input(
+        self, aux_input
+    ) -> Tuple[Optional[List[torch.Tensor]], Optional[torch.Tensor], Optional[int]]:
+        """
+        Interpret the auxiliary inputs (normals, features) given to __init__.
+
+        Args:
+            aux_input:
+              Can be either
+
+                - List where each element is a tensor of shape (num_points, C)
+                  containing the features for the points in the cloud.
+                - Padded float tensor of shape (num_clouds, num_points, C).
+              For normals, C = 3
+
+        Returns:
+            3-element tuple of list, padded, num_channels.
+            If aux_input is list, then padded is None. If aux_input is a tensor,
+            then list is None.
+        """
+        if aux_input is None or self._N == 0:
+            return None, None, None
+
+        aux_input_C = None
+
+        if isinstance(aux_input, list):
+            if len(aux_input) != self._N:
+                raise ValueError("Points and auxiliary input must be the same length.")
+            for p, d in zip(self._num_points_per_cloud, aux_input):
+                if p != d.shape[0]:
+                    raise ValueError(
+                        "A cloud has mismatched numbers of points and inputs"
+                    )
+                if d.device != self.device:
+                    raise ValueError(
+                        "All auxiliary inputs must be on the same device as the points."
+                    )
+                if p > 0:
+                    if d.dim() != 2:
+                        raise ValueError(
+                            "A cloud auxiliary input must be of shape PxC or empty"
+                        )
+                    if aux_input_C is None:
+                        aux_input_C = d.shape[1]
+                    if aux_input_C != d.shape[1]:
+                        raise ValueError(
+                            "The clouds must have the same number of channels"
+                        )
+            return aux_input, None, aux_input_C
+        elif torch.is_tensor(aux_input):
+            if aux_input.dim() != 3:
+                raise ValueError("Auxiliary input tensor has incorrect dimensions.")
+            if self._N != aux_input.shape[0]:
+                raise ValueError("Points and inputs must be the same length.")
+            if self._P != aux_input.shape[1]:
+                raise ValueError(
+                    "Inputs tensor must have the right maximum \
+                    number of points in each cloud."
+                )
+            if aux_input.device != self.device:
+                raise ValueError(
+                    "All auxiliary inputs must be on the same device as the points."
+                )
+            aux_input_C = aux_input.shape[2]
+            return None, aux_input, aux_input_C
+        else:
+            raise ValueError(
+                "Auxiliary input must be either a list or a tensor with \
+                    shape (batch_size, P, C) where P is the maximum number of \
+                    points in a cloud."
+            )
+
+    def __len__(self) -> int:
+        return self._N
+
+    def __getitem__(self, index) -> "Pointclouds":
+        """
+        Args:
+            index: Specifying the index of the cloud to retrieve.
+                Can be an int, slice, list of ints or a boolean tensor.
+
+        Returns:
+            Pointclouds object with selected clouds. The tensors are not cloned.
+        """
+        normals, features = None, None
+        normals_list = self.normals_list()
+        features_list = self.features_list()
+        if isinstance(index, int):
+            points = [self.points_list()[index]]
+            if normals_list is not None:
+                normals = [normals_list[index]]
+            if features_list is not None:
+                features = [features_list[index]]
+        elif isinstance(index, slice):
+            points = self.points_list()[index]
+            if normals_list is not None:
+                normals = normals_list[index]
+            if features_list is not None:
+                features = features_list[index]
+        elif isinstance(index, list):
+            points = [self.points_list()[i] for i in index]
+            if normals_list is not None:
+                normals = [normals_list[i] for i in index]
+            if features_list is not None:
+                features = [features_list[i] for i in index]
+        elif isinstance(index, torch.Tensor):
+            if index.dim() != 1 or index.dtype.is_floating_point:
+                raise IndexError(index)
+            # NOTE consider converting index to cpu for efficiency
+            if index.dtype == torch.bool:
+                # advanced indexing on a single dimension
+                index = index.nonzero()
+                index = index.squeeze(1) if index.numel() > 0 else index
+                index = index.tolist()
+            points = [self.points_list()[i] for i in index]
+            if normals_list is not None:
+                normals = [normals_list[i] for i in index]
+            if features_list is not None:
+                features = [features_list[i] for i in index]
+        else:
+            raise IndexError(index)
+
+        return self.__class__(points=points, normals=normals, features=features)
+
+    def isempty(self) -> bool:
+        """
+        Checks whether any cloud is valid.
+
+        Returns:
+            bool indicating whether there is any data.
+        """
+        return self._N == 0 or self.valid.eq(False).all()
+
+    def points_list(self) -> List[torch.Tensor]:
+        """
+        Get the list representation of the points.
+
+        Returns:
+            list of tensors of points of shape (P_n, 3).
+        """
+        if self._points_list is None:
+            assert (
+                self._points_padded is not None
+            ), "points_padded is required to compute points_list."
+            points_list = []
+            for i in range(self._N):
+                points_list.append(
+                    self._points_padded[i, : self.num_points_per_cloud()[i]]
+                )
+            self._points_list = points_list
+        return self._points_list
+
+    def normals_list(self) -> Optional[List[torch.Tensor]]:
+        """
+        Get the list representation of the normals,
+        or None if there are no normals.
+
+        Returns:
+            list of tensors of normals of shape (P_n, 3).
+        """
+        if self._normals_list is None:
+            if self._normals_padded is None:
+                # No normals provided so return None
+                return None
+            self._normals_list = struct_utils.padded_to_list(
+                self._normals_padded, self.num_points_per_cloud().tolist()
+            )
+        return self._normals_list
+
+    def features_list(self) -> Optional[List[torch.Tensor]]:
+        """
+        Get the list representation of the features,
+        or None if there are no features.
+
+        Returns:
+            list of tensors of features of shape (P_n, C).
+        """
+        if self._features_list is None:
+            if self._features_padded is None:
+                # No features provided so return None
+                return None
+            self._features_list = struct_utils.padded_to_list(
+                self._features_padded, self.num_points_per_cloud().tolist()
+            )
+        return self._features_list
+
+    def points_packed(self) -> torch.Tensor:
+        """
+        Get the packed representation of the points.
+
+        Returns:
+            tensor of points of shape (sum(P_n), 3).
+        """
+        self._compute_packed()
+        return self._points_packed
+
+    def normals_packed(self) -> Optional[torch.Tensor]:
+        """
+        Get the packed representation of the normals.
+
+        Returns:
+            tensor of normals of shape (sum(P_n), 3),
+            or None if there are no normals.
+        """
+        self._compute_packed()
+        return self._normals_packed
+
+    def features_packed(self) -> Optional[torch.Tensor]:
+        """
+        Get the packed representation of the features.
+
+        Returns:
+            tensor of features of shape (sum(P_n), C),
+            or None if there are no features
+        """
+        self._compute_packed()
+        return self._features_packed
+
+    def packed_to_cloud_idx(self):
+        """
+        Return a 1D tensor x with length equal to the total number of points.
+        packed_to_cloud_idx()[i] gives the index of the cloud which contains
+        points_packed()[i].
+
+        Returns:
+            1D tensor of indices.
+        """
+        self._compute_packed()
+        return self._packed_to_cloud_idx
+
+    def cloud_to_packed_first_idx(self):
+        """
+        Return a 1D tensor x with length equal to the number of clouds such that
+        the first point of the ith cloud is points_packed[x[i]].
+
+        Returns:
+            1D tensor of indices of first items.
+        """
+        self._compute_packed()
+        return self._cloud_to_packed_first_idx
+
+    def num_points_per_cloud(self):
+        """
+        Return a 1D tensor x with length equal to the number of clouds giving
+        the number of points in each cloud.
+
+        Returns:
+            1D tensor of sizes.
+        """
+        return self._num_points_per_cloud
+
+    def points_padded(self) -> torch.Tensor:
+        """
+        Get the padded representation of the points.
+
+        Returns:
+            tensor of points of shape (N, max(P_n), 3).
+        """
+        self._compute_padded()
+        return self._points_padded
+
+    def normals_padded(self) -> Optional[torch.Tensor]:
+        """
+        Get the padded representation of the normals,
+        or None if there are no normals.
+
+        Returns:
+            tensor of normals of shape (N, max(P_n), 3).
+        """
+        self._compute_padded()
+        return self._normals_padded
+
+    def features_padded(self) -> Optional[torch.Tensor]:
+        """
+        Get the padded representation of the features,
+        or None if there are no features.
+
+        Returns:
+            tensor of features of shape (N, max(P_n), 3).
+        """
+        self._compute_padded()
+        return self._features_padded
+
+    def padded_to_packed_idx(self):
+        """
+        Return a 1D tensor x with length equal to the total number of points
+        such that points_packed()[i] is element x[i] of the flattened padded
+        representation.
+        The packed representation can be calculated as follows.
+
+        .. code-block:: python
+
+            p = points_padded().reshape(-1, 3)
+            points_packed = p[x]
+
+        Returns:
+            1D tensor of indices.
+        """
+        if self._padded_to_packed_idx is not None:
+            return self._padded_to_packed_idx
+        if self._N == 0:
+            self._padded_to_packed_idx = []
+        else:
+            self._padded_to_packed_idx = torch.cat(
+                [
+                    torch.arange(v, dtype=torch.int64, device=self.device) + i * self._P
+                    for (i, v) in enumerate(self.num_points_per_cloud())
+                ],
+                dim=0,
+            )
+        return self._padded_to_packed_idx
+
+    def _compute_padded(self, refresh: bool = False):
+        """
+        Computes the padded version from points_list, normals_list and features_list.
+
+        Args:
+            refresh: whether to force the recalculation.
+        """
+        if not (refresh or self._points_padded is None):
+            return
+
+        self._normals_padded, self._features_padded = None, None
+        if self.isempty():
+            self._points_padded = torch.zeros((self._N, 0, 3), device=self.device)
+        else:
+            self._points_padded = struct_utils.list_to_padded(
+                self.points_list(),
+                (self._P, 3),
+                pad_value=0.0,
+                equisized=self.equisized,
+            )
+            normals_list = self.normals_list()
+            if normals_list is not None:
+                self._normals_padded = struct_utils.list_to_padded(
+                    normals_list,
+                    (self._P, 3),
+                    pad_value=0.0,
+                    equisized=self.equisized,
+                )
+            features_list = self.features_list()
+            if features_list is not None:
+                self._features_padded = struct_utils.list_to_padded(
+                    features_list,
+                    (self._P, self._C),
+                    pad_value=0.0,
+                    equisized=self.equisized,
+                )
+
+    # TODO(nikhilar) Improve performance of _compute_packed.
+    def _compute_packed(self, refresh: bool = False):
+        """
+        Computes the packed version from points_list, normals_list and
+        features_list and sets the values of auxiliary tensors.
+
+        Args:
+            refresh: Set to True to force recomputation of packed
+                representations. Default: False.
+        """
+
+        if not (
+            refresh
+            or any(
+                v is None
+                for v in [
+                    self._points_packed,
+                    self._packed_to_cloud_idx,
+                    self._cloud_to_packed_first_idx,
+                ]
+            )
+        ):
+            return
+
+        # Packed can be calculated from padded or list, so can call the
+        # accessor function for the lists.
+        points_list = self.points_list()
+        normals_list = self.normals_list()
+        features_list = self.features_list()
+        if self.isempty():
+            self._points_packed = torch.zeros(
+                (0, 3), dtype=torch.float32, device=self.device
+            )
+            self._packed_to_cloud_idx = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+            self._cloud_to_packed_first_idx = torch.zeros(
+                (0,), dtype=torch.int64, device=self.device
+            )
+            self._normals_packed = None
+            self._features_packed = None
+            return
+
+        points_list_to_packed = struct_utils.list_to_packed(points_list)
+        self._points_packed = points_list_to_packed[0]
+        if not torch.allclose(self._num_points_per_cloud, points_list_to_packed[1]):
+            raise ValueError("Inconsistent list to packed conversion")
+        self._cloud_to_packed_first_idx = points_list_to_packed[2]
+        self._packed_to_cloud_idx = points_list_to_packed[3]
+
+        self._normals_packed, self._features_packed = None, None
+        if normals_list is not None:
+            normals_list_to_packed = struct_utils.list_to_packed(normals_list)
+            self._normals_packed = normals_list_to_packed[0]
+
+        if features_list is not None:
+            features_list_to_packed = struct_utils.list_to_packed(features_list)
+            self._features_packed = features_list_to_packed[0]
+
+    def clone(self):
+        """
+        Deep copy of Pointclouds object. All internal tensors are cloned
+        individually.
+
+        Returns:
+            new Pointclouds object.
+        """
+        # instantiate new pointcloud with the representation which is not None
+        # (either list or tensor) to save compute.
+        new_points, new_normals, new_features = None, None, None
+        if self._points_list is not None:
+            new_points = [v.clone() for v in self.points_list()]
+            normals_list = self.normals_list()
+            features_list = self.features_list()
+            if normals_list is not None:
+                new_normals = [n.clone() for n in normals_list]
+            if features_list is not None:
+                new_features = [f.clone() for f in features_list]
+        elif self._points_padded is not None:
+            new_points = self.points_padded().clone()
+            normals_padded = self.normals_padded()
+            features_padded = self.features_padded()
+            if normals_padded is not None:
+                new_normals = self.normals_padded().clone()
+            if features_padded is not None:
+                new_features = self.features_padded().clone()
+        other = self.__class__(
+            points=new_points, normals=new_normals, features=new_features
+        )
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.clone())
+        return other
+
+    def detach(self):
+        """
+        Detach Pointclouds object. All internal tensors are detached
+        individually.
+
+        Returns:
+            new Pointclouds object.
+        """
+        # instantiate new pointcloud with the representation which is not None
+        # (either list or tensor) to save compute.
+        new_points, new_normals, new_features = None, None, None
+        if self._points_list is not None:
+            new_points = [v.detach() for v in self.points_list()]
+            normals_list = self.normals_list()
+            features_list = self.features_list()
+            if normals_list is not None:
+                new_normals = [n.detach() for n in normals_list]
+            if features_list is not None:
+                new_features = [f.detach() for f in features_list]
+        elif self._points_padded is not None:
+            new_points = self.points_padded().detach()
+            normals_padded = self.normals_padded()
+            features_padded = self.features_padded()
+            if normals_padded is not None:
+                new_normals = self.normals_padded().detach()
+            if features_padded is not None:
+                new_features = self.features_padded().detach()
+        other = self.__class__(
+            points=new_points, normals=new_normals, features=new_features
+        )
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.detach())
+        return other
+
+    def to(self, device: Device, copy: bool = False):
+        """
+        Match functionality of torch.Tensor.to()
+        If copy = True or the self Tensor is on a different device, the
+        returned tensor is a copy of self with the desired torch.device.
+        If copy = False and the self Tensor already has the correct torch.device,
+        then self is returned.
+
+        Args:
+          device: Device (as str or torch.device) for the new tensor.
+          copy: Boolean indicator whether or not to clone self. Default False.
+
+        Returns:
+          Pointclouds object.
+        """
+        device_ = make_device(device)
+
+        if not copy and self.device == device_:
+            return self
+
+        other = self.clone()
+        if self.device == device_:
+            return other
+
+        other.device = device_
+        if other._N > 0:
+            other._points_list = [v.to(device_) for v in other.points_list()]
+            if other._normals_list is not None:
+                other._normals_list = [n.to(device_) for n in other.normals_list()]
+            if other._features_list is not None:
+                other._features_list = [f.to(device_) for f in other.features_list()]
+        for k in self._INTERNAL_TENSORS:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(other, k, v.to(device_))
+        return other
+
+    def cpu(self):
+        return self.to("cpu")
+
+    def cuda(self):
+        return self.to("cuda")
+
+    def get_cloud(self, index: int):
+        """
+        Get tensors for a single cloud from the list representation.
+
+        Args:
+            index: Integer in the range [0, N).
+
+        Returns:
+            points: Tensor of shape (P, 3).
+            normals: Tensor of shape (P, 3)
+            features: LongTensor of shape (P, C).
+        """
+        if not isinstance(index, int):
+            raise ValueError("Cloud index must be an integer.")
+        if index < 0 or index > self._N:
+            raise ValueError(
+                "Cloud index must be in the range [0, N) where \
+            N is the number of clouds in the batch."
+            )
+        points = self.points_list()[index]
+        normals, features = None, None
+        normals_list = self.normals_list()
+        if normals_list is not None:
+            normals = normals_list[index]
+        features_list = self.features_list()
+        if features_list is not None:
+            features = features_list[index]
+        return points, normals, features
+
+    # TODO(nikhilar) Move function to a utils file.
+    def split(self, split_sizes: list):
+        """
+        Splits Pointclouds object of size N into a list of Pointclouds objects
+        of size len(split_sizes), where the i-th Pointclouds object is of size
+        split_sizes[i]. Similar to torch.split().
+
+        Args:
+            split_sizes: List of integer sizes of Pointclouds objects to be
+            returned.
+
+        Returns:
+            list[Pointclouds].
+        """
+        if not all(isinstance(x, int) for x in split_sizes):
+            raise ValueError("Value of split_sizes must be a list of integers.")
+        cloudlist = []
+        curi = 0
+        for i in split_sizes:
+            cloudlist.append(self[curi : curi + i])
+            curi += i
+        return cloudlist
+
+    def offset_(self, offsets_packed):
+        """
+        Translate the point clouds by an offset. In place operation.
+
+        Args:
+            offsets_packed: A Tensor of shape (3,) or the same shape
+                as self.points_packed giving offsets to be added to
+                all points.
+
+        Returns:
+            self.
+        """
+        points_packed = self.points_packed()
+        if offsets_packed.shape == (3,):
+            offsets_packed = offsets_packed.expand_as(points_packed)
+        if offsets_packed.shape != points_packed.shape:
+            raise ValueError("Offsets must have dimension (all_p, 3).")
+        self._points_packed = points_packed + offsets_packed
+        new_points_list = list(
+            self._points_packed.split(self.num_points_per_cloud().tolist(), 0)
+        )
+        # Note that since _compute_packed() has been executed, points_list
+        # cannot be None even if not provided during construction.
+        self._points_list = new_points_list
+        if self._points_padded is not None:
+            for i, points in enumerate(new_points_list):
+                if len(points) > 0:
+                    self._points_padded[i, : points.shape[0], :] = points
+        return self
+
+    # TODO(nikhilar) Move out of place operator to a utils file.
+    def offset(self, offsets_packed):
+        """
+        Out of place offset.
+
+        Args:
+            offsets_packed: A Tensor of the same shape as self.points_packed
+                giving offsets to be added to all points.
+        Returns:
+            new Pointclouds object.
+        """
+        new_clouds = self.clone()
+        return new_clouds.offset_(offsets_packed)
+
+    def subsample(self, max_points: Union[int, Sequence[int]]) -> "Pointclouds":
+        """
+        Subsample each cloud so that it has at most max_points points.
+
+        Args:
+            max_points: maximum number of points in each cloud.
+
+        Returns:
+            new Pointclouds object, or self if nothing to be done.
+        """
+        if isinstance(max_points, int):
+            max_points = [max_points] * len(self)
+        elif len(max_points) != len(self):
+            raise ValueError("wrong number of max_points supplied")
+        if all(
+            int(n_points) <= int(max_)
+            for n_points, max_ in zip(self.num_points_per_cloud(), max_points)
+        ):
+            return self
+
+        points_list = []
+        features_list = []
+        normals_list = []
+        for max_, n_points, points, features, normals in zip_longest(
+            map(int, max_points),
+            map(int, self.num_points_per_cloud()),
+            self.points_list(),
+            self.features_list() or (),
+            self.normals_list() or (),
+        ):
+            if n_points > max_:
+                keep_np = np.random.choice(n_points, max_, replace=False)
+                keep = torch.tensor(keep_np).to(points.device)
+                points = points[keep]
+                if features is not None:
+                    features = features[keep]
+                if normals is not None:
+                    normals = normals[keep]
+            points_list.append(points)
+            features_list.append(features)
+            normals_list.append(normals)
+
+        return Pointclouds(
+            points=points_list,
+            normals=self.normals_list() and normals_list,
+            features=self.features_list() and features_list,
+        )
+
+    def scale_(self, scale):
+        """
+        Multiply the coordinates of this object by a scalar value.
+        - i.e. enlarge/dilate
+        In place operation.
+
+        Args:
+            scale: A scalar, or a Tensor of shape (N,).
+
+        Returns:
+            self.
+        """
+        if not torch.is_tensor(scale):
+            scale = torch.full((len(self),), scale, device=self.device)
+        new_points_list = []
+        points_list = self.points_list()
+        for i, old_points in enumerate(points_list):
+            new_points_list.append(scale[i] * old_points)
+        self._points_list = new_points_list
+        if self._points_packed is not None:
+            self._points_packed = torch.cat(new_points_list, dim=0)
+        if self._points_padded is not None:
+            for i, points in enumerate(new_points_list):
+                if len(points) > 0:
+                    self._points_padded[i, : points.shape[0], :] = points
+        return self
+
+    def scale(self, scale):
+        """
+        Out of place scale_.
+
+        Args:
+            scale: A scalar, or a Tensor of shape (N,).
+
+        Returns:
+            new Pointclouds object.
+        """
+        new_clouds = self.clone()
+        return new_clouds.scale_(scale)
+
+    # TODO(nikhilar) Move function to utils file.
+    def get_bounding_boxes(self):
+        """
+        Compute an axis-aligned bounding box for each cloud.
+
+        Returns:
+            bboxes: Tensor of shape (N, 3, 2) where bbox[i, j] gives the
+            min and max values of cloud i along the jth coordinate axis.
+        """
+        all_mins, all_maxes = [], []
+        for points in self.points_list():
+            cur_mins = points.min(dim=0)[0]  # (3,)
+            cur_maxes = points.max(dim=0)[0]  # (3,)
+            all_mins.append(cur_mins)
+            all_maxes.append(cur_maxes)
+        all_mins = torch.stack(all_mins, dim=0)  # (N, 3)
+        all_maxes = torch.stack(all_maxes, dim=0)  # (N, 3)
+        bboxes = torch.stack([all_mins, all_maxes], dim=2)
+        return bboxes
+
+    def estimate_normals(
+        self,
+        neighborhood_size: int = 50,
+        disambiguate_directions: bool = True,
+        assign_to_self: bool = False,
+    ):
+        """
+        Estimates the normals of each point in each cloud and assigns
+        them to the internal tensors `self._normals_list` and `self._normals_padded`
+
+        The function uses `ops.estimate_pointcloud_local_coord_frames`
+        to estimate the normals. Please refer to that function for more
+        detailed information about the implemented algorithm.
+
+        Args:
+          **neighborhood_size**: The size of the neighborhood used to estimate the
+            geometry around each point.
+          **disambiguate_directions**: If `True`, uses the algorithm from [1] to
+            ensure sign consistency of the normals of neighboring points.
+          **normals**: A tensor of normals for each input point
+            of shape `(minibatch, num_point, 3)`.
+            If `pointclouds` are of `Pointclouds` class, returns a padded tensor.
+          **assign_to_self**: If `True`, assigns the computed normals to the
+            internal buffers overwriting any previously stored normals.
+
+        References:
+          [1] Tombari, Salti, Di Stefano: Unique Signatures of Histograms for
+          Local Surface Description, ECCV 2010.
+        """
+        from .. import ops
+
+        # estimate the normals
+        normals_est = ops.estimate_pointcloud_normals(
+            self,
+            neighborhood_size=neighborhood_size,
+            disambiguate_directions=disambiguate_directions,
+        )
+
+        # assign to self
+        if assign_to_self:
+            _, self._normals_padded, _ = self._parse_auxiliary_input(normals_est)
+            self._normals_list, self._normals_packed = None, None
+            if self._points_list is not None:
+                # update self._normals_list
+                self.normals_list()
+            if self._points_packed is not None:
+                # update self._normals_packed
+                self._normals_packed = torch.cat(self._normals_list, dim=0)
+
+        return normals_est
+
+    def extend(self, N: int):
+        """
+        Create new Pointclouds which contains each cloud N times.
+
+        Args:
+            N: number of new copies of each cloud.
+
+        Returns:
+            new Pointclouds object.
+        """
+        if not isinstance(N, int):
+            raise ValueError("N must be an integer.")
+        if N <= 0:
+            raise ValueError("N must be > 0.")
+
+        new_points_list, new_normals_list, new_features_list = [], None, None
+        for points in self.points_list():
+            new_points_list.extend(points.clone() for _ in range(N))
+        normals_list = self.normals_list()
+        if normals_list is not None:
+            new_normals_list = []
+            for normals in normals_list:
+                new_normals_list.extend(normals.clone() for _ in range(N))
+        features_list = self.features_list()
+        if features_list is not None:
+            new_features_list = []
+            for features in features_list:
+                new_features_list.extend(features.clone() for _ in range(N))
+        return self.__class__(
+            points=new_points_list, normals=new_normals_list, features=new_features_list
+        )
+
+    def update_padded(
+        self, new_points_padded, new_normals_padded=None, new_features_padded=None
+    ):
+        """
+        Returns a Pointcloud structure with updated padded tensors and copies of
+        the auxiliary tensors. This function allows for an update of
+        points_padded (and normals and features) without having to explicitly
+        convert it to the list representation for heterogeneous batches.
+
+        Args:
+            new_points_padded: FloatTensor of shape (N, P, 3)
+            new_normals_padded: (optional) FloatTensor of shape (N, P, 3)
+            new_features_padded: (optional) FloatTensor of shape (N, P, C)
+
+        Returns:
+            Pointcloud with updated padded representations
+        """
+
+        def check_shapes(x, size):
+            if x.shape[0] != size[0]:
+                raise ValueError("new values must have the same batch dimension.")
+            if x.shape[1] != size[1]:
+                raise ValueError("new values must have the same number of points.")
+            if size[2] is not None:
+                if x.shape[2] != size[2]:
+                    raise ValueError(
+                        "new values must have the same number of channels."
+                    )
+
+        check_shapes(new_points_padded, [self._N, self._P, 3])
+        if new_normals_padded is not None:
+            check_shapes(new_normals_padded, [self._N, self._P, 3])
+        if new_features_padded is not None:
+            check_shapes(new_features_padded, [self._N, self._P, self._C])
+
+        new = self.__class__(
+            points=new_points_padded,
+            normals=new_normals_padded,
+            features=new_features_padded,
+        )
+
+        # overwrite the equisized flag
+        new.equisized = self.equisized
+
+        # copy normals
+        if new_normals_padded is None:
+            # If no normals are provided, keep old ones (shallow copy)
+            new._normals_list = self._normals_list
+            new._normals_padded = self._normals_padded
+            new._normals_packed = self._normals_packed
+
+        # copy features
+        if new_features_padded is None:
+            # If no features are provided, keep old ones (shallow copy)
+            new._features_list = self._features_list
+            new._features_padded = self._features_padded
+            new._features_packed = self._features_packed
+
+        # copy auxiliary tensors
+        copy_tensors = [
+            "_packed_to_cloud_idx",
+            "_cloud_to_packed_first_idx",
+            "_num_points_per_cloud",
+            "_padded_to_packed_idx",
+            "valid",
+        ]
+        for k in copy_tensors:
+            v = getattr(self, k)
+            if torch.is_tensor(v):
+                setattr(new, k, v)  # shallow copy
+
+        # update points
+        new._points_padded = new_points_padded
+        assert new._points_list is None
+        assert new._points_packed is None
+
+        # update normals and features if provided
+        if new_normals_padded is not None:
+            new._normals_padded = new_normals_padded
+            new._normals_list = None
+            new._normals_packed = None
+        if new_features_padded is not None:
+            new._features_padded = new_features_padded
+            new._features_list = None
+            new._features_packed = None
+        return new
+
+    def inside_box(self, box):
+        """
+        Finds the points inside a 3D box.
+
+        Args:
+            box: FloatTensor of shape (2, 3) or (N, 2, 3) where N is the number
+                of clouds.
+                    box[..., 0, :] gives the min x, y & z.
+                    box[..., 1, :] gives the max x, y & z.
+        Returns:
+            idx: BoolTensor of length sum(P_i) indicating whether the packed points are
+                within the input box.
+        """
+        if box.dim() > 3 or box.dim() < 2:
+            raise ValueError("Input box must be of shape (2, 3) or (N, 2, 3).")
+
+        if box.dim() == 3 and box.shape[0] != 1 and box.shape[0] != self._N:
+            raise ValueError(
+                "Input box dimension is incompatible with pointcloud size."
+            )
+
+        if box.dim() == 2:
+            box = box[None]
+
+        if (box[..., 0, :] > box[..., 1, :]).any():
+            raise ValueError("Input box is invalid: min values larger than max values.")
+
+        points_packed = self.points_packed()
+        sumP = points_packed.shape[0]
+
+        if box.shape[0] == 1:
+            box = box.expand(sumP, 2, 3)
+        elif box.shape[0] == self._N:
+            box = box.unbind(0)
+            box = [
+                b.expand(p, 2, 3) for (b, p) in zip(box, self.num_points_per_cloud())
+            ]
+            box = torch.cat(box, 0)
+
+        coord_inside = (points_packed >= box[:, 0]) * (points_packed <= box[:, 1])
+        return coord_inside.all(dim=-1)
+
+
+def join_pointclouds_as_batch(pointclouds: Sequence[Pointclouds]):
+    """
+    Merge a list of Pointclouds objects into a single batched Pointclouds
+    object. All pointclouds must be on the same device.
+
+    Args:
+        batch: List of Pointclouds objects each with batch dim [b1, b2, ..., bN]
+    Returns:
+        pointcloud: Poinclouds object with all input pointclouds collated into
+            a single object with batch dim = sum(b1, b2, ..., bN)
+    """
+    if isinstance(pointclouds, Pointclouds) or not isinstance(pointclouds, Sequence):
+        raise ValueError("Wrong first argument to join_points_as_batch.")
+
+    device = pointclouds[0].device
+    if not all(p.device == device for p in pointclouds):
+        raise ValueError("Pointclouds must all be on the same device")
+
+    kwargs = {}
+    for field in ("points", "normals", "features"):
+        field_list = [getattr(p, field + "_list")() for p in pointclouds]
+        if None in field_list:
+            if field == "points":
+                raise ValueError("Pointclouds cannot have their points set to None!")
+            if not all(f is None for f in field_list):
+                raise ValueError(
+                    f"Pointclouds in the batch have some fields '{field}'"
+                    + " defined and some set to None."
+                )
+            field_list = None
+        else:
+            field_list = [p for points in field_list for p in points]
+        kwargs[field] = field_list
+
+    return Pointclouds(**kwargs)
diff --git a/pytorch3d/pytorch3d/structures/utils.py b/pytorch3d/pytorch3d/structures/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f9438c035e49d658056c017b1c5ccb12f8b546f
--- /dev/null
+++ b/pytorch3d/pytorch3d/structures/utils.py
@@ -0,0 +1,242 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import List, Sequence, Tuple, Union
+
+import torch
+
+
+"""
+Util functions for points/verts/faces/volumes.
+"""
+
+
+def list_to_padded(
+    x: Union[List[torch.Tensor], Tuple[torch.Tensor]],
+    pad_size: Union[Sequence[int], None] = None,
+    pad_value: float = 0.0,
+    equisized: bool = False,
+) -> torch.Tensor:
+    r"""
+    Transforms a list of N tensors each of shape (Si_0, Si_1, ... Si_D)
+    into:
+    - a single tensor of shape (N, pad_size(0), pad_size(1), ..., pad_size(D))
+      if pad_size is provided
+    - or a tensor of shape (N, max(Si_0), max(Si_1), ..., max(Si_D)) if pad_size is None.
+
+    Args:
+      x: list of Tensors
+      pad_size: list(int) specifying the size of the padded tensor.
+        If `None` (default), the largest size of each dimension
+        is set as the `pad_size`.
+      pad_value: float value to be used to fill the padded tensor
+      equisized: bool indicating whether the items in x are of equal size
+        (sometimes this is known and if provided saves computation)
+
+    Returns:
+      x_padded: tensor consisting of padded input tensors stored
+        over the newly allocated memory.
+    """
+    if equisized:
+        return torch.stack(x, 0)
+
+    if not all(torch.is_tensor(y) for y in x):
+        raise ValueError("All items have to be instances of a torch.Tensor.")
+
+    # we set the common number of dimensions to the maximum
+    # of the dimensionalities of the tensors in the list
+    element_ndim = max(y.ndim for y in x)
+
+    # replace empty 1D tensors with empty tensors with a correct number of dimensions
+    x = [
+        (y.new_zeros([0] * element_ndim) if (y.ndim == 1 and y.nelement() == 0) else y)
+        for y in x
+    ]
+
+    if any(y.ndim != x[0].ndim for y in x):
+        raise ValueError("All items have to have the same number of dimensions!")
+
+    if pad_size is None:
+        pad_dims = [
+            max(y.shape[dim] for y in x if len(y) > 0) for dim in range(x[0].ndim)
+        ]
+    else:
+        if any(len(pad_size) != y.ndim for y in x):
+            raise ValueError("Pad size must contain target size for all dimensions.")
+        pad_dims = pad_size
+
+    N = len(x)
+    x_padded = x[0].new_full((N, *pad_dims), pad_value)
+    for i, y in enumerate(x):
+        if len(y) > 0:
+            slices = (i, *(slice(0, y.shape[dim]) for dim in range(y.ndim)))
+            x_padded[slices] = y
+    return x_padded
+
+
+def padded_to_list(
+    x: torch.Tensor,
+    split_size: Union[Sequence[int], Sequence[Sequence[int]], None] = None,
+):
+    r"""
+    Transforms a padded tensor of shape (N, S_1, S_2, ..., S_D) into a list
+    of N tensors of shape:
+    - (Si_1, Si_2, ..., Si_D) where (Si_1, Si_2, ..., Si_D) is specified in split_size(i)
+    - or (S_1, S_2, ..., S_D) if split_size is None
+    - or (Si_1, S_2, ..., S_D) if split_size(i) is an integer.
+
+    Args:
+      x: tensor
+      split_size: optional 1D or 2D list/tuple of ints defining the number of
+        items for each tensor.
+
+    Returns:
+      x_list: a list of tensors sharing the memory with the input.
+    """
+    x_list = list(x.unbind(0))
+
+    if split_size is None:
+        return x_list
+
+    N = len(split_size)
+    if x.shape[0] != N:
+        raise ValueError("Split size must be of same length as inputs first dimension")
+
+    for i in range(N):
+        if isinstance(split_size[i], int):
+            x_list[i] = x_list[i][: split_size[i]]
+        else:
+            slices = tuple(slice(0, s) for s in split_size[i])  # pyre-ignore
+            x_list[i] = x_list[i][slices]
+    return x_list
+
+
+def list_to_packed(x: List[torch.Tensor]):
+    r"""
+    Transforms a list of N tensors each of shape (Mi, K, ...) into a single
+    tensor of shape (sum(Mi), K, ...).
+
+    Args:
+      x: list of tensors.
+
+    Returns:
+        4-element tuple containing
+
+        - **x_packed**: tensor consisting of packed input tensors along the
+          1st dimension.
+        - **num_items**: tensor of shape N containing Mi for each element in x.
+        - **item_packed_first_idx**: tensor of shape N indicating the index of
+          the first item belonging to the same element in the original list.
+        - **item_packed_to_list_idx**: tensor of shape sum(Mi) containing the
+          index of the element in the list the item belongs to.
+    """
+    N = len(x)
+    num_items = torch.zeros(N, dtype=torch.int64, device=x[0].device)
+    item_packed_first_idx = torch.zeros(N, dtype=torch.int64, device=x[0].device)
+    item_packed_to_list_idx = []
+    cur = 0
+    for i, y in enumerate(x):
+        num = len(y)
+        num_items[i] = num
+        item_packed_first_idx[i] = cur
+        item_packed_to_list_idx.append(
+            torch.full((num,), i, dtype=torch.int64, device=y.device)
+        )
+        cur += num
+
+    x_packed = torch.cat(x, dim=0)
+    item_packed_to_list_idx = torch.cat(item_packed_to_list_idx, dim=0)
+
+    return x_packed, num_items, item_packed_first_idx, item_packed_to_list_idx
+
+
+def packed_to_list(x: torch.Tensor, split_size: Union[list, int]):
+    r"""
+    Transforms a tensor of shape (sum(Mi), K, L, ...) to N set of tensors of
+    shape (Mi, K, L, ...) where Mi's are defined in split_size
+
+    Args:
+      x: tensor
+      split_size: list, tuple or int defining the number of items for each tensor
+        in the output list.
+
+    Returns:
+      x_list: A list of Tensors
+    """
+    return x.split(split_size, dim=0)
+
+
+def padded_to_packed(
+    x: torch.Tensor,
+    split_size: Union[list, tuple, None] = None,
+    pad_value: Union[float, int, None] = None,
+):
+    r"""
+    Transforms a padded tensor of shape (N, M, K) into a packed tensor
+    of shape:
+     - (sum(Mi), K) where (Mi, K) are the dimensions of
+        each of the tensors in the batch and Mi is specified by split_size(i)
+     - (N*M, K) if split_size is None
+
+    Support only for 3-dimensional input tensor and 1-dimensional split size.
+
+    Args:
+      x: tensor
+      split_size: list, tuple or int defining the number of items for each tensor
+        in the output list.
+      pad_value: optional value to use to filter the padded values in the input
+        tensor.
+
+    Only one of split_size or pad_value should be provided, or both can be None.
+
+    Returns:
+      x_packed: a packed tensor.
+    """
+    if x.ndim != 3:
+        raise ValueError("Supports only 3-dimensional input tensors")
+
+    N, M, D = x.shape
+
+    if split_size is not None and pad_value is not None:
+        raise ValueError("Only one of split_size or pad_value should be provided.")
+
+    x_packed = x.reshape(-1, D)  # flatten padded
+
+    if pad_value is None and split_size is None:
+        return x_packed
+
+    # Convert to packed using pad value
+    if pad_value is not None:
+        # pyre-fixme[16]: `ByteTensor` has no attribute `any`.
+        mask = x_packed.ne(pad_value).any(-1)
+        x_packed = x_packed[mask]
+        return x_packed
+
+    # Convert to packed using split sizes
+    # pyre-fixme[6]: Expected `Sized` for 1st param but got `Union[None,
+    #  List[typing.Any], typing.Tuple[typing.Any, ...]]`.
+    N = len(split_size)
+    if x.shape[0] != N:
+        raise ValueError("Split size must be of same length as inputs first dimension")
+
+    # pyre-fixme[16]: `None` has no attribute `__iter__`.
+    if not all(isinstance(i, int) for i in split_size):
+        raise ValueError(
+            "Support only 1-dimensional unbinded tensor. \
+                Split size for more dimensions provided"
+        )
+
+    padded_to_packed_idx = torch.cat(
+        [
+            torch.arange(v, dtype=torch.int64, device=x.device) + i * M
+            # pyre-fixme[6]: Expected `Iterable[Variable[_T]]` for 1st param but got
+            #  `Union[None, List[typing.Any], typing.Tuple[typing.Any, ...]]`.
+            for (i, v) in enumerate(split_size)
+        ],
+        dim=0,
+    )
+
+    return x_packed[padded_to_packed_idx]
diff --git a/pytorch3d/pytorch3d/structures/volumes.py b/pytorch3d/pytorch3d/structures/volumes.py
new file mode 100644
index 0000000000000000000000000000000000000000..151a3d44adfb6372445cf34fb224ccaf8fed0dd7
--- /dev/null
+++ b/pytorch3d/pytorch3d/structures/volumes.py
@@ -0,0 +1,737 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import copy
+from typing import List, Optional, Tuple, Union
+
+import torch
+
+from ..common.types import Device, make_device
+from ..transforms import Scale, Transform3d
+from . import utils as struct_utils
+
+
+_Scalar = Union[int, float]
+_Vector = Union[torch.Tensor, Tuple[_Scalar, ...], List[_Scalar]]
+_ScalarOrVector = Union[_Scalar, _Vector]
+
+_VoxelSize = _ScalarOrVector
+_Translation = _Vector
+
+_TensorBatch = Union[torch.Tensor, List[torch.Tensor], Tuple[torch.Tensor]]
+
+
+class Volumes:
+    """
+    This class provides functions for working with batches of volumetric grids
+    of possibly varying spatial sizes.
+
+    VOLUME DENSITIES
+
+    The Volumes class can be either constructed from a 5D tensor of
+    `densities` of size `batch x density_dim x depth x height x width` or
+    from a list of differently-sized 4D tensors `[D_1, ..., D_batch]`,
+    where each `D_i` is of size `[density_dim x depth_i x height_i x width_i]`.
+
+    In case the `Volumes` object is initialized from the list of `densities`,
+    the list of tensors is internally converted to a single 5D tensor by
+    zero-padding the relevant dimensions. Both list and padded representations can be
+    accessed with the `Volumes.densities()` or `Volumes.densities_list()` getters.
+    The sizes of the individual volumes in the structure can be retrieved
+    with the `Volumes.get_grid_sizes()` getter.
+
+    The `Volumes` class is immutable. I.e. after generating a `Volumes` object,
+    one cannot change its properties, such as `self._densities` or `self._features`
+    anymore.
+
+
+    VOLUME FEATURES
+
+    While the `densities` field is intended to represent various measures of the
+    "density" of the volume cells (opacity, signed/unsigned distances
+    from the nearest surface, ...), one can additionally initialize the
+    object with the `features` argument. `features` are either a 5D tensor
+    of shape `batch x feature_dim x depth x height x width` or a list of
+    of differently-sized 4D tensors `[F_1, ..., F_batch]`,
+    where each `F_i` is of size `[feature_dim x depth_i x height_i x width_i]`.
+    `features` are intended to describe other properties of volume cells,
+    such as per-voxel 3D vectors of RGB colors that can be later used
+    for rendering the volume.
+
+
+    VOLUME COORDINATES
+
+    Additionally, the `Volumes` class keeps track of the locations of the
+    centers of the volume cells in the local volume coordinates as well as in
+    the world coordinates.
+
+        Local coordinates:
+            - Represent the locations of the volume cells in the local coordinate
+              frame of the volume.
+            - The center of the voxel indexed with `[·, ·, 0, 0, 0]` in the volume
+              has its 3D local coordinate set to `[-1, -1, -1]`, while the voxel
+              at index `[·, ·, depth_i-1, height_i-1, width_i-1]` has its
+              3D local coordinate set to `[1, 1, 1]`.
+            - The first/second/third coordinate of each of the 3D per-voxel
+              XYZ vector denotes the horizontal/vertical/depth-wise position
+              respectively. I.e the order of the coordinate dimensions in the
+              volume is reversed w.r.t. the order of the 3D coordinate vectors.
+            - The intermediate coordinates between `[-1, -1, -1]` and `[1, 1, 1]`.
+              are linearly interpolated over the spatial dimensions of the volume.
+            - Note that the convention is the same as for the 5D version of the
+              `torch.nn.functional.grid_sample` function called with
+              `align_corners==True`.
+            - Note that the local coordinate convention of `Volumes`
+              (+X = left to right, +Y = top to bottom, +Z = away from the user)
+              is *different* from the world coordinate convention of the
+              renderer for `Meshes` or `Pointclouds`
+              (+X = right to left, +Y = bottom to top, +Z = away from the user).
+
+        World coordinates:
+            - These define the locations of the centers of the volume cells
+              in the world coordinates.
+            - They are specified with the following mapping that converts
+              points `x_local` in the local coordinates to points `x_world`
+              in the world coordinates:
+                ```
+                x_world = (
+                    x_local * (volume_size - 1) * 0.5 * voxel_size
+                ) - volume_translation,
+                ```
+              here `voxel_size` specifies the size of each voxel of the volume,
+              and `volume_translation` is the 3D offset of the central voxel of
+              the volume w.r.t. the origin of the world coordinate frame.
+              Both `voxel_size` and `volume_translation` are specified in
+              the world coordinate units. `volume_size` is the spatial size of
+              the volume in form of a 3D vector `[width, height, depth]`.
+            - Given the above definition of `x_world`, one can derive the
+              inverse mapping from `x_world` to `x_local` as follows:
+                ```
+                x_local = (
+                    (x_world + volume_translation) / (0.5 * voxel_size)
+                ) / (volume_size - 1)
+                ```
+            - For a trivial volume with `volume_translation==[0, 0, 0]`
+              with `voxel_size=-1`, `x_world` would range
+              from -(volume_size-1)/2` to `+(volume_size-1)/2`.
+
+    Coordinate tensors that denote the locations of each of the volume cells in
+    local / world coordinates (with shape `(depth x height x width x 3)`)
+    can be retrieved by calling the `Volumes.get_coord_grid()` getter with the
+    appropriate `world_coordinates` argument.
+
+    Internally, the mapping between `x_local` and `x_world` is represented
+    as a `Transform3d` object `Volumes._local_to_world_transform`.
+    Users can access the relevant transformations with the
+    `Volumes.get_world_to_local_coords_transform()` and
+    `Volumes.get_local_to_world_coords_transform()`
+    functions.
+
+    Example coordinate conversion:
+        - For a "trivial" volume with `voxel_size = 1.`,
+          `volume_translation=[0., 0., 0.]`, and the spatial size of
+          `DxHxW = 5x5x5`, the point `x_world = (-2, 0, 2)` gets mapped
+          to `x_local=(-1, 0, 1)`.
+        - For a "trivial" volume `v` with `voxel_size = 1.`,
+          `volume_translation=[0., 0., 0.]`, the following holds:
+            ```
+            torch.nn.functional.grid_sample(
+                v.densities(),
+                v.get_coord_grid(world_coordinates=False),
+                align_corners=True,
+            ) == v.densities(),
+            ```
+            i.e. sampling the volume at trivial local coordinates
+            (no scaling with `voxel_size`` or shift with `volume_translation`)
+            results in the same volume.
+    """
+
+    def __init__(
+        self,
+        densities: _TensorBatch,
+        features: Optional[_TensorBatch] = None,
+        voxel_size: _VoxelSize = 1.0,
+        volume_translation: _Translation = (0.0, 0.0, 0.0),
+    ) -> None:
+        """
+        Args:
+            **densities**: Batch of input feature volume occupancies of shape
+                `(minibatch, density_dim, depth, height, width)`, or a list
+                of 4D tensors `[D_1, ..., D_minibatch]` where each `D_i` has
+                shape `(density_dim, depth_i, height_i, width_i)`.
+                Typically, each voxel contains a non-negative number
+                corresponding to its opaqueness.
+            **features**: Batch of input feature volumes of shape:
+                `(minibatch, feature_dim, depth, height, width)` or a list
+                of 4D tensors `[F_1, ..., F_minibatch]` where each `F_i` has
+                shape `(feature_dim, depth_i, height_i, width_i)`.
+                The field is optional and can be set to `None` in case features are
+                not required.
+            **voxel_size**: Denotes the size of each volume voxel in world units.
+                Has to be one of:
+                a) A scalar (square voxels)
+                b) 3-tuple or a 3-list of scalars
+                c) a Tensor of shape (3,)
+                d) a Tensor of shape (minibatch, 3)
+                e) a Tensor of shape (minibatch, 1)
+                f) a Tensor of shape (1,) (square voxels)
+            **volume_translation**: Denotes the 3D translation of the center
+                of the volume in world units. Has to be one of:
+                a) 3-tuple or a 3-list of scalars
+                b) a Tensor of shape (3,)
+                c) a Tensor of shape (minibatch, 3)
+                d) a Tensor of shape (1,) (square voxels)
+        """
+
+        # handle densities
+        densities_, grid_sizes = self._convert_densities_features_to_tensor(
+            densities, "densities"
+        )
+
+        # take device from densities
+        self.device = densities_.device
+
+        # assign to the internal buffers
+        self._densities = densities_
+        self._grid_sizes = grid_sizes
+
+        # handle features
+        self._features = None
+        if features is not None:
+            self._set_features(features)
+
+        # set the local_to_world transform
+        self._set_local_to_world_transform(
+            voxel_size=voxel_size, volume_translation=volume_translation
+        )
+
+    def _convert_densities_features_to_tensor(
+        self, x: _TensorBatch, var_name: str
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Handle the `densities` or `features` arguments to the constructor.
+        """
+        if isinstance(x, (list, tuple)):
+            x_tensor = struct_utils.list_to_padded(x)
+            if any(x_.ndim != 4 for x_ in x):
+                raise ValueError(
+                    f"`{var_name}` has to be a list of 4-dim tensors of shape: "
+                    f"({var_name}_dim, height, width, depth)"
+                )
+            if any(x_.shape[0] != x[0].shape[0] for x_ in x):
+                raise ValueError(
+                    f"Each entry in the list of `{var_name}` has to have the "
+                    "same number of channels (first dimension in the tensor)."
+                )
+            x_shapes = torch.stack(
+                [
+                    torch.tensor(
+                        list(x_.shape[1:]), dtype=torch.long, device=x_tensor.device
+                    )
+                    for x_ in x
+                ],
+                dim=0,
+            )
+        elif torch.is_tensor(x):
+            if x.ndim != 5:
+                raise ValueError(
+                    f"`{var_name}` has to be a 5-dim tensor of shape: "
+                    f"(minibatch, {var_name}_dim, height, width, depth)"
+                )
+            x_tensor = x
+            x_shapes = torch.tensor(
+                list(x.shape[2:]), dtype=torch.long, device=x.device
+            )[None].repeat(x.shape[0], 1)
+        else:
+            raise ValueError(
+                f"{var_name} must be either a list or a tensor with "
+                f"shape (batch_size, {var_name}_dim, H, W, D)."
+            )
+        return x_tensor, x_shapes
+
+    def _voxel_size_translation_to_transform(
+        self,
+        voxel_size: torch.Tensor,
+        volume_translation: torch.Tensor,
+        batch_size: int,
+    ) -> Transform3d:
+        """
+        Converts the `voxel_size` and `volume_translation` constructor arguments
+        to the internal `Transform3d` object `local_to_world_transform`.
+        """
+        volume_size_zyx = self.get_grid_sizes().float()
+        volume_size_xyz = volume_size_zyx[:, [2, 1, 0]]
+
+        # x_local = (
+        #       (x_world + volume_translation) / (0.5 * voxel_size)
+        #   ) / (volume_size - 1)
+
+        # x_world = (
+        #       x_local * (volume_size - 1) * 0.5 * voxel_size
+        #   ) - volume_translation
+
+        local_to_world_transform = Scale(
+            (volume_size_xyz - 1) * voxel_size * 0.5, device=self.device
+        ).translate(-volume_translation)
+
+        return local_to_world_transform
+
+    def _handle_voxel_size(
+        self, voxel_size: _VoxelSize, batch_size: int
+    ) -> torch.Tensor:
+        """
+        Handle the `voxel_size` argument to the `Volumes` constructor.
+        """
+        err_msg = (
+            "voxel_size has to be either a 3-tuple of scalars, or a scalar, or"
+            " a torch.Tensor of shape (3,) or (1,) or (minibatch, 3) or (minibatch, 1)."
+        )
+        if isinstance(voxel_size, (float, int)):
+            # convert a scalar to a 3-element tensor
+            voxel_size = torch.full(
+                (1, 3), voxel_size, device=self.device, dtype=torch.float32
+            )
+        elif isinstance(voxel_size, torch.Tensor):
+            if voxel_size.numel() == 1:
+                # convert a single-element tensor to a 3-element one
+                voxel_size = voxel_size.view(-1).repeat(3)
+            elif len(voxel_size.shape) == 2 and (
+                voxel_size.shape[0] == batch_size and voxel_size.shape[1] == 1
+            ):
+                voxel_size = voxel_size.repeat(1, 3)
+        return self._convert_volume_property_to_tensor(voxel_size, batch_size, err_msg)
+
+    def _handle_volume_translation(
+        self, translation: _Translation, batch_size: int
+    ) -> torch.Tensor:
+        """
+        Handle the `volume_translation` argument to the `Volumes` constructor.
+        """
+        err_msg = (
+            "`volume_translation` has to be either a 3-tuple of scalars, or"
+            " a Tensor of shape (1,3) or (minibatch, 3) or (3,)`."
+        )
+        return self._convert_volume_property_to_tensor(translation, batch_size, err_msg)
+
+    def _convert_volume_property_to_tensor(
+        self, x: _Vector, batch_size: int, err_msg: str
+    ) -> torch.Tensor:
+        """
+        Handle the `volume_translation` or `voxel_size` argument to
+        the Volumes constructor.
+        Return a tensor of shape (N, 3) where N is the batch_size.
+        """
+        if isinstance(x, (list, tuple)):
+            if len(x) != 3:
+                raise ValueError(err_msg)
+            x = torch.tensor(x, device=self.device, dtype=torch.float32)[None]
+            x = x.repeat((batch_size, 1))
+        elif isinstance(x, torch.Tensor):
+            ok = (
+                (x.shape[0] == 1 and x.shape[1] == 3)
+                or (x.shape[0] == 3 and len(x.shape) == 1)
+                or (x.shape[0] == batch_size and x.shape[1] == 3)
+            )
+            if not ok:
+                raise ValueError(err_msg)
+            if x.device != self.device:
+                x = x.to(self.device)
+            if x.shape[0] == 3 and len(x.shape) == 1:
+                x = x[None]
+            if x.shape[0] == 1:
+                x = x.repeat((batch_size, 1))
+        else:
+            raise ValueError(err_msg)
+
+        return x
+
+    def get_coord_grid(self, world_coordinates: bool = True) -> torch.Tensor:
+        """
+        Return the 3D coordinate grid of the volumetric grid
+        in local (`world_coordinates=False`) or world coordinates
+        (`world_coordinates=True`).
+
+        The grid records location of each center of the corresponding volume voxel.
+
+        Local coordinates are scaled s.t. the values along one side of the
+        volume are in range [-1, 1].
+
+        Args:
+            **world_coordinates**: if `True`, the method
+                returns the grid in the world coordinates,
+                otherwise, in local coordinates.
+
+        Returns:
+            **coordinate_grid**: The grid of coordinates of shape
+                `(minibatch, depth, height, width, 3)`, where `minibatch`,
+                `height`, `width` and `depth` are the batch size, height, width
+                and depth of the volume `features` or `densities`.
+        """
+        # TODO(dnovotny): Implement caching of the coordinate grid.
+        return self._calculate_coordinate_grid(world_coordinates=world_coordinates)
+
+    def _calculate_coordinate_grid(
+        self, world_coordinates: bool = True
+    ) -> torch.Tensor:
+        """
+        Calculate the 3D coordinate grid of the volumetric grid either in
+        in local (`world_coordinates=False`) or
+        world coordinates (`world_coordinates=True`) .
+        """
+
+        densities = self.densities()
+        ba, _, de, he, wi = densities.shape
+        grid_sizes = self.get_grid_sizes()
+
+        # generate coordinate axes
+        vol_axes = [
+            torch.linspace(-1.0, 1.0, r, dtype=torch.float32, device=self.device)
+            for r in (de, he, wi)
+        ]
+
+        # generate per-coord meshgrids
+        Z, Y, X = torch.meshgrid(vol_axes)
+
+        # stack the coord grids ... this order matches the coordinate convention
+        # of torch.nn.grid_sample
+        vol_coords_local = torch.stack((X, Y, Z), dim=3)[None].repeat(ba, 1, 1, 1, 1)
+
+        # get grid sizes relative to the maximal volume size
+        grid_sizes_relative = (
+            torch.tensor([[de, he, wi]], device=grid_sizes.device, dtype=torch.float32)
+            - 1
+        ) / (grid_sizes - 1).float()
+
+        if (grid_sizes_relative != 1.0).any():
+            # if any of the relative sizes != 1.0, adjust the grid
+            grid_sizes_relative_reshape = grid_sizes_relative[:, [2, 1, 0]][
+                :, None, None, None
+            ]
+            vol_coords_local *= grid_sizes_relative_reshape
+            vol_coords_local += grid_sizes_relative_reshape - 1
+
+        if world_coordinates:
+            vol_coords = self.local_to_world_coords(vol_coords_local)
+        else:
+            vol_coords = vol_coords_local
+
+        return vol_coords
+
+    def get_local_to_world_coords_transform(self) -> Transform3d:
+        """
+        Return a Transform3d object that converts points in the
+        the local coordinate frame of the volume to world coordinates.
+        Local volume coordinates are scaled s.t. the coordinates along one
+        side of the volume are in range [-1, 1].
+
+        Returns:
+            **local_to_world_transform**: A Transform3d object converting
+                points from local coordinates to the world coordinates.
+        """
+        return self._local_to_world_transform
+
+    def get_world_to_local_coords_transform(self) -> Transform3d:
+        """
+        Return a Transform3d object that converts points in the
+        world coordinates to the local coordinate frame of the volume.
+        Local volume coordinates are scaled s.t. the coordinates along one
+        side of the volume are in range [-1, 1].
+
+        Returns:
+            **world_to_local_transform**: A Transform3d object converting
+                points from world coordinates to local coordinates.
+        """
+        return self.get_local_to_world_coords_transform().inverse()
+
+    def world_to_local_coords(self, points_3d_world: torch.Tensor) -> torch.Tensor:
+        """
+        Convert a batch of 3D point coordinates `points_3d_world` of shape
+        (minibatch, ..., dim) in the world coordinates to
+        the local coordinate frame of the volume. Local volume
+        coordinates are scaled s.t. the coordinates along one side of the volume
+        are in range [-1, 1].
+
+        Args:
+            **points_3d_world**: A tensor of shape `(minibatch, ..., 3)`
+                containing the 3D coordinates of a set of points that will
+                be converted from the local volume coordinates (ranging
+                within [-1, 1]) to the world coordinates
+                defined by the `self.center` and `self.voxel_size` parameters.
+
+        Returns:
+            **points_3d_local**: `points_3d_world` converted to the local
+                volume coordinates of shape `(minibatch, ..., 3)`.
+        """
+        pts_shape = points_3d_world.shape
+        return (
+            self.get_world_to_local_coords_transform()
+            .transform_points(points_3d_world.view(pts_shape[0], -1, 3))
+            .view(pts_shape)
+        )
+
+    def local_to_world_coords(self, points_3d_local: torch.Tensor) -> torch.Tensor:
+        """
+        Convert a batch of 3D point coordinates `points_3d_local` of shape
+        (minibatch, ..., dim) in the local coordinate frame of the volume
+        to the world coordinates.
+
+        Args:
+            **points_3d_local**: A tensor of shape `(minibatch, ..., 3)`
+                containing the 3D coordinates of a set of points that will
+                be converted from the local volume coordinates (ranging
+                within [-1, 1]) to the world coordinates
+                defined by the `self.center` and `self.voxel_size` parameters.
+
+        Returns:
+            **points_3d_world**: `points_3d_local` converted to the world
+                coordinates of the volume of shape `(minibatch, ..., 3)`.
+        """
+        pts_shape = points_3d_local.shape
+        return (
+            self.get_local_to_world_coords_transform()
+            .transform_points(points_3d_local.view(pts_shape[0], -1, 3))
+            .view(pts_shape)
+        )
+
+    def __len__(self) -> int:
+        return self._densities.shape[0]
+
+    def __getitem__(
+        self, index: Union[int, List[int], Tuple[int], slice, torch.Tensor]
+    ) -> "Volumes":
+        """
+        Args:
+            index: Specifying the index of the volume to retrieve.
+                Can be an int, slice, list of ints or a boolean or a long tensor.
+
+        Returns:
+            Volumes object with selected volumes. The tensors are not cloned.
+        """
+        if isinstance(index, int):
+            index = torch.LongTensor([index])
+        elif isinstance(index, (slice, list, tuple)):
+            pass
+        elif torch.is_tensor(index):
+            if index.dim() != 1 or index.dtype.is_floating_point:
+                raise IndexError(index)
+        else:
+            raise IndexError(index)
+
+        new = self.__class__(
+            # pyre-fixme[16]: `Optional` has no attribute `__getitem__`.
+            features=self.features()[index] if self._features is not None else None,
+            densities=self.densities()[index],
+        )
+        # dont forget to update grid_sizes!
+        new._grid_sizes = self.get_grid_sizes()[index]
+        new._local_to_world_transform = self._local_to_world_transform[index]
+        return new
+
+    def features(self) -> Optional[torch.Tensor]:
+        """
+        Returns the features of the volume.
+
+        Returns:
+            **features**: The tensor of volume features.
+        """
+        return self._features
+
+    def densities(self) -> torch.Tensor:
+        """
+        Returns the densities of the volume.
+
+        Returns:
+            **densities**: The tensor of volume densities.
+        """
+        return self._densities
+
+    def densities_list(self) -> List[torch.Tensor]:
+        """
+        Get the list representation of the densities.
+
+        Returns:
+            list of tensors of densities of shape (dim_i, D_i, H_i, W_i).
+        """
+        return self._features_densities_list(self.densities())
+
+    def features_list(self) -> List[torch.Tensor]:
+        """
+        Get the list representation of the features.
+
+        Returns:
+            list of tensors of features of shape (dim_i, D_i, H_i, W_i)
+            or `None` for feature-less volumes.
+        """
+        features_ = self.features()
+        if features_ is None:
+            # No features provided so return None
+            # pyre-fixme[7]: Expected `List[torch.Tensor]` but got `None`.
+            return None
+        return self._features_densities_list(features_)
+
+    def _features_densities_list(self, x: torch.Tensor) -> List[torch.Tensor]:
+        """
+        Retrieve the list representation of features/densities.
+
+        Args:
+            x: self.features() or self.densities()
+
+        Returns:
+            list of tensors of features/densities of shape (dim_i, D_i, H_i, W_i).
+        """
+        x_dim = x.shape[1]
+        pad_sizes = torch.nn.functional.pad(
+            self.get_grid_sizes(), [1, 0], mode="constant", value=x_dim
+        )
+        x_list = struct_utils.padded_to_list(x, pad_sizes.tolist())
+        return x_list
+
+    def get_grid_sizes(self) -> torch.LongTensor:
+        """
+        Returns the sizes of individual volumetric grids in the structure.
+
+        Returns:
+            **grid_sizes**: Tensor of spatial sizes of each of the volumes
+                of size (batchsize, 3), where i-th row holds (D_i, H_i, W_i).
+        """
+        return self._grid_sizes
+
+    def update_padded(
+        self, new_densities: torch.Tensor, new_features: Optional[torch.Tensor] = None
+    ) -> "Volumes":
+        """
+        Returns a Volumes structure with updated padded tensors and copies of
+        the auxiliary tensors `self._local_to_world_transform`,
+        `device` and `self._grid_sizes`. This function allows for an update of
+        densities (and features) without having to explicitly
+        convert it to the list representation for heterogeneous batches.
+
+        Args:
+            new_densities: FloatTensor of shape (N, dim_density, D, H, W)
+            new_features: (optional) FloatTensor of shape (N, dim_feature, D, H, W)
+
+        Returns:
+            Volumes with updated features and densities
+        """
+        new = copy.copy(self)
+        new._set_densities(new_densities)
+        if new_features is None:
+            new._features = None
+        else:
+            new._set_features(new_features)
+        return new
+
+    def _set_features(self, features: _TensorBatch) -> None:
+        self._set_densities_features("features", features)
+
+    def _set_densities(self, densities: _TensorBatch) -> None:
+        self._set_densities_features("densities", densities)
+
+    def _set_densities_features(self, var_name: str, x: _TensorBatch) -> None:
+        x_tensor, grid_sizes = self._convert_densities_features_to_tensor(x, var_name)
+        if x_tensor.device != self.device:
+            raise ValueError(
+                f"`{var_name}` have to be on the same device as `self.densities`."
+            )
+        if len(x_tensor.shape) != 5:
+            raise ValueError(
+                f"{var_name} has to be a 5-dim tensor of shape: "
+                f"(minibatch, {var_name}_dim, height, width, depth)"
+            )
+
+        if not (
+            (self.get_grid_sizes().shape == grid_sizes.shape)
+            and torch.allclose(self.get_grid_sizes(), grid_sizes)
+        ):
+            raise ValueError(
+                f"The size of every grid in `{var_name}` has to match the size of"
+                "the corresponding `densities` grid."
+            )
+        setattr(self, "_" + var_name, x_tensor)
+
+    def _set_local_to_world_transform(
+        self,
+        voxel_size: _VoxelSize = 1.0,
+        volume_translation: _Translation = (0.0, 0.0, 0.0),
+    ):
+        """
+        Sets the internal representation of the transformation between the
+        world and local volume coordinates by specifying
+        `voxel_size` and `volume_translation`
+
+        Args:
+            **voxel_size**: Denotes the size of input voxels. Has to be one of:
+                a) A scalar (square voxels)
+                b) 3-tuple or a 3-list of scalars
+                c) a Tensor of shape (3,)
+                d) a Tensor of shape (minibatch, 3)
+                e) a Tensor of shape (1,) (square voxels)
+            **volume_translation**: Denotes the 3D translation of the center
+                of the volume in world units. Has to be one of:
+                a) 3-tuple or a 3-list of scalars
+                b) a Tensor of shape (3,)
+                c) a Tensor of shape (minibatch, 3)
+                d) a Tensor of shape (1,) (square voxels)
+        """
+        # handle voxel size and center
+        # here we force the tensors to lie on self.device
+        voxel_size = self._handle_voxel_size(voxel_size, len(self))
+        volume_translation = self._handle_volume_translation(
+            volume_translation, len(self)
+        )
+        self._local_to_world_transform = self._voxel_size_translation_to_transform(
+            voxel_size, volume_translation, len(self)
+        )
+
+    def clone(self) -> "Volumes":
+        """
+        Deep copy of Volumes object. All internal tensors are cloned
+        individually.
+
+        Returns:
+            new Volumes object.
+        """
+        return copy.deepcopy(self)
+
+    def to(self, device: Device, copy: bool = False) -> "Volumes":
+        """
+        Match the functionality of torch.Tensor.to()
+        If copy = True or the self Tensor is on a different device, the
+        returned tensor is a copy of self with the desired torch.device.
+        If copy = False and the self Tensor already has the correct torch.device,
+        then self is returned.
+
+        Args:
+            device: Device (as str or torch.device) for the new tensor.
+            copy: Boolean indicator whether or not to clone self. Default False.
+
+        Returns:
+            Volumes object.
+        """
+        device_ = make_device(device)
+        if not copy and self.device == device_:
+            return self
+
+        other = self.clone()
+        if self.device == device_:
+            return other
+
+        other.device = device_
+        other._densities = self._densities.to(device_)
+        if self._features is not None:
+            # pyre-fixme[16]: `Optional` has no attribute `to`.
+            other._features = self.features().to(device_)
+        other._local_to_world_transform = self.get_local_to_world_coords_transform().to(
+            device_
+        )
+        other._grid_sizes = self._grid_sizes.to(device_)
+        return other
+
+    def cpu(self) -> "Volumes":
+        return self.to("cpu")
+
+    def cuda(self) -> "Volumes":
+        return self.to("cuda")
diff --git a/pytorch3d/pytorch3d/transforms/__init__.py b/pytorch3d/pytorch3d/transforms/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..448ea66f3430d2fc75cb19d1cb30ee27ec8e5160
--- /dev/null
+++ b/pytorch3d/pytorch3d/transforms/__init__.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .math import acos_linear_extrapolation
+from .rotation_conversions import (
+    axis_angle_to_matrix,
+    axis_angle_to_quaternion,
+    euler_angles_to_matrix,
+    matrix_to_euler_angles,
+    matrix_to_quaternion,
+    matrix_to_rotation_6d,
+    quaternion_apply,
+    quaternion_invert,
+    quaternion_multiply,
+    quaternion_raw_multiply,
+    quaternion_to_axis_angle,
+    quaternion_to_matrix,
+    random_quaternions,
+    random_rotation,
+    random_rotations,
+    rotation_6d_to_matrix,
+    standardize_quaternion,
+)
+from .se3 import se3_exp_map, se3_log_map
+from .so3 import (
+    so3_exp_map,
+    so3_exponential_map,
+    so3_log_map,
+    so3_relative_angle,
+    so3_rotation_angle,
+)
+from .transform3d import Rotate, RotateAxisAngle, Scale, Transform3d, Translate
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/transforms/math.py b/pytorch3d/pytorch3d/transforms/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..5110cf1d99ce1375dba66879864b3e404e040615
--- /dev/null
+++ b/pytorch3d/pytorch3d/transforms/math.py
@@ -0,0 +1,86 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Tuple
+
+import torch
+
+
+DEFAULT_ACOS_BOUND = 1.0 - 1e-4
+
+
+def acos_linear_extrapolation(
+    x: torch.Tensor,
+    bounds: Tuple[float, float] = (-DEFAULT_ACOS_BOUND, DEFAULT_ACOS_BOUND),
+) -> torch.Tensor:
+    """
+    Implements `arccos(x)` which is linearly extrapolated outside `x`'s original
+    domain of `(-1, 1)`. This allows for stable backpropagation in case `x`
+    is not guaranteed to be strictly within `(-1, 1)`.
+
+    More specifically:
+    ```
+    bounds=(lower_bound, upper_bound)
+    if lower_bound <= x <= upper_bound:
+        acos_linear_extrapolation(x) = acos(x)
+    elif x <= lower_bound: # 1st order Taylor approximation
+        acos_linear_extrapolation(x)
+            = acos(lower_bound) + dacos/dx(lower_bound) * (x - lower_bound)
+    else:  # x >= upper_bound
+        acos_linear_extrapolation(x)
+            = acos(upper_bound) + dacos/dx(upper_bound) * (x - upper_bound)
+    ```
+
+    Args:
+        x: Input `Tensor`.
+        bounds: A float 2-tuple defining the region for the
+            linear extrapolation of `acos`.
+            The first/second element of `bound`
+            describes the lower/upper bound that defines the lower/upper
+            extrapolation region, i.e. the region where
+            `x <= bound[0]`/`bound[1] <= x`.
+            Note that all elements of `bound` have to be within (-1, 1).
+    Returns:
+        acos_linear_extrapolation: `Tensor` containing the extrapolated `arccos(x)`.
+    """
+
+    lower_bound, upper_bound = bounds
+
+    if lower_bound > upper_bound:
+        raise ValueError("lower bound has to be smaller or equal to upper bound.")
+
+    if lower_bound <= -1.0 or upper_bound >= 1.0:
+        raise ValueError("Both lower bound and upper bound have to be within (-1, 1).")
+
+    # init an empty tensor and define the domain sets
+    acos_extrap = torch.empty_like(x)
+    x_upper = x >= upper_bound
+    x_lower = x <= lower_bound
+    x_mid = (~x_upper) & (~x_lower)
+
+    # acos calculation for upper_bound < x < lower_bound
+    acos_extrap[x_mid] = torch.acos(x[x_mid])
+    # the linear extrapolation for x >= upper_bound
+    acos_extrap[x_upper] = _acos_linear_approximation(x[x_upper], upper_bound)
+    # the linear extrapolation for x <= lower_bound
+    acos_extrap[x_lower] = _acos_linear_approximation(x[x_lower], lower_bound)
+
+    return acos_extrap
+
+
+def _acos_linear_approximation(x: torch.Tensor, x0: float) -> torch.Tensor:
+    """
+    Calculates the 1st order Taylor expansion of `arccos(x)` around `x0`.
+    """
+    return (x - x0) * _dacos_dx(x0) + math.acos(x0)
+
+
+def _dacos_dx(x: float) -> float:
+    """
+    Calculates the derivative of `arccos(x)` w.r.t. `x`.
+    """
+    return (-1.0) / math.sqrt(1.0 - x * x)
diff --git a/pytorch3d/pytorch3d/transforms/rotation_conversions.py b/pytorch3d/pytorch3d/transforms/rotation_conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d7aabfae45cb3ea01014e4b7aa7c9482c46febb
--- /dev/null
+++ b/pytorch3d/pytorch3d/transforms/rotation_conversions.py
@@ -0,0 +1,587 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from ..common.types import Device
+
+
+"""
+The transformation matrices returned from the functions in this file assume
+the points on which the transformation will be applied are column vectors.
+i.e. the R matrix is structured as
+
+    R = [
+            [Rxx, Rxy, Rxz],
+            [Ryx, Ryy, Ryz],
+            [Rzx, Rzy, Rzz],
+        ]  # (3, 3)
+
+This matrix can be applied to column vectors by post multiplication
+by the points e.g.
+
+    points = [[0], [1], [2]]  # (3 x 1) xyz coordinates of a point
+    transformed_points = R * points
+
+To apply the same matrix to points which are row vectors, the R matrix
+can be transposed and pre multiplied by the points:
+
+e.g.
+    points = [[0, 1, 2]]  # (1 x 3) xyz coordinates of a point
+    transformed_points = points * R.transpose(1, 0)
+"""
+
+
+def quaternion_to_matrix(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to rotation matrices.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    r, i, j, k = torch.unbind(quaternions, -1)
+    two_s = 2.0 / (quaternions * quaternions).sum(-1)
+
+    o = torch.stack(
+        (
+            1 - two_s * (j * j + k * k),
+            two_s * (i * j - k * r),
+            two_s * (i * k + j * r),
+            two_s * (i * j + k * r),
+            1 - two_s * (i * i + k * k),
+            two_s * (j * k - i * r),
+            two_s * (i * k - j * r),
+            two_s * (j * k + i * r),
+            1 - two_s * (i * i + j * j),
+        ),
+        -1,
+    )
+    return o.reshape(quaternions.shape[:-1] + (3, 3))
+
+
+def _copysign(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Return a tensor where each element has the absolute value taken from the,
+    corresponding element of a, with sign taken from the corresponding
+    element of b. This is like the standard copysign floating-point operation,
+    but is not careful about negative 0 and NaN.
+
+    Args:
+        a: source tensor.
+        b: tensor whose signs will be used, of the same shape as a.
+
+    Returns:
+        Tensor of the same shape as a with the signs of b.
+    """
+    signs_differ = (a < 0) != (b < 0)
+    return torch.where(signs_differ, -a, a)
+
+
+def _sqrt_positive_part(x: torch.Tensor) -> torch.Tensor:
+    """
+    Returns torch.sqrt(torch.max(0, x))
+    but with a zero subgradient where x is 0.
+    """
+    ret = torch.zeros_like(x)
+    positive_mask = x > 0
+    ret[positive_mask] = torch.sqrt(x[positive_mask])
+    return ret
+
+
+def matrix_to_quaternion(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to quaternions.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+
+    batch_dim = matrix.shape[:-2]
+    m00, m01, m02, m10, m11, m12, m20, m21, m22 = torch.unbind(
+        matrix.reshape(batch_dim + (9,)), dim=-1
+    )
+
+    q_abs = _sqrt_positive_part(
+        torch.stack(
+            [
+                1.0 + m00 + m11 + m22,
+                1.0 + m00 - m11 - m22,
+                1.0 - m00 + m11 - m22,
+                1.0 - m00 - m11 + m22,
+            ],
+            dim=-1,
+        )
+    )
+
+    # we produce the desired quaternion multiplied by each of r, i, j, k
+    quat_by_rijk = torch.stack(
+        [
+            torch.stack([q_abs[..., 0] ** 2, m21 - m12, m02 - m20, m10 - m01], dim=-1),
+            torch.stack([m21 - m12, q_abs[..., 1] ** 2, m10 + m01, m02 + m20], dim=-1),
+            torch.stack([m02 - m20, m10 + m01, q_abs[..., 2] ** 2, m12 + m21], dim=-1),
+            torch.stack([m10 - m01, m20 + m02, m21 + m12, q_abs[..., 3] ** 2], dim=-1),
+        ],
+        dim=-2,
+    )
+
+    # We floor here at 0.1 but the exact level is not important; if q_abs is small,
+    # the candidate won't be picked.
+    flr = torch.tensor(0.1).to(dtype=q_abs.dtype, device=q_abs.device)
+    quat_candidates = quat_by_rijk / (2.0 * q_abs[..., None].max(flr))
+
+    # if not for numerical problems, quat_candidates[i] should be same (up to a sign),
+    # forall i; we pick the best-conditioned one (with the largest denominator)
+
+    return quat_candidates[
+        F.one_hot(q_abs.argmax(dim=-1), num_classes=4) > 0.5, :  # pyre-ignore[16]
+    ].reshape(batch_dim + (4,))
+
+
+def _axis_angle_rotation(axis: str, angle: torch.Tensor) -> torch.Tensor:
+    """
+    Return the rotation matrices for one of the rotations about an axis
+    of which Euler angles describe, for each value of the angle given.
+
+    Args:
+        axis: Axis label "X" or "Y or "Z".
+        angle: any shape tensor of Euler angles in radians
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+
+    cos = torch.cos(angle)
+    sin = torch.sin(angle)
+    one = torch.ones_like(angle)
+    zero = torch.zeros_like(angle)
+
+    if axis == "X":
+        R_flat = (one, zero, zero, zero, cos, -sin, zero, sin, cos)
+    elif axis == "Y":
+        R_flat = (cos, zero, sin, zero, one, zero, -sin, zero, cos)
+    elif axis == "Z":
+        R_flat = (cos, -sin, zero, sin, cos, zero, zero, zero, one)
+    else:
+        raise ValueError("letter must be either X, Y or Z.")
+
+    return torch.stack(R_flat, -1).reshape(angle.shape + (3, 3))
+
+
+def euler_angles_to_matrix(euler_angles: torch.Tensor, convention: str) -> torch.Tensor:
+    """
+    Convert rotations given as Euler angles in radians to rotation matrices.
+
+    Args:
+        euler_angles: Euler angles in radians as tensor of shape (..., 3).
+        convention: Convention string of three uppercase letters from
+            {"X", "Y", and "Z"}.
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    if euler_angles.dim() == 0 or euler_angles.shape[-1] != 3:
+        raise ValueError("Invalid input euler angles.")
+    if len(convention) != 3:
+        raise ValueError("Convention must have 3 letters.")
+    if convention[1] in (convention[0], convention[2]):
+        raise ValueError(f"Invalid convention {convention}.")
+    for letter in convention:
+        if letter not in ("X", "Y", "Z"):
+            raise ValueError(f"Invalid letter {letter} in convention string.")
+    matrices = [
+        _axis_angle_rotation(c, e)
+        for c, e in zip(convention, torch.unbind(euler_angles, -1))
+    ]
+    # return functools.reduce(torch.matmul, matrices)
+    return torch.matmul(torch.matmul(matrices[0], matrices[1]), matrices[2])
+
+
+def _angle_from_tan(
+    axis: str, other_axis: str, data, horizontal: bool, tait_bryan: bool
+) -> torch.Tensor:
+    """
+    Extract the first or third Euler angle from the two members of
+    the matrix which are positive constant times its sine and cosine.
+
+    Args:
+        axis: Axis label "X" or "Y or "Z" for the angle we are finding.
+        other_axis: Axis label "X" or "Y or "Z" for the middle axis in the
+            convention.
+        data: Rotation matrices as tensor of shape (..., 3, 3).
+        horizontal: Whether we are looking for the angle for the third axis,
+            which means the relevant entries are in the same row of the
+            rotation matrix. If not, they are in the same column.
+        tait_bryan: Whether the first and third axes in the convention differ.
+
+    Returns:
+        Euler Angles in radians for each matrix in data as a tensor
+        of shape (...).
+    """
+
+    i1, i2 = {"X": (2, 1), "Y": (0, 2), "Z": (1, 0)}[axis]
+    if horizontal:
+        i2, i1 = i1, i2
+    even = (axis + other_axis) in ["XY", "YZ", "ZX"]
+    if horizontal == even:
+        return torch.atan2(data[..., i1], data[..., i2])
+    if tait_bryan:
+        return torch.atan2(-data[..., i2], data[..., i1])
+    return torch.atan2(data[..., i2], -data[..., i1])
+
+
+def _index_from_letter(letter: str) -> int:
+    if letter == "X":
+        return 0
+    if letter == "Y":
+        return 1
+    if letter == "Z":
+        return 2
+    raise ValueError("letter must be either X, Y or Z.")
+
+
+def matrix_to_euler_angles(matrix: torch.Tensor, convention: str) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to Euler angles in radians.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+        convention: Convention string of three uppercase letters.
+
+    Returns:
+        Euler angles in radians as tensor of shape (..., 3).
+    """
+    if len(convention) != 3:
+        raise ValueError("Convention must have 3 letters.")
+    if convention[1] in (convention[0], convention[2]):
+        raise ValueError(f"Invalid convention {convention}.")
+    for letter in convention:
+        if letter not in ("X", "Y", "Z"):
+            raise ValueError(f"Invalid letter {letter} in convention string.")
+    if matrix.size(-1) != 3 or matrix.size(-2) != 3:
+        raise ValueError(f"Invalid rotation matrix shape {matrix.shape}.")
+    i0 = _index_from_letter(convention[0])
+    i2 = _index_from_letter(convention[2])
+    tait_bryan = i0 != i2
+    if tait_bryan:
+        central_angle = torch.asin(
+            matrix[..., i0, i2] * (-1.0 if i0 - i2 in [-1, 2] else 1.0)
+        )
+    else:
+        central_angle = torch.acos(matrix[..., i0, i0])
+
+    o = (
+        _angle_from_tan(
+            convention[0], convention[1], matrix[..., i2], False, tait_bryan
+        ),
+        central_angle,
+        _angle_from_tan(
+            convention[2], convention[1], matrix[..., i0, :], True, tait_bryan
+        ),
+    )
+    return torch.stack(o, -1)
+
+
+def random_quaternions(
+    n: int, dtype: Optional[torch.dtype] = None, device: Optional[Device] = None
+) -> torch.Tensor:
+    """
+    Generate random quaternions representing rotations,
+    i.e. versors with nonnegative real part.
+
+    Args:
+        n: Number of quaternions in a batch to return.
+        dtype: Type to return.
+        device: Desired device of returned tensor. Default:
+            uses the current device for the default tensor type.
+
+    Returns:
+        Quaternions as tensor of shape (N, 4).
+    """
+    if isinstance(device, str):
+        device = torch.device(device)
+    o = torch.randn((n, 4), dtype=dtype, device=device)
+    s = (o * o).sum(1)
+    o = o / _copysign(torch.sqrt(s), o[:, 0])[:, None]
+    return o
+
+
+def random_rotations(
+    n: int, dtype: Optional[torch.dtype] = None, device: Optional[Device] = None
+) -> torch.Tensor:
+    """
+    Generate random rotations as 3x3 rotation matrices.
+
+    Args:
+        n: Number of rotation matrices in a batch to return.
+        dtype: Type to return.
+        device: Device of returned tensor. Default: if None,
+            uses the current device for the default tensor type.
+
+    Returns:
+        Rotation matrices as tensor of shape (n, 3, 3).
+    """
+    quaternions = random_quaternions(n, dtype=dtype, device=device)
+    return quaternion_to_matrix(quaternions)
+
+
+def random_rotation(
+    dtype: Optional[torch.dtype] = None, device: Optional[Device] = None
+) -> torch.Tensor:
+    """
+    Generate a single random 3x3 rotation matrix.
+
+    Args:
+        dtype: Type to return
+        device: Device of returned tensor. Default: if None,
+            uses the current device for the default tensor type
+
+    Returns:
+        Rotation matrix as tensor of shape (3, 3).
+    """
+    return random_rotations(1, dtype, device)[0]
+
+
+def standardize_quaternion(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert a unit quaternion to a standard form: one in which the real
+    part is non negative.
+
+    Args:
+        quaternions: Quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Standardized quaternions as tensor of shape (..., 4).
+    """
+    return torch.where(quaternions[..., 0:1] < 0, -quaternions, quaternions)
+
+
+def quaternion_raw_multiply(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Multiply two quaternions.
+    Usual torch rules for broadcasting apply.
+
+    Args:
+        a: Quaternions as tensor of shape (..., 4), real part first.
+        b: Quaternions as tensor of shape (..., 4), real part first.
+
+    Returns:
+        The product of a and b, a tensor of quaternions shape (..., 4).
+    """
+    aw, ax, ay, az = torch.unbind(a, -1)
+    bw, bx, by, bz = torch.unbind(b, -1)
+    ow = aw * bw - ax * bx - ay * by - az * bz
+    ox = aw * bx + ax * bw + ay * bz - az * by
+    oy = aw * by - ax * bz + ay * bw + az * bx
+    oz = aw * bz + ax * by - ay * bx + az * bw
+    return torch.stack((ow, ox, oy, oz), -1)
+
+
+def quaternion_multiply(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Multiply two quaternions representing rotations, returning the quaternion
+    representing their composition, i.e. the versor with nonnegative real part.
+    Usual torch rules for broadcasting apply.
+
+    Args:
+        a: Quaternions as tensor of shape (..., 4), real part first.
+        b: Quaternions as tensor of shape (..., 4), real part first.
+
+    Returns:
+        The product of a and b, a tensor of quaternions of shape (..., 4).
+    """
+    ab = quaternion_raw_multiply(a, b)
+    return standardize_quaternion(ab)
+
+
+def quaternion_invert(quaternion: torch.Tensor) -> torch.Tensor:
+    """
+    Given a quaternion representing rotation, get the quaternion representing
+    its inverse.
+
+    Args:
+        quaternion: Quaternions as tensor of shape (..., 4), with real part
+            first, which must be versors (unit quaternions).
+
+    Returns:
+        The inverse, a tensor of quaternions of shape (..., 4).
+    """
+
+    scaling = torch.tensor([1, -1, -1, -1], device=quaternion.device)
+    return quaternion * scaling
+
+
+def quaternion_apply(quaternion: torch.Tensor, point: torch.Tensor) -> torch.Tensor:
+    """
+    Apply the rotation given by a quaternion to a 3D point.
+    Usual torch rules for broadcasting apply.
+
+    Args:
+        quaternion: Tensor of quaternions, real part first, of shape (..., 4).
+        point: Tensor of 3D points of shape (..., 3).
+
+    Returns:
+        Tensor of rotated points of shape (..., 3).
+    """
+    if point.size(-1) != 3:
+        raise ValueError(f"Points are not in 3D, {point.shape}.")
+    real_parts = point.new_zeros(point.shape[:-1] + (1,))
+    point_as_quaternion = torch.cat((real_parts, point), -1)
+    out = quaternion_raw_multiply(
+        quaternion_raw_multiply(quaternion, point_as_quaternion),
+        quaternion_invert(quaternion),
+    )
+    return out[..., 1:]
+
+
+def axis_angle_to_matrix(axis_angle: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as axis/angle to rotation matrices.
+
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+
+    Returns:
+        Rotation matrices as tensor of shape (..., 3, 3).
+    """
+    return quaternion_to_matrix(axis_angle_to_quaternion(axis_angle))
+
+
+def matrix_to_axis_angle(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as rotation matrices to axis/angle.
+
+    Args:
+        matrix: Rotation matrices as tensor of shape (..., 3, 3).
+
+    Returns:
+        Rotations given as a vector in axis angle form, as a tensor
+            of shape (..., 3), where the magnitude is the angle
+            turned anticlockwise in radians around the vector's
+            direction.
+    """
+    return quaternion_to_axis_angle(matrix_to_quaternion(matrix))
+
+
+def axis_angle_to_quaternion(axis_angle: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as axis/angle to quaternions.
+
+    Args:
+        axis_angle: Rotations given as a vector in axis angle form,
+            as a tensor of shape (..., 3), where the magnitude is
+            the angle turned anticlockwise in radians around the
+            vector's direction.
+
+    Returns:
+        quaternions with real part first, as tensor of shape (..., 4).
+    """
+    angles = torch.norm(axis_angle, p=2, dim=-1, keepdim=True)
+    half_angles = angles * 0.5
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    quaternions = torch.cat(
+        [torch.cos(half_angles), axis_angle * sin_half_angles_over_angles], dim=-1
+    )
+    return quaternions
+
+
+def quaternion_to_axis_angle(quaternions: torch.Tensor) -> torch.Tensor:
+    """
+    Convert rotations given as quaternions to axis/angle.
+
+    Args:
+        quaternions: quaternions with real part first,
+            as tensor of shape (..., 4).
+
+    Returns:
+        Rotations given as a vector in axis angle form, as a tensor
+            of shape (..., 3), where the magnitude is the angle
+            turned anticlockwise in radians around the vector's
+            direction.
+    """
+    norms = torch.norm(quaternions[..., 1:], p=2, dim=-1, keepdim=True)
+    half_angles = torch.atan2(norms, quaternions[..., :1])
+    angles = 2 * half_angles
+    eps = 1e-6
+    small_angles = angles.abs() < eps
+    sin_half_angles_over_angles = torch.empty_like(angles)
+    sin_half_angles_over_angles[~small_angles] = (
+        torch.sin(half_angles[~small_angles]) / angles[~small_angles]
+    )
+    # for x small, sin(x/2) is about x/2 - (x/2)^3/6
+    # so sin(x/2)/x is about 1/2 - (x*x)/48
+    sin_half_angles_over_angles[small_angles] = (
+        0.5 - (angles[small_angles] * angles[small_angles]) / 48
+    )
+    return quaternions[..., 1:] / sin_half_angles_over_angles
+
+
+def rotation_6d_to_matrix(d6: torch.Tensor) -> torch.Tensor:
+    """
+    Converts 6D rotation representation by Zhou et al. [1] to rotation matrix
+    using Gram--Schmidt orthogonalization per Section B of [1].
+    Args:
+        d6: 6D rotation representation, of size (*, 6)
+
+    Returns:
+        batch of rotation matrices of size (*, 3, 3)
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+
+    a1, a2 = d6[..., :3], d6[..., 3:]
+    b1 = F.normalize(a1, dim=-1)
+    b2 = a2 - (b1 * a2).sum(-1, keepdim=True) * b1
+    b2 = F.normalize(b2, dim=-1)
+    b3 = torch.cross(b1, b2, dim=-1)
+    return torch.stack((b1, b2, b3), dim=-2)
+
+
+def matrix_to_rotation_6d(matrix: torch.Tensor) -> torch.Tensor:
+    """
+    Converts rotation matrices to 6D rotation representation by Zhou et al. [1]
+    by dropping the last row. Note that 6D representation is not unique.
+    Args:
+        matrix: batch of rotation matrices of size (*, 3, 3)
+
+    Returns:
+        6D rotation representation, of size (*, 6)
+
+    [1] Zhou, Y., Barnes, C., Lu, J., Yang, J., & Li, H.
+    On the Continuity of Rotation Representations in Neural Networks.
+    IEEE Conference on Computer Vision and Pattern Recognition, 2019.
+    Retrieved from http://arxiv.org/abs/1812.07035
+    """
+    batch_dim = matrix.size()[:-2]
+    return matrix[..., :2, :].clone().reshape(batch_dim + (6,))
diff --git a/pytorch3d/pytorch3d/transforms/se3.py b/pytorch3d/pytorch3d/transforms/se3.py
new file mode 100644
index 0000000000000000000000000000000000000000..212c277a6009e082139c0fe5a5a7dabaa869d73d
--- /dev/null
+++ b/pytorch3d/pytorch3d/transforms/se3.py
@@ -0,0 +1,218 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from pytorch3d.common.compat import solve
+
+from .so3 import _so3_exp_map, hat, so3_log_map
+
+
+def se3_exp_map(log_transform: torch.Tensor, eps: float = 1e-4) -> torch.Tensor:
+    """
+    Convert a batch of logarithmic representations of SE(3) matrices `log_transform`
+    to a batch of 4x4 SE(3) matrices using the exponential map.
+    See e.g. [1], Sec 9.4.2. for more detailed description.
+
+    A SE(3) matrix has the following form:
+        ```
+        [ R 0 ]
+        [ T 1 ] ,
+        ```
+    where `R` is a 3x3 rotation matrix and `T` is a 3-D translation vector.
+    SE(3) matrices are commonly used to represent rigid motions or camera extrinsics.
+
+    In the SE(3) logarithmic representation SE(3) matrices are
+    represented as 6-dimensional vectors `[log_translation | log_rotation]`,
+    i.e. a concatenation of two 3D vectors `log_translation` and `log_rotation`.
+
+    The conversion from the 6D representation to a 4x4 SE(3) matrix `transform`
+    is done as follows:
+        ```
+        transform = exp( [ hat(log_rotation) 0 ]
+                         [   log_translation 1 ] ) ,
+        ```
+    where `exp` is the matrix exponential and `hat` is the Hat operator [2].
+
+    Note that for any `log_transform` with `0 <= ||log_rotation|| < 2pi`
+    (i.e. the rotation angle is between 0 and 2pi), the following identity holds:
+    ```
+    se3_log_map(se3_exponential_map(log_transform)) == log_transform
+    ```
+
+    The conversion has a singularity around `||log(transform)|| = 0`
+    which is handled by clamping controlled with the `eps` argument.
+
+    Args:
+        log_transform: Batch of vectors of shape `(minibatch, 6)`.
+        eps: A threshold for clipping the squared norm of the rotation logarithm
+            to avoid unstable gradients in the singular case.
+
+    Returns:
+        Batch of transformation matrices of shape `(minibatch, 4, 4)`.
+
+    Raises:
+        ValueError if `log_transform` is of incorrect shape.
+
+    [1] https://jinyongjeong.github.io/Download/SE3/jlblanco2010geometry3d_techrep.pdf
+    [2] https://en.wikipedia.org/wiki/Hat_operator
+    """
+
+    if log_transform.ndim != 2 or log_transform.shape[1] != 6:
+        raise ValueError("Expected input to be of shape (N, 6).")
+
+    N, _ = log_transform.shape
+
+    log_translation = log_transform[..., :3]
+    log_rotation = log_transform[..., 3:]
+
+    # rotation is an exponential map of log_rotation
+    (
+        R,
+        rotation_angles,
+        log_rotation_hat,
+        log_rotation_hat_square,
+    ) = _so3_exp_map(log_rotation, eps=eps)
+
+    # translation is V @ T
+    V = _se3_V_matrix(
+        log_rotation,
+        log_rotation_hat,
+        log_rotation_hat_square,
+        rotation_angles,
+        eps=eps,
+    )
+    T = torch.bmm(V, log_translation[:, :, None])[:, :, 0]
+
+    transform = torch.zeros(
+        N, 4, 4, dtype=log_transform.dtype, device=log_transform.device
+    )
+
+    transform[:, :3, :3] = R
+    transform[:, :3, 3] = T
+    transform[:, 3, 3] = 1.0
+
+    return transform.permute(0, 2, 1)
+
+
+def se3_log_map(
+    transform: torch.Tensor, eps: float = 1e-4, cos_bound: float = 1e-4
+) -> torch.Tensor:
+    """
+    Convert a batch of 4x4 transformation matrices `transform`
+    to a batch of 6-dimensional SE(3) logarithms of the SE(3) matrices.
+    See e.g. [1], Sec 9.4.2. for more detailed description.
+
+    A SE(3) matrix has the following form:
+        ```
+        [ R 0 ]
+        [ T 1 ] ,
+        ```
+    where `R` is an orthonormal 3x3 rotation matrix and `T` is a 3-D translation vector.
+    SE(3) matrices are commonly used to represent rigid motions or camera extrinsics.
+
+    In the SE(3) logarithmic representation SE(3) matrices are
+    represented as 6-dimensional vectors `[log_translation | log_rotation]`,
+    i.e. a concatenation of two 3D vectors `log_translation` and `log_rotation`.
+
+    The conversion from the 4x4 SE(3) matrix `transform` to the
+    6D representation `log_transform = [log_translation | log_rotation]`
+    is done as follows:
+        ```
+        log_transform = log(transform)
+        log_translation = log_transform[3, :3]
+        log_rotation = inv_hat(log_transform[:3, :3])
+        ```
+    where `log` is the matrix logarithm
+    and `inv_hat` is the inverse of the Hat operator [2].
+
+    Note that for any valid 4x4 `transform` matrix, the following identity holds:
+    ```
+    se3_exp_map(se3_log_map(transform)) == transform
+    ```
+
+    The conversion has a singularity around `(transform=I)` which is handled
+    by clamping controlled with the `eps` and `cos_bound` arguments.
+
+    Args:
+        transform: batch of SE(3) matrices of shape `(minibatch, 4, 4)`.
+        eps: A threshold for clipping the squared norm of the rotation logarithm
+            to avoid division by zero in the singular case.
+        cos_bound: Clamps the cosine of the rotation angle to
+            [-1 + cos_bound, 3 - cos_bound] to avoid non-finite outputs.
+            The non-finite outputs can be caused by passing small rotation angles
+            to the `acos` function in `so3_rotation_angle` of `so3_log_map`.
+
+    Returns:
+        Batch of logarithms of input SE(3) matrices
+        of shape `(minibatch, 6)`.
+
+    Raises:
+        ValueError if `transform` is of incorrect shape.
+        ValueError if `R` has an unexpected trace.
+
+    [1] https://jinyongjeong.github.io/Download/SE3/jlblanco2010geometry3d_techrep.pdf
+    [2] https://en.wikipedia.org/wiki/Hat_operator
+    """
+
+    if transform.ndim != 3:
+        raise ValueError("Input tensor shape has to be (N, 4, 4).")
+
+    N, dim1, dim2 = transform.shape
+    if dim1 != 4 or dim2 != 4:
+        raise ValueError("Input tensor shape has to be (N, 4, 4).")
+
+    if not torch.allclose(transform[:, :3, 3], torch.zeros_like(transform[:, :3, 3])):
+        raise ValueError("All elements of `transform[:, :3, 3]` should be 0.")
+
+    # log_rot is just so3_log_map of the upper left 3x3 block
+    R = transform[:, :3, :3].permute(0, 2, 1)
+    log_rotation = so3_log_map(R, eps=eps, cos_bound=cos_bound)
+
+    # log_translation is V^-1 @ T
+    T = transform[:, 3, :3]
+    V = _se3_V_matrix(*_get_se3_V_input(log_rotation), eps=eps)
+    log_translation = solve(V, T[:, :, None])[:, :, 0]
+
+    return torch.cat((log_translation, log_rotation), dim=1)
+
+
+def _se3_V_matrix(
+    log_rotation: torch.Tensor,
+    log_rotation_hat: torch.Tensor,
+    log_rotation_hat_square: torch.Tensor,
+    rotation_angles: torch.Tensor,
+    eps: float = 1e-4,
+) -> torch.Tensor:
+    """
+    A helper function that computes the "V" matrix from [1], Sec 9.4.2.
+    [1] https://jinyongjeong.github.io/Download/SE3/jlblanco2010geometry3d_techrep.pdf
+    """
+
+    V = (
+        torch.eye(3, dtype=log_rotation.dtype, device=log_rotation.device)[None]
+        + log_rotation_hat
+        * ((1 - torch.cos(rotation_angles)) / (rotation_angles ** 2))[:, None, None]
+        + (
+            log_rotation_hat_square
+            * ((rotation_angles - torch.sin(rotation_angles)) / (rotation_angles ** 3))[
+                :, None, None
+            ]
+        )
+    )
+
+    return V
+
+
+def _get_se3_V_input(log_rotation: torch.Tensor, eps: float = 1e-4):
+    """
+    A helper function that computes the input variables to the `_se3_V_matrix`
+    function.
+    """
+    nrms = (log_rotation ** 2).sum(-1)
+    rotation_angles = torch.clamp(nrms, eps).sqrt()
+    log_rotation_hat = hat(log_rotation)
+    log_rotation_hat_square = torch.bmm(log_rotation_hat, log_rotation_hat)
+    return log_rotation, log_rotation_hat, log_rotation_hat_square, rotation_angles
diff --git a/pytorch3d/pytorch3d/transforms/so3.py b/pytorch3d/pytorch3d/transforms/so3.py
new file mode 100644
index 0000000000000000000000000000000000000000..50004ed75a784946b1eec086c9e4b45a911412c1
--- /dev/null
+++ b/pytorch3d/pytorch3d/transforms/so3.py
@@ -0,0 +1,300 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from typing import Tuple
+
+import torch
+
+from ..transforms import acos_linear_extrapolation
+
+
+def so3_relative_angle(
+    R1: torch.Tensor,
+    R2: torch.Tensor,
+    cos_angle: bool = False,
+    cos_bound: float = 1e-4,
+    eps: float = 1e-4,
+) -> torch.Tensor:
+    """
+    Calculates the relative angle (in radians) between pairs of
+    rotation matrices `R1` and `R2` with `angle = acos(0.5 * (Trace(R1 R2^T)-1))`
+
+    .. note::
+        This corresponds to a geodesic distance on the 3D manifold of rotation
+        matrices.
+
+    Args:
+        R1: Batch of rotation matrices of shape `(minibatch, 3, 3)`.
+        R2: Batch of rotation matrices of shape `(minibatch, 3, 3)`.
+        cos_angle: If==True return cosine of the relative angle rather than
+            the angle itself. This can avoid the unstable calculation of `acos`.
+        cos_bound: Clamps the cosine of the relative rotation angle to
+            [-1 + cos_bound, 1 - cos_bound] to avoid non-finite outputs/gradients
+            of the `acos` call. Note that the non-finite outputs/gradients
+            are returned when the angle is requested (i.e. `cos_angle==False`)
+            and the rotation angle is close to 0 or π.
+        eps: Tolerance for the valid trace check of the relative rotation matrix
+            in `so3_rotation_angle`.
+    Returns:
+        Corresponding rotation angles of shape `(minibatch,)`.
+        If `cos_angle==True`, returns the cosine of the angles.
+
+    Raises:
+        ValueError if `R1` or `R2` is of incorrect shape.
+        ValueError if `R1` or `R2` has an unexpected trace.
+    """
+    R12 = torch.bmm(R1, R2.permute(0, 2, 1))
+    return so3_rotation_angle(R12, cos_angle=cos_angle, cos_bound=cos_bound, eps=eps)
+
+
+def so3_rotation_angle(
+    R: torch.Tensor,
+    eps: float = 1e-4,
+    cos_angle: bool = False,
+    cos_bound: float = 1e-4,
+) -> torch.Tensor:
+    """
+    Calculates angles (in radians) of a batch of rotation matrices `R` with
+    `angle = acos(0.5 * (Trace(R)-1))`. The trace of the
+    input matrices is checked to be in the valid range `[-1-eps,3+eps]`.
+    The `eps` argument is a small constant that allows for small errors
+    caused by limited machine precision.
+
+    Args:
+        R: Batch of rotation matrices of shape `(minibatch, 3, 3)`.
+        eps: Tolerance for the valid trace check.
+        cos_angle: If==True return cosine of the rotation angles rather than
+            the angle itself. This can avoid the unstable
+            calculation of `acos`.
+        cos_bound: Clamps the cosine of the rotation angle to
+            [-1 + cos_bound, 1 - cos_bound] to avoid non-finite outputs/gradients
+            of the `acos` call. Note that the non-finite outputs/gradients
+            are returned when the angle is requested (i.e. `cos_angle==False`)
+            and the rotation angle is close to 0 or π.
+
+    Returns:
+        Corresponding rotation angles of shape `(minibatch,)`.
+        If `cos_angle==True`, returns the cosine of the angles.
+
+    Raises:
+        ValueError if `R` is of incorrect shape.
+        ValueError if `R` has an unexpected trace.
+    """
+
+    N, dim1, dim2 = R.shape
+    if dim1 != 3 or dim2 != 3:
+        raise ValueError("Input has to be a batch of 3x3 Tensors.")
+
+    rot_trace = R[:, 0, 0] + R[:, 1, 1] + R[:, 2, 2]
+
+    if ((rot_trace < -1.0 - eps) + (rot_trace > 3.0 + eps)).any():
+        raise ValueError("A matrix has trace outside valid range [-1-eps,3+eps].")
+
+    # phi ... rotation angle
+    phi_cos = (rot_trace - 1.0) * 0.5
+
+    if cos_angle:
+        return phi_cos
+    else:
+        if cos_bound > 0.0:
+            bound = 1.0 - cos_bound
+            return acos_linear_extrapolation(phi_cos, (-bound, bound))
+        else:
+            return torch.acos(phi_cos)
+
+
+def so3_exp_map(log_rot: torch.Tensor, eps: float = 0.0001) -> torch.Tensor:
+    """
+    Convert a batch of logarithmic representations of rotation matrices `log_rot`
+    to a batch of 3x3 rotation matrices using Rodrigues formula [1].
+
+    In the logarithmic representation, each rotation matrix is represented as
+    a 3-dimensional vector (`log_rot`) who's l2-norm and direction correspond
+    to the magnitude of the rotation angle and the axis of rotation respectively.
+
+    The conversion has a singularity around `log(R) = 0`
+    which is handled by clamping controlled with the `eps` argument.
+
+    Args:
+        log_rot: Batch of vectors of shape `(minibatch, 3)`.
+        eps: A float constant handling the conversion singularity.
+
+    Returns:
+        Batch of rotation matrices of shape `(minibatch, 3, 3)`.
+
+    Raises:
+        ValueError if `log_rot` is of incorrect shape.
+
+    [1] https://en.wikipedia.org/wiki/Rodrigues%27_rotation_formula
+    """
+    return _so3_exp_map(log_rot, eps=eps)[0]
+
+
+def so3_exponential_map(log_rot: torch.Tensor, eps: float = 0.0001) -> torch.Tensor:
+    warnings.warn(
+        """so3_exponential_map is deprecated,
+        Use so3_exp_map instead.
+        so3_exponential_map will be removed in future releases.""",
+        PendingDeprecationWarning,
+    )
+
+    return so3_exp_map(log_rot, eps)
+
+
+def _so3_exp_map(
+    log_rot: torch.Tensor, eps: float = 0.0001
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    A helper function that computes the so3 exponential map and,
+    apart from the rotation matrix, also returns intermediate variables
+    that can be re-used in other functions.
+    """
+    _, dim = log_rot.shape
+    if dim != 3:
+        raise ValueError("Input tensor shape has to be Nx3.")
+
+    nrms = (log_rot * log_rot).sum(1)
+    # phis ... rotation angles
+    rot_angles = torch.clamp(nrms, eps).sqrt()
+    rot_angles_inv = 1.0 / rot_angles
+    fac1 = rot_angles_inv * rot_angles.sin()
+    fac2 = rot_angles_inv * rot_angles_inv * (1.0 - rot_angles.cos())
+    skews = hat(log_rot)
+    skews_square = torch.bmm(skews, skews)
+
+    R = (
+        # pyre-fixme[16]: `float` has no attribute `__getitem__`.
+        fac1[:, None, None] * skews
+        + fac2[:, None, None] * skews_square
+        + torch.eye(3, dtype=log_rot.dtype, device=log_rot.device)[None]
+    )
+
+    return R, rot_angles, skews, skews_square
+
+
+def so3_log_map(
+    R: torch.Tensor, eps: float = 0.0001, cos_bound: float = 1e-4
+) -> torch.Tensor:
+    """
+    Convert a batch of 3x3 rotation matrices `R`
+    to a batch of 3-dimensional matrix logarithms of rotation matrices
+    The conversion has a singularity around `(R=I)` which is handled
+    by clamping controlled with the `eps` and `cos_bound` arguments.
+
+    Args:
+        R: batch of rotation matrices of shape `(minibatch, 3, 3)`.
+        eps: A float constant handling the conversion singularity.
+        cos_bound: Clamps the cosine of the rotation angle to
+            [-1 + cos_bound, 1 - cos_bound] to avoid non-finite outputs/gradients
+            of the `acos` call when computing `so3_rotation_angle`.
+            Note that the non-finite outputs/gradients are returned when
+            the rotation angle is close to 0 or π.
+
+    Returns:
+        Batch of logarithms of input rotation matrices
+        of shape `(minibatch, 3)`.
+
+    Raises:
+        ValueError if `R` is of incorrect shape.
+        ValueError if `R` has an unexpected trace.
+    """
+
+    N, dim1, dim2 = R.shape
+    if dim1 != 3 or dim2 != 3:
+        raise ValueError("Input has to be a batch of 3x3 Tensors.")
+
+    phi = so3_rotation_angle(R, cos_bound=cos_bound, eps=eps)
+
+    phi_sin = torch.sin(phi)
+
+    # We want to avoid a tiny denominator of phi_factor = phi / (2.0 * phi_sin).
+    # Hence, for phi_sin.abs() <= 0.5 * eps, we approximate phi_factor with
+    # 2nd order Taylor expansion: phi_factor = 0.5 + (1.0 / 12) * phi**2
+    phi_factor = torch.empty_like(phi)
+    ok_denom = phi_sin.abs() > (0.5 * eps)
+    phi_factor[~ok_denom] = 0.5 + (phi[~ok_denom] ** 2) * (1.0 / 12)
+    phi_factor[ok_denom] = phi[ok_denom] / (2.0 * phi_sin[ok_denom])
+
+    log_rot_hat = phi_factor[:, None, None] * (R - R.permute(0, 2, 1))
+
+    log_rot = hat_inv(log_rot_hat)
+
+    return log_rot
+
+
+def hat_inv(h: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the inverse Hat operator [1] of a batch of 3x3 matrices.
+
+    Args:
+        h: Batch of skew-symmetric matrices of shape `(minibatch, 3, 3)`.
+
+    Returns:
+        Batch of 3d vectors of shape `(minibatch, 3, 3)`.
+
+    Raises:
+        ValueError if `h` is of incorrect shape.
+        ValueError if `h` not skew-symmetric.
+
+    [1] https://en.wikipedia.org/wiki/Hat_operator
+    """
+
+    N, dim1, dim2 = h.shape
+    if dim1 != 3 or dim2 != 3:
+        raise ValueError("Input has to be a batch of 3x3 Tensors.")
+
+    ss_diff = torch.abs(h + h.permute(0, 2, 1)).max()
+
+    HAT_INV_SKEW_SYMMETRIC_TOL = 1e-5
+    if float(ss_diff) > HAT_INV_SKEW_SYMMETRIC_TOL:
+        raise ValueError("One of input matrices is not skew-symmetric.")
+
+    x = h[:, 2, 1]
+    y = h[:, 0, 2]
+    z = h[:, 1, 0]
+
+    v = torch.stack((x, y, z), dim=1)
+
+    return v
+
+
+def hat(v: torch.Tensor) -> torch.Tensor:
+    """
+    Compute the Hat operator [1] of a batch of 3D vectors.
+
+    Args:
+        v: Batch of vectors of shape `(minibatch , 3)`.
+
+    Returns:
+        Batch of skew-symmetric matrices of shape
+        `(minibatch, 3 , 3)` where each matrix is of the form:
+            `[    0  -v_z   v_y ]
+             [  v_z     0  -v_x ]
+             [ -v_y   v_x     0 ]`
+
+    Raises:
+        ValueError if `v` is of incorrect shape.
+
+    [1] https://en.wikipedia.org/wiki/Hat_operator
+    """
+
+    N, dim = v.shape
+    if dim != 3:
+        raise ValueError("Input vectors have to be 3-dimensional.")
+
+    h = torch.zeros((N, 3, 3), dtype=v.dtype, device=v.device)
+
+    x, y, z = v.unbind(1)
+
+    h[:, 0, 1] = -z
+    h[:, 0, 2] = y
+    h[:, 1, 0] = z
+    h[:, 1, 2] = -x
+    h[:, 2, 0] = -y
+    h[:, 2, 1] = x
+
+    return h
diff --git a/pytorch3d/pytorch3d/transforms/transform3d.py b/pytorch3d/pytorch3d/transforms/transform3d.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8bdf938545c09d36cc60bd8db06d7df87fab201
--- /dev/null
+++ b/pytorch3d/pytorch3d/transforms/transform3d.py
@@ -0,0 +1,781 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+import warnings
+from typing import List, Optional, Union
+
+import torch
+
+from ..common.types import Device, get_device, make_device
+from ..common.workaround import _safe_det_3x3
+from .rotation_conversions import _axis_angle_rotation
+
+
+class Transform3d:
+    """
+    A Transform3d object encapsulates a batch of N 3D transformations, and knows
+    how to transform points and normal vectors. Suppose that t is a Transform3d;
+    then we can do the following:
+
+    .. code-block:: python
+
+        N = len(t)
+        points = torch.randn(N, P, 3)
+        normals = torch.randn(N, P, 3)
+        points_transformed = t.transform_points(points)    # => (N, P, 3)
+        normals_transformed = t.transform_normals(normals)  # => (N, P, 3)
+
+
+    BROADCASTING
+    Transform3d objects supports broadcasting. Suppose that t1 and tN are
+    Transform3d objects with len(t1) == 1 and len(tN) == N respectively. Then we
+    can broadcast transforms like this:
+
+    .. code-block:: python
+
+        t1.transform_points(torch.randn(P, 3))     # => (P, 3)
+        t1.transform_points(torch.randn(1, P, 3))  # => (1, P, 3)
+        t1.transform_points(torch.randn(M, P, 3))  # => (M, P, 3)
+        tN.transform_points(torch.randn(P, 3))     # => (N, P, 3)
+        tN.transform_points(torch.randn(1, P, 3))  # => (N, P, 3)
+
+
+    COMBINING TRANSFORMS
+    Transform3d objects can be combined in two ways: composing and stacking.
+    Composing is function composition. Given Transform3d objects t1, t2, t3,
+    the following all compute the same thing:
+
+    .. code-block:: python
+
+        y1 = t3.transform_points(t2.transform_points(t1.transform_points(x)))
+        y2 = t1.compose(t2).compose(t3).transform_points(x)
+        y3 = t1.compose(t2, t3).transform_points(x)
+
+
+    Composing transforms should broadcast.
+
+    .. code-block:: python
+
+        if len(t1) == 1 and len(t2) == N, then len(t1.compose(t2)) == N.
+
+    We can also stack a sequence of Transform3d objects, which represents
+    composition along the batch dimension; then the following should compute the
+    same thing.
+
+    .. code-block:: python
+
+        N, M = len(tN), len(tM)
+        xN = torch.randn(N, P, 3)
+        xM = torch.randn(M, P, 3)
+        y1 = torch.cat([tN.transform_points(xN), tM.transform_points(xM)], dim=0)
+        y2 = tN.stack(tM).transform_points(torch.cat([xN, xM], dim=0))
+
+    BUILDING TRANSFORMS
+    We provide convenience methods for easily building Transform3d objects
+    as compositions of basic transforms.
+
+    .. code-block:: python
+
+        # Scale by 0.5, then translate by (1, 2, 3)
+        t1 = Transform3d().scale(0.5).translate(1, 2, 3)
+
+        # Scale each axis by a different amount, then translate, then scale
+        t2 = Transform3d().scale(1, 3, 3).translate(2, 3, 1).scale(2.0)
+
+        t3 = t1.compose(t2)
+        tN = t1.stack(t3, t3)
+
+
+    BACKPROP THROUGH TRANSFORMS
+    When building transforms, we can also parameterize them by Torch tensors;
+    in this case we can backprop through the construction and application of
+    Transform objects, so they could be learned via gradient descent or
+    predicted by a neural network.
+
+    .. code-block:: python
+
+        s1_params = torch.randn(N, requires_grad=True)
+        t_params = torch.randn(N, 3, requires_grad=True)
+        s2_params = torch.randn(N, 3, requires_grad=True)
+
+        t = Transform3d().scale(s1_params).translate(t_params).scale(s2_params)
+        x = torch.randn(N, 3)
+        y = t.transform_points(x)
+        loss = compute_loss(y)
+        loss.backward()
+
+        with torch.no_grad():
+            s1_params -= lr * s1_params.grad
+            t_params -= lr * t_params.grad
+            s2_params -= lr * s2_params.grad
+
+    CONVENTIONS
+    We adopt a right-hand coordinate system, meaning that rotation about an axis
+    with a positive angle results in a counter clockwise rotation.
+
+    This class assumes that transformations are applied on inputs which
+    are row vectors. The internal representation of the Nx4x4 transformation
+    matrix is of the form:
+
+    .. code-block:: python
+
+        M = [
+                [Rxx, Ryx, Rzx, 0],
+                [Rxy, Ryy, Rzy, 0],
+                [Rxz, Ryz, Rzz, 0],
+                [Tx,  Ty,  Tz,  1],
+            ]
+
+    To apply the transformation to points which are row vectors, the M matrix
+    can be pre multiplied by the points:
+
+    .. code-block:: python
+
+        points = [[0, 1, 2]]  # (1 x 3) xyz coordinates of a point
+        transformed_points = points * M
+
+    """
+
+    def __init__(
+        self,
+        dtype: torch.dtype = torch.float32,
+        device: Device = "cpu",
+        matrix: Optional[torch.Tensor] = None,
+    ) -> None:
+        """
+        Args:
+            dtype: The data type of the transformation matrix.
+                to be used if `matrix = None`.
+            device: The device for storing the implemented transformation.
+                If `matrix != None`, uses the device of input `matrix`.
+            matrix: A tensor of shape (4, 4) or of shape (minibatch, 4, 4)
+                representing the 4x4 3D transformation matrix.
+                If `None`, initializes with identity using
+                the specified `device` and `dtype`.
+        """
+
+        if matrix is None:
+            self._matrix = torch.eye(4, dtype=dtype, device=device).view(1, 4, 4)
+        else:
+            if matrix.ndim not in (2, 3):
+                raise ValueError('"matrix" has to be a 2- or a 3-dimensional tensor.')
+            if matrix.shape[-2] != 4 or matrix.shape[-1] != 4:
+                raise ValueError(
+                    '"matrix" has to be a tensor of shape (minibatch, 4, 4)'
+                )
+            # set dtype and device from matrix
+            dtype = matrix.dtype
+            device = matrix.device
+            self._matrix = matrix.view(-1, 4, 4)
+
+        self._transforms = []  # store transforms to compose
+        self._lu = None
+        self.device = make_device(device)
+        self.dtype = dtype
+
+    def __len__(self) -> int:
+        return self.get_matrix().shape[0]
+
+    def __getitem__(
+        self, index: Union[int, List[int], slice, torch.Tensor]
+    ) -> "Transform3d":
+        """
+        Args:
+            index: Specifying the index of the transform to retrieve.
+                Can be an int, slice, list of ints, boolean, long tensor.
+                Supports negative indices.
+
+        Returns:
+            Transform3d object with selected transforms. The tensors are not cloned.
+        """
+        if isinstance(index, int):
+            index = [index]
+        return self.__class__(matrix=self.get_matrix()[index])
+
+    def compose(self, *others):
+        """
+        Return a new Transform3d with the transforms to compose stored as
+        an internal list.
+
+        Args:
+            *others: Any number of Transform3d objects
+
+        Returns:
+            A new Transform3d with the stored transforms
+        """
+        out = Transform3d(dtype=self.dtype, device=self.device)
+        out._matrix = self._matrix.clone()
+        for other in others:
+            if not isinstance(other, Transform3d):
+                msg = "Only possible to compose Transform3d objects; got %s"
+                raise ValueError(msg % type(other))
+        out._transforms = self._transforms + list(others)
+        return out
+
+    def get_matrix(self):
+        """
+        Return a matrix which is the result of composing this transform
+        with others stored in self.transforms. Where necessary transforms
+        are broadcast against each other.
+        For example, if self.transforms contains transforms t1, t2, and t3, and
+        given a set of points x, the following should be true:
+
+        .. code-block:: python
+
+            y1 = t1.compose(t2, t3).transform(x)
+            y2 = t3.transform(t2.transform(t1.transform(x)))
+            y1.get_matrix() == y2.get_matrix()
+
+        Returns:
+            A transformation matrix representing the composed inputs.
+        """
+        composed_matrix = self._matrix.clone()
+        if len(self._transforms) > 0:
+            for other in self._transforms:
+                other_matrix = other.get_matrix()
+                composed_matrix = _broadcast_bmm(composed_matrix, other_matrix)
+        return composed_matrix
+
+    def _get_matrix_inverse(self):
+        """
+        Return the inverse of self._matrix.
+        """
+        return torch.inverse(self._matrix)
+
+    def inverse(self, invert_composed: bool = False):
+        """
+        Returns a new Transform3d object that represents an inverse of the
+        current transformation.
+
+        Args:
+            invert_composed:
+                - True: First compose the list of stored transformations
+                  and then apply inverse to the result. This is
+                  potentially slower for classes of transformations
+                  with inverses that can be computed efficiently
+                  (e.g. rotations and translations).
+                - False: Invert the individual stored transformations
+                  independently without composing them.
+
+        Returns:
+            A new Transform3d object containing the inverse of the original
+            transformation.
+        """
+
+        tinv = Transform3d(dtype=self.dtype, device=self.device)
+
+        if invert_composed:
+            # first compose then invert
+            tinv._matrix = torch.inverse(self.get_matrix())
+        else:
+            # self._get_matrix_inverse() implements efficient inverse
+            # of self._matrix
+            i_matrix = self._get_matrix_inverse()
+
+            # 2 cases:
+            if len(self._transforms) > 0:
+                # a) Either we have a non-empty list of transforms:
+                # Here we take self._matrix and append its inverse at the
+                # end of the reverted _transforms list. After composing
+                # the transformations with get_matrix(), this correctly
+                # right-multiplies by the inverse of self._matrix
+                # at the end of the composition.
+                tinv._transforms = [t.inverse() for t in reversed(self._transforms)]
+                last = Transform3d(dtype=self.dtype, device=self.device)
+                last._matrix = i_matrix
+                tinv._transforms.append(last)
+            else:
+                # b) Or there are no stored transformations
+                # we just set inverted matrix
+                tinv._matrix = i_matrix
+
+        return tinv
+
+    def stack(self, *others):
+        transforms = [self] + list(others)
+        matrix = torch.cat([t._matrix for t in transforms], dim=0)
+        out = Transform3d(dtype=self.dtype, device=self.device)
+        out._matrix = matrix
+        return out
+
+    def transform_points(self, points, eps: Optional[float] = None):
+        """
+        Use this transform to transform a set of 3D points. Assumes row major
+        ordering of the input points.
+
+        Args:
+            points: Tensor of shape (P, 3) or (N, P, 3)
+            eps: If eps!=None, the argument is used to clamp the
+                last coordinate before performing the final division.
+                The clamping corresponds to:
+                last_coord := (last_coord.sign() + (last_coord==0)) *
+                torch.clamp(last_coord.abs(), eps),
+                i.e. the last coordinates that are exactly 0 will
+                be clamped to +eps.
+
+        Returns:
+            points_out: points of shape (N, P, 3) or (P, 3) depending
+            on the dimensions of the transform
+        """
+        points_batch = points.clone()
+        if points_batch.dim() == 2:
+            points_batch = points_batch[None]  # (P, 3) -> (1, P, 3)
+        if points_batch.dim() != 3:
+            msg = "Expected points to have dim = 2 or dim = 3: got shape %r"
+            raise ValueError(msg % repr(points.shape))
+
+        N, P, _3 = points_batch.shape
+        ones = torch.ones(N, P, 1, dtype=points.dtype, device=points.device)
+        points_batch = torch.cat([points_batch, ones], dim=2)
+
+        composed_matrix = self.get_matrix()
+        points_out = _broadcast_bmm(points_batch, composed_matrix)
+        denom = points_out[..., 3:]  # denominator
+        if eps is not None:
+            denom_sign = denom.sign() + (denom == 0.0).type_as(denom)
+            denom = denom_sign * torch.clamp(denom.abs(), eps)
+        points_out = points_out[..., :3] / denom
+
+        # When transform is (1, 4, 4) and points is (P, 3) return
+        # points_out of shape (P, 3)
+        if points_out.shape[0] == 1 and points.dim() == 2:
+            points_out = points_out.reshape(points.shape)
+
+        return points_out
+
+    def transform_normals(self, normals):
+        """
+        Use this transform to transform a set of normal vectors.
+
+        Args:
+            normals: Tensor of shape (P, 3) or (N, P, 3)
+
+        Returns:
+            normals_out: Tensor of shape (P, 3) or (N, P, 3) depending
+            on the dimensions of the transform
+        """
+        if normals.dim() not in [2, 3]:
+            msg = "Expected normals to have dim = 2 or dim = 3: got shape %r"
+            raise ValueError(msg % (normals.shape,))
+        composed_matrix = self.get_matrix()
+
+        # TODO: inverse is bad! Solve a linear system instead
+        mat = composed_matrix[:, :3, :3]
+        normals_out = _broadcast_bmm(normals, mat.transpose(1, 2).inverse())
+
+        # This doesn't pass unit tests. TODO investigate further
+        # if self._lu is None:
+        #     self._lu = self._matrix[:, :3, :3].transpose(1, 2).lu()
+        # normals_out = normals.lu_solve(*self._lu)
+
+        # When transform is (1, 4, 4) and normals is (P, 3) return
+        # normals_out of shape (P, 3)
+        if normals_out.shape[0] == 1 and normals.dim() == 2:
+            normals_out = normals_out.reshape(normals.shape)
+
+        return normals_out
+
+    def translate(self, *args, **kwargs):
+        return self.compose(Translate(device=self.device, *args, **kwargs))
+
+    def scale(self, *args, **kwargs):
+        return self.compose(Scale(device=self.device, *args, **kwargs))
+
+    def rotate(self, *args, **kwargs):
+        return self.compose(Rotate(device=self.device, *args, **kwargs))
+
+    def rotate_axis_angle(self, *args, **kwargs):
+        return self.compose(RotateAxisAngle(device=self.device, *args, **kwargs))
+
+    def clone(self):
+        """
+        Deep copy of Transforms object. All internal tensors are cloned
+        individually.
+
+        Returns:
+            new Transforms object.
+        """
+        other = Transform3d(dtype=self.dtype, device=self.device)
+        if self._lu is not None:
+            other._lu = [elem.clone() for elem in self._lu]
+        other._matrix = self._matrix.clone()
+        other._transforms = [t.clone() for t in self._transforms]
+        return other
+
+    def to(
+        self,
+        device: Device,
+        copy: bool = False,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        """
+        Match functionality of torch.Tensor.to()
+        If copy = True or the self Tensor is on a different device, the
+        returned tensor is a copy of self with the desired torch.device.
+        If copy = False and the self Tensor already has the correct torch.device,
+        then self is returned.
+
+        Args:
+          device: Device (as str or torch.device) for the new tensor.
+          copy: Boolean indicator whether or not to clone self. Default False.
+          dtype: If not None, casts the internal tensor variables
+              to a given torch.dtype.
+
+        Returns:
+          Transform3d object.
+        """
+        device_ = make_device(device)
+        dtype_ = self.dtype if dtype is None else dtype
+        skip_to = self.device == device_ and self.dtype == dtype_
+
+        if not copy and skip_to:
+            return self
+
+        other = self.clone()
+
+        if skip_to:
+            return other
+
+        other.device = device_
+        other.dtype = dtype_
+        other._matrix = other._matrix.to(device=device_, dtype=dtype_)
+        other._transforms = [
+            t.to(device_, copy=copy, dtype=dtype_) for t in other._transforms
+        ]
+        return other
+
+    def cpu(self):
+        return self.to("cpu")
+
+    def cuda(self):
+        return self.to("cuda")
+
+
+class Translate(Transform3d):
+    def __init__(
+        self,
+        x,
+        y=None,
+        z=None,
+        dtype: torch.dtype = torch.float32,
+        device: Optional[Device] = None,
+    ) -> None:
+        """
+        Create a new Transform3d representing 3D translations.
+
+        Option I: Translate(xyz, dtype=torch.float32, device='cpu')
+            xyz should be a tensor of shape (N, 3)
+
+        Option II: Translate(x, y, z, dtype=torch.float32, device='cpu')
+            Here x, y, and z will be broadcast against each other and
+            concatenated to form the translation. Each can be:
+                - A python scalar
+                - A torch scalar
+                - A 1D torch tensor
+        """
+        xyz = _handle_input(x, y, z, dtype, device, "Translate")
+        super().__init__(device=xyz.device)
+        N = xyz.shape[0]
+
+        mat = torch.eye(4, dtype=dtype, device=self.device)
+        mat = mat.view(1, 4, 4).repeat(N, 1, 1)
+        mat[:, 3, :3] = xyz
+        self._matrix = mat
+
+    def _get_matrix_inverse(self):
+        """
+        Return the inverse of self._matrix.
+        """
+        inv_mask = self._matrix.new_ones([1, 4, 4])
+        inv_mask[0, 3, :3] = -1.0
+        i_matrix = self._matrix * inv_mask
+        return i_matrix
+
+
+class Scale(Transform3d):
+    def __init__(
+        self,
+        x,
+        y=None,
+        z=None,
+        dtype: torch.dtype = torch.float32,
+        device: Optional[Device] = None,
+    ) -> None:
+        """
+        A Transform3d representing a scaling operation, with different scale
+        factors along each coordinate axis.
+
+        Option I: Scale(s, dtype=torch.float32, device='cpu')
+            s can be one of
+                - Python scalar or torch scalar: Single uniform scale
+                - 1D torch tensor of shape (N,): A batch of uniform scale
+                - 2D torch tensor of shape (N, 3): Scale differently along each axis
+
+        Option II: Scale(x, y, z, dtype=torch.float32, device='cpu')
+            Each of x, y, and z can be one of
+                - python scalar
+                - torch scalar
+                - 1D torch tensor
+        """
+        xyz = _handle_input(x, y, z, dtype, device, "scale", allow_singleton=True)
+        super().__init__(device=xyz.device)
+        N = xyz.shape[0]
+
+        # TODO: Can we do this all in one go somehow?
+        mat = torch.eye(4, dtype=dtype, device=self.device)
+        mat = mat.view(1, 4, 4).repeat(N, 1, 1)
+        mat[:, 0, 0] = xyz[:, 0]
+        mat[:, 1, 1] = xyz[:, 1]
+        mat[:, 2, 2] = xyz[:, 2]
+        self._matrix = mat
+
+    def _get_matrix_inverse(self):
+        """
+        Return the inverse of self._matrix.
+        """
+        xyz = torch.stack([self._matrix[:, i, i] for i in range(4)], dim=1)
+        ixyz = 1.0 / xyz
+        imat = torch.diag_embed(ixyz, dim1=1, dim2=2)
+        return imat
+
+
+class Rotate(Transform3d):
+    def __init__(
+        self,
+        R: torch.Tensor,
+        dtype: torch.dtype = torch.float32,
+        device: Optional[Device] = None,
+        orthogonal_tol: float = 1e-5,
+    ) -> None:
+        """
+        Create a new Transform3d representing 3D rotation using a rotation
+        matrix as the input.
+
+        Args:
+            R: a tensor of shape (3, 3) or (N, 3, 3)
+            orthogonal_tol: tolerance for the test of the orthogonality of R
+
+        """
+        device_ = get_device(R, device)
+        super().__init__(device=device_)
+        if R.dim() == 2:
+            R = R[None]
+        if R.shape[-2:] != (3, 3):
+            msg = "R must have shape (3, 3) or (N, 3, 3); got %s"
+            raise ValueError(msg % repr(R.shape))
+        R = R.to(device=device_, dtype=dtype)
+        _check_valid_rotation_matrix(R, tol=orthogonal_tol)
+        N = R.shape[0]
+        mat = torch.eye(4, dtype=dtype, device=device_)
+        mat = mat.view(1, 4, 4).repeat(N, 1, 1)
+        mat[:, :3, :3] = R
+        self._matrix = mat
+
+    def _get_matrix_inverse(self):
+        """
+        Return the inverse of self._matrix.
+        """
+        return self._matrix.permute(0, 2, 1).contiguous()
+
+
+class RotateAxisAngle(Rotate):
+    def __init__(
+        self,
+        angle,
+        axis: str = "X",
+        degrees: bool = True,
+        dtype: torch.dtype = torch.float64,
+        device: Optional[Device] = None,
+    ) -> None:
+        """
+        Create a new Transform3d representing 3D rotation about an axis
+        by an angle.
+
+        Assuming a right-hand coordinate system, positive rotation angles result
+        in a counter clockwise rotation.
+
+        Args:
+            angle:
+                - A torch tensor of shape (N,)
+                - A python scalar
+                - A torch scalar
+            axis:
+                string: one of ["X", "Y", "Z"] indicating the axis about which
+                to rotate.
+                NOTE: All batch elements are rotated about the same axis.
+        """
+        axis = axis.upper()
+        if axis not in ["X", "Y", "Z"]:
+            msg = "Expected axis to be one of ['X', 'Y', 'Z']; got %s"
+            raise ValueError(msg % axis)
+        angle = _handle_angle_input(angle, dtype, device, "RotateAxisAngle")
+        angle = (angle / 180.0 * math.pi) if degrees else angle
+        # We assume the points on which this transformation will be applied
+        # are row vectors. The rotation matrix returned from _axis_angle_rotation
+        # is for transforming column vectors. Therefore we transpose this matrix.
+        # R will always be of shape (N, 3, 3)
+        R = _axis_angle_rotation(axis, angle).transpose(1, 2)
+        super().__init__(device=angle.device, R=R)
+
+
+def _handle_coord(c, dtype: torch.dtype, device: torch.device):
+    """
+    Helper function for _handle_input.
+
+    Args:
+        c: Python scalar, torch scalar, or 1D torch tensor
+
+    Returns:
+        c_vec: 1D torch tensor
+    """
+    if not torch.is_tensor(c):
+        c = torch.tensor(c, dtype=dtype, device=device)
+    if c.dim() == 0:
+        c = c.view(1)
+    if c.device != device:
+        c = c.to(device=device)
+    return c
+
+
+def _handle_input(
+    x,
+    y,
+    z,
+    dtype: torch.dtype,
+    device: Optional[Device],
+    name: str,
+    allow_singleton: bool = False,
+):
+    """
+    Helper function to handle parsing logic for building transforms. The output
+    is always a tensor of shape (N, 3), but there are several types of allowed
+    input.
+
+    Case I: Single Matrix
+        In this case x is a tensor of shape (N, 3), and y and z are None. Here just
+        return x.
+
+    Case II: Vectors and Scalars
+        In this case each of x, y, and z can be one of the following
+            - Python scalar
+            - Torch scalar
+            - Torch tensor of shape (N, 1) or (1, 1)
+        In this case x, y and z are broadcast to tensors of shape (N, 1)
+        and concatenated to a tensor of shape (N, 3)
+
+    Case III: Singleton (only if allow_singleton=True)
+        In this case y and z are None, and x can be one of the following:
+            - Python scalar
+            - Torch scalar
+            - Torch tensor of shape (N, 1) or (1, 1)
+        Here x will be duplicated 3 times, and we return a tensor of shape (N, 3)
+
+    Returns:
+        xyz: Tensor of shape (N, 3)
+    """
+    device_ = get_device(x, device)
+    # If x is actually a tensor of shape (N, 3) then just return it
+    if torch.is_tensor(x) and x.dim() == 2:
+        if x.shape[1] != 3:
+            msg = "Expected tensor of shape (N, 3); got %r (in %s)"
+            raise ValueError(msg % (x.shape, name))
+        if y is not None or z is not None:
+            msg = "Expected y and z to be None (in %s)" % name
+            raise ValueError(msg)
+        return x.to(device=device_)
+
+    if allow_singleton and y is None and z is None:
+        y = x
+        z = x
+
+    # Convert all to 1D tensors
+    xyz = [_handle_coord(c, dtype, device_) for c in [x, y, z]]
+
+    # Broadcast and concatenate
+    sizes = [c.shape[0] for c in xyz]
+    N = max(sizes)
+    for c in xyz:
+        if c.shape[0] != 1 and c.shape[0] != N:
+            msg = "Got non-broadcastable sizes %r (in %s)" % (sizes, name)
+            raise ValueError(msg)
+    xyz = [c.expand(N) for c in xyz]
+    xyz = torch.stack(xyz, dim=1)
+    return xyz
+
+
+def _handle_angle_input(x, dtype: torch.dtype, device: Optional[Device], name: str):
+    """
+    Helper function for building a rotation function using angles.
+    The output is always of shape (N,).
+
+    The input can be one of:
+        - Torch tensor of shape (N,)
+        - Python scalar
+        - Torch scalar
+    """
+    device_ = get_device(x, device)
+    if torch.is_tensor(x) and x.dim() > 1:
+        msg = "Expected tensor of shape (N,); got %r (in %s)"
+        raise ValueError(msg % (x.shape, name))
+    else:
+        return _handle_coord(x, dtype, device_)
+
+
+def _broadcast_bmm(a, b):
+    """
+    Batch multiply two matrices and broadcast if necessary.
+
+    Args:
+        a: torch tensor of shape (P, K) or (M, P, K)
+        b: torch tensor of shape (N, K, K)
+
+    Returns:
+        a and b broadcast multiplied. The output batch dimension is max(N, M).
+
+    To broadcast transforms across a batch dimension if M != N then
+    expect that either M = 1 or N = 1. The tensor with batch dimension 1 is
+    expanded to have shape N or M.
+    """
+    if a.dim() == 2:
+        a = a[None]
+    if len(a) != len(b):
+        if not ((len(a) == 1) or (len(b) == 1)):
+            msg = "Expected batch dim for bmm to be equal or 1; got %r, %r"
+            raise ValueError(msg % (a.shape, b.shape))
+        if len(a) == 1:
+            a = a.expand(len(b), -1, -1)
+        if len(b) == 1:
+            b = b.expand(len(a), -1, -1)
+    return a.bmm(b)
+
+
+@torch.no_grad()
+def _check_valid_rotation_matrix(R, tol: float = 1e-7):
+    """
+    Determine if R is a valid rotation matrix by checking it satisfies the
+    following conditions:
+
+    ``RR^T = I and det(R) = 1``
+
+    Args:
+        R: an (N, 3, 3) matrix
+
+    Returns:
+        None
+
+    Emits a warning if R is an invalid rotation matrix.
+    """
+    N = R.shape[0]
+    eye = torch.eye(3, dtype=R.dtype, device=R.device)
+    eye = eye.view(1, 3, 3).expand(N, -1, -1)
+    orthogonal = torch.allclose(R.bmm(R.transpose(1, 2)), eye, atol=tol)
+    det_R = _safe_det_3x3(R)
+    no_distortion = torch.allclose(det_R, torch.ones_like(det_R))
+    if not (orthogonal and no_distortion):
+        msg = "R is not a valid rotation matrix"
+        warnings.warn(msg)
+    return
diff --git a/pytorch3d/pytorch3d/utils/__init__.py b/pytorch3d/pytorch3d/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3e74d0fb7cff6c5772ae8e60f32dbe8058d7766
--- /dev/null
+++ b/pytorch3d/pytorch3d/utils/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .camera_conversions import (
+    cameras_from_opencv_projection,
+    opencv_from_cameras_projection,
+    pulsar_from_cameras_projection,
+    pulsar_from_opencv_projection,
+)
+from .ico_sphere import ico_sphere
+from .torus import torus
+
+
+__all__ = [k for k in globals().keys() if not k.startswith("_")]
diff --git a/pytorch3d/pytorch3d/utils/camera_conversions.py b/pytorch3d/pytorch3d/utils/camera_conversions.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9ba980e735fe1426fe0132acf86d10361fb439d
--- /dev/null
+++ b/pytorch3d/pytorch3d/utils/camera_conversions.py
@@ -0,0 +1,159 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Tuple
+
+import torch
+
+from ..renderer import PerspectiveCameras
+from ..renderer.camera_conversions import (
+    _cameras_from_opencv_projection,
+    _opencv_from_cameras_projection,
+    _pulsar_from_cameras_projection,
+    _pulsar_from_opencv_projection,
+)
+
+
+def cameras_from_opencv_projection(
+    R: torch.Tensor,
+    tvec: torch.Tensor,
+    camera_matrix: torch.Tensor,
+    image_size: torch.Tensor,
+) -> PerspectiveCameras:
+    """
+    Converts a batch of OpenCV-conventioned cameras parametrized with the
+    rotation matrices `R`, translation vectors `tvec`, and the camera
+    calibration matrices `camera_matrix` to `PerspectiveCameras` in PyTorch3D
+    convention.
+
+    More specifically, the conversion is carried out such that a projection
+    of a 3D shape to the OpenCV-conventioned screen of size `image_size` results
+    in the same image as a projection with the corresponding PyTorch3D camera
+    to the NDC screen convention of PyTorch3D.
+
+    More specifically, the OpenCV convention projects points to the OpenCV screen
+    space as follows:
+        ```
+        x_screen_opencv = camera_matrix @ (R @ x_world + tvec)
+        ```
+    followed by the homogenization of `x_screen_opencv`.
+
+    Note:
+        The parameters `R, tvec, camera_matrix` correspond to the outputs of
+        `cv2.decomposeProjectionMatrix`.
+
+        The `rvec` parameter of the `cv2.projectPoints` is an axis-angle vector
+        that can be converted to the rotation matrix `R` expected here by
+        calling the `so3_exp_map` function.
+
+    Args:
+        R: A batch of rotation matrices of shape `(N, 3, 3)`.
+        tvec: A batch of translation vectors of shape `(N, 3)`.
+        camera_matrix: A batch of camera calibration matrices of shape `(N, 3, 3)`.
+        image_size: A tensor of shape `(N, 2)` containing the sizes of the images
+            (height, width) attached to each camera.
+
+    Returns:
+        cameras_pytorch3d: A batch of `N` cameras in the PyTorch3D convention.
+    """
+    return _cameras_from_opencv_projection(R, tvec, camera_matrix, image_size)
+
+
+def opencv_from_cameras_projection(
+    cameras: PerspectiveCameras,
+    image_size: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Converts a batch of `PerspectiveCameras` into OpenCV-convention
+    rotation matrices `R`, translation vectors `tvec`, and the camera
+    calibration matrices `camera_matrix`. This operation is exactly the inverse
+    of `cameras_from_opencv_projection`.
+
+    Note:
+        The outputs `R, tvec, camera_matrix` correspond to the outputs of
+        `cv2.decomposeProjectionMatrix`.
+
+        The `rvec` parameter of the `cv2.projectPoints` is an axis-angle vector
+        that can be converted from the returned rotation matrix `R` here by
+        calling the `so3_log_map` function.
+
+    Args:
+        cameras: A batch of `N` cameras in the PyTorch3D convention.
+        image_size: A tensor of shape `(N, 2)` containing the sizes of the images
+            (height, width) attached to each camera.
+        return_as_rotmat (bool): If set to True, return the full 3x3 rotation
+            matrices. Otherwise, return an axis-angle vector (default).
+
+    Returns:
+        R: A batch of rotation matrices of shape `(N, 3, 3)`.
+        tvec: A batch of translation vectors of shape `(N, 3)`.
+        camera_matrix: A batch of camera calibration matrices of shape `(N, 3, 3)`.
+    """
+    return _opencv_from_cameras_projection(cameras, image_size)
+
+
+def pulsar_from_opencv_projection(
+    R: torch.Tensor,
+    tvec: torch.Tensor,
+    camera_matrix: torch.Tensor,
+    image_size: torch.Tensor,
+    znear: float = 0.1,
+) -> torch.Tensor:
+    """
+    Convert OpenCV style camera parameters to Pulsar style camera parameters.
+
+    Note:
+        * Pulsar does NOT support different focal lengths for x and y.
+          For conversion, we use the average of fx and fy.
+        * The Pulsar renderer MUST use a left-handed coordinate system for this
+          mapping to work.
+        * The resulting image will be vertically flipped - which has to be
+          addressed AFTER rendering by the user.
+        * The parameters `R, tvec, camera_matrix` correspond to the outputs
+          of `cv2.decomposeProjectionMatrix`.
+
+    Args:
+        R: A batch of rotation matrices of shape `(N, 3, 3)`.
+        tvec: A batch of translation vectors of shape `(N, 3)`.
+        camera_matrix: A batch of camera calibration matrices of shape `(N, 3, 3)`.
+        image_size: A tensor of shape `(N, 2)` containing the sizes of the images
+            (height, width) attached to each camera.
+        znear (float): The near clipping value to use for Pulsar.
+
+    Returns:
+        cameras_pulsar: A batch of `N` Pulsar camera vectors in the Pulsar
+            convention `(N, 13)` (3 translation, 6 rotation, focal_length, sensor_width,
+            c_x, c_y).
+    """
+    return _pulsar_from_opencv_projection(R, tvec, camera_matrix, image_size, znear)
+
+
+def pulsar_from_cameras_projection(
+    cameras: PerspectiveCameras,
+    image_size: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Convert PyTorch3D `PerspectiveCameras` to Pulsar style camera parameters.
+
+    Note:
+        * Pulsar does NOT support different focal lengths for x and y.
+          For conversion, we use the average of fx and fy.
+        * The Pulsar renderer MUST use a left-handed coordinate system for this
+          mapping to work.
+        * The resulting image will be vertically flipped - which has to be
+          addressed AFTER rendering by the user.
+
+    Args:
+        cameras: A batch of `N` cameras in the PyTorch3D convention.
+        image_size: A tensor of shape `(N, 2)` containing the sizes of the images
+            (height, width) attached to each camera.
+
+    Returns:
+        cameras_pulsar: A batch of `N` Pulsar camera vectors in the Pulsar
+            convention `(N, 13)` (3 translation, 6 rotation, focal_length, sensor_width,
+            c_x, c_y).
+    """
+    return _pulsar_from_cameras_projection(cameras, image_size)
diff --git a/pytorch3d/pytorch3d/utils/ico_sphere.py b/pytorch3d/pytorch3d/utils/ico_sphere.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c4cc74fce2b253774779e3434b2608e0d8f7a6e
--- /dev/null
+++ b/pytorch3d/pytorch3d/utils/ico_sphere.py
@@ -0,0 +1,84 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import torch
+from pytorch3d.ops.subdivide_meshes import SubdivideMeshes
+from pytorch3d.structures.meshes import Meshes
+
+
+# Vertex coordinates for a level 0 ico-sphere.
+_ico_verts0 = [
+    [-0.5257, 0.8507, 0.0000],
+    [0.5257, 0.8507, 0.0000],
+    [-0.5257, -0.8507, 0.0000],
+    [0.5257, -0.8507, 0.0000],
+    [0.0000, -0.5257, 0.8507],
+    [0.0000, 0.5257, 0.8507],
+    [0.0000, -0.5257, -0.8507],
+    [0.0000, 0.5257, -0.8507],
+    [0.8507, 0.0000, -0.5257],
+    [0.8507, 0.0000, 0.5257],
+    [-0.8507, 0.0000, -0.5257],
+    [-0.8507, 0.0000, 0.5257],
+]
+
+
+# Faces for level 0 ico-sphere
+_ico_faces0 = [
+    [0, 11, 5],
+    [0, 5, 1],
+    [0, 1, 7],
+    [0, 7, 10],
+    [0, 10, 11],
+    [1, 5, 9],
+    [5, 11, 4],
+    [11, 10, 2],
+    [10, 7, 6],
+    [7, 1, 8],
+    [3, 9, 4],
+    [3, 4, 2],
+    [3, 2, 6],
+    [3, 6, 8],
+    [3, 8, 9],
+    [4, 9, 5],
+    [2, 4, 11],
+    [6, 2, 10],
+    [8, 6, 7],
+    [9, 8, 1],
+]
+
+
+def ico_sphere(level: int = 0, device=None):
+    """
+    Create verts and faces for a unit ico-sphere, with all faces oriented
+    consistently.
+
+    Args:
+        level: integer specifying the number of iterations for subdivision
+               of the mesh faces. Each additional level will result in four new
+               faces per face.
+        device: A torch.device object on which the outputs will be allocated.
+
+    Returns:
+        Meshes object with verts and faces.
+    """
+    if device is None:
+        device = torch.device("cpu")
+    if level < 0:
+        raise ValueError("level must be >= 0.")
+    if level == 0:
+        verts = torch.tensor(_ico_verts0, dtype=torch.float32, device=device)
+        faces = torch.tensor(_ico_faces0, dtype=torch.int64, device=device)
+
+    else:
+        mesh = ico_sphere(level - 1, device)
+        subdivide = SubdivideMeshes()
+        mesh = subdivide(mesh)
+        verts = mesh.verts_list()[0]
+        verts /= verts.norm(p=2, dim=1, keepdim=True)
+        faces = mesh.faces_list()[0]
+    return Meshes(verts=[verts], faces=[faces])
diff --git a/pytorch3d/pytorch3d/utils/torus.py b/pytorch3d/pytorch3d/utils/torus.py
new file mode 100644
index 0000000000000000000000000000000000000000..46157da4e08dc5c0933ba2f3b41d2d212fec5ebb
--- /dev/null
+++ b/pytorch3d/pytorch3d/utils/torus.py
@@ -0,0 +1,71 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from itertools import tee
+from math import cos, pi, sin
+from typing import Iterator, Optional, Tuple
+
+import torch
+from pytorch3d.structures.meshes import Meshes
+
+
+# Make an iterator over the adjacent pairs: (-1, 0), (0, 1), ..., (N - 2, N - 1)
+def _make_pair_range(N: int) -> Iterator[Tuple[int, int]]:
+    i, j = tee(range(-1, N))
+    next(j, None)
+    return zip(i, j)
+
+
+def torus(
+    r: float, R: float, sides: int, rings: int, device: Optional[torch.device] = None
+) -> Meshes:
+    """
+    Create vertices and faces for a torus.
+
+    Args:
+        r: Inner radius of the torus.
+        R: Outer radius of the torus.
+        sides: Number of inner divisions.
+        rings: Number of outer divisions.
+        device: Device on which the outputs will be allocated.
+
+    Returns:
+        Meshes object with the generated vertices and faces.
+    """
+    if not (sides > 0):
+        raise ValueError("sides must be > 0.")
+    if not (rings > 0):
+        raise ValueError("rings must be > 0.")
+    device = device if device else torch.device("cpu")
+
+    verts = []
+    for i in range(rings):
+        # phi ranges from 0 to 2 pi (rings - 1) / rings
+        phi = 2 * pi * i / rings
+        for j in range(sides):
+            # theta ranges from 0 to 2 pi (sides - 1) / sides
+            theta = 2 * pi * j / sides
+            x = (R + r * cos(theta)) * cos(phi)
+            y = (R + r * cos(theta)) * sin(phi)
+            z = r * sin(theta)
+            # This vertex has index i * sides + j
+            verts.append([x, y, z])
+
+    faces = []
+    for i0, i1 in _make_pair_range(rings):
+        index0 = (i0 % rings) * sides
+        index1 = (i1 % rings) * sides
+        for j0, j1 in _make_pair_range(sides):
+            index00 = index0 + (j0 % sides)
+            index01 = index0 + (j1 % sides)
+            index10 = index1 + (j0 % sides)
+            index11 = index1 + (j1 % sides)
+            faces.append([index00, index10, index11])
+            faces.append([index11, index01, index00])
+
+    verts_list = [torch.tensor(verts, dtype=torch.float32, device=device)]
+    faces_list = [torch.tensor(faces, dtype=torch.int64, device=device)]
+    return Meshes(verts_list, faces_list)
diff --git a/pytorch3d/pytorch3d/vis/__init__.py b/pytorch3d/pytorch3d/vis/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10a55772ab58b21573a6eba0356ddd3080164ac7
--- /dev/null
+++ b/pytorch3d/pytorch3d/vis/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
diff --git a/pytorch3d/pytorch3d/vis/plotly_vis.py b/pytorch3d/pytorch3d/vis/plotly_vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2c7dfbeb1cb94da23c7f1f86a6167b738e49687
--- /dev/null
+++ b/pytorch3d/pytorch3d/vis/plotly_vis.py
@@ -0,0 +1,978 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import warnings
+from typing import Dict, List, NamedTuple, Optional, Tuple, Union
+
+import plotly.graph_objects as go
+import torch
+from plotly.subplots import make_subplots
+from pytorch3d.renderer import (
+    RayBundle,
+    TexturesAtlas,
+    TexturesVertex,
+    ray_bundle_to_ray_points,
+)
+from pytorch3d.renderer.camera_utils import camera_to_eye_at_up
+from pytorch3d.renderer.cameras import CamerasBase
+from pytorch3d.structures import Meshes, Pointclouds, join_meshes_as_scene
+
+
+Struct = Union[CamerasBase, Meshes, Pointclouds, RayBundle]
+
+
+def _get_struct_len(struct: Struct):  # pragma: no cover
+    """
+    Returns the length (usually corresponds to the batch size) of the input structure.
+    """
+    return len(struct.directions) if isinstance(struct, RayBundle) else len(struct)
+
+
+def get_camera_wireframe(scale: float = 0.3):  # pragma: no cover
+    """
+    Returns a wireframe of a 3D line-plot of a camera symbol.
+    """
+    a = 0.5 * torch.tensor([-2, 1.5, 4])
+    up1 = 0.5 * torch.tensor([0, 1.5, 4])
+    up2 = 0.5 * torch.tensor([0, 2, 4])
+    b = 0.5 * torch.tensor([2, 1.5, 4])
+    c = 0.5 * torch.tensor([-2, -1.5, 4])
+    d = 0.5 * torch.tensor([2, -1.5, 4])
+    C = torch.zeros(3)
+    F = torch.tensor([0, 0, 3])
+    camera_points = [a, up1, up2, up1, b, d, c, a, C, b, d, C, c, C, F]
+    lines = torch.stack([x.float() for x in camera_points]) * scale
+    return lines
+
+
+class AxisArgs(NamedTuple):  # pragma: no cover
+    showgrid: bool = False
+    zeroline: bool = False
+    showline: bool = False
+    ticks: str = ""
+    showticklabels: bool = False
+    backgroundcolor: str = "#fff"
+    showaxeslabels: bool = False
+
+
+class Lighting(NamedTuple):  # pragma: no cover
+    ambient: float = 0.8
+    diffuse: float = 1.0
+    fresnel: float = 0.0
+    specular: float = 0.0
+    roughness: float = 0.5
+    facenormalsepsilon: float = 1e-6
+    vertexnormalsepsilon: float = 1e-12
+
+
+def plot_scene(
+    plots: Dict[str, Dict[str, Struct]],
+    *,
+    viewpoint_cameras: Optional[CamerasBase] = None,
+    ncols: int = 1,
+    camera_scale: float = 0.3,
+    pointcloud_max_points: int = 20000,
+    pointcloud_marker_size: int = 1,
+    raybundle_max_rays: int = 20000,
+    raybundle_max_points_per_ray: int = 1000,
+    raybundle_ray_point_marker_size: int = 1,
+    raybundle_ray_line_width: int = 1,
+    **kwargs,
+):  # pragma: no cover
+    """
+    Main function to visualize Cameras, Meshes, Pointclouds, and RayBundle.
+    Plots input Cameras, Meshes, Pointclouds, and RayBundle data into named subplots,
+    with named traces based on the dictionary keys. Cameras are
+    rendered at the camera center location using a wireframe.
+
+    Args:
+        plots: A dict containing subplot and trace names,
+            as well as the Meshes, Cameras and Pointclouds objects to be rendered.
+            See below for examples of the format.
+        viewpoint_cameras: an instance of a Cameras object providing a location
+            to view the plotly plot from. If the batch size is equal
+            to the number of subplots, it is a one to one mapping.
+            If the batch size is 1, then that viewpoint will be used
+            for all the subplots will be viewed from that point.
+            Otherwise, the viewpoint_cameras will not be used.
+        ncols: the number of subplots per row
+        camera_scale: determines the size of the wireframe used to render cameras.
+        pointcloud_max_points: the maximum number of points to plot from
+            a pointcloud. If more are present, a random sample of size
+            pointcloud_max_points is used.
+        pointcloud_marker_size: the size of the points rendered by plotly
+            when plotting a pointcloud.
+        raybundle_max_rays: maximum number of rays of a RayBundle to visualize. Randomly
+            subsamples without replacement in case the number of rays is bigger than max_rays.
+        raybundle_max_points_per_ray: the maximum number of points per ray in RayBundle
+            to visualize. If more are present, a random sample of size
+            max_points_per_ray is used.
+        raybundle_ray_point_marker_size: the size of the ray points of a plotted RayBundle
+        raybundle_ray_line_width: the width of the plotted rays of a RayBundle
+        **kwargs: Accepts lighting (a Lighting object) and any of the args xaxis,
+            yaxis and zaxis which Plotly's scene accepts. Accepts axis_args,
+            which is an AxisArgs object that is applied to all 3 axes.
+            Example settings for axis_args and lighting are given at the
+            top of this file.
+
+    Example:
+
+    ..code-block::python
+
+        mesh = ...
+        point_cloud = ...
+        fig = plot_scene({
+            "subplot_title": {
+                "mesh_trace_title": mesh,
+                "pointcloud_trace_title": point_cloud
+            }
+        })
+        fig.show()
+
+    The above example will render one subplot which has both a mesh and pointcloud.
+
+    If the Meshes, Pointclouds, or Cameras objects are batched, then every object in that batch
+    will be plotted in a single trace.
+
+    ..code-block::python
+        mesh = ... # batch size 2
+        point_cloud = ... # batch size 2
+        fig = plot_scene({
+            "subplot_title": {
+                "mesh_trace_title": mesh,
+                "pointcloud_trace_title": point_cloud
+            }
+        })
+        fig.show()
+
+    The above example renders one subplot with 2 traces, each of which renders
+    both objects from their respective batched data.
+
+    Multiple subplots follow the same pattern:
+    ..code-block::python
+        mesh = ... # batch size 2
+        point_cloud = ... # batch size 2
+        fig = plot_scene({
+            "subplot1_title": {
+                "mesh_trace_title": mesh[0],
+                "pointcloud_trace_title": point_cloud[0]
+            },
+            "subplot2_title": {
+                "mesh_trace_title": mesh[1],
+                "pointcloud_trace_title": point_cloud[1]
+            }
+        },
+        ncols=2)  # specify the number of subplots per row
+        fig.show()
+
+    The above example will render two subplots, each containing a mesh
+    and a pointcloud. The ncols argument will render two subplots in one row
+    instead of having them vertically stacked because the default is one subplot
+    per row.
+
+    To view plotly plots from a PyTorch3D camera's point of view, we can use
+    viewpoint_cameras:
+    ..code-block::python
+        mesh = ... # batch size 2
+        R, T = look_at_view_transform(2.7, 0, [0, 180]) # 2 camera angles, front and back
+        # Any instance of CamerasBase works, here we use FoVPerspectiveCameras
+        cameras = FoVPerspectiveCameras(device=device, R=R, T=T)
+        fig = plot_scene({
+            "subplot1_title": {
+                "mesh_trace_title": mesh[0]
+            },
+            "subplot2_title": {
+                "mesh_trace_title": mesh[1]
+            }
+        },
+        viewpoint_cameras=cameras)
+        fig.show()
+
+    The above example will render the first subplot seen from the camera on the +z axis,
+    and the second subplot from the viewpoint of the camera on the -z axis.
+
+    We can visualize these cameras as well:
+    ..code-block::python
+        mesh = ...
+        R, T = look_at_view_transform(2.7, 0, [0, 180]) # 2 camera angles, front and back
+        # Any instance of CamerasBase works, here we use FoVPerspectiveCameras
+        cameras = FoVPerspectiveCameras(device=device, R=R, T=T)
+        fig = plot_scene({
+            "subplot1_title": {
+                "mesh_trace_title": mesh,
+                "cameras_trace_title": cameras,
+            },
+        })
+        fig.show()
+
+    The above example will render one subplot with the mesh object
+    and two cameras.
+
+    RayBundle visualization is also supproted:
+    ..code-block::python
+        cameras = PerspectiveCameras(...)
+        ray_bundle = RayBundle(origins=..., lengths=..., directions=..., xys=...)
+        fig = plot_scene({
+            "subplot1_title": {
+                "ray_bundle_trace_title": ray_bundle,
+                "cameras_trace_title": cameras,
+            },
+        })
+        fig.show()
+
+    For an example of using kwargs, see below:
+    ..code-block::python
+        mesh = ...
+        point_cloud = ...
+        fig = plot_scene({
+            "subplot_title": {
+                "mesh_trace_title": mesh,
+                "pointcloud_trace_title": point_cloud
+            }
+        },
+        axis_args=AxisArgs(backgroundcolor="rgb(200,230,200)")) # kwarg axis_args
+        fig.show()
+
+    The above example will render each axis with the input background color.
+
+    See the tutorials in pytorch3d/docs/tutorials for more examples
+    (namely rendered_color_points.ipynb and rendered_textured_meshes.ipynb).
+    """
+
+    subplots = list(plots.keys())
+    fig = _gen_fig_with_subplots(len(subplots), ncols, subplots)
+    lighting = kwargs.get("lighting", Lighting())._asdict()
+    axis_args_dict = kwargs.get("axis_args", AxisArgs())._asdict()
+
+    # Set axis arguments to defaults defined at the top of this file
+    x_settings = {**axis_args_dict}
+    y_settings = {**axis_args_dict}
+    z_settings = {**axis_args_dict}
+
+    # Update the axes with any axis settings passed in as kwargs.
+    x_settings.update(**kwargs.get("xaxis", {}))
+    y_settings.update(**kwargs.get("yaxis", {}))
+    z_settings.update(**kwargs.get("zaxis", {}))
+
+    camera = {
+        "up": {
+            "x": 0,
+            "y": 1,
+            "z": 0,
+        }  # set the up vector to match PyTorch3D world coordinates conventions
+    }
+    viewpoints_eye_at_up_world = None
+    if viewpoint_cameras:
+        n_viewpoint_cameras = len(viewpoint_cameras)
+        if n_viewpoint_cameras == len(subplots) or n_viewpoint_cameras == 1:
+            # Calculate the vectors eye, at, up in world space
+            # to initialize the position of the camera in
+            # the plotly figure
+            viewpoints_eye_at_up_world = camera_to_eye_at_up(
+                viewpoint_cameras.get_world_to_view_transform().cpu()
+            )
+        else:
+            msg = "Invalid number {} of viewpoint cameras were provided. Either 1 \
+            or {} cameras are required".format(
+                len(viewpoint_cameras), len(subplots)
+            )
+            warnings.warn(msg)
+
+    for subplot_idx in range(len(subplots)):
+        subplot_name = subplots[subplot_idx]
+        traces = plots[subplot_name]
+        for trace_name, struct in traces.items():
+            if isinstance(struct, Meshes):
+                _add_mesh_trace(fig, struct, trace_name, subplot_idx, ncols, lighting)
+            elif isinstance(struct, Pointclouds):
+                _add_pointcloud_trace(
+                    fig,
+                    struct,
+                    trace_name,
+                    subplot_idx,
+                    ncols,
+                    pointcloud_max_points,
+                    pointcloud_marker_size,
+                )
+            elif isinstance(struct, CamerasBase):
+                _add_camera_trace(
+                    fig, struct, trace_name, subplot_idx, ncols, camera_scale
+                )
+            elif isinstance(struct, RayBundle):
+                _add_ray_bundle_trace(
+                    fig,
+                    struct,
+                    trace_name,
+                    subplot_idx,
+                    ncols,
+                    raybundle_max_rays,
+                    raybundle_max_points_per_ray,
+                    raybundle_ray_point_marker_size,
+                    raybundle_ray_line_width,
+                )
+            else:
+                raise ValueError(
+                    "struct {} is not a Cameras, Meshes, Pointclouds,".format(struct)
+                    + " or RayBundle object."
+                )
+
+        # Ensure update for every subplot.
+        plot_scene = "scene" + str(subplot_idx + 1)
+        current_layout = fig["layout"][plot_scene]
+        xaxis = current_layout["xaxis"]
+        yaxis = current_layout["yaxis"]
+        zaxis = current_layout["zaxis"]
+
+        # Update the axes with our above default and provided settings.
+        xaxis.update(**x_settings)
+        yaxis.update(**y_settings)
+        zaxis.update(**z_settings)
+
+        # update camera viewpoint if provided
+        if viewpoints_eye_at_up_world is not None:
+            # Use camera params for batch index or the first camera if only one provided.
+            viewpoint_idx = min(n_viewpoint_cameras - 1, subplot_idx)
+
+            eye, at, up = (i[viewpoint_idx] for i in viewpoints_eye_at_up_world)
+            eye_x, eye_y, eye_z = eye.tolist()
+            at_x, at_y, at_z = at.tolist()
+            up_x, up_y, up_z = up.tolist()
+
+            # scale camera eye to plotly [-1, 1] ranges
+            x_range = xaxis["range"]
+            y_range = yaxis["range"]
+            z_range = zaxis["range"]
+
+            eye_x = _scale_camera_to_bounds(eye_x, x_range, True)
+            eye_y = _scale_camera_to_bounds(eye_y, y_range, True)
+            eye_z = _scale_camera_to_bounds(eye_z, z_range, True)
+
+            at_x = _scale_camera_to_bounds(at_x, x_range, True)
+            at_y = _scale_camera_to_bounds(at_y, y_range, True)
+            at_z = _scale_camera_to_bounds(at_z, z_range, True)
+
+            up_x = _scale_camera_to_bounds(up_x, x_range, False)
+            up_y = _scale_camera_to_bounds(up_y, y_range, False)
+            up_z = _scale_camera_to_bounds(up_z, z_range, False)
+
+            camera["eye"] = {"x": eye_x, "y": eye_y, "z": eye_z}
+            camera["center"] = {"x": at_x, "y": at_y, "z": at_z}
+            camera["up"] = {"x": up_x, "y": up_y, "z": up_z}
+
+        current_layout.update(
+            {
+                "xaxis": xaxis,
+                "yaxis": yaxis,
+                "zaxis": zaxis,
+                "aspectmode": "cube",
+                "camera": camera,
+            }
+        )
+
+    return fig
+
+
+def plot_batch_individually(
+    batched_structs: Union[
+        List[Struct],
+        Struct,
+    ],
+    *,
+    viewpoint_cameras: Optional[CamerasBase] = None,
+    ncols: int = 1,
+    extend_struct: bool = True,
+    subplot_titles: Optional[List[str]] = None,
+    **kwargs,
+):  # pragma: no cover
+    """
+    This is a higher level plotting function than plot_scene, for plotting
+    Cameras, Meshes, Pointclouds, and RayBundle in simple cases. The simplest use
+    is to plot a single Cameras, Meshes, Pointclouds, or a RayBundle object,
+    where you just pass it in as a one element list. This will plot each batch
+    element in a separate subplot.
+
+    More generally, you can supply multiple Cameras, Meshes, Pointclouds, or RayBundle
+    having the same batch size `n`. In this case, there will be `n` subplots,
+    each depicting the corresponding batch element of all the inputs.
+
+    In addition, you can include Cameras, Meshes, Pointclouds, or RayBundle of size 1 in
+    the input. These will either be rendered in the first subplot
+    (if extend_struct is False), or in every subplot.
+
+    Args:
+        batched_structs: a list of Cameras, Meshes, Pointclouds, and RayBundle
+            to be rendered. Each structure's corresponding batch element will be
+            plotted in a single subplot, resulting in n subplots for a batch of size n.
+            Every struct should either have the same batch size or be of batch size 1.
+            See extend_struct and the description above for how batch size 1 structs
+            are handled. Also accepts a single Cameras, Meshes, Pointclouds, and RayBundle
+            object, which will have each individual element plotted in its own subplot.
+        viewpoint_cameras: an instance of a Cameras object providing a location
+            to view the plotly plot from. If the batch size is equal
+            to the number of subplots, it is a one to one mapping.
+            If the batch size is 1, then that viewpoint will be used
+            for all the subplots will be viewed from that point.
+            Otherwise, the viewpoint_cameras will not be used.
+        ncols: the number of subplots per row
+        extend_struct: if True, indicates that structs of batch size 1
+            should be plotted in every subplot.
+        subplot_titles: strings to name each subplot
+        **kwargs: keyword arguments which are passed to plot_scene.
+            See plot_scene documentation for details.
+
+    Example:
+
+    ..code-block::python
+
+        mesh = ...  # mesh of batch size 2
+        point_cloud = ... # point_cloud of batch size 2
+        fig = plot_batch_individually([mesh, point_cloud], subplot_titles=["plot1", "plot2"])
+        fig.show()
+
+        # this is equivalent to the below figure
+        fig = plot_scene({
+            "plot1": {
+                "trace1-1": mesh[0],
+                "trace1-2": point_cloud[0]
+            },
+            "plot2":{
+                "trace2-1": mesh[1],
+                "trace2-2": point_cloud[1]
+            }
+        })
+        fig.show()
+
+    The above example will render two subplots which each have both a mesh and pointcloud.
+    For more examples look at the pytorch3d tutorials at `pytorch3d/docs/tutorials`,
+    in particular the files rendered_color_points.ipynb and rendered_textured_meshes.ipynb.
+    """
+
+    # check that every batch is the same size or is size 1
+    if len(batched_structs) == 0:
+        msg = "No structs to plot"
+        warnings.warn(msg)
+        return
+    max_size = 0
+    if isinstance(batched_structs, list):
+        max_size = max(_get_struct_len(s) for s in batched_structs)
+        for struct in batched_structs:
+            struct_len = _get_struct_len(struct)
+            if struct_len not in (1, max_size):
+                msg = "invalid batch size {} provided: {}".format(struct_len, struct)
+                raise ValueError(msg)
+    else:
+        max_size = _get_struct_len(batched_structs)
+
+    if max_size == 0:
+        msg = "No data is provided with at least one element"
+        raise ValueError(msg)
+
+    if subplot_titles:
+        if len(subplot_titles) != max_size:
+            msg = "invalid number of subplot titles"
+            raise ValueError(msg)
+
+    scene_dictionary = {}
+    # construct the scene dictionary
+    for scene_num in range(max_size):
+        subplot_title = (
+            subplot_titles[scene_num]
+            if subplot_titles
+            else "subplot " + str(scene_num + 1)
+        )
+        scene_dictionary[subplot_title] = {}
+
+        if isinstance(batched_structs, list):
+            for i, batched_struct in enumerate(batched_structs):
+                # check for whether this struct needs to be extended
+                batched_struct_len = _get_struct_len(batched_struct)
+                if i >= batched_struct_len and not extend_struct:
+                    continue
+                _add_struct_from_batch(
+                    batched_struct, scene_num, subplot_title, scene_dictionary, i + 1
+                )
+        else:  # batched_structs is a single struct
+            _add_struct_from_batch(
+                batched_structs, scene_num, subplot_title, scene_dictionary
+            )
+
+    return plot_scene(
+        scene_dictionary, viewpoint_cameras=viewpoint_cameras, ncols=ncols, **kwargs
+    )
+
+
+def _add_struct_from_batch(
+    batched_struct: Struct,
+    scene_num: int,
+    subplot_title: str,
+    scene_dictionary: Dict[str, Dict[str, Struct]],
+    trace_idx: int = 1,
+):  # pragma: no cover
+    """
+    Adds the struct corresponding to the given scene_num index to
+    a provided scene_dictionary to be passed in to plot_scene
+
+    Args:
+        batched_struct: the batched data structure to add to the dict
+        scene_num: the subplot from plot_batch_individually which this struct
+            should be added to
+        subplot_title: the title of the subplot
+        scene_dictionary: the dictionary to add the indexed struct to
+        trace_idx: the trace number, starting at 1 for this struct's trace
+    """
+    struct = None
+    if isinstance(batched_struct, CamerasBase):
+        # we can't index directly into camera batches
+        R, T = batched_struct.R, batched_struct.T
+        # pyre-fixme[6]: Expected `Sized` for 1st param but got `Union[torch.Tensor,
+        #  torch.nn.Module]`.
+        r_idx = min(scene_num, len(R) - 1)
+        # pyre-fixme[6]: Expected `Sized` for 1st param but got `Union[torch.Tensor,
+        #  torch.nn.Module]`.
+        t_idx = min(scene_num, len(T) - 1)
+        # pyre-fixme[29]:
+        #  `Union[BoundMethod[typing.Callable(torch.Tensor.__getitem__)[[Named(self,
+        #  torch.Tensor), Named(item, typing.Any)], typing.Any], torch.Tensor],
+        #  torch.Tensor, torch.nn.Module]` is not a function.
+        R = R[r_idx].unsqueeze(0)
+        # pyre-fixme[29]:
+        #  `Union[BoundMethod[typing.Callable(torch.Tensor.__getitem__)[[Named(self,
+        #  torch.Tensor), Named(item, typing.Any)], typing.Any], torch.Tensor],
+        #  torch.Tensor, torch.nn.Module]` is not a function.
+        T = T[t_idx].unsqueeze(0)
+        struct = CamerasBase(device=batched_struct.device, R=R, T=T)
+    elif isinstance(batched_struct, RayBundle):
+        # for RayBundle we treat the 1st dim as the batch index
+        struct_idx = min(scene_num, len(batched_struct.lengths) - 1)
+        struct = RayBundle(
+            **{
+                attr: getattr(batched_struct, attr)[struct_idx]
+                for attr in ["origins", "directions", "lengths", "xys"]
+            }
+        )
+    else:  # batched meshes and pointclouds are indexable
+        struct_idx = min(scene_num, len(batched_struct) - 1)
+        struct = batched_struct[struct_idx]
+    trace_name = "trace{}-{}".format(scene_num + 1, trace_idx)
+    scene_dictionary[subplot_title][trace_name] = struct
+
+
+def _add_mesh_trace(
+    fig: go.Figure,  # pyre-ignore[11]
+    meshes: Meshes,
+    trace_name: str,
+    subplot_idx: int,
+    ncols: int,
+    lighting: Lighting,
+):  # pragma: no cover
+    """
+    Adds a trace rendering a Meshes object to the passed in figure, with
+    a given name and in a specific subplot.
+
+    Args:
+        fig: plotly figure to add the trace within.
+        meshes: Meshes object to render. It can be batched.
+        trace_name: name to label the trace with.
+        subplot_idx: identifies the subplot, with 0 being the top left.
+        ncols: the number of subplots per row.
+        lighting: a Lighting object that specifies the Mesh3D lighting.
+    """
+
+    mesh = join_meshes_as_scene(meshes)
+    mesh = mesh.detach().cpu()
+    verts = mesh.verts_packed()
+    faces = mesh.faces_packed()
+    # If mesh has vertex colors or face colors, use them
+    # for figure, otherwise use plotly's default colors.
+    verts_rgb = None
+    faces_rgb = None
+    if isinstance(mesh.textures, TexturesVertex):
+        verts_rgb = mesh.textures.verts_features_packed()
+        verts_rgb.clamp_(min=0.0, max=1.0)
+        verts_rgb = torch.tensor(255.0) * verts_rgb
+    if isinstance(mesh.textures, TexturesAtlas):
+        atlas = mesh.textures.atlas_packed()
+        # If K==1
+        if atlas.shape[1] == 1 and atlas.shape[3] == 3:
+            faces_rgb = atlas[:, 0, 0]
+
+    # Reposition the unused vertices to be "inside" the object
+    # (i.e. they won't be visible in the plot).
+    verts_used = torch.zeros((verts.shape[0],), dtype=torch.bool)
+    verts_used[torch.unique(faces)] = True
+    verts_center = verts[verts_used].mean(0)
+    verts[~verts_used] = verts_center
+
+    row, col = subplot_idx // ncols + 1, subplot_idx % ncols + 1
+    fig.add_trace(
+        go.Mesh3d(
+            x=verts[:, 0],
+            y=verts[:, 1],
+            z=verts[:, 2],
+            vertexcolor=verts_rgb,
+            facecolor=faces_rgb,
+            i=faces[:, 0],
+            j=faces[:, 1],
+            k=faces[:, 2],
+            lighting=lighting,
+            name=trace_name,
+        ),
+        row=row,
+        col=col,
+    )
+
+    # Access the current subplot's scene configuration
+    plot_scene = "scene" + str(subplot_idx + 1)
+    current_layout = fig["layout"][plot_scene]
+
+    # update the bounds of the axes for the current trace
+    max_expand = (verts.max(0)[0] - verts.min(0)[0]).max()
+    _update_axes_bounds(verts_center, max_expand, current_layout)
+
+
+def _add_pointcloud_trace(
+    fig: go.Figure,
+    pointclouds: Pointclouds,
+    trace_name: str,
+    subplot_idx: int,
+    ncols: int,
+    max_points_per_pointcloud: int,
+    marker_size: int,
+):  # pragma: no cover
+    """
+    Adds a trace rendering a Pointclouds object to the passed in figure, with
+    a given name and in a specific subplot.
+
+    Args:
+        fig: plotly figure to add the trace within.
+        pointclouds: Pointclouds object to render. It can be batched.
+        trace_name: name to label the trace with.
+        subplot_idx: identifies the subplot, with 0 being the top left.
+        ncols: the number of subplots per row.
+        max_points_per_pointcloud: the number of points to render, which are randomly sampled.
+        marker_size: the size of the rendered points
+    """
+    pointclouds = pointclouds.detach().cpu().subsample(max_points_per_pointcloud)
+    verts = pointclouds.points_packed()
+    features = pointclouds.features_packed()
+
+    color = None
+    if features is not None:
+        if features.shape[1] == 4:  # rgba
+            template = "rgb(%d, %d, %d, %f)"
+            rgb = (features[:, :3].clamp(0.0, 1.0) * 255).int()
+            color = [template % (*rgb_, a_) for rgb_, a_ in zip(rgb, features[:, 3])]
+
+        if features.shape[1] == 3:
+            template = "rgb(%d, %d, %d)"
+            rgb = (features.clamp(0.0, 1.0) * 255).int()
+            color = [template % (r, g, b) for r, g, b in rgb]
+
+    row = subplot_idx // ncols + 1
+    col = subplot_idx % ncols + 1
+    fig.add_trace(
+        go.Scatter3d(
+            x=verts[:, 0],
+            y=verts[:, 1],
+            z=verts[:, 2],
+            marker={"color": color, "size": marker_size},
+            mode="markers",
+            name=trace_name,
+        ),
+        row=row,
+        col=col,
+    )
+
+    # Access the current subplot's scene configuration
+    plot_scene = "scene" + str(subplot_idx + 1)
+    current_layout = fig["layout"][plot_scene]
+
+    # update the bounds of the axes for the current trace
+    verts_center = verts.mean(0)
+    max_expand = (verts.max(0)[0] - verts.min(0)[0]).max()
+    _update_axes_bounds(verts_center, max_expand, current_layout)
+
+
+def _add_camera_trace(
+    fig: go.Figure,
+    cameras: CamerasBase,
+    trace_name: str,
+    subplot_idx: int,
+    ncols: int,
+    camera_scale: float,
+):  # pragma: no cover
+    """
+    Adds a trace rendering a Cameras object to the passed in figure, with
+    a given name and in a specific subplot.
+
+    Args:
+        fig: plotly figure to add the trace within.
+        cameras: the Cameras object to render. It can be batched.
+        trace_name: name to label the trace with.
+        subplot_idx: identifies the subplot, with 0 being the top left.
+        ncols: the number of subplots per row.
+        camera_scale: the size of the wireframe used to render the Cameras object.
+    """
+    cam_wires = get_camera_wireframe(camera_scale).to(cameras.device)
+    cam_trans = cameras.get_world_to_view_transform().inverse()
+    cam_wires_trans = cam_trans.transform_points(cam_wires).detach().cpu()
+    # if batch size is 1, unsqueeze to add dimension
+    if len(cam_wires_trans.shape) < 3:
+        cam_wires_trans = cam_wires_trans.unsqueeze(0)
+
+    nan_tensor = torch.Tensor([[float("NaN")] * 3])
+    all_cam_wires = cam_wires_trans[0]
+    for wire in cam_wires_trans[1:]:
+        # We combine camera points into a single tensor to plot them in a
+        # single trace. The NaNs are inserted between sets of camera
+        # points so that the lines drawn by Plotly are not drawn between
+        # points that belong to different cameras.
+        all_cam_wires = torch.cat((all_cam_wires, nan_tensor, wire))
+    x, y, z = all_cam_wires.detach().cpu().numpy().T.astype(float)
+
+    row, col = subplot_idx // ncols + 1, subplot_idx % ncols + 1
+    fig.add_trace(
+        go.Scatter3d(x=x, y=y, z=z, marker={"size": 1}, name=trace_name),
+        row=row,
+        col=col,
+    )
+
+    # Access the current subplot's scene configuration
+    plot_scene = "scene" + str(subplot_idx + 1)
+    current_layout = fig["layout"][plot_scene]
+
+    # flatten for bounds calculations
+    flattened_wires = cam_wires_trans.flatten(0, 1)
+    verts_center = flattened_wires.mean(0)
+    max_expand = (flattened_wires.max(0)[0] - flattened_wires.min(0)[0]).max()
+    _update_axes_bounds(verts_center, max_expand, current_layout)
+
+
+def _add_ray_bundle_trace(
+    fig: go.Figure,
+    ray_bundle: RayBundle,
+    trace_name: str,
+    subplot_idx: int,
+    ncols: int,
+    max_rays: int,
+    max_points_per_ray: int,
+    marker_size: int,
+    line_width: int,
+):  # pragma: no cover
+    """
+    Adds a trace rendering a RayBundle object to the passed in figure, with
+    a given name and in a specific subplot.
+
+    Args:
+        fig: plotly figure to add the trace within.
+        cameras: the Cameras object to render. It can be batched.
+        trace_name: name to label the trace with.
+        subplot_idx: identifies the subplot, with 0 being the top left.
+        ncols: the number of subplots per row.
+        max_rays: maximum number of plotted rays in total. Randomly subsamples
+            without replacement in case the number of rays is bigger than max_rays.
+        max_points_per_ray: maximum number of points plotted per ray.
+        marker_size: the size of the ray point markers.
+        line_width: the width of the ray lines.
+    """
+
+    n_pts_per_ray = ray_bundle.lengths.shape[-1]
+    n_rays = ray_bundle.lengths.shape[:-1].numel()  # pyre-ignore[16]
+
+    # flatten all batches of rays into a single big bundle
+    ray_bundle_flat = RayBundle(
+        **{
+            attr: torch.flatten(getattr(ray_bundle, attr), start_dim=0, end_dim=-2)
+            for attr in ["origins", "directions", "lengths", "xys"]
+        }
+    )
+
+    # subsample the rays (if needed)
+    if n_rays > max_rays:
+        indices_rays = torch.randperm(n_rays)[:max_rays]
+        ray_bundle_flat = RayBundle(
+            **{
+                attr: getattr(ray_bundle_flat, attr)[indices_rays]
+                for attr in ["origins", "directions", "lengths", "xys"]
+            }
+        )
+
+    # make ray line endpoints
+    min_max_ray_depth = torch.stack(
+        [
+            ray_bundle_flat.lengths.min(dim=1).values,
+            ray_bundle_flat.lengths.max(dim=1).values,
+        ],
+        dim=-1,
+    )
+    ray_lines_endpoints = ray_bundle_to_ray_points(
+        ray_bundle_flat._replace(lengths=min_max_ray_depth)
+    )
+
+    # make the ray lines for plotly plotting
+    nan_tensor = torch.Tensor([[float("NaN")] * 3])
+    ray_lines = torch.empty(size=(1, 3))
+    for ray_line in ray_lines_endpoints:
+        # We combine the ray lines into a single tensor to plot them in a
+        # single trace. The NaNs are inserted between sets of ray lines
+        # so that the lines drawn by Plotly are not drawn between
+        # lines that belong to different rays.
+        ray_lines = torch.cat((ray_lines, nan_tensor, ray_line))
+    x, y, z = ray_lines.detach().cpu().numpy().T.astype(float)
+    row, col = subplot_idx // ncols + 1, subplot_idx % ncols + 1
+    fig.add_trace(
+        go.Scatter3d(
+            x=x,
+            y=y,
+            z=z,
+            marker={"size": 0.1},
+            line={"width": line_width},
+            name=trace_name,
+        ),
+        row=row,
+        col=col,
+    )
+
+    # subsample the ray points (if needed)
+    if n_pts_per_ray > max_points_per_ray:
+        indices_ray_pts = torch.cat(
+            [
+                torch.randperm(n_pts_per_ray)[:max_points_per_ray] + ri * n_pts_per_ray
+                for ri in range(ray_bundle_flat.lengths.shape[0])
+            ]
+        )
+        ray_bundle_flat = ray_bundle_flat._replace(
+            lengths=ray_bundle_flat.lengths.reshape(-1)[indices_ray_pts].reshape(
+                ray_bundle_flat.lengths.shape[0], -1
+            )
+        )
+
+    # plot the ray points
+    ray_points = (
+        ray_bundle_to_ray_points(ray_bundle_flat)
+        .view(-1, 3)
+        .detach()
+        .cpu()
+        .numpy()
+        .astype(float)
+    )
+    fig.add_trace(
+        go.Scatter3d(
+            x=ray_points[:, 0],
+            y=ray_points[:, 1],
+            z=ray_points[:, 2],
+            mode="markers",
+            name=trace_name + "_points",
+            marker={"size": marker_size},
+        ),
+        row=row,
+        col=col,
+    )
+
+    # Access the current subplot's scene configuration
+    plot_scene = "scene" + str(subplot_idx + 1)
+    current_layout = fig["layout"][plot_scene]
+
+    # update the bounds of the axes for the current trace
+    all_ray_points = ray_bundle_to_ray_points(ray_bundle).view(-1, 3)
+    ray_points_center = all_ray_points.mean(dim=0)
+    max_expand = (all_ray_points.max(0)[0] - all_ray_points.min(0)[0]).max().item()
+    _update_axes_bounds(ray_points_center, float(max_expand), current_layout)
+
+
+def _gen_fig_with_subplots(
+    batch_size: int, ncols: int, subplot_titles: List[str]
+):  # pragma: no cover
+    """
+    Takes in the number of objects to be plotted and generate a plotly figure
+    with the appropriate number and orientation of titled subplots.
+    Args:
+        batch_size: the number of elements in the batch of objects to be visualized.
+        ncols: number of subplots in the same row.
+        subplot_titles: titles for the subplot(s). list of strings of length batch_size.
+
+    Returns:
+        Plotly figure with ncols subplots per row, and batch_size subplots.
+    """
+    fig_rows = batch_size // ncols
+    if batch_size % ncols != 0:
+        fig_rows += 1  # allow for non-uniform rows
+    fig_cols = ncols
+    fig_type = [{"type": "scene"}]
+    specs = [fig_type * fig_cols] * fig_rows
+    # subplot_titles must have one title per subplot
+    fig = make_subplots(
+        rows=fig_rows,
+        cols=fig_cols,
+        specs=specs,
+        subplot_titles=subplot_titles,
+        column_widths=[1.0] * fig_cols,
+    )
+    return fig
+
+
+def _update_axes_bounds(
+    verts_center: torch.Tensor,
+    max_expand: float,
+    current_layout: go.Scene,  # pyre-ignore[11]
+):  # pragma: no cover
+    """
+    Takes in the vertices' center point and max spread, and the current plotly figure
+    layout and updates the layout to have bounds that include all traces for that subplot.
+    Args:
+        verts_center: tensor of size (3) corresponding to a trace's vertices' center point.
+        max_expand: the maximum spread in any dimension of the trace's vertices.
+        current_layout: the plotly figure layout scene corresponding to the referenced trace.
+    """
+    verts_min = verts_center - max_expand
+    verts_max = verts_center + max_expand
+    bounds = torch.t(torch.stack((verts_min, verts_max)))
+
+    # Ensure that within a subplot, the bounds capture all traces
+    old_xrange, old_yrange, old_zrange = (
+        current_layout["xaxis"]["range"],
+        current_layout["yaxis"]["range"],
+        current_layout["zaxis"]["range"],
+    )
+    x_range, y_range, z_range = bounds
+    if old_xrange is not None:
+        x_range[0] = min(x_range[0], old_xrange[0])
+        x_range[1] = max(x_range[1], old_xrange[1])
+    if old_yrange is not None:
+        y_range[0] = min(y_range[0], old_yrange[0])
+        y_range[1] = max(y_range[1], old_yrange[1])
+    if old_zrange is not None:
+        z_range[0] = min(z_range[0], old_zrange[0])
+        z_range[1] = max(z_range[1], old_zrange[1])
+
+    xaxis = {"range": x_range}
+    yaxis = {"range": y_range}
+    zaxis = {"range": z_range}
+    current_layout.update({"xaxis": xaxis, "yaxis": yaxis, "zaxis": zaxis})
+
+
+def _scale_camera_to_bounds(
+    coordinate: float, axis_bounds: Tuple[float, float], is_position: bool
+):  # pragma: no cover
+    """
+    We set our plotly plot's axes' bounding box to [-1,1]x[-1,1]x[-1,1]. As such,
+    the plotly camera location has to be scaled accordingly to have its world coordinates
+    correspond to its relative plotted coordinates for viewing the plotly plot.
+    This function does the scaling and offset to transform the coordinates.
+
+    Args:
+        coordinate: the float value to be transformed
+        axis_bounds: the bounds of the plotly plot for the axis which
+            the coordinate argument refers to
+        is_position: If true, the float value is the coordinate of a position, and so must
+            be moved in to [-1,1]. Otherwise it is a component of a direction, and so needs only
+            to be scaled.
+    """
+    scale = (axis_bounds[1] - axis_bounds[0]) / 2
+    if not is_position:
+        return coordinate / scale
+    offset = (axis_bounds[1] / scale) - 1
+    return coordinate / scale - offset
diff --git a/pytorch3d/pytorch3d/vis/texture_vis.py b/pytorch3d/pytorch3d/vis/texture_vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..37f5abd0f40da7a62f45c69717efe2c78531f565
--- /dev/null
+++ b/pytorch3d/pytorch3d/vis/texture_vis.py
@@ -0,0 +1,110 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Optional
+
+import numpy as np
+from PIL import Image, ImageDraw
+from pytorch3d.renderer.mesh import TexturesUV
+
+
+def texturesuv_image_matplotlib(
+    texture: TexturesUV,
+    *,
+    texture_index: int = 0,
+    radius: float = 1,
+    color=(1.0, 0.0, 0.0),
+    subsample: Optional[int] = 10000,
+    origin: str = "upper",
+):  # pragma: no cover
+    """
+    Plot the texture image for one element of a TexturesUV with
+    matplotlib together with verts_uvs positions circled.
+    In particular a value in verts_uvs which is never referenced
+    in faces_uvs will still be plotted.
+    This is for debugging purposes, e.g. to align the map with
+    the uv coordinates. In particular, matplotlib
+    is used which is not an official dependency of PyTorch3D.
+
+    Args:
+        texture: a TexturesUV object with one mesh
+        texture_index: index in the batch to plot
+        radius: plotted circle radius in pixels
+        color: any matplotlib-understood color for the circles.
+        subsample: if not None, number of points to plot.
+                Otherwise all points are plotted.
+        origin: "upper" or "lower" like matplotlib.imshow .
+            upper (the default) matches texturesuv_image_PIL.
+    """
+
+    import matplotlib.pyplot as plt
+    from matplotlib.patches import Circle
+
+    texture_image = texture.maps_padded()
+    centers = texture.centers_for_image(index=texture_index).numpy()
+
+    ax = plt.gca()
+    ax.imshow(texture_image[texture_index].detach().cpu().numpy(), origin=origin)
+
+    n_points = centers.shape[0]
+    if subsample is None or n_points <= subsample:
+        indices = range(n_points)
+    else:
+        indices = np.random.choice(n_points, subsample, replace=False)
+    for i in indices:
+        # setting clip_on=False makes it obvious when
+        # we have UV coordinates outside the correct range
+        ax.add_patch(Circle(centers[i], radius, color=color, clip_on=False))
+
+
+def texturesuv_image_PIL(
+    texture: TexturesUV,
+    *,
+    texture_index: int = 0,
+    radius: float = 1,
+    color="red",
+    subsample: Optional[int] = 10000,
+):  # pragma: no cover
+    """
+    Return a PIL image of the texture image of one element of the batch
+    from a TexturesUV, together with the verts_uvs positions circled.
+    In particular a value in verts_uvs which is never referenced
+    in faces_uvs will still be plotted.
+    This is for debugging purposes, e.g. to align the map with
+    the uv coordinates. In particular, matplotlib
+    is used which is not an official dependency of PyTorch3D.
+
+    Args:
+        texture: a TexturesUV object with one mesh
+        texture_index: index in the batch to plot
+        radius: plotted circle radius in pixels
+        color: any PIL-understood color for the circles.
+        subsample: if not None, number of points to plot.
+                Otherwise all points are plotted.
+
+    Returns:
+        PIL Image object.
+    """
+
+    centers = texture.centers_for_image(index=texture_index).numpy()
+    texture_image = texture.maps_padded()
+    texture_array = (texture_image[texture_index] * 255).cpu().numpy().astype(np.uint8)
+
+    image = Image.fromarray(texture_array)
+    draw = ImageDraw.Draw(image)
+
+    n_points = centers.shape[0]
+    if subsample is None or n_points <= subsample:
+        indices = range(n_points)
+    else:
+        indices = np.random.choice(n_points, subsample, replace=False)
+
+    for i in indices:
+        x = centers[i][0]
+        y = centers[i][1]
+        draw.ellipse([(x - radius, y - radius), (x + radius, y + radius)], fill=color)
+
+    return image
diff --git a/pytorch3d/scripts/build_website.sh b/pytorch3d/scripts/build_website.sh
new file mode 100644
index 0000000000000000000000000000000000000000..2efff6bee8d33a3e8175dff36cd8138781a2d6dc
--- /dev/null
+++ b/pytorch3d/scripts/build_website.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# run this script from the project root using `./scripts/build_docs.sh`
+
+set -e
+
+usage() {
+  echo "Usage: $0 [-b]"
+  echo ""
+  echo "Build PyTorch3D documentation."
+  echo ""
+  echo "  -b   Build static version of documentation (otherwise start server)"
+  echo ""
+  exit 1
+}
+
+BUILD_STATIC=false
+
+while getopts 'hb' flag; do
+  case "${flag}" in
+    h)
+      usage
+      ;;
+    b)
+      BUILD_STATIC=true
+      ;;
+    *)
+      usage
+      ;;
+  esac
+done
+
+
+echo "-----------------------------------"
+echo "Building PyTorch3D Docusaurus site"
+echo "-----------------------------------"
+cd website
+yarn
+cd ..
+
+echo "-----------------------------------"
+echo "Generating tutorials"
+echo "-----------------------------------"
+cwd=$(pwd)
+mkdir -p "website/_tutorials"
+mkdir -p "website/static/files"
+python scripts/parse_tutorials.py --repo_dir "${cwd}"
+
+cd website
+
+if [[ $BUILD_STATIC == true ]]; then
+  echo "-----------------------------------"
+  echo "Building static site"
+  echo "-----------------------------------"
+  yarn build
+else
+  echo "-----------------------------------"
+  echo "Starting local server"
+  echo "-----------------------------------"
+  yarn start
+fi
diff --git a/pytorch3d/scripts/parse_tutorials.py b/pytorch3d/scripts/parse_tutorials.py
new file mode 100755
index 0000000000000000000000000000000000000000..0f40b64c00bf0088e825a93ca8af1fd95fd6c00f
--- /dev/null
+++ b/pytorch3d/scripts/parse_tutorials.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import argparse
+import json
+import os
+
+import nbformat
+from bs4 import BeautifulSoup
+from nbconvert import HTMLExporter, ScriptExporter
+
+
+TEMPLATE = """const CWD = process.cwd();
+
+const React = require('react');
+const Tutorial = require(`${{CWD}}/core/Tutorial.js`);
+
+class TutorialPage extends React.Component {{
+  render() {{
+      const {{config: siteConfig}} = this.props;
+      const {{baseUrl}} = siteConfig;
+      return <Tutorial baseUrl={{baseUrl}} tutorialID="{}"/>;
+  }}
+}}
+
+module.exports = TutorialPage;
+
+"""
+
+JS_SCRIPTS = """
+<script
+  src="https://cdnjs.cloudflare.com/ajax/libs/require.js/2.1.10/require.min.js">
+</script>
+<script
+  src="https://cdnjs.cloudflare.com/ajax/libs/jquery/2.0.3/jquery.min.js">
+</script>
+"""  # noqa: E501
+
+
+def gen_tutorials(repo_dir: str) -> None:
+    """Generate HTML tutorials for PyTorch3D Docusaurus site from Jupyter notebooks.
+
+    Also create ipynb and py versions of tutorial in Docusaurus site for
+    download.
+    """
+    with open(os.path.join(repo_dir, "website", "tutorials.json"), "r") as infile:
+        tutorial_config = json.loads(infile.read())
+
+    tutorial_ids = {x["id"] for v in tutorial_config.values() for x in v}
+
+    for tid in tutorial_ids:
+        print("Generating {} tutorial".format(tid))
+
+        # convert notebook to HTML
+        ipynb_in_path = os.path.join(
+            repo_dir, "docs", "tutorials", "{}.ipynb".format(tid)
+        )
+        with open(ipynb_in_path, "r") as infile:
+            nb_str = infile.read()
+            nb = nbformat.reads(nb_str, nbformat.NO_CONVERT)
+
+        # displayname is absent from notebook metadata
+        nb["metadata"]["kernelspec"]["display_name"] = "python3"
+
+        exporter = HTMLExporter()
+        html, meta = exporter.from_notebook_node(nb)
+
+        # pull out html div for notebook
+        soup = BeautifulSoup(html, "html.parser")
+        nb_meat = soup.find("div", {"id": "notebook-container"})
+        del nb_meat.attrs["id"]
+        nb_meat.attrs["class"] = ["notebook"]
+        html_out = JS_SCRIPTS + str(nb_meat)
+
+        # generate html file
+        html_out_path = os.path.join(
+            repo_dir, "website", "_tutorials", "{}.html".format(tid)
+        )
+        with open(html_out_path, "w") as html_outfile:
+            html_outfile.write(html_out)
+
+        # generate JS file
+        script = TEMPLATE.format(tid)
+        js_out_path = os.path.join(
+            repo_dir, "website", "pages", "tutorials", "{}.js".format(tid)
+        )
+        with open(js_out_path, "w") as js_outfile:
+            js_outfile.write(script)
+
+        # output tutorial in both ipynb & py form
+        ipynb_out_path = os.path.join(
+            repo_dir, "website", "static", "files", "{}.ipynb".format(tid)
+        )
+        with open(ipynb_out_path, "w") as ipynb_outfile:
+            ipynb_outfile.write(nb_str)
+        exporter = ScriptExporter()
+        script, meta = exporter.from_notebook_node(nb)
+        py_out_path = os.path.join(
+            repo_dir, "website", "static", "files", "{}.py".format(tid)
+        )
+        with open(py_out_path, "w") as py_outfile:
+            py_outfile.write(script)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate JS, HTML, ipynb, and py files for tutorials."
+    )
+    parser.add_argument(
+        "--repo_dir", metavar="path", required=True, help="PyTorch3D repo directory."
+    )
+    args = parser.parse_args()
+    gen_tutorials(args.repo_dir)
diff --git a/pytorch3d/scripts/publish_website.sh b/pytorch3d/scripts/publish_website.sh
new file mode 100644
index 0000000000000000000000000000000000000000..0607c3f8bcdd9004828b03110c9d0e6214fe57d8
--- /dev/null
+++ b/pytorch3d/scripts/publish_website.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+# Instructions, assuming you are on a fresh pytorch3d checkout on a local
+# drive.
+
+# (1) Have a separate checkout of pytorch3d at the head of the gh-pages branch
+# on a local drive. Set the variable GHP to its full path.
+# Any uncommitted changes there will be obliterated.
+# For example
+#   GHP=/path/to/pytorch3d-gh-pages
+#   git clone -b gh-pages https://github.com/facebookresearch/pytorch3d $GHP
+
+# (2) Run this script in this directory with
+#   sudo docker run -it --rm -v $PWD/..:/loc -v $GHP:/ghp continuumio/miniconda3 bash --login /loc/scripts/publish_website.sh
+
+# (3) Choose a commit message, commit and push:
+#   cd $GHP && git add .
+#   git commit -m 'Update latest version of site'
+#   git push
+
+set -e
+
+conda create -y -n myenv python=3.7 nodejs
+
+# Note: Using bash --login together with the continuumio/miniconda3 image
+# is what lets conda activate work so smoothly.
+
+conda activate myenv
+pip install nbformat==4.4.0 nbconvert==5.3.1 ipywidgets==7.5.1 tornado==4.2 bs4 notebook==5.7.12 'mistune<2'
+npm install --global yarn
+
+cd /loc
+bash scripts/build_website.sh -b
+
+rm -rf /ghp/*
+echo "pytorch3d.org" > /ghp/CNAME
+mv /loc/website/build/pytorch3d/* /ghp/
diff --git a/pytorch3d/setup.cfg b/pytorch3d/setup.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..7372cc411977742c89788ea86702d858ef84df98
--- /dev/null
+++ b/pytorch3d/setup.cfg
@@ -0,0 +1,10 @@
+# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
+
+[isort]
+line_length = 88
+multi_line_output = 3
+include_trailing_comma = True
+force_grid_warp = 0
+default_section = THIRDPARTY
+lines_after_imports = 2
+combine_as_imports = True
diff --git a/pytorch3d/setup.py b/pytorch3d/setup.py
new file mode 100755
index 0000000000000000000000000000000000000000..fc1b92ad393ecac857d5cd75752f72b25ad83168
--- /dev/null
+++ b/pytorch3d/setup.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import glob
+import os
+import runpy
+import warnings
+from typing import List, Optional
+
+import torch
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import CUDA_HOME, CppExtension, CUDAExtension
+
+
+def get_existing_ccbin(nvcc_args: List[str]) -> Optional[str]:
+    """
+    Given a list of nvcc arguments, return the compiler if specified.
+
+    Note from CUDA doc: Single value options and list options must have
+    arguments, which must follow the name of the option itself by either
+    one of more spaces or an equals character.
+    """
+    last_arg = None
+    for arg in reversed(nvcc_args):
+        if arg == "-ccbin":
+            return last_arg
+        if arg.startswith("-ccbin="):
+            return arg[7:]
+        last_arg = arg
+    return None
+
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "pytorch3d", "csrc")
+    sources = glob.glob(os.path.join(extensions_dir, "**", "*.cpp"), recursive=True)
+    source_cuda = glob.glob(os.path.join(extensions_dir, "**", "*.cu"), recursive=True)
+    extension = CppExtension
+
+    extra_compile_args = {"cxx": ["-std=c++17"]}
+    define_macros = []
+    include_dirs = [extensions_dir]
+
+    force_cuda = os.getenv("FORCE_CUDA", "0") == "1"
+    if (torch.cuda.is_available() and CUDA_HOME is not None) or force_cuda:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        # Thrust is only used for its tuple objects.
+        # With CUDA 11.0 we can't use the cudatoolkit's version of cub.
+        # We take the risk that CUB and Thrust are incompatible, because
+        # we aren't using parts of Thrust which actually use CUB.
+        define_macros += [("THRUST_IGNORE_CUB_VERSION_CHECK", None)]
+        cub_home = os.environ.get("CUB_HOME", None)
+        nvcc_args = [
+            "-std=c++17",
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+        if cub_home is None:
+            prefix = os.environ.get("CONDA_PREFIX", None)
+            if prefix is not None and os.path.isdir(prefix + "/include/cub"):
+                cub_home = prefix + "/include"
+
+        if cub_home is None:
+            warnings.warn(
+                "The environment variable `CUB_HOME` was not found. "
+                "NVIDIA CUB is required for compilation and can be downloaded "
+                "from `https://github.com/NVIDIA/cub/releases`. You can unpack "
+                "it to a location of your choice and set the environment variable "
+                "`CUB_HOME` to the folder containing the `CMakeListst.txt` file."
+            )
+        else:
+            include_dirs.append(os.path.realpath(cub_home).replace("\\ ", " "))
+        nvcc_flags_env = os.getenv("NVCC_FLAGS", "")
+        if nvcc_flags_env != "":
+            nvcc_args.extend(nvcc_flags_env.split(" "))
+
+        # This is needed for pytorch 1.6 and earlier. See e.g.
+        # https://github.com/facebookresearch/pytorch3d/issues/436
+        # It is harmless after https://github.com/pytorch/pytorch/pull/47404 .
+        # But it can be problematic in torch 1.7.0 and 1.7.1
+        if torch.__version__[:4] != "1.7.":
+            CC = os.environ.get("CC", None)
+            if CC is not None:
+                existing_CC = get_existing_ccbin(nvcc_args)
+                if existing_CC is None:
+                    CC_arg = "-ccbin={}".format(CC)
+                    nvcc_args.append(CC_arg)
+                elif existing_CC != CC:
+                    msg = f"Inconsistent ccbins: {CC} and {existing_CC}"
+                    raise ValueError(msg)
+
+        extra_compile_args["nvcc"] = nvcc_args
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+
+    ext_modules = [
+        extension(
+            "pytorch3d._C",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+
+    return ext_modules
+
+
+# Retrieve __version__ from the package.
+__version__ = runpy.run_path("pytorch3d/__init__.py")["__version__"]
+
+
+if os.getenv("PYTORCH3D_NO_NINJA", "0") == "1":
+
+    class BuildExtension(torch.utils.cpp_extension.BuildExtension):
+        def __init__(self, *args, **kwargs):
+            super().__init__(use_ninja=False, *args, **kwargs)
+
+
+else:
+    BuildExtension = torch.utils.cpp_extension.BuildExtension
+
+
+setup(
+    name="pytorch3d",
+    version=__version__,
+    author="FAIR",
+    url="https://github.com/facebookresearch/pytorch3d",
+    description="PyTorch3D is FAIR's library of reusable components "
+    "for deep Learning with 3D data.",
+    packages=find_packages(
+        exclude=("configs", "tests", "tests.*", "docs.*", "projects.*")
+    ),
+    install_requires=["fvcore", "iopath"],
+    extras_require={
+        "all": ["matplotlib", "tqdm>4.29.0", "imageio", "ipywidgets"],
+        "dev": ["flake8", "isort", "black==19.3b0"],
+    },
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": BuildExtension},
+    package_data={
+        "": ["*.json"],
+    },
+)
diff --git a/pytorch3d/website/.dockerignore b/pytorch3d/website/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..27d2dae2b493488b48bdb18b95af471821ece9bf
--- /dev/null
+++ b/pytorch3d/website/.dockerignore
@@ -0,0 +1,2 @@
+*/node_modules
+*.log
diff --git a/pytorch3d/website/.gitignore b/pytorch3d/website/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..64150c13c1bfcfc78adc4750ab3d9ef97f2454e4
--- /dev/null
+++ b/pytorch3d/website/.gitignore
@@ -0,0 +1,13 @@
+.DS_Store
+
+node_modules
+
+lib/core/metadata.js
+lib/core/MetadataBlog.js
+
+website/translated_docs
+website/build/
+website/yarn.lock
+website/node_modules
+website/i18n/*
+website/_tutorials/*
diff --git a/pytorch3d/website/README.md b/pytorch3d/website/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bfd420afe376fcee68a8214f4995b3a5ceeab8cd
--- /dev/null
+++ b/pytorch3d/website/README.md
@@ -0,0 +1,265 @@
+This website was created with [Docusaurus](https://docusaurus.io/).
+
+# Building the PyTorch3D website
+
+## Install
+
+1. Make sure all the dependencies for the website are installed:
+
+```sh
+# Install dependencies
+$ yarn
+
+or
+
+$ npm install docusaurus-init
+```
+
+2. Run your dev server:
+
+```sh
+# Start the site
+$ yarn start
+
+or
+$ ./node_modules/docusaurus/lib/start-server.js
+```
+
+## Build the tutorials
+
+We convert the ipython notebooks to html using `parse_tutorials.py` which is found in the scripts folder at the root of the PyTorch3D directory.
+
+Before running this script install the following dependencies:
+
+```
+pip install nbformat==4.4.0 nbconvert==5.3.1 ipywidgets==7.5.1 tornado==4.2 bs4
+```
+
+Install yarn:
+
+```
+brew install yarn
+
+# or
+
+curl -o- -L https://yarnpkg.com/install.sh | bash
+```
+
+Then run the build script:
+
+```
+bash scripts/build_website.sh
+```
+
+This will build the docusaurus website and run a script to parse the tutorials and generate:
+- `.html` files in the `website/_tutorials` folder
+- `.js` files in the `website/pages/tutorials` folder
+- `.py`/`.ipynb` files in the `website/static/files` folder
+
+
+TODO: Add support for latex in markdown in jupyter notebooks and embedded images.
+
+## Build and publish the website
+
+To update for a new version, you need to build the tutorials and the website and push to the gh-pages
+branch of `github.com/facebookresearch/pytorch3d`. The instructions in `scripts/publish_website.sh`
+bring it all together.
+
+## Add a new tutorial
+
+The tutorials to include in the website are listed in `website/tutorials.json`. If you create a new tutorial add an entry to the list in this file. This is needed in order to generate the sidebar for the tutorials page.
+
+
+## Edit the landing page
+
+To change the content of the landing page modify: `website/pages/en/index.js`.
+
+
+## Edit the tutorials page
+
+To change the content of the tutorials home page modify: `website/pages/tutorials/index.js`.
+
+
+---------------------------------------------------------
+
+## Docusaurus docs
+
+- [Get Started in 5 Minutes](#get-started-in-5-minutes)
+- [Directory Structure](#directory-structure)
+- [Editing Content](#editing-content)
+- [Adding Content](#adding-content)
+- [Full Documentation](#full-documentation)
+
+
+## Directory Structure
+
+Your project file structure should look something like this
+
+```
+my-docusaurus/
+  docs/
+    doc-1.md
+    doc-2.md
+    doc-3.md
+  website/
+    blog/
+      2016-3-11-oldest-post.md
+      2017-10-24-newest-post.md
+    core/
+    node_modules/
+    pages/
+    static/
+      css/
+      img/
+    package.json
+    sidebars.json
+    siteConfig.js
+```
+
+# Editing Content
+
+## Editing an existing docs page
+
+Edit docs by navigating to `docs/` and editing the corresponding document:
+
+`docs/doc-to-be-edited.md`
+
+```markdown
+---
+id: page-needs-edit
+title: This Doc Needs To Be Edited
+---
+
+Edit me...
+```
+
+For more information about docs, click [here](https://docusaurus.io/docs/en/navigation)
+
+## Editing an existing blog post
+
+Edit blog posts by navigating to `website/blog` and editing the corresponding post:
+
+`website/blog/post-to-be-edited.md`
+
+```markdown
+---
+id: post-needs-edit
+title: This Blog Post Needs To Be Edited
+---
+
+Edit me...
+```
+
+For more information about blog posts, click [here](https://docusaurus.io/docs/en/adding-blog)
+
+# Adding Content
+
+## Adding a new docs page to an existing sidebar
+
+1. Create the doc as a new markdown file in `/docs`, example `docs/newly-created-doc.md`:
+
+```md
+---
+id: newly-created-doc
+title: This Doc Needs To Be Edited
+---
+
+My new content here..
+```
+
+1. Refer to that doc's ID in an existing sidebar in `website/sidebars.json`:
+
+```javascript
+// Add newly-created-doc to the Getting Started category of docs
+{
+  "docs": {
+    "Getting Started": [
+      "quick-start",
+      "newly-created-doc" // new doc here
+    ],
+    ...
+  },
+  ...
+}
+```
+
+For more information about adding new docs, click [here](https://docusaurus.io/docs/en/navigation)
+
+## Adding a new blog post
+
+1. Make sure there is a header link to your blog in `website/siteConfig.js`:
+
+`website/siteConfig.js`
+
+```javascript
+headerLinks: [
+    ...
+    { blog: true, label: 'Blog' },
+    ...
+]
+```
+
+2. Create the blog post with the format `YYYY-MM-DD-My-Blog-Post-Title.md` in `website/blog`:
+
+`website/blog/2018-05-21-New-Blog-Post.md`
+
+```markdown
+---
+author: Frank Li
+authorURL: https://twitter.com/foobarbaz
+authorFBID: 503283835
+title: New Blog Post
+---
+
+Lorem Ipsum...
+```
+
+For more information about blog posts, click [here](https://docusaurus.io/docs/en/adding-blog)
+
+## Adding items to your site's top navigation bar
+
+1. Add links to docs, custom pages or external links by editing the headerLinks field of `website/siteConfig.js`:
+
+`website/siteConfig.js`
+
+```javascript
+{
+  headerLinks: [
+    ...
+    /* you can add docs */
+    { doc: 'my-examples', label: 'Examples' },
+    /* you can add custom pages */
+    { page: 'help', label: 'Help' },
+    /* you can add external links */
+    { href: 'https://github.com/facebook/docusaurus', label: 'GitHub' },
+    ...
+  ],
+  ...
+}
+```
+
+For more information about the navigation bar, click [here](https://docusaurus.io/docs/en/navigation)
+
+## Adding custom pages
+
+1. Docusaurus uses React components to build pages. The components are saved as .js files in `website/pages/en`:
+1. If you want your page to show up in your navigation header, you will need to update `website/siteConfig.js` to add to the `headerLinks` element:
+
+`website/siteConfig.js`
+
+```javascript
+{
+  headerLinks: [
+    ...
+    { page: 'my-new-custom-page', label: 'My New Custom Page' },
+    ...
+  ],
+  ...
+}
+```
+
+For more information about custom pages, click [here](https://docusaurus.io/docs/en/custom-pages).
+
+# Full Documentation
+
+Full documentation can be found on the [website](https://docusaurus.io/).
diff --git a/pytorch3d/website/core/Footer.js b/pytorch3d/website/core/Footer.js
new file mode 100644
index 0000000000000000000000000000000000000000..f640d5f15cadf08f57fd88e717df129c400ca902
--- /dev/null
+++ b/pytorch3d/website/core/Footer.js
@@ -0,0 +1,91 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+const PropTypes = require("prop-types");
+const React = require('react');
+
+function SocialFooter(props) {
+  const repoUrl = `https://github.com/${props.config.organizationName}/${props.config.projectName}`;
+  return (
+    <div className="footerSection">
+      <div className="social">
+        <a
+          className="github-button" // part of the https://buttons.github.io/buttons.js script in siteConfig.js
+          href={repoUrl}
+          data-count-href={`${repoUrl}/stargazers`}
+          data-show-count="true"
+          data-count-aria-label="# stargazers on GitHub"
+          aria-label="Star PyTorch3D on GitHub"
+        >
+          {props.config.projectName}
+        </a>
+      </div>
+    </div>
+  );
+}
+
+SocialFooter.propTypes = {
+  config: PropTypes.object
+};
+
+class Footer extends React.Component {
+  docUrl(doc, language) {
+    const baseUrl = this.props.config.baseUrl;
+    const docsUrl = this.props.config.docsUrl;
+    const docsPart = `${docsUrl ? `${docsUrl}/` : ''}`;
+    const langPart = `${language ? `${language}/` : ''}`;
+    return `${baseUrl}${docsPart}${langPart}${doc}`;
+  }
+
+  pageUrl(doc, language) {
+    const baseUrl = this.props.config.baseUrl;
+    return baseUrl + (language ? `${language}/` : '') + doc;
+  }
+
+  render() {
+    const repoUrl = `https://github.com/${this.props.config.organizationName}/${this.props.config.projectName}`;
+    return (
+      <footer className="nav-footer" id="footer">
+        <section className="sitemap">
+          <SocialFooter config={this.props.config} />
+        </section>
+
+        <a
+          href="https://opensource.facebook.com/"
+          target="_blank"
+          rel="noreferrer noopener"
+          className="fbOpenSource">
+          <img
+            src={`${this.props.config.baseUrl}img/oss_logo.png`}
+            alt="Facebook Open Source"
+            width="170"
+            height="45"
+          />
+        </a>
+        <section className="copyright">{this.props.config.copyright}
+          <br/>
+          Legal:
+          <a
+            href="https://opensource.facebook.com/legal/privacy/"
+            target="_blank"
+            rel="noreferrer noopener">
+            Privacy
+          </a>
+          <a
+            href="https://opensource.facebook.com/legal/terms/"
+            target="_blank"
+            rel="noreferrer noopener">
+            Terms
+          </a>
+        </section>
+      </footer>
+    );
+  }
+}
+
+module.exports = Footer;
diff --git a/pytorch3d/website/core/Tutorial.js b/pytorch3d/website/core/Tutorial.js
new file mode 100644
index 0000000000000000000000000000000000000000..7dda462d9e4f9bc1be4648bc0d3487c7b0766785
--- /dev/null
+++ b/pytorch3d/website/core/Tutorial.js
@@ -0,0 +1,100 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * @format
+ */
+
+const React = require('react');
+
+const fs = require('fs-extra');
+const path = require('path');
+const CWD = process.cwd();
+
+const CompLibrary = require(`${CWD}/node_modules/docusaurus/lib/core/CompLibrary.js`);
+const Container = CompLibrary.Container;
+
+const TutorialSidebar = require(`${CWD}/core/TutorialSidebar.js`);
+
+function renderDownloadIcon() {
+  return (
+    <svg
+      aria-hidden="true"
+      focusable="false"
+      data-prefix="fas"
+      data-icon="file-download"
+      className="svg-inline--fa fa-file-download fa-w-12"
+      role="img"
+      xmlns="http://www.w3.org/2000/svg"
+      viewBox="0 0 384 512">
+      <path
+        fill="currentColor"
+        d="M224 136V0H24C10.7 0 0 10.7 0 24v464c0 13.3 10.7 24 24 24h336c13.3 0 24-10.7 24-24V160H248c-13.2 0-24-10.8-24-24zm76.45 211.36l-96.42 95.7c-6.65 6.61-17.39 6.61-24.04 0l-96.42-95.7C73.42 337.29 80.54 320 94.82 320H160v-80c0-8.84 7.16-16 16-16h32c8.84 0 16 7.16 16 16v80h65.18c14.28 0 21.4 17.29 11.27 27.36zM377 105L279.1 7c-4.5-4.5-10.6-7-17-7H256v128h128v-6.1c0-6.3-2.5-12.4-7-16.9z"
+      />
+    </svg>
+  );
+}
+
+class Tutorial extends React.Component {
+  render() {
+    const {baseUrl, tutorialID} = this.props;
+
+    const htmlFile = `${CWD}/_tutorials/${tutorialID}.html`;
+    const normalizedHtmlFile = path.normalize(htmlFile);
+
+    return (
+      <div className="docMainWrapper wrapper">
+        <TutorialSidebar currentTutorialID={tutorialID} />
+        <Container className="mainContainer">
+          <div className="tutorialButtonsWrapper">
+            <div className="tutorialButtonWrapper buttonWrapper">
+              <a
+                className="tutorialButton button"
+                download
+                href={`https://colab.research.google.com/github/facebookresearch/pytorch3d/blob/stable/docs/tutorials/${tutorialID}.ipynb`}
+                target="_blank">
+                <img
+                  className="colabButton"
+                  align="left"
+                  src={`${baseUrl}img/colab_icon.png`}
+                />
+                {'Run in Google Colab'}
+              </a>
+            </div>
+            <div className="tutorialButtonWrapper buttonWrapper">
+              <a
+                className="tutorialButton button"
+                download
+                href={`${baseUrl}files/${tutorialID}.ipynb`}
+                target="_blank">
+                {renderDownloadIcon()}
+                {'Download Tutorial Jupyter Notebook'}
+              </a>
+            </div>
+            <div className="tutorialButtonWrapper buttonWrapper">
+              <a
+                className="tutorialButton button"
+                download
+                href={`${baseUrl}files/${tutorialID}.py`}
+                target="_blank">
+                {renderDownloadIcon()}
+                {'Download Tutorial Source Code'}
+              </a>
+            </div>
+          </div>
+          <div
+            className="tutorialBody"
+            dangerouslySetInnerHTML={{
+              __html: fs.readFileSync(normalizedHtmlFile, {encoding: 'utf8'}),
+            }}
+          />
+        </Container>
+      </div>
+    );
+  }
+}
+
+module.exports = Tutorial;
diff --git a/pytorch3d/website/core/TutorialSidebar.js b/pytorch3d/website/core/TutorialSidebar.js
new file mode 100644
index 0000000000000000000000000000000000000000..1893d50d4ebd27d7b1f6a1b309c627c573e0211d
--- /dev/null
+++ b/pytorch3d/website/core/TutorialSidebar.js
@@ -0,0 +1,93 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * @format
+ */
+
+const React = require('react');
+const fs = require('fs-extra');
+const path = require('path');
+const join = path.join;
+const CWD = process.cwd();
+
+const CompLibrary = require(join(
+  CWD,
+  '/node_modules/docusaurus/lib/core/CompLibrary.js',
+));
+const SideNav = require(join(
+  CWD,
+  '/node_modules/docusaurus/lib/core/nav/SideNav.js',
+));
+
+const Container = CompLibrary.Container;
+
+const OVERVIEW_ID = 'tutorial_overview';
+
+class TutorialSidebar extends React.Component {
+  render() {
+    const {currentTutorialID} = this.props;
+    const current = {
+      id: currentTutorialID || OVERVIEW_ID,
+    };
+
+    const toc = [
+      {
+        type: 'CATEGORY',
+        title: 'Tutorials',
+        children: [
+          {
+            type: 'LINK',
+            item: {
+              permalink: 'tutorials/',
+              id: OVERVIEW_ID,
+              title: 'Overview',
+            },
+          },
+        ],
+      },
+    ];
+
+    const jsonFile = join(CWD, 'tutorials.json');
+    const normJsonFile = path.normalize(jsonFile);
+    const json = JSON.parse(fs.readFileSync(normJsonFile, {encoding: 'utf8'}));
+
+    Object.keys(json).forEach(category => {
+      const categoryItems = json[category];
+      const items = [];
+      categoryItems.map(item => {
+        items.push({
+          type: 'LINK',
+          item: {
+            permalink: `tutorials/${item.id}`,
+            id: item.id,
+            title: item.title,
+          },
+        });
+      });
+
+      toc.push({
+        type: 'CATEGORY',
+        title: category,
+        children: items,
+      });
+    });
+
+    return (
+      <Container className="docsNavContainer" id="docsNav" wrapper={false}>
+        <SideNav
+          language={'tutorials'}
+          root={'tutorials'}
+          title="Tutorials"
+          contents={toc}
+          current={current}
+        />
+      </Container>
+    );
+  }
+}
+
+module.exports = TutorialSidebar;
diff --git a/pytorch3d/website/package.json b/pytorch3d/website/package.json
new file mode 100644
index 0000000000000000000000000000000000000000..1f87392a9d8236e74a6de6a90608d201cf8a44db
--- /dev/null
+++ b/pytorch3d/website/package.json
@@ -0,0 +1,14 @@
+{
+  "scripts": {
+    "examples": "docusaurus-examples",
+    "start": "docusaurus-start",
+    "build": "docusaurus-build",
+    "publish-gh-pages": "docusaurus-publish",
+    "write-translations": "docusaurus-write-translations",
+    "version": "docusaurus-version",
+    "rename-version": "docusaurus-rename-version"
+  },
+  "devDependencies": {
+    "docusaurus": "^1.14.4"
+  }
+}
diff --git a/pytorch3d/website/pages/en/help.js b/pytorch3d/website/pages/en/help.js
new file mode 100644
index 0000000000000000000000000000000000000000..2417dd56645a64f70017df17a778a11c7d708ea6
--- /dev/null
+++ b/pytorch3d/website/pages/en/help.js
@@ -0,0 +1,55 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+const React = require('react');
+
+const CompLibrary = require('../../core/CompLibrary.js');
+
+const Container = CompLibrary.Container;
+const GridBlock = CompLibrary.GridBlock;
+
+function Help(props) {
+  const {config: siteConfig, language = ''} = props;
+  const {baseUrl, docsUrl} = siteConfig;
+  const docsPart = `${docsUrl ? `${docsUrl}/` : ''}`;
+  const langPart = `${language ? `${language}/` : ''}`;
+  const docUrl = doc => `${baseUrl}${docsPart}${langPart}${doc}`;
+
+  const supportLinks = [
+    {
+      content: `Learn more using the [documentation on this site.](${docUrl(
+        'doc1.html',
+      )})`,
+      title: 'Browse Docs',
+    },
+    {
+      content: 'Ask questions about the documentation and project',
+      title: 'Join the community',
+    },
+    {
+      content: "Find out what's new with this project",
+      title: 'Stay up to date',
+    },
+  ];
+
+  return (
+    <div className="docMainWrapper wrapper">
+      <Container className="mainContainer documentContainer postContainer">
+        <div className="post">
+          <header className="postHeader">
+            <h1>Need help?</h1>
+          </header>
+          <p>This project is maintained by a dedicated group of people.</p>
+          <GridBlock contents={supportLinks} layout="threeColumn" />
+        </div>
+      </Container>
+    </div>
+  );
+}
+
+module.exports = Help;
diff --git a/pytorch3d/website/pages/en/index.js b/pytorch3d/website/pages/en/index.js
new file mode 100644
index 0000000000000000000000000000000000000000..7f6806d77d9b2b0f82cc0fa288bbbcfde5cd2bdc
--- /dev/null
+++ b/pytorch3d/website/pages/en/index.js
@@ -0,0 +1,225 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+const React = require('react');
+
+const CompLibrary = require('../../core/CompLibrary.js');
+
+const MarkdownBlock = CompLibrary.MarkdownBlock; /* Used to read markdown */
+const Container = CompLibrary.Container;
+const GridBlock = CompLibrary.GridBlock;
+const bash = (...args) => `~~~bash\n${String.raw(...args)}\n~~~`;
+class HomeSplash extends React.Component {
+  render() {
+    const {siteConfig, language = ''} = this.props;
+    const {baseUrl, docsUrl} = siteConfig;
+    const docsPart = `${docsUrl ? `${docsUrl}/` : ''}`;
+    const langPart = `${language ? `${language}/` : ''}`;
+    const docUrl = doc => `${baseUrl}${docsPart}${langPart}${doc}`;
+
+    const SplashContainer = props => (
+      <div className="homeContainer">
+        <div className="homeSplashFade">
+          <div className="wrapper homeWrapper">{props.children}</div>
+        </div>
+      </div>
+    );
+
+    const Logo = props => (
+      <div className="splashLogo">
+        <img src={props.img_src} alt="Project Logo" />
+      </div>
+    );
+
+    const ProjectTitle = props => (
+      <h2 className="projectTitle">
+        <small>{props.tagline}</small>
+      </h2>
+    );
+
+    const PromoSection = props => (
+      <div className="section promoSection">
+        <div className="promoRow">
+          <div className="pluginRowBlock">{props.children}</div>
+        </div>
+      </div>
+    );
+
+    const Button = props => (
+      <div className="pluginWrapper buttonWrapper">
+        <a className="button" href={props.href} target={props.target}>
+          {props.children}
+        </a>
+      </div>
+    );
+
+    return (
+      <SplashContainer>
+        <Logo img_src={baseUrl + 'img/pytorch3dlogowhite.svg'} />
+        <div className="inner">
+          <ProjectTitle tagline={siteConfig.tagline} title={siteConfig.title} />
+          <PromoSection>
+            <Button href={docUrl('why_pytorch3d.html')}>Docs</Button>
+            <Button href={`${baseUrl}tutorials/`}>Tutorials</Button>
+            <Button href={'#quickstart'}>Get Started</Button>
+          </PromoSection>
+        </div>
+      </SplashContainer>
+    );
+  }
+}
+
+class Index extends React.Component {
+  render() {
+    const {config: siteConfig, language = ''} = this.props;
+    const {baseUrl} = siteConfig;
+
+    const Block = props => (
+      <Container
+        padding={['bottom', 'top']}
+        id={props.id}
+        background={props.background}>
+        <GridBlock
+          align="center"
+          contents={props.children}
+          layout={props.layout}
+        />
+      </Container>
+    );
+
+    const Description = () => (
+      <Block background="light">
+        {[
+          {
+            content:
+              'This is another description of how this project is useful',
+            image: `${baseUrl}img/docusaurus.svg`,
+            imageAlign: 'right',
+            title: 'Description',
+          },
+        ]}
+      </Block>
+    );
+
+    const pre = '```';
+
+    const codeExample = `${pre}python
+from pytorch3d.utils import ico_sphere
+from pytorch3d.io import load_obj
+from pytorch3d.structures import Meshes
+from pytorch3d.ops import sample_points_from_meshes
+from pytorch3d.loss import chamfer_distance
+
+# Use an ico_sphere mesh and load a mesh from an .obj e.g. model.obj
+sphere_mesh = ico_sphere(level=3)
+verts, faces, _ = load_obj("model.obj")
+test_mesh = Meshes(verts=[verts], faces=[faces.verts_idx])
+
+# Differentiably sample 5k points from the surface of each mesh and then compute the loss.
+sample_sphere = sample_points_from_meshes(sphere_mesh, 5000)
+sample_test = sample_points_from_meshes(test_mesh, 5000)
+loss_chamfer, _ = chamfer_distance(sample_sphere, sample_test)
+    `;
+
+    const QuickStart = () => (
+      <div
+        className="productShowcaseSection"
+        id="quickstart"
+        style={{textAlign: 'center'}}>
+        <h2>Get Started</h2>
+        <Container>
+          <ol>
+            <li>
+              <strong>Install PyTorch3D  </strong> (following the instructions <a href="https://github.com/facebookresearch/pytorch3d/blob/main/INSTALL.md">here</a>)
+            </li>
+            <li>
+              <strong>Try a few 3D operators  </strong>
+              e.g. compute the chamfer loss between two meshes:
+              <MarkdownBlock>{codeExample}</MarkdownBlock>
+            </li>
+          </ol>
+        </Container>
+      </div>
+    );
+
+    const Features = () => (
+      <div className="productShowcaseSection" style={{textAlign: 'center'}}>
+        <Block layout="fourColumn">
+          {[
+            {
+              content:
+                'Supports batching of 3D inputs of different sizes ' +
+                'such as meshes' ,
+              image: `${baseUrl}img/batching.svg`,
+              imageAlign: 'top',
+              title: 'Heterogeneous Batching',
+            },
+            {
+              content:
+                'Supports optimized implementations of ' +
+                'several common  functions for 3D data',
+              image: `${baseUrl}img/ops.png`,
+              imageAlign: 'top',
+              title: 'Fast 3D Operators',
+            },
+            {
+              content:
+                'Modular differentiable rendering API ' +
+                'with parallel implementations in ' +
+                'PyTorch, C++ and CUDA' ,
+              image: `${baseUrl}img/rendering.svg`,
+              imageAlign: 'top',
+              title: 'Differentiable Rendering',
+            },
+          ]}
+        </Block>
+      </div>
+    );
+
+    const Showcase = () => {
+      if ((siteConfig.users || []).length === 0) {
+        return null;
+      }
+
+      const showcase = siteConfig.users
+        .filter(user => user.pinned)
+        .map(user => (
+          <a href={user.infoLink} key={user.infoLink}>
+            <img src={user.image} alt={user.caption} title={user.caption} />
+          </a>
+        ));
+
+      const pageUrl = page => baseUrl + (language ? `${language}/` : '') + page;
+
+      return (
+        <div className="productShowcaseSection paddingBottom">
+          <h2>Who is Using This?</h2>
+          <p>This project is used by all these people</p>
+          <div className="logos">{showcase}</div>
+          <div className="more-users">
+            <a className="button" href={pageUrl('users.html')}>
+              More {siteConfig.title} Users
+            </a>
+          </div>
+        </div>
+      );
+    };
+
+    return (
+      <div>
+        <HomeSplash siteConfig={siteConfig} language={language} />
+        <div className="landingPage mainContainer">
+          <Features />
+          <QuickStart />
+        </div>
+      </div>
+    );
+  }
+}
+
+module.exports = Index;
diff --git a/pytorch3d/website/pages/en/users.js b/pytorch3d/website/pages/en/users.js
new file mode 100644
index 0000000000000000000000000000000000000000..222da708a452641a3f153ff63f6e9cecb1cf314b
--- /dev/null
+++ b/pytorch3d/website/pages/en/users.js
@@ -0,0 +1,49 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+const React = require('react');
+
+const CompLibrary = require('../../core/CompLibrary.js');
+
+const Container = CompLibrary.Container;
+
+class Users extends React.Component {
+  render() {
+    const {config: siteConfig} = this.props;
+    if ((siteConfig.users || []).length === 0) {
+      return null;
+    }
+
+    const editUrl = `${siteConfig.repoUrl}/edit/main/website/siteConfig.js`;
+    const showcase = siteConfig.users.map(user => (
+      <a href={user.infoLink} key={user.infoLink}>
+        <img src={user.image} alt={user.caption} title={user.caption} />
+      </a>
+    ));
+
+    return (
+      <div className="mainContainer">
+        <Container padding={['bottom', 'top']}>
+          <div className="showcaseSection">
+            <div className="prose">
+              <h1>Who is Using This?</h1>
+              <p>This project is used by many folks</p>
+            </div>
+            <div className="logos">{showcase}</div>
+            <p>Are you using this project?</p>
+            <a href={editUrl} className="button">
+              Add your company
+            </a>
+          </div>
+        </Container>
+      </div>
+    );
+  }
+}
+
+module.exports = Users;
diff --git a/pytorch3d/website/pages/tutorials/index.js b/pytorch3d/website/pages/tutorials/index.js
new file mode 100644
index 0000000000000000000000000000000000000000..880522022f110772a9e83ac1c644f54b2d802b79
--- /dev/null
+++ b/pytorch3d/website/pages/tutorials/index.js
@@ -0,0 +1,83 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ *
+ * @format
+ */
+
+const React = require('react');
+
+const CWD = process.cwd();
+
+const CompLibrary = require(`${CWD}/node_modules/docusaurus/lib/core/CompLibrary.js`);
+const Container = CompLibrary.Container;
+const MarkdownBlock = CompLibrary.MarkdownBlock;
+
+const TutorialSidebar = require(`${CWD}/core/TutorialSidebar.js`);
+const bash = (...args) => `~~~bash\n${String.raw(...args)}\n~~~`;
+
+class TutorialHome extends React.Component {
+  render() {
+    return (
+      <div className="docMainWrapper wrapper">
+        <TutorialSidebar currentTutorialID={null} />
+        <Container className="mainContainer documentContainer postContainer">
+          <div className="post">
+            <header className="postHeader">
+              <h1 className="postHeaderTitle">
+                Welcome to the PyTorch3D Tutorials
+              </h1>
+            </header>
+            <p>
+              Here you can learn about the structure and applications of
+              PyTorch3D from examples which are in the form of ipython
+              notebooks.
+            </p>
+            <h3> Run interactively </h3>
+            <p>
+              At the top of each example you can find a button named{' '}
+              <strong>"Run in Google Colab"</strong> which will open the
+              notebook in{' '}
+              <a href="https://colab.research.google.com/notebooks/intro.ipynb">
+                {' '}
+                Google Colaboratory{' '}
+              </a>{' '}
+              where you can run the code directly in the browser with access to
+              GPU support - it looks like this:
+            </p>
+            <div className="tutorialButtonsWrapper">
+              <div className="tutorialButtonWrapper buttonWrapper">
+                <a className="tutorialButton button" target="_blank">
+                  <img
+                    className="colabButton"
+                    align="left"
+                    src="/img/colab_icon.png"
+                  />
+                  {'Run in Google Colab'}
+                </a>
+              </div>
+            </div>
+            <p>
+              {' '}
+              You can modify the code and experiment with varying different
+              settings. Remember to install the latest stable version of
+              PyTorch3D and its dependencies. Code to do this with pip is
+              provided in each notebook.{' '}
+            </p>
+            <h3> Run locally </h3>
+            <p>
+              {' '}
+              There is also a button to download the notebook and source code to
+              run it locally.{' '}
+            </p>
+          </div>
+        </Container>
+      </div>
+    );
+  }
+}
+
+module.exports = TutorialHome;
diff --git a/pytorch3d/website/sidebars.json b/pytorch3d/website/sidebars.json
new file mode 100644
index 0000000000000000000000000000000000000000..92932fbac354d801883a1ade884bed9792f799b5
--- /dev/null
+++ b/pytorch3d/website/sidebars.json
@@ -0,0 +1,9 @@
+{
+  "docs": {
+    "Introduction": ["why_pytorch3d"],
+    "Data": ["io", "meshes_io", "datasets", "batching"],
+    "Ops": ["cubify", "iou3d"],
+    "Visualization": ["visualization"],
+    "Renderer": ["renderer", "renderer_getting_started", "cameras"]
+  }
+}
diff --git a/pytorch3d/website/siteConfig.js b/pytorch3d/website/siteConfig.js
new file mode 100644
index 0000000000000000000000000000000000000000..2085acfe10c4f8f1a9efd5e1bd654e0afebd8313
--- /dev/null
+++ b/pytorch3d/website/siteConfig.js
@@ -0,0 +1,87 @@
+/**
+ * Copyright (c) 2017-present, Facebook, Inc.
+ *
+ */
+
+// See https://docusaurus.io/docs/site-config for all the possible
+// site configuration options.
+
+// List of projects/orgs using your project for the users page.
+const users = [
+  {
+    caption: 'User1',
+    // You will need to prepend the image path with your baseUrl
+    // if it is not '/', like: '/test-site/img/image.jpg'.
+    image: '/img/undraw_open_source.svg',
+    infoLink: 'https://www.facebook.com',
+    pinned: true,
+  },
+];
+
+const baseUrl = '/'
+
+const siteConfig = {
+  title: 'PyTorch3D', // Title for your website.
+  tagline: 'A library for deep learning with 3D data',
+  url: 'https://pytorch3d.org', // Your website URL
+  baseUrl: baseUrl, // Base URL for your project */
+  projectName: 'pytorch3d',
+  organizationName: 'facebookresearch',
+  customDocsPath: 'docs/notes',
+  headerLinks: [
+    {doc: 'why_pytorch3d', label: 'Docs'},
+    {page: 'tutorials', label: 'Tutorials'},
+    {href: "https://pytorch3d.readthedocs.io/", label: 'API'},
+    {href: "https://github.com/facebookresearch/pytorch3d", label: 'GitHub'},
+  ],
+
+  // If you have users set above, you add it here:
+  users,
+
+  /* path to images for header/footer */
+  headerIcon: 'img/pytorch3dfavicon.png',
+  footerIcon: 'img/pytorch3dfavicon.png',
+  favicon: 'img/pytorch3dfavicon.png',
+
+  /* Colors for website */
+  colors: {
+    primaryColor: '#812CE5',
+    secondaryColor: '#FFAF00',
+  },
+
+  // @lint-ignore-every LICENSELINT
+  // This copyright info is used in /core/Footer.js and blog RSS/Atom feeds.
+  copyright: `Copyright \u{00A9} ${new Date().getFullYear()} Meta Platforms, Inc`,
+
+  highlight: {
+    // Highlight.js theme to use for syntax highlighting in code blocks.
+    theme: 'default',
+  },
+
+  // Add custom scripts here that would be placed in <script> tags.
+  scripts: ['https://buttons.github.io/buttons.js'],
+
+  // On page navigation for the current documentation page.
+  onPageNav: 'separate',
+  // No .html extensions for paths.
+  cleanUrl: true,
+
+  // Open Graph and Twitter card images.
+  ogImage: 'img/pytorch3dlogoicon.svg',
+  twitterImage: 'img/pytorch3dlogoicon.svg',
+
+   // Google analytics
+   gaTrackingId: 'UA-157376881-1',
+
+  // For sites with a sizable amount of content, set collapsible to true.
+  // Expand/collapse the links and subcategories under categories.
+  // docsSideNavCollapsible: true,
+
+  // Show documentation's last contributor's name.
+  enableUpdateBy: true,
+
+  // Show documentation's last update time.
+  // enableUpdateTime: true,
+};
+
+module.exports = siteConfig;
diff --git a/pytorch3d/website/static/css/custom.css b/pytorch3d/website/static/css/custom.css
new file mode 100644
index 0000000000000000000000000000000000000000..5d7ebee88606f79b156fb45fa3e2c22e1abe1083
--- /dev/null
+++ b/pytorch3d/website/static/css/custom.css
@@ -0,0 +1,346 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+html body {
+  font-family: 'Montserrat', sans-serif;
+  overflow-x: hidden;
+}
+
+.fixedHeaderContainer {
+  background-color: #222222;
+}
+
+
+.fixedHeaderContainer header .headerTitleWithLogo {
+  display: block;
+  color: #ffffff;
+}
+
+.fixedHeaderContainer header .logo {
+  height: 50px;
+}
+
+.fixedHeaderContainer header a:nth-child(2) {
+  position: absolute;
+  right: 0px;
+}
+
+.fixedHeaderContainer header a:nth-child(2) h3 {
+  font-size: 14px;
+}
+
+.fixedHeaderContainer header a:nth-child(2) h3::before {
+  content: 'v: ';
+}
+
+.navigationSlider {
+  margin-right: 80px;
+}
+
+.navigationSlider .slidingNav ul {
+  background: #222222;
+}
+
+.navigationSlider .slidingNav ul li a {
+  color: #c7d4fd;
+}
+
+.navigationSlider .slidingNav ul li a:hover,
+.navigationSlider .slidingNav ul li a:focus {
+  color: #ffffff;
+  background-color: inherit;
+}
+
+.navigationSlider .slidingNav ul li.siteNavItemActive > a,
+.navigationSlider .slidingNav ul li.siteNavGroupActive > a {
+  background-color: inherit;
+}
+
+.homeContainer {
+  background: linear-gradient(
+    rgba(129, 44, 229, 1) 0%,
+    rgba(255, 175, 0, 1) 100%
+  );
+  padding: 25px 0px;
+}
+
+.splashLogo {
+  display: block;
+  margin: 0 auto;
+  width: 65%;
+}
+
+.projectTitle {
+  color: #ffffff;
+  font-variant: small-caps;
+  font-weight: 300;
+}
+
+.promoSection .button {
+  border: 2px solid #fff;
+  color: #ffffff;
+  font-size: 19px;
+  margin: 10px;
+}
+
+.promoSection .button:hover {
+  background: inherit;
+  border: 2px solid #ffffff;
+  color: #ffffff;
+}
+
+.landingPage {
+  padding: 0px;
+}
+
+
+.productShowcaseSection {
+   padding: 45px 20px 30px 20px;
+}
+
+div.productShowcaseSection {
+  color: #6c6c6c;
+  padding-top: 40px;
+}
+
+#quickstart {
+  padding-top: 80px;
+}
+
+.productShowcaseSection > h2 {
+  font-variant: small-caps;
+  font-weight: 360;
+  margin: 0px;
+  padding: 0px;
+  color: #5b1861;
+}
+
+.productShowcaseSection p {
+  font-weight: 360;
+}
+
+# Subtitles for key features
+.productShowcaseSection .blockContent > div span p {
+  font-size: 18px;
+}
+
+.productShowcaseSection div.container {
+  padding-bottom: 40px;
+  padding-top: 10px;
+  padding-left: 0px;
+  padding-right: 0px;
+}
+
+.productShowcaseSection img {
+  height: 100px;
+}
+
+.gridBlock .fourByGridBlock img {
+    max-width: 200%;
+}
+
+.productShowcaseSection li {
+  padding: 10px 0;
+}
+
+.productShowcaseSection pre {
+  margin: 10px 0;
+}
+
+.productShowcaseSection code {
+  background: #fff;
+}
+
+.container .wrapper .alignCenter h2 {
+  color: #222222;
+}
+
+div#quickstart {
+  background: #efefef;
+}
+
+div#quickstart ol {
+  margin-bottom: 0px;
+}
+
+.nav-footer {
+  background-color: #222222;
+}
+
+.nav-footer .sitemap a {
+  color: #c7d4fd;
+}
+
+.nav-footer .sitemap a:hover {
+  color: #ffffff;
+}
+
+.social {
+ text-align: center
+}
+
+a,
+p a {
+  color: #4872f9;
+}
+
+a:hover,
+p a:hover {
+  color: #4872f9;
+}
+
+.imageAlignTop .blockImage {
+    margin-bottom: 20px;
+    max-width: 200px;
+}
+
+/* Style tutorials */
+.tutorialBody {
+  margin-top: -20px;
+  color: #6c6c6c;
+}
+
+.tutorialBody h1 {
+  margin: 0px;
+}
+
+.tutorialBody h1,
+.tutorialBody h2,
+.tutorialBody h3 {
+  color: #222222;
+}
+
+.tutorialBody pre {
+  font-family: 'IBM Plex Mono', monospace;
+  font-size: 14px;
+  margin: 0px;
+}
+
+.tutorialBody .input_prompt,
+.tutorialBody .output_prompt {
+  color: darkred;
+  font-size: 12px;
+}
+
+.tutorialBody .highlight {
+  background: #f3f4f7;
+  padding: 10px 20px;
+  border: lightgray 1px solid;
+  border-radius: 3px;
+}
+
+.tutorialBody .cell {
+  margin: 20px;
+}
+
+.tutorialBody .output_stderr {
+  background-color: #fdede9;
+}
+
+.tutorialBody .anchor-link {
+  color: lightgray;
+}
+
+.tutorialBody iframe {
+  width: 100%;
+  height: 100vh;
+}
+
+.tutorialButtonWrapper,
+.tutorialRuntime {
+  margin: 20px;
+}
+
+.tutorialButtonWrapper {
+  float: left;
+  margin: 5px;
+}
+
+.colabButtonWrapper {
+  float: left;
+  margin: 5px;
+}
+
+.colabButtonWrapper img {
+ padding-right: 0.25em;
+}
+
+.colabButton {
+  width: 24px;
+}
+
+.tutorialButtonsWrapper {
+ display: flex;
+ align-items: center;
+ padding-bottom: 15px;
+}
+
+/* .tutorialButton {
+  color: #4872f9;
+  border: 1px solid #4872f9;
+}
+ */
+.tutorialButton svg {
+  height: 15px;
+  margin-right: 5px;
+}
+
+.tutorialButton:hover {
+  color: #4872f9;
+  background-color: inherit;
+}
+
+.wrapper {
+  max-width: 1400px;
+}
+
+
+@media only screen and (min-device-width: 360px) and (max-device-width: 736px) {
+  .fixedHeaderContainer header a:nth-child(2) {
+    position: absolute;
+    right: 150px;
+  }
+  .promoSection .button {
+    font-size: 12px;
+    margin: 3px;
+  }
+  .inner h2 {
+   margin-top: 0px;
+  }
+  .splashLogo {
+    width: 90%;
+  }
+  .headerTitleWithLogo {
+    display: block !important;
+  }
+  .blockContent > div span p {
+    margin-bottom: 30px
+  }
+  .productShowcaseSection div.container {
+    padding-top: 0px;
+  }
+  .productShowcaseSection > h2 {
+    padding-bottom: 20px;
+  }
+}
+
+@media only screen and (max-width: 1023px) {
+  .fixedHeaderContainer header a:nth-child(2) {
+    position: absolute;
+    right: 200px;
+  }
+}
+
+@media only screen and (min-width: 1024px) {
+}
+
+@media only screen and (min-width: 1400px) {
+}
+
+@media only screen and (min-width: 1500px) {
+}
diff --git a/pytorch3d/website/static/css/pygments.css b/pytorch3d/website/static/css/pygments.css
new file mode 100644
index 0000000000000000000000000000000000000000..91f1ae70baf0a2adefc7e1db2df651e95799f4bc
--- /dev/null
+++ b/pytorch3d/website/static/css/pygments.css
@@ -0,0 +1,213 @@
+/**
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+.highlight .hll {
+  background-color: #ffffcc;
+}
+.highlight .c {
+  color: #60a0b0;
+  font-style: italic;
+} /* Comment */
+.highlight .err {
+  border: 1px solid #ff0000;
+} /* Error */
+.highlight .k {
+  color: #007020;
+  font-weight: bold;
+} /* Keyword */
+.highlight .o {
+  color: #666666;
+} /* Operator */
+.highlight .cm {
+  color: #60a0b0;
+  font-style: italic;
+} /* Comment.Multiline */
+.highlight .cp {
+  color: #007020;
+} /* Comment.Preproc */
+.highlight .c1 {
+  color: #60a0b0;
+  font-style: italic;
+} /* Comment.Single */
+.highlight .cs {
+  color: #60a0b0;
+  background-color: #fff0f0;
+} /* Comment.Special */
+.highlight .gd {
+  color: #a00000;
+} /* Generic.Deleted */
+.highlight .ge {
+  font-style: italic;
+} /* Generic.Emph */
+.highlight .gr {
+  color: #ff0000;
+} /* Generic.Error */
+.highlight .gh {
+  color: #000080;
+  font-weight: bold;
+} /* Generic.Heading */
+.highlight .gi {
+  color: #00a000;
+} /* Generic.Inserted */
+.highlight .go {
+  color: #808080;
+} /* Generic.Output */
+.highlight .gp {
+  color: #c65d09;
+  font-weight: bold;
+} /* Generic.Prompt */
+.highlight .gs {
+  font-weight: bold;
+} /* Generic.Strong */
+.highlight .gu {
+  color: #800080;
+  font-weight: bold;
+} /* Generic.Subheading */
+.highlight .gt {
+  color: #0040d0;
+} /* Generic.Traceback */
+.highlight .kc {
+  color: #007020;
+  font-weight: bold;
+} /* Keyword.Constant */
+.highlight .kd {
+  color: #007020;
+  font-weight: bold;
+} /* Keyword.Declaration */
+.highlight .kn {
+  color: #007020;
+  font-weight: bold;
+} /* Keyword.Namespace */
+.highlight .kp {
+  color: #007020;
+} /* Keyword.Pseudo */
+.highlight .kr {
+  color: #007020;
+  font-weight: bold;
+} /* Keyword.Reserved */
+.highlight .kt {
+  color: #902000;
+} /* Keyword.Type */
+.highlight .m {
+  color: #40a070;
+} /* Literal.Number */
+.highlight .s {
+  color: #4070a0;
+} /* Literal.String */
+.highlight .na {
+  color: #4070a0;
+} /* Name.Attribute */
+.highlight .nb {
+  color: #007020;
+} /* Name.Builtin */
+.highlight .nc {
+  color: #0e84b5;
+  font-weight: bold;
+} /* Name.Class */
+.highlight .no {
+  color: #60add5;
+} /* Name.Constant */
+.highlight .nd {
+  color: #555555;
+  font-weight: bold;
+} /* Name.Decorator */
+.highlight .ni {
+  color: #d55537;
+  font-weight: bold;
+} /* Name.Entity */
+.highlight .ne {
+  color: #007020;
+} /* Name.Exception */
+.highlight .nf {
+  color: #06287e;
+} /* Name.Function */
+.highlight .nl {
+  color: #002070;
+  font-weight: bold;
+} /* Name.Label */
+.highlight .nn {
+  color: #0e84b5;
+  font-weight: bold;
+} /* Name.Namespace */
+.highlight .nt {
+  color: #062873;
+  font-weight: bold;
+} /* Name.Tag */
+.highlight .nv {
+  color: #bb60d5;
+} /* Name.Variable */
+.highlight .ow {
+  color: #007020;
+  font-weight: bold;
+} /* Operator.Word */
+.highlight .w {
+  color: #bbbbbb;
+} /* Text.Whitespace */
+.highlight .mf {
+  color: #40a070;
+} /* Literal.Number.Float */
+.highlight .mh {
+  color: #40a070;
+} /* Literal.Number.Hex */
+.highlight .mi {
+  color: #40a070;
+} /* Literal.Number.Integer */
+.highlight .mo {
+  color: #40a070;
+} /* Literal.Number.Oct */
+.highlight .sb {
+  color: #4070a0;
+} /* Literal.String.Backtick */
+.highlight .sc {
+  color: #4070a0;
+} /* Literal.String.Char */
+.highlight .sd {
+  color: #4070a0;
+  font-style: italic;
+} /* Literal.String.Doc */
+.highlight .s2 {
+  color: #4070a0;
+} /* Literal.String.Double */
+.highlight .se {
+  color: #4070a0;
+  font-weight: bold;
+} /* Literal.String.Escape */
+.highlight .sh {
+  color: #4070a0;
+} /* Literal.String.Heredoc */
+.highlight .si {
+  color: #70a0d0;
+  font-style: italic;
+} /* Literal.String.Interpol */
+.highlight .sx {
+  color: #c65d09;
+} /* Literal.String.Other */
+.highlight .sr {
+  color: #235388;
+} /* Literal.String.Regex */
+.highlight .s1 {
+  color: #4070a0;
+} /* Literal.String.Single */
+.highlight .ss {
+  color: #517918;
+} /* Literal.String.Symbol */
+.highlight .bp {
+  color: #007020;
+} /* Name.Builtin.Pseudo */
+.highlight .vc {
+  color: #bb60d5;
+} /* Name.Variable.Class */
+.highlight .vg {
+  color: #bb60d5;
+} /* Name.Variable.Global */
+.highlight .vi {
+  color: #bb60d5;
+} /* Name.Variable.Instance */
+.highlight .il {
+  color: #40a070;
+} /* Literal.Number.Integer.Long */
diff --git a/pytorch3d/website/static/img/batching.svg b/pytorch3d/website/static/img/batching.svg
new file mode 100644
index 0000000000000000000000000000000000000000..2c160f97e6524b6eca9d8fad0cb48d5e9f4b977b
--- /dev/null
+++ b/pytorch3d/website/static/img/batching.svg
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="110px" height="100px" viewBox="0 0 110 100" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <!-- Generator: Sketch 58 (84663) - https://sketch.com -->
+    <title>Group 3</title>
+    <desc>Created with Sketch.</desc>
+    <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+        <g id="Artboard" transform="translate(-190.000000, -85.000000)">
+            <g id="Group-3" transform="translate(190.000000, 85.000000)">
+                <rect id="Rectangle" fill="#FF7D1E" x="60" y="3" width="44" height="44"></rect>
+                <ellipse id="Oval" fill="#000000" cx="84.5108694" cy="75.9893615" rx="24.5108694" ry="23.9893615"></ellipse>
+                <polygon id="Triangle" fill="#FFAF00" points="28.5 53 57 98 0 98"></polygon>
+                <polygon id="Polygon" fill="#812CE5" points="26.5099999 0 51.7225081 17.9587837 42.0921869 47.0167063 10.9278129 47.0167063 1.29749175 17.9587837"></polygon>
+            </g>
+        </g>
+    </g>
+</svg>
\ No newline at end of file
diff --git a/pytorch3d/website/static/img/colab_icon.png b/pytorch3d/website/static/img/colab_icon.png
new file mode 100644
index 0000000000000000000000000000000000000000..c1fc2c6dfb19d3b2e8e3747f21242eda980fc734
Binary files /dev/null and b/pytorch3d/website/static/img/colab_icon.png differ
diff --git a/pytorch3d/website/static/img/favicon.ico b/pytorch3d/website/static/img/favicon.ico
new file mode 100644
index 0000000000000000000000000000000000000000..56093a9417fe17aaca9d5b54bed35cf975e88be5
Binary files /dev/null and b/pytorch3d/website/static/img/favicon.ico differ
diff --git a/pytorch3d/website/static/img/ops.png b/pytorch3d/website/static/img/ops.png
new file mode 100644
index 0000000000000000000000000000000000000000..7bf5c0cdfaedb708704078b35efa8b877255bad2
Binary files /dev/null and b/pytorch3d/website/static/img/ops.png differ
diff --git a/pytorch3d/website/static/img/ops.svg b/pytorch3d/website/static/img/ops.svg
new file mode 100644
index 0000000000000000000000000000000000000000..b934e539df7535774dde3ec26f20a07926d8ff86
--- /dev/null
+++ b/pytorch3d/website/static/img/ops.svg
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="176px" height="175px" viewBox="0 0 176 175" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <!-- Generator: Sketch 58 (84663) - https://sketch.com -->
+    <title>Group 2</title>
+    <desc>Created with Sketch.</desc>
+    <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+        <g id="Artboard" transform="translate(-413.000000, -451.000000)">
+            <g id="Group-2" transform="translate(422.000000, 456.000000)">
+                <g id="Group-4" transform="translate(39.000000, 64.499932)" fill="#FFFFFF" stroke="#812CE5" stroke-linecap="round" stroke-linejoin="round" stroke-width="8">
+                    <path d="M0,22 L20,0" id="Line-4"></path>
+                    <path d="M0,23 L22,42" id="Line-4"></path>
+                </g>
+                <g id="Group-4" transform="translate(105.000000, 85.499932) scale(-1, 1) translate(-105.000000, -85.499932) translate(94.000000, 64.499932)" fill="#FFFFFF" stroke="#812CE5" stroke-linecap="round" stroke-linejoin="round" stroke-width="8">
+                    <path d="M0,22 L20,0" id="Line-4"></path>
+                    <path d="M0,23 L22,42" id="Line-4"></path>
+                </g>
+                <path d="M72,106.499932 L83,65.4999323" id="Line-4" stroke="#812CE5" stroke-width="8" fill="#FFFFFF" stroke-linecap="round" stroke-linejoin="round"></path>
+                <path id="Line-9" d="M74.4859874,86.9999949 L75.5383535,87.0032598 C103.773173,87.2063259 125.510981,96.896589 140.284811,116.02653 L159.026859,108.8732 C160.688876,108.238748 162.550533,109.071754 163.184985,110.733771 C163.363327,111.200957 163.430447,111.70325 163.381029,112.200872 L163.381029,112.200872 L158.126597,165.110922 C157.950791,166.881211 156.37317,168.173795 154.602881,167.99799 C153.90933,167.929114 153.256827,167.637149 152.743311,167.165918 L152.743311,167.165918 L113.56793,131.216342 C112.257182,130.013524 112.169687,127.975877 113.372505,126.665129 C113.710612,126.296683 114.12986,126.012017 114.597046,125.833674 L114.597046,125.833674 L130.458207,119.778029 C117.558504,104.578587 99.0874369,97 74.6296296,97 L74.6296296,97 L73.0820878,97.004911 C66.4718147,97.0540006 62.2741863,97.4876879 56.5279179,99.0603259 C47.7855392,101.452939 38.689293,106.123227 28.9324354,113.829564 C14.5080834,125.222466 6.51709607,142.264661 4.98906448,165.330508 C4.80652993,168.085892 2.42487619,170.171599 -0.330508042,169.989064 C-3.08589227,169.80653 -5.17159904,167.424876 -4.98906448,164.669492 C-3.28379511,138.928216 5.94743978,119.240974 22.7342313,105.982127 C33.5277551,97.4569916 43.8289935,92.1680242 53.888193,89.4150238 C60.6332478,87.5690381 65.5743724,87.0613052 72.9723676,87.0054117 L72.9723676,87.0054117 L74.4859874,86.9999949 Z" fill="#FFAF00" fill-rule="nonzero"></path>
+                <path id="Line-9" d="M76.486002,-5.0000054 L77.5660101,-4.99635656 C106.765823,-4.76855288 129.01092,6.14121529 143.768839,27.5954736 L162.584107,21.0066878 C164.263104,20.4186391 166.100907,21.3030272 166.688956,22.9820238 C166.854254,23.4539832 166.907398,23.9579467 166.844178,24.4540036 L166.844178,24.4540036 L160.12231,77.1977136 C159.897407,78.9624371 158.284495,80.210708 156.519772,79.9858046 C155.828401,79.8976935 155.184258,79.5877189 154.684028,79.1024075 L154.684028,79.1024075 L116.522181,42.0786869 C115.245344,40.8399301 115.214474,38.8006385 116.453231,37.5238017 C116.801441,37.1648878 117.228433,36.8919753 117.700392,36.7266771 L117.700392,36.7266771 L133.937253,31.0392764 C120.964757,13.6380026 102.023945,5 76.6296296,5 L76.6296296,5 L75.1288112,5.004828 C68.513673,5.05473503 64.3301462,5.50847659 58.5948099,7.16413065 C49.8731212,9.68187305 40.7881482,14.601962 31.0337645,22.7285034 C16.5631727,34.784197 8.52566138,52.8648677 6.99016787,77.3134081 C6.81707736,80.0694017 4.44258552,82.1632584 1.68659191,81.9901679 C-1.06940171,81.8170774 -3.16325838,79.4425855 -2.99016787,76.6865919 C-1.29236043,49.6536455 7.89235048,28.9923081 24.6329022,15.0454726 C35.4288998,6.05114514 45.7414116,0.466266085 55.821301,-2.44355631 C62.5896722,-4.39742276 67.5506001,-4.93507734 74.9684167,-4.99426877 L74.9684167,-4.99426877 L76.486002,-5.0000054 Z" fill="#FFAF00" fill-rule="nonzero"></path>
+            </g>
+        </g>
+    </g>
+</svg>
\ No newline at end of file
diff --git a/pytorch3d/website/static/img/oss_logo.png b/pytorch3d/website/static/img/oss_logo.png
new file mode 100644
index 0000000000000000000000000000000000000000..8183e289b139e5d03ff9870d3e6bfbcd968a5edd
Binary files /dev/null and b/pytorch3d/website/static/img/oss_logo.png differ
diff --git a/pytorch3d/website/static/img/pytorch3dfavicon.png b/pytorch3d/website/static/img/pytorch3dfavicon.png
new file mode 100644
index 0000000000000000000000000000000000000000..56093a9417fe17aaca9d5b54bed35cf975e88be5
Binary files /dev/null and b/pytorch3d/website/static/img/pytorch3dfavicon.png differ
diff --git a/pytorch3d/website/static/img/pytorch3dicon.svg b/pytorch3d/website/static/img/pytorch3dicon.svg
new file mode 100644
index 0000000000000000000000000000000000000000..e6927b791d546373528f6895362f48721f34e806
--- /dev/null
+++ b/pytorch3d/website/static/img/pytorch3dicon.svg
@@ -0,0 +1 @@
+<svg id="CrypTen_Symbol_Logos" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 300 300"><defs><style>.cls-1{fill:#fff;}</style></defs><title>PyTorch3D_Identity_Symbol</title><path class="cls-1" d="M180.57,50H61.7V250H180.57L238.3,150ZM162.84,233.91H99.77l63.07-36.41Zm-85-5.89V72l85.05,49.11v57.82ZM162.84,102.5,99.77,66.09h63.07Zm16.09,27.88,34,19.62-34,19.62ZM207,128l-28.1-16.23V79.35Zm-28.1,92.63V188.21L207,172Z"/></svg>
\ No newline at end of file
diff --git a/pytorch3d/website/static/img/pytorch3dlogo.png b/pytorch3d/website/static/img/pytorch3dlogo.png
new file mode 100644
index 0000000000000000000000000000000000000000..d8e435cc83a1b16ffef65a47107d3922bbcbb101
Binary files /dev/null and b/pytorch3d/website/static/img/pytorch3dlogo.png differ
diff --git a/pytorch3d/website/static/img/pytorch3dlogo.svg b/pytorch3d/website/static/img/pytorch3dlogo.svg
new file mode 100644
index 0000000000000000000000000000000000000000..b914e1b28a3b173c4e492985e7638e34c376c2ff
--- /dev/null
+++ b/pytorch3d/website/static/img/pytorch3dlogo.svg
@@ -0,0 +1 @@
+<svg id="CrypTen_Horizontal_Logos" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" viewBox="0 0 1300 330"><defs><style>.cls-1{fill:#ffb000;}.cls-2{fill:url(#linear-gradient);}.cls-3{fill:url(#linear-gradient-2);}</style><linearGradient id="linear-gradient" x1="130.95" y1="257.01" x2="198.97" y2="70.13" gradientUnits="userSpaceOnUse"><stop offset="0.28" stop-color="#ffb000"/><stop offset="0.75" stop-color="#812ce5"/></linearGradient><linearGradient id="linear-gradient-2" x1="173.52" y1="233.48" x2="226.58" y2="87.7" gradientUnits="userSpaceOnUse"><stop offset="0.2" stop-color="#ffb000"/><stop offset="0.85" stop-color="#812ce5"/></linearGradient></defs><title>PyTorch3D_Identity_Horizontal_Lockup</title><path d="M336.72,184.41l-18.53.22v48h-14V96.34h34.23c36,0,53.19,17.44,53.19,42.51C391.65,168.71,370.51,184,336.72,184.41ZM338,109.2H318.19v62.56l19.4-.43c25.72-.44,39.67-10.68,39.67-31.83C377.26,120.1,363.53,109.2,338,109.2Z"/><path d="M452.45,231.5l-8.07,21.58c-9.15,24.42-18.74,31.39-32.26,31.39-7.63,0-13.3-2-19.4-4.58l4.14-12.42c4.8,2.61,9.81,4.36,15.26,4.36,7.63,0,13.3-4.14,20.49-23.11l6.76-17.66-39-98.32H415l31.4,82.41,30.74-82.41h14.17Z"/><path d="M533.87,109.42V232.59h-14V109.42H472V96.34H581.83v13.08Z"/><path d="M617.76,235.42c-27.69,0-48.18-20.49-48.18-52.54,0-31.83,21.36-53,49.27-53s48,20.49,48,52.54C666.81,214.28,645.44,235.42,617.76,235.42Zm.43-93.3c-21.14,0-35.1,16.78-35.1,40.33,0,24.41,14.17,40.76,35.32,40.76s35.1-16.78,35.1-40.33C653.51,158.47,639.34,142.12,618.19,142.12Z"/><path d="M698.48,232.59H685V132.74l13.52-2.83v21.36c6.54-12.86,16.35-21.36,29.43-21.36a36.22,36.22,0,0,1,17.44,4.58l-3.49,12.64a29.68,29.68,0,0,0-15.26-4.36c-10.46,0-20.27,7.85-28.12,25.73Z"/><path d="M796.18,235.42c-30.08,0-48.83-21.58-48.83-52.54,0-31.17,20.71-53,49-53,12.21,0,22.67,3.05,31.18,8.5l-3.49,12.21a50.39,50.39,0,0,0-27.69-8.07c-21.58,0-35.1,16.13-35.1,39.9,0,24.41,14.17,40.33,35.32,40.33a51.66,51.66,0,0,0,27.69-8.29l2.83,12.43A59.72,59.72,0,0,1,796.18,235.42Z"/><path d="M908.65,232.59V168.06c0-17.44-7-25.29-21.37-25.29-11.55,0-22.67,5.89-30.95,14.17v75.65H842.81V85.65l13.52-2.83v63c10.46-10.46,23.54-15.91,34.66-15.91,19.4,0,31.18,12.42,31.18,34.22v68.46Z"/><path class="cls-1" d="M1023.12,194.44c0,25.72-21.8,41-48,41-12.65,0-27-3.92-36-8.94l3.27-12.2A70,70,0,0,0,975.59,223c17.88,0,33.79-9.6,33.79-27.47,0-15-12.42-25.51-32.91-25.51a113.19,113.19,0,0,0-18.75,1.74V160.43c29.86-7.63,46.87-17.22,46.87-33.14,0-13.51-10.68-21.36-26.82-21.36-11.33,0-24.41,5.45-33.35,13.51l-5-11.77C949,99.82,963.82,93.5,978,93.5c24.2,0,40.33,12.65,40.33,32.7,0,16.13-12,27.69-35.53,34.88C1009.17,160.43,1023.12,176.13,1023.12,194.44Z"/><path class="cls-1" d="M1082.33,232.59h-36V96.34h37.06c44.69,0,72.16,27,72.16,67.14C1155.58,206.43,1128.76,232.59,1082.33,232.59Zm.22-123.39h-22.24V219.73H1083c34.88,0,58.64-17.88,58.64-55.16C1141.62,129.91,1119.82,109.2,1082.55,109.2Z"/><path class="cls-2" d="M208.48,73.59H105.06v174H208.48l50.23-87Zm28.14,87L119.06,228.46V92.71Zm-5.12-19.13L138.18,87.59h62.21Zm-31.11,92.13H138.18l93.32-53.88Z"/><rect class="cls-3" x="193.05" y="80.59" width="14" height="160"/></svg>
\ No newline at end of file
diff --git a/pytorch3d/website/static/img/pytorch3dlogowhite.svg b/pytorch3d/website/static/img/pytorch3dlogowhite.svg
new file mode 100644
index 0000000000000000000000000000000000000000..f16b3dfbb63934dfa45c42556efb250e5ca30ad2
--- /dev/null
+++ b/pytorch3d/website/static/img/pytorch3dlogowhite.svg
@@ -0,0 +1 @@
+<svg id="CrypTen_Horizontal_Logos" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 1300 330"><defs><style>.cls-1{fill:#fff;}</style></defs><title>PyTorch3D_Identity_Horizontal_Lockup</title><path class="cls-1" d="M336.72,184.41l-18.53.22v48h-14V96.34h34.23c36,0,53.19,17.44,53.19,42.51C391.65,168.71,370.51,184,336.72,184.41ZM338,109.2H318.19v62.56l19.4-.43c25.72-.44,39.67-10.68,39.67-31.83C377.26,120.1,363.53,109.2,338,109.2Z"/><path class="cls-1" d="M452.45,231.5l-8.07,21.58c-9.15,24.42-18.74,31.39-32.26,31.39-7.63,0-13.3-2-19.4-4.58l4.14-12.42c4.8,2.61,9.81,4.36,15.26,4.36,7.63,0,13.3-4.14,20.49-23.11l6.76-17.66-39-98.32H415l31.4,82.41,30.74-82.41h14.17Z"/><path class="cls-1" d="M533.87,109.42V232.59h-14V109.42H472V96.34H581.83v13.08Z"/><path class="cls-1" d="M617.76,235.42c-27.69,0-48.18-20.49-48.18-52.54,0-31.83,21.36-53,49.27-53s48,20.49,48,52.54C666.81,214.28,645.44,235.42,617.76,235.42Zm.43-93.3c-21.14,0-35.1,16.78-35.1,40.33,0,24.41,14.17,40.76,35.32,40.76s35.1-16.78,35.1-40.33C653.51,158.47,639.34,142.12,618.19,142.12Z"/><path class="cls-1" d="M698.48,232.59H685V132.74l13.52-2.83v21.36c6.54-12.86,16.35-21.36,29.43-21.36a36.22,36.22,0,0,1,17.44,4.58l-3.49,12.64a29.68,29.68,0,0,0-15.26-4.36c-10.46,0-20.27,7.85-28.12,25.73Z"/><path class="cls-1" d="M796.18,235.42c-30.08,0-48.83-21.58-48.83-52.54,0-31.17,20.71-53,49-53,12.21,0,22.67,3.05,31.18,8.5l-3.49,12.21a50.39,50.39,0,0,0-27.69-8.07c-21.58,0-35.1,16.13-35.1,39.9,0,24.41,14.17,40.33,35.32,40.33a51.66,51.66,0,0,0,27.69-8.29l2.83,12.43A59.72,59.72,0,0,1,796.18,235.42Z"/><path class="cls-1" d="M908.65,232.59V168.06c0-17.44-7-25.29-21.37-25.29-11.55,0-22.67,5.89-30.95,14.17v75.65H842.81V85.65l13.52-2.83v63c10.46-10.46,23.54-15.91,34.66-15.91,19.4,0,31.18,12.42,31.18,34.22v68.46Z"/><path class="cls-1" d="M1023.12,194.44c0,25.72-21.8,41-48,41-12.65,0-27-3.92-36-8.94l3.27-12.2A70,70,0,0,0,975.59,223c17.88,0,33.79-9.6,33.79-27.47,0-15-12.42-25.51-32.91-25.51a113.19,113.19,0,0,0-18.75,1.74V160.43c29.86-7.63,46.87-17.22,46.87-33.14,0-13.51-10.68-21.36-26.82-21.36-11.33,0-24.41,5.45-33.35,13.51l-5-11.77C949,99.82,963.82,93.5,978,93.5c24.2,0,40.33,12.65,40.33,32.7,0,16.13-12,27.69-35.53,34.88C1009.17,160.43,1023.12,176.13,1023.12,194.44Z"/><path class="cls-1" d="M1082.33,232.59h-36V96.34h37.06c44.69,0,72.16,27,72.16,67.14C1155.58,206.43,1128.76,232.59,1082.33,232.59Zm.22-123.39h-22.24V219.73H1083c34.88,0,58.64-17.88,58.64-55.16C1141.62,129.91,1119.82,109.2,1082.55,109.2Z"/><path class="cls-1" d="M208.48,73.59H105.06v174H208.48l50.23-87Zm23,67.87-24.45-14.11V99.12Zm5.12,19.13-29.57,17.07V143.52Zm-43.57-73v31.68L138.18,87.59Zm-74,5.12,74,42.72v50.31l-74,42.72Zm74,109.2v31.68H138.18Zm14,20.14V193.83l24.45-14.12Z"/></svg>
\ No newline at end of file
diff --git a/pytorch3d/website/static/img/rendering.png b/pytorch3d/website/static/img/rendering.png
new file mode 100644
index 0000000000000000000000000000000000000000..511439cc488d145bda3f3dc87b3764a9bcff31f2
Binary files /dev/null and b/pytorch3d/website/static/img/rendering.png differ
diff --git a/pytorch3d/website/static/img/rendering.svg b/pytorch3d/website/static/img/rendering.svg
new file mode 100644
index 0000000000000000000000000000000000000000..30605d8a246a35f9e58cc180f19ca6d22b97dc58
--- /dev/null
+++ b/pytorch3d/website/static/img/rendering.svg
@@ -0,0 +1,19 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<svg width="185px" height="127px" viewBox="0 0 185 127" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <!-- Generator: Sketch 58 (84663) - https://sketch.com -->
+    <title>Group 4</title>
+    <desc>Created with Sketch.</desc>
+    <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+        <g id="Artboard" transform="translate(-85.000000, -519.000000)">
+            <g id="Group-4" transform="translate(89.000000, 519.000000)">
+                <rect id="Rectangle" fill="#812CE5" x="2" y="0" width="175" height="127" rx="9"></rect>
+                <circle id="Oval" stroke="#FFFFFF" stroke-width="5" fill="#812CE5" cx="148" cy="31" r="14"></circle>
+                <path d="M0,86 L45,31" id="Line-4" stroke="#FFFFFF" stroke-width="8" fill="#FFFFFF" stroke-linecap="round" stroke-linejoin="round"></path>
+                <path d="M70,93 L45,31" id="Line-4" stroke="#FFFFFF" stroke-width="8" fill="#FFFFFF" stroke-linecap="round" stroke-linejoin="round"></path>
+                <path d="M70,93 L96,67" id="Line-4" stroke="#FFFFFF" stroke-width="8" fill="#FFFFFF" stroke-linecap="round" stroke-linejoin="round"></path>
+                <path d="M128.467742,94 L96.5322581,67" id="Line-4" stroke="#FFFFFF" stroke-width="8" fill="#FFFFFF" stroke-linecap="round" stroke-linejoin="round"></path>
+                <path d="M128.5,94 L177,75" id="Line-4" stroke="#FFFFFF" stroke-width="8" fill="#FFFFFF" stroke-linecap="round" stroke-linejoin="round"></path>
+            </g>
+        </g>
+    </g>
+</svg>
\ No newline at end of file
diff --git a/pytorch3d/website/tutorials.json b/pytorch3d/website/tutorials.json
new file mode 100644
index 0000000000000000000000000000000000000000..d33e6f57cec5b7acba79098ce2732aac10fd5003
--- /dev/null
+++ b/pytorch3d/website/tutorials.json
@@ -0,0 +1,41 @@
+{
+   "3D operators": [
+      {
+       "id": "deform_source_mesh_to_target_mesh",
+       "title": "Fit Mesh"
+      },{
+       "id": "bundle_adjustment",
+       "title": "Bundle Adjustment"
+      }
+   ],
+   "Rendering": [
+      {
+       "id": "render_textured_meshes",
+       "title": "Render Textured Meshes"
+      },{
+       "id": "render_densepose",
+       "title": "Render DensePose Meshes"
+      }, {
+       "id": "render_colored_points",
+       "title": "Render Colored Pointclouds"
+      },{
+       "id": "fit_textured_mesh",
+       "title": "Fit a Mesh with Texture via Rendering"
+      }, {
+       "id": "camera_position_optimization_with_differentiable_rendering",
+       "title": "Camera Position Optimization with Differentiable Rendering"
+       },{
+       "id": "fit_textured_volume",
+       "title": "Fit a volume via raymarching"
+       },{
+       "id": "fit_simple_neural_radiance_field",
+       "title": "Fit a simplified NeRF via raymarching"
+      }
+   ],
+   "Dataloaders": [
+       {
+          "id": "dataloaders_ShapeNetCore_R2N2",
+          "title": "Data loaders for ShapeNetCore and R2N2"
+       }
+    ]
+ }
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8145192460e1e9cf05e3bfc8dc8d9b05891f1347
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+git+https://github.com/ttxskk/mmcv_1.6.git
+git+https://github.com/ttxskk/pytorch3d.git
\ No newline at end of file
diff --git a/util/__init__.py b/util/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/util/box_loss.py b/util/box_loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ab827e3db7f7adf2a6c48291906d74ffae1336a
--- /dev/null
+++ b/util/box_loss.py
@@ -0,0 +1,118 @@
+import torch, math
+
+
+def ciou(bboxes1, bboxes2):
+    bboxes1 = torch.sigmoid(bboxes1)
+    bboxes2 = torch.sigmoid(bboxes2)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    cious = torch.zeros((rows, cols))
+    if rows * cols == 0:
+        return cious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        cious = torch.zeros((cols, rows))
+        exchange = True
+    w1 = torch.exp(bboxes1[:, 2])
+    h1 = torch.exp(bboxes1[:, 3])
+    w2 = torch.exp(bboxes2[:, 2])
+    h2 = torch.exp(bboxes2[:, 3])
+    area1 = w1 * h1
+    area2 = w2 * h2
+    center_x1 = bboxes1[:, 0]
+    center_y1 = bboxes1[:, 1]
+    center_x2 = bboxes2[:, 0]
+    center_y2 = bboxes2[:, 1]
+
+    inter_l = torch.max(center_x1 - w1 / 2, center_x2 - w2 / 2)
+    inter_r = torch.min(center_x1 + w1 / 2, center_x2 + w2 / 2)
+    inter_t = torch.max(center_y1 - h1 / 2, center_y2 - h2 / 2)
+    inter_b = torch.min(center_y1 + h1 / 2, center_y2 + h2 / 2)
+    inter_area = torch.clamp((inter_r - inter_l), min=0) * torch.clamp(
+        (inter_b - inter_t), min=0)
+
+    c_l = torch.min(center_x1 - w1 / 2, center_x2 - w2 / 2)
+    c_r = torch.max(center_x1 + w1 / 2, center_x2 + w2 / 2)
+    c_t = torch.min(center_y1 - h1 / 2, center_y2 - h2 / 2)
+    c_b = torch.max(center_y1 + h1 / 2, center_y2 + h2 / 2)
+
+    inter_diag = (center_x2 - center_x1)**2 + (center_y2 - center_y1)**2
+    c_diag = torch.clamp((c_r - c_l), min=0)**2 + torch.clamp(
+        (c_b - c_t), min=0)**2
+
+    union = area1 + area2 - inter_area
+    u = (inter_diag) / c_diag
+    iou = inter_area / union
+    v = (4 / (math.pi**2)) * torch.pow(
+        (torch.atan(w2 / h2) - torch.atan(w1 / h1)), 2)
+    with torch.no_grad():
+        S = (iou > 0.5).float()
+        alpha = S * v / (1 - iou + v)
+    cious = iou - u - alpha * v
+    cious = torch.clamp(cious, min=-1.0, max=1.0)
+    if exchange:
+        cious = cious.T
+    return 1 - cious
+
+
+def diou(bboxes1, bboxes2):
+    bboxes1 = torch.sigmoid(bboxes1)
+    bboxes2 = torch.sigmoid(bboxes2)
+    rows = bboxes1.shape[0]
+    cols = bboxes2.shape[0]
+    cious = torch.zeros((rows, cols))
+    if rows * cols == 0:
+        return cious
+    exchange = False
+    if bboxes1.shape[0] > bboxes2.shape[0]:
+        bboxes1, bboxes2 = bboxes2, bboxes1
+        cious = torch.zeros((cols, rows))
+        exchange = True
+    w1 = torch.exp(bboxes1[:, 2])
+    h1 = torch.exp(bboxes1[:, 3])
+    w2 = torch.exp(bboxes2[:, 2])
+    h2 = torch.exp(bboxes2[:, 3])
+    area1 = w1 * h1
+    area2 = w2 * h2
+    center_x1 = bboxes1[:, 0]
+    center_y1 = bboxes1[:, 1]
+    center_x2 = bboxes2[:, 0]
+    center_y2 = bboxes2[:, 1]
+
+    inter_l = torch.max(center_x1 - w1 / 2, center_x2 - w2 / 2)
+    inter_r = torch.min(center_x1 + w1 / 2, center_x2 + w2 / 2)
+    inter_t = torch.max(center_y1 - h1 / 2, center_y2 - h2 / 2)
+    inter_b = torch.min(center_y1 + h1 / 2, center_y2 + h2 / 2)
+    inter_area = torch.clamp((inter_r - inter_l), min=0) * torch.clamp(
+        (inter_b - inter_t), min=0)
+
+    c_l = torch.min(center_x1 - w1 / 2, center_x2 - w2 / 2)
+    c_r = torch.max(center_x1 + w1 / 2, center_x2 + w2 / 2)
+    c_t = torch.min(center_y1 - h1 / 2, center_y2 - h2 / 2)
+    c_b = torch.max(center_y1 + h1 / 2, center_y2 + h2 / 2)
+
+    inter_diag = (center_x2 - center_x1)**2 + (center_y2 - center_y1)**2
+    c_diag = torch.clamp((c_r - c_l), min=0)**2 + torch.clamp(
+        (c_b - c_t), min=0)**2
+
+    union = area1 + area2 - inter_area
+    u = (inter_diag) / c_diag
+    iou = inter_area / union
+    dious = iou - u
+    dious = torch.clamp(dious, min=-1.0, max=1.0)
+    if exchange:
+        dious = dious.T
+    return 1 - dious
+
+
+if __name__ == '__main__':
+    x = torch.rand(10, 4)
+    y = torch.rand(10, 4)
+    import pdb
+    pdb.set_trace()
+    cxy = ciou(x, y)
+    dxy = diou(x, y)
+    print(cxy.shape, dxy.shape)
+    import pdb
+    pdb.set_trace()
diff --git a/util/box_ops.py b/util/box_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bf70ed664d66e27b01ce189bdbb3e21cf003c08
--- /dev/null
+++ b/util/box_ops.py
@@ -0,0 +1,167 @@
+import torch, os
+from torchvision.ops.boxes import box_area
+
+
+def box_cxcywh_to_xyxy(x):
+    x_c, y_c, w, h = x.unbind(-1)
+    b = [(x_c - 0.5 * w), (y_c - 0.5 * h), (x_c + 0.5 * w), (y_c + 0.5 * h)]
+    return torch.stack(b, dim=-1)
+
+
+def box_xyxy_to_cxcywh(x):
+    x0, y0, x1, y1 = x.unbind(-1)
+    b = [(x0 + x1) / 2, (y0 + y1) / 2, (x1 - x0), (y1 - y0)]
+    return torch.stack(b, dim=-1)
+
+
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    # import pdb; pdb.set_trace()
+    lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / (union + 1e-6)
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2, data_batch=None):
+    """Generalized IoU from https://giou.stanford.edu/
+
+    The boxes should be in [x0, y0, x1, y1] format
+
+    Returns a [N, M] pairwise matrix, where N = len(boxes1)
+    and M = len(boxes2)
+    """
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        import mmcv
+        import cv2
+        import numpy as np
+        bs = len(data_batch['img'])
+        boxes_pred = boxes1.reshape(bs, 100, 4)
+        for i in range(bs):
+            import torch.distributed as dist
+            dist.barrier()
+            idx = data_batch['idx']
+            img = mmcv.imdenormalize(
+            img=(data_batch['img'][i].cpu().numpy()).transpose(1, 2, 0), 
+            mean=np.array([123.675, 116.28, 103.53]), 
+            std=np.array([58.395, 57.12, 57.375]),
+            to_bgr=True).astype(np.uint8)
+            img_wh = data_batch['img_shape'][i]
+            lhand_bbox = data_batch['lhand_bbox'][i]
+            lhand_bbox = (lhand_bbox.reshape(-1,2).cpu().numpy()*img_wh.cpu().numpy()[::-1]).reshape(-1, 4)
+            rhand_bbox = data_batch['rhand_bbox'][i]
+            rhand_bbox = (rhand_bbox.reshape(-1,2).cpu().numpy()*img_wh.cpu().numpy()[::-1]).reshape(-1, 4)
+            face_bbox = data_batch['face_bbox'][i]
+            face_bbox = (face_bbox.reshape(-1,2).cpu().numpy()*img_wh.cpu().numpy()[::-1]).reshape(-1, 4)
+            body_bbox = data_batch['body_bbox'][i]
+            body_bbox = (body_bbox.reshape(-1,2).cpu().numpy()*img_wh.cpu().numpy()[::-1]).reshape(-1, 4)
+            img = mmcv.imshow_bboxes(img, body_bbox, show=False, colors='green')
+            img = mmcv.imshow_bboxes(img, lhand_bbox, show=False, colors='blue')
+            img = mmcv.imshow_bboxes(img, rhand_bbox, show=False, colors='yellow')
+            img = mmcv.imshow_bboxes(img, face_bbox, show=False, colors='red')
+            cv2.imwrite(f'error_gt_img_{idx[i]}.jpg',img)
+            
+            img = mmcv.imdenormalize(
+            img=(data_batch['img'][i].cpu().numpy()).transpose(1, 2, 0), 
+            mean=np.array([123.675, 116.28, 103.53]), 
+            std=np.array([58.395, 57.12, 57.375]),
+            to_bgr=True).astype(np.uint8)
+            boxes_pred_ = (boxes_pred[i].reshape(-1,2).detach().cpu().numpy()*img_wh.cpu().numpy()[::-1]).reshape(-1, 4)
+            img = mmcv.imshow_bboxes(img.copy(), boxes_pred_, show=False)
+            cv2.imwrite(f'error_pred_img_{idx[i]}.jpg',img)
+            
+        # assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+        # assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    iou, union = box_iou(boxes1, boxes2)
+
+    lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,M,2]
+    area = wh[:, :, 0] * wh[:, :, 1]
+
+    return iou - (area - union) / (area + 1e-6)
+
+
+# modified from torchvision to also return the union
+def box_iou_pairwise(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    lt = torch.max(boxes1[:, :2], boxes2[:, :2])  # [N,2]
+    rb = torch.min(boxes1[:, 2:], boxes2[:, 2:])  # [N,2]
+
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    inter = wh[:, 0] * wh[:, 1]  # [N]
+
+    union = area1 + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou_pairwise(boxes1, boxes2):
+    """Generalized IoU from https://giou.stanford.edu/
+
+    Input:
+        - boxes1, boxes2: N,4
+    Output:
+        - giou: N, 4
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
+    assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
+    assert boxes1.shape == boxes2.shape
+    iou, union = box_iou_pairwise(boxes1, boxes2)  # N, 4
+
+    lt = torch.min(boxes1[:, :2], boxes2[:, :2])
+    rb = torch.max(boxes1[:, 2:], boxes2[:, 2:])
+
+    wh = (rb - lt).clamp(min=0)  # [N,2]
+    area = wh[:, 0] * wh[:, 1]
+
+    return iou - (area - union) / area
+
+
+def masks_to_boxes(masks):
+    """Compute the bounding boxes around the provided masks.
+
+    The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
+
+    Returns a [N, 4] tensors, with the boxes in xyxy format
+    """
+    if masks.numel() == 0:
+        return torch.zeros((0, 4), device=masks.device)
+
+    h, w = masks.shape[-2:]
+
+    y = torch.arange(0, h, dtype=torch.float)
+    x = torch.arange(0, w, dtype=torch.float)
+    y, x = torch.meshgrid(y, x)
+
+    x_mask = (masks * x.unsqueeze(0))
+    x_max = x_mask.flatten(1).max(-1)[0]
+    x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    y_mask = (masks * y.unsqueeze(0))
+    y_max = y_mask.flatten(1).max(-1)[0]
+    y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
+
+    return torch.stack([x_min, y_min, x_max, y_max], 1)
+
+
+if __name__ == '__main__':
+    x = torch.rand(5, 4)
+    y = torch.rand(3, 4)
+    iou, union = box_iou(x, y)
+    import pdb
+    pdb.set_trace()
diff --git a/util/config.py b/util/config.py
new file mode 100644
index 0000000000000000000000000000000000000000..43d11d6d15df7e1909df5ef27747ec8dd032a10a
--- /dev/null
+++ b/util/config.py
@@ -0,0 +1,386 @@
+import os, sys
+import os.path as osp
+import ast
+import tempfile
+import shutil
+from importlib import import_module
+
+from argparse import Action
+
+from addict import Dict
+from yapf.yapflib.yapf_api import FormatCode
+
+BASE_KEY = '_base_'
+DELETE_KEY = '_delete_'
+RESERVED_KEYS = [
+    'filename', 'text', 'pretty_text', 'get', 'dump', 'merge_from_dict'
+]
+
+
+def check_file_exist(filename, msg_tmpl='file "{}" does not exist'):
+    if not osp.isfile(filename):
+        raise FileNotFoundError(msg_tmpl.format(filename))
+
+
+class ConfigDict(Dict):
+    def __missing__(self, name):
+        raise KeyError(name)
+
+    def __getattr__(self, name):
+        try:
+            value = super(ConfigDict, self).__getattr__(name)
+        except KeyError:
+            ex = AttributeError(f"'{self.__class__.__name__}' object has no "
+                                f"attribute '{name}'")
+        except Exception as e:
+            ex = e
+        else:
+            return value
+        raise ex
+
+
+class Config(object):
+    @staticmethod
+    def _validate_py_syntax(filename):
+        with open(filename) as f:
+            content = f.read()
+        try:
+            ast.parse(content)
+        except SyntaxError:
+            raise SyntaxError('There are syntax errors in config '
+                              f'file {filename}')
+
+    @staticmethod
+    def _file2dict(filename):
+        filename = osp.abspath(osp.expanduser(filename))
+        check_file_exist(filename)
+        if filename.lower().endswith('.py'):
+            with tempfile.TemporaryDirectory() as temp_config_dir:
+                temp_config_file = tempfile.NamedTemporaryFile(
+                    dir=temp_config_dir, suffix='.py')
+                temp_config_name = osp.basename(temp_config_file.name)
+                shutil.copyfile(filename,
+                                osp.join(temp_config_dir, temp_config_name))
+                temp_module_name = osp.splitext(temp_config_name)[0]
+                sys.path.insert(0, temp_config_dir)
+                Config._validate_py_syntax(filename)
+                mod = import_module(temp_module_name)
+                sys.path.pop(0)
+                cfg_dict = {
+                    name: value
+                    for name, value in mod.__dict__.items()
+                    if not name.startswith('__')
+                }
+                del sys.modules[temp_module_name]
+                temp_config_file.close()
+        else:
+            raise IOError('Only py/yml/yaml/json type are supported now!')
+
+        cfg_text = filename + '\n'
+        with open(filename, 'r') as f:
+            cfg_text += f.read()
+
+        if BASE_KEY in cfg_dict:
+            cfg_dir = osp.dirname(filename)
+            base_filename = cfg_dict.pop(BASE_KEY)
+            base_filename = base_filename if isinstance(
+                base_filename, list) else [base_filename]
+
+            cfg_dict_list = list()
+            cfg_text_list = list()
+            for f in base_filename:
+                _cfg_dict, _cfg_text = Config._file2dict(osp.join(cfg_dir, f))
+                cfg_dict_list.append(_cfg_dict)
+                cfg_text_list.append(_cfg_text)
+
+            base_cfg_dict = dict()
+            for c in cfg_dict_list:
+                if len(base_cfg_dict.keys() & c.keys()) > 0:
+                    raise KeyError('Duplicate key is not allowed among bases')
+                    # TODO Allow the duplicate key while warnning user
+                base_cfg_dict.update(c)
+
+            base_cfg_dict = Config._merge_a_into_b(cfg_dict, base_cfg_dict)
+            cfg_dict = base_cfg_dict
+
+            cfg_text_list.append(cfg_text)
+            cfg_text = '\n'.join(cfg_text_list)
+
+        return cfg_dict, cfg_text
+
+    @staticmethod
+    def _merge_a_into_b(a, b):
+        """merge dict `a` into dict `b` (non-inplace). values in `a` will
+        overwrite `b`. copy first to avoid inplace modification.
+
+        Args:
+            a ([type]): [description]
+            b ([type]): [description]
+
+        Returns:
+            [dict]: [description]
+        """
+        if not isinstance(a, dict):
+            return a
+
+        b = b.copy()
+        for k, v in a.items():
+            if isinstance(v, dict) and k in b and not v.pop(DELETE_KEY, False):
+
+                if not isinstance(b[k], dict) and not isinstance(b[k], list):
+                    raise TypeError(
+                        f'{k}={v} in child config cannot inherit from base '
+                        f'because {k} is a dict in the child config but is of '
+                        f'type {type(b[k])} in base config. You may set '
+                        f'`{DELETE_KEY}=True` to ignore the base config')
+                b[k] = Config._merge_a_into_b(v, b[k])
+            elif isinstance(b, list):
+                try:
+                    _ = int(k)
+                except:
+                    raise TypeError(
+                        f'b is a list, '
+                        f'index {k} should be an int when input but {type(k)}')
+                b[int(k)] = Config._merge_a_into_b(v, b[int(k)])
+            else:
+                b[k] = v
+
+        return b
+
+    @staticmethod
+    def fromfile(filename):
+        cfg_dict, cfg_text = Config._file2dict(filename)
+        return Config(cfg_dict, cfg_text=cfg_text, filename=filename)
+
+    def __init__(self, cfg_dict=None, cfg_text=None, filename=None):
+        if cfg_dict is None:
+            cfg_dict = dict()
+        elif not isinstance(cfg_dict, dict):
+            raise TypeError('cfg_dict must be a dict, but '
+                            f'got {type(cfg_dict)}')
+        for key in cfg_dict:
+            if key in RESERVED_KEYS:
+                raise KeyError(f'{key} is reserved for config file')
+
+        super(Config, self).__setattr__('_cfg_dict', ConfigDict(cfg_dict))
+        super(Config, self).__setattr__('_filename', filename)
+        if cfg_text:
+            text = cfg_text
+        elif filename:
+            with open(filename, 'r') as f:
+                text = f.read()
+        else:
+            text = ''
+        super(Config, self).__setattr__('_text', text)
+
+    @property
+    def filename(self):
+        return self._filename
+
+    @property
+    def text(self):
+        return self._text
+
+    @property
+    def pretty_text(self):
+
+        indent = 4
+
+        def _indent(s_, num_spaces):
+            s = s_.split('\n')
+            if len(s) == 1:
+                return s_
+            first = s.pop(0)
+            s = [(num_spaces * ' ') + line for line in s]
+            s = '\n'.join(s)
+            s = first + '\n' + s
+            return s
+
+        def _format_basic_types(k, v, use_mapping=False):
+            if isinstance(v, str):
+                v_str = f"'{v}'"
+            else:
+                v_str = str(v)
+
+            if use_mapping:
+                k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                attr_str = f'{k_str}: {v_str}'
+            else:
+                attr_str = f'{str(k)}={v_str}'
+            attr_str = _indent(attr_str, indent)
+
+            return attr_str
+
+        def _format_list(k, v, use_mapping=False):
+            # check if all items in the list are dict
+            if all(isinstance(_, dict) for _ in v):
+                v_str = '[\n'
+                v_str += '\n'.join(
+                    f'dict({_indent(_format_dict(v_), indent)}),'
+                    for v_ in v).rstrip(',')
+                if use_mapping:
+                    k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                    attr_str = f'{k_str}: {v_str}'
+                else:
+                    attr_str = f'{str(k)}={v_str}'
+                attr_str = _indent(attr_str, indent) + ']'
+            else:
+                attr_str = _format_basic_types(k, v, use_mapping)
+            return attr_str
+
+        def _contain_invalid_identifier(dict_str):
+            contain_invalid_identifier = False
+            for key_name in dict_str:
+                contain_invalid_identifier |= \
+                    (not str(key_name).isidentifier())
+            return contain_invalid_identifier
+
+        def _format_dict(input_dict, outest_level=False):
+            r = ''
+            s = []
+
+            use_mapping = _contain_invalid_identifier(input_dict)
+            if use_mapping:
+                r += '{'
+            for idx, (k, v) in enumerate(input_dict.items()):
+                is_last = idx >= len(input_dict) - 1
+                end = '' if outest_level or is_last else ','
+                if isinstance(v, dict):
+                    v_str = '\n' + _format_dict(v)
+                    if use_mapping:
+                        k_str = f"'{k}'" if isinstance(k, str) else str(k)
+                        attr_str = f'{k_str}: dict({v_str}'
+                    else:
+                        attr_str = f'{str(k)}=dict({v_str}'
+                    attr_str = _indent(attr_str, indent) + ')' + end
+                elif isinstance(v, list):
+                    attr_str = _format_list(k, v, use_mapping) + end
+                else:
+                    attr_str = _format_basic_types(k, v, use_mapping) + end
+
+                s.append(attr_str)
+            r += '\n'.join(s)
+            if use_mapping:
+                r += '}'
+            return r
+
+        cfg_dict = self._cfg_dict.to_dict()
+        text = _format_dict(cfg_dict, outest_level=True)
+        # copied from setup.cfg
+        yapf_style = dict(based_on_style='pep8',
+                          blank_line_before_nested_class_or_def=True,
+                          split_before_expression_after_opening_paren=True)
+        text, _ = FormatCode(text, style_config=yapf_style, verify=True)
+
+        return text
+
+    def __repr__(self):
+        return f'Config (path: {self.filename}): {self._cfg_dict.__repr__()}'
+
+    def __len__(self):
+        return len(self._cfg_dict)
+
+    def __getattr__(self, name):
+
+        return getattr(self._cfg_dict, name)
+
+    def __getitem__(self, name):
+        return self._cfg_dict.__getitem__(name)
+
+    def __setattr__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setattr__(name, value)
+
+    def __setitem__(self, name, value):
+        if isinstance(value, dict):
+            value = ConfigDict(value)
+        self._cfg_dict.__setitem__(name, value)
+
+    def __iter__(self):
+        return iter(self._cfg_dict)
+
+    def dump(self, file=None):
+        # import pdb; pdb.set_trace()
+        if file is None:
+            return self.pretty_text
+        else:
+            with open(file, 'w') as f:
+                f.write(self.pretty_text)
+
+    def merge_from_dict(self, options):
+        """Merge list into cfg_dict.
+
+        Merge the dict parsed by MultipleKVAction into this cfg.
+
+        Examples:
+            >>> options = {'model.backbone.depth': 50,
+            ...            'model.backbone.with_cp':True}
+            >>> cfg = Config(dict(model=dict(backbone=dict(type='ResNet'))))
+            >>> cfg.merge_from_dict(options)
+            >>> cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+            >>> assert cfg_dict == dict(
+            ...     model=dict(backbone=dict(depth=50, with_cp=True)))
+
+        Args:
+            options (dict): dict of configs to merge from.
+        """
+        option_cfg_dict = {}
+        for full_key, v in options.items():
+            d = option_cfg_dict
+            key_list = full_key.split('.')
+            for subkey in key_list[:-1]:
+                d.setdefault(subkey, ConfigDict())
+                d = d[subkey]
+            subkey = key_list[-1]
+            d[subkey] = v
+
+        cfg_dict = super(Config, self).__getattribute__('_cfg_dict')
+        super(Config, self).__setattr__(
+            '_cfg_dict', Config._merge_a_into_b(option_cfg_dict, cfg_dict))
+
+    # for multiprocess
+    def __setstate__(self, state):
+        self.__init__(state)
+
+    def copy(self):
+        return Config(self._cfg_dict.copy())
+
+    def deepcopy(self):
+        return Config(self._cfg_dict.deepcopy())
+
+
+class DictAction(Action):
+    """
+    argparse action to split an argument into KEY=VALUE form
+    on the first = and append to a dictionary. List options should
+    be passed as comma separated values, i.e KEY=V1,V2,V3
+    """
+    @staticmethod
+    def _parse_int_float_bool(val):
+        try:
+            return int(val)
+        except ValueError:
+            pass
+        try:
+            return float(val)
+        except ValueError:
+            pass
+        if val.lower() in ['true', 'false']:
+            return True if val.lower() == 'true' else False
+        if val.lower() in ['none', 'null']:
+            return None
+        return val
+
+    def __call__(self, parser, namespace, values, option_string=None):
+        options = {}
+        for kv in values:
+            key, val = kv.split('=', maxsplit=1)
+            val = [self._parse_int_float_bool(v) for v in val.split(',')]
+            if len(val) == 1:
+                val = val[0]
+            options[key] = val
+        setattr(namespace, self.dest, options)
+
+
+cfg = Config()
diff --git a/util/dir.py b/util/dir.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5743b696b78bcedc1ac8feb7096d21f2fe85a24
--- /dev/null
+++ b/util/dir.py
@@ -0,0 +1,11 @@
+import os
+import sys
+
+
+def make_folder(folder_name):
+    os.makedirs(folder_name, exist_ok=True)
+
+
+def add_pypath(path):
+    if path not in sys.path:
+        sys.path.insert(0, path)
diff --git a/util/distribute_utils.py b/util/distribute_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2c903104eeae145eb82aa8c0c1ba683e51467e8e
--- /dev/null
+++ b/util/distribute_utils.py
@@ -0,0 +1,223 @@
+import mmcv
+import os
+import os.path as osp
+import pickle
+import shutil
+import tempfile
+import time
+import torch
+import torch.distributed as dist
+from mmcv.runner import get_dist_info
+import random
+import numpy as np
+import subprocess
+
+
+def set_seed(seed):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+    # torch.set_deterministic(True)
+
+
+def time_synchronized():
+    torch.cuda.synchronize() if torch.cuda.is_available() else None
+    return time.time()
+
+
+def setup_for_distributed(is_master):
+    """This function disables printing when not in master process."""
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def init_distributed_mode(port=None, master_port=29500):
+    """Initialize slurm distributed training environment.
+
+    If argument ``port`` is not specified, then the master port will be system
+    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
+    environment variable, then a default port ``29500`` will be used.
+
+    Args:
+        backend (str): Backend of torch.distributed.
+        port (int, optional): Master port. Defaults to None.
+    """
+    dist_backend = 'nccl'
+    proc_id = int(os.environ['SLURM_PROCID'])
+    ntasks = int(os.environ['SLURM_NTASKS'])
+    node_list = os.environ['SLURM_NODELIST']
+    num_gpus = torch.cuda.device_count()
+    torch.cuda.set_device(proc_id % num_gpus)
+    addr = subprocess.getoutput(
+        f'scontrol show hostname {node_list} | head -n1')
+    # specify master port
+    if port is not None:
+        os.environ['MASTER_PORT'] = str(port)
+    elif 'MASTER_PORT' in os.environ:
+        pass  # use MASTER_PORT in the environment variable
+    else:
+        # 29500 is torch.distributed default port
+        os.environ['MASTER_PORT'] = str(master_port)
+    # use MASTER_ADDR in the environment variable if it already exists
+    if 'MASTER_ADDR' not in os.environ:
+        os.environ['MASTER_ADDR'] = addr
+    os.environ['WORLD_SIZE'] = str(ntasks)
+    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
+    os.environ['RANK'] = str(proc_id)
+    dist.init_process_group(backend=dist_backend)
+
+    distributed = True
+    gpu_idx = proc_id % num_gpus
+
+    return distributed, gpu_idx
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def get_process_groups():
+    world_size = int(os.environ['WORLD_SIZE'])
+    ranks = list(range(world_size))
+    num_gpus = torch.cuda.device_count()
+    num_nodes = world_size // num_gpus
+    if world_size % num_gpus != 0:
+        raise NotImplementedError('Not implemented for node not fully used.')
+
+    groups = []
+    for node_idx in range(num_nodes):
+        groups.append(ranks[node_idx * num_gpus:(node_idx + 1) * num_gpus])
+    process_groups = [torch.distributed.new_group(group) for group in groups]
+
+    return process_groups
+
+
+def get_group_idx():
+    num_gpus = torch.cuda.device_count()
+    proc_id = get_rank()
+    group_idx = proc_id // num_gpus
+
+    return group_idx
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def cleanup():
+    dist.destroy_process_group()
+
+
+def collect_results(result_part, size, tmpdir=None):
+    rank, world_size = get_dist_info()
+    # create a tmp dir if it is not specified
+    if tmpdir is None:
+        MAX_LEN = 512
+        # 32 is whitespace
+        dir_tensor = torch.full((MAX_LEN, ),
+                                32,
+                                dtype=torch.uint8,
+                                device='cuda')
+        if rank == 0:
+            tmpdir = tempfile.mkdtemp()
+            tmpdir = torch.tensor(bytearray(tmpdir.encode()),
+                                  dtype=torch.uint8,
+                                  device='cuda')
+            dir_tensor[:len(tmpdir)] = tmpdir
+        dist.broadcast(dir_tensor, 0)
+        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
+    else:
+        mmcv.mkdir_or_exist(tmpdir)
+    # dump the part result to the dir
+    mmcv.dump(result_part, osp.join(tmpdir, f'part_{rank}.pkl'))
+    dist.barrier()
+    # collect all parts
+    if rank != 0:
+        return None
+    else:
+        # load results of all parts from tmp dir
+        part_list = []
+        for i in range(world_size):
+            part_file = osp.join(tmpdir, f'part_{i}.pkl')
+            part_list.append(mmcv.load(part_file))
+        # sort the results
+        ordered_results = []
+        for res in zip(*part_list):
+            ordered_results.extend(list(res))
+        # the dataloader may pad some samples
+        ordered_results = ordered_results[:size]
+        # remove tmp dir
+        shutil.rmtree(tmpdir)
+        return ordered_results
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data:
+            Any picklable object
+    Returns:
+        data_list(list):
+            List of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to('cuda')
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device='cuda')
+    size_list = [torch.tensor([0], device='cuda') for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(
+            torch.empty((max_size, ), dtype=torch.uint8, device='cuda'))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size, ),
+                              dtype=torch.uint8,
+                              device='cuda')
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
diff --git a/util/formatting.py b/util/formatting.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ad7c60fc322a0621c43e3e49586d608fae41505
--- /dev/null
+++ b/util/formatting.py
@@ -0,0 +1,340 @@
+from collections.abc import Sequence
+
+import mmcv
+import numpy as np
+import torch
+from mmcv.parallel import DataContainer as DC
+from PIL import Image
+
+
+def to_tensor(data):
+    """Convert objects of various python types to :obj:`torch.Tensor`.
+
+    Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
+    :class:`Sequence`, :class:`int` and :class:`float`.
+    """
+    if isinstance(data, torch.Tensor):
+        return data
+    elif isinstance(data, np.ndarray):
+        return torch.from_numpy(data)
+    elif isinstance(data, Sequence) and not mmcv.is_str(data):
+        return torch.tensor(data)
+    elif isinstance(data, int):
+        return torch.LongTensor([data])
+    elif isinstance(data, float):
+        return torch.FloatTensor([data])
+    else:
+        raise TypeError(
+            f'Type {type(data)} cannot be converted to tensor.'
+            'Supported types are: `numpy.ndarray`, `torch.Tensor`, '
+            '`Sequence`, `int` and `float`')
+
+
+class ToTensor(object):
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        for key in self.keys:
+            results[key] = to_tensor(results[key])
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+class ImageToTensor(object):
+    def __init__(self, keys):
+        self.keys = keys
+
+    def __call__(self, results):
+        for key in self.keys:
+            img = results[key]
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            results[key] = to_tensor(img.transpose(2, 0, 1))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(keys={self.keys})'
+
+
+class Transpose(object):
+    def __init__(self, keys, order):
+        self.keys = keys
+        self.order = order
+
+    def __call__(self, results):
+        for key in self.keys:
+            results[key] = results[key].transpose(self.order)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, order={self.order})'
+
+
+class ToPIL(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['img'] = Image.fromarray(results['img'])
+        return results
+
+
+class ToNumpy(object):
+    def __init__(self):
+        pass
+
+    def __call__(self, results):
+        results['img'] = np.array(results['img'], dtype=np.float32)
+        return results
+
+
+class Collect(object):
+    """Collect data from the loader relevant to the specific task.
+
+    This is usually the last stage of the data loader pipeline. Typically keys
+    is set to some subset of "img" and "gt_label".
+
+    Args:
+        keys (Sequence[str]): Keys of results to be collected in ``data``.
+        meta_keys (Sequence[str], optional): Meta keys to be converted to
+            ``mmcv.DataContainer`` and collected in ``data[img_metas]``.
+            Default: ``('filename', 'ori_shape', 'img_shape', 'flip',
+            'flip_direction', 'img_norm_cfg')``
+
+    Returns:
+        dict: The result dict contains the following keys
+                - keys in``self.keys``
+                - ``img_metas`` if available
+    """
+    def __init__(self,
+                 keys,
+                 meta_keys=('filename', 'ori_filename', 'ori_shape',
+                            'img_shape', 'flip', 'flip_direction',
+                            'img_norm_cfg')):
+        self.keys = keys
+        self.meta_keys = meta_keys
+
+    def __call__(self, results):
+        data = {}
+        img_meta = {}
+        for key in self.meta_keys:
+            if key in results:
+                img_meta[key] = results[key]
+        data['img_metas'] = DC(img_meta, cpu_only=True)
+        for key in self.keys:
+            data[key] = results[key]
+        return data
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+            f'(keys={self.keys}, meta_keys={self.meta_keys})'
+
+
+class ToDataContainer:
+    """Convert results to :obj:`mmcv.DataContainer` by given fields.
+
+    Args:
+        fields (Sequence[dict]): Each field is a dict like
+            ``dict(key='xxx', **kwargs)``. The ``key`` in result will
+            be converted to :obj:`mmcv.DataContainer` with ``**kwargs``.
+            Default: ``(dict(key='img', stack=True), dict(key='gt_bboxes'),
+            dict(key='gt_labels'))``.
+    """
+    def __init__(self,
+                 fields=(dict(key='img', stack=True), dict(key='gt_bboxes'),
+                         dict(key='gt_labels'))):
+        self.fields = fields
+
+    def __call__(self, results):
+        """Call function to convert data in results to
+        :obj:`mmcv.DataContainer`.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data converted to \
+                :obj:`mmcv.DataContainer`.
+        """
+
+        for field in self.fields:
+            field = field.copy()
+            key = field.pop('key')
+            results[key] = DC(results[key], **field)
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(fields={self.fields})'
+
+
+class DefaultFormatBundle:
+    """Default formatting bundle.
+
+    It simplifies the pipeline of formatting common fields, including "img",
+    "proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
+    These fields are formatted as follows.
+
+    - img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
+    - proposals: (1)to tensor, (2)to DataContainer
+    - gt_bboxes: (1)to tensor, (2)to DataContainer
+    - gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
+    - gt_labels: (1)to tensor, (2)to DataContainer
+    - gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
+    - gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
+                       (3)to DataContainer (stack=True)
+
+    Args:
+        img_to_float (bool): Whether to force the image to be converted to
+            float type. Default: True.
+        pad_val (dict): A dict for padding value in batch collating,
+            the default value is `dict(img=0, masks=0, seg=255)`.
+            Without this argument, the padding value of "gt_semantic_seg"
+            will be set to 0 by default, which should be 255.
+    """
+    def __init__(self,
+                 img_to_float=True,
+                 pad_val=dict(img=0, masks=0, seg=255)):
+        self.img_to_float = img_to_float
+        self.pad_val = pad_val
+
+    def __call__(self, results):
+        """Call function to transform and format common fields in results.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            dict: The result dict contains the data that is formatted with \
+                default bundle.
+        """
+        data_keys = [
+            'joint_img',  # keypoints2d
+            'smplx_joint_img',  #smplx_joint_img, # projected smplx if valid cam_param, else same as keypoints2d
+            'joint_cam',  # joint_cam actually not used in any loss, # raw kps3d probably without ra
+            'smplx_joint_cam',  # kps3d with body, face, hand ra
+            'smplx_pose',
+            'smplx_shape',
+            'smplx_expr',
+            'lhand_bbox_center',
+            'lhand_bbox_size',
+            'rhand_bbox_center',
+            'rhand_bbox_size',
+            'face_bbox_center',
+            'face_bbox_size',
+            'body_bbox_center',
+            'body_bbox_size',
+            'joint_valid',
+            'joint_trunc',
+            'smplx_joint_valid',
+            'smplx_joint_trunc',
+            'smplx_pose_valid',
+            'smplx_shape_valid',
+            'smplx_expr_valid',
+            'is_3D',
+            'lhand_bbox_valid',
+            'rhand_bbox_valid',
+            'face_bbox_valid',
+            'body_bbox_valid',
+            'body_bbox',
+            'lhand_bbox',
+            'rhand_bbox',
+            'face_bbox',
+            'gender',
+            'bb2img_trans',
+            'img2bb_trans',
+            'ann_idx'
+        ]
+        if 'img' in results:
+            img = results['img']
+            if self.img_to_float is True and img.dtype == np.uint8:
+                # Normally, image is of uint8 type without normalization.
+                # At this time, it needs to be forced to be converted to
+                # flot32, otherwise the model training and inference
+                # will be wrong. Only used for YOLOX currently .
+                img = img.astype(np.float32)
+            # add default meta keys
+
+            
+            results = self._add_default_meta_keys(results)
+            if len(img.shape) < 3:
+                img = np.expand_dims(img, -1)
+            
+            img = np.ascontiguousarray(img.transpose(2, 0, 1))
+            results['img'] = DC(to_tensor(img),
+                                padding_value=self.pad_val['img'],
+                                stack=True)
+        for key in data_keys:
+            if key not in results:
+                continue
+            results[key] = DC(to_tensor(results[key]))
+        # if 'gt_masks' in results:
+        #     results['gt_masks'] = DC(
+        #         results['gt_masks'],
+        #         padding_value=self.pad_val['masks'],
+        #         cpu_only=True)
+        # if 'gt_semantic_seg' in results:
+        #     results['gt_semantic_seg'] = DC(
+        #         to_tensor(results['gt_semantic_seg'][None, ...]),
+        #         padding_value=self.pad_val['seg'],
+        #         stack=True)
+        return results
+
+    def _add_default_meta_keys(self, results):
+        """Add default meta keys.
+
+        We set default meta keys including `pad_shape`, `scale_factor` and
+        `img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
+        `Pad` are implemented during the whole pipeline.
+
+        Args:
+            results (dict): Result dict contains the data to convert.
+
+        Returns:
+            results (dict): Updated result dict contains the data to convert.
+        """
+        img = results['img']
+        results.setdefault('pad_shape', img.shape)
+        results.setdefault('scale_factor', 1.0)
+        num_channels = 1 if len(img.shape) < 3 else img.shape[2]
+        results.setdefault(
+            'img_norm_cfg',
+            dict(mean=np.zeros(num_channels, dtype=np.float32),
+                 std=np.ones(num_channels, dtype=np.float32),
+                 to_rgb=False))
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + \
+               f'(img_to_float={self.img_to_float})'
+
+
+class WrapFieldsToLists(object):
+    """Wrap fields of the data dictionary into lists for evaluation.
+
+    This class can be used as a last step of a test or validation
+    pipeline for single image evaluation or inference.
+
+    Example:
+        >>> test_pipeline = [
+        >>>    dict(type='LoadImageFromFile'),
+        >>>    dict(type='Normalize',
+                    mean=[123.675, 116.28, 103.53],
+                    std=[58.395, 57.12, 57.375],
+                    to_rgb=True),
+        >>>    dict(type='ImageToTensor', keys=['img']),
+        >>>    dict(type='Collect', keys=['img']),
+        >>>    dict(type='WrapIntoLists')
+        >>> ]
+    """
+    def __call__(self, results):
+        # Wrap dict fields into lists
+        for key, val in results.items():
+            results[key] = [val]
+        return results
+
+    def __repr__(self):
+        return f'{self.__class__.__name__}()'
diff --git a/util/get_param_dicts.py b/util/get_param_dicts.py
new file mode 100644
index 0000000000000000000000000000000000000000..c13a7e296f471106bbc0a274a5edda0aa64804c1
--- /dev/null
+++ b/util/get_param_dicts.py
@@ -0,0 +1,109 @@
+import json
+import torch
+import torch.nn as nn
+
+
+def match_name_keywords(n: str, name_keywords: list):
+    out = False
+    for b in name_keywords:
+        if b in n:
+            out = True
+            break
+    return out
+
+
+def get_param_dict(args, model_without_ddp: nn.Module):
+    try:
+        param_dict_type = args.param_dict_type
+    except:
+        param_dict_type = 'default'
+    assert param_dict_type in ['default', 'ddetr_in_mmdet', 'large_wd']
+
+    # by default
+    if param_dict_type == 'default':
+        param_dicts = [{
+            'params': [
+                p for n, p in model_without_ddp.named_parameters()
+                if 'backbone' not in n and p.requires_grad
+            ]
+        }, {
+            'params': [
+                p for n, p in model_without_ddp.named_parameters()
+                if 'backbone' in n and p.requires_grad
+            ],
+            'lr':
+            args.lr_backbone,
+        }]
+        return param_dicts
+
+    if param_dict_type == 'ddetr_in_mmdet':
+        param_dicts = [{
+            'params': [
+                p for n, p in model_without_ddp.named_parameters()
+                if not match_name_keywords(n, args.lr_backbone_names)
+                and not match_name_keywords(n, args.lr_linear_proj_names)
+                and p.requires_grad
+            ],
+            'lr':
+            args.lr,
+        }, {
+            'params': [
+                p for n, p in model_without_ddp.named_parameters()
+                if match_name_keywords(n, args.lr_backbone_names)
+                and p.requires_grad
+            ],
+            'lr':
+            args.lr_backbone,
+        }, {
+            'params': [
+                p for n, p in model_without_ddp.named_parameters()
+                if match_name_keywords(n, args.lr_linear_proj_names)
+                and p.requires_grad
+            ],
+            'lr':
+            args.lr * args.lr_linear_proj_mult,
+        }]
+        return param_dicts
+
+    if param_dict_type == 'large_wd':
+        param_dicts = [{
+            'params': [
+                p for n, p in model_without_ddp.named_parameters()
+                if not match_name_keywords(n, ['backbone'])
+                and not match_name_keywords(n, ['norm', 'bias'])
+                and p.requires_grad
+            ],
+        }, {
+            'params': [
+                p for n, p in model_without_ddp.named_parameters()
+                if match_name_keywords(n, ['backbone']) and
+                match_name_keywords(n, ['norm', 'bias']) and p.requires_grad
+            ],
+            'lr':
+            args.lr_backbone,
+            'weight_decay':
+            0.0,
+        }, {
+            'params': [
+                p for n, p in model_without_ddp.named_parameters()
+                if match_name_keywords(n, ['backbone'])
+                and not match_name_keywords(n, ['norm', 'bias'])
+                and p.requires_grad
+            ],
+            'lr':
+            args.lr_backbone,
+            'weight_decay':
+            args.weight_decay,
+        }, {
+            'params': [
+                p for n, p in model_without_ddp.named_parameters()
+                if not match_name_keywords(n, ['backbone']) and
+                match_name_keywords(n, ['norm', 'bias']) and p.requires_grad
+            ],
+            'lr':
+            args.lr,
+            'weight_decay':
+            0.0,
+        }]
+
+    return param_dicts
diff --git a/util/human_models.py b/util/human_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a326c69455ffa9a14b43581fc5cec8d825b5113
--- /dev/null
+++ b/util/human_models.py
@@ -0,0 +1,259 @@
+import numpy as np
+import torch
+import os.path as osp
+from config.config import cfg
+from .smplx import smplx
+import pickle
+
+
+class SMPLX(object):
+    def __init__(self):
+        self.layer_arg = {
+            'create_global_orient': False,
+            'create_body_pose': False,
+            'create_left_hand_pose': False,
+            'create_right_hand_pose': False,
+            'create_jaw_pose': False,
+            'create_leye_pose': False,
+            'create_reye_pose': False,
+            'create_betas': False,
+            'create_expression': False,
+            'create_transl': False
+        }
+        print(cfg.human_model_path)
+        self.layer = {
+            'neutral':
+            smplx.create(cfg.human_model_path,
+                         'smplx',
+                         gender='NEUTRAL',
+                         use_pca=False,
+                         use_face_contour=True,
+                         **self.layer_arg),
+            'male':
+            smplx.create(cfg.human_model_path,
+                         'smplx',
+                         gender='MALE',
+                         use_pca=False,
+                         use_face_contour=True,
+                         **self.layer_arg),
+            'female':
+            smplx.create(cfg.human_model_path,
+                         'smplx',
+                         gender='FEMALE',
+                         use_pca=False,
+                         use_face_contour=True,
+                         **self.layer_arg)
+        }
+        self.vertex_num = 10475
+        self.face = self.layer['neutral'].faces
+        self.shape_param_dim = 10
+        self.expr_code_dim = 10
+        with open(osp.join(cfg.human_model_path, 'smplx', 'SMPLX_to_J14.pkl'),
+                  'rb') as f:
+            self.j14_regressor = pickle.load(f, encoding='latin1')
+        with open(
+                osp.join(cfg.human_model_path, 'smplx',
+                         'MANO_SMPLX_vertex_ids.pkl'), 'rb') as f:
+            self.hand_vertex_idx = pickle.load(f, encoding='latin1')
+        self.face_vertex_idx = np.load(
+            osp.join(cfg.human_model_path, 'smplx',
+                     'SMPL-X__FLAME_vertex_ids.npy'))
+        self.J_regressor = self.layer['neutral'].J_regressor.numpy()
+        self.J_regressor_idx = {
+            'pelvis': 0,
+            'lwrist': 20,
+            'rwrist': 21,
+            'neck': 12
+        }
+        self.orig_hand_regressor = self.make_hand_regressor()
+        #self.orig_hand_regressor = {'left': self.layer.J_regressor.numpy()[[20,37,38,39,25,26,27,28,29,30,34,35,36,31,32,33],:], 'right': self.layer.J_regressor.numpy()[[21,52,53,54,40,41,42,43,44,45,49,50,51,46,47,48],:]}
+
+        # original SMPLX joint set
+        self.orig_joint_num = 53  # 22 (body joints) + 30 (hand joints) + 1 (face jaw joint)
+        self.orig_joints_name = \
+        ('Pelvis', 'L_Hip', 'R_Hip', 'Spine_1', 'L_Knee', 'R_Knee', 'Spine_2', 'L_Ankle', 'R_Ankle', 'Spine_3', 'L_Foot', 'R_Foot', 'Neck', 'L_Collar', 'R_Collar', 'Head', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', # body joints
+        'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Middle_1', 'L_Middle_2', 'L_Middle_3', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3', 'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', # left hand joints
+        'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Middle_1', 'R_Middle_2', 'R_Middle_3', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3', 'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', # right hand joints
+        'Jaw' # face jaw joint
+        )
+        self.orig_flip_pairs = \
+        ( (1,2), (4,5), (7,8), (10,11), (13,14), (16,17), (18,19), (20,21), # body joints
+        (22,37), (23,38), (24,39), (25,40), (26,41), (27,42), (28,43), (29,44), (30,45), (31,46), (32,47), (33,48), (34,49), (35,50), (36,51) # hand joints
+        )
+        self.orig_root_joint_idx = self.orig_joints_name.index('Pelvis')
+        self.orig_joint_part = \
+        {'body': range(self.orig_joints_name.index('Pelvis'), self.orig_joints_name.index('R_Wrist')+1),
+        'lhand': range(self.orig_joints_name.index('L_Index_1'), self.orig_joints_name.index('L_Thumb_3')+1),
+        'rhand': range(self.orig_joints_name.index('R_Index_1'), self.orig_joints_name.index('R_Thumb_3')+1),
+        'face': range(self.orig_joints_name.index('Jaw'), self.orig_joints_name.index('Jaw')+1)}
+
+        # changed SMPLX joint set for the supervision
+        self.joint_num = 137  # 25 (body joints) + 40 (hand joints) + 72 (face keypoints)
+        self.joints_name = \
+        ('Pelvis', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle', 'Neck', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Big_toe', 'L_Small_toe', 'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel', 'L_Ear', 'R_Ear', 'L_Eye', 'R_Eye', 'Nose',# body joints
+         'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', 'L_Thumb_4', 'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Index_4', 'L_Middle_1', 'L_Middle_2', 'L_Middle_3', 'L_Middle_4', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3', 'L_Ring_4', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Pinky_4', # left hand joints
+         'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', 'R_Thumb_4', 'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Index_4', 'R_Middle_1', 'R_Middle_2', 'R_Middle_3', 'R_Middle_4', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3', 'R_Ring_4', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Pinky_4', # right hand joints
+         *['Face_' + str(i) for i in range(1,73)] # face keypoints (too many keypoints... omit real names. have same name of keypoints defined in FLAME class)
+         )
+        self.root_joint_idx = self.joints_name.index('Pelvis')
+        self.lwrist_idx = self.joints_name.index('L_Wrist')
+        self.rwrist_idx = self.joints_name.index('R_Wrist')
+        self.neck_idx = self.joints_name.index('Neck')
+        self.flip_pairs = \
+        ( (1,2), (3,4), (5,6), (8,9), (10,11), (12,13), (14,17), (15,18), (16,19), (20,21), (22,23), # body joints
+        (25,45), (26,46), (27,47), (28,48), (29,49), (30,50), (31,51), (32,52), (33,53), (34,54), (35,55), (36,56), (37,57), (38,58), (39,59), (40,60), (41,61), (42,62), (43,63), (44,64), # hand joints
+        (67,68), # face eyeballs
+        (69,78), (70,77), (71,76), (72,75), (73,74), # face eyebrow
+        (83,87), (84,86), # face below nose
+        (88,97), (89,96), (90,95), (91,94), (92,99), (93,98), # face eyes
+        (100,106), (101,105), (102,104), (107,111), (108,110), # face mouth
+        (112,116), (113,115), (117,119), # face lip
+        (120,136), (121,135), (122,134), (123,133), (124,132), (125,131), (126,130), (127,129) # face contours
+        )
+        self.joint_idx = \
+        (0,1,2,4,5,7,8,12,16,17,18,19,20,21,60,61,62,63,64,65,59,58,57,56,55, # body joints
+        37,38,39,66,25,26,27,67,28,29,30,68,34,35,36,69,31,32,33,70, # left hand joints
+        52,53,54,71,40,41,42,72,43,44,45,73,49,50,51,74,46,47,48,75, # right hand joints
+        22,15, # jaw, head
+        57,56, # eyeballs
+        76,77,78,79,80,81,82,83,84,85, # eyebrow
+        86,87,88,89, # nose
+        90,91,92,93,94, # below nose
+        95,96,97,98,99,100,101,102,103,104,105,106, # eyes
+        107, # right mouth
+        108,109,110,111,112, # upper mouth
+        113, # left mouth
+        114,115,116,117,118, # lower mouth
+        119, # right lip
+        120,121,122, # upper lip
+        123, # left lip
+        124,125,126, # lower lip
+        127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143 # face contour
+        )
+        self.joint_part = \
+        {'body': range(self.joints_name.index('Pelvis'), self.joints_name.index('Nose')+1),
+        'lhand': range(self.joints_name.index('L_Thumb_1'), self.joints_name.index('L_Pinky_4')+1),
+        'rhand': range(self.joints_name.index('R_Thumb_1'), self.joints_name.index('R_Pinky_4')+1),
+        'hand': range(self.joints_name.index('L_Thumb_1'), self.joints_name.index('R_Pinky_4')+1),
+        'face': range(self.joints_name.index('Face_1'), self.joints_name.index('Face_72')+1)}
+
+        # changed SMPLX joint set for PositionNet prediction
+        self.pos_joint_num = 65  # 25 (body joints) + 40 (hand joints)
+        self.pos_joints_name = \
+        ('Pelvis', 'L_Hip', 'R_Hip', 'L_Knee', 'R_Knee', 'L_Ankle', 'R_Ankle', 'Neck', 'L_Shoulder', 'R_Shoulder', 'L_Elbow', 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Big_toe', 'L_Small_toe', 'L_Heel', 'R_Big_toe', 'R_Small_toe', 'R_Heel', 'L_Ear', 'R_Ear', 'L_Eye', 'R_Eye', 'Nose', # body joints
+         'L_Thumb_1', 'L_Thumb_2', 'L_Thumb_3', 'L_Thumb_4', 'L_Index_1', 'L_Index_2', 'L_Index_3', 'L_Index_4', 'L_Middle_1', 'L_Middle_2', 'L_Middle_3', 'L_Middle_4', 'L_Ring_1', 'L_Ring_2', 'L_Ring_3', 'L_Ring_4', 'L_Pinky_1', 'L_Pinky_2', 'L_Pinky_3', 'L_Pinky_4', # left hand joints
+         'R_Thumb_1', 'R_Thumb_2', 'R_Thumb_3', 'R_Thumb_4', 'R_Index_1', 'R_Index_2', 'R_Index_3', 'R_Index_4', 'R_Middle_1', 'R_Middle_2', 'R_Middle_3', 'R_Middle_4', 'R_Ring_1', 'R_Ring_2', 'R_Ring_3', 'R_Ring_4', 'R_Pinky_1', 'R_Pinky_2', 'R_Pinky_3', 'R_Pinky_4', # right hand joints
+         )
+        self.pos_joint_part = \
+        {'body': range(self.pos_joints_name.index('Pelvis'), self.pos_joints_name.index('Nose')+1),
+        'lhand': range(self.pos_joints_name.index('L_Thumb_1'), self.pos_joints_name.index('L_Pinky_4')+1),
+        'rhand': range(self.pos_joints_name.index('R_Thumb_1'), self.pos_joints_name.index('R_Pinky_4')+1),
+        'hand': range(self.pos_joints_name.index('L_Thumb_1'), self.pos_joints_name.index('R_Pinky_4')+1)}
+        self.pos_joint_part['L_MCP'] = [
+            self.pos_joints_name.index('L_Index_1') -
+            len(self.pos_joint_part['body']),
+            self.pos_joints_name.index('L_Middle_1') -
+            len(self.pos_joint_part['body']),
+            self.pos_joints_name.index('L_Ring_1') -
+            len(self.pos_joint_part['body']),
+            self.pos_joints_name.index('L_Pinky_1') -
+            len(self.pos_joint_part['body'])
+        ]
+        self.pos_joint_part['R_MCP'] = [
+            self.pos_joints_name.index('R_Index_1') -
+            len(self.pos_joint_part['body']) -
+            len(self.pos_joint_part['lhand']),
+            self.pos_joints_name.index('R_Middle_1') -
+            len(self.pos_joint_part['body']) -
+            len(self.pos_joint_part['lhand']),
+            self.pos_joints_name.index('R_Ring_1') -
+            len(self.pos_joint_part['body']) -
+            len(self.pos_joint_part['lhand']),
+            self.pos_joints_name.index('R_Pinky_1') -
+            len(self.pos_joint_part['body']) -
+            len(self.pos_joint_part['lhand'])
+        ]
+
+    def make_hand_regressor(self):
+        regressor = self.layer['neutral'].J_regressor.numpy()
+        lhand_regressor = np.concatenate(
+            (regressor[[20, 37, 38, 39], :],
+             np.eye(self.vertex_num)[5361, None], regressor[[25, 26, 27], :],
+             np.eye(self.vertex_num)[4933, None], regressor[[28, 29, 30], :],
+             np.eye(self.vertex_num)[5058, None], regressor[[34, 35, 36], :],
+             np.eye(self.vertex_num)[5169, None], regressor[[31, 32, 33], :],
+             np.eye(self.vertex_num)[5286, None]))
+        rhand_regressor = np.concatenate(
+            (regressor[[21, 52, 53, 54], :],
+             np.eye(self.vertex_num)[8079, None], regressor[[40, 41, 42], :],
+             np.eye(self.vertex_num)[7669, None], regressor[[43, 44, 45], :],
+             np.eye(self.vertex_num)[7794, None], regressor[[49, 50, 51], :],
+             np.eye(self.vertex_num)[7905, None], regressor[[46, 47, 48], :],
+             np.eye(self.vertex_num)[8022, None]))
+        hand_regressor = {'left': lhand_regressor, 'right': rhand_regressor}
+        return hand_regressor
+
+    def reduce_joint_set(self, joint):
+        new_joint = []
+        for name in self.pos_joints_name:
+            idx = self.joints_name.index(name)
+            new_joint.append(joint[:, idx, :])
+        new_joint = torch.stack(new_joint, 1)
+        return new_joint
+
+
+class SMPL(object):
+    def __init__(self):
+        self.layer_arg = {
+            'create_body_pose': False,
+            'create_betas': False,
+            'create_global_orient': False,
+            'create_transl': False
+        }
+        self.layer = {
+            'neutral':
+            smplx.create(cfg.human_model_path,
+                         'smpl',
+                         gender='NEUTRAL',
+                         **self.layer_arg),
+            'male':
+            smplx.create(cfg.human_model_path,
+                         'smpl',
+                         gender='MALE',
+                         **self.layer_arg),
+            'female':
+            smplx.create(cfg.human_model_path,
+                         'smpl',
+                         gender='FEMALE',
+                         **self.layer_arg)
+        }
+        self.vertex_num = 6890
+        self.face = self.layer['neutral'].faces
+        self.shape_param_dim = 10
+        self.vposer_code_dim = 32
+
+        # original SMPL joint set
+        self.orig_joint_num = 24
+        self.orig_joints_name = ('Pelvis', 'L_Hip', 'R_Hip', 'Spine_1',
+                                 'L_Knee', 'R_Knee', 'Spine_2', 'L_Ankle',
+                                 'R_Ankle', 'Spine_3', 'L_Foot', 'R_Foot',
+                                 'Neck', 'L_Collar', 'R_Collar', 'Head',
+                                 'L_Shoulder', 'R_Shoulder', 'L_Elbow',
+                                 'R_Elbow', 'L_Wrist', 'R_Wrist', 'L_Hand',
+                                 'R_Hand')
+        self.orig_flip_pairs = ((1, 2), (4, 5), (7, 8), (10, 11), (13, 14),
+                                (16, 17), (18, 19), (20, 21), (22, 23))
+        self.orig_root_joint_idx = self.orig_joints_name.index('Pelvis')
+        self.orig_joint_regressor = self.layer['neutral'].J_regressor.numpy(
+        ).astype(np.float32)
+
+        self.joint_num = self.orig_joint_num
+        self.joints_name = self.orig_joints_name
+        self.flip_pairs = self.orig_flip_pairs
+        self.root_joint_idx = self.orig_root_joint_idx
+        self.joint_regressor = self.orig_joint_regressor
+        self.joint_regressor_male = self.layer['male'].J_regressor.numpy(
+        ).astype(np.float32)
+
+smpl_x = SMPLX()
+smpl = SMPL()
diff --git a/util/keypoint_ops.py b/util/keypoint_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..14a21cf9227e1e41fa6cde24497dedc634ac8d82
--- /dev/null
+++ b/util/keypoint_ops.py
@@ -0,0 +1,31 @@
+import torch, os
+
+
+def keypoint_xyxyzz_to_xyzxyz(keypoints: torch.Tensor):
+    """_summary_
+
+    Args:
+        keypoints (torch.Tensor): ..., 51
+    """
+    res = torch.zeros_like(keypoints)
+    num_points = keypoints.shape[-1] // 3
+    Z = keypoints[..., :2 * num_points]
+    V = keypoints[..., 2 * num_points:]
+    res[..., 0::3] = Z[..., 0::2]
+    res[..., 1::3] = Z[..., 1::2]
+    res[..., 2::3] = V[...]
+    return res
+
+
+def keypoint_xyzxyz_to_xyxyzz(keypoints: torch.Tensor):
+    """_summary_
+
+    Args:
+        keypoints (torch.Tensor): ..., 51
+    """
+    res = torch.zeros_like(keypoints)
+    num_points = keypoints.shape[-1] // 3
+    res[..., 0:2 * num_points:2] = keypoints[..., 0::3]
+    res[..., 1:2 * num_points:2] = keypoints[..., 1::3]
+    res[..., 2 * num_points:] = keypoints[..., 2::3]
+    return res
diff --git a/util/logger.py b/util/logger.py
new file mode 100644
index 0000000000000000000000000000000000000000..7666846aec854633f26a588616c4b0324ce022ba
--- /dev/null
+++ b/util/logger.py
@@ -0,0 +1,93 @@
+import functools
+import logging
+import os
+import sys
+from termcolor import colored
+
+
+class _ColorfulFormatter(logging.Formatter):
+    def __init__(self, *args, **kwargs):
+        self._root_name = kwargs.pop('root_name') + '.'
+        self._abbrev_name = kwargs.pop('abbrev_name', '')
+        if len(self._abbrev_name):
+            self._abbrev_name = self._abbrev_name + '.'
+        super(_ColorfulFormatter, self).__init__(*args, **kwargs)
+
+    def formatMessage(self, record):
+        record.name = record.name.replace(self._root_name, self._abbrev_name)
+        log = super(_ColorfulFormatter, self).formatMessage(record)
+        if record.levelno == logging.WARNING:
+            prefix = colored('WARNING', 'red', attrs=['blink'])
+        elif record.levelno == logging.ERROR or record.levelno == logging.CRITICAL:
+            prefix = colored('ERROR', 'red', attrs=['blink', 'underline'])
+        else:
+            return log
+        return prefix + ' ' + log
+
+
+# so that calling setup_logger multiple times won't add many handlers
+@functools.lru_cache()
+def setup_logger(output=None,
+                 distributed_rank=0,
+                 *,
+                 color=True,
+                 name='imagenet',
+                 abbrev_name=None):
+    """Initialize the detectron2 logger and set its verbosity level to "INFO".
+
+    Args:
+        output (str): a file name or a directory to save log. If None, will not save log file.
+            If ends with ".txt" or ".log", assumed to be a file name.
+            Otherwise, logs will be saved to `output/log.txt`.
+        name (str): the root module name of this logger
+
+    Returns:
+        logging.Logger: a logger
+    """
+    logger = logging.getLogger(name)
+    logger.setLevel(logging.DEBUG)
+    logger.propagate = False
+
+    if abbrev_name is None:
+        abbrev_name = name
+
+    plain_formatter = logging.Formatter(
+        '[%(asctime)s.%(msecs)03d]: %(message)s', datefmt='%m/%d %H:%M:%S')
+    # stdout logging: master only
+    if distributed_rank == 0:
+        ch = logging.StreamHandler(stream=sys.stdout)
+        ch.setLevel(logging.DEBUG)
+        if color:
+            formatter = _ColorfulFormatter(
+                colored('[%(asctime)s.%(msecs)03d]: ', 'green') +
+                '%(message)s',
+                datefmt='%m/%d %H:%M:%S',
+                root_name=name,
+                abbrev_name=str(abbrev_name),
+            )
+        else:
+            formatter = plain_formatter
+        ch.setFormatter(formatter)
+        logger.addHandler(ch)
+
+    # file logging: all workers
+    if output is not None:
+        if output.endswith('.txt') or output.endswith('.log'):
+            filename = output
+        else:
+            filename = os.path.join(output, 'log.txt')
+        if distributed_rank > 0:
+            filename = filename + f'.rank{distributed_rank}'
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+        fh = logging.StreamHandler(_cached_log_stream(filename))
+        fh.setLevel(logging.DEBUG)
+        fh.setFormatter(plain_formatter)
+        logger.addHandler(fh)
+
+    return logger
+
+
+@functools.lru_cache(maxsize=None)
+def _cached_log_stream(filename):
+    return open(filename, 'a')
diff --git a/util/misc.py b/util/misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..57c5f04e68d5cfe8addd9afddd36285c7a2e3db4
--- /dev/null
+++ b/util/misc.py
@@ -0,0 +1,659 @@
+"""Misc functions, including distributed helpers.
+
+Mostly copy-paste from torchvision references.
+"""
+from mmcv.runner import get_dist_info, init_dist
+import os
+import random
+import subprocess
+import time
+from collections import OrderedDict, defaultdict, deque
+import datetime
+import pickle
+from typing import Optional, List
+
+import socket
+import json, time
+import numpy as np
+import torch
+import torch.distributed as dist
+from torch import Tensor
+import logging
+
+import colorsys
+
+import torchvision
+__torchvision_need_compat_flag = float(
+    torchvision.__version__.split('.')[1]) < 7
+if __torchvision_need_compat_flag:
+    from torchvision.ops import _new_empty_tensor
+    from torchvision.ops.misc import _output_size
+
+
+def is_free_port(port: int) -> bool:
+    ips = socket.gethostbyname_ex(socket.gethostname())[-1]
+    ips.append('localhost')
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        return all(s.connect_ex((ip, port)) != 0 for ip in ips)
+
+
+def find_free_port() -> str:
+    # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    # Binding to port 0 will cause the OS to find an available port for us
+    sock.bind(('', 0))
+    port = sock.getsockname()[1]
+    sock.close()
+    # NOTE: there is still a chance the port could be taken by other processes.
+    return port
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average."""
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = '{median:.4f} ({global_avg:.4f})'
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total],
+                         dtype=torch.float64,
+                         device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        if d.shape[0] == 0:
+            return 0
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(median=self.median,
+                               avg=self.avg,
+                               global_avg=self.global_avg,
+                               max=self.max,
+                               value=self.value)
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to('cuda')
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device='cuda')
+    size_list = [torch.tensor([0], device='cuda') for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(
+            torch.empty((max_size, ), dtype=torch.uint8, device='cuda'))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size, ),
+                              dtype=torch.uint8,
+                              device='cuda')
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        # import pdb; pdb.set_trace()
+        for k in sorted(input_dict.keys()):
+
+            names.append(k)
+            values.append(input_dict[k])
+        # pdb.set_trace()
+        values = torch.stack(values, dim=0)
+        
+        try:
+            dist.all_reduce(values)
+            rank = dist.get_rank()
+            # logging.info(f'Rank {rank} after all_reduce')
+        except Exception as e:
+            rank = dist.get_rank()
+            print(f'Exception in rank {rank}: {e}')
+            # print(f'values: {values}')
+            # print(f'names: {names}')
+            logging.info(f'Rank {rank} after all_reduce')
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+def setup_logging():
+    logging.basicConfig(level=logging.INFO)
+    rank = dist.get_rank()
+    logging.info(f'Rank {rank} before all_reduce')
+
+class MetricLogger(object):
+    def __init__(self, delimiter='\t'):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            if meter.count > 0:
+                loss_str.append('{}: {}'.format(name, str(meter)))
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None, logger=None):
+        if logger is None:
+            print_func = print
+        else:
+            print_func = logger.info
+
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}',
+                'time: {time}', 'data: {data}', 'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header, '[{0' + space_fmt + '}/{1}]', 'eta: {eta}', '{meters}',
+                'time: {time}', 'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            # import pdb; pdb.set_trace()
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print_func(
+                        log_msg.format(
+                            i,
+                            len(iterable),
+                            eta=eta_string,
+                            meters=str(self),
+                            time=str(iter_time),
+                            data=str(data_time),
+                            memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print_func(
+                        log_msg.format(i,
+                                       len(iterable),
+                                       eta=eta_string,
+                                       meters=str(self),
+                                       time=str(iter_time),
+                                       data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print_func('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
+
+class LogBuffer:
+    def __init__(self):
+        self.val_history = OrderedDict()
+        self.n_history = OrderedDict()
+        self.output = OrderedDict()
+        self.ready = False
+
+    def clear(self) -> None:
+        self.val_history.clear()
+        self.n_history.clear()
+        self.clear_output()
+
+    def clear_output(self) -> None:
+        self.output.clear()
+        self.ready = False
+
+    def update(self, vars: dict, count: int = 1) -> None:
+        assert isinstance(vars, dict)
+        for key, var in vars.items():
+            if key not in self.val_history:
+                self.val_history[key] = []
+                self.n_history[key] = []
+            self.val_history[key].append(var)
+            self.n_history[key].append(count)
+
+    def average(self, n: int = 0) -> None:
+        """Average latest n values or all values."""
+        assert n >= 0
+        for key in self.val_history:
+            values = np.array(self.val_history[key][-n:])
+            nums = np.array(self.n_history[key][-n:])
+            avg = np.sum(values * nums) / np.sum(nums)
+            self.output[key] = avg
+        self.ready = True
+
+
+def get_sha():
+    cwd = os.path.dirname(os.path.abspath(__file__))
+
+    def _run(command):
+        return subprocess.check_output(command,
+                                       cwd=cwd).decode('ascii').strip()
+
+    sha = 'N/A'
+    diff = 'clean'
+    branch = 'N/A'
+    try:
+        sha = _run(['git', 'rev-parse', 'HEAD'])
+        subprocess.check_output(['git', 'diff'], cwd=cwd)
+        diff = _run(['git', 'diff-index', 'HEAD'])
+        diff = 'has uncommited changes' if diff else 'clean'
+        branch = _run(['git', 'rev-parse', '--abbrev-ref', 'HEAD'])
+    except Exception:
+        pass
+    message = f'sha: {sha}, status: {diff}, branch: {branch}'
+    return message
+
+
+def collate_fn(batch):
+    # import pdb; pdb.set_trace()
+    batch = list(zip(*batch))
+    batch[0] = nested_tensor_from_tensor_list(batch[0])
+    return tuple(batch)
+
+
+def _max_by_axis(the_list):
+    # type: (List[List[int]]) -> List[int]
+    maxes = the_list[0]
+    for sublist in the_list[1:]:
+        for index, item in enumerate(sublist):
+            maxes[index] = max(maxes[index], item)
+    return maxes
+
+
+class NestedTensor(object):
+    def __init__(self, tensors, mask: Optional[Tensor]):
+        self.tensors = tensors
+        self.mask = mask
+        if mask == 'auto':
+            self.mask = torch.zeros_like(tensors).to(tensors.device)
+            if self.mask.dim() == 3:
+                self.mask = self.mask.sum(0).to(bool)
+            elif self.mask.dim() == 4:
+                self.mask = self.mask.sum(1).to(bool)
+            else:
+                raise ValueError(
+                    'tensors dim must be 3 or 4 but {}({})'.format(
+                        self.tensors.dim(), self.tensors.shape))
+
+    def imgsize(self):
+        res = []
+        for i in range(self.tensors.shape[0]):
+            mask = self.mask[i]
+            maxH = (~mask).sum(0).max()
+            maxW = (~mask).sum(1).max()
+            res.append(torch.Tensor([maxH, maxW]))
+        return res
+
+    def to(self, device):
+        # type: (Device) -> NestedTensor # noqa
+        cast_tensor = self.tensors.to(device)
+        mask = self.mask
+        if mask is not None:
+            assert mask is not None
+            cast_mask = mask.to(device)
+        else:
+            cast_mask = None
+        return NestedTensor(cast_tensor, cast_mask)
+
+    def to_img_list_single(self, tensor, mask):
+        assert tensor.dim() == 3, 'dim of tensor should be 3 but {}'.format(
+            tensor.dim())
+        maxH = (~mask).sum(0).max()
+        maxW = (~mask).sum(1).max()
+        img = tensor[:, :maxH, :maxW]
+        return img
+
+    def to_img_list(self):
+        """remove the padding and convert to img list
+        Returns:
+            [type]: [description]
+        """
+        if self.tensors.dim() == 3:
+            return self.to_img_list_single(self.tensors, self.mask)
+        else:
+            res = []
+            for i in range(self.tensors.shape[0]):
+                tensor_i = self.tensors[i]
+                mask_i = self.mask[i]
+                res.append(self.to_img_list_single(tensor_i, mask_i))
+            return res
+
+    @property
+    def device(self):
+        return self.tensors.device
+
+    def decompose(self):
+        return self.tensors, self.mask
+
+    def __repr__(self):
+        return str(self.tensors)
+
+    @property
+    def shape(self):
+        return {
+            'tensors.shape': self.tensors.shape,
+            'mask.shape': self.mask.shape
+        }
+
+
+def nested_tensor_from_tensor_list(tensor_list: List[Tensor]):
+    # TODO make this more general
+    if tensor_list[0].ndim == 3:
+        if torchvision._is_tracing():
+            # nested_tensor_from_tensor_list() does not export well to ONNX
+            # call _onnx_nested_tensor_from_tensor_list() instead
+            return _onnx_nested_tensor_from_tensor_list(tensor_list)
+
+        # TODO make it support different-sized images
+        max_size = _max_by_axis([list(img.shape) for img in tensor_list])
+        # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
+        batch_shape = [len(tensor_list)] + max_size
+        b, c, h, w = batch_shape
+        dtype = tensor_list[0].dtype
+        device = tensor_list[0].device
+        tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
+        mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
+        for img, pad_img, m in zip(tensor_list, tensor, mask):
+            pad_img[:img.shape[0], :img.shape[1], :img.shape[2]].copy_(img)
+            m[:img.shape[1], :img.shape[2]] = False
+    else:
+        raise ValueError('not supported')
+    return NestedTensor(tensor, mask)
+
+
+@torch.jit.unused
+def _onnx_nested_tensor_from_tensor_list(
+        tensor_list: List[Tensor]) -> NestedTensor:
+    max_size = []
+    for i in range(tensor_list[0].dim()):
+        max_size_i = torch.max(
+            torch.stack([img.shape[i] for img in tensor_list
+                         ]).to(torch.float32)).to(torch.int64)
+        max_size.append(max_size_i)
+    max_size = tuple(max_size)
+
+    padded_imgs = []
+    padded_masks = []
+    for img in tensor_list:
+        padding = [(s1 - s2) for s1, s2 in zip(max_size, tuple(img.shape))]
+        padded_img = torch.nn.functional.pad(
+            img, (0, padding[2], 0, padding[1], 0, padding[0]))
+        padded_imgs.append(padded_img)
+
+        m = torch.zeros_like(img[0], dtype=torch.int, device=img.device)
+        padded_mask = torch.nn.functional.pad(m,
+                                              (0, padding[2], 0, padding[1]),
+                                              'constant', 1)
+        padded_masks.append(padded_mask.to(torch.bool))
+
+    tensor = torch.stack(padded_imgs)
+    mask = torch.stack(padded_masks)
+
+    return NestedTensor(tensor, mask=mask)
+
+
+def setup_for_distributed(is_master):
+    """This function disables printing when not in master process."""
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+
+def init_distributed_mode(args):
+    if 'WORLD_SIZE' in os.environ and os.environ['WORLD_SIZE'] != '':
+        local_world_size = int(os.environ['WORLD_SIZE'])
+        args.world_size = args.world_size * local_world_size
+        args.gpu = args.local_rank = int(os.environ['LOCAL_RANK'])
+        args.rank = args.rank * local_world_size + args.local_rank
+        print('world size: {}, rank: {}, local rank: {}'.format(args.world_size, args.rank, args.local_rank))
+        print(json.dumps(dict(os.environ), indent=2))
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.local_rank = int(os.environ['SLURM_LOCALID'])
+        args.world_size = int(os.environ['SLURM_NPROCS'])
+        
+        print('world size: {}, world rank: {}, local rank: {}, device_count: {}'.format(args.world_size, args.rank, args.local_rank, torch.cuda.device_count()))
+        print("os.environ['SLURM_JOB_NODELIST']:", os.environ['SLURM_JOB_NODELIST'])
+        print(json.dumps(dict(os.environ), indent=2))
+        print('args:')
+        print(json.dumps(vars(args), indent=2))
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        args.world_size = 1
+        args.rank = 0
+        args.local_rank = 0
+        return
+
+    print("world_size:{} rank:{} local_rank:{}".format(args.world_size, args.rank, args.local_rank))
+    args.distributed = True
+    torch.cuda.set_device(args.local_rank)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    print("Before torch.distributed.barrier()")
+    torch.distributed.barrier()
+    print("End torch.distributed.barrier()")
+    setup_for_distributed(args.rank == 0)
+
+
+@torch.no_grad()
+def accuracy(output, target, topk=(1, )):
+    """Computes the precision@k for the specified values of k."""
+    if target.numel() == 0:
+        return [torch.zeros([], device=output.device)]
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        correct_k = correct[:k].view(-1).float().sum(0)
+        res.append(correct_k.mul_(100.0 / batch_size))
+    return res
+
+
+def interpolate(input,
+                size=None,
+                scale_factor=None,
+                mode='nearest',
+                align_corners=None):
+    # type: (Tensor, Optional[List[int]], Optional[float], str, Optional[bool]) -> Tensor
+    """Equivalent to nn.functional.interpolate, but with support for empty
+    batch sizes.
+
+    This will eventually be supported natively by PyTorch, and this class can
+    go away.
+    """
+    if __torchvision_need_compat_flag < 0.7:
+        if input.numel() > 0:
+            return torch.nn.functional.interpolate(input, size, scale_factor,
+                                                   mode, align_corners)
+
+        output_shape = _output_size(2, input, size, scale_factor)
+        output_shape = list(input.shape[:-2]) + list(output_shape)
+        return _new_empty_tensor(input, output_shape)
+    else:
+        return torchvision.ops.misc.interpolate(input, size, scale_factor,
+                                                mode, align_corners)
+
+
+class color_sys():
+    def __init__(self, num_colors) -> None:
+        self.num_colors = num_colors
+        colors = []
+        for i in np.arange(0., 360., 360. / num_colors):
+            hue = i / 360.
+            lightness = (50 + np.random.rand() * 10) / 100.
+            saturation = (90 + np.random.rand() * 10) / 100.
+            colors.append(
+                tuple([
+                    int(j * 255)
+                    for j in colorsys.hls_to_rgb(hue, lightness, saturation)
+                ]))
+        self.colors = colors
+
+    def __call__(self, idx):
+        return self.colors[idx]
+
+
+def inverse_sigmoid(x, eps=1e-3):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+def clean_state_dict(state_dict):
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k[:7] == 'module.':
+            k = k[7:]  # remove `module.`
+        new_state_dict[k] = v
+    return new_state_dict
diff --git a/util/plot_utils.py b/util/plot_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b96f0b101e29b9cc7a5bd483090ab2d7cda99e58
--- /dev/null
+++ b/util/plot_utils.py
@@ -0,0 +1,115 @@
+"""Plotting utilities to visualize training logs."""
+import torch
+import pandas as pd
+import numpy as np
+import seaborn as sns
+import matplotlib.pyplot as plt
+from pathlib import Path, PurePath
+
+
+def plot_logs(logs,
+              fields=('class_error', 'loss_bbox_unscaled', 'mAP'),
+              ewm_col=0,
+              log_name='log.txt'):
+    """Function to plot specific fields from training log(s). Plots both
+    training and test results.
+
+    :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
+              - fields = which results to plot from each log file - plots both training and test for each field.
+              - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
+              - log_name = optional, name of log file if different than default 'log.txt'.
+
+    :: Outputs - matplotlib plots of results in fields, color coded for each log file.
+               - solid lines are training results, dashed lines are test results.
+    """
+    func_name = 'plot_utils.py::plot_logs'
+
+    if not isinstance(logs, list):
+        if isinstance(logs, PurePath):
+            logs = [logs]
+            print(
+                f'{func_name} info: logs param expects a list argument, converted to list[Path].'
+            )
+        else:
+            raise ValueError(
+                f'{func_name} - invalid argument for logs parameter.\n \
+            Expect list[Path] or single Path obj, received {type(logs)}')
+
+    for i, dir in enumerate(logs):
+        if not isinstance(dir, PurePath):
+            raise ValueError(
+                f'{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}'
+            )
+        if not dir.exists():
+            raise ValueError(
+                f'{func_name} - invalid directory in logs argument:\n{dir}')
+        # verify log_name exists
+        fn = Path(dir / log_name)
+        if not fn.exists():
+            print(
+                f'-> missing {log_name}.  Have you gotten to Epoch 1 in training?'
+            )
+            print(f'--> full path of missing log file: {fn}')
+            return
+
+    # load log file(s) and plot
+    dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
+
+    fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
+
+    for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
+        for j, field in enumerate(fields):
+            if field == 'mAP':
+                coco_eval = pd.DataFrame(
+                    np.stack(
+                        df.test_coco_eval_bbox.dropna().values)[:, 1]).ewm(
+                            com=ewm_col).mean()
+                axs[j].plot(coco_eval, c=color)
+            else:
+                df.interpolate().ewm(com=ewm_col).mean().plot(
+                    y=[f'train_{field}', f'test_{field}'],
+                    ax=axs[j],
+                    color=[color] * 2,
+                    style=['-', '--'])
+    for ax, field in zip(axs, fields):
+        if field == 'mAP':
+            ax.legend([Path(p).name for p in logs])
+            ax.set_title(field)
+        else:
+            ax.legend([f'train', f'test'])
+            ax.set_title(field)
+
+    return fig, axs
+
+
+def plot_precision_recall(files, naming_scheme='iter'):
+    if naming_scheme == 'exp_id':
+        # name becomes exp_id
+        names = [f.parts[-3] for f in files]
+    elif naming_scheme == 'iter':
+        names = [f.stem for f in files]
+    else:
+        raise ValueError(f'not supported {naming_scheme}')
+    fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
+    for f, color, name in zip(files,
+                              sns.color_palette('Blues', n_colors=len(files)),
+                              names):
+        data = torch.load(f)
+        precision = data['precision']
+        recall = data['params'].recThrs
+        scores = data['scores']
+        precision = precision[0, :, :, 0, -1].mean(1)
+        scores = scores[0, :, :, 0, -1].mean(1)
+        prec = precision.mean()
+        rec = data['recall'][0, :, 0, -1].mean()
+        print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
+              f'score={scores.mean():0.3f}, ' +
+              f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}')
+        axs[0].plot(recall, precision, c=color)
+        axs[1].plot(recall, scores, c=color)
+
+    axs[0].set_title('Precision / Recall')
+    axs[0].legend(names)
+    axs[1].set_title('Scores / Recall')
+    axs[1].legend(names)
+    return fig, axs
diff --git a/util/preprocessing.py b/util/preprocessing.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5737833165b1ffb283f3209513ee3f68c4a99a0
--- /dev/null
+++ b/util/preprocessing.py
@@ -0,0 +1,910 @@
+import numpy as np
+import cv2
+import random
+from config.config import cfg
+import math
+from .human_models import smpl_x, smpl
+from .transforms import cam2pixel, transform_joint_to_other_db, transform_joint_to_other_db_batch
+from plyfile import PlyData, PlyElement
+import torch
+import torch.distributed as dist
+
+def load_img(path, order='RGB'):
+    img = cv2.imread(path, cv2.IMREAD_COLOR | cv2.IMREAD_IGNORE_ORIENTATION)
+    if not isinstance(img, np.ndarray):
+        raise IOError('Fail to read %s' % path)
+
+    if order == 'RGB':
+        img = img[:, :, ::-1].copy()
+
+    img = img.astype(np.float32)
+    return img
+
+
+def get_bbox(joint_img, joint_valid, extend_ratio=1.2):
+    x_img, y_img = joint_img[:, 0], joint_img[:, 1]
+    x_img = x_img[joint_valid == 1]
+    y_img = y_img[joint_valid == 1]
+    xmin = min(x_img)
+    ymin = min(y_img)
+    xmax = max(x_img)
+    ymax = max(y_img)
+
+    x_center = (xmin + xmax) / 2.
+    width = xmax - xmin
+    xmin = x_center - 0.5 * width * extend_ratio
+    xmax = x_center + 0.5 * width * extend_ratio
+
+    y_center = (ymin + ymax) / 2.
+    height = ymax - ymin
+    ymin = y_center - 0.5 * height * extend_ratio
+    ymax = y_center + 0.5 * height * extend_ratio
+
+    bbox = np.array([xmin, ymin, xmax - xmin, ymax - ymin]).astype(np.float32)
+    return bbox
+
+
+def sanitize_bbox(bbox, img_width, img_height):
+    x, y, w, h = bbox
+    x1 = np.max((0, x))
+    y1 = np.max((0, y))
+    x2 = np.min((img_width - 1, x1 + np.max((0, w - 1))))
+    y2 = np.min((img_height - 1, y1 + np.max((0, h - 1))))
+    if w > 0 and h > 0 and x2 >= x1 and y2 >= y1:
+        bbox = np.array([x1, y1, x2 - x1 + 1, y2 - y1 + 1])
+    else:
+        bbox = None
+
+    return bbox
+def resize(ori_shape, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+    # import ipdb; ipdb.set_trace(context=15)
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if min_original_size ==0:
+                print('min_original_size:',min_original_size)
+            if max_original_size / (min_original_size) * size > max_size:
+                size = int(round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (w, h)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (ow, oh)
+
+    def get_size(ori_shape, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(ori_shape, size, max_size)
+
+    size = get_size(ori_shape, size, max_size)
+
+    
+    return size
+
+def process_bbox(bbox, img_width, img_height, ratio=1.):
+    
+    bbox = np.array(bbox, dtype=np.float32)
+    # aspect ratio preserving bbox
+    w = bbox[2]
+    h = bbox[3]
+    c_x = bbox[0] + w / 2.
+    c_y = bbox[1] + h / 2.
+    bbox[2] = w * ratio
+    bbox[3] = h * ratio
+    bbox[0] = c_x - bbox[2] / 2.
+    bbox[1] = c_y - bbox[3] / 2.
+
+    bbox = sanitize_bbox(bbox, img_width, img_height)    
+    return bbox
+
+
+def get_aug_config(data_name):
+    scale_factor = 0.25
+    rot_factor = 30
+    color_factor = 0.2
+    crop_factor = 0.1
+    
+    if data_name == 'GTA_Human2':
+        sample_ratio = 0.5
+        sample_prob = 0.5
+    elif data_name == 'AGORA_MM':
+        sample_ratio = 0.5
+        sample_prob = 0.7
+    elif data_name == 'BEDLAM':
+        sample_ratio = 0.6
+        sample_prob = 0.7 
+    elif data_name == 'Synbody':
+        sample_ratio = 0.6
+        sample_prob = 0.7 
+    elif data_name == 'COCO_NA':
+        sample_ratio = 0.6
+        sample_prob = 0.7
+    elif data_name == 'CrowdPose':
+        sample_ratio = 0.5
+        sample_prob = 0.5
+    elif data_name == 'PoseTrack':
+        sample_ratio = 0.5
+        sample_prob = 0.3
+    elif data_name == 'UBody_MM':
+        sample_ratio = 0.5
+        sample_prob = 0.3        
+    elif data_name == 'ARCTIC':
+        sample_ratio = 0.5
+        sample_prob = 0.3
+    elif data_name == 'RICH':
+        sample_ratio = 0.5
+        sample_prob = 0.3
+    elif data_name == 'EgoBody_Egocentric':
+        sample_ratio = 0.5
+        sample_prob = 0.3
+    elif data_name == 'EgoBody_Kinect':
+        sample_ratio = 0.5
+        sample_prob = 0.3
+    else:
+        sample_ratio = 0.5
+        sample_prob = 0.3
+    scale = np.clip(np.random.randn(), -1.0, 1.0) * scale_factor + 1.0
+    rot = np.clip(np.random.randn(), -2.0,
+                  2.0) * rot_factor if random.random() <= 0.6 else 0
+    c_up = 1.0 + color_factor
+    c_low = 1.0 - color_factor
+    color_scale = np.array([
+        random.uniform(c_low, c_up),
+        random.uniform(c_low, c_up),
+        random.uniform(c_low, c_up)
+    ])
+    do_flip = random.random() < 0.5
+    crop_hw = np.array([
+        0.2 - np.random.rand() * crop_factor, 0.2 - np.random.rand() * crop_factor
+    ])
+    # crop_hw = np.array([
+    #     0.3 - np.random.rand() * crop_factor, 0.3 - np.random.rand() * crop_factor
+    # ])
+    return scale, rot, color_scale, do_flip, crop_hw, sample_ratio, sample_prob
+
+def augmentation_keep_size(img, bbox, data_split):
+    ori_shape = img.shape[:2][::-1]
+    if getattr(cfg, 'no_aug', False) and data_split == 'train':
+        scale, rot, color_scale, do_flip,size,crop = 1.0, 0.0, np.array([1, 1, 1]), False, ori_shape, np.array([1,1])
+        
+        size = random.choice(cfg.train_sizes)
+        max_size = cfg.train_max_size
+    elif data_split == 'train':
+        scale, rot, color_scale, do_flip, crop = get_aug_config()
+        rot=0
+        # scale, rot, do_flip, crop = 1.0, 0.0, False, np.array([1,1])
+        size = random.choice(cfg.train_sizes)
+        max_size = cfg.train_max_size
+    else:
+        scale, rot, color_scale, do_flip, crop = 1.0, 0.0, np.array([1, 1, 1]), False, np.array([1,1])
+        size = random.choice(cfg.test_sizes)
+        max_size = cfg.test_max_size
+    
+    crop_bbox_wh = (bbox[2:]*crop).astype(np.uint32)
+    xy_range = img.shape[:2][::-1]-crop_bbox_wh
+    crop_bbox_xywh = np.array([np.random.randint(0,xy_range[0]+1),np.random.randint(0,xy_range[1]+1),crop_bbox_wh[0],crop_bbox_wh[1]])
+    reshape_size = resize(crop_bbox_xywh[2:], size, max_size)
+    
+    img, trans, inv_trans = generate_patch_image(img, crop_bbox_xywh, 1, rot, do_flip, reshape_size[::-1])
+    img = np.clip(img * color_scale[None, None, :], 0, 255)
+    return img, trans, inv_trans, rot, do_flip
+
+def augmentation_instance_sample(img, bbox, data_split,data,dataname):
+    ori_shape = img.shape[:2][::-1]
+    
+    if getattr(cfg, 'no_aug', False) and data_split == 'train':
+        scale, rot, color_scale, do_flip,size,crop,sample_ratio,sample_prob = 1.0, 0.0, np.array([1, 1, 1]), False, ori_shape, np.array([1,1]), 0,0
+        
+        size = random.choice(cfg.train_sizes)
+        max_size = cfg.train_max_size
+    elif data_split == 'train':
+        scale, rot, color_scale, do_flip, crop, sample_ratio,sample_prob = get_aug_config(dataname)
+        rot=0
+        # scale, rot, do_flip, crop = 1.0, 0.0, False, np.array([1,1])
+        size = random.choice(cfg.train_sizes)
+        max_size = cfg.train_max_size
+    else:
+        scale, rot, color_scale, do_flip, crop,sample_ratio,sample_prob = 1.0, 0.0, np.array([1, 1, 1]), False, np.array([1,1]),0,0
+        size = random.choice(cfg.test_sizes)
+        max_size = cfg.test_max_size
+    
+    
+    if random.random() < sample_prob:
+        crop_person_number = len(data['bbox'])
+        
+        if random.random() < sample_ratio:
+            if random.random() < 0.6:
+                crop_person_number_sample = 1
+            else:
+                crop_person_number_sample = np.random.randint(crop_person_number) + 1
+        else:
+            crop_person_number_sample = crop_person_number
+        sample_ids = np.array(
+            random.sample(list(range(crop_person_number)), crop_person_number_sample))
+        
+        bbox_xyxy = []
+
+        bbox_xyxy = np.stack(data['bbox'],axis=0)[sample_ids]
+
+        leftTop_ = bbox_xyxy[:, :2]
+        leftTop_ = np.array([np.min(leftTop_[:, 0]), np.min(leftTop_[:, 1])])
+        rightBottom_ = bbox_xyxy[:, 2:4]
+        rightBottom_ = np.array(
+            [np.max(rightBottom_[:, 0]),
+                np.max(rightBottom_[:, 1])])
+        crop_bbox_xyxy = np.concatenate([leftTop_, rightBottom_])
+        crop_bbox_xywh = crop_bbox_xyxy.copy()
+        crop_bbox_xywh[2:] = crop_bbox_xywh[2:]-crop_bbox_xywh[:2]
+        crop_bbox_xywh = adjust_bounding_box(crop_bbox_xywh,ori_shape[0],ori_shape[1])
+    else:
+        crop_bbox_xywh = bbox.copy()
+    reshape_size = resize(crop_bbox_xywh[2:], size, max_size)
+    # try:
+    #     reshape_size = resize(crop_bbox_xywh[2:], size, max_size)
+    # except Exception as e:
+    #     print(crop_bbox_xywh)
+    #     print(size)
+    #     print(max_size)
+    #     raise e
+    img, trans, inv_trans = generate_patch_image(img, crop_bbox_xywh, 1, rot, do_flip, reshape_size[::-1])
+    img = np.clip(img * color_scale[None, None, :], 0, 255)
+    return img, trans, inv_trans, rot, do_flip
+
+def adjust_bounding_box(input_bbox,image_width, image_height):
+    left_x, left_y, width, height = input_bbox
+    # Calculate original bounding box center
+    original_center_x = left_x + width / 2
+    original_center_y = left_y + height / 2
+
+    # Calculate target aspect ratio
+    target_aspect_ratio = image_width / image_height
+
+    # Adjust width and height to match target aspect ratio
+    if width / height > target_aspect_ratio:
+        # Bounding box is wider, adjust height
+        new_height = width / target_aspect_ratio
+        new_width = width
+    else:
+        # Bounding box is taller, adjust width
+        new_width = height * target_aspect_ratio
+        new_height = height
+
+    # Calculate new bounding box center
+    new_center_x = original_center_x
+    new_center_y = original_center_y
+
+    # Check if the adjusted bounding box is out of the image boundaries
+    if new_center_x - new_width / 2 < 0:
+        # Shift the bounding box to the right to fit within the image
+        new_center_x = new_width / 2
+    elif new_center_x + new_width / 2 > image_width:
+        # Shift the bounding box to the left to fit within the image
+        new_center_x = image_width - new_width / 2
+
+    if new_center_y - new_height / 2 < 0:
+        # Shift the bounding box down to fit within the image
+        new_center_y = new_height / 2
+    elif new_center_y + new_height / 2 > image_height:
+        # Shift the bounding box up to fit within the image
+        new_center_y = image_height - new_height / 2
+
+    # Calculate adjusted left x and left y of the bounding box and convert to integers
+    adjusted_left_x = int(new_center_x - new_width / 2)
+    adjusted_left_y = int(new_center_y - new_height / 2)
+    # Ensure width and height are integers as well
+    adjusted_width = int(new_width)
+    adjusted_height = int(new_height)
+
+    # Return adjusted bounding box coordinates (left x, left y, width, height)
+    return np.array([adjusted_left_x, adjusted_left_y, adjusted_width, adjusted_height])
+
+
+def generate_patch_image(cvimg, bbox, scale, rot, do_flip, out_shape):
+    img = cvimg.copy()
+    
+    img_height, img_width, img_channels = img.shape
+
+    bb_c_x = float(bbox[0] + 0.5 * bbox[2])
+    bb_c_y = float(bbox[1] + 0.5 * bbox[3])
+    bb_width = float(bbox[2])
+    bb_height = float(bbox[3])
+
+    if do_flip:
+        img = img[:, ::-1, :]
+        bb_c_x = img_width - bb_c_x - 1
+
+    trans = gen_trans_from_patch_cv(bb_c_x, bb_c_y, bb_width, bb_height,
+                                    out_shape[1], out_shape[0], scale, rot)
+    img_patch = cv2.warpAffine(img,
+                               trans, (int(out_shape[1]), int(out_shape[0])),
+                               flags=cv2.INTER_LINEAR)
+    img_patch = img_patch.astype(np.float32)
+    inv_trans = gen_trans_from_patch_cv(bb_c_x,
+                                        bb_c_y,
+                                        bb_width,
+                                        bb_height,
+                                        out_shape[1],
+                                        out_shape[0],
+                                        scale,
+                                        rot,
+                                        inv=True)
+
+    return img_patch, trans, inv_trans
+
+
+def rotate_2d(pt_2d, rot_rad):
+    x = pt_2d[0]
+    y = pt_2d[1]
+    sn, cs = np.sin(rot_rad), np.cos(rot_rad)
+    xx = x * cs - y * sn
+    yy = x * sn + y * cs
+    return np.array([xx, yy], dtype=np.float32)
+
+
+def gen_trans_from_patch_cv(c_x,
+                            c_y,
+                            src_width,
+                            src_height,
+                            dst_width,
+                            dst_height,
+                            scale,
+                            rot,
+                            inv=False):
+    # augment size with scale
+    src_w = src_width * scale
+    src_h = src_height * scale
+    src_center = np.array([c_x, c_y], dtype=np.float32)
+
+    # augment rotation
+    rot_rad = np.pi * rot / 180
+    src_downdir = rotate_2d(np.array([0, src_h * 0.5], dtype=np.float32),
+                            rot_rad)
+    src_rightdir = rotate_2d(np.array([src_w * 0.5, 0], dtype=np.float32),
+                             rot_rad)
+
+    dst_w = dst_width
+    dst_h = dst_height
+    dst_center = np.array([dst_w * 0.5, dst_h * 0.5], dtype=np.float32)
+    dst_downdir = np.array([0, dst_h * 0.5], dtype=np.float32)
+    dst_rightdir = np.array([dst_w * 0.5, 0], dtype=np.float32)
+
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = src_center
+    src[1, :] = src_center + src_downdir
+    src[2, :] = src_center + src_rightdir
+
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = dst_center
+    dst[1, :] = dst_center + dst_downdir
+    dst[2, :] = dst_center + dst_rightdir
+
+    if inv:
+        trans = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        trans = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+
+    trans = trans.astype(np.float32)
+    return trans
+
+
+def process_db_coord_batch_no_valid(joint_img, joint_cam, do_flip,
+                           img_shape, flip_pairs, img2bb_trans, rot,
+                           src_joints_name, target_joints_name,
+                           input_img_shape):
+    joint_img_original = joint_img.copy()
+    joint_img, joint_cam = joint_img.copy(), joint_cam.copy()
+    
+    # flip augmentation
+    if do_flip:
+        joint_cam[:, :, 0] = -joint_cam[:, :, 0]
+        joint_img[:, :, 0] = img_shape[1] - 1 - joint_img[:, :, 0]
+        for pair in flip_pairs:
+            joint_img[:, pair[0], :], joint_img[:, pair[
+                1], :] = joint_img[:, pair[1], :].copy(
+                ), joint_img[:, pair[0], :].copy()
+            joint_cam[:, pair[0], :], joint_cam[:, pair[
+                1], :] = joint_cam[:, pair[1], :].copy(
+                ), joint_cam[:, pair[0], :].copy()
+            
+    # 3D data rotation augmentation
+    rot_aug_mat = np.array(
+        [[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+         [np.sin(np.deg2rad(-rot)),
+          np.cos(np.deg2rad(-rot)), 0], [0, 0, 1]],
+        dtype=np.float32)
+    num_p, num_joints, joints_dim = joint_cam.shape
+    joint_cam = joint_cam.reshape(num_p * num_joints, joints_dim)
+    joint_cam[:,:-1] = np.dot(rot_aug_mat, joint_cam[:,:-1].transpose(1, 0)).transpose(1, 0)
+    joint_cam = joint_cam.reshape(num_p, num_joints, joints_dim)
+    
+    # affine transformation
+    joint_img_xy1 = \
+        np.concatenate((joint_img[:, :, :2], np.ones_like(joint_img[:, :, :1])), 2)
+    joint_img_xy1 = joint_img_xy1.reshape(num_p * num_joints, 3)
+
+    joint_img[:, :, :2] = np.dot(img2bb_trans,
+                                 joint_img_xy1.transpose(1, 0)).transpose(
+                                     1, 0).reshape(num_p, num_joints, 2)
+
+    joint_img[:, :,
+              0] = joint_img[:, :,
+                             0] / input_img_shape[1] * cfg.output_hm_shape[2]
+    joint_img[:, :,
+              1] = joint_img[:, :,
+                             1] / input_img_shape[0] * cfg.output_hm_shape[1]
+
+    # check truncation
+    # TODO
+    # remove 3rd 
+    joint_trunc =  ((joint_img_original[:,:, 0] >= 0) * (joint_img[:,:, 0] >= 0) * (joint_img[:,:, 0] < cfg.output_hm_shape[2]) * \
+                    (joint_img_original[:,:, 1] >= 0) *(joint_img[:,:, 1] >= 0) * (joint_img[:,:, 1] < cfg.output_hm_shape[1]) * \
+                    joint_img[:,:, -1]
+                    ).reshape(num_p, -1, 1).astype(np.float32)
+
+
+    # transform joints to target db joints
+
+    joint_img = transform_joint_to_other_db_batch(joint_img, src_joints_name,
+                                                  target_joints_name)
+    joint_cam_wo_ra = transform_joint_to_other_db_batch(
+        joint_cam, src_joints_name, target_joints_name)
+    
+    joint_trunc = transform_joint_to_other_db_batch(joint_trunc,
+                                                    src_joints_name,
+                                                    target_joints_name)
+
+    # root-alignment, for joint_cam input wo ra
+    joint_cam_ra = joint_cam_wo_ra.copy()
+    joint_cam_ra[:,:,:3] = joint_cam_ra[:,:,:3] - joint_cam_ra[:, smpl_x.root_joint_idx,
+                                               None, :3]  # root-relative
+    joint_cam_ra[:, smpl_x.joint_part[
+        'lhand'], :3] = joint_cam_ra[:, smpl_x.joint_part[
+            'lhand'], :3] - joint_cam_ra[:, smpl_x.lwrist_idx,
+                                        None, :3]  # left hand root-relative
+    joint_cam_ra[:, smpl_x.joint_part[
+        'rhand'], :3] = joint_cam_ra[:, smpl_x.joint_part[
+            'rhand'], :3] - joint_cam_ra[:, smpl_x.rwrist_idx,
+                                        None, :3]  # right hand root-relative
+    joint_cam_ra[:, smpl_x.
+                 joint_part['face'], :3] = joint_cam_ra[:, smpl_x.joint_part[
+                     'face'], :3] - joint_cam_ra[:, smpl_x.neck_idx,
+                                                None, :3]  # face root-relative
+    return joint_img, joint_cam_wo_ra, joint_cam_ra, joint_trunc
+
+
+
+def process_human_model_output_batch_ubody(human_model_param,
+                                     do_flip,
+                                     rot,
+                                     as_smplx,
+                                     part_valid
+                                     ):
+    num_person = human_model_param['body_pose'].shape[0]
+    human_model = smpl_x
+    rotation_valid = np.ones((num_person,smpl_x.orig_joint_num), dtype=np.float32)
+    coord_valid = np.ones((num_person,smpl_x.joint_num), dtype=np.float32)
+    # expr_valid = np.ones((num_person), dtype=np.float32)
+    # shape_valid = np.ones((num_person), dtype=np.float32)
+    # root_pose, body_pose, shape, trans = human_model_param['root_pose'], human_model_param['body_pose'], \
+    #                                         human_model_param['shape'], human_model_param['trans']
+    
+    if 'smplx_valid' in human_model_param:
+        smplx_valid = human_model_param['smplx_valid']
+        shape_valid = human_model_param['smplx_valid']
+    else:
+        smplx_valid = np.ones(num_person, dtype=np.bool8)
+        shape_valid = np.ones(num_person, dtype=np.bool8)
+
+    if 'expr_valid' in human_model_param:
+        expr_valid = human_model_param['expr_valid']
+    else:
+        expr_valid = np.ones(num_person, dtype=np.bool8)
+    expr_valid*=smplx_valid
+
+    if 'face_valid' in human_model_param:
+        face_valid = human_model_param['face_valid']
+    else:
+        face_valid = np.ones(num_person, dtype=np.bool8)
+    face_valid *= smplx_valid
+
+    # check lhand valid key exsits
+    if 'lhand_valid' in human_model_param:  
+        lhand_valid = human_model_param['lhand_valid']
+    else:
+        lhand_valid = np.ones(num_person, dtype=np.bool8)
+    lhand_valid*=smplx_valid
+    
+    # check rhand valid key exsits
+    if 'rhand_valid' in human_model_param:
+        rhand_valid = human_model_param['rhand_valid']
+    else:
+        rhand_valid = np.ones(num_person, dtype=np.bool8)
+    rhand_valid*=smplx_valid
+    
+    # check validation of the smplx parameters
+    if 'body_pose' in human_model_param \
+        and human_model_param['body_pose'] is not None:
+        root_pose, body_pose = human_model_param['root_pose'], human_model_param['body_pose']
+        shape, trans = human_model_param['shape'], human_model_param['trans']
+        root_pose = torch.FloatTensor(root_pose).view(num_person, 1, 3)
+        body_pose = torch.FloatTensor(body_pose).view(num_person, -1, 3)
+        shape = torch.FloatTensor(shape).view(num_person, -1)
+        trans = torch.FloatTensor(trans).view(num_person,-1)
+    else:
+        root_pose = np.zeros((num_person, 3), dtype=np.float32)
+        body_pose = np.zeros((num_person, 3 * len(smpl_x.orig_joint_part['body'])), dtype=np.float32)
+        shape = np.zeros((num_person, 10), dtype=np.float32)
+        trans = np.zeros((num_person, 3), dtype=np.float32)
+        rotation_valid[:, smpl_x.orig_joint_part['body']] = 0
+        coord_valid[:, smpl_x.joint_part['body']] = 0
+    body_pose*=smplx_valid[:, None, None]
+    root_pose*=smplx_valid[:, None, None]
+    shape*=smplx_valid[:, None]
+    trans*=smplx_valid[:, None]
+    rotation_valid[:, smpl_x.orig_joint_part['body']]*=smplx_valid[:,None]   
+    coord_valid[:, smpl_x.joint_part['body']]*=smplx_valid[:,None]  
+    
+    # check validation of the smplx parameters
+    if 'lhand_pose' in human_model_param \
+        and human_model_param['lhand_pose'] is not None:
+        lhand_pose = human_model_param['lhand_pose']
+        lhand_pose = torch.FloatTensor(lhand_pose).view(num_person, -1, 3)
+        # lhand_valid = part_valid['lhand']
+        # rotation_valid[:, smpl_x.orig_joint_part['lhand']]*=lhand_valid[:,None]
+        # coord_valid[:, smpl_x.joint_part['lhand']]*=lhand_valid[:,None]
+    else:
+        lhand_pose = np.zeros((num_person, 3 * len(smpl_x.orig_joint_part['lhand'])), dtype=np.float32)
+        rotation_valid[:, smpl_x.orig_joint_part['lhand']] = 0
+        coord_valid[:, smpl_x.joint_part['lhand']] = 0
+    
+    lhand_pose*=lhand_valid[:,None,None]    
+    rotation_valid[:, smpl_x.orig_joint_part['lhand']]*=lhand_valid[:,None]   
+    coord_valid[:, smpl_x.joint_part['lhand']]*=lhand_valid[:,None]  
+    
+    if 'rhand_pose' in human_model_param \
+        and human_model_param['rhand_pose'] is not None:
+        rhand_pose = human_model_param['rhand_pose']
+        rhand_pose = torch.FloatTensor(rhand_pose).view(num_person, -1, 3)
+        # rhand_valid = part_valid['rhand']
+        # rotation_valid[:, smpl_x.orig_joint_part['rhand']]*=rhand_valid[:,None]
+        # coord_valid[:, smpl_x.joint_part['rhand']]*=rhand_valid[:,None]
+    else:
+        rhand_pose = np.zeros((num_person, 3 * len(smpl_x.orig_joint_part['rhand'])), dtype=np.float32)
+        rotation_valid[:, smpl_x.orig_joint_part['rhand']] = 0
+        coord_valid[:, smpl_x.joint_part['rhand']] = 0
+    rhand_pose*=rhand_valid[:,None,None]  
+    rotation_valid[:, smpl_x.orig_joint_part['rhand']]*=rhand_valid[:,None]   
+    coord_valid[:, smpl_x.joint_part['rhand']]*=rhand_valid[:,None]
+    
+    if 'expr' in human_model_param  and \
+        human_model_param['expr'] is not None:
+        expr = human_model_param['expr']
+        # face_valid = part_valid['face']
+        # expr_valid = expr_valid*face_valid
+    else:
+        expr = np.zeros((num_person, smpl_x.expr_code_dim), dtype=np.float32)
+        expr_valid = expr_valid*0
+    expr*=face_valid[:,None]   
+    expr = torch.FloatTensor(expr).view(num_person,-1)
+    expr_valid*=face_valid # expr is invalid if face_valid is 0
+    
+    if 'jaw_pose' in human_model_param and \
+        human_model_param['jaw_pose'] is not None:
+        jaw_pose = human_model_param['jaw_pose']
+        # face_valid = part_valid['face']
+        # rotation_valid[:, smpl_x.orig_joint_part['face']]*=face_valid[:,None]
+        # coord_valid[:, smpl_x.joint_part['face']]*=face_valid[:,None]
+    else:
+        jaw_pose = np.zeros((num_person, 3), dtype=np.float32)
+        rotation_valid[:,smpl_x.orig_joint_part['face']] = 0
+        coord_valid[:,smpl_x.joint_part['face']] = 0
+        
+    jaw_pose*=face_valid[:,None]
+    jaw_pose = torch.FloatTensor(jaw_pose).view(num_person, -1, 3)
+    rotation_valid[:, smpl_x.orig_joint_part['face']]*=face_valid[:,None]
+    coord_valid[:, smpl_x.joint_part['face']]*=face_valid[:,None]
+    
+    if 'gender' in human_model_param and \
+        human_model_param['gender'] is not None:
+        gender = human_model_param['gender']
+    else:
+        gender = 'neutral'
+    
+    if as_smplx == 'smpl':
+        rotation_valid[:,:] = 0
+        rotation_valid[:,:21] = 1
+        expr_valid = expr_valid*0
+        coord_valid[:,:] = 0
+        coord_valid[:,smpl_x.joint_part['body']] = 1
+    
+    root_pose = torch.FloatTensor(root_pose).view(num_person, 1, 3)
+    body_pose = torch.FloatTensor(body_pose).view(num_person, -1, 3)
+    lhand_pose = torch.FloatTensor( lhand_pose).view(num_person, -1, 3)
+    rhand_pose = torch.FloatTensor(rhand_pose).view(num_person, -1, 3)
+    jaw_pose = torch.FloatTensor(jaw_pose).view(num_person, -1, 3)
+
+    shape = torch.FloatTensor(shape).view(num_person, -1)
+    expr = torch.FloatTensor(expr).view(num_person,-1)
+    trans = torch.FloatTensor(trans).view(num_person,-1)
+
+    
+    
+
+    pose = torch.cat((root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose),dim=1)
+
+    ## so far, data augmentations are not applied yet
+    ## now, apply data augmentations
+
+    
+    # x,y affine transform, root-relative depth
+    
+    # 3D data rotation augmentation
+    # rot_aug_mat = np.array(
+    #     [[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+    #      [np.sin(np.deg2rad(-rot)),
+    #       np.cos(np.deg2rad(-rot)), 0], [0, 0, 1]],
+    #     dtype=np.float32)
+    
+    # parameters
+    # flip pose parameter (axis-angle)
+    if do_flip:
+        for pair in human_model.orig_flip_pairs:
+            pose[:, pair[0], :], pose[:,
+                                      pair[1], :] = pose[:, pair[1], :].clone(
+                                      ), pose[:, pair[0], :].clone()
+            rotation_valid[:,pair[0]], rotation_valid[:,pair[1]] = rotation_valid[:,pair[1]].copy(), rotation_valid[:,
+                pair[0]].copy()
+        pose[:,:, 1:3] *= -1  # multiply -1 to y and z axis of axis-angle
+    # rotate root pose
+    pose = pose.numpy()
+    root_pose = pose[:, human_model.orig_root_joint_idx, :]
+    
+    # for pose_i in range(len(root_pose)):
+    #     root_pose_mat = cv2.Rodrigues(root_pose[pose_i])[0]
+    #     root_pose[pose_i] = cv2.Rodrigues(np.dot(rot_aug_mat,
+    #                                              root_pose_mat))[0][:, 0]
+
+    pose[:, human_model.orig_root_joint_idx] = root_pose.reshape(num_person, 3)
+
+    # change to mean shape if beta is too far from it
+    # shape[(shape.abs() > 3).any(dim=1)] = 0.
+    shape = shape.numpy().reshape(num_person, -1)
+    
+    
+    # shape_valid = shape.sum(-1)!=0
+
+    # return results
+    pose = pose.reshape(num_person, -1)
+    expr = expr.numpy().reshape(num_person, -1)
+
+        
+    return pose, shape, expr, rotation_valid, coord_valid, expr_valid, shape_valid
+
+def process_human_model_output_batch_simplify(human_model_param,
+                                     do_flip,
+                                     rot,
+                                     as_smplx, data_name=None
+                                     ):
+    num_person = human_model_param['body_pose'].shape[0]
+    human_model = smpl_x
+    rotation_valid = np.ones((num_person,smpl_x.orig_joint_num), dtype=np.float32)
+    coord_valid = np.ones((num_person,smpl_x.joint_num), dtype=np.float32)
+    # expr_valid = np.ones((num_person), dtype=np.float32)
+    # shape_valid = np.ones((num_person), dtype=np.float32)
+    # shape, trans = human_model_param['shape'], human_model_param['trans']
+    # check smplx valid key exsits
+    if 'smplx_valid' in human_model_param:
+        smplx_valid = human_model_param['smplx_valid']
+        shape_valid = human_model_param['smplx_valid']
+    else:
+        smplx_valid = np.ones(num_person, dtype=np.bool8)
+        shape_valid = np.ones(num_person, dtype=np.bool8)
+        
+    if 'expr_valid' in human_model_param:
+        expr_valid = human_model_param['expr_valid']
+    else:
+        expr_valid = np.ones(num_person, dtype=np.bool8)
+    expr_valid*=smplx_valid
+    
+    # check face valid key exsits
+    if 'face_valid' in human_model_param:
+        face_valid = human_model_param['face_valid']
+    else:
+        face_valid = np.ones(num_person, dtype=np.bool8)
+    face_valid *= smplx_valid
+    
+    # check lhand valid key exsits
+    if 'lhand_valid' in human_model_param:  
+        lhand_valid = human_model_param['lhand_valid']
+    else:
+        lhand_valid = np.ones(num_person, dtype=np.bool8)
+    lhand_valid*=smplx_valid
+    
+    # check rhand valid key exsits
+    if 'rhand_valid' in human_model_param:
+        rhand_valid = human_model_param['rhand_valid']
+    else:
+        rhand_valid = np.ones(num_person, dtype=np.bool8)
+    rhand_valid*=smplx_valid
+    
+    # check validation of the smplx parameters
+    if 'body_pose' in human_model_param \
+        and human_model_param['body_pose'] is not None:
+        root_pose, body_pose = human_model_param['root_pose'], human_model_param['body_pose']
+        shape, trans = human_model_param['shape'], human_model_param['trans']
+        root_pose = torch.FloatTensor(root_pose).view(num_person, 1, 3)
+        body_pose = torch.FloatTensor(body_pose).view(num_person, -1, 3)
+        shape = torch.FloatTensor(shape).view(num_person, -1)
+        trans = torch.FloatTensor(trans).view(num_person,-1)
+    else:
+        root_pose = np.zeros((num_person, 3), dtype=np.float32)
+        body_pose = np.zeros((num_person, 3 * len(smpl_x.orig_joint_part['body'])), dtype=np.float32)
+        shape = np.zeros((num_person, 10), dtype=np.float32)
+        trans = np.zeros((num_person, 3), dtype=np.float32)
+        rotation_valid[:, smpl_x.orig_joint_part['body']] = 0
+        coord_valid[:, smpl_x.joint_part['body']] = 0
+    body_pose*=smplx_valid[:, None, None]
+    root_pose*=smplx_valid[:, None, None]
+    shape*=smplx_valid[:, None]
+    trans*=smplx_valid[:, None]
+    rotation_valid[:, smpl_x.orig_joint_part['body']]*=smplx_valid[:,None]   
+    coord_valid[:, smpl_x.joint_part['body']]*=smplx_valid[:,None]  
+    
+    if 'lhand_pose' in human_model_param \
+        and human_model_param['lhand_pose'] is not None:
+        lhand_pose = human_model_param['lhand_pose']
+        lhand_pose = torch.FloatTensor(lhand_pose).view(num_person, -1, 3)
+    else:
+        lhand_pose = np.zeros((num_person, 3 * len(smpl_x.orig_joint_part['lhand'])), dtype=np.float32)
+        rotation_valid[:, smpl_x.orig_joint_part['lhand']] = 0
+        coord_valid[:, smpl_x.joint_part['lhand']] = 0 
+        
+    lhand_pose*=lhand_valid[:,None,None]    
+    rotation_valid[:, smpl_x.orig_joint_part['lhand']]*=lhand_valid[:,None]   
+    coord_valid[:, smpl_x.joint_part['lhand']]*=lhand_valid[:,None]  
+
+    if 'rhand_pose' in human_model_param \
+        and human_model_param['rhand_pose'] is not None:
+        rhand_pose = human_model_param['rhand_pose']
+        rhand_pose = torch.FloatTensor(rhand_pose).view(num_person, -1, 3)
+    else:
+        rhand_pose = np.zeros((num_person, 3 * len(smpl_x.orig_joint_part['rhand'])), dtype=np.float32)
+        rotation_valid[:, smpl_x.orig_joint_part['rhand']] = 0
+        coord_valid[:, smpl_x.joint_part['rhand']] = 0
+    rhand_pose*=rhand_valid[:,None,None]  
+    rotation_valid[:, smpl_x.orig_joint_part['rhand']]*=rhand_valid[:,None]   
+    coord_valid[:, smpl_x.joint_part['rhand']]*=rhand_valid[:,None]
+    
+    # face valid > expr valid > face kps valid, but for synbody and bedlam
+    if 'expr' in human_model_param  and \
+        human_model_param['expr'] is not None:
+        expr = human_model_param['expr']
+    else:
+        expr = np.zeros((num_person, smpl_x.expr_code_dim), dtype=np.float32)
+        expr_valid = expr_valid * 0
+    expr*=face_valid[:,None]   
+    expr = torch.FloatTensor(expr).view(num_person,-1)
+    expr_valid*=face_valid # expr is invalid if face_valid is 0
+    # for coco and ubody, if face is invalid, jaw pose and face kps2d should be false
+    if 'jaw_pose' in human_model_param and \
+        human_model_param['jaw_pose'] is not None:
+        jaw_pose = human_model_param['jaw_pose']
+    else:
+        jaw_pose = np.zeros((num_person, 3), dtype=np.float32)
+        rotation_valid[:,smpl_x.orig_joint_part['face']] = 0
+        coord_valid[:,smpl_x.joint_part['face']] = 0
+    # if data_name not in ["BEDLAM"]:
+    #     face_valid = face_valid * expr_valid * jaw_pose
+    # else:
+    #     # synbody and bedlam exps valid is false but jaw pose and face kps2d is valid
+    #     face_valid = face_valid
+    jaw_pose*=face_valid[:,None]
+    jaw_pose = torch.FloatTensor(jaw_pose).view(num_person, -1, 3)
+    rotation_valid[:, smpl_x.orig_joint_part['face']]*=face_valid[:,None]
+    coord_valid[:, smpl_x.joint_part['face']]*=face_valid[:,None]
+    # if data_name not in ["BEDLAM" , "SynBody"]:
+    #     coord_valid[:, smpl_x.joint_part['face']] = coord_valid[:, smpl_x.joint_part['face']] * expr_valid[:,None] # expr valid?
+    # coord_valid[:, smpl_x.joint_part['face']] = coord_valid[:, smpl_x.joint_part['face']] * face_valid[:,None]
+
+    if 'gender' in human_model_param and \
+        human_model_param['gender'] is not None:
+        gender = human_model_param['gender']
+    else:
+        gender = 'neutral'
+
+    if as_smplx == 'smpl':
+        rotation_valid[:,:] = 0
+        rotation_valid[:,:21] = 1
+        expr_valid = expr_valid*0
+        coord_valid[:,:] = 0
+        coord_valid[:,smpl_x.joint_part['body']] = 1 
+    # print(root_pose.shape, body_pose.shape, lhand_pose.shape, rhand_pose.shape, jaw_pose.shape)
+    pose = torch.cat((root_pose, body_pose, lhand_pose, rhand_pose, jaw_pose),dim=1)
+    ## so far, data augmentations are not applied yet
+    ## now, apply data augmentations
+    
+    # 3D data rotation augmentation
+    # rot_aug_mat = np.array(
+    #     [[np.cos(np.deg2rad(-rot)), -np.sin(np.deg2rad(-rot)), 0],
+    #      [np.sin(np.deg2rad(-rot)),
+    #       np.cos(np.deg2rad(-rot)), 0], [0, 0, 1]],
+    #     dtype=np.float32)
+    
+    # parameters
+    # flip pose parameter (axis-angle)
+    if do_flip:
+        for pair in human_model.orig_flip_pairs:
+            pose[:, pair[0], :], pose[:,
+                                      pair[1], :] = pose[:, pair[1], :].clone(
+                                      ), pose[:, pair[0], :].clone()
+            rotation_valid[:,pair[0]], rotation_valid[:,pair[1]] = rotation_valid[:,pair[1]].copy(), rotation_valid[:,
+                pair[0]].copy()
+        pose[:,:, 1:3] *= -1  # multiply -1 to y and z axis of axis-angle
+    # rotate root pose
+    pose = pose.numpy()
+    root_pose = pose[:, human_model.orig_root_joint_idx, :]
+
+    # for pose_i in range(len(root_pose)):
+    #     root_pose_mat = cv2.Rodrigues(root_pose[pose_i])[0]
+    #     root_pose[pose_i] = cv2.Rodrigues(np.dot(rot_aug_mat,
+    #                                              root_pose_mat))[0][:, 0]
+
+    pose[:, human_model.orig_root_joint_idx] = root_pose.reshape(num_person, 3)
+
+    # change to mean shape if beta is too far from it
+    # shape[(shape.abs() > 3).any(dim=1)] = 0.
+    shape = shape.numpy().reshape(num_person, -1)
+    # shape_valid = shape.sum(-1)!=0
+    # return results
+    pose = pose.reshape(num_person, -1)
+    expr = expr.numpy().reshape(num_person, -1)
+
+        
+    return pose, shape, expr, rotation_valid, coord_valid, expr_valid, shape_valid
+
+def load_obj(file_name):
+    v = []
+    obj_file = open(file_name)
+    for line in obj_file:
+        words = line.split(' ')
+        if words[0] == 'v':
+            x, y, z = float(words[1]), float(words[2]), float(words[3])
+            v.append(np.array([x, y, z]))
+    return np.stack(v)
+
+
+def load_ply(file_name):
+    plydata = PlyData.read(file_name)
+    x = plydata['vertex']['x']
+    y = plydata['vertex']['y']
+    z = plydata['vertex']['z']
+    v = np.stack((x, y, z), 1)
+    return v
+
+
+def resize_bbox(bbox, scale=1.2):
+    if isinstance(bbox, list):
+        x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
+    else:
+        x1, y1, x2, y2 = bbox
+    x_center = (x1 + x2) / 2.0
+    y_center = (y1 + y2) / 2.0
+    x_size, y_size = x2 - x1, y2 - y1
+    x1_resize = x_center - x_size / 2.0 * scale
+    x2_resize = x_center + x_size / 2.0 * scale
+    y1_resize = y_center - y_size / 2.0 * scale
+    y2_resize = y_center + y_size / 2.0 * scale
+    bbox[0], bbox[1], bbox[2], bbox[
+        3] = x1_resize, y1_resize, x2_resize, y2_resize
+    return bbox
diff --git a/util/smplx/README.md b/util/smplx/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..544a024b0cde5df918b8c46cd45c2b2e850ea98f
--- /dev/null
+++ b/util/smplx/README.md
@@ -0,0 +1,186 @@
+## SMPL-X:  A new joint 3D model of the human body, face and hands together
+
+[[Paper Page](https://smpl-x.is.tue.mpg.de)] [[Paper](https://ps.is.tuebingen.mpg.de/uploads_file/attachment/attachment/497/SMPL-X.pdf)]
+[[Supp. Mat.](https://ps.is.tuebingen.mpg.de/uploads_file/attachment/attachment/498/SMPL-X-supp.pdf)]
+
+![SMPL-X Examples](./images/teaser_fig.png)
+
+## Table of Contents
+  * [License](#license)
+  * [Description](#description)
+  * [Installation](#installation)
+  * [Downloading the model](#downloading-the-model)
+  * [Loading SMPL-X, SMPL+H and SMPL](#loading-smpl-x-smplh-and-smpl)
+    * [SMPL and SMPL+H setup](#smpl-and-smplh-setup)
+    * [Model loading](https://github.com/vchoutas/smplx#model-loading)
+  * [MANO and FLAME correspondences](#mano-and-flame-correspondences)
+  * [Example](#example)
+  * [Citation](#citation)
+  * [Acknowledgments](#acknowledgments)
+  * [Contact](#contact)
+
+## License
+
+Software Copyright License for **non-commercial scientific research purposes**.
+Please read carefully the [terms and conditions](https://github.com/vchoutas/smplx/blob/master/LICENSE) and any accompanying documentation before you download and/or use the SMPL-X/SMPLify-X model, data and software, (the "Model & Software"), including 3D meshes, blend weights, blend shapes, textures, software, scripts, and animations. By downloading and/or using the Model & Software (including downloading, cloning, installing, and any other use of this github repository), you acknowledge that you have read these terms and conditions, understand them, and agree to be bound by them. If you do not agree with these terms and conditions, you must not download and/or use the Model & Software. Any infringement of the terms of this agreement will automatically terminate your rights under this [License](./LICENSE).
+
+## Disclaimer
+
+The original images used for the figures 1 and 2 of the paper can be found in this link.
+The images in the paper are used under license from gettyimages.com.
+We have acquired the right to use them in the publication, but redistribution is not allowed.
+Please follow the instructions on the given link to acquire right of usage.
+Our results are obtained on the 483 × 724 pixels resolution of the original images.
+
+## Description
+
+*SMPL-X* (SMPL eXpressive) is a unified body model with shape parameters trained jointly for the
+face, hands and body. *SMPL-X* uses standard vertex based linear blend skinning with learned corrective blend
+shapes, has N = 10, 475 vertices and K = 54 joints,
+which include joints for the neck, jaw, eyeballs and fingers.
+SMPL-X is defined by a function M(θ, β, ψ), where θ is the pose parameters, β the shape parameters and
+ψ the facial expression parameters.
+
+
+## Installation
+
+To install the model please follow the next steps in the specified order:
+1. To install from PyPi simply run:
+  ```Shell
+  pip install smplx[all]
+  ```
+2. Clone this repository and install it using the *setup.py* script:
+```Shell
+git clone https://github.com/vchoutas/smplx
+python setup.py install
+```
+
+## Downloading the model
+
+To download the *SMPL-X* model go to [this project website](https://smpl-x.is.tue.mpg.de) and register to get access to the downloads section.
+
+To download the *SMPL+H* model go to [this project website](http://mano.is.tue.mpg.de) and register to get access to the downloads section.
+
+To download the *SMPL* model go to [this](http://smpl.is.tue.mpg.de) (male and female models) and [this](http://smplify.is.tue.mpg.de) (gender neutral model) project website and register to get access to the downloads section.
+
+## Loading SMPL-X, SMPL+H and SMPL
+
+### SMPL and SMPL+H setup
+
+The loader gives the option to use any of the SMPL-X, SMPL+H, SMPL, and MANO models. Depending on the model you want to use, please follow the respective download instructions. To switch between MANO, SMPL, SMPL+H and SMPL-X just change the *model_path* or *model_type* parameters. For more details please check the docs of the model classes.
+Before using SMPL and SMPL+H you should follow the instructions in [tools/README.md](./tools/README.md) to remove the
+Chumpy objects from both model pkls, as well as merge the MANO parameters with SMPL+H.
+
+### Model loading
+
+You can either use the [create](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L54)
+function from [body_models](./smplx/body_models.py) or directly call the constructor for the
+[SMPL](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L106),
+[SMPL+H](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L395) and
+[SMPL-X](https://github.com/vchoutas/smplx/blob/c63c02b478c5c6f696491ed9167e3af6b08d89b1/smplx/body_models.py#L628) model. The path to the model can either be the path to the file with the parameters or a directory with the following structure:
+```bash
+models
+├── smpl
+│   ├── SMPL_FEMALE.pkl
+│   └── SMPL_MALE.pkl
+│   └── SMPL_NEUTRAL.pkl
+├── smplh
+│   ├── SMPLH_FEMALE.pkl
+│   └── SMPLH_MALE.pkl
+├── mano
+|   ├── MANO_RIGHT.pkl
+|   └── MANO_LEFT.pkl
+└── smplx
+    ├── SMPLX_FEMALE.npz
+    ├── SMPLX_FEMALE.pkl
+    ├── SMPLX_MALE.npz
+    ├── SMPLX_MALE.pkl
+    ├── SMPLX_NEUTRAL.npz
+    └── SMPLX_NEUTRAL.pkl
+```
+
+
+## MANO and FLAME correspondences
+
+The vertex correspondences between SMPL-X and MANO, FLAME can be downloaded
+from [the project website](https://smpl-x.is.tue.mpg.de). If you have extracted
+the correspondence data in the folder *correspondences*, then use the following
+scripts to visualize them:
+
+1. To view MANO correspondences run the following command:
+
+```
+python examples/vis_mano_vertices.py --model-folder $SMPLX_FOLDER --corr-fname correspondences/MANO_SMPLX_vertex_ids.pkl
+```
+
+2. To view FLAME correspondences run the following command:
+
+```
+python examples/vis_flame_vertices.py --model-folder $SMPLX_FOLDER --corr-fname correspondences/SMPL-X__FLAME_vertex_ids.npy
+```
+
+## Example
+
+After installing the *smplx* package and downloading the model parameters you should be able to run the *demo.py*
+script to visualize the results. For this step you have to install the [pyrender](https://pyrender.readthedocs.io/en/latest/index.html) and [trimesh](https://trimsh.org/) packages.
+
+`python examples/demo.py --model-folder $SMPLX_FOLDER --plot-joints=True --gender="neutral"`
+
+![SMPL-X Examples](./images/example.png)
+
+## Citation
+
+Depending on which model is loaded for your project, i.e. SMPL-X or SMPL+H or SMPL, please cite the most relevant work below, listed in the same order:
+
+```
+@inproceedings{SMPL-X:2019,
+    title = {Expressive Body Capture: 3D Hands, Face, and Body from a Single Image},
+    author = {Pavlakos, Georgios and Choutas, Vasileios and Ghorbani, Nima and Bolkart, Timo and Osman, Ahmed A. A. and Tzionas, Dimitrios and Black, Michael J.},
+    booktitle = {Proceedings IEEE Conf. on Computer Vision and Pattern Recognition (CVPR)},
+    year = {2019}
+}
+```
+
+```
+@article{MANO:SIGGRAPHASIA:2017,
+    title = {Embodied Hands: Modeling and Capturing Hands and Bodies Together},
+    author = {Romero, Javier and Tzionas, Dimitrios and Black, Michael J.},
+    journal = {ACM Transactions on Graphics, (Proc. SIGGRAPH Asia)},
+    volume = {36},
+    number = {6},
+    series = {245:1--245:17},
+    month = nov,
+    year = {2017},
+    month_numeric = {11}
+  }
+```
+
+```
+@article{SMPL:2015,
+    author = {Loper, Matthew and Mahmood, Naureen and Romero, Javier and Pons-Moll, Gerard and Black, Michael J.},
+    title = {{SMPL}: A Skinned Multi-Person Linear Model},
+    journal = {ACM Transactions on Graphics, (Proc. SIGGRAPH Asia)},
+    month = oct,
+    number = {6},
+    pages = {248:1--248:16},
+    publisher = {ACM},
+    volume = {34},
+    year = {2015}
+}
+```
+
+This repository was originally developed for SMPL-X / SMPLify-X (CVPR 2019), you might be interested in having a look: [https://smpl-x.is.tue.mpg.de](https://smpl-x.is.tue.mpg.de).
+
+## Acknowledgments
+
+### Facial Contour
+
+Special thanks to [Soubhik Sanyal](https://github.com/soubhiksanyal) for sharing the Tensorflow code used for the facial
+landmarks.
+
+## Contact
+The code of this repository was implemented by [Vassilis Choutas](vassilis.choutas@tuebingen.mpg.de).
+
+For questions, please contact [smplx@tue.mpg.de](smplx@tue.mpg.de).
+
+For commercial licensing (and all related questions for business applications), please contact [ps-licensing@tue.mpg.de](ps-licensing@tue.mpg.de).
diff --git a/util/smplx/examples/demo.py b/util/smplx/examples/demo.py
new file mode 100644
index 0000000000000000000000000000000000000000..d213de9ac42ce183dd4f51541b2e987e2dadd701
--- /dev/null
+++ b/util/smplx/examples/demo.py
@@ -0,0 +1,200 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+
+import numpy as np
+import torch
+
+import smplx
+
+
+def main(model_folder,
+         model_type='smplx',
+         ext='npz',
+         gender='neutral',
+         plot_joints=False,
+         num_betas=10,
+         sample_shape=True,
+         sample_expression=True,
+         num_expression_coeffs=10,
+         plotting_module='pyrender',
+         use_face_contour=False):
+
+    model = smplx.create(model_folder,
+                         model_type=model_type,
+                         gender=gender,
+                         use_face_contour=use_face_contour,
+                         num_betas=num_betas,
+                         num_expression_coeffs=num_expression_coeffs,
+                         ext=ext)
+    print(model)
+
+    betas, expression = None, None
+    if sample_shape:
+        betas = torch.randn([1, model.num_betas], dtype=torch.float32)
+    if sample_expression:
+        expression = torch.randn([1, model.num_expression_coeffs],
+                                 dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression, return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    if plotting_module == 'pyrender':
+        import pyrender
+        import trimesh
+        vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
+        tri_mesh = trimesh.Trimesh(vertices,
+                                   model.faces,
+                                   vertex_colors=vertex_colors)
+
+        mesh = pyrender.Mesh.from_trimesh(tri_mesh)
+
+        scene = pyrender.Scene()
+        scene.add(mesh)
+
+        if plot_joints:
+            sm = trimesh.creation.uv_sphere(radius=0.005)
+            sm.visual.vertex_colors = [0.9, 0.1, 0.1, 1.0]
+            tfs = np.tile(np.eye(4), (len(joints), 1, 1))
+            tfs[:, :3, 3] = joints
+            joints_pcl = pyrender.Mesh.from_trimesh(sm, poses=tfs)
+            scene.add(joints_pcl)
+
+        pyrender.Viewer(scene, use_raymond_lighting=True)
+    elif plotting_module == 'matplotlib':
+        from matplotlib import pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+        from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+
+        mesh = Poly3DCollection(vertices[model.faces], alpha=0.1)
+        face_color = (1.0, 1.0, 0.9)
+        edge_color = (0, 0, 0)
+        mesh.set_edgecolor(edge_color)
+        mesh.set_facecolor(face_color)
+        ax.add_collection3d(mesh)
+        ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], color='r')
+
+        if plot_joints:
+            ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], alpha=0.1)
+        plt.show()
+    elif plotting_module == 'open3d':
+        import open3d as o3d
+
+        mesh = o3d.geometry.TriangleMesh()
+        mesh.vertices = o3d.utility.Vector3dVector(vertices)
+        mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+        mesh.compute_vertex_normals()
+        mesh.paint_uniform_color([0.3, 0.3, 0.3])
+
+        geometry = [mesh]
+        if plot_joints:
+            joints_pcl = o3d.geometry.PointCloud()
+            joints_pcl.points = o3d.utility.Vector3dVector(joints)
+            joints_pcl.paint_uniform_color([0.7, 0.3, 0.3])
+            geometry.append(joints_pcl)
+
+        o3d.visualization.draw_geometries(geometry)
+    else:
+        raise ValueError('Unknown plotting_module: {}'.format(plotting_module))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder',
+                        required=True,
+                        type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--model-type',
+                        default='smplx',
+                        type=str,
+                        choices=['smpl', 'smplh', 'smplx', 'mano', 'flame'],
+                        help='The type of model to load')
+    parser.add_argument('--gender',
+                        type=str,
+                        default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--num-betas',
+                        default=10,
+                        type=int,
+                        dest='num_betas',
+                        help='Number of shape coefficients.')
+    parser.add_argument('--num-expression-coeffs',
+                        default=10,
+                        type=int,
+                        dest='num_expression_coeffs',
+                        help='Number of expression coefficients.')
+    parser.add_argument('--plotting-module',
+                        type=str,
+                        default='pyrender',
+                        dest='plotting_module',
+                        choices=['pyrender', 'matplotlib', 'open3d'],
+                        help='The module to use for plotting the result')
+    parser.add_argument('--ext',
+                        type=str,
+                        default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--plot-joints',
+                        default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='The path to the model folder')
+    parser.add_argument('--sample-shape',
+                        default=True,
+                        dest='sample_shape',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random shape')
+    parser.add_argument('--sample-expression',
+                        default=True,
+                        dest='sample_expression',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random expression')
+    parser.add_argument('--use-face-contour',
+                        default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Compute the contour of the face')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    model_type = args.model_type
+    plot_joints = args.plot_joints
+    use_face_contour = args.use_face_contour
+    gender = args.gender
+    ext = args.ext
+    plotting_module = args.plotting_module
+    num_betas = args.num_betas
+    num_expression_coeffs = args.num_expression_coeffs
+    sample_shape = args.sample_shape
+    sample_expression = args.sample_expression
+
+    main(model_folder,
+         model_type,
+         ext=ext,
+         gender=gender,
+         plot_joints=plot_joints,
+         num_betas=num_betas,
+         num_expression_coeffs=num_expression_coeffs,
+         sample_shape=sample_shape,
+         sample_expression=sample_expression,
+         plotting_module=plotting_module,
+         use_face_contour=use_face_contour)
diff --git a/util/smplx/examples/demo_layers.py b/util/smplx/examples/demo_layers.py
new file mode 100644
index 0000000000000000000000000000000000000000..061dca8f17920a2732d4733525ccb5d372aac2f9
--- /dev/null
+++ b/util/smplx/examples/demo_layers.py
@@ -0,0 +1,200 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+
+import numpy as np
+import torch
+
+import smplx
+
+
+def main(model_folder,
+         model_type='smplx',
+         ext='npz',
+         gender='neutral',
+         plot_joints=False,
+         num_betas=10,
+         sample_shape=True,
+         sample_expression=True,
+         num_expression_coeffs=10,
+         plotting_module='pyrender',
+         use_face_contour=False):
+
+    model = smplx.build_layer(model_folder,
+                              model_type=model_type,
+                              gender=gender,
+                              use_face_contour=use_face_contour,
+                              num_betas=num_betas,
+                              num_expression_coeffs=num_expression_coeffs,
+                              ext=ext)
+    print(model)
+
+    betas, expression = None, None
+    if sample_shape:
+        betas = torch.randn([1, model.num_betas], dtype=torch.float32)
+    if sample_expression:
+        expression = torch.randn([1, model.num_expression_coeffs],
+                                 dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression, return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    if plotting_module == 'pyrender':
+        import pyrender
+        import trimesh
+        vertex_colors = np.ones([vertices.shape[0], 4]) * [0.3, 0.3, 0.3, 0.8]
+        tri_mesh = trimesh.Trimesh(vertices,
+                                   model.faces,
+                                   vertex_colors=vertex_colors)
+
+        mesh = pyrender.Mesh.from_trimesh(tri_mesh)
+
+        scene = pyrender.Scene()
+        scene.add(mesh)
+
+        if plot_joints:
+            sm = trimesh.creation.uv_sphere(radius=0.005)
+            sm.visual.vertex_colors = [0.9, 0.1, 0.1, 1.0]
+            tfs = np.tile(np.eye(4), (len(joints), 1, 1))
+            tfs[:, :3, 3] = joints
+            joints_pcl = pyrender.Mesh.from_trimesh(sm, poses=tfs)
+            scene.add(joints_pcl)
+
+        pyrender.Viewer(scene, use_raymond_lighting=True)
+    elif plotting_module == 'matplotlib':
+        from matplotlib import pyplot as plt
+        from mpl_toolkits.mplot3d import Axes3D
+        from mpl_toolkits.mplot3d.art3d import Poly3DCollection
+
+        fig = plt.figure()
+        ax = fig.add_subplot(111, projection='3d')
+
+        mesh = Poly3DCollection(vertices[model.faces], alpha=0.1)
+        face_color = (1.0, 1.0, 0.9)
+        edge_color = (0, 0, 0)
+        mesh.set_edgecolor(edge_color)
+        mesh.set_facecolor(face_color)
+        ax.add_collection3d(mesh)
+        ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], color='r')
+
+        if plot_joints:
+            ax.scatter(joints[:, 0], joints[:, 1], joints[:, 2], alpha=0.1)
+        plt.show()
+    elif plotting_module == 'open3d':
+        import open3d as o3d
+
+        mesh = o3d.geometry.TriangleMesh()
+        mesh.vertices = o3d.utility.Vector3dVector(vertices)
+        mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+        mesh.compute_vertex_normals()
+        mesh.paint_uniform_color([0.3, 0.3, 0.3])
+
+        geometry = [mesh]
+        if plot_joints:
+            joints_pcl = o3d.geometry.PointCloud()
+            joints_pcl.points = o3d.utility.Vector3dVector(joints)
+            joints_pcl.paint_uniform_color([0.7, 0.3, 0.3])
+            geometry.append(joints_pcl)
+
+        o3d.visualization.draw_geometries(geometry)
+    else:
+        raise ValueError('Unknown plotting_module: {}'.format(plotting_module))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder',
+                        required=True,
+                        type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--model-type',
+                        default='smplx',
+                        type=str,
+                        choices=['smpl', 'smplh', 'smplx', 'mano', 'flame'],
+                        help='The type of model to load')
+    parser.add_argument('--gender',
+                        type=str,
+                        default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--num-betas',
+                        default=10,
+                        type=int,
+                        dest='num_betas',
+                        help='Number of shape coefficients.')
+    parser.add_argument('--num-expression-coeffs',
+                        default=10,
+                        type=int,
+                        dest='num_expression_coeffs',
+                        help='Number of expression coefficients.')
+    parser.add_argument('--plotting-module',
+                        type=str,
+                        default='pyrender',
+                        dest='plotting_module',
+                        choices=['pyrender', 'matplotlib', 'open3d'],
+                        help='The module to use for plotting the result')
+    parser.add_argument('--ext',
+                        type=str,
+                        default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--plot-joints',
+                        default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='The path to the model folder')
+    parser.add_argument('--sample-shape',
+                        default=True,
+                        dest='sample_shape',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random shape')
+    parser.add_argument('--sample-expression',
+                        default=True,
+                        dest='sample_expression',
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Sample a random expression')
+    parser.add_argument('--use-face-contour',
+                        default=False,
+                        type=lambda arg: arg.lower() in ['true', '1'],
+                        help='Compute the contour of the face')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    model_type = args.model_type
+    plot_joints = args.plot_joints
+    use_face_contour = args.use_face_contour
+    gender = args.gender
+    ext = args.ext
+    plotting_module = args.plotting_module
+    num_betas = args.num_betas
+    num_expression_coeffs = args.num_expression_coeffs
+    sample_shape = args.sample_shape
+    sample_expression = args.sample_expression
+
+    main(model_folder,
+         model_type,
+         ext=ext,
+         gender=gender,
+         plot_joints=plot_joints,
+         num_betas=num_betas,
+         num_expression_coeffs=num_expression_coeffs,
+         sample_shape=sample_shape,
+         sample_expression=sample_expression,
+         plotting_module=plotting_module,
+         use_face_contour=use_face_contour)
diff --git a/util/smplx/examples/vis_flame_vertices.py b/util/smplx/examples/vis_flame_vertices.py
new file mode 100644
index 0000000000000000000000000000000000000000..f09f22704041064f2c3b75944587adee24d7ff98
--- /dev/null
+++ b/util/smplx/examples/vis_flame_vertices.py
@@ -0,0 +1,106 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+import pickle
+
+import numpy as np
+import torch
+import open3d as o3d
+
+import smplx
+
+
+def main(model_folder,
+         corr_fname,
+         ext='npz',
+         head_color=(0.3, 0.3, 0.6),
+         gender='neutral'):
+
+    head_idxs = np.load(corr_fname)
+
+    model = smplx.create(model_folder,
+                         model_type='smplx',
+                         gender=gender,
+                         ext=ext)
+    betas = torch.zeros([1, 10], dtype=torch.float32)
+    expression = torch.zeros([1, 10], dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression, return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    mesh = o3d.geometry.TriangleMesh()
+    mesh.vertices = o3d.utility.Vector3dVector(vertices)
+    mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+    mesh.compute_vertex_normals()
+
+    colors = np.ones_like(vertices) * [0.3, 0.3, 0.3]
+    colors[head_idxs] = head_color
+
+    mesh.vertex_colors = o3d.utility.Vector3dVector(colors)
+
+    o3d.visualization.draw_geometries([mesh])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder',
+                        required=True,
+                        type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--corr-fname',
+                        required=True,
+                        type=str,
+                        dest='corr_fname',
+                        help='Filename with the head correspondences')
+    parser.add_argument('--gender',
+                        type=str,
+                        default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--ext',
+                        type=str,
+                        default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--head',
+                        default='right',
+                        choices=['right', 'left'],
+                        type=str,
+                        help='Which head to plot')
+    parser.add_argument('--head-color',
+                        type=float,
+                        nargs=3,
+                        dest='head_color',
+                        default=(0.3, 0.3, 0.6),
+                        help='Color for the head vertices')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    corr_fname = args.corr_fname
+    gender = args.gender
+    ext = args.ext
+    head = args.head
+    head_color = args.head_color
+
+    main(model_folder,
+         corr_fname,
+         ext=ext,
+         head_color=head_color,
+         gender=gender)
diff --git a/util/smplx/examples/vis_mano_vertices.py b/util/smplx/examples/vis_mano_vertices.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1e08437bc21cfcdd8afab2c82201ea6369ba3c7
--- /dev/null
+++ b/util/smplx/examples/vis_mano_vertices.py
@@ -0,0 +1,114 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import os.path as osp
+import argparse
+import pickle
+
+import numpy as np
+import torch
+import open3d as o3d
+
+import smplx
+
+
+def main(model_folder,
+         corr_fname,
+         ext='npz',
+         hand_color=(0.3, 0.3, 0.6),
+         gender='neutral',
+         hand='right'):
+
+    with open(corr_fname, 'rb') as f:
+        idxs_data = pickle.load(f)
+        if hand == 'both':
+            hand_idxs = np.concatenate(
+                [idxs_data['left_hand'], idxs_data['right_hand']])
+        else:
+            hand_idxs = idxs_data[f'{hand}_hand']
+
+    model = smplx.create(model_folder,
+                         model_type='smplx',
+                         gender=gender,
+                         ext=ext)
+    betas = torch.zeros([1, 10], dtype=torch.float32)
+    expression = torch.zeros([1, 10], dtype=torch.float32)
+
+    output = model(betas=betas, expression=expression, return_verts=True)
+    vertices = output.vertices.detach().cpu().numpy().squeeze()
+    joints = output.joints.detach().cpu().numpy().squeeze()
+
+    print('Vertices shape =', vertices.shape)
+    print('Joints shape =', joints.shape)
+
+    mesh = o3d.geometry.TriangleMesh()
+    mesh.vertices = o3d.utility.Vector3dVector(vertices)
+    mesh.triangles = o3d.utility.Vector3iVector(model.faces)
+    mesh.compute_vertex_normals()
+
+    colors = np.ones_like(vertices) * [0.3, 0.3, 0.3]
+    colors[hand_idxs] = hand_color
+
+    mesh.vertex_colors = o3d.utility.Vector3dVector(colors)
+
+    o3d.visualization.draw_geometries([mesh])
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='SMPL-X Demo')
+
+    parser.add_argument('--model-folder',
+                        required=True,
+                        type=str,
+                        help='The path to the model folder')
+    parser.add_argument('--corr-fname',
+                        required=True,
+                        type=str,
+                        dest='corr_fname',
+                        help='Filename with the hand correspondences')
+    parser.add_argument('--gender',
+                        type=str,
+                        default='neutral',
+                        help='The gender of the model')
+    parser.add_argument('--ext',
+                        type=str,
+                        default='npz',
+                        help='Which extension to use for loading')
+    parser.add_argument('--hand',
+                        default='right',
+                        choices=['right', 'left', 'both'],
+                        type=str,
+                        help='Which hand to plot')
+    parser.add_argument('--hand-color',
+                        type=float,
+                        nargs=3,
+                        dest='hand_color',
+                        default=(0.3, 0.3, 0.6),
+                        help='Color for the hand vertices')
+
+    args = parser.parse_args()
+
+    model_folder = osp.expanduser(osp.expandvars(args.model_folder))
+    corr_fname = args.corr_fname
+    gender = args.gender
+    ext = args.ext
+    hand = args.hand
+    hand_color = args.hand_color
+
+    main(model_folder,
+         corr_fname,
+         ext=ext,
+         hand_color=hand_color,
+         gender=gender,
+         hand=hand)
diff --git a/util/smplx/setup.py b/util/smplx/setup.py
new file mode 100644
index 0000000000000000000000000000000000000000..4cc0939cbc06caea8bcb6aafbb38ff098fe55330
--- /dev/null
+++ b/util/smplx/setup.py
@@ -0,0 +1,75 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import io
+import os
+
+from setuptools import setup
+
+# Package meta-data.
+NAME = 'smplx'
+DESCRIPTION = 'PyTorch module for loading the SMPLX body model'
+URL = 'http://smpl-x.is.tuebingen.mpg.de'
+EMAIL = 'vassilis.choutas@tuebingen.mpg.de'
+AUTHOR = 'Vassilis Choutas'
+REQUIRES_PYTHON = '>=3.6.0'
+VERSION = '0.1.21'
+
+here = os.path.abspath(os.path.dirname(__file__))
+
+try:
+    FileNotFoundError
+except NameError:
+    FileNotFoundError = IOError
+
+# Import the README and use it as the long-description.
+# Note: this will only work if 'README.md' is present in your MANIFEST.in file!
+try:
+    with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
+        long_description = '\n' + f.read()
+except FileNotFoundError:
+    long_description = DESCRIPTION
+
+# Load the package's __version__.py module as a dictionary.
+about = {}
+if not VERSION:
+    with open(os.path.join(here, NAME, '__version__.py')) as f:
+        exec(f.read(), about)
+else:
+    about['__version__'] = VERSION
+
+pyrender_reqs = ['pyrender>=0.1.23', 'trimesh>=2.37.6', 'shapely']
+matplotlib_reqs = ['matplotlib']
+open3d_reqs = ['open3d-python']
+
+setup(name=NAME,
+      version=about['__version__'],
+      description=DESCRIPTION,
+      long_description=long_description,
+      long_description_content_type='text/markdown',
+      author=AUTHOR,
+      author_email=EMAIL,
+      python_requires=REQUIRES_PYTHON,
+      url=URL,
+      install_requires=[
+          'numpy>=1.16.2', 'torch>=1.0.1.post2', 'torchgeometry>=0.1.2'
+      ],
+      extras_require={
+          'pyrender': pyrender_reqs,
+          'open3d': open3d_reqs,
+          'matplotlib': matplotlib_reqs,
+          'all': pyrender_reqs + matplotlib_reqs + open3d_reqs
+      },
+      packages=['smplx', 'tools'])
diff --git a/util/smplx/smplx/__init__.py b/util/smplx/smplx/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d3924fccad4ed2007ea4c77f3f2ee4e6a16f1c7c
--- /dev/null
+++ b/util/smplx/smplx/__init__.py
@@ -0,0 +1,28 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from .body_models import (
+    create,
+    SMPL,
+    SMPLH,
+    SMPLX,
+    MANO,
+    FLAME,
+    build_layer,
+    SMPLLayer,
+    SMPLHLayer,
+    SMPLXLayer,
+    MANOLayer,
+    FLAMELayer,
+)
diff --git a/util/smplx/smplx/body_models.py b/util/smplx/smplx/body_models.py
new file mode 100644
index 0000000000000000000000000000000000000000..25e8f3e5625c3b93936cf0b05f889fdf34f2384e
--- /dev/null
+++ b/util/smplx/smplx/body_models.py
@@ -0,0 +1,2333 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from typing import Optional, Dict, Union
+import os
+import os.path as osp
+
+import pickle
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from .lbs import (lbs, vertices2landmarks, find_dynamic_lmk_idx_and_bcoords)
+
+from .vertex_ids import vertex_ids as VERTEX_IDS
+from .utils import (Struct, to_np, to_tensor, Tensor, Array, SMPLOutput,
+                    SMPLHOutput, SMPLXOutput, MANOOutput, FLAMEOutput,
+                    find_joint_kin_chain)
+from .vertex_joint_selector import VertexJointSelector
+from config.config import cfg
+
+
+class SMPL(nn.Module):
+
+    NUM_JOINTS = 23
+    NUM_BODY_JOINTS = 23
+    SHAPE_SPACE_DIM = 300
+
+    def __init__(self,
+                 model_path: str,
+                 data_struct: Optional[Struct] = None,
+                 create_betas: bool = True,
+                 betas: Optional[Tensor] = None,
+                 num_betas: int = 10,
+                 create_global_orient: bool = True,
+                 global_orient: Optional[Tensor] = None,
+                 create_body_pose: bool = True,
+                 body_pose: Optional[Tensor] = None,
+                 create_transl: bool = True,
+                 transl: Optional[Tensor] = None,
+                 dtype=torch.float32,
+                 batch_size: int = 1,
+                 joint_mapper=None,
+                 gender: str = 'neutral',
+                 vertex_ids: Dict[str, int] = None,
+                 v_template: Optional[Union[Tensor, Array]] = None,
+                 **kwargs) -> None:
+        """SMPL model constructor.
+
+        Parameters
+        ----------
+        model_path: str
+            The path to the folder or to the file where the model
+            parameters are stored
+        data_struct: Strct
+            A struct object. If given, then the parameters of the model are
+            read from the object. Otherwise, the model tries to read the
+            parameters from the given `model_path`. (default = None)
+        create_global_orient: bool, optional
+            Flag for creating a member variable for the global orientation
+            of the body. (default = True)
+        global_orient: torch.tensor, optional, Bx3
+            The default value for the global orientation variable.
+            (default = None)
+        create_body_pose: bool, optional
+            Flag for creating a member variable for the pose of the body.
+            (default = True)
+        body_pose: torch.tensor, optional, Bx(Body Joints * 3)
+            The default value for the body pose variable.
+            (default = None)
+        num_betas: int, optional
+            Number of shape components to use
+            (default = 10).
+        create_betas: bool, optional
+            Flag for creating a member variable for the shape space
+            (default = True).
+        betas: torch.tensor, optional, Bx10
+            The default value for the shape member variable.
+            (default = None)
+        create_transl: bool, optional
+            Flag for creating a member variable for the translation
+            of the body. (default = True)
+        transl: torch.tensor, optional, Bx3
+            The default value for the transl variable.
+            (default = None)
+        dtype: torch.dtype, optional
+            The data type for the created variables
+        batch_size: int, optional
+            The batch size used for creating the member variables
+        joint_mapper: object, optional
+            An object that re-maps the joints. Useful if one wants to
+            re-order the SMPL joints to some other convention (e.g. MSCOCO)
+            (default = None)
+        gender: str, optional
+            Which gender to load
+        vertex_ids: dict, optional
+            A dictionary containing the indices of the extra vertices that
+            will be selected
+        """
+
+        self.gender = gender
+
+        if data_struct is None:
+            if osp.isdir(model_path):
+                model_fn = 'SMPL_{}.{ext}'.format(gender.upper(), ext='pkl')
+                smpl_path = os.path.join(model_path, model_fn)
+            else:
+                smpl_path = model_path
+            assert osp.exists(smpl_path), 'Path {} does not exist!'.format(
+                smpl_path)
+
+            with open(smpl_path, 'rb') as smpl_file:
+                data_struct = Struct(
+                    **pickle.load(smpl_file, encoding='latin1'))
+
+        super(SMPL, self).__init__()
+        self.batch_size = batch_size
+        shapedirs = data_struct.shapedirs
+        if (shapedirs.shape[-1] < self.SHAPE_SPACE_DIM):
+            print(f'WARNING: You are using a {self.name()} model, with only'
+                  ' 10 shape coefficients.')
+            num_betas = min(num_betas, 10)
+        else:
+            num_betas = min(num_betas, self.SHAPE_SPACE_DIM)
+
+        self._num_betas = num_betas
+        shapedirs = shapedirs[:, :, :num_betas]
+        # The shape components
+        self.register_buffer('shapedirs',
+                             to_tensor(to_np(shapedirs), dtype=dtype))
+
+        if vertex_ids is None:
+            # SMPL and SMPL-H share the same topology, so any extra joints can
+            # be drawn from the same place
+            vertex_ids = VERTEX_IDS['smplh']
+
+        self.dtype = dtype
+
+        self.joint_mapper = joint_mapper
+
+        self.vertex_joint_selector = VertexJointSelector(vertex_ids=vertex_ids,
+                                                         **kwargs)
+
+        self.faces = data_struct.f
+        self.register_buffer(
+            'faces_tensor',
+            to_tensor(to_np(self.faces, dtype=np.int64), dtype=torch.long))
+
+        if create_betas:
+            if betas is None:
+                default_betas = torch.zeros([batch_size, self.num_betas],
+                                            dtype=dtype)
+            else:
+                if torch.is_tensor(betas):
+                    default_betas = betas.clone().detach()
+                else:
+                    default_betas = torch.tensor(betas, dtype=dtype)
+
+            self.register_parameter(
+                'betas', nn.Parameter(default_betas, requires_grad=True))
+
+        # The tensor that contains the global rotation of the model
+        # It is separated from the pose of the joints in case we wish to
+        # optimize only over one of them
+        if create_global_orient:
+            if global_orient is None:
+                default_global_orient = torch.zeros([batch_size, 3],
+                                                    dtype=dtype)
+            else:
+                if torch.is_tensor(global_orient):
+                    default_global_orient = global_orient.clone().detach()
+                else:
+                    default_global_orient = torch.tensor(global_orient,
+                                                         dtype=dtype)
+
+            global_orient = nn.Parameter(default_global_orient,
+                                         requires_grad=True)
+            self.register_parameter('global_orient', global_orient)
+
+        if create_body_pose:
+            if body_pose is None:
+                default_body_pose = torch.zeros(
+                    [batch_size, self.NUM_BODY_JOINTS * 3], dtype=dtype)
+            else:
+                if torch.is_tensor(body_pose):
+                    default_body_pose = body_pose.clone().detach()
+                else:
+                    default_body_pose = torch.tensor(body_pose, dtype=dtype)
+            self.register_parameter(
+                'body_pose', nn.Parameter(default_body_pose,
+                                          requires_grad=True))
+
+        if create_transl:
+            if transl is None:
+                default_transl = torch.zeros([batch_size, 3],
+                                             dtype=dtype,
+                                             requires_grad=True)
+            else:
+                default_transl = torch.tensor(transl, dtype=dtype)
+            self.register_parameter(
+                'transl', nn.Parameter(default_transl, requires_grad=True))
+
+        if v_template is None:
+            v_template = data_struct.v_template
+        if not torch.is_tensor(v_template):
+            v_template = to_tensor(to_np(v_template), dtype=dtype)
+        # The vertices of the template model
+        self.register_buffer('v_template', v_template)
+
+        j_regressor = to_tensor(to_np(data_struct.J_regressor), dtype=dtype)
+        self.register_buffer('J_regressor', j_regressor)
+
+        # Pose blend shape basis: 6890 x 3 x 207, reshaped to 6890*3 x 207
+        num_pose_basis = data_struct.posedirs.shape[-1]
+        # 207 x 20670
+        posedirs = np.reshape(data_struct.posedirs, [-1, num_pose_basis]).T
+        self.register_buffer('posedirs', to_tensor(to_np(posedirs),
+                                                   dtype=dtype))
+
+        # indices of parents for each joints
+        parents = to_tensor(to_np(data_struct.kintree_table[0])).long()
+        parents[0] = -1
+        self.register_buffer('parents', parents)
+
+        self.register_buffer(
+            'lbs_weights', to_tensor(to_np(data_struct.weights), dtype=dtype))
+
+    @property
+    def num_betas(self):
+        return self._num_betas
+
+    @property
+    def num_expression_coeffs(self):
+        return 0
+
+    def create_mean_pose(self, data_struct) -> Tensor:
+        pass
+
+    def name(self) -> str:
+        return 'SMPL'
+
+    @torch.no_grad()
+    def reset_params(self, **params_dict) -> None:
+        for param_name, param in self.named_parameters():
+            if param_name in params_dict:
+                param[:] = torch.tensor(params_dict[param_name])
+            else:
+                param.fill_(0)
+
+    def get_num_verts(self) -> int:
+        return self.v_template.shape[0]
+
+    def get_num_faces(self) -> int:
+        return self.faces.shape[0]
+
+    def extra_repr(self) -> str:
+        msg = [
+            f'Gender: {self.gender.upper()}',
+            f'Number of joints: {self.J_regressor.shape[0]}',
+            f'Betas: {self.num_betas}',
+        ]
+        return '\n'.join(msg)
+
+    def forward(self,
+                betas: Optional[Tensor] = None,
+                body_pose: Optional[Tensor] = None,
+                global_orient: Optional[Tensor] = None,
+                transl: Optional[Tensor] = None,
+                return_verts=True,
+                return_full_pose: bool = False,
+                pose2rot: bool = True,
+                **kwargs) -> SMPLOutput:
+        """Forward pass for the SMPL model.
+
+        Parameters
+        ----------
+        global_orient: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable and use it as the global
+            rotation of the body. Useful if someone wishes to predicts this
+            with an external model. (default=None)
+        betas: torch.tensor, optional, shape Bx10
+            If given, ignore the member variable `betas` and use it
+            instead. For example, it can used if shape parameters
+            `betas` are predicted from some external model.
+            (default=None)
+        body_pose: torch.tensor, optional, shape Bx(J*3)
+            If given, ignore the member variable `body_pose` and use it
+            instead. For example, it can used if someone predicts the
+            pose of the body joints are predicted from some external model.
+            It should be a tensor that contains joint rotations in
+            axis-angle format. (default=None)
+        transl: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `transl` and use it
+            instead. For example, it can used if the translation
+            `transl` is predicted from some external model.
+            (default=None)
+        return_verts: bool, optional
+            Return the vertices. (default=True)
+        return_full_pose: bool, optional
+            Returns the full axis-angle pose vector (default=False)
+
+        Returns
+        -------
+        """
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient
+                         if global_orient is not None else self.global_orient)
+        body_pose = body_pose if body_pose is not None else self.body_pose
+        betas = betas if betas is not None else self.betas
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None and hasattr(self, 'transl'):
+            transl = self.transl
+
+        full_pose = torch.cat([global_orient, body_pose], dim=1)
+
+        batch_size = max(betas.shape[0], global_orient.shape[0],
+                         body_pose.shape[0])
+
+        if betas.shape[0] != batch_size:
+            num_repeats = int(batch_size / betas.shape[0])
+            betas = betas.expand(num_repeats, -1)
+
+        vertices, joints = lbs(betas,
+                               full_pose,
+                               self.v_template,
+                               self.shapedirs,
+                               self.posedirs,
+                               self.J_regressor,
+                               self.parents,
+                               self.lbs_weights,
+                               pose2rot=pose2rot)
+
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLOutput(vertices=vertices if return_verts else None,
+                            global_orient=global_orient,
+                            body_pose=body_pose,
+                            joints=joints,
+                            betas=betas,
+                            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLLayer(SMPL):
+    def __init__(self, *args, **kwargs) -> None:
+        # Just create a SMPL module without any member variables
+        super(SMPLLayer, self).__init__(
+            create_body_pose=False,
+            create_betas=False,
+            create_global_orient=False,
+            create_transl=False,
+            *args,
+            **kwargs,
+        )
+
+    def forward(self,
+                betas: Optional[Tensor] = None,
+                body_pose: Optional[Tensor] = None,
+                global_orient: Optional[Tensor] = None,
+                transl: Optional[Tensor] = None,
+                return_verts=True,
+                return_full_pose: bool = False,
+                pose2rot: bool = True,
+                **kwargs) -> SMPLOutput:
+        """Forward pass for the SMPL model.
+
+        Parameters
+        ----------
+        global_orient: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable and use it as the global
+            rotation of the body. Useful if someone wishes to predicts this
+            with an external model. (default=None)
+        betas: torch.tensor, optional, shape Bx10
+            If given, ignore the member variable `betas` and use it
+            instead. For example, it can used if shape parameters
+            `betas` are predicted from some external model.
+            (default=None)
+        body_pose: torch.tensor, optional, shape Bx(J*3)
+            If given, ignore the member variable `body_pose` and use it
+            instead. For example, it can used if someone predicts the
+            pose of the body joints are predicted from some external model.
+            It should be a tensor that contains joint rotations in
+            axis-angle format. (default=None)
+        transl: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `transl` and use it
+            instead. For example, it can used if the translation
+            `transl` is predicted from some external model.
+            (default=None)
+        return_verts: bool, optional
+            Return the vertices. (default=True)
+        return_full_pose: bool, optional
+            Returns the full axis-angle pose vector (default=False)
+
+        Returns
+        -------
+        """
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 1, 1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if body_pose is None:
+            body_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, self.NUM_BODY_JOINTS,
+                                1).contiguous()
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype,
+                                device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+        full_pose = torch.cat([
+            global_orient.reshape(-1, 1, 3),
+            body_pose.reshape(-1, self.NUM_BODY_JOINTS, 3)
+        ],
+                              dim=1)
+
+        vertices, joints = lbs(betas,
+                               full_pose,
+                               self.v_template,
+                               self.shapedirs,
+                               self.posedirs,
+                               self.J_regressor,
+                               self.parents,
+                               self.lbs_weights,
+                               pose2rot=True)
+
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if transl is not None:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLOutput(vertices=vertices if return_verts else None,
+                            global_orient=global_orient,
+                            body_pose=body_pose,
+                            joints=joints,
+                            betas=betas,
+                            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLH(SMPL):
+
+    # The hand joints are replaced by MANO
+    NUM_BODY_JOINTS = SMPL.NUM_JOINTS - 2
+    NUM_HAND_JOINTS = 15
+    NUM_JOINTS = NUM_BODY_JOINTS + 2 * NUM_HAND_JOINTS
+
+    def __init__(self,
+                 model_path,
+                 data_struct: Optional[Struct] = None,
+                 create_left_hand_pose: bool = True,
+                 left_hand_pose: Optional[Tensor] = None,
+                 create_right_hand_pose: bool = True,
+                 right_hand_pose: Optional[Tensor] = None,
+                 use_pca: bool = True,
+                 num_pca_comps: int = 6,
+                 flat_hand_mean: bool = False,
+                 batch_size: int = 1,
+                 gender: str = 'neutral',
+                 dtype=torch.float32,
+                 vertex_ids=None,
+                 use_compressed: bool = True,
+                 ext: str = 'pkl',
+                 **kwargs) -> None:
+        """SMPLH model constructor.
+
+        Parameters
+        ----------
+        model_path: str
+            The path to the folder or to the file where the model
+            parameters are stored
+        data_struct: Strct
+            A struct object. If given, then the parameters of the model are
+            read from the object. Otherwise, the model tries to read the
+            parameters from the given `model_path`. (default = None)
+        create_left_hand_pose: bool, optional
+            Flag for creating a member variable for the pose of the left
+            hand. (default = True)
+        left_hand_pose: torch.tensor, optional, BxP
+            The default value for the left hand pose member variable.
+            (default = None)
+        create_right_hand_pose: bool, optional
+            Flag for creating a member variable for the pose of the right
+            hand. (default = True)
+        right_hand_pose: torch.tensor, optional, BxP
+            The default value for the right hand pose member variable.
+            (default = None)
+        num_pca_comps: int, optional
+            The number of PCA components to use for each hand.
+            (default = 6)
+        flat_hand_mean: bool, optional
+            If False, then the pose of the hand is initialized to False.
+        batch_size: int, optional
+            The batch size used for creating the member variables
+        gender: str, optional
+            Which gender to load
+        dtype: torch.dtype, optional
+            The data type for the created variables
+        vertex_ids: dict, optional
+            A dictionary containing the indices of the extra vertices that
+            will be selected
+        """
+
+        self.num_pca_comps = num_pca_comps
+        # If no data structure is passed, then load the data from the given
+        # model folder
+        if data_struct is None:
+            # Load the model
+            if osp.isdir(model_path):
+                model_fn = 'SMPLH_{}.{ext}'.format(gender.upper(), ext=ext)
+                smplh_path = os.path.join(model_path, model_fn)
+            else:
+                smplh_path = model_path
+            assert osp.exists(smplh_path), 'Path {} does not exist!'.format(
+                smplh_path)
+
+            if ext == 'pkl':
+                with open(smplh_path, 'rb') as smplh_file:
+                    model_data = pickle.load(smplh_file, encoding='latin1')
+            elif ext == 'npz':
+                model_data = np.load(smplh_path, allow_pickle=True)
+            else:
+                raise ValueError('Unknown extension: {}'.format(ext))
+            data_struct = Struct(**model_data)
+
+        if vertex_ids is None:
+            vertex_ids = VERTEX_IDS['smplh']
+
+        super(SMPLH, self).__init__(model_path=model_path,
+                                    data_struct=data_struct,
+                                    batch_size=batch_size,
+                                    vertex_ids=vertex_ids,
+                                    gender=gender,
+                                    use_compressed=use_compressed,
+                                    dtype=dtype,
+                                    ext=ext,
+                                    **kwargs)
+
+        self.use_pca = use_pca
+        self.num_pca_comps = num_pca_comps
+        self.flat_hand_mean = flat_hand_mean
+
+        left_hand_components = data_struct.hands_componentsl[:num_pca_comps]
+        right_hand_components = data_struct.hands_componentsr[:num_pca_comps]
+
+        self.np_left_hand_components = left_hand_components
+        self.np_right_hand_components = right_hand_components
+        if self.use_pca:
+            self.register_buffer(
+                'left_hand_components',
+                torch.tensor(left_hand_components, dtype=dtype))
+            self.register_buffer(
+                'right_hand_components',
+                torch.tensor(right_hand_components, dtype=dtype))
+
+        if self.flat_hand_mean:
+            left_hand_mean = np.zeros_like(data_struct.hands_meanl)
+        else:
+            left_hand_mean = data_struct.hands_meanl
+
+        if self.flat_hand_mean:
+            right_hand_mean = np.zeros_like(data_struct.hands_meanr)
+        else:
+            right_hand_mean = data_struct.hands_meanr
+
+        self.register_buffer('left_hand_mean',
+                             to_tensor(left_hand_mean, dtype=self.dtype))
+        self.register_buffer('right_hand_mean',
+                             to_tensor(right_hand_mean, dtype=self.dtype))
+
+        # Create the buffers for the pose of the left hand
+        hand_pose_dim = num_pca_comps if use_pca else 3 * self.NUM_HAND_JOINTS
+        if create_left_hand_pose:
+            if left_hand_pose is None:
+                default_lhand_pose = torch.zeros([batch_size, hand_pose_dim],
+                                                 dtype=dtype)
+            else:
+                default_lhand_pose = torch.tensor(left_hand_pose, dtype=dtype)
+
+            left_hand_pose_param = nn.Parameter(default_lhand_pose,
+                                                requires_grad=True)
+            self.register_parameter('left_hand_pose', left_hand_pose_param)
+
+        if create_right_hand_pose:
+            if right_hand_pose is None:
+                default_rhand_pose = torch.zeros([batch_size, hand_pose_dim],
+                                                 dtype=dtype)
+            else:
+                default_rhand_pose = torch.tensor(right_hand_pose, dtype=dtype)
+
+            right_hand_pose_param = nn.Parameter(default_rhand_pose,
+                                                 requires_grad=True)
+            self.register_parameter('right_hand_pose', right_hand_pose_param)
+
+        # Create the buffer for the mean pose.
+        pose_mean_tensor = self.create_mean_pose(data_struct,
+                                                 flat_hand_mean=flat_hand_mean)
+        if not torch.is_tensor(pose_mean_tensor):
+            pose_mean_tensor = torch.tensor(pose_mean_tensor, dtype=dtype)
+        self.register_buffer('pose_mean', pose_mean_tensor)
+
+    def create_mean_pose(self, data_struct, flat_hand_mean=False):
+        # Create the array for the mean pose. If flat_hand is false, then use
+        # the mean that is given by the data, rather than the flat open hand
+        global_orient_mean = torch.zeros([3], dtype=self.dtype)
+        body_pose_mean = torch.zeros([self.NUM_BODY_JOINTS * 3],
+                                     dtype=self.dtype)
+
+        pose_mean = torch.cat([
+            global_orient_mean, body_pose_mean, self.left_hand_mean,
+            self.right_hand_mean
+        ],
+                              dim=0)
+        return pose_mean
+
+    def name(self) -> str:
+        return 'SMPL+H'
+
+    def extra_repr(self):
+        msg = super(SMPLH, self).extra_repr()
+        msg = [msg]
+        if self.use_pca:
+            msg.append(f'Number of PCA components: {self.num_pca_comps}')
+        msg.append(f'Flat hand mean: {self.flat_hand_mean}')
+        return '\n'.join(msg)
+
+    def forward(self,
+                betas: Optional[Tensor] = None,
+                global_orient: Optional[Tensor] = None,
+                body_pose: Optional[Tensor] = None,
+                left_hand_pose: Optional[Tensor] = None,
+                right_hand_pose: Optional[Tensor] = None,
+                transl: Optional[Tensor] = None,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                pose2rot: bool = True,
+                **kwargs) -> SMPLHOutput:
+        """"""
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient
+                         if global_orient is not None else self.global_orient)
+        body_pose = body_pose if body_pose is not None else self.body_pose
+        betas = betas if betas is not None else self.betas
+        left_hand_pose = (left_hand_pose if left_hand_pose is not None else
+                          self.left_hand_pose)
+        right_hand_pose = (right_hand_pose if right_hand_pose is not None else
+                           self.right_hand_pose)
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        if self.use_pca:
+            left_hand_pose = torch.einsum(
+                'bi,ij->bj', [left_hand_pose, self.left_hand_components])
+            right_hand_pose = torch.einsum(
+                'bi,ij->bj', [right_hand_pose, self.right_hand_components])
+
+        full_pose = torch.cat(
+            [global_orient, body_pose, left_hand_pose, right_hand_pose], dim=1)
+        full_pose += self.pose_mean
+
+        vertices, joints = lbs(self.betas,
+                               full_pose,
+                               self.v_template,
+                               self.shapedirs,
+                               self.posedirs,
+                               self.J_regressor,
+                               self.parents,
+                               self.lbs_weights,
+                               pose2rot=pose2rot)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLHOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLHLayer(SMPLH):
+    def __init__(self, *args, **kwargs) -> None:
+        """SMPL+H as a layer model constructor."""
+        super(SMPLHLayer, self).__init__(create_global_orient=False,
+                                         create_body_pose=False,
+                                         create_left_hand_pose=False,
+                                         create_right_hand_pose=False,
+                                         create_betas=False,
+                                         create_transl=False,
+                                         *args,
+                                         **kwargs)
+
+    def forward(self,
+                betas: Optional[Tensor] = None,
+                global_orient: Optional[Tensor] = None,
+                body_pose: Optional[Tensor] = None,
+                left_hand_pose: Optional[Tensor] = None,
+                right_hand_pose: Optional[Tensor] = None,
+                transl: Optional[Tensor] = None,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                pose2rot: bool = True,
+                **kwargs) -> SMPLHOutput:
+        """"""
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if body_pose is None:
+            body_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 21, -1).contiguous()
+        if left_hand_pose is None:
+            left_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if right_hand_pose is None:
+            right_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype,
+                                device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        # Concatenate all pose vectors
+        full_pose = torch.cat([
+            global_orient.reshape(-1, 1, 3),
+            body_pose.reshape(-1, self.NUM_BODY_JOINTS, 3),
+            left_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3),
+            right_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3)
+        ],
+                              dim=1)
+
+        vertices, joints = lbs(betas,
+                               full_pose,
+                               self.v_template,
+                               self.shapedirs,
+                               self.posedirs,
+                               self.J_regressor,
+                               self.parents,
+                               self.lbs_weights,
+                               pose2rot=True)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if transl is not None:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLHOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class SMPLX(SMPLH):
+    """SMPL-X (SMPL eXpressive) is a unified body model, with shape parameters
+    trained jointly for the face, hands and body.
+
+    SMPL-X uses standard vertex based linear blend skinning with learned
+    corrective blend shapes, has N=10475 vertices and K=54 joints, which
+    includes joints for the neck, jaw, eyeballs and fingers.
+    """
+
+    NUM_BODY_JOINTS = SMPLH.NUM_BODY_JOINTS
+    NUM_HAND_JOINTS = 15
+    NUM_FACE_JOINTS = 3
+    NUM_JOINTS = NUM_BODY_JOINTS + 2 * NUM_HAND_JOINTS + NUM_FACE_JOINTS
+    EXPRESSION_SPACE_DIM = 100
+    NECK_IDX = 12
+
+    def __init__(self,
+                 model_path: str,
+                 num_expression_coeffs: int = 10,
+                 create_expression: bool = True,
+                 expression: Optional[Tensor] = None,
+                 create_jaw_pose: bool = True,
+                 jaw_pose: Optional[Tensor] = None,
+                 create_leye_pose: bool = True,
+                 leye_pose: Optional[Tensor] = None,
+                 create_reye_pose=True,
+                 reye_pose: Optional[Tensor] = None,
+                 use_face_contour: bool = False,
+                 batch_size: int = 1,
+                 gender: str = 'neutral',
+                 dtype=torch.float32,
+                 ext: str = 'npz',
+                 **kwargs) -> None:
+        """SMPLX model constructor.
+
+        Parameters
+        ----------
+        model_path: str
+            The path to the folder or to the file where the model
+            parameters are stored
+        num_expression_coeffs: int, optional
+            Number of expression components to use
+            (default = 10).
+        create_expression: bool, optional
+            Flag for creating a member variable for the expression space
+            (default = True).
+        expression: torch.tensor, optional, Bx10
+            The default value for the expression member variable.
+            (default = None)
+        create_jaw_pose: bool, optional
+            Flag for creating a member variable for the jaw pose.
+            (default = False)
+        jaw_pose: torch.tensor, optional, Bx3
+            The default value for the jaw pose variable.
+            (default = None)
+        create_leye_pose: bool, optional
+            Flag for creating a member variable for the left eye pose.
+            (default = False)
+        leye_pose: torch.tensor, optional, Bx10
+            The default value for the left eye pose variable.
+            (default = None)
+        create_reye_pose: bool, optional
+            Flag for creating a member variable for the right eye pose.
+            (default = False)
+        reye_pose: torch.tensor, optional, Bx10
+            The default value for the right eye pose variable.
+            (default = None)
+        use_face_contour: bool, optional
+            Whether to compute the keypoints that form the facial contour
+        batch_size: int, optional
+            The batch size used for creating the member variables
+        gender: str, optional
+            Which gender to load
+        dtype: torch.dtype
+            The data type for the created variables
+        """
+
+        # Load the model
+        if osp.isdir(model_path):
+            model_fn = 'SMPLX_{}.{ext}'.format(gender.upper(), ext=ext)
+            smplx_path = os.path.join(model_path, model_fn)
+        else:
+            smplx_path = model_path
+        assert osp.exists(smplx_path), 'Path {} does not exist!'.format(
+            smplx_path)
+        if ext == 'pkl':
+            with open(smplx_path, 'rb') as smplx_file:
+                model_data = pickle.load(smplx_file, encoding='latin1')
+        elif ext == 'npz':
+            model_data = np.load(smplx_path, allow_pickle=True)
+        else:
+            raise ValueError('Unknown extension: {}'.format(ext))
+
+        data_struct = Struct(**model_data)
+
+        super(SMPLX, self).__init__(model_path=model_path,
+                                    data_struct=data_struct,
+                                    dtype=dtype,
+                                    batch_size=batch_size,
+                                    vertex_ids=VERTEX_IDS['smplx'],
+                                    gender=gender,
+                                    ext=ext,
+                                    **kwargs)
+
+        lmk_faces_idx = data_struct.lmk_faces_idx
+        self.register_buffer('lmk_faces_idx',
+                             torch.tensor(lmk_faces_idx, dtype=torch.long))
+        lmk_bary_coords = data_struct.lmk_bary_coords
+        self.register_buffer('lmk_bary_coords',
+                             torch.tensor(lmk_bary_coords, dtype=dtype))
+
+        self.use_face_contour = use_face_contour
+        if self.use_face_contour:
+            dynamic_lmk_faces_idx = data_struct.dynamic_lmk_faces_idx
+            dynamic_lmk_faces_idx = torch.tensor(dynamic_lmk_faces_idx,
+                                                 dtype=torch.long)
+            self.register_buffer('dynamic_lmk_faces_idx',
+                                 dynamic_lmk_faces_idx)
+
+            dynamic_lmk_bary_coords = data_struct.dynamic_lmk_bary_coords
+            dynamic_lmk_bary_coords = torch.tensor(dynamic_lmk_bary_coords,
+                                                   dtype=dtype)
+            self.register_buffer('dynamic_lmk_bary_coords',
+                                 dynamic_lmk_bary_coords)
+
+            neck_kin_chain = find_joint_kin_chain(self.NECK_IDX, self.parents)
+            self.register_buffer(
+                'neck_kin_chain', torch.tensor(neck_kin_chain,
+                                               dtype=torch.long))
+
+        if create_jaw_pose:
+            if jaw_pose is None:
+                default_jaw_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_jaw_pose = torch.tensor(jaw_pose, dtype=dtype)
+            jaw_pose_param = nn.Parameter(default_jaw_pose, requires_grad=True)
+            self.register_parameter('jaw_pose', jaw_pose_param)
+
+        if create_leye_pose:
+            if leye_pose is None:
+                default_leye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_leye_pose = torch.tensor(leye_pose, dtype=dtype)
+            leye_pose_param = nn.Parameter(default_leye_pose,
+                                           requires_grad=True)
+            self.register_parameter('leye_pose', leye_pose_param)
+
+        if create_reye_pose:
+            if reye_pose is None:
+                default_reye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_reye_pose = torch.tensor(reye_pose, dtype=dtype)
+            reye_pose_param = nn.Parameter(default_reye_pose,
+                                           requires_grad=True)
+            self.register_parameter('reye_pose', reye_pose_param)
+
+        shapedirs = data_struct.shapedirs
+        if len(shapedirs.shape) < 3:
+            shapedirs = shapedirs[:, :, None]
+        if (shapedirs.shape[-1] <
+                self.SHAPE_SPACE_DIM + self.EXPRESSION_SPACE_DIM):
+            print(f'WARNING: You are using a {self.name()} model, with only'
+                  ' 10 shape and 10 expression coefficients.')
+            expr_start_idx = 10
+            expr_end_idx = 20
+            num_expression_coeffs = min(num_expression_coeffs, 10)
+        else:
+            expr_start_idx = self.SHAPE_SPACE_DIM
+            expr_end_idx = self.SHAPE_SPACE_DIM + num_expression_coeffs
+            num_expression_coeffs = min(num_expression_coeffs,
+                                        self.EXPRESSION_SPACE_DIM)
+
+        self._num_expression_coeffs = num_expression_coeffs
+
+        expr_dirs = shapedirs[:, :, expr_start_idx:expr_end_idx]
+        self.register_buffer('expr_dirs',
+                             to_tensor(to_np(expr_dirs), dtype=dtype))
+
+        if create_expression:
+            if expression is None:
+                default_expression = torch.zeros(
+                    [batch_size, self.num_expression_coeffs], dtype=dtype)
+            else:
+                default_expression = torch.tensor(expression, dtype=dtype)
+            expression_param = nn.Parameter(default_expression,
+                                            requires_grad=True)
+            self.register_parameter('expression', expression_param)
+
+    def name(self) -> str:
+        return 'SMPL-X'
+
+    @property
+    def num_expression_coeffs(self):
+        return self._num_expression_coeffs
+
+    def create_mean_pose(self, data_struct, flat_hand_mean=False):
+        # Create the array for the mean pose. If flat_hand is false, then use
+        # the mean that is given by the data, rather than the flat open hand
+        global_orient_mean = torch.zeros([3], dtype=self.dtype)
+        body_pose_mean = torch.zeros([self.NUM_BODY_JOINTS * 3],
+                                     dtype=self.dtype)
+        jaw_pose_mean = torch.zeros([3], dtype=self.dtype)
+        leye_pose_mean = torch.zeros([3], dtype=self.dtype)
+        reye_pose_mean = torch.zeros([3], dtype=self.dtype)
+
+        pose_mean = np.concatenate([
+            global_orient_mean, body_pose_mean, jaw_pose_mean, leye_pose_mean,
+            reye_pose_mean, self.left_hand_mean, self.right_hand_mean
+        ],
+                                   axis=0)
+
+        return pose_mean
+
+    def extra_repr(self):
+        msg = super(SMPLX, self).extra_repr()
+        msg = [
+            msg,
+            f'Number of Expression Coefficients: {self.num_expression_coeffs}'
+        ]
+        return '\n'.join(msg)
+
+    def forward(self,
+                betas: Optional[Tensor] = None,
+                global_orient: Optional[Tensor] = None,
+                body_pose: Optional[Tensor] = None,
+                left_hand_pose: Optional[Tensor] = None,
+                right_hand_pose: Optional[Tensor] = None,
+                transl: Optional[Tensor] = None,
+                expression: Optional[Tensor] = None,
+                jaw_pose: Optional[Tensor] = None,
+                leye_pose: Optional[Tensor] = None,
+                reye_pose: Optional[Tensor] = None,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                pose2rot: bool = True,
+                **kwargs) -> SMPLXOutput:
+        """Forward pass for the SMPLX model.
+
+        Parameters
+        ----------
+        global_orient: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable and use it as the global
+            rotation of the body. Useful if someone wishes to predicts this
+            with an external model. (default=None)
+        betas: torch.tensor, optional, shape Bx10
+            If given, ignore the member variable `betas` and use it
+            instead. For example, it can used if shape parameters
+            `betas` are predicted from some external model.
+            (default=None)
+        expression: torch.tensor, optional, shape Bx10
+            If given, ignore the member variable `expression` and use it
+            instead. For example, it can used if expression parameters
+            `expression` are predicted from some external model.
+        body_pose: torch.tensor, optional, shape Bx(J*3)
+            If given, ignore the member variable `body_pose` and use it
+            instead. For example, it can used if someone predicts the
+            pose of the body joints are predicted from some external model.
+            It should be a tensor that contains joint rotations in
+            axis-angle format. (default=None)
+        left_hand_pose: torch.tensor, optional, shape BxP
+            If given, ignore the member variable `left_hand_pose` and
+            use this instead. It should either contain PCA coefficients or
+            joint rotations in axis-angle format.
+        right_hand_pose: torch.tensor, optional, shape BxP
+            If given, ignore the member variable `right_hand_pose` and
+            use this instead. It should either contain PCA coefficients or
+            joint rotations in axis-angle format.
+        jaw_pose: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `jaw_pose` and
+            use this instead. It should either joint rotations in
+            axis-angle format.
+        transl: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `transl` and use it
+            instead. For example, it can used if the translation
+            `transl` is predicted from some external model.
+            (default=None)
+        return_verts: bool, optional
+            Return the vertices. (default=True)
+        return_full_pose: bool, optional
+            Returns the full axis-angle pose vector (default=False)
+
+        Returns
+        -------
+            output: ModelOutput
+            A named tuple of type `ModelOutput`
+        """
+
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient
+                         if global_orient is not None else self.global_orient)
+        body_pose = body_pose if body_pose is not None else self.body_pose
+        betas = betas if betas is not None else self.betas
+
+        left_hand_pose = (left_hand_pose if left_hand_pose is not None else
+                          self.left_hand_pose)
+        right_hand_pose = (right_hand_pose if right_hand_pose is not None else
+                           self.right_hand_pose)
+        jaw_pose = jaw_pose if jaw_pose is not None else self.jaw_pose
+        leye_pose = leye_pose if leye_pose is not None else self.leye_pose
+        reye_pose = reye_pose if reye_pose is not None else self.reye_pose
+        expression = expression if expression is not None else self.expression
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        if self.use_pca:
+            left_hand_pose = torch.einsum(
+                'bi,ij->bj', [left_hand_pose, self.left_hand_components])
+            right_hand_pose = torch.einsum(
+                'bi,ij->bj', [right_hand_pose, self.right_hand_components])
+        full_pose = torch.cat([
+            global_orient, body_pose, jaw_pose, leye_pose, reye_pose,
+            left_hand_pose, right_hand_pose
+        ],
+                              dim=1)
+
+        # Add the mean pose of the model. Does not affect the body, only the
+        # hands when flat_hand_mean == False
+        
+        full_pose += self.pose_mean
+
+        batch_size = max(betas.shape[0], global_orient.shape[0],
+                         body_pose.shape[0])
+        # Concatenate the shape and expression coefficients
+        scale = int(batch_size / betas.shape[0])
+        if scale > 1:
+            betas = betas.expand(scale, -1)
+        shape_components = torch.cat([betas, expression], dim=-1)
+
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(
+            shape_components,
+            full_pose,
+            self.v_template,
+            shapedirs,
+            self.posedirs,
+            self.J_regressor,
+            self.parents,
+            self.lbs_weights,
+            pose2rot=pose2rot,
+        )
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(dim=0).expand(
+            batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices,
+                full_pose,
+                self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=True,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+
+            lmk_faces_idx = torch.cat([lmk_faces_idx, dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat([
+                lmk_bary_coords.expand(batch_size, -1, -1), dyn_lmk_bary_coords
+            ], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx, lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+        # Map the joints to the current dataset
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLXOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             jaw_pose=jaw_pose,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+class SMPLXLayer(SMPLX):
+    def __init__(self, *args, **kwargs) -> None:
+        # Just create a SMPLX module without any member variables
+        super(SMPLXLayer, self).__init__(
+            create_global_orient=False,
+            create_body_pose=False,
+            create_left_hand_pose=False,
+            create_right_hand_pose=False,
+            create_jaw_pose=False,
+            create_leye_pose=False,
+            create_reye_pose=False,
+            create_betas=False,
+            create_expression=False,
+            create_transl=False,
+            *args,
+            **kwargs,
+        )
+
+    def forward(self,
+                betas: Optional[Tensor] = None,
+                global_orient: Optional[Tensor] = None,
+                body_pose: Optional[Tensor] = None,
+                left_hand_pose: Optional[Tensor] = None,
+                right_hand_pose: Optional[Tensor] = None,
+                transl: Optional[Tensor] = None,
+                expression: Optional[Tensor] = None,
+                jaw_pose: Optional[Tensor] = None,
+                leye_pose: Optional[Tensor] = None,
+                reye_pose: Optional[Tensor] = None,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                **kwargs) -> SMPLXOutput:
+        """Forward pass for the SMPLX model.
+
+        Parameters
+        ----------
+        global_orient: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable and use it as the global
+            rotation of the body. Useful if someone wishes to predicts this
+            with an external model. (default=None)
+        betas: torch.tensor, optional, shape Bx10
+            If given, ignore the member variable `betas` and use it
+            instead. For example, it can used if shape parameters
+            `betas` are predicted from some external model.
+            (default=None)
+        expression: torch.tensor, optional, shape Bx10
+            If given, ignore the member variable `expression` and use it
+            instead. For example, it can used if expression parameters
+            `expression` are predicted from some external model.
+        body_pose: torch.tensor, optional, shape Bx(J*3)
+            If given, ignore the member variable `body_pose` and use it
+            instead. For example, it can used if someone predicts the
+            pose of the body joints are predicted from some external model.
+            It should be a tensor that contains joint rotations in
+            axis-angle format. (default=None)
+        left_hand_pose: torch.tensor, optional, shape BxP
+            If given, ignore the member variable `left_hand_pose` and
+            use this instead. It should either contain PCA coefficients or
+            joint rotations in axis-angle format.
+        right_hand_pose: torch.tensor, optional, shape BxP
+            If given, ignore the member variable `right_hand_pose` and
+            use this instead. It should either contain PCA coefficients or
+            joint rotations in axis-angle format.
+        jaw_pose: torch.tensor, optional, shape Bx3x3
+            If given, ignore the member variable `jaw_pose` and
+            use this instead. It should either joint rotations in
+            axis-angle format.
+        transl: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `transl` and use it
+            instead. For example, it can used if the translation
+            `transl` is predicted from some external model.
+            (default=None)
+        return_verts: bool, optional
+            Return the vertices. (default=True)
+        return_full_pose: bool, optional
+            Returns the full pose vector (default=False)
+        Returns
+        -------
+            output: ModelOutput
+            A data class that contains the posed vertices and joints
+        """
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if body_pose is None:
+            body_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, self.NUM_BODY_JOINTS,
+                                -1).contiguous()
+        if left_hand_pose is None:
+            left_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if right_hand_pose is None:
+            right_hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if jaw_pose is None:
+            jaw_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if leye_pose is None:
+            leye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if reye_pose is None:
+            reye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if expression is None:
+            expression = torch.zeros([batch_size, self.num_expression_coeffs],
+                                     dtype=dtype,
+                                     device=device)
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype,
+                                device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        # Concatenate all pose vectors
+        full_pose = torch.cat([
+            global_orient.reshape(-1, 1, 3),
+            body_pose.reshape(-1, self.NUM_BODY_JOINTS, 3),
+            jaw_pose.reshape(-1, 1, 3),
+            leye_pose.reshape(-1, 1, 3),
+            reye_pose.reshape(-1, 1, 3),
+            left_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3),
+            right_hand_pose.reshape(-1, self.NUM_HAND_JOINTS, 3)
+        ],
+                              dim=1)
+        shape_components = torch.cat([betas, expression], dim=-1)
+
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(shape_components,
+                               full_pose,
+                               self.v_template,
+                               shapedirs,
+                               self.posedirs,
+                               self.J_regressor,
+                               self.parents,
+                               self.lbs_weights,
+                               pose2rot=True)
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(dim=0).expand(
+            batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices,
+                full_pose,
+                self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=False,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+
+            lmk_faces_idx = torch.cat([lmk_faces_idx, dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat([
+                lmk_bary_coords.expand(batch_size, -1, -1), dyn_lmk_bary_coords
+            ], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx, lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+        # Map the joints to the current dataset
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        if transl is not None:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = SMPLXOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             body_pose=body_pose,
+                             left_hand_pose=left_hand_pose,
+                             right_hand_pose=right_hand_pose,
+                             jaw_pose=jaw_pose,
+                             transl=transl,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+class MANO(SMPL):
+    # The hand joints are replaced by MANO
+    NUM_BODY_JOINTS = 1
+    NUM_HAND_JOINTS = 15
+    NUM_JOINTS = NUM_BODY_JOINTS + NUM_HAND_JOINTS
+
+    def __init__(self,
+                 model_path: str,
+                 is_rhand: bool = True,
+                 data_struct: Optional[Struct] = None,
+                 create_hand_pose: bool = True,
+                 hand_pose: Optional[Tensor] = None,
+                 use_pca: bool = True,
+                 num_pca_comps: int = 6,
+                 flat_hand_mean: bool = False,
+                 batch_size: int = 1,
+                 dtype=torch.float32,
+                 vertex_ids=None,
+                 use_compressed: bool = True,
+                 ext: str = 'pkl',
+                 **kwargs) -> None:
+        """MANO model constructor.
+
+        Parameters
+        ----------
+        model_path: str
+            The path to the folder or to the file where the model
+            parameters are stored
+        data_struct: Strct
+            A struct object. If given, then the parameters of the model are
+            read from the object. Otherwise, the model tries to read the
+            parameters from the given `model_path`. (default = None)
+        create_hand_pose: bool, optional
+            Flag for creating a member variable for the pose of the right
+            hand. (default = True)
+        hand_pose: torch.tensor, optional, BxP
+            The default value for the right hand pose member variable.
+            (default = None)
+        num_pca_comps: int, optional
+            The number of PCA components to use for each hand.
+            (default = 6)
+        flat_hand_mean: bool, optional
+            If False, then the pose of the hand is initialized to False.
+        batch_size: int, optional
+            The batch size used for creating the member variables
+        dtype: torch.dtype, optional
+            The data type for the created variables
+        vertex_ids: dict, optional
+            A dictionary containing the indices of the extra vertices that
+            will be selected
+        """
+
+        self.num_pca_comps = num_pca_comps
+        self.is_rhand = is_rhand
+        # If no data structure is passed, then load the data from the given
+        # model folder
+        if data_struct is None:
+            # Load the model
+            if osp.isdir(model_path):
+                model_fn = 'MANO_{}.{ext}'.format(
+                    'RIGHT' if is_rhand else 'LEFT', ext=ext)
+                mano_path = os.path.join(model_path, model_fn)
+            else:
+                mano_path = model_path
+                self.is_rhand = True if 'RIGHT' in os.path.basename(
+                    model_path) else False
+            assert osp.exists(mano_path), 'Path {} does not exist!'.format(
+                mano_path)
+
+            if ext == 'pkl':
+                with open(mano_path, 'rb') as mano_file:
+                    model_data = pickle.load(mano_file, encoding='latin1')
+            elif ext == 'npz':
+                model_data = np.load(mano_path, allow_pickle=True)
+            else:
+                raise ValueError('Unknown extension: {}'.format(ext))
+            data_struct = Struct(**model_data)
+
+        if vertex_ids is None:
+            vertex_ids = VERTEX_IDS['smplh']
+
+        super(MANO, self).__init__(model_path=model_path,
+                                   data_struct=data_struct,
+                                   batch_size=batch_size,
+                                   vertex_ids=vertex_ids,
+                                   use_compressed=use_compressed,
+                                   dtype=dtype,
+                                   ext=ext,
+                                   **kwargs)
+
+        # add only MANO tips to the extra joints
+        self.vertex_joint_selector.extra_joints_idxs = to_tensor(
+            list(VERTEX_IDS['mano'].values()), dtype=torch.long)
+
+        self.use_pca = use_pca
+        self.num_pca_comps = num_pca_comps
+        if self.num_pca_comps == 45:
+            self.use_pca = False
+        self.flat_hand_mean = flat_hand_mean
+
+        hand_components = data_struct.hands_components[:num_pca_comps]
+
+        self.np_hand_components = hand_components
+
+        if self.use_pca:
+            self.register_buffer('hand_components',
+                                 torch.tensor(hand_components, dtype=dtype))
+
+        if self.flat_hand_mean:
+            hand_mean = np.zeros_like(data_struct.hands_mean)
+        else:
+            hand_mean = data_struct.hands_mean
+
+        self.register_buffer('hand_mean', to_tensor(hand_mean,
+                                                    dtype=self.dtype))
+
+        # Create the buffers for the pose of the left hand
+        hand_pose_dim = num_pca_comps if use_pca else 3 * self.NUM_HAND_JOINTS
+        if create_hand_pose:
+            if hand_pose is None:
+                default_hand_pose = torch.zeros([batch_size, hand_pose_dim],
+                                                dtype=dtype)
+            else:
+                default_hand_pose = torch.tensor(hand_pose, dtype=dtype)
+
+            hand_pose_param = nn.Parameter(default_hand_pose,
+                                           requires_grad=True)
+            self.register_parameter('hand_pose', hand_pose_param)
+
+        # Create the buffer for the mean pose.
+        pose_mean = self.create_mean_pose(data_struct,
+                                          flat_hand_mean=flat_hand_mean)
+        pose_mean_tensor = pose_mean.clone().to(dtype)
+        # pose_mean_tensor = torch.tensor(pose_mean, dtype=dtype)
+        self.register_buffer('pose_mean', pose_mean_tensor)
+
+    def name(self) -> str:
+        return 'MANO'
+
+    def create_mean_pose(self, data_struct, flat_hand_mean=False):
+        # Create the array for the mean pose. If flat_hand is false, then use
+        # the mean that is given by the data, rather than the flat open hand
+        global_orient_mean = torch.zeros([3], dtype=self.dtype)
+        pose_mean = torch.cat([global_orient_mean, self.hand_mean], dim=0)
+        return pose_mean
+
+    def extra_repr(self):
+        msg = [super(MANO, self).extra_repr()]
+        if self.use_pca:
+            msg.append(f'Number of PCA components: {self.num_pca_comps}')
+        msg.append(f'Flat hand mean: {self.flat_hand_mean}')
+        return '\n'.join(msg)
+
+    def forward(self,
+                betas: Optional[Tensor] = None,
+                global_orient: Optional[Tensor] = None,
+                hand_pose: Optional[Tensor] = None,
+                transl: Optional[Tensor] = None,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                **kwargs) -> MANOOutput:
+        """Forward pass for the MANO model."""
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient
+                         if global_orient is not None else self.global_orient)
+        betas = betas if betas is not None else self.betas
+        hand_pose = (hand_pose if hand_pose is not None else self.hand_pose)
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        if self.use_pca:
+            hand_pose = torch.einsum('bi,ij->bj',
+                                     [hand_pose, self.hand_components])
+
+        full_pose = torch.cat([global_orient, hand_pose], dim=1)
+        full_pose += self.pose_mean
+
+        vertices, joints = lbs(
+            betas,
+            full_pose,
+            self.v_template,
+            self.shapedirs,
+            self.posedirs,
+            self.J_regressor,
+            self.parents,
+            self.lbs_weights,
+            pose2rot=True,
+        )
+
+        # # Add pre-selected extra joints that might be needed
+        # joints = self.vertex_joint_selector(vertices, joints)
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if apply_trans:
+            joints = joints + transl.unsqueeze(dim=1)
+            vertices = vertices + transl.unsqueeze(dim=1)
+
+        output = MANOOutput(vertices=vertices if return_verts else None,
+                            joints=joints if return_verts else None,
+                            betas=betas,
+                            global_orient=global_orient,
+                            hand_pose=hand_pose,
+                            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class MANOLayer(MANO):
+    def __init__(self, *args, **kwargs) -> None:
+        """MANO as a layer model constructor."""
+        super(MANOLayer, self).__init__(create_global_orient=False,
+                                        create_hand_pose=False,
+                                        create_betas=False,
+                                        create_transl=False,
+                                        *args,
+                                        **kwargs)
+
+    def name(self) -> str:
+        return 'MANO'
+
+    def forward(self,
+                betas: Optional[Tensor] = None,
+                global_orient: Optional[Tensor] = None,
+                hand_pose: Optional[Tensor] = None,
+                transl: Optional[Tensor] = None,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                **kwargs) -> MANOOutput:
+        """Forward pass for the MANO model."""
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if hand_pose is None:
+            hand_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 15, -1).contiguous()
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype,
+                                device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        full_pose = torch.cat([global_orient, hand_pose], dim=1)
+        vertices, joints = lbs(betas,
+                               full_pose,
+                               self.v_template,
+                               self.shapedirs,
+                               self.posedirs,
+                               self.J_regressor,
+                               self.parents,
+                               self.lbs_weights,
+                               pose2rot=True)
+
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints)
+
+        if transl is not None:
+            joints = joints + transl.unsqueeze(dim=1)
+            vertices = vertices + transl.unsqueeze(dim=1)
+
+        output = MANOOutput(vertices=vertices if return_verts else None,
+                            joints=joints if return_verts else None,
+                            betas=betas,
+                            global_orient=global_orient,
+                            hand_pose=hand_pose,
+                            full_pose=full_pose if return_full_pose else None)
+
+        return output
+
+
+class FLAME(SMPL):
+    NUM_JOINTS = 5
+    SHAPE_SPACE_DIM = 300
+    EXPRESSION_SPACE_DIM = 100
+    NECK_IDX = 0
+
+    def __init__(self,
+                 model_path: str,
+                 data_struct=None,
+                 num_expression_coeffs=10,
+                 create_expression: bool = True,
+                 expression: Optional[Tensor] = None,
+                 create_neck_pose: bool = True,
+                 neck_pose: Optional[Tensor] = None,
+                 create_jaw_pose: bool = True,
+                 jaw_pose: Optional[Tensor] = None,
+                 create_leye_pose: bool = True,
+                 leye_pose: Optional[Tensor] = None,
+                 create_reye_pose=True,
+                 reye_pose: Optional[Tensor] = None,
+                 use_face_contour=False,
+                 batch_size: int = 1,
+                 gender: str = 'neutral',
+                 dtype: torch.dtype = torch.float32,
+                 ext='pkl',
+                 **kwargs) -> None:
+        """FLAME model constructor.
+
+        Parameters
+        ----------
+        model_path: str
+            The path to the folder or to the file where the model
+            parameters are stored
+        num_expression_coeffs: int, optional
+            Number of expression components to use
+            (default = 10).
+        create_expression: bool, optional
+            Flag for creating a member variable for the expression space
+            (default = True).
+        expression: torch.tensor, optional, Bx10
+            The default value for the expression member variable.
+            (default = None)
+        create_neck_pose: bool, optional
+            Flag for creating a member variable for the neck pose.
+            (default = False)
+        neck_pose: torch.tensor, optional, Bx3
+            The default value for the neck pose variable.
+            (default = None)
+        create_jaw_pose: bool, optional
+            Flag for creating a member variable for the jaw pose.
+            (default = False)
+        jaw_pose: torch.tensor, optional, Bx3
+            The default value for the jaw pose variable.
+            (default = None)
+        create_leye_pose: bool, optional
+            Flag for creating a member variable for the left eye pose.
+            (default = False)
+        leye_pose: torch.tensor, optional, Bx10
+            The default value for the left eye pose variable.
+            (default = None)
+        create_reye_pose: bool, optional
+            Flag for creating a member variable for the right eye pose.
+            (default = False)
+        reye_pose: torch.tensor, optional, Bx10
+            The default value for the right eye pose variable.
+            (default = None)
+        use_face_contour: bool, optional
+            Whether to compute the keypoints that form the facial contour
+        batch_size: int, optional
+            The batch size used for creating the member variables
+        gender: str, optional
+            Which gender to load
+        dtype: torch.dtype
+            The data type for the created variables
+        """
+        model_fn = f'FLAME_{gender.upper()}.{ext}'
+        flame_path = os.path.join(model_path, model_fn)
+        assert osp.exists(flame_path), 'Path {} does not exist!'.format(
+            flame_path)
+        if ext == 'npz':
+            file_data = np.load(flame_path, allow_pickle=True)
+        elif ext == 'pkl':
+            with open(flame_path, 'rb') as smpl_file:
+                file_data = pickle.load(smpl_file, encoding='latin1')
+        else:
+            raise ValueError('Unknown extension: {}'.format(ext))
+        data_struct = Struct(**file_data)
+
+        super(FLAME, self).__init__(model_path=model_path,
+                                    data_struct=data_struct,
+                                    dtype=dtype,
+                                    batch_size=batch_size,
+                                    gender=gender,
+                                    ext=ext,
+                                    **kwargs)
+
+        self.use_face_contour = use_face_contour
+
+        self.vertex_joint_selector.extra_joints_idxs = to_tensor(
+            [], dtype=torch.long)
+
+        if create_neck_pose:
+            if neck_pose is None:
+                default_neck_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_neck_pose = torch.tensor(neck_pose, dtype=dtype)
+            neck_pose_param = nn.Parameter(default_neck_pose,
+                                           requires_grad=True)
+            self.register_parameter('neck_pose', neck_pose_param)
+
+        if create_jaw_pose:
+            if jaw_pose is None:
+                default_jaw_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_jaw_pose = torch.tensor(jaw_pose, dtype=dtype)
+            jaw_pose_param = nn.Parameter(default_jaw_pose, requires_grad=True)
+            self.register_parameter('jaw_pose', jaw_pose_param)
+
+        if create_leye_pose:
+            if leye_pose is None:
+                default_leye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_leye_pose = torch.tensor(leye_pose, dtype=dtype)
+            leye_pose_param = nn.Parameter(default_leye_pose,
+                                           requires_grad=True)
+            self.register_parameter('leye_pose', leye_pose_param)
+
+        if create_reye_pose:
+            if reye_pose is None:
+                default_reye_pose = torch.zeros([batch_size, 3], dtype=dtype)
+            else:
+                default_reye_pose = torch.tensor(reye_pose, dtype=dtype)
+            reye_pose_param = nn.Parameter(default_reye_pose,
+                                           requires_grad=True)
+            self.register_parameter('reye_pose', reye_pose_param)
+
+        shapedirs = data_struct.shapedirs
+        if len(shapedirs.shape) < 3:
+            shapedirs = shapedirs[:, :, None]
+        if (shapedirs.shape[-1] <
+                self.SHAPE_SPACE_DIM + self.EXPRESSION_SPACE_DIM):
+            print(f'WARNING: You are using a {self.name()} model, with only'
+                  ' 10 shape and 10 expression coefficients.')
+            expr_start_idx = 10
+            expr_end_idx = 20
+            num_expression_coeffs = min(num_expression_coeffs, 10)
+        else:
+            expr_start_idx = self.SHAPE_SPACE_DIM
+            expr_end_idx = self.SHAPE_SPACE_DIM + num_expression_coeffs
+            num_expression_coeffs = min(num_expression_coeffs,
+                                        self.EXPRESSION_SPACE_DIM)
+
+        self._num_expression_coeffs = num_expression_coeffs
+
+        expr_dirs = shapedirs[:, :, expr_start_idx:expr_end_idx]
+        self.register_buffer('expr_dirs',
+                             to_tensor(to_np(expr_dirs), dtype=dtype))
+
+        if create_expression:
+            if expression is None:
+                default_expression = torch.zeros(
+                    [batch_size, self.num_expression_coeffs], dtype=dtype)
+            else:
+                default_expression = torch.tensor(expression, dtype=dtype)
+            expression_param = nn.Parameter(default_expression,
+                                            requires_grad=True)
+            self.register_parameter('expression', expression_param)
+
+        # The pickle file that contains the barycentric coordinates for
+        # regressing the landmarks
+        landmark_bcoord_filename = osp.join(model_path,
+                                            'flame_static_embedding.pkl')
+
+        with open(landmark_bcoord_filename, 'rb') as fp:
+            landmarks_data = pickle.load(fp, encoding='latin1')
+
+        lmk_faces_idx = landmarks_data['lmk_face_idx'].astype(np.int64)
+        self.register_buffer('lmk_faces_idx',
+                             torch.tensor(lmk_faces_idx, dtype=torch.long))
+        lmk_bary_coords = landmarks_data['lmk_b_coords']
+        self.register_buffer('lmk_bary_coords',
+                             torch.tensor(lmk_bary_coords, dtype=dtype))
+        if self.use_face_contour:
+            face_contour_path = os.path.join(model_path,
+                                             'flame_dynamic_embedding.npy')
+            contour_embeddings = np.load(face_contour_path,
+                                         allow_pickle=True,
+                                         encoding='latin1')[()]
+
+            dynamic_lmk_faces_idx = np.array(
+                contour_embeddings['lmk_face_idx'], dtype=np.int64)
+            dynamic_lmk_faces_idx = torch.tensor(dynamic_lmk_faces_idx,
+                                                 dtype=torch.long)
+            self.register_buffer('dynamic_lmk_faces_idx',
+                                 dynamic_lmk_faces_idx)
+
+            dynamic_lmk_b_coords = torch.tensor(
+                contour_embeddings['lmk_b_coords'], dtype=dtype)
+            self.register_buffer('dynamic_lmk_bary_coords',
+                                 dynamic_lmk_b_coords)
+
+            neck_kin_chain = find_joint_kin_chain(self.NECK_IDX, self.parents)
+            self.register_buffer(
+                'neck_kin_chain', torch.tensor(neck_kin_chain,
+                                               dtype=torch.long))
+
+    @property
+    def num_expression_coeffs(self):
+        return self._num_expression_coeffs
+
+    def name(self) -> str:
+        return 'FLAME'
+
+    def extra_repr(self):
+        msg = [
+            super(FLAME, self).extra_repr(),
+            f'Number of Expression Coefficients: {self.num_expression_coeffs}',
+            f'Use face contour: {self.use_face_contour}',
+        ]
+        return '\n'.join(msg)
+
+    def forward(self,
+                betas: Optional[Tensor] = None,
+                global_orient: Optional[Tensor] = None,
+                neck_pose: Optional[Tensor] = None,
+                transl: Optional[Tensor] = None,
+                expression: Optional[Tensor] = None,
+                jaw_pose: Optional[Tensor] = None,
+                leye_pose: Optional[Tensor] = None,
+                reye_pose: Optional[Tensor] = None,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                pose2rot: bool = True,
+                **kwargs) -> FLAMEOutput:
+        """Forward pass for the SMPLX model.
+
+        Parameters
+        ----------
+        global_orient: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable and use it as the global
+            rotation of the body. Useful if someone wishes to predicts this
+            with an external model. (default=None)
+        betas: torch.tensor, optional, shape Bx10
+            If given, ignore the member variable `betas` and use it
+            instead. For example, it can used if shape parameters
+            `betas` are predicted from some external model.
+            (default=None)
+        expression: torch.tensor, optional, shape Bx10
+            If given, ignore the member variable `expression` and use it
+            instead. For example, it can used if expression parameters
+            `expression` are predicted from some external model.
+        jaw_pose: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `jaw_pose` and
+            use this instead. It should either joint rotations in
+            axis-angle format.
+        jaw_pose: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `jaw_pose` and
+            use this instead. It should either joint rotations in
+            axis-angle format.
+        transl: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `transl` and use it
+            instead. For example, it can used if the translation
+            `transl` is predicted from some external model.
+            (default=None)
+        return_verts: bool, optional
+            Return the vertices. (default=True)
+        return_full_pose: bool, optional
+            Returns the full axis-angle pose vector (default=False)
+
+        Returns
+        -------
+            output: ModelOutput
+            A named tuple of type `ModelOutput`
+        """
+
+        # If no shape and pose parameters are passed along, then use the
+        # ones from the module
+        global_orient = (global_orient
+                         if global_orient is not None else self.global_orient)
+        jaw_pose = jaw_pose if jaw_pose is not None else self.jaw_pose
+        neck_pose = neck_pose if neck_pose is not None else self.neck_pose
+
+        leye_pose = leye_pose if leye_pose is not None else self.leye_pose
+        reye_pose = reye_pose if reye_pose is not None else self.reye_pose
+
+        betas = betas if betas is not None else self.betas
+        expression = expression if expression is not None else self.expression
+
+        apply_trans = transl is not None or hasattr(self, 'transl')
+        if transl is None:
+            if hasattr(self, 'transl'):
+                transl = self.transl
+
+        full_pose = torch.cat(
+            [global_orient, neck_pose, jaw_pose, leye_pose, reye_pose], dim=1)
+
+        batch_size = max(betas.shape[0], global_orient.shape[0],
+                         jaw_pose.shape[0])
+        # Concatenate the shape and expression coefficients
+        scale = int(batch_size / betas.shape[0])
+        if scale > 1:
+            betas = betas.expand(scale, -1)
+        shape_components = torch.cat([betas, expression], dim=-1)
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(
+            shape_components,
+            full_pose,
+            self.v_template,
+            shapedirs,
+            self.posedirs,
+            self.J_regressor,
+            self.parents,
+            self.lbs_weights,
+            pose2rot=pose2rot,
+        )
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(dim=0).expand(
+            batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices,
+                full_pose,
+                self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=True,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+            lmk_faces_idx = torch.cat([lmk_faces_idx, dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat([
+                lmk_bary_coords.expand(batch_size, -1, -1), dyn_lmk_bary_coords
+            ], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx, lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        if apply_trans:
+            joints += transl.unsqueeze(dim=1)
+            vertices += transl.unsqueeze(dim=1)
+
+        output = FLAMEOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             neck_pose=neck_pose,
+                             jaw_pose=jaw_pose,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+class FLAMELayer(FLAME):
+    def __init__(self, *args, **kwargs) -> None:
+        """FLAME as a layer model constructor."""
+        super(FLAMELayer, self).__init__(create_betas=False,
+                                         create_expression=False,
+                                         create_global_orient=False,
+                                         create_neck_pose=False,
+                                         create_jaw_pose=False,
+                                         create_leye_pose=False,
+                                         create_reye_pose=False,
+                                         *args,
+                                         **kwargs)
+
+    def forward(self,
+                betas: Optional[Tensor] = None,
+                global_orient: Optional[Tensor] = None,
+                neck_pose: Optional[Tensor] = None,
+                transl: Optional[Tensor] = None,
+                expression: Optional[Tensor] = None,
+                jaw_pose: Optional[Tensor] = None,
+                leye_pose: Optional[Tensor] = None,
+                reye_pose: Optional[Tensor] = None,
+                return_verts: bool = True,
+                return_full_pose: bool = False,
+                pose2rot: bool = True,
+                **kwargs) -> FLAMEOutput:
+        """Forward pass for the SMPLX model.
+
+        Parameters
+        ----------
+        global_orient: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable and use it as the global
+            rotation of the body. Useful if someone wishes to predicts this
+            with an external model. (default=None)
+        betas: torch.tensor, optional, shape Bx10
+            If given, ignore the member variable `betas` and use it
+            instead. For example, it can used if shape parameters
+            `betas` are predicted from some external model.
+            (default=None)
+        expression: torch.tensor, optional, shape Bx10
+            If given, ignore the member variable `expression` and use it
+            instead. For example, it can used if expression parameters
+            `expression` are predicted from some external model.
+        jaw_pose: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `jaw_pose` and
+            use this instead. It should either joint rotations in
+            axis-angle format.
+        jaw_pose: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `jaw_pose` and
+            use this instead. It should either joint rotations in
+            axis-angle format.
+        transl: torch.tensor, optional, shape Bx3
+            If given, ignore the member variable `transl` and use it
+            instead. For example, it can used if the translation
+            `transl` is predicted from some external model.
+            (default=None)
+        return_verts: bool, optional
+            Return the vertices. (default=True)
+        return_full_pose: bool, optional
+            Returns the full axis-angle pose vector (default=False)
+
+        Returns
+        -------
+            output: ModelOutput
+            A named tuple of type `ModelOutput`
+        """
+        device, dtype = self.shapedirs.device, self.shapedirs.dtype
+        if global_orient is None:
+            batch_size = 1
+            global_orient = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        else:
+            batch_size = global_orient.shape[0]
+        if neck_pose is None:
+            neck_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, 1, -1).contiguous()
+        if jaw_pose is None:
+            jaw_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if leye_pose is None:
+            leye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if reye_pose is None:
+            reye_pose = torch.zeros(3, device=device, dtype=dtype).view(
+                1, 1, 3).expand(batch_size, -1, -1).contiguous()
+        if betas is None:
+            betas = torch.zeros([batch_size, self.num_betas],
+                                dtype=dtype,
+                                device=device)
+        if expression is None:
+            expression = torch.zeros([batch_size, self.num_expression_coeffs],
+                                     dtype=dtype,
+                                     device=device)
+        if transl is None:
+            transl = torch.zeros([batch_size, 3], dtype=dtype, device=device)
+
+        full_pose = torch.cat(
+            [global_orient, neck_pose, jaw_pose, leye_pose, reye_pose], dim=1)
+
+        shape_components = torch.cat([betas, expression], dim=-1)
+        shapedirs = torch.cat([self.shapedirs, self.expr_dirs], dim=-1)
+
+        vertices, joints = lbs(
+            shape_components,
+            full_pose,
+            self.v_template,
+            shapedirs,
+            self.posedirs,
+            self.J_regressor,
+            self.parents,
+            self.lbs_weights,
+            pose2rot=True,
+        )
+
+        lmk_faces_idx = self.lmk_faces_idx.unsqueeze(dim=0).expand(
+            batch_size, -1).contiguous()
+        lmk_bary_coords = self.lmk_bary_coords.unsqueeze(dim=0).repeat(
+            self.batch_size, 1, 1)
+        if self.use_face_contour:
+            lmk_idx_and_bcoords = find_dynamic_lmk_idx_and_bcoords(
+                vertices,
+                full_pose,
+                self.dynamic_lmk_faces_idx,
+                self.dynamic_lmk_bary_coords,
+                self.neck_kin_chain,
+                pose2rot=False,
+            )
+            dyn_lmk_faces_idx, dyn_lmk_bary_coords = lmk_idx_and_bcoords
+            lmk_faces_idx = torch.cat([lmk_faces_idx, dyn_lmk_faces_idx], 1)
+            lmk_bary_coords = torch.cat([
+                lmk_bary_coords.expand(batch_size, -1, -1), dyn_lmk_bary_coords
+            ], 1)
+
+        landmarks = vertices2landmarks(vertices, self.faces_tensor,
+                                       lmk_faces_idx, lmk_bary_coords)
+
+        # Add any extra joints that might be needed
+        joints = self.vertex_joint_selector(vertices, joints)
+        # Add the landmarks to the joints
+        joints = torch.cat([joints, landmarks], dim=1)
+
+        # Map the joints to the current dataset
+        if self.joint_mapper is not None:
+            joints = self.joint_mapper(joints=joints, vertices=vertices)
+
+        joints += transl.unsqueeze(dim=1)
+        vertices += transl.unsqueeze(dim=1)
+
+        output = FLAMEOutput(vertices=vertices if return_verts else None,
+                             joints=joints,
+                             betas=betas,
+                             expression=expression,
+                             global_orient=global_orient,
+                             neck_pose=neck_pose,
+                             jaw_pose=jaw_pose,
+                             full_pose=full_pose if return_full_pose else None)
+        return output
+
+
+def build_layer(
+    model_path: str,
+    model_type: str = 'smpl',
+    **kwargs
+) -> Union[SMPLLayer, SMPLHLayer, SMPLXLayer, MANOLayer, FLAMELayer]:
+    """Method for creating a model from a path and a model type.
+
+    Parameters
+    ----------
+    model_path: str
+        Either the path to the model you wish to load or a folder,
+        where each subfolder contains the differents types, i.e.:
+        model_path:
+        |
+        |-- smpl
+            |-- SMPL_FEMALE
+            |-- SMPL_NEUTRAL
+            |-- SMPL_MALE
+        |-- smplh
+            |-- SMPLH_FEMALE
+            |-- SMPLH_MALE
+        |-- smplx
+            |-- SMPLX_FEMALE
+            |-- SMPLX_NEUTRAL
+            |-- SMPLX_MALE
+        |-- mano
+            |-- MANO RIGHT
+            |-- MANO LEFT
+        |-- flame
+            |-- FLAME_FEMALE
+            |-- FLAME_MALE
+            |-- FLAME_NEUTRAL
+
+    model_type: str, optional
+        When model_path is a folder, then this parameter specifies  the
+        type of model to be loaded
+    **kwargs: dict
+        Keyword arguments
+
+    Returns
+    -------
+        body_model: nn.Module
+            The PyTorch module that implements the corresponding body model
+    Raises
+    ------
+        ValueError: In case the model type is not one of SMPL, SMPLH,
+        SMPLX, MANO or FLAME
+    """
+
+    if osp.isdir(model_path):
+        model_path = os.path.join(model_path, model_type)
+    else:
+        model_type = osp.basename(model_path).split('_')[0].lower()
+
+    if model_type.lower() == 'smpl':
+        return SMPLLayer(model_path, **kwargs)
+    elif model_type.lower() == 'smplh':
+        return SMPLHLayer(model_path, **kwargs)
+    elif model_type.lower() == 'smplx':
+        return SMPLXLayer(model_path, **kwargs)
+    elif 'mano' in model_type.lower():
+        return MANOLayer(model_path, **kwargs)
+    elif 'flame' in model_type.lower():
+        return FLAMELayer(model_path, **kwargs)
+    else:
+        raise ValueError(f'Unknown model type {model_type}, exiting!')
+
+
+def create(model_path: str,
+           model_type: str = 'smpl',
+           **kwargs) -> Union[SMPL, SMPLH, SMPLX, MANO, FLAME]:
+    """Method for creating a model from a path and a model type.
+
+    Parameters
+    ----------
+    model_path: str
+        Either the path to the model you wish to load or a folder,
+        where each subfolder contains the differents types, i.e.:
+        model_path:
+        |
+        |-- smpl
+            |-- SMPL_FEMALE
+            |-- SMPL_NEUTRAL
+            |-- SMPL_MALE
+        |-- smplh
+            |-- SMPLH_FEMALE
+            |-- SMPLH_MALE
+        |-- smplx
+            |-- SMPLX_FEMALE
+            |-- SMPLX_NEUTRAL
+            |-- SMPLX_MALE
+        |-- mano
+            |-- MANO RIGHT
+            |-- MANO LEFT
+
+    model_type: str, optional
+        When model_path is a folder, then this parameter specifies  the
+        type of model to be loaded
+    **kwargs: dict
+        Keyword arguments
+
+    Returns
+    -------
+        body_model: nn.Module
+            The PyTorch module that implements the corresponding body model
+    Raises
+    ------
+        ValueError: In case the model type is not one of SMPL, SMPLH,
+        SMPLX, MANO or FLAME
+    """
+
+    # If it's a folder, assume
+    if osp.isdir(model_path):
+        model_path = os.path.join(model_path, model_type)
+    else:
+        model_type = osp.basename(model_path).split('_')[0].lower()
+
+    if model_type.lower() == 'smpl':
+        return SMPL(model_path, **kwargs)
+    elif model_type.lower() == 'smplh':
+        return SMPLH(model_path, **kwargs)
+    elif model_type.lower() == 'smplx':
+        return SMPLX(model_path, **kwargs)
+    elif 'mano' in model_type.lower():
+        return MANO(model_path, **kwargs)
+    elif 'flame' in model_type.lower():
+        return FLAME(model_path, **kwargs)
+    else:
+        raise ValueError(f'Unknown model type {model_type}, exiting!')
diff --git a/util/smplx/smplx/joint_names.py b/util/smplx/smplx/joint_names.py
new file mode 100644
index 0000000000000000000000000000000000000000..231cb6f61bc3d6e75bcefae361480c061b065766
--- /dev/null
+++ b/util/smplx/smplx/joint_names.py
@@ -0,0 +1,161 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+JOINT_NAMES = [
+    'pelvis',
+    'left_hip',
+    'right_hip',
+    'spine1',
+    'left_knee',
+    'right_knee',
+    'spine2',
+    'left_ankle',
+    'right_ankle',
+    'spine3',
+    'left_foot',
+    'right_foot',
+    'neck',
+    'left_collar',
+    'right_collar',
+    'head',
+    'left_shoulder',
+    'right_shoulder',
+    'left_elbow',
+    'right_elbow',
+    'left_wrist',
+    'right_wrist',
+    'jaw',
+    'left_eye_smplhf',
+    'right_eye_smplhf',
+    'left_index1',
+    'left_index2',
+    'left_index3',
+    'left_middle1',
+    'left_middle2',
+    'left_middle3',
+    'left_pinky1',
+    'left_pinky2',
+    'left_pinky3',
+    'left_ring1',
+    'left_ring2',
+    'left_ring3',
+    'left_thumb1',
+    'left_thumb2',
+    'left_thumb3',
+    'right_index1',
+    'right_index2',
+    'right_index3',
+    'right_middle1',
+    'right_middle2',
+    'right_middle3',
+    'right_pinky1',
+    'right_pinky2',
+    'right_pinky3',
+    'right_ring1',
+    'right_ring2',
+    'right_ring3',
+    'right_thumb1',
+    'right_thumb2',
+    'right_thumb3',
+    'nose',
+    'right_eye',
+    'left_eye',
+    'right_ear',
+    'left_ear',
+    'left_big_toe',
+    'left_small_toe',
+    'left_heel',
+    'right_big_toe',
+    'right_small_toe',
+    'right_heel',
+    'left_thumb',
+    'left_index',
+    'left_middle',
+    'left_ring',
+    'left_pinky',
+    'right_thumb',
+    'right_index',
+    'right_middle',
+    'right_ring',
+    'right_pinky',
+    'right_eye_brow1',
+    'right_eye_brow2',
+    'right_eye_brow3',
+    'right_eye_brow4',
+    'right_eye_brow5',
+    'left_eye_brow5',
+    'left_eye_brow4',
+    'left_eye_brow3',
+    'left_eye_brow2',
+    'left_eye_brow1',
+    'nose1',
+    'nose2',
+    'nose3',
+    'nose4',
+    'right_nose_2',
+    'right_nose_1',
+    'nose_middle',
+    'left_nose_1',
+    'left_nose_2',
+    'right_eye1',
+    'right_eye2',
+    'right_eye3',
+    'right_eye4',
+    'right_eye5',
+    'right_eye6',
+    'left_eye4',
+    'left_eye3',
+    'left_eye2',
+    'left_eye1',
+    'left_eye6',
+    'left_eye5',
+    'right_mouth_1',
+    'right_mouth_2',
+    'right_mouth_3',
+    'mouth_top',
+    'left_mouth_3',
+    'left_mouth_2',
+    'left_mouth_1',
+    'left_mouth_5',  # 59 in OpenPose output
+    'left_mouth_4',  # 58 in OpenPose output
+    'mouth_bottom',
+    'right_mouth_4',
+    'right_mouth_5',
+    'right_lip_1',
+    'right_lip_2',
+    'lip_top',
+    'left_lip_2',
+    'left_lip_1',
+    'left_lip_3',
+    'lip_bottom',
+    'right_lip_3',
+    # Face contour
+    'right_contour_1',
+    'right_contour_2',
+    'right_contour_3',
+    'right_contour_4',
+    'right_contour_5',
+    'right_contour_6',
+    'right_contour_7',
+    'right_contour_8',
+    'contour_middle',
+    'left_contour_8',
+    'left_contour_7',
+    'left_contour_6',
+    'left_contour_5',
+    'left_contour_4',
+    'left_contour_3',
+    'left_contour_2',
+    'left_contour_1',
+]
diff --git a/util/smplx/smplx/lbs.py b/util/smplx/smplx/lbs.py
new file mode 100644
index 0000000000000000000000000000000000000000..67af61f9c8d8d6c558175d8c444263fc9d0519ef
--- /dev/null
+++ b/util/smplx/smplx/lbs.py
@@ -0,0 +1,392 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+from typing import Tuple, List
+import numpy as np
+
+import torch
+import torch.nn.functional as F
+
+from .utils import rot_mat_to_euler, Tensor
+
+
+def find_dynamic_lmk_idx_and_bcoords(
+    vertices: Tensor,
+    pose: Tensor,
+    dynamic_lmk_faces_idx: Tensor,
+    dynamic_lmk_b_coords: Tensor,
+    neck_kin_chain: List[int],
+    pose2rot: bool = True,
+) -> Tuple[Tensor, Tensor]:
+    """Compute the faces, barycentric coordinates for the dynamic landmarks.
+
+    To do so, we first compute the rotation of the neck around the y-axis
+    and then use a pre-computed look-up table to find the faces and the
+    barycentric coordinates that will be used.
+
+    Special thanks to Soubhik Sanyal (soubhik.sanyal@tuebingen.mpg.de)
+    for providing the original TensorFlow implementation and for the LUT.
+
+    Parameters
+    ----------
+    vertices: torch.tensor BxVx3, dtype = torch.float32
+        The tensor of input vertices
+    pose: torch.tensor Bx(Jx3), dtype = torch.float32
+        The current pose of the body model
+    dynamic_lmk_faces_idx: torch.tensor L, dtype = torch.long
+        The look-up table from neck rotation to faces
+    dynamic_lmk_b_coords: torch.tensor Lx3, dtype = torch.float32
+        The look-up table from neck rotation to barycentric coordinates
+    neck_kin_chain: list
+        A python list that contains the indices of the joints that form the
+        kinematic chain of the neck.
+    dtype: torch.dtype, optional
+
+    Returns
+    -------
+    dyn_lmk_faces_idx: torch.tensor, dtype = torch.long
+        A tensor of size BxL that contains the indices of the faces that
+        will be used to compute the current dynamic landmarks.
+    dyn_lmk_b_coords: torch.tensor, dtype = torch.float32
+        A tensor of size BxL that contains the indices of the faces that
+        will be used to compute the current dynamic landmarks.
+    """
+
+    dtype = vertices.dtype
+    batch_size = vertices.shape[0]
+
+    if pose2rot:
+        aa_pose = torch.index_select(pose.view(batch_size, -1, 3), 1,
+                                     neck_kin_chain)
+        rot_mats = batch_rodrigues(aa_pose.view(-1,
+                                                3)).view(batch_size, -1, 3, 3)
+    else:
+        rot_mats = torch.index_select(pose.view(batch_size, -1, 3, 3), 1,
+                                      neck_kin_chain)
+
+    rel_rot_mat = torch.eye(3, device=vertices.device,
+                            dtype=dtype).unsqueeze_(dim=0).repeat(
+                                batch_size, 1, 1)
+    for idx in range(len(neck_kin_chain)):
+        rel_rot_mat = torch.bmm(rot_mats[:, idx], rel_rot_mat)
+
+    y_rot_angle = torch.round(
+        torch.clamp(-rot_mat_to_euler(rel_rot_mat) * 180.0 / np.pi,
+                    max=39)).to(dtype=torch.long)
+    neg_mask = y_rot_angle.lt(0).to(dtype=torch.long)
+    mask = y_rot_angle.lt(-39).to(dtype=torch.long)
+    neg_vals = mask * 78 + (1 - mask) * (39 - y_rot_angle)
+    y_rot_angle = (neg_mask * neg_vals + (1 - neg_mask) * y_rot_angle)
+
+    dyn_lmk_faces_idx = torch.index_select(dynamic_lmk_faces_idx, 0,
+                                           y_rot_angle)
+    dyn_lmk_b_coords = torch.index_select(dynamic_lmk_b_coords, 0, y_rot_angle)
+
+    return dyn_lmk_faces_idx, dyn_lmk_b_coords
+
+
+def vertices2landmarks(vertices: Tensor, faces: Tensor, lmk_faces_idx: Tensor,
+                       lmk_bary_coords: Tensor) -> Tensor:
+    """Calculates landmarks by barycentric interpolation.
+
+    Parameters
+    ----------
+    vertices: torch.tensor BxVx3, dtype = torch.float32
+        The tensor of input vertices
+    faces: torch.tensor Fx3, dtype = torch.long
+        The faces of the mesh
+    lmk_faces_idx: torch.tensor L, dtype = torch.long
+        The tensor with the indices of the faces used to calculate the
+        landmarks.
+    lmk_bary_coords: torch.tensor Lx3, dtype = torch.float32
+        The tensor of barycentric coordinates that are used to interpolate
+        the landmarks
+
+    Returns
+    -------
+    landmarks: torch.tensor BxLx3, dtype = torch.float32
+        The coordinates of the landmarks for each mesh in the batch
+    """
+    # Extract the indices of the vertices for each face
+    # BxLx3
+    batch_size, num_verts = vertices.shape[:2]
+    device = vertices.device
+
+    lmk_faces = torch.index_select(faces, 0, lmk_faces_idx.view(-1)).view(
+        batch_size, -1, 3)
+
+    lmk_faces += torch.arange(batch_size, dtype=torch.long,
+                              device=device).view(-1, 1, 1) * num_verts
+
+    lmk_vertices = vertices.view(-1, 3)[lmk_faces].view(batch_size, -1, 3, 3)
+
+    landmarks = torch.einsum('blfi,blf->bli', [lmk_vertices, lmk_bary_coords])
+    return landmarks
+
+
+def lbs(
+    betas: Tensor,
+    pose: Tensor,
+    v_template: Tensor,
+    shapedirs: Tensor,
+    posedirs: Tensor,
+    J_regressor: Tensor,
+    parents: Tensor,
+    lbs_weights: Tensor,
+    pose2rot: bool = True,
+) -> Tuple[Tensor, Tensor]:
+    """Performs Linear Blend Skinning with the given shape and pose parameters.
+
+    Parameters
+    ----------
+    betas : torch.tensor BxNB
+        The tensor of shape parameters
+    pose : torch.tensor Bx(J + 1) * 3
+        The pose parameters in axis-angle format
+    v_template torch.tensor BxVx3
+        The template mesh that will be deformed
+    shapedirs : torch.tensor 1xNB
+        The tensor of PCA shape displacements
+    posedirs : torch.tensor Px(V * 3)
+        The pose PCA coefficients
+    J_regressor : torch.tensor JxV
+        The regressor array that is used to calculate the joints from
+        the position of the vertices
+    parents: torch.tensor J
+        The array that describes the kinematic tree for the model
+    lbs_weights: torch.tensor N x V x (J + 1)
+        The linear blend skinning weights that represent how much the
+        rotation matrix of each part affects each vertex
+    pose2rot: bool, optional
+        Flag on whether to convert the input pose tensor to rotation
+        matrices. The default value is True. If False, then the pose tensor
+        should already contain rotation matrices and have a size of
+        Bx(J + 1)x9
+    dtype: torch.dtype, optional
+
+    Returns
+    -------
+    verts: torch.tensor BxVx3
+        The vertices of the mesh after applying the shape and pose
+        displacements.
+    joints: torch.tensor BxJx3
+        The joints of the model
+    """
+
+    batch_size = max(betas.shape[0], pose.shape[0])
+    device, dtype = betas.device, betas.dtype
+
+    # Add shape contribution
+    v_shaped = v_template + blend_shapes(betas, shapedirs)
+
+    # Get the joints
+    # NxJx3 array
+    J = vertices2joints(J_regressor, v_shaped)
+
+    # 3. Add pose blend shapes
+    # N x J x 3 x 3
+    ident = torch.eye(3, dtype=dtype, device=device)
+    if pose2rot:
+        rot_mats = batch_rodrigues(pose.view(-1,
+                                             3)).view([batch_size, -1, 3, 3])
+
+        pose_feature = (rot_mats[:, 1:, :, :] - ident).view([batch_size, -1])
+        # (N x P) x (P, V * 3) -> N x V x 3
+        pose_offsets = torch.matmul(pose_feature,
+                                    posedirs).view(batch_size, -1, 3)
+    else:
+        pose_feature = pose[:, 1:].view(batch_size, -1, 3, 3) - ident
+        rot_mats = pose.view(batch_size, -1, 3, 3)
+
+        pose_offsets = torch.matmul(pose_feature.view(batch_size, -1),
+                                    posedirs).view(batch_size, -1, 3)
+
+    v_posed = pose_offsets + v_shaped
+    # 4. Get the global joint location
+    J_transformed, A = batch_rigid_transform(rot_mats, J, parents, dtype=dtype)
+
+    # 5. Do skinning:
+    # W is N x V x (J + 1)
+    W = lbs_weights.unsqueeze(dim=0).expand([batch_size, -1, -1])
+    # (N x V x (J + 1)) x (N x (J + 1) x 16)
+    num_joints = J_regressor.shape[0]
+    T = torch.matmul(W, A.view(batch_size, num_joints, 16)) \
+        .view(batch_size, -1, 4, 4)
+
+    homogen_coord = torch.ones([batch_size, v_posed.shape[1], 1],
+                               dtype=dtype,
+                               device=device)
+    v_posed_homo = torch.cat([v_posed, homogen_coord], dim=2)
+    v_homo = torch.matmul(T, torch.unsqueeze(v_posed_homo, dim=-1))
+
+    verts = v_homo[:, :, :3, 0]
+
+    return verts, J_transformed
+
+
+def vertices2joints(J_regressor: Tensor, vertices: Tensor) -> Tensor:
+    """Calculates the 3D joint locations from the vertices.
+
+    Parameters
+    ----------
+    J_regressor : torch.tensor JxV
+        The regressor array that is used to calculate the joints from the
+        position of the vertices
+    vertices : torch.tensor BxVx3
+        The tensor of mesh vertices
+
+    Returns
+    -------
+    torch.tensor BxJx3
+        The location of the joints
+    """
+
+    return torch.einsum('bik,ji->bjk', [vertices, J_regressor])
+
+
+def blend_shapes(betas: Tensor, shape_disps: Tensor) -> Tensor:
+    """Calculates the per vertex displacement due to the blend shapes.
+
+    Parameters
+    ----------
+    betas : torch.tensor Bx(num_betas)
+        Blend shape coefficients
+    shape_disps: torch.tensor Vx3x(num_betas)
+        Blend shapes
+
+    Returns
+    -------
+    torch.tensor BxVx3
+        The per-vertex displacement due to shape deformation
+    """
+
+    # Displacement[b, m, k] = sum_{l} betas[b, l] * shape_disps[m, k, l]
+    # i.e. Multiply each shape displacement by its corresponding beta and
+    # then sum them.
+    blend_shape = torch.einsum('bl,mkl->bmk', [betas, shape_disps])
+    return blend_shape
+
+
+def batch_rodrigues(
+    rot_vecs: Tensor,
+    epsilon: float = 1e-8,
+) -> Tensor:
+    ''' Calculates the rotation matrices for a batch of rotation vectors
+        Parameters
+        ----------
+        rot_vecs: torch.tensor Nx3
+            array of N axis-angle vectors
+        Returns
+        -------
+        R: torch.tensor Nx3x3
+            The rotation matrices for the given axis-angle parameters
+    '''
+
+    batch_size = rot_vecs.shape[0]
+    device, dtype = rot_vecs.device, rot_vecs.dtype
+
+    angle = torch.norm(rot_vecs + 1e-8, dim=1, keepdim=True)
+    rot_dir = rot_vecs / angle
+
+    cos = torch.unsqueeze(torch.cos(angle), dim=1)
+    sin = torch.unsqueeze(torch.sin(angle), dim=1)
+
+    # Bx1 arrays
+    rx, ry, rz = torch.split(rot_dir, 1, dim=1)
+    K = torch.zeros((batch_size, 3, 3), dtype=dtype, device=device)
+
+    zeros = torch.zeros((batch_size, 1), dtype=dtype, device=device)
+    K = torch.cat([zeros, -rz, ry, rz, zeros, -rx, -ry, rx, zeros], dim=1) \
+        .view((batch_size, 3, 3))
+
+    ident = torch.eye(3, dtype=dtype, device=device).unsqueeze(dim=0)
+    rot_mat = ident + sin * K + (1 - cos) * torch.bmm(K, K)
+    return rot_mat
+
+
+def transform_mat(R: Tensor, t: Tensor) -> Tensor:
+    ''' Creates a batch of transformation matrices
+        Args:
+            - R: Bx3x3 array of a batch of rotation matrices
+            - t: Bx3x1 array of a batch of translation vectors
+        Returns:
+            - T: Bx4x4 Transformation matrix
+    '''
+    # No padding left or right, only add an extra row
+    return torch.cat([F.pad(R, [0, 0, 0, 1]),
+                      F.pad(t, [0, 0, 0, 1], value=1)],
+                     dim=2)
+
+
+def batch_rigid_transform(rot_mats: Tensor,
+                          joints: Tensor,
+                          parents: Tensor,
+                          dtype=torch.float32) -> Tensor:
+    """Applies a batch of rigid transformations to the joints.
+
+    Parameters
+    ----------
+    rot_mats : torch.tensor BxNx3x3
+        Tensor of rotation matrices
+    joints : torch.tensor BxNx3
+        Locations of joints
+    parents : torch.tensor BxN
+        The kinematic tree of each object
+    dtype : torch.dtype, optional:
+        The data type of the created tensors, the default is torch.float32
+
+    Returns
+    -------
+    posed_joints : torch.tensor BxNx3
+        The locations of the joints after applying the pose rotations
+    rel_transforms : torch.tensor BxNx4x4
+        The relative (with respect to the root joint) rigid transformations
+        for all the joints
+    """
+
+    joints = torch.unsqueeze(joints, dim=-1)
+
+    rel_joints = joints.clone()
+    rel_joints[:, 1:] -= joints[:, parents[1:]]
+
+    transforms_mat = transform_mat(rot_mats.reshape(-1, 3, 3),
+                                   rel_joints.reshape(-1, 3, 1)).reshape(
+                                       -1, joints.shape[1], 4, 4)
+
+    transform_chain = [transforms_mat[:, 0]]
+    for i in range(1, parents.shape[0]):
+        # Subtract the joint location at the rest pose
+        # No need for rotation, since it's identity when at rest
+        curr_res = torch.matmul(transform_chain[parents[i]], transforms_mat[:,
+                                                                            i])
+        transform_chain.append(curr_res)
+
+    transforms = torch.stack(transform_chain, dim=1)
+
+    # The last column of the transformations contains the posed joints
+    posed_joints = transforms[:, :, :3, 3]
+
+    # The last column of the transformations contains the posed joints
+    posed_joints = transforms[:, :, :3, 3]
+
+    joints_homogen = F.pad(joints, [0, 0, 0, 1])
+
+    rel_transforms = transforms - F.pad(
+        torch.matmul(transforms, joints_homogen), [3, 0, 0, 0, 0, 0, 0, 0])
+
+    return posed_joints, rel_transforms
diff --git a/util/smplx/smplx/utils.py b/util/smplx/smplx/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..ff190d7f496224f5971477d939c4b6c4a8cdb7e1
--- /dev/null
+++ b/util/smplx/smplx/utils.py
@@ -0,0 +1,121 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from typing import NewType, Union, Optional
+from dataclasses import dataclass, asdict, fields
+import numpy as np
+import torch
+
+Tensor = NewType('Tensor', torch.Tensor)
+Array = NewType('Array', np.ndarray)
+
+
+@dataclass
+class ModelOutput:
+    vertices: Optional[Tensor] = None
+    joints: Optional[Tensor] = None
+    full_pose: Optional[Tensor] = None
+    global_orient: Optional[Tensor] = None
+    transl: Optional[Tensor] = None
+
+    def __getitem__(self, key):
+        return getattr(self, key)
+
+    def get(self, key, default=None):
+        return getattr(self, key, default)
+
+    def __iter__(self):
+        return self.keys()
+
+    def keys(self):
+        keys = [t.name for t in fields(self)]
+        return iter(keys)
+
+    def values(self):
+        values = [getattr(self, t.name) for t in fields(self)]
+        return iter(values)
+
+    def items(self):
+        data = [(t.name, getattr(self, t.name)) for t in fields(self)]
+        return iter(data)
+
+
+@dataclass
+class SMPLOutput(ModelOutput):
+    betas: Optional[Tensor] = None
+    body_pose: Optional[Tensor] = None
+
+
+@dataclass
+class SMPLHOutput(SMPLOutput):
+    left_hand_pose: Optional[Tensor] = None
+    right_hand_pose: Optional[Tensor] = None
+    transl: Optional[Tensor] = None
+
+
+@dataclass
+class SMPLXOutput(SMPLHOutput):
+    expression: Optional[Tensor] = None
+    jaw_pose: Optional[Tensor] = None
+
+
+@dataclass
+class MANOOutput(ModelOutput):
+    betas: Optional[Tensor] = None
+    hand_pose: Optional[Tensor] = None
+
+
+@dataclass
+class FLAMEOutput(ModelOutput):
+    betas: Optional[Tensor] = None
+    expression: Optional[Tensor] = None
+    jaw_pose: Optional[Tensor] = None
+    neck_pose: Optional[Tensor] = None
+
+
+def find_joint_kin_chain(joint_id, kinematic_tree):
+    kin_chain = []
+    curr_idx = joint_id
+    while curr_idx != -1:
+        kin_chain.append(curr_idx)
+        curr_idx = kinematic_tree[curr_idx]
+    return kin_chain
+
+
+def to_tensor(array: Union[Array, Tensor], dtype=torch.float32) -> Tensor:
+    if torch.is_tensor(array):
+        return array
+    else:
+        return torch.tensor(array, dtype=dtype)
+
+
+class Struct(object):
+    def __init__(self, **kwargs):
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+
+
+def to_np(array, dtype=np.float32):
+    if 'scipy.sparse' in str(type(array)):
+        array = array.todense()
+    return np.array(array, dtype=dtype)
+
+
+def rot_mat_to_euler(rot_mats):
+    # Calculates rotation matrix to euler angles
+    # Careful for extreme cases of eular angles like [0.0, pi, 0.0]
+
+    sy = torch.sqrt(rot_mats[:, 0, 0] * rot_mats[:, 0, 0] +
+                    rot_mats[:, 1, 0] * rot_mats[:, 1, 0])
+    return torch.atan2(-rot_mats[:, 2, 0], sy)
diff --git a/util/smplx/smplx/vertex_ids.py b/util/smplx/smplx/vertex_ids.py
new file mode 100644
index 0000000000000000000000000000000000000000..fb99b3af8c9bea9d16cc1d8e25a6adb43b7ca380
--- /dev/null
+++ b/util/smplx/smplx/vertex_ids.py
@@ -0,0 +1,75 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+
+# Joint name to vertex mapping. SMPL/SMPL-H/SMPL-X vertices that correspond to
+# MSCOCO and OpenPose joints
+vertex_ids = {
+    'smplh': {
+        'nose': 332,
+        'reye': 6260,
+        'leye': 2800,
+        'rear': 4071,
+        'lear': 583,
+        'rthumb': 6191,
+        'rindex': 5782,
+        'rmiddle': 5905,
+        'rring': 6016,
+        'rpinky': 6133,
+        'lthumb': 2746,
+        'lindex': 2319,
+        'lmiddle': 2445,
+        'lring': 2556,
+        'lpinky': 2673,
+        'LBigToe': 3216,
+        'LSmallToe': 3226,
+        'LHeel': 3387,
+        'RBigToe': 6617,
+        'RSmallToe': 6624,
+        'RHeel': 6787
+    },
+    'smplx': {
+        'nose': 9120,
+        'reye': 9929,
+        'leye': 9448,
+        'rear': 616,
+        'lear': 6,
+        'rthumb': 8079,
+        'rindex': 7669,
+        'rmiddle': 7794,
+        'rring': 7905,
+        'rpinky': 8022,
+        'lthumb': 5361,
+        'lindex': 4933,
+        'lmiddle': 5058,
+        'lring': 5169,
+        'lpinky': 5286,
+        'LBigToe': 5770,
+        'LSmallToe': 5780,
+        'LHeel': 8846,
+        'RBigToe': 8463,
+        'RSmallToe': 8474,
+        'RHeel': 8635
+    },
+    'mano': {
+        'thumb': 744,
+        'index': 320,
+        'middle': 443,
+        'ring': 554,
+        'pinky': 671,
+    }
+}
diff --git a/util/smplx/smplx/vertex_joint_selector.py b/util/smplx/smplx/vertex_joint_selector.py
new file mode 100644
index 0000000000000000000000000000000000000000..01fe151afc97b42bed73cbcc067eb203759c4a5b
--- /dev/null
+++ b/util/smplx/smplx/vertex_joint_selector.py
@@ -0,0 +1,73 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import numpy as np
+
+import torch
+import torch.nn as nn
+
+from .utils import to_tensor
+
+
+class VertexJointSelector(nn.Module):
+    def __init__(self,
+                 vertex_ids=None,
+                 use_hands=True,
+                 use_feet_keypoints=True,
+                 **kwargs):
+        super(VertexJointSelector, self).__init__()
+
+        extra_joints_idxs = []
+
+        face_keyp_idxs = np.array([
+            vertex_ids['nose'], vertex_ids['reye'], vertex_ids['leye'],
+            vertex_ids['rear'], vertex_ids['lear']
+        ],
+                                  dtype=np.int64)
+
+        extra_joints_idxs = np.concatenate([extra_joints_idxs, face_keyp_idxs])
+
+        if use_feet_keypoints:
+            feet_keyp_idxs = np.array([
+                vertex_ids['LBigToe'], vertex_ids['LSmallToe'],
+                vertex_ids['LHeel'], vertex_ids['RBigToe'],
+                vertex_ids['RSmallToe'], vertex_ids['RHeel']
+            ],
+                                      dtype=np.int32)
+
+            extra_joints_idxs = np.concatenate(
+                [extra_joints_idxs, feet_keyp_idxs])
+
+        if use_hands:
+            self.tip_names = ['thumb', 'index', 'middle', 'ring', 'pinky']
+
+            tips_idxs = []
+            for hand_id in ['l', 'r']:
+                for tip_name in self.tip_names:
+                    tips_idxs.append(vertex_ids[hand_id + tip_name])
+
+            extra_joints_idxs = np.concatenate([extra_joints_idxs, tips_idxs])
+
+        self.register_buffer('extra_joints_idxs',
+                             to_tensor(extra_joints_idxs, dtype=torch.long))
+
+    def forward(self, vertices, joints):
+        extra_joints = torch.index_select(vertices, 1, self.extra_joints_idxs)
+        joints = torch.cat([joints, extra_joints], dim=1)
+
+        return joints
diff --git a/util/smplx/tools/README.md b/util/smplx/tools/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..1e69d971bc22f8d65f2e751c24c6125d363a3e76
--- /dev/null
+++ b/util/smplx/tools/README.md
@@ -0,0 +1,20 @@
+## Removing Chumpy objects
+
+In a Python 2 virtual environment with [Chumpy](https://github.com/mattloper/chumpy) installed run the following to remove any Chumpy objects from the model data:
+
+```bash
+python tools/clean_ch.py --input-models path-to-models/*.pkl --output-folder output-folder
+```
+
+## Merging SMPL-H and MANO parameters
+
+In order to use the given PyTorch SMPL-H module we first need to merge the SMPL-H and MANO parameters in a single file. After agreeing to the license and downloading the models, run the following command:
+
+```bash
+python tools/merge_smplh_mano.py --smplh-fn SMPLH_FOLDER/SMPLH_GENDER.pkl \
+ --mano-left-fn MANO_FOLDER/MANO_LEFT.pkl \
+ --mano-right-fn MANO_FOLDER/MANO_RIGHT.pkl \
+ --output-folder OUTPUT_FOLDER
+```
+
+where SMPLH_FOLDER is the folder with the SMPL-H files and MANO_FOLDER the one for the MANO files.
diff --git a/util/smplx/tools/__init__.py b/util/smplx/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..013898511fe46ebd360554af6533046aa7a36b69
--- /dev/null
+++ b/util/smplx/tools/__init__.py
@@ -0,0 +1,17 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+import clean_ch
+import merge_smplh_mano
diff --git a/util/smplx/tools/clean_ch.py b/util/smplx/tools/clean_ch.py
new file mode 100644
index 0000000000000000000000000000000000000000..50e79b2d927a8d93098c3d424b1e81a8a06772b5
--- /dev/null
+++ b/util/smplx/tools/clean_ch.py
@@ -0,0 +1,71 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import print_function
+from __future__ import absolute_import
+from __future__ import division
+
+import argparse
+import os
+import os.path as osp
+
+import pickle
+
+from tqdm import tqdm
+import numpy as np
+
+
+def clean_fn(fn, output_folder='output'):
+    with open(fn, 'rb') as body_file:
+        body_data = pickle.load(body_file)
+
+    output_dict = {}
+    for key, data in body_data.iteritems():
+        if 'chumpy' in str(type(data)):
+            output_dict[key] = np.array(data)
+        else:
+            output_dict[key] = data
+
+    out_fn = osp.split(fn)[1]
+
+    out_path = osp.join(output_folder, out_fn)
+    with open(out_path, 'wb') as out_file:
+        pickle.dump(output_dict, out_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input-models',
+                        dest='input_models',
+                        nargs='+',
+                        required=True,
+                        type=str,
+                        help='The path to the model that will be processed')
+    parser.add_argument('--output-folder',
+                        dest='output_folder',
+                        required=True,
+                        type=str,
+                        help='The path to the output folder')
+
+    args = parser.parse_args()
+
+    input_models = args.input_models
+    output_folder = args.output_folder
+    if not osp.exists(output_folder):
+        print('Creating directory: {}'.format(output_folder))
+        os.makedirs(output_folder)
+
+    for input_model in input_models:
+        clean_fn(input_model, output_folder=output_folder)
diff --git a/util/smplx/tools/merge_smplh_mano.py b/util/smplx/tools/merge_smplh_mano.py
new file mode 100644
index 0000000000000000000000000000000000000000..7082f1e94181ce30322463191b68ea5b105d09cd
--- /dev/null
+++ b/util/smplx/tools/merge_smplh_mano.py
@@ -0,0 +1,100 @@
+# Max-Planck-Gesellschaft zur Förderung der Wissenschaften e.V. (MPG) is
+# holder of all proprietary rights on this computer program.
+# You can only use this computer program if you have closed
+# a license agreement with MPG or you get the right to use the computer
+# program from someone who is authorized to grant you that right.
+# Any use of the computer program without a valid license is prohibited and
+# liable to prosecution.
+#
+# Copyright©2019 Max-Planck-Gesellschaft zur Förderung
+# der Wissenschaften e.V. (MPG). acting on behalf of its Max Planck Institute
+# for Intelligent Systems and the Max Planck Institute for Biological
+# Cybernetics. All rights reserved.
+#
+# Contact: ps-license@tuebingen.mpg.de
+
+from __future__ import print_function
+
+import os
+import os.path as osp
+import pickle
+
+import argparse
+
+import numpy as np
+
+
+def merge_models(smplh_fn,
+                 mano_left_fn,
+                 mano_right_fn,
+                 output_folder='output'):
+
+    with open(smplh_fn, 'rb') as body_file:
+        body_data = pickle.load(body_file)
+
+    with open(mano_left_fn, 'rb') as lhand_file:
+        lhand_data = pickle.load(lhand_file)
+
+    with open(mano_right_fn, 'rb') as rhand_file:
+        rhand_data = pickle.load(rhand_file)
+
+    out_fn = osp.split(smplh_fn)[1]
+
+    output_data = body_data.copy()
+    output_data['hands_componentsl'] = lhand_data['hands_components']
+    output_data['hands_componentsr'] = rhand_data['hands_components']
+
+    output_data['hands_coeffsl'] = lhand_data['hands_coeffs']
+    output_data['hands_coeffsr'] = rhand_data['hands_coeffs']
+
+    output_data['hands_meanl'] = lhand_data['hands_mean']
+    output_data['hands_meanr'] = rhand_data['hands_mean']
+
+    for key, data in output_data.iteritems():
+        if 'chumpy' in str(type(data)):
+            output_data[key] = np.array(data)
+        else:
+            output_data[key] = data
+
+    out_path = osp.join(output_folder, out_fn)
+    print(out_path)
+    print('Saving to {}'.format(out_path))
+    with open(out_path, 'wb') as output_file:
+        pickle.dump(output_data, output_file)
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--smplh-fn',
+                        dest='smplh_fn',
+                        required=True,
+                        type=str,
+                        help='The path to the SMPLH model')
+    parser.add_argument('--mano-left-fn',
+                        dest='mano_left_fn',
+                        required=True,
+                        type=str,
+                        help='The path to the left hand MANO model')
+    parser.add_argument('--mano-right-fn',
+                        dest='mano_right_fn',
+                        required=True,
+                        type=str,
+                        help='The path to the right hand MANO model')
+    parser.add_argument('--output-folder',
+                        dest='output_folder',
+                        required=True,
+                        type=str,
+                        help='The path to the output folder')
+
+    args = parser.parse_args()
+
+    smplh_fn = args.smplh_fn
+    mano_left_fn = args.mano_left_fn
+    mano_right_fn = args.mano_right_fn
+    output_folder = args.output_folder
+
+    if not osp.exists(output_folder):
+        print('Creating directory: {}'.format(output_folder))
+        os.makedirs(output_folder)
+
+    merge_models(smplh_fn, mano_left_fn, mano_right_fn, output_folder)
diff --git a/util/time_counter.py b/util/time_counter.py
new file mode 100644
index 0000000000000000000000000000000000000000..d5ce187370911e3ae17c40a031c43f8ba1eac43c
--- /dev/null
+++ b/util/time_counter.py
@@ -0,0 +1,61 @@
+import json
+import time
+
+
+class TimeCounter:
+    def __init__(self) -> None:
+        pass
+
+    def clear(self):
+        self.timedict = {}
+        self.basetime = time.perf_counter()
+
+    def timeit(self, name):
+        nowtime = time.perf_counter() - self.basetime
+        self.timedict[name] = nowtime
+        self.basetime = time.perf_counter()
+
+
+class TimeHolder:
+    def __init__(self) -> None:
+        self.timedict = {}
+
+    def update(self, _timedict: dict):
+        for k, v in _timedict.items():
+            if k not in self.timedict:
+                self.timedict[k] = AverageMeter(name=k, val_only=True)
+            self.timedict[k].update(val=v)
+
+    def final_res(self):
+        return {k: v.avg for k, v in self.timedict.items()}
+
+    def __str__(self):
+        return json.dumps(self.final_res(), indent=2)
+
+
+class AverageMeter(object):
+    """Computes and stores the average and current value."""
+    def __init__(self, name, fmt=':f', val_only=False):
+        self.name = name
+        self.fmt = fmt
+        self.val_only = val_only
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        if self.val_only:
+            fmtstr = '{name} {val' + self.fmt + '}'
+        else:
+            fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
diff --git a/util/transforms.py b/util/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..2bbcaba8c0b944b345ce465d097a78631fa6ea5b
--- /dev/null
+++ b/util/transforms.py
@@ -0,0 +1,232 @@
+import torch
+import numpy as np
+import scipy
+from config.config import cfg
+from torch.nn import functional as F
+import torchgeometry as tgm
+
+
+def cam2pixel(cam_coord, f, c):
+    x = cam_coord[:, 0] / cam_coord[:, 2] * f[0] + c[0]
+    y = cam_coord[:, 1] / cam_coord[:, 2] * f[1] + c[1]
+    z = cam_coord[:, 2]
+    return np.stack((x, y, z), 1)
+
+
+def pixel2cam(pixel_coord, f, c):
+    x = (pixel_coord[:, 0] - c[0]) / f[0] * pixel_coord[:, 2]
+    y = (pixel_coord[:, 1] - c[1]) / f[1] * pixel_coord[:, 2]
+    z = pixel_coord[:, 2]
+    return np.stack((x, y, z), 1)
+
+
+def world2cam(world_coord, R, t):
+    cam_coord = np.dot(R, world_coord.transpose(1, 0)).transpose(
+        1, 0) + t.reshape(1, 3)
+    return cam_coord
+
+
+def cam2world(cam_coord, R, t):
+    world_coord = np.dot(np.linalg.inv(R),
+                         (cam_coord - t.reshape(1, 3)).transpose(1,
+                                                                 0)).transpose(
+                                                                     1, 0)
+    return world_coord
+
+
+def rigid_transform_3D(A, B):
+    n, dim = A.shape
+    centroid_A = np.mean(A, axis=0)
+    centroid_B = np.mean(B, axis=0)
+    H = np.dot(np.transpose(A - centroid_A), B - centroid_B) / n
+    U, s, V = np.linalg.svd(H)
+    R = np.dot(np.transpose(V), np.transpose(U))
+    if np.linalg.det(R) < 0:
+        s[-1] = -s[-1]
+        V[2] = -V[2]
+        R = np.dot(np.transpose(V), np.transpose(U))
+
+    varP = np.var(A, axis=0).sum()
+    c = 1 / varP * np.sum(s)
+
+    t = -np.dot(c * R, np.transpose(centroid_A)) + np.transpose(centroid_B)
+    return c, R, t
+
+def rigid_transform_3D_batch(A, B):
+    n, dim = A.shape
+    centroid_A = np.mean(A, axis=0)
+    centroid_B = np.mean(B, axis=0)
+    H = np.dot(np.transpose(A - centroid_A), B - centroid_B) / n
+    U, s, V = np.linalg.svd(H)
+    R = np.dot(np.transpose(V), np.transpose(U))
+    if np.linalg.det(R) < 0:
+        s[-1] = -s[-1]
+        V[2] = -V[2]
+        R = np.dot(np.transpose(V), np.transpose(U))
+
+    varP = np.var(A, axis=0).sum()
+    c = 1 / varP * np.sum(s)
+
+    t = -np.dot(c * R, np.transpose(centroid_A)) + np.transpose(centroid_B)
+    A2 = np.transpose(np.dot(c * R, np.transpose(A))) + t
+    return A2
+
+def rigid_align(A, B):
+    c, R, t = rigid_transform_3D(A, B)
+    A2 = np.transpose(np.dot(c * R, np.transpose(A))) + t
+    return A2
+
+def rigid_align_batch(A, B):
+    
+    A2 = np.stack([
+        rigid_transform_3D_batch(a_i, b_i)
+        for a_i, b_i in zip(A, B)
+    ])
+    return A2
+
+
+def transform_joint_to_other_db(src_joint, src_name, dst_name):
+    src_joint_num = len(src_name)
+    dst_joint_num = len(dst_name)
+
+    new_joint = np.zeros(((dst_joint_num, ) + src_joint.shape[1:]),
+                         dtype=np.float32)
+    for src_idx in range(len(src_name)):
+        name = src_name[src_idx]
+        if name in dst_name:
+            dst_idx = dst_name.index(name)
+            new_joint[dst_idx] = src_joint[src_idx]
+
+    return new_joint
+
+
+def transform_joint_to_other_db_batch(src_joint, src_name, dst_name):
+    
+    src_joint_num = len(src_name)
+    dst_joint_num = len(dst_name)
+    person_num = src_joint.shape[0]
+    new_joint = np.zeros(((
+        person_num,
+        dst_joint_num,
+    ) + src_joint.shape[2:]),
+                         dtype=np.float32)
+
+    for src_idx in range(len(src_name)):
+        name = src_name[src_idx]
+        if name in dst_name:
+            dst_idx = dst_name.index(name)
+            new_joint[:, dst_idx] = src_joint[:, src_idx]
+
+    return new_joint
+
+
+def rot6d_to_axis_angle(x):
+    batch_size = x.shape[0]
+
+    x = x.view(-1, 3, 2)
+    a1 = x[:, :, 0]
+    a2 = x[:, :, 1]
+    b1 = F.normalize(a1)
+    b2 = F.normalize(a2 - torch.einsum('bi,bi->b', b1, a2).unsqueeze(-1) * b1)
+    b3 = torch.cross(b1, b2)
+    rot_mat = torch.stack((b1, b2, b3), dim=-1)  # 3x3 rotation matrix
+
+    rot_mat = torch.cat(
+        [rot_mat, torch.zeros(
+            (batch_size, 3, 1)).cuda().float()], 2)  # 3x4 rotation matrix
+    axis_angle = tgm.rotation_matrix_to_angle_axis(rot_mat).reshape(
+        -1, 3)  # axis-angle
+    axis_angle[torch.isnan(axis_angle)] = 0.0
+    return axis_angle
+
+
+def sample_joint_features(img_feat, joint_xy):
+    height, width = img_feat.shape[2:]
+    x = joint_xy[:, :, 0] / (width - 1) * 2 - 1
+    y = joint_xy[:, :, 1] / (height - 1) * 2 - 1
+    grid = torch.stack((x, y), 2)[:, :, None, :]
+    img_feat = F.grid_sample(
+        img_feat, grid,
+        align_corners=True)[:, :, :, 0]  # batch_size, channel_dim, joint_num
+    img_feat = img_feat.permute(
+        0, 2, 1).contiguous()  # batch_size, joint_num, channel_dim
+    return img_feat
+
+
+def soft_argmax_2d(heatmap2d):
+    batch_size = heatmap2d.shape[0]
+    height, width = heatmap2d.shape[2:]
+    heatmap2d = heatmap2d.reshape((batch_size, -1, height * width))
+    heatmap2d = F.softmax(heatmap2d, 2)
+    heatmap2d = heatmap2d.reshape((batch_size, -1, height, width))
+
+    accu_x = heatmap2d.sum(dim=(2))
+    accu_y = heatmap2d.sum(dim=(3))
+
+    accu_x = accu_x * torch.arange(width).float().cuda()[None, None, :]
+    accu_y = accu_y * torch.arange(height).float().cuda()[None, None, :]
+
+    accu_x = accu_x.sum(dim=2, keepdim=True)
+    accu_y = accu_y.sum(dim=2, keepdim=True)
+
+    coord_out = torch.cat((accu_x, accu_y), dim=2)
+    return coord_out
+
+
+def soft_argmax_3d(heatmap3d):
+    batch_size = heatmap3d.shape[0]
+    depth, height, width = heatmap3d.shape[2:]
+    heatmap3d = heatmap3d.reshape((batch_size, -1, depth * height * width))
+    heatmap3d = F.softmax(heatmap3d, 2)
+    heatmap3d = heatmap3d.reshape((batch_size, -1, depth, height, width))
+
+    accu_x = heatmap3d.sum(dim=(2, 3))
+    accu_y = heatmap3d.sum(dim=(2, 4))
+    accu_z = heatmap3d.sum(dim=(3, 4))
+
+    accu_x = accu_x * torch.arange(width).float().cuda()[None, None, :]
+    accu_y = accu_y * torch.arange(height).float().cuda()[None, None, :]
+    accu_z = accu_z * torch.arange(depth).float().cuda()[None, None, :]
+
+    accu_x = accu_x.sum(dim=2, keepdim=True)
+    accu_y = accu_y.sum(dim=2, keepdim=True)
+    accu_z = accu_z.sum(dim=2, keepdim=True)
+
+    coord_out = torch.cat((accu_x, accu_y, accu_z), dim=2)
+    return coord_out
+
+
+def restore_bbox(bbox_center, bbox_size, aspect_ratio, extension_ratio):
+    bbox = bbox_center.view(-1, 1, 2) + torch.cat(
+        (-bbox_size.view(-1, 1, 2) / 2., bbox_size.view(-1, 1, 2) / 2.),
+        1)  # xyxy in (cfg.output_hm_shape[2], cfg.output_hm_shape[1]) space
+    bbox[:, :,
+         0] = bbox[:, :, 0] / cfg.output_hm_shape[2] * cfg.input_body_shape[1]
+    bbox[:, :,
+         1] = bbox[:, :, 1] / cfg.output_hm_shape[1] * cfg.input_body_shape[0]
+    bbox = bbox.view(-1, 4)
+
+    # xyxy -> xywh
+    bbox[:, 2] = bbox[:, 2] - bbox[:, 0]
+    bbox[:, 3] = bbox[:, 3] - bbox[:, 1]
+
+    # aspect ratio preserving bbox
+    w = bbox[:, 2]
+    h = bbox[:, 3]
+    c_x = bbox[:, 0] + w / 2.
+    c_y = bbox[:, 1] + h / 2.
+
+    mask1 = w > (aspect_ratio * h)
+    mask2 = w < (aspect_ratio * h)
+    h[mask1] = w[mask1] / aspect_ratio
+    w[mask2] = h[mask2] * aspect_ratio
+
+    bbox[:, 2] = w * extension_ratio
+    bbox[:, 3] = h * extension_ratio
+    bbox[:, 0] = c_x - bbox[:, 2] / 2.
+    bbox[:, 1] = c_y - bbox[:, 3] / 2.
+
+    # xywh -> xyxy
+    bbox[:, 2] = bbox[:, 2] + bbox[:, 0]
+    bbox[:, 3] = bbox[:, 3] + bbox[:, 1]
+    return bbox
diff --git a/util/utils.py b/util/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..c97330685c187ca8a3f999ebc79d232b291eb6cf
--- /dev/null
+++ b/util/utils.py
@@ -0,0 +1,570 @@
+from collections import OrderedDict
+from copy import deepcopy
+import json
+import warnings
+
+import torch
+import numpy as np
+
+
+def clean_state_dict(state_dict):
+    new_state_dict = OrderedDict()
+    for k, v in state_dict.items():
+        if k[:7] == 'module.':
+            k = k[7:]  # remove `module.`
+        new_state_dict[k] = v
+    return new_state_dict
+
+
+def renorm(img: torch.FloatTensor, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) \
+        -> torch.FloatTensor:
+    # img: tensor(3,H,W) or tensor(B,3,H,W)
+    # return: same as img
+    assert img.dim() == 3 or img.dim(
+    ) == 4, 'img.dim() should be 3 or 4 but %d' % img.dim()
+    if img.dim() == 3:
+        assert img.size(0) == 3, 'img.size(0) shoule be 3 but "%d". (%s)' % (
+            img.size(0), str(img.size()))
+        img_perm = img.permute(1, 2, 0)
+        mean = torch.Tensor(mean)
+        std = torch.Tensor(std)
+        img_res = img_perm * std + mean
+        return img_res.permute(2, 0, 1)
+    else:  # img.dim() == 4
+        assert img.size(1) == 3, 'img.size(1) shoule be 3 but "%d". (%s)' % (
+            img.size(1), str(img.size()))
+        img_perm = img.permute(0, 2, 3, 1)
+        mean = torch.Tensor(mean)
+        std = torch.Tensor(std)
+        img_res = img_perm * std + mean
+        return img_res.permute(0, 3, 1, 2)
+
+
+class CocoClassMapper():
+    def __init__(self) -> None:
+        self.category_map_str = {
+            '1': 1,
+            '2': 2,
+            '3': 3,
+            '4': 4,
+            '5': 5,
+            '6': 6,
+            '7': 7,
+            '8': 8,
+            '9': 9,
+            '10': 10,
+            '11': 11,
+            '13': 12,
+            '14': 13,
+            '15': 14,
+            '16': 15,
+            '17': 16,
+            '18': 17,
+            '19': 18,
+            '20': 19,
+            '21': 20,
+            '22': 21,
+            '23': 22,
+            '24': 23,
+            '25': 24,
+            '27': 25,
+            '28': 26,
+            '31': 27,
+            '32': 28,
+            '33': 29,
+            '34': 30,
+            '35': 31,
+            '36': 32,
+            '37': 33,
+            '38': 34,
+            '39': 35,
+            '40': 36,
+            '41': 37,
+            '42': 38,
+            '43': 39,
+            '44': 40,
+            '46': 41,
+            '47': 42,
+            '48': 43,
+            '49': 44,
+            '50': 45,
+            '51': 46,
+            '52': 47,
+            '53': 48,
+            '54': 49,
+            '55': 50,
+            '56': 51,
+            '57': 52,
+            '58': 53,
+            '59': 54,
+            '60': 55,
+            '61': 56,
+            '62': 57,
+            '63': 58,
+            '64': 59,
+            '65': 60,
+            '67': 61,
+            '70': 62,
+            '72': 63,
+            '73': 64,
+            '74': 65,
+            '75': 66,
+            '76': 67,
+            '77': 68,
+            '78': 69,
+            '79': 70,
+            '80': 71,
+            '81': 72,
+            '82': 73,
+            '84': 74,
+            '85': 75,
+            '86': 76,
+            '87': 77,
+            '88': 78,
+            '89': 79,
+            '90': 80
+        }
+        self.origin2compact_mapper = {
+            int(k): v - 1
+            for k, v in self.category_map_str.items()
+        }
+        self.compact2origin_mapper = {
+            int(v - 1): int(k)
+            for k, v in self.category_map_str.items()
+        }
+
+    def origin2compact(self, idx):
+        return self.origin2compact_mapper[int(idx)]
+
+    def compact2origin(self, idx):
+        return self.compact2origin_mapper[int(idx)]
+
+
+def to_device(item, device):
+    if isinstance(item, torch.Tensor):
+        return item.to(device)
+    elif isinstance(item, list):
+        return [to_device(i, device) for i in item]
+    elif isinstance(item, dict):
+        return {k: to_device(v, device) for k, v in item.items()}
+    else:
+        raise NotImplementedError('You use other containers! type: {}'.format(
+            type(item)))
+
+
+#
+def get_gaussian_mean(x, axis, other_axis, softmax=True):
+    """
+
+    Args:
+        x (float): Input images(BxCxHxW)
+        axis (int): The index for weighted mean
+        other_axis (int): The other index
+
+    Returns: weighted index for axis, BxC
+
+    """
+    mat2line = torch.sum(x, axis=other_axis)
+    # mat2line = mat2line / mat2line.mean() * 10
+    if softmax:
+        u = torch.softmax(mat2line, axis=2)
+    else:
+        u = mat2line / (mat2line.sum(2, keepdim=True) + 1e-6)
+    size = x.shape[axis]
+    ind = torch.linspace(0, 1, size).to(x.device)
+    batch = x.shape[0]
+    channel = x.shape[1]
+    index = ind.repeat([batch, channel, 1])
+    mean_position = torch.sum(index * u, dim=2)
+    return mean_position
+
+
+def get_expected_points_from_map(hm, softmax=True):
+    """get_gaussian_map_from_points B,C,H,W -> B,N,2 float(0, 1) float(0, 1)
+    softargmax function.
+
+    Args:
+        hm (float): Input images(BxCxHxW)
+
+    Returns:
+        weighted index for axis, BxCx2. float between 0 and 1.
+    """
+    # hm = 10*hm
+    B, C, H, W = hm.shape
+    y_mean = get_gaussian_mean(hm, 2, 3, softmax=softmax)  # B,C
+    x_mean = get_gaussian_mean(hm, 3, 2, softmax=softmax)  # B,C
+    # return torch.cat((x_mean.unsqueeze(-1), y_mean.unsqueeze(-1)), 2)
+    return torch.stack([x_mean, y_mean], dim=2)
+
+
+# Positional encoding (section 5.1)
+# borrow from nerf
+class Embedder:
+    def __init__(self, **kwargs):
+        self.kwargs = kwargs
+        self.create_embedding_fn()
+
+    def create_embedding_fn(self):
+        embed_fns = []
+        d = self.kwargs['input_dims']
+        out_dim = 0
+        if self.kwargs['include_input']:
+            embed_fns.append(lambda x: x)
+            out_dim += d
+
+        max_freq = self.kwargs['max_freq_log2']
+        N_freqs = self.kwargs['num_freqs']
+
+        if self.kwargs['log_sampling']:
+            freq_bands = 2.**torch.linspace(0., max_freq, steps=N_freqs)
+        else:
+            freq_bands = torch.linspace(2.**0., 2.**max_freq, steps=N_freqs)
+
+        for freq in freq_bands:
+            for p_fn in self.kwargs['periodic_fns']:
+                embed_fns.append(
+                    lambda x, p_fn=p_fn, freq=freq: p_fn(x * freq))
+                out_dim += d
+
+        self.embed_fns = embed_fns
+        self.out_dim = out_dim
+
+    def embed(self, inputs):
+        return torch.cat([fn(inputs) for fn in self.embed_fns], -1)
+
+
+def get_embedder(multires, i=0):
+    import torch.nn as nn
+    if i == -1:
+        return nn.Identity(), 3
+
+    embed_kwargs = {
+        'include_input': True,
+        'input_dims': 3,
+        'max_freq_log2': multires - 1,
+        'num_freqs': multires,
+        'log_sampling': True,
+        'periodic_fns': [torch.sin, torch.cos],
+    }
+
+    embedder_obj = Embedder(**embed_kwargs)
+    embed = lambda x, eo=embedder_obj: eo.embed(x)
+    return embed, embedder_obj.out_dim
+
+
+class APOPMeter():
+    def __init__(self) -> None:
+        self.tp = 0
+        self.fp = 0
+        self.tn = 0
+        self.fn = 0
+
+    def update(self, pred, gt):
+        """
+        Input:
+            pred, gt: Tensor()
+        """
+        assert pred.shape == gt.shape
+        self.tp += torch.logical_and(pred == 1, gt == 1).sum().item()
+        self.fp += torch.logical_and(pred == 1, gt == 0).sum().item()
+        self.tn += torch.logical_and(pred == 0, gt == 0).sum().item()
+        self.tn += torch.logical_and(pred == 1, gt == 0).sum().item()
+
+    def update_cm(self, tp, fp, tn, fn):
+        self.tp += tp
+        self.fp += fp
+        self.tn += tn
+        self.tn += fn
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+import argparse
+from util.config import Config
+
+
+def get_raw_dict(args):
+    """return the dicf contained in args.
+
+    e.g:
+        >>> with open(path, 'w') as f:
+                json.dump(get_raw_dict(args), f, indent=2)
+    """
+    if isinstance(args, argparse.Namespace):
+        return vars(args)
+    elif isinstance(args, dict):
+        return args
+    elif isinstance(args, Config):
+        return args._cfg_dict
+    else:
+        raise NotImplementedError('Unknown type {}'.format(type(args)))
+
+
+def stat_tensors(tensor):
+    assert tensor.dim() == 1
+    tensor_sm = tensor.softmax(0)
+    entropy = (tensor_sm * torch.log(tensor_sm + 1e-9)).sum()
+
+    return {
+        'max': tensor.max(),
+        'min': tensor.min(),
+        'mean': tensor.mean(),
+        'var': tensor.var(),
+        'std': tensor.var()**0.5,
+        'entropy': entropy
+    }
+
+
+class NiceRepr:
+    """Inherit from this class and define ``__nice__`` to "nicely" print your
+    objects.
+
+    Defines ``__str__`` and ``__repr__`` in terms of ``__nice__`` function
+    Classes that inherit from :class:`NiceRepr` should redefine ``__nice__``.
+    If the inheriting class has a ``__len__``, method then the default
+    ``__nice__`` method will return its length.
+
+    Example:
+        >>> class Foo(NiceRepr):
+        ...    def __nice__(self):
+        ...        return 'info'
+        >>> foo = Foo()
+        >>> assert str(foo) == '<Foo(info)>'
+        >>> assert repr(foo).startswith('<Foo(info) at ')
+
+    Example:
+        >>> class Bar(NiceRepr):
+        ...    pass
+        >>> bar = Bar()
+        >>> import pytest
+        >>> with pytest.warns(None) as record:
+        >>>     assert 'object at' in str(bar)
+        >>>     assert 'object at' in repr(bar)
+
+    Example:
+        >>> class Baz(NiceRepr):
+        ...    def __len__(self):
+        ...        return 5
+        >>> baz = Baz()
+        >>> assert str(baz) == '<Baz(5)>'
+    """
+    def __nice__(self):
+        """str: a "nice" summary string describing this module"""
+        if hasattr(self, '__len__'):
+            # It is a common pattern for objects to use __len__ in __nice__
+            # As a convenience we define a default __nice__ for these objects
+            return str(len(self))
+        else:
+            # In all other cases force the subclass to overload __nice__
+            raise NotImplementedError(
+                f'Define the __nice__ method for {self.__class__!r}')
+
+    def __repr__(self):
+        """str: the string of the module"""
+        try:
+            nice = self.__nice__()
+            classname = self.__class__.__name__
+            return f'<{classname}({nice}) at {hex(id(self))}>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
+
+    def __str__(self):
+        """str: the string of the module"""
+        try:
+            classname = self.__class__.__name__
+            nice = self.__nice__()
+            return f'<{classname}({nice})>'
+        except NotImplementedError as ex:
+            warnings.warn(str(ex), category=RuntimeWarning)
+            return object.__repr__(self)
+
+
+def ensure_rng(rng=None):
+    """Coerces input into a random number generator.
+
+    If the input is None, then a global random state is returned.
+
+    If the input is a numeric value, then that is used as a seed to construct a
+    random state. Otherwise the input is returned as-is.
+
+    Adapted from [1]_.
+
+    Args:
+        rng (int | numpy.random.RandomState | None):
+            if None, then defaults to the global rng. Otherwise this can be an
+            integer or a RandomState class
+    Returns:
+        (numpy.random.RandomState) : rng -
+            a numpy random number generator
+
+    References:
+        .. [1] https://gitlab.kitware.com/computer-vision/kwarray/blob/master/kwarray/util_random.py#L270  # noqa: E501
+    """
+
+    if rng is None:
+        rng = np.random.mtrand._rand
+    elif isinstance(rng, int):
+        rng = np.random.RandomState(rng)
+    else:
+        rng = rng
+    return rng
+
+
+def random_boxes(num=1, scale=1, rng=None):
+    """Simple version of ``kwimage.Boxes.random``
+
+    Returns:
+        Tensor: shape (n, 4) in x1, y1, x2, y2 format.
+
+    References:
+        https://gitlab.kitware.com/computer-vision/kwimage/blob/master/kwimage/structs/boxes.py#L1390
+
+    Example:
+        >>> num = 3
+        >>> scale = 512
+        >>> rng = 0
+        >>> boxes = random_boxes(num, scale, rng)
+        >>> print(boxes)
+        tensor([[280.9925, 278.9802, 308.6148, 366.1769],
+                [216.9113, 330.6978, 224.0446, 456.5878],
+                [405.3632, 196.3221, 493.3953, 270.7942]])
+    """
+    rng = ensure_rng(rng)
+
+    tlbr = rng.rand(num, 4).astype(np.float32)
+
+    tl_x = np.minimum(tlbr[:, 0], tlbr[:, 2])
+    tl_y = np.minimum(tlbr[:, 1], tlbr[:, 3])
+    br_x = np.maximum(tlbr[:, 0], tlbr[:, 2])
+    br_y = np.maximum(tlbr[:, 1], tlbr[:, 3])
+
+    tlbr[:, 0] = tl_x * scale
+    tlbr[:, 1] = tl_y * scale
+    tlbr[:, 2] = br_x * scale
+    tlbr[:, 3] = br_y * scale
+
+    boxes = torch.from_numpy(tlbr)
+    return boxes
+
+
+class ModelEma(torch.nn.Module):
+    def __init__(self, model, decay=0.9997, device=None):
+        super(ModelEma, self).__init__()
+        # make a copy of the model for accumulating moving average of weights
+        self.module = deepcopy(model)
+        self.module.eval()
+
+        # import pdb; pdb.set_trace()
+
+        self.decay = decay
+        self.device = device  # perform ema on different device from model if set
+        if self.device is not None:
+            self.module.to(device=device)
+
+    def _update(self, model, update_fn):
+        with torch.no_grad():
+            for ema_v, model_v in zip(self.module.state_dict().values(),
+                                      model.state_dict().values()):
+                if self.device is not None:
+                    model_v = model_v.to(device=self.device)
+                ema_v.copy_(update_fn(ema_v, model_v))
+
+    def update(self, model):
+        self._update(model,
+                     update_fn=lambda e, m: self.decay * e +
+                     (1. - self.decay) * m)
+
+    def set(self, model):
+        self._update(model, update_fn=lambda e, m: m)
+
+
+class BestMetricSingle():
+    def __init__(self, init_res=0.0, better='large') -> None:
+        self.init_res = init_res
+        self.best_res = init_res
+        self.best_ep = -1
+
+        self.better = better
+        assert better in ['large', 'small']
+
+    def isbetter(self, new_res, old_res):
+        if self.better == 'large':
+            return new_res > old_res
+        if self.better == 'small':
+            return new_res < old_res
+
+    def update(self, new_res, ep):
+        if self.isbetter(new_res, self.best_res):
+            self.best_res = new_res
+            self.best_ep = ep
+            return True
+        return False
+
+    def __str__(self) -> str:
+        return 'best_res: {}\t best_ep: {}'.format(self.best_res, self.best_ep)
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+    def summary(self) -> dict:
+        return {
+            'best_res': self.best_res,
+            'best_ep': self.best_ep,
+        }
+
+
+class BestMetricHolder():
+    def __init__(self, init_res=0.0, better='large', use_ema=False) -> None:
+        self.best_all = BestMetricSingle(init_res, better)
+        self.use_ema = use_ema
+        if use_ema:
+            self.best_ema = BestMetricSingle(init_res, better)
+            self.best_regular = BestMetricSingle(init_res, better)
+
+    def update(self, new_res, epoch, is_ema=False):
+        """return if the results is the best."""
+        if not self.use_ema:
+            return self.best_all.update(new_res, epoch)
+        else:
+            if is_ema:
+                self.best_ema.update(new_res, epoch)
+                return self.best_all.update(new_res, epoch)
+            else:
+                self.best_regular.update(new_res, epoch)
+                return self.best_all.update(new_res, epoch)
+
+    def summary(self):
+        if not self.use_ema:
+            return self.best_all.summary()
+
+        res = {}
+        res.update({f'all_{k}': v for k, v in self.best_all.summary().items()})
+        res.update({
+            f'regular_{k}': v
+            for k, v in self.best_regular.summary().items()
+        })
+        res.update({f'ema_{k}': v for k, v in self.best_ema.summary().items()})
+        return res
+
+    def __repr__(self) -> str:
+        return json.dumps(self.summary(), indent=2)
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+
+def merge_configs(cfg1, cfg2):
+    # Merge cfg2 into cfg1
+    # Overwrite cfg1 if repeated, ignore if value is None.
+    cfg1 = {} if cfg1 is None else cfg1.copy()
+    cfg2 = {} if cfg2 is None else cfg2
+    for k, v in cfg2.items():
+        if v:
+            cfg1[k] = v
+    return cfg1
diff --git a/util/vis.py b/util/vis.py
new file mode 100644
index 0000000000000000000000000000000000000000..20753f2343c62a6513cb0bd15ccf2edd6b5ac7e5
--- /dev/null
+++ b/util/vis.py
@@ -0,0 +1,238 @@
+import os
+import cv2
+import numpy as np
+from mpl_toolkits.mplot3d import Axes3D
+import matplotlib.pyplot as plt
+import matplotlib as mpl
+import os
+os.environ['PYOPENGL_PLATFORM'] = 'egl'
+import pyrender
+import trimesh
+from config.config import cfg
+
+
+def vis_keypoints_with_skeleton(img, kps, kps_lines, kp_thresh=0.4, alpha=1):
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    colors = [cmap(i) for i in np.linspace(0, 1, len(kps_lines) + 2)]
+    colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors]
+
+    # Perform the drawing on a copy of the image, to allow for blending.
+    kp_mask = np.copy(img)
+
+    # Draw the keypoints.
+    for l in range(len(kps_lines)):
+        i1 = kps_lines[l][0]
+        i2 = kps_lines[l][1]
+        p1 = kps[0, i1].astype(np.int32), kps[1, i1].astype(np.int32)
+        p2 = kps[0, i2].astype(np.int32), kps[1, i2].astype(np.int32)
+        if kps[2, i1] > kp_thresh and kps[2, i2] > kp_thresh:
+            cv2.line(kp_mask,
+                     p1,
+                     p2,
+                     color=colors[l],
+                     thickness=2,
+                     lineType=cv2.LINE_AA)
+        if kps[2, i1] > kp_thresh:
+            cv2.circle(kp_mask,
+                       p1,
+                       radius=3,
+                       color=colors[l],
+                       thickness=-1,
+                       lineType=cv2.LINE_AA)
+        if kps[2, i2] > kp_thresh:
+            cv2.circle(kp_mask,
+                       p2,
+                       radius=3,
+                       color=colors[l],
+                       thickness=-1,
+                       lineType=cv2.LINE_AA)
+
+    # Blend the keypoints.
+    return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0)
+
+
+def vis_keypoints(img, kps, alpha=1, radius=3, color=None):
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    if color is None:
+        colors = [cmap(i) for i in np.linspace(0, 1, len(kps) + 2)]
+        colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors]
+
+    # Perform the drawing on a copy of the image, to allow for blending.
+    kp_mask = np.copy(img)
+
+    # Draw the keypoints.
+    for i in range(len(kps)):
+        p = kps[i][0].astype(np.int32), kps[i][1].astype(np.int32)
+        if color is None:
+            cv2.circle(kp_mask,
+                       p,
+                       radius=radius,
+                       color=colors[i],
+                       thickness=-1,
+                       lineType=cv2.LINE_AA)
+        else:
+            cv2.circle(kp_mask,
+                       p,
+                       radius=radius,
+                       color=color,
+                       thickness=-1,
+                       lineType=cv2.LINE_AA)
+
+    # Blend the keypoints.
+    return cv2.addWeighted(img, 1.0 - alpha, kp_mask, alpha, 0)
+
+
+def vis_mesh(img, mesh_vertex, alpha=0.5):
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    colors = [cmap(i) for i in np.linspace(0, 1, len(mesh_vertex))]
+    colors = [(c[2] * 255, c[1] * 255, c[0] * 255) for c in colors]
+
+    # Perform the drawing on a copy of the image, to allow for blending.
+    mask = np.copy(img)
+
+    # Draw the mesh
+    for i in range(len(mesh_vertex)):
+        p = mesh_vertex[i][0].astype(np.int32), mesh_vertex[i][1].astype(
+            np.int32)
+        cv2.circle(mask,
+                   p,
+                   radius=1,
+                   color=colors[i],
+                   thickness=-1,
+                   lineType=cv2.LINE_AA)
+
+    # Blend the keypoints.
+    return cv2.addWeighted(img, 1.0 - alpha, mask, alpha, 0)
+
+
+def vis_3d_skeleton(kpt_3d, kpt_3d_vis, kps_lines, filename=None):
+
+    fig = plt.figure()
+    ax = fig.add_subplot(111, projection='3d')
+
+    # Convert from plt 0-1 RGBA colors to 0-255 BGR colors for opencv.
+    cmap = plt.get_cmap('rainbow')
+    colors = [cmap(i) for i in np.linspace(0, 1, len(kps_lines) + 2)]
+    colors = [np.array((c[2], c[1], c[0])) for c in colors]
+
+    for l in range(len(kps_lines)):
+        i1 = kps_lines[l][0]
+        i2 = kps_lines[l][1]
+        x = np.array([kpt_3d[i1, 0], kpt_3d[i2, 0]])
+        y = np.array([kpt_3d[i1, 1], kpt_3d[i2, 1]])
+        z = np.array([kpt_3d[i1, 2], kpt_3d[i2, 2]])
+
+        if kpt_3d_vis[i1, 0] > 0 and kpt_3d_vis[i2, 0] > 0:
+            ax.plot(x, z, -y, c=colors[l], linewidth=2)
+        if kpt_3d_vis[i1, 0] > 0:
+            ax.scatter(kpt_3d[i1, 0],
+                       kpt_3d[i1, 2],
+                       -kpt_3d[i1, 1],
+                       c=colors[l],
+                       marker='o')
+        if kpt_3d_vis[i2, 0] > 0:
+            ax.scatter(kpt_3d[i2, 0],
+                       kpt_3d[i2, 2],
+                       -kpt_3d[i2, 1],
+                       c=colors[l],
+                       marker='o')
+
+    x_r = np.array([0, cfg.input_shape[1]], dtype=np.float32)
+    y_r = np.array([0, cfg.input_shape[0]], dtype=np.float32)
+    z_r = np.array([0, 1], dtype=np.float32)
+
+    if filename is None:
+        ax.set_title('3D vis')
+    else:
+        ax.set_title(filename)
+
+    ax.set_xlabel('X Label')
+    ax.set_ylabel('Z Label')
+    ax.set_zlabel('Y Label')
+    ax.legend()
+
+    plt.show()
+    cv2.waitKey(0)
+
+
+def save_obj(v, f, file_name='output.obj'):
+    obj_file = open(file_name, 'w')
+    for i in range(len(v)):
+        obj_file.write('v ' + str(v[i][0]) + ' ' + str(v[i][1]) + ' ' +
+                       str(v[i][2]) + '\n')
+    for i in range(len(f)):
+        obj_file.write('f ' + str(f[i][0] + 1) + '/' + str(f[i][0] + 1) + ' ' +
+                       str(f[i][1] + 1) + '/' + str(f[i][1] + 1) + ' ' +
+                       str(f[i][2] + 1) + '/' + str(f[i][2] + 1) + '\n')
+    obj_file.close()
+
+
+def perspective_projection(vertices, cam_param):
+    # vertices: [N, 3]
+    # cam_param: [3]
+    fx, fy = cam_param['focal']
+    cx, cy = cam_param['princpt']
+    vertices[:, 0] = vertices[:, 0] * fx / vertices[:, 2] + cx
+    vertices[:, 1] = vertices[:, 1] * fy / vertices[:, 2] + cy
+    return vertices
+
+
+def render_mesh(img, mesh, face, cam_param, mesh_as_vertices=False):
+    if mesh_as_vertices:
+        # to run on cluster where headless pyrender is not supported for A100/V100
+        vertices_2d = perspective_projection(mesh, cam_param)
+        img = vis_keypoints(img,
+                            vertices_2d,
+                            alpha=0.8,
+                            radius=2,
+                            color=(0, 0, 255))
+    else:
+        # mesh
+        mesh = trimesh.Trimesh(mesh, face)
+        rot = trimesh.transformations.rotation_matrix(np.radians(180),
+                                                      [1, 0, 0])
+        mesh.apply_transform(rot)
+        material = pyrender.MetallicRoughnessMaterial(
+            metallicFactor=0.0,
+            alphaMode='OPAQUE',
+            baseColorFactor=(1.0, 1.0, 0.9, 1.0))
+        mesh = pyrender.Mesh.from_trimesh(mesh,
+                                          material=material,
+                                          smooth=False)
+        scene = pyrender.Scene(ambient_light=(0.3, 0.3, 0.3))
+        scene.add(mesh, 'mesh')
+
+        focal, princpt = cam_param['focal'], cam_param['princpt']
+        camera = pyrender.IntrinsicsCamera(fx=focal[0],
+                                           fy=focal[1],
+                                           cx=princpt[0],
+                                           cy=princpt[1])
+        scene.add(camera)
+
+        # renderer
+        renderer = pyrender.OffscreenRenderer(viewport_width=img.shape[1],
+                                              viewport_height=img.shape[0],
+                                              point_size=1.0)
+
+        # light
+        light = pyrender.DirectionalLight(color=[1.0, 1.0, 1.0], intensity=0.8)
+        light_pose = np.eye(4)
+        light_pose[:3, 3] = np.array([0, -1, 1])
+        scene.add(light, pose=light_pose)
+        light_pose[:3, 3] = np.array([0, 1, 1])
+        scene.add(light, pose=light_pose)
+        light_pose[:3, 3] = np.array([1, 1, 2])
+        scene.add(light, pose=light_pose)
+
+        # render
+        rgb, depth = renderer.render(scene, flags=pyrender.RenderFlags.RGBA)
+        rgb = rgb[:, :, :3].astype(np.float32)
+        valid_mask = (depth > 0)[:, :, None]
+
+        # save to image
+        img = rgb * valid_mask + img * (1 - valid_mask)
+
+    return img